pax_global_header00006660000000000000000000000064127471614740014527gustar00rootroot0000000000000052 comment=432071cb5f212d683607ab8f8ff42d31b437fc4a fflas-ffpack-2.2.2/000077500000000000000000000000001274716147400140555ustar00rootroot00000000000000fflas-ffpack-2.2.2/.gitignore000066400000000000000000000051571274716147400160550ustar00rootroot00000000000000Makefile Makefile.in aclocal.m4 autogen.status autom4te.cache benchmarks/Makefile benchmarks/Makefile.in build-aux *.o *.lo *.la *~ *.libs *.trs *log *status *cache *aux config.h config.h.in config.status configure doc/Makefile doc/Makefile.in fflas-ffpack-config fflas-ffpack.pc fflas-ffpack/Makefile fflas-ffpack/Makefile.in fflas-ffpack/config.h fflas-ffpack/fflas-ffpack-optimise.h fflas-ffpack/fflas/Makefile fflas-ffpack/fflas/Makefile.in fflas-ffpack/fflas/fflas_fgemm/Makefile fflas-ffpack/fflas/fflas_fgemm/Makefile.in fflas-ffpack/fflas/fflas_igemm/Makefile fflas-ffpack/fflas/fflas_igemm/Makefile.in fflas-ffpack/fflas/fflas_simd/Makefile fflas-ffpack/fflas/fflas_simd/Makefile.in fflas-ffpack/fflas/fflas_sparse/Makefile fflas-ffpack/fflas/fflas_sparse/Makefile.in fflas-ffpack/fflas/fflas_sparse/coo/Makefile fflas-ffpack/fflas/fflas_sparse/coo/Makefile.in fflas-ffpack/fflas/fflas_sparse/csr/Makefile fflas-ffpack/fflas/fflas_sparse/csr/Makefile.in fflas-ffpack/fflas/fflas_sparse/csr_hyb/Makefile fflas-ffpack/fflas/fflas_sparse/csr_hyb/Makefile.in fflas-ffpack/fflas/fflas_sparse/ell/Makefile fflas-ffpack/fflas/fflas_sparse/ell/Makefile.in fflas-ffpack/fflas/fflas_sparse/ell_simd/Makefile fflas-ffpack/fflas/fflas_sparse/ell_simd/Makefile.in fflas-ffpack/fflas/fflas_sparse/hyb_zo/Makefile fflas-ffpack/fflas/fflas_sparse/hyb_zo/Makefile.in fflas-ffpack/fflas/fflas_sparse/sell/Makefile fflas-ffpack/fflas/fflas_sparse/sell/Makefile.in fflas-ffpack/ffpack/Makefile fflas-ffpack/ffpack/Makefile.in fflas-ffpack/field/Makefile fflas-ffpack/field/Makefile.in fflas-ffpack/interfaces/Makefile fflas-ffpack/interfaces/Makefile.in fflas-ffpack/interfaces/libs/Makefile fflas-ffpack/interfaces/libs/Makefile.in fflas-ffpack/paladin/Makefile fflas-ffpack/paladin/Makefile.in fflas-ffpack/utils/Makefile fflas-ffpack/utils/Makefile.in interfaces/ libtool macros/CodeChunk/Makefile macros/CodeChunk/Makefile.in macros/Makefile macros/Makefile.in macros/libtool.m4 macros/ltoptions.m4 macros/ltsugar.m4 macros/ltversion.m4 macros/lt~obsolete.m4 optimiser/Makefile optimiser/Makefile.in stamp-h1 tests/Makefile tests/Makefile.in tests/data/Makefile tests/data/Makefile.in benchmarks/benchmark-fgemm benchmarks/benchmark-pluq tests/regression-check tests/test-compressQ tests/test-det tests/test-echelon tests/test-fadd tests/test-fgemm tests/test-fger tests/test-finit tests/test-fscal tests/test-ftrsm tests/test-lu tests/test-multifile tests/test-rankprofiles tests/test-bini-p tests/test-charpoly-check tests/test-fgemm-check tests/test-ftrsm-check tests/test-invert-check tests/test-permutations tests/test-pluq-check tests/test-simd benchmarks/benchmark-ftrsm fflas-ffpack-2.2.2/AUTHORS000066400000000000000000000005121274716147400151230ustar00rootroot00000000000000François Bissey Brice Boyer Alexis Breust Jean-Guillaume Dumas Pascal Giorgi Gavin Harisson Ashley Lesdalons Clément Pernet Ziad Sultan Bastien Vialla fflas-ffpack-2.2.2/COPYING000066400000000000000000000432541274716147400151200ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. fflas-ffpack-2.2.2/COPYING.LESSER000066400000000000000000000636421274716147400161170ustar00rootroot00000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. , 1 April 1990 Ty Coon, President of Vice That's all there is to it! fflas-ffpack-2.2.2/ChangeLog000066400000000000000000000140671274716147400156370ustar00rootroot000000000000002016-07-30 v2.2.2 * many bug fixes ensuring a consistent support of clang, gcc-4.8 5.3 6.1 icpc on i386 x86_64, ubuntu and fedora, ppcle and osx * new SIMD detection * use pkgconfig * new feature: checkers for Freivalds based verification * improved performance of permutation application * 2016-04-08 v2.2.1 * many fixes to the build system * more consistent use of flags and dependency to precompiled code * fixes all remaining issues for the integration in SageMath * numerous minor fixes to the parallel code 2016-02-23 v2.2.0 * new precompiled interface * improvements and API change for the parallel code * new random matrix generators * fix many bugs 2015-06-11 v2.1.0 Test suite and benchmark improvement : * much larger coverage * run most tests over a wide range of fields * systematic interface and options New features: * parallel PLUQ * computation of rank profiles and rank profile matrices * echelon and reduced echelon forms form both LUdivine and PLUQ * getters to the forms and the transformation matrices * igemm routine for BLAS like gemm on 64bits ints * support of Modular and ModularBalanced using igemm, to support fields of bitsize between 25 and 31 * support of Modular > for Z/pZ with p of size > 32bits (based on Givaro's RecInt multiprecision integers) * support of RNS based gaussian elimination on multiprecision fields * Paladin: DSL for parallel programming adressing OMP, TBB and Kaapi Improvements: * a lot of new sparse mat-vec product improvements * faster parallel and sequential fgemm * many bugs found and removed (no known bugs at release time) * improved helper system, with mode of operations 2014-08-08 v2.0.0 code update : * rank profile * clean namespaces * use field one, zero, etc * fix clang warnings * more blas wrappers (sger, sdot, copy, etc) * simplification of fgemm * simplify blas detection (+cflags) * easier permutation handling * improve testers * use std::min, max * many functions have API change to use last pointer argument for return * some more doc * and probably many more in 2+ years ! bugs : * correct permutations * fix fgemm, fgemv, ftrmm, ftrsm bugs * mem leaks * bugs for degenerate cases * fix bounds * and probably many more in 2+ years ! new features : * new pluq 2x2 recursive alg * leftlooking * parallel OMP fgemm, ftrmm, ftrsm * parallel KAAPI fgemm, ftrmm, ftrsm * new testers for pluq, fgemm, etc * new tester for Bini approximate formula * fadd, fsub, finit, fscal, etc * vectorisation using AVX(2) * in place schedules * new Echelon code * helper design for fgemm, fgemv, etc * template factorisation for modular/multiprecision fields * helper traits * automatic matrix field conversion (ie double -> float) * add spmv kernels * enable use of sparse MKL * parallel.h, avx and simd files * new DSL for parallelism * RNS and multiprecision fields * new const_cast, fflas_new etc functions * element_ptr in fields * use Givaro dependency (compulsory now) * new test for regressions (with tickets) * and probably many more in 2+ years ! 2011-04-15 v1.4.0 * Convert project to autotools (à la LinBox et Givaro) 2008-06-05 v1.3.3 * fix the design of specializations to modular modular * give a proper name to ModularBalanced * fix the bugs in the bound computations (Winograd recursion over the finite field was too deep) * prepare the interface for integrating compressed representation for small finite fields 2007-09-28 v1.3.2 * add routines fgetrs and fgesv (cf LAPACK), for system solving. supports rectangular, over/underdetermined systems. 2007-08-29 v1.3.1 * add the benchmark directory, for automatic benchmarking against GOTO and ATLAS BLAS. Adapted from Pascal Giorgi's benchmark system. 2007-08-28 v1.3.0 * new version of ftrmm ftrsm: ftrsm based on a multicascade algorithm reducing the number of modular reductions). Automated generation of each of the 48 specializations * several bug fixes * add regression tests: testeur_fgemm, testeur_lqup and testeur_ftrsm 2007-07-05 v1.2.2 * add a transposed version of the LQUP decomposition routine LUdivine * fix many bugs in LUdivine * new schedules for Winograd algorithm for matrix multiplication: 2 cases depending whether beta = 0 or not, taken form [Huss Ledermann & Al. 96] * add rowEchelon and ReducedRowEchelon routines + associated tests 2007-06-21 v1.2.1 * add the use of float BLAS, if the field caradinality is small enough * improve genericity: gemm can be use over any field domain (not requiring any conversion to a integral representation) * add a variant of Winograd's algorithm with less temporaries for the operation C = AxB * add ColumnEchelon and ReducedColumnEchelon routines, using an inplace algorithm, based on the LQUP decompositon of LUdivine * add routines ftrtri (replacing invL), ftrtrm. * fix bunch of memory leaks in the tests (not yet finished) 2007-03-13 v1.1.2 * change the genericity system for trsm to detect Field implementations over double (compatibility with LinBox) 2007-03-11 v1.1.1 * complete preconditioning phase for the new Charpoly algorithm * new Charpoly algorithm renamed CharpolyArithProg * add exception for failure of the LasVegas algrithm * default charpoly is now: 2 attempts to CharpolyArithProg, then LUKrylov 2007-02-27 v1.1.0 * change some naming conventions in the directories * add a LQUP routine for small dimension (LUdivine_small) and the cascading with LUdivine * put the bound computations in the same file * add dense_generator.C for the generation of random dense matrices in tests * add the new algorithm for characteristic polynomial (temporarily named frobenius) 2006-08-11 v1.0.1 * add the field implementation modular-positive.h, especially for p=2 * add a the flag 'balanced' to the finite fields modular, to switch to the apropriate bound computation (fgemm and trsm) * fix a bug in LUDivine LQUP elimination (initialisation of the permutation P for N=1 in the terminal case) * fix a bug in the determination of the number of recursive levels of Winograd Algorithm. fflas-ffpack-2.2.2/INSTALL000066400000000000000000000374611274716147400151210ustar00rootroot00000000000000Installation Instructions ************************* Copyright (C) 1994-1996, 1999-2002, 2004-2016 Free Software Foundation, Inc. Copying and distribution of this file, with or without modification, are permitted in any medium without royalty provided the copyright notice and this notice are preserved. This file is offered as-is, without warranty of any kind. Requirements ============ * The FFLAS-FFPACK library requires a version of GNU C++ compiler great or equal to 4.7. Other recent compilers such as clang++ or pathcc are also supported. * the Givaro library v4.0.1 or later: https://github.com/linbox-team/givaro (itself depending on GNU GMP) * a BLAS library: OpenBLAS or ATLAS (recommended) or any other implementation of the Fortran or C blas interface. Basic Installation ================== Briefly, the shell command `./configure && make && make install' should configure, build, and install this package. The following more-detailed instructions are generic; see the `README' file for instructions specific to this package. Some packages provide this `INSTALL' file but do not implement all of the features documented below. The lack of an optional feature in a given package is not necessarily a bug. More recommendations for GNU packages can be found in *note Makefile Conventions: (standards)Makefile Conventions. The `configure' shell script attempts to guess correct values for various system-dependent variables used during compilation. It uses those values to create a `Makefile' in each directory of the package. It may also create one or more `.h' files containing system-dependent definitions. Finally, it creates a shell script `config.status' that you can run in the future to recreate the current configuration, and a file `config.log' containing compiler output (useful mainly for debugging `configure'). It can also use an optional file (typically called `config.cache' and enabled with `--cache-file=config.cache' or simply `-C') that saves the results of its tests to speed up reconfiguring. Caching is disabled by default to prevent problems with accidental use of stale cache files. If you need to do unusual things to compile the package, please try to figure out how `configure' could check whether to do them, and mail diffs or instructions to the address given in the `README' so they can be considered for the next release. If you are using the cache, and at some point `config.cache' contains results you don't want to keep, you may remove or edit it. The file `configure.ac' (or `configure.in') is used to create `configure' by a program called `autoconf'. You need `configure.ac' if you want to change it or regenerate `configure' using a newer version of `autoconf'. The simplest way to compile this package is: 1. `cd' to the directory containing the package's source code and type `./configure' to configure the package for your system. Running `configure' might take a while. While running, it prints some messages telling which features it is checking for. 2. Type `make ; make install' to compile the package. 3. Optionally, type `make check' to run any self-tests that come with the package, generally using the just-built uninstalled binaries. 4. Type `make install' to install the programs and any data files and documentation. When installing into a prefix owned by root, it is recommended that the package be configured and built as a regular user, and only the `make install' phase executed with root privileges. 5. Optionally, type `make installcheck' to repeat any self-tests, but this time using the binaries in their final installed location. This target does not install anything. Running this target as a regular user, particularly if the prior `make install' required root privileges, verifies that the installation completed correctly. 6. You can remove the program binaries and object files from the source code directory by typing `make clean'. To also remove the files that `configure' created (so you can compile the package for a different kind of computer), type `make distclean'. There is also a `make maintainer-clean' target, but that is intended mainly for the package's developers. If you use it, you may have to get all sorts of other programs in order to regenerate files that came with the distribution. 7. Often, you can also type `make uninstall' to remove the installed files again. In practice, not all packages have tested that uninstallation works correctly, even though it is required by the GNU Coding Standards. 8. Some packages, particularly those that use Automake, provide `make distcheck', which can by used by developers to test that all other targets like `make install' and `make uninstall' work correctly. This target is generally not run by end users. Compilers and Options ===================== Some systems require unusual options for compilation or linking that the `configure' script does not know about. Run `./configure --help' for details on some of the pertinent environment variables. You can give `configure' initial values for configuration parameters by setting variables in the command line or in the environment. Here is an example: ./configure CC=c99 CFLAGS=-g LIBS=-lposix *Note Defining Variables::, for more details. Compiling For Multiple Architectures ==================================== You can compile the package for more than one kind of computer at the same time, by placing the object files for each architecture in their own directory. To do this, you can use GNU `make'. `cd' to the directory where you want the object files and executables to go and run the `configure' script. `configure' automatically checks for the source code in the directory that `configure' is in and in `..'. This is known as a "VPATH" build. With a non-GNU `make', it is safer to compile the package for one architecture at a time in the source code directory. After you have installed the package for one architecture, use `make distclean' before reconfiguring for another architecture. On MacOS X 10.5 and later systems, you can create libraries and executables that work on multiple system types--known as "fat" or "universal" binaries--by specifying multiple `-arch' options to the compiler but only a single `-arch' option to the preprocessor. Like this: ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ CPP="gcc -E" CXXCPP="g++ -E" This is not guaranteed to produce working output in all cases, you may have to build one architecture at a time and combine the results using the `lipo' tool if you have problems. Installation Names ================== By default, `make install' installs the package's commands under `/usr/local/bin', include files under `/usr/local/include', etc. You can specify an installation prefix other than `/usr/local' by giving `configure' the option `--prefix=PREFIX', where PREFIX must be an absolute file name. You can specify separate installation prefixes for architecture-specific files and architecture-independent files. If you pass the option `--exec-prefix=PREFIX' to `configure', the package uses PREFIX as the prefix for installing programs and libraries. Documentation and other data files still use the regular prefix. In addition, if you use an unusual directory layout you can give options like `--bindir=DIR' to specify different values for particular kinds of files. Run `configure --help' for a list of the directories you can set and what kinds of files go in them. In general, the default for these options is expressed in terms of `${prefix}', so that specifying just `--prefix' will affect all of the other directory specifications that were not explicitly provided. The most portable way to affect installation locations is to pass the correct locations to `configure'; however, many packages provide one or both of the following shortcuts of passing variable assignments to the `make install' command line to change installation locations without having to reconfigure or recompile. The first method involves providing an override variable for each affected directory. For example, `make install prefix=/alternate/directory' will choose an alternate location for all directory configuration variables that were expressed in terms of `${prefix}'. Any directories that were specified during `configure', but not in terms of `${prefix}', must each be overridden at install time for the entire installation to be relocated. The approach of makefile variable overrides for each directory variable is required by the GNU Coding Standards, and ideally causes no recompilation. However, some platforms have known limitations with the semantics of shared libraries that end up requiring recompilation when using this method, particularly noticeable in packages that use GNU Libtool. The second method involves providing the `DESTDIR' variable. For example, `make install DESTDIR=/alternate/directory' will prepend `/alternate/directory' before all installation names. The approach of `DESTDIR' overrides is not required by the GNU Coding Standards, and does not work on platforms that have drive letters. On the other hand, it does better at avoiding recompilation issues, and works well even when some directory options were not specified in terms of `${prefix}' at `configure' time. Optional Features ================= If the package supports it, you can cause programs to be installed with an extra prefix or suffix on their names by giving `configure' the option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. Some packages pay attention to `--enable-FEATURE' options to `configure', where FEATURE indicates an optional part of the package. They may also pay attention to `--with-PACKAGE' options, where PACKAGE is something like `gnu-as' or `x' (for the X Window System). The `README' should mention any `--enable-' and `--with-' options that the package recognizes. For packages that use the X Window System, `configure' can usually find the X include and library files automatically, but if it doesn't, you can use the `configure' options `--x-includes=DIR' and `--x-libraries=DIR' to specify their locations. Some packages offer the ability to configure how verbose the execution of `make' will be. For these packages, running `./configure --enable-silent-rules' sets the default to minimal output, which can be overridden with `make V=1'; while running `./configure --disable-silent-rules' sets the default to verbose, which can be overridden with `make V=0'. Particular systems ================== On HP-UX, the default C compiler is not ANSI C compatible. If GNU CC is not installed, it is recommended to use the following options in order to use an ANSI C compiler: ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" and if that doesn't work, install pre-built binaries of GCC for HP-UX. HP-UX `make' updates targets which have the same time stamps as their prerequisites, which makes it generally unusable when shipped generated files such as `configure' are involved. Use GNU `make' instead. On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot parse its `' header file. The option `-nodtk' can be used as a workaround. If GNU CC is not installed, it is therefore recommended to try ./configure CC="cc" and if that doesn't work, try ./configure CC="cc -nodtk" On Solaris, don't put `/usr/ucb' early in your `PATH'. This directory contains several dysfunctional programs; working variants of these programs are available in `/usr/bin'. So, if you need `/usr/ucb' in your `PATH', put it _after_ `/usr/bin'. On Haiku, software installed for all users goes in `/boot/common', not `/usr/local'. It is recommended to use the following options: ./configure --prefix=/boot/common Specifying the System Type ========================== There may be some features `configure' cannot figure out automatically, but needs to determine by the type of machine the package will run on. Usually, assuming the package is built to be run on the _same_ architectures, `configure' can figure that out, but if it prints a message saying it cannot guess the machine type, give it the `--build=TYPE' option. TYPE can either be a short name for the system type, such as `sun4', or a canonical name which has the form: CPU-COMPANY-SYSTEM where SYSTEM can have one of these forms: OS KERNEL-OS See the file `config.sub' for the possible values of each field. If `config.sub' isn't included in this package, then this package doesn't need to know the machine type. If you are _building_ compiler tools for cross-compiling, you should use the option `--target=TYPE' to select the type of system they will produce code for. If you want to _use_ a cross compiler, that generates code for a platform different from the build platform, you should specify the "host" platform (i.e., that on which the generated programs will eventually be run) with `--host=TYPE'. Sharing Defaults ================ If you want to set default values for `configure' scripts to share, you can create a site shell script called `config.site' that gives default values for variables like `CC', `cache_file', and `prefix'. `configure' looks for `PREFIX/share/config.site' if it exists, then `PREFIX/etc/config.site' if it exists. Or, you can set the `CONFIG_SITE' environment variable to the location of the site script. A warning: not all `configure' scripts look for a site script. Defining Variables ================== Variables not defined in a site shell script can be set in the environment passed to `configure'. However, some packages may run configure again during the build, and the customized values of these variables may be lost. In order to avoid this problem, you should set them in the `configure' command line, using `VAR=value'. For example: ./configure CC=/usr/local2/bin/gcc causes the specified `gcc' to be used as the C compiler (unless it is overridden in the site shell script). Unfortunately, this technique does not work for `CONFIG_SHELL' due to an Autoconf limitation. Until the limitation is lifted, you can use this workaround: CONFIG_SHELL=/bin/bash ./configure CONFIG_SHELL=/bin/bash `configure' Invocation ====================== `configure' recognizes the following options to control how it operates. `--help' `-h' Print a summary of all of the options to `configure', and exit. `--help=short' `--help=recursive' Print a summary of the options unique to this package's `configure', and exit. The `short' variant lists options used only in the top level, while the `recursive' variant lists options also present in any nested packages. `--version' `-V' Print the version of Autoconf used to generate the `configure' script, and exit. `--cache-file=FILE' Enable the cache: use and save the results of the tests in FILE, traditionally `config.cache'. FILE defaults to `/dev/null' to disable caching. `--config-cache' `-C' Alias for `--cache-file=config.cache'. `--quiet' `--silent' `-q' Do not print messages saying which checks are being made. To suppress all normal output, redirect it to `/dev/null' (any error messages will still be shown). `--srcdir=DIR' Look for the package's source code in directory DIR. Usually `configure' can determine that directory automatically. `--prefix=DIR' Use DIR as the installation prefix. *note Installation Names:: for more details, including other options available for fine-tuning the installation locations. `--no-create' `-n' Run the configure checks, but stop before creating any output files. `configure' also accepts some other, not widely useful, options. Run `configure --help' for more details. fflas-ffpack-2.2.2/Makefile.am000066400000000000000000000046451274716147400161220ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ ACLOCAL_AMFLAGS = -I macros SUBDIRS=fflas-ffpack tests doc benchmarks macros optimiser examples pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = fflas-ffpack.pc # include_HEADERS=fflas-ffpack-config.h #!@todo add examples dir XXX docs:doc/fflas-ffpack-html/index.html doc/fflas-ffpack-html/index.html: (cd doc; ${MAKE} docs) docs_dev:doc/fflas-ffpack-dev-html/index.html doc/fflas-ffpack-dev-html/index.html: (cd doc; ${MAKE} docs_dev) perfpublisher: benchmarks/perfpublisher tests/perfpublisher benchmarks/perfpublisher: (cd benchmarks; ${MAKE} perfpublisher) tests/perfpublisher: (cd tests; ${MAKE} perfpublisher) examples: (cd examples; ${MAKE} examples) benchmarks: (cd benchmarks; ${MAKE} benchmarks) uninstall-hook: (test -d "$(includedir)/fflas-ffpack" && rm -rf \ "$(mandir)/man1" "$(mandir)" \ "$(includedir)/fflas-ffpack/fflas" \ "$(includedir)/fflas-ffpack/fflas/fflas_fgemm" \ "$(includedir)/fflas-ffpack/ffpack" \ "$(includedir)/fflas-ffpack/field" \ "$(includedir)/fflas-ffpack/utils" \ "$(includedir)/fflas-ffpack/paladin" \ "$(includedir)/fflas-ffpack/interfaces" \ "$(includedir)/fflas-ffpack/interfaces/libs" \ "$(includedir)/fflas-ffpack/" "$(datarootdir)/" ) || \ true .PHONY:examples benchmarks bin_SCRIPTS=fflas-ffpack-config git: git commit -a; git pull; git push VERSION=2.2.2 # EXTRA_DIST=incremente-versions fflas-ffpack-2.2.2/NEWS000066400000000000000000000000001274716147400145420ustar00rootroot00000000000000fflas-ffpack-2.2.2/README.md000066400000000000000000000110121274716147400153270ustar00rootroot00000000000000# FFLAS-FFPACK: Finite Field Linear Algebra Subroutines/Package [![Build Status](https://ci.inria.fr/linbox/buildStatus/icon?job=FFLAS-FFPACK)](https://ci.inria.fr/linbox/view/LinBox%20ecosystem/job/FFLAS-FFPACK/) ## PURPOSE The FFLAS-FFPACK library provides a set of basic routines for linear algebra over a finite field or the ring of integers with dense and sparse matrices. It is inspired by the BLAS interface (Basic Linear Algebra Subprograms) and the LAPACK library for numerical linear algebra, and shares part of their design. Yet it differs in many aspects due to the specifities of computing over exact domains such as a finite fields and the field of rationals: - it is generic with respect to the finite field, so as to accomodate a large variety of field sizes and implementations; - consequently all routines use the C++ template genericity and the library is primarily meant to be used as a source code library, to be included and compiled in the user's software. - However, we also provide a compiled version instantiating most common routines over the most common finite fields. ## LICENSE FFLAS-FFPACK is distributed unded the terms of the GNU LGPL v2.1 or later (see LICENSE). ## REQUIREMENTS: - a C++ compiler supporting C++11 standard. This means g++ v4.7 or greater, clang++ v3.4 or greater, icpc v16 or greater (earlier versions of clang and icpc might also work but have not been tested) - A BLAS library conforming to either the C or Fortran BLAS standard: OpenBLAS (recommended), or ATLAS. Make sure to use a single threaded version of the BLAS library. - [Givaro](https://github.com/linbox-team/givaro) version at least 4.0.1, providing the implementations of the coefficient fields/rings. ## INSTALLATION In brief: ```./configure && make && make install``` The most commonly used option include: - `--with-blas-libs=` : to specify the arguments for the linker to find the BLAS - `--enable-optimization` : to run configure-time optimizations Type `./configure --help` to list all options available. Note that `givaro` is automatically detected by pkg-config, so you no longer need to pass a `--with-givaro=...` option. You may need to set the `PKG_CONFIG_PATH` environment variable to `/lib/pkgconfig` if you have installed it in a non standard directory. For example on a x86_64 architecture: - Using OpenBLAS in Fedora: - install the package `openblas-devel.x86_64`, - run `./configure --enable-optimization --with-blas-libs="-lopenblas"` - Using OpenBLAS in Debian, Ubuntu, Mint, and all debian based distribution: - avoid using the distribution's package, as it is threaded by default. You need to compile openblas yourself on these systems, - run `./configure --enable-optimization --with-blas-libs="-lopenblas"` - Using ATLAS in Debian, Ubuntu, Mint: - install the package `libatlas-dev`, - run `./configure --enable-optimization --with-blas-libs="-latlas -lcblas"` - Using ATLAS in Fedora: - install the package `atlas-devel.x86_64`, - run `./configure --enable-optimization --with-blas-libs="-L/usr/lib64/atlas -lsatlas"`. - Using Accelerate Framework on OS-X: - run `./configure --enable-optimization --with-blas-libs="-framework Accelerate"`. see INSTALL for further details. ## AVAILABILITY from [linbox-team/fflas-ffpack](https://github.com/linbox-team/fflas-ffpack) ## AUTHORS The FFLAS-FFPACK group (see AUTHORS file for a list of contributors). ## Citing FFLAS-FFPACK If your research depends on the FFLAS-FFPACK library, please consider citing the project as ``` @manual{fflas-ffpack, title = {{FFLAS-FFPACK}: {F}inite {F}ield {L}inear {A}lgebra {S}ubroutines / {P}ackage}, author = {The FFLAS-FFPACK group}, edition = {v2.2.1}, year = {2016}, note = {\url{http://github.com/linbox-team/fflas-ffpack}} } ``` Or you may also consider citing the related research article: ``` @article{DGP:2008, author = {Jean-Guillaume Dumas and Pascal Giorgi and Cl{\'e}ment Pernet}, title = {Dense Linear Algebra over Word-Size Prime Fields: the FFLAS and FFPACK Packages}, journal = {ACM Trans. on Mathematical Software (TOMS)}, volume = {35}, number = {3}, year = {2008}, issn = {0098-3500}, pages = {1--42}, doi = {10.1145/1391989.1391992}, publisher = {ACM Press}, address = {New York, NY, USA} } ``` ## Contact and discussion For any bug report, feature or help request, please file an issue on github's [issue tracker](https://github.com/linbox-team/fflas-ffpack/issues). Please address any other request, suggestion and comment to the discussion group [ffpack-devel](http://groups.google.com/group/ffpack-devel). fflas-ffpack-2.2.2/TODO000066400000000000000000000016111274716147400145440ustar00rootroot00000000000000LUdivine-PLUQ * Clean up of all base cases * Only one routine, and automated switch to all implementations FTRTRI/FTRTRM * Optimize base cases Conversion double -> float for small moduli: * should be done in each routine, not only gemm Simplification of helpers: * currently all mmhelpers have Amax,Amin,Bmax,Bmin, Cmax,Cmin,Outmax, Outmin, and all related features for delayed reductions. * this is not suited for other FieldTraits (say Generic, Multiprec,...) TODO: - write a by-default minimal mmhelper - specialize the mmhelper with delayedModular trait with all the machinery * The NeedPreaddreduction system is error-prone and ugly: ==> introduce AddHelpers - carry max min outmax outmin info when used with a DelayedModular FieldTraits - decide when a mod is required in this case - empty otherwise. - Two bool params: add/sub switch, and inplace switch. fflas-ffpack-2.2.2/_clang-format000066400000000000000000000030171274716147400165120ustar00rootroot00000000000000--- Language: Cpp # BasedOnStyle: LLVM AccessModifierOffset: -2 ConstructorInitializerIndentWidth: 4 AlignEscapedNewlinesLeft: false AlignTrailingComments: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AllowShortFunctionsOnASingleLine: true AlwaysBreakTemplateDeclarations: false AlwaysBreakBeforeMultilineStrings: false BreakBeforeBinaryOperators: false BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false BinPackParameters: true ColumnLimit: 120 ConstructorInitializerAllOnOneLineOrOnePerLine: false DerivePointerBinding: false ExperimentalAutoDetectBinPacking: false IndentCaseLabels: false MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakString: 1000 PenaltyBreakFirstLessLess: 120 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerBindsToType: false SpacesBeforeTrailingComments: 1 Cpp11BracedListStyle: false Standard: Cpp11 IndentWidth: 4 TabWidth: 4 UseTab: Never BreakBeforeBraces: Attach IndentFunctionDeclarationAfterType: false SpacesInParentheses: false SpacesInAngles: false SpaceInEmptyParentheses: false SpacesInCStyleCastParentheses: false SpacesInContainerLiterals: true SpaceBeforeAssignmentOperators: true ContinuationIndentWidth: 4 CommentPragmas: '^ IWYU pragma:' SpaceBeforeParens: ControlStatements ... fflas-ffpack-2.2.2/autogen.sh000077500000000000000000000134441274716147400160640ustar00rootroot00000000000000#!/bin/sh # Coypright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ # Run this to generate all the initial makefiles, etc. # Recover command line, with double-quotes CMDLINE="" for arg in "$@" do WHO="`echo $arg | cut -d'=' -f1`" WHAT="`echo $arg | cut -s -d'=' -f2`" if test "x$WHAT" = "x"; then CMDLINE="$CMDLINE $WHO" else CMDLINE="$CMDLINE $WHO=\"$WHAT\"" fi done echo "$0 $CMDLINE" > autogen.status chmod +x autogen.status # Starts configuring srcdir=`dirname $0` test -z "$srcdir" && srcdir=. PKG_NAME="FFLAS-FFPACK" (test -f $srcdir/configure.ac \ && test -f $srcdir/fflas-ffpack/fflas-ffpack.doxy ) || { echo -n "**Error**: Directory "\`$srcdir\'" does not look like the" echo " top-level "\`$PKG_NAME\'" directory" exit 1 } ORIGDIR=`pwd` cd $srcdir PROJECT=fflasffpack TEST_TYPE=-f DIE=0 # Defaults LIBTOOL=libtool LIBTOOLIZE=libtoolize # Fix OSx problem with GNU libtool (uname -a|grep -v Darwin) < /dev/null > /dev/null 2>&1 || { echo "....Adding fix for OSX" if command -v "glibtoolize" >/dev/null; then LIBTOOL=glibtool LIBTOOLIZE=glibtoolize fi } (autoconf --version) < /dev/null > /dev/null 2>&1 || { echo echo "You must have autoconf installed to compile $PROJECT." echo "Download the appropriate package for your distribution," echo "or get the source tarball at ftp://ftp.gnu.org/pub/gnu/" DIE=1 } (automake --version) < /dev/null > /dev/null 2>&1 || { echo echo "You must have automake installed to compile $PROJECT." echo "Download the appropriate package for your distribution," echo "or get the source tarball at ftp://ftp.gnu.org/pub/gnu/" DIE=1 } (automake --version) < /dev/null > /dev/null 2>&1 || { echo echo "You must have automake installed to compile $PROJECT." echo "Get ftp://sourceware.cygnus.com/pub/automake/automake-1.4.tar.gz" echo "(or a newer version if it is available)" DIE=1 } (grep "^AC_PROG_LIBTOOL" configure.ac >/dev/null) && { ($LIBTOOLIZE --version) < /dev/null > /dev/null 2>&1 || { echo echo "**Error**: You must have \`libtool' installed to compile $PROJECT." echo "Download the appropriate package for your distribution," echo "or get the source tarball at ftp://ftp.gnu.org/pub/gnu/" DIE=1 } } grep "^AM_GNU_GETTEXT" configure.ac >/dev/null && { grep "sed.*POTFILES" $srcdir/configure.ac >/dev/null || \ (gettext --version) < /dev/null > /dev/null 2>&1 || { echo echo "**Error**: You must have \`gettext' installed to compile $PROJECT." echo "Download the appropriate package for your distribution," echo "or get the source tarball at ftp://ftp.gnu.org/pub/gnu/" DIE=1 } } if test "$DIE" -eq 1; then exit 1 fi if test -z "$*"; then echo "I am going to run ./configure with no arguments - if you wish " echo "to pass any to it, please specify them on the $0 command line." fi case $CC in *xlc | *xlc\ * | *lcc | *lcc\ *) am_opt=--include-deps;; esac for coin in `find . -name configure.ac -print` do dr=`dirname $coin` if test -f $dr/NO-AUTO-GEN; then echo skipping $dr -- flagged as no auto-gen else echo processing $dr macrodirs=`sed -n -e 's,AM_ACLOCAL_INCLUDE(\(.*\)),\1,gp' < $coin` ( cd $dr aclocalinclude="$ACLOCAL_FLAGS" for k in $macrodirs; do if test -d $k; then aclocalinclude="$aclocalinclude -I $k" ##else ## echo "**Warning**: No such directory \`$k'. Ignored." fi done if grep "^AM_GNU_GETTEXT" configure.ac >/dev/null; then if grep "sed.*POTFILES" configure.ac >/dev/null; then : do nothing -- we still have an old unmodified configure.ac else echo "Creating $dr/aclocal.m4 ..." test -r $dr/aclocal.m4 || touch $dr/aclocal.m4 echo "Running gettextize... Ignore non-fatal messages." echo "no" | gettextize --force --copy echo "Making $dr/aclocal.m4 writable ..." test -r $dr/aclocal.m4 && chmod u+w $dr/aclocal.m4 fi fi if grep "^AM_GNOME_GETTEXT" configure.ac >/dev/null; then echo "Creating $dr/aclocal.m4 ..." test -r $dr/aclocal.m4 || touch $dr/aclocal.m4 echo "Running gettextize... Ignore non-fatal messages." echo "no" | gettextize --force --copy echo "Making $dr/aclocal.m4 writable ..." test -r $dr/aclocal.m4 && chmod u+w $dr/aclocal.m4 fi if grep "^AC_PROG_LIBTOOL" configure.ac >/dev/null; then echo "Running libtoolize..." $LIBTOOLIZE --force --copy fi echo "Running aclocal $aclocalinclude ..." aclocal $aclocalinclude if grep "^AC_CONFIG_HEADERS" configure.ac >/dev/null; then echo "Running autoheader..." autoheader fi echo "Running automake --gnu $am_opt ..." automake -c --add-missing --gnu $am_opt echo "Running autoconf ..." autoconf ) fi done conf_flags="--enable-maintainer-mode" #--enable-iso-c cd "$ORIGDIR" if test x$NOCONFIGURE = x; then echo Running $srcdir/configure $conf_flags "$@" ... $srcdir/configure $conf_flags "$@" \ && echo "Now type \`make install' to compile $PROJECT" || exit 1 else echo Skipping configure process. fi fflas-ffpack-2.2.2/benchmarks/000077500000000000000000000000001274716147400161725ustar00rootroot00000000000000fflas-ffpack-2.2.2/benchmarks/Makefile.am000077500000000000000000000064701274716147400202400ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by JGD # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ SUBDIRS = benchmarks: $(BENCHMARKS) AM_CPPFLAGS=-I$(top_srcdir) -g AM_CXXFLAGS = @DEFAULT_CFLAGS@ AM_CPPFLAGS += $(CBLAS_FLAG) $(GIVARO_CFLAGS) $(OPTFLAGS) -I$(top_srcdir)/fflas-ffpack/utils/ -I$(top_srcdir)/fflas-ffpack/fflas/ -I$(top_srcdir)/fflas-ffpack/ffpack -I$(top_srcdir)/fflas-ffpack/field $(CUDA_CFLAGS) $(PARFLAGS) LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS) AM_LDFLAGS=-static $(PARLIBS) PERFPUBLISHERFILE=benchmarks-report.xml FFLA_BENCH = benchmark-fgemm benchmark-wino benchmark-ftrsm benchmark-ftrtri benchmark-inverse benchmark-lqup benchmark-pluq benchmark-charpoly benchmark-fgemm-mp benchmark-fgemv-mp benchmark-ftrsm-mp benchmark-lqup-mp benchmark-checkers BLAS_BENCH = benchmark-sgemm$(EXEEXT) benchmark-dgemm benchmark-dtrsm LAPA_BENCH = benchmark-dtrtri benchmark-dgetri benchmark-dgetrf if FFLASFFPACK_HAVE_LAPACK USE_LAPACK_BENCH = $(LAPA_BENCH) benchmark_dtrtri_SOURCES = benchmark-dtrtri.C benchmark_dgetri_SOURCES = benchmark-dgetri.C benchmark_dgetrf_SOURCES = benchmark-dgetrf.C endif BENCHMARKS = \ $(FFLA_BENCH) \ $(BLAS_BENCH) \ $(USE_LAPACK_BENCH) \ $(USE_OMP_BENCH) CLEANFILES = $(BENCHMARKS) $(PERFPUBLISHERFILE) EXTRA_PROGRAMS = $(BENCHMARKS) benchmark_sgemm_SOURCES = benchmark-dgemm.C benchmark_dgemm_SOURCES = benchmark-dgemm.C benchmark_dtrsm_SOURCES = benchmark-dtrsm.C benchmark_fgemm_SOURCES = benchmark-fgemm.C benchmark_fgemm_mp_SOURCES = benchmark-fgemm-mp.C benchmark_fgemv_mp_SOURCES = benchmark-fgemv-mp.C benchmark_wino_SOURCES = benchmark-wino.C benchmark_ftrsm_SOURCES = benchmark-ftrsm.C benchmark_ftrsm_mp_SOURCES = benchmark-ftrsm-mp.C benchmark_ftrtri_SOURCES = benchmark-ftrtri.C benchmark_inverse_SOURCES = benchmark-inverse.C benchmark_charpoly_SOURCES = benchmark-charpoly.C benchmark_lqup_SOURCES = benchmark-lqup.C benchmark_lqup_mp_SOURCES = benchmark-lqup-mp.C benchmark_pluq_SOURCES = benchmark-pluq.C benchmark_checkers_SOURCES = benchmark-checkers.C benchmark_sgemm_CXXFLAGS = $(AM_CXXFLAGS) -D__SGEMM__ # Perfpublisher script interaction - AB 2014/11/17 perfpublisher: +./perfpublisher.sh "$(PERFPUBLISHERFILE)" "$(BENCHMARKS)" "$(CXX)" # for compilation of new benchmarks FFLASFFPACK_BIN=@bindir@ define other_compilation $(CXX) $(CXXFLAGS) $(AM_CXXFLAGS) $(OPTFLAGS) $(PARFLAGS) ${INCLUDES} $(AM_CPPFLAGS) $*.C -o $@ $(LDFLAGS) $(LDADD) $(LOADLIBES) endef %:%.C $(other_compilation) %:%.cpp $(other_compilation) fflas-ffpack-2.2.2/benchmarks/Makefile.tests000066400000000000000000000017521274716147400210000ustar00rootroot00000000000000SIZES=6 6 7 8 9 10 11 12 13 BITS=30 60 120 240 480 960 1920 3840 7680 MATR=1000 SHELL := /bin/bash index = $(words $(shell a="$(2)";echo $${a/$(1)*/$(1)} )) swap = $(word $(call index,$(1),${SIZES}),${BITS}) OUTP=output.fgemv MODEL=$(shell cat /proc/cpuinfo | grep "model name" | head -1|cut -d':' -f2| tr -s ' '|sed 's/^ //') EXEC=benchmark-fgemv-mp WSRC=${EXEC:%=-W %.C} mkruns = make "OPTFLAGS=-Ofast -DSTD_RECINT_SIZE=$(1) -DBENCH_RECINT" ${EXEC} ${WSRC}; ${EXEC} -b $(call swap,$(1)) -m ${MATR} -k ${MATR} -i 2 |awk '{print "SIZE:",$(1),$$0}' >> ${OUTP}; make "OPTFLAGS=-Ofast -DINTEGER_NO_RNS" ${EXEC} ${WSRC}; echo "NORNS"`${EXEC} -b $(call swap,$(1)) -m ${MATR} -k ${MATR} -i 2`|awk '{print "SIZE:",$(1),$$0}' >> ${OUTP}; all: run split run: - rm ${OUTP} $(foreach siz, ${SIZES}, $(call mkruns,${siz})) split: fgrep RecInt ${OUTP} | sed 's/4rintILm/ /;s/EEE/ /'> ${OUTP}.rint fgrep Givaro ${OUTP} | fgrep NORNS > ${OUTP}.gmp fgrep Givaro ${OUTP} | fgrep -v NORNS > ${OUTP}.rns fflas-ffpack-2.2.2/benchmarks/benchmark-charpoly.C000066400000000000000000000063351274716147400220560ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Clement Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" using namespace std; int main(int argc, char** argv) { size_t iter = 1; int q = 131071; size_t n = 2000; std::string file = ""; static int variant =0; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'f', "-f FILE", "Set the input file (empty for random).", TYPE_STR , &file }, { 'a', "-a algorithm", "Set the algorithmic variant", TYPE_INT, &variant }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); FFPACK::FFPACK_CHARPOLY_TAG CT; switch (variant){ case 0: CT = FFPACK::FfpackLUK; break; case 1: CT = FFPACK::FfpackKG; break; case 2: CT = FFPACK::FfpackDanilevski; break; case 3: CT = FFPACK::FfpackKGFast; break; case 4: CT = FFPACK::FfpackKGFastG; break; case 5: CT = FFPACK::FfpackHybrid; break; case 6: CT = FFPACK::FfpackArithProg; break; default: CT = FFPACK::FfpackLUK; break; } typedef Givaro::ModularBalanced Field; typedef Field::Element Element; Field F(q); FFLAS::Timer chrono; double time=0.0; Element *A; for (size_t i=0;i(n*n); Field::RandIter G(F); for (size_t j=0; j< (size_t)n*n; ++j) G.random(*(A+j)); } std::vector cpol(n); chrono.clear(); chrono.start(); FFPACK::CharPoly (F, cpol, n, A, n, CT); chrono.stop(); time+=chrono.usertime(); FFLAS::fflas_delete( A); } // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 std::cerr << "Time: " << time / double(iter) << " Gflops: " << "Irrelevant"; FFLAS::writeCommandString(std::cerr, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-checkers.C000066400000000000000000000237351274716147400220270ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * Written by Ashley Lesdalons * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ #define ENABLE_ALL_CHECKINGS 1 // DO NOT CHANGE #define _NR_TESTS 5 #define _MAX_SIZE_MATRICES 1000 #include "fflas-ffpack/config-blas.h" #include #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/args-parser.h" #include "fflas-ffpack/utils/fflas_randommatrix.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/checkers/checkers_fflas.h" #include "fflas-ffpack/checkers/checkers_ffpack.h" #include using namespace std; int main(int argc, char** argv) { size_t NR_TESTS = _NR_TESTS; int q = 131071; size_t MAX_SIZE_MATRICES = _MAX_SIZE_MATRICES; size_t Range = 500; size_t seed( (int) time(NULL) ); std::string file("checkers_report.txt"); Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &MAX_SIZE_MATRICES }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &NR_TESTS }, { 'r', "-r R", "Set the range of matrix sizes.", TYPE_INT , &Range }, { 's', "-s N", "Set the seed.", TYPE_INT , &seed }, { 'f', "-f FILE", "Set the output file.", TYPE_STR , &file }, END_OF_ARGUMENTS }; std::ofstream stats_f(file.c_str()); FFLAS::parseArguments(argc,argv,as); srand (seed); typedef Givaro::Modular Field; typedef std::vector Polynomial; Field F(q); Field::RandIter Rand(F,0,seed); Field::NonZeroRandIter NZRand(Rand); size_t pass; FFLAS::Timer chrono,global; double gffop(0.); global.start(); double time1, time2; Field::Element_ptr A = FFLAS::fflas_new(F,MAX_SIZE_MATRICES+Range,MAX_SIZE_MATRICES+Range); Field::Element_ptr B = FFLAS::fflas_new(F,MAX_SIZE_MATRICES+Range,MAX_SIZE_MATRICES+Range); Field::Element_ptr C = FFLAS::fflas_new(F,MAX_SIZE_MATRICES+Range,MAX_SIZE_MATRICES+Range); typename Field::Element alpha,beta,tmp; F.init(alpha, rand()%1000+1); F.init(beta, rand()%1000+1); size_t m,n,k,lda,ldb,ldc; FFLAS::FFLAS_TRANSPOSE ta,tb; stats_f << " Matrix size\tSuccess rate\t\tTime comput.\t\tTime checker\n\n"; // ##### FGEMM ##### stats_f << "FGEMM:\n"; for (size_t i=0; i checker1(Rand,m,n,k,beta,C,ldc); chrono.stop(); time1 += chrono.usertime(); chrono.clear(); chrono.start(); FFLAS::fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); chrono.stop(); time2 += chrono.usertime(); chrono.clear(); chrono.start(); pass += checker1.check(ta,tb,alpha,A,lda,B,ldb,C) ? 1 : 0; chrono.stop(); time1 += chrono.usertime(); } time1 /= NR_TESTS; time2 /= NR_TESTS; stats_f << " " << i << "-" << i+Range << "\t\t" << pass << "/" << NR_TESTS << "\t\t\t" << time2 << "\t\t" << time1 << endl; } stats_f << endl; // ##### FTRSM ##### stats_f << "FTRSM:\n"; for (size_t i=0; i checker2(Rand, m, n, alpha, B, n); chrono.stop(); time1 += chrono.usertime(); chrono.clear(); chrono.start(); FFLAS::ftrsm(F, side, uplo, trans, diag, m, n, alpha, A, k, B, n); chrono.stop(); time2 += chrono.usertime(); chrono.clear(); chrono.start(); pass += checker2.check(side, uplo, trans, diag, m, n, A, k, B, n); chrono.stop(); time1 += chrono.usertime(); } time1 /= NR_TESTS; time2 /= NR_TESTS; stats_f << " " << i << "-" << i+Range << "\t\t" << pass << "/" << NR_TESTS << "\t\t\t" << time2 << "\t\t" << time1 << endl; } stats_f << endl; // ##### INVERT ##### stats_f << "INVERT:\n"; int nullity; for (size_t i=0; i checker3(Rand,m,A,m); chrono.stop(); time1 += chrono.usertime(); chrono.clear(); chrono.start(); FFPACK::Invert(F,m,A,m,nullity); chrono.stop(); time2 += chrono.usertime(); chrono.clear(); chrono.start(); pass += checker3.check(A,nullity); chrono.stop(); time1 += chrono.usertime(); } catch(FailureInvertCheck &e) { stats_f << " invert verification failed! " << nullity << std::endl; } catch(FailurePLUQCheck &e) { stats_f << " internal PLUQ verification failed! " << std::endl; } } time1 /= NR_TESTS; time2 /= NR_TESTS; stats_f << " " << i << "-" << i+Range << "\t\t" << pass << "/" << NR_TESTS << "\t\t\t" << time2 << "\t\t" << time1 << endl; } stats_f << endl; // ##### PLUQ ##### stats_f << "PLUQ:\n"; for (size_t i=0; i(m); size_t *Q = FFLAS::fflas_new(n); chrono.clear(); chrono.start(); FFPACK::ForceCheck_PLUQ checker4 (Rand,m,n,A,n); chrono.stop(); time1 += chrono.usertime(); chrono.clear(); chrono.start(); k = FFPACK::PLUQ(F, FFLAS::FflasNonUnit, m, n, A, n, P, Q); chrono.stop(); time2 += chrono.usertime(); #define CUBE(x) ((x)*(x)*(x)) gffop += 2.0/3.0*CUBE(double(k)/1000.0) +2*m/1000.0*n/1000.0*double(k)/1000.0 - double(k)/1000.0*double(k)/1000.0*(m+n)/1000; chrono.clear(); chrono.start(); pass += checker4.check(A,n,k,P,Q); chrono.stop(); time1 += chrono.usertime(); FFLAS::fflas_delete(P,Q); } time1 /= NR_TESTS; time2 /= NR_TESTS; stats_f << " " << i << "-" << i+Range << "\t\t" << pass << "/" << NR_TESTS << "\t\t\t" << time2 << "\t\t" << time1 << endl; } stats_f << endl; global.stop(); // ##### CharPoly ##### stats_f << "CharPoly:\n"; for (size_t i=0; i checker5(Rand,n,A,n); chrono.stop(); time1 += chrono.usertime(); chrono.clear(); chrono.start(); FFPACK::CharPoly(F,g,n,A,n,FFPACK::FfpackLUK); chrono.stop(); time2 += chrono.usertime(); chrono.clear(); chrono.start(); pass += checker5.check(g); chrono.stop(); time1 += chrono.usertime(); } catch(FailureCharpolyCheck &e) { stats_f << " charpoly verification failed! " << std::endl; } catch(FailurePLUQCheck &e) { stats_f << " internal PLUQ verification failed! " << std::endl; } } time1 /= NR_TESTS; time2 /= NR_TESTS; stats_f << " " << i << "-" << i+Range << "\t\t" << pass << "/" << NR_TESTS << "\t\t\t" << time2 << "\t\t" << time1 << endl; } FFLAS::fflas_delete(A); FFLAS::fflas_delete(B); FFLAS::fflas_delete(C); std::cout << "Time: " << global.realtime() << " Gflops: " << gffop/global.realtime() << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-dgemm.C000066400000000000000000000107621274716147400213250ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s //#include "goto-def.h" /* Copyright (c) FFLAS-FFPACK * Written by Clément Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/config-blas.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" #ifdef __FFLASFFPACK_USE_OPENMP typedef FFLAS::OMPTimer TTimer; #else typedef FFLAS::Timer TTimer; #endif #ifndef __SGEMM__ typedef double Floats; #define CBLAS_GEMM cblas_dgemm #else typedef float Floats; #define CBLAS_GEMM cblas_sgemm #endif using namespace std; int main(int argc, char** argv) { size_t iter = 1; int q = 1009; size_t n = 2000; std::string file1 = ""; std::string file2 = ""; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'f', "-f FILE", "Set the first input file (empty for random).", TYPE_STR , &file1 }, { 'g', "-g FILE", "Set the second input file (empty for random).", TYPE_STR , &file2 }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); typedef Givaro::ModularBalanced Field; typedef Field::Element Element; Field F(q); TTimer chrono; double time=0.0;// time2=0.0; Element * A, * B, * C; if (iter>1) { if (!file1.empty()){ A = read_field (F, file1.c_str(), &n, &n); } else{ Field::RandIter G(F); A = FFLAS::fflas_new(n*n); #pragma omp parallel for for (size_t i=0; i(n*n); #pragma omp parallel for for (size_t i=0; i(n*n); CBLAS_GEMM (CblasRowMajor, CblasNoTrans, CblasNoTrans, n,n,n, F.one, A, n, B, n, F.zero, C,n); FFLAS::fflas_delete( A); FFLAS::fflas_delete( B); FFLAS::fflas_delete( C); } for (size_t it=0;it(n*n); #pragma omp parallel for for (size_t i=0; i(n*n); #pragma omp parallel for for (size_t i=0; i(n*n); chrono.clear(); chrono.start(); CBLAS_GEMM (CblasRowMajor, CblasNoTrans, CblasNoTrans, n,n,n, F.one, A, n, B, n, F.zero, C,n); chrono.stop(); time+=chrono.usertime(); FFLAS::fflas_delete( A); FFLAS::fflas_delete( B); FFLAS::fflas_delete( C); } // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 std::cout << "Time: " << time / double(iter) << " Gflops: " << (2.*double(n)/1000.*double(n)/1000.*double(n)/1000.0) / time * double(iter); FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-dgetrf.C000066400000000000000000000064211274716147400215040ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Clément Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #ifndef __FFLASFFPACK_HAVE_DGETRF #define __FFLASFFPACK_HAVE_DGETRF 1 #endif #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" using namespace std; #ifdef __FFLASFFPACK_USE_OPENMP typedef FFLAS::OMPTimer TTimer; #else typedef FFLAS::Timer TTimer; #endif int main(int argc, char** argv) { size_t iter = 1; int q = 1009; size_t n = 2000; std::string file = ""; size_t NBK = MAX_THREADS; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'f', "-f FILE", "Set the input file (empty for random).", TYPE_STR , &file }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); typedef Givaro::Modular Field; typedef Field::Element Element; Field F(q); Field::Element * A; TTimer chrono; double time=0.0; std::vector Piv(n,0); if (iter>1) { if (!file.empty()){ A = read_field(F, file.c_str(), &n, &n); } else { A = FFLAS::fflas_new(n*n); Field::RandIter G(F); PAR_BLOCK{ FFLAS::pfrand(F,G,n,n,A,n/NBK); } clapack_dgetrf(CblasRowMajor,n,n,A,n,&Piv[0]); FFLAS::fflas_delete( A); } } for (size_t it=0;it(n*n); Field::RandIter G(F); PAR_BLOCK{ FFLAS::pfrand(F,G,n,n,A,n/NBK); } } chrono.clear(); chrono.start(); clapack_dgetrf(CblasRowMajor,n,n,A,n,&Piv[0]); chrono.stop(); time+=chrono.usertime(); FFLAS::fflas_delete( A); } // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 std::cout << "Time: " << time / double(iter) << " Gflops: " << (2.*double(n)/1000.*double(n)/1000.*double(n)/1000.0) / time * double(iter) / 3.; FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-dgetri.C000066400000000000000000000064131274716147400215100ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Clément Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include // #ifndef __FFLASFFPACK_HAVE_DGETRF // #define __FFLASFFPACK_HAVE_DGETRF 1 // #endif // #ifndef __FFLASFFPACK_HAVE_DGETRI // #define __FFLASFFPACK_HAVE_DGETRI 1 // #endif // #ifndef __FFLASFFPACK_HAVE_DTRTRI // #define __FFLASFFPACK_HAVE_DTRTRI 1 // #endif // #ifndef __FFLASFFPACK_AUTOIMPLEMENT_DGETRI // #define __FFLASFFPACK_AUTOIMPLEMENT_DGETRI 1 // #endif #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" #ifdef __FFLASFFPACK_USE_OPENMP typedef FFLAS::OMPTimer TTimer; #else typedef FFLAS::Timer TTimer; #endif using namespace std; int main(int argc, char** argv) { size_t iter = 1; int q = 1009; size_t n = 2000; std::string file = ""; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'f', "-f FILE", "Set the input file (empty for random).", TYPE_STR , &file }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); typedef Givaro::Modular Field; typedef Field::Element Element; vector Piv(n,0); Field F(q); Field::Element * A; TTimer chrono; double time=0.0; for (size_t i=0;i(n*n); Field::RandIter G(F); for (size_t j=0; j<(size_t)n*n; ++j) G.random(*(A+j)); } chrono.clear(); chrono.start(); clapack_dgetrf(CblasRowMajor,n,n,A,n,&Piv[0]); clapack_dgetri(CblasRowMajor,n,A,n,&Piv[0]); chrono.stop(); time+=chrono.usertime(); FFLAS::fflas_delete( A); } // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 std::cout << "Time: " << time / double(iter) << " Gflops: " << (2.*double(n)/1000.*double(n)/1000.*double(n)/1000.0) / time * double(iter); FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-dtrsm.C000066400000000000000000000065111274716147400213620ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s // /* Copyright (c) FFLAS-FFPACK * Written by Clément Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" #ifdef __FFLASFFPACK_USE_OPENMP typedef FFLAS::OMPTimer TTimer; #else typedef FFLAS::Timer TTimer; #endif using namespace std; int main(int argc, char** argv) { size_t iter = 1; int q = 1009; size_t n = 2000; std::string file1 = ""; std::string file2 = ""; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'f', "-f FILE", "Set the first input file (empty for random).", TYPE_STR , &file1 }, { 'g', "-g FILE", "Set the second input file (empty for random).", TYPE_STR , &file2 }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); typedef Givaro::Modular Field; typedef Field::Element Element; Field F(q); Element * A; Element * B; TTimer chrono; double time=0.0; for (size_t i=0;i(n*n); for (size_t j = 0; j< (size_t)n*n; ++j) G.random(*(A+j)); } if (!file2.empty()){ B = read_field (F, file2.c_str(), &n, &n); } else{ B = FFLAS::fflas_new(n*n); for (size_t j=0 ; j< (size_t)n*n; ++j) G.random(*(A+j)); } for (size_t k=0;k<(size_t)n;++k) while (F.isZero( G.random(*(A+k*(n+1))))); chrono.clear(); chrono.start(); cblas_dtrsm (CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, n,n, F.one, A, n, B, n); chrono.stop(); time+=chrono.usertime(); FFLAS::fflas_delete( A); FFLAS::fflas_delete( B); } // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 std::cout << "Time: " << time / double(iter) << " Gflops: " << (2.*double(n)/1000.*double(n)/1000.*double(n)/1000.0) / time * double(iter) / 3.; FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-dtrtri.C000066400000000000000000000056751274716147400215530ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Clément Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #define __FFLASFFPACK_HAVE_DTRTRI 1 #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" #ifdef __FFLASFFPACK_USE_OPENMP typedef FFLAS::OMPTimer TTimer; #else typedef FFLAS::Timer TTimer; #endif using namespace std; int main(int argc, char** argv) { size_t iter = 1; int q = 1009; size_t n = 2000; std::string file = ""; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'f', "-f FILE", "Set the input file (empty for random).", TYPE_STR , &file }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); typedef Givaro::Modular Field; typedef Field::Element Element; Field F(q); Element * A; TTimer chrono; double time=0.0; Field::RandIter G(F); for (size_t i=0;i(n*n); for (size_t j=0; j<(size_t) n*n; ++j) G.random(*(A+j)); } for (size_t k=0;k<(size_t)n;++k) while (F.isZero( G.random(*(A+k*(n+1))))); chrono.clear(); chrono.start(); clapack_dtrtri(CblasRowMajor,CblasUpper, CblasNonUnit,n,A,n); chrono.stop(); time+=chrono.usertime(); FFLAS::fflas_delete( A); } // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 std::cout << "Time: " << time / double(iter) << " Gflops: " << (2.*double(n)/1000.*double(n)/1000.*double(n)/1000.0) / time * double(iter) / 3.; FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-echelon.C000066400000000000000000000246571274716147400216610ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Clement Pernet , from benchmark-pluq by Ziad Sultan * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include "fflas-ffpack/config-blas.h" #include "fflas-ffpack/fflas/fflas.h" #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" using namespace std; typedef Givaro::ModularBalanced Field; // random generator function: ptrdiff_t myrandom (ptrdiff_t i) { return rand()%i;} // pointer object to it: ptrdiff_t (*p_myrandom)(ptrdiff_t) = myrandom; typename Field::Element* construct_U(const Field& F, Field::RandIter& G, size_t n, size_t r, std::vector& P, size_t commonseed, size_t seed) { size_t lda = n; Field::Element *U=new Field::Element[n*n]; FFLAS::ParSeqHelper::Parallel H; std::vector E(r); PARFOR1D(i,r,H, E[i]=i; ); srand48(commonseed); std::vector Z(n); PARFOR1D(i,n,H, Z[i]=i;); P.resize(r); for(size_t i=0; i& P, size_t seed) { FFLAS::ParSeqHelper::Parallel H; size_t lda = m; size_t taille=m*m; Field::Element * L= new Field::Element[taille]; PARFOR1D(i,taille,H, F.init(L[i],F.zero); ); std::vector E(r); PARFOR1D(i,r,H, E[i]=i;); srand48(seed); std::vector Z(m); PARFOR1D(i,m,H, Z[i]=i; ); std::vector Q(r); for(size_t i=0; i(m*n); Field::Element * L, *U; L = FFLAS::fflas_new(m*R); U = FFLAS::fflas_new(R*n); PARFOR1D (i,m*R,H, F.init(L[i], 0.0); ); PARFOR1D (i,n*R,H, F.init(U[i], 0.0); ); PARFOR1D (i,m*n,H, F.init(X[i], 0.0); ); Field::Element zero,one; F.init(zero,0.0); F.init(one,1.0); PARFOR1D (i,R,H, for (size_t j=0; j(&seed1), sizeof(seed1)); f.read(reinterpret_cast(&seed2), sizeof(seed2)); f.read(reinterpret_cast(&seed3), sizeof(seed3)); f.read(reinterpret_cast(&seed4), sizeof(seed4)); std::vector Index_P(r); Field::RandIter GG(F, seed1); PAR_BLOCK{ pfrand(F,GG,m,n,A,m/NBK); } // std::cout<<"Construct U"<(maxP); size_t *Q = FFLAS::fflas_new(maxQ); FFLAS::ParSeqHelper::Parallel H; PARFOR1D(i,(size_t)m,H, for (size_t j=0; j<(size_t)n; ++j) Acop[i*n+j]= (*(A+i*n+j)); ); for (size_t i=0;i<=iter;++i){ PARFOR1D(j,maxP,H, P[j]=0; ); PARFOR1D(j,maxQ,H, Q[j]=0; ); PARFOR1D(k,(size_t)m,H, for (size_t j=0; j<(size_t)n; ++j) *(A+k*n+j) = *(Acop+k*n+j) ; ); chrono.clear(); if (i) chrono.start(); // Added by AB 2014-12-15 //#ifdef __FFLASFFPACK_USE_OPENMP r = RowEchelonForm(F,m,n,A,n,P,Q,transform,LuTag); if (i) {chrono.stop(); time+=chrono.realtime();} } // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 #define CUBE(x) ((x)*(x)*(x)) double gflop = 2.0/3.0*CUBE(double(r)/1000.0) +2*m/1000.0*n/1000.0*double(r)/1000.0 - double(r)/1000.0*double(r)/1000.0*(m+n)/1000; if (transform) gflop += CUBE(double(r)/1000.0)/3.0 + double(r)/1000.0*double(r)/1000.0*double(n-r)/1000.0; std::cout << "Time: " << time / double(iter) << " Gflops: " << gflop / time * double(iter-1); FFLAS::writeCommandString(std::cout, as) << std::endl; //verification if(v) verification_PLUQ(F,Acop,A,P,Q,m,n,R); FFLAS::fflas_delete( A); FFLAS::fflas_delete( Acop); return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-fgemm-mp.C000066400000000000000000000177061274716147400217460ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Pascal Giorgi * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #if not defined(MG_DEFAULT) #define MG_DEFAULT MG_ACTIVE #endif #if not defined(STD_RECINT_SIZE) #define STD_RECINT_SIZE 8 #endif #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include using namespace std; #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/args-parser.h" #include "givaro/modular-integer.h" #include "givaro/givcaster.h" #include "fflas-ffpack/paladin/parallel.h" #ifdef BENCH_RECINT #include "recint/recint.h" #endif #ifdef BENCH_FLINT #define __GMP_BITS_PER_MP_LIMB 64 extern "C" { #include "flint/longlong.h" #include "flint/long_extras.h" #include "flint/fmpz_mat.h" #include "flint/fmpz.h" #include "flint/flint.h" } #endif template std::ostream& write_matrix(std::ostream& out, Givaro::Integer p, size_t m, size_t n, T* C, size_t ldc){ size_t www(size_t((double(p.bitsize())*log(2.))/log(10.))); out<<"Matrix("< int tmain(){ srand( (int)seed); srand48(seed); Givaro::Integer::seeding(seed); typedef Givaro::Modular Field; Givaro::Integer p; FFLAS::Timer chrono, TimFreivalds; double time=0.,timev=0.; #ifdef BENCH_FLINT double timeFlint=0.; #endif for (size_t loop=0;loop(ip,p); Givaro::Caster(p,ip); // to check consistency Field F(ip); size_t lda,ldb,ldc; lda=k; ldb=n; ldc=n; typename Field::RandIter Rand(F,seed); typename Field::Element_ptr A,B,C; A= FFLAS::fflas_new(F,m,lda); B= FFLAS::fflas_new(F,k,ldb); C= FFLAS::fflas_new(F,m,ldc); // for (size_t i=0;i(&p))); fmpz_mat_t AA,BB,CC,DD; fmpz_mat_init (AA, m, k); fmpz_mat_init (BB, k, n); fmpz_mat_init (CC, m, n); fmpz_mat_init (DD, m, n); fmpz_t aalpha, bbeta; fmpz_set_mpz(aalpha,*(reinterpret_cast(&alpha))); fmpz_set_mpz(bbeta,*(reinterpret_cast(&beta))); for (size_t i=0;i(A+i*lda+j))); for (size_t i=0;i(B+i*ldb+j))); for (size_t i=0;i(C+i*ldc+j))); chrono.clear();chrono.start(); // DD= A.B fmpz_mat_mul(DD,AA,BB); // CC = beta.C fmpz_mat_scalar_mul_fmpz(CC,CC,bbeta); // CC = CC + DD.alpha fmpz_mat_scalar_addmul_fmpz(CC,DD,aalpha); // CC = CC mod p for (size_t i=0;i(); #ifdef BENCH_RECINT r1 += tmain>(); #endif return r1; } fflas-ffpack-2.2.2/benchmarks/benchmark-fgemm.C000066400000000000000000000213361274716147400213260ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s //#include "goto-def.h" /* Copyright (c) FFLAS-FFPACK * Written by Clément Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ // Please do not commit with any of these defines on - AB 2015-01-12 //#define __FFLASFFPACK_USE_TBB //#define __FFLASFFPACK_USE_OPENMP //#define __FFLASFFPACK_USE_DATAFLOW //#define WINO_PARALLEL_TMPS //#define __FFLASFFPACK_FORCE_SEQ //#define PFGEMM_WINO_SEQ 32 //#define CLASSIC_SEQ #define CLASSIC_HYBRID //#define WINO_SEQ //#define DEBUG 1 //#undef NDEBUG //#define FFT_PROFILER //#define PROFILE_FGEMM_MP #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/config-blas.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" #ifdef __FFLASFFPACK_USE_KAAPI #include "libkomp.h" #endif using namespace std; using namespace FFLAS; int main(int argc, char** argv) { size_t iter = 3 ; Givaro::Integer q = 131071 ; size_t m = 2000 ; size_t k = 2000 ; size_t n = 2000 ; int nbw = -1 ; int p=0; int t=MAX_THREADS; int NBK = -1; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'm', "-m M", "Set the row dimension of A.", TYPE_INT , &m }, { 'k', "-k K", "Set the col dimension of A.", TYPE_INT , &k }, { 'n', "-n N", "Set the col dimension of B.", TYPE_INT , &n }, { 'w', "-w N", "Set the number of winograd levels (-1 for random).", TYPE_INT , &nbw }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'p', "-p P", "0 for sequential, 1 for 2D iterative, 2 for 2D rec, 3 for 2D rec adaptive, 4 for 3D rc in-place, 5 for 3D rec, 6 for 3D rec adaptive.", TYPE_INT , &p }, { 't', "-t T", "number of virtual threads to drive the partition.", TYPE_INT , &t }, { 'b', "-b B", "number of numa blocks per dimension for the numa placement", TYPE_INT , &NBK }, END_OF_ARGUMENTS }; parseArguments(argc,argv,as); if (NBK==-1) NBK = t; // typedef Givaro::Modular Field; // typedef Givaro::ModularBalanced Field; // typedef Givaro::ModularBalanced Field; typedef Givaro::ModularBalanced Field; // typedef Givaro::Modular Field; typedef Field::Element Element; Field F(q); Timer chrono, TimFreivalds; double time=0.0, timev=0.0; Element * A, * B, * C; Field::RandIter G(F); A = fflas_new(F,m,k,Alignment::CACHE_PAGESIZE); //#pragma omp parallel for collapse(2) schedule(runtime) PAR_BLOCK { pfrand(F,G, m,k,A,m/size_t(NBK)); } B = fflas_new(F,k,n,Alignment::CACHE_PAGESIZE); //#pragma omp parallel for collapse(2) schedule(runtime) PAR_BLOCK { pfrand(F,G, k,n,B,k/NBK); } C = fflas_new(F,m,n,Alignment::CACHE_PAGESIZE); //#pragma omp parallel for collapse(2) schedule(runtime) PAR_BLOCK { pfzero(F, m,n,C,m/NBK); } for (size_t i=0;i<=iter;++i){ // if (argc > 4){ // A = read_field (F, argv[4], &n, &n); // } // else{ chrono.clear(); if (p && p!=7){ // CuttingStrategy meth = RECURSIVE; // StrategyParameter strat = THREADS; typedef CuttingStrategy::Block block; typedef CuttingStrategy::Recursive rec; typedef StrategyParameter::Threads threads; typedef StrategyParameter::TwoD twod; typedef StrategyParameter::TwoDAdaptive twoda; typedef StrategyParameter::ThreeD threed; typedef StrategyParameter::ThreeDAdaptive threeda; typedef StrategyParameter::ThreeDInPlace threedip; PAR_BLOCK{ if (i) { chrono.start(); } switch (p){ case 1:{ MMHelper::value, ParSeqHelper::Parallel > WH(F,nbw, SPLITTER(t,block,threads)); fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH); break;} case 2:{ MMHelper::value, ParSeqHelper::Parallel > WH(F,nbw, SPLITTER(t,rec,twod)); fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH); break; } case 3:{ MMHelper::value, ParSeqHelper::Parallel > WH(F,nbw, SPLITTER(t,rec,twoda)); fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH); break; } case 4:{ MMHelper::value, ParSeqHelper::Parallel > WH(F,nbw, SPLITTER(t,rec,threedip)); fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH); break; } case 5:{ MMHelper::value, ParSeqHelper::Parallel > WH(F,nbw, SPLITTER(t,rec,threed)); fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH); break; } case 6:{ MMHelper::value, ParSeqHelper::Parallel > WH(F,nbw, SPLITTER(t,rec,threeda)); fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH); break; } default:{ MMHelper::value, ParSeqHelper::Parallel > WH(F,nbw, SPLITTER(t,block,threads)); fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH); break; } } } if (i) {chrono.stop(); time+=chrono.realtime();} }else{ if(p==7){ int nrec = 0; int dim = m; // if(dim < 19000) nrec--; while(dim >= __FFLASFFPACK_WINOTHRESHOLD*2){ dim=dim/2; nrec++; } nrec=std::max(1,nrec); // std::cout<<" WINO_THREShold"<<__FFLASFFPACK_WINOTHRESHOLD<<" nrec = "<//, //typename FieldTraits::value, //ParSeqHelper::Sequential> WH (F, nbw, ParSeqHelper::Sequential()); if (i) chrono.start(); fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n,WH); if (i) {chrono.stop(); time+=chrono.realtime();} } } TimFreivalds.clear(); TimFreivalds.start(); bool pass = freivalds(F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, C,n); TimFreivalds.stop(); timev+=TimFreivalds.usertime(); if (!pass) std::cout<<"FAILED"<s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Pascal Giorgi * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #if not defined(MG_DEFAULT) #define MG_DEFAULT MG_ACTIVE #endif #if not defined(STD_RECINT_SIZE) #define STD_RECINT_SIZE 8 #endif #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include using namespace std; #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/args-parser.h" #include "givaro/modular-integer.h" #include "givaro/givcaster.h" #include "fflas-ffpack/paladin/parallel.h" #ifdef BENCH_RECINT #include "recint/recint.h" #endif template std::ostream& write_matrix(std::ostream& out, Givaro::Integer p, size_t m, size_t n, T* C, size_t ldc){ size_t www(size_t((double(p.bitsize())*log(2.))/log(10.))); out<<"Matrix("< int tmain(){ srand( (int)seed); srand48(seed); Givaro::Integer::seeding(seed); typedef Givaro::Modular Field; Givaro::Integer p; FFLAS::Timer chrono, TimFreivalds; double time=0.; for (size_t loop=0;loop(ip,p); Givaro::Caster(p,ip); // to check consistency Field F(ip); size_t lda,ldb,ldc; lda=k; ldb=1; ldc=1; typename Field::RandIter Rand(F,seed); typename Field::Element_ptr A,B,C; A= FFLAS::fflas_new(F,m,lda); B= FFLAS::fflas_new(F,k,ldb); C= FFLAS::fflas_new(F,m,ldc); // for (size_t i=0;i(); #ifdef BENCH_RECINT r1 += tmain>(); #endif return r1; } fflas-ffpack-2.2.2/benchmarks/benchmark-fspmm.C000066400000000000000000000177361274716147400213660ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Bastien Vialla * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include #include #include #include #include #include "fflas-ffpack/config-blas.h" // #include "fflas-ffpac/field/modular-double.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/fflas/fflas_sparse.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" #ifdef __FFLASFFPACK_USE_OPENMP typedef FFLAS::OMPTimer TTimer; #else typedef FFLAS::Timer TTimer; #endif using namespace std; using namespace FFLAS; template T from_string(std::string const &s) { std::stringstream ss(s); T result; ss >> result; // TODO handle errors return result; } template std::pair test_fspmm(size_t iter, const Field &F, IndexT *row, IndexT *col, typename Field::Element_ptr dat, index_t rowdim, index_t coldim, uint64_t nnz, int blocksize, typename Field::Element_ptr x, int ldx, typename Field::Element beta, typename Field::Element_ptr y, int ldy) { MatT matrix; sparse_init(F, matrix, row, col, dat, rowdim, coldim, nnz); TTimer time; time.clear(); time.start(); for (size_t i = 0; i < iter; ++i) fspmm(F, matrix, blocksize, x, ldx, 1, y, ldy); time.stop(); sparse_delete(matrix); return make_pair(time.usertime(), matrix.nElements); } template void print_res(pair &p, size_t iter, T as, int blocksize) { // cout << 2*p.second*blocksize*iter << endl; std::cout << "Time: " << p.first / double(iter) << " Gflops: " << ((2*blocksize*p.second)/(1000000.*p.first))*(double(iter)/1000) ; FFLAS::writeCommandString(std::cout, as) << std::endl; } int main(int argc, char **argv) { using Field = Givaro::Modular; using Element = typename Field::Element; size_t iter = 10; int q = 1009; int blocksize = 4; int s = 0; std::string matrixFile = ""; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT, &q }, { 'b', "-b Q", "Set the block size.", TYPE_INT, &blocksize }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT, &iter }, { 's', "-s S", "Compute and print matrix statistics.", TYPE_INT, &s }, { 'f', "-f FILE", "Set matrix file.", TYPE_STR, &matrixFile }, END_OF_ARGUMENTS }; // matrixFile = "matrix/cis.mk8-8.sms"; // matrixFile = "matrix/M06-D9.sms"; // matrixFile = "matrix/GL7d17.sms"; // matrixFile = "data/mat11.sms"; FFLAS::parseArguments(argc, argv, as); // cout << matrixFile << endl; Field F(q); index_t *row = nullptr, *col = nullptr; typename Field::Element_ptr dat; index_t rowdim, coldim; uint64_t nnz; index_t * st = nullptr ; readSmsFormat(matrixFile, F, st, col, dat, rowdim, coldim, nnz); row = fflas_new(nnz); for (index_t j = 0 ; j < rowdim ; ++j) { for (index_t k = st[j] ; k < st[j+1] ; ++k) row[k] = j ; } if (s) { // auto stats = sparse_details::getStat(F, row, col, dat, rowdim, coldim, nnz); // std::cout << "Sparse Matrix statistics : " << std::endl; // stats.print(); std::cout << std::endl; } auto x = FFLAS::fflas_new(F, coldim, blocksize, Alignment::CACHE_LINE); auto y = FFLAS::fflas_new(F, rowdim, blocksize, Alignment::CACHE_LINE); for (size_t i = 0; i < coldim * blocksize; ++i) { x[i] = 1; } for (size_t i = 0; i < rowdim * blocksize; ++i) { y[i] = 0; } // auto coo = test_fspmm>(iter, F, row, col, dat, rowdim, coldim, nnz, // blocksize, x, blocksize, 1, y, blocksize); // cout << "COO : "; // print_res(coo, iter, as); // auto coozo = test_fspmm>(iter, F, row, col, dat, rowdim, coldim, // nnz, blocksize, x, blocksize, 1, y, blocksize); // cout << "COO_ZO : "; // print_res(coozo, iter, as); auto csr = test_fspmm>(iter, F, row, col, dat, rowdim, coldim, nnz, blocksize, x, blocksize, 1, y, blocksize); cout << "CSR : "; print_res(csr, iter, as, blocksize); auto ell = test_fspmm>(iter, F, row, col, dat, rowdim, coldim, nnz, blocksize, x, blocksize, 1, y, blocksize); cout << "ELL : "; print_res(ell, iter, as, blocksize); auto ellzo = test_fspmm>(iter, F, row, col, dat, rowdim, coldim, nnz, blocksize, x, blocksize, 1, y, blocksize); cout << "ELL_ZO : "; print_res(ellzo, iter, as, blocksize); // auto csrzo = test_fspmm>(iter, F, row, col, dat, rowdim, coldim, // nnz, blocksize, x, blocksize, 1, y, blocksize); // cout << "CSR_ZO : "; // print_res(csrzo, iter, as); // auto ell = test_fspmm>(iter, F, row, col, dat, rowdim, coldim, nnz, // blocksize, x, blocksize, 1, y, blocksize); // cout << "ELL : "; // print_res(ell, iter, as); // auto ellzo = test_fspmm>(iter, F, row, col, dat, rowdim, coldim, // nnz, blocksize, x, blocksize, 1, y, blocksize); // cout << "ELL_ZO : "; // print_res(ellzo, iter, as); auto hybzo = test_fspmm>(iter, F, row, col, dat, rowdim, coldim, nnz, blocksize, x, blocksize, 1, y, blocksize); cout << "HYB_ZO : "; print_res(hybzo, iter, as, blocksize); auto csrhyb = test_fspmm>(iter, F, row, col, dat, rowdim, coldim, nnz, blocksize, x, blocksize, 1, y, blocksize); cout << "CSR_HYB : "; print_res(csrhyb, iter, as, blocksize); // for (size_t i = 0; i < 10*blocksize; ++i) { // std::cout << y[i] << " "; // } // std::cout << std::endl; // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 // std::cout << "Time: " << coo.first / double(iter) // << " Gflops: " << (2*coo.second)/1000000000. / coo.first * double(iter); // FFLAS::writeCommandString(std::cout, as) << std::endl; // std::cout << "Time: " << csr.first / double(iter) // << " Gflops: " << (2*csr.second)/1000000000. / csr.first * double(iter); // FFLAS::writeCommandString(std::cout, as) << std::endl; fflas_delete(x); fflas_delete(y); return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-fspmv.C000066400000000000000000000165451274716147400213740ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Bastien Vialla * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include #include #include #include #include #include "givaro/modular.h" #include "givaro/modular-balanced.h" #include "fflas-ffpack/config-blas.h" // #include "fflas-ffpac/field/modular-double.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/fflas/fflas_sparse.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" #ifdef __FFLASFFPACK_USE_OPENMP typedef FFLAS::OMPTimer TTimer; #else typedef FFLAS::Timer TTimer; #endif using namespace std; using namespace FFLAS; template T from_string(std::string const &s) { std::stringstream ss(s); T result; ss >> result; // TODO handle errors return result; } template std::pair test_fspmv(size_t iter, const Field &F, IndexT *row, IndexT *col, typename Field::Element_ptr dat, index_t rowdim, index_t coldim, uint64_t nnz, typename Field::Element_ptr x, typename Field::Element_ptr y, typename Field::Element beta) { MatT matrix; sparse_init(F, matrix, row, col, dat, rowdim, coldim, nnz); TTimer time; time.clear(); time.start(); for (size_t i = 0; i < iter; ++i) fspmv(F, matrix, x, 1, y); time.stop(); sparse_delete(matrix); return make_pair(time.usertime(), matrix.nElements); } template void print_res(pair &p, size_t iter, T as, int blocksize = 1) { std::cout << "Time: " << p.first / double(iter) << " Gflops: " << (2 * blocksize * p.second) / 1000000000. / p.first * double(iter); FFLAS::writeCommandString(std::cout, as) << std::endl; } int main(int argc, char **argv) { using Field = Givaro::Modular; using Element = typename Field::Element; size_t iter = 10; Givaro::Integer q = 1009; int s = 0; std::string matrixFile = ""; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER, &q }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT, &iter }, { 's', "-s S", "Compute and print matrix statistics.", TYPE_INT, &s }, { 'f', "-f FILE", "Set matrix file.", TYPE_STR, &matrixFile }, END_OF_ARGUMENTS }; // matrixFile = "matrix/cis.mk8-8.sms"; // matrixFile = "matrix/GL7d17.sms"; // matrixFile = "data/mat11.sms"; FFLAS::parseArguments(argc, argv, as); // cout << matrixFile << endl; Field F(q); index_t *row = nullptr, *col = nullptr; typename Field::Element_ptr dat; index_t rowdim = 0, coldim = 0; uint64_t nnz; if ( (matrixFile.find(".sms") != std::string::npos) || (matrixFile.find(".smf") != std::string::npos)) { index_t * st = nullptr ; readSmsFormat(matrixFile, F, st, col, dat, rowdim, coldim, nnz); row = fflas_new(nnz); for (index_t j = 0 ; j < rowdim ; ++j) { for (index_t k = st[j] ; k < st[j+1] ; ++k) row[k] = j ; } } else if (matrixFile.find(".spr") != std::string::npos) { readSprFormat(matrixFile, F, row, col, dat, rowdim, coldim, nnz); } if (s) { //auto stats = sparse_details::getStat(F, row, col, dat, rowdim, coldim, nnz); //std::cout << "Sparse Matrix statistics : " << std::endl; //stats.print(); //std::cout << std::endl; } auto x = FFLAS::fflas_new(F, coldim, 1, Alignment::CACHE_LINE); auto y = FFLAS::fflas_new(F, rowdim, 1, Alignment::CACHE_LINE); for (size_t i = 0; i < coldim; ++i) { x[i] = 1; } for (size_t i = 0; i < rowdim; ++i) { y[i] = 0; } auto coo = test_fspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "COO : "; print_res(coo, iter, as); auto coozo = test_fspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "COO_ZO : "; print_res(coozo, iter, as); auto csr = test_fspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "CSR : "; print_res(csr, iter, as); auto csrzo = test_fspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "CSR_ZO : "; print_res(csrzo, iter, as); auto ell = test_fspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "ELL : "; print_res(ell, iter, as); auto ellzo = test_fspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "ELL_ZO : "; print_res(ellzo, iter, as); // auto ellsimd = test_fspmv>(iter, F, row, col, dat, rowdim, coldim, // nnz, x, y, 1); // cout << "ELL_simd : "; // print_res(ellsimd, iter, as); // auto ellsimdzo = test_fspmv>(iter, F, row, col, dat, rowdim, // coldim, nnz, x, y, 1); // cout << "ELL_simd_ZO : "; // print_res(ellsimdzo, iter, as); auto csrhyb = test_fspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "CSR_HYB : "; print_res(csrhyb, iter, as); auto hybzo = test_fspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "HYB_ZO : "; print_res(hybzo, iter, as); // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 // std::cout << "Time: " << coo.first / double(iter) // << " Gflops: " << (2*coo.second)/1000000000. / coo.first * double(iter); // FFLAS::writeCommandString(std::cout, as) << std::endl; // std::cout << "Time: " << csr.first / double(iter) // << " Gflops: " << (2*csr.second)/1000000000. / csr.first * double(iter); // FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-ftrsm-mp.C000066400000000000000000000063121274716147400217750ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Pascal Giorgi * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include using namespace std; #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/args-parser.h" #include "givaro/modular-integer.h" int main(int argc, char** argv){ srand((int)time(NULL)); srand48(time(NULL)); static size_t iters = 3 ; static Givaro::Integer q = -1 ; static unsigned long b = 512 ; static size_t m = 512 ; static size_t n = 512 ; static Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'b', "-b B", "Set the bitsize of the random characteristic.", TYPE_INT , &b }, { 'm', "-m M", "Set the dimension m of the matrix.", TYPE_INT , &m }, { 'n', "-n N", "Set the dimension n of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iters }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); size_t seed= time(NULL); typedef Givaro::Modular Field; FFLAS::Timer chrono; double time=0.; Givaro::Integer p; Givaro::IntPrimeDom IPD; for (size_t i=0;is,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Clément Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" using namespace std; int main(int argc, char** argv) { size_t iter = 3; int q = 1009; size_t m = 2000 ; size_t n = 2000; std::string file1 = ""; std::string file2 = ""; int t=MAX_THREADS; int NBK = -1; int p = 0; // 0 for sequential 1 for pIter-sRec ; 2 for pRec; 3 for hybrid Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'm', "-m M", "Set the row dimension of the RHS matrix.", TYPE_INT , &m }, { 'n', "-n N", "Set the col dimension of the RHS matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'f', "-f FILE", "Set the first input file (empty for random).", TYPE_STR, &file1 }, { 'g', "-g FILE", "Set the second input file (empty for random).", TYPE_STR, &file2 }, { 't', "-t T", "number of virtual threads to drive the partition.", TYPE_INT, &t }, { 'b', "-b B", "number of numa blocks per dimension for the numa placement", TYPE_INT, &NBK }, { 'p', "-p P", "0 for sequential, 1 for Iterative, 2 for Recursive, 3 for Hybrid.", TYPE_INT , &p }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); if (NBK==-1) NBK = t; typedef Givaro::ModularBalanced Field; typedef Field::Element Element; Field F(q); Element * A; Element * B; FFLAS::Timer chrono; double time=0.0; Field::RandIter G(F); // if (argc > 5){ // A = read_field (F, argv[5], &n, &n); // } // else{ if (!file1.empty()){ A = read_field (F, file1.c_str(), &n, &n); } else{ A = FFLAS::fflas_new (F,m,m,Alignment::CACHE_PAGESIZE); PAR_BLOCK{ FFLAS::pfrand(F,G,m,m,A,m/NBK); } for (size_t k=0;k<(size_t)m;++k) while (F.isZero( G.random(*(A+k*(m+1))))); } if (!file2.empty()){ B = read_field (F, file2.c_str(), &m, &n); } else{ B = FFLAS::fflas_new(F,m,n,Alignment::CACHE_PAGESIZE); PAR_BLOCK{ FFLAS::pfrand(F,G,m,n,B,m/NBK); } } //} for (size_t i=0;i<=iter;++i){ chrono.clear(); if (i) chrono.start(); if (!p){ FFLAS::ParSeqHelper::Sequential H; FFLAS::ftrsm (F, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, m,n, F.one, A, m, B, n, H); } else{ FFLAS::ParSeqHelper::Parallel PSH(t); PAR_BLOCK{ switch (p) { case 1: { FFLAS::TRSMHelper > PH (PSH); FFLAS::ftrsm (F, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, m,n, F.one, A, m, B, n, PH); break;} case 2: {FFLAS::TRSMHelper > PH (PSH); FFLAS::ftrsm (F, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, m,n, F.one, A, m, B, n, PH); break;} case 3: FFLAS::TRSMHelper > PH (PSH); FFLAS::ftrsm (F, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, m,n, F.one, A, m, B, n, PH); break; } } } if (i) {chrono.stop(); time+=chrono.realtime();} } FFLAS::fflas_delete( A); FFLAS::fflas_delete( B); // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 std::cout << "Time: " << time / double(iter) << " Gflops: " << (double(m)/1000.*double(m)/1000.*double(n)/1000.0) / time * double(iter); FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-ftrtri.C000066400000000000000000000054571274716147400215530ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Clément Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" using namespace std; int main(int argc, char** argv) { size_t iter = 1; int q = 1009; size_t n = 2000; std::string file = ""; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'f', "-f FILE", "Set the input file (empty for random).", TYPE_STR , &file }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); typedef Givaro::Modular Field; typedef Field::Element Element; Field F(q); Element * A; FFLAS::Timer chrono; double time=0.0; Field::RandIter G(F); for (size_t i=0;i 4){ A = read_field (F, argv[4], &n, &n); } else { A = FFLAS::fflas_new(n*n); for (size_t j=0; j<(size_t) n*n; ++j) G.random(*(A+j)); } for (size_t k=0;k<(size_t)n;++k) while (F.isZero( G.random(*(A+k*(n+1))))); chrono.clear(); chrono.start(); FFPACK::ftrtri (F,FFLAS::FflasUpper, FFLAS::FflasNonUnit, n, A, n); chrono.stop(); time+=chrono.usertime(); FFLAS::fflas_delete( A); } // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 #define CUBE(x) ((x)*(x)*(x)) std::cout << "Time: " << time / double(iter) << " Gflops: " << CUBE(double(n)/1000.) / time * double(iter) / 3.; FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-inverse.C000066400000000000000000000053421274716147400217050ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Clément Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" using namespace std; int main(int argc, char** argv) { size_t iter = 1; int q = 1009; size_t n = 2000; std::string file = ""; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'f', "-f FILE", "Set the input file (empty for random).", TYPE_STR , &file }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); typedef Givaro::ModularBalanced Field; typedef Field::Element Element; Field F(q); Field::Element * A; FFLAS::Timer chrono; double time=0.0; for (size_t i=0;i(n*n); Field::RandIter G(F); for (size_t j=0; j<(size_t)n*n; ++j) G.random(*(A+j)); } int nullity=0; chrono.clear(); chrono.start(); FFPACK::Invert (F, n, A, n, nullity); chrono.stop(); time+=chrono.usertime(); FFLAS::fflas_delete( A); } // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 #define CUBE(x) ((x)*(x)*(x)) std::cout << "Time: " << time / double(iter) << " Gflops: " << 2. * CUBE(double(n)/1000.) / time * double(iter); FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-lqup-mp.C000066400000000000000000000062671274716147400216340ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Pascal Giorgi * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include using namespace std; #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/utils/args-parser.h" #include "givaro/modular-integer.h" int main(int argc, char** argv){ srand((int)time(NULL)); srand48(time(NULL)); static size_t iters = 3 ; static Givaro::Integer q = -1 ; static unsigned long b = 512 ; static size_t m = 512 ; static size_t n = 512 ; static Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'b', "-b B", "Set the bitsize of the random characteristic.", TYPE_INT , &b }, { 'm', "-m M", "Set the dimension m of the matrix.", TYPE_INT , &m }, { 'n', "-n N", "Set the dimension n of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iters }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); size_t seed= time(NULL); typedef Givaro::Modular Field; FFLAS::Timer chrono; double time=0.; Givaro::Integer p; Givaro::IntPrimeDom IPD; for (size_t i=0;i(n) ; size_t * Q = FFLAS::fflas_new(m) ; for (size_t ii=0;iis,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" using namespace std; int main(int argc, char** argv) { size_t iter = 1; int q = 1009; size_t n = 2000; std::string file = ""; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'f', "-f FILE", "Set the input file (empty for random).", TYPE_STR , &file }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); FFLAS::parseArguments(argc,argv,as); typedef Givaro::Modular Field; typedef Field::Element Element; Field F(q); FFLAS::Timer chrono; double time=0.0; Element *A; for (size_t i=0;i(n*n); Field::RandIter G(F); for (size_t j=0; j< (size_t)n*n; ++j) G.random(*(A+j)); } size_t * P = FFLAS::fflas_new(n); size_t * Q = FFLAS::fflas_new(n); chrono.clear(); chrono.start(); FFPACK::LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, n, n, A, n, P, Q); chrono.stop(); time+=chrono.usertime(); FFLAS::fflas_delete( P); FFLAS::fflas_delete( Q); FFLAS::fflas_delete( A); } // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 #define CUBE(x) ((x)*(x)*(x)) std::cout << "Time: " << time / double(iter) << " Gflops: " << 2. * CUBE(double(n)/1000.) / 3. / time * double(iter); FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-pfspmv.C000066400000000000000000000145011274716147400215420ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Bastien Vialla * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include #include #include #include #include #include "fflas-ffpack/config-blas.h" // #include "fflas-ffpac/field/modular-double.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" #ifdef __FFLASFFPACK_USE_OPENMP typedef FFLAS::OMPTimer TTimer; #else typedef FFLAS::Timer TTimer; #endif using namespace std; using namespace FFLAS; template T from_string(std::string const &s) { std::stringstream ss(s); T result; ss >> result; // TODO handle errors return result; } template std::pair test_pfspmv(size_t iter, const Field &F, IndexT *row, IndexT *col, typename Field::Element_ptr dat, index_t rowdim, index_t coldim, uint64_t nnz, typename Field::Element_ptr x, typename Field::Element_ptr y, typename Field::Element beta) { MatT matrix; sparse_init(F, matrix, row, col, dat, rowdim, coldim, nnz); TTimer time; time.clear(); time.start(); for (size_t i = 0; i < iter; ++i) pfspmv(F, matrix, x, 1, y); time.stop(); sparse_delete(matrix); return make_pair(time.usertime(), matrix.nElements); } template void print_res(pair &p, size_t iter, T as, int blocksize = 1) { std::cout << "Time: " << p.first / double(iter) << " Gflops: " << (2 * blocksize * p.second) / 1000000000. / p.first * double(iter); FFLAS::writeCommandString(std::cout, as) << std::endl; } int main(int argc, char **argv) { using Field = FFPACK::Modular; using Element = typename Field::Element; size_t iter = 10; int q = 1009; std::string matrixFile = ""; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT, &q }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT, &iter }, { 'f', "-f FILE", "Set matrix file.", TYPE_STR, &matrixFile }, END_OF_ARGUMENTS }; matrixFile = "matrix/cis.mk8-8.sms"; // matrixFile = "matrix/GL7d17.sms"; // matrixFile = "data/mat11.sms"; // cout << matrixFile << endl; FFLAS::parseArguments(argc, argv, as); Field F(q); index_t *row = nullptr, *col = nullptr; typename Field::Element_ptr dat; index_t rowdim, coldim; uint64_t nnz; if (matrixFile.find(".sms") != std::string::npos) { readSmsFormat(matrixFile, F, row, col, dat, rowdim, coldim, nnz); } else if (matrixFile.find(".spr") != std::string::npos) { readSprFormat(matrixFile, F, row, col, dat, rowdim, coldim, nnz); } if (s) { auto stats = sparse_details::getStat(F, row, col, dat, rowdim, coldim, nnz); std::cout << "Sparse Matrix statistics : " << std::endl; stats.print(); std::cout << std::endl; } auto x = FFLAS::fflas_new(F, coldim, 1, Alignment::CACHE_LINE); auto y = FFLAS::fflas_new(F, rowdim, 1, Alignment::CACHE_LINE); for (size_t i = 0; i < coldim; ++i) { x[i] = 1; } for (size_t i = 0; i < rowdim; ++i) { y[i] = 0; } auto csr = test_pfspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "CSR : "; print_res(csr, iter, as); auto csrzo = test_pfspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "CSR_ZO : "; print_res(csrzo, iter, as); auto ell = test_pfspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "ELL : "; print_res(ell, iter, as); auto ellzo = test_pfspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "ELL_ZO : "; print_res(ellzo, iter, as); auto ellsimd = test_pfspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "ELL_simd : "; print_res(ellsimd, iter, as); auto ellsimdzo = test_pfspmv>(iter, F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "ELL_simd_ZO : "; print_res(ellsimdzo, iter, as); // auto csrhyb = test_fspmv>(iter, F, row, col, dat, rowdim, coldim, // nnz, x, y, 1); // cout << "CSR_HYB : "; // print_res(csrhyb, iter, as); // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 // std::cout << "Time: " << coo.first / double(iter) // << " Gflops: " << (2*coo.second)/1000000000. / coo.first * double(iter); // FFLAS::writeCommandString(std::cout, as) << std::endl; // std::cout << "Time: " << csr.first / double(iter) // << " Gflops: " << (2*csr.second)/1000000000. / csr.first * double(iter); // FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/benchmark-pluq.C000066400000000000000000000200221274716147400212030ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s //#include "goto-def.h" /* Copyright (c) FFLAS-FFPACK * Written by Ziad Sultan * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ //#define __FFLASFFPACK_USE_OPENMP //#define __FFLASFFPACK_USE_TBB //#define __FFLASFFPACK_USE_DATAFLOW //#define __FFLASFFPACK_FORCE_SEQ //#define WINOPAR_KERNEL //#define CLASSIC_SEQ // #define PROFILE_PLUQ // #define MONOTONIC_CYCLES // #define MONOTONIC_MOREPIVOTS // #define MONOTONIC_FEWPIVOTS #ifdef MONOTONIC_CYCLES #define MONOTONIC_APPLYP #endif #ifdef MONOTONIC_MOREPIVOTS #define MONOTONIC_APPLYP #endif #ifdef MONOTONIC_FEWPIVOTS #define MONOTONIC_APPLYP #endif #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include "fflas-ffpack/config-blas.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/fflas_randommatrix.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/args-parser.h" #include "fflas-ffpack/ffpack/ffpack.h" #ifdef __FFLASFFPACK_USE_KAAPI #include "libkomp.h" #endif using namespace std; typedef Givaro::ModularBalanced Field; //typedef Givaro::ModularBalanced Field; //typedef Givaro::ZRing Field; //typedef Givaro::UnparametricZRing Field; void verification_PLUQ(const Field & F, typename Field::Element * B, typename Field::Element * A, size_t * P, size_t * Q, size_t m, size_t n, size_t R) { FFLAS::ParSeqHelper::Parallel H; Field::Element_ptr X = FFLAS::fflas_new (F, m,n); Field::Element_ptr L, U; L = FFLAS::fflas_new(F, m,R); U = FFLAS::fflas_new(F, R,n); PARFOR1D (i, m*R,H, F.init(L[i], 0.0); ); PARFOR1D (i,n*R,H, F.init(U[i], 0.0); ); PARFOR1D (i,m*n,H, F.init(X[i], 0.0); ); Field::Element zero,one; F.init(zero,0.0); F.init(one,1.0); PARFOR1D (i,R,H, for (size_t j=0; j pWH (MAX_THREADS); TASK(MODE(CONSTREFERENCE(F,U,L,X)), FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, m,n,R, F.one, L,R, U,n, F.zero, X,n, pWH);); ); } bool fail = false; for(size_t i=0; i> 1; size_t N2 = n >> 1; typename Field::Element * C2 = C + N2; typename Field::Element * C3 = C + M2*ldc; typename Field::Element * C4 = C3 + N2; SYNCH_GROUP( TASK(MODE(CONSTREFERENCE(F)), Rec_Initialize(F,C,M2,N2, ldc);); TASK(MODE(CONSTREFERENCE(F)), Rec_Initialize(F,C2,M2,n-N2, ldc);); TASK(MODE(CONSTREFERENCE(F)), Rec_Initialize(F,C3,m-M2,N2, ldc);); TASK(MODE(CONSTREFERENCE(F)), Rec_Initialize(F,C4,m-M2,n-N2, ldc);); ); } } int main(int argc, char** argv) { size_t iter = 3 ; int q = 131071 ; int m = 2000 ; int n = 2000 ; int r = 2000 ; int v = 0; int t=MAX_THREADS; int NBK = -1; bool par=false; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INT , &q }, { 'm', "-m M", "Set the row dimension of A.", TYPE_INT , &m }, { 'n', "-n N", "Set the col dimension of A.", TYPE_INT , &n }, { 'r', "-r R", "Set the rank of matrix A.", TYPE_INT , &r }, { 'i', "-i I", "Set number of repetitions.", TYPE_INT , &iter }, { 'v', "-v V", "Set 1 if need verification of result else 0.", TYPE_INT , &v }, { 't', "-t T", "number of virtual threads to drive the partition.", TYPE_INT , &t }, { 'b', "-b B", "number of numa blocks per dimension for the numa placement", TYPE_INT , &NBK }, { 'p', "-p P", "whether to run or not the parallel PLUQ", TYPE_BOOL , &par }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); Field F(q); if (r > std::min(m,n)){ std::cerr<<"Warning: rank can not be greater than min (m,n). It has been forced to min (m,n)"<(maxP); size_t *Q = FFLAS::fflas_new(maxQ); FFLAS::ParSeqHelper::Parallel H; Acop = FFLAS::fflas_new(F,m,n); PARFOR1D(i,(size_t)m,H, FFLAS::fassign(F, n, A + i*n, 1, Acop + i*n, 1); // for (size_t j=0; j<(size_t)n; ++j) // Acop[i*n+j]= A[i*n+j]; ); size_t BC; for (size_t i=0;i<=iter;++i){ PARFOR1D(j,maxP,H, P[j]=0; ); PARFOR1D(j,maxQ,H, Q[j]=0; ); PARFOR1D(k,(size_t)m,H, FFLAS::fassign(F, n, Acop + k*n, 1, A + k*n, 1); // for (size_t j=0; j<(size_t)n; ++j) // F.assign( A[k*n+j] , Acop[k*n+j]) ; ); chrono.clear(); if (i) chrono.start(); if (par){ PAR_BLOCK{ R = FFPACK::pPLUQ(F, diag, m, n, A, n, P, Q, t); BC = n/NUM_THREADS; } } else{ R = FFPACK::PLUQ(F, diag, m, n, A, n, P, Q); } if (i) {chrono.stop(); time[i-1]=chrono.realtime();} } std::sort(time, time+iter); double meantime = time[iter/2]; delete[] time; // ----------- // Standard output for benchmark - Alexis Breust 2014/11/14 #define CUBE(x) ((x)*(x)*(x)) double gflop = 2.0/3.0*CUBE(double(r)/1000.0) +2*m/1000.0*n/1000.0*double(r)/1000.0 - double(r)/1000.0*double(r)/1000.0*(m+n)/1000; std::cout << "Time: " << meantime << " Gflops: " << gflop / meantime << " BC: "<s,f0,{0,g0,(0,\:0,t0,+0,=s //#include "goto-def.h" /* Copyright (c) 2012 FFLAS-FFPACK * Written by J.G. Dumas * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/args-parser.h" #define CUBE(x) ((x)*(x)*(x)) template void launch_wino(const Field &F, const size_t &n, const size_t &NB, const size_t &wino, const bool &asmax, const size_t &seed, const bool compare) { typedef typename Field::Element Element ; typename Field::RandIter G(F); if (compare) F.write(std::cout << "Field ") << std::endl; double basetime(0.0), time(0.0); Element *A, *C; A = FFLAS::fflas_new(n*n); C = FFLAS::fflas_new(n*n); for (size_t i=0; i WH (F,(int)w); time = 0. ; chrono.clear(); for(size_t i=0; i F1(q); Givaro::Modular F2(q); Givaro::Modular F3(q); Givaro::ModularBalanced F4(q); Givaro::ModularBalanced F5(q); Givaro::ModularBalanced F6(q); // ZZ F7; // ZZ F8; // ZZ F9; launch_wino(F1,n,iter,w,levelasmax,seed,true); launch_wino(F2,n,iter,w,levelasmax,seed,true); launch_wino(F3,n,iter,w,levelasmax,seed,true); launch_wino(F4,n,iter,w,levelasmax,seed,true); launch_wino(F5,n,iter,w,levelasmax,seed,true); launch_wino(F6,n,iter,w,levelasmax,seed,true); // launch_wino(F7,n,iter,winomax,seed); // launch_wino(F8,n,iter,winomax,seed); // launch_wino(F9,n,iter,winomax,seed); } else { if (balanced) { if (type == "double") launch_wino(Givaro::ModularBalanced(q),n,iter,w,levelasmax,seed,false); else if (type == "float") launch_wino(Givaro::ModularBalanced(q),n,iter,w,levelasmax,seed,false); else if (type == "int") launch_wino(Givaro::ModularBalanced(q),n,iter,w,levelasmax,seed,false); } else { if (type == "double") launch_wino(Givaro::Modular(q),n,iter,w,levelasmax,seed,false); else if (type == "float") launch_wino(Givaro::Modular(q),n,iter,w,levelasmax,seed,false); else if (type == "int") launch_wino(Givaro::Modular(q),n,iter,w,levelasmax,seed,false); } } if (compare || levelasmax) std::cout << "Lauch with:"; FFLAS::writeCommandString(std::cout, as) << std::endl; return 0; } fflas-ffpack-2.2.2/benchmarks/files/000077500000000000000000000000001274716147400172745ustar00rootroot00000000000000fflas-ffpack-2.2.2/benchmarks/files/mat1916-1916x1916-195985.smf.gz000066400000000000000000020420711274716147400234230ustar00rootroot00000000000000z[Umat1916-1916x1916-195985.smfL9m]׊_G!S69 nE]&Y'uV?cOdmܴw3`c7_#Z~ߨ0S?ob_p3~1*3vL{yN2_dҜe YS W̨ ;оIe)L,X Ysh&)\} >` >J`N14',0Y  ֘_B9dmj-kӚ!@ik-¦+X`+䕲@Vyo5,4@g'9P[v+YMYUR٢FR hI,+yN|t{ǵ]?g[I~_e3o=@gm_uw-w;ç_ԧЧlU#0j293~[Ul~eh'Am4>>0Z?s`VC_XZS8 USj%= b[ſT_%?Y&s}r\o{o 9}~! 1oV n_Tj(QhS?%8 -'3 Aþx/.ؕ)]Ϲ[ r(1);rtB'llYK X,7su*S&}]Lv-W GuY)uJ~_)uJ~_)u+S֔:%NtD O@H2))sJʜ2))sJʜ._綜{%{аO_~PPVI z4^5-_1*]Fuĉ;LEx{^qƌvQӃ;/ 9vՌN{g X+hWMY+ 6-TY%xXSYL\")eLJt!f{ z/My Hrn_S@۷{i6;Ut{~ pP %[ < jӤ+Ҟ`5 &egDK01PrȒ lUFrjo, b jN0aP)nɂSvnpOT}a܄E!mw:A$'Ln]i`Ow܆əՑJ󮇿5~wn櫌ݯwʰ7gJ5-j""O,ߍ ?~͐v}G n 5/X5/`qw?wWVQC 㡊xxyN1#΂L;j s">qz HЧ4&sKʀb`vh֯T `$ ͪN̤[.0c4R0>7J**@yp=4 Lm|)̔ɡj4Y[&Է`(F()܌}q:? Σ*v l?&~ ;"d٭ 8Z .0[aOߋyXhv3*؟[4h \-P#]L̩OK&K,&'wG`- &0#_e4A _ WClֈxI,)%ųxϒYxVk')%ųxϲijYwR(C>˒eq~ϒ{_AÇK%zg joL%m jwalI-)%ز*7/ܲr>*otۃ&O~wҿK{ % \r.K5 iRK49&gY_԰xk@W45ft&?u}t՘ -L Lv\5e^B]hEaBK?[&!;l559aver7@߀)+-5U#v?~q5(I8WF5M8_mI+l?~ZQʡXvJ,+% ]?6~W/k"軮\}b_p]p]+nCb=ێjWeLjh*q|Pa408&GH;0r>eLpPKOAxG+JGޑwtv ST01 h=h ;3d1m(0e#H:$2u *?DQ 6}t *YJR@@: \4L;䢣#qH:YX<Z#Hh9Z[8_Rq%t%-_wR_Р0ko]LwCХ\r9henbѧf JQim3W 9дٲ]TT.Umm:7z(J8s qnna*cٗLo)nK$/hEiiMPkR]KeNϊQZn']i{1Mʡp=>Z3H.YtNcJ:N>k9}hΤv4Jm;X>UlKTJ 0])\\ hi]k92$2]?!sjwډo A~] 2)] ;Rph'0oT?9ђY*!,f*OW}5 ӭ ^@ Ы4A5u*X aՊ$`C@;(8GWQIf4" = ʰCڰӎzM`RanN~YRthJqER[XWŋkEK_ݽ/Ȅ$\hU>[Yh4)Ւ-'Ia"݌)!P#vvJ i=hVV6=30P?~/LS?7,(K,ɒ,|]ZuT߂C@Q,R ,eR~/eR.eR^.NC.2eeYnYVeedi>Q]rAdYYBe%L2 h#%,SW<Ȓ ןeydY Y=%e9cYX3R@v3e9cYX3ebYX.ebYXֻ^.%ՕK.e]ebYX)%eeIbYX$%eIbi-M35%eI+fЗ?`Kj[-q%nĭZVKjZ< i3ӗcӶǙ*l2 }hšauCP;uCP;uCEhQ9Qg)_QWTaJb@C_u#hZ0hpE9*j($B9.(VBT?E;-tF{MMMۃM Ȅ8C`b dr҄O҄҄V6ƣi ,j~qlßq\"D{XbWMZC * Q6A![.pd&tf&At"Ւ@Gtp zvʽMtQm#ʨ익r *J *)-!WOqpVUG Ǡ ,}%2)+4̊ JKE;rd{*ťM6SeRQLSTg NUa$$3mS0p/y/y/j%%.ym42E hBiCjӾJJKDH{*-u/5Ni[y_t>×>͕ݕf_d;X7YpR-I-;---ӁLJJJ ؉TA 7-g̤Ii\>>:xqi;)ѥѥwJ!T%tp.}r. Sl@j]ٽ&]˿99jZ ;AOY~O{ f^ЧABK\-Bq]%+Ȑ;A E5py60tŹ+!E K'\)\9[?EfBHq>AL쨔=ZfCrՓF&5Z0r%#LsLd},YKǒd}Tq;_ԎmQO*א6"ߤC(w"?um$ڠɔ.$U4"ᨀpTq:vivi 'QQkNGd1ZTmtD8 ^>Qܐ!M Z Iu9po_mAZ6Sx)hÁC/NunӈTi|ҹMCe+rN7muA.NA*K6e @~.-ۊr:"g/mcEk@*,G)K8d{I9 e3~:7 |dpkquE"tY8.Wl;_?ڄnkH]rtu1Go:8Ζ_4dΫuQMs+%]Y!X9? kで(w]u[GuR&'S_7٬|L/KB)W_$4X2Q :$__„ oOҖۭRW&%G^EHS}|i;!_4U #OCDAJ)dz2mSd51u?^1mϽNgwNKj]Kbj]IW+O䓮՞Q|%t'?زy1tz[O|fLwھ)ihmSJ*[EwnEv,rGѲ[:ی "<0\wڀfh탚a U-5֮] -ۏ߶ ,ݢH(Rt uZBIJ#]J#]J#H&#te SLE# ֲٿaCu/ձjii2;Lo&B[Ke_k=n3X6ôa,5.Lk52J9 !}]foW6&x~e2hgL x8  iNI8ptS$}>M.' d>K 2Q (>9he*^8_ :y}aҼ.hfyiXJGЧ`LÅӁE= _'`hvSO!HuNT qfgra /[[^4A)nb"Vt(ߛҧgItd?2E3o2y&kbL-݇|!h%kuLr{.9~\5U-j-r WV FuQ[[[[{U6Bm`0H &A^4>;zֵm'.=HI*]J[/g[R.!4ei+ti+ti+ti+ti+TFGMӠ4(< OӠAx 85[@)P \Y̜t7 +yHoʰP@!*"ns(<2\ņc88 E7MxzՃV_zE_ J @Kt04""W7.JJtH݈"e!݈+$]jDTpВ(ZВ(*-"?n^ySKhu .Ȩ%QJRoRŅ;@- Ц=hI5UӚ\4Zw5>X*#%c~v^Mu#?B[c-J=*y6S QemT2;8+  ;VkrmעJ`EB3L~l/؄ 6L_y6,Hץ%FQ !_r;j,قЧ#pǁaB_4]Aq` aL!e's-Et[@qIA jS&)y?`9 )<\M 8־P&⭐Ji0-/:C8-2kJ].&U3qH{23 1#IU_5u:l6xi%#h,|tB5NZī?8 5ƞE\m&M~P,ц-r[=va Nw"]h^5q_wmXoU5o8'S&2(~B "KP.]!(# (T<4/tF9VFh42w􎦚xϐʀJ(9–kTL}VoFs r+"h^x".5 3*- ܢQ%Q4FhZxTdev.AhjQq/H .MuEK"ʩYW^Z*.PU_z>/@ӢH9_h~Rr_]b8 UZ $7HyPN} q%huS2Ε0مWVD*K\ 6&ǥAUO{0ߟo@j~5{}ņ د=.lA*kFn=3:/&P$ |rC @} ju 0Qϒfo};gXGn?a̦>ml^r\ 6@Cׄ}&f7M@x+}NpzbK׺zRVK׺zUX {trrqoF]zGQfX&sI)PfXfw uWXEխV9N `অagP#FQg R)MjqdlG0F$ zcHu"C ],2$NcP–fܥ>د'҅OB·R /z~a7eaYhXǟ^pPMU@o7 )K$'AOCI*%" ;oE6BFvM$ahX,*&Wd*Q)P>-ƊRkH4l6M,X 8SQTWR͖i<\'槲Qp>_?(#Srb`1}qLg8 L/)WX>|r#V\qH Nkc74 JZ6rKsX?O!6S-Z?h5t d8YeC=q QpvQdrv7継.\*;rWtHWY"M)>^qHYt%M6W7[쀂ʗߊ1N9y&Pg j7| S!ϸij@v ;:Zau̦'LYr3ɣ bMƙSSˮSˮC@} 9@祁u ŅCk4.`x]sGJb7p$X,Kb.rR.R%jbYƒ$=,Izh,^\0i0GQ鄱4/jz"YJACGe莩 l h{9?? n*W1uEUyd,7Uڍ[=Ӯu~:$CuLM&"v g0%1UۅS3M*/VRW\"e?o1իlWh K_R nS=XylT ( ?b@~BBRU)X%咐\KBrIH. %!$$䒐\r'  &aՒϐKBrIH. %Qr P4od=8GBHH #aq$,ܷ+nɍ,T$U&YE1K "U[ѱ~*T J UH@R%*T J U!C 2H@z!CT J ]8}K[i81BֻvIZƅ׻ҟ_2 規ã7X(C^ߟ~+7z .H–E&w,kB! \yX`/~B?Qk uB-ˆo5BZVXx~An'^/;(;teee\ҝZZ {I9}? kkksIc.)q%%12 Oy?j+wzvzvzvzvzvrMa_=DI)礊2[~+•gLHc{ ̟|gN*ّ/۾l#(Kn;LAʫS :*"K;Eu*(6)V`h,{aRѢYwD)~۞r*O"Pyh)Toנ=Rf 7]P^̱Ytd|ӴM$Zؖ lۢ=׽rܥ=!$m!HQm*A߶x TBN#(5nKɶX4l20ٖ)k5_,mr[ܖ.mr[ډZxpwrπ+ޯ/}~t/v>LA.0 Ȅ+IP^LM_!EK=er[[ 7Ҿ`P9).,gJ0ІBA %а R΢pttK i= <3 m(9&]%inZ&5ޤM9ǴϫU<RiQ[c:S&S\J-al&NB#{1=tV% RL^?(]9B] E3)Gۤ|yPIo_ݿ 2Җa@-9̶ @X9Kx!輗]}Kпt^@ vO)%>sǃR PgB'49i IW86uSǦ 6W+M31h^y4> FoRVBS$GJKσY'c:8<һ-ڶ*8h^<`G>oUEMNbpJn)&)!P-RzlQBp:O>aăi"bi鬢%TTNQ*q>UX"}`HK.p LYgE"lf6 EyKS e@ p>.~‚nnYub~ϛ{gIեKGŀbҶn5bSl|~EcT^8_cKA|8߂`q,Xx-ұcT>^޲>^޲>^޲paT̍U>t}e!Ci yׄp4>a R]oqY8V>^:W>YaYw每GϯʳKgǽxL 9f//_/_>E|v~}ҫ]>;>헂fe'H@G9 R<ckQLr+ 5n(̖bš0JwJH<2.#pa (e^&ֆ}Xk&;WƕLerq>nR pʥ~5Kީ.hhY~LbC?2Oϳ$Ԥ?PA|; gڻZnG,N #ƈ1k#ƈ1k輻Bȟ)@k%R  V B@WȲrNKFWf_XP 40BaIaIau.Nթ:^t蜛Eid(ܞ,R\Xd"]^W"hreoVjWT h= t}Puc4`p5Tqa$MJ^8hMI5K/]aAH>/glUѡ*:TEA_A /4FlwfVX{0]{2;:t>w|ą|ZXҠfp҅K/]t{{sԟHrzZ]THpk RWҪʃ ݮv7D{ 9o-^ p…I8T0q!?-C 56M’+Ҕ$72R0]Cᜥ5%MzT垫EegR`4Mr۔dr7xBB(!5nPװPL)q?^p KE*)^+ Rzqȩh?۽DGm"D+.m(ZzKnRLc1j}%)ٗ/r{ds ,,,b)2tHp-µ&(s5' ꊨ v󃽦l~::˒:˅HXkOSdgn[pG(K)%9[f)o@s gvDj_ HGY mЦs\T(u.He5m~݂ۅf0h* k&]i{.E˶/1؎s^88MX):N[,A cQUbΧhT'{.r]., RoWZ7Zsˢ u^am渤}9Qy@RK0qPqhLyGA+nM}ُTF צ"ٗpS5;[fsbM)ri]K eEWǵ+7îKynfՆ s'Wz A=iJugV0u$ Kޞz 6@25dDQ5[#˦7/Q'XO(-pK2ܒ 7Z.4C1+7 X:o;Z -Hn [gE4Tn0-rK [JP=nC)i*Vؘ:Ɏ]Kt R)!EGXڿ+o7@!{uO6UsmNӟmtݔW$^߸ h(9H1y[Lޖ倭;y }tnQ"ؐRDi Uq`kh)θ)mKdq %"TwTG:Ve-z}j&\εrζk>4WAa-Z c;_h0\w`]ڏ?}9ä6OLآ.鰤c(x 1)KKM@C,SPW.B5;swȡqt, F:8?>rRp^=niFpt]~y ZJܛdbR*~>$J N9-Ҩa8A;#&(LE|.قЧRyI4@.y)о*Pq_x0Э}C0,\ YV q_)Ly)d9Xpla\[+ژ mL6&Czuk֬B/#vA@Kt6&dxY+1ޘ oL^R+v\)#@s-fmA9X^>}lp?1^!+4Ib%6;"Gm8jۊAjp@Xh5j( *3q]71ɿL;'>d)cߴ=q< u4)r.k絷elE*UGJ4TH՚;^jIѢrD@$AlG}qL3HE-l|%-f2iObK>WWC:In ] 4{.+:v< O`k=1N,fw{>,a6q;iu-j8xyw%Vobњ`vJiBmj65N͠-8sX0d2 '-(92u "fd80+5}>M!BN/nKm)كj"1KPm.}N*yV c8\N4>ӳ,1=KLX|0 ?Uzf~I0'Zm%͙㥥<|1Z%lhn~+Ru:o^NZ,vÕmz6B^Mo-eӫzkzm57iz4>^MW@$MI{p Rr-4 &[fxkѶ/Hx+HqI9guY]z4f^3Mҳiz4=Kz'?{Þ'?G?{tCXK{tG=Q~z5w5w5wԵ:u`@ j1MvA)kрb;>>;vӎ{>z` خirK~x=OA 'q/zKw} 7fkrp=- @ǽqo{Xa~,nq{ǽqo{ǽqo{uuǽqo{ǽqo{x$#NV@Jp鸻=no mS2!gw{EuoVoVwV kǝqg|wǝqg|wǝqg|wǝQgOnwz{zzazݛ/uU0:QPqF#hg*PqUn6B?д+n&mm[o]X' L`SȟZ}. [OU(TUıx4Mrh2qkO3dOY4؟f `vagv6)L5-2ٛt *txq${`d[|ҘM-S9Imk"y:\ @E.BtB/Z+ ; !} /mJR_faZl\nXE$E[M H)¦ H  5u^DR2pꪔZ*b89࣬ gа6)!Lvbm_\0@aRә(p m.L+]c[]U济~-}5 qHCO,H}[hZcQ~0e+/,z L8Q csB2q@B[ ٙ` dGY*rXGs5 S&X${^*9E3)(ֆ!Jo ΐ/F QgIʜU]U]U]U]Uwe,N& )ΡQưpqA򤇸Xp1Kizld**Ϋ7 Y8nwYFC<Ƀ`,@hW\iՔ+dΨ2§-xˍHA4anPnPnPnPnPnPnPnTOWĥNW]ww1_M|cz@`#VKׇ3ܺ[qnŭV,] pBg4pKk\ն…! OY] <` sUD>g0BYn*q^B3@}f8y#lb(AiR }1M,-\LYuβ SEnR 8*e KƯ5>=\oyia7uH)4)K̗U.KQ'\^YJ\M[`^+PDmf4N!6ShNSjhqRQ_Tg@JGkW anthtQRa]-ZJ15þ,.t\-]ct5;i}֨&6 -D!Lv 5o0/`yj>]pA0>,]0 !HNZ0򸜥A^0[%["[cqܺqbǰ1 I- ;oQ~*ހڦZ좞L*5UOu<\\ #.:)RlK*$_*2g%fbr e Q-Kp~9?+B[n9ı(mf_}qq<%\xRK¹JrKP.$ʃm,L0 ^t!}./z#鳓R陂&&fƥ<>\ϫOO933T,Y0ӓ&?찒 @΃F$:&a|!f)񔸒/T?SSj*RW; ^In\@={:++~-RPkYavSRZ 6+&GV&{fJEkPApL}~21E8\:T\ bi"ݢB.P)tC[ud[qX;{Ԗ@C ٮI[UG?J/(R=Q秲 vFA)EۤK"u2*mVq!K5t7voC;Q5 /b\M Wu-Ø.ПQC*x3/pI.L}I jL&\EY1Z+[V{02ٍ̏+[+,Ta98Yڄ6dv?ͺwӸѢ.>Re+!CVh׿ZOSgp) )K#ddӼ;þ0>] [&&&Tʙ \W\2?19@vrxtF dnVYR| b9a0Ҹ ` lYZn} bSv|y5, `o莼yTn^30XlYއv%)FdD_ǣiB'|q29u e:5'!eZHR.>rqe"ЖUd~ J9:_ TqE1T\H!OUQ'@]ZP3z3`uhN2 ERtYSbO056:CLn+(o TC&q2舶nMSթcu4(XMi6uS3H1ʅCEGfMm iGݤp9Ee) 2SG<( N˂E yq^yi3Fڥ04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. pG,-g--g-7q MxG)`EQ¡`E0xG]nuZuuFQ\ ԍ={,-ucH*YƄucTǸ]Ro jΈ6uӐd@H3ge6L^]њ.P#|狚ͺ=:l VIÐ~b#2H\k.#8|,`]+qWMceޚ 3>2U*@&I\ㆶ%Ꮂ ^27KTsR}NVCH&cϩ0ǒ|¤셲:Kh`;!vLtǔ^IOwL)w~O%ԇ nzF}r1̦o낖 A6ا oT@B$X$/w5wh,\2k׵ک9eRgw9N/5V/T\I'Kݐ@Lzs@-lמڳQ! `x#Eљ&%7aK\0 s"V){e5`S|&Cbtd!|PBpH5OSljKr@NwT|wCHy$ ؾl,H{<<^ W2t]_z$ƟoߵsB^BXrվ@ MjMB MRZ ;e} 4T1Z)ӫ CvF f|pEЅ!Jyx>| (M)j,@Tgd nUR(\Qgf_3f iRe:>|:@)knOQC_'˒R}6hHH9h+pZcۗZLmlђ} :Rs Aiv_{v&Ņil;Wl\:F M}JSWԃ4 s݂OlMmQ¦s6ݦPesaS#lj{M-}5ܖ/* mJ Uh.SZhtFQcnH.wRyEQm\$u0FHsiRZ)El#m4=%r ;EfGn)bhn-7\~%lQz.'`xSs[=Yi}66!{Hh#OJ>(DG^@!iU#{35Ĭvyw.gL[nKQ8TXsvy*'>ܵJ5va{e8 +)Jtkg?/-3ur{7E_4-Sj&&ݦ U!?)Z.-$nJ))䯮%˵(f&U8Zt྅"崱yh)4찀S8%"Os%5)4)qj]6k0O8Kk9n ``X`B2xƩf=[.߹K.RK8| ҁ\R:E6ۃ `~KG/vnt1\G.H™EOeu Y ɠq%gM}}ca-A  t5>#m>A08~! goN~&<_ ͂œR&aG;GV7-ӆ֔G W*x"՘ X Vqwԅ)@Ge5aUc6u4h^ȫfD %:4kdɰ:RSz]LWn<с~z mVXQSPV}Mg[Wi#~]ڸ- ;Ijݏ]^5*pjO6i04gjz(QCEZ;;[5EKfs,̦#’LX?ǻ@."SQQ=:Q|ʛMEQ4?-8Xpg>2)x]|E΍:x.lo޶uW6f9 ^%CQWf :miʊ: ._\ x"4Ad{1MaR<{ EG{&uAt*_̫_k+C?r>#i6AVǎ`HN)@Pg^wDtV6/2SZmMtԂ$,|}?1H2:U | Nz5Svӄ$ɑC;xg⿃@MXj{m)+88߅]ьFFodrq)*z^` &'BhjfՂ KELôN]KuZQҾKUy7Ƶi]Gô_&g>vo7@ceG7wAE:\q}籎GQ?SgQU:µ݈=Yxwp`{g===========e؂slfmfmfmfmfmfmfmfmflfWC v8;GOy=ui>4jxEf^G{ݩttuEԿmr!H'~1\x}B;SCLvSwӳ4cw_h eh(CC|u24 <ľpT0dB;3N:ڞID*`)_]աG> }=]f9P%u\p!,Vh2SUjx}!P+Es0m!ȩrh`=fuhBuiqKrix2^ Oòj֙U)2!%_pLQ(wy;0Q.EK.ɿ@pއJLE΃t8a OXCƠmR*حcGr{ zѲY Ud%~Qdžo/Đ񡓸t] h2 NZSԄ45!MMHSԄ4y2$?yhjc瘩s@E9:M\=cz/ ӭ0nջVg6](wʼnKܘn{ՠqah.Iaa{ӾL[pzfYd.u&7|E 75)Iap:h(p1}&f&S_< `উspRB:N|h!h!‰S54Q>@Hp> },՛RGG$= u(9jG X  HyqF~ttȏq#?^H:|;wCO.yZH[)LS`7:zchN5sRkI".Ќ(tx1xVnk}M\9 `(XEѶź͡(j e1G$|ݮ!!{]MYiE&ɼ}H{'"e]M5LJviIIjjjjj("&?^PTEG--K<W&KйPJ{)+A;ձܺz; UyA2=ŚyQ~|-QҮeP$ "Vpfy 0Qd0)()grUY4M=n)iֲ@탓Tj`!Cfϴ5c?8>gsAW!Mf)d !8}C^|vUj?VX]qP0l`nQ؂%v{WNJH|-9A} a( ?|pkժJi+NP:ͳS=_ s0tz1sC e7~xƳ/1ôuN?TrўHTkqNt_kEls6qe,!.y<K\K&n">gDž,;`]_ўb 3*x7w-n(7nKI6egDb?dN.]xxlabx)6,6C=|za<|k>K$\`Liurtnr( Jw\:̟Wa[,^h|tjS>^Nj|+:)EzqOnR8\|`8,7^XFt%Zpϳ>e9^*o|-k-N6GsQ Sje,=s7_gS<ݾE'*yʅ1iʊrJ(2-4VFrAL^ZK-΂SVVH.6)K( v& ̂R9{7iloOba\Bz"͢gQ2VLtAIۦGQ, 'a[CwrV88 ƨn%{0Kܝ"nA:0v_sH+Ńrm Y%o;_Føkpq".?-JQ?3]4@-Aou%Y=ݖg›BEά}]LtñGals*7)9;*l(^iLdt2{C!p$][fth[e!w{Ըn4+rwzzQ@jc6B At4G]nLc?SyAtWH8 | p:p?1YIޅPUmY.=B'novYq vXBN~Fпu(/˷.|t88B9C# ? ?m"`̇_v 6x)y^UVyR 5q^9?@bT/dxQ[1ԄiqsYef]VHA.r?6"u0-j&ic'єm֛.acBhnwax J&9Vj9mf{7bTRgRe0,jzL<]֎p}9cKS_u%)?0n9< S6mH$fMH- G%cQz9G\Ƣ5t,CWArf&`M4Mυ}v;q}7m/tŲPv*`O;=|jӲlz58^RzJ|>.K ρXn'[ '[ɭTĹqQ9{Oa}> !WM7By]b"qHs5hBHv۟g}]{>Rzn* n5| xg9{uejt uzn?|W|Y"E㫍;QLg EY60cBN |v. pceg8>7>H#=/> bK|qx2]IpSޓF$0Bᕯ˞H -cwH""f,{s]>ڢ&AQIЬX߬D߬D߬DZ 舆|imhUUUU+E/I/l$Y>Ir·汳e^8Cm,8-ډ,l0BC3np;8w#nxI6 pهp^WA/RK9)\vF=Yp:p65:fe,u㵨:\o7.3ѰoGYKIwAФ.̸m ] ; KT ]Li:,s jhoXA:Lic+K#Zkh^-ajMCnknMCnk|aeGhmo+'.B%}visp.0H팪! yy3gpmټ %)c;3MycE**`]aV%DuJW,S LtH6{ea;CQɿxq+v ˡQp;u 1_+|9qQxq%RLDp)w١p] yS%jx_̨} [E6Ӭ8ܬ8|(gSybŻJ? Bv6$ZP aJ4emh2/-p syEԸݞJa{*`5 O0 3Mhxц7/e{*mh<*)|8ԍ?<m߀hx C gzɃ <[`yPA#eg񲏛@YtCAp{n[:lt$S tp"ѵ!j=`FDzN֕Vq2&V⨅Zm >Su ptCaXڏ?CpF Ja?xs[;]ʟK=I/e4YB^6egO)Bh'{UB\ ğ.)M,6>#ףO,uj@v{!7yTkp:X'.1(\iՀ< }F9f1:S}KPDn ѿ_٫tgxqb^Ciþxdq( Qn ݔq)Y5·/EX3*T q`p;]PΗQq*YBYBxf f ؤHfJOQ%J^J:tJzK)"K})EܯN|/p\"8\:/S 7s#Ұ&L!Sd3~QQcS]K0/j.4f.Mۛfu zO&P˻}|+YS4.|fGϳh݈WC]V D?C'zCp)S5^>-:׵W>y8zQ8O7 _Bz[ 57@SB'=SB'SBȧO ! t5Χv|:#q 8FJLR@%d#T*zfI;g7u Tk[3+5lZC/ }4t?»җ4%1kY0xX* q<;BBBzi$@wK=zLLc0>wMSEt[S[S7iM\+rJ"5:_%iM#]oMVoHzWoM_i&ԛfBi&ԡ=B:YcOopz(ӛ̓Ho+Ζ}"NomT$Y*J>pԛszsߛ;nS{z]8iM:Gd!c _,R֩U"/nSD EUjvLt p\K!t_ =4_aNO$g ]}xegƩ& IV!5`wsj8rN0o|v]H}x6]x$0P.CA;eLæK~ ?v~'4nz.k3#չE.7#.MeoK쮋o)w%> ^JL&q^>yt$hVz=7 Nǭ.A(JGcLd{xhɿcGSɌ*NK;vil&=Ў:hŶi:z4)EpƲm =JU.#zYK|E6)%Xxeh&{zd{Tk%ȹt ⭾MUB.\rWkpwՠ#縄_GDP(ڦ0&"i<$\?ұq `}|L쭂SE(J:x6{뾋m.pwCQH̔"e-2hűّ[gJc )n֣am Q:Fi_Yo|Q%^!f?/a&?,Vzc_[MKto`h<:|?:M1eO\${)kxaS!2X)ib=WsMѯp9}.Oz4becpl .]1u%MpDiH u1v#nf \Q+Rں4J?scƧ*cy - BZ(>y᫒Z(C elh4-۶pw™%.1dr]>Sv>Op>q *Ln[pҺ\^v"кj+'u0t1sa,@K}*_'E``jvgLǬ˛EG?by˖.ڥG,W\4 x)LkE6#J vO `4AճEK65Qh\Qr5Xd֪ ?wE+fic/2KUGGJ\Lf3gO;^ʰx~,+*.ic9dMLX.3m]gO,ˇ$q)K/ɔ˥ӣS?IfX,Ucmd GǒjX,OO ]jzJnٟ/J\:]xC M_Gx+1Y K\9)/]5QS*.L.M9i+#PtW8hZ޴dieiYK@S-p]8x'RQMž9WuAb ^5I~S]]XԻ.av"^K Jj[ֻ. 2|ѽ(|y&YnK~,47*fo p6sCv8nR 9kZyZvbx(%?5mq8IfH᡻t4TLvbXw] Z2]/"K)LvK)49 ǀϖ9. nr]Jp-)/Gn5'MۤvX4LӔ 7n^n6i{Ϋėp`Q7 29t)/C%)JjaKg1n[S[ŔsҔ[d]vmF,3mػLa簳|X*q2p>SJ҆=Rl*cC03ѾĆH^]tw@Wql%{uqk{*&v5-H:V/uxlw~C;홺 eL3ufݍYxi4_R_=d=]:c> &ZK[ wyމz)pxv_S)"o'GR&얯q؈?mED6cޚ>ޚ>ޚ>ޚ>ޚ>ޚ>ޚ>^9^9%t|>i:k)_[9oz/\t):::::99fС:d DtctN \\ԙgχWO+;:::ƃ8To~d*O\M~y>Eu_|sY>Kn?t*>^|>G2?-Xo>y&<}Tx UN#Pak] ?8mw<V4M]RC-F纶ӀÈ99~Q_Zrz NCQX _% 6[g<`_U|hJ9qQ"p@`gy*xέ/cjx%p=|v{S*KM|z.yTG@Y|n=?R(dU5 h ٛ/F'nT#ŮMX-Ia^zHgdޭ\Ⱦ[09{zxw8^\5()W@(t+4 t}{_*i%%YMBtB cFϪ/:|UI];T,ĶйC-vE]U]+YTtUV1w o?He ~,J{^aq'F uYC_7< -^4<#ɣ9Xht$ixAbw=''?], '?>R[*L8$ÿ.ڟKsjK rֲ&d/3k 7)xzJ<=i G_rZ;=+.b qz ;8zgWf $L]vsxj:8:盚NpS83MMKeQ|Mjon(-bVR!)/Bex[E.0O2Gו,mR"g˓rd@W6 ?tNقJ[+oo|Gͧ\4F޼Owh>-&+tub_pP ޿ua^2B}2j’rX>Łg芶C5"ECeڌ'O^b39Q/-MaWΞ㏪BKa{+ QH.{ ڼⲆީcqYΘYGY/ #gj&Ш[4LӔкCCamg1T-raLe._CyJKFxf L&rn_.L.mShBKNBEd\!"Q&0 @wG|83 RGdc|ܙ7= brQ{2D tڰ Ώݝ Wtoۍ/ǹӻt^Õo '≫Fxj<4[8cLuhKox ׳p$xȀ­G ִr:z7HP6Xox娒Lg5ﭤv86R*`ZHn)Z6 ;Tb\ wFT 2/C%*ۍ*X2H_ɼ=K_ 9̔7*/MӲToӖsY5L̎UQն_ܪ6kSVm"ҵ)]u(xYu(#jNkNv.r9隓Bd.\ ֛>ߝ$?ms8Oiqm kõVyd;c<tprB6Y/o5 _$T>P,{Wgwi 2+\סp Tp) ؍q**u1>x1a/y(\tP(3>< pQ|"x[rRwTkYFp50 "_p&'g Z׵"SBՔqOG{z)Lv{..&> L:v|L/G~9aڦ0- f9Q^r(/Gy(oEys%^5eZu6KOj\'5Z.E/5Zڗۣ엖rR:EN~ݿ?}U%HWt%HWt%*R2˶=;tɾ -]hBKZޤ+QYq>헜cT^ޯ^8/6.1yiޑr{9p9pUA .h){S㊾Д,Fxi\@=R(,]iR1ߋ# 2KR!6~k)E_$1 %_"P&faӪuNAA(< RT 3Et:tL5.Lt0M*o'gm SEd{˾04=9zv+3Kߢe3%<&8h) 0mSVٳ=ٳ=ٳ=ٳ=eVleVl7* Z"x;1ގvcP͂C*YC*p 4\,_YBr!(TE:&HIR{4EMye3ɿ)*ًhE(rpR%*txQ،%؁PG :4Sm uU>1|Lc:t>1|Lc:|xez0.H[Q>gojZKRoLdi6n2^2T2FnvmۡmL#e6rmSa`J]c pJ]E;Rpչ6 cOtx|,ȶO} WboFvAC жvPPۧBmTPۧBmTۧBmTPۧBmTۧBmTۧBmTۧBm YPm YPm YPm }Pʒ'.8t4hٌ~}'ĚaBj\ AdSht Z3)Zxդg/N۰nfvaQ(i]]9ߕԝA]JnR»9 )+Sr۝v';Nnwr_r;Nc8 Yp |E`h2Npr;'w8NprKx!I Y&t}S#`q_4M&Ѯ1Nnpj^d{alvQ{q\˳a3ElnNp[ujyKaXUpX.XZQ)ڦ0)4 O˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥^)lcvylvylvm(.hM1P)lv)lv)lv)lv)lv)lv)lv)lv)lv)lv)lv)l~j 97K#\UnQ9IA{-&b-&bBkE)bY / 7it }^.c\¥* W.%Ǐ+ۏ5K,"6ǖjZBMjzM7tA :]fd+b&kE_Ls Np^ 5]BMjPӅ.t 5]BMjPӅ.t 5]BMjPӅ.t 5] 5_YX9)G YO,{&/cc+E2Sqtf Q>;&d{c5bڅӦҝl?3ʡQi6tol&ouZݴb $c\k+mEbKx=Hy2v}\&~;)E.$ F8wSQ*tƒNE-LHNQ*Q\*ѾFga\G) 9$3uQp<\:Nt*4QR܆[s#n,a<<|YM k<|L}=T#wc-Lyލ}ToѠtK㸄7wt ٷ.ԉiDy7}\Lsb˛XQ5ߦ{_ЦIC;le"f,ܪWȤ?>|* ^mWs/gw:zB|g=zes{&vө;Z^-o7rጇt3Uv{Up􁑊Y#4k$d췒VJ[~+ٿ UBU}mqm*yWn0BfKr8 .Bwaz\ڹԻ[v?ǭ,9uoGvK:ߎ##\DOIV,ޡۙX-ALWO=jI}yzwCDMZn,LW=`=`F=K/>` Yٿtt]lG{ i2~he;ZَV"h/QfM4hjf޷hj;ڎ?M{.MmGSv4Mm,ԎTGt4Mmz!3RBt)mlvLu:j܎q;j܎lVz0q;j܎qSZ)B(t{0X`(t;   ݎB(t; ݎB  ݎB`UI=jKJ]I㤎c`10e`'QɸM nS 9=oG>'R5yd0'yaEv=o dXQS^R]4Oykl榧.ow;ݾaԻ}1¨wnFQvԻ}1wSF"~FEoG۟zw$RM5(zbTQ^:kTQvTEoG?EHDNyQHuIo*[rGYnV hמ[yox:0 Lp&8Lhȅ޴X^,*%wX`1:.d=?hz-f`*=08#,/2f=1Og|q qz8\7)v?o{K)b:Ȕq׹ie<_K_4xﻧ!佔jSӣYV-y {=Zn`Xp-.| ߮ c˭؃ V~gXy(ቑ~m]RQUUQIMم Vj i4{UQ7&iEn"(},Xh4]p Sv3?{Jԟ] Ԫ= Ԫ=XSva/nk]IԛLԛL;ׄ ArA9+{`J,·<0O;ӹEwvaԳ椛΅:5kfh_AMɃR{lNř:I?MV󂔻kFw{(zv=YF)#ww C3|"C:;kZJ@jD{! ^T}كS)cy'껴83~p?aNټ7;YT*h//CG7l}U(k֫|E\D\D֚ EdlVb[)8 \?cgn\r1!r88iUc1p;*-z"wЛnmM4`>yh؟Ӄ笁Nrm^ݫ}!!: ޖ&i[5]@Ħݰ=ZnOٷe|wV`DtH%/Q[K~Uk03ow-*Fo4-zrzٛܐN\~_4@<-֝_@_RF@i(~(m.Rz\&EU"ԔWƨ6)0. OV۰[P&x4=۵>]t.w2DrUl.TzzǛ'lP34~gh^ Mg:~!ӌP]Smu(z|G:HoN,hz"(c9z>lC>2CKfз9;t}0{Yw H0ɢs芗{}JM퐰 RX} ,a@X2 e ,a@X2 eWX&1]ݯ r )a%rߝոsꬼW*YHǍ7zqǍ7zqǍ7zqǍ?-ZQ0FjFLϱ%ȮvANlBrb;&s:_H Ej鋖*/Th݋J ׶mp`5A9m:Wsёd3Wc{|#Ut칈~9k|9+o,"-4 Z;_k_u}ׂñ WSywDŽ}HuZ&<|W„dZz;_߄ 4 f!,GXY5-'EImL"IspjrYA_;_'ty(x5HHiw$ iu !l ldF6TLyXֲ6sRFPW>:ϊ!9U`a{[4.)gXD?bHOg\^Ureo X ,ZPp03L(9h 1.e>iaANӡ Xp:#X\,.dE|$(Fp X\,8 bO\,La2˜eD\%}rq;rF!ש] w /sHJq uX,pl µ얃`d[8sqDQ-\K`8n: ’nጟ3~@|={*^E0plmU:hקz`j[`lV Z+` VY,A )Zo=E=MW 6]+t˂e W0\t{.OB{{hb캂]A_4:0vbDpɂ=Y'4Z{`'b2-NFp2- __B- 66//`ӂf1NƗf,l&Z_4}Ypޡ=\ pň/`7\ vsADX,DL$}]ň vsn.ň/F|`]2v}/`gEQ5ȹͮ"~1_bmCfclƜ=ṿ4UԻPf1AR~Nmc)m;ho'N ̉Avlr~0l('=&rDNl"'6D2UAj3:d9fN-lӓd#9;{l$'q64go8<7ijTJFrl$'4Cg M3tHB_PLmN5'֚kɮqknsb9ۜmxt7/סid:#nsb9ۜmt1EmkNsvFKωsb9XzN,='3s%ZҨ;/GrA:#}i A'Cyωsb91CЋub91N?'ϋ*o98Fu0Q`tb0:1N F'щ`tm].Z_4Cc9fJe0eW:f:&`545DM0QLcg'`:sĈtbD:K>Q\4I9ƈC#al(&ʆaN1&ƦcӉ$Eը{EzHcoݱyvu >S}q2Pfl/%A*ME.L"/);N -!Wׯ}ҶP?IP&AM&wgps/!<)yRSrzן^^\rt^tD*%m%'v;Ϲu~DQk9."Pܺ* WWj[Vs˪un_fS]\< ׯgi[yMk7ݴv[űȃ^+cDA2xּ]42-wJT;tkIK2'e]j qԯAMvZM+/ ˅1Hr5ήu<]`otq AMQjشh eEj>kA8z@rmi,:cXtƢ3Z vLJXtƢ H5aqԲ>A :60H\$. grmҶ*ntˠ[2 O-nt Fk-NYok{HQEdD&PZ ~Hm7nN׌lbe?EMҒs$۩/ eWX,NEC,ࠬ@PqCB(i ^^ҭGz.B9Q'9L)5ewDN$r"9DgIUJ'=JٞE{lϢ=,ZA\hŢV,ZAEEhŪV较%I09I#c#8}Np:$ Cpm(k_ n7"[֦7ݸM7r銭MJQ7#qK>U{vOes7'zoOf/O?5wjE[?ޑݟ{QQj8FwY)mVJij16=M[=dUsL٬[Ds7P$ㅲF[룭h7[Q7룍f}f}(F Yhw)7hwͯyf}Y>R꧱`};c G36hwƂe ]A>ڃb}f} YmG{0Y>zuQ e-[XmGt 룍f}R(ڻ9m.Z1T{Pׯ$8z۱mZ&l7A~ 'iJA4m{W? f;F-Ql2eF)dyvgުgNW֖۵v:).mx,Nq-㶴<;h#"&ioCԛuA)~6kPڋYJqRܬ7+ų~{ҠIEZjJA)OϝC"UeRIU&ҟh/.Kqstzr!㹤%r׍hs>–ωV< uPV9.76z=[?{"~G o):Uoe-xJZRoւ,9jʛ5_fYGn֑u~ȑh)-D-[[f_|7=Mjt6{޾Ǒ=s6.oEݥCT4y"ǝy~}Kumo⹭ޱCf M=ax7zq.2Ү E9EsGG>1VH9=u?gC1FO<|Hrtl*yW!Br|n\*vyIp]jn ~A+xA.jF\ԛ~Ta Qax'BAy@!{7|=4u C%Q^s=X좦1 Y;M|! 9B^(iH5} R:Þ][VhK\o_a!kIx(2;}H9KCAvҒzܻȺR q!r88i\ioN <&&[?gg>eډ3(L]tDJ~2&H}G;Rߑw#H}G;Rߑw#H}G;Rߑw#H}G;R"Dޅ% 9႑`>fZ`}?,0 c,0 c,0 c,&bb,&^dKTir Dί[0X8 `0X8 `0X8HEZ.  E` %C_- E@_M+hմ~}F5"uEY!$BbiyN$ݼ+oOc2>C R#>!Pס@>mJp 1Cp 1Cp 1Cp 1Cp 1Cp Z3DPF;S69Qtw6. Zz.Tx-G$tD-<~ݪ%_>B]59|G9|G9|Ջ8h8cxrgȝ!wڥm_htCtdБǥDȃ4#*bDb Ntf܇Tp2jQM ͬXp‚',8IO{ߓ'=IO{P$ `%ȭju-ȑ;x^yC!nw]vnw]vW!u F ɯ]jEuJ(/~aEMbX/5bX㯼?@ R`B-Wҷ[{-ҰlN#h mEЖuiCR)ڨn6;CC_FuQlO7ӝA4tmڝnv/|t'I7{>QI)Siҍnt#)HFJ7Rҍnt#)HFJ7Rҍnt#)HFJ7R]RՅ5072-xߛy+/jĻQ.2GIp{S&gVF}Mg]IO6L˔MA!$*+46D2!Y끥.}ZꕧMSS^QBޑX.~E<):{ً^tEgk^tҶyѶxgЛ-[xtO*A-ື-~S>tهN1)f:u6>I{`$1l1`$1郑Đc#9H?c*W0z9`+%?5ҺPwy[Ht]k8h oJ\dGה>Gs-:F9񎔾cïCO%XwdΑ9Gk.t 6]du PMRߜ&<2;2P\g(|Eo;4!ۜi3POis3>mn_67ȁ{q8P \JEqo#EC#<0)mGއiA8n^( ~mQ<%_Ty'p4}jf"3Ld`&20 D3 IDv`20  LB&!I$d`2dȐwFk;0:wF3]'3~&`tF3~gοGg;-9>u3] .Fb`t101ntb``10X ,Q'~zC?0 >!*}31rc0!+}a`0:b00eAŔa`00eAŔa u0AXa c1TL c0&QL`6j24A=X , B3_I{С0P* EN'GVCaa'`bô Li60 T``*0L0mjj` 0PC P5@`jaʧir*ip?P/ K@4J4(d\I4\.i8.i,Lu\ FKU3hZ2ꡭ_8y4J J=Ӫi#8h˲d}}.˒i}}.+i}.{iIڧڧiIڧ'hFic:2{!ӐidNڧ>@4z YO# :O#d25> ݧuQڧDhڧ2}Y!}!ԁiiڧ}!;iڧp hF hF hF hڧ0ڧ0ڧi2i2i2i} } }hFiFiF`6Q#Je3f)a{8ʢ/Je)qL z[ /2OzG S(Hn`40 \ ꠃ\5h(:h$Zcю r@8zWBCJA)K^h`@40 π'S80 +~]X  j/풦p(g:"LHHMX[Mg(:P Tb@%6P Tbx*vHJlJll郤tb?4#Q Tg@u6P TgCّ-S=Bu6P Tg@u6P TgcUl/xTCfOz>?6Ц4e`&44BiLw\Mzld{EoCf ]k3tmе٧6tmе6@LV&]ˤ}3ݣpPʨ34rF9C#gh 34rFDd rs/*o<)- *C7g ݜ3tsts\9C7gO7˯e }3uΞ.{ C_g }=}]O!AU5&;-+AK޳P3}P>CwP.]3{{=ȅ`Pi7Ii&!LfЛAoaTg&UTQuˮ2 :l7ޞٜ'rPBN(BIyO#yLDWC$W4Ԕ :HM4fH-='Y3[LCi}4ԙfSgΚe֐3:Cb]]~%@KXRN45:ωDÒK:,νH9RuPÒK:,yC:/]RRRRtaN[o7%:Já:phZe?sZsԡSN:}Ue+ @j[ }o5̙ @jh^ ?sf36:XМg54ݟ93 yMׅ԰ZNJ kr2ۀo 6ۀo 6ۀo )-ܿ vU)|m.%E+$bC u5 uEjD Ũag(FZzNqTPQ,PS~`2,ԡԡ/uJ>S"4a N+xt£Nsž_D^'o='9ɿh&{Nsž=4 MsBN"N JD'<8Oy"qD'AMy"qD'<8Oy"qs ڬ,ɞ~GY%u[>(U#KMҞA[yeEZyQEKt]dKjȓW^Ess 5y Dz({c֔oK~fUmEOھD@(^AB@Cy@Ȼ)y_ZyۊyPSζ<-ԯSA^#o?Db4'>G 0kuYp޵δ(쉢Bz*Wr}흩}*L&r;E۟dS`BoV+R#)'>m~ Δgqze?BEEK9B9ަÉDH;+͉/V #L[\_sc(UN /"ǜ #RoMwASj"}8.hh/a]y7mCi("iVǵzYҶ:/W1׀&!oyX꺸u<&w;kWΉ+O8s8+u)u~̞L DMJXd(hmQJצ #vzBʶ%k5gAoU;k3kmV0 Nm쒝zUSWt JY1YS:id9&Wc'AOvQ66AK9,7{\\\tlH.kr85954^"mW6-j-i&ؙV.A9\*ԋ~UiBOzHf]*ԋ:hԘNRIz&VV.%sϓ HiSiP.;zB] уqK!.姻ORtKz(7:9ҼZmʂg5!yP?sp&!f''<_;WWfV|hՠʃ ˡˡ<ߠ 7ߠ7_Շ^TPH;Xnrn)3hù_|@獙e ZB86V/A'sOwҿ'3|uvy$9w\vFZ-Xҹvɹvl8090B6.kS`[lڸV{[lե'N K_ҹɹ(Rmfz]-rHͥL 6MS}i.z[Yֱ]Y]]i]i]i]^i]=- HF6v֯)a Hػ(y"IF隃5i#MF6IAF6Ҵ4mi#M\tVIFFɷ|#CF62Kv.%Cdh 풡]2Kv.P|)C .q/XS|2Og $>ħ3xgBH|:O#>rħStN9)G|:Oi&ħQgepFQgITPPPPN5G"MJ$8 N03>ٳ'ue|RWwFwFwFwF;|5Hc|R>'c|R>ƧGyd|̧CȨCȨCȨCȨCȨǨǨP$F! .2"K iħ.|s\FsP=nm nޛbhd]2hH.ڈx\9~IG@]|V_ Vp #t[[u-"\yEj\~U4ɱ!?&=d> |_/ |9B:25;*U&G0:rڑ;]kkkТw;aҷ.5&1A jLPcԘ~_`o/ b|^7`$ZkL_cט&c+^`TLI܍!҂[¤ d2# _+zޅ/m376uވ1ͭy]Vj[M:ApNRi>pt=P35i  K|#O: KWLpi/ssrhodbrty~dyTɛ҂dbr1~ e A~KY>ǖ? 4_1@F۵u8'j W?H=#s2TD #p8#p8#8` )*-2wG.|ӅG8pGF.R+73loxoxoxoxF ^8'^^_8՝PHR+;6u")oӑ{ѓՕW{=0@+X+H~^L5ڵ{xiێ}Z4F =86RTzz/z6gNg7MRǽz_FgK/˖fJy^Rh؋PNue饒/d|!Ku wh,fb,9_D䠼\k)C4+f XzPP^%k΃SogpASaK/A9B7wAqm>d^#dnpTpEiFXP7ҶIoLF̼DȆfb-֒iͭ!xiivHZ"Z"Z"nhyAZC%A@ DP"(h -~YMxA&VP"sMͼ[BU*PyyuMvl4yY,A 2Y WF;Wx"7 Ze ;+bcz`PL<ۭO[xҀ O*md‘A El>[泱[A|gOkL]h1]iDLUEϽ)uE;QJ|=w"; z{[2|q{A[oɈ }4P@O5? 4P@O5Xί)D?Ā3<@ʛZڃLuޮ;YB 4*nG· 텈B͆6 u2mD~Kk-~}ulMDxLozpsNdLdLՉAf ~5g4\rФ6Rɚ4Ie51 <}Nhp |>#6ȣ|?c~L.IꃶТJZw e>7暶iۙiۙiۙM&vEj*wrg*ʽr"ͲEe4i$p8ν"w u8|&U܆L%M%M>orAUP/B6xx(ap4ϓX-'琩2>hRIw=ag1=)pSM6J~Ӏ̚XfD'4? iO~Ӏ4'OOOOO.PCs@x$gҗ!lk^6lk^׀EHIHIHIHIH+!:C6نD7\A][c6`4,&fshF0ڀ6F1ZY2EmbQ6F1(Fh-VmRm+oȷx&h+/ȓSgrl'6p·L[4bzhT` 9w}w8߁2d0z\_)'V"`19bb'9/֑|/BpψL䯼x &H*F楣;;[N,p>#in[GGI*G#vK=ߓp6R; iOF}IsOΨy\+N/Hkɉ+%\ʑèe'f̃A2_BEj,#eA7E#_mX~Z J4sWrqPՃPIAypM 6A3z?Hf ~MJw8#w 6(,^g9ꌫ3u18AA6eRhmIfïț%xv瑙Ϝ5>!Cw@l/B]t" U=]b 43}H3q2 (6UWg I3ߝ4hTT4 L̒uI)ig:2@>lp3ϥ h;ȩ&hbΚM31]x׳%toe1A=A/4ƿ]ar/ W}u:)b3E}pjT2BS>eR3}T\ja<7@RKez"^hVa|*! 5>q5g$x5Lz#*9)>!r ;xjv!m91γ/ xg@(:uI"_3wWW62tY/έ|sUج*ar=_\RRG%'|C"Ɍ{aMQSɳgTꬷ=?M3nD/gk2Hb[?hp /[sT2^E=U\-(\`&Jn[:n[:n[:n[:n[:n[:n[:n[ؾȎ;P-Tv Bey.v8n[(JRn[(J9KdRJ)Gpɋ(aRn[(Jrrm[( nl[(ʶml[(ʶml[(ʶuBٶP-m BFĨhQ)yb MN$d 5BMP-d 5BMxy6{d 5ٺ{5pڻ-Tb_F ?.ZF v$v$v$vEjvgɲwO&=*=*=تzK7r Cm/4yM?} _ozˡinEF(2oFF8: o8|jxV)i '77laq3Aanm(A"#__mBiR576>{lsr]<[ע^Uai:l:l:Bj{07̽a ssEAAo8೓@scuEti9m )ks+AM%OzrI{'>m4am'l'!l/RȰxc٣KI]$7t ]owCEg;Pq%Fq䢤ڥڥmeMpw.E=1__^ .Qח--TXӝE)GRE mIE^E[if/2!d7.2ԥͮL8Ӛޑ4!>""pDp듁t-so}t}Į,Svy{A[&^2`2&Ew%1"utR/uIH]{q-H%$4ҋᠾ\uIbG-Ob9itrI'g$q# y)qq %s!qe#狂y|WrA gqPPp5 G9AE QJN)R/_4@͝ 3H Aw<룋xZiV?E <]HyS-|Q *oCbmA}1U2axN',D$b!y|ޑ'zB5@Txl>O<:)gbu` xU+]uʀAM/++)^H/(2Q RUn@bu{Uj^,9=0O{}8D#}`O7أ3k``8.?Ȳ? |_=ox{=ox{]NhfU4x{=ox{=ox{HpAHTi|0 >`P"n|]1 >`|h9w0䘒RP.cv|.p\>_V(VX}bQ>5#1T VX}bQ>G(VX}bQ>G(VX}bQ>-Yv顼G&E2ۃ,keYx^dnA?5JsPh։ ZWEx眼IoAoAoAoAoPwl?E\KG7w<3E]Kq"" _4@ IoȑhQJJTrF^hVPINb/b/bX‹ߋߧw 5>C-F |h72DZ!hFYB_s75 3k1޽޽s~-_WrOU"JWKʧI[׆춂/U`k+ >/E azz=+W;K2"hڌV,lѵlf36ٌ AfDrmaqq˼ߛ 6I{ m6gcx_[?Ɣq"4U렃77Ɗį4ޑzc\qa-ۤmuwN栩ʙ"iGӜ9͙mr}g~)7{.u7z4yNMy~ǝvĴ#vHDyA)]rPURp4XMMͱ&Z&Z&Z&Z&Z&Z&Z&Z&Z&Z&Z6l;6lS̋:-ޱxf?"7"7"7xxxj[:uBz_G/4JȻw>3f+Vƭ<~0m_|qqqqqqqqqqqqqqqqqqqqqq\p;l@[a8CPnN(77Capppppppp;Capps* : 59ۄaLpLpLpLpLpLpLpLp;Ca8q7qCa8x! 0" /¨C]pN. :cuvQm ;"ݓqHQ R:_߸_ߞ@냶JER;2s5м >dist|z)Gj;y M70VnP.R)n<Ztԯ 8XjΞ7g j 6G#Pbs$/uT$/J$/J$/J$/lgN{Jئ$N>PԔ"~WLS^'XMGkSU=)4^}&x'+-@j>n6M蓅d9>YOޓd=Y\O&dq=YRON֙ȅɱ::M:drDpT 4Y'u2Y'u2Y'u2Y'u2Y9h/ldN&dN&dN&d5 S4YgMYud5YgMYud5YgMYud5YgbI͞D.W#R(BP{+&&&&&&AN=moF:hi4B####aA &&&6;!"l"l"l"l"l"l"NH${+ފ&u-2:&:ɉ3b)"\6|a"_1f?OmS3333333ٛԋb2\ !Agp}dog߄6[ُ a][AVF7Al€ rrl<0gVN0{cr d΂pdN\~v./ǿmů_/?wEE6-BC}<UZԶ\8Z!xi4)ʷ~_C-1C7ɲ/LE [?@|݋nrChvQWwH&~?w\H}dQSG R{2I~yJӧ= .z {="y蝶=dzt&BIkס%Z _kyhBl~4Azs}(@<=m<6AjAjAjAjAjAjAjAjAjAjAjAjytzH UT R5H UT R"U+R"U+R"U+R"U+R"U+R"U+R"U+R"U+R"U+R"U+R"U+R"U+R"U+R"U+R"U+&bR+&bR+&bR+&w;0m1r|]v @SOYgUD\"yg>G*l*z'B0KFcTfxۻAFTAuƤSGXQ ֳVi+t!6 VS36y)a=]/JfX%; ]2[p͆D͚8N3gb}B_L7!y4{{WŻb=RPqDގF~ƒs7/!D78 NtLx -Y*D-%c_9H9 IK9_'o勋rsOHjy,ct Ձ:L`,{eo 7X`,{eo 7X`,{eo 7XH{'U;(($K5]+.jn m!TZ򌙓_Cc$xEx\̱W('--%xOMI^`@ZG-УQZz=jx<5Ŀ(H 4Wj9lVLJݤVڪ69|3-@< j ށ0R:re ξZAG+ C"&j ]6UaV\Bt3C?Q]!m$Jjek҉D-x-xkiɷi8!UNU[s+.-xxEq+ˠíHioE, jAeY[U]6,vN/9aj%3e2mnbl71i|POriNZdD71M &s`nb07VȄlCSJI)g [h} bi|k)x3R'X'lc d51\L &=#1H =䋨GQ@5_Nj:Js&:g(?&f(?f^ uQWɋZa_&ɪeR^P&kK]5,F:(5?IP_t[},7YK^1H{Eý^phR4]fgNQs+W4Uiv< 4'ՄJ4n~5E/A5Ġ!  ]6 @5Ġ! 1hAC b4Ġ!ĠVbJ ZA+1h%Ġ\.ܞI.z^$Y'z#u3h]/3h].!55ՠu9453T Ug:CPu3TNy3T[*[kz U*ݖM چEm:ii;Cmȟ S!}ʑM7>@[vU5=&.o|]O,[3y.x[Ty'oR)iڰbmX6XV +ֆkEQ V +ֆkÊat=CxwD5U+7h)ǻ, UH-猒猒:9S36dtC̝1A<uhv!~]ߢ7&U٩fyIme&MjZA>7&Tqi+V--JRRg0ֳȶ0Zَme;vh+іDdt$CЌATi m"D|\t'Eɛoi&F ԆQj3xM71@mob a 4xM7 4xMLQ3E}}匌`K-uCjSVoj38Cfp$ CՆj38CՆj38Cfp$ CՆj38CՆjPa0Om6-H>3̨/b~ fĴab >4s҆9i+sV椭I i+V6͊Zhò me1hC+>,cVƠA[2meh+ V,@[Y6O>tÇÂRrs9< -֯]F‹reTr/9(@*/ _9n|x۳hk9尖Zk9尖Zk9尖Zk9尖Zk9尖Zk9尖Zk9尖Zk9尖Z6LZk9尖Zk9尖Zk9尖ZkykykykykykykykykykykykykykykyV$kX+dxJh9C +ŋ Bra؞|hQ"m5wh{ D#=̘l:gӥuWrj/`ˀ- V{[le[- 2Xle[- V{j/tN1T8cnI rKjH fg7lg63[g63[g7lg63[g63 ̆Ufva1ٰlr7j!wc 5]xx3̀7^axC(ތ(ތ(ތ(ތ(ތ(ތ(ތ(ތ(ތ(ތ(ތ(ތ(ތ2 ʿ9&9 oNxf ʕ-exh9Hi NpwRŇ>'7 MVp߄&7Jq}pd8 Mo}&7Y)b԰aj05l.ʑ7a Moxƛ0ބ&7a Moxƛ0ބ&7a Moxƛ0ބ&7a aEհjXQ5ڄ&7a Moxƛ0ބ&7a bY7f1,ƛxobY7f1,ƛxobY7f1,ƛxobY7f1,[x+m&-oɸ-W%ʶiHi׍~s h %/-xiK ^Z҂ r E_UxiK ^Zqq(g< Y0ςy̳`,g< Y0ςy̳`,g< Y0ςy̳`,g< Y0ςy̳`,g< Y0ςy̳`,g< Y0ςy̳`U̳yV1*Yxm$O_O_]`O_O_O_y<8ÁJ,YZZZZZZZZZZZZZZʁgkkkkk8DDŽ?t Ή S]ҟSe oz^@*a;)G^sPrM5-(oH]4 Es@cR2E}.Azg:7WE@HKT|Hufyxb]Hl8e?BŚ%y2&N?VݷLN2JS౗e3-sb%Ͻz~л:;tЉ޽‹.‹.‹." /I|V$W<_/A/E;Pdf6tb\Ԕ]42D L2D L2D LH@&d"J&d"jF?jF?jgd|T<δ.:hfT=s;RS1. Gr/OY)>esPS))>ק_E7@EjyzYzc뜗 胂_]i~1E)Yt,c}P`fMb}+a +ڼ-S!=Ec"Y鋜_aH1ϐ:KiuF"&Iu&Yo rVIx)&}B-\s熄?@f g._%;̓Yr8`_r{9a>tg mLYdyG}Շ`!X|CءFzU4SŇ;􃚚4XŇ`!X|Ň`!^cޱWثgnr0piޱWثwWDzGޠz,;}AXw,T {džcޱaذcޱaذcޱaذwl;6ٰ[" maذwl;6ٰ[.D{7/(k^}fe˚U@7pV m(Q* /{=|_E(|UWQ* _Ee4i|i|P7׍ur}-#o7[n9}s9rϼ2cٷz Ibޱyؼwl$7$9ƾls}E;=Qs:ޜb? v{6! moC~ې}C~Öcޱeز m[-{ߐ߆jXw;V}C~êc7!?l6! mȏ}C~ې߆Q2 ڻi{zE~ȯ{Y2=o"Be zE~ȯ{YBȯ.++^V}zE~ȯ{YBezy/^}mG,/ )p`>>D:ϸ|| C/yM1K'fq|0x=SG#/ - -*M L럃6lH؅_T#dz D#ys|7/P$'t`~",'.Nޛ:X'VX"Jiׁ%|`I>$u|ׁ%||_/ViZ VӎOL;xN<&t'0xi zvVCp:TsR0i%ニLJu|pH+lP,|4g=^xFz A#׳%4m,|G+mEhEhEȶ ̑ m+m9Zqm+mXc8Pa޶͋;Z]õrTZ x+ j,y~A͝6+Hub/ vpb 6r䟳RԵ辴{3w,}.JJl5 w nRȏ4׆9Q8 k(M8##j @=]|`C>NP|NCV|Cjg5b2C졏9\Cįn [)y+M|H5U7݌A dPF7ltU7nՐezqO>=&R1[p[RMRkN9kN9k ~ BzN=Փe6"EAgmKHD$nޯjR$.L?-KCM T?r75&">*R ㅏZz.@ x9jY08|=A, r"c5/bOMJoݞd&7HNF+zc.X fLwWMM:*S]{GY`er,UfG&Wx ʒ$7Gfh]힨ҶPӒUXr~H9>{pYֱtXIz+Z3GW^sN^'-).k\:e#\zP -m׶m\:z(zv陁x/Y L0kAጩf`c108Ql4 bj-fb-fb-fCAAEoldy3ZkfX FY\8Y \5N;&qk2n- &Wnk2n>hޏZ mBVܤe?ICZ}E-rj'{ ='ڈη+ڈF6Xmj#V!F6"q4|J"KvP:uv;vظ4]rR0L9lɔ;4t\d"Oɑuw+-47Bs:עY!]mKeK:RRz\eX2.M^ygب|,.|4aZN9Fo5z>vrqqAK9?!)E]ƟjjV]]?Koꭞb%WM,#D8iz곃T^=7޶~Sd.j0ĎVޡ.cvS{nw;];cNmVnʣ{I|mH>$;ts ^c*͞?Ao z0tÆlؠ7-Haw3%@_c B&M{)j_Mu1jFm6\x;Hw"%Æ+JkЦ z/iG=0Rf|A5ce@ K6\Ȇ#M^__c BPڸ9pArȁkQ$r!9p[7@zNrȁ#8rAAЦs%8r%^r%^r%^r%^r%^r%^r="|m_s{P+įMysw{*/wɑ^ HZ!%wF]r'kvZh'kɶDjed/Rr'{Q9H9sq"x ⁈"<<<<ڶlJl{֞WA"ޖk[^y$*%yn٨eJepe+KO^? XFr3D4 'ۢc"(?ۘVƴ6-1mimL[\93z״Maj4 5W F F%6*]l |\8vLPoD=xc`=zZ9&(9fII<ȴ;&(4A0&ԸzvjɬBŤɅ ΂!F.܈1g`Tj=],|Zh_V\Tib"=.Rj\sϑN(  ӯNr:y"9*Gn:jT"]Das/C~Y?'f ~\ws887\_\_V![A}4ҹ qӎrQN;iG9(Ǖù?B 29ҙ+ TL.Z\8GLԴ(jerC}Ul(M8^-=mҶ2Ǜ=޴m7~isMmcd{C0?hק$V jZ|Gh-7iV-c 8bq7Ī!B j!@IZP+j%BDVrFݖy@rBmRfG2Ѝ:u׬q/ډ2w] vwH5w+RKl@uN,ꃖZM],:Xs~늄ږ!Ч?3z0xp^ HsuaS/:x$#]6$dC4$QbDJ2s\z|4yQ_Z^pӼB*%c"@r3b";1ȷT$NL&Iь} 9!$9D.v";h$g䌒Q3JrFI(%9h䌒P*9r*9r&彀l MIvdG]7Ȏs.9rιs.9HBSu)T R4H EȜ32{2e#64'hcmA{a9isNDOiaL Һ1- Bl9sͽwhSjXM:::< ) q'%ܪ|a/4A S}ꤞu箝Osa㝃~w\ΞuΞuㄔuB:!eiwNHY';;-B:d@NX'h4֝xǝrNwɽ'hvσ,@*/O#Q-J<;Hགྷ L8;u~~pNVUw~pX9u~p~p3[9u=?x?{?x?x?x?x?xs6u: >ZWN=rAlOZ`:.;?i ЁA Vx2_ wjjUgΙs~ :'~ >'ɬs2~ 9ur\D~G~zzzG~G~G~GB:=PBIQ Ļ\c0 w>? 0=P;2Lק3:az0 l뉖PEAy mj)%eE}(6:{ȽN^{mWDu"+Dy/^׉SQ|*`׉/PdyMԢh D 9H0 Z(F&68w}*`S{'c4ڶ"΄wDdyC#(V"snfFM9+y=5yxfG1:S5`|&NjZAwҘ\#zpyA7H' Wp@fnYrY?Uªrsu1ku19iSBq gzEn+<|i) ?&O3 \&ǵqmr\&_,cqbByPWf/]U΋ܛiM}y}) 軉NGXC቞1A[pHY/]q#7B.HuAĹuT .H8 U:qn}A R%έ/HuA R%έ/H8N[_TsĹ.HuAľTJۃBתICy[ωT .HuA R%>TW*R]EHu"UTW*R]EHu"UTWꪥ*R]EHu"UTW*R]Es}U>W֞nKs&7 Bh(6 Bh(6 Bh(6 Bh(6 Bh(6DQmBz7oB=GE?PQaEP?ѽ.ꪾu~eViUh6Z%;/ b砭.We뎡#s͛:wɭ#ږ]xWDulQm</̃LuΛjOSTVˋJ޺4df/L ¼HuCo|suwwwG&/p{v`nV(oi2+/I+//׃?h,2 4?sc30 <6O6' V|`~2YNbO.X<4]Iħ̓`]4T[>yA}+: b=AًpO`sЛpP|pA { :k`&\"_㑑'>M"9'A&\ePeP|L(ˠp.R߄ .2h'tI_MẂiʴ3eOKBoq"AM/eE*/MᠭRE&uEP r8HkzHi$_d ~5~5jѠc4/Z i?hp"u[^8xћM hZp\9;3:&9HnA)9MԩAM\ .C;򂼇7ޯ7J X9ݘӭt_E_YBg4FTZDC3=…Hri5m{FG;َxv2='/B[m!"5ǣ#9J?`)9H9$%kK)HIGJ:Rґ쳢#%XeEGJ:RVtU^KzCRERHʯW=KK竞 X liM2X $c>+ c03 g0>XA`|} 9`|FO1Ũ)ۥ5>5m{'Iayt>7R&˴).*m/r=H E^ r>-dMGEDz j<*:hcN+X<_t~ ETQ޻0 `,,.[̋"K EjQ񾨃@w0g/rD;wpjprBhi|[xj+/D,$gϩ >"rBvPnL#Y{5`heEEaxpRa9^HSz` "rnɖBzAF*lIgW2[J0{<)7Ay>I!g5|.yα ,ˮ+ bX*ᬫMq59(7cGupPSkp"肫(T^2ױyNd.sy"sPA|g9|g9|g9|g9|g9|g9|g9|gA#M~^H3>s3/>3/>3/>3/>3/>3/>3/>3/>3/>3ǃNg^|g^|g^|g^|Eb^$Eb^$Eb^$Eb Ђ/t%Ez(g. `Z׃99Er[ `5` Vj0X `5` !I.RofK|W/4Vj0X `5` Vj0X `5` VTHwFFg{I07>3`π= 3`π= r*o03axS{'cjř 483̀3 83L\= 3=3=3=3=QDHBtSŞQŞQb(b(b(b(bϨ`FiFiFi f3&D fLƄ41|h9Hi -NhXL:hh Dyv N(pk*MN(pBr i(;ɘPcuoIE߄ aXG֑A߄&ćud`XGƄ #`G1! M:2 aXG֑1!>,!0JA1!>l[1p -ń&ć]c`eeel E|o1c"Y7ʘ1ʘ1f_3F3,E|eeeee,,,,,,,#-CtB,8n1`kZ1V ]֊b*gh-)M{_ 010LA3QR{k1Fk1 [ֈ 1k1 [؂İF kXֈ5b+$ aX#ֈ 1k1$a+7 0$ma`[XALX؂İ# c``AbHl"2U$VXFƪoF*[Ebe<e<e<e<e<e<e<e<e<e<e<e<[3xSsɖp6k's~a[.2uP.{6K͒n%ݫAoI/+CNy,6K͒nwIZ$.a.[uvڦe޾˼Hp,AW $ ;$Y-$4;ͫ_i^2KyyyRC& CI|z[cE>Đ CeHZ}EKĐ Ce V"LW•a+C(^ k%](NVJpRw±zޯԄSQ.^C±חDYICyx`P@Ȝ)7 6rQBKM%<&h:+q;q!d' )Kzvu?!%s\-sѶG|;!D>"._9_xS-(G_7+y%_"cllll4?*7BӬr,]7n%Л`~hz&Tjy&J$TaT"QSM~ G)UE)Em("h'/Z2("h_Jɻƻƻ䨈Css7oPH/RntH?dztHeUDzB ZixPEJiE*$E'"#R*"}C tNBg?`jγ> ^HM" )H/*KtP̤hD'c8hŢbZ׊2?(ʔuݖkomlOZϽY2yo)xCSC>@[<7|u;ө[H}AJIͭF-F]N7 o(6 rQ{mCVVVmzx UwTe@O3j)nn݅Gå*Jk6lHѝfWG roӕt vQ{gi;utN=9+eW 7aoYy몐Z}%Tev~7g sNan|t)l*ewxi/RZ3@S8Jˮ!00e }Mݷܡ;:7^6^67K}( e}5tؔ/'r=o4t!_Ti]َ+ہan1n1nnمޯZWLFrꊅ#K.^Ku4D,}E] hhgK=޹˅N/"x`bBЎ?IIjh`.p6v:?ptN'ѐn笆IdIrN-h/[Gv☿Dݠ%fZ=Wn/'-ץOK/^֥eA/KZc]zC|'" †Y,nn9ՇrT>`մ V@vwuWonnn^rzӡNQ:ui\P^S]0p ^[D :j ꎺ^GAdLu;PrbX]]|G~CNY('N*0ArBA"{|GcR ~ [ Y!}+9sB[ ;$ԯƇ&aVhP}{G/Ehg!{U>_sѩǪ.ҫmY]i綉.u3T[V )g rmT6&٘$jCk?7J? /BJKnsʐwE*V)U>sQlk9}ëGzq‹Py.,څ2s=r`i?x>֡٨n) .|Cz'Թ/: {;-wN鹳oTK](j!| 3D (*-*ֿCCZ GT]]*aڗi ª(E, Uo̷xx 7sŞU匴?T )o4S W^]ֺ/Mp~C/t]o"~["/:"ߝyޝy> ErBBt(狎>EYn|Hо﷥zQPyvyvyvyvyvyvyvyvyvyvyvyvyvyvyvyvyvyvyvyvyvyvyvR['%}R[IP%rS %:H>tx#y.'H(7)A ԩE N5.һuq|(}wT S 9wrH$64'YoTBҫSjTB'nv:8-e` IM뭉#:PBD"Q.QC rמH->8K\D+Dpޭ*N :JݪD!R_UHA")H| Tq ?Tq I8Tq /RyT:qC\yO %w}( KJ厷UrEUpEzW:ArO[rO[*uPx A%(-J]+utѕ:RGWJH/tbPr.߰vP>4л5]o԰7\Bx />4 #-5]DiaCA,a-' ?+3\y~(&ws>IBw"~yOTVNPFA;[9?tzQTiyŋ7҇uޥO`r#_)//ڔ͗ȗȗSywz9RF|Cs~U._H}yz嗡:_h{;c:(Յ҃!]#eOp_4DiR 1|K~&_Ɗ|Cj\D~MX7T%&qMa9L*$˜DFD"FD"HdhFD"F40i4̖ȤulLZ7Ln ӺaD&ut4\ }^-ѥP%=W9(xcVmt+ƾk9ss.̻u[>]i}NCj}ܺTuC/]{rOi}ܺh=i}N^tZ>9ϻy>[wn}ޭ_ixR271rQǷ'z[JAZZ&%]4uO7CC(Ro]D̓moLxF 弬>Ż%_KbI\~;2hۑ!EG]55XL]Qb}"u:J|eHzx8НQ(CZQ(CįQnH%~FR%~F_įQ4`,'/k!Wrc^nHK|%Gף [2tDr^ )#.8X}! *F2 iX:~0>)ʅ2щאp +lNP -(q]w Z\T ZR,_E }<wF;\v'dd}8'-nAw8'J%ΉD(qI$(qI8"-GD#Q.GD#Qrsb.nAw n\݂%1t}MQrb(qX)JnNqb(q]@wn5ݭtVj[Mwn5ݭtVj[Mwn5ݭtVj[Mwn5ݭVsj[q9n5g[q::V݆춬.?TgNݺyCƯU6Y nhtCo]o|NҖucet[eC[6ej:Jfw뎱2-+2 mDCܐ5&DC QrIcrIc(qC!J.iL%nCՌՌP|(1q>8J%W3&26dDž5!;aLaLaL\%rCv8)J%kkkwE~D&dvnMvn7&dvnMvn7ަMvn7&dvnMvn7f v3n;:@8ڗ9 pp8@8 pp8@8 .}<)q8OJ'%ΓIyRrcrE.ua/RΝ pBp8!86Xy.H) fa:l]QG7䁞I./#yKulV>|v@T9J:~z]w}:zHC{H.x_-md-DrE/ùhCŜ\\\\\\+CrYrYqH.(l5Cf`K2k0[;KQ'VC|{wDw=͸VEZuQcZ>N) zUEwqzѭUEX^eܬE2^˻"C2. RqᫌoeAn *ŠZ0bĜ%FfQA-Xm ?įHbPᅦ&5E[)bA(bA(bA(bA(bA(bA(bA(bA(bA(bM(քbM(քbM(քbM(քbM(քbM(քbM(քbM(քbM(քbM(քbM(քbM(քbM(քbM(քE(Z垯8"*F!Ttsb)죖Bt9A)TPCe8PC~OFU2p(CZ. ZB[i)8C )8C )8C )8C )8C )8C )8C )8C )8C )8M ޤM ޤM ޤM ޤM ޤM ޤM ޤM ޤM ޤM ޤM ޤM ޤM ޤM ޤ)BԦE UtCUҽbFewRBOTA e蝽7m@_TMk!CoQYh=%ʺH%f(\ 7d! npC2ܜlp ېf!}aJOd! nf\׆6t mnCw݆6t mnCw݆6t mnCw݆6t mnCw݆6t mnCwC mnCw݆6tvnMwn7tvnMwn7tvnMwn7tvnMwn7vsnqާ-;⸣<, u%EShzXJF"9FKR=.CJnQxHysBâRs]W  Odw @v;݁dw @v;݁dw @v;݁dw @v;݁dw @v;݁dw @v;݁dw dwNi;Mv4ٝ&dwNi;Mv4ٝ&dw =8&M;Mv4ٝ&dwN3i;p4Ýf};3դ(Ͳ&c&Ʉ"g{Q@d }硡7E( aC_Z")U,;weL~J>ݓ>)~ s"W>2~ L( JNYS>8 $#)kʂrz䪪h/jt>,MIi[>|ݐ=kއym}+{t.(dC~d.zmȏklC~d.JU5!?29hv;Wm>4uk(jl*jl)؆\G5!4~T>8ȁh؅hF ȼjl{١&Iِl Q( ] dF:||==AC %O-[z܇aCg MBxMa. wa6ʠ{cУH<+TSH?tǞ~u*F,J Н%/}R+EJixSzCSi)@G8t@G8t@G8thhhA'ԨfJՅՅc9:{p7Tnu;ZiI4I4O]sѽ뇲Z ZD[\Oj2&?W; )k_C_'>IO*}RJT|H>ٕ>gW]wvή^SjDrDwESϿ.F=>G3V|DC.e.f<1v|(x?TbTQ)FbTQ)FbTQ)Fb])֕b])֕b])cu۝^re~~d >e[}7|v#zι. hV.%zH)-EE)>(v߈ k_WM 2@%em^@/.p{ vRZg_"5|m+Aԑ@3 t:tf l=(v;7F I3ڝvghwڝ; s[cLuEF)=gO3 M!rGkWd~ސ9VZs 8 i H)hWdLhLhLHtIkd8$ p6HF+F+F)C@14P >ψPnpδFLkd)|CEr햋h"lu! iBgZ# p$LkZ-4m"6lJ䅞h'zW@^(ʢh,(F@^(ʢW@^hh1hF@^hhh1yh;hW@^(  M^ M^M^Kk ypAbyEWk.A^-gM^M^M^M^ʦʦՐi,m,:-6[ņb`MXl,qA z|ʪJfNҩX@tV5?zYՇw7h|j,U K^|"-HlAb( 1IEX؂$6iMhڤ MhƂ$JiR* RƂ$6i,HlAb C4$"iC [꣱ 1tH@s4$Zs4Zs4Zh4Vj[Mb9999Il5hhh&XMb9999Il5hhhhhhhf̵[}I̅h 3ȌƆi6'3^ͺ6(h39fNtF# JgȐ2 i CȐ2h 9Hmgfz$GQSe! }h+!MHҤ4ilxo{h51 ra ~XFwJW\i aH "aHƆ 7\!RUuu0 aH¥\ilXqÊO^\&?> E4vn,R:~ jAPI-X%XPqnܽ+KTR86Ed#\Uhc.V=dL9Lb=e﷗OdoWQ4 M@w4hƁVzh@Z=o;BCaA z`=0A z`#8m^+Ubmhڦ7orDt-R%D4l|m.mHi8@4P>G|l\y/w*iEx&+%9>kt:Q:&tBg?k~aPp`FFbFFiM oXPkȨ۲R̥C"!V㉂k)RJKsRsU4*?wVZJi:uS0v&ꯩ.Z?'S}HEt)T['6G7s8lnc'/gfľκT{pGP ~=ZRw^J9uC:sR2}4S]EKO)RyCw)f#}5"W>*.Ro !J]m)ftYu=CJH Rb*}ꬓ:鬓:鬓:鬓:鬓:鬓:鬓:鬓:鬨椳N:+9鬓ΊoN::;R9;ڪ4@:L4$V^d$$&1|WoIMhF0$&a4 IMhF0$&a4 4hF@=8QNI#FnMºIpZ7 &Bi$'MºIǭ)[^=&ƉC76i|czZ3uCKM/]b~W?<4 3Nl&ZuCJ2sbvbvbvb,:M)M:N_<rG9Kg:r>uKx D'96⢭q胵0!hhڈ(wmE4iDO9[O9 8Et~)+zYzJZn] 5 zzlHS R.&04i,=%~͙ @g4K4Ye]j)bu]E qGJ&&&q֬|CMr͐U˹h(۵:q6q6˹hךD]~8o0]}|[oR,n Q QD9nE9eǸ[b΀[bN'o3ր[b΀[`΀[`N'o3ր[nE9nE9b~m Cpk7g"8g܊s"rr3ր[n D9[95[5[5[[9[9垳垳3[垳垳3znͭͭͭrrr͢,͢,oo@>)Vb+w/6KFsV+E´]l DC BWыJCsҙBwr/#ZXrFb!zwF >xR|/,CwӠ$hl _x&[a`>AҵUFpҹm6y~f?om6y~f?oٻm6{wfn3L`6{wifڲl-~_6]kRe <ڽg{J~Ya_r*?4wOCK>+Ԇ̈?OCu:*V|hse!We5_wƇbFB5z?caܲ g`c:wzйLm_ȳ;G}>Iz uaw]aV(ơi*CxוX{v"ŧfpt<4s/_n fpnAl|]w@qAU41_%VC]q5$')2۞)4[=K)5YRgI%uYRgI%ukYRgI=G=/β 3β,5L쯨oU|~eC~|uIM7 O9(nj_tfEqxL?}/5hHyҋonoLxۥ [|munR,AjkR|5zb^%M Q5z%DJkl" F)oAC9@z-b.Ed Js-f>4RO]781;91;1wRvCa< ~V}#}~t˿HHrYERj`ʩVzCSz4L jBXKwkѡC?xHo-/ӭJKdśC9ox_FZsPA%y:Rp>r~,>dC}pߐP#MIZRoi o}CB7w 2Ъ '0B#1ހ7` ox0ހ7` ox0ހ7` ox0ހ7`  ox0ހ7` ox0hxo4fь7F3hxo4fь7F3^˘.\ Czxo4fь7F3|G+6h)*ƛ0"CCdh !24DSM؄#'9^:>t:4C:4C:C:4cm:*:*Cj:* Lg<Aud\i\i\i\i\i\i\i\i\i\i\i\i\i0τyӸӸӸӸӸ&3a L88m<0)0wwwwwwwwwڄy?^yf3lrY&>"P1=28?ѡ~:C1PP ךZDS3y7~ǸL>8N ͢q驡^wy(Uo`Af?`jعSuvhw+%?8 jM 7$25#u'tzD ꘗ :@>:.LUz97ooz}~_È8];H?kqO/a?Яjzp)p&9U0+ a?[UvE+M 3H\_[꯭CkFqaG;I(+u.zRBRf]Yް!:语}tX;dGne|-Gێw5ݰC:'4~PC HZ4.建P {̆ѐlktawe5xtv=s! K'fM6 ҂y.jPe9(= -ۓ 9mp,2L ̂Cz0@p@x `C2ؐ hB~z ţ=Kfkk ŢU^@O47ؤ f8uoI& Uy|hfҞ )A]P_K%P_0 4ԗ̌Q_S_F!@sdt#LCi0 sdtӞF+Y1KCsiOsiR!^xQfa0piɴ!NZ '=mZJ-kZZ-iSEZ"ŐbHbDOT[ i-C>G4X@Gv%<Д7gMNZ 'q/W+L pi´btG#]%g3L懒S)2 LApi4bn1 9].vN[zŧ4>g3 Lq˯-o}k [p4 OеV[pVϦgi4l~6 ?wMû>k@Vsj[}o5ܷVsj[}o5ܷZzP ܷVsj[}o5ܷVsj[}.5]U  nXŪX5bPUCj(VmÂܰf;QP 1n<2C! 1?km!mrC YnȒ  ! YUӞWM!-F1I7LoN7o88/NaKi4|s9mîv#EkKi\*lvݰ+ 9y|P۔ߡ\ĸ- #q/< o7OÛqmi2y>< ZKx7|wn>޽A>PһYz7Kf,wn.sXz7KJݏ`zu>&⳧YN7w!P2vWWȝx^n>픑*ˑEɋSh4rwc;0?D G#ы*mu>;4|^DROso2$m ƀvcYj8Z6ظB PCEFhp |diϧJR,OQçS)zQEx5E OQçEkFwwhvb~~F ?QE|cZY܁֯F\R B!3R%n?p6I~pwaFOsin?X=@*Yvtvސ$YBUlvPqz8YI\ 0N*?ypw7?}rޠՂrzpzP9 8=TN*ӃArp lttCtyHU"m)?qEz s{,eUo}-a/ KwGȔ #HHHHHHHwGwG_uqwqwqwqwqwqwqwqwqwqw|&H{} A^mr!zp5ܕTzmv7M]7N /:zHo=F#~nh QkeOzRZw^s+@0CW?hXj!##}5osl.RjX]\=4s ?g&!˽wj(#,jʰCS٬}/L! R~( K9*ACC1P;_[Wfl8Oй2Y -*&uO'/$AZ~4|ԝc u:~C}DCd"8>_xP81 >qcg\AGx : u 6-:mR,ul8ƇuI,VOC^U7]oZn8cn8;cn8Q|ht )֞cq8FL|heC+F5A?9cl8clО;9v 6 6 6 6|2 / +H2ییییییییzo3}020 |.?h&>pGޗfM۽M>| ɇɇqL>1pL>1pL>L &d`-u1pL>WX 1q@|0ӧ33 1q@3 1ڑgrŠ0p?1p?pc8Ok]pf c☁8f Ok]pׄ0q B1pL>1 wM '܅ɇɇɇɇ5fsl]mmmmmmmm⳹ A|6w5fsl]k6w殶7qlj/7qiSCb,fb,fbaqؙbAcBXfL/ j320H=o¡hT|aaXt|xyzΝ0⣰CbXy8Vn0]iñp={7{7J+ 4J;6 c؀؀kii/"V SpL;VcXtn HHr.v5E!h֚h֚h֚h8g|01x  &fbk&fbk&fbk&fbk&6!C1=&>t=`g4%_Pgkcm=@?؝eto/ԋZwhݡuZwhݡuZwhݡu!xwu;;pEʛr˝񢡰*IQC;TP9nABivhڎsmǹE@*q6n7ێmwhۡmEkK{Ypq8vvupxQ"-[!v&vbU*q‹$"f e`8Beo^eo^eBʑ 7{7{7{7{7{7{O؛wul؛?].4U݃C]g6 {ϛ؛va`oS-:sZpNٶlNŶb۟Y(XV)U2uz3-s@v|q_s5Վj:rqq8gr>"ƨq}RbL]oPDAl1qF&aZ }M)cJ˸=`q}Z#b1*qJH.4U} MG!ΐ1]|CɦѣJ=D*٪B ? E_dhǾ>CIX]υ75juR5~]{ng%T\yH';2.WGAdɼY{l/Jf!e(P$CIB~ƽs^Pu\\k%4~JE)z}EJJEJMPud e'?{Ez!o]4Az ׶++1P|dg*;V" HN(;:T?T;ڙT3hT\%ii]RBBB?(uO\X9{O_|6;h M~5Q?d  s*3̌23h42R?(GI-Ǘ=KUO0ar\wo)Q [. =V ES:ESN:EXޯ*X7YuUdɮwXq/̉ފF(ZDHmM   6&XDh)jEsQythv B*MRP?x`>PzWqh*M~ۗ9ڤo/7@ RBZn%X2NEbƹqn` \T[~ \T l`֓XpƢ*׭=(H*pQ b4 ,FPbQ+pu@Mbkd۪-ߪMMnjDAL 9'*MrElt`x"lWZT:ݷN9r:/x36N7uo֯7)ݤToVQJ7uQXŖ𫬡43l~8(쎾2CE*019LLvb9LB!=dt=>͂jIMVXmh9WӗFI]E̋_7li2bQL&i."M,[W&1$BUI{L&i pE*]UPB? %V$\pgV?voWqz95f4ү5Y?k~W]:Tw5Jwm{ڨhBSoVP\8A_>_8<_hoP\Cm JPH-\^^hBej"͛THmBKR~PJk5USk^P-wkjX-w ݚڻ55-w 5mMjhjrQ*jhj ȇ}PiNSGYI=8ij}^wZGw( ͣyg̨gwwy;-f';Dh:ZCoeaxa^!SKHpPV6ݭսv%CTwYnMcZWւЎtdٯ%au<J ulu᤺e6~n;C!Ik55A;%ƦOC.:Nfy; aFe ?{whЍtA$1dwyFךSv7wR [ZRoX,4\xυ*T>/:s-6#kfb3bi:JiyE[|KKKG=4ߙE W p! X":[[Gx aB⹥T , ۵u~wU"Tev96pYKnUۯ~]l~^L؈]o#bKvP,osœ*a6lRuCѻ%N y[KkTv]c^tk>d ZdIC {/sABj!MJE惘Ž4w{Jvq)-Z}@:A*Js$o%jW|9>4s B ȐXٻ ߴ&u_?gkyTatqԱqԱٜ?*l\vl\vl\vl\vc=f%kSFvn/Az). /ޤcjBժ/RU_Bo|荶һJMj"8kkd_ފPVH-mA|4'ު>0/8i/@j56>D{e:m/Uw:wC.M駴!ۺw6Z{2UuvfW0h`]6{JQߦFb׫6՟V=irwɞ4޷+E7&[\.Z^_ta@! [oN"$ B;"U5nv/⍽AJ(Ǡr 1(G6J.2mPVΓ $`=L厞o{uϷ=ߺ\ fE)[ N;TSN;E o :5USN;TSlr(*ߩ|68c;UTSN;U]U]U]U]U]U]*Udv;\ec/m6nh64ujB5 &1A MlfnwFqQBڠSH  ԨB/=&,6zoORG0WT_?ڀ_tn|э/E7_L#zR;Ԯf_ۖhRh^ђ']5u,*Ll"hջulh8h86z&eq) '6:]軵H)nO%n{ϩn`tao[ώwv^q&T[ lmMna[% K֑Fø0n4 Fø0 nHiCE[ kMkݴMkݐցFQ8nސq:i˛i˛![l{C[{3emܨ#7[qF'7 nVr*}:ɗKQI$iNj_m~wϻ0j6n6_U }dhG~h~ M P坭mܽA¡+ύ'a4TJQCG9tlQQ(Q.?@z>4uìN7L1/ʠ(onNv}:aqFǡaBWqF}6Z?D;HnFJ^ mop19_ZZ)E`*Y :iz~fC~"\3~+:}o~| %Cխnz RP/'hBӿRZ6U7Mnz(Oz&2%_{G &Rgy/ ż~0VAkfa S 7ool~Fގr$.LAY?* R3>~*'!}7 ޒRwƹ(h2CN}jJl< >ʧEM&z4T[i&U]8~ &)t]ӳy.1pwdЎz'6)ɝ#FN;ί@ ^Bί&CG.ϯfEÜaps89ᜟfE#r_uyWMtsy+l*ΝOtp:}}~~ #fYd4:,N'K:VO`tz:X=]r^("󔏷+"q c@40jzQ\hw$59۴yeu&=3鋓~7u}8=S;t:CzǤw|ĤOLĤOL*3kʻj9iS;[G(T񯤔sM#gRϓzz]ktvή5::VjEMUiA|LFr~z"S$Lcw; 0MkcZ־Ǵ=xr>=i{L+cZ=Ǵ=y>_k{rx#xc7"GGVn 7jq{L-}"jz|PIע?U'CLR "@$5ucq?_/Bo*Bo*Bo*li΃-XM&pw TMTB 7$I)@TMTMTMTMTMTMQT¸ P:=qq}΁t@)4ycb6TF'`f -`fCotߗ A.bX:-I R-?% +.{{Bu '.||| A#uW(r;C[;߃pJp~OIbxhNhNhNjiփ8)9]}8)9_NoHdR9)9)9)9)9)9)ֻsg+ NZXTFU%Ϟ]/ *Z0պLRSVJ;.RFTK(ZĂh(>qւg-8k^! iKZT6EZ.Z ւ%-F"5f<<|F Ptz֯G%Zt!S ׫֞bOgAo z[:FHyѭKtQ1YP޺*[PbhA~KǨڪפE1X֌t57>fX͍Vszܨ 3 Jrc= n\ 1~/ݱB;C?RY:wɱK_RMތՈ(O\)Rp\.~ntViE =+9*% RO!w9RtMt+r}WeI6ڦ7 {{HN 1}轑.Gƒ <ƒ <s8:cα9G <sh`HD hկ{t7NDM|0?4K.9Aįo "UKHPF>?dJA>7|AD]D̔F)// "-_^4@bYM s|ch sbvbvbvbRu+(}Z㧋6(vMEZzryC Gԣ/wIZv- EQ.IdGk8EZFAiQmMjZr7Ϟ~-UbZ \8~ Vn %!µx`wdЎ~] '6hqht/НA]G餾_:D9%Q%%DC˅VRXB5p^\9A)TT8p@*P T8pL;JtG袬| {ST( ހ7 ho@{ހ7 ho@{ހ7 h9&'@%DC PEGMU. A4s)4sJ坨5ZF)fhԕiI\Еƚns-Th&䇪$Fj4R$ЙL^wXb4c[U2I&/R%L 2*ȺZTd]Ad]A3D]y}sWrW"#v?ИBUN;SNs=؇wq:S\\N>QoJM~?Sޅ]ޅ]-1D ̀0e0 ` *@ΐ>4 t@m`[aC>9C-V>h03$rDΐ9C"g`fl\af†Dpù`3kkaC"lH !6$†Dؐ\a\a9C"gH !3$rDΐ\R\R\!J\ +`Ᏸ!6|h+͇9ԇ>X#qH0N !6d  †laC![ؐ-lȒ0z4L Ybb lYB,![%K\;Oh?Kf6ڲlYB,![-K*Yb'KBzH3E˧Jx}"0q Dr- nA FY0ʂQ Q$bDl  ^>y5H <`{0H` Q$ \ǂ=$\DW r r |+2gH4F  `SSp⟫Os5S1f>L)V3EgjC\}Hs5S}fL)V3jXb5Sf]L2,ܬ89 ʷ̇je#9m6La Slb0ņ)6La Sl[[sDBp-ț$6oݷ Slb0ņ)6La Slb0ņ)6La Slb0ņ)_2U27[ͮfWmsAmb0ņ)6L)v3nb7SfL)v3nb7SfL)v3nb7SfL)v3n|h e|m5QC|xyy s[eVmPVv⹻Ku⹥R{;Qz{T_R /\ryxVm%ZтE~ސRTXЀ@&pw _KiUENtQეBQძCwG'z:LWTxaZ~=ڧh~IEwCY6` mlf0ۀ6`C!ڛ]t 6qB7ɯ8iINI9i4'ќ4FshNI9i4'ќ4FshNI9i4'ќ4FshNI9i4'ќ4FshNI9i4'V=wO;Rjd6kWwk.3Y&|5Ll^ZB1f-J/*Մf-ERሹ&5kQ7v7R7( f6d& R9MlfߥLb1ޥ f6٬C\l6aYׇH MxmkyڬESZ^Tb Mf]2*֙n^{H 7=?0bT 7f3Apn6fh\fIvۃCw? vB?K;=O`͚W].`7 v:W?qUK7fTSLa0!=|>ia o}VaJ2Xn`*'AbUZ7EFgҬ۔v}3~AzY~z΋Q2^|Hs*{/kį4Ao9ϑg3&h^;h|(J_NO5(EbA}e C q`x=;[/=OΖ Ơ7F]rQV-"hA_ jW颣F] TRW ZNx9zwAk ZSК#8"ꚓ\3-,EI4DIWbwpGW^7%!guLr+ k=cͥ{^}aGC}''`܃ ,zƺzƺzƊ\CF+.#u'݇G7o榻iv`C\T =mC6! )mHi苖)mHiPц6TYZohCE,UPц6*Pцv~~hTi mhPΌjDEx8]4!GM"etrWT>м 9jG!GM=ZSzЁe[rJcZ{4*ʐT~ʔ!gK91;1;1;1) :?!gKƔjCN.JKQ@{20CE'Rrt君H}hAbDU(U(U(G?U(U(U(U(>~,)ߞ,zB?}z?}zBBYv(zzgcE酂7.:%xcUUh M},C'OrG$6flMbIl6MbtÞmX%M0:9MkaLnjtSdT]<@_RSh(}F5:akZ$F5:aktX+~CiQ5:aktX+ى9Hia)uumCՔΦ0cS" Ø]]D~\S"n`Q(ءc:'5YA@ }.U4 dol &٤<YF?!Fΐf+& k;5;l H+8 8l .)ppxgwagwIo&pxg"+D%G$08 03%pk0øĵpkq&.qpgra2+8W6q& .qp8p9:8 ~ސ99Û9{n]x9gg@Jfy#}c8+fofoxCgG<1HH>`f0H 3A $ÎK. f&!1$`pI%sVR%\VBJ*%K. $`pI%󖐴u[. $\pI%|9BJ_#qIH:.YH T} #A1%\%ѳh.h.h.YH T@hZ I T}>ZB mtt#eJ.*UKrF!T%%i܇qc1BFFڠS1m,dG"@c'%jC%jk/I>4kSeI4h, څc7<@ ΆCb&ə1hjWK;!&A1Eݨ=]x\AM򢋪Z6Uˋ.r~n|re~Q*lmibf_P]~fp6͒ls)TZ~0Zsp!aԒ Oi~mbU.g8뇾I8uNanqX 3Lp?8aurhT g3!S.N9!bD|u`tyV1#N'987F{!EkNaq80tp?8tbc"b9t,v9tSn/J}PR<5CK,dYgZ)aKKX,G%,"KX,Y?%,Xbɪ)adՔXb % %,KJ %,Xr읰Xb %ܕpW23Lx*) #YI%씰SJ(aIO)ᚄk&k&`;` &`WJTM%kl&lfc&LLLLLL0,S̟b"r~|L(y`c1D 3q 3L$0/Sw]<@gOL/D2̟b>K!Qq_{H'f&hf"x8vf&]&hf"fO1h& )#ıD>3L3GO<2|Fm-ʪmJ"ži1L5.!DH3q2uEe>Ks1llll(DAz1<3[ 3[ 3[2[2[2KbW_C3+_!FZ0ǀ9́[[9;q򕄘c4sf􆧢9F3hc4sf19F3`Svf(Vզo'FW9Nb~i0h9J26%ʺjD<6'́l"#SADF6MddsC>^USVBzWUZ, Q{[ F Yʪ-r'*fZ2S!O'ǧ"#_ M$w ur:St3żT vL.kbҗcֹ.{fW1F4Tr6TCӜ]sja6| l66BM!NG'sC@[sҹMmj/?2s멗.JZoSsbc\p~m}moi]>͜}9u2=HQW>tCW>tsBzԇN}nt:;'6 S>SBvuj"MSR,هL1k88tC>{7Ķ~nG_J1N'A.JZ< JeYpPI\*?v{KD%Rܷ_!H#/qD;̈́9w 'đG^_q39I͏|#/qxz*?Rcp*18ze&YLj1Jj@~WSM? GaPW̚-$w l0ʹ7[|7H/9 >Ļ%Ph %b QC~I9(@fQqrYzɪCG*s_sït!q(Ci1*sGr*԰*0.| B#G@6b?ȅy} 5D٨ɆF&>4caFxdC,co8 zU}ƨ6__ohb#Ŷb㵡1".^7>Ļ5PL.)MuTc7T,{.L}O GT#l^78 ۼn+78}@?S?"&eu٦?mCU>>JRiҢFcMH~k5]&65^Bv=^t 5{SO?K R%`*1v+{:`6YmcjϦlxERhJEQdjh*|hՠUV Z5h^Gf7RbFfn7A$$2n7 C P3I$ P3mf a|7~訬y+a"j uDL/ڄ)U%4EʹnI `ƢVƢ\=g7*Qg7j&[.3)#ͤ0Fͤ)2$L R B0EͰ%5lI [R3) [RÖL-aKj&ő7fR<f2&7lIov/8U6 7pυ*yRzPۅWh1 zm+: | #v xv,+Km#Ƌ,P]5t&JgL<taH!t&ӵbϜu:bI!/&LWMKi3-uδԙ:3g 0N5gZLK][')- !MP 3Au&ՙ:Tg.%AWXzOP'Ύȃˮ"3W;GC yuj< j5}rHzq[JYm:6Mm:h7-z%oިSK04L--C.Q?D 4cjiZajiX\~\|#s04 ,-MRUG憁a`iXfYaViϬrR}h>X0|0~  a`iX~`ajiZu%a`iЌ?~=4?Zm?b. jc<5qp\8߅'4Gqp=Gqp=Gqp\Mmip͖ .zN_F.T8u4NSGh046SGі l%k̅yüa0oyڒ!-d5<ۂ+o7ۂ+o7N6m@˖\~m\7~ R[2ղ%`.~ `.~m<{R>\ Z0~m\ Z0גKVÇqkڒKVs- /x70%*[0 _=xZ\k5sf̵zw5Gb^ЂPB0jZ\k5sf̵V3WU_:twdЎFs̵V3jZ\[ aj̰6 ;miNvڰiqn=fzVج 6ͺ`.ج 6NݍSw8u`5ٚ73373̟sz87sbR0".\ vwO7 3jZ~4rt>a'cu\pԻyԻA|~//"JÎa氣!d"ڎ9l6\pHmFE(,vGNSi:S7] Էӛ+ %-57ѱj4ǧP'&<@ $Fq8n,L(mɮ*晆yaigZ2K<04|N[aijWJ2K0dL0dn%`ai$$3d-l&3dFHR 0 LӒ)aiIKƖmےQn04L1-ےQU$؋:CR^i CST 5չh(GbUcHDPMp>DC|\ol*&8 #&8_ebѬBYͅgh!ZP]s EFLBmhH^>M؛sPvd;nɮ瞙&q&Qn~-TMvGlM&q?]47>0-e} .z9IIIIGH"IscaXX9Vnʱr, +QcWU9vU]cWU9vU]cWU9VR۱r+)Jʱrl(6ʱrl(6ʱ)6>5%MԔħ$>ISSԔıyrlK'QcX:9N۱orGݎU׊)ącg?`@ok77Cx0j.p \8_kȌ!5_Ihnۥ qYA(Hܠ t#nэAF$wt#n;w;;7:7hZQ8jyq;jy<ш8G#8w4"pG#8w4"Fq(Ceq>SC tNBg?`jγXSyxZ C W"= kp\%n8yx9w9w?b8g"Q{8jG=?GRukDȵoq^8/w;ryq^8/w;ryq^8/w!ryq^8/w;ryq^8/w;rwyu9\p3az˽Ι0=V~5aBhUryq^8/w;ry ?_8pV[R07sy3|9Uk|\޳0o\.x6{n揹񊹼˛˛fs(WKR$LB!;bx &FBhuSJ!+~ 9'n RK{~+E)T=4Kb!}wuŢm`$eNPR99?t#i/|Z±?z5 [q:0-5WV]rdvY};ގc.ՅpTŸ}kB!wxl}>U8qbܱw,qbܟŸ Bjh#ێmNJq}QV7m7bw؝;vݹcw؝;1FAY(i57v :*qmPT wיJPL^/\VjW-K|x[ɞޯpAu6|yǟOܱw;ΉEܱwٰc'[gCKjΙ3*Ch{NjECGa:C~P_Br"--/_-.~TR| ,TgȎ)̀!;~\L=T{ÚS|eqYreqY,w\;.MqY,w\;.rqE"w\;W+rqE"w\ss!;a.=k?_B&TjRRyTx*C=t^;?*uc V@x w<;Ɉ8?tTZ^ި9{|8֞\hV}eR4R3ԊI(\(:THd? ' &Q8Grsl$p|Tx2EG"AYLU8*3o &_X$ 33B2@4F dL#i2im?yB;_C`**Fێ[ R,t #~ZOKi뎟TCsZZ5Z@BUH T!2-~ZO˴3vk) wL0.-,~kءۃNv:e9*gR<tQ`?@MݞsVDa>m^ckJ%x@ <P(1ރ=ރ&gف?gفv FchCSCv!;xbJ b4&4'[5M @>8B@HSj@HS私,Ŕ$HIYLAIYLjBWBRHہ,Ŕ$@RHIYLAIY ) $e,@RHbJ HbBW*'ۿ&|[@{Sbڡ@xZ;FCR$1C(b5P-[t("'N$FCyxRѪ -TET@)Ƀԁlf(fl媭#Av~9vQnLrcrq,J) uaL0xU])PޘQG)S՝B"C,$&I}])L@_}]Ӿ_Ĩ` u0tmӇ\i JH,.I!@sxc 4w.@s` 4w.@sh] /0X`YcR4w $a̭] fQ]b^cΈ$/9#B@~[ Dw@tHt Ӈb?Z[jsƖEzhQ].-̅ra=gf7kJ3:jQGiGe;q Az^i'tljUO1@7"aNl%IC[i%.Q"CS(LhІڰ@NWsڰpڰa]Vyzch+iYW}"RoQZxq>ޢhQZ(-ffՅ7 0Ϡ44B-K]{Pх|4TӅY'ӣa?:t-!BqlແEq0῞<: Iuպpwda]l_wT{j5ZSЋ !Iw$;jxx6r_%~vwz75䧿vvpU5Iŏ EufC\VG~6!%I t3 Rw<:]kut`RqAgm6ZVv.}v?xZ6*v[t?Ho'0֡4X5vMN{SԻN2;M 4鯝MIƵ?Hes(ԓQ(얚?"ngܗ5M+'Zy+`<G#`03Xy+`社GSD?Uxn%*ׁu` `alŽOw)ʺ"Aie+.`V ǂc`X0p,8`ي;`N0p'`,[ek7>!Zjk r ;ʑִW+a0pEWz.R:w?tK8z=GρsW_`䍞tF4s%o7zċ$&&Cy.AncobURZ_ )*DU>G vQg)K: -z.ID Zj_Oh^~C~C~C~)bDi 4~b ,Oc1x|oB~.XOe_Cbc"[Cb*!x{ =oc1x{XLJWb?d2 C!" C!?d2 C!?d2 C!/  Eh1[2- |K%ߒod[2- 1ߒoi_k!hzZ=R8 UP>|bo1^rhz`[= =ȅh[ozdէZM23|φ@CI3[E0Tf 2Af3l |n;֍r%oFFYw_D,Xif3lMRmb)f AףIESFRj6Fk Fh`{0=vF݃'OvvkVwkVwvvF.;`f ?ݑ~a,BcW?k-Jc~l(5vFݻ Kǥqi{\=.t']qi{\=.vKKpDvKǥq h\:l%n 3800 8,˫zCXV|}1 0$#83/UB0 0Fg&h9a0 L0fa:q{t0``:Q t0cqX`:qX<ϳ܅9R6mŽҧ40`X6_v#Ƴ{pzt8=:NGӣi?Mi::2y,x\Pӂi4מyzG4̟~LyOK?Ni"d>D;xowo=f&0 ].LC Ѕi4ta~="$y+5sh)ͩ_k P,5s\X^Ļo*"?T}+W ~5b1ޥ$k"b4ҨrE -_4ArPA̓w R EQ:pV3 =&@D/? ~mpm gkR 6w<<@,p4|mL?Hxum\裡5lr`Zxv+j/5D, !ln\PWh}J[ mE1]C5-}Я Q, ޅiPb;(v@{;3 Rf@"۩OLk0^ /F hxa40^C354C^C354C^C354C^C54/^C5|QFݖZX,ҲUV"ygplT3DC #(o5k^C9Cn^0yajՆɫ W&6L^m35ILz fk0^̤`&}!5gWwzw$eSlT[k4Ϗ<|hPhMm'}E?]i]-є?GS~W_Mٯ=5єk4):>-(HOk5k6l0qUkJ?NzMIBCa\^I/XI/X\z"b1u9XHA\D,F펬ݑYS# 5~, R{a^p'? OӚQ`2 LF(0q"5 #-jk"ll=1YS;&kjda&܏){a!ZSׯ6?gsl6?gsdp#B;ա fMʃgk 4:#a`a@1t 00{l<8w i= fdAk("e= fe)PM)R&2L*&[BCQ6*7Yru[9Ϲ[ތL*v0`L>4鹒|(Grkua* rku!UjR:r``w'?ՅՃTz~Ա;s._*>syCY $,n ^z^paUrN\V._n ^ؾg>D˅LJjc"}t,,^?J_Q|?0[w M`L}೹lulu@Un0 `Wp忂4+tVLYX,lgVȗ 3 MYά#߅vf4,a0+a YX¬Fla B|0 K% 9]X,,a0+a IZ0I &if=o|0B&Teײk&c [ˇB.Lt6lb61JUcVH!ʥ~(B~|1_!IZ~5<0Tv_;},.zd zn=&i^4/ބn]jQ?lq&^Ai]1xC|<Z"Iւ0X,?ւiy ìca̱0Xp- [0 s\c--mqar0X\,L.b7 - Cbah0XE-moϼ*~[ۂߖ.K-Xm` Xba,0XK,%bZpϒe0KۏܳzpVjY@\M-eq5&̱9c+]t.VjXMb5]t.ZZVE+j(ڢ ]lbCtmbC@/ ڒ. ]lbC[@!_! qly+- ېɆL6d~!_H׆L6d! ABP/ ABP6d!d BFѯ ldC&OF fځfEU. lyB¯ lfC5b,M5f7즚TjvSnM5f7즚TjvSnM5f?9x\g6^u{Ճ. fyy;o%ty[nbMltK09Sul:i, _q,JCN$D];QNԵuD];QNԵuD];QNԵuD];QNԵuD];QNԵuD];QNԵuD];QNԵuD];QNԵuD];QמPWB] u%ԕPWB]3>q'.ԅPQNԵuD];QNԵuD];QNԵuD];QNԵuD]z9eS69eS69eS69eS69eS69eS69eSKцvBi't vBi't vBi't vBi'` v}$Y, nb6Kp͒ ANo8jQpԆ6(B胐M GG1n[yk7oݼvn[yk7oݼvMX+uc6avSl{n7qWvt{n7+uWv|{n7MWv aƯA!֡O: JHdxD)s׭R{sP9N/" >vT]ȃA/\O(U+IyrH4T^~ }P78m4}Q jZy0tvŅ iV[>p| \? Oխ qw"yW ؉ {nNСS)% =*zPu U $Ηh]XJ8.)Wjٻ~`:gl~3p+;mc!峹|6ONh}6Ϧٴ>glZMi}6ϦϺo]MJ)S]τ)_j"Na/6"UG*y%h  ?.J.K@_Am Ie]\r9Aފ:6޴؅mZ\jj5vZ]ƮVcWjj5vںZv烃^@؅h UºZa]VXW+ juºZa] juºZa]VXW+ juº sk53E q5!&,Vj[MrIn5ɭ&̶nlm=:ӳm̮Dg&:3љL,fb1 3X`1Oap4X`1+(A[`1 3X;bazbkz$mNЪljiM m#b1:Ao lp5s 4kB&4kB&˺ZlWЬ ͚N)Y58zC&B&4kB;;IIDhF(܃4X̚ŬY̚ŬY̚ŬY̚ŬY̚ŬY̚ŬY̚ŬY̚ŬY̚ŬY̚ŬY̚ŬY̚ŬY̚ŬY̚żXb.s\,b1t9tЙCbPu^ sHӷ/J*O$gsT&%ߓےmM$FdUMMR 6-4W Db7zwX?K:I] W| LG70x Ï ~ ՚:%%*Vpu`JT0`ta aN]WM޴M޴M޴M޴M޴M޴M޴M޴M޴M޴M޴MQQ1jJfx$D!X#`5^ܰ*V]hTvG#28 "#:" 28 `,y*B;#A pe.mл6rm#: p )l~d@$M$M$ [VCuw-YN^)!_qS .flflflf]lŖ[lŖ[lŖ[ly.hs mn mk867 67A!67!ˍMAI Qr1Mb/R6767671C>HsJѭ|^sewS?fmC^)8+Yz*!Ղ)WwE@+uyF?P촛vnCyh7qA<CYQ v_>5&BO]̷ BHg~ |5*N}G]C3ߨH gdճ978վSz*m_+z栯~[KTy b?d0r?R㠯rLRF]6HHHȜR A 6XNq9ˉXNJXNr"SΧD,KPb9<J'KBN-a-iҭla-^"Il-2<74Vcn ߱pٱpٱp?p\ݘsuc*)Mi4E2ׯHhLhL1䴻jR4Һ1XNc;+^Fme@ d@ HWqZrAp/kڤm xMUrE?Zz.7ACyxuG=wTqGvmG/KPek!, @;­2Vz*Uzwzahv:3邙.邙.邙.邙.邙.邙.邙.邙.邙.邙.邙.邙.邙.8+9+9+CghA΋/>KZH~ ,X_@7qyQoqKh? `NM9978.HhORSH0pJ4WjUDZ38@`ӻJ=yvRO!Pwdz4ߘ# :h3=i3m @6i#6p)'H·d,$c! QdT$شBi|$#HG2>񑌏DMJG^5}G2>ڏ}PldTH58@;Mh|$ti%54>uAqat͠ك"{P8NƯG /P-Ł{V VlAϖl9Ζc-iox#9F7rox !9F7rox#!HWH[9{_)HF7Do$z#HF7Dցېr[wf>kM NuM=Q3։&pݢC-_E/rHϭh -v\zwA[m}ǼH8( [qEQA5u9B)iڧ߿{vRI"WUrP?ͮͮ৻KzEԿE(a:%VG Y͢mmhmmVY6V'A@uP5Ƈj|-c5Q(RƓy[䥩J|֤z|YϪ!"REJ!MPJyAyAt9Y7=RJg~u?wXĹLPO,zVO,zb{buO='V';/Y)N-JEp?Z}m7!u^P`-`-h%N;M4N; 43;s3Kh9{Eά,#sNC)ݡwޝfuݻˀ(n`n`PH'bj _@_@3YSQb08>9{SL@ h20B)Dķ@I Wpy9hph7@fNQOiG/duGKz'Zz'ZzR35<_ID&+TY@=s,PHow3wwh'?OAҷój`60&TPEBezp]dO>깭,唇|RP- - - 4ҍ*u+M*fZ:@@P7Rd6A~]\JAl d Q6އT^~DZCJՔYbkz󩒾g,_g>K`6˅tpװ\F^YM8߯DՌ~|IFH R/H㢧]tN!xruvU.[@&hE#?=4) BVzEjm<)[ |Hi5*~U=B6"h M}eʋagDɴs-P^QyP֍ɟy.4H~O# dӿ5fcRhi*rBk#FH~PH~j$?hյ9L(ߧq{U/!!!!!!!'AQm!W%?=%$bO?k'!?]MZAO|BNxtY?Z m%UABR?l%Bs.|7o~-_ B~-_ B~-r  )OH=)ׇ"{CR+>(@z9WW!AAI=PPr=_{r$LJ{ Z/c! XHB*RT,b>(& X%}h&H* D-Yߒ *,URheo!x~MN^̧{Y_Ϳv[vMqF?4Dlч&(;ʼnƴgP j93dQ|[IH>ҦޡmLX6&,ct]63l5&,៱w}zj mNkgXp@N 9DTyj öAM.yH߫nxq<p>hG;=]Hk0kٲ[*Y}̖Ao~/b0[=0d0[peA}w&"fO 7AίNo-OdA?sqIj&J4/iSQҶ~U'-' x?<4$mKz!^=l:k5%L$_2Iddɱ]_}Ǯ0Ç N %ܩw;{uQJиN?\u=7n}0I?ܤn7釛9m](_;Gv~zi;Gv~чC^x=lcO=~q!{0~h΋':~!99pN99pN眬4@؛Ɂsr|9'C79M|#9H!X, uƂ |j~gWcVrM.Op3Z|[\M]+kkfTæTH~+bf+VCmIhL=XHE[P1?:n4f~[yN哽ҭ5ޯx. ֩٩viG`*h?Tg5XjjH}HEpoy"\2W*_M#%4Zv[CM uψق٩)aVCo&QficJ(?u^s%hWiQoNrP촋P$SvET);"N;@UD(sVfgٳYz%h=ϞgE^zuW9(@ z}sC9(@XRU h y\9oT)RɓZ1l1l1l1l6ЇvO!crPVAdsCư!c1lnc61x ,_Ctm^}逸1H_>ioBy~ J iOFe2>|ż}娱ZXLtDHb^3JnMWFejt$3h̫!BK"CHMFjjvxP$莆ZCAkwFH#9{$g쑜=G[RX<yW(U:-S| UT2B.Z ՠb[/*b[缾\&~ڿxЧo."E X?@pVUDyj SoV_Xmj'OKY<]i޽hק4Tn]Dާ4ItE-/5 9>չ4EqW#VGX*X갪aUêVo\V:D dXC]>AG:zу׋[VG:zC*bgECTDyFyNZޠSxPRr~w%-Q^5AG:}rѣ;ztG=Gwѣ;ztG躇ꀧݬrtm7ԩ\U|JHύjQ_S!.z_i%-S~ڥZ 3i,+2r_k ca~2(ٚ0tCJ *h+5Yd]lH}Y>EELKOvrQvhcKbKEKmQC64&@5m,Y,---~Cp:eu˚=;e:e:e- B],T*ʒixOPJyAyAj%T_dYl']l\l\Q~>~H<~9TŞυ6lojoͅjgIm6G.6G.q,"}"m\}ibKAj[l\l\8OڍEv,mTz?T rsX .#wA5 e@#XpB;.m/@Ut]Xر\ֱ\+`<a (>A+^.G()9ɱ~}ƅl|-HM`2aVd><~w\=f!$"cDYC.N"BŒ6 @S,eKǴǴTS!R*wlmZ}LHaFQFtmZq\9AN|{_ةA[ew.TAJoAKyKusrrur;(8W^ɵqSW!=S?OR96޽Ϲ[gggp }eRr=?.鏶gCsa hq?ZſG~wZ"9 մHzs@"'9Q?-8!S!SV+r(.~AJ839LN_ ۡvh`my}[^!پ~v!@%).ߖ+P Tr\>e ;JNAER,M,%*9JF*lb'd YT9Ow@%J&p@%+$'<_xR +$'<_xR=QqI:3ѧlbǙ,9D'd Y) OrNx/f|j?۟W>{pz_eҘZe:c-n!v [Bb-n!v [G_mHK e^d Jq.j%חL\_Z{%֗@Pu{>i+qOz'/L0S,3Wj[-cAەvejkW+W˘LfY)5[:;͜fnܝz֯,&?m(em;E5؝PRϕ;s|g98Ҹ,<ߙBRjh~/Yƭ:̢!­%hn4o7[KBѼhn- L9]ָK$typilC*0%G(h7r-r-}fC FmFmJZAwYtmz*6K4#aDkuҪS+Q4qqqB[hPPw @p^^NJl/G[s.P㪽ަ㕽bӯ}5T=\iQE|_=x~"}oSzU9u닳Ee]zACh]_tU@(@z}pf7Ưa;uÏԲ3;\TvTVǡS3́ IҶ߉zhp͕ ) "rli-NZ ii5֏C/Ku3/4껼Ζ<`_C=>|çpRb즳T}ᴆT x~uGC3f`*윻,5hHa&# 7j Yj?x; ~֡S8I w,J3FAE>ĻL*ZEQu)7R. Rɥ]dse O*]O*]O"CO'.~'.~|B?PÑ?& M%]`1GǏ=:?:?:?:?:?:?:?\シmwb(_MS⭤zC&r^h6Pu9a1XpЏXpU<YA޺ $mc+mԝw]׋ t6`A1@Dj4ޖjsЦMk8ZihXz! RBoj&fL(Uz{c?Tw^*՝FR2V|AUS|E =VcY!њAkqAI&YF;68.؁Ff_Fd5 YuY5 Ywu׹NPi(xάtp W5넫BTJpu6Qjvׄ|Y5+0si9 >-kU$8:U3$8:O%zQB[ȟ#?LBjMC:pЧT_  HMJ'O^#~AI m{7Tb߯uA% /~-~}n$9cO.H50TS{t GiYG]oΫEsΊsgǃV:ݔyE0zz8^&?f}RrJ+O)ʔ(7yjs(Vi ՈCo~ 9F+{9"@&H9|H-AŶGO^wi5(.t¹ mzx . ~(@;.BCys{"3?*?] 4[Vu:_9  -.`N 7t-lMa&Ё0ؕRxwQ'm\$:9!B ȜڍTn$:9! :F74 N.)MKJsKJ3 WJ9 ^Ȅ\N{"R&$4j"~X yPM-'Ffr? =y`S #G}5>A6n|ep=\*J 7/cгcs;\ #̥A2&GSj/"Gr8\jEQ^ $iڃqy"rNM(hIrkK˭tO.&vx\LK$w$h5qRgrE R2 Z:R 8t"JM@Wj?ⅬPX?"d }ǙM* Pҷ%R)eӒb?4cv<7y.jJ$J;(,Mfnm0m5-Np6%!Dz#DFGWD!7Bj[Mtn5љr]&3)Vn{$gSN lH89% Rl4HQ+Q+j2{gzK]i_ }MqHy+.R^lWHھھ>OҰM7%G)#MҰ97MiHMSG<Ӡ%}%FF~0O/@x0 0 0 9š) k`m X`= k@#N厱"k"0f;fC[@'A[meZp9Ket-TZfIr)o9J+7vHx<ԥYK/Rj|6e(̒,?&ˏcrtrxE @"_|:eʃ>=W&2kZ~&f9+%گa]m*o k$mx,&m=Wgh1#CZhhD;s%%4B-4B[2 Y;ge/!]w>y=s޸Q(u0sRf D)SUk&r!'GkPA0S^T?+fb[mƷٻ Mw^^A Z9Jr9i/J^(yQQS##5"oddGM&GĈKLL{;`W HC ]0.R "GY"{үn-Xݗw{(np/~6aI;{kǩj1jďlх uKI$HR[I+ͷ%wSR#مm5TRp %Fݯ/*?n'Ix#i~ gCc2.j e &k+.=tt7ɖ_Cf-Fg(\W\䙜M\ڙHU:/"TMbdr[lݿjOSmYz9E .oEj?ބlhLƯ̏6yJ6'mO6'M&dHW'7nOnsci<{K2O<^ԓ)>GqOLwn}鞬w}YyFj %1P[N}SnV7)dk*C} Cb{%>Ry(S˟~U~E-c C綾R3 f 0ĕlvSdТPl& B%MAOrnD;'oZadeP+dbk?~EyL{"D ;u*xvذ^lfkAoeyEZ*U:s*6ljɱF6WZ"oU6r2^֟ͫ9N: skQ|TSIBiuZ&. RyhQ} {0mAX WakZ~v`_(rB W7wnEK4w/i^܋]yn_7E|Ӆ]ɊowbHXbYϔRd7VPRI 2ܻbդ8ڦ}MRMkpzY=ܬnVOm7t&6tjn`Y۽6kmNρ{An rv/^۽ ؈۽ {An*UݫpZ%Z%Z%Z%ŋ.^tI#Ȏ ;t!]HtСC%9Jlsvϡ>Ci ZU E|ś8_93`Ao*ooooooΉ:_3#x0]4 z{ x.เ@eM<\s<x.9Nۜ8v -`vhvfhvfhvfhAfhvfhfhhhvhIhIhIԬ 䝒Ԕ%l갖́fb7.-r,JSÛ%_Noiz͂A/Vx`Y9k r'%/r$乤M%LͤJd6f3i6fg6}fgfgsIől̖$ |9_.ܼsy0RߤE/{I|{K$Yw)q#v 3"(V`kcc}1>X{~׾`TVWE5r2DP g}e!)3S.۞rSWp퇶B/R nqi7]3F*3%}ɰypbedoce<@<_;NIdNMu}$t":@`?XiT:F!<|? .tUf9kO'>ayq:2RxQdh‹ meGIwDz!έn聚&~t0{XK烩 kjoxD>8~N9cN 5j j5GQ3| 4{@4{-\Khi-% e-UW_UUW<ڪ/⽣ua-,U<J3yK^6۪R =r.I\u%EuQ7נPF _Dɢ[Xt nU1g{!Fy]MV[uCj񨱨=}B {!;d .^]EKߖHɦMKj_؊9sn # jql\=g6)I`|>ᒗ<0j[cyثwc_'2Z]̮VfW+Y`":љ43|@LƥAx`A )`A )`V23ߺW}%ʆaiuC+3>T?VPVC)1C)1ҴgO}(0BNPY _6V ^T=FD~B>X 7DhE:H1š*Z1=g77J{ۅx'p6YzI}ݤߪnwޝ[6#dE]F. ^k}AYU`kKC]5V6~]?D 6*O#6~Ο~^}T1Osɯ{ZK/~cdKϽ1.h4#:">~Qm^~qu:VX9bglܶXA+MO~oqnA)fPV0vov-0zղOPk=vݣk3~y}=fc[A Ƌqa>BgE!+44]\iCt粒izۛ>hS]t,󩚾C\i_E}{.JP >|H_vx>.jvx>s >|BB5K}?R^-QJSkC,jz05 WEQV|NgK=pQTZ>_E4u UʻkT~23 9%'4@_w 疾r9C%lixUv-=4Aʑ}<+$e:'o$(oO(uBJPR': NE|S_h6H%҅&;މΦ+Z~vφtG{+|U-E}}oۃ 56=[g3|=)a"wTamMd_/0RB*xRf? UǂE)P|Qo-MM5,&%AEKN\ߺI!tt ؇ eCbZr_d&ń|_ZHSbZL]k),cI[ZRV4{_z&@/ZJ,b,&ՓɰݿJڷX>_-%6vi<_} [ WwŒHbl-No5q8VjXMc5q&ı8&XW`3i"#E ,,}sӌlU颚7= r r,r,l((Ų\d ޑX -vkhlbew6WV&5lnlnlnlnlnlnlnƨI׷W5[~5glHa׭uyz^׭uyz^kU{ 6zu#zZT#3B?#UW^zUUW^jUB#5BB>tt+m_kA_#~wluTyfFrѸ.2_]ѹH9&uȜH.cccsbf,OW;\{*p"%Hmԯ. J*H_ bs}b}ŽУ8;ҧ:'uN[ U88BL\zCq]CqxYQe#:(~6!&ϛEMvlv:#RM|w9 &>o&>ǵ́ys7zs`(@o@o@o"ؿ# blbŀW^ x%W^ x%PB˾wE2!'k(k(k(k(k(BqH%WED޷R![)[)[)[)[)[)B1 m!(h%H_Tq_(Z mw syC5L"))))~ 지 ,hK*ڒ&hK*ڒK*ڒ-ǒ-(KJ#O(?þ)xwfC[@'++++++RDRRDSO֖hm֖hm֖hm֖hm֖hm֖hm֖hm֖hm֖hm֖k® &우k® &hm֖hmm\_jA΋/W&\pe\X4W^ɂDT S%L g%d=(]2%]2%]2%]2%]2%]2%]2%]2%]2%]2%?D||J<"Z\Q^& 4H '/4!JcPAIS~n' mM7c((( /ԿJ~aܴN~mNJh[@hYB~ F.,;4AgkYɻC@u}O,u;ߜ?GdѐҚS߁Ybxzů,H"=YfO3d *vdy4)L#{ ͞B)4{ ͞B)4{ ͞Btk ~ =\Rjv26fLTj3Qm&D6fژ3`c;QBb^8AThJەm^H Uz W;dMl}gWjbh{4l(BMccl쏍?_Z\R^5d;Wmȋ,}UH9Sg/FPy"oE}+fclb6VƊX1+fclfw5bޡ[uzr0FV :9i3}p}gM LLn2o4>gc]5LnFPSfJLiC;`C1m 횏q -3Ty7D| $N#«۝ d(vO 7{r=9vO'ݓa{r=9vO'pMίZ:ǎ;M`WJz}1`IJ{CCznIyc҅ne+ޱy!uk7Ń xO}pTO3ƯG #/Bom|yETI래ӂs |0-m:X?=)u4|AZ_AV~ m!HuemY&H;}a<ԃ 75MD,Qi h9@ڡ _7Lp5NDS=/B"!5g?IMꐛMްC3߻e;c9h.J4}S;AsTy #)RH9J68Wmp\js窍OA3>9h'9h?yE堹H5(9(B堹OR4V{^[f|r4 A[[f|r\D)|ye'E+zn-s[T^>{|N/*^J|^*GI{$_Hj-/Nz.!'wZmjsw9h6:: d~Qڗe~姿!w򻌏ZClP-J0U~_W|-ZR?*^|Gɿ EAZsu>$:U`? ɩrl( @i~7?5Ho~қ!V uЀIL9P(Q)Jo0 1?`c0 MFә?VsV0y7lalao *,oACU-om0.r@~9W;d%b--9NѴ6-9X-Hu)oED) yK. P.sIy:#hկ#oˆҔ|$F<76!`Cx:^5=Hz>T>J!!Ezbf+2#3ԾI`Etcd$/#c12#%?>5ՇrS{O1HE9"6Dk 'OL z9ܭ@QmU)G% f!$\"2>GShjM}4>GShjM[:6sLY/NS/PČ1S/2WyuAUi31m&Ĵz~V)k2w&ܙ;sgb;]HCФΚ&Sd La?:8ttLL h2GML =hF/3zKLĶS郞wԘp,PQrmSGIS)1 σ41&@Hib M y $!@ %H-x"Kvk;IƓJZ(i{iĆAdLib2ML4uAKa31l& . f43۰mw>d#iLx!2{ᦾM2Bo A^1 em~@ /vhh2|FyWHwEҲjl< |J=h2ϭ#`vkrM~kz. 쿺^bZD^pޠaMh|x\܊@Ų,9ٚ97 kNdkNdkND kNdkN삢_, p kB*XB!{ 9(@z(r ~$o^Wc?;wZ JvМ$L$ CGC+:8Vtoc#pppBDž{G~8$t'}&1. B|[҃{747tTjj]o"@:.{NK&.o}[\)[s\5Wo[s\[s\5Wo[s\ iޚꭹzkޚ՜G0 ǿկYo%t`S`+A!Ķ ?T`0e----#L` S0 `0C(B:NTP0! `C  4K P0Ξ}AzTF*%zDd54 MʰwGG|EwNJ7  ):n̝{p@X4@HfI쓼HZdpvN 쎸h9qi55"^:~kDXyy/&o {1{1>P`'C1Z d&Dqꚴ4h6ܞVH~[4xޜ DG2:r8{fzo'!acWx6>MBCH͒]N9(3GGWxdzuM  UGZ>ȳZaᕳ!Zn;#"n}}קnN=/M>srF흓;;'7uN?uO5Sw`rO U)8&T{ۧ˭6? caZMyT5gUzfqwbRG|Wʓybd7@S^~P /*)c(@zԃ s I\\_·;9D#}eDIjܤm@_~fAs՗wAK56}vc;y{Yl~EgGZfV*|?%ζZN5b:t7a ֩􆼭n|:.IGKx?ͷ͏C!sÕ|.L [ֻnRn>NJƠЭ /F-9fCk a6p4]iRJ(=FS;˺7zk^.r9/R3 tg/^%p7A BՌ?٥P15]Mt5&dwz[)(@L3s91&&ĄijyM*y.L 11!&&ĄbbBLL 11!&&ĄbbBLL 11!&&h=-bbVL̊flbY1۬mV6+f͊flbY1۬p9۬mVgV1h U1ۀm@6 f lb1ۀm@&lb1ۀm@6 f lb1Lzp53E)}6Ϧ<>g3lƞMӳiz6MϦMG8pTLMpgLpg#Tgw{11e(`0l6 P) °) D/h D MaMa~x_?BS`````m?X`m?X`m?XqRk~k~Uk~k~k~y~k~k~k6fck6fck6fck6fck ]k&[kfXkfXkZU{ZABi*uyUZS5UZSUE.tQ!] bH!taH!<VC: 0@0ÐCUaH!taH!Od_.taH!<1H 0Ð&-~u!:hӺH+taHof~_Rg0:]av&Ko&KoNz@O)کW+{>E;HE;QE;HykLNʙ6,;QW߂hzOƘ3cvc\FPPJPJ7Qn& TDeϫBChuE kE֐WyITՐgx=#LH4l>[ϧ+Fgk~oI=Hեgk}>^V_C/Ql>r_os_/@C>X9>9>q|Z WspSB yۯ0flflflflf]l k+[@ko7a WV>nkw-26@}Q ͍))%S`c lL}MznR7FF!`clQ1 6F((FU 6V***l}U 6V*XZu uU 6V*XZwRJu:g;`tܜ6/=h"oH<\ԮA >AƼwjB["Cci>ҋesl<(ߚi]3 v; uƵZKɻڕ݂ .#u~D*Rhk2U\pX<04UUHN.RKo9Ԛzك!<Ćl?cEp6\4wmAUa" 7BTrS\]_MyRt˃p)=`_ͼKקNN' YRzagK':"¾Z'*D[ή_~ ڒz9:(mCu~Ea.җsxjAɯ䪃x[l}eXt@jMHdΩ/%F_сT+Fյh?wG_m%>h)@r4  h@ 9@r4i+G롒 hf h G*;$Q:G,VXS_/2JD`p6Uαxnɯ[PgI_4$E%kJ "Mk"XD&5!`MWZ5&5`+X{$XW8YKj/D`ץ霋 }+NB "y>Et$-h6-MYYKs@s.BHz mcm,c餑WH͂$_BNv:h-PVWેieR{`"IaJ0Z_*3Ƈ1>;(f-)L8rٓs&'Iz(ؠ`N<(ɱIj!ѭ]B-$"WTaE_]pP diz *Z<5ӸmjRc;i4>' Oj-R!U~[[AU'i%R4P MHiHi|4_\|Ha[+tAW Q(δEbA|j/PVYzC$1묷|2PT)3ުzFA[(TZY,,j 53XY,z,3]Pn$ h%a@\z S6wKW|j*y[ ` foemm~Y[meC݅cw][M-ktڵ~εut:0j\[Rk(ŵuyW0g ~}?w)ںzC\…p[8So[[&J`mBVVּ 5U.zn)-el햲?0g''\kztsИ cccfD)F)ίx@#˧h&뾳wJҶG /⽦,< N NP; D#:|P^AT^0{A*:|JPӦuaeڏӦ'luԃְ}kV}q]Bht }tlE B[V8qqqqqqqqqqqqqqqqqqqqqqqqq}{Ay{{{{{{{{{{{{{{{ӵ ft}pĻozqdRs*sq77wJ8#TD^tq=~Z5s|as|_3q텋WX@=,@$5|M"_$5|M"C$kLwR0irqqqqqq `!W;q*и͆T@RlH4Q3nO3nO3nO3nO3nO3nOl2=TMK!Ҹ ͸͆K33R.mH48383838383838lU9z;B\jPԣA=zPԣA=zPԣ,%XG˘s[*Id>';NJd s[wEҶҞK8:&A&٤>g3l&NbL>'kB% sV]f:,–zbgdz:˯ɯEiK]ֺt~]]BST^B[r ]n-t.DbYL"Id1,&$Db [r ]n! ?.l5$EPT_&-)_W/d,d!uղZY= V`nzDGjfX="V#bnzpf_ngCezEQjE_![+ 3U_=t6pdWF_j뾲+뾲+:0FʋwVhn6V\=˸gĸg=#U̐qψqψqψ9k\5ٻq!iE8b8bVhEVhEV 78U|aec6n_D~ }4Gsq49mr$&޺n⭛x&޺nmMoI$KA΁V̂` BR\ K!p)P0 P0 P0 P0 P0 P0 P0 P0 P0 P0  HkGkGe#e##11WW,= FO09]12/ zP8:GG#pt8:GG#pt8:GG coOWHVHZ!& w ,-T6Tϋ IB7x^{7u fe/-tϑqIKZצ~4:Lvu*|0loޜ8p84Zx!ÏGRXsҋj݃`A[nkO".{J8*36=Mh36=g \K5/fc)J2N8!ㄌ2N8!ㄌ2N8!w7Sj^ކw^H[4T^8JqVW777Xz̓L!?VJX+c%~ďޝ;R͋(/r%\zMF{7鹺ލ<6H-\|a7,%ްxoX Ka7gCЃCe X e+yS ._9`H2M{Cօ֨b1WZ*9!wQ vǯJT[)xr>2Zxҧ-AU&l&l&l&l&l&lІ3'?TDu"޼ћ7ngٸy6ngٸy6ngz_oѴ7FhM{Crߐ7! o}CrߐFhڄA"F$b؆c7+"F$b؆1Dl\4νqHmٍ=Jϵv녞ݻum+-+m-x_Mۺ 1RH&s[Ni֨[ )-r1/hȎp6\|ŠP"w+Jn%r[*57ci\3?T89ېB(TBmнȎwo0_׋:M=頧"TVծT>U^E[m=E]--B="x͜oUTĦ)5sF<8RB{~ĭ~(2-¥ M§ M \L xTG NI?$;L:6{-.xk/%F=TVABeShB(g͹g=kr+8 '?]Ppnasnasnasnaw ,L5H9Kzm|L(AqmsmsmsmsmsmsmsmsmsmOSEө"N܈ܔƯƯBssەhh a?#p-|]._sC oD.\r{]]X| ٷ אW zMG<+Fp5|f-zxG`}&U1w[[ݜ[6*EnzЧRt77777:]Ĺn 醴\.醴znu^kmP7979fv W-871#nwo6A <;{"(yQEFMFw[݄^\Wu ΥoΥo.}'ߍpB&ιι rnsnsnzouA7ZTٵ*샊t܃H0ґ#~}8WVڅ<];9.a/&ѓIn$7z=ɍG"^8BL}#nGC q8zBA Rcn=7Gύsq8zn=7GύƩ9^&.(rP hS'fdw|왘=gbL،PBg@xѬy.mԴVi|jߞsO8797aNh jZAYN@<7w{;P?t.w{7Zl~j2`m6Xf, lJMy le2`͹k+d6Xf, le2`Rө, le2`m6Xf, \>gsl.峹|6&٬=Ypw'B|mNuRnEj<hHijx6fULKL@u=/wK.w݃jb.wKߥI'"WL9sZ [pm{n:yn&h  ڶAw'/8yɄ^į_M-$N^psR^M?nԨktf?;s5M rEʫIy5)&E0wٯOW/}p5|Hy=x8W/_Cٮu5ՔRWSjJ]M)u5ՔRWSjJ]M)u5ՔRWSjJ]M)u5ՔRWSZ9~LjVGWZ]cGej4zo V]}gUݪcV=:A-Tócxv ώ1<;gócxv ώݸ5(f jE^{mkmNkȋcxq /18^Ëcxq /18^Ëcxq /18ǘwִBsVkE078pŵu]eFz3c:3 й17pQcvX;j5kG+Qc5֎kGQc=?Y;j5֎kGQcvX9֎Og`\XkGQcvX;jg"z&g"z&vXOJ֎Qc=UYOUSzֲg-Ygc=YO`f=YO`f=YO`e½vJhuQ֯2>]$.Oy͸6H4UuQ ۺ@Qa[}cR ۺ4Ra[x#~}CF#wmF#gh r9ۋE"g{^;N.׶\mA R m -_U+ҙZҦ21].h qM9zȐt*FQpb\."=ε"jV4ɹ1\hΆ_iӍD(LͽW 4^𞷼 /o Hޓۘrм7w{؃TkPم2^zv`D'~&_4$w'’ْҨ} DTfo7 s9{Üa0go7 7, %n\s";jAC5=^|*m`w;v]+TڃpQJY9-"%%W޶yVw6M;h 4%=s4%vxh灇vxh灇vxh灇vxhvB`'vBmN{S齒sg}7T&{pe54 MpOBof½[oH%G[:=$~`I$CyUI/څJ?N'O ~Ҋm'iPNAoIMՠm݌OX|VVVcpjpjpjpjpjpjpjpjpjpjpjpjpjpjpjpjpj|uVK.cEf?AD[zc}ںyT4>m<(^k>O@)]JS ŧYb93SAoKEj.Rj~]3V53QCkATd45]8Nt!A)aFm,*92]v~~iC1`b0e CL '9 ?r>w%@9t48t48t4ϡ1x؃mxJ:h7Ir>_-@Ήgh+|\ J·494@^h]Q{piH{pһcp{D)ٝ @Ndw"ٝDv';݉Ndw"bU;݉Ndw"ٝ.9T\| }SO䃆+)&82>(ݩ:M-.)H1]$ųT}A* ϖR<v?P|ѥq^8A%TBBX ^ ^~iH`Jҿ0 `1`a,F d4Е6M:4?1hCkX X`V jZ_-ky_g> sWJPv7D]UZSdEQ-BQ9A d=[δe <<_t#X*LBB_TwFN}qo|I djŀ|ƥzata jE^+zSֽiݛֽiݛֽiݛ֝ gC(wW{zG) Y9c6 g{6 g{6 g{6 B9!V4Dqtʔrjf@惸+M:p:pG`zqwg%H_!}7w}7wOP o ѵJr7DM?T.@Ch<鶁n趁n趁nѺmnFѺmnFѺmnFѺmnFѺmnFѺmnFѺmnFFѺmnFѺmnF$M$M$M$|tqMlMlfl*\ !-\hފhox+[)JVJx+᭄Jx++)x+JD[ o'; (A V[ o%uoxG\V[ o%E[ o%V[ o%uzHsx+lJ6> o I Wh+.Ll}vE;c?0ZGX[ټ[ټ[z?WiQ YeU6YeU6YeU6YeU6YeU6YeU6YeU6YeU6YeU6YeU6YeU6YeU6YeU6Ye.eroim[EZ[! imHkCUR.Ul)[{_ۻ^v;\m6TƺXzc]ou7ƺXD];o3M6Ʀ>R.򶾲^9?dIo,%7ƒXKzcIo,%7ƒXKzcIo,bcS;)B[dS͍6^4zFVjXͻVnyռjm5ﶚw[ͻVnyռjm5ﶚwsnC.N.Vnyռj9x7wsny7ﶚw3ny7I&Vn׻z7_ﶚwSnMݻ{7uԽwSnM>fw2V)oS(wI=Hew_\H9JLOOOOϟ $0RL'wEJ6:Exf]į(e ryRryڪˋH35})\2R.g~`v WCo+. ?vŋ9Q?0ZGסaWg뚍2'r` yqAq?W,Ɓ”jЅ۬(-]h`heM يf[Lo}O[xKo}o}o}2ғғϻJ^RM $n2?x냷><[))פR[TT[[T,,e? nlp5[Fְ-j&>I28⭯yk歯y(~T+0xAV_d5Y}MV_d5Y}MV_d5Y}MV_d5Y}MV_d5Y}MV_d5Y}MV_d5Y}MVjJ~!u^ Հ5E (j@Q:H"㭼>(A[j@Vո' PQJRV)"Y j@V;!Ҍ4׷N/DhVj4[~v S?a4_ Vj4[fKHJmb 6ZGXl5Fh a>2MFh4&d4Fh2MFh4&d4Fh2MFh4&d4Fh2MFh4&d4f|κo"YκI9EJ:w9'\4 E^hS M2 )EUwLHS• ӧPEEw+A_+Q'>$!~ƷV7 \ (EzSS?Ew-PPI\t-J^<(yQ} r9(\t9uFDNSgD/x9u\N S q9uB\NrES.r}?݃}jHr]&acr9r]]s=؅:V. ]$8c>8c>8Fc4x Fm4x 6??h6oh6h6oh6o]oh6oh6oh6?] 6+F7 F?-A[%)p>3iT4& SIԤaj05iSzJTO)Q=%D|3u*ByJ(OI♒[3Hԫ|3|3%oSwJNUwA. uqir+d#2]PE(tyi/ E/!OE"I9HA/ EA(ׅ ^ċE񻸯(~frxQ/ b^%yi^ċ4~|s*@0-ku2bC:w\RIcQia ,9*w)zczcHnW\?420⫶m VzƇヹywՈ:};͡4 g"oN+t;>|9HKȻ(BWƇR[*0>#]O@`)Cc&N xVPNqIaH%X1{y@Ŕ/{z/w=q)s;}E8=}E{q5FXqNE鹓 ==>/VOo8IGpxr]?/I W_ڤUU7\3~ O9hK4|%>f%Kdi,y yQ0,ݬ^=zSx [lLNb*O'MF[+cxnr74vAUP:;5YC5Y򱺤_'&`Bz":H5}^=x{CnQWw GxO8͂)C"USc=$*BWL7FLoٓzq5>CfB ~]OM#Kd:>3D`e'*nDasn䊿^_HNm GJb“ݝSv+h" _<_946i+iRթ :<"o_EB)}{=qnA}{t 4Q$Pt(ۥ3 90^ S+Uxfr ŁCwʏ_!8 o9;3^e2߬yO3Wf_]@'lr`j4^#lM(:܄.-P[Q64;>Qÿ4\Pe\+v_\v;tkC^z+q>`g$k>`q>`.Fb)FbuQ5Q[[[QCF`Q| <E3\?sE3\?sťUT [;ݹd[ Cy4ѡg.˕C R8짐`F{\5YE#IDlSmjMM65٦&qfMM65٦&d>SW65٦&dlSmjM"FR.Ρj~׉g0. VNǹDB!B!1b0fVF[oeă:a\s+~?#͘C`p!7c9ZL<0sP/LfL"GcY8lt bI^װђ)D͘Dduk:{e6TxMeeSg'3q#)}b|n:0_$!=ST`pj08 S wj Lh̥@X!4h48a5hӥBc{318p>< g3AǠY<t@ bec]ӫLBv֖AOL%CJl5"j<46-nCK|B.6G1DN!Vog6'[XCE)Et!yT6B+'dl">5&B6Vcmjɢ&a$zpEovq W]pP#3<MEq(^Qx+KH:U6m# R.w-vꒁKEv_{ w&iji⍆(wCIuU%\b*: yO #PCÕ3ޥ_' ϖ[ޥN2#'ηAf|]8U*wF5 nypNZ;ne1opBXM8n>i~-Ţ]!r;^Nwׅ;]]E]Z^68]4t8EyvFcqbP\61;rVs!oXt8rr&;Ytѳ8wSAe-,sܹcIeɝ;Tܹ/6#Ѓ!#Cwp4@ےz[RoKm2tKnI-i%M4ݒ[tKnI-i%M4.M ?0맒"3]r/maP)Ȧ8s+ky6`k`ي-&e4q'_lGc̡$x\F Cy1k6_~i#ĆYm#f [Y֬akְ5kؚ5lf [Y֬akְ5kؚ5lf [YÖ,ݚ5lf [Y֬akְ5kؚ5lf [vkְ5kؚ5lf [Y֬akְ5kؚ5l-%D["xKo-a%l햄<.~vKn1u1' S:ʜATn-Q%*De@TDe@Te%54YJȠ Š\ Q\ Š\ UbЯ*(~UA ^H C !2(! #(! #(!FPB=So JȠu%dPB%dLa%dLᡉ@Bn2MB6DdHDDd aR0X򎐴 Iː Iː<: Iː Iː-|?,B6dZQ \HcO.P{B[@4 T’ zYIem̓+i`2ݮKSYUz '*kR0pl'\i?fA5_;b^ $dwnMIvݕ{5arn4%d\&MT`M@7NtJ<&$wnM»Ix7 &KxYa!;fRCwHa!;ex W[Ru BÈ tN:':@$h %\#7%: sιBsc&0m6tNz]Er)P/6tN: aϘH|b]T>PTtN*:'Cax/ӋEsz98TEm`9htN4:'ΉFDs9htN41W%{]crxM9:S)G -sq?Ś}t>:g]ʏ}k9震;yH:9!qNHULbʃݙ}9!qNH9!qNH:"mP|9qG|9L:Q#8$ğf 3+=\s9%`hA\̵皃EA\s׾= eĥ\3:ЩХNyҭ(`*uKWtKWtKWyN9&tfsQ(1@eRf.eRf kRq.Rq.Rq.Rq K۹K۹K۹K۹K۹K۹9Xb?v*97꺨KJ,F*QR1pxk8wDD(;$FQ糽jo3,D 7Yqw>1HYqw΄MRl,/+n*,ǸÅ:\ÕYJZ_+U#篾@!]Ĝ0ŔQq.ѐȭKFHNBzd,Ĝr$"?\cfNl|1_Մ*Ƅ]? ֺmc0@`vNUYRԼMnqK.$Q xվH3vU$1ೃa' yKIxdoK3+Wc*̟9a)OԼ(moP/A^j$a\ҹѹѹѹ"KlƱ8V4%ĻZd v`MF$>kgA{=zٰLĦ)4Q4~omyS Є*妡S[Pn㵳uhC-{h6ݍxzTgQ=Gu~ُ:Q;"Z%#M{X3=;v `N[m!t`l񻽥jfMti]?i]|(φ6bR3nc011ʞl^#\HhR]g$ě}Pd噤kHjH K bʐj2yfXiE::KW{,,ӛH.]ϸr1Ծ- MC!M!NE4MSrY%=;1͘XD+#x"2^ '#ϸg_"..͒ (XX9J:̠4h.k.RӨOT$ P&ԨU\z5 5hBkBկ%Q4QhSSO452^M/ sD3gL)Spf2f6>Z|gjfjfjfjfjf^0{&:\=`Q,ʞEٳ({eϢqhk֯)a/p8\45 \ +wҫK &䅕K UߓT`G\4;y!m\2;WYxnxnhhvW#H ChoA^R\KrC4;o.ͥ7VDD\H9Ua5R' Ҫ' 96'?xH9?yJTǯw};LEHa.\!-]p5qǺ!M@\d f|} =r )"Pɏi #MP@*#$EԇKI_Y~B@zD}3ˑ>w֗:R9~9V>z)[#IYxG v%-+0CH}i 7 䦁MӦ5im:6LMӦitӘCv֋fuNy(*  ㆅq¸aaSvvʎ  >1t^_Ũaˍ ƆqcøaذبʨN)fF bǛ)a0m|Xm|DaiSXm|!aƇiôa0m|XW1C'z 7C!ff PY+ָAb 1nXneٚ5ek>1nO+ܗ6Vseˍ+ ٸbROJ4`7a0M{>¸`.6]l vqƸqcܸHWrfB$ɶij26yc!r Gj"^t1vyL<].iL48~I!y :Br3*d A5KҘUEB*uIWN+I4H|𶼩`58Eח>h6SC#5rZ/rs%&CN2 _wABmOBv+)]$N!V 2b\FŎ.{yhn!d6̹enw[g ҙDfk`ޅ.Et0zKyPR ]vˮsPMh@u|:=܌PNUjȿL_tY[*d>Kk(8e0J"u@ {@;Z)aAY;(aM"E ;(abn٠4h%전.- ;(a%\mĔK^ ;(a%,/*$전=@ž a%iO p W ;$b/6=AS\+'>یo2G kGj,^Xr)+BϝӋۃHdd ՐTCjHP !A5$ՐTCjHP !A5$ՐTCjHP !A5$ՐTd&քuM 95!&ӤxJ-M)Ԭo^>ˤI@+L\bu!;)@*)%VcMN*:Kx:n 9>ۣJzhVWR]Z5PɒFbr5QWR]26u5Ka+.w81'XuY&.@'M,'g3HQ@L;'n:h(+۠NY׻%nC)6yc5&C3`.dbPSsYJ)3QLg*T&n>!Q\%2 3uC)V#3B7Z.^WPScJyL))1<ǔyЄĭ(]uU"E3h&o81XEC3stΔΙ9S:gJL霅==N-&(+^[/[/[/[/[/[/[/[/[/[/[/[/[/[/[/[/[/[/[/[/[/[/1zdT`~ȳDXq\Tt+9QC~a R,V E^\Y\q_\q_\q_\q_\q_\q?vDaR߅/uf]agqrCTsFBh ^%+r:Ri'O8iX]ڞZڞZڞZڞZڞZڞZ4,%F4Scjifi.r]X1𻷥 5zWJ𘪤sq3LT&o*[Z޽T&di8rWU&*cɱPufu5}M)<6Ǐ:R#bg](eSl M)P6ʦ@( 9h\-eSlMѲ)Z6E˦h m 'CpCrlʏj=qHa67Eh-FKa?菄RR\\M\]~~umfek  tb#X ʭQ5*F֨[rkTnʭQؑCjŠ)4@HUk^Γ¢C} 09MdkO{{!凜8UMg:Ux3 yh" fk$tg 7%PJm6ip4(mFhmŖ#5M5FvN;Sz^ 9v{^] ir 5ZoưœLg`k`wbb`vh`vh`_ "{5{h{h{pYg RW}3.MKiֳuDPjKj"til4]b*o2'ZCMwԕgI ssSO)lC*9C0`[dsPW%I.cXN]ؾ) K;ejZt\=PʲB){]-qr<[%lOk%IץP^Ψ̈́NW!,&ː|/CeH !2$TʐP*CBeH !2$TʐP*CBeH !2$TʐP*CBeH YBSϿwtRGx:Sީ)dAxfƳd:]R.NTKKj=1⯵DGT#tDu::Q}ۨNSOӢ@}~~ڋr:u::NNNSө~ejt5u:]MNWS`` {&6NROҢgsOU*uz:J^NRWӫӫ4ӫ}:SөTt*# ©b:(Dr*Mt!k5J4$8DHDRTH/OЙ#,fgq]sҏRG6~;% SR|JOI))>NPO' u:}B>NPO' u:}B>NPO' u:}B>NPO' u:}B>NPO' u:}B>NP_*5NM_tk:5}QҫUtzUccsIv.ml.%Ѹ4^lKP~4s%'?GKs^9]~NBGӣQ(tz:= N?C58Q(tz:= NBGӣG#t^,q;nvG9}>~K~YW;D(N哗8> F V`[}pnm0n+QyӹyܡDQQv:s9os9ǜ9Ŝa{ao3tZDn9'Xyܡƛ_m9I(%:|cg uPg uPg uЊa]7@B-ق^[kfa|KnAnAnA : : :\]]]]]]Ƈ4>/Z 9^u mWjUP篝lի-cW]RJTjAHlKK/ 9fȧg| nGcr\7Ii773h¡W94=GpJK_[ 6_afPk$}5+:3}3fɣKuf4̈GhR }:73OGShjrA!g`*BC( U : !)~;?H>z-8Y*QЯ/BOBb:/Z>kI&!O;. $ZtH([J6DC`O q =Ǻ"B̈́.B)XP9_ [O(S)]y])\De_Ռ4 e&k?Q)bgֲEvAd)jA|pv$b\܌Pj).SXZ-uZH\BE0 zL[It0Hx* t0Hx*NQ:Q:v===+li'Ht8C_7L% ==Q:Q:"bD!EtkMV=f 5 hJ )8z̙nX?9ЄO(S)]/t0It0It88:PfS"$L"$L"$L"$L"$L"$L"$L"$L"$L"$L"$L"$L"$DC$8Dv-rʙ8yuq!{q>FXJw)tKLJ`?27`P&@MЁp0?R@ұKpq c`J,KJSʙw L%SbɔX2%LSbɔX2%LRvR`J,cuKĒ)q eþX/%fE3u"=z,L\͏}_l,LX eL j()Rij,WFXSSco38f n2vձdu4XKVǒձdu,Y3 cL.uaԕ*j-e}5v~lTIL%V.U1:*&*\{tE`[`{h1U-)i,KEj\` p Aj-46a2Ev &Ȇ al &Ȇayj cذ<6,Mcش<6-uMdٴA6mMdٴA6m}Wc1l,M eBٴP6-M e_ ޱPBEzKelZ([ʖelY([ʖelY([ʖrpvN FX#,=5h[YmrOzLmGդ-ceX2VJ2nNekcXiT&`l%[fɖY,!n!ceX2V-ceXi ;WCCXm0av0 Ջ#`l6[͖ael6[N1̝-sgiYc,T(_sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2wv; 씹el;[Ζel;[5RV5VVV -f˰2l -f˰2l -f˰2l -f˰2l (&ʰ 6&` aX8 '` a0lMа 6A&h a4lMа 6 4q8ᑮq7<2gLМ 3As&XKb ) XBfJL )!3%6A!a}͑e,<-BEȶ!"dE"BVDȊY!+"dEIB$!{!d/셐B$}e BA2퓄IB6@! dl>Ih$C>' )b)b)9ءC=C=#T:AG O{h$C=eR!R-b)ЖIHyHyh$Cށ{"{]i@" W J+ČwStcܡtcrAcT P9ʎzD-"\\JlK9S?q5QmQ1 +)Q"*"/"q*"D=Ǵ`D&VVn SQ/DۤXU8ʆ傋*fYC[L*[#Ll(t*VFj\B ą/UTMR_yq]*jLčh&>FCuѐ[B[}+` S !Yhh0qr2#zhFzhFzhFzhvD%zKVm(-FѢh_,hh2,}cd)4Y MBи!}rq$~oRMJI7i&Mߤ4}o_{z1 %d A7.'P)"oRMI7)&E8翸wKXV6 Tv ""TvnRMs`ܤs?Qo=zsvn΍sD&8O eL /j9)bsvn_T1TM*I%7PT(*I%7NM.TbN*I%7eD%T=ܤpnMzI7&=ܤpnMzI7&=ܤpnMzIzCw_YqJw^ fzSvjN۩};oף}Wˑ2tn=9ۯ~mӱ CthR]Kqv)[ =V)\4`Hj@ 8¡4\p]Kui. ץ4\e ؓzj-Dz+r-=د~ʮs5tN9]:Kt霋[P4]Kt).Es}BVTǃȷMuG(.EӥhE6TMA\\ Uܭ„lzMR4]Kt).EӥhAW\WR4]b]zKt.=ҥGH#]zKt.=ҥGH#]zKt.=ҥGH#]zKt.=bG zĠGZyhOU/j\psc,B iyҬ$[1:~_rƝ6vzE8s>?h e1yULQ1y3}PL=qhLT{aɰq| MYU;酶͘q5Q"؃.cCNqD=]**ΰ'q laøuZ=q}"{/{/9}P@*}̓}̓L*tM4WĩKl'̧T ydKJbi0t(٩ӉLz^YvܔmA+>BUTR֗JD6T;t'-4;[SAq|)`MF;loL`uDTNI! #"s=UY=eiO1ucO1eҞB*1֘)dO7/vQ=FCcD]9;FC)44[4u1sfif1*F(482j 3ĥЭ;7fOd~z֘󍉬MId7ƃ!W74BB6{wY~(Hb9l7톯zHa9l7ݜNi9m7Ni9m7x C74Q;,~-j)hNɛ&uh&:4PWQJ%x)eZ{Wϓ?3 Y/2FBFlpcSwbZMYnZs.ke .DWPhZ~6gTܩT#!?SzڎUh؎.k1 @a;lGO~@6Q2q豩 y2N"N1a/E2OHt.#e$D2]FHt.#e$D2]FHt.#e$zD b.#HDܠ02]FHHt.#e,CE72kcD=PʃaӠDǎM4 ؋.{e/E]^tً.{e/zڋ8ʆ[j؋.{e/E]^tً.{e/EON$6LR]m\Ν.e:LGi$!d /DIX.e,Ce ] {=ž2tY.еZs-\ {.#e$D8H0SpQc& #$uڀM8`؄6M8hڄ6M8g41hktل6!o&p& CV'`&&on򆺙U} yܡ!BَM8kWɺ.i(p(4 y]b*5blg{ЅOb8T`*C␩8d*C␩8d*#Cbgt-9>\C*Tܼ^#1P{>؀y;d C(Ϩw@i:Xh16tFZ nphks2!o,!KoFwkC8K9ggFM؅T1^T̥b.@8?:4X(39jo)P(C)8d C)8d CHSp]B)8d C)8d C)8d C)8d C)8d #Ml{CfY8dCfY8dCfY8dقC-8d قClA+5CnQ7d wCCFݐQ7d uCFݐQ7d uCFݐQ7d uCFݐQW׋N\/:q0_yV9aMNrLtNrܤ7iMZpܤ78 ryhˌ4&M80:'MInQhҘ4&~j! .ZN/"T I/o;mYd0ͦLIl42ͦL)l42ͦL)젳9a4"eMlS۔6etB\۔6eMnS۔6kpLU'o4M)j>˳Jמ2/'aMCSД=4eMCSД4eMASFД4ug"Lz+r-=;R r `Ԙ25L)ScԘr25)bʾ/)W)bkʾ,bʾ/)bkkʾ kʾr/c_LSŔ}1e_LSŔ}15e_LzMSŔ}1Ө[vs|2̇)a|2̇)a|2̇)a|2̇)a|2̇)a|2̇)a|2V~7!gNd/L{/J/sҢ?~ײ%pIaxsF>R#9ZV΢9Eba,|"mo\{7S)E KHFź0ET\BN6vۤm.\Bm -;d'O   b^؄Ja(BTppq+B^q>p}E LUtjժBH F٬a>lYi {6Րzo5`;_W8{A9\.d:TbeD6h]qp~KRLerm̙ w)d=,Wz[}/ 6uADf>(q$U;Xml,"^tR|C(5Ŵ _t YB6ҁX?Z90HB "&rS/_bXɀ/߂2)ea&*)j0هPs3@aCQr<'/IZJ$<1ڬGIuB==H(-W}~0[' 7;:꠾zf|g<;':\HZsas+h_3滦*zea}Uy!/v 'ͼbF~PWp_-Tګ𧩁]A걺$2.b 0]~Wi"_׺Lu=6"5˵[ T}1 *1 z}0YX%6EO!Kx1󉱾Yէ@+}7amƨȖ+E jyoy܀Z@nl*9PEf$369j|P;;n*DdmFJ?BkeD.L%8֘Xc Fqڹ V{6&݃8ՃoW(/!`/7\| A|dGF \"*nl@N0) iҤi'Mئtms~-K<5z,`.!4VYNTb4~2KI?}LBu[򖭼񵋃jgKea^~q h 1BIŔ(s<`_͆#aTHYsW  Hvv\;{_D`]uuкZq* P^wifUǑ 뺨ڼL6j@M6uВZAK:hIǵ3ǒzšAZ'V ͷ" bOZ A;;hg젝vv\;w ;;hg젝vvΎkgF$q΢a"= }EsvuķN,;huVw==Xz. qX! ;daGZأt᮱359QLD}^ '4F0c|0B !7dLސi"2yC&o[zvL(~\9:`'*eb .T ; ^s<i#̽*Å0Cp؅# 4!ⱆPo듨4u5,"M`dg* OkB i> ٧!4dӐ}OCi> ٧!HQ]t+c^ ^eP OK%C[p4*3X%K!c,d1iل+k5H Lc.c(kvY. eA7n8v3n9 ׎W\eL̚Kx̚C0cX5;CS4@gѰ^|17Âi{n>_S;)7ӰsivM]yIa|sE>=vi0l1 4Gs4uDAF >^Lpʺ:uuE <1sB} UYե jHmѿi[>dT5tz.m:ޱ6TS:_bX^ &U4Q;y.!a,a^djQt'{V_s ;t!l aAh3[L6(֖^Xtkg"+DfwwJrjO乶" b F8R%0㵕`j|n;)~@Nve"A&ɺvBh!'d e\ʹ& B%vu:Wrq˔!s1X`N),`!Le9\^vmƥ(n:Q7Fܨup:8kS4@0GWD!rhy%^ÂáJE2 p %;C5.6D|xʽoo4(|ɰNxmK|; W%kiW |bͬ%xȾ6I>5#9 C񗐗81i̟3e-^1:1w ml:]cİ܄tltCym$|.}b Сg~v:Bc7bkD"$pWNٓIJwQPx#_t*ƪ/Ev:'Rg.UՏo]Ⱦos+9cNC cu8^l`Wl砹.[jY? V=sxc,o- ב.\Gz'C_+*k|u/(-HˡQcB&YW^2e3'Ꚃח̵!ouz`Lػ:,ȴ<`Q늷Kwu3l0dF,?W8Wpk+н T]7vd %6FJWnm8,A5hmנtp}BT1>L%?H?&YK(x˞Bu*裺A2?\8:w)8JcʪDM؉SyN1UhM~S f145>M|qȷ9.MT u󢳪,V^.uG{ҝ V`DS7nH%DfHt%B@*V$RH :Zq1X4IxsJ>Ol]wQj:)a)IӔ1s(ݡt)qEq7Qo[]HU*VB+t(t+]쌻TKP^=SJWuV&Pq5JDLB.|OP`#,HL>R^Or}uo po .}*t+y>b?ifw>bRlI-u սv3T`b!Y!#.KX :Wh ,źW7]u?@f /KR:zR+2ފSXh0q݇롆 d4vML &(-he43)/X[ߖ۰W.^xJ_11]Y |,:XQBdi16M=m7ӅQg2LFɨ;XB'r֧Oc٩Dc:8Yc.ce112\Ƙsc.cpZ".?\yc2_\H.cxinrݘ)dׇY:3J?\8Z]QkT\sY`f?bi%O[w XX. ea,,ÇPOԋal4fU0\&CɉJ#0?|_T S%]Mv^ bq.#eĹ8elJ7/czmu["[&cq`yHˆsq.#eĹ87%Fˈsq.#eĹ82\Fˈsq.#eĹ82\Fˈsq.#eĹ82<daϹ9=\˞ss.{eϹ9=\˞sq.#eĹ87QXP `L4Oc;fVn _~L0OFі岻\vr].ew.岻E0eLYSE0eLYSE0eLYSE0eLYSE0eLYS*K`X,K`aƽJ), X%KNf>*k dDXZqM,Z j JKTSUK?K4|X/YKi4|i4 _e TC_WXsf %5_m<|%kl<[U%J]Aa/)%W*QX {IaTHђ^^3 Ƀ* فsKvnp@p+p+p+ v@/PnupQ(p>7 %m9%kxb-h1/%<.!nnn+(LP-1,^{k`..DBwT۱1s' 5=6)9='5ݘnY7E}#4 LW#`Ћ)`=8U%5Xo V5n/O8k6s20p>8#5ICj5%_a5dHexh'rs Щ6*ټdͫJ6*ټdͫJ6*ټdU% S($Dv O /؍RQ5JF(޹OdbLl2Q&6F`Fi7O_vn{] m{8 yރO oQ92Ⱥ5RJ3(2.1֯NT2_6s)jCGңSztJr3uݥ;/mV3N_C.\B6+ Y]CY0F|p rHS=1"U0?ahu1rM!#o 15FH#]cktÈ0Ր?CT]]]K a)j&~(/47&ƒ66$-q&CeY49S7U {ze/@=Öuز7e9zv:ů66)>,/:4E{y֡ܫ:4i BsJ[收N i9;;;;;2]א]`rQC#.K #52\TrJK;X1% (ŭtk8<*6̓5\#5\#%]k,8tY,!!!!!90H#׋JG¯kcm\{&n?T#l\{gGnpZ6h 3#lОgq*'#qp$88GT8 ØP;` ;8 ռh4q5q@`d΋Mи&N=ni@o`Cvhi䰭W` Бo5d !cghx 54^CkhxM 14qT=nͿrL>T=nMy{H*}5&dߛ4vq,zܼa3MUTeWS=UMg OT OT OT t]7> ˭6澎>0 ֙WygՍ~0_[lEatM8!Z{ț_A^ [N=CUKKkK Ë?AO\$r瓈"b\DEZ皉h1ܐ}q(S/=%>qY;4!t{*ևT-IzrV">XW_T]OP(`QЅ}ع^՚[P_GKTb=,v$Z7%OARus⾟$S!'-mJMI|>O6hIBDxHg$`xe2jm6vKLo3 y1yCxn ,Q&noo47.։*.;ה.ފMB?&,iֶ|Q־WhXէl [3뭍-k&Tv6롳C}#Q.tF 6BUbͷ#v^8KnFi#6qн6#nǓqM!1gM>6Pk'D]cj$>>n'P9ny'n(=G"q(U4Zjb.d.1؄V mV᷊Yb V4nʼnPF9(]kæuБA{=qSqj0:N FbItztlG^G?#:R= ċ[`4PW (%ܢ[pnQ-JE xmoQ(<#EHx/t6 ו?D EIx#r |:Q#^B6{fUa >wH:"= m4)͖WJ(DXjsԩ6˪ p6S3( ];}ϿԢ7=V!T69Y sP~pen8.ꝄTxk) 'KWNʇ:N `ἰ| OIB&k/Iz)w>k|yxΔ);SO,e&džolFJ+Jٍ ύ6;"؃.cua}|b݅e7Җі/=n4S/v%lomomo8D)݋bpA W|UnMNCǖ28p8`M؅ChVk"mmIhFrQ]]8r% <%4OSؐ,4Wԁ͋Y]ߐ8oZ$YT|BY|!1dJL[,J ɐ ɐdln 4ppb0ZΞ& ɻ ɻ ɻ Ȼ_Kؠΰ+.]wi+=)_xumL1Xn_/KQXL0,|BN4>H8w pW$ŔɔtC5*PBEPUPdRPV}P.a ~ⴷzRIb{ w6*YJ\yo{ޕ³XuS*Z/ϻpgR|)NO,x>ġ2x,~ߞzK Y1g+t^zW^1ĥ dz9$v- a(ts1q1Y7n0Y!TgJx *1˗5pLV=z֔fS6LJu,%ѱPtTKōRw40:QʒkI|([ 7oR*Q*Q*Q*Q*=WX䤔MUV0OOυVxK%%s<^*((((((($˓NP>o܎'dgn B#zNN0 C<؅1Lrj)?u#+~{O6Ig)ޔZg(WߖHw3=i` [DCǗ"4˙q|\ϙn7 >TbCa{:; uUh7!fPg xx't(`cQHPTհc~7 7 ՝\fHv3ݞ- drRi=DH/B>oYHq IZ.zD95xʭ1gXJYu6~oUt/a姯gC鏌kDgp~38 Ig?C15Ni، cy_mlp mkkk;MǞoESP/T\ZBC!b 3 3j]nk!nMܟg=*C] VswVYzay<~\%~mS6jQ=q:[U9YxFx?\}=,)Ƴ41Esʞ%'T\^xlw-nɫ+7Fb?xl'd 2tRNJIyIJydS;SsJpN Y,\-d=%8ZɞS+SsޕlâΔSsjf1%8S~SsJpYϒ||$%kczWS+JGGGGgo>Rt>Rt>Rt>Rt>Rt>;}雏Gng<):):):):,OCg^G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*G*.HE(T\RqQ*.J%$T\\n7Ln!T\KRqI*.I%$T\KRqI*.I%Ȳ$ҦޒT\[KRq妞?蒊[T\T\T\T\T\T\T\T\ϒzֳ3GGg#o=yOXR [Kg$FdA} f A1!s] uC+![8Cr ![8Q0z1m\.d l-Cp![8M `2b,кAB,кAbZq +ϳ͟'=?rrxnM-ijBK>RLY,@ #ё ӨG#HxxbӻSBǑaJ<|Cx\,qGr#Hx$w<;I'=Oz}?*xG >J%G>J'5Oh .$}{0%wӑ#MG>|4h:tpGQr}n%w(/21Hօ`CamВM6=ooxҷ'Y{`OJkHI6G>IIOG{g$cՙ+?yDh{hF$B'ھG?B#f{Nh\?liIlOb}lf$6'yyf{NwtG{Nw\i8IlG{NzG{Nzǹ0;Q9aA6TiI|6돞I|6\ b@L ,U(\+: ŬcRa7%ЮօkJw)Tc!R>]C8|1d.fBgM'塳 噋ʃ1W\B,\ۆceCOza vJUZf*5ڣڣڣڣڣڣZ.aPۣۣC%})lllllllДzg=AdIЁ *jӊNإ;iEQJZ״sAuM+:A[6A Dk5@tM &]γlņ  DsuM:yZTs\L7j]T"H lsq[cOϳjsLf6 O&Vqa'mυKyny~WN|'o82r3tT_lՏEMucmXwyl֗T U^E}98AΧO%oŁ;RJͧc?ǎw=i;'LD,6Yvp턦:??r g g g <,shlΎxC]8nh8aml-m[4L[ε׹#!ǝ% m'Qsr=˕uqxlz6r.w+➢G8.8K} 7-^[u׃X'K Sm:'SKj B::s[`5}=tOo@1Bɡ50zz;:m?Y\m(8 * ^{x|yO2:/엃:]Waz0!tq1P½p]Wl鱥K9[uR"˅&t )QgﺐBx47MFBb*eRv]HzVRoi]w]HBb"\LB0Xg8u!%cj*%?۞3?.ܔHdcMuKԿ,v.?rW>qe{O<ү 'z'1߱ٔS_ wj ,][S/m?ՇxF7<;"/=8zvYݑ.\SYk<^?<} nJn;k7vJN) ;/0RSըwI.)%Z}e." NockIs#gݣyW\l$΍k{^{\87 IAW$#aFM;HyErS=޹uEr+ȜDfr^zW m+{ +OVnW-W ׹vqaؕ'Yg?fUC [|jb}PY,_1ˇbܳqlQ]Ǻ/pSeQW>cqѕѕѕ_+%ĥC;fL}}}urD[[uǡ0llt8GG^n܅!. ~o2M7S|ݸlt+ށL?Sȷa5uzū?\u\K5nZHOC՘.?k-5_k&d{] c7|{i~ݞ8;HWG-] w݂]fŭv&ۆ^<?.G$܌M%dvsRRԻk~Z;*h| hs3M˵]hKϒkC-9P͵dInU8\nO`r< 67zsAϠg3t 6.'|)o F?q`GO D)w~.na"lE&ȸ9TİsqyEK@LMC8:+L&E*:&Ew_ĺL>ULRMG}>q~pC UZ21\~p1S;U-f%pXYW_TA1^u<+*bl֜Ai`pr-޾dh7]V^$@7|)xQI5b=>b_0ZK/R_K ݥR_K/m9PXZ K{]r:]RpTȃ/LbiTȃCaiiiT, ݆mtp|BBC23 .OVN^ٵǯt]}j$VO=.16[V!V@"xdu}A}{uq`0B e_?v!s66pur0l sg8.Źw\T8.Z KG!:֚bVt6uڝQu?ÝϲgtgGE^;;}Wq񣉰r.u+BM7љN^D!ta~ [ 1ku{d].@=x))))4S~ot]xPY{h=K.=<&TcZץY _ms@`{±n '_ͦbpù'?8<^bu#׼<7\usO:E}<5*#wug!]xv]g%xvEg!]xv9g 5xpYÉg xvBGb?.?>7 +Qyh"D}ƫJ(*$ZYohW/=?h?bA{q}A{q} A{q}A{q}A{q}A{q}A{q}A&yq}A{?q}I Ĩ@xg;hػFUroCrV1izIl_z.piEғ/=}&+rd JAqt[am g>`s`T xJ(S#Q({=J%`G أQ%-U9S8̩ \*\r5pgr 69Mlr 6gt9.qQVn3g`(]XҿknrW6+[+3V(>TC醮 "LUt?Bsnu.3]g?p=~B/E*ae[ ?KpMLOxӂx,l]w68ϻp'}(}>''Y׊&?}HJs+Sr(Vkʃ=Þ:000XXXt3(,;-l;(H\ŹŅuB'1 '2RTT–S/clD4''m|]ndAaʒu-{CX7s[q/du-{2Ǐ%^L -Q[.p3&m|vCX4#iJ]ez]b.Y2᳸ﲺe]g2h?f|Lh qPÙzx/~0uBSv N!)D;h{BiIH_hB h-B\~B4^ x!/DhB4^ xY/pr~8k9pr$F8u+\7>g."V <8щo ,d iC aJ % <1X Cܘ(d6u͑x] "@57NZ05yØ"eL2HS)R)c1Eʘ"eL2HSlJt8S6$=&1o 8IGR#)j!e<2IG ^%^?I_u,vOM R4A&HYH[ uSl]Q/m6)"Ff^sImEaU]KeSHsɻcS+hURBVH IMW Ou39,V=iV O `|[1O_?/b уHi:?0úrcDbl 1 "DoA@q߹\8miwS6C8%.ght-$I,3m%aMҿ+=6ـl &a/or5JLJF/f$(sR7yOݘ~x4Re6ho /N]KGr<.Bdؘ7av%2KUn{[ Dm~ƽ[w;ϡºdrc@~VL6۞|IS@Vgny\9p>{< '\ 7.Ve ~+3<9УkaF橛/?a_r_6rM *;GO\v2k:{sK4l8>a~12.8*sX>uK\/`n\x4p8\ (! (! ;p 6!?&kø MjF#bsSf"xC!bq1ĸb\$/SC!Fc1bt 1:8>'9R5&9H ] 8[=^sn@NeO,)&089898989ȳg'*4 z؈5݇wtf~FDk<'1̏k76T̄7Nz`l;5Y E=F0p gLCK1:>y{D`I6av+S&Bnfl()S& 7Æm.3Ό @0\Nxls?Dt'<{sx2 K-QkE*/R:8T@V5v_;/Yov l5b'"Vv N!;ذX`\,0. bqX`\,0. bqZ[7tOv!9?aqga.q!%d-t[2Ӏy{vŏ ;p3fĽz Z+wӪޟ9rFD#*ǬspSfqNŝ2;ew,a'\f[4xgv,~Y*Jfԥ˾;<ɉrk?;i,lͅ,LoH)?] &܅}Mx!$ rK-Krᲃ!~w 2R/MX\N9!e#;),~La.b:dNŋ.tJ;N%;I3/>8<]X܍i3 \|p8X]܇|,sco3dΐK'4ԩ9d%>#&K< i!yrwIe.A8OS["DQxkc~j[ͷ#pڜY?щAD>㱓c'n}"<Q"zEd`>㱓c'nqj8B뢯PT@gp9ʓo>^n̙nf-:{>aI)]6{hD .peM.ectٌ5]6`hPአI8Kݙ+r#6CRnhg j.ZEnt:?~f>8HpYCܣ~}{ԏP" ;1 2 2 <3Q4 Iqw$i@$i`_ P$䒩AAH3(HfP7iPi Ҡ f nF f 304(H4p䙩8?gxLy6!&B&M)R6!lBHلW dLB;<SSSS}"~ʺEʔNtM^:~82S?, zRtBNH ):!E'RSStE:|{I }ORStKnI-)%ybzrz&EϤ=gRLI3I=M@uJ\z-}h+OxM>8/"֥-b+ic|}8_oo|-ύ>S  aTv"}2/|n4.*>O|RERGeK_UpuŽQ够)!,iJ_pL1I g_Nr`>?>/;y2e^.1c񃩫s}y[/j'N}}2K$nNjS[Ӕ8>Vxֺ.IےڮOZeNĐQ2ka1!R!һno]@|k ADQ»aCƒ> Cx >:LHz"6. j 3"(bcщ(E}!L~+u~(>cKFqqt]DVL(1+!XP59Ȉ'#6FaP cc{ 0P5 T2kXë0P5 TabiZ+vTA/t7~,or;S8*GMǐ}qOqpZ8E2% Tf簉%.qILi&<09>yvGBrz^mW&z^mW&zQctxw)霔펜&qe):oÛ&:oÛvw-&noۛ&noۛ&n;T:^nNZ 'j 'j 'jwxG12T#ΑYȬsd92X u:`c0555555555551XdddC)Y)ST`]`]`]`]`]`]`]`]`d/K\#SlLxYPkukukukuNmXXXXsrv ^~.#.#uˣZGW]@-E t]@~Җ&6%Ew3 )EJw]qiEw]qiEw]qEvC.i!y;(ZR 5r󷆣;kip4båpOOOOOOmp48\.ЩЩЩЩЩЩЩЩ xxxFOOOOOhp`68 LLLLLLLLL x@C)qงHHHHHHmpóƳ6Y { { { { {8 5222222 u u u uh!CK]|w-\s0?x$,zS056D kC6D k< qN͐8xs3t]9DW  ѡCt:DѡCt:DѡCF4CF4Ct:DѭCt:DѭCt,:Rrץq*ub 3\'0jvRοzcylN(uid&Efrd39Ll&G6#ɑfrd344449L d} K\CI?SelLLL;Af&[lMZ \tSYSYSYUO Z셸%f>+Dbiѓ~/D1O6EL$S41EcLS41EcLS41E3L S溦h)s]S-1EKLSvrc à8 à. J¨$J83* X83bW28,3ˌ2̸e\2 2Et Fhqfgb8B3<. E>'ӯc, ֌5`͸e\DDqqfUM>cdd`i a/lMF<5"?=`gpU,*q4gGsќq4g\2Y2 D9]Ax&ή[#w3|;kdg2?tV 3 d[.3ȌF?Sa6 d[A).84i淝voY\l3@M6\~$0; A?lPnQ(|oM7Q& D GG>C܇emw8J$`fbD&Fpd8jb\&ƅqab\&ƅqab\ GM&Fabt&Fabt&FkYTyҧ֟gݨ{0d~XF> ߈V~XXXX6v$ޒ8ϐv|II?mnȒddnjnO K1xaG{+ h)9Meϙºyc 'w L`qA:.24 >0k<6k<+q 9urxc 9> .C8L8XCl>4\KY \?obˬʐPَnGjK%utU'EL =1X%l4B|S셃V`z]"eeLz(n#Oxpva}c)BcᆨA?%B&HB&H&v 2y([_KrY</;&Lb̢ժi:9JД ,}߫`o{p$ zC䰱}] ¥œM~n{܄ IN '~츷=_8XVQY826uk\k>bG\3(3Gtp ͪ ;‰wtf\5>3GMWk(*x"6#2L, AZaNDDQm 2oデnWaZA3w1!Aw7Qt d=LΚruY w ~,τ'܄_ĝURMZ5i!̸snTͤQ6\ ~ VcunD87i78DrY5?OͿBpgob`5[x{H=塦ccʃcfcg5g1Eޕh߄agkpoI.+0Jo_J\ӄ%=S0% S˷,i I[2u!⿱K|y Kn,qբ[}[SN"emM9^P'jD$KԵIzL^R˳.E,:.H,~B#y O7/i[f.X6#[Is]׽+zp?aE^_>+P;L7**,5wpb2Ϝ=zB36vdv#i̦%6Dˡ^;䬷su'1QtIwpxdQnӃ,2wek}wTCzy]uBIBS?;SmLEuQHC2do$k⼣j79íߐ?~a''DYGN]2㭽+%zؽ d:p2够WMONJpi&iqh&iqV]f 6j&iqh&iqhv5~ i| Ro6`otD48M4ND48M4NӸ_1̐4C48M4NӸG밤Rl5t8°;Nkt%iIwZҝ{/;9 Z%J;k_f8ws><*s_aGZm=~7蒃&wTPң6no8|DRkÛ>@ƆJJAqaīG>O< |g~X ZS>@qED$jaeZν>1˱EȢ¨oN߷YD֞:X,8,Z7ߚ=Mw>qy{ss@9uxKjetO 'mc`p3aW*Q><}gLag&f/35~9>>a?X&f'l <_sYm-,]H&{#K`~ }]ϡR9zUnFeG~688aSٹ'6qX.a`4 /F0pFsa?Uu =87GV.u0aC]~Z`mt8s0̷y9Yo ?1f}9=ۄw>qi#AvRN9wJlZrN?,~,W]f663nҹL:I2P|/tU&}F_r?7Şس(#Z? A|3v+=tV@"{Y'~!]^ϖxOml&ANNo4FŢg`WHEgYzEgYzEgY2uQ.Z= e/;_ -Ý;w><;}=puh!IW5s#o5(Ir`KM5o7   5:A/1b^F#_<|M~E?%%r_$r%^6 %rqDC%nnXb- K %zbDO,K=DO,K=7]={N N9mAsXnÚVp`1Fc`TNAkO ND!=C9cƆa9sl޸7gzS$9d1F$c2n5~ڝH^?-mȢe-gn- +.}ץK~7Wv Ka\.ϥM-K_t.}ѥ/F{{ܖܖܖܖ聽 8uA-=xC .@| {p? ~.\6l0@~ Eu&vvS) !@N`G`/666me۠˶A-~j]+'_U>LJax|8jupy##*p1ѵ;x[:hU'::\pt|{ #C=2>ȍϘ=R== D5!H'~Y >mQQ(iQ˄0N!m.~9 I=P(xM5b?P։8Ix&A> z?V 7vA[#Opl RF@x d& ӭC8M^%x ǥ$-ބGFny? M 3'١,ŏ@\p2e-^aݳ]KfY,}饠Knz5ܩ0:uF>Qci1%jiÉJ4GDdƃ i0g x@tamuBAwd~| 6dҼ_<3ǨD?T:HBƠ}0h OOOOo4PbP 9 ĠQ0h "h%, xY@ebyCƤdI7Ѡ,TApS?<9l:sxR'Y&~`{?@Qh&kt$2RfRLV.W7V_GFu!9&%Ǥ€Bp9&#b}.d^Ad(1|r谋O{=n1ܠ5<^Şsv cFY !.p1 &spzN+J'eG[=}fT\\7NAB:$ltD%kFj5؃HzMa H<Ѳj~#k>X:`T  *lW剒 oШ[bZlQ{j?"h,?FbRR,6`AI gBRRp))Z76Dfh,6fĕX4om\o-m0 R6^{78J8#/szQUw1 Z8"X5% $W9܄0P+\Vo \vNrHB—$w^pyo K ]8Ŀ5$we?au3~9^;1KN_] GOE?0\ ?!oǝ52?acyx$^!6~09X6qwv/lFФ~ى6;f'dKl-uNIjNT|;'[dKl-]\'9NiSZꔖ:@\ÜNiSF>/V;NiSZV;NiSZV;NiSZV;(j){Jj)vJj)W 䏪)c5_c`YC!:uЁihYJ#XF5:XAHFc`K9\2fv4ځF;h@ǐ842CK 8r4E;'V-/IWeӨNɳi8,)g8ǔ#k Y6>)Erne l^ P!Ox b=;?!Dp2гPMMa4:f+;Jf= rN~N/a]ݥ~U"q$2¶^e']a8yp|,g4qMLs2/L6Fޓ,ZWMU3-KO}b_,UsdǧflUU2߅їf:zBTOoa~ BsHp55Bos 3t}|ۚ0 ""|E&E&oaH^N!yB:B]p]pY-\],o%|/Ϛ^)/m{0j}/7xee<ŝ~;X&6G/ԮlLnܳ׌t7Vqhfl*ySߛ=xnDprbDuר.};6$p'# j8!EF(wN"s8EOފuыA|2XYL%¬&X|m6.,aގ ܄o|tWg\_>uï_w|;8]ݤ!̺x{XאָMPi*A꠽[6A4JFI(I%Iki 8&Q F1g`WȸV <̃;<,$}\4"s $jDml*m~+:w/.x.:\Yw.@8 Yfl8nq Q|Q}[]öKܝ!."]x!&hj"&hj"vt!],6Y K]܃J_MWD|5_MWD|5_MWD|5_MWawx] =4믰:V.z/;e#8lNF:mNF:mNީ흺((,k׮KˢmYoiя㠟os.OVxˍv<>˳ᔼy;b1o(v.ݴߝodeѥv,t.]K|c5;V[OHнtWzK tnTMxKE6%{`/b$^fk,V?bok5Q10!0jE|jlQw\\1 ~Y%)WcK9\!Ag{Ag{AgPA:h Qf,op78 ݨ4.o=b !Q ŇD8MdlH-5YPPԨ]f#C.}0!#ʁ]fb 98{࿱nC??DC?D?DClw}0Ąb CL!a 7DO C=1DO C=1DO քhrR'LI09lM&möY;V7gGZy(6)&դTj洋V/f}6LD 6)f]r!o1sp9XBxR|M&h"irT69*0'GeYvAWJ~+?r'6kg@skބI{Kru)]vJל5'6=F&. qJ$v~8;NShϳ춷i>aeMUOV7klVea jZpHG0Q5hҽqUyY-z,3,$2:E51QGDo$! QduO5弐,f}Y j6;+Fgs"Z ,S߶ ǓM Zd^]3F~['){ ]|y8ޏ}j?N`oKF>6q_}FgRpV2|`jC֔5ZcwMa?%{[baÔ6:%uΏIm [2%Y%w ſK}C l3O2޽lKua:v vr&&גGTī튗EO,%rgI_]56x߆[iocq6)ݰ~)~~˝ix{7.K{~83=oO|{=ݺy_ʎvҶ~;ۮx!-Cܙ071mQ|<5g y,K).?1plOoǶ^]3ic%J8sJ,\tedj'>0މHIolp5~5a"o~!VG'.Bu0#FdziX#v N|(:Cl0AQ0_35b'bZlgL5LE}t_4ME}t_4ME{Oàk/m%[z-Ė^bK/ωĔڤ]b.{ؽKW_^ lG =<%vuZܿ\\n,! HZK%Vf))霒)5 seyXUwXM6j& %.ti[T9E_+5ٍ*&8^]M0' ճ;q XBp# $<_D37w0"FDzG͋gl^o*l|o*\gMc![NDz/-&}Ua}619m\HL$&>U[/m}aS"e%e%e%lj}ﺷM"iO\am{.>:Q<ɉ {l|uq '`w~VW: "w_Q)ˉW_kx;F< e3]x}C>!h^174*w.Ϗ~&{llSE+ gWC_fT,~e^] _/e~]-*X$̭h^[{s}Q}v7|. M{x* +;qU ˝?>DCj /VŃF\'xx3]v:Fz~=sUl@;>a 4@BAh"M 4&DBAh"M 4n;h Tf,ȂH\"lpfAg L; CCuA[w@gJ-\\pЈ..'mZZf3-Azpp5fȘ!HEi(4_|QMɃ9`~p_MEc|_4m 8 t߶k!o.[}0\;]hY*Q8`C9s!`*ZWݧ*~ii컰UE<ۓn]tG_fcG7FLtsX2yuO~ః`YuwHvytTŵX.Z%h.ZEtqå3Kƺz zm|=u:2᯲Nm4i&Ӥu Atδ;g7!lmy. wSJ:SJ/0L;J8b9A@ =Դs9Ww;Me4տpϔf/wTfiy紼sZޏp|~D:p[޽8dd}q2e2Ef8Ufԏ-S'~"ϼ;x3>߱8Z^Iw \ă^uew'127"&!"\ăxp.! #H;JDF`05#(<;8;8;8#^4`E. RP(RpQ0hmm킶]ж#zUĠP bp~pqzM=yP\Q$XV \ϲKCêY"ec|D}aGe׬zqϐ KY쬘Lk!nNbˏ&~zsq!1&N$&% ?Yn(5'K~1Y2Y2Y2Yه0[I%BD1B4F=BGA2Yl<%%aj^܅YKҢo>IIIIIImB&IB&IBYB%KfoB%Yb %a 'ZR%Z(NN$PS()Jr %9BIN$PS()Jr %9BIN$PS()Jr %9BIN$PS(yT| .|ܘǯJQPIE=$GCrIMIMIMIMy4&I$GR'U}r$Grz(93=BPrz(9=JN%C{D^gc&\']/3E8^Ѫ|`qQTWJ+U5! R,1JA%G\9+92ی"NvM"]v>ƒLJJ7LwLw']g4];h^ ~iH25n dZ^J?HΘ;[R)k@ր>H޶&؄O[NωxH<KR;,KR;,KR;,eDba)#~MbS찔Qb'SFI;i(zя nOam+g}o\|]>ԑpoع!8I;d őHeP;>k578@Aщ žA  l|: b a|lD~(( fـ)Yܑ YG[C6HIJkatVQp֍(>rsT8?tuԏSmZ?4mf_@?'?LVA6^^.'OGwx;% de gI!GY?ʺQˑ%@~z,ʽ3 g^]S|pqZџ:/v0̽asoYBaAfd5qf#v${*}Q xifs{77X.'&˿ ?y1U}[7ukq[ 6"}wg%L'<σ8̔#:Q }Skz=cgPXS8`ᰴA,jpMX?lD$:OEm_Y8DTa1dr$ݒM8kd6s$Sx Kc.:.;uC~ Sn¨c,pߌeb-Y)wdw/!lyM.<۾ĝ5֧3ٚje Jbt1p5p",.?q)+)m =L,Aj2gqׯQwcx`mPjxx)?Szئ!Őbt1d2] .LC!Őbt1d2] .LC!ŐbkT-pKTh]F|X)~b~#`"}alV9.˷ʡ]Yi@u1P] .Tշ˜bL S5XBw?UW-?=lAiX-qo7sjǃQ]{ЈBPa;˵֗ d+\^ly;7HlкX=n:?)"cBn/?[M^~\,}{Րͺ Nb/ccm+66lFPwdl,c?szxYt` A"`+lh#yTMQ[dTDҲqB!c񸷩[1VHJZ6xCx` ~pHVS}C)>5 f/NelXC:-o1:w'!({Yefu#J1&+m4~-?XڵZ(mߢ!1x2@3>YZUKuiC̴!f3m6Lb 1B&; ͷkSZ4m?Ma$SѬ;]‘f'o6oC̷! L!&%0߆oC̷!m65”b 1冘rCL!Sn)7Ĕb 1冘rCL!Sn)7Ĕb 1冘rCLܗ*ެ/ YϺrοw_^2+ϽQwxcot;%UZo~dd )hI ;qOzXK$^14Fav`["'|}^qO6:%!g|yU_k.~淭x[런\o~iս.w@f1Un s4?Ws홹?ֽ`{:wdHr~UGt3&o̪Z픗 Y)OXe\?'yoy74 y WnCZkۼw7;ek˰V;n?:\cɇ'o#,vɋSnÄwŒk'܅Yuﰡcƿ˔7]v?I'˼n\7]^Ž Jwԫv c߄^y?[Ak%\πHjd'j} _!…ؼ^f"ob'kxeƻ.iL2{}uIYX5{]+i!Y+d{{wd{9Yew GK{{dNDuh/ ¡ph/ ¡pA_/\upEPyqwqwqwq̻qAD X.AK_Y<(|Y?uP2x&e4unaS:bٴ:'|9gHtIgư=rHz ݓa.D}0 t23)AZ>1> 5gTI\oMX+']E|XH9ij7/ QeK ^6aߙ:>m\T|DQq(\^g(AP*ƦklCGHub ?Kx^lj݃6:NnN@gJ8N4">lp=HWsqb;Nv1lj~:~:~6̌e5TN5TN5TN58ycPA7uycPA7uycPA7u皮sˤF69g{t71ݍA@;nq;8*~Qo~lۜoqH& t16h:Nv^u1q2;n;:F}t"u0;we:vޠ1sO؄)`8/=,8j2^O.i8~{yO!׽?q/e)ĮӷؤXүe,;e_gl/'&]*X-.V_Mܝ'Eu^=:Z1*+8xnkN&\kT=\*!wH9$[??m`9/_%DǕc`|8up^.@;W4 ),,IC &,"B\eK:ܪK&kۉ{k. ϣGM X#9]KN%3)=39]AWu^ԓuy^S j*u^yIP%Au^yIPOfKwV՚׮{z15K%,)Β,9#睏w>ry#'C9e/̂\ τmg.p{~Ơ`<\<\<\<\<\<\<\<\<\y?CO1̲hZRRRR,K4X}1=)[)[y:י뢻aJ)7)pzf|1nيU(ʯM̻]c|vpT(ǹ/"uG?8<Ԇk77lNÅ\Ood."Bhl`#J'K =54PC4M4D.Cy ˷ꗧ4Gd.sM-v7>ngISڇ'ʕ]M`uwq{p5vy1H/3C:xZ˅Mun4}x2ѵ?qk;\:]E6߭["C3c[9Mp-df3Y;3`dSjUZ%\ l<݆0 w+JKswm)lTOsKd 0vg(v"X*>\K8.R*HK䢟d81/Mq7q_.6]By]ɯMz}uJH}]Z}]SJ:ΩA#"w+X ~#Ft"bG7V3;KoDkү! KJ?k^gt:kf +hFl@5/Eͭ~s(kps;or9?3\CyNE?zq9ɒ79G.wqg궑K?jߒp?ju{c`C &m8hU!b4:M ­m28^tp A lbbJ fP֌3}lO_'Y)=7ٸ ٘(sq !%IWV1C5~5_sj. c+ m 9 =4d"uP{{g;0xA[Խ3lYrc1`Fvn?`ra!RQf#EP$%RvAdd8Em9Hl'baCeJZ 8ĝm s)?J6qG[Ò?3݇)~c3潱Mbecz],Q r=ɬ+b D6я9b9W)Ͱ056aҙ:suP u&tΜ0W3Z 6wl;(J@ h"-G'7a7a~p'+cYn&rde}{no^n5444Y ,lX\%[\%;g*X%;'ࡺ"&_,a{ >6|= [YKXV{0,[s(r5"y/lE!ˈ lg$;zN bKĖ,-Y[ dAlɂخ "غ`d3>`V6@ϊY뮂}#Kzf=MQ7#P\=o1`Y0~7ݥwY:LCמۚ`%F >K3.ҌËn ӆxAz$~v2\q!?Ӊu/"ۦ}yOJC֏EÒ~K8$mK$"w)xÅ|d{,C:ڋYnu׽q%vi[u%Rl)?Ӷ:$a/32tg_s#ԥ3fw^aF>apԺW f dr%{,+/k3Aeuʽ|\`x%u۔.VL+W_WsVn%^^pm^>X)KO cE'>mO]_x9*V'3m _Å S\G1.3n,?``CDᝫCӌ%=]owmJz$=qY0YnX kb9ԻUk/ew.3z6aw.,%C8$V.-ydVI[ai&{/MI3 눇%̵%Lg{#H\"L-;P_f^fn> ]0,J%?gmp5+rꭨhԳU?Zp+=MZ"%٪M}HגX"=(AGtC=[up'zǯ6Vۉ`A&AfeS`q֛R!X$ 7"eY[d<UX8ƅ\@eGNju~&5ro>%Ɇ|*bQX'k6(b'wn@1D7.Kz jDxre$W#̐Po=]&~?ty ˷};˼K9]9M8T! ?‡% y ~\⭍!C~%bL)Y~\fzxQwLIzLڤQJP(eCy2Q!IOR$>IORE)h$-:LG64?0i^. @fRofaqL`*GKa-)vKݒb-G bä0)6L bä0)6L bä0)6L bä0)6L bä0)6L av<fch^D;_ȋ^sC{y}sG,aesqI*51y 0L^0y 0L^0y ߌADD&}[&e8E"eR [b?CYvȷo[Cy@8\^I?&Y! ICg`ewY[Kk;e/a{_6>Sf%,\u\4/b .),N VUeeJ^" .r&H.%~$78<Y^;l< ^TP~)b0.岉{нSĸ$tE/+m=xXDaY8h]qI8r &LYV^of[8U٢hkdvp?eOuYr|S]x8':Dۏ+z2\kuTkA@=vyXh!!|&ɇ h!|&ɇ h!|i&ɇ h!|&\ܼD,Q^yX+O񳨏<-M؅!E/!Va0jb5 X CfXC,YĄ1aALXĐmn& kaF&mIaF&mIaFؗ/lIaFxUnN6¤Ár0i#L6¤/a1=81lIaFx4nN6¤0i#L.lIaFb#^!,[#wv|> d:a#Lxbn2s0^b/L S)nK'YM{a0^2I;,a?AbbvYOa /qwl)a0e2vY_daoÏg0L1 S )a0`b0L1 S )a0`b0L1  S )Ôa0`b0LO1 SS )Ôa0`b0L1 S )a0`b0L1 S )a0`b0L1 S )a0`b0L1 S )`{DĠfMVT60`9`4FHc$ Dj`ء}aDֶ ٯmEzrHޖ]X5i4j ۲wy}FDĵ-{cщQlfؖ}ympA>mj[A 2CxIM61Cό9ZTM6Bk>y48BmXް-`ֶ8qU`C{#C;/ۡ}{SXuϥvr6{?zYR }ه߾t׾Ny/;yzSb/bY^qj/abb*&bb*&bܗ}"&q_2g7`;{;8s 1oKY_LIb܋}Y—rybbbST11ULLST1>.a-&fbb-&fbb-&fbb-&fbb-&fٲjaz酅酅 eBYX^XEm- Jm.G9i7vAŨɄ|ժ;e#/ Y VkE|ɊB#qdI|ɒ%%KK+vF0%wiec݅~=XBX7_nd|͗/Y7_wČpo,{7wpO{Bḳ(3OcIU\8ے48ܥq<(r(r(Q"#C.VԺB`9!pn{T!:?n[[ ۃ F>hFF{KzD`Nw 3//^9`5{\ 4yXWM^11Ƞ`ā^17s*8xNCK6] )W7}u4>"Bt}24G"D}#ocQ!?!?!?!?!?!{$BHS!L0&bl&bĝBl3$Q{/^faE !(B6Pĝ@ t6PĝB8}ɷN'xrg8s3p2nd·+wa$,#|\{ܓPKR,>KR,>KRfRfR추.e)nKfbl3IRl[-e&*>KJR쳔m&)YMbd)O)Y}bg)Y}bN/vbj)Zbj)Zbj)Zbj)Zb%m}aѭ=CG/Fߕ[{Ǎ KA>v_D5P/nLp7Wլ`zp~@Gr 썁"q&Ǚz6,o aFxe}N3^U1xz,* 9 T^o {v$+cjk"ݐٓ'!Z!`cllMֈzI`(%u4 -wc#'MvUVi*ouA~'+2 !1DF+Lci2MI`7ٍQG>&lm5\lr55\c6n,9h^XwLZm?2sB7,hHo p&F,f&>Bb@k}B%Z3<(qv"]ER+cpcqen͔_?$Շ%zF8UU{1;'7Dwm7{>mׯ%omX>?~cIReD-)XH7!uP^p j`:`ll0/gtvCڗ,iee\QKeiX=<W>/wva?pdp(pWe PoCҐ;/ =Spu0磷qpGC <v0<>KЀo^砸9uVgrwb[6.8jrgD5u]nH ט7sF ב^c9ENgkDa15yq3fhn9i̍QLb2 kD~f̑cj||tmNDAՋv,*|&60hGl`V{P!WݿёL4Zqu75rG<= 8J;SdSN>pYr"mH)v"mH)v"mu ׇyxj\q1øa\0`W0cq8}lu ;aC2먉K1hp!W!=(D9 s]kΤrj@G9`wύ&&v'x1кvؐbCZ\X!YЦbZlSgc•FZlSmjlV{{XGCLJx: Y8S8 t0 ,'?T!LPm~^ydg99YN~if!2‹n(,iYiօL39ѐύ&J]e}e}ĥ;]ko{uW~'kvB 4`;؛"X9 `:FUN| ep-#,|'|48- n8 ppݲE`S#D9 aTMpG-!Ǒ~Ⱥ AϔEb,`..;Ap1vU}F9VK2\6eRH{}[+_>f{^Ͷ75={R!z[Zy :>PFOrw6GJgR)GqaBң#g&GarD&GarD|"?{g`ELBzQ郮 QY-f҂ef;VS)k)kC/˷~,ab{xXtL p8e2?ܔyx Η?~#+$h]Ŀ:$Fe Cx%#Xټ\-RN&3Mi޳4 w!,JpdƸzI\&ߚ|k\%i2a~4q CKzGS>Ѿ&]xOao/i?Y(n^g}lcKTْz_R׫3=KD5KDQl-w 6KEdY~WJ\R1_C`Y`ڜ9UpY.!cHG.aY$EĿ|+}*ouoඖ꾲ګkZ-ʯ={uSm3_kԍ+ vBٗ׍7NV \}Cmэ ,XΎw}Cxvɂu/ov}Xfp}"*|Ɖ [>h#HdC_VO~{c}l_o%}vMX 7~;Mͮ .BC4%;?4ՉRñ=?g\fƶX=?.33>`G"Fg;\8'O{/pg?K|z6ofefcq  tIkF],r:'3-h+8Qc׍; )l4 ' .ya㷥 %,RKA^vqqOrNKA^^)=e~[ |+W&Ws-?zSB[x0 f!W, CBg⡃CV&x66.X5Y'k`zs? i\4ׅpK~?{9=%:;~/3m]Po^6|[{.y~gw7m n6I[=lqF%۲l2ͬDh|Ьfxc[wzg! ?l:[&C(eM];n76Vo_ [p.6&_X]l#26Mmp>޾&NSzἽ,a.I70]╼GKDw½z9wNSzal6$mSzbTi[Z;,|r3muu@w׳4)=|ߘËsVf4m fhkਨ WLc.s;Y V6vvOXXaK\6$5oح anu15a3a(Z;uaqoPNo2 awjN~:~?hY8`2`2`2`\:fj`lðҚGc: 6MS\&/v2z#̂wuĂĂĂqe3&L&3.,篎>   ]2'„q23 Y{ x&6YИ|;@Sܓ 6L>L>؞d0@c& Bv'L3f_M&L&N8=X{bɏ!?60;FJ<ۉ+?z7`y.ZZ!>8l J`aJpAzpp5[D'&9؉HC;pUckrlV6V3Yh.|ont[_ ]iuy,>SL35QP0yyvqgi.1y,~]b.1shuYYǛpVv]v&L%&v Ą]հTDr~"Fv\SH"u\q:.Re}؅CX&܅I)"\JJ?.LrR2ÄdrL.E2H&)1 2%2%"Q% Q2w/ɾx' N F.fxm+NC4i!3q4ΐ8CgH i!3n#woSH\N4$406[f\Y!$4FHꮄn_ M!1 m!(j+Ɓp{hYFmȣ1X6i+qBW՘!c///xs zЍIݘԍIݘԍIݘԍIݘԍIݘxbݻk0p5DQ}Dʰ7.0xшoܔԹ oKq)".EĥLh'FyE$|aͫgnq̴5#th^Z~zo:4EnNV͠Ƞ+6+!""4E"6SfLyf/={{gϽ)ߛW#{÷i WؘN]ЁGǹ@vNN\ǘW9:?I?S^uK\"??o5Sl$h^f~K_fJH_f\hϥR>hWzmCml SA &<ߘtM`ՌƷuAo BXwTScwjN ovrGh.BBM|r]i"uFCOw࠻wa''r^@;!`8)Awx~FNѤ#l!TKaGht&IGh&`K?r{?0{C@5rpYͪ6[͟l\|':M:MuB;щQ>| DD 1ES1ES1ES1EBj]B’ B҇spcDS-ޘH;- n&fjW]&8>L[._N{%md=Oo.3P}9s֛.dAzXj }`S/BhA-0w.< |ٗC?DC?DC߲)I.7aI!,rɗCpB +?aKXĆ2D N\N1(%˽˸eMܙ"$UɗYP(C2D Q(C2D Q(CL! b)>"'>:&_D>KЀe#"0AKZNJIi9)-7ҕ)yՃNJBNJI 8)'%޳G}{b͍L{elRMʲ)lI#ZS5EhMZS5EhMZ *d8o4,yrI䥅!_y'yjȧy@Oקtn{]0ӆ=e=s^ҹtnٮ+emVd0e]߰t0 LxMB茢6[ N'񳲙 NjicҕZ N,޸ v4`*C9@\ťq" q8h49&4FhrMNi49ޗML2e; !9 @ -6",a1Ŵj ~Wj_~5گFոnp7pTzZ(f0iXd;,EBs?Adohe̢Y2ZfT!5fV;7"dŢվDȢYT"JdQ,*E%DȢYO:KU`8XTwO*n/d 6"mKʋ`QQTP,4W) `Q,jEmDL牴' 3ْ@D/vdNIQD/KEQD/Kfc0bBD/]Yw]6F9 Q'.YwC$o%D.KD뒡qdhD.,KDѺD.KDC:Do7r[nvuXuN Zbe$;DB/湐mF}p5u|f'&m&m&mwZNټ``,`Hl?{jz{H7vD GxWf#2Gob)bڹ%oxo3|\ُ@~!o)\x.OFR!S:bދ-/#CAvcJY"+c5݃R2=o7^]Gjfߦ9dJe;vnw8.'+Cx)WQn_jrko!vu lq\K:ĝĝĝ7XohWsg;9fnzy{Qq=_?oux9Mx1w~˺i[שhLCHzsc&~̓m]x ˷};Y{/{.O䫳1M$wl28:l?ݥ̻K`31Ly^, W,u=,!}dw)iAw$OMa X_;E.z].=Eاa"S}O)pC؄P)E nK\”rح$'܄%_)ᧄLI2ɫLK؅_(] w)l(C(哢|.NoTWXn%.i_0/KbJ(=7?0;&qHT?L۰OSx\fڠdRLIQ2)J&Eɤ(%dRLIQ2)J&Eɤ(%dRLIQ2)J&EU2-L 'E(XRKbIQ,)%E(ŒXRKRl˻`=Xbߛ\nDOP$x%>-3,s)s)s)AlLD}rB-!QBF%>[n|uxK~%iKH^,ixn,tocBwJ.aJ~W_o^J_^4?"Hk!c[יy l㣰̈K^ꇕQ2e pBMd=, $&4) e֙OHz$_%T7P=ll%T7P]G %TIs ˌ7BYoۧN/iehY?qpӪ+7uD*8nx$dT-P`|D~ UXXp뽕g#˪ޗ:Ho;, ]yǃrcK`[uQ+P]<< { ޽h8]ypЕM&'A(\wلCXFpF(?^˷lTr-QqÖ́{N[Yt9ۂuY',:˰%ݮLѲ581_[YM0muCpy ˷o{esAe sRXAv獃KnI\]rv4!%/r^0Ǜټr޶x5zܡ-Ю-P~IK].h]vp a06B 첤M$ݼD.-kΗۡЮP}*"'aSlx< !u' Iۤm6ib4M&__);NS:FvJa#hQ:*acY'IuiwęN:NޅN:Jit*.l1":it*'E(Xy"6,6Kit,6 OD"at,a҃at,n0L0-KX'5rpit&M,qgÚoC/-OxP]ŮbKt%]l.D[-ŖbKt%]l.D[-e-ŖbKt%1t1t]]l.6Cfb3t ]l.6Cfb3t ]l.6Cfb3t ]l.6Cfb3+axاeF fĀ0`2   u+;$aWeI3Q\,uՁ7d-XDbXq;pw=nT{!جl_){``n=g;01~Hvf̅VxMVPTC9-uN.M,&Ŀ/tfYlfa,1c_"o(Y$[' } \m;Аs奄U PF攗'%/OJ^<~w6#ً Y[=Y+߀?'/8-mwe_$osÒfB1qZ ͖HI佫 /KkH͡X죌飀qf &Ij]xG ՐY^<ÀJgYt(%JgYt(YQ@t0е&-bLޱIP@R],i7(% hZ(wjhZ%jh1h^(%h2Z(%h2Z(%h2Z(%h2Z(#5H0. aQj7<\{l8F7,7߰tttt D X/8{!ۇzsPonM:THzQHzQHNHNHN a7,7߰~roXn͍q67sG,ަM-yZ6mjnB45>lL} l=b϶fزan-..ayi8Wn)Ifz8$e n?iC6MދHpK O 3!kC~^.6PqШI N~'[i34RVYx<耋-FXFȸq7|l1cɚ'2fOé48JLHLHLH#{\$#$#$#$#$#$#$#$#$#H!DJIi$R Q^g; oooo48 O48 O1N42iL Lp l^r p p=N%8)e.)C1AX%.\k܁<ݘc&A]5]5]_/ D|{{{:UT{@{@{@{:)I<<<<8+y2=cgГg9c9y@C0!%(X:%oJެY+yVfuZX{D2]@tQK:ˈC℆K^SZ5),۔2`0 &Y8?/ \<8*q?B T+WP%J^A*yU T+WP%J^A*53]35G3hy>:y>:y>:#98ysUUɛ7W%oʠ118Ht |%4'4g \k`%J^+y V5Xk`%J^+y V5XkN/4!!!!bCLx a=^K2ƗI 20i1!=B(d/W^ 1oqЄbC+Fa[hM|l?|AKnH [MKx:]9#`.֍~.܈B Y*YM'vJ?\|R'û+K3d890 |q x x x` lW|eee탥-g{)W~X☤GUlce)`yXη\ ,K\^Kz3>,J,gd;e8e"e8KK;CJGn4* 8C(3 8C(ꊢ(ꊢ( {%J~k?P EP EP EP,pppp'-4>'cgҗXK_bK,}/%s9Bu??*5;udPbKr?=} ]Y3&Wo {Pue.L,Ds0B0mn@c>qX]=p܀CC]w@h; $M4F0=]ʷGf6ȁl\ȅFly`sͅhvw@q| d7&}Î 3̅nәa:S?LgLflS ~q!J{d5{ǁ9v9k D-zsǁô^ȸr=:c^<M0'cG<@‡S‹؇GՃن=qFO..ɫ~O g9'I8ƲgӋKdx68`Iل2h>CP௭tza"B <ؼ8ȋ8ȋx#]F +Ժ \7u\;w82Bso+tw! Y^OZOZOZ ߟū$$$$ܨH@@ZCb|$!4rBNCo 7bErBNu.|h>f ),'i9 <\ %& !4|qʽU&YeU&YeU&YeU&YeUxO^=y5*2*2*2*2*21[)`& f`& /ҫI$I$IVṩ⹩⹩aaaaaa q U& _G.Pj D II :ŮAV>8$PC20Sf La) 3a0.q>\[&ډ 3a0$^/fX_p\[tHtHP@5LDO/2;?y=[x[Z$O.P[skOJ#Ll"Jq,x-^Vܳ= t/Dn!^V4HC t/cV qC~q~q~q~..zA3o_o}G(kX|5`/EKԱE /"PNq7?~fXp,smhPW'NK[3MƑt!(ݳ8:{<G}`g:-67?Z\qdcşփbaa8 jK]t -e:.i^^$XI/f:a?K|y/KKOyX?K:vΖ2AɃM ta#e6)˃M `I3O,^LG.qdRvzpoX>0Bݫ^)c!eT,{IK^-bI̚ĬIn]3CX]F2.2.2.x[`gM8̈́L8̈́q_l[p יp יpS<o2 n4@<%}2mn4FMŸ3?n4FsY%xėf{.FS, n4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4Fn4FonFondSntp/.X ةJةJةJة yc]*%R*%R*%R*%R*%R*%R*%R*%R*%R*%R*%R*%R*`$Kf(a(a(a(a(a(a(a(a(a(ǝ ^fa5z/ G[9(͋p"78/ϛ(G ,fVߥ ľciw$D$y2 ?86!j|Qdƅ ɷ, 8K,ojoϨ~>x^wMK"f?)i$%$>,u|~X}.>[ ut>a>ǾÆI"Æ S03~K[}?˳?wl‡6piam 0zFo A7hޠA*ha 0zF$#$M&VH Fo A7hޠ;ޠ4zCހ`?gO/4zC7 1zCR`8)E~B7 ]<7 1zCٹK:m!pbv>,q\a?~|VobX!VobX!VobX!VobX!Vo`aS7S%`GXp1CVobX!VobX!VobX!VobX!VobX!VobX!VobX!Vo՛m&ބ՛zVoMX w\TГoMI7i&ޤћ4zFo^zEp L&Dnp21ɭߤo[bj ud 7iNS3g)pbK:K—o!Xr v!yN1S 8N1KNS4w)/_nbN1Sc8N1S|F/wʇۈM1bS#6ňM1bS#6ňM1bS#6ňM1bS#6ňM1bS#6ňM1bSĚ]<0\k.Ka)jb)jb)jb)jb)jb)jb)jbVê/YZ_ VkV-ZE h-Ժj5d۽U"t6 k$DbƸNwcsym2š(B A8V1 &ǘd w3͸Y~~c8D M\_`6ߏp.܄BV Db}ȇ%| )q-XeD>ƹ6<?>nx8#~x|8a1Cf':.xx*mf.Klf. u<1A[p]GKB\źJon&/$M҃]<_:l[Ė.Kl[p]Rf%vu]]bW%vuẎA&]bc%6v]bc%6v]bc%6v]bc%6v]bcesm{,a+ ir+ [v%ta~#"հ@}bhqă/ .Ma/Ncg]ҏ!xY)g5^?l'O0ZlkMI2c^]҉bSp`1GO+o4@H5h}hЇ} Ibݛwc&B7:;;ǾrDɰlbxM:IG6.?5<gN gߞK^l;Qwj]!NȳR9Rrę|K 9]9~R)}Xҙ.uxJpp&e8.ϺTΑ7{ D׸׸~$?'?%$Y;u$u$֑ЋD\~d\` bJ}(]UƸKG!Q3Q%Q3Q%F~,[!]lTɻQe6N%T7CQړ)ag-q8* f1{v8cOv1tI%߶O<8tb 22_7pJx1|y);iRvҤII2Oo(5+$͐t'K46)S|SdHm<~S6,3ҙ.;LBxKI?[bS\)S|zǎjӱ6~ϒ)|8Y3fg͔Ϛ)5S,g[FN2*RT@ HQ)s9W"HQ) E̹RAʜ+ESY]FQ >.@)(|Jǖ8 =D˔ϗ)/S>_|L|DRHIQ")J$E(%2EjURP%URP%URP%URP%¦~IazX'E}R_QIQIQIQIQT N#D(JQJQT#pnP-ERT-ERT-ERS[*=ERwOm?̢оP-ERT-ERT-ERT-ERT-ERT-]ERܕ[T-ER*vN)%/x[JVJvhR[JtKɊBn)-%DR[JtKn)-%DR[JVJ4L)0%?M \f?.0%V̲ܸ5L)0%DÔh Sa;J4IqT} x¢TXaQ"&JD(%bDL1Q"&JD(%bDL1Q"&JD(%K %D@QP" JDCh( %D4P"JDCh( %D4P"JDCh( %D4P"JDCh( %D4E1ܷ 7_:b-€3kp qYo_,ƻ.x_dnYm|#s+fQhx|!*dlx|(j q͜/Dzp!*d_nffYnxYxҿw\(CZ#4A#4ڈ'NsbUs$4 Tw?AsEh7p߃-/xȭfy1'×g_=A\A6]%ew}p.GJR>Ǐ)M2l~8$fܛ,NK"C?4ፏ8k:9vKlsv6YNlM b]zGlmԊa*bg4 zX𩠋n1;7mEt1.L[H}CVGӤA=,<sHB=,ouMF#|75D[ 7!} |.EkXM@$”P' Ÿ SMS0r9`oL<޼ف'Ǐ%zxJ5`w_h G{M*7uAX zo߻Q0}a!y?$-1߱QdK1!hDKdx^O;6bontk]I %X.\98%~:qc-$!.!'Q.aDEp+7㔤z0KN*?Pw?Ng6Ʒ  b ƫ&W`p0_6h'`]0Ó arv|a'<;$~} $k Úxum职tl/#( iw1b^1rc 0I4Aм4oyTF72od|#e`|#e}|YĂV>%ED7h~؃'O`̢Xb;7ck򱞲S xeG65GPAhF\Ub6C*PqzC{2r`AYAYAYAYA:c1|qo5d$1?qE~;œ)@eZPeZPeZPeZPe9#Nw8&C_l8X 8 :Zu TS {KǁkZ2G\l_Л!z3ĺbuBN<{ߖ݇Iq-07<=GtYƌ%/d|-gJ<:x(uP??qO B)B)$B)ĚB)Ě2a'n||uN!Дx>8vt\ě3لql)U S.ա/ z}]E*V5o"-X]p6s rGn8; {`!wN sE+\q`f?zca,D`=AE",aDEz,cVE,2e)LYdp ͺL/O ч0FC}?_{Xʒ/ ORÒ4kVMcx8L;:Bq0¸Cw¬.סAGz=`b%=!8zBCq=!8e?`I2)IMZ8p Ò`ADŽ{hp !48BChp !48BChp Z> ;;gP8pE;5m~!dH,8:^ ۿa7,ߴ}ӺoZMdBz5`څH=]v{v +ݗ,_X,g/cN?BSn.c6Pv{v|,j %B<- xB'Dn I7shIg/څx͸q7schhoށ2.H!Yr?y聋ߡsf#ۼ\~܄7f쎬,o{O,[<~ wb/A\ȃc\v<sd?båp..]<~q?!?6wdpiH7A} G.EV*٢J-d*٢J-d*٢J6]<Y/(M/c=<=WuYgyP(- eB٢P(- eꋥP+[ElQ+[KNElQ.[ElQ.[ElQ.[ElQ.[ElQ._{&jݛmbkXy/'hAb4v ŗԡczi‘yυ\$wxG` \>M9ɜ#G`&M>|?\ptq ER|#G`&M>|6lr~qNf6%%-$2wptj'2?$G|-6Zlkb&_M|-6Zlp"NbLBMhdjB&PZ?~ o2 5xI _̮GĒWIrl>åm mxaoYfP б б б б P1gt%3{lX%vĎ٣H'm;NӭE9gεc';׎kǎ;7Ls\;v_@?Sڱsqyƅܹv\;v;׎kεcڱߵck8 X1VyB aEٹ\QܹP|XQvYQvYQvYQl,ώM̞ʕfffff拋qt"xw<C^2#)Kl?c=;`=>*Khc\Y_ L# Zn؃]?4 ]!+Dt"BDW ]+bD,Ƈ"BCXb| 1"BX c!b,DbX^C"BDZH i!"-D-D"B[` l!-D"B[` l!-D_u/a]J*Hڧwe08wxI ._B?Gĉqs@2])+)+)+)+)+)eB;//` P| ŧP| (;S(;ea>ea>ea>ea>Sh:ea>ea>4],]e ]e ]e EPd0e^v?ˮ'. ]\H7bJh4{'~zVNm^U=yUUՓWUO^J=y)ԓRj{q7QOD=Mԫ$\;'/hz.t_ˏz5~3,ώM̎uuuuEKV !)X_KKKKKKKKlqyy榒%%%%%%%%%l4H}&6.z68%kI^G#]#{}fGIb`0w+74>fM%=؋4la2 LxԶ 4\bKlsm.%6\bK>LJ_%6fiV4dU2*f]2*fLJV%vd:Ua/xN}PJA>(%DLJBQ+q|@HvG ]__kpkpJs!n~X>p#V #Wya5 W`/^x׋^ַ^E+^xE{/Dƽx2cƽxa%{6 'ŋ/4BIO&k֋wd֬^#{Údki~ؽx֋W/^Z~X^zF(G@Hwtȑ+K HKYNpKp8 ,ܰHp`c9R2(/?8s3vXf ]f?N򇿕C39ђ~~bM>[<1Y#-bBx EL97u&BaP>ŮXuISIwisq>B9ì0z-f۶~f Y@o#mbb i 4Cs(XQ;JC˳)~bb6/8md8 8? sk_=k`)kXDgA8qM֚FW݋Ys8y%[8 Oŀopu&w q,n&Oɶ3߾3a,'tj| x#`c|<έ=\?̺)ep )OHy"@\`CzX:% ̶+hLgK^[ۭץ>丁]l,7r\BKq=r|nCv!Q.!%x %D(zD9X B\B AXQ|mP&m&m&߽ @qUwwÞ R bEwt/ѽam&ko&ko&ko6o^{6E kBֵq!zs\ w1h,p1o%qZ܏ Q[pޖ-e)l n-\ek nY‹[V sI-5fۓQqm2񳿡 8'X]t74Ccۛv&sCF滇C$mjM$=Z:ěDw-_l?'Н(ULɰ5&dؚ [ako.`UN3hC${OP7F=q_"ڣFh0/@&f(a3&Vy14޵nhf~GFu(fvйBbfAwh-/t\ Ey16 7 ,W,W,K,KL.ނra*L8'K.黔-C;8:T.8+++6b2?Cx3 0 L\V ͺ/c`.R. d0 q'؞s)Qv srUp5rKOϳM:AzO[!r[]4 he苇$WpVge }b D`T{O_\$VCBֹBְ!1^'װB1DţF<;%||]45Pc5뼘\MQ ^xqs`p`p` <8&><#|2q Ɛ Z7LBx2'Cx2'Cx2'Cx2'Cx_d>XvOL/KN AG}{|gPLD.9KND.9KN&C 8mi|P޾*x;=[[x,- 6&'ƪ ()!2LN>=;NYAOaae6d:a|Y&eR%E|*_Z#a9A&))Cae^b2CPL SUʔ0eJ"R)S”)aʔ0eJ2%LL S ; .Fy("}9w&L,D[4a fp!213>@u#d p^ lX~ 4؄P#FLa" cAmfQmfQmr WQm>WQm#Z| R #ŕCqaP\B( lD*QEt&D)`&h ťzBz5[^d7%&D`X'jX\bJJJJ8^~׼?ur*%t2m VbJ,X= f:%֬D',|,v֓ٸ-[e+l%IJ+Wb.2{XmMDJX)]bKtU,%BX Yb!K,d,%h!'o1?i^=&";+׾c#~{INYC )W)gf3B~6/D }[j?&!#&D-O;~B E 6;1wB9',PqȴneK,P|@p(}jehJJJvc'c Ⳮ/vGᇙhp`ϗZ c #]\lK0vyV?^ xixtqd ..GW+;.pzrV!5n@>CC%܎p& v G`v2~/DƦ0я=afãgCTg"CN+oXlCn }8w2W|X`..a8p Dq5 yVX(Ua_w\|8P"vID$'XbBc3bx S2 )3bG9pH83 t W)>L:p%:,F,}^l|+Xϣ4- toY^i s^l8)qzyt.=<GO0ftlT5E'TYO=s80IXq)L+.;Y^8o~!~+~[X }9p1t1t/B4yb#>jqP1 'F.~y4ra `&F.D'B}oMھI7i&mߤ훴}M:D=1!4ݐz 夡PclJ&g +1JL0'> {0'> {0x0ݓtw>AļSl(f.4tdaqf_b)xbq  >J$܄h#)RB$/Nc.6 8{t)RLu&?^\x'vs@ ]rQLv-FL*A_[6 L)`<ɻW0֗.S9f2ɔY)H>3yB~`&7:k %-8v))"gș\z "gr-Xq5/UpIgDDDDD&;¾y^8犐9$Lf 6(No9qwtjB |xkplt/ F7޼sr`FWzRO'؉ ÛPri眲d:Ŵ盇%˳ʹ,qb)}׸azx}#x؍M dlRwf!qR(bo7&9.&ɷ~,l %=8&i+YCo0ŶK_sWIeң;Kl&=(]l7)K<†Rv_ӫC.>[ۙ2vI8iL?bJ|U!elR$U!COC fg~'?fmZ&_AboaêmXs;rGva{ot}J?FܳP8z i6׻3jm6Ϸmo<߶ymo~6lلo"7?Xl~`q6oSMKi6|D]Z\ׯךlG-)|آX?H7C^|]g"Μw|g$/ 7>])qKMK]BzXʙ&^&qX9~3Sf)鰿>wT{?L0_R~8]3 O`=ƜC,#!&F:kŚL0w)Mak;H?)v˾a8,;$\ȸ %K,Y w!́lvv!M[SAAXYmp!N6 CP|PivY$D߆`3 6G^K.sx_Z`,4&DbHLkЇ9H{.̉| ` Ҟ$& ( ]8|8Mq{qr)eNi"?H le>qILęt򷳯3&8F+=Y>+Mt=]p̡!^|=X&,KYφ~7NZ"|x f.P2u>[(N >* ;Z28)Jpޕ &_ $'9.锤#4b Yj"KMd,5&DRYj"KMd,5&DRYjOE:4F5Ѩ&՞FFtЫ&zDUjOv!e>Thԃ%ͤi؋9>.t4I)/ZDh]kuM4K%4AvaoYoA ;$C:S:5Sոސkvn?|sѿaدKw{}]@6qߏ9 F4/6w=/D ]iP|s kl^^ѿr6^q8Ͽpnd\f1$/D-@2,E7>f|,!M{9qNtD9qNtD9뿘8<9ttt_ L{KJNoQv-q p!S(fQ(Yp!^&8797Dc9MG/Eq \<1{80$xx 4cw<$<^?L<\&.p =$>1l3.r+֫;鉋qfwśMص W*ˏAZ}A|1uIC.\zͺ.EH_W"}]?pBS?qOL?6xw1Gڽal}18ߙ>,ɀ|5R%d?Xy1zCSO2cJ(k]d L{qH6$ZY6`iĠu.EֺZY"k]du.EֺZY"k]du.EF=[M57_o׊] )xO밁[!wpwйBEXAX(7"EJ#o߁wp.d.d.dY.pQKgǐANZ|ܐ!'dCN|XdI&CH<>B{]&qiF/͆| r=FqB(.N S0>,駤~1>N6$6 ?8'?n٘znfuR ~n6lso0yBޔ):e+Eʖ|0e?`=N2`L3e` ̔20Svĥ Ɣn)RfZ:xɖRvdKɖ-eZnh)R{>qٶ)RvK ,e'XN`)Rv|O9(Wxdh-)xOo<킧] v~~Z]&Dh;gȲeʦM+W6}l^0M+W6}l^ŵ"^X"9}< B{p}<]\xK ّpd{. l۾X.c^6]pOK~ %{/)h,)Вq撎K&>/N6⨟`;9:bK—|Ϸ ̶=ɻc.]KۧiV,l+27a.վN 4b)-XuI3.ze˄.7OEuN T~~mrә 7 kf,K3#|B3ԙ Ş˗6Q0zMa" '"d \j enmr`@o e5Q6F2F2F2F2F2F2&^Kۑ+:Mʅ_){f>&~0)ILʁ2<,:-8lIoh:a9ZZ<\vyXkU0qsK|! I[*\l &f. pd*xIq "fܡ ̺LGXM^3뵤oxiζ$\5SK|g' !fSڼağU;j/W2 r r r rܾN33wYƅ̸̹9fE{v>:Fƀs4 dЀ A2h@ Ƞ4 dЀ A2h@ Ƞ4 dЀ q H'&CFb0+?x\s1]."n˱c96y>U8c5/PfQ2y1S0*Ctj,=8ϟ`IJVd!b0N<,eH)gIl!cb0 ķ`o۲L?#a~a$!`!!Dż\k*I*I*I*I*I*X@X@;C{q2F!5}}wG"'vInInƉlf<&3al0$|!$K(/Q6^lDxe%K(/Q>;v{L a2v(SXd [La)0)|h2,?$3%[3)[ЮveGC;w?~..Eڷ߁.?~"=$}?!26AڿхmFafvdt!..Dق![p8d l-8Cp٢![tl~m->p[ 7!:P;9}KKk]od޸pacRsq~1pHx/2>\Ɏaׇ'^[os~.=\u bL˳6Mm3M3 xgI.Lt1b-eq1(&p^X1a vnqa.;[h|XO T/4B&w!7+`pza `VLb6lgv2m0m0m0m0:S=8Qvz3 aW Ȣh>by!â2OROROROROROROROOOӶ8\vi9I9Isiqs8ʛ%c@{>FLˁ(N+Cst" Z]pg28yp e 8ˈ٫hY\6{,ڹ޻̪ L8䈠gquO.;[hn\̊W~LL? k(`2x.s]ƻ̍w27enx.s wdBj0h=#h=O8^^ Z u m{4/qW&/j ,D0*^5og7nx;vv-hidd) ww  D`xmUV-|CxݰJAS4EAS4EAS4EAS4EAS4EAS<D5ӅN"Y>wfB'+2B@6@nXNyhT8ᅚАq;~q );zlsmM'oXѠ'k4+yFC7L}%t ?!9 U!؉-{w X-/#^&GLxx?o4ᔼş]tym8%enKBRk2FpCpkїt|鑳h&g>fLQ{Xz>a_ 2`V{!Xe3|u˳waR~`R>'wKYL;lg1`wSOq0- 1$=<KT,&$=S sK;}|^L\`AިE&.osD[$"HE-mFYgn 閐n=ҝ&l n 閐n 閐n 閐n 閐n=%.b||]ib.!b.!-Iy,UJ%\B%\B%\B%\B%\ppfa Lc.{.D`VYao6 g4 ,|`)h)y5 _,e2#đ%N Bv߁~(z!D5=_<[ҡ8&ikofIkRǮ%􃭲4=?L$ߒKկWKnMMӴė4m_,u1y&j>إM3ٞ!#ia̶j~iaUHBFLŶ3~J6m?,(T,$'=nھxl_:i{4m9oR AAAAAABCzU!T=zBCy=!<zz;w?%ݎJt!T:JP*BCt!T:JP*2'%t8%闤S,C;?|5!8PjBo gSO t&{ f\hrMɋ]0)hrM#| )hrM!49&BChrM!49&B|[NKaΎiуVBrCx9I~N)-'r-'yt߁(7oKÓEG%~788GSO)CIs f͇lh0/-%)ev]zyXޝ}ra)aI[}Xd/1{׶lG|ff&aYX)`w\bvcq.?go[/1{Xr)X%,Q?EOSD{އVIپK]J*ɫW_zvoԴB0G[.pHxJ7f؟v#GرLVh٨]>Vo`ҳԇ"baa/<ھb>]lL=^,sOaǔ1nun =Wmq/XKh-EK˸a%[d KĒ-dK,KB-faiX%kZbX,SǙ?ƁeZbX%i=k8=YbX%h5ZbX%h5Z4A|CSCҬgEG[%VdYbEX%VdYbEX%VdYbEX%VdYbEX%VdY2Y2Yb-X%bX2Y2YbX%VaUXb.#=đtVay*, K’d.*, K?K*,, K*, K*, K*, K*, {y^9ڨ7f=varĽ{4p;uߘ m ~` o 0f Y@}) ׎/LBT1oc?PKFH'db/hZ^ 5>Aܒ)p!g&8K1<;%ΒKa;ŭTzշ!.?پva8̷]|Ǥ3]ŀ)W'>8SڶN:bW_<;]3qs?I~PDr8v6}!)FH1(>A¡" .^.-8\+]*`\`n3|X daj2222ܼb}}cÌoJ*--bآ,(--b2ܢ,(--bآ,9?\(I|n[&׾\\ $ǖɱErl|n^r!%TȌͫ S0_F_0!e@l+[El+]2`v kc1LJOwdAe vd1Jd1Jd1Jd1Jd9w ;wv,' $Du Y\[ͪ%#$M&VH Z;vd1Jd1Jsl(Y(YR;oM^Sxʳ'X×`/|C,&DHb"YL-l NKb"YL$d1,&D,[X3ٽ)YL$ɒEib"_L䋉|1/&Dut{x@R2RƾueL1)&2Db"SLdL1)2D탵p)q!SLdL1)&2D,n,n9b"GL䈉1#& &rD9ba"GLB1#& &rDS9b"GLɪ1Y0#&rD9ba"GL䈉1#&rD9b"GL䈉1#&rD9b"GL䈉1#&rD9b"GH_|BqreP(N]˕O] qJqQw!;=  YLs>q'.E}O7>lN]+8s7\f܋9|C7>2eyڵRp & MC\t=.f=\Ŭ/p+/6džPh"o,5&w3.1_Xߌp2\z4&qތ{4bթI\4&q$.EhMI\4&q$.EhMI\4&q$ћ7>666Óy4logFԹ3=ہہyjyjyjyjyjyjyj^=^=y 署g[ -![C$ےMts~qM6" 7O3:prq˾ %I|Pn_eÖ KގTP/[Y9kB%>q0ⳛHM[8g7cS[KCLeȾ){qu);Y|0!2T1k!,Ĕbז>21Q3e/w2GbBQG=`ӘnQǛE#)!CQG:du>}!CQG:{YNf eO4K c%=i\x߲e}=.a<xڞSP ,"ݾǓ(Nß3\`gqZ.#gs Fq3/p5lK٥Ϝ1ܙFp1;5?,ucW_ʮ|fRBo||w)f4e_n|t٪%i೟臧K{w`|&|_b]1Ãu9$~p =p! ?[e{ ݅|̘P4:XݵᆅtQw[l8r**'^~f(*U/d %؍Pw7Bg(ouc7Bq7xQYqcBݍ S'7&lL1q ؤPIdBM xs8ok Y!K4d?{߱ϟg{n4b-tt#orэ\=<℔!4ܘa6MӐ(CJH[M! ,v4dhfS )iH9%Ls3X?}xI-qLw^D8ǒ%8%1ۈ8(1C?CYď9LuY.}a8{;| )R5kHאX[dFnX"IG.I,%DR֓X(fdI߄n /Ke,%D^yY"/Ke,%D^yYד?ZDkhYOkv#zWBwY;Kf=i/##e2R5K|)2ReHɗ/#%DhZYK4kf-Ѭ%_FJh٢=3KD,h_?y}6~-oQ{IٓWs| ~/}\X(I q s돤 nBsd4CaĤ`x ??I`@(80x8u*GpI%~c3u:bm6:wp(_%!nq{0▘6/3B$AB0'4ڇYUX ׇ[>AJrVb1B9[^L50Q؋?pJx18!\ViXX&ۋIQKpH8+# AB-n/u}X!x oq0GFۇ`r_ۇY8%DLF"๭ר;3"L,D?$E.d օ܋p3>惐qjk7]7[9p1t1t/B{1d/'5 ^9 }xCHޏs ~ls _H|6"KKc NfqKgwla69$o)`yb̺Ѹ`e/IgKzmNI'b^Vӗ=N:}?,鸤N1O& <.%XfJMMM,))'IIỔ2%aꤳ%oݠ'FF߇f-<@-vu [4cflȁ On\" R(\5 M\6q&.EhmM\6yņL-64;h.1Kxavqby"6p͵dlG:1Ĉ]x +pNW5GAc=#^l{)SQQ9Dg1~&Xtk?>#Ĉ/1q$t8¥ז6  w{K'Xʠ Rt_v^~ܛkFX"%`X"%`X" .&m|xJ,wp`%BePYtqYa1)iJBX A+ 74!)H`)C1%Z"%BhZ"ML'} aYigR W @P-TKՇg9wLN;8٨4;lLA?qCd$ UlSm:p##(Mŵ)|5|5q5f5[5P5E5:5/5$55556UԦTQvvvvzJZ *jڢ-*jN{q,4tzX뮅76Lg)1IX>u1]#k?2ayv .={Î1I/vk$C2U8~c8Ce>9{?&~c%{D~"oOK%-o"-o"&aq-{{p 7pn!O-ow}vT `;-*r8V0H*rF0kۃܢ"-*rܢ"7]_-uRw(MlK}d#qS%s؇`8o09=\\aN2\^ӤQvnzAظbgxJ GM“/}?L*!Ґ29/(!/ \#tv-t"AH-t"AH-0lز c6-0lز c6-0l"/-[2lWėq1As}[{ K [\ٞG`0;p#>Ssat=2/G0ѺSrC(d?նC[n>u11x^mgPN)նC7PlvjFijtptpm'VTsp X|:6xeSgIZ%%\jk"vsn ]OM0vpp1s[>Rb0Livv{qo?xK<;]㰏q9paЇ?k$vL.hxÏpoӞ?ݍ$C]K]6A#D_ÁHKxvKi@YFQ\JI*T&!Xj*՞Ju`-M b5`kXMbR5u>بkXMb5Q&D(VjXMb5Q&D(VjXMbOv$b XMb5Q+t6K's(*VjXMJR;rbJ5Q&~)I#(qkYS^6S^*T4Q&DuN{sO,?l?[KN!~!GC^:SJ:S?:_t @AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\]>d%nXnp\ p:rjwxމ;na0}޽<xJx g7yEdwpwQGt޹'8_;?&?q#\?9ǿO3~T5fy/r#A@O@x t.E BWA" ?TAO!NIo)sQrNO AA/f U\T?xTX*Bɹ7qEɹrQr.E.?rB(6W.E4CaG\b(9GB\*)ixDEչrQuh-`rB94_[0_sQr.JEɹ(9%\sQr.JEɹ(9%\sQr.JEɹ(h% J. q)°'? ꂢ. ꂢC]P]P]]wݽ+9kjIA\}אmq0e} cwa$d@qu3֭^Ȍ o:!|>&"܄La1nfw3 c7@׀; t쵺p)'x 7p>|qUși=1Y`yeZ-^?pp]|'W$u~8Z?B~^S#,uHaWIxf8s`~|<7Y{x>D ƧΛCJ0$R`VgfAW!"Β8Ko'G<[ʣ6䟽Sty'X7> qB،` Rc+grfʦ͔M)6S6mlLٴi`I.otC:vFǘ#1[11RʤI9*rT&L1.)GYRO/GǓUb&ɦR>m1Qꑊ/)SWRVRVR6^ܺ0yfxq:$3W,i)qX%mxؤn{HKtO)xK|,,^|vb`pgM5-rKSS—κ') K ؏<%Β-c2&\e2,"syu7E]LKt.='[Η%L} Ck<$0 w 4SҬeG򒶍JjV`XlX앱`C c]¶`%oS+yR3PIbq7j1ՔK πYrpPk:`ya'ʋ%؂..yIoMiʒ9~) .fs/ +`w)KzMpL I Q~׶5{[^(**iB  ŅBvq!]\..d ŅBvq!]\..d ŅBvq!]\..d ŅBvq!]\..d ŅBvq!]\..d ŅBvq!]\..d ō ō ō ō ō ō ō ō ō 1#bBvqBq!idBɦM %J6-lZ(ٴP]]]B6~Enη?36-ԷݑB6~g!?B~ީ9rg~ķޜ %dz\% 8-t8xx,&w΂?Ly:1.pmj՟Ȫ??*P]]d d d[j%lo6LKA3&pRɡxتd^So^4gKh=]2(Pz.O"a$3j}1:oTDžȰWJu{+X%58˙ $=j`cGʇ]pncsKygs썞=,76ֱ'DBÒ~Ll.8o@%zc0若e,K%7mR^޺8L)O/K]1d%^fz})CO ,l^z'=1Mfx/<)p$_baIǗ`1^yX!ᒗ^zaSң%$%\*5=t|DPK?}%8_=_ > s“^0gjЅ__ޔ|CfCC|74&;3H뻗Bԅ(5,NݷW}Vt2nwu qS8aMBFͳd݂56ָ.d u%|GC?ԭW.p&!sLaŞߺυ7 Yo |XWFɮ.pV~+]"̆u!lߋtwbu {u!a8! E%[z!q-X Jyx vA\&؅M<5;?W&=Ί9)$7Sxxc*i/biZ bCmK*x`楋#ǒX[;ko};Ů:z߭ߏGr=!b%F al6؇% mS|^<`Y Iċ/<+Vgmg%ĺ?7H=mK\qvFU !,G#VD:LK۬?E49LF&g#ldr62l 0ޮ bb'~i;GYhrQ&(s9drb291;1鷺&]'au57V"_.g eztr2e5Da29sL\&g.3əer29sL\&g.3əeޙ$}It'pfrj3$ {cF́3ωpLsޅ9ְ}Ls&6S6S6S6S6S6S6S67F#$-roK4^d\NLNLLL>0hM***Io6$602c232!gMN=l=o|e0'V e惩%arͰ3}!r*)x"ON` () wpSlzElGv|9w0l>TSj8WYo ?=\ԃzpQ.E=\Wol<Kx1B-Շ. DFZr]-\?@Cq + < tB%AjfT@g(f)@v PֽrQȌ[) x]wQ. E(x]wQ. E(x]wQ. E(x]wQ. cuƢFsQ;G9YHb]q]C.b8ILO >x/4CŁ\hn|e5 #&ns܇%| )q-+I+b֓"xB8$NH^DD. x3)e3ӞL+d۸@. YO &_!Dı _tn/T.QZ<@ xkdZsJ%X 48%~J|7 *P)П8t]O=lOeDRNrUzjkH8;5D.QKD].QKTe/+ߟ_:zjSOmM=6ԦSzjSOm}T6N^̄XM<Zhvy!j6q'&k8p Xs!~[/nM'zH/63;CC@l1Ye7BTKMIs  h*yUT;wP%JA[':=dp!x.xX g)ނ`­o\8}Ń ط[?# [_?fx~G1Vc#Ոo1o3ɄzwI8wII_1cpﶀ';bN_`y,ۖ2)%g0`o{y<+ >Cf؀o؀o؀o؀o؀o؀o؀o؀o؀o؀o؀o؀opsw؋o܋o܋o܋ow/~'&ĸ߸߸߸߸߸*Tssk- Y͸Ƹθhvr!nZ.4jMmJ6S.M/&;Mv\%Mv<“&'LN09 `r$I&'q|#طd3c}`YZM`pk0xrq`9s&L;GP2LəӒW;-yﵝ<4ˆ3 [8p-l1%|I8N!6q Fg#O$|K,ng#D]\FJjX坺S0X]Wbt5p/119cbrjay719obr伉y&&MLΛ719obr伉y&&MLΛ719obr伉y&&MLΛ719obr伉y&&MLΛ719obrɿ_'#O!')=R!Eϥ o!ERԱ!J!J)J/mR(+J-J*K<kT~hi8حTN%W~qDJԩDJԩDJԩDJԩDJԩDJE:uQ.JE(Q%O"N+уǖ{$}n7S |CcV.*E%d좒]TJvQOiZ'\޻墒]TJvQ.*E%d좒]TJvQ.*ٟJR d좒d(dA%dO%O`*E%d좒]T?5 dRoP aj鋕vQ0.f-R^R"X(ce좌] pq AAA|nQy- ^5=ۂ8 8 8   ݠ *۠ *M^AAAAA)qEn7MO?y"7EcMiQxqC0D؆x+/m !6D؆aO$6 `llW;L)^l[ ? 6Dن(elCm QG.`((xCoYo(T^ᒼ%yK L*wo Q[{[+b(PSjBMQ) 5E(PSjBMQ) 5E(PSj>%u< 5?ʿwBSxwHp~۰ұdi,M)4EY"KRb gu KSdi,M)4sa)K-՜ĒqIfŮ-EShMѢ(xKSҗ!Q~/2l<Ɨ횢]~/2UHȺԴ)6EӦhMx uSnMON`nMѺ)Z7EhuSnMѺ)Z7EhuSnMѺ)Z7EhuSnMѺ)Z7EhuSnMѺ)Z7EhuSnMѺ)Z7EhuSnMѺ)Z7Eh. *Z7EhuSnMѺ)Z7EӴ 8$NJ]{Tt2AR~),?kvMѮ)5Eh횢]SkvMѮ)5EhjZ] ڵ] ڵ] ^okRP[P[PE[TE[TE[TE[TE[TE{:[m]A B9ҵ(]ҵ(]ҵ(]*7G,EiZ"GK$hؤ~?9Z"AK$h-%DH Z"AK$h-%-/듗6}G$XKbZ,Q%jD-(%XK`,Q%JT (2TJ_+Q|%C|6*^ҫ>e!'8 JV*[%rDnȭU"JV*[%rDnȭU"JV*[%rDnȭU"JV*[%r(NS}rpGZn{[x>R{_~zz'WR_b޺nu৷qmuJȁ=㺕ZX<Y"̆}PBd%܄ehW&V@#v$DٯtŁ4 u̇%| )q-ώJogN]l؋ԌɌdK攎돳hw;x1=<̶<#O^CBYЋ~c`YЇ}Za8!Ɍ'ZR n [|[!H|}=e Lه!ni>0+:b:BZ7Z}aXO T&%<%\ u0)aiƊM,/ERL>u0{V 惏`6`bW03 fFb#[! L I-fA]Lppag9/ːi0_@ѺY#vh0"N)CI|cmpfݏ`݂ap7 Æ҆G032d+tж^?[W)wHHfI#9(d&$ìNK拝yd~%<%o019 f?k|qAA2?>ђaIɘ,1{[u\p:&L,DmqM>a ,z9tgw71c dܖvSxMO5=kz)^SxMO5=جf;Eű2_%]Ё6"ߌs~c>3/D[6EԤޕ30FzόVe1X﫧 o(k'/27y}&/?l.qb vA\,<2 [^X~T^\X03Rv0 ў/ ؘqh|0 <,S>M¿ ֗\>HLl f E br`ɼc, o404!>3ir&MΤə49&gLI3ir&MΤə49&gLI3ir&MλpwCo] )xBƋ0<= lN>a.87ԏiq /Ycx0ߋ?X 0{`S0,)h^?1o֋,1&C8u3Q1<%<42c̘OftAf,ȌuARj,ulX0kkw2@ s2׭U F3>֫T@#cǥ~ 8p1t1t/Bτ q,hb*J-3TŢXTbQU,犿0Tt'msa,ʎEٱ(;eǢXcQvt!2C!ͦrs9ʹ\KGW~/\ +;BnX +;BdQ a". "v+;t21PMHCb@tx- 3sqeş]&eLÜcR$ ;$.k&d+']v?\v?\v<\v<P?<vcj^"fI:OcsocXGz9"HK4HK4HK4HK4HK4HK4HK4HK4HK4<P&fLjjjjjjƕMVdA IVbg.N|ĊH.+"HPLmY-%t z3wp"r\Ep"r\\W.;+h0&!C1P A 0܍A!܂.A *(ʇ`q7WUt퀥xHx≱𐥁O.`30܄TͿn3z9')Vb5)Vb5)Vb5Xt[$jR&jR&jr[5Iyejt[&eiR&ei{uZg$IJؤMJؤMJؤMJؤMդM |4؈iTvᷕtI9Sf \DKR$IŃ`H Ar ) APٶQ^q1&#"6N̿~R@g,/{=ĤI8VRVR4S,`NʧûK:"O?bX)~ʪg>FO1Rk̡::fI:W.^\Y`EXEXEXEXEXEXEXEXEXEYXEYXEYXEYXEYXEYXEYXEYXufEYm~}aY/f]yrFk Ym(هE^YWv6e?2>h…Z0x,'!34m8p31"W,"+0Ewqՠ(+Ei[4FE Z4K qWbtĈ.1Kߒ=z @튻[ʅO<o^ >[2/0"O<]9adyp5 7:r{?Ȍzpv{4D>tӃ e>tF@0}8ɒkI 's~!_4 !GçQ"^!ß@~}O\=(w`͒|P\HcAd1qa.BcA2.y„ZywcHIx -bKmپØ`&k܂90vO-~x> r% wqU_o 聆]p6}қ?d0`ðexxK!7 [0[hI.<\ql6,KZYν$-'˼[FNjS߽.Sa=D:f×`V9%͔4sJ5OL?=ER,_[0ȳX⌯4r98٨„ZP j_baOBD*T%) E2Hα>Ve8@(TƤʘT*cReLI12&UƤʘT*cReLsRgJS1E{LS=öbKSe[&q88&9"-Z9>la3K.N88[ hHkŬ`Zqb2N,_ ? qw珳XҤУ</&ZҼЯST:ENѯS:ELru- %-X;%ml0$`M}88VHCʦ .X-S?St||.˧)|.˯$|[OS~|[,QÌc{ 8"GLьyK 4,)L[~$}Wz̸K_h&J2<[7dxLҶ!%'X×`IS:dߔ2=)g {𞃥؆)}2RhX1-_,i V/z 7>qs͎zh/P=SrHHI=lmw 6BG,r؄4q?߽oK`-e8{fz4b1N} )YL~Ľf \Bף.nI?@ R%p %\cZKZC( `iI["6ipBM}XdО.c8^fcoS)v]J-fKn-fKz|>Dח61;&ftlmMµI6Y&ڗe,\ψshXiݽ zY"$@$Ymx0;g[&#I+d̸8>~pnBYLkq7،;֍E7~f,80)B\Kelx |v\ 9m@As7i4y\> <8K,rlC0ZrjMp0.aI'G< >=qsGrW2-b 'WimaZ1-|yAu1Zނk`][XVֵuma][XֵuvuqKp]<imaŻ]Qo 7YRb[)iv!XfXu/ ,b)CRÙ>l9L)Kq9]2-,jOc`Q3hQmaT[FQmaT[FQmaT[FQmaT[FQmaT[FQmaT[}55 @ã>WH{Fe3nx֗ ~# p1t1Y7qϸg3n*֨b}&}&}&}&@&+?&=&+<2bof1eM<~Sg/Aѷ}[كpG٦Iy\'x2L&yԃqML\R쭕8pH8;hmiy{Jq`6ܣGGGG=E4}I|l zV_naHlCeaCewM5ٕ2Y0Y0ٕw;{]obؘX&&&Όeocb7e 䛒/vcLvcLvcLݘ%=0)!ݏsxZFWsadxGl⋀ s¹q&i1%n{5DP~6P\lfB=j Dh!͊(/H[$-sĹEqN qnlllllSS, _K·'[MuO"\a.#yI}]TwZvqu}q upQgy?SZeş&vcȅw.zܟǷ"Ơ]]\<\\Z\ЇGBBCCChBM !4!d=50-t;zK҄|BBCC(C2ueY !!!!>ЇBBC}!!>ЇBĉĒuXbmd*?|r ֆkɵprm86\ξL\H kɵprm8鑘tCL '׆=`︕k)k)k)k)k)]$it)k)k)Fnʚ,3֏S֌S֌S֌S|i]dv?e8e8L1n؈=6bÑͤڈ ʻnsC]BQ  'gۄF( FNȸ]t@g,ڞE۳h{mϢY=g,ڞE۳h{mϢY=g,ڞE۳h{mϢY=g,ڞE۳h{"MNCqh{mFBqeP'(n=Ċbŵ)J]۳a/,,,,,ۗ,c+xD4's)Y/Y/Y/IKKJ{r{{ɲzNv=RPJJɒy C)OKKOq`Rd}J aFQ>JG=lE]@0&RDJH )a"%6i (a%6iveOKH )a"%L>?gN`V0Rbا%l>-OKJ}ZVJRbا%̥0R\JK s)a.%̥0Rd.Gq$B>+,I!tn^YU$Td iiu3^ SHL#Dh/n_f{B4t``F%-ldM;\1X2gM%.tB>&"܄La3nfw3,E7~f,04`$R8`.lxWЃyFS~2B| noJIx o?r~VoËRKюY©Ԅ?&q1Əh dsdgb>K1 ?w?ES5%I' a8A~.0'gEeG [{Pol MgI%q;1EPS6R6![FŦ<qjJ8E]p8Ǝ1W,ߺė n`פ#ۍaIlt\Qh3%%enKaI_Fw_ RGbkC0+ދ,2f;A I2b?VRLz}aI'9ڭ(l%/h:CGYcdwA`@jXU [PBN#D -In6 $s}A9.]vfb,kV0! ,: SdLu\ډ f1ђ`Lu2A:Te6 `3A:Ϣ CGH=wo[>?xf'lV;j-ȢWC:}v+[sQN0V"f\^ᇖ܂x)e^Y3$_3~kA?~~6~Ha g0_8\";Ǐ]%88SrN 1ў?ύc2cgH8 YV:aCXV:aCXxHmJ3>==,s? !l3[*˴&0Lk´0tIP3as0Jk0AϐuoϐC<_l3{p:XgM3<̿%ÃvM \,zN/ V'=} 4مFDݢa^\?o[B_#^(@>hG Ex3q'Z$m}[iO{Y`!#XI5I&$њ$ZpVDkhMI5I&ּDb sMrI5ɹ&9$\ks˹:cj1zJMw q!` 0lxX6x\Btp~}X ?x w"K F!{6El_x>^Gy>+bz7!xOqm& \_.MC!3uH8X4q> -ˋ&~|k>7iHjJӉ{a``^`>lL?z|xf9>$`^r ˷K_KfyoĜrSյE?8Ca;kҞ"ɝӇ.E޻&wTn0;'2vT/0LH0۳wT j^>_4Ow'Hbt` 'nmf"[!>1t6y\eCg; rz\.sn1@'%`?#Dv:WQ7U`0$\$}߅̘%ko Y* -ilɾB&VHN:!*>b2n/]Hw(csBaloflM…21q츳Ɓ g5a^2t|\uq6<ƂXR~wkWnmLƅK%| ⶮl/2^k  [14[%t)-PX萞8h_[K09wUv:^E;Ո6zi׊Q@w{ȁgmr +P# Ϳ;*c8&u(zNԬOASPc856NA Yc8uѓ;z@c\g\Q^&p!،` o`Ծo^'O>8K,JgĒfzxc-X~{$Rov^ 1:pko{r%[F+6]N*}Iɷ&Z1v)-췋.!ubIꆝP>Aac"=zogIpIqzd2ϫ̩[}я}.el,xvEi"]X%_8wt$%\ꞒfM/a/8aX)c5*T0-6*e8n!s'D[_B$ĐR`R!J奡#2yfrcrs$?7Ebi#QpD@glR O &%* ^~1K|V&p ֽמܾW&nYn_ba_ qN %K]+#7x$+H9:V%[ ovo/=/vo/vo;-p313(o{z~إaS{#e` /\xx N⍪_{vl~C"#EK{ByJsI|rgJ(`b/س[Ug.L%xK|v7 8(v/{B90a:~%TSK&s\013Xf&&`b"&&`b"&&`b"&&`b"&&`b" :`b"N>_`"&&`b"&&`b"Nj t ņɩ US{63):1)g2>٧XN&&`b"&&`b"&&`D@]3Pb:!R/&&`b"&&`b"&&p̑r̅ ̱!u!!?2١&悉`b. b6 {fv gg4Jeڔ80&)`R-4LLSޱ?@mgr\@);;NT{seRAIIMH 'KMtItItItqIp\RvDAtItI }eIt/n:t"j#3.v~Y6xވG'L8ݥ(љ?dSS v2ɓ]|y5X/qT5$|K8d+ w.|ޅϻy>]?>2 A>] w.|ޅϻy>] w.|ޅϻy>] w.|w`cH] w.ޅۻp{n] w.|ޅϻy>@q.ޅûpu=ܻzJgK|2'pܻ+R6po½] v.ۅ{po{#\ /׸R5w{wp wp wpsմ!F ˽b{ wpL@G)ixi>?8 ,X.lFh) 9KGqfgy y!!9<BCs&}LPJ^&}LJNNNNNNJJ^<؝<؝<؝<؝<؝<؝<<<1y;y;I\=W lطBĭ#y>;y>=\Q EqhǤtS)SNmK,gn}L1cL]~po~GaC >eqJ-yw@'$Zl\1x$;? oo|kx ƀ+t>Gq vIIi&K8ycccz,'Zq)H8x K0''/f{nj?:_K]4k2bh})9rc})9rc})9BSp N1>ǔpx f;oe B)'SNqSpf]{]eXҔ)'Szø1LW%yw ؆)!RD9VR.o=qWcsbqp?cjX` 3Xb%[',,H -}WgœœūW5wOB7Jy@AB]r,XdEvYdEJYEJYŻw9r$OXcc :CYb]EYEYEYEYEYEYEYE0*Kg& # MĨ?-i}=`c:-4vt"x?0QLc(kflx{*{,"-%–P [BaK(l=B%#TBIK?T;xwPu#x`3ơŔuwF/Co+RdukKxm -%זze[r޽{y.1|זZSKxj O-%<ZSKxj O-%?lgߟbKaoxpjxOO2/xJx 淃C [s(آ=h/`Ô>.#}{nϾH Û,P<<%<$?&l%%]aЈ7E/l-R-MaIs,V%!ݽ}oKPlV`o_qsNHvwu3 s23 s2 0G'ok:ڳt\ Y>&f|amɷ&f|amɷ&f|amɷ&f|amɷ&f|amɷ&f|amɷ&f|amɷ&f|amɷ&f|amɷ&f|amɷ&f|amɷ&f|amɷ&f|am9`s}\<.ρI ~asX0瀱>ɷ&fdA{R'I1a_Mpm_<~ğp/X`4ɵwPb| S o3\?~?>EՏ Œ&5 :h_}m=̞l9 |KK p9:Tm?<lm?'Ke>ui=;Ӥ~}~qRG}0GQ0 4BnV$_ؕ0Ł /L>0 /L>0…XB!H; :CL| =C U2ㅦj|aɇ$C 5L>0PC 5IŇ&j|aɇ&j|a9ɺ: 3L0: @PC = Ǝspa`9zx=~N?'^%@Tω|]s.qDm֯ w&qa?,cs!$ޘ5x0m8gsaxQ|Gg;Mҷo$\w6~%/F|G1RAgN _870x6{M^qSW/gNoÆ?Mb f7y4B-RVޅ b8XNRԼqjJ843MLS~3µ<=o؟a'o؟a'o؟a'o؟a'o؟a =/۟l'/۟l'/۟l'/۟l.6L^?y\\ 7Oް?WyO^?yO^?<&/П@ '/П@B~;sᝤk'տ0YycɜYpCk/jŕŕŕhs\8u6r w . ,X+dW`KN%;Kv, Xd'`^.:{<%=JbdcaGa=5E"=D'^ZMl({&helyl,,XYD.ѷK6 l,,XYd`.]850{K6 옐A۾p?ԶkF7tFѼDh7xH9 ?w4y7MMw4y7Mލ/.4C.M;wxo5 kEinf՝3vӌ4c7M3v Q g5anfin ^8h7-MvӢݴh7-M3vӌ4c7覚ܴR7M+uJݴR766M+uJݴR7ԍ,s4ܴR7M+u4ˁ+uJbnRX[-V+u?u7ďB  uzbn^X[-å@hnX|8]ⷕJݢ-V+uDÐsԎi ̌FD#agu"dt<,-Ksp'HXEŢbQn"{xrb9nX[,-hXe6X-h!6ц4ڐFhCmH i!6ц4ڐFhCmH i!6ц4ڐFh8 Gh4p4FѨόp4Fh8 G+m/h8 Gh8qy3Eh4p4Fۅ(lH i!6ц4ڐFh8 GhxBc(3 dr.uߓ}O=yU? ^{O^=yޓ{Oý#pޓ{O= j.4 MB.4{7Q&^4ME{^4ME{D9hFD b ƏbBȣ4т&{&{&v]h&k.ķ5۰MlA[D#&k|3MUق 爃-hb ؂&ts#s*?s*?s*?s*?s*?s*?s*?s*?s*?s*?s*?s*?s*?s*?s*?s*?s*?s*?s'XKsZ-zM( fgB%wkŇ|%>'[$Oø_<6 ?,LIGzx/aov0G V/.Ƒ5uK٤lnaɐdx`֥yxgIoG"x ,ɌK.5Evs.\ǻX$Kc?!pH8e`p"x//{XI'8>ᇥ^)i|[3})LX(~p!.lK|cB`9dK9~\C =!dT^:ds4z)XԤ~BavL'/* y y'2GHxJ4-W1["|KTg|+D@T/gS<:F9TC%8Թ JpiJJ$˦ol*ɦl*ɦ*ɛd)Id_iJ$oJ$oJ$oJK{~&.4xS76)b{O)+%ȸ'A oǛRxS os#iBsM{8%T JS)|*OS3S0 rZ%s|{Xt+spΕ¹R8W J\)+spΕ¹R8W J\)+spΕ¹R8W J\)+spΕ¹R8W J\)+sp%1Kv)8W :x2M~J\)+spn­RU JV)*[pn­RU JV)*[pn­RU JV)*[pn­RU JV)*[p?@ͭRU JV)*[pn­RU`a\ɷ .RSs%j864}R8W J\K.%Bd,S|@ 4@ 4@ 7&OT OET$OET$O#.D)<ȣ<ȣ<ȣ0:n5Q<Yo"*"*g.~ .lvUdW%쪄]vU®JU *aW%쪄]vU®|=l @%>vH,DdD)Y$ߋJU *aT%BLoQ}Xtl-Tx*aQ%,EUx]!x 6)߲MJU .uUºJXW *a]%r.X6VJXEY~Qa%l6VJX +ac%l6VJX +ac%l6VJX +ac%l6VJX +ac%l6VJX +ac%lzlp>?ڻ2S"hҁS B}q 㽵qwb@`m>.qH\>p/~kO\l(۷-<ēW|@z5a'ƃNN9L¯A-9Y`H ;pD /wDj3<$71..v7wfuw]O7^ݕHfr v^.xߞafmbLN999'8srrNdNg<)socr\>xHu{^_ 9~/Clo]ҠK`qD:kKCL'Sy2eM`dm2Ȗ^)Y;Dg6>Y`#O)|Z@O=>Kঃ ˉϡ<ąwi]g#%߮"&:}␼eŲL.Jɲ%qbmnaIx8g-I2,Sė\Rε}%xwSX1LǾZo8!Wr38ٞRx羙<?L?dJ)sW0ːXMaJ]S:g<}Z`~HۯI7D''T#LCMąp9gavmsoa!. \GQN},bo=smgM̵鞹6o]7ٮ͛lM(UH-Ru?m }@=CeH<ߺ,twpG"R[Nx1~-;mB=6"^sx & ɵErm\[$ɵErm\[$ɵr.es8SmhIn(Xds n8`v S0s"1H-s"1HfHUzo7fͿ@ %`?_ӽ "Df a"3qw@`e&+C,ٗ 2a_&˄}/e¾Lؗ 2a_&˄}/e¸L 2a\&(gWđ[HL 2a\&˄q0.f"L 21ɰNvpc^[&rDn-e¸Ld 2O&Ʉq0.e¸Ld'?&p!Li[a"3Ɍ-3e8VPpXpXp .bE,  "\ĂXA`삋8$d"EDȩKvpr 嵷2Ct8pK]0]&i\}Է^f1_""\Dp."ED""\Dp."ED""\Dp."ED""\Dp."EDsI0K!0Y< gA!"AK-ۓS f9{.R4dGK& dA,hmfY^ybg!Yybg!YL !25D,g%· .`9,$ 1BL,$ 1B̰3, 1B̰Ou|,d!&YIbd!*DVd!&YI"BV [!r+Dnȭ"BVIbȰ,$ g!,$ 1BLåWp6qy9$ 1avI|) "#Cdd !22DF3,D^[J Jrw%YI]I$wWr("RDBHHRhV JY);')"!e$E<윤Ps2S,)2RvERvERvBRvBRvBRj:s]^g7d%&!#|w;!C cJšTJšTJBU%dT^2JFUɨ:%`拑Totz`\\XfSotX .3.8 9~їwkz_?88=2.&E?]c4%ѰoL{}gό-[<3xf16161616161616]06*ttlb!=;g$sq6{">[PfoWůf/|i_yJ{d`_-4[|i-u`|i&0@LyA4( A6V66siK8S!1dJL>dzL2BQ4 F!(dLi2BQ4 F; ;]¤O̭lǪ0U;i'^ҤϿČLI>i' ԁI;iz'Mĵ8I:ņNSl褳K,b+}.n7M:67MqSlܤ3.6ŮMkSڤIlDkSڔY2S6ŮM)3>eƧ2Sf|ʌO)vm]2S6e6ŮMkSbצص)vm9) "]RKtI.)%EHsÌISv}ʧEï uEA,܂K[eLݩĹ($[I\<`O,߲dd%졄=PxacC0&QBKXE=ci\'W^z/@KG]<]eC0,ԓOߍjw T"JS|*OLŒWHJdR #)C%r_BKP=(%D~ȏQ"?JdFQf"`L#'Fr 4C0͐RB2<\)iSӒa֥%œuo) fZJ\1Ƃ?f{@xK]0˹دk~j&3oA=M|f;Ǥz_m=㡙.g2ӳbg&=obv}p{v8 ;'\!G`BuLAA0ՅѡɁ=\lEPGnh3쇒. qb9pXncOom,A(EY;q#Vʟ,C==dLz7q|8Pv|xKx`>POMTًىYAa5DX VCa5DX VCa5DX'.m6]a b[Ap \f[p^l2~IP&KHNF !Br"$!Br"$Ms6SW/}bu@? |\}w^{W=L E6_h{!~d*Z¼.R8.'ؐT,l \}87,i~KO~zV'ܤpiߒzqMjY?x`鞇C⤄˷ɱ<,i|P6 w gyj 3>}3_:O p,& I=UοF_[͋l"L6~ı ?wIL޼$emi)}Zˮ[ p~'6<_6K6K6F+VC lѳ9z6G=ΚFgMm^ybl=/-8P⚂Y6]/=, 7 w^2$ -Cr!I,qˀeLg\ؖ`/} @f >pL-SaT2L-SaT}!XHEeݫ>Զq؝$vg`$d"vgFB3mo۝c tk1;+)8q0~7>.<3_x{Ox3Ow8Ъwwx˙?Y?;6^>gO֨ ~h9>E"㻼#d i0B,g<ێq\-{Ůr\*Ů:[Ӓ%{a6?X~yߦ1Β/%-bK],glv\{x3X!yX[-^8\\\\\ xIx1\ӎncf<=c}BLE%OX ŠKґ:¶pY:xH8eIDT $#(<ƒv'Ʌ &wJ|1"&1Ĺġĉq!bB.McӇ`#D|3Ro >'G!CLLvi|ֱ{G&D"VBJX +!b%D"VBJX n "VBJX +!b%D"VBJX +K:2~2K:vKk1!b%DlQ-Aށ iIiA OZ -R8YH)"EZHi"-REH)"T ]`sһDZ(H) EQ" MyK:R3͑o٠)S^f0aS^,eʋL)/~&~O)S^.-S>eʋ2a0Kw!Ǟ@5h!Pէ {v}C~F'tȅBУ|zR81ӦMՋ{]p4B+\1mڕZ=A[@'zQx*J|BD]fE-!y#xG![ (;/B%%xw#>,d|w^O 9p=q=cMB} +dً{ L$)~Lc{;zx?Y}-O_?q[v$ e(;\|G9?ʑ?9?ʑ[u }&+_W&"/h:p+LB5t t::]NWAUN4&?Dqtuᦋ_,[|"t<ٝpM8D#[pߣ!q=}0$I5pU4IlxRa,X镊9|fOtf}j8"z!g"z!g7Lg2 ƓC];2iLOuwX{8K5Ol4IX<"φȳ!l<"Ƒg}2)ȹ7x6oaƯ/;eRi?TI~z?tGx!3>B||`;Kypvs0߲??Bo zY>{]KrY>=q+NNR<|2ldf؊~,2^$Oi~ahėZK^. &fe8Rqxyqg}m. ?:{̩ ׂOH63B Ŀ,EbM($a\,Ţqh\,Ţqh\,ŢqhQ,(XK,%bEĢXde"˒%)KP,,Y(Y8t =<y_txeZ^ZXvF{G`gI|1sڴsRmKL%f%)\p"x0].џ2) 0ov2ʐXbq,8XK%V+c8xKoJ}3%dfa"`/dÌ$ w['wv7 b(D>y@n ҍM:C7˻0XNy!fhM;/u^F뼌y*Y+Kc:/u^뛢0A?4E@o[~q:8ywn"NSvXmpoY#г}KK:8,9w|ۂ9  -B[ܬeL㌅%2@3ZM&O9`q$ߒ%e+KAIlQ[%EIlQ~sïbκ_MO7=.KamQN[ӦƾpN`<<3Y2K2[ep.}ŶEmQl[Ŷ0O-%'͆MOʇlmN,0Ӂܢ (M߉{j?Źrwmh}mkȁ%}_Xt .؝0^y`"~Bd,4jE}},N%]Ui=>)$R˷`ߵoM>\4񃏪nm ގ|q=7#>bO80zZDk.*x }'Iloe?LJѻd?rH1} {2+ .M_gƼْW_spqprodododododododo>C2]aͽ ەoé+ ™8K5Olm1sssssssS0ˏ}n8>uv~9VceqNBBB >- [_1K %ᔸӾp=r[w9O8'jNNN4VƊs7nL7/nBG^SQ/|Wй4w\sq1w\sU_/nR=<=Aad;ECTy*QB ^%-ϙ->- 1SJ=x6d`1%5;*x359uuuuud:.>K\.8P᣾&+Urb$;.qP!*;DeCTv<5M̷&P4×`C[AMl:ﻺX$Lw02󜒃s!*;xa6V}lϐﻺa#w𾫇٧}؜Ip QH웰.8ҫtv-w:KyG;{))^gV߮pHpV66/tJP4%#oz$%q q ?{4??W ?qx?ITITITII/^O*O*O*t̫wOѧ({EӧX4Ky=>)>tJF1xΧ[h͝u.j.j.B{4erA!rQ!rQ!OCusQ7usQ7usQ7us] XtsQ7usQ7R h-\Kp.%ZD h-\Kp.¥Jm*w=P%D}Y>Kd,Q%jDMS GJ3f%%:DXL/ x1_Xz@:DXf;@WJ^+{%zDWJ^+{%zDWJ^+{%zD.>WJ^+{%zDzmIÖAecc%zD=VJX+c%zD=Vcs;? D𑁆OWHl!Zث:!go\c/,@GBc/Dz@a IzO1g)ނmk .bVbifS[]?mK߂OΪ3~;>Dñwf/^KbWR{;>6oDVK;,Ů,O/^.iC1JbBK}INieO)eh̋9,8%m(ssA2{]ǸBl 64-gQPy%RIR BnP  #B<oy!xC-[<oy.[nݤrwCy7D wCݠm0j]>s[4|0>uq }avO{X|>roI;u+fy,S—gޏx4L {$<[CK]q߬%oyoyX}UK| 698VH;HT%ߒ8%qZ?sdBwV3"DOV=nmؾ1;ENS1;ENn1;yy΀"f./xY+1~,s;?%`sJ@lX "t"B)"t"B'f|sTuzA-dz~ 8QQ 4fZl6:fN]z ?c 0?M pdf N,<o23C@T=PFIAsP31B pf9-5\J)OI7:Eô圶ߌq,}/0&?ϼd:_ϩF=#9oߨ9)'6ĦؔrbSNlʉ}DGpQ=qیP"`eH1kSzlJM)=6FWz2F\7@Mwߑ #|G.6æp;rw{Gn6UMNSysn[&ޖ e"m0MwA里N`F(_1DL\"tE w1f,b, Pgy`0vP5ڷq6S?nY wRcb6Pccgqgqgqgqg?L?X̢PcSj qqq9_ yny~ Ŕ'rrŁh47/ 8˚[Yu?/D)O]\W9]C^89Rplp xv:s;n @![0ΩJ~lXJΩt jYUYUY~'צΉx@"X Lsbd! h30w|Q^uNѯ?YxM,"?ͯBX:ǂ0BXo:G:1d`#nnnnnO[IECh|y-t;~X(';.4mxKU 8   3>!6b/`],q8da[!S!S!S!S!S!S!SK^:XƳI9P.=3MM0(8T_|HLY:Xʐ)lyO]WF+^IwA/cvi^IN^)+Ex"RWJ^)+Ex"RWᱢ]!x %M.Eإ|~ϳ> a"R]Kv).EإawK[A%@\laIGʿ!S`L)K^%ypL)1E8"ScpL)1E8"ScpL)1E8"ScpL)1E8l[0㧴cpL)1E8"cp,19ZLcQ81±(±(±(Kcp,%D X"Kb@,%D 3˷.]I,?]IIV"J$Y$+d%DHIV"J$Y$+d%BxJ^($tT"J$Sd*L%D2H%L%D2Ht^K;C2H]0D2],ep T"J$Sd*L%D2HT"J$Sd*Jcad}h h:}.!ķx' .D6-Q.dyd$c!I nҰExa6}H+@}$p2] !&Db!Cy2dhJ(r?I ,|;%Βo.X=Heb?N /)c])[R>4ەb)gU> 2oa3bߌ.KBRx7A?;`Oi֖%,\{^^$6 |m 0bqnMa)gXJ99ѠQ4K)I$yyL<(p289XMb he턇~ s"՚a̼Z3?,qdR! _?e^K5O컔hmEkT,km(!j,fY A3UFpPyA0Y d<*pr  X@`Ԁa`,ߺwG,` <~⇨!*~⇨!*~⇨!*~.ֲI)'XCc3%MN#A/|!|*ʇ!|*ʇ!|И~#8ʇ!|*4fڰ~3%/A#aU>hX?qU>DQCTU>DQCTU>DQCTU>DQCTU>DQCTU>DQCTlU>!@_K >q"'tN:}RO*yjX3a"t9I{!ݬ6u`o@8mcsym2š(nƺaȌ o:!lupnBYLkInfw3,E7~f,0C`$Rȅ5<̆N倛DnBKwg)ނ@ŭ/N\l2s/'n3]gpqㇿ{ޏxK #dS|̀2.n0%La SX0%La y:D!O^S?E̙Ip0%"l+D$3 0)` 3 .N /K.f"H!e[0&a c(C02f0YΠ#c0)a c0)a c0)a c0)a c0)a cX X ⧓ɾe_R0U5J;; K̽%2b%qW#4GjЋ{+oa.1~KLL?.8.q<$/xXR2|mkyf;@?0K;F0˶d<6C,=mXf%"z^".N'Y%z]D.Y{] =bvz1H[޺yq6!1<}KDѷD-qKDn!6 !6ۦ`l|`3l[Eml7"ض-힨8--Bn"ض-m`"ض-m`"ض-m˚֖5ִ5~H\BnˮEm`[Em`[Em`[ؖVl"-Jq 2@m`[Em`[ei"-l۲Em[ei"p-l"-l"-l"-Z'jFB/ ӌ2(ӌ[-`K[K[KUKPKPw* єQQ|%Q4B :\}Ŧc+:a0Bng 3(MFX#ゑqȸ`d\02. F#ゑqȸ`d\02. F#ゑqȸ`d\02. F#qKϸ`d37;vm:ޜqCθ gz3*x F ALLLLLLLL8.W&qo fC 8 gs/&eޚɾ^}{/.d|8#&R&dDכE}oM7&zdgZ]0JҬ)B}('5'n` L 70&p'n`Lɚɚɚɚ g0 &dɄ?3`L֖L8 g0 &3p`L8 g0 ޜAAd`VN~E}q!ɡN:x|7c_=+ywLG @"W\.qKɕ.\g, OaIelp .E6"ɀgN7|u.ƶ|uOsp]ga&BKÃ'qۗ.e3y2\0Ӂ!:DXa"CuvIx$b=/3GdKx&b=4I&%|U&'`Qn8,".".".".".0Zh,`u69w%Kkf.\ eɒMFhs+qS끅tfןw6 7B o^/NQbo72nQqqX879񛲢~>GݒN۾\R!e;tbKdz1mH^cRd&'ԥ( kO ]9x.N3sߩ8‡!tˆ6l“RxR OJI)<)''Շ] 8S gJL))g\JO))?p|ˌf;SK8&8&8&8&8&8&8&pNR8Y 'Kd),8&8&pVժ7 @ a;w*dH[I[Zgq\o.nW sչ\ܹ*:wEW8[\*z=^E&Vq&V'VIƅs6U*nbf|aU*nb7nbuYlfa,`pսcuo29z9z9z9zqH׋tH4.9 r2=K}l:jµj̼?+fQ(<2/n^2 ]+ Q˫Xu.96^.K/ _.I30<ɍKV^k(wtǵJn ׭i<"BS_ᇯ&p1?83g}zWfR?wuሼg?=~@l W?b5x!K` ,a%L 0ze+I`0fWo.N#.쯄WJ_ +r(a%.p:*! 4d%’mXXKXa +,aVX KXa +,a%VX KXa +,a%VX KXa +,³#'ۚm.T6B{x'0&O]ȒJqz7&j;ʩf\IiMƒ#Y7tR?pJx1 ^1|˷1'd{s2xYQM81k|- VB.*~`˵;xI:xqRй$oa֬3.N@Ew>O̷\fڽaֽUn,geh0ۤU2Z==xmd\,63N]0$dVfśulqIxHx ,O]0آwG qu R~+ 'J06 N}_7"L䵘n^ uLsB90{!>kgƸN|s a;^m'#|α2B u 0xz]BT.tB ^Wp.06C3\-b3,60: [pj`L 8XcA3HbI $1$f ,1VxL1'2n&{!]A>m g߁5 1<_֞ٱw mܵ&m`C@imQZ)-b>/b6. Jk ߢSZ 7lk)'b8l+KlQH)!%X~JXM_cR~28E ųElQ<[ųElQ<[ųElQ<[ųE-yM(-Jhڢ(-g?Ń4P<[1^97< C-[pJAQ5*b1*be\7*>bhx?F~2*w?5D!cxkF֌M!Min)8IibIDh e{v0&D;.,S `VMLWMLĽ=q@욈]k"vMĮ5&bD/SKd9;b. s`DNTgOM .3a&"D0f"LD3a&"D0oνQѠ'Ftu:|["%-dxo&q>? ` .M̞ bw/ #HJxKlR︘x|]tXp|WJoK[nZ2}Yo4 qwY=rY=rY=rY=r![X.híI.W8Q)x]ǥlFqBgT,`Ng)iJBm q"RBR2#/Rb:IOm*]6w.b\K_llllllYX\XU scK"h4;Z[jԋWɺrFX͗!xMS^K ta^0 E!΀ 腋{1%vT :}Q|p!C }oUiI5>F_}NFIE J⡟By/`oH9AW_[0 4*FШBRhT JQ)4*FШBRhT JQ)4*FШBRhTW{Xa D6:w(/U8B^USc EjUV}/hVqs_+ny[^-WqIrUzwxg2QWWQjQWWWJPJP6.Ԡ(%qʥCq'pqk%;m%+M%;m%;m%H%i%i%i%q%q%q%888((/5?<<I}V!q4~1R:R%&v )0%dAdmC§%8ך go tl}Ol%#=|Kw=̷ B̞W\ 7)++&%K9%^q%|8x)OGJH )1tq*8 )\+Y)Y)%7%k%7%7%k%\dk,ߔ,ߔl,ߔl,ߔ,ߔll RVVVVrkpQ5JF (%e%/Yd_8-7a) RXŅU<, }փ~W<ЧD@F&~gDwcŃ h&n u3B|]Sqd5 !у#!>֏tB oI7|,&SX 7 ,хdOQoAtwr}߾ qߊs裷" % &釳xJx o>e{Ûx8퇋'M ߆&zl%d^Ï=ps6we擫So`8~;ſl]~X2I6)d4?<$| 6i0uެ[?aI·1\+|@ LJKURbWBRd{?6-qX-4Ü[13},mL>˷XR?vX2WT1DAه`[50ǤS|j)8ߪه9C2C[50Թ^y[}%̶!y L{ |c.~fi ;rפWk:@UUTu<1pՅtAc..6YPvێ^ ;[ {ك=] kJ4ϯIoiMK%bx[6i(^֫>߷kppH [z'xՇDXi+%s?4s1j!qj!q-#F? yi2D uqnA !s|8%d>O|ߺ4\wN{hgs||v`̓n.yx=y~"!~"!~"!~"!~"cuֿxAH<}ο:e7_!&ª6>;&Ё FtR"'<2$϶Ԋm@߆˭m&Jf˅L:df1z%:T'`O/F؄Ƹθ4Gy+mb31_ɀQ{h)nHMh)mv8~ uƀ3.glb⚢(G>ۉ]pȷuߏk8K)6bdSN)?\lg֏:XM}2[`,)f%/t\ )\2J?돘2|PۃhS,螔{gl1F]'9>lvsHv=$[ockKn:%.pN-rI bi[0AÜ&doKa%mn"bLذ d.''OuԐT15?qξ`].q[2!k s5[^` ly-/0~Qvj)׶޹|)7[Ë|x]><B' 9#4^Ë|x]>]"^Ë|xˇĒm͚uYrGX%/ťnI.a^$|{)&4unlK`b=%= "a^B, -xA=<%R[>\{N΀XlK$^:9I[CK;o8-vl1\[I > IyD}Yl~;ڿ*~1=9Rp)?5HYt$fIKi%CS nPނMK:A6 Iig˰_K·$WlYc, x}{Y &cSR^B| s3g13 K 'X56R.)Dx`3YraqX3`Fqo  XB$0f3` Xb,1K̀;>|a,1 KL%&`I$Xb,1 KL%&`I$X$˪K̃%`y;8Rv&Ϙd w3͸Ylfa,3gjGB w[7 Eqhƒ f<.?o???J`Vӏ]~l=l_ߊr`{>K0](C/6y~)ḇJґacS< ,"\t˰CN} ^7ZYV#fnjEʖS,ڨ~L&)-*'x, &^&b%Lix$f}MށK%e+ҏ.zGL/fǡ=Ouħ"n}hnnnnk}sbc+4e\ăģģċK|b4b4ag8IiяFFF]ލ7|qrI3ÙڟI~NPXNd###########~Xvv3 Byl_ Xߣ'GLLk4^RSl'O?%ċT^>_&i:g|iOӞ.=]N{t9]x?8.ᢍ)/_lM{9=·]‹%ߒrhߨ] ww.݅pwwypDžpwqw_$ ww.݅pt]8 0] v.5KAGrDž]w] u.օFC7  ZZAZܷ 2e5+pwy%!*@p4. {{.4]ZCf:a ьL,N]q &L`l#xGpEP#tBd@o Ć2L#$xGH!;B;Bw B 4O4O= ? y{=wp;HU{=L?+?v2wCvCvCvCvCvCvCh !C G-=ܐ@qH|nBxc<ޘ#?pxsxpㄈ37Ҽτǒz<.߱?1?ʍ>v<2LM2BBX !c!d,|qx$8l{<D=B`B#B`BLGCM !6!&!$<Y9BNo"9 9r9d/`B0BF!!#`B0ffϕ&{8G__[R׽ؚMn&cyIG4ZZr--\KK%ҒkiɵZZr--\KK%ҒkiɵFpr78$I.%В hor7If$IҖw}w%fV͒fU$mIrwj a,j\5KrYr,IP]:!v "܄,d_k.gA{XI{O-S)N'w;1l!6թ|hCI9{ M N3{?+xX)AܫS)S)S)S)S)S)S ^gc,IʢUʢUG۷~?M]*~OX'0ƊUʊU a;;ߪV)))Z))))oD*HF["j@Hp)e9e9e9e9e9e9e9e9e9e9e9e9e9e9e90*O FfNNʪM 1JYMYIYMYM)M!OVj)%/7_^RvJSvJoX1tScYͫ}_ԕ(0 ." *" *" *" *" *$(&C 5߬- Sa2.?hLל0YSs?C]B ne k zvzS;+v &db= @f6:8 .̆,axѾݞF>N\,:vSjUtq=%@#u٢5ܸoZpUrjdߴdߴdߴdCE)XK֛o0ddɕ?Sgp9Cѭ|KxolM3sOU9RMA?? _O!O!&OSSxgk~W붾WIQ3/ _{|GWwhԡ/;O1M>.j7o V=M.pG 0xxuCzG-jP @XB~G||{Wš7o 'N)|S8pM7o '^8yK9sgPW'SMQ FB>Zگ]]')VJXTaQ'GWW/hxeW%W6(W]Ly?{x)U'z2c7ŴoiӾ)}SLM1b7ŴoiӾ)}SLMTWگ6Q_mjD~jU&MTŴS:۬~HZRB\/QsekVگs_=jz~*aZE)'ʭg[O#5<ڠulu]=Qw3 Z掩n|jqfICcJkҧCګ?F9SzTG֚vh9H<Ҧ>5SuSu5Վz^*?ЮkZkҧު@#sZ5וOsEi'> Qj\FksMtHksMwuG _ǝy]뼮u^:k׵ZuhYX"[:k׵;j6ܲ-ǸEJ‘7Jn+^ gZu-];Zr6_ԠZu-y]˼^N#2. C `C$ZLuTMu_-y]˼e^/OSKV+_׊'J};\uj׵תZʺkj׵'JfN-y] :Eo:n撕w2OQ盢7EoíQyCwl^xKYwdhi(խIwcO.VGFr:Wo1_NkuٱZ}u.mZyW/qח=uқtGLYzHcv4:q. O8WNo\J|yRV󞸔5Ţ|;( |CB<~]ypinyJF{bjF|%u!7 !3Fݑsm]㧋a5ֱbia5J-E/] c%">2q5>af6u_r+:S1/|՚ }䪊w*8d^ɦniL^)9#)^9+.N, +MRIvT_&g,zO ?a'Lɟ0 qk\g~vç'N]ç7s?[| _x4X`gj^qs1oUOC= />=tޮP4Okl̿bW_ctF{qm;Q@ڒ_|UrYV&|ա=<6>R_2ɐ1sNsMճM}&i yZsG~&iQfτݠ;87.jMMM0M0e?^!7a^gUP VA*[Q;^;W7藰k k5+gGZk*k k V}̏F(ߕj`SyJU/f՗եX®Zz?誵6,Ȼ?\8 CZȳq1tu{PnܣFGGxOcT]Rݫ^r\c d-Ϗ<ݐB?l8V[_wOS/QZ _}ݑ(g ݐn8vaӨ.?ݑUbho=ݡu7r2-8_}<]5*tOw4D~i!'jA- 51 ,r&^~Zm/\\\\˥׹y 2 s2 s2 s2 s2 s2 s2 s2 s1Ww]\\\\\\\\\K>{peĚl-[]]1-0-0-0-nn xBfgJbMsx_:|iz1L\Uz*~y먯WTL˰UWV-9וU|uH_wBvZmQcVºvE]튺u+j`j+ꑳ$\ W$+kю#c|YFXv}WC#5<ڠul>i|z!#}]Wy T ZW6^uuՎ`ӽڞ6&|Tx5wꇆA3/<=Oy ]s!}#]]]h&z4:Aniq>`x!*@:`.4⽮"^8Y^H`WwҷFw>t|Zu$^=kp{UWdWQǞ^Hs\⽮O{JW{6˿8W@Ӻ7/jW~Y{| V?PiG]wo]xܧӺ:^=uzSǧ75 4;@4;@4;@4;@ոa&@sl$hQ9E6 /G((DC$9.I'3Dq)ZqKE+V+Zqտ+DqWGN:[EMK9jSV]uʛ;?*+KTCnM#Ϡ6h[.\td3P>jдt ``p~]*&ZwRS[룕Wئԇ*V{O{68oy[_{Zֻ_r5kK#F9;kTK?^TNL˥*_[dY]νqxo KZ?vJJ:dL+5;ȫ~Rjˎ*Ȧj|jEjfX!͑W{ey~JO?=W vHti𮶟utiJ Z+2 [;v.l<<\tWo;o"sakΫq^( gfR^v=Q3~ ڑG8ҫba+Ăwa肿vI14hz졡H\(wO!h(Qfc υ ף7LWm(uV5jW거1hеj,8C}zZ]vhBu6v/Uͳˎy1/53-R^vˎˎx5^2LkKWVj~Vշ~7Z=rI2D,{e坭=rtIeT+ZԽR':,Cd_$:xC$qNUajk:)SLU}갩üJO_)%4JI )u ~7eTޚtGAm:::Ҿ@Ǿ@Ǿ@Ǿ@Ǿ@Ǿ@Ǿ@Ǿ@3M R#0W?OZs?gX [tB!tB!tB!tB^>^>suӽ$P9 9 cLy9Zy9 њnX6q-mOU/V[: : : : q~6ꕖ6e}{w+|]uGYW6,g 򙅖>rAΒ:pGDMmBz(vin- Aƞvi @ 4-^Ўs7>`",޴;:O;:kae|cr47̚2 &7`L~3tFM z І<:&+Rk5m ȵх дvZJTRk7kjcP:]v{ZrMBth@th@th@t1 |dZrB`JA9_ktdޓ]4EEtu  4&mB 4 itz>tU1$d뚿MuZ:cֱNXuӎVi_3{1wku]ǺhG~o8u].kױXuGirkW_{sj_||zT;@jǤ1vLjǤ1vLjǤztZ Z/1Ї+~_ZDŽv4&W|w-}BZ1quL\W5r5qƙSy#U1j5q M\C54q M\ 1hS6ĥchRC16blkhRpC6[f5[ [P#P9,/~Ck:t7^EZd`6MƛM-} >]ݣe`fof(3gWo^ Yc`5fYc`ov(NkkFf7FV}r ߶QϺ;;_T?aܥ<շEW+dk2i>b5Mi0yLi0yLi0yLi0yL!Z1ՊMR':,#_$:KZ&tS+P^)LyM0bSMu3˗kCr<|^r&tتSAޫ9<*mo#Ϡ6hщ ;gKt7h^]]pҎcύ#C5:<(ݡUӾkT*К.WCg(G{ƙf{!li[ӟ0m㣨@!]w]W<yƋYKcf|]5*дܑi!4]6L: CÜp]sd}P5/mKzr=WBkˠU\\nvh2 v3ϙD>3PG<2!}p}OY2˳^D22uz1vr0duŋaaa^v)' #0b #0b #ň,0WQb(b(b^LgMR}q2xoQrT|͚#L[2!W͑kWȑҫ9R&|*"G<ڠul9&ߛ#f31m*D|1ޒsKo7VڕKtݦ޼<Z{o:2LzFn=?x~2X5zAH%8zy85hͶnh-'n UYV鮺`M;HOXU۹e] &k>(P&Fzքcb&k…9ᰘx&3Xg& w`M{JX&9w).O"[ޮ-o׾ޮ0Zrju9N]+që߃'?cul6ʱPBVV}:nzj9і4nC~/GنlQ(pm86e GنlQZ AiJC*B|S!> iOC|ru\]!T jU>uUG82đ pd#G82 <5 8oX3 f5p3GAJ_(Q獺mխo;V;꟎vw3 8NOO@:ÿ:[yMtajw|nQ?ԣCwX-jV Z`X-j\v,g\vU~2Yg<][]ps-QOG=ǺS..yi (G8 nt8 p( Qep( Qep( Q@\v Q.\v Qep\v < Np'8 Np'8 NpFxnߔzp#>9jgڷNL{_zRpjv"[~8Wzzfin˺=SpL1Ws8pq+1333_ΕVƐ%z-Y!MZ&nu]o?mӑP#P9MW[Y"uAL-OG3 ױ[moϨ>\mkRKQC׉v<4܊UMBx5oEONf.?];OhrGۧk:vvi *ӺiW:o~2λ4E-G9c]fJ!Җ>+m]y plfWO> ZuN{xޑnu=<:m՟ z?7bZíli^HW[$m_U6ˀz'egR?Yi#zVNyJljȜ_veԉMLLLLLLLLLLLLLLLLջa0knz7Zf V[¬ a0kBz!ZWn^V/Yʭ[`-0kz Zf+VoY[`rkz ZeV{Y^KJKJ%/ku%A[PVQ*<ڠuluMIe7XWUzZWW7X,|{P򝧧;t]zܪC ZgՉ8o_5ǶtEX4E{NwaF5h-_$Z9]=pV3yTeZVMhH~v{NOwϢ0{NO|jЪO96=ȃk6=6MuLJ˨dgSq7J5UľgWVa}7JPη ! _r–RRu iG*Ϋ>r)N1̲>rd7I׺iӔtAtAtAtAtAtAtAtAtAtAtAtAD0S5k,w+ҭ)htk"ZHk okۚ#^Y80SQy+HoM#Ϡ6hk jq=}Z㤢= ٵ[jDZ^7EPnzm:衃:kZ74w]zPzG6QG gkZ2ԭhwmpf +OWaS3?A._nM#Ϡ6ho\ps[(G[(Wkx[(Okjkqy8z?ݡM M(GhBZ=4Fǧ'ʁgbT:zxvҭA>YWqcuj2fi;le,3!1&+b"&+b"&+8f"&+b"ci&+b"&+b"&+b2&a2&a2&a2&a2&a2&a2V1dELVdELVdELVD1dELVdELVdELVdELV*ՕU2(&b2(&b2(&b2(&b2(VJ٦K/T3a[7&dELVdELVdELӺdELVdEL1Y1X1X1X;W&dGu1Xu1Xu1Xu1XӶFtG^wgmzZ,d0@ P*uUE 6ǰgsخi1uʶ1W#1?t\߲3=;K[饻a1ؙvfڐ1eh ͔24SfL)C3eh ͔24SfL)C3eh ͔24SfL)C3eh ͔24SFmLS)C3eh ͔9+uILٜ)3eslΔQ0+RUPa0e~ϔ2?SgLUr*Ugx(3e]˔uQ F.S^)2e]˔u4hL)C3e]˄u.eºLX 2a]6+?O#ZhMX 4a&,Є@Vg>K)K3ai&,6 K3ai.gkOpB(Yfʣ }ufo4x wyC{^ҳg) d.V˽R9vV=r g!Ϊ`G~rԌ?p,/rU܊| {{oJV {dW%!K?K? {ed= Yz$dIғUra:[.`Wf:)l&rR_zбCMmDcUU;rԨ5p~78 {e+u6= YͿ[3 Z6G;ruBӒո {MQ?sm8WzjQ|i< iqt/E6 l_}Zuhc-J% 3 c_}Z4FiiMytHOnc,<Yx dAEkeZYVkeZYVkeZYVke2!EkZdEkzZdtEkzZY]Ya^7;r,\+ µpYµp,\+ ޵p,\]ǵp-'\ rµp=w-'\ rµAϵp-'\,rrUU?\KAR0S0S0S0s\KALAk'NѽѽcXo8G0XXWr`9`9 crƼ c}X\u>>0X5s0Xc=HUgFvyU2L11X8eo[[Vv43h75Ry ׺v8eo?t\sx^1Zsv]k+elזڲ][kvmٮ-۵elזڲ][kvmٮ-۵elזڲ][kvmٮ-۵elזڲ][kvmٮ-۵jڲ][kvmٮ-ؖ2c[flˌm-3eƶؖ2c[^-eѶ,ږE۲h[mˢmY-eѶ,ږE۲h[fLLLQl+TRh*Sh*Sh*Sh*myX4X u㧋,a6 ֆ0XkAe)6M&a6 ܆0pnm4εQfth0pnm a6slÐmW} *@=eUct2ݐ ͆x3N!θ g܆pm[)^eRYB"d)B"d)B"d)B"d)B"d)B"d)B"d)BC"d)B"d))2!2!C2!C2!C2!C2!C2!C2!C2!C2GT^ݷ2!C2!C2!C2!C2!C2 c!{-?]C7s~`ՕnS麕r`~7'gzɁ99<\yz]՘WzMsr`NɁ99OvԆ{ՌcqC4'A{33F#H5Vpδh/C_7BA:6C^*5:w?+m|e\`WaOmuZK]CLr*oU=W0:gTOA" 7o轮A/c\ݫaU?zLӞqȯwcgѪknEih䃪9V%W=sLMwF9 PK͕>:vNyByތ4樛n9څgۑ:2ju=ݑg ݐ|z!]u@9QiL (c(}[vخW+plZ8і2]1yH_HWQ0GOճjQ{Km/aT;jtF:ڻ5a; i!0ÐvCaH; i!0ÐvCaH; i!0ÐvC:Ґ2 (92Q;h22CFsh!9ѴOʛ_:SnCrh>WOX]z0t*^iAEc?68v譃o_4W| cw c=cNV|X|`i>4}+foµdejEt1|] }.b.t1@XxjEKXPu ,c:1@QG<#%?E?] }꤈cZc##._j{QE PE PE Px SaIVaEVa5*gOXO &0A "Lq^jNTZn VrŲڛkokEgclz۳ޑ/e\ `\ `\ հ<6, F`D FԞ2sըxzAzba=XXO,'y4O'W_.~Zc8\|GcS=kY=߲Vڞ6땡^8yCW7!248ІBO]yt O9樛n4XO,'K㉥xbi<4XOl  XO,'cb96Ε2[KyA2[q n-ew}Ub{e00AOv~a*t/$LtItsn a6 ֆ`UG_BZL/a6 ֆ0Xk`m S+}~>Ujճ&cߘ7&I~cߘ7&I~cߘ7&I~cNpޥe9gN&堽5oLľ17WЮcq/ϟ6hչ&ycrޘ7& ycBޘ7& ycBޘ7& ycBޘ7& ycBޘ7&y8j^92-bT|#'jrcdbM7JehuڜZ^ IǴOMQմ s3"CyfFE圊x3f*͌<ٌ w]:"ЧfSQu&LDՙϡMDy4TАicƐB:_#DT:3V =g"bDĜẺt!~ջAwg[AS#kWY{Awz*TIvuWF^ u4M F~#,#,ޚqr+==Oy 皨\HHgug> mqh-n)UeO/^񜽫#逅?tk88t!gh-L$47 _?_s—7TW{4nc:6ua^ @ pO`x dUO_4.!6խ:W8mQS E իԵ-UիWmmW{^^ Z`-kX z`meLU1VD[d|(+`mVEB[g]J~#LX+÷ȡ5[Z_/G[#&VYꛑ Zw2D+wP?d`#?;$'d"VAk,%vJUgUG­n  FpWw5F,>UNӖm.{G,%AvWRڽ֡'C^CI|5:(X? wti4 yS]5MO(`6|Oļqļf)̻28%]=;4ʟ(s8tmtF펶Iog;ҷiy4!#}jTND1M,i ^Z}c'">%"^mjK"՘u DħeƤA]uF&1a[ W2EyO#?&|۸lgQjfB^=w3ުx vwTIy7z*ٮޭVltM@tM@tM@S,!&{ZVZ.k"&k"&k{z)\ޑCF.y5]ѵ C}e59Ԣel֋vj\D@dMDDd D@d D@d D@d DyCz:  ڣ{}A7y6PX(Lv]:QX*!144[O:ߡ:00__H uC:¹εPOG=to{s~Љ р р р рMOڞ]p~-/hhhBqXG=5?IӺi]TGG~4G~4G~4G~4mx?=tAwh;Т-Т-Т-Т-Т-Т-Т-Т-Т-Т-Т-Т/VF[ oU4Uζkt#߭O}֍|73m킌.JBső*‰.W"߭.WQXtdE#2 eXm]`]^r4 Ygw'VCk5z1*ê} >^^5Q:W&ݑgPhO"]c&K:K6>=|eN^;o{ݡw\Fց:Ԗgês+06-w>=;憎~KnW:Uu:|U_Uc;;S|T'8cXU`WU+vb;\U+bb+vb+vb+v:XUWUUzkN`WO-&`&`&`&`&`&`‘LWv\qU`ǓmJ:wEp!!!ᐳi4D&x<;r11ᐳi p4ĄCL8ĄCL8lr6  9Sϩ1DwH7tCH7tH7tH7tH7tH7vx8j*՛B@@@@@@;5ޚu۪[GPVJ88n7؀lƇtU6BWo^},w7:PlkkkkkkkkNx<8Ϭ$Z <gtkz:u1}f>    7 0hM6A4A4A42p `Ѐl0he6A.     p `Ѐl0h\f.@@@@@@@@@@KɨN:*X l,:Z7d|uWcBEdWDǀ8@yf֥˄8&1! qLcBDŽ8&1! qLcBXB^l5u k XcG`6?gl\l@Ā&p꣱WoEӠBD~G8qDte;dا5F00 ``{,:*txa5u}[YW[G5\5Vi5Vi5Vi5Vi5Vi5֫qı Mzz2Gotl8w95S{&4r̦K:&QjJ>zU >c]C̠ ̠ ̠ ̠ ̠ ̠ ̠ ̠ οI_7Ǥj:槧#G Y/CCX<6fS,Yǣ'{J*5tgOٻd9̝gW{toK EzN`Rq/Qu_*ՕZV5۪2>+utUV]?̻ st{&3GvZ[Pg['%eW+Ug뿼+K: .Ye:ijI9{GzNۼV갇RUco绪W׻xl^˼&M|=ggG]R.3 Hmc-Mلi‡47:Zn6GM~&=G*|?s~_vƞ='z7IϮc:`7hTbMwjbēZkt:16Mε@!?m'ln:X yBZt1,~u+VzFZmA, Cs?U_ΧlV2}JiKv~/u|"u ەZuy%# ;u:,^AMWLhzD1:,Ԗ r4ɩKRsEJW\^zBtNw 85pkx* q]?7<+#>Boa73Q7kP^OSUW|'B~GN6:҇<ã^c56+8B1ڈp`O͡/o_TC?fR(û^@u}qV{˕o^Օg5Ow4Vyu d<tG7Op;T;^p%Ud14ɩKywePτoDo-> >ۏϬz뇞[c? g|g|g|"C-c8cO7@S U Iթ 6mm%6"{lDWz#P!TX9V/5`AĤq@N,@ @W{zCFkGc| nJ(SnJ_(gVF=a[:1 7`pzcc鍍W {Z筷4$tGmȯyJ4A4A-6hp7hp7hp7hp7hp7hp7hp7hp7hp7hp7hp7hp7hp76c^}B;F9ǎtcoȏ-2 2 2 2 WƹO|g nP~t16ota\%w/ 0.1 60.2p« |:S%Dz_QhvN5+υx.s!z WOUԻjs /ΒyL.R[po^o:*h+@[ V````<^C+HIJ PR!F|ؓ/ `O0Fi8&1 pLcJV~x%+pGwq,FEqI)bS8'BZ8v}[vU1D! ''_Cwql l fʀ 5nAS6>`6>BYW_Jy7Cql]v *gev=`2z]]x2^ffdz^qlev:`v:`v:`v:`v:`v:`v:`v:`v:`v:dË^d"ɞI}_?'WtY_`Fݟzn*X_&$GE;rT e6So < .W/TwJLW~=mOwh]ף?R~C9LPBz?:t\|WOcLT9O?GҖ塡ʳo/itAQ5MQyZ,=*OhC Vo XG/!(Wc*K'  57W6 B:QaHHHטw\t<>wC2vXO/qLiџLz8wg:NnHj\Jo-]={-[Luf4i]ӍkLut2 Zg`<'S=%SeV^q[^M ry[U+jEWMHՄTMHՄTMH\HMHՄTRxPF_4ԕ:GP_> > > 4yƹUȟ> i!c? 䈹zAzUZ_x5Jfi ,M ><*h`h`h`hz:tu L Lr.G>yZC ⋦'9O\ Z(F /-GqV@4@4@j )(=,O/b,ܟayz ݐrС :tߨ !džw&2 FɫD:@t_+*O0*W-.uso0 F7#`xo0 F7#`x ^QzF=VP`gQdS^aS2]Fף+Mm;{qd+ /9e 3J/]Y-R1R{e>mZC+ҿh'OʿU~~uOs?Ҷy /ƮcDtJ{Ǥ;a[z6饱3\u=8:f<ݠ; [5?f\qi CW'f\m"0iI~K3F=.ٙ>Y?ƟAkh/QֻF?Y?utu m~]Ց/ƭt]@6 ]/]>4w_ ~l}G. ޥA*qszMSUՇ jL)QO4͔_DzyM%iB L]Ӗco|Tt@ETt@ETt@ETt@ETt@ETt@ET ddMȚ^5ʏkQx1 /F(7fI02*թJb//RN! L\^!/^ٲ;+w^s.GxwWQ  pV~ 8 _EnSzFqz:~JSlvUHO ^:g~g|3ys~l5~Ɵ_wXj:xZè/'€a0@<N'€a0x<yLyz㯢I>y('R' 8_zjn Puu'x<(W&a1u_( 5*Ĝa~,6h36h36h36h36h36h36Wwh1Oh W0gs0gs0gs0gs0gs<~.oΌ3F&™rtӚ=`^/ XIG5ͼXaؘ_Z~5˭0_& 1a] i_!&1p`CLbFM7mtLA&giԤ\. #JZ`a X؃:l۬LkꀭtI}~;r7~WIΒr:$wnڡy@2n0n0nW? ?HXXe/g8wì=C&t9QB4ҦtB2 gt52*cHڠ±z@((((((((̴l,[0ZzX=3BQ_ZܼTR]ĕe n\<}f c{ڪ5ey_p#n:TB:lաLjR_}c>(gO %Pb%g*/I I_~umnj8/&zT,gf[NqVi*0Qś5Q&DuFJMZWo@LH #&&7l}xSKu&0&dlM` L6&0ل&sJ٩)76zL_}_ ]EҾ֑Jl*|/Ӌv䮼T$\2J*7/t7]+wpUDzINlC#5t jƱD2s<:'ȧn5`G|sGݺ5巎t>QtG96<=T@ ?dJtM;tOJ3q6yzc:hsOP_i6pϏdKgJotk⚦Gz&><蟥еH|Z:]sPy27 Z:ȏ e79~jWվGTʖw2{zo[*?Y6#]YgL3C>3C>x9r2skגdjT_euW#YGU=wT3WV4w횥J]Jݕǯ,flneꨎiUPƬkZ\6ƵDޕukyOOo#!}!}+}WUNͫDtխ8V}ip5됧]|5 粺e|HO/!=<%\Ww\{o]~y~huzu+= y&tò v7xplH2i%{\bvgo n:vh4ʎgqe۳Jie۳[w=:o~PM^qe۳ylC>/ޞ}rBn7U&nclcwwhwZAO˸2{u̴e{.eһLz׎ϔ>~,D_ .JnJ_ 9wɪC ~?;rI2DuM;[a&%SNmN*׫S-jqiT Qe+d"G]%IφkÆ6T)LyM0b6.9JB)8_+\2JR~A\>~N_0+{o&UxLjSμޚAoiP( s(VquI=҃O:˭z55I٘[?,~Awh[SOsBd@Kkb.dZgC>tC>tC>tC> ٮ @4Mťqm :0:0jὁKhpu„L5mKOKv?2 ?uTBF_jWC>tC>tC>tC>tC>tC>tHw~Iuxo`}vET9jʷrK)C!wx]<.!wx]<.rxs}UUp.gI*=ԇ޶2m.ڐGYVzJDSx)zU.wj}ǔ\ bXx[Jwzb`؃+UL@1P b4* y4ɶmڐG5dϚ?VzJFz-o @cn [7n [t c@:1t cY|3o桚T +UJ΋ST3+G;ZNRvj);oS&wl9vN=SکeԲvjY; yV9FHuĝZN,f'YYЉ?g~OW%h zֱ8XN,N_ z Ϲ?CVm/Bfo"$'&ɉIrb$'&ɉIrjKCz=((&fyf"V[4/@5M8&j,9 jfY&febfY&febfY&f?pp#CاF55Go^ f(SEUI~} ty⯃U<*j}!N14-9Vł-פ4-m\*ԑ%ܚ漥9oiϥmir[ӕ٧Zq^k!iSyRR¼z.;So{Ī/9]k ^؉ua]v݅v^}CW_[\|뽌[l_SԼ[ui/<_ZxtЮ0^z uPI-OUןv_ U=-ZꩦmTu;Ʊ q\_x-1gYMyaЬ=V~γV}&q\һk~i0>6f_zڠu8nruQg큼?pbvs,Nw 8Uznm ZcSlytkRLZuޘ-6b֋O\07ގcC#~B~#8*r˧:O"}/%>{k7xTԹW]"x8.WV5gޡ I/w"xE r[03E. 9:E.w]^ x.:5CJX&9w)m_/ig^ # 0868`wP0yWĝCƺ8;;ߍcQ;?Ƶ깽nZ  :6SO}D܅ 5-k!X8waȯq0 ;c!X8c!X8c!X8c!X8Wqƽ\Ǣ(p, Ǣ(p, Ǣ(p,  Bp,  UrUB/\WOAֱ["vpnw;vp?{<` {'w]ܽwqgx|O}!4E;?|?@-92{e+W||Ʌ[ܽpX)W-7˗h|Ʒh|!o=O"[dE[dE'nQoQ<#͑rWW57y7Cn<` ` `1:WSGuQ㸨1b<*'9x~4^n<܏M{?O?K~ُ*[Q=vh_^K=" B B B BۏZbA_]q_}c&Vg7%"j\!JlEyZWŽ<Ž>lŷ|.wi~OXW{ _}x.d؊i: a~hfHU1gdUcs$^ǦߊiPz\o sa70z\o sa70z\o=7lxfӔD=ߎ]Z.nDPAv)-!$gOWy2ʱTV'vf7tMxz!=u^h"Ot2(Anw:vho*?== z5ʯ>и8OB7ǟ~D8C٠. uesV+3?{s|z] CW&FG]{Z#bbDdA\  vFFs򇦆 VjP=ֆPе tV{S/hy];:p7dеHjxgR?~&6~n;S}&E_;}^dе+=ued1|T~e|+;WVj> ?r_%|}jPq{Ty#{l.YUaSTZQ졚jӪQp鷙i]|}iayFO'H'VnhˏVh7>Z퍏VNh&Z9>Z9ՆhC ԩV#ϗQ 4[3 ZǦjkSuη16h#{q;|3jG[!A4Ehymq~@;[Fh9mzAZlIFhy#{FhyZ}Z|1'l-^|8p>|8p>ဏo蘿] >vƍ$6?ċ|c8bg,sċ-‹]AŮ}Ȥ]|c)BqBANmvE]Qp`SyWʖgW6݆vݎHlHyz@[[ԱA~qM I,Gog *4Uf=>m7^.mȓGJ4QmUG MOV6P~WaaF94ڄ^]~Wo_,i5LZhI 48ء8\=52<yt12<(n9樛naK/i?QɐOkːOkmm نh6lɶb=mHT0 6lOdHdom6l6|Fxgx3 <mنh+ո-lm`m`m`ۆh6mj3@ TժG2$2oW1ޮ^jP>m>m>m>m>m} jI^+|ħH|§(|§(?MHES>ES>)O!| SB§>)O!LB$zzh%Sj`SyW=._ꄦ4)JP RU4=iE(@tA3]e((@)JP R⹔*b') MAR⣣_P]Rp'; Nw@'8 Nq@s5U B'8h\MgnyWW8(QD'8b\M'8 ~}?4e,`{)2 Le,`{)2 ,`X&2 ,`)2 L`\M K_)/~ K_p;%/S_p5%/~Ej1KY`%,f 0KY`%,f 0KY,7(I7n,mR=f<=fjz>Eog}~Q, za|rgn͹{*KYnu^t'W]gvEe,O)|ڑJ_ȳu ^*K=rY@e˿;%MoF:?#3ҟҳI?#K4j>#i,~J!˟^h_oG,~Lug~7C$}zԩ?i]$=My0>i]1Og^8ҽutw@9q, =ui<=&q[_O}k|f4nl+kT)-Eh-o[4I ]y(ij{֥I t)mQcV{ȝSV,duU+e:2O/rVO~(\UXowɞW*іK'RJ߅|8_ ~HoM#Ϡ6hk?&7Lx</[koߚ.=M iCD;Owi[I>nǻc^v~?t~:< "}nUOhuҷU y]to:YmaG5`Pϐʾ-Ow75'\$P_OO㼦DBFz>KWG8&um1v:W}[d>>a^hohyOȣq1Gt91nMy0ݬ|\b6~O.YKJZ>u-W-X#~Jv\Yi$=u?gŧEfwp<+?C ׌r()\?itG$ кͿM2Ocz6|Zu6|Zfc7`q7`q7`q7`q7`qM2O| {aq/<&+W]27xLIϧmHW/Dp>ɸ+y3xQgud 5U_v3]봿_\|ӨB[tfDO,2mO>OpApgh)7>ݑg|.=(Gilj 0zk6Z4ք!]#wtlQ?Q0Sa2Q{?=rCOtsL!=WQ`jtw@11P`5jSm9n鎻19*_O7w賉\4X8_i cT~/^MWlW-]WfT yFƈ譽ppS6Bd@d@d@d@d>#0s2ZG-ъHъ /]wٷ ^iulqLg^yswrv}&7WY3IN]7''ܗranp,}3xӖ xV/~zBoN1x]*߀ڠK౳|LG>M X7#~hO: ;6DtN-agpv@Yzx~nQa=5U(Ow(4w)~ڠu%끹ڵЮ:,a8ڒ   k<|=<2x6#ϨMKCSX+5P=-ʉ(@5Pb kb!][[Qb kXg@5h Dk ZQuf ]B@9PU}5 шE;ŢbNhXS,y-f,;KŒ~u;źbNnXS)vu;źbNhXS,)vv$r/yKdgym*g۞0!#s/r^ò* )ra9ŰbXN1,S )r^ò2S&3 K+ʰ4`XگpӰUUU`b*J]i+vqׂѮ4Zhv_`Xw-m{8w:[񪲹qҳˆ}Sd[dqd d~ܸ`kg~l_ o6({~؄~ 8L.C \+1rL+1rL+1Yw>X3&k~ab؏;%L5Dkîa)lv}#07c/q 6nDq5nMԸʞ. 5nD0Q&jd7eaMvYn]&d. mDu,D]왬晬ڙԙ0}ad: ;qɜ4ѕDWJ]i+Mt4ѕ&DWJ]i+Mt4ѕ&DWJ]i+ttJtʿl Sq9Zӡ5Zӡ5Zsc5PNԚ5fxZӏ|5h͗Zz lѕ/du7W<6hPe1PhP A]4ABG@ΆV&o!1Ov4:uQ.%%%En6VbH.!bH.!bH.!bH._ /bbyY0)֯9QE. &łI`R_>vĢ^Q}WOVtu v=o^ʽ{xxGGWZZYhjԆhuðѲY,Ҭkx,aߗIWw?2H^RjK]$)HRv|ΫnS"IE$uΛA9tt)Ǘpxw^;Gǝ#ΑqHs$9}R^:ǝ#ΑqH hs$Es;GR^Fǝ#)-9b?w;GR4cʋiޘ; Ъyc]ͭycȦcj?7qqSjʏW-5S^iS)r)?%R.%R.*HKR"Fʑ9N<ɋӁ{ˍoGp h$A6p7VKtEGxὌ"aA]v晟\;M8aeAeY/·-}مYyZtֿ^/Ke:gS?!ņT)!Վ#yX&f\6㒾8*yWa>07Wel͸<Ѯښq[[3.wRNr5d֌S㻶f\˥].up'-k NXr`ϔɶˬsXtyng  '\ve>R+;2 SXu/K2f.u)3vyx.EIn%cɄ_e?K9(~<,S*2]2I~ނe<gQ圈!&Am=,lp@);p2q] ˬ'Vv\pwIޭé guV[rlVkZn4 F!tctn[mkxvk4Q:è'U{%nS>a"r] _ѝ\ctwa~~Y2h5hyoP[#KQhbc41Ml&6Fhbc4F=\D%.Gc_q ]r/O_1&%@}d&նɹ}D9n0@msNtЧZ:Y!+$]RN}c0=y@o6ћ+< /)sb/Oav^0J^fB5YMtVSDg5YMtVDg5YMtVDg5YMtVDg5YMtVDgYun1 kCÁMG:mګS{ujNթ:"6YmxnTY*g{l:1aYu+չr5^ph.ګꢽh.ګꢽh.ګW{%x#|Z+r2Opn+YEEu\6aWg&܅0R޻dEuQv{KuQ|{ywY:&d^g)>R. 7ؽX). _ n0V{]2*v &(./]l%w K· wY/ E)w]?ҷw9wCAwy . .K~c[ܻ(R7GX[ Epå;/3txEw]txEwE_)هZ KEdqou!{0W@b7 ܟ4{;&>bǽޑXF%5 g(C؄]8K8^eT4ɢO0Fd0vF`2 c;'3EDW-hldd#>͆:ulFTƍ&lɎmG r=8a}|~n>lj( =~vvGDpVAԂ]E.]E¶{q݋l݋cfJۋڌ,ȒD4A^w״%qgk|RdȔyïUX9f=#'T|%!RN飌ڂ\o\y/뼗 ic -Q 'o{mJKi&fi2w$b #` -pfX,l!,d%m;ձro4èNN?, c`2C9'ՁO <_w ߶9hDV {Mo1C9 jZ㵵 Ͽz Hu<ho:Lǃ:,K=ҾNl/>jSh&ȋJTo>?gobg!,$)D./IIrr9UL0t{^<$.-X{a2ѽ:٩:StN=ؒ'̃Nҩ /.eVO)6*ak]xCzل a^“yK򄤳&cDA{ZR$u|0&d>DcflKqq5e:S}4JF(uN9c9sX:۰ "/lC6t ]lC6t ]lC6tz.CϯmM]җL 7!b 7ͳ*8e@Y_>pK %pI,K %VG TC,{'`oOPį7v :( IX 2ba xNAuGB (: `'{^yz>NdAT]pԍ%jym5qmx ʓfcr0|,MI 2=(!مMؑ}G' щ!:1dJ=*e9՞c)W'L2MD\+eB'LߐЉpNC''-]=9i3ОQJjǤṿ+C3N{rٓs}7 E 8PuL9:L;=IzozI2=9|Ijhz>{r$5tRCЕa~;7[}/?%oH|GEwV3A=EEKtiZN*kx,LJ=[E1oJjTip:xt7)[hk<36qɋZpv`]ܘWu8 4\FU:CrވO;rjR2~& 7.yrl()˒goW]L2'\'iaF+Y6]DzlQLfHVޫ shDfݡQOeI}/Kҗ u?ytO gzߦbLS)v;`0xk Uo}5ZkpyHb`?̾}0EPMNSFQ>)D;;;n( d`#qfpf8^sJqepe8 # #0Cadq/! eg86o-e1b E j.ZPv.gЈ4*ӟ|:xD඄C8%cIAp'P{6ݲjܬX=ʍ,bf:{?t|d=ؙ:˻iMuaLu"Omnwة y KR@3;8X>W]!_9*`el1  ->lDu[a| /A~ "'Yד 8سc}íߦ˒gȵc KN?2ilI1$`~M&|_8zrJ~g8{H/Q?|KpyqLyC#.)eM2Wf "Ĵ1-BL"Ĵ1-BL"Ĵ8{Mۜ+$I)r؅0f4͏RY!&t)_3#I ,ӥ"ir!&fi ̏#1?B̏#1?B̏#2?Fĝl)IQgUTcF7 n\7n0d`nnITHp lL^Uߋ&<3}'#OIczYyne@úuHeMg9A)2(EȠ"RdP +0>^{{~L6v\fm:e jI%&SH) EI2S&}ʤO)>eҧLI2铓~bDP^N M M MnhwC#ЀoW0|Vɻ>yA/3Ȇ'c[汱eED+QuDfwe;pn1?:>¶Y":n VDh.[TA4 |nH!+.?|1d୙?FBMXǖ+`)-E6[pߋ͖g=eoڝ8&lMQiΖΖ9:J`c7>MT¡Q84 F4e7>kpS$q:W ~ /sTأ`7Nۃh1&q$n &aƆ]_<2 %BDh .{`~|8dJ0a@O lx4AS-^1XO `<i8{K8ɳ c q G }0&;[#R};"3da,̎^ 3me@6f"6{r_[6vxIKI!LDh3m&D6f"LDh3m&D6f"ڼD7 Zۻjd8S9e)0YX;&O:0"RuE~Q_\QtxvE~Q_uE~Q_{D?il&ƞ/:< Jm_uE~Q_uEy[ xɼN:N/q- Pj6fT%/:N/rA4.Rԩ.RE"uqH]\..RAbP.(d"kԯ!n?6oGCI.{"pq88B!9Sv"!NDS")qJuJtpKȄ8%B!NN wERR4':+jr˵.zC:|aWf~"Cc` !1`Dzr 6(qb8.Bl fN q\u\Zwa6S q\Ḏ #m@++ݍuXl !9D "Cr@!9D g di:c&pNJNa9ygfN;9*CP䘉v'wrA;9&4cޝW;9^*?wrL1_vrA"S"N/~'K+Jyɓ33N6o̔y‹4.E"6lؘacʆ"\ކՕbuX]oFVۯ<5~ )uؓ K ueֹ΀1qk`ӸHAapP5f 7C!k8( ¬m nnk8( 3cX}k8( ¬l YCГ5f 7zcx1^o 7zcx5vxI:[d5\l29L&9y,\Y);O5Yk]Oet>]f]ػ<.cq >A18(NzW},pY|%yJV2F"AP9o\ql 4|>"/7=6ŵp#N0k<7Ãh|;h:iՅ9QSΕA޹2k;\2zZ'˄2!oaY>dC3 69'+mDCmR^Tp!p=pav#EwÍeֻe^[ s2^t^[Q'˯`3񌠏;I$I%y>3ۋL209yftD D ً@"OL$3FD D 1$Ɍg2L3`oJgD]|s9_/p8 αG{ͱG{ͱG?_3g|s96fcc96fcc 7 4obO.慄sJ8sJ8;S[s5|kv.;9 r:W|+/@@. K벴벴ClIp¼/dY7D֍I^RΒk]Cy8DC!2SX] y8~‹<.}ե*2K\rK\m|!?!s"?@É@s"?ǕV9م.hr߷wS|̅ \ۧ.-K9D~C9D~C9D~ΒGC9٤83<'|K1R)3R'/’)buX"V)buX"V)buX"V/NrEBRy-ٮu!u"b)"v"bɸR7ದ?7EM{S7EM{S7EM{S7EM{S7EM{}2]m"KHK[ ަ)mx"ަ)mx"ަ)mx"ަ)mx"ެě^ wJ!qRꠄ 2`jC4 ,1c|Yi wK"ъ:0 d:0ecA C͌46SRϜѠ:`'&S2ԬFXpNA杼-&o1y c  /OՃe'gPFuiTFuiXQߊV8wk ]h&U*o"or-f XND'{gͷFMo8e7c}pR֘B(!c2<nCFQabTg.a޷BM|Y!e&Yn38bG)#}Շ)ha`c[g91MmM46&DshnG>*fb8M4Ftwa)S"4F7&D=ҡMv7&x<|7&DӛhzMoM47^9Bs-l5\c3ǯ8?. sǶbg;;>sc[3ۊ1x<1xb<3;px.aIm>,m>Ç徵}x3&u.eLV2Y]&du.eLV2YWMV n<+=z6!_.xwwHQ3g.Xz]%l%l%lp¨1w wa)mи&,ur)]rߐ2S L,98S؄]\%gIݖԭ?eqe!,qmA3/dYSX?Km7%KTKX&K,Q-YZw1u[XZdjՒ%%˒%%KTK,Q-YZ"vu_%n["%n["(aP/05(΂⌟#Oy lyi4yy^n]W4+"B\pdeÔ:!|VxOpčLz,yAN:[2 =)/elr-1)_]]E[qy)SQ)^/eN,Mx)S)^_A2 "2K̫H<)^MLRx)SmMѶ)6EۦhmmSmMXx,S<)-L)-yS4o^%Yu7Eh͛yS4opROh-Sp^e14rFN)9E#hS4rFNj}'/Ꮮݮ/N}lb x8\@Vw[VA,ł\kaɮ%۵0vr7y9<ܘlC-/C`eԡlz|}0\RJQi#8M/M׶Ͷj> "n:^f}jIPd`=!3R5{D7M4{/͎f ;vHRiiZwl4;a< 't.йxB Svȹ.rEu]l.p8vx;Ӂ~:](_Kv-QdGْeKv-QHbw .vx;r'M:]ew+0-g}YewCl%;Ė[Cl%;Ė[Cl%;>L n2-;_/~w~-#[GlcM?(1ql$G#ggY]P߀9̖LK ޫ:<ʩ/y~M׎Cؖf=}LaqaMg9>aoasWsZOyS>Oy;.4@EFѳ1ϗb <].׺ dL\k2qM&ĵ&cpmͿ:6>pBs;gs;g@8i9[⷗4]3=0d<4wdL$Ywb 043X:2zxɀ^2 %zɀ^2 %zɀFՓ b,o]A".jݘLMKn,d=M< J/l\q}1i&$3%$3%$_#XX=fa>= 2w46"oA'";B'fW[('!t2k|,rW^X Q! 9D (ͫ1ϒkmQ,k, 1 _]IQJJN>,YO.~sӟK߯s".Bdœa?U1PIj2SFrnJ:a.gds0 S+/&EӤ 2RWJ\)+9M_V;.j5nqp#'RЊ+tdxRaA'>aSUL}u6R-T=%QXe^Dr Bz(sw#a)/TS</ /*^U{?k  ߒd5d/ r`A'$*>. Kycxg>Y*SWe L@92UYEn,ʐ]8Qϐzn1XyV?zB >`}g&B ~(4B hW;:C+YB%YhōϬ9"ڇhTc*T@ϒ_ :ٿ":DG#; ~4N; ~tc0omgA=8a<=PC ~5=3RZ;a͟h GymS^^Q ~aa^]X!uKW\fqt >Ü,9F-rlscRp(~Έ@H fdWZ 6ޔ%eCy1p̏ronxZ66aƋ(Mb%%̅%uX2Bbi.rm!?gn.؛}˂89vKQ+j2_qC:AoPÅ8Xッ 0ĀS9xsc>Ǹ`%0DC$i<尔a"eHb M)S )"etxJ\+u4@b M1)9Y7>4ELM10 uX8_R>X"֦]uProqS6EY2;v ?f`Ff`FwQe0oc?Fa^{ պD<&6cbHe#B1&1&6cbHwMkv_^?B4f"L@3mLlD"D6&Ķ+|l~mc"4LlD86&BD!`"L0&BD!`"L7kKӸ^n(dX3<a8M sBwaۄeA^%\[˷M򸤇p2ϒ+u\eWxffiK7,v^#m׼"{=.] waޫlˬOWtcʎ:xZ:gb3ΩVKb]^!qs9ef.e &܅Yzͻ,!D2Z~;,e7s+FƲ5GP7e_:DKnrHnrHnrHnrHnrHnrHnr-Ll0ђ{{{{{{6h&;d)R8kHːS2e܄0ۺ\;%ϔNcO_~@hrgHB }R? x?5ea]x٘˸,eN)SL|ݤ].tMr\yQr2ȅW!\-h~ecR3/9lM4[QYRg_0fk| 862$]&S^;#{z֋bqY^2'^8.&l _D&l|)\/%9 pgƋas$}Ij¬O\kϫ^b6Kz:tX;GV /:;6?=;1g缧V9i6?=Ͷ*6?;]W?q]7yw:ԣ }Q۟7). YN"|O>Ӄtۇ;[_9Td]]dWEvu]]d"Tw۲*eNoKeݙw7~8Ï>Y=ϋ<'Å˼2o.˼2o.˼2oq+#㯘+ l:'_, nw1o";ppvX"2<-7yZnirq;hwM>51Ft0 quFGj=ich2zԆ'゚Q:ԡ63 NiXp:'O.ORBmi1ڇ#E!.Eorkؖ,9puf'\"spbd?`pvtw'T .eݞ.OS%}>wRgI?,5058†Qd,_6ٖR/W-4`M !${TZN919XC% id5\[Kҥ4!mG0Y$UC~|`Yüo~.6Lllor3[`@ hw>m W{i6-Qm4][+sh,1_mBa,e3qc!W4a(eDŽF.2zd6v;ÛFcj oIէj6_]6jnYvJ]d.[[P0~~]0}l>vprؕ44qصΧC獉m>^k#}֛򟽥=Z b]㷃yü~PeV|# 851NM|:&Ʃ]#GYvjͶۺpVt.6ǧ-%q%T8QS7r,enqcٽt '[>>z ýU~u2ԭjLmWUɋD=glMԳz6Q&D=glMԳz6Q&D=glMԳz6Q&D=g\Pv2׳U tuz@K׺Z._8L}q$&8,=TVM~&(g~&z`ѫVG}U]0چ7T?еee%wl*\yO>#Vn?[ G'2s7[b̘!6Jخ†~։8A:Kx{,t"gpYIkD^flaT{a7\Jg̰3f؏a7 Qp<7kE5Y\TǙsqglVba 3ni,b.j~m1PrmH.a^4ai&y97+n=xm7OZr `v\<11eÒ+i$daG~̳ME2yDŽDz#sgN⇱rî>AMs u<1]`AFQOa'v7A\µ]µ]µ/`:]va>Rfʵ"%DzXt󷄃ڹ5a=Tل]8.͏مF~1ed>dĚ+K~6>u,eCH,"ӥn)uK˼V]lCfKȒ9f{̅ xX Ak.a..k.....klUmOE`Z;Fh.Z:mU^]ȭa.M! ;6wwVɑs:G`cG9ƼN\~ރ:R m)3DfX[&7.#wHNDH8PBLP3vce>XىTt[#v5b1</e1֕4L~0!\ d ^Qy?Y;nap2vjp¯YcX4>v% aр_ nX.9,|-,ZVվmj_mj_m Cd!y`:g4-%K}]KY~k![ q C !wab세B\-z qB\o!'zzCɇ}c:cD qeלUtj*8wPкTtDx>1 ['Nj,$ÄaB,#d0YX >*q9PɖOF zyΞ gO'ٓpd8{␾ÞaOư'cSbؓ1)))X)%E8I6aegc— eRe>r؅'G]6aي1eށNx2p񃋅ˋ^V8#Fg(Oa7.GC5 V/xLoawˁnߐ #/Ok}o },e냃?xq "9w^1d~Ǭ <|' ǝ׺\_c̰:Dc&fŖ_|ϽzƖ?q}_lc#'̍x?\.p]/1ނcG#u$0a0Rm1Tgm+ ^.(i Oa>ԧ/'L4c/'|1i|AMP\ϧK9)MNs=%IWOs?JsR Aa|cSEWזns}us\PҤQur*-dfn9רmjc^DGu@d#:R˔iqSJt4jIr.U* kp\L)E2;6<}'ܘ9k-Ysم',e)&SΚ#{&gx4ݞGis<6Iwš܂Ek:h=D6ѣWG?6a69e^)2hS2',t)Gl˜rSqHw<>|Nǒ_s\r"X%bIׅ69e˽Ā޶Hه.;t;xI:"\s),us^K9\7I.YS)pWH|lk<3a[z AS]S R1STkbkU9)sfXӫaքF^̭^Vanu[ؽ ë0: Në4: Në4: Në:m~lY7و:mN1LNYŔbJ6OU]L.TS)ŔbJu1R]L.TS)Ŕ?|(;?@ ~fbt1|>] .OQŨbt1j5]SF>.FMQŨ9#FMQŨblts<D0j5%ϒ6.ɿ+侱OIO FMQӯb`T/].K㥋x+lκ9eOaKY72&at1d2]Cҥ|yF0d2] 2Sat1d˵qKXy|/].K㥋YK͕㥋x[bt1X,] .K`bt1X,] .K`bt1X,] .K`b2XIEi5͏҂~HlBrp2Fm:;R' `䋸VNKHFFfypɚu fz,v Y1dcbǐŎ!@8cf6c9ͧ1b~ 1?C̏!~@c)sKV(C 3.I:;ko>nڣI9m˵m4[-C̖!fːuR˵.u^27W 1yq<}=r2C4C'󼀑)Oa !jvO Cf6ko<%bIRft3DR[6-RfC,enyB)'% Ͽ7N">cRjwdجs/1$˾~r{r{Rw@O?>"'r-0E_}ie[LS/kdhP,[+  3>Baz%kԇ\YR7yF`860LҡHk)ll t3L71Mt3~)A6)!8|#%OJ0M4nb&fnb&fnb$eـ kSD)2oC؅% NrDM㏥Df˵e50+TYdE|p Ni 8 w6E֨=ÈH%9;M6k:q1o <q{3DfXAays׈NDuq2ޢ ;yɼ3 e ^a68F`(wV#y鎳#rk] 3J:q2]#5MrKKe=R,uvNCKRrYt6OqMVmfu&jr\&jr\&jr\&jr\&jrV9ciJxD;;|S EhZ4='Ew ;wmPAq3*]t,:\.UG\HhEAЪ%%%%r\)hsZrLӍϢ3f꒕%~ì'|KV|`>>g=^5H2Ćaf<}K|K|Wǝ=6G Kr#R='yB/Ÿ}yPVŸ- TުINFB'#r'i$&)aPB$b%)VPR$$)@GջΎvYq#3//%%%̥s'㹓b."Ks)YhKm)Yh˻Uі:7\0RKtϏ/RK)ل0ا ;MR~mBOYJYJ)^)^)^'6?0h#RK2#mW}5{<9;(D N|d)Af ..<.8JR%^.-@lr/kMR%e.ɿ2fpMؽ:jSD SSSvK8K:U\)NAlq*rp(Aހx)>2]ʑ|>f{]#7HrAJ{`w>5XEUymD{N#d,815IxKZSK?cJ9$!o:9$?.dlGm^mK)r}Mo+݇ؾ8Wg޴!E'qHۇ(j<4'wp"uqiDQDvaԲ,.,魑xx(O O>]t>(/O 2HYpuyOaY?urYӗ\0̼YjYSBW.ˋuPC ׹Yuy/&܅0Ķƾ;0^.rK%1 s̕s2c.e\>KtSt̓2].e6[.t,HryI:viK{=gl.spx(s`}\d\9l|.e\&exK"K`~Nv92CYqϴ̏ u.K_Wf̏J +V!S+mTF;jڿ3\uaJX-5O[b5?q mK D^roz5%vב:rޫt6*u1-^:'55fc1lw~yH'FQϾO0;wT|L/~8&evH_~.s/ o›9D_6swl`A!醶1TlT;˩.'9ymb6ccMږ}}qD7M}qD7MD|D7MD|D7MD|D7MD|D7MD|˚AߴdUTJ.>:N 6=r[H:E^%/읈r'[[ bÍgvf 1ضq|t"5q2ޢ ;yɼˌ-@jwzuӍ1NUi'X!n_XlD՗%5r7,2A-`X(b Y(b ',y6(7d7d 3\^}r=^^\<(3-
    Xӄ!0陑cόsզxfxfh633y:L!GX[ڔu 9MYOt2t،FSm2rcy~"H!)3!"Ϩ켕>[mdVjGV@<=>O diiiL?M9ML9ML{I{I2y4 3dA]<]1C08U!;9؇x3Q&Dd[; qI}Cp  Wب@{RCޓz {'&Յ4i;MuNi;M}<5oXB}-6xc"2Ni;MNS5ޢ'a;M4i;űSߣ|>PYɎm"rtߖ^G4=!,{eciNi;-{eϳ~ā0u1]_aZS8%]L{Eh.pt1]s.g?̈́5.gb5>qCY;.w!̇M{w)Sꔇ:9 i."]Lu:%rf,9P]:&fX=FQ&!}$_4F&:a:Sq[XIh/vՁؐj̉e-&qn |$_47."K8cKRY3vEĸو,!qE_NbTas,zE_N;[LX Y\[ hD% ƣviFI\#HAR'}k\#HO:2?)2?);)N NJNJzBRb'tRL'tRL'XI1I1IٜIٜIVR6'esR6'׮9\Zҗܺdz6&*xx,UK|fIٜ6/UBriVϭN.-$xdԞKNqA<2!"S\c) );e=!ERRz,E%}AQN;<9mAJY+h+y{ T<6sǣ/㈎#:W_8c?7z]]g|ĭ봎3ەz+N.?C:<`cW ne`DG]ƃo.^ yh7yN|F~篽} Kpqu<$눯&]G|]SdWqY궤]R[D9R7u&]xOak1p&$Ciq"\T K%lˌtNaD. /Ƹ,˳s uY%g}9/ai;K$vq%V߰,e90Dbձ^O^n1f̅h}Y(d^/CX߳̾ ioXg*y0m}9$]|!=҅YNzcwx68`*:%y;]uaI/GBg$r6c>6c>QҮDPF%l8pEXD5Ѡ&Dt;@]O&ZD ՂN_9>s:-C(8gL3Qp& D(8yOr<%$KRv]r/{$z.v}#&g %eLX˜PL&D1(&Ŵ}œ0.鼯˸us(&do] tRfaί @:b2QL&D1(&ŴyJ)D1(&%HeGctL1Q:&JD(ctL1Q:&JD(ctL1Q:&JDx)q(q(tҡC8 ƹtT0ΥC_YQKN\:t.:smѹKtG~wHuGx~w.:KΥCGv:z EBEBEBEBEBEBEBEBEBEBmHH`9XtYtYtY;¥]]jby9>]Ov vjbM<|2`b8p2k~=ˬceeeeeEjHm"]v.ERHjI"]$v.ERHjI"]$v.z^ z!c!cuAs}@!cQ UՕq:vW>\Rzܠ~`: Qp {B<|;6:zg!ܢ^'蜲iy y&/7шQXE.ݬ#vSjYu>O1nCy2൵Y5c|AE9hKj vif0;>z)Y?"~x:SW;\'{ C$y;&:2CGH/|X:x d91{4JGxɺڥ* X],)}m.df~2]:f} 5 }D,.j0aak a. smt>+?aK, /Yh%iIxɒ%%K4ʹD3- /Y^$dIxmD-FKQ6 h6 7\T7D "oJDe"/_}΃N:QIm:o =pP 7ty|C-yfC M-WPm,|!Ո笎[e1崈.Et9-i]N8ݸ ܅_k\_/ɐ7 ȟrm]}C}C$Cl)$xp ycY Y y :,Z,HƵ gyx}; !A  N ZwAu(/C\;8u!v]]q8xB.Į 8xB<!Cbx!6^6^_&j^KxI/i%MgA"&W2x LOxI/iDQs/iD8kc{:<Dc`z/=qp2y?%t{҆Kn)[pI1=pbz&'t9nr:]|]t2.n~+%]C}bPU<>p[Ѹh?HJ)c9f/ÅF? GYTRzSAYι@ò Quh8pL9ADՈgnV֨C]+D+j'Uf8 OW&éE_c;;bxoy3t]N u׸x^./R:R2aA7O#2WE6#5 u0Q"u!ͯS.0gF>.w +Q_ھ, wK˵.q)z&D} |氬˼Wm,2˥x~L7)L8$^.;aK} IՔASn)ui^Mһ0k6zE̶+d{,ȳv.uhry.q̳$"?dY/Oa ,CetioH=S)&!X>ܙI, ?,B/K9SO/Y!ᗧw))1^{.k{-õ(%sN&i#lБgTN^3 Qٰ؀Ml26Bdl>9EN.SG؉;1K{ٌu9X`` 59Xe3sݪ Ŏʼn3b?ae .Į@`=)xhX{>) /{Gi 5x-t6a_ ϣk]yk }όY@kD/4VXB}`7 F1k1k1k1k 0s>9ه>.r%oY~%or<80OL\&sfs?Yf>䙟pf0s^9fm_ 3=$=Kgo&d7r-qN?,G㓦kT:(:|5.e#J 7@u0s9s9s9xgz rl$qъkOmD%&DuFM9>1%}5_885]f>F/LAiwJY|&. f-acL2]\tq. eq8aiq-1~u?oq-?c09 uKmt?;0ra/%t^K:;vv?,8^{* Kr>Ԯg;uoJ>J+?ꐮꐮꐮꐮT;G|۽E e$o$gB$/ nz2e$q,6o|{³e\oG)FI+(I JqgRVJqg.)VQbhK6aM#0]k]mC}/KK\&,#/K! 1)1)1)1)޷%ȳ’'>م%-`%,ƺL$t\Y,2QK~Y:eY4avV 0;qp8<)9.`K\fG\5]H)3ZLø.hS5Zm/{㰳F{rH:IƸn"l?<%D^,&$,Bva铲CS܄YC.0e.5g`t4FGoxP&zc)FѨ8'uHi!:Q426vxKT k܋T,b4a? x\omˬ7薿[I;qE_t)\D:5NMSD:+BGCb4J*h"1wrI&ңh"=H;y_H J D$f&3Lo2ӛ&3Lo2ӛ&3L5U(M6J?z!fM]J '9s\2W:䟒GU:lt.y\w:aIK갤籸,evHaqt]qyew]q?N*VNz+'x+ u w>#2cXgщ̰PcیmptQN('](؉8roYelärE9颜E/3^8ru`:Foa7鰛ؼq98~#\sS\sS\sS\s),.w);6ý6Ž6ť6ť68 mmm~xѦ)^)zkmæ):læxѦxѦ)^)^):læ)^)^)^)zn)zn禼OsSt6ŋ6ŋ6ŋ6n72]&D8)qY4|8EgLD95N>ZL8 2\&˄sp~'Sp.e¹L8} ߯pao8'˩1i\&ˤq4.eҸLI2i\&ˤq4.eҬ4;ֱ#?qjEoħD)[IpN"kQl&o}l_mE[}V_mE[}V_ױ{!n|U*_b/ʗXK%VK|'{Udo棠U*_2×XKf_zxHu%BYmœ?k&̾oe3ӆp0O9(H%jp(e)ߥ))eqa)텏Ic6ڐ<&wCq:{l쬛]؄y%dM/bɳXҷ!e] 2gC!'D"|OB$O9tT'!rkJ$KR$KJpI Ķs3"> hFc r!)dDf%U#i7Lv!$UJ~(2iM*ERHI"JQ*8pgÓtJN)/y]Yfu#)E"H"R$R^T%3DӒW gM$˄TI*)R%EH"URJTI*)R%EH"URJRl+ QIOlBgW;#M$iD' ^Rc1Jm"m6lcpnt(-*tacl`?HG 9ܴ$Prdt >K`jr~m܎j/ejֶ dERGZ)٨:UglT {&"f2-(wY7щX};>\fފ8:clm q3qYt"98(Ug I Ы06b@zӭ`ٔc"Vf.T\+LcSEV̧?kopJ_ ~44VKh\ Ue%KuũM.gJJPouijKCp}{p(^s[?v,ˤiqG[(ݡ_cre4-ҹx`4X, KҴH w-|F~6z}=܀禞ެl4` 6;^:07UӴP)␅`4X8 N`4Lxx+eXVMBA @- =lʛ=s4Q׹g|a>gAVi|,˧i|,˧i|,˧i|,˧i|zZ> ^O(͜l4{9̜.35qMAF21tҽ.51u]@0u1mokBPe}+k(r ẗ́ zh3CQ=tGM]]ZL?0`Lêbq0000xb=-vyW?tj2@$>@$>@$>z<$W:-o8T&-lŦ4oL*            Y}cʏx(;٤0z/렝(q3QaݶS$Ze%+FTbyeׁ-YS[~W{}oQvgMΚڝ5;kjwYSJ^eC&8]qUi^ݡ< %ZpaUVA,~& o~oXlV-#My+Na`~q7paд$b< OCjaӎY(GQfp.6ę"*ǚv2:-.rj&+r!Ѩ Ͳ_/7Uq9xijέpùd<=gA{aַ]}gvkRbTG."`V:şuHܷdDk+@]J$z o2@y6gF/%w> Php +,2 FX]/o0E.7v#NSnݔ~f,nY7ˬe֝/ 3jVVݔU7eͿIS)o25FTeMxSޔU75x)ꔭ7eM ӧl)[o֛ܩS)ou^0gn}[b؄!6zpNeFلQ6aM 3VaҹU@[V{J? |[6p\'jM,ÚZ0:Vµt,̡N| bC Ac򮹋]ߟw=r{ O`R >}/O~X>J/V;(< B+X`GRYHg#2 JuPTՏ͟Onj~ y(>k|ҨbKNNNNNNNNNȡzɡzɡz^]_&"%.+9*tɝ_^RT]`ʝK\rwr=vvi׽B74~/ WqK]>~ٯ|zf7'w0›J'{?S-sHݭ᭺O)sH}}⤔9̟e++)'eP/Uqxt }oF;ͮ`e&Jj=ۯ8UO2HλoBB/=v^}Zb]}qʠuh;ѿ;W8|;(AwP|;(AwP|;(A $_TCUpQ(vJ/-=wT_8*B:¡Ʌ!yCeSA1Ya!9CNS9T9CNS9 *P ik<ɡL:MaCm*<6FGnj6OyX9VG?FfC+ʍ:Jᩇ]Z[ͭW=l؁n󬲋QCMzkGlA 2A xhڀ6'ZGy_׀5} x_׀5} x_׀5U,a#o[R׀5q x\׀50^ x\U_͓=qk`: <&݈E^ҭY32uy'skḼRd,~m9 ea0'6*m̉m̉m̉m̉m(ֆbm8 7TjaLʴLÍ PEK*hiC6ic҆2:ohC6 *B´0e.Xm.Х wJ wᆻp?w9:_y 7Fşs9 7y gᆳp78~78~78~W[~7?4^skoyʰixP/TOB/ku">'e*'W_sd?AeׇBMp UqBU0ŭ w]~.l*=?x)V:{XLx6f z^i~+0ZHhV3,mQ ><X/H(BɳIf|=Vog;xv(ix B;Zwz/Fk@|,CJOH =[pCx<#\yHOl}bP!r:$6+*S>ou|_Ƴ(Ga<^Sɮױ?kaGHzV30L*:>:S}|IZi.ilY+#Rs6Pv`3=U*iiJZY%['VI+}d4'MI}d4'MI}d4'<f7*26=#$sJGP NAWU.&i0D\T~(!6ı-H!`4" H!`4" H#ZeE*jhjh,KRh,KRhu_&]B`)4X p,KRh]j߲,KRqXe[B`),K#\iE`4X A{~^XqE`4X]E`4X;ޕAEp0ʼ,"8,br񠞩$1i G&Ծ&j>G]z࣎tUs*QS*tjja&gm*ԀH Mʪ xb՘Ӂ/5S/6WUtKjmoV~h@h@h@_ݐU~r*bwT)FCS/ɩćUGyê#Ggiŋiŋ=QV iŋON+gM+^?!\uTd>@d>@d>@d>@d>@d>@d>@d>@dny8WXRngDgDvsHk2\ӸNqD:u$I4h\'ѸNq:PսgR?!z1bw>fMMٗJ>*B~}a)*yt)yt.s +n`W!*S_1y^zrŅ.Xqj4Ak mJ_\:uŃ؀l^nvM->aO&LW^>GO9p4hB.ʯC`32jub!&C/x!g'u)f&iBV7/?g[uPΆrg7lBVo x!J3 v;ɃֻB'и и и 8/isx y K\uZe,t򇸌u﹌u2]ƺT*T9YQXwca;u06'ҙ *Ԯ2ÈX*a; zA!ꭌ{qp;tc:YayE^r /y9`Wq_~wQr˃V"qlcg{V9A8Ap IQEn_;,,XX ތ`<.$Hu*@8(@.lXoDCxW|+(Ϛ\\\\p+ u/G)plOq8"||9~ϡ΍QyyrTЁ>!Q)>蕝 'b@}EsjSAj hjSAjSA([N1 Їj#U[)r~aBޕsͯܿpPS/:ŁtHۿVfw8mPu-kՒjټoR7U)^O`Eót%G9;ڒSVH]<'ro~0 IG]䞂7*WIa;2]fTd+).Ex6/RP.bvp+? ď%(<G2i__67 Z w^Zjߪ[ ^Xޛf.CywYkptAEWL}P/X.q Vvk֪4A[ y ꖅpF~0ja舏]4!}C~Xe{Hen߄ţ%D;qQDCG{|8,ioҍs϶ )7W==bQ;dGtGEhju9Aos0Y6^tOPPPK3+ }|յaW*Sa9B}}CxXF'ޜ)ldz(<2bS,x!V]y ț@0YF޶%g~n؀'u+x٭g#.FuK]3?=^û!ex7S(s3Bnկ98Pvb=W#>mz6`9xX1Pn9pCxG8xWQ5t :@@c 1: ܃v1F;=h1RF(unmnC6nC6nC6nC6nC6nC!Qg@] ~Ҽ!Ҽ!ҼQ21$iC6 cIIZp=-r5 Wr5 Wr5 Q5 Q54pqoxYCIԀD&tp9 Do z@7B:Lo z@7#]"zӌÈ*r@7 Do zۛFr.7M.IH@7 $n qH@7 $n qH@IH|"'Sלy)y`lLQԌuOQԌa'W^/.[@>斧斧斧g- Y ]U4ߑ,*t+4<1<8؀l.zP?UMű-SU*$Oph$b"6!b"6UXjI=WCA6\4kI [SM_myO5?Ou$?ZבG{W#[ |kߚCIz  ʶ1rJ9T۟o-/w jr9[sϷV գSoa8|k a!8\k1rXW~?2/ߚâpօúp 59,j#-O>,eL2qX&a8,eL2qX&aDZ&QIee_#(%FQBI0 &!$dM^qFk/{_&!$d }p`(:!t B1cB A:!t 4TX 01B&RO#`bL01B[y.Hg2|>;6MI0I&I$ $$`LI0I&I` U` UA3<50I&I`v->302{y ʔ!`>̇0&Cd is0Ӏ4`Vw9H?@>&M3懏 O`U^x@2<xN2s闌d< xsd<x/byN2d< x@2<xN2_{qU%Ht@D$: Ht@D$: HJ^%ѫ$zDU{y)$C=}'<K$z zeID/ID/IK$KIBĒD@Ё$IBĒD/I& MKstjҲP#Tf,/lY̳  ga..ªWg_򻰙ga5jWg_stIq5<,/,/ۇV9?>)e[ -M?Lhܪ`vJt)~ׂ~-ׂN-ԂN-ԂN-ԂN-ԂN<"]{y_ tB:{6-hӂ6LLŇC%l⤾,˂,hʂ,hʂ,hʂ,hʂ,hʂ,hʂ,hʂ,hNM٥)4eSC9GKIv)ɖ$J+tKIz{jƖflm/y3|)jbk;!"wk+;!?WqfdI+ekfl|2dh_U?0kWƮ =[ڰ i*pڣUҀ OS66tn'gc /3/:FZ.طOF]lCm`jcc]~,Q 0q鄾O1n0n0~ y+` ` ` ` ` ` ` ` ` `-2+z,a!k27gvQ>y<ņ4 %'}U @VAw.?`Μ qgǻBx~`F0ax#o[ߒj7w`w u sַ⭲͡ꛎ<GbLJ]pQQ#G|G v# xv'‹ktqW@ C.X R %v#C|C|g.|؀<h \pzw ?FI%FIamJai%o#"Of9oli4JZ%VFIeEl-ίVJ+Cd4Y'MIudZ:rN攻F9zV3gt]8͈&3th0S~6٥th0Lv/_n(Y 5Ûio̴divj5Hxl7vl7vl7vl7/?܁.g.xYԻ ߕ3aOq&ϥ8C0Lp&yNOóqg6Hl6Hl6y/Fpt6>W:m6Hl6͖=<p_?ʡ$#Ճ8!KVdipv_%Mh0\t`T2 2 2 2 2 2 2 2 2 =e2Xu<Ǫ9VϱzUۯ^Xۯ{xPϿ7qޚ.5v`dLvɤNX]2%~ZvwF6gV4\և a9C2;$C2\o=_J?݉Pf{sJ||vg|'^|SJoTw*C;C;C;[HW!'.ê뒓9鐓9鐍萍萍萍萍8x"9D:W#ϱ<8萊uê\L0¿\AAAAAAAAAAQ:bq8<}PrNQF u8|GMn]|NnΥ/KSqX:b&" ٚ\:82X#O|[j30큑Hg`30 PUPth x+Nt\{n syK ?CSH7P ?=<80~ ME]t\1C؀';0V)؁?C[e<})~y]2b4<3bsˀ Kf@k(30\Рʀ B^t<3aU8N_ҩˀf -|Xe@.6kC `E9FNA>zυtS. h߀ h߀ h߀ hthϥ}ڗb:cչXVg¾sV,s`3O&3)]/3)It.2)X:c{&Rh+2Q#H y3HA #"KH0R1-|x 38 >{ujPcCe%,􇩷-)?zAiA `7p \oz,㽎n⷇w?\l H^mhGq\-x8 2 x ;M>5O |jP85pC jP85pC :-X|EtȰ]uٟ t9'~΅簄{F亂;'#V#ͭ8e_Mvgl]T,_<##]*O,Ƙo1F';X <1fL -|8Hg"z@:뱐8h'5csp,`ř5&g5CKm~"s.&D+ǑGqSr.k[u 2F@b*zv/ՊEg޵)|O {}(ԋL]bsJxx1^[o?nV{\Q.OE󫸳UGJwYߩ/{{#]zvO{#]z޼u!h+"/:Wdž6+LqM0b6.8 Oy1k m!X_q$}$S=\p%ܹG/^2ty⼽niYZOք{񪻰/aSϭ1s(R ClLm4-l?܁ߑ>#NMZ֫06+X2+p+r(Pc)qpݚw1m`xv**~coʃ 3 X5ԩ)PΦ+X֡oX|?.jp0m1\U5a lX5a lX5a lYX D_<=; E5ѥg?~:?ֿK#0lO~MP- 26yՈljD5"ۿw' m}Pn9`svG 1 XzxުP}@:Kbdt=_H~Xíiשo H<'>I|Xe㋇?yKa'I`KawԊdN'ŦNaoiCS~4.hIHio5ЮЪU4QE?4C? +n(B&8뽿&XJ@t{nbpA2#K.iu֝ƻA2fq6qI3ev4J0*ZiaS@+ @+ @+Ήx٢&⤕&ojE%M˥K%_዇?y/ފ_<[=( h h h h h h?jpSC s GTn@޵j gN|ZxaT^hY?]|V}0=uE0]&JSu#+ݢ.Z2QhQe[˛.o޽sٺy˛w\.\ [W:o]{XgEsSu9/3 ϚQC:Em]^Ui*rz?g7򰑇]Rs8yhyjWgYtO+XQKV:1E?W;*۪Ng> it'\9/]i \_jiê\e}kַoÎpo料0eq8GYvGٞ 28 Q?앍 sWܴ=ߖl28(Q~ˬLp 2CС^+¨Lw"H,!ti642NQQl]֝rV‵8`-|m8`9Xji-:djQ*r͍gEfxNV9;Hߦ7Cz٪&U#܆>{|liFtrŦsT1B`q (Ǘxx.Fx:a<-1=wƳo,kzVޥWDzUW5<*a=["e)H_s1 +xK쯎ux|sx Ky%-#/*~^pqL] puJ7U897p ox87p ox87p ox껮VuWѾUY.Y.n}rO%<‘i ~ftw맱}jOnrD7 zc={c=5ڳJ `>v^O`W:8·pz@t#^횑Nxm'^ vՑO`IOIϿ:llyX'< 넇uՑJ t{:=NxO' |JgRlUdyL'< tc:1PMxL'RBjPxP_앇I hEɮO/] K3i}C`Kla>la>la Zu a a a k!X{X{X{a>lzzzzzzzzzzz:/7u.gW؇޹uGw f-/yzQ^v3F*T/>#ׄbp+녮\ ]p^w]]v񿶽&^Kra&9vqöT:ߘVuYg_NTƴrLUOpgI妄M-~?TX(w\Q&Ӈgg2vzƾ71j9>~x*XaoJǑ]ww},*opau)BopЋoppL ?FZ^(ˑcV~#Hý>2}X_)ԇUB=jB}x+OuO> ql>0· S7| 0ZӧPlPB}ExP/TOyKښ)V- eY6U͠WhIMop NA*op JۓO\yo+n ?TEXLqYO`U̷dLf^xxe+*B(o5DB˾sa>g]/_5SXc%H#t S?xv8B?raU&~o\@1W=~#>7.[,xLPD/"&>wh5.LXٶB^ല:ZM<,>]hei >ka>UEOEˢeQ^EEuP@[VsQgv]EieZ\asa".Ӣ6ʴ2-zz9[^ߗ5٥k41 v{-p}؁;̇;>}YY?a o#`67F<<ߚy`<]͸~x~S phZuZ:][aG>(}"D|ǻ|(<@ޒ:bd-QRN3 c1Hv R{-ycpW~ªxCE:C9:C9:C9:C9:C9:C9:C9:C9:C9:C9F*(c^R[:J%FĐJ yKTbH%C*1-I*o鐷t[Z< 9CC!g3t: r9C뜗|G$?蟯o: BG 6:tJSYS%D)>PPPy&߇=A$~G޿,q)ހ:Jo@o@7~~߀t:~7x&ArY> ê_GO%J\.瀋sԴwS1 1 1W|_?4l爉)P/x0]y BWJ^߮zpSnK.^rq(yC᳷zvv3g(?S:A+k}u1u֙hj?S`zl *mcpY0}pJw]0.Xyiμ4A%}cDWDT蕝]PűwǮq=Z&p&pgyatP<7ÀCn5kztQsJaΤ8[qO#NSqq]y Xiv{www1>vhMm9f%ܿߗ hYEKp+tdy^j`%;z^jv% Q0K@OX,b X'99pE.Yrɒ_qJ!cE.npQ^+E.BO+g ӥ-uA=u[xLT>X$goSynCu{G?F׳]*2Yj#qj?`{}V\ÚAYqDwᬬP}VޛXռ~* QK*xg#h uH<ڗ ?孺llll)Z뺭>uZ3B2R]fܬWw+p5Y5%Zr-3tzm_.e?@ƯXXXXDէMn nY>ٲ y ` Z%5تlqV}l[ÖEgO"'XZ㉍1˻>6 c[?+kxp!:07\ujַxO`4;|1y0CO_nʧLw9tVx:6VWJq\LƔ{7 *8xwzf<̡C8M5E<F:Sy(r\H3T_p<ύw{YG{8d:~J'o8hv~J+pV5t1A݀U߻m;ZOa/jr)_?jraMV/MׅG~VÅY V"g5\r{ku-HRܕWYhN*=gӱ0-}{X >'Z3`a<6؁z)ɯ_V9$^M7ҎMy(×fܾ?z#ߞ.* wvpr~a7<$=t'9a'U}ѿ%r 9{ߒ*^{KyC>{PN8y[y؇`zA??zĢvަqre\c֪Hc*n|?z,AI̅UWJamsTv/sꋧxj0OvC]#^k%?KU8=p(t̯?V^wE-M[xq)5R`.KMi1uyWE}zH:H:H:H?" &_D1" a(c:M锥oO\wtN;:yG'wtɽ3PV¨N>r棍8zQ3k+nkFpډj2(KC9EV[80UFxO`]Ձ;$:a\3N y\.{S\n R2bp1 , hr<9LCm 4C;HUѐ <|nsF,֐ gm8VHk_8p p1$8x{5i^lIV.+Ҵ"M+"HAD&4 i HAbs3=;lz<.6#$HA@4]L<',ѿ&@4 y/` s s م998 s s =l-zB[a =l-I }vM'N; ܭ餭餭餭M; v64liشӰiaNæM; v64l[U; O˳sc?oX)Ɇ51+c0+M;•%ۘ%ۘ%oe p ƚIۘI o&V~6f6f68k4w`5upGF4QnF\UޞY}8Qqo8wwVApppA4\<444~p,e4~pXRDAMU<$֩8CEd1 1 amLmbMi lC(6bcnC(66DccnC46DccnC@6dcnkp؍| Ari_2zI'!rvm4hjB®݅] vvuQGWcw-~Nή3Sm.%Mb=<ۀUtWe[/F%t/Ÿ7V~qq[8޵-yέSF'_l*<+9uُ^xV哖o-x`z`P*)MU(Ta |bTRƻ6;ޛ9kFʋ->; !aud}"iNEa5h@;.(bSB.1V nW~:B=&_'KFY_Ac;Ṽ"{+kޯ_]vy8`V~K#< "7 bֳ,P)  д-z!mAA `|cGJ&g/v@ {y}KGbŽǯ+сNއ b\#qu#|#8kzx7|-`WL_[zmM5ִ^[zmM;5GU~U}ވWhaP>P`Yww*._6_ڏʳ]wҷG]XX5}Ӌ0o/?xQ؀g}K'#ߦc ߊx]H;@)u%=R5\\5KJVtE m9p(?PvЮۃtG_o8苽`Fr#M|o$|a1?7䮐-`<ogQ>zB^Ӏ*Ɛ+ay-b6Ppv ұKM҄ok?*,Yhw][NU 합i|&ɧmi|&;b򎘔G;W9`6p* nsxx wS9WHٲ N=hp"U5| ._\7ykXtF b;v3 lg`;v3nYYYnYn޻n|y׆>;5XONNۧڦ׵M?*B~mݦSߩa԰|;5,r =Nn]{ ^Tf(O8'c X}b11r}}ڪYzWs2< (| O x+Ϲpӻr!(r2O>NhڒR|G|GAv䧘SzqJ:N슴rbO8',#!'Ĭ|)|XҙM[p@C Y8Js  e"9' s{¹=ܞZ,pQdxB'>AĠfb 31L d&$y>1 ;a:#BCN}X,:_eXmWH saI6$pM8&dN 'ل11sxa0IA=߅4C֫zuXa:W^֫zuXa:W^֫zuX5z2Z֨%*Qj\У Ԩ%*!5d Ԑ2PC^T= Vœ]Bv u+dWAV[rְǮK6wmT!sB{Ȝگu`=M]]{Ԇ뷨:9D`KqpL,`Z6z^m|ulw/pkǨfV.&+RxUۓo+gBۊnl4کۏ)q g@8΀p3 ጿښtVw& 0 F&0@ZwRtT#p{I^=8jN`Nl`[:ul`[:ul`[-gm_{k+͏jGA_! E3lgwe~Q[ݪ]]ͽZ幥FT2oXMXwGj#9YR % XҀ % X0 Yٮ*j4EJ7ǧKPC %S1*g{8 3 *2 3 *s[SV#`s]K<0?L-lh* /=كF W><:z6+՚υ_iptZ͵ 3 3 0 0OJ>_k,>Ue_/ /V<ZH 27/ 34HGj5^g_,haե~cjQمekVZc僛ҴʧpUnx{8wM<;T4Ԡ< `u/F>W 7wl:•6\[;.Ցy_iۿڻrSzDVݹʃ4A.V}G}ƻ ]/:S c]bt}Q)emJjf_dm[-+jg_kK/aU~W2\ξw1ptAE=;_1mSߖ'\< 5)X)>( A\z[W il;g5I67 MPMPM=hshj7ձz&fDživ\X/%$^yءĢ*+m~zG&gmn~n갵1>-S^Hg +vq+ -<)aWTΕƅqUnqrL|@Țzx)~Gn85`'}8Fi*u Rmw`mnŒ a9mo*xx've4c8hejn9/rRzӲZ .Y~yPdtV 99ddV7@ +h5ptEXJ!BBLE𬏜gx׀Wvjz#_ǻ hS Ň(rw*?p!]8:+m*]UY@<F:a@F<@B ӭ02Q[tT*M;p ^0Pr9@|Xʽ٩ȃrpGjņo1|gzCMZ0;⨂? 0݅B6-j!`x7Ϫ;Grs(Dv{^{/ӗa ur?4}٭ਯt?tuZB~ecxbCx'2&O*#}a?zU^zƻb+Rd(Rd@8 _?GE*+q,:N۶MӦeztl:=렕>?A=S'4z-ppdB5H2京^lF!~`T eur׵:R1)(֯˖{wF,CGr 5z-e:kZ2zm:qw~緥BzE-#yiv-#x-cڿ%"a*r?Zv7k;n#k/v'< 7\xqoOxV\Wa}Xh㹸bPcEauXDQEauXDY>_l?<Lq)Nz[{T;0x5 V_~VCL7԰ X.!U9/|1YRb*OtOtOtOtOtOtOtOtOtϿz~'HGs$< Q((([-+I$ IҐ$ \CNqU$sv䐓`I0$r 9 CN!'`I0$r 9 㯮B{x+!J'3`@ ΀g3`0 p 8΀g3`0 p 8.FϪ po2^`Ks TB  Zc`10p 8I_,gVDs L1Pˇ;Jǐ?`ѳ0p 8`0iw S?`?0c1`?0dzX]V׀5`igiY@}]k)ǺJ\Հu5`]gQU|,-*Kʢ,*+;aeQYYTVEeeQ,*EeLɢ2YT&dQ,*Eu+Y&_rY;0,eLv2Yv(OYeFݏwPIfx*t]ߦɆm0 f2Q~ /3`F̨#eF(e0v;UeF3HHO/5c(R=`c0z F1:C` c0tNJ<1:C`ܶ1 c0t +|:BF:Cbĉ1Mx+wc0t = ;ʠ14`1bTe Ac0h.zx ܀EގW9e^14_ʠ14٭: E{ˈ11#ࡶZF11#`YFysb/ْc0b FEd0\ p1.` b0\ p1.` b0\ Le2pe2\f.̕Y&"f+&"l)e<&οS{r S6 :7*:BtriC&:tiC&:tiC S S S :BޡɅɅɅɅɅɅWWX_y a*a?ra>&kf&f&f&f&f&f&f&f&f_-^j5[01[01[0a=LXÄ0a=LX#~OLLLLLLLLBk`b`Ց7!FYYYY 1O n n n n n n n nvOnv/nv/nvA{{f]"|5(uq]w|0*nɀkR7tށI.Ep)K\R"A' "5rv]#gd9[Bǒh\Zo_i8TER8Tǡ:qCut9h;1Hw*CtJWBf /7LW$_pr/Z<N$`,eLI< DXma?EK@LmJXUhJG L=[Xa5+Q+| k@ņn@nUB?@ ?@ ݹiu<]wqy}GOdJ?{[+yӁ2T7$]I9gNXuÈtjkWCFris3r}oF-[uXT/w}!l N}x# kyaH: +2T/nJV/_X:bXm,OĽa0oˁU&y|zW8?x"yihyŨӉ:cFr#FH'fpV+a}K/獨Fԃ`yZxuyhI^wBVt2w-޵h<'N>F[X5\[-u˿_nWu*nWVmA^ /熗s˹rnx97m,?X~ԋ yxkT/zW^*aDcC6aƷt܎}aOxG7{cцqv+́:Ձ#~3lW:"aƳq-<) nx^x$x+Nm/:V3ʳn=gEt?ҵ_tW{u_z,3խg[Wٯn=_zf~u3խg[Wٯn=_zf~uՙ3UgWWٯ:_]uf~u~3fWٯ7_of~u~3fWٯ7_of~u~3fWٯ7_of~u~3fWٯ7_of~uh~3ۯ$~u&R3ӉTL'RN2He|^ӉT+ɷ_IR31T)}]ܚpGAlz%V_-~{x ϡtYe <e#e#78&~߁JՏPa`SVAb#ja n?Lɿ LqbUR ć!y =xX5?xu`⤉p0j2aG5Z`kaG+)txx0̗Lthi:ZY `;iVvBd'4 4n7|jvBZ.\ sv[rUowqb|r.n,gSHg੢n\#fo`1߸- V]6cN^#ޕct=bE ޴QlHTHTHTHTHTF0Kt_ChqG+~Zl ۆ.j?E\ @\ @\ @\ ՓzW/y9tQ.=YtqTGuqTN/t%[]&]o.t`0[9 Y8C0Pe #$HL 1@b:tɅm[՗g"1Eb:tɭץD3*0j ͢?" _}뇶zNF~L 9:} u*N 9}X܊:fe6;C6ʃMͬ>-r?tg>~ %D/FrϢTj஗? ~`?0?l<򞳇82QU9 |P)̀ (#.'W? [p0lz/d9_avIpĻ&xvqUny~Ł4)0{x)in[n؀'u+?C.=U.UNsz|D|Ifm7X"T>:uԋK;jêeL-xdϺԦV}̺ԦV}L/:Zf:0"?5b:etʊZ 2uo*glS7?eU̿:ᷯbJ(Y;kuZS'~Ϫ9o=;?Ӌ t{B't{B't{B't{F;UM_wQ~҃sSx໓ }47^< J6^ppmнxNs|n?=r e4hSOŸ Ÿ Ÿ oՉ)씳rй'Z M;՘*B}^~MP-;^LA`X]O|c5k ϮJ t'7StTkr<ߑ3:uG^s\RGhwFS1=juPkܮt1.]݁Zo[2FT[N6X0s|з;LnKeh'EaV9DWM߸PnKP[nO\[7Vs+řQ C~*b񨫎/sGQWWbbbGVW\0[Jh yh yh yh yh yh yh JlfMPU|S3WY-B-B-B-B{YrJ5֣֣֣֣֣֣֣ǽ,W x1S\tzl걩Ϝz۬6YS*WvBqW5̐dU!/mK҆UaN5r͆wE5y x^׀5y x^'5I H `fʉkR` XyL I{o;C w`=ޚ_SၩTx`*<=7#xLkS< 5P׆XeRӸYu=3 `s酿6RV ϒv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-i׺Niגv-iגv`U@ؒ-ؒ-ؒ-ؒ਷ a(S\JіmIіmIіmIƖkIkiqIkIkIrKӊKڵ]Kڵ]K$X X X X 0U0U0U0U p  p  p  p 0ŷ p w /[[[Q[O,#<~0<$d Bxe|LނH':no7V~,'VA/_ 獶T~ [ [ NZ-yąBR-UR}U!߶d-W J bK b鷘~鷘~鷘~_⺊ĕ-zߢ-zߗ+)t VIoqoqoq-N-N@ Ej\Ag\?E[D5עC5TK-z[LAGm8PERLw.yҩ~cO_89չ,1uT:W6z?V8tS..*41o,`O[d7MǬ[7e_9ş+L.o͎ͳ>=he<'r^_spToց]FE'OTN\}0'/On1݁Tv%"LAǢr/Q]ք{FQtC ]q m/-^t:+;J3M ;#I.v< WtyDr j&~؁Cq:U^re.;6oC.6Ϣ%jS]<Μ qTag.$m1# X]/£!# +e.,cM?W]Ky3Gv`W?6g{-MVH+S)R'2,MVH+Si2EL&Si2EL&Si2EZ-?0K*iuEȅ;!#|"LA%-?h9VDb\fj`4 Gq ׀4hZ;#e4M?plYZ9v&D2!Lh0!̆lh0̆lh0̆lhZ}qdzhL(lh0xv:BUJʅ#~ o2,lh0zW fCдaG̃_rt4ud|G`R:gȞ#{;' ~Cs9jq9tph]ܚpGAlz6!?l(_ov/s4Wbz5i.gSOt<=-t@Y.VUK[u@[u@[u@[_mܽtt#181)`G?ʍAmi&Ջ]j16䳸}h߾Us_*NNaKa+a+{EnVDlyyE-xL)/W>0iu 4R[NV+NV?;K!W$L3iu ޴9n:bZ}Uiu ޴oĊi^W?/FIbSg V!<<W?=m|xWۨ{_EGVC?|k|_>]iD#ǛR*70 Lo`z_U xS x7N1S C Ӂ(ogsŮjuۀ@>هwPp ^0 1G8~jpUK7p \oz7p \oz_{fr,3= k5=5kiN)cj^%(X; L6ϷKVL +555vWU BT.qY->܁ 4 bplHG]<W^osJ3];? t qTgu*\0x6􍹍դ~w7,d-˙X3,gbY|*TijX__LYi84Sfp򹑇BV2e< | Y}o&L,/MM'Udbź-amMM&L,MMͷ6b9PxS 9N'LM'L'&|ʰ?N'L'&L'&-8nCDB.7@B=~ _rb.o/9g/9ᗜoyOڬ=sysyNPaWs>*<6U>$צ*r(j¦&l j¦:Wրʦa8& ca8& Bal:FWg>rXY6c]6H5*8EVqM)6æpaSN88OxXl Mst9:l M)s9G}st9: ǜca_s2hx ;ߡw9:stcѡSH7Yt?:1gJwh]9?51|;41|\Co.wiCFJi.Yp;4ܡ whCp;4ܡ whC( (R(GOw.͌(9"JCrߓ%q5;dS,  jJ肫Bf」6&x}l v@ɶ'Nv<ٮr-v@ld;'ԮU]F:CeRrǓU Q?rP<9'ǑsE:Kƪ4*r/wrv@v~=n^x[}!'z@pt;ہZZx]؀';0S O`<ԼnL6`*>@|C:ߍg}c{vHx)=0 |ċRas)|:bFiNp;?Ng#͍|] UM-zX9~Sd: m.ķ3+\1-%uŨG]8ʿm=#N׷updzo W>c O|-Bv|U> s_zzcK𢝐^h$4!'O9H<n.Tg_X߭^;`ͅ~w}Sox[q</])[x:8%Eo]F7DaCe҇0.?xq/*\(a5$Fv`a$UI/a~s1şxpVH$ы?-);V /ҏ%v&=0][x+NI`- }b9dssJƏ{ '>n>UZ=G-=O>/6r?M4oi]9툏t!Ny5GMg+N~{T?F)nc.\}X+a塨.8HIqQYűf`F:^[i:H[i0@һ\69z py?~ U `R&riÈojE.GUI2/Foo\">Xj5<,g WQN ʎE^mѹxoEãE~1COEz|ZE~q׳Y*@fpbG ^򰦾wW4`1S.ϵgyJ{ `O)x)q]ׯe)."I../T7(f> _XU#AȠkd52uiفvUVτ :I X*UZ_ TCǀU[Uݵb9K>i,vyŷ<ǐ%/vבOgP|es~r4,:$C:$C:$C:$C:$C:$C:$Czv!Wuduͩ\$xXMHM/.w1͕ wS}[:*\wtPuUwPuUwPuUwPuUwPuUwPuUiM8-?9'KL|~O*|e1o_i;ec&]v7.ToGmQ8\bW|!Vҭ6:CXQm | 8.Zj+]%+oO`aua ym^ʆ10+g m]rs?;E1aW+,<%Dey 61<`lcy 6Lg3ԙgSn|Y3EzYf͔Y35hκB"Cﲂ/r `;&|u<م{L qU|lUweO qg_val/RK=N-dg w'21ܝX 1ab;1АB騴4|NLͧ Qԩe=f'(uR'(uR'(uR'(ubh>+fJn:8/2x"|+> Zj n n n n nsX=?!{īN6WMz[\Ǹ8eɺh.q\c\QJGIu+\9Sw*grfKG+sx ЭАl ]㽦W㊃'5ijSl"a?f ~ <ʪܾJoϭ|et|>Wՙ$J@tߧGFIXT' g0Bcȃgax:0u0(44_w<|FF^Pp yP9& /V>6L^hrFѦ)/>xKm\v8LFnϺ!1SDPMѺ oxKеaX෡3>3$hy\b2Ś yt@5@&_\9G&0Y7<2ϭs+<|[~b[ץ}(܎Qɩ_`v,Լke,q؎̓`߆ncਂU,cg9MfߎYSKdr;f{Z۱gi[]{`=8?H.&ڜ>z!\8vge1v-KW!QS'/GNζkZ!ibu_/6$-oRڷ^3GA5kCMY3Xp `,85 x9!|sC.Ftչp+ p+M?xj+p"+p"+p"MX}[n~p~p1PMRՋ57w ޘ}W~pF략5m7hW*p?8W_v軍+0`J8R~p~pa?tQm `/4e9ap9 .0r\ap9 .0\YN`lasLbcaH{+ފ({+ފ({+do쭐BV [!{+do쭨>Wcz % ^!+dzű,aZV!*`M5uRFeU XV_0U~YVƻeU XVHL`**p6hNş>UsGG(`죀}6Q& D(`lMX죸Q/<:}tp(G9&`DKw ȣAW6Q&:y u37^leڡlMB^ǐGSO >'dAl}O >b=>苲}O0Hls0~?e은w>{ijYP)Q `*Qge(QjvrA:~*כeQpWae?CPRZEgS> KPmlGex ӃM]^6e^6e^6e^6\ϕ~q^oːڋ&mR?ɔq1xX'f ًUӌ!e4" H-`4M?. FĠ(ɩr09Uf<312.bsjf\o`WmV`C~:^.61F:a.V &A~bC[~wj϶+ؠ ]<<4e 6]nK\tdza.֠ߦ*C sp}l\TfÒ;&[za.^H˕A䅹x)}i@z.DJ/[mV8F=]~vmjus0jt\d&Mvp`7 vp`7 vp`7 vp`7 vp`7^&'ߵP\?Tvꟕ|JLL۠F?b`?N lV"ke%< S^w`GUFd ]]Fl׫ǧe%w]pǧ2{$}0դs7+խ!Gr u&nav&na~+w 006Pj~4-*?ZIU  :x2 ;~ xq=4?9k^ppcد< y?&Fȣ(GT#3y_ŪlĚSUh}򂅃1˰,UR=sbi$L?+RùvqC^ sRYxڪ494i6&n^[ě[O#Or+ùtp+28z.ǰLbk2 5 J>US(4 |CLX={C1=&0 xz]("/իs3Xb;/~m>77774ʼ08ʺe݌nF7fs`=vTz]"n fFhr v>P.ql,a/ݕ3w J}+qCr`3}kiW \\kk($6(w;p+$`G}\evn\3$`K\ 9XP!9keH+OW>??)qˋvzWPŹX&?rX@QYC!?wڕjk5Zxƛ -q+<^X#,ևFg(3,wP e545ToSy652r 75˵5Stp\VոVpGaq5wtjۅ`tpH6፱Tx$c#o(`emx†+f<(¡x.2 ӌp[^ؤk0tFbCI/z/O[u=[3'ɳZf-xYK_<57LyY ւjk`G-xV ^ՂjkZZ-xV ^ՂjkZZ-l ^ՂjkZZ-xV ^ՂjkJXy*~J^|/D:exd\V jխwnrtϸmy~ φgٰ!6>~ φgW+~:X9}6D{#zù!"!"!"!"!"!"!"!"!"!"!"4.;W'~f%B ە/]8Vr|n&~\8&e6pLlRfS)T1-_޲M.St=ДZ[yS6Mvds&\[ɑv\8XCܒɅTR 1͆cxt|͘H_HW9}c}bWU屟~k+[Anw^pv],%Wd8 ͯ_&6oD\lʓta͗#լ\^x}&+evQ@i\ An . 1@ . bp\ vyFaY.]uoI] _; 5uͮ pF6_ťyӋ35Oc2 q0*L7Hn9Ry0*21L .21L .YcpL`uTzW>JHWD;'lw`ug׺CYw֝ueyYw^֝˺bۼ\,uŶ_Sd].!2\{q.CbbeʦsX\.v5:Jm&XmVVLzp~>ZT;%I<3gb˿OI;,׌5p8\3׌5p8 WB8OcKЯY]֟cϱ }]jOkQn?EF::cί['Qvtun[kʶtؖa[:ڙ³3Qp~YX11l5kq؁׌5p9l<}~:<ߌ_#l-ѠyXIyWr%9|6_fK^3\uͱػq5DwJ7<+ >7|6w(CP~;ߡw(CP~;ߡw(CP~;ߡ;"w@|QQs|gyG_1qT>+Ggٛݓ8_ ĉspeuiHC>eʠԦKh%$!i]/Rʇ\F!P{wt¨IC_:8D@Зn{Xy 8g:#)gڎ l|+ls{+C+cp^}(r^7~!^*a4аoE]NFKg/Vj%: ̴>trx'K2!7G\ Vk MHzz\~`yXm5Cl( (`w@  ,ŧLq$*Dz)H O@~М }Vm!!OfKw.tY9{Lwn}{cwՠ5e>>2̪I3u=zz]?Kq뺩uS^Mz]7nu뺩zD^^躿uSp l‚*VT.76uVF)HoM# m\<})0ic+FT;~os1Գ6_Ea'OOX-tΡߎ[9p1~ӯfJ.7/^ت+Fyߝ[&x/7IX\_`ge9+k9zuHs'>+q`f8āJM+veʛLCݷUu~V9aQSX]mp5ı8 n\GsY"*QϲPFdOF7\<־7곙*?P>K~Uzǁ: @Dvg'cMF`hqmMF_N!:N7UakXok?`wanΎ:5`wbV1)=j>߭S^ևz5j:\uQ0O*qh8TCL5Z-CE"ph8Z-CRl`)6X ,bK_Cx ,k`iaԡ㹵uؐ?9ٴ{ޱcՂk|qXJw[=^KRj`)5@9@$9Xz}8z;NncO`=Iv=^CyQ~ȳUw7=K%1o2h91u$U-毣kWo&3+P25cYL;k#u)Ν)Νy8w&Lͻ?#gZWWϦu.XuA]3[Д!AOUVZ`肻Ub-Łzpj`Ug/vCp gzDW C?zPީGLz Uj]p [O@3:+-|;\\Pݒ.zFוֹ 15J[AG# #58gPgT+5\mP[jZ-,8~aѶZl-,p ahfJWki1 XWa Z:}7>\ ̹ ̹ ̹Y ĺSx!=Tm}` 91,0,0,0,0uI~K vFZ\;;o`s*>;`lv]'dWBvjt7n$u/?X6▍ݿE6[-.I4CPr%[]RuҁU߼w黮];p nUu$yyāCPֽ)fn /Qۆj6lAgn=cdQjMhnm6hngaP:8~Jt527?P7_ȃ=BbMӴ7l [yÝI ~o`CobeOJoFT/] nC" ^zn^qqWy e7UPyt^xo~VSUWb\47PgW}Qն2 ճ<xqGn1`< c[+?Q`mu]ؐqgZ^X|Lzʳ.ok<@OA诼&ISF$x#]퓷^5 :;~s*]_JȏwD{0G5 s$X}ꐰH؈H؈H؈H؈H؈H؈H؈H؈H؈H؈H7"!3ybZChG03OB ! w;B0J쪘5MOͶR"ȠD%2(A JdP"ȠD%2(A JdP"ȠDy]G^w/?RquQZ=NFNF", \hOP5&dp9\N;Np\;Npk|Gȿc iZp;:c !q!~x|ڱv,kڹv_Ʈo XVwnX/o<&PKaRرv,KaG -/\:ecX:ecX:㌎ciX:ciX:ciw,mwst,mW8̍ciX:@zƬ#ޱu,mK[ֱu_}tGʩ#ƃEZ:#ƃ1kiX:z#.֘ciw,mK[GciX:ciX:aP8 A0(àpaP8 Aw;`{G#liPDQEAePDQEAeP A2(BEȠ!BW$yȠ!"d[ı- F!"d[lm-BCL0Ct!`:LoowXB hx@~ڼ4<v@nt;v@nt;v@n/u;ƩK:Ʃ.mX#YP?yT%ЅR BHů:tT-R[BHn "E*tT}QB7Ců\+j68y3'~OϨ&Cx`5aE.U*؄ߥowpj23.va!>00C|a/!X3w~05~yvoz/^Cʳ17Oz/.2( yq{{]mCE!Xu҃V[VA|qW#xVJ.vk .tbc[5!ub[‡.u qp@B^uuN9V ?bhڋuj[I% V{ֳ^%/?p~u/ZB7n92o.>ͅ_܋j,R]yHT>p*êbh4ӄ{=P޴41Gr{Vs{U `G^򹰽D:v6ȁ&h\._r@=Aa9:0cЧyťye^Tyr)k}^bqW` &X `5` &X `5` &X `5` &XO eч eZZ2z-ܻ쮮{ewu^fv-L|rk9eu-绖]|rk9ߵZw- .+kegޮ6̵HӵZw;VU~*c߱Xw;U~*2Xw;VU~*c߱X?}2anN8v,rOQWV?<\V@U3(wp7;pdQ~zv=\5h x4<W4wSy1\(Q 6\ x.ms1\ x.<\ x.<\zbg/yj2s1V x+qE7Sŀb+1D x"<OĀ'b1D x"<OĀ'b1D x"<OĀ'b1D x"<OĀ'bw. ^DxC<z;0/e l 'ɸ6I1&IlW98X#;dCq. y~`̩,c1 4Bߨ~,#dJ̗,da2M&Sɔa2e:S͔a2eL&Sɔa2ecdv- y P02L oW͟Z"xb/gHkƩro[S=S'/vCyLO^>7^R'~G.H7`lh&,Wx?5>S^W.jG?\Մe5aYMXVՄe5aYMXVՄe5aYMXVՄe5aYMXVՄe5aYMXVՄe5aYMXVՄe5e kԞмUs Ϳypq&, j²&,yoL1ۓ+=out&, wɄ6.*]t+ mB&, mB&, mI5g71Zhnk{6aMXhڄ6aMXhڄ6aMXh m?˭57 Oh&, mr&, m^˭~*mrrSiy&, mr&Iת.6`c1g61+VT_Qʺ[iݭVYwUݪ U*nuۡ_Q-CniChɐ[26 %N7҆҆Ъ/2ELڬ¥}}$ݤ>/tQ^v.zĜ RRwI#-#鮽/66P=xO`/\G G B(†††††чlȟJ /8/wY$]ל,|"0s0TjZ03WW]߶t2-Lr\B]^Z^Z0-Ʌra{iT\^Z^Z^Z0^҂ՊwK,X_X -<4j{ia{ia{ia{iՅ % /VK K K K K K믾q|óҖXo JVӂA +____________7og蟦Ef.8 &~z- ̓K&c׍qo|Un,8 f. ]2A:%*tXMƮK~BMo<6!nXh?Qؐέi6աTHF7dtCF7dѼQZ*a3>>yMXv<8õC5.1-@3#]}}/ lrF* r, A/^0N ,hdE_wyWɟu`>_0;)OM&0L{?<T5LE{\o ڳ.y-q2[Vݢ uY"h? >;|\5puljW.vq]\jWW%>7.vжmm;hAێiW_88vFhtPTjU;AvPTjyjF8AvPTjU;AvPTjUjaWP93(A wP_ڮ2sCv$KDD|,.JbJD%B,bbKġ0 Y@B$u}*C@ԭ~V3?4C3?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03? tOzWXX39039j&I~? =S!oԼؓÁRexg\]PyB+xªiJϐ\ Uwe oUn~aSzTpsp~&Me ~>t*a?tx.tj[m&Ů&ߧYӬMOV2ߧY/_\}r=}amgm~v}꾁Ce(?K==PL^Th).ݿ ;ߴy߆s}=Q=JHi+*\x:Wj&h2К v Ne!hi&]o&]M׺s=s/}3K^fey7Z@_@U1O2u83ؐaf!VբYn&j}p*u_hJu^zuV s½G장ʼmKaip2>ؕj.q'R[~RHo+ݻFšKei,ѲJ- 0pY8 ga,-?\ga, ߖ,€Z2w.VW@g=W7sH]*pBU0 'lq1LNzmq>Rzy7s,}uFwntFwntltFwntFwbhV6`t0Y<`U2u`VDk`-?1LcC?1 cC??Xw]<.WjzG 2]^-`MK%XKVՓt'\ޢ:ѩNutSTG:8ѩNutS%cspFCdlX2d]894jDzpz#w$Zw|.R{X=UB-moZ}^ ̡U:٬Ws냏VY2.8 ~Ý/Z<2Cj'XySFSϲ-[|@OLs`/87!L ˫"JxQ7s)}5`S~oF%]:$]UV']<"i܋I:ob{"'|xtcxwh8S8I40 L7GM칚.~Ě]s-57^ԫOkuMIVj;Pu1:Stos{\!ДScJtۜӠctL.XRӠctLi1 :A4ӠctLi1 :A4ӠKo/C `45 F:((幒>64  1 m7ʕtR FԡԭSPh=yy830{h=O`/\g` ̞30{f=2 (CW:l?*=A5fLLIC>Cu?aͪY50fUC>.19:y3o`;%@0*a̚y'a\TqCq75fĚXh c]]]]]]]]]]]]/v!7CKԄCL@L@LŪOM>7 7 7 7 7 7 7߰ [˻IbRc=JLVii~'zKY^Y^I&lV&t?:FShjMΦ$tS)tg+y: NxB'i9i9l /焗s6̋p ww  xW=s:Mn6geyHC]]5%cp<(zq]'nH xTGU~Ŷ'Զ-Q }TaQMzַ ̸hTTbs1&L~qCV3*_/Wl'6QXPb3c[kK!昝%B>A!+?\ π π xÜxP9[HHo1@ߗ~?S%Hq%Ni8O+3-?/ejL͒yde,))Ҧ)A)A KiJijE86UMmWFOoˏU=i~/jfz??8~BS&hB&hbt M;0 "j^nխtjB&tjB&tjB&tjB&tjB&tjB&tjB&tjB&tjB&tjB&t`mi6/ژZP5_5Y5Y5Y5Y5Y5Y5Y5Yi֯5K&4kB&4kB&4kB&4kbtf *-X\ͫegAYZ6eZ6۬rIiلMhلͫYYN`&dĂjB&tjBV*yZ%Oi-Ӓ>-Ӓ>܏%>-Ӓ>-̽% D JD JD JDKQHK)& b & b KGoAXCy:wj'aA`~k0% wևKa]®<1էCyKD`AD`AD`AD`AD`AD`AD`AD`AD`]X_   .׏†tCq/_g%/p/p/p/p/p/p/pNN߹ Y~1/w3v-Bv8Gd]̿Ot[f,b-Eۢ-ߢ-ߢ} w*,Z̐IJZl1ol{Z*OzdCUO?*_{~~r+#ˑ XoUR W}*R PȆlφl,G6gC}6gC}6#곡>ˑȆ}^X+x@aJ1nK6hC6hcf_%JlxR %P %P %0ƒmڻWl"+҆*ml>^곱ww0vrrP89(AqsP>n)sP\dRLɐn^47cf<vPqB2RCš𢐽 /{/C!6 a(8Mnj櫩f(X]}0!8yhpjHUp n87 qÁ@T G X* `aa۽մ:888D'`+:žX)` \,gc낀)pb>ȣ!Wϝ W@ W lí l [xTCy4)$@ l N) N\ѩe]//nʣ(8XPq۔ʟq ?8xX.w*.7a`3~/ *FΧ+'匇=n~BUqgyr.,Q:<ŨƳ4L Yr%w1LuF\Hטxog$wZŽ̕8&x bOTkl-Fn_\`OX 1ʋۭʥ^ ;3v0KF5ܼ?Dpcj*bP4狻[sx(F#]-gV}z> #LXxkx.i!qn_.\^<@Av̪[>h1*c6le m]iHU*h`:UdCYTYM$M$>{kWa-s Is~^+}xfjepmp 5 G/?x"}'®_Q΋x#ݎ}Kqx֐xooooooooFo&!zOyX:ߠ :ߠ :ߴ¾X jb<|7|7|7|nbt^5#X걒oZz_ 7-/6UjHk9o9ov<;ATC؀y, !j\80 Iu'0pK<^K{LKA+V]^=!+%]~YAƻeu jv mu,i~qW~\ʓ!!!دZ l?#)u]a0C.P~ ըTC:T_%~+K*I*CZ:k<yP钖 ](gװtHKrcM5$R!RѯTXa萊萊萊萊W*7ג萍萍萊 xooR~>^?}n ^65DzwzwzwzwzKY +Z>:r~Nˮ+-ԯGN j~ل;:|:|:?'77.FůKw}wqwqj ZW0V+JuUƺGTj~$o9!LpLna?x#:2uLnaM1 rQiQv@7\;L^0yn2GP,ujNyfcgϱsԹN88OD\\CCwp0|{;w#.ӱup~;wpc/s-*&~)8|tЫ^W:AzuЫ#zAzuЫ^#WyLIsn<4_ӒZHD-$8ފZRD-)B!>-)B|ZRN-[! -tFn܁SpWW-B˄2!L,lXp 4, K 4, K`` 0.5,ڼ6S8b+!b+!bz`VV!qW!U A (B`1qW';" H+$" H+$" H+$" H+$VI@!# (W@:wp2 U@jP$VYOGP (T@ uųE-N@H`qX]X?9Re# ,B& d\tWTTI@Xx]I5$SKۿ]gvۿ]'vۿKۿ]'>vURoJ}UxVPܕTO+Ռ%x?;UN }p&<6m]\^~Bug4/`Cw)Nj›xU=am}8QBy:wӼbG7b)<F6LṸ!g156_)6klKMbsPN9x,`Vl?^c&b{"5Sl.f?Q>:[Xc8-/sX3EbC:!E)*o`rڡDhx^i/t`lssfDnʛ_[ w}V]Ĩ/Cz! mM<0GoQjwz 9!u><~*tT^~`L|nQtO'^xه]LWCgԇ7:5=,*4NB͐ yf]J*U[*T֧J>U)3jS4 ѭo{1#=+A<7CKI =r}ZK&iIav]A'tMbQ΃Ƈ$0IQت2+|Ηu`QsИOoK,3(gYgy"?10Gev}]my,bo`X*4euEXVf_f_?/SuMu(Ãttk1k~eun]^]R[ם?o[3пyo/w}r/w}r/=JM>B;!7 Ih?ׇ.>t˭]n}r-˭]Q~]n}rQ`?V K;J:~j<-ꕴ4Xx3ybD2hZ^u/v7~zNC: ky=p-C~݋tgư.n֡#'p,u`I=tీ>y5UmW~^DC{֧*6O O ~s6ĆKlĆKlxN-]A/| K%Z[-¾¾¾-Pߺ7"|a~^-P-(-,(n(n>\ Z-P-P-P-P-P{a@k @k @k @k .ss@k T@Y jZu XV jZjZjZjZPѿEM Ԕv~Xogv}Fo]v]][[Qbzbz%+Nwomw[qY.±ca'y#b{} }%h1cd{c{(7r+jol^ox"767[1oɰAj " " 2)yFbmmnG8g,`H_X*R }IkX}ك5CDLĴALĴALĴALĴALĴALĴALIDdڮcuڮcuڇvy":j{kߪ<=&N3]&ci>s[綎m}Ym/m:>M暾v3װc^dcQk?9ؕ~J76Dþ)`osg4 Lhu1i?Woum^y1/404Ui07 4" &am01 Hi01+bihn2IM/8}!ec7 + n`uXz2bO/5MduVUnhZL[4icUQF k.!M#Ƙ(ӭgo`*T)DۇFyI]i].ׂߥA:+]Sq[_JW*R`7(E#q!<qgJ"lceZ8GVJ<qxԲ08pCp:! ap0Vw<.߷!1pC b8!86qlnvC pln;!1pC b8*qTn;!1pC bku_:ƹk7rLsHAw };HAw#d(ҏ":{#/unGH{x}Oʅh=# HBۨ|$:\Cې6+>_+S=R;# H>$#5 nͺzs88>$# Hnкxg$# H>$# HUW_\(2 iLC7b]*2 iL~5@b kX%@ ld$ 58~8. f;%.e{.@p \ .@p(.Dp1[Cok4RČ~cw}򶮋M8THuV~'/V5Ԋ8?IXI{/f~u]'\KK >u]vXӀT~uI㷨C`ۺ p>lMށ )y[K8uݤQʘIo[ lׅmחVa6A`$l {5,)fyxmA6Sr\MQVS(:d`>KQ53!0'2Kx&gBx&gBl&fBT&Ĵ:pp Obw,A A Y%fXs*|]w,WĿ澉3k3k3k'fv]}VVVv#eO@)k %B*Cq;$MAYSЬ-q.q0 iIu[:-`)OR)m ᶆp[Cp[aj<vo]M^W[\8xut5pm2'ڧ(p[CX!Vk gۆPZC(ŪFm Bl ![C!aZCX!VkV}_ضZC(gXq@o `/ h5H5>nk5ml`S܋7[CZFZEZEΛvrm]|wݗ`ʵ->->TT)*׶o -"-2(܃^LnI1j qZC\m֞m{mbf 1dXXXYC̬m0ֆцYۺ?{p$&`2bi XZC,!KkYC!~?k5{m־m{m{m׆q^am0boX5pf9L5br 1\CLeLUUUUdUMO`'`   li"f`JOm6E`lzlށޥh)@KZ R0E(*@QͣV[CIt]‘P0`}%6EJ`4CpĎ- PW u! ^@Zqէ PW^YOBE<}{]uݷ^z{]uׅ^zOxYPnuunݮ]t_-'*~Jip9B!o7Gț%$y\fΕˇ0GX#,?yRaqYvTzj$<`W$psEፁt04.̭֘5d9Æz7~[[_rk5Ն9~M}tš!EL+b]UDLo`PZZKǶ߽|=uῬO$ T~o wpzH7h2oU  /> ̻`,> ς`,ᳮ#wt3|jl30k0r(Qcge2qvQ`5[ž[ž[ž[ž[ž[ž[̆p8m86 +d ٰB6 +dc^l 8Bq>\coay `¡>j:s4`/YƆamlXƆamḷ#} m[Ɔ!܀U~yxemlX)TJ{xPZ"Vf%`Vf0ˏkINR-SYf0Ӝ0 d$k&Y3ɚwxG-3)I g&1(A g&iKׁu`PA T:0 hPG:eY؄guFd*3?uPzUĴvFjpwu_˾xPy/VD2;wcz%~s//˚E5H+5`YT2`n#]d~`3HE'gPjkMQu.<18&.v/^A0# /h a[qCg<~ ? ΀8π;x>\׀5~ r] WpQ-2׷$w@uwRU#NGԧ;,?SJ= mw?Uf:Bvt:S D|^ܑ1Yr0Qt\x'ވ5J/Mم-II裉3tWTrFg*838 #aEQ4EQ.E D@ D@ DG5fEQ4EQ4EQ4EQ4EQ!cEQ4ES,"rh r8f~:QA@ @ @ @M]QYE @ @ @ @ @w`աȡр^E @yX?z=_ITHVTHVTHV~0rz+ }=GC5aBqX8PVz~nxS3'~[ѵ&8]yrppʮu&R8.[Ot)^*z".ţSFR8;p'.]ӥt)\rҥpBPPR8.36`orӥpWKb 1ʥppZ_ָDkEl/D+nV95j+3FmeBX`P0V`WgkTƈ`CƐ1d`ĀQ@V ܊e7 p`D/ 1C@VJrĐ 1.!wA ketkU2]/X+ G'1-!oA y kz_Cm:`Đ ^V{k%0kK)S냩<⟵A<%6J:zY-+)`N I{$`uԺCeջ̷(Qg޸̒Y{1\[ 5˄2aL)fXjap#r@-9d2& #hڬXC h XM&tH'tj75;rqߞ`o?RXu͝?zᏅ+:Q!{G+sm\0,]ؕ0nZ vZh~t0&ֈfzqŒWX;5ٵXSN)'֔kʉ5Ěr4XS+&#tclP^ȭuĺsb9Wv/f9607' ssjt Nk܉5ĺvb];$g6O)^pyQd]vP:Eh]@0V~%V$ €;@v`=yGom5VWĽm0o*o)SScɆ̈́T䯐߸!iPS|2XQn><\>1t}bCƁ@q 8`0FZxǫ𬡑 ]́\?5޳G#@UªcaձXW-JXL7`IuJjٺwT^vqM.N"/V1쌉2Su#o5V>ؽ̋.3Brk-HuC=1ëFMrB%>8yOV9 5YO%X:*vFPLCTBwXoV&+vqKP>&0 (KӅxWo\<h鯗v5s~ͷ$|ԗG}. ݒW - rW#Y_]7y^u,W}l馾6A=,a[[[[[ZNnI. ]|V, = : ݐ ݐ ݐ pmVW ƺrCB7$tՅwṮQ!!!1 y QJjŖ;(TRԠ%5(AI JjW=+Oέ2Ѡe4A jhPCCj%3=~_JgP:Πt3(A JgP:o+ƮemVgP=Πt3(A JgP:Πt3(A JgP:OxXhb_~^Vu+C-% fP7 }fg`T\>+{pʕA r)Q^u+W3^tmhJ~kRBBµ]]qvRM]]]]]]]]]\\\\\\%U!' BCZ8It`:6\'-v1ᘎpLG8c{g#ұX9 c!'pȆC6X89("ʱeزsl9ؚsl9?-2b:vls9\-֖ck˱rlm9[Oݱnrl79ȰvsxQPO4ɌɌ*3C, \($3r! \BtC((2yC `&`X%*V ffE%p+p+p+p+ywb~MK:B7=w )V JU,n4Gแ<܉],91޽Y uW`#<`;)Nv S 3NvD+3.G&陶9YǶD>t]b:=KL# -S7I`޺(6 *#LUAd*^Kq8Ǔ׬qꅕqU3.6{f݅Jf݁]qb5qZxw?xg>'k[kzGz 伒P9GM4t/*4QKM"Aaֻ~?չ{ت;>tWeJbrt1S3u1S3u GW ՅQuUX٘;9pg[H*&v+k˚&.령령_ʛ7&#Zl^kR akEvb-vbW/61At{ڳhm /T )GA nM8xE 6U5tPCa;K 5ޥæ갩:æ꠆~!ӿvoeJvjE3Yo߁JLH_]{Tڧ*N#w? ^{;EȈUl ?0{DG֣l6|D-<֐426?UߘJURM?]jW:0[Qgw$GX65xϥ 9K!g)hS 9({ҋ>IY)-)-35=Pu( {w$icS{' Mޔ-A QyE-79GԄ $dB6+,Yѯ%X5!X5!X5!X5!Xy++/#@58eyO,g!߱"2!1!1!1!1uVuʟWhO\ W WMȄLȼ"Re%""2!""2!""2ޗHHs7/^ E.61L['Vb]`?6,r^S[\IAH>vHp:֖H~W8-Pˠ%_Z^ӲiY%/-AK%fYr,-x|*K %ʒx_/^Z,rV9KUΒ ,b\1 Ŵyq^8/,m ˙UqYYpˬ}x2 n̂[fB.VK!c ,`V0 + f*Z,ZV- eN8eAXeUl%, ²  +Uw{ՊdaE<U1jEd1Yڋ[bXbS,H_~bXŠdaE"Y{wxvZ,HV$ J "YX+JSOGzH#kD풨]u>d=5W$jX}j?Ҫ-rmyjKTiKTikrӿڤR-rmWnp[-ۖlϖl7[-R P P P P5ƆlCQ6ec"*"*"]y1x~XVl,+6b×XVl,+6`7777#8ccYXV.XVl,+6`C 6`C UUJJJ*A>eņl,+6 XVl,+6  /ItESo0%e[Qe[QUe#_gleۡ잰X>$eآle(YE&66gHၣPmle(m,ӎM} f޽ & ,6[d6li #CH]V_՛{X%9x:0X1z:m&\A6:wp*8N6yšU=<cQ<8VxxH[1v% >t$^Y `eshy_1yCJ8؄j^I&_ѩk"&b`"&b`" :߉mMLX&MdA4A`rD]zs]zVuR `lXN&YU} m[@d~ 2? J-/<}Xu.{ wڵq`toۦ%zKW؇zEo]~o'hWU% r3烪pB+U_M*7j~MNS5j:M~yso}mX][1*5%# yoo.٤s;ճ޽ϥS#bH0˭3,r0ϭߪ.f:~k^2/Pл~9XëLwOrҡv;C` xL[6&_nMXMS l1͵ ߀:`P0h2+oo 28/ Ӄo M҃1 $=xE~_1oozn ߀/`d0LƁ8`2ηfޟfnf=-}mNԼJ{T ySցQ# oI6GS_ؙڄx GU5=/`S4AB$4A<3A<3A<ѨaK^hjgM'uԮzV[<6~&s0y󪧃#}9Ԟs@zT=ꐛ9͜~k诼b <uM?/M wRK/w5zJI`UDŁzPޔs@;+oW&-YYY:t@-S:tMӡdOMӡz[MiNyrZ,irZoN]p-8_~⦮ɀMxYiyb}aBC?)3~jAV/^<' [l5hyejh?tP}2\`\`\`uH gŪK5ĮKb_(x"mŬt Gjpa1/m_ߊU6TucT2҇5nK%w|i'?jpu"DJӉ)M'R `ŧ$ohMV IrZmN#K8 6m;"[a6 gA΂4i8 6vDvrc߶oXr~g8FzvlXuV݆%ampOmXorPۚz}(P/>ڰ68h6 ٰ6,}9p87176R-\V(z?WP'!fflas-ffpack-2.2.2/benchmarks/files/mat1916-1916x1916-195985.sms.gz000066400000000000000000020421021274716147400234330ustar00rootroot00000000000000z[Umat1916-1916x1916-195985.smsLIm+1QQ3Bf*;9kﰨK_uV?cOdmܴw3`c7_#Z~ߨ0S?ob_p3~1*3vL{yN2_dҜe YS W̨ ;оIe)L,X Ysh&)\} >` >J`N14',0Y  ֘_B9dmj-kӚ!@ik-¦+X`+䕲@Vyo5,4@g'9P[v+YMYUR٢FR hI,+yN|t;B?~鷒|g:{ ɿ^/ :[xg lw?h߃_ BO&OOF `Z7 er*g,GdN ~m9ħ4i2>}v}:4a>~dП?Sӵ'Чq*2JzĸqZ+WJ'ǯMOvYf;r;BBcV? k߬2\;z?rXP0&Ƨ86~Kp,ZNs{g@;0wE}P_\ +)S )s PGcCy=1Sv&N2ٲ@XoT4yΏ)&ruf" %x-[l-` wNXg/Yެ;#Oc*BAD?4@L,PM &썔rs5Uis1+ "L~@S9La#Wg ]odd˄UOczS/_̿YFl [qϮj4'IwrT&GY~brfNr甜8%'NɉSrCHBunHluVK%ؕYn!WT* ŋ d!N~eMBa\U=b ǘ%` ; }LfFW[r+ɯ+ȯS:%NɯS:%NɯSD_W&[z)iuJZПP:%eNISR攔9%eNISR攔9%\N m9J-a]ߟN\,_ ݭi^j>[bTnWw8Ɲ;>_9*ywf_Drmh+V ЮVlZ&J/>𰦲T߹0E>S@˘.t-i B&dC6_E ݾЧo@{(d;l"w0Xr 0J xtԦIgW0=U&jLb`c j-%cAPU=Yxb[Fy}S{,X&>^rvTQ^ V&1!H(T]a+($`D'h.I՗m?~ZM6=bԪw>ƛ˿ ?̿߭߅ߠ_HKĽm>{*YLZ/ Vx9__~oO;ݍQ{[9]heeMossssW'j5xbrW'?93[K|˒[˒[˒}Lt}t:"^:`,4S(ܒ/26S4 >C6:tǡINN~_l 3#M]1%k vuwW_aoπjW[jGEEŁoßXx~`>!U=݈j^k^pPS&'t~:AfGUg+ L53 ~LlwEb[1\&]pr\`*Þt#v Bs!J+v~ /l%ݿ1!+ߝG/g!%ؒ[R`ت<[ܒ[Rp}&nKlڛ>}IV/-^ 7.p,A0 IA.9&pg ~Q}o Y]8x'egiUc &ڷHv0&l0%SrפR{ ntv >d -l`笆%ڕ}Lh8ZT(Ԋ&\4@|;%}vϮiG?*b)ܖ,t=]4r5=}uuv|JuGhhhBS"[s6 Hl;]Y2MAŃ?H;\U#Ad@2AbA-=%(IxGہ\OFsPPn,v U<| ԧC; PÔD#H:d~ RԁT@T:D%|ҁtjf%*Je@@*r0퐋ġ#qH :~fb@h9Z#@h!l|JŕLӕLny ܸ~ՇY㭰{gz.”A+v.>5[P}OhnbXi=͖좚$rhlչkETzMƙKhEPvssh]U1˾fKv[ ٿ|Ac-JMnWl-X.Z*sE}Vr;gOKEinRAњAr>s2S8}qYCs&յ۠aVju]ŶA+ хB]@O)˹#N:*Ӆ)X:2v $K ոP#vF-B-j~\'_Zc08ݺP4j>AQS\A7CO|tlF#Ұð ; ;8(`GD !AW%EW$O)% Uf[ukLHe>V5)^S!eFq\-2K Rv (i[%1b'i@&_ڃkEmo3lilOj4xob,,_7MA-8R ,R/R6/R.4B-#߈QnY[eieYFYLEE)Le)dYXR@)4?(6rY>ȾlEϳ,,JzIzYYGecYX3e9cI-d[9I9cYX3e9cYX.ebYX.eebYXR]DbYXֵ^)eebYXV^$%eIbYX$2Aۤ8SczYX$~-n }Q)ĭZVKj[-q%nĭ%.!CL &8ckLZ>iP*=} :6m{&ۍVJfP;uCP;uCP7:Tu.q= Jƨ$4U7V&GPTnA эBM/Cbų6k6A壛.A ^3Bgk7ش9ش;ش=ش?ذAL3 6!@& M$M(Mheh<&&I6%HWu i{Eє5`/jGfOgf~QAAW!R- tDG 0gK9)Y?M6BўY)R\|ޮR*rqhoW! W`-]%wp W-2JìhTC_] Wo)IP\*Zd3Uv*/4Iu6tQ_ARaOqZL2I;q;['Gg_[__Rw_fInI-SK߹v-= K6>4mTOꉴRRTUUZI;3|C|\i]i+iEaACɺyy'2A4ҼrkZn?K4o44o4HJ%䯐yrfZL6ʅCzSz]]jyhBUB'Rܠ00 AݕkrP广ӑ#%N( w`}+tNN~jՒ+ׅ)X yc~PXc/(>k,KlCjΩRRg[!\P|bɕKΕceYXdn W4ȎJe8.Q=hjRS.h.W=1$YKǒd},YKGK߿O_Ry io7߹-M?r*r\׆IM LAB PLC*R  GQGckp204x4MvCyPۯEAGE Ҕؐܐ@ސސd_Pc Јܦe37B6H1i_[6H'4yۿm_rJo *_tӖ[ t/rri)_p TLѲ->#rq&vnj8_ \`o~bD?(L7o+Z6ss B Gv.]w\$*BJL?ߋz9zAM :N|u-G׽]s! S0lOCvZ}I9\ۅ% ر6xr~e]ut]q!erR9u.rMQLh.QSM%E.L$m*u U?lAZr$H_)mT<ё[ESp=4TKJr<0-6)Mv[Pc /؆ܫtގkzgadx$&%t.K>jX]u']I|-˛CGt'mt훢鈖6DUtVaXMa2-w-۫P*)u׮ h,l>_&PRc޲mP-t"EJ9zNgcS i_jmfiG>3L.NN; FbjH@.9y΍/i_ؙtݝIwg-vt AiCofwT9Ji`%~C֟[;.dY>mAX_3 wޖ{vAPRRҨ~ Y [֡A#e.e.e.e @v's\tA1,`\PCa [IRDBp*}P'%;4ҥ4ҥ4r! ^ta¹?>I>IQ[0ϊT?`-o;_M_v[&$i=,uk9.P.C6e3L{ƒ/X)mԩV,󯔳_pavohyqh]&v"椫 >A0N ~ڇ؄pR@&h)0룝#ZE/:Rna)1kUt} 4\00XP K@Rɯ hW0DXY~pJ`gv&𲵥eI3&&kEn)} @Gf-S4S&kJF.ƔR}h-܈_VA$'U_5>|0ւ*@+|gZ `TU qZ@NNNWe! `T[赑OcC[>A[o]V8m}҃zr%KIKIKIKIKIKI,@SBBBBBe|4(< OӠ4(< /S#~.?•)HwZ KoJ/[(znx6<7 -q3hjё/Ul8;3ӛZDxzބ7L^=`UzQTE[A .ADGcA#h|n.rqu#bTHԍ(PҍnIPKɌe8FH--Ғ(rV97$V7RZE$jIl/U\#ВmڃDQSC,9_̑em"Gрr3Qeva˄.E~\` թS([dȄ&3 w]PTW(QP.”ԙtBhyMokOC0,agJ*`e%B3d)b` 6z YŽ>)(UM@I =f dzO/ k=8@6kMPRhR4YCyiQ\#J}ɭ"!hGasE)kmT[AQ mKϴʘ,!'9I\>Xz .8,8|uI PTKڻ͆iュN:2kI)jzZR>^>>X>iET7*U9ޢۣb>n3`UݶI%cORpϺcU,v-Vt/4M(`Δ՘8lBlpqIAM.Q}]Zrot%:L ϒ-}:W~ qv.E @FRvR>@A \DWWt6e2# 0)` qI>^ez(1c7.(0c;Tʑ{ϋZYf\:hGG.TtKӈ[^i)[TEfjBR)m: ;"-5cְt!ߵ{Zuц[U*s2e{y *cɌR~'( uqBo\pRz2ҀK+OQ;BgS`i4FC-xGhٍ i (T|z/lA6.AgU&ai4Wqj X" ,<',^PS_j:03Ҳ@{-UZeY Oii4-)KŢmrK֨S(")j(4Hfr+1ٹ3Eu%lN)ub~*kuɱ w"<2ui)'))stpytryKSKͷ-1>buhhlCʜݸ3Z'8J2t Qˤd"bGpS ]±SE];EM!:Ӥbk*m%5UJ˕_'R-^}kʰ\u-6E݃-KF)ϫ RЈ^!GP I!!)$$\e!߯\/P. %!$$䒐\KBrIH. %}9`\- $$䒐\% OEc8LC#!q$$đ8G}2IILReU(RE* K!`H@R%*T J UH@R%*2 "COT J@ =2~H@%.n-ki](e\|+qxOL/nh{=2WI>MKYO[+e O-Z> f4/(9[+ng08q2P0Ժx~&L(dzNj (rr)5 vxz(0*<[R$)eu^AESzmQ0t|APPN7UܦTq*lѧCYMc.5U^ShyO骉٪ Iq]51uԆek<+> ahan=]EU͜"<ܤQ+W /Z8:.Rޘ2]B>w j,*sT\AJ^]2}٩.îvΫao!)Q*q4eV>&LNɁ)xxcej~Q* x_huf Ɋaj_6}ё=۠4)VA,"/NG-5[P*/⩋P:fڀCXtF''0\ю?^/\T] R5j nɫ*vYҥ[Ԥ){,b.ցbca.۴,2U GPN:a;uvNRMLc ެ0Ld=^gŵ$fP5mRxP d_P7Sfcl5j,4m&s15g}t1|b#^,_.{lZZ#eeډ+?av[SsqKwjQvjQvjQv.n-ݩة @[0xOзP;;;9\RK+s9ϏrmWkWkWkWk'hIvQՓKzN.S~(O`u.\/\ykhˤ4g@Ǟi}ᤢzS t@gZJv0bҾ6ٿ`; rBn-HZT2nCLb v@2h& - %x=ILQ)ꏸy)4,R BKv c ;mp%>EG7M.ӔJ?76nqBm71ukLr[ &v.LŖyUϹ%m:@}aCH1 KZi=<ŧ֮/ԖÅCu! MAbZ~AL&Rنhc!IRm̶-s ]CBb}$)mH%T4;RԸ-lˏEf! ӑmVuRܖ.mr[ܖ.剧n=  7+ hMb?z7G]@kwR SL͎RDXN^3[- rp, Bɢh|q mH)P ,*NGw ם&P:c b)6cBuPeRQMz4ٞ3LIQ3 Ee*N93EYk_@aRh1ŕH4R:pQhVa$tQ>cN7l5]_ ;E3^՘#ԥY4B|M܎{ Eލ A+m` ݒslZB П䋎{~),?!M( kRC o0w<8- Mu/t}BC:3tcS0?ul 0byѼP4S+GʣyѼh^y4e>) \ *,f궙U'n!~N/_^]:t@}T X:>*,o/mk/f/]#6UW4F8t,X>,.χ-&寏,_8]OU--- gaaFuM]Q-.-.K[\/_:>n[.Ɵ-`wMN@(`sxgxxUcJNkv>*VhAٔ"eAʿt`]q>Zzu\;r3캔Va_m~1}0xrunPԓTW{fn[]Gr`$]JVL\#x5lj^-qb }bْ $-pI1Ip YA% ۂ }V4M]Lv Ry(PP*J CP1b9P*K۵ıJ ([tDKvK}WmS\5'F?0YنˡAG_MzEE >m9`[غЧL% 1.uJ[aXE v ⌛A۱PA/p\.ByGucYV2ǾhaRq;ϥ\.lCs֢0 %xGk)vEkQڗ3LjT Ў-/zK:8i{:x:k jˢ<u/"TC]?wGG׭ϢPn#'E|%ۣ&lkG7ٿk X ĽI&/& CSТJh,*3r h21]b T璝-}* }QZ^aO )R qhl:^0S`Րe_pΔ* ܘR KGVUo-=BYdhc21WVh*|jB0b'DWhc21ޘ oL%ՑbZ̕24)ݒh% Hݧ I!CBRAS,:P2Ia#rԆ3qVj[1|> QVݡNၢpp2s@MGQpJ@΄sRIL:MSdz* p1H_HRܝ.kR|^{LqiQV\utAC^kŚ;>q[h-*GDvw04Tfj( Lp\Bbm&4)λ]sj`jxu5t/v)Lw_Z겢9;+mT9SRm}w oVn'Qfכ_{~jPBk) hV/(tߦoSs :>ٲ3C&S {>ق#S+BhfO_JR'"nzȾݖ=/ӿ2tge0U*pJ,1=KLӳ,'Aù RSEa 3xE\Ҝ9^ZӫʼnY"6[az7{(\M&uR]n7\Mߦj+uRz]6^yMV+9Zz3*^=MWJZhz41Jdg WJ: )BSzmR)\NjWmWtsVեLk5.=KLӳG?{Þ'?{/zG9uzGwԣGKG[sG[sG[sG]] ܠm7(O8v H.)C?Џ;+>;>o7>{Fʟ&;w܎?$pB؋ǻtGpc6H1|:.qz&wwܳ﹝?ޢ R Nqo{ǽqo{ʌGrf{q{ǽqo{WWǽqo{ǽqo{G>dq7lа6%.~v[Tffq|muvwǝqg|wǝqg|wǝqg|uƿ/J}{߫z׽׽R7xY^ n4v"QFa3,[ M[Ϲm?nRm=޶?/m=޶fۅEq«`d 6? ٩炻p[B^UQMG$&4(K%O3i?agvagSi|TMab.IBw7A F')д2ꛓ=PЖI.B @ITœ.L+°Rbٗpafa):*j6Ŗ\=qɥ捁h?\DR1"lppIѐPSE$EKa`8)*s 'J ">pa8 m.\d.U1] &902g@fĹ2Е:5{H_^{`^GPüP0?[0DY -.DǻEZ1.0Svj¢}0cY<67/$# .%ɐ @vur+u0WP0eRL2WSd=bm;t IbuY]U]U]U]U]yWvdB?a{@APA%_h w4,Oz~ wlSv>f~LV*Rx`skvAe4C\Ms63!U.4Cdg7VY(f&ӴLŔEi:\, :@?U&ŀӯiZdZs]aພ# VK&ySH.MRT|Yn*r\u)ڜ| UʿSXL/Ƶ4a3[o<._ Mhy,UԲF[moo<ދ7UcS-Su T$syB<\F.O#K AfF2h31ʼn>P4kW* [U`\@uqdYh)pR돏فN=fK-vAJ٥ݢt#P3?˂±JDž=5F7Y~WlJ[mb 0 qL2_ia7`n^3Qf|p^0 tâue#Y nzhQ-5&ǭno,+v [z:ho޺c2 m՞1.ɔ"_SeQTïUɕ*8 :B"6phM̾ҋ@"z[l&+'ɐZrL w8C[3"ƸC{feKN^W)$$w B <(o.r3Eg?9R27>;+)hNh2i6o\:'. O);S>CNHujN ̂ 3=yAn+yj` ˿_kb( ]#F.>ߗ E-D%R^P:o.n[zh,uI,2n)&q' 0(,^؄r%dxtD5F/*ԎZ/!z) (Y^U{. DWd+PrOKE[@BC-C =z+tTR;|q6pݳR)p\tJVlAJb(Xvл[w 1Ȥ,.&wH] RaTM֔2;*C,H]T3h"Y"*ޑfw<%ի% +hbbxi5mg,TH):S1%CI &/-:*񗢡L7UL=xIZCHJm 4횴UyqA"c(:Jz~*[͠0`gqQ bN0jjIPMʎ$,a8P7)OݭV-m~P hOwcj6X@aOU"ƅrU;YQu9?"7\Q]^}!qĩA1ԧРva$i[E je,W )B)սɲKe^MakJfӬ{_?;K--U;d5^vh51u֘:➿ @R=Mf\@:K3 #()պemrhL7=J V4)cP-EGe L ?e,@T^cӨ3ĤXᶂa0M;Tiib-hv4eO=Q:VJ3bnS75ݮ۱ڮ\8[tdƩԖlP|M SiYpZ@-Sh8uy˃,X08ZGőwݑ>c]. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K04. K0xG,xrrMxG, ,7qrQ(j% V$p_7٬5QhXwkE`A؃O,-yP7nLX7HuEK-6٠Hjh-P7 9IZ, 4sqPfäM b X;j":sRRh4%v  l4nMU"H)VL~FH_ H>-mm9o }ķh"kIfEkssߎ6KUܽ.9phpj<sMrq ON!l=}ёRkLq&9.[)qT_*jp}U\W6K7yCP,mf{Kua0B Ҁg*^:j?\_V^dxi)_p%Tn\rGd{1Ux.K3,Z ./n//n//n//ni"Usy[_н;A֠V5VCjHXV7` gUn8V~A.~jOP GEKΗol0(^49KUʺL!eݐuqS.1B16RA18$`:W0s<|AQur9,\J! 9$!$䐄CrHBI"$`CqH0ƅt-\HGH/(*[+C؅-BD):ZT8d䡌.BvHUw"$`(; vXKaI;,i%>DsH)txK5TkN5T=H*$#dr) UINSpIW,0qgqJ.:L_(m/n)K{X` a=^4 aѽH1O$ćGaq٬+ܣpk%h; aa?,'>.S5<2}ֵwD>V ^88k(,Q:oپ Tp!Y5nh[(0e)zI9>'d9dR?q z,w.L^( @ݬ4pߎ ݌ bǔ^IOwLtǔ+)xwQTB}g'sl*O.h|nc}1N%Z+DL⮉Ein:~Wsp9@!vz];:Z&vGǛ3|d nQSh?OΕH{ 4ˤ72)`Jvٮ=+Eʙ{H|h{1]iRBpN ^ =w)rn1]VY v0/gR>$&{@Gf"o: -TA 7TCptQ/y>u$MyHuwW?tG€Ƃ rp{,CGnJbIv]1?;n!.y%] dҤ.ܰ x$uN+ ZGJ ` >ALE}U;0dg |0Θ^oW].0Gza۔f0 IEpLvV%,ȏ_m% kfel<(lƜPV/ \C_PʧrY6h=u,)mS)ꀶ)H̊Tk8prP7)wwS4uN2>fnom*ZbrNb9MZ\KpBnar[eE_N[(r;ZNB H1eӱ_"/k1]PB* .B֥nʳ_`3Ǔeԛ(dJ7D!w7S75)”#ܠ9@&k;qfG1QA!AϬ:<<<79$pCn q!7N͜9r]ӴMvI6"EnVjY 7f%/gn8CIe> P:.uܱ~?2{i r2u!o!MM_uT6%R?/' hToME1d!:JirbI4D0-Ýc:&pH01H{m~B$)\i`O>/f۩*IEۡALdfĵ /t2ݯLn~e_W~e_§) PdKy!|zaf$cpcϾfkfRj(Zr fk.O*˓.Av&s{HKH &}M6x:R3Թx:)sG J0'kwFu$)XTq[gN Wr:<00 ;AWs3= tp l;2,\rl<",cYX?^>^>"R9U/գiǡi8TTJEْxE w3-b׵A]خ9@Rm\9l(mfn0%mP ;u@ovfkIqx ť'.A3h@&PtdmWt@7ﯲ,~n9_NaR|>#C?rꝞ6 n>Tήi|H ܎$ DؑS"]!<^N؄:ma@ሄ"P#β̦i9Ɵi#<|@.R,nsaxx|Z+:EM^\+Esvv>C[o)Icٿ\+ =tWۢHbG)ZN)ڊaRs9~X-rX.vlsEL)VnAˤTZ=l:np 0TX7/X^>TheƢYw?:(prU.^oHo+7_J{` na744 !o~f̺aLW]x:<:@T:We0Mx-I29w.;xl^d :G&;S ]\@(htaL,g<"` Fȟ`q" .V,8lv\-ȚTL4L t[W)틊G,nT|j\&}]{4Lkrn^v;Kɹ_Q=Vvt}soMT8PU7*{x38|UO#\ۍh> НOa{ 6}8>>>>>>>>>>þqY-8ffffffffffv%9:=ĝ`g񑌳cxS^gxOYWmf=uѝJGw*anXWM}M>k,$,}ᤣɛtHTArmյ _ + ~Jev\R'Rk`'3EN+ aY5,« a 5S?ߗmhnRh9|σ@n87M~!E.$SWL.8OhP#OSMI \hЧOR)up9ZHңPGB|Ԁp- *>vΟ(7V).Y#>7-D-͈"L'(XMZr\$\Q(!8_9jܑۇwb.R$[ä^ifattH^&&&&&)ju E]tT<8I :d\L[3gas6?te^2qt|zlƛRI6`_pLR1=0Ngw?[5cu7 c_ V-XbwHdȋײ:^z⌭^`ob76_f)t EY9JtQ$-E~h-Z_>cKd&!s]tDnZ/` Mj-nw^@[feߎ-.񜱧O6bI?I5u]{@BoBoBoBo^$qۇbJ&Yp)[jZs)c:c9cNt!<+q;pa"9n3m=gC,kK`Kjmſ(`KjC-u"pfpnQ-pA70i(oGy=f}{kA(4p;nxο?| ϙ3-z=^EעkZx-z=^EעZn])ԢkZx-z=^EעkZx-zf<^3KfQṅzl6C?LyfT]'XM'&+ןXtʥoqej#WUmo,!mZ+qg/|6[\t"Y ס\hh6h{Jh+)-Ljet.᥵,H:eE,ibbȾ`mِ,j 3w4)v%G),jy,-oU,DTmzp>t'mЯwagVRJ+;T #zjh5ԺR<(GXֽeXfnt< 7)2ڮ?,=%K# $AoK\w[ޞnKm z{&LQ!T:صD[)?~v6k>rO> 1?> .Bՙf˔HO'1^>;dGչeVAֹxQ6r7kKLa"w׼ 6q>/l#T DGs43uKDgxΨs qӞ T!:]5Iޯ]5ކ) !tvm7e~ k'E/4h [ |gnOG*#s|&_6 â6M.oY5QN`(=\3V:e%YWoc ɓ0{aކ oAl}$9]<:\'so,:_C1XNr:|u$kmR vL4L\n.=loi{>rL],m Di98ӓ^+N˷i6-˦Wk,$>8KZM r|rk|rk>5zJeISiW0bp$q3k~!g~:%V,Njx97\s-&d?yַѵ)Wz>L\1N&^_÷πyvWW&O_z5_̗%Rd?|یb# ׫ zzA>g}o61Z% E ${8>ތOP+xGYJTϸ25_5[Y}JJ9VB]0o8)9:xk,2* Jn͠x( f-LG46#qbÔ}&YqtJtPa1-ay@8g <8 `|7Vx!spSn[<=҃\NNNp)ćp W.ӕ x8=iʘM#^J݁ib:fzWD)(kfβCL{{F7 u\.Q:MNA=gE-j͊JJJMh6*vIYuYuYuYuYoR]O$7|ʊo;[*s=ݖςӢ #4t[16aa|wCq9B XYpqhdn}x%0]_"y 4pegh W mW#hc[Z70^Uy2 vtMŒ 6ӰDu EȔ2ۚv/ns؆:],o p͗6n<.GR=Бf`ZUla.W.ܯp߱ga?x{LtċYA4aȳx&^tpop?^qc|J[?}\Е뼲 R<9wR.e'$:OqSbtO2ݓRiB.:>eO&op.h8[y||Slf?e{ziP6ٞd{i:?S0 4|Bq'lU74پ^ U6_6_6_6_!¶gЦl xuJ}|}|}|}|}|}|}|}Epp=O۵2m'''''''''''''?\uE#*n@)А"""tB189L a9X+t&0_p(g+S^^_^'pQ#붭ОH[w,سl_uEN\+E]kcцURI45wG6C_Yjhߦ#bۮpnmW ]+vLbOA?4ī' 鶥 MG;@'!]KpbnD$gNa]9l'j?hx+ZHqp;Nm_ G884nt7UKѓD|!R6H8K(n\vB._xBP.@MBx>drY=q<ٍx=teeAHĿp?t272U^s]y WnD|p/tG̼uP|d>%|s>%|q>%|JҪ@@G]|Jkχ:џ{p}O;gsh$ 4d-e\Mf?AgS^vSw [pJ5L=B]vE19GC#+} KY z黖 ÈՉ CS+K+K+V*@kAtwCz46szH4\_@LG5E5EzڛFtε"$q,YSխQқƼ4&4nѪ[7i&ޤ՛fBi&ԛfB##572YK?^KX+ZΡq3WVȱiqWrs)lW̴nr.s^LGf!`w 4Q iIy:xq:ʹM7;v{9̨4 l*kfrxZ]l9MR1n 8j,ئףQR> uWtmYb%َPFἏ6MaR׮@7mϹKKM*i_O t[*\-W{Ce[c)E_ڸͰkċ.qve!l&Ej2Gg7&P1>UkF}94HEL)[fJ!S\u4ٻp2馡m=iڦC`TƑe zEY2n`fâ(;,k;EݴDG. 3^_{SEwy8 ,S՘B&*0#[y5>gt .$XH#1X6bمOX8[QG4:\Gh7_1m,Kc.A(8:f|b:Z(py -듇/M*y mጡ1Pvi)O϶ LNӲm{w+~'a_bJ&P 3e3PC;k.W/eg.ˡf2qR A3sVBfUyar3|v&jDPN qo=x8P Zs|FoχbnYR֮uO݉nsE/;gt</it]D뙮l=g.v__U/n|gpR$zsr}ꁢ!x.uTg:Sk>*\5+B* *MkSՒ-ЅÑB.p?S77F!WC/cz| #ix):Hr8:Zl:R ؞)/P?t Qr4pa`0vuIJ|yNrR\ߛL\Zm!9=Z85xOxj,jE ^mQ8vAp$y,NХ'4܍˥ӅW1KTuY @ḁ.UEJ0U.qb2ʩyҔϪYNr:Ew%MK֛,_66a T:.*p'+J؅s=9|"$sUQ{/J_$ӡp`cujZǦWT0Qh07cQWJ㞊5й9xJAu˙gu *.W lI.w'{ ClziχD`<Ӕ)Ovx 35ЅE6aW*qTPpٮepK-`֌^8q;[χ+ 3!m#Ϧ<6;Kle)MMGyqtVҩWRKob^~PҁTSJm^v=ٞ`],tCn7nK[p睨0"Yxg>g:/R[֊|p ennVDx|yk33t_qHᆣI\xąTGnձwF-ǦiAӚ@?"L[Ь%y9k?][a$y,vt9c?:Dk}v_Lq-93_g0;4777 k-_圌3u2㕅pBWKhA<1):|v͇!7+%@ZI :|vpXc֔o-k8̏{>a,\"n@'#| 4mS(-SM'瓦NKb!>o_[9ǹJyE:znz0`C拟@I+)>@xL)=z|xc_t}P[I5Y>g{laT6Cbeq$z%{<|^,gΓΓΟ^Z.|%/2gb]O_g M ű`:G0Km%Q(Lۛ%Ģ\pIHQm.KGDihud&qԢitk; 8NEN %t;u𥍯_@j@Eap.U5G*t)aW{%/3}>l+B7*'<yULpyiE_cP/|1t _r_8^.ڇj(N<f.h |^C|}|>ϳ>p'>T«wҁ(GlԘR5t%b%M#H8<99 g-kO20ySOO|0~4u+ӓibJR*SX{N~m@2ԅkN:썧Ӌs 79=ԴT6ke7ݤҲ*vh%"TV^ja}> +|t]YaQvO,^<>ݖ酕//-~ѓF{}u)_ԋK//eӠs4\zGx奩VV6l&Vl-4E:z>HaMSq@@@@^;-Lt3}P\E"ke Ɲ£v\XzaP{ ;܄-W0 0Sfq;)ty/,Bzc8֟ǞaO"_.l64oLX9bx]}2%rk־\!zkrn(oZ^QEs{x{x{oz~KwVcoA`~Ms~kmZ~~~B~3fpJC$ .^ -ht˧}4|EiͫtOKhRK NW'/;iPq-#$('F),)『!Zxh;T\#R4\xrG5/)65B}"&9i+.yڼ{i(.k*!;I.)YΘu092}f n jE4Mi{;1+v&C5"1t˱_v"5TνahHHʄi*uಏxVx;E(![LeM-Re "ȃxE[5高&EAjg `t#! tw7=Pp()uO61gʝx(,mew)J@ ˰>qIpMxr?[>@ ;\xj/o'La<|d> uk㍵& ~_*)4yVJanjSia#u 関es6xS[.q#_&?=2NU)s Eb12ufLr12k$1cdJё[r9|{6/F,xs414Radc>35͔OmFpˬQ8U3|cc1RN1M>5|[ |q|_gjk,s -&EbK_䙛3‘E=ݼXCLOI&fL>|z-5kz8.˺gr^J2YQڳU.ڣIպh(]R,ƴ=GX/GS&ٿ=@<@<@<~<~<~<~<~<~<阮wU]XMۛ7+.Oio9VᯡکxzjF-=/x>hM5 rdOC}<<^ǸTF)Ҁ{ހ{ހ{ހ{ހ{ހ{ހ{ހ{ހ{pBH 2ŭcS=qްx(v1ϟ0/\t?x?nPxPx0"gǦQU_" PsjѰ4={U/ ]QU*E0M6)劣N;崓wF1ݕ;حڼwm^j1ڔ"Fd;"iA ^Ҽt [s~a7IG_q.>Ͱs J5y3J>̘| {Ɗ\ld&. Ev5(;Ψ:Ψp*tdƌ:Ψ:Ψں:C\Tmj.qLۭť- @v Tw2(L5XO3 S޶Tw#HuGjb7\tﱇ g)/P#x'>c< Tﲧz;V۾6Fթk`>^|k/nrwV_[w|]J^*4?4o$\Pg|QcJ.LҽUJV9k?%|)6ż//eL1{/+{/_{?ՂT ^޴RASzx|p}j~ {yFr~ {yw?C8ɉI|t8-(Νj3Y/RU7RU={U_)rqB-)ǗJ^$qZ~m<^+-R qLJ {ؕjS$)Ѕ3}v.R_*LѴ~ev7p6 7رmul[VǶZض;-uݱH.mK1U#1oBⶻvvٯ_׌8C? R__2kkk۳1T|vs=tկׯmW4Ք]???????دڨٯidw9T/Wlv/ѕN^R |^^^^6-|vC~F"| _QSϗ*oڙvEL=ڞ mτgB3*Ӣ$&;$ܘWf lOq8[S0&\bx DZ;jcw,X=ݑP?O|T|'0'i3hJmxMe?O'uQ hN?Ox]Tw< ?45F-]`144 O‹&1 GӞ=i 1z(*B 0 ށеP, m0M6-1.KݗB_)߫zty$LR& _3;""ErAh_͘?i*R"-]6^V]e m\BZ$(R8)/ߘ c,c@%pHoD 2T2xxz/T\K#4IU˫Lz4- AF>;m:UD숎^UmɭjS6e&K+]u(]򏷙K0]=ḨkNkNjW*9*Qjo/Mq.b9SIr?S9O{f/1sT+kq?ep\@_Bep\أ=\Q({=ex2Gޣ8zfS~PD勺ce=O =IW>{["l 4mQHQʥ-Jq;UDٔ*w?:Q.GYI)k.aviʭRU߮~.aRui^(a0TWhkfO׆Ez93K6xv%]]ϴEGpȾ7 +oYoE\dzʍP2Xɿ|"HqΜ^pP2&M-7q4[-[n9BB?>|>CЂ%.] m]%tzqC[1#]@<ޒ~/.6 co?6&w,@Gcfqa?8>gr p=l;+v\ l? BKT<܇TjEqF?b&1\KkG9C @W.7.d_M6ZqyErLCj+B8wyvWp)S \u(| ןNݸ/\(.JRs0*,^@WR%TMq'idg삉qaRcgK_r#_m ݲ`(/Gy9Q^r׋V7]RYS*UaK^,J[ZReӸ4nq;i|s_ƲW%6${~%URt|aIa>}Q*RIYU8=D.$/S Kjʓ#6]i KzA8SKL-Bfh A,Z,LΔt3%)LIgJ:Sҙ/SlL9N98j ї( Gw+!3V;^J}_}\a RaҮi#cQ{]tD>`}Qb?~THqG)MimJbS/_YdbS!\!~5ӏO?{ BFvvܦ[r”a{h]LXsL…"Gnd~,&/h”"=E BQI1O2+ù2}u0-ΩΩұّro8so npSA/qdzܢaz_ {"@{'IԡF^MaRhZBDzЙ鬝Z{ziۗc#3etNg|YwC\)+ YYYYu)ٽMo\rpG4KcUi MsR4E,5Y.Mۛy:1Xyۅ3PejX7M@rFq.͎R_ՌwKYj}:::ۭ[_ N:&E Aҡ\("mvs$ǻegk`Bxl-+gkxxxxxxxxxxxx#9h+! ^!+{p½BW ^!)IXxRsRd_ӡ}9˱=~i)'CPWUt%HWt%HW"/ӾlsZIpN҅.t MQiq>~9OHyK앎hCs)x9p9p9swW9B{!)զwLtPuM믏t Qm킖ia:uP0;@!^0MٛraԊ7h8#Ř&Ha`h?װ/!t+b澍aaJ^zRukLlА\2;+"];,al6.:jZ4( EհY]?SDNT#oO7 4vrڦ0?UdM^@ Ig\Y4{ h@*ɲ(.p!BYTp:o$ EGS4ߴi}^v?r ]͎20. 7a -UL=XQuTڠC3VqېY[c:t>1|Lc:t>1|̗W&$Uy6&EQ$p]ϤI9f//>O%j vl7vږ 4Rf9?0)a_1E>;iUDN{)uWk>jAǧP͂lTЗ{+Fiomg4A mk YPm }*fAYPm }*fA}*fAYPm }*fA}*fA}*fA}*PPPP ,y"CG臎!WyA6(D鯬ՠ4N@v{=~I5b5WMz 𡑵 gWtH F/;'w8Npr;'wNBeB75gEӴMaRlv;q;EƖ (lGe<6s\ N/v;W׮֚KaKaZ.U*X.m B\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ ˥\ –(=lvyl8=NlvmL*a₶_@v)lv)lv)lv)lv)lv)lv)lv)lv)lv)lv)lv)W C}s4¥Zl)b-&b-&Y"Uh ҰsM`eYjR <ɅJ-\jpU~ <^rxiٿ_þ)h^yl*TtKtnvIM"~jV40`uPӅ.t 5]BMjPӅ.t 5]BMjPӅ.t 5]BMjPӅPꝅٝhnx\"?jr>6\$3}LNZm֞0M|nױL7mo:V.]8m/ݩ3ۭuh&kXNfRVWM>/Ko9VҶ\+ɾ܃*sheRÍoRM9pnsO;M!SeDB9> /m57Bzc·>>>̾ZC5p|G8fkZݸG J9Kx;q3wSxd/-4S@ts=gJ/fJnQSmm4zfX(h2ɭ*+{LJSY@ɧi jn{5|v_n9'DχxvѣW|Q}n>W _nj7#:jyC-xL=SnjW 3=6}W۫=JSun*! *8Q}ݩ#gd췒VJ[~+o%זJM4#C6/M[7l VE[=L:];zz.un]AC9F-['>ڌ~s؝gv ⹷ȼhWɸq{*cz!?y;z>§D=o d0R5h|;ߎa z"JvʋBi|3oMߥRvԻnG#z/Fn_wԻ}1(z;ݎz/nHįCuѨwSDFEo_*ދTg**ߎʷ(zbTQ()/ .Oyk<Y7K͊AMsy+oT: g`3>G {5h(&m+GS-?M kڴM+!܋mȽIhC!3ӂ7;ʜ@zzd{hI;Lu{`=X, ܛVA[k|,FUE􀬻p`z=0MpY LgEFެ'ጏ3> a\=0W#q&ſnBm~`)%UL;:>M:[ Һޔc鋦}4>DZ RujztW=ߪ%oQaZ k.Åwu>8Cul{pڃly^ x#%<1ޯB*9*9:)[JM! f*9DR M䃼/A+w3fn`xa.t gRY# rSv[A^Z[A^Z+ۢSSv`.<+I9z7z7zp<;R<;(8geL ]ۃzo:]>N.zvU؜t;޹pP}f+H>yOj/鲼8A:>釹j^rsͱH`OA.G5܈8e$~h/_yh|P2z\~-_TiHh/D{! >jоc/{py t,DW}g.R0 IW2:x0<8]yȶm %{mUzȐȐZ"2pJlK7!W'c}эpQN\??D'-~ \?cg\Զܖ,fN ':f`3ؿ @N ' dZGikir1y.ۧAOzsmMciT56O؟Scz5iY.ݫ{/<9DQR{~$mȓش6gX)ѷ/r= nd;jkQׯj-' axBod=9ပ` JBo~r#G=(@*% ),zpJv:y"9H9δԎA~SJI)^k<O4i)L9   ̐,'r`3 f000.*-Q.2!'7f00 ̐-AJJ1p c0 `8Q6j8L! Ic&ťp)11fq1)r \bIk#nj2F .%f:u1\J 3.%f1VP6p.1Kdk8p.1Kdj88'1GJ]A).xo:g;`Q'_nn[dP92d4$mHp6U үS{{JBϡRdǟnVc7-۴g{?D$\ЉyU?duO є<{!sZO IHTiuC2|#~p@FA_kkk.GJS>rjHd߷|IC$Hh!ERVM^=0 pE-~H奁@?Kұt,/Kұt,/Kұt,/Kұt,/Kұt,/KhSák[:Uv>)7xd8gP5{JǞұt){JǞұt){JǞұt){JǞұt){JǞұt){JǞұt){JǞkeOK({Ÿ-m1yh-Fk1ZK4֑#CcucXG:֑ucXG:֑ucXG:֑ucXG:֑ucXG:֑ucXG:֑5,r,o>˛oɡ泼);mEMy[i-;iH-:_*zxOldQѧZp8udj*zеNDgJ5V`3LKz'0f!,"ߢ0렦ĺ_;i٣M郞^d s:iqq.2RMS3}Pޡ^n:=h{=oIt)M#T!.!VMl`#Fחꢙi9kZ6&R]ƽ*6\mFgӜl3'ɖrrX=X|Mszl$g7w6}sgcO6d8>Φ;lmC&@J:HN6d#9fliη\8ʘ)͉Zsb95Nv3mN6'v.E:6LgdۜmN6'v.fMpmީ4Xzb9XzN,='KωsvFYzD_C~GH=3ZgO1C#!tb:8b91N?'szC;_?'ωsbyQm9:Fu0N F'щ`tb0:1ΡE4f`,c9L JbLTDu0QLc&8& &jiccDu0QLTtN11"NH'q'&i7Gq#rhQ6L eD0Q6L)tbl:16D2HUyo~Ui썱;/Ng/Nׁ?4@ LԟE?H%ȅI%elj5>%U5 j$体NYY,z?'%OJJNKߋ^V.΋|֝Hsb9u^ί(^0y =9B[W%ʐVs˪unY-,S7cJv5[,C3m+Zin<}8YyЫbe} T4H{²2BƗ0ۚWFe.Z~Gs-I|\,8XM!:.u\6I~NV\I׳|奟ray<5i\XY&U` 3 u;: "Jmvm컈T{߇ru-V'rPhTsT?Eg,:cXtSyԎIXi>F9,Z֠4Ab\Cf ąA"LMV-ntˠ[݂Q)5e-np )k }m)?ʿh HdJ$ym1M,=IZRC}d;u1EJ\^TԩhY*0npQ%_4YHӥ\4@S('>'uB);e.sȉDNp"9,[JDtP)۳hAYgўE+ XbъE+X w7)& 9idlo8}NǐdaHP eBՍ@dqt7ݸF8]UI)Fbzd7/zէjld3[o7#LuOͽh ;Ss/4\"HNU7+Ji7M4G)~~룇j.)~룞zDpPvhtk}>Zf+jf}1جv\ڬ[eި_\]7uY.f}Y1٬6GYV4vg,:chwƂf}YX>ڸ^cѾ룸@>pG{Apج66h!G.A t h>ڃna}qجA pZ{w6 EK?"y#j5=5Ao;M˄m<oӂ~ts&nPy!an޼]4F}! {/dCap|  ap, ap,  H˅"aH,xHE`0"aHZ{emZ֯Ϩ_$μ( 6CH,"މui\X|Xj'=/:cgق^US !8c!8c!8c!8c!8cAk_(rg&G8ʟT<4A<4@Kυ o9判#H:%;ׯ[dGBS#>r#>r#z's G4?/V:Li__kGܱ6AjQ^Uuk%_[-imڲ4M~(@*EFughr٨n6f{ !9柝N"MŁϴܓn=fO'B:)e*MRҍnt#)HFJ7Rҍnt#)HFJ7Rҍnt#)HFJ7RKJqFv[8཰3{st#o>E-}x==E6@&"#"~99 r9^>*AIɓ_z)ϯCJtwE B,E.j_A2EHkIL:P”*&j,5d!KGqLZ2,%qpȑԐ&-*AIMZ" B].n[CP7 uqkHPCԐ %qCIP7ĭ!A( Bn$?OFuIPOi뒠OiC6<ȼQm{(@*/? oS,J2!'7oCZ!z҆zmw)mw݆zm)H)֑RԻ nC:Rzmw[GJ;RڑҎm(uJ݆Rm)EP6 n%Oi+em+em+emIYF@VAH!FzQ# 5Z w\V EREjBsFy)#]թ7HJIu<)E7Wyz@|.}kCz}/WWSɓNj*AW3myӅ#__$ K˭m++TfyY?iЪA}.JZE Z 9` sQ'O UOkM=┒aPb1=Zj=2{ĺ>Ob7ϭϭwؐq`Nc< ?ix;jT<W5T sCP17Y%>kd5:5CMVW;h}څThJS=gZCgoj?!ʨL:ii|&|9($ߓUzŃ&H&:6k=إRZԣIxy냜4Jh'<Ḉԧ;%ӯv5Ug/:{ً^tlwmً^t]ڶ[;!Vze I%uJS6ڴk.=mӮM6ڴkӮtv /=[nepv,>>_^xi^бlسutD50M-;뢗 ]ѿ] Hy[gAYlS"S"rd՚A9e04IBM0&+î{ " r>}<<D^ʛ6ൡ>d\ůyهN1)f:C#Φc#qO$f>-f$<}0t y`$#9ɑ6gLep y[9lFZj;/ z k 2v M)PFX9ȩSARR< {P-35TS3&sάaY7Ǻ@[F~E&}Xg?M~;~tdX7:n57TdЋ6k%x%}9H_uXH#}E^("ޑwruw92Ȝg󋂚o{ʭH#A9H#A9H#A9H#AatwυI#A9H#Aa\Խ$K$(c>! ل_|y*pPBJ8)GڄwE4~_;iZ "?d s:iDGmsis3>mnƧPf9po85\@ 4pQ(AS Auvsrd\#p4- _ Р 33J6r9眺J#owR׷\xOLd`&20 Df"3x&!LB&!I$d`20  LBFف]wmFg;wF$|tF3wF3~~g;ߵE1ǧntb`t10] .F3ύΘc`10X ,b``1ʔbo#V/Cch?0w'Wo`1Ca |0Cz15 `P:TL c02 L c02u0+b00a Ä1Q:jP3\YiP3 FmXf>(gK%Ph:io:JaAj`( C4 VlbS0mô Li60 @ 5PC jj P5@ 5 aB4P> TN@4P9 ꥁzi^Fi]T+钆K% ?@4% еNud HtIJyU@4\UCWB=S'oF)A)gZ5> =ppY OOeY2> O!Oe7> OeO2> =@4B4> =@4\$(pLGc/2}>I4B2ڧ2hF@o!KԁiDiLڧ.JC4B@4BR#>@4BvR# ?O#d'5>@4dddd@4F4F4>@>@>@> O#aO#aO((ӈ&jDɸxs1%rOGYC,8%#SAo?tEsY㼃BȣAc P @@tyv PDv,ڱ[!@RXhH40:(e|k`@40  DDz}jDc]y KvaCR]aE@v VS )KZb빩 E*Jl*JlO%)4PR-}TbcӑTf"u$l:l鎃I@ ]=][Cg(ꔗҥ1}zϞz' \y]#"X}|_^KuSSSJʃÜs:y PC]Z)s8PCMkgNk:tЩCOsϚ[ }o593h^ ͫag pf36XCkX3tհ3g83^3Q6!BIa-XN|m|m|m!待=1o 6؅heD u5ԡ1H퐈5 AK3P*RC1jQ{/]FP:Ԟ:`R~"tN'{J&:a b%wNxt£_4 {NsžŚ=4Db {Nsž&" yNsB_DީApZ)qD'<8Oy"qD<ђ8Oy"qD'<8Oy"q.DR%S?Bo(Rղ|ѣtjdɗIy>(p+(R+/h}l<[ 4yʫh{N&Ps0He]u_КTmiԬ_TܴIWhetۋ:h\h(y7%oS+S?o[7jj[zٖg=U%u|;k퇨QCSa3@f"Kλؙ=QtVHOwT3պOŝiфRΟ{!}l l_͊]p^j0~ę W tbr'_(ȸh)G;t8Ҝitb9JW1Uw$q9{KГxqeJqAEw7@}1:wB1~=hT_OS"% 6Me#,+m{(}SCE$>q~@/8Yz@}<9$-K]'\tg 8p9q x9p'қڼћ<Ȝ# 9p,#bLj1"v\?"Q ou#ͅ&-ks=c^ۘۘۘۘ8G%(QsT1;*Dj?wTvcn7vcn7.5Z?pş{>GsE8~c*x;ymQH=]HR.:o$vrV3r.@V3o5[V3o5 7HE]$` @Ȁ'S^rvP2mw888ȀNd@'2й ƻN `ȀNd@6'2Љ Dt3'vs#Љtbzg\\GBJgqn 6@ ?hT(2;ѡEt(CPD":ѡB:A5p;A`9ps58e;ѡEt(C8ѨLM;DF/ڨ 7Pы6zF/赸Ehm^ѱ r!8P +$oJ[4\ʪ7/nQ㮕V.ki;s/$RF-ma/ Njm_A "JW%?ѣR( 5a n:S3fUgzV 5jq<g.Ώ3iiY>ckL`-*[ d5SS]]V"xoM,HUsgm{m F풝][7 snsD)+&kJ'0,'j9. Ҧ=:h)fk IeMN&粦+UAmʦGWM^YX" F;$/gd=d\\ArOoпqjAJ^-/CyпAr9s9oпAoTF Ћ~C *C^i˭PέP>e8`8Bޖz<1{!AKU 2(p.INwҿd׃O2Ԏ89玓@NH력_+]:.9. &&_er ,qBj/qDҩt ]:5957@ʡb3LR龹f zu7/M[Vv]o++:?666ܫ6[%a HF555%l#a {%y"O$iH9]s4miHF<"9s޴HF6Ҵ4m+!iH(uodh#CFv.%Cdh 풡]2Kv.%C/e(%NK|2OO $>ħ3tRx|O#>rħStN9)G|:O#>̈́4328 (32>0 3****ީH Zr' fp`'{OΨΨΨΨx疯|O|O4<28uuuuuuuuĨ;&ojZMSKQ!)~:*mL A 33J6r9jLv?(kkkPrN-DHӨ1>(aapX^sV4A*Yӈ0ӈ08,  08,  `08 C/84/8 chL#ýp/ZM:ы:ы:ы:ы:ы:Ƌ:Ƌ:Ƌ:Ƌ:Ƌ:Ƌ:ƋicO EephEepT\t]p@]ƕ]p@r]\p,E!ep,E!epqa\øPrw;k< ₳,.8 ₳,.p[ ₳,.:3,.8 ₳,.8 -8w ؂3d-8Y -8E NтS@ɂs,z:::::::::::::;#3LLz4렷 h׸ຯ 2V Gb:@zߏx3jrԇ6$sErPd)$pX;Eρ|+tj Ǎ4- aQ{T" 78K ⹽E1+9(9/+=a#Jdt+uW_ 7HMљ˯&9?R<¤ x/^x/^>8GcUARfǘB#}&]AN;r}G~ ~ ~ Zp']م5&1A jLPc/0 ͂ea|^D+4}kL_cפ| |k0 j28 < т DZpKZTa|La|wk>RϻmƦ474 7Pj[Msi^'HIꖰ11n^jƻf\"5=Bsoɞ_Cga. Ez.=y.R-6 WLp.2 /Js:ySZ WLpx?ޏc?2\1Y|Y: \K4hG]D gxNHupG8pG8pl7e8_eWZ"cp8#p8Ejfm!px / / / /ި $5yË7xË7_  gvpz=A.sB`^["Sr )K8HϷs9!2椑&38Lr$zT)TE]uYuG@jzǦ9\ 6m:r/z*5`h+}끩_?r/>My۱Oc&(?'FUB/ZųQF=,~\생gM.d*|͆.\̪6AT*w`jQP2fYok.F,Q%ϛ'9JXY`*lW ;x[M5ZbI_Q V6lUjzE[3`:<{u׋?u,~JL#}/tTT+zA^Τ=5bUaUY?K~gdVE;;K~g6V`=;Z^WCvV\Zߑl}Ū1^5+?27WMT ҿ;ʊvvf_m08Zd pJT'(F9QN50 "3ɼVDJ wy詁Op%OpOC1_s. .9hMI+{Ti7Sg0NzmkSgT?-99J )x_3m1ZA &oI pzTv1 "V긗Q ը?l)ŗ}2A) y^ {i["T2/d|^̀ X̀%um=Hr@=ߑ6}A( ( ګ9y n nH5f^6h,[̲Z;-z5C;3[ BKBKBK -1/Hk DP"(J%Mů9 2/Jt\{7 zyK TE"6v3o7v3oɎ&~3o7@A E9d[&Ah_ OAk"L[ca'|El] uri+p/YA4sqQ@C8|6sD's|66t|6ޖi56m36k"H4_5࠷h'J /s?_dg!CxoK/vs"1Nx_s:h-xX!ܝj~i槁j~i&Q私t%]}S\8cӎ&sɝd;iB?EE;㶃B23`[97rAo&^>qbs6:H%T b+nyrlU2`5tHySK{UUr3Xh6f_HvpS>&ZRoi0z_>mrA߿O]6M[n~Й Й u:9 į2]A*Y3i&&/M\3g$צy2's A/ze?1 _}ZVi~P ng\Ӷ3m;Ӷ3m;Ӷ3m;]Įh>^MTL^ST.[YHlf"ҹw[BNpAgτxې))`S1'M;}J BhP%9N]y_$>2CMj0!;7nZ]iܦZO~YˬӀ4? iO~ӀDIIIIs@x!l^WR;^؁{ k^؆{ k^׀> > > > >qr:$CGsq>۰;~w,ޱc1ڀlmhF0ڀF1(F+YhsbQ6F1(FhmUJis9Glզ2ZU2Q;g &;֯i=sr!BɅ 1'bN.Ĝ\9sr!BɅ 1'bN.Ĝ\9sr!BɅ 1'bN4ٟL|&j=_|/&VEs4gМvebvH4hٷeɮ6٢sIsُ:H#⌃39l&1'q۸m4hΠ9 ^n4gМAsM 3hΠ9 nbW7\tbC7?\.:XM 3hΠ9 3hΠ9 nba7 3hΠ9 9++YxӊhΊhΊhΊhΊhΊhΰ2Vgx՛;Jz&fq z@,ÁM&wLÁMÁXMƛXMn%XM&x@l&6x&6x@l&6x@/|6x o>F)tJ\/k?qTog:#{}&6Fp:<óD)1^j@>Y6>O}M9o;}mEyrT\ Ćqb8.wiCL0mޖ ,!}v8;C |^닀=Ċr:_,&'SL$E:rb9/Eۀ_N=/^R3Tȼtzgsˉt>Χ 5|D2m{KX9I8x}D.s B{F|!3m46Ш5iՃ:o^k|M>9oߡkC9yL<Čyp=^=uAKuPH-#ed$Mym:ids 1:!4zw0:+-H+`}L{FlVHhE~ڛg9Xnmz1p~u1u_=%%C6d`z# A+Z3[aSϚ,gԭir3QxvJH'gf6V G# SU-"i5u~y~TY\<2s'd~E׾+NX+7B=BA^fp7upi&Nf⼁aŦj̹4;i滓*;校ziYr;)V3,QWGfȃڧ 2PtuU~pmG9MYscp& zvΚQ">1g3װ+L᪯^?E̽|NB^ht꛻g=3RjoJK-SUzSOd 2|:Ϟ_%ğ"!]7>0,S&IBo\%='DnapOm.-g;y6ճ %^'.P_|J0V}p:Ź5/t U%z[ KR䄜o\=P$r/r4 s*yJ'i͒eTl 3[}Pl+jf7}kSaN`C}xK:% E [)-tv Bg-tv Bg-tv Bg-tv Bg-tv Bg-tv Bg-tv Bg-tv~*n[;s-r B)P-r B)GpɃ,@T).y1 R-r B)[Z-p BP-m eBٶP-m eBٶP-m eBٶnY(ʶmX[(9 y- !%22ALIT䕚l&[jl&[//f{/l&[wN{JlhE+Hl=Yݤ[[[t[ofY{9{wG=qv1^^^^^^nj6Msg$Hއ3ϣ7| _o94ͭE5UG o8[ /*%W"#uݰ7lu;r:"#̭-Evb:H]d \(}\gcσr[Usc[˞gZԋ4 #m7_au|6_au|6_}[HMso{07̽an.t(H u# |vrvhvnL|. 6mA;1cmn:ION;iD'-&$$Eo,7{}:KD7t nwCw67lʚx?nH!.\TQMw.EwC~5+֗~12 %^ k}"HJA ?ɺHKh+E&fEt} gZ;f!']^^hv8x.x}2^nX\ZzW5e?Ot`䠔$Pz1ԗ.I,wI,1-Nr>L8.|q!oq\8%""8.|d."IKב&BuCϭ eh7_l5Bx[yRY kW+O -SC{wxso@!O yjSCWj0 SC{Y yerkY|YV+i%9$XV9#c߯~%zB֋Bw }{sׁEzҫ#g?ۧ,y|Q@[v5/J<9<" jAy#69<^ARyy|6J)QsAr<]>hg}tU+7<]"ߛ)o/2PmA -胠2A;w0䘅hђD,4O/;$=BX__jM AC}R=Aր$[c\jR>饸|> ٣ E&0*uVaOj HN{J ;iﯱy{T>`f|m֐,=Y4S⋒=ox{=ox{C>ͬ"@ox{=ox{=o).9H" 3\b|0 >`pJ_ލO+0 >` "X}VX}bQ>G@Fx"ƗՕ UX}bQ>G(VX}bQ>G(VX}bQ>G%X.{"=](դh]#s{pe ,ߋ 27 sTi m:]ZYAj xo_47 x 7 x 7 x 7 x 7 x 'E;.AD)նMRxOi?QAnPA"(ܠp 7(ܠp 7(ܠp 4 KҞH,6H8N~"i K=Qb75lP5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q5[Q'5١f "GI:L0_V_sVuXaUU2諢䋰l8Au8 {ƯR 0km9~@Ûo:Ûo:I4xM7txM7 ~PɴWÛo:Ûo:&חÛo:Ûo:Ûo:I^Ûozozozozozozozozozozozozozozozozozozog]x}X/y_uմ&EFs\` ิp\Zj@⸴{u۞ h9)cW^ x5ՀW^}^J#&C V\z^J=!Ki=/ڀkRZK)s ޑ\pm!)Hdm@x$瑔Eo6 ۀl ?@[ ,i=?xo I?>z H+`N:Z-sI"I2R.DPT|^]EUu`auQeE VjkNjF"&ӣ8={PYtNhcļ)3皌8=N8*yViOjhr=HxR nV%D#ufG(Rucޚzг0=(Ym*>E*,.//////⌯(ˋ88888888k*R"""""""""l$m|k*:I[S{xߋxߋ(ߋ(gAkvk)AYR~U!}97R "JI _W=JHËMJ:I E E Kx{#٧#xcH37 F^ ; 5RxnQ[cQb{-ѻѻb.XܯJҾJW*~@4;)y+ڐV0E xm^[V0E!,A/@gb>~zI^m^ي6c`56clf36!h"TڌH_0l9n9n7{"&iuMl/k+;_'Bؘ2OpQJytu&Xuf ;rRo 3._"sz4C9SWC$42h39srMZBݾO 2U?F>}%FO6n̍~6owA?&i)/O ~5P X-{i9OM#ep@K76  Ɔa.ބ݄݄݄݄l6=7=7=7=7=7=7,"|zszssֽ nN7ԛ+7a12w;nbnbnbnN7/7/7/ σ*[;]ŻN/Cq9 )v>ަ0[a*7&onM>6&oS0m ƻ!ysCA6xx(aps/*AIsoߩ_l T?q(7q(7q(7G#ۃޡ9E,M,m)bInbInbInbInbInbInbIn;6l;~;ҧ}Krs8 9DDDDDDDDDDD&wmr&wmyQ?U;'Z&Z&Z0srrr[~_g"w@^H1qסF ysy[zls܊޸-k:o6n6n6n6n6n6n6n6n6n6n6n6n6n6n6n6n6n6n6n6n6n6n+v9n~G-_|=}" 0p &&v! nnnnnnnn0p! BnBnN7A77y`! n n n n n n n n0p! 0 n&nv! 0o;C^EunۅQglN.4Ja؃m pG{q" 6j!]AY^p}V(_@*ٞ}\f767,mN_o H~'<y"JE*ÉA.55T W_w!hcsd9Jl"v5 vEI^EI^EI^v?7)~O ۔jRr^įq Iq"~mJg^4ޫCvyHmq|&}0,'{,'d,'KBz:=9zCu2Y'u2Y'd|&u2Y'uLV&dN&dN&dN&dN&+ ͡žLdLdLdLVd&:kΚ&:kΚ&:kΚ&;YL,is 9ȅJt"Q gy*s/Da! Dlh e""QXBs] QXB(,D`*p) ){! Da(UJV*QX% Da(UP3Udddov]!I6!I6!I6!I6!I7d!clllllͮb*: 5 5 0 0 0 + +w*^!} 1x}lvV( ~o9I(Q-V}ddd l>$$ٛf'D$M$M$M$M$M$M$ doE[Ѥ.EAF'D'9qF?C,EĒ^Ն"L]sS+6~x&x&x&x&x&x&x&{zQLZBk!D2 ξoB&{+8!̠{+ w&V]AN.foL^.4^R; YnPlI|X<.EE<-uhצ:ECSh轏ޢJ ! =w|Q+o| mFޢþ99Zk6桳~p3Y:yHU~󡖯{M.b~.Я4c@_TGK/jA6Aj[^41//8C[iw~ԶGQG>s~/bEߦ|/g|kJ>_"qE>S1^•2e/=8\{'=h! =2zw]rcqиګ槴쇦`@p??_KQY墔|h+BO[nPj|+zN, oMYnrCܐ,7d! YnrCܐ,7d! YnrCܐ,7d! Y~1`E,7d,VJ[r+Yn%˭d,VJ_ku%83֏_s%!S堸!vAP=ykC&Qr$]y]{~VWEۂA6orUz?Hf ~MyCCz3>xG BWj? Y:QmFVO;Fnnh) ཀr*Sćțx"^D]4Oy̓g`o۩F8Hj}@osw=U־^Ѐl-IBFG%nZzK/8 # `45P]H{"*u\'(XY^UBFuhxJlo{=tԈ*7Θt+ ^{zv cB4m.D4fJujQ&/~w=%VUI KTd'KfِYg`=iWgWjњRR b+-6XǿWis6/fOtxW^g_*QOxpF=t'n /%P 2rEMyMAufgcv>,l)jnjК< ZCԎAf>tZ_V^FC4h8hx4@$1]tC;&1?hx4Ġ! 1hAC b4ĠVbJ ZA+1h%Ġk]P:3:Eϋ=KDuΠuu%䢦.G&Pu3T Ug:CPu3T=)PuP1q5Te+z U5TeҴɴaQ۰mCG2-^31{gȟ S!6OtyC#O9cƇh׮Di\՞ek9oJ#mA*%XV +ֆkÊaڰbm!Ja@ҊaڰbmX6Sg|hj[M3> x%įK[wӤ=;L3P26lR[٤ImC<8@+Hм$2.meQʢEWjTzF+VlG[َme;v񙌎Dz(5HYG3w)b>ȑipGNć*AZZ$ԝD(yM7 (a0Jmoi&M P6 Poio)j{[li?|Zm4TmGb Pa0TmGb0TmGb Pa0TmGb0Tm6S cf31ԧbFuE16LLbN0'meNʜ9iò!meCʆYaYmV|XvB[2mV|hŇe 1h+cVƠ,@[Ymeh+ ɇ.>tCRsX]JZRx9t#CV4HxQ__9_}!H}s/uo{6a-rXa-rXa-rXa-rXa-rXa-rXa-rXa-rX ߆osXa-rXa-rXa-rXa-/b-/b-/b-/b-/b-/b-/b-/b-/b-/b-/b-/b-/b-/b-/֊dkl/Ciz-g=ax\(WX.6,۳\-JYxޡ mRh䯇5 [lTJ\lej/`ˀ- 2``e[- 2``ej/XN/;&ug-aBnIM $ x& ̆}f x̆}f x& ̆}f x̆}f>aٰlN6,&-`ՀUCF-n԰zl!ע of+6l|śQśQśQśQśQśQśQśQśQśQśQśQśQY_[7Մ7'9 oS#Ң BRl-)M|8 Nv@p‡&7Jq}p߄&7Y)No}'7 MoRp߄&+El6L  E[9&7a Moxƛ0ބ&7a Moxƛ0ބ&7a Moxƛ0ބ&7a<VT +U0ބ&7a Moxƛ0ބ&7aY7f1,ƛxobY7f1,ƛxobY7f1,ƛxobY7f1,ƛx+o͑d-W%ʶd\u9z)1?znm/-xiK ^Z҂uRtA/-xiKKQ7z>.5,g< Y0ςy̳`,g< Y0ςy̳`,g< Y0ςy̳`,g< Y0ςy̳`,g< Y0ςy̳`,g< Y0ςy̳yV1*YO\ߟdS厃7 \d|QAޜ{&i :rA>h iQi=op/[$_RqhR% wR\(sY{y_!LH%'_AirECb\ 1x((((O'N`r}"dF6:Sk _~pᠡ<_דijwL#-@6S*Y jCXHԾH:R)1(_*# /220*uVaOjhf X@\a_XL?0 5JЕ+ݑ8 \rZ㌬_H LtEAfHEEjoS>RSst+-ei)CivQWZW8"Cӣ1=ڍ+hLVդh֤h5)ZM3h^Qǵ~F4T ҿ)ۻ<$hř0 ^sH{U}7 9?HWiҞ>pͩЁ&9L o *ig3ЁY{5rg'*5xʖ^΂(([[i`:u^]QeعH̑\v]J̃[Sv>wi1{W].fRh hjjZ;EjZԿTsv_jZ$|} Z߻Ԝ;z;z;z$} A;ABwBz$} A;ABz}"d;Awzwփ`{;xwփS+p8@%^^^^^^^^^^^^^^]9p ,{-{-{-{-{-󘐿.9b kSsL6tM0q?BqzzhT>l5kJ栥4栔h2~Hr >rAj[Һh%A/L'Aj(iJά6/T]}O 2~#Tg]hXdeӧsj*r=eӧ_\ȻH-O>K/w y\a}PkT+;f>1Ew̢j},c}S\̢/r7y9B"p{A>~*B繈wlP$`>}kWz> ;Rg)r藠_äcR1Ϥ:K1 DcR1>1TxnܐO]MZC)׼>wЖ mFA:#vh0e'l ZA5z+ Y%ʶ+Tɛu5KWf}̸{q4VY%*&F7K7K7K7KWݽ,JZ_u,;Z"R B__xlKdy"Kǒ`_rKn/',ߧ2Y,^-i94/,>v;_ҨTCfC`~PSӝC,>C,>ثw;{-V.;{zHWTzr/z_Paذwl;6zl;6zl;6 {dž?vK$ԡ 6 {dž?vKԅh尿c˚U^콬{YUWQxٸ_8h*aU  _EeWQx(|UWQ* _E(L{42 PFQnTpb́@{-oN7[n9_f,S!Il;6{ߐ$7ط {no~Uy' zN؛SgnQ߆6! moC~ÂooC~زwl;[! aޱeېVU{ǪooC~Xw6-{ߐ߆6! ooC~ې6}C~XD{7aޱBoz/+^}"]WV转{YBoz/+^V}zE~e "Boz/+^VL{2=ez" %e!8C?%l':p|A/a%i|"a3>1Č3fghqE%EisЦɑW9>qxvឨx$oO9>EdЁ]ĠZXcQҁĥ^I{UVĪ^D)"]:$X,O:$XO:>EԊ>Mˡj||bidzidX|`">>DOmǦ?dz WƋ4 wA^ώ?\HL=*Pa7D&AY áŽ*?A nutݠ&kpU n nu1:-lJLbns 17A bnsdyQfS 17i7:] 17/RD bns &eE7AMF9|A^O$  7o^DI n2߼" 7HA+&sޱ-qpGhR܎Li􁈷] H!hzv@hEȶȶȶ6a9 [!mE<1G+mŵ k̑*lۖyxT>C}zVp[J+o%P%oҏnc }O< u]`/6<my{世B95 -N;]9Z: *e? ޻EA;RAF}I;{ :A'D{b44*y$M08 .L_+f~FUʃ 䨚¨TVa.ѩn*0uWlqX2zQuPxs"E?Xl͏W\BXo&栫Ut߱z+G>sβQV_Z̻DFsVjPݗu"y&yஙOڅ^Qf.٭W0'U(>tGKKKKKKBr@/Z-ksj)on/BC"rr89RB{ڋs|P+N"ҨKd-NB;Y d-;ٖH̝EjQd/j )32q 2 @D<@ĹǸǸǸǸ6r#D<((Ѵ(((/+H%>%S{)iZLdyѦv6M( M(mjGiS;JC-h 4ԂZP jA&Cg( г3zBf o)zV =+BJgų^Jg\Js^SgiX|2.wiKm1aKw۶o*V"y@e "CeK 2TX ˞ d2TX Pa*,Cex*,CeK =Kی멫ס4UꪞT.ҡTW n^/ɱ;#Ǽ@x 4vXNM?\ ⹘2 pZYU?ȅ8"YJµkvO-r_~˃4r}u~ۊ*6Q^2Eʑ[x92I{_ddAN^']d WMAޑJ1 ze(#O^:롷Ҍ0Zۏ^snBqSqT *q+9OF:W!rQN;iG9(\r88UA76{!2W&G:ser!ɅQ+ ɁELcbhrV-843KMVxӶǛv޴ݯ7m{z۞׷q#{oF5Ī!V Bh-&ͪwLҽ!V{C0NX5D!B "(P+j%BDVrJnҨ:HBN[R^4@Q52BE;QƔ  <"n@!y7CA}|bVRy։E}RW E'Ar1~/w]pP2ĕgF}PoN`+i<.LqECodb ۆld:C샕A)CC&yΙA/Xf4/K z>`1YHdLdH@rCLd'&6uʓى|:u=!$g 90v$Nb'^䌒Q3JrFI(%9$gQ*s@P9Ts@P9Ts@uФQ)0 ;qs9\9ss;Bh."EA)sF朑;#{sZl$צs-s -ho0 9'mI艓1-iAXZ7A9gsٜMcXMRx^x^xx#s<3[/F2(~Rԓܵ3y)a>us֯yΞswswNHY';;-R )넔ugEHY'H uƺr︳rW׉NP7-nٝqyHss$ E)vwz g?8ɪs?8?p:'?8?pf:gg/~:::u/~/~/~/~/~/~pƼN]'GIVn7I+`CeG8"-^2t⡠:P9(Wx /_@NM_^U_L9SuT=@9]@DWD?@d9uNf=@D?@#gYsbя(ѯS_S_S_(я(я(HhA_'{J0=)@x׃} /:x@·'?' jzGuTtF'L_^!rtr y= A (/[M 6lȽeF'r׉/r6ͻ*88{ȽN^{(u:Qz}**Olp":qxE,/IZM}>_]=;!Ad'Ol}7sjprL~FۖZ$ޙqx> ohJDub͌H9]|i9g|%oZ2(Fg욯xA+NkDon5/^?oDa nԬ 1 WN8\8'JXUn.f.&'vJ(nAVB 1y}ep/-Ed vf>aq"k69Mke "N__(O^\\6𵋴y{3/2"/a}7 }(<3>hI<+bsCUȸ9 .H8N[_T J['έ/HuAĹ.HuAĹ։s R]*qn8 .Hط Ub^Ts{P1_HZ5i(o9T .HuAýG1*R]EHu"UTW*R]EHu"UTW*R]\EHu"UTW*R]Es}U>W*mNY7F!QmBF!QmBF!QmBF!QmBF!QmBІ7 BhCzqBhgHHtj9*"lߺ מA]շЯ*mJFD@b' E`@l1tdyS^G4.uSr+ʃݼ j:QڠyySt|*JߊSvPWyyA[lٌxa:^b譛oΘ}s(CNpN­9Be 4CfE%i\@CEzŗ7X~fyzxl:f|OO3>V3ى]Ńⓦ+>iStyJy+Ӈ#/p:oQpA '<{Z.0 Lz.0O.0:ԢpqO`Dg Lzӄ ^~^b<2R`ħ \:;QЄ eEp1cEZ9I9P4Mvi\\@[6.rPd޴;,H)4UsȄN=@ʑA.iM 4- įƯF^S :ѴcE 4'q@M^n /z3iM Z>gt1pF"i-(%9i1p:5Ѵ8ir1p"ehQ^Ci?qP_B~ 4}v1s՜.H+KLƈ"[hhfG S.5mmOtGx;"ώ]Eh-dTDsxt$_)񇂴,#%<{-s)%)HIGJ:R}VtHIGJ2^RRVUKzC/y(S(SiC#?RUiC~|WW!+!-?Z"Ikdqr1Xa V`|3 +O1;Ϩ)35>egQc$),OSdvQ3AECMEIaR\BQ^"rhUO> D BݑGEmrlie ] BkίR<h9{9#̃E`RyQdyH-8urF5ENZaxNNY-~OmE,95aG^R>2]mIx$#t ,( nU*L i\_AA"R8^H/=¨Y?r7p =^f=>#[ t3%7H='i:Fϥ"/9]eW`Xv]A ^E"u5).#&uLsjjm\]p˳rQ:6׉Ez.Od::_>s3>s3>s3>s3>s3>s3>s3>s3"z4:hti i9|g9|g^|g^|g^|g^|g^|g^|g^|g^|g^|x\ة̋ϼ̋ϼ̋ϼ̋ϼH̋ļH̋ļH̋ļH,rZdHEd ]z";@z`5` Vj0X `5` V!W $ElEj0X `5` Vj0X `5` Vj*Ȉ= 3`π= rƧ{g{'!].\ms;wpjprL8`1AgpfgIP"g{Fg{Fg{Fg{Fg6HiAC.{*=3=3=VQŞQŞQŞQŞQ"(""(""(""Ҵ `ƄHS՘3Ɯ-)M8 -IMm(/5N䔗8 {P N(XB;mb' Plm95֑1!>#:2S߄ #Ș֑ud(&7! aXG֑1!>#:2&ć%dF)1&ćc`X8A߄k c1c1c1きA|o"2fY7f_3F3,+c(cƘE|cccccccccȂ1Z>Tc=gT-bb`X+֊AZ1V Xb iab`&8~&YJrbX#ֈbX#QbAb 1$5b`X#bAb [ֈ5b` 1F rł$ 1k1$5b`X#Ƃİ< bucAbƂİ- l 0 [vaaaa"222<2Hl"U$VƃV*+(X-(XEbHsoxj_c.fͶ5d/bE efIY{5:-ex)/t%fIYmt.^^XŖ;EcN۴wN{w3K䄁U|I1wi>쎡2I` ohLi^i^a|cĐ Cn-P_>ƢXc1dyR:aV"Dw#1dy~u|#H}!Szps$?JWš{IʿRi/]+epc85TԀ g{+sp³xVs:Ps52l>ͦ|?eSn I/uJN\H BR];OhpqW?hQ%uxvH W)9~rK%z% ~^שHjx%x%x%x%?*Oʍr4?4K.}M+/'mE.&ި (`^ 懾Rz(9UX5ռHoTC_mEJUEoE7H7ɋ H/ W'Rh.9*" AC%TD"ҋTEFǹ"("(:/RY޾нBV?TDzRZDz yH?CH1XDBD7GBJGD" )8&3)酫CwG 96Z(Xe>n㵢2%vryq8{([SsoLE.^Իb39OГ^J)%ۨN7F|C環7sVHq *g]fU%[΅t-݇T2B{l\z.\ffs͞'|R`9@ʏ3=욼I]4Ar&gR-eZmӒC |\4AzBm×_____7mffQmmӴwl0ۆmM|iʃ6yЉRR~D+P-{?`Xs'\mܕmm%'|QX7:o;&׎~u#>zZu3MrIO] Vf'd{S"A5BmxS6* oS*ea|H9_\'_tV!R|cmR}kmkmkmsF˷Qmo(6 oo|)^$|||۱ޥzC#k%h%$LZ[[w5pâR)t;<.ht={te{$]C|]tf] ~]|~jcq{OJU tX[}޺*fh U(YܧSo[?4] J&]!{狔y 7ҲkH{>f!#L?C8cv-w΍RJ}cBYm 6e\ 8n4F/UZWav`[L[L[A[Avkb~mҩvRk)>s>Q.KlQF?xٮRwrXDؾ:GR>. )5Ώ4.Ӊ<{4k9aYS zѡ~8/Q7hUۋI uSE/Kub+zY˺RVX^* %y7D ƯFNb5-zzz][[[[W?it(SN]''}\C;BqZ0jQ@Y,"SG]*N=,>VotChQꇫ~zPS?!ku @o;Hj%ߑvjt_C֯‡aVHJbNb֯u I&T_~}#nm1Zo>v~YFϗ?~\t*x[V&CAڹmK~ *?㖼B g}[3I6&$j>d=P%eZ?CNj?2d]Q -U ~w'\T1{N^\;T^ 7Khv @)4~\?Xu{6ʂK> uNEݷSzlBD#7߯ EJ/?Ģbb*ktFktF=Jew*E, dQ x"^d {>$,\EwU9#-*e!@&*eW=vKij{. =\@z6˾9]۬(빈hE>ȋ+;wywwywhp@C1 bQV.;om^a)|||||||||||||||||||||||ԖImIԖyR(BɯA;TBIҫH R>ʍEGPA,uqaC)Snj?:J:8~>D>T 9y\ 9ɩii9I['9ոT$'9ոЉN5NKY>X.HRrozkȟ)>P9HnKnP@d }ܵ'R*N*:\wa‡Rux8#|W'҃DfH !A"U>U?Bҫ*U?B' TuDƢ:9N"%%zBoJRRCUvaU:vU*uPӖӖJT*^rPx JnKRGWJ]+utѕ:R TF7]t@)F:n k5} %Pa"{EzFzHK kFZjXPjįoXˉf)WJI#y{mHo8_ޓ>4S~>QNc^-Z</|,-'JlunHg}H\& QTBi~3(Rm?ׇ"_PD$-)j)=jrFcr U"}Ia"I\S%rDJ10&Ƥ0Ȥ0$40Ȥ0$2"5La %2i0[" ӺaD&uôn-Ii0]3 Wt5@/Gf_WKt>TI;xϕC ؄$Co[1ʻZn>ܜ3q9Kg2n{CtZP|Z}(f>nP>Kqއ\SsZ%ZtZW/>9i}Nn}ޭϻy>[wvA/+ͿAm\-cyɇޖErPV}=o=on] 7oG[$9A7B9/k-xwkAXR1ߎ ).Z }vdH5vQ$~>D 5EkF|oW8u!XnH|%_G'2tD(px8ʐV,px8ʐ>k&F_įQb_įQ(E/k! XK KeHܘ[%_G>Q(C *ܧxBJ8JfL%·CՌɅ! q!cr cnȎkkkDkܐN'Ek]Q諩@vn7&dvnMvn7&dvnMvwnMvn7&dvnpn7f epp8@8 pp8@8 K@8OJ'%ΓIyR<)q\\x!Kzyp<NjstuBp8!8N r/RJjz?6[WmFэ>yy ȧr=araey[Յ'_P0UNA]yߩ&RD .*+.xd<"碵D \v p.ZP1'%צ%צ%W%%%JD\i\ih 2[͐5̮̮-$>p/7߸l|]tO3.rѭt]ثS†}r]^t+"byq+W7k8.yѭqЫBTe\*[x0验^e|yRD" 9(xn_ftчNEwݼJ|(xm_TU1AU bPTŠ*U1AU bPbtUU1*J"`*U̐Jx ozM|&CroW³v=d ჰ;7x耔? w?Vk <@mi?įX&Nb>ޖCo+AٿeE9@|7[<@J3~h ~Ei~91;91;1) [n=h$4׳)5x)!i@ $x EFA:$xpk~]+'vEuèj$wQ&1Ơ+m黢n/*j1Š:71'iIYbPA-V+9b kw~VJE(bA(bA(bA(bA(bA(bA(bA(bA(bA(X5X5X5X5X5X5X5X5X5X5X5X5X5X5xJV+E(ȵEQ8t\C'@ =]hNP e8PCe8Pߓ$!?U| 2ʐlVZD )8C )8C )8C )8C )8C )8C )8C )8C )8C )x7)x7)x7)x7)x7)x7)x7)x7)x7)x7)x7)x7)x7)x7)ƾE ))xhB+aդtEA,nb =UECzg/|-܅މCz\4&_OB@ )B@ )B@ )B@ )B@ )B@ )B@ )B@ )B@ )B4)DB4)DB4)DB4)DB4)DB4)DB4)DB4)DB4)DB4)DB4)DB4)DB4)DB4)D*RX")fXiNVX,=Vy( ]%{h ohaX,&_4 zU/ b bY#MIJ%KC3UR1yc򋦾bXrT*qQߺ(ӜfDA ZPЂV} 7QЂ1;1) չ(H wmm괜3Jn]xc$=@oE?$"2)\.ڣ(m[n9 u9 *CU4쫮?._LX =@yԡCwG ٜٟE9}%lz`kk }n@9.UuH[TmZO.6fɵYr=%J=Wd! npC2ܐ 772,6dYmpe,npCfƵ mnCw݆6t mnCw݆6t mnCw݆6t mnCw݆6t mnCw݆qnCw݆6t tvnMwn7tvnMwn7tvnMwn7tvnq9n7vsܹ)|8|@v;,  H]uT$QyCaA|nbR ;znDPWyou@J: q@Y=_'sO} \9ȕHi.ʂS֔N%Hʚஜ*ڋKSRZ?8lG-_f7)CϚ!|r^@[n^C5}5.J~%:^c#sېRUmȏEίB]Uc?5M~ۇJEy5!o4Qm/ͭ6~4r6Z*v!6Z56~4Q2/|}vIR6$)CmndJGCHo<E71_D>vkO;w4?sި@8AлYCЁ"3~u~}SB(FE)}<2(!? ݱC_J"%˩R~h>tgKTJ)xR|7TfTZJG8t@G8t@G8t@G8ZG8ZG8ZGxaI5huhuvN `궳VZcҠ'wd'SW\ty(7V¾Q+דꟚIOxǤBWתIO*}RJT'.w4RO*}vϮٕ>zgW]T{3~]t* QL@/2*K=iL]0 ޸UbTQ)FbTQ)FbTQ)FXWuXWuXWh])vgn~_AxV_tx፷2ȟިs . KRJK7tQCʬJk7ba` b rY۽-⋷ 4\õׇ:@w6_J}P3u$? t:x:Y([ }vObg@3>{5B[#$#3:!x P\hڠ#[|BgZ# ЙH.$-=Ӛh M~}ۼ㩈M,[y'zhʢh,(ʢʢhh1hh1y1hF@^y;h;y iiiDW4yiiiDW4yEW.?0c8ZE^-\zD^ hW˙F4yEW4yEW4yEW4yEW4yiiw~5lK#K*NcAgMb`a&Xl,6 z\YStj5cj,U O^,uV*:KgUxzHlAb ["i,H E@4$ Mhڤ6i,HlAbhڤ R*Ji,HlAb* M [؂&HƂ$h,H @}4 4Gc5&$VXkFkFkFkj[Mb999IlG?VXkFkFkFkj[Mb99.....k5sV=w`ߡ~s!3Ȍ2aDɌ|F#h3fN=шC?Ҡ2 i CȐ2 i 9Hhlgfz=QTb;dHcv {HҤ4i Mއ&aZ }M \ilpÀ"ݩ"aHilpÅH ~TmF`(L?Hpi WVܰ)E!$I.ŏOtwBQ) K`?`0oy|-hZA(aR F *T\+wm,UΥM:t{ AKjϣ(_an:S6[zUME@w4;Gqh@Z=V鎲PX(b=0A z`H9DJhڦ-ysśƁ7Q9]4A T-;[w(@._bKG|D4P> O(."-[9GzimfD4 )}IćOڮ4i?0b3:::qz:::qO/z[4JTiii44)_;Nh+ޡ &1ts^,?\oq`jQaB &dywdy,.z lB &4#& f%sr ,%sdNMż4[a TGrɒ9Y2'Kd M 9#j &daB &daB &daB &daB &daB &daB &daB &daB &daB &daB &da6fa6fa6fa6fa6fa6fa6fa6fa6fa .$GdMdMdMdMdM ̀ ̀ ̀ 0W7Sʼ?7ŁRBwC {f/ZBmMzM)w >QO"A[?sM>47())zc †\?J!}Z_ H̽@]0 }?8v? iWopw qHՍ9Tj1އ%Pm͜=E0>Z%2궬s)eHA|xC鱔Ғ~h黵1u܇ԼtE0J]~EZ5~ݔF̡݀kvO>tR3D6u;݇s *4։m-`vy:Y:U^,\QEB_'ԝRwNr>L-M-waaIJFy hT;uCH_Anz',ۅB|W=t@)]uV]O7rJ:뤳N:뤳N:뤳N:뤳N:뤳N:뤳N:뤳N:뤳N:+9鬓Ί~oN:뤳䛓:;k:o;:Ab(%4M-(/N.{A%&M4խ=Y41=0I?ߕnjF0$&a4 IMhF0$&a4 IM4$PNԃh$un.MºIpZ7 뉁unqkt1׭jT3>{>'\?V9(OQ^9r(r뺮 BC(]D85Nt]t1.?t9lzKZ|mגBb.]D85NttMlMl>] !ZImLOqkMZ6ؤeV}z]RӋ}W?<M S,qz]R眘4NSgJa&/QRYꠎOR B'9INh`m>Lg++6"J]qQ(u?SSNgCb:Nf)]etrrR?0:|Flit.Mt.MtrrFY)f rF]sQBܳ}҇d5+)PS60B\3jmmsr.vmN\M\ͨr.Z9Q|_'&d/o1[nE9nE9e΀[eND'o3V3V) 5V3V3V 5ր[`΀[bN_>p+pk8'2pk"H;g0wĹĹ 5ր[nE9{Ζ{hnhnhnV~Ζ{hnmllhnmll猞Fsk4Fsk4Fsk4sssssFh4Fh4[[%PO)'p h+w]l.&M&\wѩ0m.wۻh2?qvЂU~9mܠtEE\KsV)ܵѻEqxm9]ѻzԵ:= ; _4h'* ;2 VX:Ftmoml"3=tn6y~f?om6y~f?om6{wfnwٻL`6fڲl-if%bMWE_rvއR:_V )+MeSү "!c73-+OSлu· ?gZa\mnY~sY~ ם񡘿PXg=q6,C~Xݣ2t-S''Q_=O@+Cy~WJqh.4ue/ƞfphHݪ4ˮjfphfphfphfphfphfphf+yhAfpnf:kP\P.MW2?i2bWj ɉol{̶gl{+C~gʪ~l绠}MuYRgI%uYRgIg&uYRgQK,:k̮:ˮD::+[_?Et~R} nE j?>zo86SO-3F|/ZRj"7v釂_[ƻeKh_m^įXWI~Bom^d~n Қ6ÇE[PG|PިE KGuA\MS #NsNN̯]4A*EԢy:CabU2HߡzE-?R>\CoԻZd?ءcrP#ԅ HZ*A(Z`tkO[K '6QhtFoxsėчaBokl9tIއN&>ܿ_A4|8$Pq߀7$ÈysA}pߐP"~%G]h tP"H7` ox0ހ7` ox0ހ7` ox0ހ7` oxCfd0ހ7` ox0ހ7F3hxo4fь7F3hxo4fь2 W_B7^1hxo4fь7ъ:Zv&"CCdh !24DCku|bS'6 GNmĦOl8ݦMlĦMlĦmĦMlXۦJlĦJlڦJl<0τyPWqWqWqWqWqWqWqWqWqWqWqWqWqڄy&3a44444 Lg<*N*N0τy&̃f&6a.ǏW`3yf3l ŧ-O'MÓIix4ǻx7|w~/'TwnһYz7Kf,K\$'һRc-rqw=` i? 4~iH9 UF?r5)O;e$dʨrdlyQ\h]1̏#QÑH"J[- ۟ ɹqE[aC10pƀpddG<zn?6n?hdžPC!z0Fl)*S)j5|^ o#=E wQçS)zZ#]]9?QϨg3zXjtw?/F#Bjx!5~ T~zkү/94\]h@oد~l&J]57$(Ilvfb5>;=|>mv *?yӃO^.'MO7k 8=^TN/N*ӃArzP9=\>)]m2bU#H[Ǐa8b,}-3\C KY[b_K8 R@2eB-------C,ua`bmmmmmmmmm! R.ubwCxu\H#/'\ w%^kgvƾe+kS׍+(ީEeOш_/Bo=EZٓ^֝ 5w>4)}3H_[9D)eO?\ϼrx噉}jr]ʈ"%˩2,T6~ߋ.~vTƅJ@ίG28PA5)oN>d!=ez23t.L!B BI I}P֦,3 ugci8~CP2Q;7o9^E},1p3>4qcg8m7:ÇB `CkeK:c ~2u UEU=xDƛV:cn8cC;cn8cCkZ3ݰBJ"'q8FcC+ZğG}vPO9cl8cl8>C{cCN o o o ,3|2` ߵ66666666یz8LpDɼye~YEvol`meᳶz1pL>1pL>'ӂd`2o˹1pL>Ǖ61q@3:L 1q@3 1q@|jGƧvd.?1p?'܅c8cpׄ&܅c☁8f cpׄ&5. B1pL>1pL>|]1 wammm᳹k6w5fsWxxxxxxxljC]k6w5fsl]-E.yM5hhMeڔPb,fb,fbX8v&nbƭnИ12KڌAm 9F.RFDw|p*U1|X~|v8n0i/"us'(ĐXy8V i0iWڱp={ 4J+MҎ c؀86 n&6 nkiڋo.UBT=?ӎcv8np[/R>]M{8k"Zk&Zk&Zk&Zk5L` ~„ؚؚؚؚ aȅCwPLl i#]hɗ+Y#X[:!pvg[t u9wZwhݡuZwhݡuZwhݡu|;r];$q}&.wrg:`h(,xwJRTPC;TmǥEbrPmƹ\q}-J\mͶfvhux^D8vn;!kv^ HVi.!fqvd"9ɥ*w9xٛyٛy9#rdMdMdMdMdMdMbo2&c]2&cOv: eU PvYMfof`G7|؛?\.Vkp1\q8qb{ (@JUiL]瀞z}3_\M?=`j\u;=`jt;1*qt_D,ƻ:.SQPD쀳1qFlIXH_{cJ☒x2n;`oy_ĻֈXhcqJ]<8ν/"o79*!8SǔFǔ1%qLISqǼC;.t5=hr&hr}R{4#zvP4:)pաՊʅ{5O,gCB5~˃M'Xp)sѣJ & *Fz:eߨBhAg}3d}aPy}=D*ѣ_?/BBQ"hkıOPzA~hs፲~߯@T_mמYh WND(.K"Q2d2o*R(~YHJ2d(PЭq/TW|W|ƻ=Z zQ ůި~_}upRz|Rup:v=C={ds^[Mި¾/&h특 l lC9ٙηHvS!ΤN&u2d0Τv&3Ռ.5W}zjZ-(//(+eVΦ(F|B_Mok=Y‡(3̌23͇ m=r4hRvwR"EzzL[_ԮueBoTN:ESN-)/n!bVvrqU8uݿ&Vw -s䶷6Ҷk`mm냂냂M 6oZ\T.Z])PJCO`>?TcUy>D6A ac#F`A{3m&%&c8!bQƹqn`ƢE.*pV,A_@l`6$u}mJ!ҧ \T ,F&XT \]+PٶjrjrS|1QakoO"|Uׇ(]es]^6j?4ԛ&/"24&N5&PhR29&/w-?bZH&-.R|⫶p~R.uTҚmMUTw ݚ,Vl B{FnMl B{}Mjhjd[\ .wUC=^SB&~RW,֌"5}h)5-V ‹™ʲ55-֠kvwbQlieLի̗z"-H )B"-Ht>JE"-HW/-PUVoRՋset'yȻ6*ڨhSktJ[Utt Zf,DZWuUozCk,[5Xh Z`= [&ͻ[S :ԩAt:ꒋe/.sK.t:SN]:]o]mR~Dؠw1_g3EĭWl=ֿ'FJ-UL)\@黣=:2~}Rѯ'iSȳ~hd[xg}Lr|-r9hQ؞zca[uF/[GG5>yлd j;ޑ !cQ:Z} D{* emO'Ӫ7 %1^UqNä} >bZ5?hTB}p|j㳏awQVRN>,vt>aם1D?H(iv,3ݝzwNgk?AFΆFpY^W2`TR34Mwkhu]sC꾼?a"`G[EX֕?:#;YkI{X]o+OC[]8"s;ǻdžm=h%~PoHZ wz2FiN_I`a{?i=`g'tSYtN'HaQ>H1t#82}2;I 2ޡѡCfM– *-=s ON5x͈l،XF(RZ^VFFGCw&{B:\֦B,x?xn)U*4 bvm_}h]H7E(p٦]lήM.o2G֦Ҧ뺛vA_ۯ#S**6bۈ}直ؒ]b9ۜ0'mؾGz ۇTl~Pa.g "BVը]WXZ3w7| YP5ü )\ Z}7)g !C)dRѻ!vq/"l)j^R].EJ~2m%Ҝz7[⾚gw&McȇE2;VnCr7mIݿjtW޿:U?*lulul6O XY Zpcy,{C~ۋmޭm&H7Z+APꋔj;zfCsnf3CA|rHi)ٗb.R~ik4$I'96jO.L` NZl w?|DkjC}Q:m٨>dzN[>vEg՝]йˇ~)ESi)-j."l""ީFu=X<͞R_譑>o#F)j>}}fOy}|'m&JQImV6E2Pm~}fo֛| $HU xcoR*1(Ǡr ʱR įAE $`S[|o{uϷAu7W{p5~Q -–B-SN;TSN}{N}MTSN;T>Jw*:N;UTSN{Ww{Ww{Ww{Ww{Ww{WyCU.ݎ:W m4u{nh6n?M=PMbILlf4٠$FqQmw)P6RfCs55F zohL?dcU 4>O6_tn|э/E7=ӈiDåvkZĽtvl;lzWjf5 AZn-΢,κ IYr ͮfW";n.R n,-CIsj`X!]*f㝄]hle~\? Vj`[[iqI撄u0n4 Fø0n4{C|P!gZuZ7uZ7$uQ8nF7$!p\N-o-o-7tLrvۨ#7VF'IdxC܆JNRTt/,i>IWߴm.L6n6͗dsUC*Q?jGyooooC?48B!iygkw!EPp sIX*1RulQ0[8tCG9tl9j9|KO3ޭc E0x S̋2(:ʛۡfu}aq<.s}<e|h*}oQG@Rurzz@*e{Vp@ vQ/JNz_n>P>@׌ N|E?BhnPu[|^BC?ԋx {. ~﻾xsMS&^7ʓ釞Li ׯ^qı`TbY|^sA$#H1&LﮡeEr96aAŔ=E| _w p!SJhT)߲ |HM-i@q!J#;>چ 1S_!鮢`QӅɳ  |Z*dU_I&_lK /'c^{yɳMJrQ5莃9БKY0ps~89pG8Yps89?|W]hU'&??ݢp~ JsDp:t8:}h@w_q|ˆYidpt8Y:?M'K҇;y݋}~8:E<NRS4p*u8:@NNs4pt8c:1wTV?Ga=H+͇(h4:ZV)ŔJ!'íӇ[\d>u%&j+*%ͽ> RӰ#G~ lt>pg'G Ψ3t{8";Cghplv9lz6=Cu`ss8|.:B6_{ʋTjYӡzu3`!/qȀC2:!3qH:X$,IE"`sTyIc\nx}Phg< ك`tz:X=VO\81>1S|Z$ZENZև j+)\5>uș':>:>9d琝Cv9dv0~eG2"Kʯhq]z\W*uqq]z||||||||\W\ Cʾ.N=:OmEI5ԂIq]z|*O~tЛ Л Л [sZj Vt zS7 lR P7zS7zS7zS7zS7zS7zSa0.F!=}1TA~}'0q\qsP Mޘ 777 -`f -`(eK>/"NKT~At '.C '஀B#uH7 NVN;p}X4^8)9)9)ZuZjNhNjyׅdNhNjׅ٤hNhNhNhNhNhNnY $?'QpՃ"uI鳧`c׋&ȅL.SԔUEµ15JV<%Og-8kY Z華W"i-HkAZV%!M*azC uIK:aiŻwo899p*i-^QuV/~BjYۂޖQ/RDt]{Tuz.彴-Z1ꇶ5z8;t B$t֡5#]͍6<Vsܸ7*t X)H ƅ_ )w|!:xwlЎe鏲:T֡]r-(TF7cC5bl.JES,b-E\-AxaCϩko)>JJIu>99hT>C%7]r6 )\UwtAM=7ҥ³{L=zoKg8-ᑀ <ƒ <ƒNXsDΑ <ƒD W#U0FAau5L)Z5Z5Z5Z5lv&IwDԏ|.Zk^dMD?QE>_" RxwN[|>e+o.Rj"QcRPM$_iePQ3QˋrH W; ЪX/E09ߘ|-.Zœool~yy9ʟ(6m *DtV^\P?ݤf]D1taKRQ%yCN}Q֯QPZd[?\ͳ_KV_CbHhȹp-'ciɳM0%y' tg_=Q:oQydITI*)Q rU!EP y=W}hNP p@*P T8p@*Q;(+A~"U-JC7 ho@{ހ7 ho@{ހ7 ho@{ÁCII4P*Q>-9t8uQSBo:Q^Hhoh/"?4FShMq)uUZfސ ō7FShMq)n4ō7FS`yZ<;OhphPhMq)n4ō8yY xsjU0ϩUqVU*LɋsI&Ө UuYWuYW Q׃}^c_\\Ȉ>4PSN;S\}N;;׃%׃%=j[RyS;wa{wa{v )D )C3`%al` * 3$r m`[ @ؐOPa :L !3$rDΐ2fWg!6\s$-i3 ZZDؐaC"lH !6$W'W'rDΐ9C"gH !3$''}Ÿ3X#lH Jf}H+aC"lH †laC![ؐ-l6d $  HB![-KȖ%dWaҶ,![-KȖ%d2z6eXL;s)L܂! $\p˂[ Q` 6H` $b|b oA#h A5,xA.c( A.A0H `t 쁮 $\\ǂ=// L@g,8e@.brj\}}̟s5SfLG)>W3Ej3\}Fb5SfL)V3jXbSl1žL1 !8G17+Ns#-m,6p} Slb0ņ)6La Slbܬ.\y #fm0ņ)6La Slb0ņ)6La Slb0ņ)6La LnDnxdU+lv6;h0ņ)6La SfL)v3nb7SfL)v3nb7SfL)v3nb7SfL)v3n8O꘧_֜H~.xZ(9)P:<3<08:~GBo,V>?TD|ݿ (ل&l6}hEf6٬%Cwi Xw)ل&l6k1Ml!Bk^d6kz7p-npY׼ u&e ^7fCA'tf 7}rf3lpnjZp+cR B;_;ЏxOk=حztfjUK> v3`7ݬ?tyEҍSLa0{H=߿kk}F0i؇l4|>I}iMѺY46e:z](̸r^b콌R,1CkspM[s1~ ڠ׎6.""JҗSS JexQ(eXP_P%C-d"AkώK`=/yAϓBj?71Qװ\UZDWU9BkQ<įC'=9Er=^Nz]К5)#"Wm%L nal&l&RuWĔ4x#xc0v#E[. kE/\IqO\UׇwIEv]D.iƺzƺzƊZثggg{sުc_X=yE. EK"{ъKȤCf͛.zڢ]"/=miCOÐ )mHiCJR,DJR,7T m*P.!E7T́Ɔ6T]ڄ)UZ(ohCE*,7T] 7o8p|#H}FnDjᐾCv69P̆b6IJ!]/rM$!$Dvnctp sf̱{P9v3%n];Oh?{ht0nc7sfSqmsS9lq`s8,N_@80ǁ9q`s㰌;lq`s)z1ǁ9q`sE)C 7oolay`Æف/|q_E)s\%80i84sfq9N/bOȑizJ9N3i84sfq9N3i84sfq9N3i84s$BNV6INɚ$YG']2Y)'*Y'C$kd-“pҀhCJ5NfL:iɬ9+gg-,e7*Sߢd/WF13㧺?@3"桪!7N }|QE||t(]5=4/C.zсeQET-*t`V-=X֞@2ls<"2eENNNNAJ776nyOm}EF1ZRT>О 9LPԇ$]D.ku9&"R_>Z2~A~5<0hhh%Ǐ%hhhhAK'GPz?}z?}GPFPxV17LJ$1eaQz~b XUգ/Bjӣ!G >},<,xMx};wQ1PwI'c6&$6flMb}f;f=)ݰg8qVuIvp<;itNӚvS"$>F7%j-$Ta .J:aktXÚִF5FaktXʾ!'PZaktXʾ!bvbR*{uXcJdt]]zvP~5%2i%;!Ɣ&90DFgg$Ɣ$l vICyx`P1B_@ C5Y&ID6)DlbQ(OhpȦ8%}C3يFZ/6q 8<҃+8&8K66A0^qx6q]ᰉ&.@0^qxř83p"008 L \00.q@`1qkɁK\;qƙĵagrLcʆ0΄M K\6A16q8a9|17A`ofo[8{e~-b7s87R9{g9Û9Û9!xO>;B ~$ 0H $`A #i X"؀A $İ#KIH ;. $\pI%%Uh>qI%uJ0o $K. $\pI%%$mK. f+\pI0yҗH\K$zUtBg?k~aPpI4DsI,$K$K$K$zU/>:)h%Vi%zBU=ÀOz{PquHJՒ\CQfbgI%iX?>Q6T%~y'Q|%yH)X pFP."lKF+`Y@/ Kv BkawП'Irf'GmF ڀZb/)Bk>ruyCz7jt5mPeaS袪M򢋜_['7_v_ʥ&[[رE~F!4l$\g *4\HnX1`xBb:GGY8?Ns?}q[V'q8Lp?8NanqX 3L3}ȔKSNX_fUp?Ás9x^Hiڅqe8N;ݯCXX]08]]씛R_;DsúMNn`` ơ>BK0ZK0ZK0ZK0JKKs:>:>:>:>j-h-h-8rz}Eq)KVC %fC0#dՔ0[lɪ)᳄>K,Y+%|YVJX,aŒKQ %,Hb$맄KO %,XjJX,Y5%,Xbɱwb %kŒRb %,{',Xb w%ܕ Jx HVR ;%씬&J(ោS~J&ll>&l&R6dSI(4I#)"XI6Sd3E6Sd3E6Sd3E6Sd3E6Sd3E6S$L1K2'\S|h5 g42ؘ?1D 3LL$0 ]e-g/D2L/'OR}o{\מ1=&hff"f&.^&]&chf"f&{'BJAD>3q2L3D>3'Q[*FC*Fg4ZzL5qıDH3L\L2v-FO*\2[*3[*3[*3[*30(1Q^4~A 311VV̖̖̖̒XաP-4Lb~iD [4[*3Gq|-8iI -CQXg:΀up2q2q2q2q2q2##CNX8](ȑxe+^ NW&NW&NW&NW&NWWp2q2+Wp2q2q2q2ӕӕӕר Wzd1ǀ91`sVeVeNܪ|%!c{ha19F3hc4sf1ؔʅCwǰz){ۉ`셓p19F3`bMIƾC<6 s #yDF6Mddٜ:SAADP6MeADP6MeADP6qrV?e+A~WM$h ڜ:-&bm"F&bm"F.]]r.A6MdiYD6MdiG(EZJC3ksi"_&|mN ͩm"dm(mmmmmBPeVjlllllllۅѡCwǰ U8:*UӦٴuiHiNΝf۝/sP=6ɋOY{#}!ԻYaJ_HZݡ5y e\=myAV!=~Ho)m5('NJ+: HI ۋK}7bV΢=x-TFO=WU&U{C KAFgԩV_Q`,?0~BVfˇ\J?daE"nTH)n26ɇ]-9HT; ] L1=įAڦu.&A垙:01x} ? 4gס}Zn ld'MͦfS:P1!tnjlSctKLzꥋRߔ9r"=W'߿[g;u)8g17G׳O3gf>`L#XszmVڳ=%^FRQ*Z5hՠUV ZGki|7~፣9 sP3 C P0LI0La$aaj&ߍ:* oAJiiC05Ƌ6aJU! cQ3mƢfƢ诠2W~ٍv+ٍɖLH3i Q3i 4Â԰ 5 B0yQ3lI [RÖLGÖ԰%5~aKjؒIqd&獆-- [[D N cl? y@dc+\s DT5BGvZdoC^41ʰ{pb1Ckp?!^JR e.O0 TtM<3tҙx:O!taH!3tm3g݅aXRȋ >U@RgZLKi3-u ™A83S͙:REJrŷBHTgLP 3Au&ՙK8s g.AaUo*&T #kv%U%GCf.h*:FMd(^zzܖlRoVMm:hA4 p F`r7jd04L- SK KZajiZa`i!:* A>04 , K DhTՑa`iXYaViU3\TC  !|0|0c`iXt5cjiZajiprjXZ04c`04GV[`Crp=N'@wa qp=Gqp=Gqp=ww[*\b%ë^ASf U8u4NSG046 M8u%C.[rjsa0ox7<fdeK.Y `.~ `.~卓M[2в%_y[0~ _yüԖLl%- _yï- _y[>^h1ׂ̅_y[0̵ׂ!ogo K% o7Lbmɴ̅Gx#W?^V3jZ\k5s]Mb둘 ԃP:$̵V3jZ\k5sfg'c8k4sf̵V3Bbk~ج63ͪaNvڰӆ6iqnfqǸٮެ6ͺ`.ج 6ͺSw8u7NmlMo73373387szT,L7 nSx `6{|IX06(vp$,njߋ;0aG{ج9rmG)Hc0 R椆GQ? ݑӔuNMB0-J-CIKMtmM&)G2)T?I}' $<@Qq8n[2K(04F[yaig晖R043 Ӗbij%dR04 0-A04 0-`iɾ)ai`Z27I&Ɍ$YK%Ɍ$$3dL0aibZalix۶dcۆai6L1 SLK+dcOݖa0R#0s쮟=ɞdOg !{ ٫otDž&7C#{=9L&{34e0s9L&{=ɞda0s9L"( 15_MḟF#zIfo9:$״Ի5չhFMu. ⑘cXu:ikC!TQ.59ߤ&8J cHD WYX4/kVs9hȳ3p5TbFל&CQӯz?9(xn&d?Tn7َlng&vd;n~ooln߯k s]ѭ&qvdOn}c>ͅ.LeKG2>8qyr<}_{^N'=|'=|'=|'=|ÑD8G\DXX9V۱r, +qU9vU]cWU9vU]cWU9vUc%8v+)Jʱrl(6ʱrl(6ʱrl|ʳOMI|jħ$>5%)OmԔħ$>5%ql'qX:9Nc8v&Qch q?ZLၚ \No?!2GwHz~9v9v)H\Dn 7h݈qt#n7݈ΎnMn yq"ZqEZq8w4"Fш8<ш8q;G8w!pG8w pG8w pGA D n""/䄩4D"OR1o~e1"p|8#p;G8w;y#p;/1XTekE"0Xyx<\0y`/BÕCB;gj7x!c.;Cb.of.of.﹙,JI=SPC"8Zs;ZϔR_BΉ?hҞ_Jg~Q UXH]]>Gh$6 z?vFzNHZ p8@Cޠi ȅgCy)~\h. BKMxWY]Vߎcz; Gu.ՅxZP](:-5^>qvtlbܱw,qbܱg1nAZ7p۱"w\o_sCEۍ؝;vݹcw؝;v__"llPJZ -ÇJ\?0.pDuR((2 Wǻ'կZU.1~e})g6Vm2.(._P _b'w?qǟsbxw/u6ٰ?/7ßs&L~=4O{H JwJP@"xQΐW 66ܰj |o W9j˟˟ռTq;C( cE3GgȎ)8S|?p3_8&qY,w\;eqY,w\;.rG,w\;.reqE"w\;W+rqE"w\;W~t\9\9GgE~taqkZ նڇau^<;yo㎷ Fա@x w<;rG2x w#U;;7jN'ZFc)Z!˝˝֩\xT9i!˅ + 7Nܿnsr I١slH*k@[اOoao y@+ 6b? ? R+Y0Gɂ Ƒ\4@'9ɹ5>*LQQmb4o OQxNpx;'?ގ 'S@|O|(Uj,NK8{waH~`P,{Aua4@FC! Ok3 I̳[IW%V-B?̫L#i2@4F dL#~ZOiW&J4>dEbK-eG숟n-~Z> }l"~ }i*$P@BUHLi2-x-Er]-.LK˴6$tv(v;.0o?ӯNYʙ.!)%]PSg!~,0jvGDZA%x@ <P(J {{ gفv?;&14gفv?;CД?x[bj+0l&m=_.࡬$]uܘ| ңDl )}]S}]) F5]!WZˢ cR.ޘ] 4w.>] 4w.@shK. 5X֘>ݰGIX"skWYTט3" cΈ/]e)R0ŏsƖڅv3.ZT 9c5sa=gl\X͚Ҭ猥y&QZauN\8jPW<0 ]3&0^ΞfMa]N7c:ӿZOR'֓Կ䀘rx'ϻl޾Ν |)gYhSc M䯶Ho/9[ sts|%KoԆkT.@8a6,І՜6,\G)6CAX(hDUrXZJZC߭[}ܻ(-Z-J >oYYulh4 3(?5? pPvW.<)#TDt!8><' UtjyhϮ]&pu eH=;P*n`Qg6~'OuΟxBoy.0:t5-&elF}XWy?ݣ,$k:ڄwzViz5"?hH)I7:e Wa`wwݝM 鯝ݰ?tUM/k#CEِ>աMkHIF*,̅=]3bWm|0kݸ>fyzYlMj廧ַR~>f. pu( |3Vou wc78'h%?|kgSqRY $l?/ȲY6eMI`V3Xy+O<F#`0x <F V3{*Ql"s瀁s~"48ca`0"nh(,xwju`:0p4X[}qo]n廈oPAihي`b0p1ǂc`X0p,8  #XN0p' #X` ƽOoZc~|lŽr5ma0pE" \FᇞcR13Gρs9pp:XF/y'3%o2z^F/y7&G8 a t&KĘƛwŃVFC:FO*pm]TAһ4CE//lvҠ)Q~-DZƐƐƐoX)~?b ,b i 4^D27c"K=8VSbxv&bJ=oc1x{ =/"7`l C!?d2C!?d2 C!?d2 C!ȋ7B=@Cb(Z 6 |K%ߒod[2- |K%{d[C$}rZDzZ=V@_CU}xj'/&pxhZVleBmr$9V,Yb} '!xg:[Wcpxz<\7z[=ƭVqǸc1nz[=ƭVqǸmm R5m l[\dٽdٺC'@Ax3> r7ȇ= (#}(4k0F6f #ad3l0F6f #ad3l4[|B׀!۩OL閹x`t"L) d6f 2Af3l4fu(E -1@(F}"EZj 7F/pT[4|XYCЇ(oe~21QZf0=vF݃h`D`kݚݚ++݃h/3tY9pzC>!Ow_'tح߽8o0 Aa F݃h|Cqi{\=.vK? 1xקw}j\=.vKǥ~>\:qi{\*8wq{3L070 80˫갼zƁ^(@wx_@38wa?3L"KF938rQ CN%7L8Y`0è``:aTt00X`<`<a,w~ "}i' +8'X_`È^it8=:NGӣpOi?888b8b8L8 8 8}zz  i=͵Ƨ4:͟'4Sҏxz~Oo7M_,`X^` m%_H}a]Y7]BQxa_BExaް#(aGBQ E%``0 'Hs.`Cw[l{l{|<9a) tM“<͓xȳ.T~O<'81"[[YE-LC Ѕi4ta0 ].O`!LB <\ٚ>ȳMG`?K7c:|3_R:E]hh \`m<دe|6$,x Q3 BzHd; ۡ53ZARֶxCo[za{ѫg z2} mK_4kmCi´waڻ;(v@Pbk̂Tj0v(Ӛ!/F hxa40^ / y ͐ y ͐ y ͐мx ͋мx m<_ j"E,lHyh^[!!>Q"~=[͚Ьya0G׀1L^m0yajՆɫ W&Lz fk0^̤`&35I_Hk|aաI<Ֆy~4Ϸ!09?GS~I_Fj|`Z~aW(4另єfrkow k4Mm궎OKmv=.Ӛs⚍5k22Lz\oǚҽ5^vҫPX(fq?Wo,vҋ7vҋ7Wo,޸Xw@"RP# Q#kjwdavG{M {aca^X`/,ׄ'? Or&dQ`2 Li\>eMH +1ZoD0}vLԎɚ1Ye c0^`){a+ql6?gsl6?:yp6\ЎauBwuvF#Dfynt&_+4 >HnnS_?l.j-[%[?PL_2؅++4\`= կa?hV4 ۙB곅bC C?lg3+wa;Y$ K% &iX,,a0+[X¬#߅%fa B|0 K% &iX¬`L҂IڳY-% -jٵ,Z>&(}XAeaP ṀRohwR}r)? <߅uD$-z}~ͼ! *N=E=h*E "z=͋7[CתZnqAmW*0}yDZWw =_f'&=e`oqJv6d! _! lABP/ ABPԯ lBF/dkC&2ِɓџB v rvuolfC5A/kC5P CG1jvSnM5f7즚TjvSnM5f7즚TjvSnُj4>=Chqt? xfCY.st޼ΛwI]fޟpĶvnb+kElb;t3 %Tk-r0 yBƗ}6RĐDDb;QNԵuD];QNԵuD];QNԵuD];QNԵuD];QNԵuD];QNԵuD];QNԵuD];QNԵuD];QNԵ'ԕPWB] u%ԕPWO\ >ua'EԵuD];QNԵuD];QNԵuD];QNԵuD];Qק?@NMNMNMNMNMNMNxg{?`mNF&lrV)lʦlʦlʦlʦllkWY)pφq6a}80ƶmHv©pj' v j' "fqH8E D):+>ц6|#Bi't vBi't vBi't vBi't vi, n7K͒&d$HDpԆ6 Gm8h'$ $gS-&}~Q vn[yk7oݼvn[yk7oݼ{n /JM􁢰{n7^Ml6vst{n7^|nJ${n7^͇cvS,{nCCP@u-'҄.R:Y 9'G%QJu^#G|ΣӋHU0q JyGR^~* _rC N[:M_T?="<~ Vj.]q;Hi@:?p,Ⓞ}u+C 靈xUvbEC⻢2riAy/tTJ wq=BkzTv@U w#Ӄ~ ^E[ioX 08 WJ+PꋒR$=|X^|H"]|=~E'}*yyp0>xY=yENDOظURߕV~JunTЛ.*)ήh k lU/"WEjPJAk:lRQ.@˞|?7-:~޴x7g a?QmSlzC^QRX-=8$Wu0;!%A<\|]ǏNͯ?aȎnm-û'/.E.-?ͷFvaDuX|fV|yBlWۤz7_ v% > B&0f~!TGoՕY C,9EwS,7r܄] S3JRtL1e@d5^U^1uPpFpFpFpFL1e@ǔS(!ƍ15ܨܨܨܨFY(HSv.9[#[#[#[#[#z2cX)c92cX)c9僂%xWfx ȬPsSZNzzØ]Q6ݡwvG?0!4TRȾG_ꛡ4)<\^u}th݅Q.FWqwjtA`KʕZbNEglZMi[eSRJ)|3b ȼS HՑz^a,@ BKetoBRY*+4uNb7-$5vrE&Z]ƮVcWjj5vZ]ƮVvZlAեVXW+ juºZa]VXW+oºZa]VXW+ juºZa]ǂ\Z LjB\M q5 |Vj[MrIn5GgA61jf[g[<gEg&:3љLtfb1X`1 3XSX9 3X` eJ*n0X`1 bD6bĚ4InvBHs}X`D}f~:5'u6\ !͚Ь ͚Ь Ʋ0n`"4kB&~J@hքfMh~`ސЬ ͚NRi 'f1kf1kf1kf1kf1kf1kf1kf1kf1kf1kf1kf1kf1kf1/s\,b1X+Eg9t搘Cb'^DqZ73ܿ'_.Mfh&fh&fh&fh&fh&b4]lJL1xdY36DLDkƮR^/, QHyV0 jBFy/dI _Bs ݋x[53{Zο8h]LΘD$&0AiVobLl1 :m g+aeC#lk8홠(lk8ζ|ְ.>h8t.qam@}0o.6rm#g0E¤/_C -9)l6fl6fl6G–PzwKցnJWt-2-2-2-r[n[n[n[+%)K.܄%nhsCyC롢 mn6 mCw67 mnrc`#|P9AjLnhXj͍͍ mnn2\mgtx3Fw$_x\|Oꢄo퐥{E pEJH w=/Aʕ]J]r;f?~|^/O@V{=Q ? xf(yTr %f:3B᤾¥. M<CÅ 7 dk8Z?y[M$A )Zh~bМr攻a\O1;P}AVhůOuͷ Smk_~͵@*ozSQ7*(Ow=⩇YEU P)I<{xJ3!IJ ҹh*m )?` @w^E\үo!OC%AwN5| Ϊ #0 pwz@֩< x3,XRO& zS], 7!HalB=ZLOL}zv[C>~2hڈ -f--2 XHBjT$"ɨ8'6mPyHG2>񑌏d|$#QWMzߦ񑌏d|c4TG2'#C.R P%NS)8AZIAj l!{P\z3(EųӀkQ TAq<&A~F,39[г8[XKZ%9F7rox#9: rH7rox#9aU7VWJ7Do$z#HF7Do$z#ѬyF7ru`E6xrYZSS{eSDԇ쌬u)I-)-EomD'j{Kh1ABy\i=/s+l)Ho7^]r֯zw1/;Bz\{Q7/r~7E]4A{Jځ/lTmR?:AUU;he!6j6j{*(rz}p9J"Gպч*f(<9HmmhE,fu۬nmmVՉ5l)PgbTe|ƇG . K69oSsmSx;hRsd Qg5=pAu{Yu{^η8|/ٛs.B a{刷ZuQe!j1oo\@D]0Ns q(]N):!,t?}zG;G,P;=D}_jH#UVc Ԙ} 2產fK<5kҬI&͚4kvf7kvf7kvn3:EY?7 iMldC Rؗ?RUѐwH4IoxCސǦ%WxC4f%ֻzw[oymnܯzH?hsPz$_}$sՖ괬2s 7 7n(p;oR+hx ZW$(p_ C-EMw{yc*=mPwսg-+i7wCjH9̅nBzOi7BJGwt+j7w KM|( = !7?~OMJwĤE)nyijR6_5^()-lֳj(wTчR@>@ԿR^P^#D@VDE!9h]Ģ'=q;;CXX{buO=aꉫ B?eV7F%.^2zh_}.\9?=6L_Dyo*|hȻ[nݐBIMEҭrLtjuOc#z%V_ xHn륫X X (%.p NS; 4N; \Rz^3+xP -~-zwݡwYzn2 ػػC#,Ԯ!һ A_@_@ g@}|C O^Gꔧd0` ̳P .-(y6CDE4\m% d&m~Sڑ#=Y撞h鉖h鉖L \'t.zng$ y9!"d Hd Hd {@>tJ0J :;/>Ю1>9(T 4wЧ_%~%%;w(iF캍!_,R5e{|? )Ar!508@x+~52~҂T ҸoxH94~uVy7 Z*o(HO"@q9uA ԿjT-yGuEj|^@K"M5RFH=)oǘ=T;h\Z}5k5%|luq{`?iܞc ?@@@@@@@@ 5rPT)f[F檁Ou%b:IS:m#I9hWvxS;.9_ť^.*tOք}uC[}IwqP@IМ FC"_ B~-_ B~-_ {|C S#zAί>A+9l r*RPJE!㽥FDye\ʦht9 e\d J{Ȓhɲdw~pn~Cˇ+NhRO?V Άfy{6'0{p/*CU,9(K*eyh.r䋆[>xns4@zD o6yH,x4sPV%SެopO{aH'឵ԫAJ +eE6U -}PSq^75[#^\5G~8Qo0FE>D{̯wJVeAoD >_gw/̖AO ̖?\_лAƝɄH,`&||oAS3Y\\R=wK>iTE_I x?<O9&I^H5[hΚ$p~I&D2I$L-r~ױW߱+=S~-C w;ÝN~]R34u:p~M*į[!7 n7釛M&p~[1xJnѯ_9G^uѯ_9Gk!stW+^SG_m\{7&|_ma{A*<Ox*yBЁ't?ǯ6=@RbρsvQ7$Zhj[Eވ_RBi5xzBPX^ρsx=p֯F^W΁sx8΁s<">rPh0cr.PcpWWWWkrr\7| +e+E#CpSܔ7;99pN99pN99pN9'+?sr8''xM|#9H<xj" .$Z>I_(jhhhhZڿ8ڿ8ڿ8ڿ8//7Sc}#Q ]@W8wZVmQٷUTSk/Z2;~MSF1z]ir9i5X6~ʟe!7O<7*ӴuR;a ?., 7/L^jHa'V+ zs_uB/5{UX\~ FS7 4}ֳGSs .5|Z?9ǰ)?R_ي~[<54C|z=?RT!iNͻٽVSdotg+跅ujvj"%l(Jx)!ZUi ),VlR%R:۴i:.̕ʗH D:ݪ֐rӷ,H3a4{evj@nJؿ±xT}Yvژ'JO]zhC UZӄ r<;-T*ɔv5p~-UsN%y>~$:Q0' DՀsY~,?{o'ީ<@ ggٳq^]EjC P^-zܐ@ 9?~QN@*$G>~E_.rїru_}/w?Pru_}i͗vRP_n2 ^6ln=ps_*Y߆mX߆mX߆Mm,.ZBF^#jȐFj2RZ,")c6 ,aPZu0GrH#9{$g쑼55;y#yF7ЫToԙ$=ۼz[sЯ"BC8x!̊7:2oUCT`dVlE_^=yN4xs8qMEgA&' ~},1ِ?<킙V~jT $p9xQ 4O'xI$^p/8qL)RrPNb&1Vc7ۛmh6/r5N :5tjxbV ]SCZ1DͶf[V]{j6=;::렯:렯HI$n{bM̺N:Cd[d[d[d[d[d[$^n-{/7ٖ=ٖ=ٖ=ٖ=l˞l˞ن҅Ճu3 ;: | pUZ[r~} ;oku}r#O }DOu.Mb:kuV" q:juX갪6WU:Q3P:zуt֯уtJxP)սQQV7T=h|ey.@ wh :zуfAtѣ;ztG=Gwѣ;:i7kgrM}?u*EY!Gd,/R_Ovr%Ӓ]\Կ5ؒؒy{[Ԑ h#eMF,@E bb"eupˁyYfƎsYGcǹsYGcǹsYKP .|dԿR^P^Zf4<)YqI>[=~堏_?&ϭhx,tsoF)[mwsn9ڙm'fd͑͑r" AHo*LX#ϭhxWiϱYi2IcwEPӶ0#zǦ  P.KR1m1mEԭ h@<{[n v6,7RQF`o8>@$׹dNAo$/vjrV] wЧRuwRRݜ@u]C= NWr-}_UwHS?;T9b}"m w/sV:88H_\%~ ?z\?Zh+EѺ]"yHH5%?^>H} rNOK$NȔs>wԅ \>frx}[޾-oߖokvo)Pu˷ Tr'OYNtP!K>e;!K>e ;JN@Q(+ YrB*U#ǝ@%'P_Rɳ0@%'Pɧ '< OT<§ '< OTυrTܱsL)q&:!K3 YrB|u“ǡ' OZ9H[.tw sBJ.aB`#oϡOY`#`#`#=ڟO'Do\:W4Vcg%v [Bb-n!v [BbǑ9(@[:EAR"빲vZ}%ח|ɰ.P/B޿Dڧtuܓ S`-L2ZVXoPvez2]Zڕ2f>YVJ>N3g68w^k1ɏx.zYyz>kǟvy"v`+s%Fߙ;s|g΁4΁4n0,wfРx*aVqhpkIh7ۍ֒n4o7[K_9/kNױ5 ]2{~;3A5;s3;sk?EQ~jvrtgNwtsi wPxdbbR<ZSt<'9}D+DFyHu ?bD;k:Z3V]qwƪ8Z3j-_ZR/h:ΨuF3jQZg:ΨuF3jGɽ?[˼ƲXn{Cz!=t,4 i 7@| 7@| 7@| 7@|u^䠨&m"&m YzEWzEWs0B[<0 #D@4@x}3EDZ)|:7za@ k<; FmFmqِ$r-rFV]gݟf!9p^#aDH,<8"yjTEԥnnnn<ᰄ89A72T].Gw{=׳{=׳{=:a?[QV:*k jxe`릭l_ }WZTע}.^_a߃7Ľ4~UrN~,wAYhtznz]j0=6~ae~ٍok-~#1W?&q(Ds`wBw-@seC!悈7[fӤVCZ}?h i/ R .%P2rt#/F6~'0<:(U@@Gw_8a?:fC>^ym9 ;n+;+K0uͳzC{lֱ_yu?Լ =~R+d+d+d+d+d+d]!3 ҌjP.rwQT]J;荔hTr)w(@-\YB%6J?tJ?tS4DГI4I"濼O3w2Tp$ŏIBSIW+ߪ=~Lqcj{׸2>{ǝX/J!WSx+iސﮜ)GmrjX 4c\sC=EVhnhx.IJ[=uuE" /زP $ZmiSΆVEA3ֽlHaoqZ=h*mI9"J^fޘ՝JugE9ϹR1u+k;}1SKm8 h+-ԙ$1w'g^4iv?8g~Í  Q':Bp^x#e nrlc.%zT_gRMC'ʅ@/tPUtQkOE.zEVHfКAkPɱgֆ_  v`_Y_!eBe_eB]gu7<o 3=^g/38];yuE*E:2R'\Fg ]54;_oz  b;7_E5R59nr&nE.& m?]G/J!2ؐx(xQ/h8\)M"++mV\Y۰_̧xa%^ћhNn& 4{RwɞY%DI@Hd6لRHHC%nn'~ o"M(鮤< @'%zR'qv$Bo TzG]PghAҵѕ:#r#(o譟ї=Ѡ~N'`j ?[W{Kw&{Vڊ b+&0M8(&(TLl(wsw#ދHx}dslݲ[6wF6WSRJ$S$1 =h4 tЛp|OJX|iI`ptAx,\D6;qF@rc/0xJ%mPtiZNOyժ&$8 *ΟN̟&$8 *ΟSɟ&v?|Ż'"iO/>*hХ964RSR} WH_P|C(~ؾ(k]4P ȋc_'i d_,cN/R  Ԟ'7?]C>QZQF*+q✳কNgG7e~LOj YE{ESq2e.J4A ~} UB5bЛ_/BzΨQ^Dj RR9mP-9=Sy]DA J!{gpB*怂&_ Ю P=ܞ̏J?FeC7U]8~N5WvKශK}x+SEa6h <[~cSX(t %5|/-v^ux@:W3 NNg):492v#$9! NNg)|\sF^#2n Sj($ e.ܚrk.x$?Si?*o ZM\5rQ3 Ϋ=i:!9~U-.ZrF&~|Ǐx!+?='zǏHqfJ-mlJٴ 72'MK춚.d#J;Kp~jv[{.>v[n. iIn5ѭ&FVj[Mt&\Lʸ:  뙷5S <.NFI8% RlJlJ|g̲(RW"/WzyC7yS\ERފ˸hTcx!j8ohoh4,~ oQiH4lMStG;C}瑦84h0k_I>~>S .$17,0ANkf kG`m X`m X~k@69Sc&zkzkzkzkzkzVrh0\Nd0] .UYdi##G-\4A?Tzjo4>mjY֯|m(HS/wש!v m(y_EN싒%/J^la~ }eH$l21QIdbbf/z51-1Ug/R'PC&T4~uuCjmָ{[e &`b1 .܋_`tNujqjZw x7 ~`v#[ta1;5Hݒld'Ʌ':mJ$1AIT`va[~7ǩ<k8C|@oHfLU DžYEFsYZF0($8I NTֹYunV~J$a5IXMrJAh՜+{&%y&g%v&'F΋H+Uܭ`V%7~+jeSfT+`@^$A}r~}=zQ[p9Z7/' 2-Mޭ wM&dx mtVr+A>o;p4@}Nǃ XЛʛǛ[|[|e[fe[f.sNL!1^ (ǁdm!''Ȅ;เ x.เ8|29"@Yo<\s3 |9_ 7o\̆7i;~Quޒ-IgJ-q0on;>:ێ7on;v|P17on;vxsۍ <$݈V`ވFl7bۍnnv#i6w&b[-vVvKn -%x[wKn -%x[wKn -%xoI >,px"~|?-Ikį"/*o~qFH9!_}cR`i^d5(% ~/!FPMQ -ZUw# Y/Z dP mEiR2(Oj5]p~AAd5m <0 .0DxP/g !UOx`>JDU?)>Xkcc}1ޭ/!Us?Uzt5V`*5^;уS}o|-S`5GQ3{7j@4{@KZ@Kf=1A.DFYe lm'lՕ2),x(~]Xj KmE2ҌFޒͶCzB\u0Eˤ~(W]I}{wx]d5(%=j¢[uUCꙠ^xCQ5}?oV]"Z.Bhzۦ^?01Fg §!Q0*cKBx, TAx;;lwjt*oM0z?ِ4oj`J;22 |c7u/?1u !S)/Ň^/'VUc-0qOB!?c ?nth5@ oۿF*nKj:|uxXz |Z^./f:!$f$|QL֙ (BVHt~h&\Y^Tô/ڔ6ǯrh<d6@"s!~AP~ {JIpP(oz~H_FW8 emѿS[R^Yk&uRC.HxeI,OYM^v}:Io4 C^RGn߆ys7}﷪ݮwiQpצ>rsP֯kDF-RyW(ouױQr6n͟Hz ߃d(Uf\ҋs|6͈ƽ~'_zsr}\}᧎峱|6VξX2V)|g w[uPCʡ~v<2 oo ^/=vݣkڌ_jjt}w~@ϼYjEOz}OPk"r}zQ gb?pi7}گݹi~뾷O9/ڔi7B?K~|oA5}y{ߞR-TO;<WKڿ5O;<<@P.AROԧ} D`ԚjnBgUQxR=\sM~] En~UL5HF#G Я{]깥|`P&^7->#[mLUUyEZ_} /TV|YMSǚ_%?PqmYc۽'J@YqMM m~*Drȗc@)xH%P]n<6HUQފo;p\[X*_9.ڪ[NCi JV/A 29/|W!z >\!U`@4ʕ:Z\Gk?8HEdBT_M$w=7_YMMnBwS'T7yrPIMe!}]KMrs} IN~;; JPR': N(uBS/ק/RɻtNw"w6ְݳ!ފj=wUKQ_ oB cٌ=cOAJHU!r}K_/7̆mg<-AtY)k@BF}Q 9i>A&h[ˁxhwSrM0 fɩ~PS"׷njH/&h!9(@YЄY= k1a-9/שԵSbZ XRV=ԿU;o PPm4-żb[Z)Ǿϯ%ئc+pdz2Eo-ϾEW _ ]pOW'aժz}ĝ0?cb5[f[M&ı8VjXMc5q&,La@d 4#[mUMv94 J1Jq~},w>VCjZƀ;gX͕ٝfj餦IM51111111tUz_?h =Ruyz^׭uyz^׭ZނMkݟH#zUUW^zUUWZծHuPO+ݟk`30 (A[hlu׈_?6[4Uཱུ #\4 Wbt.RI]*2"ҨKmXXXص;"6WE ܻA R[ Rh һhWj>\axClXCq8,(Ρ8IHm(Ρ8~8WPkkP;azG(AوC xnk 8|S7y݁Έy7Ahϛ;ys7zs`|́!9Л9Л9Л90d0ā!&*H#èؿ)X1W^ x%W^ x%"ƲpL     PER7x}/={h/wV lV lV lV lV lV lPAA[> Zb /UBݾwzǰ\^yP ~ ~ ~ ~ h))K*ڒ /ڒ-ǒ-hK*-hK*ʒʬϰ/n 68&ƊfhƊfhƊfhƊfhƊfhƊfQcQ%S%Z[%Z[%Z[%Z[%Z[%Z[%Z[%Z[%Z[%우k® &우k® &Z[%Z[jחdqЦĕ W&\,W$<,M$앰W 0UT S%dYyu'AJLtDLtDLtDLtDLtDLtDLtDLtDLtDLtO+*!5,WԹW"I$ 4HK)M9Rᘠ-T$}A HۼwS͘c&Jn&Jn&J; ~ F7-_a:Ѱ AgktC.aw|]ov7K7gmAu6ѧ3Y4w`ckb<: RgOٓe >Hʿ=Y95} ȞB)4{ ͞B)4{ ͞B-k.ݚBB3]̀4f7fLTj3Qm&D6f ؘNT=;0yy?3v8s҇{EC^Õ6YkbjvZ^)0aoAf?6ccl}gרuxWv8ۮ6Ԕ6SھNc.8` 6vC(|HK8y7U oB=Iӈ"v~,?0 9Ji d39vO'ݓa{r=9vO'ݓa39xֿ߫αN/ؕ~w _zymj:z|sدt[ ewwE^HG]8MS\F*Sx:5kQr ,Fg{_l4UҺ'x\$/L}9hڴOu^|Q9h.R A3>9hJP9h."S"-A՞A2 9(AAV2Q _^n-sQJ[f|r-䖹h9 ׁW" 푴G%ZA@E^*AI].V[\]N#_Կw_s/mȝ.c).!;$g˃o*L~|y)_˃3Gԏ +6w<j`t\mN%BBr\8.J0P_nAlM=қg i7?H4`GyRx1ʁr@TJ`~c0 1?`C~tUĜL~*M[[° +[rPU[r5zՎ-loK{oyKS4 yK25VyK.R][rQJhCޒK\R^54H.ZkH۲4(`AMM#cG21ކWgMC>-8R|$cG2|$cGrަLH.b o҇05rQ0$o'K+r!`wErϤ~Mޓ1zƩ'` R$E,Ȩ)QZzx}w+|n*6P_[Ut=p<l Bȅ j}4>GShjM}4>GSǬ5Sˬ6 Tm31c&f6 U^wPxbLLi31m_Uʚ%d̝31w&ܙ;NW5;P'-46&SǏFQFڤ FSsE}j+G>!5&ˆ#T\w}TJ 9(A%e M 41&@Hi^I>s RK&2xll:h1@񤅒JZ(De2k"r2&&dLib2M|x;dL aư(i°=6lf6-䁏n9:{lh@S'^cL6^GSa̶P曦BW $k@l~"0P&G3=ȋzb:o:-w:"(_Q?=(sѬ,4Gơ)qOn|)s;n/H+f7Z\i_'%O~]J [ݙ{<8((/ߑ_~pwfA_˜1Ü1Ü1<f 30S 3Řz 3txR*-@ `Xʡ *2PR%%{ ~;ZS15f1I1I5If!}ۙ.TF-No?4L0S؝4ؓ4ؓt*NuKnu eU? Y)bX)bX)baXl1@zi W.j+7zXS}m%)9&,&7P,- kNdkN DÚ9ٚ9ٚ{/DÚ9ٚ!E.+Z=Vr8@ 6\4[6c#86]9483383c#hhhhhhhh8c#p8NJ:"qá?Ѡ%@ :b$`_I oCC2tߖ^ ]į} 9%ZWH5^g[g5Wo\5Wo[s\5Wo:\5Wo[s\5W|gBꭹzkޚꭹ5(( `C-o+Gmp 7T3JsimfmfC8Lln`bK`KS0 `0)L P0ζ! `C(B:H5ҧ! 'Eߡ!tޫ!>=n.`,уn>Y H2,ݑa=,ݱ yBJ곎)s:/'M2%Y$R?g#.څdNDZ̈́yWNbDb$FDbD$FD2/V^e [^^j(PLb'C11Qv&! #Ҡv BG7'%ёL=Nl7{Ddzoޛ-ɾvH6޴mn/2f}i}i,JQ)l6v2oAՑVnxlH%ȱ[_߭oS_On9sr{M9O5Sw͟k]&ܓynU(a s?r jXX9VS'le*=h jEY`ܝ'ߕdq BƋJs %m~'u~'H%\ jm!WNyN)H_#A#;*7im?Ww\nsR 6M_]7A^0?bF?0p5!U|o _O e %c7X0/- Fa1u`!o:NgKRypm+fdpe 9K /.8ԟϭS1zt1C틭Ezzh 5 )wpŻԿOQT|5ĿC;w^{蚗\/}TL]Ës MP G5A2rw9mb?96uƮ/w0E"ˍ/xkW@^_Ng7@@ N}#LN$w,ޱxQ{##x۳>z̡t4f 9%325%D.^"/U3K*vV)wvG?0!4URǬ(&!"c#u|m0s4#hwjo_σ,N1Z~ g?0i:9d=یmF62lx\|ˑKкzX~=ʾ|f3 Ij$5z=IFOR'ѓIj$5z=IFOR'[ٳ]`\oaG'f~vz&TLMWj2]M0杸^iV . *&`9u:}NL 11!jZf^J d=bbBLL 11!&&ĄbbBLL 11!&&ĄbbBLL 1u$ů.DozKfĬbY1۬mV6+f͊flbY1۬mV6+:f6+fz`LZlAe@6 f lb1ۀm@6 f{1ۀm@6 f lb1ۀm@@9\ LQlJM)}6&ٌ=glMӳiz67HwnK2g3lƞѴ ]B xW*[bzɖXU(L]=zt]/Hz]>N w{R^X⨪%ϽM caq,,ű8XX cɻZn9(xnY caq,,ű8|jK}DZ=F! d{,l=X ciU"O_tl=XK/RmI ca{,lnT Y8 ם+ PY! +da,BV YX! +d݋4,d,쑅=YmGV#j{dp?RǬ`7)um uYm6BV!Fj#dz[m6BV!Fj#dYmVXg3kzZ=c6BVO^j#dzJ[mVngճj#dDz[=ѭVOt'nDz[=ѭVOt'j3Mpv&8TWO=bS111ٝ^iG {İG {t4ǰ= îMa }l æ0l æ8ыBaS6aSص;d?a?a?a?a?X`m?X`m?X`m?t&~k~i~k~k~k޵k~k~kؚؚؚؚyךl֚a֚a֚ViVjf{kڣJq@DTiMTE.tQ*]TbHCaH!taH!OĤջŐC: :a: 0Ðhz,taH!ta'-ė!taH!O/C: 0pI,E_]ڴ6Ҋ!tқ!/ٰp5NW:қ,қ,\5̆|қ,DjR2>F?0VCoDdMz?4C0 ͐ ͐ ͐ ͐ ͐ ͐ ͐ ͐ ͐ ͐ ͐ ͐ ͐ ͐ ͐Q 5 1e)C^e(3Ĕ! 2`ʀ)~,^YWEERH)m:CG=mA BtEj9A'H $pN 8AB]닌4Rqt]65^97G#psn97G#psn97G#tu "'#dĝ x2OFܩ _/hE"_D/hE"=wthE/TR!_D/hE"=DA|=DA|ѳD"zˆ0sG=wDi$zFi$zFi$zFi$5}^)5Svj8DN&DNDΫhRk1;r&4KNշS1fj7LƘ}4>A&Mr=/ 9H}%=QzGvr5U+Gc'h#e5dj|>5^|#>[i'gk}ߛvyiRui}FOWՅxqzl>[ϧ G?POa<~RlE?[ϧ諴Jx5єlll|_U:@怜\/ܔ8-x+̰::::z[~xJ)6[a koX{ջ{񿯠"߆ڻ4]4@KL))P_TCsc k zؘS`_SۼQQr(`clQ1 5 dlU [)`clU 6VV~]%H]#`clU 6VVAxԅRt_=j"67gͭK"knhfBOy1/+oVwO.rb[;/ fڭeCL낝xCq-dq1@Rn;veEA`gH]9Q{T>Z*L<<0MtmlKDR*u'&kjH#O(a"X &]8L{X~`6Mz'Ao{=vf;.N8}ZhsjKbt`Quk1G(`W[zJr4 ` G9@r4 h@zh G9:DYy%G9Ѹr %iI@ˤm;TW苌}997@M s,[;>ױJaJyKo3F 'S.N\FrI& 6($JrlҶZHtkFP E.U넦tWW>TgshJx-C4AzFsiϨ)ӡZKT{H_VwIiAZIi  HiASRxAR4_AW6lr 1t$mj<ǕB{ʷ3|o߬Z9 t^r:-h&h UJLj{23ՠj 5Bj'BŌu#8[a@+ ZI*T]7mA^%d84c>Yn[m~Y[_VdA[YPcw]8baS :vmm)9i8smZx(օkGqm]޽Yh_jO<]ʷ^>ąp\{o[8Vyꖠ(XЀծU5/Hgi%[vKn)-el#e{aL'm(|CE2yZ$cm4&AXXQQ3^/2Hi88d4f4Aj"x+O_ETFjC{D>:|:lE#D봩+t]X㴩I%[65gڅtD\.!4]Bh$C?{Q֯oAhAhAhAhAhAhAhAhAhAhAhAhAhAhAhAhAhAhAhAhAhAhAhAhAh{w>9A^+f???????????????t}0çk->]kan[G|pGFyJ\m~m~ z zy}x-x?x0:]\_s|_3wg_35s|l{#gP$5|M"_$5|M"_7ӥn6g뀓 y|6ɚЅo UWXǥ-tvkkQҥky.];kįjWa{{.B[r ]n-&$DbYL"Id1,&$D8DB[B @j z(՗IKJՁ*d! _; _jfmV`0X۬GZY="V#bX۬zphǹƄWGAYi|QԯZwWH}r3;LiWF *Y}eW'Dߺʺʺdz%]U/ߺߺռ4WGn21wC3d3b3b3bZ}3eMnyHZ779Z9Z zksm%_X6ͼE?Q_C_$\ͅ;@NI0A}1n⭛x&޺n⭛x!tz$[z{% !k R~s( **p).BR\ 4LC4LC4LC4LC4LC4LC4LC4LC4LC4LC8c;qوqو+F+Fz##F; FO0zL8zW̋#pt8:GG#pt8:GG#pt8:>6r1RhVH8bI{]2 %h "B҇ M7{~YuK-(f sd\b"6m]h [ƛ7g"'$|4lŬnytу֜"~A =E?wֳȃ/}=u6=Mh36=M$5hR{K$XJ2N8!ㄌ2N8!ㄌ2N]xcԥma *-݇}8 U״w3NnfU888^ Ssȏ?VJX+c%~wkT"K}\>^ӸލLޥ@/u5C̕!bN]~d삪]/*{*V.k \n|PՄI/I/I/I/I/I/:O,B,}<'y7o捛gٸy6ngٸy6ngٸy6F;pio4퍦Ѵ7Fސ7! o}Crߐ7Ѵ76"F$bA"؍J$bA"0D 0A"=MpsoR{vc-euEs-zgn+o[d k[y*WhAӶ;b{cntLT.IܭE"+b%Sy@ZdDZ<5*E";BgdJܭEv˅Zn-#a. _1hJn%r[ܭDV";G XWČ*<Ek6# Pt/#$==ẗs*M%;sH-J^(yQQS##CV7!WAW9U>t݂ss雿KCFo w# >nepsss¶[[[|0;:w9hMcmF`ԑ;a>sǛs].թ#>zv "]152/.atȅ߯A%tD=!7Uv;FNz cXt In$7z=ɍFOr'Ά2SH۬cB=!;eD€ԘGύsq8zn=7Gύs㬹qjNɭ Jfv{)&GB.&lj3&&s2{&f왘=6#^4muA[%5UZ&:ڷDS< s snBVP﮹*M蝀^9!-TG#:99999>1]& 999ݞ0?c6Xf, le2R;f;fA&e2`m6XfsnG}zG}>A , le2`m6X:;At*le2`m6Xf, le6\>gsl. |6kOk.<ݨɁPF7_tSnݨs9񳆔[#R31l&ijx/1z?PD"G] ]RAſ ]*t ]R/w)D gH/E\ '/\[[N BFyj19|%'%oc' N^p2aWS |Fg>ϳ#5"~\ !o)wwB|\jR^MʫIyx|a+ӿ*" )wi\ !)R^~k,Ջy+v]M)u5ՔRWSjJ]M)u5ՔRWSjJ]M)u5ՔRWSjJ]M)u5ՔjEVUenVvdQD;M[iw5jEzWmvCU*&U->Nh gócxv ώ1<;góc7~ J´}Ѯ0^)Zm}^Ëcxq /18^Ëcxq /18^Ëcxq /1]c5_\-#?ZEn:  j{kE-dzq-{]eW,؃zΌuy-tn8o 17pX;j5֎iQc_8rX;j'0kGQcvXO֎kGQcvX;j5sQcvX;j5֎z&g"z&5֓z~vXOUSTeg-YzֲXO`f=YO`f=YO`f=ympn+qz]-OS^i3 M7xopd0rT¶.2rTA_#ԥ¶."TAF)ވ__萳]a[,hl4r63}E"g{^l/rC˵-"=W".rPTj[ED)|ymhWjt&;Ǹs;Ǹ{ Z76E\4@DyA2!Qpb׶Ahsm{ZmjU+Mr@4wӬЛY|pmRDǯNy58~:m݌O[74>44GՒX|槙ir?A0fn4U? O[7 "P| 8h@jԂA|tA~m ħ6qےyڴ4Ժ4`EULwTZy:,MM55]HjjzP/AJoJLz׃_Zz׃z-ܣ{w=h@-AD-ܣ{wh-ѣ%zJvb"ٝDv';݉Ndw"ٝDv~Ndw"ٝDv'KEzN; *H&) J 7 dw*dNSK'RLEjIl)0:UϖR<[gKl)Dt=O+kJ!ųxt]tk,dtiAepg3?qPr~!!!0BB_DZ/L8X8X0 Ec)tM8O!ڪPng-Z0A8>7jᅱZ_VWzv^״7Y/BAgܕR}*)T #zhEV;+{rTdocK@TjN(@ϖ3m?))))%鋲kHjm66_{_JkH3}QE):"sٰ6d̆7_V׈^ְ-^p~69킾_.胶ʫփJ9PU4ѰS__d.k|Y/~p1qD^]Zѫ"^puoZuoZuoZu'z ޼^h){ pBV/DΘ g{6 g{6 g{6 f,pNȸ 2%Y Ё |J=#^qY(@ WH @ ۂwt@MQӷCn>*OmmmmnFѺmnFѺmnFѺmnFѺmnFѺmnFѺmnFѺm4EѺmnFѺmnFѺm4 F`4 F`4 F`4 F3_4Es\4E[4E[4EY4JWCkg x+y+x+[)JVR⭄Jx+᭄Jʽ(@!JR9V[yBJBV[ o/o%V[ ov!V[ o%V[ o\"Jx+ f~k_Rg0 o%`mQkO*%/oeV6oeV6oeցީO*o`T7CVdMVdMVdMVdMVdMVdMVdMVdMVdMVdMVdMVdMVdMVdv[EZ[&{mimHkCZPվT m[7v>y(AzMu7ƺXzc]ou75hیoM˦9izYKzcIo,%7ƒXKzcIo,%7ƒXKz뾬NJ*yc?;*csc?oM ռ7Vnyռjm5ﶚw[ͻVnyռjm5ﶚw[ͻjP S +yռjm5ﶚwsn9x7jMǻ̼jMһIzռwn׻ԽwSnMݻ{7uԽwS.YqUwʝdjA!.RE=WERR3{{{{{': ̟I1=uQͷNz9ޱY;<"JB\^j\^*"LM_AF)?nl:g +mCd?O*%u(AoٺfI\>%Cz恥g^8Cq܃|Pq܅q b0%t6+ pK@Ea.Z2ڠ~qp=HYփ!5o}[xo@'ҩ[[[Wn?''w':OOo}oqyrA5)/)e'''o}o}'89<9<9Gpf~;5'u6\ !0ևQ5lR)a ~xk歯yk0'>%^d5Y}MV_d5Y}MV_d5Y}MV_d5Y}MV_d5Y}MV_d5Y}MV_d5Y}MV_("!ұ8jF:<d5 E (j@QPԀRx+}AJ*Հd5xB&4AUj@VՀ=N4#m" 4l5FhV]Oj y%FhVFB/[fl5:%BoFh2MFh4&d4Fh2MFh4&d4Fh2MFh4&d4Fh2MFh4&Yd4[Hi.usN] MhErBjEzC9rʁwFS'pm;;(Trr]Ew9@JywԉIEmƷ H9Ewr]4A 9G;ho+A-J^lF)]NSgD9)^N S59uB\NS.r唋.\tj_.+E`~p\t.k墻jŘ|\tzF\tva` &$AZ[RIwErE)'4;yBS%KG]訜|G2 $NFj^V[S[ +R[W۫meavB<&wZ]O|Exf,:z{evͤo|F!8STMj_wŦmvnTj_M |=VꪃmR0঄:L,Xsz%J;8wX5\j]Mw5Z9yg% cOX&6hؠa 5s_Ulɦ=`iZN:w3^4@*־Ӵ}+Ś2r8(?2 mᜦU4;}yi"JljmR P99ϕ[uTlC4Tlxws(yQ} xNxη uZ%4i<߾*l~Gh4i =9"\SUR99996} ss<9;͹x# ͑.ةG*a:;u.M Ӽg|p2RVaݫ-faf-ȷMFm4x F:IVn >Ʀ"XPќ4LM& S)Q=%DSzJTO)~ixem|h뒃wa*^Gۜ/"llllllnunnuY0tVV0dm`fsfsfsfsfsfsfsfsfsfsfsfsfsfsfsfsfsfsw[uuw}zBmE(a?zNOgZ*sㆣy{6j͎řރ7g6g6gxBBBB̖^ZESh-l-li-tb~q*t)ZZZZZRE[hKm-UTB̖VZRP[ 1[jKWm骭5-֖Z`[lKmi- 4ؖ`[lKmis EtK, e#mCQ~]%Vv~P6xW(@c'> :*/Y'#6sua+9VRKqRXL77ጴYX6 q+|5_I+#jжOma|*8> ~gXCy_^mAǹY/J >|&/xKwn×cTۋ`-~m|(}.5Pཥ1*T 2<4iԀnTs:WL TL~BYC5YC-KJK|b *i1'\Ӈ%K܃/q?f*uuxrDJ,2$R\5ۘ}.=QC.t% t{c&޸Mp=nWcxi %>0X1RumL/7 _8|GC6A~;]/^4_YiKu5K q–)KHvH%>16܄ME.:hNC );{N} Q εyma~qm#Wa^RK$:tpR4u(J.%un풺]RWǓBǓKZ}d( j0C%7nCi;C6䥗;(*m@~FLq>`Fb)Fb)Q5QQUUU5q>`TQŸq>g3N\?sE3\?sE3\\ZE`šHӝK0N(o}"K\:P/C~ fgKYj\N92`D65٦&dlSmjg&dlSmjMM3|lSmjMM65٦&| k9Q!"AjΡw9~x srk9pKt!B!B~Sa|/jFoeVF L<g31ᙠk} .J|x&)>8{ C}~3MvȄ[yȭ8h,pameFr -R&7kb51=czr:&=4dzMDƭm>'@m@tƸ-tB>dxo\ 9|htBn?CW~ [v=֕=ʄ.dwhm Ĥ[?8[V.Cc6'bs`JtlovnCK|BYU(Uq,[9](RD!Mŝ|1lI/zpr⹨Li /r3[Ȋ;Xh!dcm5V,jFMgQf^ zU 8B;S9لX׋zQ\/ 71r\em36 raKH.IA]{inwU^xwhk֟.!hr;d_XQu0X%:;1?4\9]ul]$#9Rizo|.m'%^|h$>^C wEJ߹Vv6X8  '!-4Mۄc6Cׂ],E[,ǺC Utw@~]+IU{Ѕ|ŎjECG\g7pQh46.%H9E9n#g8֎EǹY/ǹ(m⼳Eǹ= s;ET֒ʒ;wȝ;TܹcIeɝb3 =2>tOiQ-%/CȮLM4ݒ[tKnI-i%M4ݒ[tKnIt~*)13u('9I0ۜl8'jFkwm# P 6x"mRX:OwƝ|qt16Ib*j2~i#6_845K/m6R?akְ5kؚ5lf [Y֬akְ5kؚ5lf [Y֬akְ5kؚ5lҭY֬akְ5kؚ5lf [Y֬akְ%lf [Y֬akְ5kؚ5lf [YÖ["xKo-%D[{q[vKnI!a$S}[p;KD喨[2JTDe@TD%\P_[K JȠ\ Š\ Š_UP.r1WEI22(!2p02p02pki%d36HȠ JHzQGPB%dPBFPB$d&.d@DDdHD&*S=.\L /Iː Iː I˃Jː Iː I˃[nU;P/6w:eHZeHZ^ s%=$C0$C0$C0$C0$C0$C0$C0$C0$C0$C0$C0$C0$C0$C0$C0$~y+JYԯBH*d!dR^m3H Dr>i} ! ć!aFr?aΰ/dC%^D=\2` 'T>/ DHSm}Mu+,`Tz!<1 z!4 XipR&*S g;J >q}cXU#ՐMIv7&$dwK] W&FS! 1ݎ^Bƅa$N xDTsىnM»Ix7 &$wn޽wwߺ¼c6!;zT2Sw~e%Ug ? Bs9ADPz?r^s9W+;W= jӆiC紡U$iC紡sp(:魏'~ݕ}L%p#i0 ՁIE礢sR98bz9^tN/:ӋCLY&ΉFDs9htN4:'ΉFDsUr5&הs9r2;'\Gs5ȯW!yHtC!=l`g"8jؿ$=$@0j gR"$]ss`G%ypG!T[6Hi"X!{322NtD81NtD81Nt.,皃X[ĥ\+q9O]5q_sXP6LA\Z5q ]z*ݚ~xN? ֙qRWtKWtKWthBgv۫6ÂOPq7YqC7YqL$*n΢~:\Å:\y܁Ŋ[JexP5?r 4JE S^Lä8\PE0%>%&( r*r*7k'U ~TfY6˲YͲgM۾nk+_|Zy*F9.hL: )K N*VR=fK!^ WQ3"H"8QSvO2i=Wnj/F2;6|b4\'J456dAK[6& 8$Fz%++++.ĖXmcAs\B[Eb dAxx1haО τNlИBc8~[kMi6ߖ7MȪPn1Ŝa(k?5 9^;Q=TYвiHgGu٣:{Tg?G_/2P^2$!9u>ӳxajn$նB [f$Y;I]ͬ]ͬglhoC{_.&56119F=5]4_;l/5\uh~FB ũM[Iԡʐİ- ;/3)ZifXiP\x{jͲ˜p0q䢡z+ CۂXj۴z1!Md\MS4M-,7-,U(I=n0:C9ۼA94:%#15<|6VST^&!/e|0G*tNu?\+!,Xo biY^\XhuӁZFQH³y/!T` VFFF`qEP`ui=v!^:zHzVԉzX[p{Bj!v vY:sّ)Yf]|bыFm4F\i\i\im4ip߁֚"'IpE$A-XiXiXi œ%%CzhP-ղX-\diY "K"ˡ$%"m%X.9_G{p=iػ_d sXxB;ޅJ9 a O65ܛ{SsoVu١K;P`o+ZHTA5h)6kf ڬq)@F⯝a%6kp"hl "ʼn<'4 MnBoCDhF2'BDȜ!"҆(T6%l &P ΥΥ$-&j uu9 Tsk}l / I4aŦٚ]KM]KM]K]K]K]K]K]K]K]K]KckٵԵԱԱ[ՍIt$lgʵ`KA-u.u.u|e%rԹLKM'yšššš/\"BXu\\\\\\\\鹄Bq%%%%%^K8E!dZZZZZZZZZZZZZZZZMCihݦkݦcwΖlY.t.t.t.t.t.t.t.\h!ҹҹҹҹҹҵ;WZZZZZZZZZZZZZzWQkk߅ԍ-e\V\V\V\V鹬RpJJ7t.t.t.t.t.t.t.t.tJGGGGŏ.BZZ\ZZZZZZZZZZZZZZZZb/ZZ8%ѱ ksййййpHx;Y֏S8,kHq^`\.2\e!ԁNlPӌ%M2r1'"upb0xp p(rHrHrH9,ـ(T KҸRO*9h8DE 5hBX%WP&Ը&TZBlИK151_Acj\c*0'Mcӛ"B&W:W:WmFr{p{ 7[vpہUKmp(lCy\02؆64~GlĦ5~\Xwhߡ;uDCs( 塡<4PCC}bOWeմCֹֹvI=jGM5ُ&d?Gd4'̼`=Q}f3S}f2gv /,3`fŧZ|)ƟjfjfjfjfjSˏ}lBCeϢY=gQ,a:kak]a0Ep]0qξrw!ZİjBO^X4@\=K%V}EsݞWߖ%se붉붉&`[=0A-uȥ!:J.R\K}7oȺH@K5NZ[#u zcsb𣁷slLuZ+}g3d^ĎtZIU.%Y[|2x4E`ֈ0]W+=j^ "()~0 2:ARD}4HG,GԇK.9QC~1i1a}#ӫיc5㮗5tw@nWr rSHdĥ@Xq i\7.-=Aܛ#"vZôDoZ&D5b޴Do.?Q†պPuAVU)꾖D`7c}k5|i ߴoOl_W`9z\CG㺾q]߸o\7u}㺾q]߸o\7u}㺾q]߸o\7u}㺾q]߸o\7u}㺾4v1Po\7WUU+~8d6lqw;`0w,w Րmg!i{`=0vv i6ރi 7 379r@nt0m:XӘ֦i`t0m:6iL7鎡1awmhVg씇)aaܰ0nX7,);;eg9@W[0nl76ƍ ˍꔝlf vRƇGya6Gaoh0m|6>LƇu=3t쁘p3ĸb 1nXnXab$ I>[,[3XqĬ}im5WH"l͟++D/ qƸ c܄+{ vb3f`7n7ƍ~՚(@a,'n.DRlxx*j1V"{vy*QjOw].iǴc1vy48Lgi#$~0B0\ppTsKa9*YHNZ$R>x(hNq2Ig o˛/ VST{}#i6`o#19>R#!yɫΉ"gI&BvкnÜ+[֍q7*u .IdvF])\Dg1|1a. S/P*e׹:; Մtw _/ߓuzT։Eg\BC[N S$(: Y$D쀈: Rn$-RTzv, JAvPJqْvPՁ+. fHLᱺ=8qKꍔoHlNFXJlM]LjHP !A5$ՐTCjHP !A5$ՐTCjHP !A5$ՐTCjHP !A5KPͿLfB`MY^لSiRr]bu:Dl2':CpȐX=z׬/&lu%ե_,j-!Wu%%#9i3QW3/46Ih||b_U˩lrKtrrf9;I s▪C &mP:b" >ޝ5Yi~;_`6;"nxB@@e▫uU" 0S7TBO^muq)J=2SM tqNy1<ǔRScJyL)W[MAzeJL^Wuq(BmP4Sf6s_;9S:gJL)3stΔY؃mRb굸B-GMh V<;KEEHC5T;P/o~ bpQ/ śCHo7Ѫ:O&]YR'َo~'(;Le1Wa$J+J+J+ >UrI"Wo qbCȑѥE_IRbT].hN3k|1Vk"Չl!n {[ZZkPCwJ:G7Me򦲥MeBV#WzXer29V Ug`VG\3ܷcSx#M1)F~uM)P6ʦ@(eSl MrUMѲ)Z6E˦h-eSl٦hp; h9H(fAh6+aspSlnnMѲjtX@9ߌH(%%Յ:'n'nm\nFAx1P`h M';[rkTnʭQ5*F֨I8`{1DV AS h Mt[E5,<)/,:7ox` 3*DDg鰗r0X~ȉC_Y nz_ی;Ð l+ oF;9MzPQ u & nI҆n^`VlQX?>R3(_ca4s9@mgo&PC[吸 mf *(q6vy'V(vh`vh` Ws{h{h.{} Eu1ԺIJ-pif=[G4IH&B)A9%2&Sx%Ȗc>Nm |[gqZgk7gy:.A5<2SGgRby~uP); CN g<ԐDaPڝ_2/2/-Y'!2/1B)ON9b':J|zgҫgm}'8fa޹#pCvqe#8]X8r<]2S4ڂt| [n؂j3ҪnpEY~ƬM3 7-,q = ٸevє.)x9wՕ(DJl5vmDk?Y ŬE D%6Y'KBb֍D[n9> sj]܌Pgj.*KuTQKo[յ9`h[󄿻#$C׽NX=T2)tzߺS:e o/mviN蔁u^roӿSVxd =mNO[w@ %uv!i{ۃ/8O E;,QP~ At At Aatei>&* "lH !6$†DؐaC"lH ͓G +/diqm',ՋᾚbB ɗ!2$_ːP*CBeH !2$TʐP*CBeH !2$TʐP*CBeH !2$TʐP%T;{W@/5{w;zRz !I&z)7mm%ŧΣMHE>NPO' u:}B>NPO' u:}B>NPO' u:}B>NPO' u:}B>NPO' u:}B>NPO' Eɻh/._tk:5NM_,*^NJW766d\Kɶ]~$Kh9]~.?G_|st9]~4畟(tz:= NBGӣQ#yS[}p`$"+}pnm0n ƭ>Vx[/79ۜ7Njhn9kF9yyQGv 6SAg E;szyJ*1lyՖCHR',q 1vPg uPg uPg :u d-B-y1fƇ444t(ȠРРЏzUhUhUhUhUhUh|KC_ҡUfzի]u+VB96z%DիVҐϐk|zGqf| ?F+DžA~3v~3P+JAC;K3cT *u0is(}n}nFB)Xhm ~87a<T=mFÌx4x/A_0 ާC~3t4uƩ(DXr&9"<2A_ܠP#TׂӸ"$-$VݭcE /mӉP*Ib*EK9 aOL9֚PqSz+)L)+k5pMݠ2ҕJETvUHPf6B.v&.`-K_d D4. gG*^qpvjb0zEzR窕%TӠW$zt0Ht0HtyJTQ:a~SiSiS&i6 yt0J;Hu3T@:гhгhQ:Q:/"Mҁ^DTm۞c Pp Cǜ9kM2bHt0It0IS e&!;U)L"$L"$L"$L"$L"$L"$L"$L"$L"$L"$L"$L"$L"K$8DC$g8DC$gg8e=xvS"ogogogog g g g g )v͠͠͠͠͠͠pJDH& !!!!!Q4D` 'ԅKW4 ?!?!?!?Qn0DU6 ߐːːːːːpx9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 ׈ːpQ#~`x8?022G#n.n.n.n.N+N+N+N+cpx: : : : tF!W c)eeЧRm J1FЈC#~h?w1=L|I.>ɐC#S^#C^#C^#C^#C^#C^#C^#C^#C^#C^#C^#C^#C^#C^#C^#C^#C^#C^#chdkdkd 7oU'G<*$~A{[Uqt}x}xvONk%뗖s8ƼM;swUJTsbӀ7M;(Œ|ؑ '#E6딡:e־|#*_G.=>mY887Xw rg~fWxW ])tPbz}Q6;#L;a);K.egKf^ %dÌt|Il®G/1Z*0!};TkaTLp*R(s65 4Իf-_T 8`\Tַ 73Xo\t˘eCS,r*`0PkMsku uVjK[*ܦpY+&+(w!B}2ygPE^5:~eۋk[}co6jHJThQ}ß2ͪYF-Kt ; Yy`VyW?v^juu@ɋ:yQ'/EMHn>h2lvdxo0rO,C"lI9/*S`g"^J]h.59chPV(SXj7XKVAc/ձdu,YKVǒqp0:RW^K]yRWShZY@TbUR"h©εGW쏋K[eےᛚǒd\T PBc.Sd al &Ȇ alXv۰<6, cش<6-Mc\شA6mMdٴA6mMdw5?6 OBٴP6-M eBٴPP Uy,QtTʦelY([ʖelY([ʖelY(gg4 `1cKyX*vNV.vTMX2V-ceXi4 3a[V8fIefɖYel%;20V-ceX2V Bks58[ߦf PX=͖ael6[͖ae4lS<2w̝:N0w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sg2w̝-sgN;[Ζel;[Ζe\c!.E`Ucaekaekae˰ZX2l -f˰2l -f˰2l -f˰2l -f˰2l a0lM 6 pM 6&h a4lMа 6A&h a8A'Swã~-s&h͙94gB!%h,Y,!%d̔2S"mD0[ !Qf[ "d[lm-BVDȊY!+"dE"BVDh$OBB^ !{!O' Y! d,>Ih$dl B6@퓄IB>C>}b)b)b=C=C=rO{ BT([`j IPLB=C[&!R!2 )bm{HyHLB             V™y H=R?'̊t@~4+zK_S^B~IxC=sFJ+ w Z+H9R^7/KZZt'2RQ_\,w@Q'NFxKukYUݟЉzQ$>`ܶLTK.TwH!BUeW#Gwsw^^qK'וvؘX- $p0ĺJXz7kJ?Jw(1vj,$n=Mz( G"ťĖ3W#Ȭ"B]e*Y*Y*B)ʪHcyL+QyvZ.HdbeU\>za*.a˪HdM%XSlZ.bU81l5YBBnek$*E.d~H\REY u'*ݥ&˔Hxj­Yήlc4TN K-h2ѐ8P0u hh0~Q'?(3zhzhFzhFzh_aG4l`\¯dEѢh(-2&Rhg?I MBd)4Y ;0[hh'}>`O"JI7)&ߤ4}oMI7iŜ'걮ǺP[Btzb B)&Eޤțy"oRsKqJT+Kui. ץ4\p]Kuij YZ=BWLb*gK9[c:WKt.ӥstNιEӥhMR4'dELu<(|TwR4]K\dMťPJ,LȖJt).EӥhMR4]Ktul(.Eӥh/aڥGH#]zKt.=ҥGH#]zKt.=ҥGH#]zKt.=ҥGH#VzĠG zVT&+ u 77r.T<.J1c׎!G~,o9y-mcwWבֿ3]7XL[1y[7SLG1y DE 7j/YΘ Ǘ$1Q1^U ^h،x\+=B>>G>ᬢ {bQȶ<['R8> 7. E <<4bJDz+(Lv||P)L /LƾԬ4j1Y-V CB:HʤEщe׬q MަNzDpb(c*Te<CY6L-)D{aɞ2MMqb狏P cD1y8FЙ;cD1r/`NLuK -_='`fcl!+mObB#6:?oL\ ݊˼|3o=Dwl:ߘȚԞ4L|?o<~u1q 9|C,$aylwgꇲݼ|$,vsnׁ}vsnN͹>ݜvsn߻ݜvsnNۍw M0t3A[k7ϋ"`ۉ&$h[&ah^Cu4^r]Ư_O}<30UZ{.ki"l.Thk 7?6uW.vxUO|5\֜p }Ntwpm}Oŝ;1~I؎= ;X Jv׺d -7.鄙XY.+yQ^tً.#HD2]FHt.#e$D2]FHt.#e$D2]FH 2=D c.#e$DD2]2\t3 66\_Ica <6z< JD|옎 D]^tً.{e/E]8Slv]^tً.{e/E]^tDb ]$u5ܙ2]ttF"bKo L䐄e ]2tY.еZs-,Ce ] {=žkaϵ2]FHtp805l 8BR ؄6M8hڄ6M8hڄ{F66LM8h6;n :h" mA0?hEx6!onֺɻ&oy/]UǪ*7"؄(;6!~j™Q8QXa0 Awe+vB[#v]+4 /q01#M H ␩8d*C␩8d*C␩8>$vvNWܒ#U8N5%W}s | \C␁8d 2z' đV9NU8&|cMgU6 6L!o2!Ko,!Koy6Sz:}}!oO؄]NsK\*HS*q0F b)8d C)8d C)8d 4>)di C)8d C)8d C)8d C)8d C)8l ̶Y8dCfY8dCfY8dCf-8d قC-8Y;drZ8d uCFnn}7[;d uCFݐQ7d uCFݐQ7d uCFnQ7d uu׋ nnF-D-7iMZpܤ7iMZp>s -׊In҄ s҄4&&IcnҘfiaB0Kd*Hx̺B(Bu4VOUOl44&M)l42ͦL)l42:L ]`(l^P۔6eMlSLg+ՋaMnS۔6eMn3V 9˴iZuF#Ѵ2<˞)M^|є=4eMCSД=4eMASFД4eMA^Yq].za̴[WLb*gK9[cX( !L)ScԘ25L)W)Scʾ/)bʾr/\싙^/)bʾ/\\싙^x z.\)W)b>Ŕ}1e_LSŔ}1e_LS^SŔה}1e_L3>a:<̇)a|2̇)a|2̇)a|2̇)a|2̇)a|2̇)a|2̇)aw]rDvNήĽ?I:>+-'~-+`[W8W:i#5e,Q] 7 .-FXX%w3Rd 0Z߁T`T QIH%X%䴾kpaaM Q8H%T NCV~p:ʎP eMR.D5 k| "u3 G.\ğTKGY,;`Özbg[ VCfczEBC%V_Ld緄,T&vNʜqWBrEP;ߋ"lCȬ^WHdMX%f"E7!7R_LUAwn+d#CxIkHU 4/l /j"1"% "-(S"Z&? ~iK}%:7TX8UJ/3~RtiAJytj_y,؃r`"\/MAuP 믗m'psH|bυ>y 065mkʺa\FVWlp{.Yj'pxnyE*2N t;!.J2,N"N :awu N /uK@ađQxZGc3*r[\`Mէà"ÀȬ/u^bS1 ]8{X}ij $Bwc6{NƜlJl"]ĠGG 4ƛϦ^dJ2cSz7q(AB4Jf~?ȑ.ۼXvIDO[Bso5kT1NJ`Ugc :k=SQ=vJ RxPp Y}P|PMvhE!vlTS&M6 Y}҄mj8Ow!6Xg޲Sxjv2KchuA%Fg+KCw!d_e+o_8X[*kvTPo #DZL2xl89>F0q Xq;hgǵWQHuкZA:h]u׺L|iVu9 Bر$kSϪ4?A:hS-%t\K:s,iЬA\ExbR|[! *ĬvvA;;hgǵA|GvvA;;hgvvhN2젝w.,ځjq*#з]4hi]G|+IJVwkuӓwCv= A+?CXCcD絉Bc3>fJ#ݐ2yC&o &/R(7dFOm]@2xGھH?yP&P"N]3;s0bܫ9\s8dЊ]I?@/kx>LSWÒ_h/Fv`\&d`Ӑ}OCi> ٧!4dӐ}򊏴OEеO 9<:i`EZvĺdh]2d Hí;UD2BX cMZÌ`t>v2v`첐]~#o7?Cs+J~xo~ͥXƔʬGˬ9d 3Ys ;4Et Ns3,vg;Ez< [8N>iWԅ7Yt(J^cÐHoKZp]?@cZ'Md=q@#1*CLPZWX3'ԗPe]]ڠ:H&ֱCIL^CWz);6 ocAQ;%j[EusE,y8Hzbk hkn5A[J @$Z^6tk01nm[^Lv/šJd&{g{-DHk+!+`Sq)Ur S0^[MLލ/],iW&a ّ+nێ(tZ}BP6Ek YbWs{Y/'L2v²Ht8Xoܾ:k6xk\"Fܨupn:Q6Et xtE"]bK5,8 TD+.P P;dyXbC'i}k FCB͗ 뤉׍܆ćK0n}Uiv`]'6j1^b*gKaA#1^3PZ>=ԯ yCSa9_%m a|`&z:-?K MKwV^I8'FB^Rܗ?!P|g*#>+|.F O w=A,q'7EWRh _d/q"^}XUE0ױ16(90V?\M1 vvbjA |lc=׺}7V>z2pu~r[~,$CånCSqBQN] q|SD\:Ȳa"KQw'*ݹ `vH4:qJ]OdDJ.[">8hE"Đ8U9J<.w9TpjsBY)4M;J7^HwՅYўm,XBBB[pYθKTuE*\Û8\q]qUguo WMT$v?‚δ+$W&HP (yBۧBBٞ#Ȟk{Qq 9#,ŖRG[z[k{`gO1C &!a2BD@s%LR{OXUwZdp𲺴n,EC']?M6M4Fc% c1hi v F*nBV@SpA{+3}, A5m5m]m٥a A1h2/52mjߙAKc cd A1h2MƠ&D)kh A1h2MƠ4&cd A1h2MƠ4&cd A1h2MƠ4&cd A{A1h2MƠ4T&˥b ˻2јP&T" X^T({הЂhM4Y& d,@hM4Y;BBYQ4Y& d,@hM4Y& d,@xP~if1he ~`. zR[Ku]gi"llc1cvK&3z&Sԫ^~uB8q+^~TU'db9\!4aə,9/^Zxjᅋ׽6x%̊\sX`?_ņߛF@ Puqxkrbx9m1Ƿ֖K~% xo#ꈷ DnEhcFs11m#c12\Ƙsc.ce112%2b<Õg)1Ky(ka2&ݍ !B|}813a,0ÕhՕhL e,0iv!zs/V򴥈zG(`岰\?|(9q XJLAc&]e>8i 3/N0Uمd^ 2\Fˈ?\Ɩt2V7%2uo 9fG 曇,82\Fˈsq{S]q)`Ĺ82\Fˈsq.#eĹ82\Fˈsq.#eĹ82\Fˈsq.#ӈ[@f˞ss.{eϹ9=\˞ss.{eϹ92\Fˈsq} UD4cv[ojm].ew.岻\vr].<.5[{n]we"nN,]leMAk4ͮA\+2_evջEo֠6x/ڪky b╵7gUl M3iL>2̤!3iL2̤Ǜ.χC'?p47%f&JA| ǒ7cs c2=LCǐ1x FƐ1dd ך]z15<̉k٘!5Kda-?|2QzQ `5,kG. i!M=4,1l !<5>!+5K ۦTzRCTi eegZ U͊puY":yH'H\-dġPa4̠6tN:~ڐg<,'!=;gǣg쐞ҳCzvH!=;g쐞ҳCzvH!=;g쐞ҳCzvT.R]Hu"5gEBKP7OpN\s5eTJ7ӇWf*Ցy-ݛwV& P7I[w)W]6 O ^k@G>[Rq RS6_q/1]x?zRQ<>ʗkOaɛ%^ |7&_޹\޹ 13|$ʬm(0}ҾѾSwJNi);}}оSwRDvU/);}gj_<6X?о3/"L1LA\8:UDK՝KL(hA|>*؉%&/H bŠ-u>/nRl ꛱0RSӋ- e\~H4Ɨ T姉.Tb2k(;DM,]T+b"cLYSE0eLYSE0eLYSE0eLYSE0eLYSE0e,K`XfܫT?RlXz[ҁ%h ,ZnY$nYnR@F%%dXɢOf`XN?Y:|`A,d ,M×,i%4 _/M×KU@%q-=}9'oZ_/\/܆WYhv[Y^_IaT^RK {JtK-) Ja/)[DSm cţ.N*}pu&dB_/%}Wk2KzI_/%}^KyI3/i%ͼ4f^KyI3/i%ͼ.O4R4f^+52t^K:xI/%tK^ג'z.-k/A?\K~n*XRK*xI?\KPK*xI/%T ^RK*xI/^K~Q:*xIڧ=cT?k{j')@ՌkyZ G6m|xm|xK7K7xO7fZn `0t_'flt !nN~q)&E=条D<*\tGkRP'B]?T^qIT W`W v`W v`W v`W vsaӫ? +Uծ1{2, ΰ$*Mb*Xt8v.n0!SiЅ B$4H&LAl=/h'zPCk@Th@ L?dyd|p*S9Ð< yo?d@HZ?!? a rrr`4ܡ\ؼ g8ʾ?e7asӋ]ޖS-h1bRRv޽$u%KB$tpaK}3q{`\cyqИssr;Ps!_эƜuSqg1B@LT}5B81B փSURPlE؊Pboq{mø= *# a7 s3R[I ?VSKVcG:[^|"9o jͫJ6*ټdͫJ6*ټdͫJv^UB?ROJTmi`A(^kzRQK)D^q O@&6F(ebLl fn0vteץͰ@߶=@v:]#K_X#=o+m$K%tu ŦI hlzE A ]mu*۶ ºJ\¨qFW ]nFWW1!';jg$'1t+tLlSaޓOC"wJNfQNJN9=4l0,H2\E3C%3um3/_:6AztJN)=:G蘹a)7sZ]]q#]f5ׁ:4R%daw`kU*e84 Sn"4ź쯮!s#ZaӮkiV9 #2#]ckt15FH1üP h=A]]nJ@|/fK}/nn,ichc)MrR+j8\6I`K Ёe/0A3xSŰzXc*1lY-P6W`Sjc#!"CS4@w alʽCf``9-4uZhNiМS:;;;;.seNq ٥( (X:?ދ಄y92\#e*L%3Sbr.,]JSqax(2\~.OE4a- /- _{ 0X>n\Cr_>q(nNNW'mh}8]LϓPUI'*Pb}d/1: ojMJl+B5-p*_JE_Bfg6_c@Eؒ)g-uEn Y+$e]Ko7k[UuD,S`O]_(>Sھ$(%Y_:'H@2ձrJC{Yڦ$ޔ烰hG$ăCʦCʦCʦCʦCʦCʦCʦCʦC(\ TCgxNbW!#6kn6 P|Q80km_V&A۹{RK \om3ݼ:QqM)cbRޚnm l5zjyKkoizU};1֠]5XزmBeg:<0XBgk# T%|k?bJzm60ag k#077oKTKŬmƝ6sfŃ`ز".Ba3Ve}R7"'v1KdM(F'T ZMAA6\8mҵQ6lZɃѰit$:Gtmش>T/(^RF۰zb+]ϚPWzܒC3r"Sk4Q`ʾ&$\K)7 MNasLdI{ s($6FhMbIl4^qDAMiз}Ӡo;C7[ D%[}; 4Z)9[z;4r@nMi 7 䦁4r/ɀ;+㕌YCM 3:+s(=,MWtx}r^}4LП ώHt|y" ꋄX=q}܃>A85`t,DǩӔ9yσ,,Nbߋ.Bz{(:Iaѹb $qY4Lb496Mup"E1gs&f.O۶zFfh&fh&ѥ.4z ]ĥWT$L@8ji|b5mx].-!:jAO >QmA빇AN01pE#8Epk8%1QG/u$ص㵇Xu<8PTj}PA vy|3//Zp&b.W    A!1($MA`$/e|G['ס_;.כKA?%danVSx# # VJli ~B6=WJkaIg35c صK-z`Xc`Iepk>1w a^IHř97IbpRx|W$< NʗD9)_$`I'r'iLϙ3>1B[krlƆo䡴ٍ ύ ȃeENZK޾Gc{1}l3*=B>V..]ȎYz#m9mGF1bPAtXҽ.Wыp͑˷`\u]zy4>|l)Kք]<ԁƓ$_me=ϑ?5S?8GARƫs׊6)mx^ }CCNn$Zw_H^B]h+L"r ll:iPٸ)ԞPcFzq)vf$Օ݅C*_R SB1 B>|Eؼ S6HM"O'eSLFI`ĬLbTɐ ɐ 9AFFM/SXk21Ȼ ɻ ɻ ɻu 2z)ړw\'V.\Ot 'Dc) CqJ9]ު; zEJR8^LXȚLhL7T,TUu N,e 7~(N{_','pW:jթ%9])lK̽=RKN W)I|,'ޛę˂K*cyvuśerJYO=,Jsi|8OO{ bQl6F(6flMbIl6&$6ɯ)|B(62[MbIl6&$6flMbIl6&$6flMbIl6&ypI̓ɜQlv6ԫ[O'~Vԓ$6#6#6#6#6#6#66&phl+z6(nmP~8SGGGGG|G|G|G|G|G|G|G|G|G|G|G|G|G|G|G|G|G|G|G|G|G|G|G|G|uWcr8WHN9)WY ؗ'DkqQ/.!t(.E ˁ\L`]*PFlyq ?)d)j= )4Ws6R@JXW^zCYl\I\Kޚ@ʪHbrPRL7kq BuwPXC|YI \,nMjZongMi6im,MlCq|(YRI'XKTZ* Z|NJ cx,ZoO)Ύ"ApS.slKNJ\eSG\h%'NTBQB=W[SBBBBBBBB8||io*Bc9'v㋻yA%6om^1áYWvlfk uֹ{|9KA 6> H=Q ;6w~CP|#Paixmd7S @&*փN I(-d[`G EG(X}Zgmq4xU^wl#9 ( X8]/Mg1V~jz6Ȉ{6/Lt7Y͠ yF3dz[Њͨj P>ˆp YܦӄzyV=J =BuUUQ-4 P;"0C0{fy6y?y?i2u`5gzWka19W'^6m6YU35QglxoדRr!h< AY>p>yQB|OefpHOr<\}ci8 -ӟi6m>|BNI:)C'e䜔+w@N )9C8%8%BSsJpN Ω)9=%8]&*1,L )9%8fSsJpN Ω])97%8]?qXm@999%H,YGGr\f:w5|||||v#E#E#E#E#E7):Hљ;}|vY?x|||||||||||||||||||||||||||||T\RqQ*.JE(T\KRqI*.Iť % }.1J%$T\KRqI*.I%$T\K,KRqioI*.m-IťM%$WnÌ.MHHHHHHHH, gh=y[?z$z$z6ֳ,)_)##############################6ߦS幟4kSLm -%VR!ĭPnmmϿ0ofLJRzl  έ>'6[}[[}K-~lCe֮\[g[[[[8 PL=m7U6nnn@zhB= @ηA4Bc~N4֞FFAFrڨ~66Nl^UY ڐ~66N%/ ڦՆ~!qxӝ~~6 W''> lG9G9G9G9G9G9G9G9gsn?b?sQٜۏُ8Q<8R&Q&G!Ԑbُbُbُbُbُbُbُb (> {Fb$Aw0.j1!1P7![8BC>!g-yX}n:q_SBrߗMw Sr7h:tGӑM;w4h%w(G>JQr">h]6,6y -t\Xh$aӳ͟'}{ 'Fdc|Omo>|F1>?ZɸÞJF{hO"}{Dh#4Jleojyhf$6'>IlOb}G?=@ltG{NwtǕfOlf{NzG{NzG{Nz)*)~i DjHy(>gّg:@(Y.D:ZİrP:E⬳\:\.*)\,\N}|CȸXr k]t[JA*<b.*cX:ScMk&t۔X{RP:oX<sU/Em=V3WָRQB)5ԴƕָRWjZia&UjsGY$o%7W<`/?qL E5\ỄCCB_WLg?K8k '9ĵTࣻ_99'6x߰ll_?jrYZ======!eZ=Z=Zp{9^ҎюююююM~ۗ.1ihhhhhhhM٩yD9੢9ԋ]z|QVt5kAuM+:Z״sun+8@tM &] Dk5<:6^l@tM :yZTsuM:ytc֕ Mz.dn?68<9d!a=tnbv\ع綞'}u.$gzɓ tI_?g%?I.֟IIc&G|GdGdw5&o׮#2O履SGīQūQūQūQūIīIīIVmLմjc&j&j&j&j&j&j&j&j&jx5Ęīi%$^M՞{īJjJ=jxG#^W{ī=HJ61VbYG\#.Yg%ƞZN.dB="gȳG#y<{Ĝ=b1gG#s9{Ĝ=b1gG#s9{Ĝ=b1gG#s9{Ĝ=b1gG9Ĝcj|N\oq@t >s;sĝkŵtaȸ\Zq-V\K+iV\)zk=ŵZOqVȷq=ŵ<~Q.=b(E" .BwYrZ9qg8O zqWbTL.K\%Γ =6PK\;5kĵFZ#qFs}DZq]FE<]o/?qLPa6WMlCq|(Y+R,,C;eE\-| ]I/ _ϳBqp XW1)e㏲GlQ6(e=2xbixx,ǟ=-|Y+a*GT2Qs=8{R*YxPh0ܦsp2f=o*L\y8=ǫun( V.?`HD#:Xz;ކkqaYZ_uن܏pj幇wAO0:Oy.~93{EpM^֯ l"`L_ .MoNgq:Y}zl,e]b;7&d ~)n<׍FMd<:|V#I\W]5L\u\ꦥ1TinϻRpiZJV~Gpٕΰ;FXyʷgP 莍s quT݂u r-e9K]zAkhBmC١2~tM͸D1`_Bf ;+9!ջK׮uBv?ǰ=y1CD\hݵv ,6ԒC \K?]ȵ) 6.`q3.y?g9<y]t :?ΟAϠ<`z—`g03~Abpq+VhAEhL ;?ј\䋹; tQ)8MA”Il]T2M= n]dpA˜τc)\*dqԇ'{ ?0Zu!0KSu:n] ߭^g1xUEElM_UYgɃ2/v!id'*lAvneEqI~×ҏ'^/{ ݳcc2312121212#O~121r=xJ?s\, R_K/߀] K/R_K܆ 3%х,G<.t9*F<8FҨm]F'd)4.3  뙐dU]q8\ H_O^مoOۧABmR=,Ciӽebt+^N]ׇD޷WWù(` ]s;h2gc3;JPG(`8x6!bk\8_}Ebk0l%@m|o ۫!wkEwN[oSQQ31Q,;~MwvT1ιswEN:Aa?+~<_RL tp)IbA7zhOpqVZ Gv]ԃ8N1w+Lun6CÃnB=u]zx~6gO'_{~XlOlj ~)& W:{~%Y8y~U8ǙS_I^ddz:Z2r!]xvug!݅x[QgG!]xvEg!xvYÉg '5xv9g t$&?s#ؠΰ&JTQiZ>ʃ%*/n3%ʓhᦫ/2O/1Tׄ ӍQTGQit#}QT&T%[4Hh}죨>ʃ%*M t*t(D|1Ee|x?oo;}=zX2g{fSKDb{ W:<@T&χ؟g㈢|=)C<^VxLV /xLV~rCyxNHxx=uxYc=c1Ye`xٞV)cx%FPkBޡFГ FYxlM2Q.&PK%zI&ʷi4v|;I<6&(NoɷiXxɋi4n\7M&MkÚc$ir49i{ZNO=6zcSSV69f&zxq}ex+-/?]ΕzWKp9W&ljljljljljljljljljljljljljljljljljljljljljljljljljljljljljljlMpN!݌ΕFJRt.$JKu^arν{KɥRig.BKɥ:/0Q(M~&?J뼅Gi[8LΓ&Id'MΓ?`r49O'ny:09O'MΓy!y:/09O'MΓΓΖ-&I뼅 9 (KB\42%bGأ@Q O]4}20\(г3zs=u<<"HJ%`G أQ({=J%P^ߢ lP ׃J ꇛ)D_d`V6㱄x+>-lR&uGns W^~܇~ҜO:\~ҜOuhS.!߇4W2%l͞؁c$77&B ;!!!wVɏa:dr:dcO$t 2'DL4=}0#QnFs~Ó`M/A{c}u"\:Kq"@4= D(/a^rK'JVǯ&Zm74&7}ȴy-%d ]%K:sL"_=MщMwc'"4=l֝qݩL:HoUw9no.p땭F1(jV^#h{}2kwe> &C5c}HK\Ǜpg8.JK0OobCK7N]n]X,%8avd@eμ qg54gSmo {ga.[jU*V>RV5fp_ƀy!?j8<;66Uv tZ=T DutD옽8n 'btl 727:ijD\O>q* 옽ht}Q5{s*듊ٛ_fon+K8C܇OakH\#oA6II\&qxYP]diû'<4xi<,q9^CYtf525辄]5@z8Zоtg'@ԱYyCdai| H-龤,KҳaDL#߉"2(dt_:aH9CƲEpRz+1-tĠt߭A1$DoWJ/~m,c~硖76d&p|9tXm\A <X uzH Vk'9cstS [HlN4"*fW7k Nbqg5bL`~Yfef3c-&ht!9׼ ,ρ9p4lY8:qUq1>VOG-{epe[YY ),΁] 뗇02O<|够 3knTa8x䲓Y3ٛS$\` v/tQ/]z/} t6GǣkRC?DQC?DQCށ]1q5Y30_PlR0"2Dиb\ 1.C!p'yAbt 1:C!Fc1a9 & &9ϰ~0xAzhR8񚃈nO?pu2 .#~C_gH15AF՟t3#6 "^9`~L\۾`&LվqK`ۉ&AJd.1yτ}<>c]יGD$N[O }l\28er3Ee;EN02a9%6`x \!Kv,^p>aN̙%D/A8ſY5yXIH)7΋X̼[K".3u ~ߺaW? :S8_;8Ujad L=j̈N "XL<;y!,V|1Buct3k 3 MME#ltю .˷/ntє.3]fLE W]KYμtp]!!opCS'6ceEH^أ~ZMؑ6Y@Y@Y@ ͘I3 I4 IRp(w"!L Ҡ FAA844%(H4(HfPiP7ip3JP7iAAA3(HL4$(H4dWI$ U"ICvHҐ]%!R5dWIT qw p;lcup!CvnHƐ!;7Bvht!;4BvhЈ+$W/U V!*DZHi"BU V!*DZHi"BU V!*DZHi"BU V!*DZeIM |z%[8DKX%U@L&+RSStHlf}ƞ[RtKnI-3)z&EϤ=gRLImTkA?+Ec_dž'@wxk*a)}.l[I[Ʒy*:*[oV3}v4/'LvaIFVcIzX?rC_}~ɓi.v gw{L]?cR~yT;vؗY%owV6vs V:NGؚ)-$'ƛ%uIږvx:0ךe_/3up%ޗY%K )r=ޭw{%X@o bP$ 7i:LƃY 0tȬ`w_DkqeP]F;=ND9. a;']3GKX:58ȈՈ`""|g:Du4ֶXY594 qAF<1 g]Cm5 Tab Y@^将5 Tab {OZl\ 7~y/seqہPQy<*olrx ?csxB09,8)a28M,qw񿥏OO`J{4YTӶ?=ǔԫMjD6ѫMjD6ѫ펊#C܇KUNI礔~nwT ]DW7Ս/3LMtxD7MtxDk4@7Mt{D7Mt{D7Mt{/ޡzup:Ff8Qk8Qk8QÓ8Yru:Gf#ΑYru,WXs 9u:`<%k<%k<%kOOOɞ 260222222228%{YIdeԿ˂XXXXsrl3Z]Z]Z]ZcݗK8oUwiuiu\8h.Z:#4ٶ)H.RfH.R"H.Ҹ4"H.Ҹ4"H.R<rYH{H!]GI:]@5E5E5EݡH _c4(\GN#. K<j<j<j<j<j<jåpNNNNNNNNmpCCC680 xxxxxF#/5/5/5/5/5/5/5/hgJgJgJgJgJgJgJgJgJmp4368 N{=Ľ,kQ' 6#Ĉ01"L&Qĸ01.L ĸ01.L d8j251:L01:L01:L_K͢λ>մ>F݃5'2YF[7v`]#OmNilsF@H%%sS_t}ZXҌ;;[\iGK?h*{PTδ֍k8`+go!ltAaYYYٍɩ9TetIDNpfR,`)ZRVw~[#]fU vu? U[?(s^ l8Q-boWX'pO*`b/ķZ4)8(cltMDq˥y{4 M7D])2A2A6KO}䐉SUEb^ꍕ6ݗx0aB̖%dVL&_eTT$Of!'^;쬌}C#iE6"l.l­;ll|u&܅9%HrJ8CfǽDzʲUG!3^ڧ׼\뎐;U{=堸F፧a"6w֔ӯ# ,]kcQ&>&,U%j֪^FN%aM<%wj&Z]p_ͷTs#B\Ohx9LAh ɬU6V~93jMpqvNiR;MjCt XYeYtv>~G4=tل ;3cY~]ʯK[~&l~'8(rqPJJA)9(%!7?8H͖@BJJA)9(%ՠh V޻diWXTRC:ΐFy!gwCĐ1A iCľfoAUӘl=YE_; 5/^ b9 :sOvɚyj;?{ZA Rt)ZXo~bvWif "r)-dJ B03a}{llm 2Jd;U6H*;4>cU_3yѵ4yaI{(V \w5;@pߢкӤ(bA+,ߺpȷo,i=|c~G0uG(5886Vs6 >9(D;&;S_s6>~ۘ@^NrYIQxC NS&,陒)ip\%K='=|%YULAnϐo?[0};9Ѽ0$:jVK48M4ND48M4NӸ2gP48M4ND48M4ND㴫qcl08M4NK@|Ŀ{% %h&iqh&iqhZaiqh=Z%5rdkɅ}}5iwZ臰;-NKӒܻ}ɉd.Qڹ_2éMkIP!;jA7Ὣ怒Aw{@(Ł#_{=S{ f&^u>|'&}Y?;s}Ţg`ךyއ+rlo`'"9hT W(Ӛv `^(FFou"mjw&bA4 gպn.Ol+{D{D6ʡ}[r`4V.[Gg{J8oc܇U够Ō);f;55~yA 26K"w9X {m!az{/Oil5!o&яKa tIVz`w4`,W]vac2I4YteҹL:IT}C7N7.g-ŞU.xpDqrA3]ӵˊ8k ZF~{"hc(6}zxwldw1(=8sB-Z=VϢճh,Z=VϢճh,Z=VϢL]2uY8-|1b/lqu>|@GDM~EOjD][j }~ш lh ~p2!eh4//)D/K&.aP/K%:`J}D,1pKtnnXb-K %zbX'%zbX'艽v 5pNp9hsڜ6mNJpguִ4r _{~Ww" ٘)۾76 Ρcƅ9 Ϝ"9&s6"qݾ/[@2=oo7oC-/l9}p3m]liXqv.ե_˽]/^B_t.l?o]K_t.}1jG{{DDDDwn@tn@lC2]v[ffރ{]6lse ~qGK(ֈ3JYw;|AAAm.] l SZI]8_J>W2p9> QŇQ}>(WA+=8ႇC  NƇzd|Gn|` uD0x:i BY !F >8*,b'"Ce4dL9>M@cmcNǨ ] "Sw,EX =$CS!CJV'q?F$QrA46A`>x?x?x?x!:=bAGh A`(8=t@+e/ ^u6&=$KdB5K5acթ B<@p @2KF3Y&ᤔ2Rfr<61)9&%ǽ0`\Ҁ11ds18&k "CٌYkF]xKpQ=*KUG[7"ZfX}pL䇋!1K/7hsx;g4.qg\,Eq.6[0oYokb 2;绡6Z,lE T-6ƾʵ@\'q&?{CjAd`:(׈6?c69l젍aZ6vAKImAtvܳ wa 'Y3ڤ6M:h"S6锲t`)e't&IlTjtFM:eN٤SʲOʲORORORORO6M:eN٤S6H~vUWvvn:]?[o V8et Nm4õ6t"Vlv"P #ޤa;#tЁZJΎXJΎ]>:snyP`ۥvrfr/t.Wί^gW$|t}]܃e"-]KŻt.]Km9۔s|]&V 8]|^IڍeJPmP&D~ƈ< =tIDuv R lu8~rO.IYb.OiG_qO~GF? K%Ó& 9!m9򰉻3={a4&U;Nى6;f'[dKl-uNT-uk9R'[dKl?t:v')-uJKR)ڼ-uJK2)vJj)vJj)vJj)FٔV;NSZV;NiSZV;NiV'!T`LۯۯՕ`&H !iNc@gV2:hD|6`2 ҕEZ6[ j̐1Cv4ځF;h@>qiZƑqh.9 jyFPgN/FtJeMAftH99Yփll\r<˺/$0Ԯ]:ev +g.Oi ゔ3r&RDʙH9)g"Lj"3~&vk" MXIhbǚHEh"MT4&RD*HE[]Wfͽ&\npQ.zCšPoIok[h; 1̓o$Du )o÷a\-.u(`Y;@8XSUuԡVQXlrF/= Df[@).3.s9r?oΗs9r./s{ir.-Ͼk{Wl"Lm\Mk^ȳ]FU>(V>LW\)_[őjhfȥ:w{wIOqi.X 8n#b'ncZ(02~&d~ 5 =ŽYzŽk!S~ L!zYU+H.,5PmVS*6,l %12v&vvzx Ǧ.M8$k'=: . <Hw ˅}c9äkTgyxgzyP_6dy>גj-0&ߎlG]2[b 4#3=>݆0gi.܅0 zBTxsOOҞCFHA 56|KG&KքY&PUg_+&v/2/2y C:u )::4l̷?fy/~7u~|lL~o%݃T~Y~-/m)2`Uɛ |v#"߆mT'ۨ6FuQ\)v!9oEp;d`FWā t7~.2bG9s2`A,.z*V 6^ z@ŗzg/f5ېo wa vHmN&܅O|;cuǧ?:zI~3 &<a؄g}%hJPiW)Q4JFI(IZm];g}N_\1b01 p =8sBƵZȼ-d\a!9oH Q%jDmuL`kU)oż]թw~qwjku`l2dwUw?-.آ =^tqNwiYo D45MMDSD45MMDSD4S 1eI?jX|V„j"&⫉j"&⫉j"&⫉j"먿śbY):VuU~/{I`#t6Bi#t6Bi#tN]DloEqEdY[v\_Zn;x|K~ &|s"oBsO4\nŷY}  xG鰛v<(~+',.ݴeѥv]&ٱz8|BҀեKR]jKw[j]*)^6{٨X&%2#G]cqY~;]ڟuqgp5` Q[.=೷U`I\*vO] 7h =vϠ3h =vϠ3h =uPAgp88P4`#0oPXF]q}hterf(>$Zqm"gElɂF2HJvA}0DQ2{3lq`%v !r"!r߼!r"gCt!&nnb CL!zb'!zb'!zbմ&Ą%&F:aR'Lamr69lڑv<;C IQ7)&դT6]߷~1볷9`"bȷI6 y1u“kr69@IQ>9* w :< R2{_??Q?Y;6V~[F&tMr[NS攮9I1 6=vqg\S.8&9[t)qJwtG{dUM {/nzҽ)_c}( SxV|բEkG:ڠAj``!ё,JDXiA>"xK'Pi"{Zi_(煼eI7LbnWY_1<AZcqoP}Hx?l]OxU׺$ 5>FOa'υpzU~~$Tq{[j47tƯtfu6:S7KVҶ)u˷m 3);x9$.is~LϔMoK8ߒ.Iϒ4,IK.]opmd)~ f[ Iv+N56Y߽?]$^mW,zbI.;K!a? 6l]|,J;,3ˇO膽K38k\LۻqYy{:.OlL֥RvvdzT] ?ŝGmQ|􄱽!Yoc19koc _(Nu/})Sd{l>E,'I/8Pѝ5PGwf+# ~V+-?!wNDrxMzcEQyψk 鵲$~8:qclDȅ,q6"ӛl}Hjq3Fg _Q;q SƆgK>cz`/h/h/؋&]~iK/Kl%[z-}N-rd6&ݟvwݻ]bzOX`{?f`.nXK%zXwf)I`Er]b.J7K)NItNIXK,_$mʷɸV3af.qy嘦OJL/Z?ntV1y?, dzx@gn:9V؉hQ&!ؽ"AH }#6";j^d 8u`zSa{S">{o*\c r"wu{i4# +!gϹnEe"1ܯr_ zn/Ox才'.~W'ٴo*Sp4k&,Q:yYμҹ":x+{_} zYpq$!LP9N,r{D;L;}+p"C%(NF/ * Gn6n۪Z0$l6۪94۸D߰Y2B1av 1M1Y lNL`b3ƙu}AYoz}TbUmu^5t#yx~-k{oٺ&үQ?6|+Ұ3qF; %^~=n]qaqlWSug R) /) /) /Ʌ5NfW {׽mzIиw}**5no+pqF։NN4 ow=|c㫋HN=<෺A'!.JYN$Z;5aFDf/#]C=qAo}]܍q Tawy~3dccb/e?X)U8B<0e,_bg|.*mQo $gnE޶wHܛ 7uwQ׆hڛ;P)oX܉oXaM_!Rc^Vlg}E/4߀߷>qpÛ1=xo0i,>geib/k;S <]uK,uqgj𲤇y.+!'uo?~tH~Q®K5ӟ~Y9W۔zW?F9ˋdgn,!^g,E^gl^40#'39>&܅Di{7Zݥc'ƤD`᷶_n7F؉wMe֛kEpTQOOd߅}*I?ߞt?B<%=2f;12gKĒ1?#KD~Dˣ*" MD't]D-EKt].[.)X7fH^l ;֑ epj I4i&S:s9 acMm{sqUwο;ߝR)%Rҏ|aqUA z 顦sιz\si*;Meg{4{ 0CwN;~%"Ё&L'닓..5 y,7n6~l:yѪ|L;"7-\f.?A$y14e"\ăxpiFWb? 2A}Gwp}Gwp}GwQ7k/BpQ0(B)(A.hmm킶] ( %RP(?8?8#kɃ"bz^V]rQ,#1c=`?LLLĕIYaJ~oݖL9WOMLij ?.;_=Xfuףl~hH\bgdB\vwr[~6ooӛ1m.p&1)(LprF!9YɊɊɊɊ>قL-%B 1B=B ʐɍ͒f3),. S.̺XҖxsyNLLLLLhm2I2I,1]0{,S( S8גz-גFqr %9BIN$PS()Jr %9BIN$PS()Jr %9BIN$PS()Jr %9Bɣ%h@Uud=~UrN*!9zH&Oj&Oj&Oj&Oj&ϣ_61H*#9H?>#9H?CɡHCPrz(9=JN#R:{6!:*~Q)zǩ:Vw¼Rrg^QYP0eaVr*9JA%_9qfqhvA0,uBl @]3$MC$tǀ5e'tc VA27wق8: NYʷG5&d~JEwxNԥSF)vXba)vXba)vXba)#;,K駌Sl,eԟbS;)eԟ2OIhFѳ~dp{lCn[8+}?DM0{4S<|΍ L!ֈH\\,D*놲Y]WA' Nd`, Uuh`9[]Hv~GGf#3G@l@0?ߨ~OyTl\:D"] n|G쨝QF~"ŏo l3r1?alv$?y?ʽS)I .K8K~Y?ʺQu]\.+cQO,?R检[L* |!@𵃹7` ;{: d2#3#ߛW9RK3;qr96\cȃ<$7 a~ p m&Պ3]84},_&~]x ˷};Y;{..Ks"_pĿsRkt //]w6f)w2l孖.0a̎xǜ_;eMXXNtnަeyh$K;K/0۹I;7ce s-a >-ؙM#esJwK5풶uywO{㣟0_fꊾ10ks~tv[n$>RnUU{fT~꽩^{>gJl x( ;+c:Id1~_CZ<;;zǚ ]G,"eQ#{%T[.S(UBNӯwevɷяKYIoqgz]f ʄV.Y>~3mRKK '6 .LC!Őbt1d2] .LC!Őbt1d2] .LC_Cm[B2 .NN՗OC3| cghqY}W]Hŀb@u1P] (TcjkiQ/ U5`K MҠ4h!{9UC4v?D߃F\ i^ PL `_b{iFbA/uAP7o|7oHY=?2p{Adjub˭DlEq~ks_d^-fs 4 cmg3d{M <%/X07nfCǾ.σդ%=nzЀ"Bx7&egca,ǽM݊QBzpUҒ,w£t,^ow EʶrNqY0Exv/ve;IJyly1/u? IF߻*0Cc1W1Yn'lҮ]Bil1*ڭzXsLb 1ӆiC̴!f3mv2Yh ^~Ңhi 'f%46S?|;̴|bqL` 1?,i6|b 1߆oC̷q/Sn)7Ĕb 1冘rCL!Sn)7Ĕb 1冘rCL!Sn)7ĔbwTf_z|w7^~ {N)*֚xc'k%SLANj؉S}Z"GBDy4%0 ݌v'8JǍ{"+^w)93ΫZ#w30m ZW}+cN6Vu0pcWouOӹ#sG8z~1y;eV *&olBNy*>-K|㾡l#drҺv8}aK^r &,~f\{$?.26׍ 7] IrMOw8Yuv'Etԍ_V^x5n& |ՠOݢ X+zEU&>9WTfRl.e}5 ~[GVO2kdWؐhËNe3:gW|5bmmWFOxgM+_D I?Ybxe.`R!M6&BDw$%q-kWG]xO򯇼t cp>Í`Ý(da2?{y1|L<=\Se;|/3%pIcŒK*Jj.?XI+% J])&D{C&!r'E/K8p]:d{Xuo)2&6&Dhb=MV6&ZD+helMV6&ZD+helMV^uj h~1p4;\H\0:4 C}p_84 C}p_84-/n[^ܶph/>F., ;/B a://.^]x𮇏U!u "C}p_84 C}p_84 U}᪃( ʋ333cŽ " llX?&ҏ틇Y{NII agŏQ9?M\J.7|k"\e?!0.XEaMW]VyI2x/]'^fuv )i㢚#jC=oE R56^c3:BY6:NIqbw@ub?3~VzqbY'dAnwqb;8NvyO}?tyO}?tef,3QuQuQuQlj:o 1Ơ:o 1Ơ:o 1;t[&t798ͨn 2luIWz (g,|C7dEAq2󪣍9,>q;A~4r{.ݹ+q0 8{&Lye QSǖb7~rOcO˓~ƸE֏s| ?(L0%v,Eh&ł~-dgl/3:c{=5yWjqZZ-ldo ?.S$ I,qx\,qx\,qx7tSX](v^liA5D2: X?2: Z"AK$h[:omq8|0лqdddddEWBpN6dd;xN_u^yU99٠b WeB9٠ wiUsAGspp[ kޚu'(Q&ЊVq_yHQwP=pD\sz0Z;JW $lCu!q>Տoo~~P.!8L CƩv$ڹImOa gIJ4f"䪤-_ 1V]21_\TN|܃Xsy58jz^(Rt]r.9]I陔t:ꠞ:pUVS $K:/ $Kzr7]RD%EԽշּv%{}þ |^r>/9gIqgI$9|G;9?8)x)FfgTx&ls=v95s.`''3;RRRRRRRRug~aEעE85222bXyo&@LJ;h-ؽ\\WMI7{w{Vγ~˜m6o|6]vDZNe8Yᰱ<?=~},ڀv/q76h>X?lN-_f֦gYV~[㭥oyKo{)7&>C iW}eRKڗYG2cKŏlu!k&WRrqՊN4.w<nrd dvnU :6Ã@8(5¨oG7q"9u ZGwƪF]4Ƴ:."x~#sA~oEcDQ:YRAh!wASX}7?7T<|?Z%u9Flmɷm`ipVti;m!3˘)};'"Vծ*AV`)6Y[U]k/Oqg`z~¬C]2&U'|;C)R"]qRE^"$ɏyow1r'z#J~mg?\^nocSʭD7rRXu1uNu׎hQ,иϿ[!<ԄuL ?NW7[ > lX|#"^~ iXR(0X:s9X4W ^A3:fk6"y(jnC\s( yIߥ%aJwΛ\vr,Fw&I4Լ7?r;U\^ ǔ|.QT}@U+Gߛb1oA# сm2nm9aD`hHJn`7{~X,PJ5fWd{8J!9H׷xЀo%h@goD֞U6(lL ?،Ps[iͅd`ooc \ρ7!"uۃ=~Ʉ+"(w y_DD,>gu6ڔ3.7tl `Ֆ*-5/ ) "Cu'Ɖ(jCAe>s=b.SzZ%lcL!QQ_;?R]䤟1>L1mX.6'Ubj..If]aeH&],~od(Ni],ѰFΜЙj33't愹]?hy` 1!&I#v҈DY~ROIQ?)'E'I0&O ro@!7$D1|x ّU)bu("J)ta)Ff-RdNSDCy]F15س ȜWdOҏ a_kuUd^:Ã) ꬶby0:s[ܧQxDQQ DrĒ¢k},?+ V 62E\4E;ʕ iik9]04 Nള?\‡ Yհ? b5 Y5 Y;*~}Vw3S85S8;Ev B԰)4p@ma "ː-' 6^cSaGPH@ ho??Q?ITtg;Y]20p3#ݼo-,Ö߻xv{r)ťUjaqg**9V*9e79eŕPY]ΰq3aX\Z\Z\Z\Zðaqj]4DQ{a+ \Fd`+ =;u%qDs"G\[ dAlɂؒ% bKvH8]# ״] zVϢ^w nYԻ4]m1t:h9|r}I.aҽ/a6Zg\q5]f^ueȘ6$ǃ #Q2] aw~&N,{46Cx"SZ~,"[^A&i[%yd?qſK$.3S$݋f^u0r{>ݾ+澷L=/bO%&x!r^n!;r,Y=m4v/qg:]yƇ5^[n |^/;K.U6 3B {ֽ6OM4&?,7/kdYy^u n.+ `U K,d\ߦte__-xµr+ zk,ZOq\}_g~W/w8ixQ:u)h\Mpo.vH:qل}8P uca46'* \]&~f,n.M|ۗlS#i˒rj_ ˡޭ:\%%^x,[[uԻU 3cva/!yE|pi;'|pMڪLK4I |mҤM1mXG<,a%,a:[,C9`wlv|ؘ2RwD^0ïw6)^8܄eV($/=kï]SoEDт[{dmZ)QVmEDG:?٪8;~NӀ 2G2k,+:ޔ:H &'V)u)ڂ$qBǡ6.*38uP3|1%,qM6[V9{bR8=^Aq;?w$ZuY”4PMxU#P,& aĕ,z6ü`p᭧SXKY]ʡʹlgġQq>/?eUom ٽ.cj%(ewLq,z{23(S_fMc&RZtX %nI[R%nI>Ztl&ņIaRl&ņIaRl&ņIaRl&ņIaRl&ņIaRl 婈 7{D'jB^ p{st&;el,.OBV&/`a &/`a &/`fT" '56y-))N8ܚRC}j)sʓZ)𠟷N1 ߘIOH;k[.3pXX.˷} ;Hk^ /2D,`)Sk,<{D5u] wa.aJC]Ѹoxeio˰Wl\ l~.ߵ <\f[mǰy I2K[]y8$dX ضKy6lWŏg\Cë aw԰pL["[Lʡ7/) eDA5^0ï!Ր28 K s%{ ^Cw^Cw);E{h]D{Ew]D{:O.⼋8"λ.⼋8WlϜnK'%=rW<sEDCi"/gu쿺Nt7,i;l쩵re$_[uyU{o)ow'K9ު&~Vj!s.`~K\o?廟˞9!~\ _SZd4DC4M>d4DCNC4M>d4DC4%g¨4 ?^yxE}ɴE5nal.,/20~a5 X C!Va0j2S7ĂbA̲ &, b‚ &lvÄ0a7LX 6¤0i#L6¤0i#L6¾d~a#L6īrsF&mlIaF&m}!!`ĉa#L6ģqsF&mIaFطu`#L6a˷V!,Mg5%o]' aʄsSS){a0^b/LwOw^'=lG S)”Iygx 7} K8}`;LSl)K% v|~,[+DrªIQcؖ3""m;=Ngr0fӘMt0ö[m˞K"!oV۲$u`ăMm2،2~fѢj/n] aƁj[ƒme1Dء}YP򭳏~.؇]osh_oջ (e?4]I_>eNenpۗ}Sdۗ}r{ؗˢb_W{H4ULLST11ULLST11ULL4a9cܗY٣IK؅ŏ1|[bzL8L^ĸ0*&bb*&q/Eu l11[Ll11[Ll11[Ll11[Ll11[L̖U ӫW-L/,L/,L/X( ŒƢ-h,n^pUjs>͉HS]& /F O&lL䳮V)y0^W\:\+KVɏ'L%KKė,/Y_$^G0)K/S.ºu%뎘8Э&nd|ɺ#fӗ|7"O kKUCtgiYoW05 d K*>zZоd}B% Kڗ,/Yh_оd}BKvd%%jh{Zw=$%uGv-a T]ܓ(G8iw]UU3/`B;{%d*YJ%d*YJ%K}̯wGMqR\3 y9BuPR720O]'agZ``\~N9m? agZԐ VU`ލ̱1:G~Fb,.9,3,ڤ|aXg, dl~'b2(I7TƢ~="XxHfHN=r$"\/(ޏ)S؜OIZYuop"0 7 E"u>}D5l/崽N獬o8߸tbWʍBwFZ(rYLqLمp]D\.wwr>p@Uslr?{?Hzz^vtP0.` /~/?j܋G0HZCdok[: NXMxd8)afGi~'$ Mf>^u,&'c<5g%{p{':XCtgE>x|D KEߖݷEwwYt.eI.G\,,D.G\,,D.G\r skܣZD q lDqj\X.,7"A-܈7"7"ܫ^bֻ `v|Ap !X׼a//H1wҼbn򊹍D=#'68ypP w@dzp"fM^RPPM Φy﫫1ɦ=!; |sDZ #G"7Lxg"Y6 dD6 b!L},׌p0 (B6Pl@"tPRMpo ˷"tKu8@;ÙSMן/pE&6]t[%gz``ػ,lvBB1B1B1B1B1B1ByyVL5 BBBBBBB99999999999VO,ŵ\ܖxO"wk!k!k!k!k!k!k!N!PȄSȄS9222v2beFYO )a#%&RZMx7M܄76R2JZFIs(i%͡䖋<Z0 ^Wd[.Mxx)x39KfrB+""^W19xǛo~N=13oKR,tx8m[-VKRl[-VKRl[-VKRl[-VKRl[-i n?z1ھ=n|]`_z%3 ү!zuc2kzfc{0:O`o 39;o \Id}k0/p!x\ꬪ[eQmh IWM{k+s' \S[(Y<Ϟ=< y `cll?FDֻM"CHF,!oh=o{BVN+UHD+"0;Q>']i d%7\eLciM&`Vn0?o 7aobCSd!;إvcɑ~FEڐcrjAxaFDzx 35b9H70j-"F\+2,њ?vE(ڞ~0XY 3E+.vk!\>,a3j嬂9&kSNl~/akǺqoK/$jI3F ?N]TKLJYfc+~YM?оdI[-n.Z=,+o,HÒɸjD } ф C/K8.߆|d݉=}My e| (g<ۆ+lv(8:`1X|:uU5yz88ڭqQs<>#¯krDz`Ƽ3 gslq-*t">[_#" I̓8(6kFsgqMcnbdQX#3gÔGWCeks" ^{gQ4 F8Zf6px ٸlfRw;b)ɞ[ON OXֿd'4'y~M /U/KK'8t"Jh:236>3 ;ww A!fƔ4LQ'׮MXٙ9?c Y*uRL.,߆|P 5N6%DS1DxEu΋p)X]և~ꪵˢ=D?'C̓!ɸI F́ +` MXo?xAē;ԪWKbw/1c&GYeg Q`h5yeqoZ@,-S8Rdb=Fbc##S@e9>f >EOMS$=EjOS=EjoN)y궄Cx ~ V ,H)RxfavPJy <=wi;ENSi;ENSiS>lPK53P碍 qøa>6NcC_5kؙ>Q$\GM\g)0$g@ %|"8CF1'`̐(QP6_X/v& W:0v_c|n(6(6;; ޅֵЦ_ņؐ:: 2v'mb*VY8q™VΠd`('g9) a2lCM$;YNrH3 a,8^~uC9`O/ O.drɉDM~nD6V.#.#.(6V^{3;y_CKZIiu[G1 "mp4k(<ƀkal>`iI@GpKiT@Ĥ/bZ1_ a0 j E=j 9/tD)? }e,c9@[~pq1 ˈ`32qZY,ŕըДEr=߳𥉉Z4;ʭlc Rk]}Y2zp8R:e_J= C_9È09"L#09"L#pC;kpפ#.eқNte`Ϻo0 -5q2XMYKY?zYgd 8,Ǣ0gOȬnS)pKmpI_!1G&%Ա%qD7z/nT^³ƏM/agoLƽL\VnuOD7)oJMn6aO aVχ~&4՛M2[c,I!V9X3\Z;%R15qCx ||OW6Ϻ@q$=c[ʖ^Y" VgנY"eeekImx-%(].e%J%R2ԔͩO'ݥ/)7>Ԙm10E,7qh3.}ɣuCm/$a|)rwqqg=FC=r SX&)4%[S٘|[~ l=۵WE^]{jQ~٫ԍn9ZC,:X33L,>339&wĝq-6?jX|aŧ6&y6REH"u\q:..7.<%LJ))2 WJU!.wqg \<&$drL.E2LH )1)1(}N;Wpj(0r6o3_v87פyH i!3q4ΐ8CgH iq!{GtmTfh$!$4ٺ0 i$!$4FoUw%p[nhBg1 Lxos&H !tFQ[A7۫Eo@gJ=leD|6j+D I(D ]Y%H 3՗T}I՗T}I՗T}IǛ{(=qU+?nLƤnLƤnLƤnLƤnLē]kAׄ!z RH׶]OSo4`:Dvl8* ?ε]OzwrSn&).Iy?.S\x]rif~1g#A\ 2[2PB2F{.=F{hmcm :h7apƤk"fԠn6݈?kMo|šx؝Scwj}cv7sGu?FCuFZhېH{h~k7~ݽ ;9CvϏeZ ljL %76Bv&a aZ;BФ#4M:B75^ߛQUmg阿ݓhijV ]%jd4?9Yma~Xmv<؉Ho]>xЈN3Hv'"bH)b")b")b")bWu\> ݝ-<DulDzlioOdop3a6VVC2I1`jvrݛpg\.i'{($q~zsqg} 7/3v5'K~J@ pf} qp̔_LOvo^>TO-aqOy?5 ~},?#vR:Trydŏoš$?kv#"$?H$A[k`f ^9 B~P A!oDᙸ~P{{K:7W" P(ޜu&#ÚDV >P ~zaB jA-0s|P˾T"!"!"%O?DN2\v0 LaɗKZLoH\ 3]"6ܠ\(!epbrBA .^t\U.lv!9O̲BP(! eBP(! eBbQCL!?!]ױ]w7 '1l\,;xiXrRZNJIiLYͫtbT,pRNJI 8)'%OaA +xK%/-$ B+H8=ɃP G>E0+~J>s˖2Ї6t))[=sv-^X-3m"C6p(5gAL`C-8mlBgq Zp:8όdXp:hD|V Nװt,Zph`%h5V(.1f3@i49&4FhrMNiGlb({QH Y]jhA/ f! iW%g/Uol}`Wj_~5گuϼsB1IO"Adi/g5#wa"|C 1jە@dFJ Z=l.~f?ޅq%zjjiَ!ȫj>2e_)ox[W)nF \{(.3bvmh3b v~e85Nv (MU<7sMMkD_ĸ6&Ƶ.6ѿ&D_yny]4]Զgeͽνþa/ xH T)-J`E-e̢Y21ðڹ'+E%DȢYT"JdQ,*E%DE}tOY: $C! ۞oZTw t!cdi6X\R^* XgMqE``Q,j%``:O=Yɖ%bX氋 {vO4O%~_*ꗈ%~_2 %~ʺ 3:tɺ'y+?%%(%"wh]"Z m%C%tfh]"Z%uh]"Z!Z69,Cr;eS:esnpxon|7T-k'ىL$׏}1υwoc6샫3{?97i;7i;7iӪwȐcCbس~WC6C#bo?»B(5!8J}KN1- wwx{~ yKsyJ5:Tǖ!eR)e^poϏ<}JSVJYv7{ďpҝtxÜ?>U06%$Pvn.۹]sq8^O?zLtV[{C7յ['a; w{\&&&&1zC`Gο[?ɉ4s#N7x뭮S!l‹q%^Mk.JNeGcB󎐞V^6`lsSXKܙYkL<vy"_Mo?l%qcYuag.e%e]7{al'e/m`H)듲>)듲>q:87;7;7{R'Eاa"S}Op&L)J@v_p\n%H'9>&,J ?%dڠLRI^eB^.BlM OaCB(8pu~ w.vN<.=yY&~P 'E1YC5aچ},2%dRLIQ2)J&Eɤ(%dRLIQ2)J&Eɤ(%dRLIQ2)J&odR`P8) 'E(ŒXRKbIQ,)%E(ŒXe[)=r+%}J$x`'-Aoqa K K Kec" 蓐("n 9r%0J|-QGu[.w[𛵼,qM^GeI3pd|7U=pP|+R2wdC"^ BeaeF\V?(K^6o' di@&4II=X%`/˷<,~B#*ac(e8Jn.z~HK^f%$%TP}>uRNo~UM+G nϊߍV]&}W9SoLDv#\'Zn9Ȅz#_ Ŋ0]<YVA}qg;DC0 ,[ݪZh[wUXEʃ`n29ID &5r4B?$1x0]ekn p{m&szGo^wK?,z-첔 w% )yI`?B9e!mu0mvm3MXM&vA cT5m]-3b0#aP4A`UWv(I Ry!4ʒgFͣ dA,QwW6+H +nj/D+et>̭lF?f?)c6Pެt4V * 9h3gqE qDw{P7b3,E7>f|,۝v`ˢZ E"d$_$/cz#vr=`]txaS?ԨڜI˓/θ}f6{!֕u7sQ싗m~X X(& .R i:8~pw!p)|-9}Q?}q1 AټWD I_uSK0vs"ˋ'xqP,Q:KD,Q:Ks1+9 ٠քu":Xl;6I `@*%>D-Q@Kt2C -QCK5D -QC3: eD-QFKeD-QFKeD-QFKeD-QFKe ۅ?Լ7>Y' 7` G32TȢ߰~ÒNNNNASHz/DyPon͍CI I/ I/ I I I 7,7߰~roXn ˍq67N{Ԓ%oSKަMM'̓{mg[ٖ [v3lͰe78,/~3'4\8%l_lۍ8'x0=p6patH8}"a{Q)vi>xpz)p>R6xmk:5WCCÉ/d+|fpFj38J#pw3fm>f,YqPPi8\I I I idOwwwwwwwww4)]"H DjAs!,Cu_a0mmmmGipGi9f؉F52iy݁N5Q5Q5Q5Q?ݿKN5TN5TNǩ'%e(59aY%qk;pgt$<݀k»&k»&9aOxOxOxOjrxJxOhOhOhO'8)GדՓՓՓg%Os, z z:Dzs,;2Ϡh(Fs5Ĺ\qKQZɛ7k%oJެN뼴kρ2]@t.>aIgqHp+qj0P+z8%eR &8?e,/u;!es) viP`ƇEIٸ8匞l,Y,[:I|ivH UFAT!gBqP!gE]QE]QE]Q!~^8/xMZZZNNNN^uDŖR~,0lXK_bK,}/%dP2G(VV%VFb.#JrU.?yj-0C~*̅IX}N!R]Bw h|G2!kBpbbB ss@CÁG Qw f=9ד 702l .(O=$~رq!"t@v`=؍z:S?LgL0)Vx@7[>.Dz/f8!.yMEo8pc ]'Zg̷#ՋO &]db(oZpJx10hzpw15I%y!,dc0X4~7zXlzqIBܓϦ7)Pǀ} Uс.?#YB/LD(@ဧyqy7o+Hw2]cZW]Ynx]N@FhMwś.d\C!<Y<Y<Y<Y<Y<Y<Y<y III5xU^dddiHkQ]삓4rBNCipu-F̷HrBNCiş͇LXy8%9 !;4מ/nR \wvC$L$L$L$L +ޓW'&YeU&YeU&YeU&YeU&YeU&fk"$L$"Ez5I03I03*<7U<7U<7U<,U<,U<,U<,U<,U<,ua2!cʄs ӅYM3I0(yI7.<,q&e6cyBta,i%ߋI2LUMBGf]B[Q{Ջ8e,7?< Yy/)uëE5Y5wY;i:~ԭQskvt]&e]&e]&e]k|L p p=N#m 3:3:;x*-XҜŔ&hRf\㳭&hpBIsfǍ&htN0]/~9lhtTwqJ|&h&h&h&h&h&h&h&h&h&h&h&h&h&h&h&h&h&h&h&͍͍ޛlntC Ē5O&V!ڡ&Db!C9j̸P>a%SgIZmBwQYXn v^'W#<%b]>a^އ.˳ŇxKǨ.im>,釤nݥaËe^{ f|~>|GTG?>}>̲y=mK%1ov%KKg.I_e̺gÒg>gCwaR^68Id}!X!x f}ai}XbyVg.mCpцpC8>m}# !0Fo A7hޠ4z}?B} LBTFo~Xdd A7hޠ4zg;C4zFop, %X6Fob!FoX p'%Ohq`!FoX<;`I 81C ·%k8l_ߏ zC7 zC7 zC7 zC7 L>,qv &} ^]z8.s( zC7 zC7 zC7 zC7 zC7 zC7 zC7 zބ՛zVoMX 7aJzMڿI7i&ޤћ4zFoMXNI߄-s.B&&oMڿwK Ca&ijL8N1S x_bIgI-8K^.1$/)pb)pb))q.%2\KM,^C0})p1b )p1ec.\p)FlbĦ)FlbĦ)FlbĦ)FlbĦ)FlbĦ)FlX3 gkbp#c5XM1VSc5XM1VSc5XM1VSc5XM1VSc5XM1VSc5XM1VSc5XM1VSjc]rCzX%+Y_ kj-تE h-ԢZB,QwwJ4BdMHlZ .~l36MFX1VX!8ܡ  tB>&"܄La1nfw3 Ӄ|X?|^H!"|\ؼK̆N倛0P*HA<%R˳z}ȇ8׆gwLJq'wz<<~Wt+9ƋڂM FK1m)87ZC"cz, K:\W\p .:HzeS $Ily{nDh!CG"St^c5v^c5wPn6(VM IG6&>)aɋMx']+C yVQy8SS8o ^K: ^.O n.|_ӤlYgM%PfgcUܞh*쉦~`-v%[R%bmz۰9|aa6*=l#1,eIlRf#EqdoJDžP2Y9r8Yw&;>,f1p1_o2ĘY,DȘ!c"dL12&BDofx;P$v';INb`5;INbw]nfwavfwavfwavfwavfwavfwavfwavfwavfwavk^C IS=%`I_۪$b:{B`rXb9\,pN><gk)s+u.˳.ubZZ kb-\c+뇷`×kb-[Y?BX b!\,p.BX b!\,p.BX b!\,g!ð.ư.ZX b\Up .V*Xb\BU  ` A սv{{a3kVmEVmEV=c 0!"Xcb,BE1!"Xx8qlR1}>,Tr!]Cɻ `($6j&7$7j&7ݨُ%s+*y7̆p 1ةvf(jQ{27̻3.B%!~"fN|;]g ..)>SvgGݒN,XfXkN /ϟ85/e'MN4);iY -!ya)F&eo*`ǏXaJݱ&eF: w)W_o 7 g R,yʔ+eʕܶYOWM<|:YR"O5S>k|LY3fO,`HiYfQRT@ HQ)* eΕ2JQ) E(9W"Hs| >|>kvh;J%(O)tٞAH|2e˔ϗ)/SHIQ")J$E(%DR>_URJ J J J \O>)LOo+ꓢ>)ꓢ>)ꓢ>)ꓢ>)jiT)JT)JjwN ZZ(U{jR(UǒYZZZZZ+ZrZRޮ <8cOv;8uKɪ@n S[JtKn)YQ(-%DR[JtKn)-%DR[JtKɊB)0%di%d9^Y;)0%DÔh SaJ4LqT&?ouCX aQ",JD(%bDL1Q"&JD(%bDL1Q"&JD(%bdID@Q" JJDh( %D4P"JDCh( %D4P"JDCh( %D4P"JDCh( %D4h86Xx:!~:bZG Ep0~Nz_!o!K+%xu ~9܀mq7w +ۻt``n, M/D\%3VXx|!3.zPol;=ЅRcHɡX}1QMDl{5iIut&#Q8gMg8|v.4kةIVPAt@]\KmbZ=4[]LUÒtVAK>tԽU4&wG8f.؅"z"xKoH[b4%` !PYgS-aIըws&hAr&d񽯁ݽh p2BDwaxJc F].7> `~郛Ǜ7;p#d1đ"ZS@O WXù̲nbǖ fy̦`} .A~pu7톯hu;߳ݳ4Bgdh5 >| br@L1 0|kN&h] xw~hrMq49&9>'ZzE~EBn=Hb *c YlK&"Y,!%d,8}b_Ӂe%3 GՂ7`wp1t1tY}.h_>}I0 sPx!cZ/{7*`f|9? ,=?Ģ%;6 Xl0Fo5mhÒ xF?Xm<6:xVΗx-+tAk8څk6O'.b\#3;$87{.yf9^F_#c)\%8' `āA7V,x_҆  fxrx4ӰCNÎ4g﹏d9@PvX!°. =9tE~[CBBS?8.]Vxla >ua.;2{ 21cbmd\gǒY^loGځCxlM>Sb Ȧ> X؈+J}Vf C*.C}YbhC]l:#("+("+("+("+(2Wg,>/.󭆌0P``?&g3n8tG"ȴL ʴL ʴL ʴL ʴL;gd'[dhM'cS:^KN^Y`j Dao?Qt|8q CKM0 z3DoX7CNՉ1W~oٽ۲0 u9`㆟'Cgß. ј$󅖌iFfiFı K2fc~ıĩ1C4fLX|Cڽx I㓜'HҊ$HH$DR&hR(αоe2Ba~j6"i#6"i#6"i#6"mAklhܤM&en$'y?IO~I1ep<:xuH95s;Ryu,Q=1璯CRJ=pB:"N)_RH9|SH6dS5XS5XS&=Ío)2'N|2߁x}&b4. ? o4V:45AHXE*ֿmBKc_^Bn&3yd@n-'"ga,r>ysE+?#@WXo,,r⁅'HE",h4p?CHEz,?ȔE,2e)LnTY)S੺SuFɎi0;a0a3z]{ő9sl]ݾtnX |ad+aNz Uc0QNGDX|ґ0 +rȩH 012}G6&rʓğMO>q,A\L[k FMTp1 3K^wKwIߤb)IMobz>/CũXk]Jү)x7?K{Fn,~X6eҋCcf]0۰agZ 0Vot)ps) $;I4\k>yma&!jwO^8yuuJK&MpNt%a3.ɜ\$BTw>nqOSpVw>N[1NȳSwÁS W}xo[M;k39ۻ/&qXc_DJ>Ef{NyG?x?{؛NhknX⸲cY|Pўb'%n'%s[-lc60K<6Ox<}Tadvz2b)vz22bLc)6{4f͞2bN.Q՗=],O哗=]%iyo- fS>< )an)an<0@C ( \saVÂrX}݊-d%[ "aQ$,EOaA9 ;(Xr]w1n+El.ꅅ^X׿WcE[E‚ܗx zἷO̗D0, .t/ ߂,˃qBZHxA0,YV]8g얈%axX"%aRMBbX"$%˪n.ܔ=aKFכ;l% Qa)Kw?Z,KD,KD,KD,KD,K&KD,/KD,/1K&Kd/1K C/1K C/1K C/1K C/1K C/1K C/1K C/1K C [xYkۿa7l߰}ӺoZMi7ܞ]P(Slx@ nd Eyl| Gȸajمxjn\XMc57<]Ǽq^ȭ]8 FbCcn-[E/>w3fnB>m;0УBe)$+_'=p1t1;{`.lxd7 qف06܌ݑy??7coIx gُN%8ypl 1΀`>pObn›B7Op1#f8 F1/4(rbcÛœT%[TUElQ%[TUElQ%[Tɦ"^7e`㠇DZJ!XNA<,ElQ([ElA}jeZ٢V-jeZټbcҩ\(-er٢\(-er٢\(-er٢\(-er٢\(-ko_{Z{ӠM _{ +%d-Hܮ:tlB3M8;﹐Ol;㹁˧i24s|6l#G`K:.zSʐ6>|6l#G`&M>ۛ[#̦d$ŗDNA1DG_䈯&_M|-6Zlkb&_QPCQII LBMhdjB{M&&o2 5@X*_:gtM|_maַO>M_; 7c:6c:6c:6c:6` <:挎99zcx/šĎ9c1{tpsm~5`(ҹvxڱsqƁ`kεcs \;v;.ϸ;׎kεcڱsعv\;vl '!2Ɗ24_(P!(;W+~W;bo+.+.+.+VٱS|q1qHm5\ԀpYvYoV5#'=tYvYvYvYcrpE!\vYvYvYvYvYvYvYvYvYvY' ??Eܖ;_t::ߏ?i(Vl3EH.Bi˴eZ2-tL ]6 Lg&BE("\Pp .BE("\Pp .BE( YN[2\d+ 0; x3M 1b}8ǜG y̠T.\ 9\b.ǕoXC q<OxvWm"Ҕb$@{ Q!1D=Ƨ.['{ԋ?^q8E|p[}GGEcYal4+kaD {p tf+Dt"BDW ]!+Tt[nE"-kid .Ƈ1"BXb| Y` BX c!b,D1vP:<Rw"BDZH i!"-DU#S_P"B[` l!-D"B[` l!-D"زUe/KZI%In_X/iK8>q}YѲ+e?e?e?e?e?SVSVSVSVSVSVSVSVSVSVSVMGuM;6K|I`kI^KVqTqKw}ﻭG!SSSSS-bgk;=b,'I9A)Oo} ݧ̷S>pOO|uePv e,̧,̧,̧,̧t M,̧,̧PpᖦLLLfzˮQ\v=qe׳_Y \,6B c{=yד^"qYܩͫ'zK'/zRY\m/&ɛ绉z51yEӓMO^4=Eӝn1k<\W?\4/tٱѸ^^^^ha)*b8!0$<5ؾpWtkzyyyyyyy-.Y>/Y>/TdddddddddF&ۅ]F#y-k^^bK?~dL;~;(Vn%ZlI74^׳{1ߝ&-c2"LF ږ8F\bKlsm.%6\bKlsGWf]2*fLJY%Ӭ]bKY%ӬUԪĮL8>,Rө}PJA>(%D胒TV(j#ochqk~#.^~΃_i.DڭKn$J`*/| +^z{Bx؋^zxȸ^B|56ָWo^?d/ކ}$tBzz|x1F()0S`5{zÚًwd/ޑ~X^#B,йzz -7o^zKӋ7UwSu5nX9R{Ea s)8K ^~x g]{^i~,GRYJBr'gnYKl+\,WiSR~h&gT#Zݯ_̾ 0gG"4}EBc/5ƽ0ބ[< ʇ˳.iJ_6.-t.'@\\:yB}@`l@O0l0-qm[ ]L;1t1MFbaC;JyG)|bay6%O \"?Z ]܆ẁ`~dn5t5p ,e~ z`x,c Z ؈{1k''?C0{N<.$n!Mi;ٶ` ܟp.n9z±)).6,r",r",r",r"G-&`!>f-\s kWz-:uYϩZbx(o;ھpmʁL,uۭv1o֯2_0z\N->[|b`ool,2߹qY1 .! )O9H v^K_P8ٶ}ÃlkPb70e3fr\BKq 9Go}qH^:$%D?D(\('pK}AKa~$ 7 MMwwCz;.nw774ט^ D,ѽa;=2MdMdMdMm܋q/ƽpam\Ⱥ6.d]bU. n8-}$N9AQ!›4j۲,m-\ ~m-_[xqˊ~x!c.<n۲۶pnmO0ǴI.f-ܶv&ynpy[8o m-p޿^ 䍚+d^yJ+ dd^_]ܰ׈+yJ+d>F}eWF2ͧl{2j5.VM&~7A17gKK.B`osa0sa0Iˊ?aYe,eUʅ1 :N3.89~1.NW΂ ^BwyI'U/7` w4 C"Dn? } ʂଌG/^$zK/^댍eo^=g)+)"L'6ˤL OPeWա|d8,'$EP]<s(,K,]f)aJL SUʔ0eJ2%LL S)S”)aʔ0Qt[awGA\Xܨ!œU"g΄IX}C # 01.;Y#@cZ9ȸnl>UWAn+WWA2mt[_*~) Bdal3͢,͢,M^μx*Mާx*sDA*zDqP\c(.,WKubAqmR[% h܄(LR$MRqP\O(8U[f|y D؄SQ TK,UɒCBrBg+SN.T%:dNƳmJ,Xg1[:Ě䒅ϒz29e+l%IJX\bJRaˠVbMI{HKX"Kb.%RBX Yb!K,d,%B->> }^'DĶ|gڷylo/Iw1bh!~` ̬~FO:H?0!`KY͖$d߄Ȣ%iO_(H!~'#W\:DJC=c߭lו{zCOm mYPRɎxdxLA\|Ҏ(0mr8`xX.vR`,Ғ4m f[<\.J?G!X‹һ@2Mc..l!ajZ}uXV?$ bb$њ_Dw"̮w@!R8pG?zC# lxlj Bd|w~ uȭoBVK\% NH2f; *΃+jyYx`. Еdx`KZLhq,y8^ !xJf!}\?>\h5 gw0_n4?J!E[8܇?XλXϋoyy[cƶ%ᒎ-X;ˣ{/doB6ov+/<..E&;;WlGyc#.J@4Dȅ/F. ȅBT蓶oMھI7i&mߤ훴}oI@g'&&q^2@4~Mab%BY}ć=apO|ć=apO~ܓ`{qv߇90w*ёm3ìb%l"L0]?Y8ڽ[o]L6{¼cbB mr$EB \b~i&g.5EʔՋ /WÎt?v~_K?ɮ%؈IXE8hyk&)̜'ybחw*Lf>4|2g&[ȏ̤FgM4e=cNr69e69EL9 TUAL.V=K:#fp 8)q̙蔙蔙蔙蔙|X#sDtGط o\RB?D,f>-Gr9!R Vh:axm<c{ܛwqXWO*xuo;!xx[3V׏[ڻ$Mc%2C}v `aS|Kz*)C [zt;x` 7ͤf9%| gSPtztg]zk;Ӹ~YQN36 ;)'WL/* 5RʓJ8xxvtl^˼$+<_8lX vnGh.}y^Cۈ{GOa7 ٦z1wFmxm|m6Ϸ&M6?[,6msi6-݆ QkuZmH#Ŗ[ifȋLę w%gK88.uiaKH]CTK9S_t0֋3$9ǏqӜs ތ3%玪tV0~!6r>PO\K'K2~Ɯ4ǘ3|3~HC~ $H5bͷX f=8ltapXriX8nw8??s'IfmK&ӌJ/oqUCd!#^@!E9nlϾL3*B1`/ja?ĒWGE6 KKe m]0(nT/tX= H_Ͻʍ_QW5 Rl]{2֓q'5@/|^('&`q!>xvl| RE23? g%<$MvwU;0VrB6lȉzKC6ɄzGvyヒ$N1ِOB[Z.v(Rp)X!x f9ԇ%S/F\ٽD۟G-[=M۬Nj 9>V`ͦM?7~nvrCbrCbrCbrCbr S&_\ț4ePlH2oP#|`Ԓ{SL0߀]41`Sl~b`` ̔20Sfʎ-eW[LQKo;RRNl);RvRKٍ-eY`'N60=eWXN`);RvK ,eWʎC Px9gú o3[ %Oo<킧] v.xO 4؄m,Y6ݰl^{eʦM+W2af{eʦM+RKX"<]~!Rh}<]ྋ}Iw0;ΞlϾSmK:?ek} |z/{yd/!8s%Z2\qR`dGF|'G=B,}IsZæ??{F/) B$\Y 7C _.6m3qa !UUUUUUki;r`EIp1q+eۤ&~0)I9P_-M}]>L0GK;~Y.˳| &Nb.v?6i`?ZV%lX4/ #.?Sl;tYI|k0b=Mٖ˳f`̷!d,pJ7?qjX_5 T`AAnAnAnAnWtFpF.˸7?9w=9ho@9_ڇV᛾ڠpN`dЀ A2h@ Ƞ4 dЀ A2h@ Ƞ4 dЀ A2d(H f5ޟk?xq5  Pc96p,>Ujrl` 1*_&/f*X:FepyN?C2p,OJѐ 1$C ) ),ɗC1` 1C 0a ,M[2I g$_/#1 !!?菻SaMTӾ7н[;B DƦ0B7pB# l؎.D5eхȸ}ق![p8d l-8Cp٢![t8d.mϳ->n 0^{&DjGlLSs./s /Vơև;;K~Ka=گk18C\2C\tms1]\lvmsKvm󰄷(^tmp n6k=_+lȘ }+?T@PzTMC.n'#|1q7PQtB;S] 1 L#8#8kM qL0S43M{9S43M+OMT$ԂlMiP6 ʦA4(eӠlM|i[6?v|i6 qm\A`Wqm\A`W%q3m\`Eq۠=Z4;..T/h-\0xK8MSl1)đ2r3%d϶GχC.0A[LSq`y)x1mi&A,|՜n1F[Ÿl1.vn p3 9ƁC # -. tg+t3FڙCljpz!C"4vz+6vzJ.Dc~` T/  X̢ІLRF` F` F` F` Fg'TϘ!vz!# !18SY3 '\hHhHhHhHhHxXLl >S/d!777?,"D) ̙ќgTCYT酈 /ϟ`L}4&>|?RS.hpSeqaMLLas&cߴ``?B~|nF/&yTKl-͜31s&f9|9)V704«sչ\mu:W[VjcZ1{2I1 I ~|BmC; س1st d&;ze;<"ވbfON;"B;G8sy`!*dq];%q!x˳R79er䈗/#^1&W~K.q>a)CHBym>ZHnnpza6/=r,nj3j~[/8K8Xso;KlƞnuS—`y6.Lo2Lv)s;ycM,!=lNcJ>f?RXY`dgJa) .!. !ȁ>>O~,?n<,?!~,|zυ(3X 6 _5-&lfმ/̷Ya6E5%fዥX|8R6)WH^n;E;~|~HskzX:T$Mcm,iC]~U燃[~I|>0jھr!Xi`yR틥.&#Ϥ^Miaߴ03d$5m?lVMsT5m?̶ )CȈiھض`Oiæ%%%dgM-Kg4m>^J>@=Hۃ=Hۃ=Hۃ=Hۃ=Hۃ=HCzU!T=zBCy=!<zBQ/C⧄Q)!T:JP*BCt!T:JP*BQ&.8$teh!8PjBC-lJwaR~nR,˸K,%f=vM?ؽX$,2ŲK.^v,%bc%fK^.eKԵ%"꧈)~p0*)۷3aKI^%yK.2Vh% O l3.q$;с "އMLz޽X@_zvp_0Y],LЗ=,eÐQۗ]Lڇ܋c)29SXS 6Jں-Xw;8`-٢%ݾhx"l_dataɖX%l%[bֳX,,-b-XK,e88LK,˴2g0Kk-FKk-FKֳ45~HxJ|v(X4h+Ċ,"K+Ċ,"K+Ċ,"K+Ċ,"K+Ċ,"K3K3KkZ,K3K3K*, KCS%qޣ`8Nw*7/UXbX%VaUX2Zb̅X%VagUXbX%%VaUXbX%VaUXbX%VaUXbX%VaUXbX%V_w/k;_zƬg.^7cvS b!?BLmBfl\y! H/ltfv?%bćgJՉW'^,gK`v&b19ioH.xo<|?!)Fx~|7'H8[KY'+qBlKLmKRrR5lQ [[[[昛W\,o x\IeeEYlQ[eeEYlQ[[eeEYlQ[0 9-2csڗ2"9H-k_.2PRė y!X!x k_f; ȕ-re\"Wȕ-rek_c-c 2 cpN1=n,Fb,Fb,Fb,Fb,.B!YxgSA$QkYdd }^kn,Fb,Fr2ԃC%Q%QVjɋ\p cOyvk,oDHb"YL$R ib"YL$d1,&DHw{ q&7%d1Yrȁ8-_L䋉|1/&DSbnnoy@jRTطL1)&2Db"SLdL1)&2Tb}.% dL1)&2D 9b"GL䈉1#&rdD9b"GL:L䈉1Y0#&rdD9b*GL䈉1Y0#&&rD9b"GLV=L䈉1#&rD9b"GL䈉1#&rD9b"GL䈉1#&rD9b"GL䈷˕OP .Wv\\ ũKpIKT#N ;.dzûGa!Bw3ջ۷^ݞ{wcAp}3݌~|7hRꐋwI\4f܇I"}N'.EW}O\f܇MI}7><Gzpތ{1/ufG,O9V .I\t9ؙ>,pz`܆C-&q$n% qNK&q$Λq/F :5&q$.EhMI\4&q$.EhMI\4&q$.$zp&g&&&`x8l5؈:rƲGp;pp;0O;O;O;O;O;O;O{+{+;o=UbkBp!%9dKrȖx[)`1n/Fdwt[W.n}$ lز5bB9ځSY{qt<+g[ěٞǛ3<{qx>BfaB|vɿ| 'f?~lc+`I`uququququququ w2e?7eG92Tʐ}!f-ĔSbBLV҇Yf0>x VC Cf!C,V>x'pwSύ6>xhw?_ du>}!CQG:du>}!CQg4i,ar]ܲ쉝fUwܷ|6s 1a\x߲' [v޷oy߲'\x%'\xO\s*Bx%);y8ysPԂ q0N5]C|엃{.?(w~X]z)3;h.f懥.RwK՗,2\SM\2ܐYrb;ŌKϼ2\Ҝ4[$|p)sL3]R5k09fx.ďg "HnhSq=N7o0e`ʾ})S L7o0e`ʾbSw>p#w>ٿaoٰ|鍱ͱd;$mb2&Gg۷vkof}Rbxm'fOl ۟w1ΰ=ğ,?}},|>p1t@,3e2Le2Sf)̔Yfl2e6Y]X.,lk;Z[/|p/.l۸qubw{Q׽ŋڠ.k+xq.x&sMΛ̝7;o2((8oߧ.dutl/x5Ԣ=^öⶂC6ܰ.0n+ '@bPNEī/ EڽLF-nF(F/*"+nL1{Ƅ 6& ēJ6)lRI~w<Gm Y!K4d,ѐ;̶¦zeHii}5Ŏ2 LsJa3E= 49i`}K7߷/%I Xdd#F}18f581}u9+Q|6yi.Ke/# goא!%_CJ~ )R"Kvk(٭Q"Kc=¥4I1%DRHz2 , e,%D^yY"/Ke,%D^yY"/zґǂBkhY5inD/JY;Ktg֬5;eq_FJfɗ/#%_FJ|)2ReDhZYK4kf-Ѭ%Hɗ-[Բgy{?ym4Z'oOƯm jOz?){sw7oz!累 % [8dn4~]M6k}vsc{>Mл9t`OdXtg[o8p2š(Ǝ qMly Pop!*do4\nffYnxY`}`0B~/Dzs{/C{s!#&D$l 3~4x}Á 5:AR&KntԾ7\~k&+fQ(Yovpvܞ8"\hv}!#$Ιa:r!N<X`8{.v< _6]fH:씘T< / QaRqG<ɽ,\%G8BRH#ďbsNg@͆[}[nø˹?0D z-n/&WeBO>6[WBdA0+k0b ׇp}8h\IB^\l0!<^(g ڋ& bf?j{'bN /<;-+wK7q{1) byIbt t ge2A^9hC^RH#=Ŵ.K5O-nhqLkq?\y1{?p!b0[=XwBmivVg)ނloXog/[Öك{Ń>ߖ֋]?3G̺%lI?Y-)<[ˊq򾇷`I'~%tR)?)3 !ž˳^RLIS)S)S)SS<3%d?ɚ>)|R撾$3~_tMH~@찅'(=^Z_0cflь-9pᩱX܍Q 0Xd~!1y_ Zc6q&.EhmM\6q&>Ҽؐ|bVf\b4%Fs\b4Ϻ<7/6^,7>\|{tzA-yXHG:q /vd41S>hg qϜ0s* ?8H28@.qxgė%F|=#D9G߆`!c;㾁q/`XKRʟ.XK8p2ԫ؏{sB^KD,KD,KpŤS\pپtO _^Y!X D,*..S0з?>2=%M)[ #!66heFaӘ&0!)_,e(DBK!D-BK!tI)$=,S<+26SLJ?zj3%jZuԱ}~=,N88ѥw]z1Fp=.p!H[?~Lg҇?`GďbdOO-)xI䒷Em[Em[$,n8xK:[ %~on-E-VϮ 4lGEEnQǪiSEnQHfcmy{P[TEEnQ[T[EbqD^<.pJ7vGۛQ&GG#<\^p4M/(\lOIXIx2|I!DR2e0%Ky@nnEn[$ En[$ eƖm[alنeƖm[alنeƖm[yeKƖ-9.:hq6ck~ a뱵+ۓwOungj. Gڞr">Zw B`h2gvhP5&ϫl8#evFMҎ^m(MҔnn9 J}noS_' c0|ʳS, _K·K^-vMĮq!kIn.1bxnugSPf)n/͓o gK`1.g[=aĎM{bܝz1@uN-b{?C򐖟d+v5Y&q\0hT_k80iI=bn)wb|! v(|=v(JK]Y2ID|8KmZSWt|P&lMb5QT*.[uMb5Q&D(VjXMb5Q&D(VjXMb5Q)֎b\,b5Q&՞bEFqdNEjXMb5QTjRNl5Q&jdï=8i% S0;kˆtKtJItG~wC8888888888888888888888888888888888888888888888Kk9q$ .K8nBGnPN;q- ϻWcOo,&RlC. ;?xG=ބ'?Β؟+!G~?iFÏjftь=AYb$@"@<<(/v.EBWA*]?R*( <))]޻@=qQr.J a!h2UuŌ\TsQuKX(9|0_(9W.JE7]YN(6rE4WC}S9=0?B(Q,%(^(K%=%Mh:#Z.RN(9} ˀsQr.JEɹ(9%\sQr.JEɹ(9%\sQr.JEɹ(9%}\@|?.ED]PD]Pu蠾 껠 ۠날~w%w Ym8) 1# wx.l2š(nƺ q1 B'Dq/$\Pc),>Zb,6nfwƢ3>pGWu?\AXnV<WN>W5D?!z/ 9<}<<8K,ϲ,W%j=ÏGGȏ cq)l*CO6LG~:0 z' ]Sd$B *P%)AbR0&Ç(`ysP SB69*DYgI䈧Ky԰І7`iqj4.4 Sˤ P!*2DEQT$q#^I̗ "ɠpʈ!bI؏!Wuo_7Tg<Չ8=@.=7)8.1ATO<5[Ҕ#!j4D5FCh Q!j4D5FCh Q!j4D5FCh'_a[<Ԣ Q̆_{URDb#:Pݟ#հ7w d|u!sEEEEEEEEEjQ7:&#$\YQ{lLnLٴi3efʦ͔M)6S6m,4qOuH38r$`+!1F3QQ2)GeRʤI9r%(Kɔ5%(>xX$ٔS-?*X=Rq%ec^^^Ƌ[7&7,S/Tgd%M4=%{>:]co-c3pÂI=oWn_,lnɳ1ѷs\nit|xJprÖYY!ebi[^Y%~X&LFS$z;o`7st1u>`ΥGdK"4`75R~8{;{&.fJ5ػLH^Ҷc}_;@IjOB!̞2laK{Y̺Mr }E;XF*)C,&bBS-r72ނbO0kA0XnJc@7=Dy[%B%/)mAYR"'֏=X̷~./al=E|WiI2s!ʯ}/_`#+QO5QBvaaqBq!]\..d ŅBvq!]\..d ŅBvq!]\..d ŅBvq!]\..d ŅBvq!]\..d ŅBvq!]\..d ŅBvqBqBq!QQQQQQQQQ]S1fcQX.nZ(.d7-lZ(ٴPidBɦM %JKK[ƏHxƦ;]Ə3,dGȏ;u8Gv0ԛ1a `sadxd^Sgc%Y)9O'f㡭AYǃWV%JְKְKv3aalK ~ &c <qd_J69[kkl)K%) KKKKKKKKKKKKKKKX,a,a,al(9U]]]]]]]o {s(aDv&\%K%K%K%K70G!K6T*Y.PQrPpVvᬒY%Jg*9Urd dٺdٺdٺdٺdٺdٺdٺVe9x#Oy\t'߲AϥI$;dW?_ ҃ _JX^Sx#O߀> x7N/_:؅h dQ2~'*Y)k6|+w_%Ӌ8O}-8 o9n[$}K)g#xvKg9sGcHR ^]xrn)ϖl3%:a^{Xҏ)'-[{,]/潔v^{evm][ 8e˂?uL/Ŗp[oyɑe@sq' 寧]UR8~ΞK,<,:֔QK?K]"$\K?R/1gaYxK@&`L[ PBߚ,{[kߛo,bbw"\ #tGy~i}\^e|ũjʁN?B7Nu!2u'#Iy[Յ,CndshՅpn$dn),س][uq7! m 곐H75JWo\ٰW.Ā{.N<c.; D!^9$R˳AR:/.8uG?qʤY~01F}o ^t}QlȾm2O"]P߼tq߼~Xk5 {gۣog5ҽ]?Xﻵ"#2QHY9#XĈY^9}wX9|x XÒt쭰oRj^쒗􎈇M [l1>,uӬt(I05) d+!L}>,eHI_i }XVR6y.*)pa#vނY/C-/fp.C0ۧ>1maձ a ZыlC<+c!~x1%gŪm =dRP”X75 k?׎׶ hʁHy|U&g#ldr629LF杍t]]Uv:`24Y^LLd6sy(M9er29GLNL&'&y'&V$t$ʷS 嬜,Y/N\杹h3\&g.3əer29sL\&g.3əer29s;s\8 NLNmT\Dwucvqh@09z9p.Tyλ0"iNpffffffffhEtmx 'qv"!ؘW#GXc.N#Ft3 !Ijw A<{a2Oc{;v㜏1bēa&8ӇƋÔÔÔÔ|fYŔYŔY|3 w=&q\1fSf SfW><Igo & L|0?lYNa/DNE>70Å#hQK%h(d偁ZaPBh?C׌jB El 67!ʺW.qp;]wQ. E(x]wQ. E(x]wQ. E(x]wQ. EsNX4h}.uuph?'KIA//bW :qP~ȸ8aMޖﶬyqDqm!A<%RquE00i:\zR?\p +ѝ(pų`3}faړ^|l%Bv=!8=!d+dcȁ8vK-?]p%v=U80oM6PkNKOI&]p?<gS@.Q)P<`mL(Ӄ]InJ]O"~ g2]F(% t]@%jq*ܟeVBOmM=6ԦSzjSOmM=ʆ߉ڋp <C 6/Ds2Dd "Sp^TBn7vy!k.]*k e {|{%-َ;AȢC:!ӎ;/pcb2<̅N7a.doKw $\`fghM:;FȖjI)idb wP%JA*yUT;rvT޻? .KS,[0[m8oxv뇃xcv' o(T|bķ-F| af8P<06 g.)+bN_|0[ 2o5[eR>೵$ FCm/geg             nc.N{{{{ڄ\Ŝj}bNzC@6<\nZ.D5M˅^6iZUySdǿɎ󔋷.vQx$I&'LN09 `r$=x8/ `wı63$<^&2ql)qo 7[;ye0lbÙKP\ ?Sy FW# pja)p bs5|ԮF4S,yؗ)L;FW#OKvy{FK|c]pN䜈8?bt5rq/,]aO"Ζ4{aO}% !6D؆a"l [w& UJ`)!XMvG#Aن(elCm Q!6xCo⍧rN+x*XRNr:e9*7D冨rCTn Q!*7D冨rCTn Q!*7D冨rCTn Q!*7D冨rCTn Q!*7D冨rCTŒ%k?ek.dim Q!6D(`Cl Q! 6D(`Cl Q! 6D(`Cl Q! 6D(`Cl Q! 6D(`Cl| C `Cl Q! 6D(`)Xo<7Ҋii,m Q!6ų}@b)O mZ;T/C0{=j:V ) )e2!E4!@qLvLP.saФM 7a ɸEyaL&(I^I-WFǭ +\W$pI^Im9 ~roKޖ~E PSjBMQ) 5E(PSjBMQ) 5E(PSjBͧP?[P?ZPN|/)oV>U:v,M)4EY"KSdi>YJdi,M)4E|?,qb7X2.ɬuEShMѢ)Z4Evvqϟ`IJ2$QS4jORfM2]SkѯR )Y6EӦhMi>/v_nMѺ)>i 6MѺ)Z7EhuSnMѺ)Z7EhuSnMѺ)Z7EhuSnMѺ)Z7EhuSnMѺ)Z7EhuSnMѺ)Z7EhuSnMѺ)Z7EhEAe QhuSnMѺ)Z7Eh|vI7vkN&Hʏ8%vMѮ)5Eh횢]SkvMѮ)5EhZ]kAkAkAMx-\ *b *b *b*b*b*b*b*bOQ'` ڢ+h!R(GAkQkQk]e%Ҵ(MKh-4O'GK$h-%DH Z"AK$h-%D֓e}򲾯XKbZ,Q%jD-%XK`,Q%JD *%XQ] _+Q|%DSv/ʮDٕ(eW)81V D(WO!NHCҏ%|Pj%JDSdPd%DS^`*^%ҫDzHU"JW5U"LIg?$t V"Jd.%ᒾ0ȰV" V".g 8!u*^/U"JW*^KBzHl8&^%ҫTz' ?ԑV*[%rDnȭU"JV*[%rDnȭU"JV*[%rDnȭU"JV*[%rDniXOn_˭•u}r<3Gu/#Ooսþ|BJ Q[ֽ!:^_-N?90g\2X 냒~"\ٰJ^,䁛| ]66wDn(؟n8F2vN8A<%RC-X⬟`u鱋{8t?b)М1pq}nG{?9<3ٖgۋ{hX> zqߏ~ > OB>'$\LGH#t[O>,黡dKA}.&%B?,XeH~X ѧnfj|^l? fF^HlD|K6$i0,is.؂a,E2r2- 3Z;x3-O=\)e(W`~/~-[0?l"A`~Bfs@ pA֫`C2!spɌ8q$31ޒaפxd~i|3-f;&5$g-/.f|䁌@<֎{)^SxMO5=kz)^SlH86QkDS :fB{.o ug8u&q~ԻrHё'MT.9=9kj5!"KVSB}4] %ae\C&/M%NL.8K%šYw kޏ׋ kffC ׎f!Sa3͸~|^fBQs]%vׇYIB7ar?i-luaBQ,[9Wvu䍚Fp?x:]2G`r&MΤə49&gLI3ir&MΤə49&gLI3ir&MΤyw1|nbh"׻zw14|qջ?_qbhVp{oE;eh~r!S0#D`&ڤUV7C/C.4 n(KT6ܙ9=W"dlIgNI#;id u',  XZg2cX$sl!N0Ófx OI3Pɮ{2x 6C0;oQLR/lM*)"Z}CsnFxIiwoݛvݽiwoݛvݽiwoݛvݽiwoݛvݽiwoݛv7h/^xG{;ڋw׻}1U:Yx5NF)Ϗg L~~Y#fwAOoM=]6Idb][L-Fh Nu쇖l? џ6'a!N #dٛhjc-9ȦWӯ,:6Ww 7hggy`x7t^HR̅!lʣ#&L_j D Qe/.p2FE-W^޲ʫ1utXGd^؂ᵑ,b9Do5 >kx*P¨.¨.¨.¨.4LB\Fhsm.0j0j0j0jt(aWq4l`Fh\N5ʵs80Q (SKƥLRq)߈ !c=d/Dbɱ;F8i\4HUB/CڕZeH2qҸh4Fe\e42[ db&+z&DiffrgrgfMLLL쭾uL447G^=́2` ɢّ`'/nG˯ Ms ʹW2B3bлy^2@qdw/Ё ezs4:a _ h({`=0s̹s I@gqɋ Mg.D&]6TXr.G9Q~jӯ D~vGX KA~vG9*W@7Lą?_?BdN~rNF0 Wwh`Q aqƹ8\q.θ ̣ y|Lpeğl%dˎˎ' N#uLK,Ii̒49}m K3UP/GiiiiiiiiiiiiiiiiiiƕČ5C 2 C 2 C 2 C 2 C 2 C 2Ըt}ꁌ?Ha@4Ɋ~C,lʼn؝X"abE$"\ b>+"wdTAo \W.+A~\\W.+Ep"sw$dh8ZA h&A!h# [%D\eӐ=E0 ?FW[@㪊Pְ_=pO ^<1ָ4”CVCVCVCItfݍyV/$jR&jR&jRn+XMդXMդXMn&Ub5)Vb5,] nҤ,MҤ,Ϳ~@̞$I I I I IIq1.<.)r܌#hI Ar )#2"x I\Fj9TԼ8X,Iŋ r6 l(b(b(b(b(b(b(b(b(b( ( ( ( ( ( ( ( .lb6ͯ ,²Ō+O.6߱;6M`<6p+ʎ0&{Www\Zp3 E2y$DbƁfߵ n&XEVX$}.qq%(m~qӨ(A{a^"JQVbP%r[_5ֽ]qwkX ^>x}dg ]XAIK>'#,6n@c2'c>Y`u>qDѾΏ5݁{"D_sht6J+2~Hqtߏ%Y )|wL<,;&.EB~wL.~? da÷FOzմ ,s o)Ė&]bA} f -iݥ7>u &{ɴ\K0˹%dy)^`w%~Jb9=?̺H,,J;fN)駴H%k fyKߟFC.'>'ZP jaB-K,:sRH%J$|H&9jʘT*cReLI12&UƤʘT*cReLIqN BTc=h)cxvXlҷ=aiLc˷$3$}c9G$常Z 6]=VWM`8,wa['gKd i#u W"NLƉ%kᇡ3q6rKzt>=%qDKu~_)u~_׃)QE!pk @,I_~^ )sHٴK%q|.OO`St]>EOSto)z}^/oe0qloGI?1/!x v%I}+/uJp 2_]Ƙ|+̝34C6dk,iJ_R3L=Sxso0R]>˷2&E?MOE<'nι>іuXjUgJ.B5ItWAF舰C<'ȅtsC(Y8[guhsu)B]͆hZE-Ը6Pl24oy27"[ubU!IxJSgIZ%%\?s %n Dm| db66iz48M)C1aƙ·"`t g{1۬_2u^? ϔ>!<)Cߏ7}?적Khz4X߭61(RJ.QB|,Rkɸ\}pqu,ux"iKԆ tBǏp.M(1?c-|1fq7`Ǻό[F"\{ b_ |Ž 7!?(:h&ݗP1g g)ނ[xF]NZ pv?,$9xG5GԀ?q\Jf]dq0-Lk?9/ȴ.P]K2[0{ k ʺ-k º?®."r).·´0-Lb+J-4K_c+3M.8Kc˷.eE],eHj8sڇ-1}0)eɢ6n0K]XEm~ail,b޶0-j ¨0-j ¨0-j ¨0-j ¨0-j ¨0-j v@~a?аhVvxԇ `wH[z-=>qro2ta..:?5n?QUqddddddd`63]FL`"69X\ǏxJ e=(sK:{0(4)KxOӛiVQ$?{z0R14.4VK g8c<-oOi3̆[{H1H1H1H/ o ]/*׍?9MvLV8LvL5ѻ&R&&&Rnǁ}bOuK701M,~IJ7ٙ1Mv`Lv] 7~|Snnn~Ӹ3GF9%:ıqOwXH/z:{pSWQ|ܿp_8/ա$=Fdr#*-2W2L_u4]G-!-[?YsĹE"qn8H:Ω㜛!jbJ%kIpdxKsN]Z\Z\|Y\|Y\W\W\W\W\|V\|V_K;̥q$//T*v}quQ.~0.^./...5,Wu{`JC_y.sx,.zEVW'VHzeßgҦwٵpٵp.Nw.:eeŸ~ǷW#8rM ~wp.ݟ~o]H*J\YYYGR'FV 툷~!!~!!> 1jNzBOBI;n%>v.R8~[]sU@+BC\&Qf: !!2ģ L ^BBC}!!>ЇBBhBȒ{Ȓ{Ȓ{M !4!&,ǣ%ܘNq|G[/aIBBCC(CuueY !!!d:d:>ЇBBC}!!>ЇBBCrqb86X8UK CLOUprm86\N '׆kٗ=X am86\N '=nɵpRcm8em8em8em8em8Eߚ#57e8e8MY3eqʚqʚqʚqњ8?aÎ@)g4`7|?BS4DH)gBqϨV|bOה88888O }i-)dʂp! !Ostl;^bI|M))))Fb"pb$,))Fb"pe5M"p"p"p"Ŭ#Z6 }H,// /m{:/tاYFl8t]Byw͢x`K(j#\~%ҵ 7ڞ,ڞE۳h{mϢY=g,ڞE۳h{mϢY=g,ڞE۳h{mϢY=g,ڞE۳h{mVDqPi(4mϢYH( ́XC_\/R"Ek{6Ò%%ezlod.%K%K%%6i{ C)Yz/Yn/Yb/YV/ɮgvPJJ C)Y2/a(%iɒy2y}?݂Y Qb,u׳O!>JG >- h0&RDJH )a"%Lx`&-a%&-َ i )a"%Lا VRb0RPJا%i [)OKJ}ZVJ0R\JK s)a.%̥0R\(;Dg%%52+?ʐlwa!t 6.t+9d )i Sجts_t7[NL跹 iǁk2B8ƺTBf\ąN8;$\P>c),~Zb,6nfόơD xtE =*pz0#hpYO\/ ύM)o]]3-x{x1RJRw?18X8?\?ӄ?5Q$l,L gԺ"fxhRpJݹ=)$Ay?,=;S޶8D,C!!pKuMt ,$z6Ch*F&#d˨ NM gvc 1[2ۍtd?,闔K?m>a|mI>,n7}"\T{mSpf{ÜXlR|ط`9V1x_3],/vCʘp]?,$^\RVrECxh7clQ.=ssHeݠkL,E0@y56j!> D!ESdLuT;,&ZLu2A:TL3&l:T]Yd\6tNwjYYT*Pc!u}B_ǣ=}%ނx|nq af13|KEpfi aZC5B6 x<,c=Cf^)} 6rr xf\'zK{g_{xEO>)>Dg/t!0?|[4K}m \} %hGCH/t2Dؚ޵o+t/,d$k5I&$њ$ZDk^jhMI5I&$њhyC DpI5ɹ&9$\ksMry9Wgl[:&]\ItVINx>.|9c ˆxݙ XίKso~[d6ٸ=d/& هkx?oهx],[F><O)-$c3K|˥i4dp!ɾ˗5nڇ!#ey]ďuбZp9< IMz:v`>̋̇S`;,ÇkSnaavIk ,@83\n V(ޜ7uȷ8 zb}MS@ocý.q6<ƂX8a1]ߍ I۸|)AՒ-%[|-a~yѼ8xK3RⰙ.?Sj!%<~]~ķ;Cd3^DZwЭ^Li &{Zž%i {Zž%i {Z%!|.^$\:sK97e.]PHr/'P\>`+;O0メ,a(oC`zKXV,a%KXV,a%KXV,a% >ks <_뵈sbzF5Z1:(n9p2,CN^!pjAW~Gc{lǤNEω)q jl)qCظ!kl4Ԣ9z|uTth28[d.-Mc)X g)ނ[IXZoѻK֯.bqZ>0p纳xWx.~?IV}?CMϦgcJXc L5DZp|ݪ/Cp:0wvŒK.irf·9BbD΋!Fn|yqO"x\$}bŦKC]/6 7[+Ƒ.呶v%.@\ >Iݰ7>lL_X/ S, 1i?Qo"Ly79`˝/y~E[ es>(\4vc? NKSҬ1eG63sb[bقffcs\aG-dȜobK>C*L*q#4Q4tTR&Oì@nvLnN$uSp4V,M4vd?0.]x-X!)WrФַ^EAV"ܫ/s/Ӛ ?P҄@5CK,;5ɃQ!^!^ vIke$Fo%7x#s7u#G5 d}7x;x巃^~;x巃^~;x]b}n5uF`-vB~7ljo,c oIQၧ^ s0~rx{\M W7&&+vF1w.6%'g.oH?]~ķhi^X(^?O~R4UNvLI~eL{̅ oo=sa`\;&LsđxJ8[bĂ`b. &&=q D01LLD01LLD01LLD01LLD01LLpTG0LLÉ҇9 LD01LLD01LLI .T29ajj"{&E9&L㓳' D01LLD01LLD01K7{&SL'CD01LLD01LLD019"[9V#.1$8T&>;\01LsY@FY`,.쌔FILS01Vꛢ%)`b ;\趭С 8BK(7w|wyljjaro[9(;);) )a~|鼓;鼓;鼓;鼓;.)K;輓;鼓;㾲ḯl8鼓M{CdQyueb/GF 6wy'w.tޅλy:B] ti|4cء I]h w.4߅|B]hv1{.%.9\Ӈ9&;/^=I紻%:slJur 6.X&yG(89ހ`鑈Mfqx`LIFrVۅ{pook8G x w.|e?J&|qo l%w.|ޅϻy>] Qڂ8] w.|ޅϻy>] w.|ޅϻy>] w.|ޅ~ # w.ޅۻp{n] w.|ޅϻy>!37.|ޅûpx{@OtOD{weR½] v.ۅ{po½qov;;E^p  }6hUqW,p S (%w  '!#8E -r$qi^9.`L,vz A$!9<BCsyYoІYpv?t:T2xHCHu!!:T|a{G,K~CmQIz/1>B(Ce !!2P#Y !!d1D:$H1[$/C\r)gH9Cmu9}O:Œ"!᜛.#KҔzɷMB(Se L!Ų2Ptf;2l#PtC!K!(P BB(P @!(P BB(P @!(P BB(P @!(+JT[_$Ρ_Ly2Sȼn#tBk,}L^<1yc䵏Jk̓˓'ʓ>&O''ώ''O(%O(%}L^1yP(yc䵏ɃB333333AAk瀒瀒瀒>&Os'Os'{*=\ V葸z$g'g+!( tNx`nm(L>~Hg,?Nÿ%~D- <|私͜͜oms/wS2N!Nb.i6)MqT`G /tLql>BwL/ERDTK#n8}1; !x fulMG⋷`KfMS;M1>ǔSsL1>ǔSsL1>ǔSp N)9rcsVc/lgQ-a\r^ ,p?c~K,}Y%%a_pxҽxҽxUc`_=f@)(HBh]Ż.H)H)xc.]|dIklD@g(KVL˱H?H?H?H?H?H?H?H?&_q{prDE~Z\7-/Lz,_ƎNo& i E=}Qx_slozotXuE6[%–P [BaK(l -G{J(i=Bǻw?tBrbtl8r/㿘{"e@mCxm -ᵥז^[cl#[wKλw/9^r޽%f[Z<^<^_Kxj O-%<ZSKxj O-%<o?TS7<ox0w؂M0)KٗpWJ8\ə3%pNVxzaKx dx 6Pom%"2TΌ, 7|B/| w\̒} F;zw|dpGυ9GM/dju=zL2ẞ LaOխ\ȌMmlј?ܻ3wgp|:!ӗ.ۙ?\3lg AÃlkuՙ?\3Vgp={{E%C!k,A@;L,[">?8tXl3? Tڙ?ScOovphab!b[آ}ҧ__aA)Y8|xEjrqӑёDZ\ہC$ͻ= qsŃM[ʹ"<,i!eު$Ͼ?6 sB1|/^$27&>, .ϒ8-Ǐ+&=^laJm]O0^~HQX҄ A#Mٓ0GtbiNbcK 0GG=m`dy` .m_`g//NꨶH9Jـ'-Ƞ0$l_%l򇙎K;~b{M0۳m)i[WMNYǶm?vKf;c g9~uP'6[¹=} enB?7ɧ{:!>FJ+"ؠ8p6||a&_||a&_p Y(dɘ|Bg(0V087z@fX_C 5L>0PÜd]|aɇ&j|a9ɺPC 5L>0PC 5L>0'Y_g|au&_g|Btj|aa3q.L?'^ω ]s9qwN;;PqD;s!#l!%| bZ<$_Mp'̟`:< p/PlgIK.OEqވO(FJ6)æp)k>`v꥿x)q-qU \ ? ŸO[ʻA_Cx؉=\w0NM gp)i oU^ Y\ 7Oް?y 7Oް?y 7Oް?y 7Oް?y 7_BXeO^?yeO^?yeO^?ye׆'o؟+aJ?՟VZk'՟VZkO^?y_6s3sV.4y!+o,3 {hEҿҿҿpm.: ]ns.AAAAAAAAAAA w]%Kv ,x)ނѕ Xd'`N%;Kv,QKTEgCփ]?~SQl,(H^^K t/Ҥp-=͂%K6 %vf͂%K6 l,ѥfc`fz6v׻.ږ}qAVU.h47<0 8'gn&ɻin&ɻѡ}}؅`(҅inڹ &w-h7M3vӌ4c7`nfin7x!j&M3vӌ4c7M3v GWEinZEinfTVꦕinZV¦inZVꦕqenfVꦕinf900`nZ[-V+uJbQran^X[-uza4--KRX[TVJbnR|rq?ш?`v$Nę,bin4y bQ(X[,-Vyo`X[,-qr Ѡ6ц4ڐFhCmH i!6ц4ڐFhCmH i!6ц4ڐFh8 Gh4p4Fh8p4Fh8 GhxB`8 Gh4 './t0p4Fh8cŁ i!6ц4ڐFh8 Gh4 ]h e3]p؅{;'{~bɋ'/{biwdn{bɻaR3a؅&v]hb؅&{&D9؋&hb/؋&hb/(GwÐ5وcA?~!BQR7?XyԂ&Zdodo.4Y5 Mdمv[z-hb hD3YcXo/zFzɺ=[qMlA[ւwN:8`T~NT~NT~NT~NT~NT~NT~NT~NT~NT~NT~NT~NT~NT~NT~NT~NT~NT~NT~'J/glIAtAt̎IÓEe!s!^R O &%*S"|Ko,oelZ5Jp:ֽP 8`S 6`S 6dSy TM%TM%TB%yS,%R,+M^IޔMIޔMIޔMIޔxio$ޅVoJ&E ޔ|/≂7p$HMxSwxS oJM3~bav$T>>5 ?X~`ac}p3JS)|*O|| qf fA᳟³RNpÛoKBspΕ¹R8W J\)+spΕ¹R8W J\)+spΕ¹R8W J\)+spΕ¹R8W J\)+spu5`I`b.eJ\Oҏ\)+spn­RU JV)*[pn­RU JV)*[pn­RU JV)*[pn­RU JV)*[pnU JV)*[pn­RU J]0; +VV%_jJr[mF8W J\)+y)%BDȖL~b%|OhTFhTFhT<ƻ@ HHpą(%xTGyTGyTGyT]Fǭ& vUdWEvUdWEvUxoaمͮ쪄]vU®JU *aW%쪄]vU®JUᕯ y]Ǯ跼2?%{1U *aT%QcT)m3=K-T–/s=@`Q%,EU¢k=<O&8[ U ?.2JXW *a]%u^zKbyJX +0/j=_C_~q<6VJX +ac%l6VJX +ac%l6VJX +ac%l6VJX +ac%l6VJX +ac%l6VX +!c%d2VBJX GpH82~ҷ`6*!W%䪄\rUBJU *!W%䪄\rU$W%ǍN#|'B{7B~B$VM:p DZo6>?.bx6.N\(?%^7GomI5%>8  鞡At`vH#Ht&Ņ>֬.\vƫ4w},3[.YC\~`E3 :M 999''8`rrNNɾɾ`` "8em^\ⷘn닓=tbK]xxXtI!7.Hg-wI|ȓ)d<"O )윌cM`Sr+9{'l}V5;OCISg t_a9q991QlU;Z\ẈXvTziSO3Y$V̲c99=,|/)Z&\{ ҞKʹ6ۼyXsnk?wTz}#7B.v~'۳Xc}6bcb`73'"'̳~bc^)e*f2˷6LKQqwJ _ OSi5D}x|*b)bh ." MUxM ,e`^k(8ʾ)ŢCmgM̵鞹63Mk&۵yڼb6E'Uo7gwݙqCɃO`^[|Nn`H@V};i/ۯB]%|' U&V[kN/_C:ؤ!H-k"H-k"6_.:eLbgJ-m?[|˘W}@,߮!x f{Bbn[$Ebn[$yÌJ/_լ9?T201DGc7B$ a"3Ld=N̄ceܹ%#2a_&˄}/e¾Lؗ 2a_&˄}/e¸L 2a\&˄q b8K) 2a\&˄q0.e"Ld 2a\&=in rDn-e"L2a\&D>0.e¸LL22d)m a"?Ld=qýejN N .bE,p0{z;y}I3c;^^^^^^^^^t䔎ґ[{ʴ2-'W[^gKc 77.NIX^yyySOFǍ-cXȋ>gY釷`S0۶W^pg-t}XyٓVŜj6cgm\)eՖ]Rc fuOė)?][KےxeՃ[ױ$z.ot?]Gbp'o޻_.eQ, :ۜ2,.KD,.KD,."6W%$a9B\o!ixނM|$d>AFX_H7qaìˑ ؘNJžsKiDZ,KiDZ,KzҢ㸌HeX"9H%cX"!H%bX"!H%bX"!HN2N2:C*> HXXz0:†0{³ hSmJ6/t}hH94ƥCͦCͦĆq!6޾o5DÃ"M?M?۾ 탟`vH/>K:q p7}r.n^{p o{{aWxX[}ؙW0žosވߋnsq|7G:-n->6{->6[|llcfo|[ccK-Qjlٟ/ 8OiOKŗf/_En/fY>o6?!44` f6f1?b g*<&G\\lqqe-..[\\lqkֲŭe[-n-[Zds}e-+[Wlq_⾲}e-+[ẂB1 .+[\VlqYⲲee-.+dLwcsee-.+dm2Mee-.+sY{jzOvYpOpO971r"`ι99bs?2\s2\柋m2]\t.s3򰆳.6\t9\\tN?lyx0fz.se.E2]\t.se.E2]\t.se.E2]\t.se.E2]\t.s1z.FE8hp wcCD !S0d L)2ĒN1`y2C` !S0d0Ĥ1CW ^!+IOk$`&}Ȕ R!S*dJL)2BTȔ R!S*dJL)b҇L)b҇L2BW4 F!(dLi2BQ4 F!(sdK􉹕X&}±j'MKWI>i' A:!iz'M靸v@ iC)6t tx)ނXl4mؾIǦbظ)6ntf]bצص)vm]tZz8vm]2Sfy]bצص)3>eƧ2Sf|ʌO)3>ŮMkSf]bצٟbצص)vm]"R6ŮM:'=<=!]RKtI.)%EHtNzq iv®OTh?([pr`K;տ8şd?qAI[졄=Pb@_=lY~l&Q$JH{ gL#͖+0؋T`} (1KG=#˰lfۂz QP"JS|*O%|BX*IL*a$%rDRBKH~'?<%D~ȏQ"?JG(ʌC$l }}t7^7Ж40!2!2!2!2!2!2!zxf^RJH8%͔tqZ2<̺dx-%6WK?fX,~<o f9av^^D6o_;H~gOlgQ] gÒWq<49e ɡ 58{<;1+(!j"!j"Uץͦ9la6]\l+!klxK+MW/I9@ۄxI|H!9DHC!9DHC 0@`ΦC{~Osp<9<@ߞk s8gq9Cܾ6i({#66 /Db}W1YS[<v Ma^f޺8~%oa֏'[R/`I_M0279ޒBO\,?pHp6968~%<-}&.,OtۧoF}Kg 3yX؁6 ±0ky6M?ς86qp 8i}6֛ 33⺍2>%rp:Okuto&XO'+zzƈ8~t8js]ѳ9z6G=gsYitִ+A !yűԗ{^\S0}Fbpc1ޖ%nKB[!eH7$%b0T Uqr|V82^}l~as2Їe*l [–e*l ׹/KiHl{uՇ2N:,@ThUhwf2m3v}n-qgwu~1q1fO"F?%`oo{ pQ~qouZb9S0ː'}yG0 ` ǧHd|L}s!F,|O'q1.6boU.v]bWUsaVZo=&gK"?YR%}\lmՎko+$VKxŋs_A/ /Ky`,0L~~`gLOȘtK]tpI:RG. C ,Q;3xZxGPx$=NZ &BĄ8888ݙ8ڙ8י8ԙ8љ8]b"DLCRiw,[|q{Xm!#D|c3.X5:Ò~Ȅ"VBJX +!b%D"VBJXXX +!VBJX +!b%D"VBJX +!bbIGObI'n)s "&DQ-J%HB;r#!-")-xrwI+E:K)"EZHi"-REH)"EZ Ēk̺@B|NzHK) EQ" RDA)qIG*p9w1S>eʋ,xxxxxxx=,{ʋLy)2)S>eʋWŶ3ߔ`ʧLy333һX& x]>{q`pCB)$3ُ)uc^Oq'O';{Av$1;pe(G\|G9r'GG9rq1~ˑWLeşLyx [\淣?ʗC$|??ɑ[\?Q\,ranCp%J8D%Mn|i?@胢NBU*tt &8tH2nR>tKx o['6NI'<~s {8d?!~DZ"A]߽8U ?+R1bQ 7㉛άOM]G=CD3D =CD3D =]Fxr C& IɡsN >`) &@gCy6D gC8O&99w<&-"}7_~w'0C*-gp6׏u=Zᇎ/Dqg{^o@}}g 9.`N[G C7#BO;RgKbi>?BT3PnU6kا'.v8I7ۚO ~,[ۯ~ؙEBSkvpi2q4$qarg.H@U ?<3=X=)lmlm[pJIÒ>?*1&xq}b.vUC0ǂLNKgdxH..q`84ۂL/F'/%wɩwѺeD I#lLJ]p)2=?MRt˥<2=Cpa챌G##//.LO\-CAYv9U}Z FrA֜x76__҃EhQ,8\\ e߃ EbѸX4.EbѸX4.EbѸX4.-%bEĢXbQ,(XK_,,YdYd1eʒE% %.v߁#TSkaΘc(\?,/#w.6z;?PvyXӲm,9<%ߖ N[=O2S&͢ۂYҹXQR6XK,%+cXbe,2Xo#Moll3lQۙn+c$R6!qC僼~!MA|WGvfy"/D ieh:u^%|yiX祱Kc}S"8Ȣǝw6F"-z"7/T=/΍<\8}m-7vz6v/v wIY#9~B6vts{:VWW(§swùwK+`)!$7UvN¹8?yXn.GD b)b)b)ƊXqݍMkj~wc;Q :掋b;.掋b8?ࠃMwH܂x C0-&ìK{?,yc*o 揹 vR,oBEςsr܈9+lHې"2i9Al ) )R(ـ *.2oNʻ E,q[F=lHlBlBl=lUy*Q!ݓ=۫IETIETIE4Ηyn)b(Sz>FJ SzBOQ) =E=%?oٙa1DhS4zFO1/N%[SZ)=FGyEӧh >w`)OR>EN(O4w ͝S4w\P\P\Px쿶B.(B.*B.*in.n.n.nSn.n.Z8zCj-\Kp.%ZD h-\Kp.%Z0X-]宧=D}Y>Kg,Q%jDMz0]HIqƬdQDX띒@)񥓡6 XKt`S2CxpKt`=lWJ^+{%zDWJ^+{%zDWJ^+{%zDRWJ^+{%zDWO-8)qr8pzpzD=VJX+c%zD=VNJzx |agh>2) =D{5B'D16c(Vz]c(Y57Z!Q _ML)q< y1g%ނ e.A7Hw??PZx҆fT[>Jx D*T A :a0sB-[<oy!xC-[b?xíTn"Ȼ!nt Y?7ԇ~}nb..v=s^|KG-i~.v,O{xJ{@,ϙX?֒saqK!Rw=xɷ5ud:sV:Kr)o2 iiѕ][$NKx.lCH!jSd~H 8RpI|g9Mڡ<̙aR~U`2v)p\a~2BF󰆳.!m4ptby) 5DC PC4 5p ì{J{с%T,ʸc /P~U~v? u{5D[Mj / !ӄBdp w Q?ŁvN^nZϟm omDw=\ĭ& : eDȫI'ܣ{Ώ60Rԟpx}3Il+`B|0/N6 DQ1ۍ2!f'//SEo18%ƏenC_CvNi̓}K[ANSD:ENSD褃Ìy73.^:(x֏Z' CC7Jٞ>tج@k[-YzC3 Kgl>rav{b!9a, Vĉq=4 B~{&`q 3~ۈ=#9a~"~"a3F"?",g%K^))IC!`Y1q/d'L+9ߨs 9 '6ĦؔrbSNlʉM9.;J .|J,ݖ⹬)3FsmJM)=6ǦШJ@&9߈)\;rwŦp|G.\x͆հ I~pJx"oN-r2L-f pVNb8(Ib@(PeQ+&` \w!.F،Г8Y@,ow?L[6Φvc"t"=٘{CjLjlYcR,5R,5R,5R,5R,q6Yjq{J<7s<7s<7s-+)I+Ex"RWJ^)+Ex"RW?OR2|[إa7Oyև?a"R]Kv).Eإa".pi+֗=,H;`L)B0ewbɫ$b^)1E8"ScpL)1E8"ScpL)1E8"ScpL)1E8ǃm fpL)1E8"Scp>F#GXɢp, G>B8cQ8cQ8cp,%±D X"Kb@,%D Xxa%k8)凰?{?>IV"J$Y$+d%DHIV"J$Y$+d%DՓXO+]0e_Nq J$Sd*L%D2HtӒD2HT"K0uigHt f_@2H% |!J$Sd*L%D2HT"J$Sd*LEt,-ߜMUE $dȦ%ʅ,7ld;,$}!mSV/LBԦi%ĒN&Vk{ DڄH"d3B M EZr?}!a%>oY-K^IWL^,ǖ)p4e,+} WʇfR^,lʇ]ƒRv||?=`&\q\p%b=a)CHB&yg)O[>$kˋF֞!fC,m<,L+[)''4ÒfI:%锤#=// ޓ4nY'I}[_L BL0o;aN$Z3?lWk%L2!\KL4 a)}/hm|[{4D\,`4 `828(}JAը :A0Y d,GNKH,p LB,' d,,$NdDQCTU>DQCTU>Ds>UU>DQ僆,CsfU>hd?̼ 9Nʇ!|*ʇ!|*ʇ!|*ʇ!|*ʇ!|*ʇ!|*_"uk<';NUN:}RO*IE>"_ y"4B#25[~/DƸN2  ~l>36MFX1X7̿q3 B'Dqc.M(1?c-0,6nfόahD в߂g4ٰ~? prqswA|?wwoo`!c` 3Sf),a K),a>6|9O8$iw Z֟h93D$medSf0LaS)pW3DZ~x fDR1La 0SpfyB p1sd S01La S01La S01La S01La S01La ۜ<d\}AAt_4bK Fi~GrDF^l$NP*rr$">HMzq2~o^<-%o鷞8=E}%KBRϱw-lfڿasI{b[fٖfz"Xpᑥ]Ⱝ ˌDD/KDũX$D\/^uKy@ޖ%k;C.Yo]"fiK[w?/ NF8$<`o["%"n-6D܆!6Dܦ`ll|F]m`"ض-[E5!Emr[Eml[Eml[EmYڲߚ[mٵ"-l"-l"-l۲r [m`[eSi?N:Y-l"-l">m`[Em`[V-q"2m`[θEm`[Em`[Em`[EY 0D9X֨[ edQeqŸBlckckɳjjjN"><<:Fs!CoݼtlEc\' F,}3B|fek2Bձ`d\02. F#ゑqȸ`d\02. F#ゑqȸ`d\02. F#ゑqȸ`d3n[z} gs3-Lg0 rz8 [3C373+?Op?%8~߅ǽa:d]DRz(oM7&zDL[b f]VI53_}oedd L 70&pn`d L 0Y2Ys2Ys2Ys2Ys2 &33u&`L8ڒ g0 &3p`L8 g0 &38 8l=L~*۩bׯȳ/.d?9_f싿g%/UH(W\䊋]b%.v?eKw5 ~WQ=,鴌q .E6"\d?yƘ.e^sض.p]2w? 3_h~yx$ne3y2\|!CG" #"N"N">uG[!M1S!^rqӎp8ZP"Б%NW0k)K·`8.K:.ߺ|]

    %B]$ER]$ER]$ER]$ER]$E^u&C ݦ=Ndɒ2}̅,Ymn%nj=. Άp!1&7\ȸ:SMx&"x>yB-4*.}C|gp&6V1R<~SV4yهCH[ ~=X6lN,``c̷]o < ɋw =9 Sʓ,䄺erߝS߷Ct8Re:)\'pc,RRR ''~!ʜ`{RxO>ރRʐl>‡RP>>p}FM“RxR OJI)<)'“˔gJL))3LK^O))?Δ3u}lbrǤǤǤǤǤǤǤNR8Y 'Kd),pǤǤժZWH[a#@pBN,i+"i+"S+2Ss,nWaQtu.:];WEWUչ\ܹs+4XEs\*nb 56ɸ|&q6XMX P2lb7XM&VMb3,E7t w:b Qp"G/r"G/r"G/r"G/zz՝%GANP׽g?c)@G]@]g, %'x]f-7q@;A;EWq^r!vyKE8G#e%k%i'qz=ke|8V -8mXSh];.QgluQLJ箎?G#zۂJ_=GL/6$` ,a%L 0wwW١l%́fWmv|zۅi:WJ_ +a%[%쯄0zۅTgCT:&VXXKKb +,a%ޖ" KXa +,a%VX KXa +,a%VX KXa +,aEVx7|Bz[~}|:Fh9&@Z; sѾ0YV2t n_^Dߴ~pG9U=?Ixda6]?&NN /Ɵ?Kp16o6&|,so\<~Y>pt6;9Ig:&]t o]PJ8VEޏylVy/iVg/йXj:m[=̚bhaG),C=̺JzXltq fJXf^:e4؆`i f*Qجx>.6  !iQ f[".h>>6Qn!>RR~@ʏ~$b@&3FiCFm7 qY~@c.}w.]tqX=F,Jfo])駴sJۦg/uoxۭ+$Gs*q7j)6TæBW!|nMB=*6¦BThj́ jbSMl}l8Bclj͵}5FVv}Ecli{4tlY!ز?El6[mElz-4o+:xݦ=Ekx؂M?ه9Bl(-Jk?Ela}QC§%XRF_[ebAim[~J&S z a?Dmu- i?%0ءO ˷)qLR٢x(-g٢x(-g٢x(-g٢x(#%%E mQB[ųExgV0b+ƒg|ផe n\)61h61j;ݠFbT,FbT,Fg^\ mcc(ޏQFǰ($t o(ޚQ 㶃4=mb[M/g1=)p M,ha bOS ^~1&D;h{aLT=`ߪ骉']k"vMĮ5&bD욈]{b=q 1gG̅`b,p]ij"쉹 [¥^a&"D0f"LD3a&"D0f"͹7/d@٨^αXG{Kwķx(B(</K (.[2==%M)Cȸ Q>Aނ9BDJHCR~d%XPL')]V\6w.eU`pk)3?+˿꽁qnLxIfGkTz">>YwRը r>:"v/)^~s"q)A5.lKC(dp!_p1uO4fŽ v!Z/ .do Ғ - . GȒjρ.doИ?<vБ N$|- .yÃ^qq%wbIsf^@tF.p)?\+b]p8_!~H{-d-5Fw7 ?4 !X!/Βa~.+xBBW )'hT*} fFШBRhT JQ)4*FШBRhT JQ)4*FШBRhT v+sᖨצQ p_A+|pLHԪU\MU*nn>c-Wq˫U*.IסxCNw_.oX,Q@P%t@Vك"!zp2ƒBBBQLnx݄`Pߍ:a0Bn5w~FϾk*.&!#Z[?z}r gуN=8 &Ϙd ?uf% 0C[?0ߖ>z m>NΠoAw5.[q}Axad}b$pv֗Oom};ÇL0|}xGp1|_a0}$_/M<}Qbߦ|rU|w? ǏwK&^?!懇O&4"Лu~8K:?,i>c6K}EpIJUJ!WlϽF?%˹6fz}rK?faoRbRvYK}X揙~*5(^5̲}ft^5{W><`[50pH9C&wH~ffق:2>ar>$ٶ{}8$/)cvoqlŏ>u4{ u|g[xMUUTuR7aA:>y.0. Y}& nka~ka2{7ಧ 7a |V֑58-)b$X Y_u&>^zՇv!kTo[K"mX7_?|N1}.VCT8D5QCT8D5e¨zb12mC萁6-!:dcs"P 'CA[sir,`4y%@'StQCy?DCy?DCy?DCy?DCyo/:_Ǖ/Udz0u+DX5z7:p!ˆNꟜ@ g4\V ٖZ3 HpmԄBpC'3FdP'l:?6(o-XlF0+0jM7EMiz6EMnGo.0u9lc1:Z@LQ\S5Ӈx; |-a ?X=8Ɯbc^llJ) rP'6O&}xKY;ŌX咎KA=KFc[j{pmݓr,TR-#7LJM“n)خ|kqlmiRGԅ^Ωe\t=)wa?_,m+f?sDܘLWm 7MDɀVXEd`zb2`"F9IKfM/I/8W1 ǡ5k x8b/0%c#l f]Bʹ@ R΅R#%DJF4\IHhK]0;&!҆)qBS0;; a%EHs2qe=EaTLJ]%`}K&d ַz_^cf ly-/f/N>e;!&|xˇg@D!bË|xˇ+"^Ë|x/uXY.K#07E-… ̋y]ЙOt3߄Ƹθ٭Mu sL"a^$K󒥡/Y`~ ¼pև]tzk=q)R"K0Z0'`wpkhz'2ðepM?Kb+i݇2>;|mOapGCeߏ2_'gX.%g)5,i#Mdbsj [ vI'($)`kI6{M0+soOp=Ka0ssrlJkPO{n,5ƶz&o˷ƖƁ]ۥ!B(@,s&\RΒ6,c,`Ѩ"-<0KDf_ Xb,1K̀%f3`pч/L%&`I$Xb,1 KL%&`I$Xb,1 ?zYuy|? شߤ|f[(;}lknM6e_ +.&d|s ToM677F7&dl|sctsctoMb/٤ʛTy UBnEB-y mBlGGǖH[~'FmZMnRݏv߸81u{aE?aFEJd4Vhrp^ioJv$lG%-n hs(wtb&n}1z >6#Jd`k.! qq1|ɷ|k`đt\~`I'$tJ)N5/9 VU$Е&ĥ 2m !ږ۪STTEoQ[TEoQ[TEoQ[TEoQ[TEZ{ NGOyko "̾v?2ݏD. l9J gOaR7zԍ^ַ\8PgXwF/uKnR7zCԍ^F/uKnRސu3j2gtBBʮ:"܄La3nfw3,E7~f,04_:^rp`.[rpwp9&7!( [_xpaa^|yƒ߅48K,g/d:c=Em1.Bq.w8q/,mGGyc={GIL9J31|׏-u[Rl[Xc et^[pR5;="]ap\Ñ9VI:2L{l 'xEKn{ȷ8iOKf^u?jlMOHr ނEՏid:>E/E#ī#`W,ߺė) @{ށĬI?;bIlt\EoHwlsQT7T78穎T?XDMMMMMyO`X,slŒ+xxx4x4x1__ba/^&^{>L')"`hhhS0ûFo_[;N6pfw8_8oI lyyyyyyyyyyy.CwݎpD!ޟށK^{BCѱ6 H0n'gzw/5 !NJ뤴NJ뤴NJ}vdNvd~m'636:б N$N$~n' B~E'7~7,/:7'3kΓΓΓΓΓ΅DLw~5,|s& Yyt9rH否!MC.4]ibRB]ot9rxM맵?Yvwg21)ƒsD!Y{zz3yRq'f:Î>𞌳0n$)~wDQQG'9Xh)wKj uqd)9xˇ$M,XÂ遅ɸk2bĂ){?~8jNqH6m uؐXtVw #$xGHp5xGH!Y@Ciip;=wp;yjp;g%NvȎvȎvȎvȎvȎvȎv !T4pye4rH<Mo~oxqF_XAo%<~;rGkugKh!4mdAN`j:CQ7cD;܄LdȌw;F``````````````eِ%3 O_u ( qEnjq7Z\*..QgİZxĹ.[MKJN-웖웖웖,Vc9TKbzS ܛ,ٛ,3LV%<76&ϧ  şBşBMƟ(/@y0B0B(J m}!Coff _{ \4HШC_Qw\37c|p \o^z&X\`.tCA[Tc|5o 'N)|S8pM7o 'NprWϠT+'2WOJ|^ue>]_=Ry{uORȕ2L$'¢Nگ]_~ʮ KlP>R%~JSUONdƴoiӾ)}SLM1b7ŴoiӾ)}SLWگ6Q_mjD~yU&MTWڋi)RuYp(R-^:Ld-ʰ[8_=jz~*aU8ZWڋS)"OF[϶FzkyAJ]3/[y.~ݠ]ږv+/{}iJݠհ >H[fٴO?[tGUN{Z==@D3\;Jo+̧uJc0踌tӱ1QCLǹ2@.~>_\ybE{++G~|R_ڵ)si*% o xF;%mȹ*UؒGGd-ٕatIef=r]55.)sSKU\^%)V1ȶ-53׬Gjh35SS : ;ZC[uh{!s|m֡:ZVjWokaT:H{א]:z}z^ݑgPSW7j]Z^=ǔ֤OW5ir5Ъs>xڥM}kҫ֧kҫT~Fa?P]״֤OUFV٧;.j+61tҪOn#}Z8׃OLS{sոi-tGtG x;Uy]뼮u^:k׵ZkѲDu^:kwdle[q:#o*,W|A$Z)wj׵lRAZu-yGve]R *Iֵ꾩Zu-y]˼^+RdV|]+_/O8Lgm*]3K틻zoe js؍gn4c[S'. 7`,,\Lǒr'N;67Zi wEm>>xW~)vUڧF5;tgZblXjXj7vBX'qdի m QYxM*ۋPֽhbV/%5/%5El&fUɦMQ'ффK6ŗC~򨽰]Upcssnfl 9FZ*^.4Z~j 1yإOK_qg߫4Vs ޺NXɼ O+.ձJ~<CO]ʂ]ȿ6>Ӻd5{t:1 p /(tԥtOnQ㼎 tt tC^mY!38Ӫ|Zu^S0]^Z+Vq5޸ƥ:#}#ں` +tO)j>c384j>Z_fKJE'|d5DkL}¦"m6:*꼿:Wb)u >c^H954U8UBqȼMaݚӘS2sG S0sWN][X1W2L~/^꩜&LɟK#XJ0&O ?a3X׸şPO7O(0껆Ooq];~jOz!h:YK;4=Jc>;Jzߍ_ |z]mi*.E(x5Ɗީ/ >v> %t< ?w?:LܷC}{x$˩mp}24&Ѕ8e!Wc.XS:R皪gm;L 3?L(s1 yz#]8-kq5>UO< K1? r!!T[ OџiQU!ge!W8:}noLhS_|Awqoh] ^ ^ F` F` F2=~0pCnb7\ elUMvwTo/av-akWځ%%:aתUBav-av9+Q+5$.ݕ^,ͪ/]9K ] ص!UkmYYw!~((?7q.SmCεg\[cjաܸGwv'tS5b[`[`[`[-0-0|ϔ/wS#̿&{yܼ |o^7/ߛ B`f\7|o.ߛ˵z9Es.'ˉrh.'ˉrh.'ˉrh'4p9h:sѦmrˉp9h'Ép9hW]?7<m?LjA8njUڡwVa9l?9ldr$&90ɁI8VcS;j.^zí5\ˏ9x?Ï9x?Ï9x?Ï9x?Ï9x?Ï9x?Ï9pˁ[r@/Uu@=k@/z9ˁ^GOma? Sgm2[JG8-n9pˁ[r-O)D]/ub9ˁXr @,b9ˁX+J>s.=#Xz9\z㱫åp9\zåpsݕ^(S[;Ҏtq;oa=Fu:R9pn'Ĺ]춋v.vn\Ilmmmi^y&mmm-ܑ[2JEw[tEw[tEw[tKwY7n6n6n6n6n6n6n6n6n6n?F9in AwDG}; gtlEtDAtDAtDAtDZ (۠ ۠ Q Kn8o\mmGt'_:wQP=6kWγCOhgj* ըB_annn#vEĸA ~8K/ѹ&" b b b b b b b b b b b b b b b b b b b b b b b b܏wi'q7q7q7qfCo潰1feto[^dAdAdAdAdAdIdEQdEQdE[#F1b#F1b#F1bCb#1Ĉ!F 1bCbSU5/Tb*!V]a C`0.1`Cw-iC`/{ ^ş>~$H.@riLf4 Yh@,` X@,` 'BjP+ZԊTE!UH@RC3ծB:)N8(Ѯ)HP9 P(yvt'< Oy`NMB'<9㰐'<9z\=ٲp8؎c[ʮRF]RFezh2imn){mܨC斲sKd 5=$]"ӺzZ0tɅYj6hpHG.lO/u~XF (TG;WzvķڞBƏ\]L|{ڠQQOg A̚[Ǟ\vtn:vmm ZHDC #TO#OkT-%RBVH ZTKH Z!U+jTMHՄTMHՄTMHՄTMHՄTMHՄTAGPtQrUu @ @ @])8~ABݕ'9,jAR F(SQO4PO4PO4PO4N4N4N4=<:N)i izwmqlDZտHY"i izfwOtHtHtH"i K/QE4 D@4 DzZm_R BN Z+i &'S}!VD@4MN518摢irB==Ა&'-j(b(=rYu4K4K4K4K4K4K4K4K913%ɥ|u>YҖbϹ!T9Q_K3YaZs+9בV;5UDZ zۢƬ:,uE]튺u+jW+U+jW#guI>k;r+˗I֥/SWGnx_ \FzkyA|}i SyLKxOBV7~_1L q,2z?Tx>gMP f8[Rr GB|$BG։l>zQt+{=>m8Vu3L,jL-2 f:@_Y {=0{]V++mjqFu9n_Ԯh*~Ҏ$ߺ.OSSǧuoqI=u{x˧O/okhvfhvfhvfhvfSǫqLLHH@sl^Q.QPyyy&&TATnp,\dC.!$=`}$Hr\Of>*YS>ėV+ZqW>*W]!u㍋r($2>N7wx]xUW& 5%_ݚtGAm:\ gf\tױ}ԠiLxZFNޱ5Jq>Jw~VSOSWd B8:qPuzdJ4+~Xy(ʯ<=59fSr)9pJ8%NSr<$V_H<pPңFivO0E.Ks9@9@9@9@9  MK̠݊4Hӊ4Hӊ4HO} yY $iHDvI1#ju33UX Z&ĚV&T4neh]wkZ}J cHV ;_`~O}ӭh|QMy:j*\9O}wiՏ_ hA@о`jWQ88W~9ܜ?ӹ=@Cb~sXN罄K~uiuh Dd "@DKjt煬A55w5. KЕA7n6n>: Ze+x5>zivuXqЦcÑ+=JGKtYiSI{2`@`@ F`4 F`4 F`4 F`4 F`4 F`4 F`4 F`4_Ý%rݨQ2*CYlЪZ9%GmHZɩæ[HG\Ga#oWkYC;R5n_T,?RzTa Rjҡ,[:6g:,7dn.YSTj׿)Wu^a7-]Jra:qF;2d-?C}$dU2f3L'G+ƱMuU^סlr}iqގv5|Yíw&?~!z%Ɲ0řKU4Ju(ؑN{֥,?X++QÕZuȘ:WVkvzW}[ʥUnqMՊ̰C1#7z]Zdz"}#3> :Wȩ]m?3*CӔʯWdv.l\y4y^ tSv zEfak΅W>Qٹ̾8zn z@f#zqW{a+Vzci4ZV'CC^ڡQ&BQ`C@ ;ܯGoHםQ.хjԮzcack+vYpׇMtM须v 3`0l^꫚g=c_=jfZz옗s1s1k,eԙrרחxo9#oJ {tejY~s;[!?{UVL"?{NuXȾrIVuj+I0Ԋ5t S^)LaSy]^(R2J/i)%ӡR nʨr/H#5<ڠul#t t "hZ]nm}'DXiwt:fvhu@7&[h"o5eL~o0 &7&gѹ=<jw@1 yt^CMVnOkpƱAk iv&nWo[kvǠt JGr嚪 Ѐ Ѐ Ѐ zC/bgȴ _ms|ȼ'i42 itM 4 i(Zz}J cHV}k5+4m1\봎uZ:cֱNX.Lfcf;x뺎uюޠq.]7\tGcױX7:W:rR1 Cw&ՎIcRT;&ՎIcRT;&ՎIf7b539_3 cfW& hCMh][ֵc꘸:&k5j5q3qͧF c5j&khc ѦmK16ĥC16bl CĥpC6m ED;kz}iC Fr2X_&tXoxEl207d[@S}G32Qf,4ZX10k 1Qz׌00# Co= m?Wuw?pw?p۷)Z^չ/Kyo)WFVdp|&j,< &O`4< &O`4< &O`4< &O`4< &O]OCJb3NuXFrIVu:u.9$M]% VSNa:Tէ:ʟg/%,NyfX%MrU )WsxTK6GAm:6]m:gw.3n?nк٥Ǟo׏F$jtxPC}רUڡ5;\Pi8K3w4-jC ZӶا?{a{5GQJ-+OCһ<yi̪j7\1U2Qi#]sCLOmiltf9 Jɞ5k^ږ{J v] A>ȃkejKg3|'egR?~ydB, = g,\)2ߺuWeħ;Pv&z~2d_kt|KZ#q%=p^kШmiZO,'~5XVV]u+w|S{3 s˺`M|%Qf:L `Mք sa1Lf&L8/&ܙ xV3ڜg5j& lM8>& DŽc}&uSy)+Q~QWO󈴹Q/91澲3ܕ24*3}Re3`Ϊt~l,T@Xr.=_7\f0!ڒuӪHC+ƒOuɧCG_u ʡ &9w)f7_Ta_T+5җVbrj ՛^Woz]ݑgPۻ*^mH?<6U~[5^/x_K[V='ƣ͉jէvWP+L{?PnШF{A5[W;~,3m]Kt&^!τ0WU{VVg+ՀӘH~Bal5^ ǡix׾'0qމsMk̉,աіY |i􏣞znv]2oڗq5jW~iej>mHI;1ۮ(3jn dW&.bbӠ5,\ӵ4R_-Y2 麦07qG ef ǵ8O=ߗt/:c/=ksyu?;ڋ~r<ո{!?Íddǫ]mȎOc`o7#;?+~ ^ hgR/.R/.븊2sο&zq`.u F]0zFTQ5).Ʃ*]0QGdqQRvť..uqkk Q]F#T1 ]H@:t 0&+DX`˫'!t@2Ld: W!^Sql/s/]?؋rpk\\vAf2sd 3U̢.u9QUMDThIZ^䏊*¹εPOG=tC9uf>$]!IH@8_So"7?k88ā$Gc@\ )$W;q^ (3tQ3ZӺ! q`CSm0ā!W Wn!gƱ+X V8p`+X V8p`+X V糓/v.bk脿3k 0c*JlPbť0ACaNW-PĠ#󩆂  4Qϭt:Ws4-)i.hKy6o{5" BpmnꆣlP7eA GنlQ(p^Ө%Wo3&W1Z *ݠ *ݠ *ݠ *ݠ *ݠ *ݠ *H*(*(WFy4O4!( AiJCPru\]!> B|\]!>UDru4ħ n5B|4 BBF}%\V qd#G82 pdxjpt f5`kX3 pg;6Fl#=Qu۪[Gw^6w?* pg ppWq9~q9ޟ?4tuΛԆn~GZ`X-jV ZJ*?FwYAdx.?Al\ Zz:u\v]<8NG8 Qppt8 pp(G8 Qe. Qe. Qe(D @\v @. x'8 Np'8 Npp7ܸ)))G}Xsδo/h:Q$ʑEӷ> p\%?9:Tuo{&bٯpLWc g g g g+2!YmK9rl[|/A::C\]Lлz;~ڔ#Frzb?=E&O&<[g-cΕ۬0IQ}zGW/פ*t=R?:xh!I>i߫lNΤ^s}iҾG&V~ؾ‘9ͷ|ʨ}"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"ؙ"Ywìջa0knz7Zf+V/Y¬ a0kBzZf^V[[`-0kz ZWnVoY{-0kzZeV{Y՗՗K^%/{J)B +V/yݡJWUzyA|ij!vVGo>n.oY(o ;OOw躸ZѹUiO󝧧q޾kmh|ÌkЪ[H'sz1f8ʴ6&zݡ Z*'ԟE-M99]aV9 }9]56==ՇFݡU|i!s::mzڡ.mzZmwW֩Qɖ/^)=+{%/nΕj}Ϯnƕ.o\+CZv7TaS-.^=;Ҏ U2\W *}Rjbe{}n*,u?R'Ӫ)%,"".""."".""."".""."".""."".""."".""~`Qk X%k)WW[SDzE^nMAv!z5GNqal%VۧޚtGAm:6#^m:zpIE{ZNWkՎcnz'! ݠ;J=tC1tC擧:nhmȣ(L e[@rX. y綦%QNZ4w&,0ȢwXKc_SN1U淈8r*Ug.T.tFZ:̕aW [x_WLSL{2KVB}VQ\Y8fj2*\Co\ݚtGAm:6`Z殷PηPP ryoqpW=~CPdр=zhÃOOWpŬLC u0bٝN[V}4rMZe:ΕvXXgBcLVdELVdELVDq,dELVdELVdELVdELVd:Ld:Ld:Ld:Ld:Ld:Ld:b\ɊɊɊcɊɊɊɊU`+:ePLdPLdPLdPLdP[]9MTaK-^*lfvVoLɊɊɊ=uɊɊ=&+b"+b"+b"+b"w(M\Ɏ.b.b.b.b.mO/I ڂY`?4U(Oh]ZlacԱ]7\Ӳ3;c3Om1cF~c~鸾eg k{vKwQ";c33,;3!cL)C3eh ͔24SfL)C3eh ͔24SfL)C3eh ͔24SfL)C3eh )ڔ24 hSfL)CsdW꒬͙9S6gLٜ)`W64`L)3e~ϔ2?xTΦP6gʺLY)6e]jSeʺLY)2hjД24SfʺLY 2a]&˄u.eºLmV~:7F,Є@hMX 4a&|&SfLXm:fg]2ʣZL77QGchipUeSg)KRV=AZ\l {Rs {Ta R!CUr^Y/X_ā+#{!9%e,ޔ,Ȯ K*CK?,K?,=X,z$dIғ'!^t^]rT C':t6S^LgMR夾cAbb㏥z㏥>g>wQjf= y2oqZ,V2lzW&ݑgPmzw܅%Gq jʣ95~(3pPe*.m]yt*(s!_0 6 lج qzC\}}akazƟ2?:neɱccqx2̟ifnkʃupx2^1 p+9b+Xk4X ֖Wlvmٮ-۵elזڲ][kvmٮ-۵elזڲ][kvmٮ-۵elזڲ][kvmٮ-۵elזڲ][kkմelזڲ][^-3eƶؖ2c[flˌm-3eƶb[mˢmY-eѶ,ږE۲h[mˢmY-eѶ̘VVTٖWLlkTTTT)J)J)J)ԑitOYk`m a6 ֆlˆSlm:Mnm a6 ܆z ikJ/a6 ܆0pnèm86 ن!0d4εUzOd! Che?gC:qθ 0R?gˤ!Kr˅,ER,ER,ER,ER,ER,ER,ERv%,ER,ERYSF#d4BF#d4B dB dB dB dB dB dB dB dTa %oeB dB dB dB dB dB5~1~1E{1E{1E{1E{1E{1E{1E{1E{1E{1E{1E{92+j˹[~7o9?0x90oǛ+ݦu+}oNsry1tsr׏ȁWͷ Mh# 4O4gfFj_Нi_ <>oTulrUjku <~dWj%RM˸W†|WH 2Tުz|a^uϨ̟OExo{]^9oǸһWޫ3lzS=Oད_LU Ҫ%]'UO#r JدzEO笙қΛ=r::+}t37/iQ7Gus ϶#u] eԖ=z#@!!B013ՁrB8w?ҘQ xZZc0ܸ{O/Z!!T3:;ˠ2 /Өi,mG?;ѷ:^~سAw5 e׎t9c1=/aTgEї^Fèv *2uwkvgCaH; i!0ÐvCaH; i!0ÐvCaH; i!0t!eHGQFsdģve4G!9d42CFs\i7u:d),|߯y֕&9wU^uG-CK9d>Ct_CKy|l`jU =F9 {lq[4:i~'hzǂ:Aư{.5K|`i>`,e?W(tk~5xnd HA! R~`)?X9 x 0 j)?HG科VHOfQ Q P쿧 ZwB}.b`>@t1@t1@] ] ,b.t1|Χ^G:1@1@1XPu ,c:ƣj#yFJ~;ڋ~.I1>'FF\("{(’"(Š"(kJUaL+ +0A "LaD _/ کz6A "}*Ye7Y{7, ` ΐ8 jg#ӷ_ʸaylX=YyRW:+?!: 'ƹεPOG=tϞaFQ[    a{)t#7D{ ]cC״#d0B#t!!T2!|WF-c3EZYeueSfQg~~9F2/Se5&%qS6eja:4e~&'YgR]CLr*謁Z :$?wuUB Ch(6an CZ-:r:.X|Ze.˫Q CV;Y 뉅zba=XXO,'LiXWO'\p~Ʀzֲzeu/: =C_m(+C_q. t}nC~eiq,ơ !J.7:F usQ7Gtii<4XO,'K㉥xbi<4<7A"$2A1A1XOrx>&qD~͘"L,'$24 xD&$24aqO9[T24 *L,'dJ&K㉥xbi<4\!]oBW^crtD&HdD&HdD&HdD&HdD&HdD&HdDVJYE"Hd"U$DV*Y"%Y"%Y"%Y"%Y"%YZ.A,\[ڣslR#Pʻ*+?c9c9V GA#XXۿtYq4_  qBb_ X@c!b!b!aaaaaacO؎c1 aguT aaꃷO|Gr[lXMp=< ذ K{z!,t3տT6MZxx1hX^kGX`GX5*]S5VZ}t{z"絟5QB}(G[ܤ1ReVѡ1Z/^ş^ 8{9{WiFr p֠qpBЀ[Hi*nw/:\="1Jׅ/o<<.i4ul,8: ,P|oK˫W7@J5h\&Clϫ[u7q[ۣLZ[3Wk[իWW^^ z`-kX Zx*kX ʘhb*VQV%~ k>Xrn+GvWo'Cyk~^~kGzMXRV7#^%{"JeX[pW~G~wIN:E곃p3e]X&9K픪Ϊ*[?!Y=3 zkX(}D-\hXKȃ֥*{#' ڥCOhա//4ҽCY?Z(4jtP~% ^}"Oh;x]ĿjQη?pmVyW\yORw5e ڡq^Kzvh?Q=pfA7/QGmwިvoӪi,_CzGԨ0Wc & ^ZXxXx5 X K,8i §QOW X}b@ *i ǠOD|ZuKDԖDī1a'">O'"^I1i 4L*jq*iΖN^AVרkbUU>ҕg|`H}k(t0Jק5nM#Ϡ6P[D;4LrItpItm}z"\r'wy7h-x:Wk羛U@uo'VO;q,dbUV`WU+v:X?VbV`V`VpUub;X;\UWUUյiu֜;i[LLLLLL#0; *'Jt84G(:B!$B!$B!gӐiMxvHy(&b!&b!g1ᐳi p p4l!<rr\Scn醐nnnnn4 p4Uw7888888wu@~ ~ ~ ~ ~ ~ r r cWo]5xpmӑP#ظ;اـ&41MHk5 Gc:A:'hY:0p#:{PWvR[Ok 81 ``#X ul{UTk^b(k0k0j0j0j0j0j0W؉cΛhGteN('pJ/sj0LhMitLՔ|2tW}ǺVA4A4A4A4A4A4A4Aο"oIOLuOO)GT _̃#RRsxdm^XdݏGO#Tj$;2 柲wɨsؙ;S*8/RRUB*{m^3T+ʝ:kUe}V蒫7~Z w5*MRg:Օ뷔ζOKI֟WyAW>/GTaCu*A]%tbӉgasTNby)daf[7޺wU'~uǯwؼn՟]%runШ:x45eiĮ'= uclRZkhC~ |O*t^nbXVčڂuY9p[eW^l;U%;O d(׵%j#+0Ӗ_lEU%l+KFvyʩ[^xr蔬~喩]2uʦ.%dSlM]J]uK6uɦkSt^]J*A !]2,t$uجUsz.Y%d=\Ksz.YugfP3zk=3oZuNJŸnk;-ukyAֳؖtHSڨQ#qr \K+ݠGCLr^.{`/׳UfA傽\:x,[-TRp=[Xy*j?L@2g*T4];E-۶`lۂm[m m-6OWnFT/>#.Hx=Co-85pkxUX~nyVv G}jA8nSg hoנؙ&׫ OOtltucyt]Gѽ jlV̑quc ZCo_8޾8>~0:QwxI+&ҫ+Bk7'HH7i -4Zmx"O7<5o7' 6wcÐ>.vޕ1%:!i^-Z4^HG9όA#? B4A5B4=蘬zu:٠{1ʔa7ڸ16X=pF6LCr_jHI\CXhh>F21 ֎6OAq>QfwPB?z.ö6tHc n ?>@[oil}I<ڐ_hp7hp76Zl n n n n n n n n n n n n nlԫv r (ǐߐ}[dAdAdAdA=s}AF.(ܠ.=W{clٵnøK6 ^qqqa\+pcm`\$e8UW^:d[uJ . *9kV8s! \B<wjՉAA^]%\Hv9h+B&tH/U~VVVx$%>V7C'+^`8&1 pLc㕬{+^ JWO<8XS8ե"pWO9ⱅ4εp6bC"O2O2>LL͔8jr l|l|xg!`㯾4o?P0شG:TPˮze e(4͎̀g4NtNtNtNtNtNtNtNtNtNtN)^ͽ"zE=ȿ~N*K;t$?T=c,&MI==6v*3w˰m 6xXw_]WWW^H˕Vy,ӫ#Q{ڞк Gݥrt<~ZuLiZƘr yt0]?OsOtzo3Wvm02SxAOh5yJ2chJV%U9KCPU=G>=9Ww]2ð7 {ðxWOS}tz#.- CCgr_8 =*OkӣYzTІ5l=:$`]('4^C:QTN5&kocÐ>1^8yZ}e ^H=Ӹ?4/qt<ݐ^ǹ0ު[]z[jW7h2Ӻ4dxNzkcKʸ^q[⶞d#(OHK/VtՊ [>\ZÛ ZťV꽉h+uW'}ا}ا}اis}?٧}ا)ӪC~xjKѨG}اs> 4j*ՇA~9X.A|xZUL L L Lth)))]5 '0\|DEMOrƹεPZE_4y[Nohh& SPzX^HX?@!C:4=,W;uȿQJC: 2M^e~Wt蘿WzUH7hazUtW[]O,#`xo0 F7#`xo0 F7#޿QzF z:-ZR6|ߧN¦z/#eĻxGWw(4/V꛹,_,/s>pE=f^л[h>c^}ڪ{#V.N5댟.ڥm_]?cLj!1#?w/O:5JI-w|ÌlKc'f<ҹ$zqt>xAwHj~≯/6ݫN≯tEaHK f6{wVJo0o鯒%.#tHܼ?pC5d f` f` f>'ů~~DZD;^qY?Pw;3;35Q35Q35Q35Q35Q35{L(s̅ ; 9i\dM蓅>)d0@dHk dTȧǐ>Aocm50Q0Q0Q0Q0Q0Q0Q0Qi fقY`$zf.$y|ﻈ+ܸlyx~JϢ(Lyj*M&|SZNMgm?ټṂ'֦Sj1\ՈܽS^S>{ߧv}xh>,=a'l͞6{fO쩰 W)=aW>^iWU{^eWU{^eW9[zw Қ*q7l[P{b/Yu-Q/=^bG%s^kj\}f LjUkʼ/>Fܼ7t<uتC7iƐ/<_}, QPJ,J,XyU9I_ X ɫl=x[CWuY³X`QGۯA aįh*xC|C!^;vx='K*~ڤۄ:o(Í Z=oWW9Εo-o8|(Pr^#>t_]y}שI*dUn~_o rmxz^CѹɠOHwWǷ1v<ݑg@X{5_m!TӁP9 .u@7:F2ݍF18lZ47j):5M L}y?K3C[kCu nGQFMenݡu=FosԮ };ة>Q-jed)_U,mFΘxg| sg|re4׮%ԨW5ۿPaG;{(f!i5K+UU_YخQ;ҪY4Vl\k+bHHGAQC韏iCBVNO2W<q3LW3j3!O/5k̃seuC7^3?Czy4JtHG9ߺVz@L/躇exo:خ>perӺKr=4tŎt.=(wi?#0ʦgӕ;0*˦g+yrc{*=t:ա>_?eߛ6_4ʾgزM뷫M}^=^ n6L]j?:a/yك6}WqeT먙i#ˤ2]&ˤw.;ޯ)#Ki}qY2~@\rJݕj:,?ArdU~v8tewLh%KʝjtUW}Z<^@W.:EKI =ZC m:SNa:l]r!7Գ-]lj/Sx`8X4,q)P7x zr</soӚ꡸nPŀ*T1x bX}\,z?)@İ5<ԕV(((%Vz-u b(@1iU乧4hѓm{?!kɞ5CZ41,o [n [1,o [@:1t c@:Й :fC54 jeWjfW. wj);2SK٩Rvj);ΧL٨szSکeԲv^&rPj;XN,fP#P9MPMJj:a+ckq:8Xοz@sB6'VM^>D5INL$91INLԖΫzQP81MDDi_j&pM8Yr21L,321L,321L,'D3%2%FOk-k>ZҽA Pj/ŭH9_'yU4B(:chi[rڭu[PͯIoi[sU#KV5-yKsϥK+OWZC$.nV}C7\yy]v\U_r ua'´0.L ;Q{,FC95.ԯFOSʷ0M/{*慩yin_x|j][a_NuyΥz_V[ ןv?V]ꇿ@Rz[.GQSMۨvcC?6. mqG[4t.Z]x 롫5hGϟv>ϹJwH7bVOLxR8vN-*OA}bV?,̳L Y]wy~zs+]KZ|z"}!]7$g9| M 㸾'Iw l*a|Mmp'WO9Aq$>K0 yXQY(:p6.F٠;祘nO;1[lX-Ĭƹ` 7nCdž.Fd JGq4UOuE^E7zK|c/ ֺos%E^q\jϼC_HdE.w`fj]n xŇ9rTu]"x.O ]0#Wuj~JLr*R^UgU"'I0_*ϼ,KWG;`Oqlp08`;OtAwu;/"uqwwxǢwx˯[u}:WyFtg>,Ei3O :xN O|Π Od7,Iߖ8±p((ۡ7򠪇=0Ǯ+ ]7]9l_rr 50;YZ;p=Ls]-Ԕ;&>sn>`̠Ok`nzoUP>z4[OMЮ:,QGfH}~ks{(g!ul6Ο kZBp;W=Ð_ja,;wWBp, Bp, Bp, Bp,0_{EcQX8EcQX8EcQX8c!X84^}*'d=cEnw;vpnܝxlAqNݻ{w]v>=B(i7w~ ~3[ؽ/?QsdbW&M3iա0z\o sa70z\o sa70ב{n*پ̾?)z\݈Z%󡡂S[BIBϞ:? d(cѭN튁(Wo4W7'hCz<-zes[Dg dīQ>z;uT~W{z@k_}ڡq;5p.{'VtAo:?}5pA]6ex.Vf@;\iVL FĈlW/ƹ; M  :桀 `{Z CkWO1t^ vtnȠka>UϤ^Ll݊ wM vnɠkW{ cVY:3W"uUUw2|~7!=2J%ՠ| :;G]*9\pUߩOx~^u5ɪR՗j]rHduj:EWl2”t )>(z?~~;uhP;wWǯ_z"~~xaJ)««:>HoM{Am[p߽W ҳ\?^ڥ?stHOj.OjK8[꾻Pz!ꇞ ~|C55U|o3o3Ӻ,%NxzKODnfzZ=WnЪ5cRjS~ΙR/ ,Q] !w_jˌk]?fz7i7ߍrmw=՟>t]ܨ;ʗQ7w6?1wgC\3owc軵_6# s䂜%?O+K^W-\y\jyGG.TUYȰ?GOJ~/7oJaȮ KD+U)U_O3\U2R2,Ƒ/)Id~M|.9$MՐY]% 6t|JtlSۿ-3̔S*lV﬉rQKX]T ]|Nʗ|jo|j;MrG+}rG p.@SC=G/'2i&ݑgPMoպ+j[y҃qjڥtO}O/'QBr)R~\Ʃ r liZ==r]Ѵj(0\yk*PC W i֠U3QҴVc1`iw/Dܳ'Ʒ2鵜Ԍupdٵb\Ԝ {:k1%gSb\Ն\uk/ex'ǯxaRR{SVՙtnɕ'%גkaZw-]K% 7SֵkߵZ|Ynے:[e?Qc].G@#e&%GX=OW0 !zC^W0 !zC^W0 !/QE2B|Yª:נ=-euOWޥA#?@C:a+jhkT8ikB.ʫ5kQE&9t]pĹT 5\߂^s#tn:H:xW;օ? h ё̮&/цnjCsk)OVJRcyݱX:w,;#/QnԲrhY9ZV-+CʁeвrhY9jCʡeвrhY9ZVY sh9ZV-+CʡeWlHd~:ZKk-e]Zrh-90Z:[-+Cʡeвr->ZV tVC+̡ sh9!ZV-+CQT5JjY9Z@-1111111111111111114xZ5m޺5<ڠuls:`NVG2+΁90+ѣA+V+ԫ7ΕҟlYtU?ZXZ<ӥ;꣹>mawVjX͡Oh5xkROΦCu^y c _~6Mwf^(u/ccCs7mкGw fՎxC{i4,>wrֵ֡蓌tG]$#_ _ _ *D COcCp*([:iC`m`m`m`m=XXjKA-@Rx8/qUq q q q q q q q q q q q q q q q q q q q q q q q q qAy n n n j;Ł807mho-------yš'yyyg* *^xy \5`k.Xs \`{ m~-0uEh.Bs HE`Hw{~ji7uKJA`rP叴*B<yBy\WO/9?ZLG1`$#9XBr,`! 9XTޔQNG9$8ԱSm,qmla*nus+0*wqU_@eR# JP\8652%Gw[P8%Gk8h\825 JFBqm)9z-"qD rT= D r5@w"WxCzGƹ|#?ܚ\p>|8p>|8p>|8;I!c|Il~j;cίqX\[x]ǑIASl1℅|$)튂^9$ʛ-ήx+mlŻ tيlc:6>WX:XިV9Tiz}4Ro]]ڐ'_ߏ/in?:<j{Cm6P>ws`7Ui u#7ޯxz+Xj@w;1n xiqlDZCpzjex֧7bdx֧QBusQ7ä _~>̓!Y,!YH7!< $@ ?m mT{ڐ`a<m نh6l6? <mm6l6lggx3 Vq?:[ض? l l m`ۆh6PmgjUqdH֧e}1ޮb]462ղ§ | | | | | VOOQOQ~(§(|§(| SB§>)O!| SB.IH*(-,K՜5${\2 M!h @S&Mh {ӊ@)JP _f,P:/P R(s)UskU\Ui;oo O/=ۋi'X|M)ϻ%~_S#=g\J7Ti>{-?P3˖?vvKutFg?g~FS|i|FV#YB?uѾOߏ Yl=nOoҟIS&ӺIz᧛`}&ӺL c3IƱ$=pޥ{q98F9rB Y]z|_3yzLh㸷6h"n?Uװ՟ixgvg>yԟcloo?pg>Oe/Riheڷ(sY+sѾGy|ݞ'VרRZ&Z-" =TΣ-O.nPU q^*yi ޡޚtGAm8~&Mx o0^ y,_00T5]z@vӆUwJ8Ҡ|Wݎwݟ95 8sW~ރtW7tx8@EptaGnoʧx5{t^]ӳ:ZE{ͯk!1}[[oXkNHd5(rHyM{υP}-g-mqMzZb>ȳuе|Z|¼X?: :l Z>li]Ʊ1G#c܅sb<ݚ`YOĘm]|@[~[+0G,Ʊ+z[I(zlYK:~ΊO#T=0#̪ !yV~LB+tiPSй\2߹Iڡudmm.@),n,n,n,n,nVd 0+==&=^ {yLz W |eo\.Oy4invnvn[(ݞ&..ݞ&%".(ͫRMr'vTV5o1xuJo mtkyAX+->yP~atur)wrhHСchF:uif}ڐ_xPu'?bzi_ȿP9ۮ>S~~c Ӂ(ށCq^ ٣*3ZGVo|>rw w{=vx;{؟7/u.ÝbiB]v6J wA:#{$iA?CK#< >;aۑU~hpU!?EdAdAdAuE*    |>q씉/(;wqwwqwwqwwqwwqwwqwPvewPvewPv߷?ޥѮ |>pv;`v;`v;`v;`v;`v;|;qWHg(Eȴ/]'j\;Egjiy/#wC=YGף!xjWU_q9Ta{vհގ tXT9'Mr*謁+ZxSzX^ݒnxd9JG>reO3}ʓx4{W*Sx3![ <U_l4Ҧ&Տ=6=rL7*lӧ7*].JpnGiճV=̎rYo==՟[,# |z#.ާQ͈XdxF}u9<:4ΖSnQo}#͏%\z~+Pʉ`6mzkiC CF؂ۣz!]fad٣z x FCz=j7ң.^hu'mA888882f}3G`(dZ)Z_/:+oU>Y%*50qLMnL \UgfIʻ*oNNv/3X4@{DKh]k\Q廉QNG9hdy /SAJs O/k] uX q%(3x >{[y1dYmtGQ3қꇦVkX{[YQb kX(ziC歅ѷPX(@5l4h Dk Z@5h Dk:DAn.! c>ghĢbNhXZ Y"v%vu;źbNnXS)vu;źbNhXS,))IZ^,%ڤUζ=1aB )F_伆eMUS )ra9ŰbXN1,S )弆e'e2LfVai0, _9 &aaa4lU0ڕFhWJ]iܵp~oU2,,j4Z0ژ+?q=uUesらgw- f &&֝qɺq%֪ؾlQ ` kq^c]q<VcbX9&VcbX9&Vc^i?|0fL"&V;:wJX!&k&ֆ] SxF`෻o?^V+l&*dJ7Q&jDq=]&jd7eaMԸ. 5nD0Q&,LT. ]&d6QׇYԵ6Y3Y3Y3Y3Y3}ad uj',v9 ]i+Mu4ѕ&DWJ]i+Mt4ѕ&DWJ]i+Mt4ѕ&DWzJ/]Е]ЕWs9Ck:Ck:ƨk@ ԩ5kk512Ckњ/=bd٢+:+^N oymР~5(cuѠ.EhP OVŏ鏿ߋ ?>{pM4Cc.*ԟ Ci&uN]ԩKKKVCy\6uYsѬ. wѲ.~qK9}]ao<.uK==YՄ1}G! {]#TFHRs'pZwRȵHdafÞ\$5ȴ7&Q/ĨYרωVm s0QcQĨYרqp!<9-`,jX5Kue,1k5K̚%ffYĬYb,1k5K̚%ffYĬYb֬kX;炀! F֏93a,1R5R:-n.̞pH{ԭ k)r/\uy2@^b,1X,K %e`Yb,1X85XKs읝}YsQT5`A0Iw^;竖5XnCҧIUy p,K u OmA=|Z,ucX,K-RֵRP^U+lk]&xr<>r Qj!o!)D(J16Wg a:Q78NVr"1dv\ylo Cd\?keB'['L0PCtB\k3H<[N;S;SIHLIL9E]Q$%N%BGjkDQJ& OGI*#<,I܏܏#y/fDc^Gڏ܏܏r¿ƃQ_OItUd]x},/tS&~=e^"0ZHRv"I֝W=zLE$eI ˝7ϻs&R/fEڙ?Iyw;Gǝ#)oΑqHs$Et;Gǝ#ΑqHw;GR[s$49?whƔӼ1w(3LOfۡUܑy[1fg3Mo$%.%.E,Qs)Qs))Qs)Z>%j.E˧ҦS"RJ\J\U)b!D協#sx?GOKYzODǥDǥX )q)HJt\Jt\Jt\bD华#SG-ϵ\֒vr CGTx\Jx\2at<%cv)=R)=R{5 2B{DYy{0#EifY)fV*N))' 7ܾ2W+C} }g(tܷ5"k]Gqt}xܓ7 +< ă&јבIhEmh>npķ 銎{E‚3?avp'æ,˃,^K[% /}#˵t_0tΦ?9xC RHC[Gv!U{37 L͸l%}q*HU)017}T+9a&n0ːؚqy]52;f\ߥ.Hkk>0wm͸,qiK\ROW[3.K[2j)m ƗYb/D:ȇ.,)AZe'N6|.u)ߥLWveDAXq\e3^2eЗx%]Rfʵɇ\6?Kǒ ~<<$0IwIrDQxX&Rُ0Ue%}dx틃ur/y9;,Ϣ"F/95CDwMqڶ{XfsqRvd.0qYOq:һ$[Sn h6Zm*tߵ*h5t]C 2 xݻdE+th3ЁuuQw+^u!"=/O4Jܦ|^uE$\,;o?r).ɗeдk'HIFm{.hbc41Ml&6Fhz<1`K47\5h3406ֿ^,UcLeK Lms >m?ls8>a>p%2֡Oc/˵.uֳB.K=WH7&`z z0ˁl7Wx^,Sv ^lK&<>a5>j謦:j&:j&:j&:j&:ꥳ݆cvm:w[+J{uh/:W^ګSeuDl,#ڰ֩:Um,tb %ӯeVګs岑kF^]WE{u^]WE{u^]WկJ%GjWd:<V^  *."\wl9(M aw::.H{+xӓuXMɼS}X{>/xos]~{nP{ R(]h >a6b.0;eUaLP]^T(.J3^rʷ3^_:JR׻, Jo+^r+^p烇bEY6\6Ig_A6\ƶ@wQ n&u{'Vg93\os~zKw^g.:û.:\?8,::Sk%5 p6al/2C`F[߯Bo"?ivL}Ď{#wJjP_ pp0˨8iJ?M駿EU-Pݟp)awI Ȋa욍(d@3vNfXzZ F|ț u&<M:٤~+DPzpv;g99|pPzt#S;\gÉ}/ಝȥ\ȥmy6*D'b̺*YR% ih;ioKZ585ɐ)2_)ënI9s;d{GOKCҥ7&GM;6޸^y/,f[^O.1N_HL>Se:H^*^wcDG 5w̰XB*YX'J*f57v"chLea QQXdrJ%EZ'2O"PybSwc@l%msЈ(b"J#T1rUcJcsva浍!{kk%yBғɜgģ7!YLìa?7OMHIWA56~ykFD(w>o{;@c9 "VT鐏(Wolh[M:V蠡sЈC͝hEfGa׷#p0I~!Wc4ժa>{u y!sZG1/y Cqx7`\Z Բ~PRROG[qѳ-WG4v;og|4ρ@bZg;x &>b/Dt%u!vYҗ0+z}q^|MX|~2<Ͼ9CXISv>uڧN9B#9B#i:RQ4JFs,:Ǣs,:mCtK?aaD^؆.&mb؆.mb؆.m2\_۰/Ign.C6tn8gKUlqh--:e}}%2K %^Yb%/1K %XO7*_botPPA/r\dR(:hQuTdN8.F}>4+W'K`?FG+jtB'`=YޛΓ=e(Mу!zPCҳ #0N$CtbȔzU\s=*S0N,6!!:1dՉVʄN !7 lONZ\{r&g=9ԎIG;Vgܡ'=lo2,qrtv:L{{$7dj:A{rɹ|Ij褆Σ+'wso^DJˁTX$.$g1{e%,ӴyUtX{;9>5ニb\,Ԩt(nSx> fS70Gmj52  .11Q-I#p4h|u'܅Q;ğvՄe‹I?&٤|d^/^U;œ9嗟LϷ$M0bLS)v;`Bkȵ0d:} `")x(ba\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\Ђq/L-k(Ѝ"("("a/]F^~hY2LD, Y&"- ?`"LăxgsX Qa"*LDCYFpf *dyf !K5Cj, YT3dfRxK5bBqb"NLĉ8 x8 }}S0w0w0wQ %F22pN) pF\{.sGa [~w'^C@.q?Np:mQ3Z*.cry.幻ys_{mY@Sg wfg&r?6 )i(7pQ,!Axk,K6ѫ  ՞ nj5LНqUP}(6l|e7 ݹYձzY.!tTY{3u'1wܛ|nØD &ƱS2eŁf.vj}9qp|BrpٻUb} Z|قqÂ4_ځ&ENd'?qg[#M%ϐk<,eqؒblI-X?5Lp}2:ߕ:p:t^8~ './F\fouc+S˚.ed9188!EibZ!EibZ!EiqR9YWHOS /ah09>09BL!RfFXK=E$19BL!Gb~!Gb~!Ge~$,;[IS"ΪR1 n\7nps`0ɸ$UJ- y   ٘~8 Mōyg+N1!GƑ(y ,/2ugꐜ˚r RdP JA)2(EȠWa}vl|̎\y8>t`JXM"R@H)>eҧLI2S&}ʤO)>e''6Ɖ?2~   8F߮y74Fa (v>bCaw},4^f OTx:cc-=bˈVׇƻ7(̟6VJevED}}kb~u}mEt":.Dє]xhP܎ ~gCV\~b[39~0,n#-W\hS6[Ώm$/$)-OI7~w{m;q6LVkI5--su&8Oo| S' Cph¡!i6n|ƧHuA*Z +A^樺Go bLI8Mf& &y%,6se$DGA"4JF(]&''''q:Ȕ`b8%2i--N Z,cyGx8&q66pg@@|Adfb9c aM 3Iw=FD g!wE fzY.+Afxُ񥖳qc uF6kwlL4<qYy7vf 5;<tdm+At_yuNIx7>7_;hLu"/&6y7c/[lf๭@aE9bLC +A^_-ݸЊLA7eIo%PrvّD[:2/릛/8;ӘO%x]E (a%좄]E!(d좐]BvQ. E!(dJ!dlsA.ANc'p'/A~E?&7Mn 7nrۮ:K_x늧Y\3K|K|K|K|K|K|K|KKKKKDѰD4Ψ);쌚3jΨ);撰y X͘sssSv+M٭4eҔJSv+%Rv+%Rv+M٭4LK٭4L%rɴ\2-%œؤ4Ii&h z7<֐ɭ![Cfpp/^ɽ{1I ;C}lm\7ԏ!Á\D!pq88Bq(s}w{ EB!NDS")┈ qJ8%Bq‹<مiO@uVT;jZk]>tx®+ !1D0"Cc\Am\E1!Q"6 q\Fq̾⸈l/lq(GW W![ٖ) Cr@!9D "Cr@8tb'LᄝI1EЧD^NZ)V'7f%7fʋlޘ)/so؅i \pI.)E6lLٰ1eƔ 3E uG+J~?_yjRH)S'y1&y$O)ǻ|s9_/p8 c9hc9hc/g 8%e|sl6lsl6l7AohٽPŞ\ pN pwDXɷkr4~ђcr TߙPaIs_O@Zr*yU7 ߈Pu̕\"7NԷN2~-GKN~ђI;?c6Ì˾e_KN1.D+I;;s92S<)nˊ˵wwÙ 0sa7jǶ^Z7 an᭶ѓ{* dW;ug,b&'wY#j]Xȵ&!%! )3٨|y0ˬr-vY.SKkav.; r\u:W|_JHσՁ\"dieieiׇpݗ?&܅y_Ⱥ!n?!p9%J !p<"CDe֧KۻpمyJ9W]K_UDe>veZv9.782C~C9D~ 5/9D~+?q+!s0Z ]oa0,.q ȻO]f[%!s"?!s"?!s%?''r GuI9)?q0 !JcgxO*7\9gpj<Nj):RF?R<Ƕsl cDshnmM46vjK}&e}U&8q0ShtٍR DhtnM4F{om+>v2]ö.E` "0\s+A|f8w\>|XR}Rۇ}kfL\2q]&u.eLV2Y]&du.e8KiXxWzv lBz]2ޣf:\ʼnDK؞K؞K6Qc $%rR۠qYMX6S$=!e afXrZwq 0 %%S )YqK1ϒ-[ %~%f'C?Y xۂfy_%Aħ~}م0nR7X#K,Q-L,f/YZDb68nDd1{Ւ%%KTK,%KTK,Q-YZDDح+vKaD-vKaD-vQ'T9_aqgkP?EG$"$%%p3aPiJ9 t ܄0hW4DʅHR)uCHҭxxyXl ueLAz,S<)^җZcz)SDDD"Rx)%S)^EHYLRx)S)/e^E./e2KWy,S)4KLRhmmSmMѶ)6EۦhmLXx,S<)Z8{S)Z8Eh޼K\4oMѼ)7Eh҇)Z8E hbhS4rFN)9E#h6O^!z=]_ :H-/pj/'2[q>N-]-Uŭ X<7d’] Kv-ka?toryyZy{^ɒ.kK4\fK4\J4\fJ4~>3>^[6eC)/KZ8"I~Ma̶\>5-/7a_ '99jZijZRGJW+yv,pE;;ӈ ;N>蕷 7.;`W--;y^܉1Q2cWZÍd8:y]-6L惼,qYD'e2dg2D#J qga%e2odɻM eˌ5[ ?أ30΂ zlƤjbQɁlqWCl!Ow?%ϐr'܄0''tR|1O̗0[6g3_ aĊ,pyJZݬ b㍷C`MB&.g@4;fl\ь(31f$/gdMFqP1}`(ve֥L&Gq*,M_mma|6af=E47XuԪa铠 ai{HCf!j&foٛh^;v-0/f;gɥNӺӴXiZwD |'x,A 7O\9xƆ0&hPn#J&0x7 #.nYh&ȋ덢gc;S/y# 2 yx9.sxbbbb2)l2&l=8 &TȬA.z5x$AP_$ٸp"b'MPIgJPIgJPIF@IE {"|{lehrm=ESуOEv"ʅNyͮPOjCFCe0'X .kCrQWc%&LOy⇱BPFR^{ky:^_Up~\%/j/*^NHU|\ЉXU :H^ :V!?ʍ}r{Ux8΁rVeY/܄Y^!p!bUí<.>}LC?PhC?>b wuχV)K iYs53ED /WG2o=ƍT%} *AtEt0FD}Kq?wisϝ?w`@J?zp0yKqGry{j{gNwhÚ?@8.zԇ',wÖ=}Crd3ɹ:2|և9/Xs[:"/ |K2Ǥ>2P:Q#@&;m0)K: s f/Zscd> rՋ˵RmlŒQ\*x߱pb<Ʋ]}q+x! -!VK`H!xpa)yd;yڦ~wȌ+r)sYY9@8D*C8D*9#㵐fC`|acwxUe"K*Bossx-$`$e>%qs$"IH!d$"IH!d$"IH!d$"IHYdn91k5)S&eʤ 4&ɤA5iPMTդA5ŠbPM1L)vʴbDMS#jTbDM1QS)Ffor/à⫙b,M1KS)0E,L SaI.;uy:bt,\GrEKDD.`y\_?$OK+:Ps\";%s\";%s\";.yO]";%s\=u\";%sS\";%ssCv.ٹ;'Kv.Kdٹv\^Dv.Ke%sٹDv.KdٹDv.KdٹDv.Kd%;z>ꜛSZqm1Qf9Djcs Չ6QdeN\̰:f3".cl6vf 16x\eu`A':uJNIx/c+05Gd[c9Z\9= : PlAčqAuF\om8fc8pk.yk9 qل(oenOE4s~ucD9̎A,W џ!3D9@DPNsR?1tilK!Nw'1[W7sP}KCtaL]?CaQOAK?)̺ԠK}{xI:1?fe)b1ۗqYʜRgʗ,I\.帔).Gˣe{ ?C. j&Zx?˒Ǥg_fsٚhF+>ؿapEym.eHx//IgL@4vNG2qdNap\MD6.MR^J.s ":%IՄYz,WlKuvl8_uvl~{-wb" ώyO osl+m~{mUm~wv?g& nR۟tzG3?oS\7 .E~}x-܇ww޷bsxuUŏ=p/? Ȯ."ȮEeQUp˜J߲<0;f3}6nHqڇ?|~{yN2 myev]myev]myeVfG_1Wj+VtOX b E#+v*:>Ee0nyZnir&OAvи4n(|jb``p$zzg=e yaUsO}?uCm25g0䝱t5O>->]?hb1G~'C\L嬧ך-m5Y"spOD~"cdxߏ`>ux[Ēl[=2j>TP]Ƭ~D5`L=Ps;QVyo@auwn@%kͺ m"`mDȚkV>"e0pt^Ϥ8μo`* {vU gQ9MFCg(OaVʯ_bH}F\g^d9 9x_xjUN̫#ReM6 I s4at?_k0L!̮[|5gWR2Q5a֨Q.[1rTJ]}.Oة%] )39=9]Kv}lA'Β~Xk a>5kp$9 rYF l -˥|_9Z=,?irAC:Hdscr48K:1k K92hBڲ`H\uy\˵mL:YqWf2 6Fz}~)+,'2l>[\qEh6merWXc,ڌX6g2gB-it?dQˎ Y]elDCa%v7@ēOՊm'2mԎ ݲށXӕP\, ;r`|pyƔtVʇ3Ʈ}`^ خ}+ih⼱k#O獉ۮ}m>F7?{K{ȵKlgo'yKgˬxGDƩqjbtLSFhÏ6Z՚mV%u\rmOs[J#Jp n/3X݆ Z&{Xȓ7Nr}|\,6&ue[#P[jۮ>Գz6Q&D=glMԳz6Q&D=glMԳz6Q&D=glMԳz6Q&kMd|g:lVu{́\aq0THڍMpYnI{vILP3ALP2W}G-d?>a6 o~UkJbU>"&}Fč~XNdVod-Ę1Cm5"?]ׅ qȻu0`YD+,$ֈx¨êo'?Θag̰nxn0֜jN̹Ə3DΊ٬! 0gܲsYTs\b<8wq].iӾ!)))))))/KF.eV WPr\+)JJJ7,ugPPPpHLXsc))yCQYҟ%O %O %JN^?K)ycX곦A$yJZJZJZJZJZP5K{CdP3<-ox7< y],7i%8RLC8"S`)pfQBoMft͡QFULf,Ń E>K 9x-ŃW_<`9~7Pgn|pS:ZEܗՋՊ}-byٽ%'n/d :pGuaIo%Pro\, k^4ocp{T/!APG.>^LXd7u;Dr`cz Y ~y'O;uIw ǘau,L͊-<4{-};FN>~\"^c6ř?fǎFnIXa旓~a日bV6]QoeO?|Os_ObiOs_O4c܏94c6؟Os?rRHOs?{K:2~<$+nvn@{J˯-}ݞp8In9UZ[sjUQƼ2&zmGۯ~lls<潞S1&D9e@68e.OXR 9ؖ9Zc??o )lr-s<{5)3͉m]Dw"v#tE!cZSTk rR&̰&W˭ 0z[̭s0;{S^Waxu^Wixu^Wixu^Wׁeu``0+oz#eu`6X icbEaX&V)Ŕ:lE3R]L.TS)ŔbJu1R]L.TS)`QvRI=çbt1|>] .FMQŨbt1j5|5].FMQs8G.FMQŨl!y8`t1jK%m\ aiW}c 럒.FM_4^/].K㥋W2Y7usʦ?.n.eLbt1d6LJK`t1d2ebk Kiߏ/e`0^/].Ke!.a+/%K㥋ŷ`bt1X,] .K`bt1X,] .K`bt1X,] .K`e::jZ-J;dj9ڬuwN2ljq . .:51ɃA`X1dcbǐŎ!CLp&mrOc1b~ 1?C̏JR2a15\<>tvָ}0Grk9h 1[-C̖!|k]dnh9b 1y3z!<)e\+iOy#S¬Cf7ɟɇ"ld.y.KŒ )3g ҷ"lZ)<"Xܞ%'xPM%;I\!Byr|ROJ<)~'go&E|ƤԞ`ɰY_DkqcHI ?+}#$"SՁU,M{} i?EO?f9=)Z`䟿:1ʶh^:@#L?XWx!gH}F㵃x0")x0")xbeS<9 ?rM$\0!)BkК")BkК")BkК"-FAhlJMi+-PJD&qj?l`~?C;'ؑdMS؄&fnb&fjS9lR7)C:qh{ÅFJ<2!ah 3L71Mt3L71Mt3L?,I7ʴ,צ䑉jSeބ KKx&oA`)˳ZK?뇉66y0y0y_}q1, Kkkk7l,Cۛh{moMfy^0W_},p5WK) k5jaV f9"Ȋ-90@pNlQ }{:K$zsw" l4ub>|y.2cZgщ̰Pc :".8{['$eEg >wy'o =gޅ9l oqQ8FbgGxx, aen|ulK#5puYJ.tcXD$=R!O`pr"rSn CZG.֑ubXG.֑ubXG.֑_I5I+3? mv3SXʟ"Owa6ڔ'aiCLF}a59E|,׺f>ud< XGk"&zY2B.b)mlr):, ZM.Vb5XM.Vb5XM.Vb5XM.Vb5XM.Vb5r/K >.wvr-F:{+%h-zN%w5"S">f(UpYt,:\.28:ЊdU]+K\+K\+K\+K4MDS.є&嘦Eg:%+KYO|-.Y[;|݅'|D{jՑhe i :>x|||ׯ;{ldy gGf_Ͱ:ˁyϻ~wƁ]|9 V4B>őO͋0Á >z?\EMϠU~JUPTWR+(BQbԇ,OWlM0*婐"B^"zBDO 1qCBpZ,L!D!" D"BD@!" D| 8G ٥ zN,Å^Z?ء4?'ß[Ɠk'ݽUݓb% NFB'NIE'%LR$-!텰I.9%JR$-XI.9%eIRwi-RG@g^Q_lKnKnKnKNs'\E"R,і"R,іw)-%t!CxOaCsv;yQ4v2Ƞ-Rǝ))))Α"]]x]p8JW\Z6Ł^&ך\[))K\Ieԛ2 {?ucK/%>%>%=q>ct4S8)/y'0T˝PH'SW_},e#)));W_}s7.3xǵ_M a90%DO}=)w`2v˽\R%sɵK7)ӷc`+3pce\Igt<$}t'e\}t9l2]K?rYʑ~+t> ^zʵt\Pr%ɶr\\,M|/>2E.OI:SS8lo!s6d qy (2i\et#:̏˯:^5 GWjCl WLvL +|=԰cg 3Z6jP zk"- S ۖĻ,VY/2?hUjK즯#u8R+ SWm>Tb [8u>Ok(k2!c&C)#Oꑟ}av^p"M쨣4.] +^D7Qs>ߍl:N}7pw2ߗ.7C m9c;j[vS]Nsl8j[= 0ƚ3-2 ⾉o"&⾉o"⛈&"o"⛈&"o"⛈&"o"⛈&"o"⛈5IiHFs]F}tĝElz0qtDJx_$:;Nh?qņ0:clmCye:DTg+k`'$eEg >wy'o1y cՍ/[O:վ# PqcqYŝč vOB"94و1V/Kzk.ykr A=\ҰN A=_G.jziW/6 nC4·zNTnHb!%2-s2V-v)5݅Rfʵ)Z GD^`oSM.ӧ笩o]vo~L.>쇊.%rK~ q`^lMmԶN\TAvD;vl|.af[\wyD9\;d&S2Ԭvl|\y-&oaaaiFa8.NdDj:ǠwbpqcO>: l$' 38WJB1Q_h2h񤢍|nX4eꃎ[6E9RP26P26O~Y I/;mpoQFooATgxV%!$zẋyjQf\[y[LAʵqC`hh쭊V<ܝAJlA6 b 1ȆdC !h)d}'qe@j4˓7 }'F!C|;C|;C|;1."@b 1P(C !dQ2(b 1J%C !Cd!2b 1D"C !Cd!2Zp_i{,L\S:`7 XyWa#0Jf}b05Zvɬ7d6tL%I_ɤ2+Oym3 Hg*k,7,2 :*~Ikeү2WLU&*~Iʤ_eZc)䈪1EaZ)+O1QiD;QEUQEٯ6l,ߎԚe}lTFƗdK%lTF7c㛱بoFчoߌo7c㛱بko7c㛱fl57c㛱fl|36ߌoF]c|3kރݺx$>81:fT;<;/\_P/TWq}D4ƟQy+}R[)3>[Ԏzyf{},&Ds䙼sۙh)L;L`;eixg쇃OxxYcap%Bvrg,L`7y{+%xo ?wF4`8 I=Q,'NL i;wN4wxkl-b܅[llEe0wN)=ʧ kEGOwNi;űwc8G}ǡXC֓ۂ7E0- h{CXN˞4ӜӲwZN˞gabλS´&7pK8.t1]L{bڻD>]1~6 K9.k4]%k3}4v]CiHRfgE<)urTw1]LuSE~MuJ.̺Yr ft:LrͰo{2qMCIh/zMu4,tⶰE|$_8 %!x[4M66&Iho\D0qy7Vǖf(XV‰qYB2XVտh/Z 'ĨX0-ZVտh/wfij4ijшKj?AY3>lQ;G4xN[tW||ui<_-.owZ3+OXݺd iɞ%n%oK,q5-l,)&S,nsJ~LR:,צ\ٷKʯ#& J#wSP9SP9SP9SP9SP9SP9SP9SP9SP9SP9SP9SP9SP9SP9SP9SP9SP9SP9/cz*z*6vqc ,aVr2/{ٓ^ATYATYAS<}5Wj<\5RP#ZX xRJ'*rJ-BhQȱÕTBSr QN!)D9 ;w4dh2 Qf!,D(eBYҊfK5}UQ9( 6y(ҧ䟜OTO a^Td5wBgk5O2?)󓮪O$e~R'}r )2?F5O *FIt`%e~R'e~R'wRx'XIIIN餘N餘N:b:)b:)9)9)lNlN]'esrgQ/+ugQmLT2;pX8ncG 9'Tm,_,jɥϭjɥ[\ZH.-Ȩ=!?FK) )8ﻌQ\t94ЌSoN_{UmgN8~f=#uم asX}=Tž4:kZVa_¼Y]eSǀ]r#Oa}xI_M8$;,K<:mI…r8n2rXMһ&brو2?͏Fh87.&EXMva]iEkETŠhbE4"$xśmK)hb 41O^qFD7QMaִ,t)K9&ך w)%ϒ-ɿnKrX!Q˼D7{uj0*[s焼̺L{OaKY7)#ثCҥyPMT{x'ǀ\?PMyuxett yuXO&ثîLdx:;K[P6u:p.ua쫭7j|˝y֔t:N.:N.:N.:N.:N.:NF0!(i=>6^_Q *״qEȀ״AE16n6>ʱљZx'3UoDO7c'+m\Kvk/0oS\$y\%ԛxo~~H!  ZOPqX΋"yqu>ڰ$7K㐗!/CCC^(yQ8Eqxxȋ!/C^(yQ8Eqȋ!~!~!/C^(y9qd.@}Y!y0P]fP]\f+Ly29OOEOQS>EOQS>;;EOQS>EONNQS>EONNQS>EOQSOO>EOQSvpeBQFzwIsq`"7x}()zw()^)z()yr()yrx}()ywrx}()yrp;E!OQSs}t\[H3Qp& D(8gL3Qp& n`!̏yJH6^.r)I%\rF+s}L, ">.e1eeeeeeeeye8pe:1Ã@@@@{lY%v.REjHmI"]$v.ERHjI"]$v.ERHjI"]$*I $BB>wGc\79-i]NrZD"qx~]IG7 yӉ7Div&9E)]Nrل:_yN{wI~mNhC3+ufeigp:7x.C}Ð<pgG;uwț^5xONh__;xsMMXʟS !C!CB.B.qK$/7,B,Bk7'5$C$C|q$; ׸ ^#o! !oq$?zlU ,HN! !SI܃7-=  &wAuXROXk%t7  v0$C<! !^A7dA2$8dA2$8$P^vo+8qB.q8u!v]]q8xB<!/ Bl,/a%lzMX{ k/$a%4&^KxI/ςd/DE`uM|e6^0^0=qZ2wuy8L0dz_zஃe엍J 4R Ɠbz&'MNr:LO1tÓ!exrM1RNN CONrLO1db轓e؅Hq2#J1R8!ի./\)NJR;)NJRl(6JRl锵{(em:em:ʻ6]r/sI=\8tS s^IWR암{%FIQRl%FIYNQR֦SlkQRl%FIQR֦S֦SlkӨJ nM()k)6Jb()6Jb()k)6Jtb(yϵ>kH?R#H?R#H?R#H?R#H?R#Hη8j|GosU{V<_x3_6ppkc366}QV{,TV<ƇSV'l׈Vy ð6g.wЀQyk`G[ Syoqc|\8u!V4.K<q׽2x޷Pȴ^?𤎆=ZT䗓D~'V],C ٥ q48[ktD-?vTyJPPKP a8K_L?l_2jSҗpH$2CғD]X4_}bR s̐_z 6*`Zsce.1dw pDs܃QӇO VR =VԾqO*rp4LSnJvv\ gzudvPq2]^t؅''L)L: raYKy |Yʑ=exKnRpH˽\wN:r“)S&Ӽ\wa׾)l|mWɾ xYʑg"]V'KL]66gI:E|Ȭ˳/_r/_}YKp.ސz\SMCҡ}3Y?~Xtg_vIr䟒_B/OaRSF-%b*]e3s6#8ks_f<`ٻUqg : ~0 \ʼn]jMp'oPxp2r6?,u(Wa’gJ av\^rq7.......N'8ltqKͨ@?*GՀ'Ϡc5pQ . A>Níո8!/Cm>UB1eWcȨVkHޟ/ӯ2į228Ux1.ɸ &]̸J#2$ܤ~I{1e>DC<C<Cބ1(}Yߗ|c`ӥ./ɿ:ۻ6R<& [cjNr@7}Λ>>aNv m99X& LQ7gT<$OXʯf!,ȔHz.+0&\tl>\t,uW],a[ Y*3dUfȪbfh!+4CVhh!+4CVh C4 ?df ͐!+4CVhh!{^v3^3 eho\@ޭ2-"ne)f zp£ 0b1W8GMa>r|.,''l’ߟGQ Ã7>m7 "lֈ^ht(oڍc8fc8fc8fca 0|,pcs}]K'5kUKi$% 'yp4` LD9>$},a3?&:`.f9s~0۾>fzHz21L|3o[Ω:X,'M7֨܇u05Qu7e#Sk\8Fn`n9sn9sn9snL3=ιHљמgۈ:K$Mꌚr|rsc0 JYj9qpjL| ^r.bc2L8]1.<[Ltq..eL2]tY?sq-[scn[~<`s ^ ~<>g3z-/%vX/aY^K:{Ig/%tv`g;;X2;q2TJ-U >}#]v}n W!]!]!]!]?@ũ 4_ v92Ώ{IJI΄IJ_2epHZYmlg.ގR7WQJqa+).c%ŝJ1\R\[)$(9<Юl^Faם׺5?^^򗔹LXG^2WCcRcRcRcRo)޷Kʑg7…%OJ} Kd[&utKXҍu3gxcIn,yc<r.'DlIJl9l_<2+>0Ij¬Osم(Sn<_ٞQ0g̯DJ eLE.#ze,~8+pljŋ&[Pgknt^); Za*YldZGm:6lŵyb*g?8S[v$]}Uq-U7^fl1x T{ND-'.>ZD=e\vaU!9\jO]h0:F%JLA+RQq4/6VO4FҨCuHipel1׮'{͇Y8bi’7Yoj-=t؂.]u.eÒ2aIcqY 퐶2.<2NaX)y鱸̶"ȃ.DyEt16%O͕N.<ǥ>x":=?uafq^1?GE~t\lcH!?ȏQcR$g&!?7C1D~ Cd1Df Cd1Df CdY:w/bP~ ͖l"Ȇ!al"Ȇ!al"ȆrRN+>5熼 y"Ȇ!al"Ȇ!al!榸榸榸&wiSX]h vc7)S5'wTm{m{mKmKmpa.///4E?MMUShS/6EMaSt6EMMMaShShSt6EMaShShShS=7EMsS=7EMy)mmmm^/y >2Ld/#zѦ)oE)o)o)JkT 5&R]iG] DAN9%2ˑ PT l61Mf[~8g [-e2&dLwn2M[-e2&S-e2M4qKLGh2Mtj`}$k?4D֢Mfپh/ꋶh/ꋶhcBV|U*_b/ʗXK%O| GA|Ud/ʗ%ާ%%^ѿ@/@/Jۊ'ׄM}5,g4) `.s0QB-PK$PRK=SSL2;S !,Ӊ8rmȵ!yLRpuzٰY7)'YՕ K$ɚ_ŒgoC )3"eΆyX )Sڋ`(jQ3 y7(y7$&pN"3L`&O O~iMl` 5KV=YI!" C(KrZ.m!:~8HSjK ^ "Cv\CL>oa" ~q5_xGVM"0Cf !3`WЁy b %:T;NRN~¬3. ;O >!'D"|BO >!'Dd IsOB- &I pI <.msfrE|v9?2ѐB&S"3O'7#J"g'25ѵGR>IoڙBHIPeӚ< %UJT)*ERT8x-$Upκ1Z'"R^&J':0G%RDJH))E"HKf^恧%$I "URJTI*)R%EH"URJTI*)R%EHTV0TلlN<5vv%G^ikIӈN\6`(uc1mD1m6l`$Q[Tdaو~(Ari_I<-}x~&dcy pc9uECEL\6a(.]t͟'.wI/2juy;hyzl¬h\^rX Ғ,u6xp! Km,y]v쫲/E"&>C:׋aQxYҍזrisYgAVi|,˧i|,˧i|,˧i|,˧i|zZ> ^O(͜l4{9̜.35qMAF21tҽ.51u]@0u1mokBPe}+k(r ẗ́ zh3CQ=tGM]]ZL?0`Lêbq0000xb=-vyW?tj2@$>@$>@$>z<$W:-o8T&-lŦ4oL*            Y}cʏx(;٤0z/렝(q3QaݶS$Ze%+FTbyeׁ-YS[~W{}oQvgMΚڝ5;kjwYSJ^eC&8]qUi^ݡ< %ZpaUVA,~& o~oXlV-#My+Na`~q7paд$b< OCjaӎY(GQfp.6ę"*ǚv2:-.rj&+r!Ѩ Ͳ_/7Uq9xijέpùd<=gA{aַ]}gvkRbTG."`V:şuHܷdDk+@]J$z o2@y6gF/%w> Php +,2 FX]/o0E.7v#NSnݔ~f,nY7ˬe֝/ 3jVVݔU7eͿIS)o25FTeMxSޔU75x)ꔭ7eM ӧl)[o֛ܩS)ou^0gn}[b؄!6zpNeFلQ6aM 3VaҹU@[V{J? |[6p\'jM,ÚZ0:Vµt,̡N| bC Ac򮹋]ߟw=r{ O`R >}/O~X>J/V;(< B+X`GRYHg#2 JuPTՏ͟Onj~ y(>k|ҨbKNNNNNNNNNȡzɡzɡz^]_&"%.+9*tɝ_^RT]`ʝK\rwr=vvi׽B74~/ WqK]>~ٯ|zf7'w0›J'{?S-sHݭ᭺O)sH}}⤔9̟e++)'eP/Uqxt }oF;ͮ`e&Jj=ۯ8UO2HλoBB/=v^}Zb]}qʠuh;ѿ;W8|;(AwP|;(AwP|;(A $_TCUpQ(vJ/-=wT_8*B:¡Ʌ!yCeSA1Ya!9CNS9T9CNS9 *P ik<ɡL:MaCm*<6FGnj6OyX9VG?FfC+ʍ:Jᩇ]Z[ͭW=l؁n󬲋QCMzkGlA 2A xhڀ6'ZGy_׀5} x_׀5} x_׀5U,a#o[R׀5q x\׀50^ x\U_͓=qk`: <&݈E^ҭY32uy'skḼRd,~m9 ea0'6*m̉m̉m̉m̉m(ֆbm8 7TjaLʴLÍ PEK*hiC6ic҆2:ohC6 *B´0e.Xm.Х wJ wᆻp?w9:_y 7Fşs9 7y gᆳp78~78~78~W[~7?4^skoyʰixP/TOB/ku">'e*'W_sd?AeׇBMp UqBU0ŭ w]~.l*=?x)V:{XLx6f z^i~+0ZHhV3,mQ ><X/H(BɳIf|=Vog;xv(ix B;Zwz/Fk@|,CJOH =[pCx<#\yHOl}bP!r:$6+*S>ou|_Ƴ(Ga<^Sɮױ?kaGHzV30L*:>:S}|IZi.ilY+#Rs6Pv`3=U*iiJZY%['VI+}d4'MI}d4'MI}d4'<f7*26=#$sJGP NAWU.&i0D\T~(!6ı-H!`4" H!`4" H#ZeE*jhjh,KRh,KRhu_&]B`)4X p,KRh]j߲,KRqXe[B`),K#\iE`4X A{~^XqE`4X]E`4X;ޕAEp0ʼ,"8,br񠞩$1i G&Ծ&j>G]z࣎tUs*QS*tjja&gm*ԀH Mʪ xb՘Ӂ/5S/6WUtKjmoV~h@h@h@_ݐU~r*bwT)FCS/ɩćUGyê#Ggiŋiŋ=QV iŋON+gM+^?!\uTd>@d>@d>@d>@d>@d>@d>@d>@dny8WXRngDgDvsHk2\ӸNqD:u$I4h\'ѸNq:PսgR?!z1bw>fMMٗJ>*B~}a)*yt)yt.s +n`W!*S_1y^zrŅ.Xqj4Ak mJ_\:uŃ؀l^nvM->aO&LW^>GO9p4hB.ʯC`32jub!&C/x!g'u)f&iBV7/?g[uPΆrg7lBVo x!J3 v;ɃֻB'и и и 8/isx y K\uZe,t򇸌u﹌u2]ƺT*T9YQXwca;u06'ҙ *Ԯ2ÈX*a; zA!ꭌ{qp;tc:YayE^r /y9`Wq_~wQr˃V"qlcg{V9A8Ap IQEn_;,,XX ތ`<.$Hu*@8(@.lXoDCxW|+(Ϛ\\\\p+ u/G)plOq8"||9~ϡ΍QyyrTЁ>!Q)>蕝 'b@}EsjSAj hjSAjSA([N1 Їj#U[)r~aBޕsͯܿpPS/:ŁtHۿVfw8mPu-kՒjټoR7U)^O`Eót%G9;ڒSVH]<'ro~0 IG]䞂7*WIa;2]fTd+).Ex6/RP.bvp+? ď%(<G2i__67 Z w^Zjߪ[ ^Xޛf.CywYkptAEWL}P/X.q Vvk֪4A[ y ꖅpF~0ja舏]4!}C~Xe{Hen߄ţ%D;qQDCG{|8,ioҍs϶ )7W==bQ;dGtGEhju9Aos0Y6^tOPPPK3+ }|յaW*Sa9B}}CxXF'ޜ)ldz(<2bS,x!V]y ț@0YF޶%g~n؀'u+x٭g#.FuK]3?=^û!ex7S(s3Bnկ98Pvb=W#>mz6`9xX1Pn9pCxG8xWQ5t :@@c 1: ܃v1F;=h1RF(unmnC6nC6nC6nC6nC6nC!Qg@] ~Ҽ!Ҽ!ҼQ21$iC6 cIIZp=-r5 Wr5 Wr5 Q5 Q54pqoxYCIԀD&tp9 Do z@7B:Lo z@7#]"zӌÈ*r@7 Do zۛFr.7M.IH@7 $n qH@7 $n qH@IH|"'Sלy)y`lLQԌuOQԌa'W^/.[@>斧斧斧g- Y ]U4ߑ,*t+4<1<8؀l.zP?UMű-SU*$Oph$b"6!b"6UXjI=WCA6\4kI [SM_myO5?Ou$?ZבG{W#[ |kߚCIz  ʶ1rJ9T۟o-/w jr9[sϷV գSoa8|k a!8\k1rXW~?2/ߚâpօúp 59,j#-O>,eL2qX&a8,eL2qX&aDZ&QIee_#(%FQBI0 &!$dM^qFk/{_&!$d }p`(:!t B1cB A:!t 4TX 01B&RO#`bL01B[y.Hg2|>;6MI0I&I$ $$`LI0I&I` U` UA3<50I&I`v->302{y ʔ!`>̇0&Cd is0Ӏ4`Vw9H?@>&M3懏 O`U^x@2<xN2s闌d< xsd<x/byN2d< x@2<xN2_{qU%Ht@D$: Ht@D$: HJ^%ѫ$zDU{y)$C=}'<K$z zeID/ID/IK$KIBĒD@Ё$IBĒD/I& MKstjҲP#Tf,/lY̳  ga..ªWg_򻰙ga5jWg_stIq5<,/,/ۇV9?>)e[ -M?Lhܪ`vJt)~ׂ~-ׂN-ԂN-ԂN-ԂN-ԂN<"]{y_ tB:{6-hӂ6LLŇC%l⤾,˂,hʂ,hʂ,hʂ,hʂ,hʂ,hʂ,hʂ,hNM٥)4eSC9GKIv)ɖ$J+tKIz{jƖflm/y3|)jbk;!"wk+;!?WqfdI+ekfl|2dh_U?0kWƮ =[ڰ i*pڣUҀ OS66tn'gc /3/:FZ.طOF]lCm`jcc]~,Q 0q鄾O1n0n0~ y+` ` ` ` ` ` ` ` ` `-2+z,a!k27gvQ>y<ņ4 %'}U @VAw.?`Μ qgǻBx~`F0ax#o[ߒj7w`w u sַ⭲͡ꛎ<GbLJ]pQQ#G|G v# xv'‹ktqW@ C.X R %v#C|C|g.|؀<h \pzw ?FI%FIamJai%o#"Of9oli4JZ%VFIeEl-ίVJ+Cd4Y'MIudZ:rN攻F9zV3gt]8͈&3th0S~6٥th0Lv/_n(Y 5Ûio̴divj5Hxl7vl7vl7vl7/?܁.g.xYԻ ߕ3aOq&ϥ8C0Lp&yNOóqg6Hl6Hl6y/Fpt6>W:m6Hl6͖=<p_?ʡ$#Ճ8!KVdipv_%Mh0\t`T2 2 2 2 2 2 2 2 2 =e2Xu<Ǫ9VϱzUۯ^Xۯ{xPϿ7qޚ.5v`dLvɤNX]2%~ZvwF6gV4\և a9C2;$C2\o=_J?݉Pf{sJ||vg|'^|SJoTw*C;C;C;[HW!'.ê뒓9鐓9鐍萍萍萍萍8x"9D:W#ϱ<8萊uê\L0¿\AAAAAAAAAAQ:bq8<}PrNQF u8|GMn]|NnΥ/KSqX:b&" ٚ\:82X#O|[j30큑Hg`30 PUPth x+Nt\{n syK ?CSH7P ?=<80~ ME]t\1C؀';0V)؁?C[e<})~y]2b4<3bsˀ Kf@k(30\Рʀ B^t<3aU8N_ҩˀf -|Xe@.6kC `E9FNA>zυtS. h߀ h߀ h߀ hthϥ}ڗb:cչXVg¾sV,s`3O&3)]/3)It.2)X:c{&Rh+2Q#H y3HA #"KH0R1-|x 38 >{ujPcCe%,􇩷-)?zAiA `7p \oz,㽎n⷇w?\l H^mhGq\-x8 2 x ;M>5O |jP85pC jP85pC :-X|EtȰ]uٟ t9'~΅簄{F亂;'#V#ͭ8e_Mvgl]T,_<##]*O,Ƙo1F';X <1fL -|8Hg"z@:뱐8h'5csp,`ř5&g5CKm~"s.&D+ǑGqSr.k[u 2F@b*zv/ՊEg޵)|O {}(ԋL]bsJxx1^[o?nV{\Q.OE󫸳UGJwYߩ/{{#]zvO{#]z޼u!h+"/:Wdž6+LqM0b6.8 Oy1k m!X_q$}$S=\p%ܹG/^2ty⼽niYZOք{񪻰/aSϭ1s(R ClLm4-l?܁ߑ>#NMZ֫06+X2+p+r(Pc)qpݚw1m`xv**~coʃ 3 X5ԩ)PΦ+X֡oX|?.jp0m1\U5a lX5a lX5a lYX D_<=; E5ѥg?~:?ֿK#0lO~MP- 26yՈljD5"ۿw' m}Pn9`svG 1 XzxުP}@:Kbdt=_H~Xíiשo H<'>I|Xe㋇?yKa'I`KawԊdN'ŦNaoiCS~4.hIHio5ЮЪU4QE?4C? +n(B&8뽿&XJ@t{nbpA2#K.iu֝ƻA2fq6qI3ev4J0*ZiaS@+ @+ @+Ήx٢&⤕&ojE%M˥K%_዇?y/ފ_<[=( h h h h h h?jpSC s GTn@޵j gN|ZxaT^hY?]|V}0=uE0]&JSu#+ݢ.Z2QhQe[˛.o޽sٺy˛w\.\ [W:o]{XgEsSu9/3 ϚQC:Em]^Ui*rz?g7򰑇]Rs8yhyjWgYtO+XQKV:1E?W;*۪Ng> it'\9/]i \_jiê\e}kַoÎpo料0eq8GYvGٞ 28 Q?앍 sWܴ=ߖl28(Q~ˬLp 2CС^+¨Lw"H,!ti642NQQl]֝rV‵8`-|m8`9Xji-:djQ*r͍gEfxNV9;Hߦ7Cz٪&U#܆>{|liFtrŦsT1B`q (Ǘxx.Fx:a<-1=wƳo,kzVޥWDzUW5<*a=["e)H_s1 +xK쯎ux|sx Ky%-#/*~^pqL] puJ7U897p ox87p ox87p ox껮VuWѾUY.Y.n}rO%<‘i ~ftw맱}jOnrD7 zc={c=5ڳJ `>v^O`W:8·pz@t#^횑Nxm'^ vՑO`IOIϿ:llyX'< 넇uՑJ t{:=NxO' |JgRlUdyL'< tc:1PMxL'RBjPxP_앇I hEɮO/] K3i}C`Kla>la>la Zu a a a k!X{X{X{a>lzzzzzzzzzzz:/7u.gW؇޹uGw f-/yzQ^v3F*T/>#ׄbp+녮\ ]p^w]]v񿶽&^Kra&9vqöT:ߘVuYg_NTƴrLUOpgI妄M-~?TX(w\Q&Ӈgg2vzƾ71j9>~x*XaoJǑ]ww},*opau)BopЋoppL ?FZ^(ˑcV~#Hý>2}X_)ԇUB=jB}x+OuO> ql>0· S7| 0ZӧPlPB}ExP/TOyKښ)V- eY6U͠WhIMop NA*op JۓO\yo+n ?TEXLqYO`U̷dLf^xxe+*B(o5DB˾sa>g]/_5SXc%H#t S?xv8B?raU&~o\@1W=~#>7.[,xLPD/"&>wh5.LXٶB^ല:ZM<,>]hei >ka>UEOEˢeQ^EEuP@[VsQgv]EieZ\asa".Ӣ6ʴ2-zz9[^ߗ5٥k41 v{-p}؁;̇;>}YY?a o#`67F<<ߚy`<]͸~x~S phZuZ:][aG>(}"D|ǻ|(<@ޒ:bd-QRN3 c1Hv R{-ycpW~ªxCE:C9:C9:C9:C9:C9:C9:C9:C9:C9:C9F*(c^R[:J%FĐJ yKTbH%C*1-I*o鐷t[Z< 9CC!g3t: r9C뜗|G$?蟯o: BG 6:tJSYS%D)>PPPy&߇=A$~G޿,q)ހ:Jo@o@7~~߀t:~7x&ArY> ê_GO%J\.瀋sԴwS1 1 1W|_?4l爉)P/x0]y BWJ^߮zpSnK.^rq(yC᳷zvv3g(?S:A+k}u1u֙hj?S`zl *mcpY0}pJw]0.Xyiμ4A%}cDWDT蕝]PűwǮq=Z&p&pgyatP<7ÀCn5kztQsJaΤ8[qO#NSqq]y Xiv{www1>vhMm9f%ܿߗ hYEKp+tdy^j`%;z^jv% Q0K@OX,b X'99pE.Yrɒ_qJ!cE.npQ^+E.BO+g ӥ-uA=u[xLT>X$goSynCu{G?F׳]*2Yj#qj?`{}V\ÚAYqDwᬬP}VޛXռ~* QK*xg#h uH<ڗ ?孺llll)Z뺭>uZ3B2R]fܬWw+p5Y5%Zr-3tzm_.e?@ƯXXXXDէMn nY>ٲ y ` Z%5تlqV}l[ÖEgO"'XZ㉍1˻>6 c[?+kxp!:07\ujַxO`4;|1y0CO_nʧLw9tVx:6VWJq\LƔ{7 *8xwzf<̡C8M5E<F:Sy(r\H3T_p<ύw{YG{8d:~J'o8hv~J+pV5t1A݀U߻m;ZOa/jr)_?jraMV/MׅG~VÅY V"g5\r{ku-HRܕWYhN*=gӱ0-}{X >'Z3`a<6؁z)ɯ_V9$^M7ҎMy(×fܾ?z#ߞ.* wvpr~a7<$=t'9a'U}ѿ%r 9{ߒ*^{KyC>{PN8y[y؇`zA??zĢvަqre\c֪Hc*n|?z,AI̅UWJamsTv/sꋧxj0OvC]#^k%?KU8=p(t̯?V^wE-M[xq)5R`.KMi1uyWE}zH:H:H:H?" &_D1" a(c:M锥oO\wtN;:yG'wtɽ3PV¨N>r棍8zQ3k+nkFpډj2(KC9EV[80UFxO`]Ձ;$:a\3N y\.{S\n R2bp1 , hr<9LCm 4C;HUѐ <|nsF,֐ gm8VHk_8p p1$8x{5i^lIV.+Ҵ"M+"HAD&4 i HAbs3=;lz<.6#$HA@4]L<',ѿ&@4 y/` s s م998 s s =l-zB[a =l-I }vM'N; ܭ餭餭餭M; v64liشӰiaNæM; v64l[U; O˳sc?oX)Ɇ51+c0+M;•%ۘ%ۘ%oe p ƚIۘI o&V~6f6f68k4w`5upGF4QnF\UޞY}8Qqo8wwVApppA4\<444~p,e4~pXRDAMU<$֩8CEd1 1 amLmbMi lC(6bcnC(66DccnC46DccnC@6dcnkp؍| Ari_2zI'!rvm4hjB®݅] vvuQGWcw-~Nή3Sm.%Mb=<ۀUtWe[/F%t/Ÿ7V~qq[8޵-yέSF'_l*<+9uُ^xV哖o-x`z`P*)MU(Ta |bTRƻ6;ޛ9kFʋ->; !aud}"iNEa5h@;.(bSB.1V nW~:B=&_'KFY_Ac;Ṽ"{+kޯ_]vy8`V~K#< "7 bֳ,P)  д-z!mAA `|cGJ&g/v@ {y}KGbŽǯ+сNއ b\#qu#|#8kzx7|-`WL_[zmM5ִ^[zmM;5GU~U}ވWhaP>P`Yww*._6_ڏʳ]wҷG]XX5}Ӌ0o/?xQ؀g}K'#ߦc ߊx]H;@)u%=R5\\5KJVtE m9p(?PvЮۃtG_o8苽`Fr#M|o$|a1?7䮐-`<ogQ>zB^Ӏ*Ɛ+ay-b6Ppv ұKM҄ok?*,Yhw][NU 합i|&ɧmi|&;b򎘔G;W9`6p* nsxx wS9WHٲ N=hp"U5| ._\7ykXtF b;v3 lg`;v3nYYYnYn޻n|y׆>;5XONNۧڦ׵M?*B~mݦSߩa԰|;5,r =Nn]{ ^Tf(O8'c X}b11r}}ڪYzWs2< (| O x+Ϲpӻr!(r2O>NhڒR|G|GAv䧘SzqJ:N슴rbO8',#!'Ĭ|)|XҙM[p@C Y8Js  e"9' s{¹=ܞZ,pQdxB'>AĠfb 31L d&$y>1 ;a:#BCN}X,:_eXmWH saI6$pM8&dN 'ل11sxa0IA=߅4C֫zuXa:W^֫zuXa:W^֫zuX5z2Z֨%*Qj\У Ԩ%*!5d Ԑ2PC^T= Vœ]Bv u+dWAV[rְǮK6wmT!sB{Ȝگu`=M]]{Ԇ뷨:9D`KqpL,`Z6z^m|ulw/pkǨfV.&+RxUۓo+gBۊnl4کۏ)q g@8΀p3 ጿښtVw& 0 F&0@ZwRtT#p{I^=8jN`Nl`[:ul`[:ul`[-gm_{k+͏jGA_! E3lgwe~Q[ݪ]]ͽZ幥FT2oXMXwGj#9YR % XҀ % X0 Yٮ*j4EJ7ǧKPC %S1*g{8 3 *2 3 *s[SV#`s]K<0?L-lh* /=كF W><:z6+՚υ_iptZ͵ 3 3 0 0OJ>_k,>Ue_/ /V<ZH 27/ 34HGj5^g_,haե~cjQمekVZc僛ҴʧpUnx{8wM<;T4Ԡ< `u/F>W 7wl:•6\[;.Ցy_iۿڻrSzDVݹʃ4A.V}G}ƻ ]/:S c]bt}Q)emJjf_dm[-+jg_kK/aU~W2\ξw1ptAE=;_1mSߖ'\< 5)X)>( A\z[W il;g5I67 MPMPM=hshj7ձz&fDživ\X/%$^yءĢ*+m~zG&gmn~n갵1>-S^Hg +vq+ -<)aWTΕƅqUnqrL|@Țzx)~Gn85`'}8Fi*u Rmw`mnŒ a9mo*xx've4c8hejn9/rRzӲZ .Y~yPdtV 99ddV7@ +h5ptEXJ!BBLE𬏜gx׀Wvjz#_ǻ hS Ň(rw*?p!]8:+m*]UY@<F:a@F<@B ӭ02Q[tT*M;p ^0Pr9@|Xʽ٩ȃrpGjņo1|gzCMZ0;⨂? 0݅B6-j!`x7Ϫ;Grs(Dv{^{/ӗa ur?4}٭ਯt?tuZB~ecxbCx'2&O*#}a?zU^zƻb+Rd(Rd@8 _?GE*+q,:N۶MӦeztl:=렕>?A=S'4z-ppdB5H2京^lF!~`T eur׵:R1)(֯˖{wF,CGr 5z-e:kZ2zm:qw~緥BzE-#yiv-#x-cڿ%"a*r?Zv7k;n#k/v'< 7\xqoOxV\Wa}Xh㹸bPcEauXDQEauXDY>_l?<Lq)Nz[{T;0x5 V_~VCL7԰ X.!U9/|1YRb*OtOtOtOtOtOtOtOtOtϿz~'HGs$< Q((([-+I$ IҐ$ \CNqU$sv䐓`I0$r 9 CN!'`I0$r 9 㯮B{x+!J'3`@ ΀g3`0 p 8΀g3`0 p 8.FϪ po2^`Ks TB  Zc`10p 8I_,gVDs L1Pˇ;Jǐ?`ѳ0p 8`0iw S?`?0c1`?0dzX]V׀5`igiY@}]k)ǺJ\Հu5`]gQU|,-*Kʢ,*+;aeQYYTVEeeQ,*EeLɢ2YT&dQ,*Eu+Y&_rY;0,eLv2Yv(OYeFݏwPIfx*t]ߦɆm0 f2Q~ /3`F̨#eF(e0v;UeF3HHO/5c(R=`c0z F1:C` c0tNJ<1:C`ܶ1 c0t +|:BF:Cbĉ1Mx+wc0t = ;ʠ14`1bTe Ac0h.zx ܀EގW9e^14_ʠ14٭: E{ˈ11#ࡶZF11#`YFysb/ْc0b FEd0\ p1.` b0\ p1.` b0\ Le2pe2\f.̕Y&"f+&"l)e<&οS{r S6 :7*:BtriC&:tiC&:tiC S S S :BޡɅɅɅɅɅɅWWX_y a*a?ra>&kf&f&f&f&f&f&f&f&f_-^j5[01[01[0a=LXÄ0a=LX#~OLLLLLLLLBk`b`Ց7!FYYYY 1O n n n n n n n nvOnv/nv/nvA{{f]"|5(uq]w|0*nɀkR7tށI.Ep)K\R"A' "5rv]#gd9[Bǒh\Zo_i8TER8Tǡ:qCut9h;1Hw*CtJWBf /7LW$_pr/Z<N$`,eLI< DXma?EK@LmJXUhJG L=[Xa5+Q+| k@ņn@nUB?@ ?@ ݹiu<]wqy}GOdJ?{[+yӁ2T7$]I9gNXuÈtjkWCFris3r}oF-[uXT/w}!l N}x# kyaH: +2T/nJV/_X:bXm,OĽa0oˁU&y|zW8?x"yihyŨӉ:cFr#FH'fpV+a}K/獨Fԃ`yZxuyhI^wBVt2w-޵h<'N>F[X5\[-u˿_nWu*nWVmA^ /熗s˹rnx97m,?X~ԋ yxkT/zW^*aDcC6aƷt܎}aOxG7{cцqv+́:Ձ#~3lW:"aƳq-<) nx^x$x+Nm/:V3ʳn=gEt?ҵ_tW{u_z,3խg[Wٯn=_zf~u3խg[Wٯn=_zf~uՙ3UgWWٯ:_]uf~u~3fWٯ7_of~u~3fWٯ7_of~u~3fWٯ7_of~u~3fWٯ7_of~uh~3ۯ$~u&R3ӉTL'RN2He|^ӉT+ɷ_IR31T)}]ܚpGAlz%V_-~{x ϡtYe <e#e#78&~߁JՏPa`SVAb#ja n?Lɿ LqbUR ć!y =xX5?xu`⤉p0j2aG5Z`kaG+)txx0̗Lthi:ZY `;iVvBd'4 4n7|jvBZ.\ sv[rUowqb|r.n,gSHg੢n\#fo`1߸- V]6cN^#ޕct=bE ޴QlHTHTHTHTHTF0Kt_ChqG+~Zl ۆ.j?E\ @\ @\ @\ ՓzW/y9tQ.=YtqTGuqTN/t%[]&]o.t`0[9 Y8C0Pe #$HL 1@b:tɅm[՗g"1Eb:tɭץD3*0j ͢?" _}뇶zNF~L 9:} u*N 9}X܊:fe6;C6ʃMͬ>-r?tg>~ %D/FrϢTj஗? ~`?0?l<򞳇82QU9 |P)̀ (#.'W? [p0lz/d9_avIpĻ&xvqUny~Ł4)0{x)in[n؀'u+?C.=U.UNsz|D|Ifm7X"T>:uԋK;jêeL-xdϺԦV}̺ԦV}L/:Zf:0"?5b:etʊZ 2uo*glS7?eU̿:ᷯbJ(Y;kuZS'~Ϫ9o=;?Ӌ t{B't{B't{B't{F;UM_wQ~҃sSx໓ }47^< J6^ppmнxNs|n?=r e4hSOŸ Ÿ Ÿ oՉ)씳rй'Z M;՘*B}^~MP-;^LA`X]O|c5k ϮJ t'7StTkr<ߑ3:uG^s\RGhwFS1=juPkܮt1.]݁Zo[2FT[N6X0s|з;LnKeh'EaV9DWM߸PnKP[nO\[7Vs+řQ C~*b񨫎/sGQWWbbbGVW\0[Jh yh yh yh yh yh yh JlfMPU|S3WY-B-B-B-B{YrJ5֣֣֣֣֣֣֣ǽ,W x1S\tzl걩Ϝz۬6YS*WvBqW5̐dU!/mK҆UaN5r͆wE5y x^׀5y x^'5I H `fʉkR` XyL I{o;C w`=ޚ_SၩTx`*<=7#xLkS< 5P׆XeRӸYu=3 `s酿6RV ϒv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-iגv-i׺Niגv-iגv`U@ؒ-ؒ-ؒ-ؒ਷ a(S\JіmIіmIіmIƖkIkiqIkIkIrKӊKڵ]Kڵ]K$X X X X 0U0U0U0U p  p  p  p 0ŷ p w /[[[Q[O,#<~0<$d Bxe|LނH':no7V~,'VA/_ 獶T~ [ [ NZ-yąBR-UR}U!߶d-W J bK b鷘~鷘~鷘~_⺊ĕ-zߢ-zߗ+)t VIoqoqoq-N-N@ Ej\Ag\?E[D5עC5TK-z[LAGm8PERLw.yҩ~cO_89չ,1uT:W6z?V8tS..*41o,`O[d7MǬ[7e_9ş+L.o͎ͳ>=he<'r^_spToց]FE'OTN\}0'/On1݁Tv%"LAǢr/Q]ք{FQtC ]q m/-^t:+;J3M ;#I.v< WtyDr j&~؁Cq:U^re.;6oC.6Ϣ%jS]<Μ qTag.$m1# X]/£!# +e.,cM?W]Ky3Gv`W?6g{-MVH+S)R'2,MVH+Si2EL&Si2EL&Si2EZ-?0K*iuEȅ;!#|"LA%-?h9VDb\fj`4 Gq ׀4hZ;#e4M?plYZ9v&D2!Lh0!̆lh0̆lh0̆lhZ}qdzhL(lh0xv:BUJʅ#~ o2,lh0zW fCдaG̃_rt4ud|G`R:gȞ#{;' ~Cs9jq9tph]ܚpGAlz6!?l(_ov/s4Wbz5i.gSOt<=-t@Y.VUK[u@[u@[u@[_mܽtt#181)`G?ʍAmi&Ջ]j16䳸}h߾Us_*NNaKa+a+{EnVDlyyE-xL)/W>0iu 4R[NV+NV?;K!W$L3iu ޴9n:bZ}Uiu ޴oĊi^W?/FIbSg V!<<W?=m|xWۨ{_EGVC?|k|_>]iD#ǛR*70 Lo`z_U xS x7N1S C Ӂ(ogsŮjuۀ@>هwPp ^0 1G8~jpUK7p \oz7p \oz_{fr,3= k5=5kiN)cj^%(X; L6ϷKVL +555vWU BT.qY->܁ 4 bplHG]<W^osJ3];? t qTgu*\0x6􍹍դ~w7,d-˙X3,gbY|*TijX__LYi84Sfp򹑇BV2e< | Y}o&L,/MM'Udbź-amMM&L,MMͷ6b9PxS 9N'LM'L'&|ʰ?N'L'&L'&-8nCDB.7@B=~ _rb.o/9g/9ᗜoyOڬ=sysyNPaWs>*<6U>$צ*r(j¦&l j¦:Wրʦa8& ca8& Bal:FWg>rXY6c]6H5*8EVqM)6æpaSN88OxXl Mst9:l M)s9G}st9: ǜca_s2hx ;ߡw9:stcѡSH7Yt?:1gJwh]9?51|;41|\Co.wiCFJi.Yp;4ܡ whCp;4ܡ whC( (R(GOw.͌(9"JCrߓ%q5;dS,  jJ肫Bf」6&x}l v@ɶ'Nv<ٮr-v@ld;'ԮU]F:CeRrǓU Q?rP<9'ǑsE:Kƪ4*r/wrv@v~=n^x[}!'z@pt;ہZZx]؀';0S O`<ԼnL6`*>@|C:ߍg}c{vHx)=0 |ċRas)|:bFiNp;?Ng#͍|] UM-zX9~Sd: m.ķ3+\1-%uŨG]8ʿm=#N׷updzo W>c O|-Bv|U> s_zzcK𢝐^h$4!'O9H<n.Tg_X߭^;`ͅ~w}Sox[q</])[x:8%Eo]F7DaCe҇0.?xq/*\(a5$Fv`a$UI/a~s1şxpVH$ы?-);V /ҏ%v&=0][x+NI`- }b9dssJƏ{ '>n>UZ=G-=O>/6r?M4oi]9툏t!Ny5GMg+N~{T?F)nc.\}X+a塨.8HIqQYűf`F:^[i:H[i0@һ\69z py?~ U `R&riÈojE.GUI2/Foo\">Xj5<,g WQN ʎE^mѹxoEãE~1COEz|ZE~q׳Y*@fpbG ^򰦾wW4`1S.ϵgyJ{ `O)x)q]ׯe)."I../T7(f> _XU#AȠkd52uiفvUVτ :I X*UZ_ TCǀU[Uݵb9K>i,vyŷ<ǐ%/vבOgP|es~r4,:$C:$C:$C:$C:$C:$C:$Czv!Wuduͩ\$xXMHM/.w1͕ wS}[:*\wtPuUwPuUwPuUwPuUwPuUwPuUiM8-?9'KL|~O*|e1o_i;ec&]v7.ToGmQ8\bW|!Vҭ6:CXQm | 8.Zj+]%+oO`aua ym^ʆ10+g m]rs?;E1aW+,<%Dey 61<`lcy 6Lg3ԙgSn|Y3EzYf͔Y35hκB"Cﲂ/r `;&|u<م{L qU|lUweO qg_val/RK=N-dg w'21ܝX 1ab;1АB騴4|NLͧ Qԩe=f'(uR'(uR'(uR'(ubh>+fJn:8/2x"|+> Zj n n n n nsX=?!{īN6WMz[\Ǹ8eɺh.q\c\QJGIu+\9Sw*grfKG+sx ЭАl ]㽦W㊃'5ijSl"a?f ~ <ʪܾJoϭ|et|>#N _Ug$ˊ^E-!%c}u{xlf2 -&WUybSJOB&u>}5stGDXӱ6Y<Ԕe,8g 3xz ^Cܐʳ$`3?qXWyʣ/` 2Z b) tbrKLw+\ӱCFQiK*v Sbq'bЇTwIR(!(haBР-4hA ZhBРF-4hA ZhBРyȮ{Ev+Ͻ"w|{\UurU\U~T&/ ;pT/fo{J|qiQM*|<.|<.o˛渼9o˛p8_}Y풷r\g"\z{RGZjvLXÛp8OGNOnө:įJ Ӏ;zUnq@. q@WF lxwC>9DͯbHWK p+ 􃧆q"+p"+p"+yoV~p~p.ePXy>y w~paTYXvvur?8uj獾r?8ꮴ#A~pPX^~pVaS|p8ZJJHH b JQ|p8r\ap9 .0r\˯l`6d;l.;6_M({+ފ({+ފBV [!{+do쭐BVފizL8Wȼ^"M2BWeT XS/h+}N`O9XV*`Y,e3]eaKYV*`Y,et mTS5oY]}Q> G(`lM6Q& DE>ksGwQ:mbM>4x<telP7C~C x/&p_f& D(ty y>el}XA$1 >'`l}>,֓}>('`lT >㷡SN ;{CewN ;^jw(3흀sp(=흀w0rFN 9 xM 6&` a0lM 6&` a0lM 6![kRLy' _,%_a!.m|g1oS|os3=sU(_EޞWg|*C$jʐG/m(! {gd|p .]|8Z'cOU*Xb&ܑgO`6~.Ko+S9oW`WYۥ?a!Uj=qF~0Ů i8z@O8UVvT0=;e^6e^6e^6eSx!\wv/^340Z/tРq 50 x\LL׃0]"k//rn\cX}/{X=9 k&Lxz|E>{b^HjX5ARH-`4" Ht i$O K*Se3>/b/F=`:v_;nW:^6ooibCcbm_ܐޑj.6QG~7\6lb M/PUhza.cȣAS`ɡQ5JKb mj2$07 &/Lel:,ykt\M^җD~bMՆkC`_UgVz{Y7 #K7M`2d7 vp`7 vp`7 vp`7 vp`7 vp5n?1}] I/n/ClYwmĴ znDCQi/Cf%]Vrʠ:Eq{]UmM۵eTfZ].O {Zy|<>]Vriewy|<>].#WLSM1wQn؁2Zݪrt~.Pav&nav&Q~xcc.` wL®;T~^p 7+.ӻ9'C#Ce a8vʳG<# onߎ;a7U0!nQACfcfjEt1: >[IHg蹟2 (X*kix **fQƜ,_aS5uq~K8?t!݁1tm]̐gի=o"2@Y?=7/o?03p#~#~~~/ƠQN܁Q(o(fT́;a6ֳ[.kGE|+*e8*`[oT́O+ڨ`>RR]yM?zԷ7$' 6۷&vPxUBrnq޾BrvUfGfH*?CrƻdHXP^ s^\t׿h狫we E;_ˁq*ia*/ǁq ʛ5sp]yV^ո^+/gmY^:|pB׊و52ˢpm}k}.1#pg POZCZCUF=g[3+/*PC]\[C1ExneUkU wQV\^CqG~] H\aCHK G:L10[ֶ'l(`b#]21/(G(3Xߍ>XA9;M:8CIo+6ts@V;Wl(6bWzCIwec_8@Xp E6]܀70E|XYbiȣaY9UXzFjr +x}7p w wp fZ3YV,vU]Ҫeβj6gmr2kgXMٲS>)[vʖ e6|vSSfn7=eNpS~)?ܔn7凛Mᦌ)yjzsy'z8< ɮҢ&ۄm6}Mx&oYYD@<555555555:Hu"o9~z~y& cts6~B)}9eN:iN. ڼi/tbu"k0:'P'M'\jKm 0'>';'\j{eDľ漆]ꙅ/Km"m¥6aMANAN;NX[AhAh{.}} hb/pbobo"m"mboo2m2m.V}IP]]mTi (y0aLy0aLy0aLy0aLy0aLy0aLyʆgl}6>~ bl}6>~}ܫ}sЋgC7B7;"!"!"!"!"!"!"!"!"!"!"!MHsp:)mV"|.$~]i҅c%fʅcRf Ǥ&e6jBM.Sҽ-1EWnMN7emG6g¥niDžL?K5-\8_KM*l8wM׌ytӷ:'vQy\P)᷶V?}7(:g"1Xrk:OoO^m?jFŦYgA> .RZQfGe֘bp1@ . 1@ lp\ bWga+UەU#_Y@*7`j\\7<X4&cr1kT#s .21L .2151L ]7IwItMȏck }ζpsО>>QzOVtiBƧ}bp'v-7aZnR[,rpKYqv8aG%ǜuG:cYwv; <?uiyZw^֝ueyX;/˺+RWl[]5EVbqr.Cε2\.?.YVl:bqXjWxC?ffe?CЫE~?AS3+:|&JX(̽;䛤~jr8\3׌5p8\3׌cq%tl:ڟUNm9[|mـ?7kĹֱ^díp89ueg:L[quLqflKm-cͯ)<>u!: næ\3[qfp8\3׌((sG׮^ϳ5Fa *ǯ8o)}%^g5nfa>%[G8~_CzȨtóҘ糩zwagP~;ߡw(CP~;ߡw(CP~;ߡw(CP*wu>}]lQ*9;χz)qTS*G2)q== spPN8_Z?!iIP Nm6]BR!|enMwG'zt;esHt;}A{቎ECx֛>3j~vL\ͷ&<~~>]=G՗҉+5~WFC Vh l_}bա6QsL+O/׏z$3psĕlڑ DwwXe| !\?V:ăRƟ}0Pr:^|j=w?JJ,q(ph:}'AJ:nEVdt]Kw/}]wg=>pW ZYfp(ì>N0c?]=/7Mz]7nu뺩uS^MzG兮{]8v./,rmUOr`Sgk]ք; fPSߗ 6~qoOuC'=cM=o`o5[8pxtP+BJc7z`r/kݹeeN_ d꙰%Wg/V?M48ܐQ/`?ZW uxwфk.F~C;v< us Oi7\6\苌x#݄dnY\Q~F| .HwvhNnɴ].V2(5`j\Ct`q\w֕'#7.ֻgC}3HцqaŦrܬ} xO9;^?X} _Ge!7 ?s{KNz^ԹN/Jsn>o9Վ9)sɜj`9Վ92[ɜj2)&M[j[9&y~t6imҮۤ]In&+LWH{S+]WHVx]WHv]!Bu+3Xu  wXMwsRX'N=; } X!l?_CoֽwYpgVi5Xp \rB6qe5Xg YYR8$r61rbD"kuSe5Xd YE`5Xd YE`5Xd YE`5Xd YE6]zYd.Zpnl\ܐޑyUYEtfg@j< ):ku*,3;f-m\*:k:k:k .XBl魺XIS}5V ܶڭvJ7qgWybg7` Xc0MKm]Ho6rSye5{}_ི흃n/믧_['I=wrg^`CԴOJnW4}[Ug6-7U؅fxWC!? u:%r2?, `O@dtn;x>[w@aLj~-\q2dkOר4`YkeO_¨Cxc\ցF~K쨳[p'o]m#Ƒ:El}gZ]V_~$hY|VCL5TCE"ph8Z-CE"ph8XZ -bKRl`)6X P 8iz}jVF:[KW- әs0`x-M[ۻ;^-[zmճj5XJ ,RD:@9@Kׇύ N{́6ֳjao_K8gu<[u؎:X_jOzwس4^(36QO2?Xub:ڼvf?R %[c<Ŵ6Rgq,Ν)Ν)ΝsgԼ*0r~Upz_l]gU5ʻMdJʻZa. Za-"]G,8p Vub.8 ~Gt0 zTީGP&GA ]퐟 Y <2GOje-~}Rgj޺P}SP#>zpD=0[3'~ 8twp l  ՞՞XM+]?_'JءN_rFӓZzhUŎt#>XX=!=!=!vQi *yV}JV}&9]~BV%ᶀvѠs9a|0 h[L)2 &L `40 &L `40 &L*`E 4+OLk(%IiŵZn=2c>ԥz:[烣~V pvARf ^?M֬E¢mm ^bna6 fts :[ЁuШE{Sź ̹ ̹ ̹ ^uqM<Cݦw9֐,0,0,0,0,0@^纤qx`juQzυH_ xo:.k>ؕ!{b@v.DkFw#F]2e#nو[[d]AS >![\ѕJ+\W*X͛>}M V^UGMG8UX`"jm6l6|Ptvу;FNhn%x+\!Tm܆j6|Vf5ӑߪk*t[HG9_+~%y/l4nD|xMZ7>y_Øk-.C?72ME(]]_xGp s$oXc0G*է }#=.?c䉟_;[ZVVUje anjjeZţLQ;JF)dڱ?_ފ<5ٟ&oV4-kuEGisYkZ֚LQ;h>"4-k혢"?5SJS4Sg="R*5en:c ,:~: X]CzS:bZ:zֱu;#`X:ci0(àpaP8 A0(àp`{G#lw;EAePDQEAePDQEȠ!"dP A2(tE A:I-BEۢ`d:-BEȶ!"t?Dts!pZ0`:L0CtFfzxge.3~?@4?b? @_ h~@xu ȣ.hc!nr<՞ iX]x#}J=m h{@=pH.m h{@8$==ޥ<=y@?T2Sz~p(=<=,4<Knt;v@nt;v@nt;!i;Kݎ_vRn_nq*~?KwKů\+](P*tTJCHn "E*tT-R[BHn /*t3TurЫfmpOOh}Ym[?y`:^<qc"?š߆!=N]U.=x `Unwy:rigtbG@O.;.\[SR.F.|80R. k .9QWD]:+_jE.։hڋX'Z~\jw`=;?jYZGjX'Z-j/tCNv-z\Ž*a/ݕWN2*,/VVKP x(^OĀ'b1D x"<OĀ'b1D x"<OĀ'b1D x"<OĀ'b1D x"_}'bPkH5~>O,pߔa2eL&Sɬi1eLL&Sɔa2eL&S 8uz岬_}DeYMXVՄe5aYMXVՄe5aYMXVՄe5aYMXVՄe5aYMXVՄe5aYMXVՄe5aYMXVZVO kYUz๡:מ_ܑghOh²&, j²*= 6PH+ mBpLXh m^ E&, mB&, mB_Xsv3醾6WYhڄ6aMXhڄ6aMXhڄ݀;ܪ[c8yqCzYnD:/mr&, mr诲&,y-7Q7ir&, m›apK~bo1&s6o?1bLy`Jnuʺ[eݭZeݭVYw+|Q26 %CniChɐ[ZZZt^,m-m"?_d8P*\GZGMzbiiiiiInwL̙p(u*uW>>;ZhhahՃµyy`~.l-l--l-l-l-l-l}x`.l0rמ)oMu'3Oը3s\um[Hj*r\0-L˅(ԅr\^Z^Z^Z0!LŅ3pa{ia{i%aom/-x K KK [JK_X|ĂՊ5?;pC㬶_]p^b7<+mt/l5- j____________f|i_nV]!;/~낣`'[в<o2vhfW΂0Lm&%C). ]2rKud'tdlF<ۼy-m ܺ`SLdtCF7dtCF?ͻ*>\F;<SyaCʋ?\.>x!=TN^S{a ; ;w(g4G(,+Ěʊ6YѺ+LV;Y}ʛWHYa}d2L䰿wcO_CZKTĹH=ol 2Q|+eo-:_E.CЯ_SO0WWφ >vq]\jW.vqkx}Uzmm;hAvжisjhtFU;AvPTjU;AKϙfTjU;AvPTjU;AvP_vO9sp*s1G;mGDKd(1dKX"!,Dj ਟ H DQW1~Ї2$ D ~`/8g5C3?43?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?03?0L.Y-Տ5390fp3JFK|ߏ=9o;ؚ~+Z^{y//.:濪 ɅYu^Q'j6GU9 p=?WHᷮo]TpLrLoߊ9+ϛL&Vo2]jho}5>4k*}bŋܷx*M܃ߧYzkgWn*ѧ8TfӻD{̴z{MrxRK9Iۙ :Mo+m\<1ZxJk;{s? /ϷY޿ EC~9ȡG=ro6<咽8I35y Ro/nznUj2K-2s$SK\|S+.B+[_%xY괏ʭ{_~`KoKoG@#T z>sa$&g8 ف(Glo?`7϶.OOԒLNSf75O)`$V5dCZЪV۪p=IjhUC>0gxU]j`!U FlUWZժVuU]jUW:ZѪVuU,巅<*ZѪVujTZ5N%Q2CjP/~ _Bj@,fq,60j@;)NSkZF_^2iѴvhZ;v ~zM C{F_ݖlgK~jՋGxS٪#fz RuJS6o+QԜJd/s\'|G=SklL mr`[-iX&2\&YX?ث\p5WgHk(;p*C37f}Cv_w_M--~fp!BՉ̈́C-sNG=[Zr::%ëT%k~:èvՎ队0]Gi:6/⁋w\&jU\ua;c: w⁇g"}}8wUq0ƹ߹c~:Cδ]v6`~[;dKrj||oCnq/+}=*s f?>UG1Yǧ|Lgfr4h&G39 .Sl&G39o3*LfLªL'%ǔlP,`#=?^ v*,`*d,HF 4RhUlo6,ܬ{>={\ԯ녩nca6nrm4WVU3u`+تJTsOnhfHw>=44OߋUf6S{$|8!\t6!L8(3.5ڳmϤlziDiV)leOGi45$lMNyMg+y?:NGg+y< O't9i9rNx9g-<ͼx pGz7᱀>ـw3izڤfy9Q6t?~PܕT)̅;;H |>?OgW$ųz}IaR8iIߞin*ߏ>ls o`oV)w :0Rph(= |OU-NgVM/`(swyXu*WV|U&f[//L=wpz_x\PF^S"?(z#&}0}Ge[mz*xT;l^l yBm>]~%0G{J=g}Kʃi}H&%)/_|Yr?@;ܑǐj\x&c9&6`{ftizhNo%x_<7({`M~[΋+E.M`=0dQY#?~jc{W&Ov!Rݑ_p*î +UJuQ] ,+MyԔ!!|^n_ u'fFꬮJՕgo`ߎtaSNԧDh\F( Ç]u 2  ɀ+09OߘM%6okW/NWx*=n%-Ӓ>-Ӓ>-HYBӒ>-Ӓ>[PJD JD JD JtqQH!b & b & ytF/AXA88#G yRaAFᷦS ~m}aA+JoX}17޽D`AD`AD`AD`AD`AD`AD`AD`AD`A(<   r(lHW;ׯ"Ř .N?ؑzVr/p/p/p/p/p/pk w],r7c"dqTiHDeb-ߢ]-ߢ-ߢ-g p"e I,[k5o0VѾl~+nmG6a_ES~_ru^hznll,G>Bxkk9PȾVu+P}էқ)P XlφlrdC}6gC}6gc9>곱XlϾG5k>ξT㶔hC6hC6nUĆw,%P %P %Pc,8}U &*m;>+}WʣwwZ]l.6V %X]l(ч m"݀y,wc;$T^Jd)FVbd%FVbd%FVbd%AVd%xAK!ݽnU@> 7~by/k|rbPx/kXeWzXX}'/ςuبsh]:l8 xW?T A6.^Uhс(  7,pUYX7޷t\܁g w/oooooooooor^¡lz@hj.٘ :ߠ :ߠM+ՠ+Ɠ7|7|7|7],vLw]32^+oŪPI{Ӓ<^/iSj9o9oWγD> ؑJF%0YTg}A{sOy%$ܞ+!ϴԺbr]%[>i+[Vv j ,.VYǒiWwJϥ<)!!!ʡ%7{Rq 88RW*PJ:TCU?yJZ:CZ6ӑ.iؕ܍rv +IK+'9<4XcKR!R!Jx*萊萊萊萊萊~ |x-萍萍萍萊'V&~)3ck X o`SCwzwzwzwzwz ZI##[DrZ: NzTTp=1M8:|:|:|y* .?x#}.\hJc}m/ f9O(h kKF_B`֪*f YB`֪*f#K9Wy7\Ʋ3'0~kC >tp믾vBcq 7r\uϭ@( _rгKҘ+߂+oDbM.!ȃ˯Tؘ Sac*lL17wl\ؼ.:, LK:زrZ;\[hڕA5;|90/]V~uwTtJ&ץaxdAGM~&euy̙i52fa ,`` V#Ոo`oM=Vo񮷽xPǃ=˹h$ asDdux<|fV}>O(<]&0QT^wyPƼC̎]Gl{S58e q:8S=jR&y]E{FFX}TX#t;c`ͫh/`ggDZ7^1h|5q*)*P (T@⯮xּI`q ,N^x }1GjqwRxLq!EH $j* (c@ й>R{j)}ҷޮ{ҷޮ{)x'x|z`^qUޮ\j_iټ `ܪJ2Jv }'xIo؄߾ pON܁l.3b^Mx{}bSu M\\\s0?[(OGS}<]HW\t9`(3Ԇ)<7,98K96bm)Sl.ֻo)6bsk̤\wOXbsyL7`'*G{ k eNk|\lHG~C~W;|8E 2ST.V;mO+텗>__`|}ΌZm^y ~5k~UP"\eHo|=XͰF-!u0Z-N/6a4g:60‡O x 1փ7R4) 14j FfgeSeF݉S# qL6}t4z#{zwٽ9PnudQ[#> .í>+Djr.M%(GIQW3jS[*T֧Jw?4eFVm}t:{]9|/bTw[<'Wf(}2I<ⷳ~sTNzVk$0I;L+H$\l5};yp&i3 [Sfeob?2α>;>xe&vcl<,C9X;~R:U&`L#~-`9\>y_Ce̾(xUe:xxpnn0c\9|z6չLӭk~]ttz7͑__>|<>GI}r/w}[h9R! Zև.>t˭]>t=˭]!X,t<êsiO龀[o];ZK!o`q`=7]vCFܑV!ŎPoQi(Pa-zȯ{[ܚŭ:tnց%,<'ʷqR|:ϋg7خe @x}c#}wQbc8sqGpGpns!xXP䮯@ 䮯@ =닏닏 f=y)")"i}q[Qp{(u V}b f` f~Ǎ^c,cɴL;L;aqN,NNX,:,:ᤜ` ` sE'XtE'XtI9,:aN0sN0sN0sN0sN0sN0sN0sN0sN0sN0s˜V{sN0sN0sN8 '9aNX p@N0CN0CN0CN0CN0CN0CN0d~pׇw}pb^ņ]ofp p ϩk [oaɷDkK_Z[ZZZ[f\/,_ݏ t^[Z[WE<^-P-PRهk@_ ^*[*[*[*[*[X`/,hmhmhmhm֥zn.hm(ka@M Դ[xj@M Դ@M Դ@M Դ@M Դv0iޮ]vqΘ޽3kK}8jWLVy_L֗𶾄ws}nCw[ۻt7]u|n/m:>es[\N}sa !@ߋz6j 'OߦhW1Űm? .&;-Zg ᭮ͫ0/F9xE|4Ӱ6" 4_4 &i01 &UtwWC}Q,mw\M&pŚXy8b/d`bx}!ue6c n`uX[XBp SųƳj4 mXqikc01 ~Lj p%<: ]Z4HgKt*n+q[_JWtܟ: wP嶾(q$n;!<LCpm;bLƒlY+B(Z#N!Z_08apCU08! ߃C;84J8!1pC b86m1pm8!1pC brG6m8!1pC bp:\1K\8wM۾8Ti;HAw };HAw#}$EQ_gve](I}o9 Y|$!Iy"DvyrF}gSr;>o~\ʓ>q|$IG|$Wf߭Y?Bz9ppG|$I Z/|$IG|$I *E2 iF_E2 i 5@b k9H6@ ld$ pp%l%l".@p \ .e s9&x-qF;oR./Cu ۺ~Ū&Z4spS9I3>iO~vLo󶮋U]x}>x)>yЇ󶮋k0җۺ6?ibuHy[·i;pa?8o:xGm]<#q1~gup{؞uY5FwnKkEkEk #a6І3330BFhYèl aT6ʆQ0*;x0җʬ0FbHl8 #սq} m[#a$;w'o]gĦ{EVl$~. lPM5h"cw 1;pު[^x1~k^cw A1F42͝cw 1;|ǀ`1.M<*3~1>2WMI1):&Xup'F9qϚ,ăOWHe͢v-?Jɍ2޷L?\Fƀ1`d kpk;.;9b.6ṫbH/l?`_eX L؅ѻR9OC2&cb]x̼`{}[^<*}O'>\Ā11`L;Z /7ZGѧt0?C3)|m6ykҞ*f:l֗MaW 0ź,:f pYS)j eY lgO:󖹆փS`&愻Uf τLτL̈́LĴvS_ _l%H4!H4!BSv^wB>gʝp00>~~ރ }Ϳ7~>7>cl`[J?q7Chs<<>nK a7I*E!nkn+l^g.^j @K1.-QD"nk5jm_lC~yJkX语!T^TC!bk5!Vk5j aj ۶tZVk=3mޖVMvgQm ᶆpۃ r{BF:~1St\˨\\\yӮUT]|wLQŇ[|Ň5Em1ߖAE[$E[!>{p2܋7)C\!Wk=#3 ` B̬!fֶl6k6k6k3k 0?k[wg5dLZC,!Kk5bim?k5g Y` ` ` 06k6 B쭽YX q8\C8, 1\CL!&kɵɵۻ:Lldd 5њhMAYScfޕ]ny[7pkL1{Eo5J)/N1pS b MC!zi4Do7 ћMC!zi4Do7 ћMC!zi4Dl"6o|Cd!Ei4DH"$ IC!B!i4DH"$ IC!B!i 4sDyz***ѼNM(Gs)^'MFShEZpzp,EXgp®WzUYu B a}>C(!|g:l/po4nCFg3̡w(= at0:C!Fg/t|ozn)c}p6s/ B#lf[3UvfO3}=oE1YQˆ(Z6ԅV T{ZYh:@+CMǨnnznv@h:c j$5ت9SUJ O`' l 0[-M lVSM&Ql{"C/;Ի-h)@KZ RC`E(*y[ak(t]Y8rFՆH ,RfNѿE] PW`. H<"@] P 1FuL:'1}POCokbKnm0W:#^]X3`|EL#> 2sSKkw﻽.U]iMJzϐCe⽘ɿR~$3GJLJ{׽ٮ{u]z۴mڮt].^)^); 7=̻rnl Xm mo׀.ջ׃Muhfϐ.Cֻ Y2dː=x3Ө.ֻڋ.y1{<0L v+ЎQD!\:a Nwp_3ʴwwE,<#4!5){0 ^zQf?^U95W2H^<N[s [}?Jb5:R @ځVP*`$̀A ؔ:ppS"o: >` w݀y7` E=݀y7txo` L5LPZ 7tBt!PXPX QErC>0_,앿4}(dC!>8 _umC D5߾XRC`Ρ945CSshjM͡/B z=X.Ǫv xpʟ3?k]u(~F:pO+-_mUC*,#-ԓ*!?Xp`Ul%kZ:eX-V Ղa`X-V Ղa l тq`-G тA l`-D тAh^`]`gY`gy gY0| \0|5|.6}O͂ yf ƀ]Re:Ԑ`윞L]&.u,f}w}w}w}w}w}w˂ٰ`6Q ӆalX!VȆal̋yPG(·ko17,[8ԧSWx2ذ66 kcذ66yubmذ66#o`չ kc7 x^ioJKJJLfq-7 WPe3KLf9a5d$k&Yρ਼p&3)AL:8:3(AL:v:0 *hPA \ׁA hPG,0 𬮐RvR̸XegPu``ȿQV[ν秺ܠ5(aԠoPP`hs^#?1\ԮV)RSz^G/c޾ YPr7>Bt"'e<~hc\Xᢍϼ/j@t>8Nr!O- &8eZ5z=N&HAR3a>C0嗰 :6}rHmÜw}_9]Asqh;E}J ߲LͿXX(< kZ;8?*gkV,`A :Xt0c`>^ܑ8񎵎|; jHA|WsBonŒ|YӴv̝8ˑʛ[QVm+>>pl& WHJz:})=EC8AB®+r9 $t'E®fBG=t`WpGχP_׀5zlG>QGuzpGχpܯqDX#W׸ "QupY"?%QDN_ y (8t%g}Q_VyNQ0]AexF́&h耕7-P[m#KinhYrZdFt DdQ2('{-q11?%ch7jt5dJ?.OgmLGw_>Qt|8"c؋;>~4c:KF9Β\wbbcq&\ ŗ) 坒P2I41I}4qJhPg4qFgC=a$,h"&h" ݥ_EQ4EQ4EQ @QYD@ D@ D@ D@ D@'DHXD@ D |qȡȡFY:Nx!4B!4B!4B!4BF|!opB!4B!4B!4B!4B!4Xu(rh rp4`WB!4@x^iW̡elr-aJB@}=QGCqX8PV(+y(^妟^.7? qt .aW\kp 'At~.{6Vӥp0z9] !z޽H˥paatljK`Wt)|8] t)?Ta<)t{Kþ y-ƛxt)\RXhC̡r)5.x+8Zq+ ъ9sNʌQ[`P0V(+71c=XƐ1d` 7p1`T p+Fne :<'xb'xqE.!wA /pҟ 1.!wA b]CxZ?`xZgt ֊QIoA y b[CނxZжn!oA X+1-%U3>|~v3 -Q,\gZ ` `j}0g-rON6~Vˊ)`:BҞ">k1P|y.8-,nԙ?7.d^o/z2aL)fʄb)byXj$ܡ9PiKN- #h06k.P)h 0&Vg " "ڍ>G\¦=ۏT3GVrsgxy^cJtTr\0&̵y)< (|v[VF]V)0&L57^0故"NMv)'֔kʉ5ĚrbM9p2M)'֔JGݘ1:rki9XwN;畝,&ebNM' ssܜ#]mwb;XNk5IYhrM_6S*' \x^sv~4~Wݧ|1o-~7~k>.hI0XnzfQy%dzU6qoL.>۴[ʻT:Xu3U'+7n{Z~8T )`Tg*7׆O ]>1t}bPq 8`0ƁQ~^*tӳU}sģFw8g/ֳ 2*P+]lF֛ʳ4]R' 6C:Ahta<+46/oo1,ZG륝{M;5tkd9߿D-v. ;'%QKB$tՂaKBH{VnuM3`+WK!vF[Pt3K&Fnnnnnnnnnn9[7Co.U%KBuª$tCB7$tCB7$tCB7$tCB7orsq5t%ȱܐ uyxkznH膄nH膄nH膄n8_} bCϙV._zR 5P 5PӍƍ}շlo\[?ލgCgN:0Sa kPX֠U5եskx_} <+Ԡ%5(AI JjPRUʓiEskƠe4(A jhPCoc_}LΠt3(A JgP:ۊzns~cTϠz3(A JgP:Πt3(A JgP:Ψt顇9tW_+`F |0PK ͠nu7C%Wre+\yJ&g:u+W64]kҵ_ڤpm&`W`W܃]t`W`W`W`W`W`W`W`W`W4W4W4W4W4W4iI&9qȉcбA8!-iqDR9B0!M>I]pLG8#t9f?#±}ؾsȉC6 l8N-;"ʱrl9t98[sŏc̱EuǶ\m.6ck˱rlm9[[-)֓nwl79M:2\)(GwT$?*#c2b2-ʫ˅B2C, \B,]l((DA! dL&0 V JU쁳YY` NJw6]{q?<_@`?㷐162|M1HU`2 1xn Ol`=w}gW=mvwzB6Uy)H8;)Nv S S><xNJfQ@;Iz7x8dwֱ-%]#C7C|:Ӂ~Bwy%'X?{.JM|pS_مJ_x*m.5.&yza}za̺ ͞YwRYw`4AWga0ɷUӶ&85l?@U=3'~[x.vyއUfGÿ%+1/l M} u3Hx.2 m1|DSm7Pݨh"VFŮޑ^9$8T8|FQ: ݋7MdC"ozXn3cOu.^Fl/NUه<]L]L](U.kuaTGV6f{Nܿ# .kʚ&.k:h:hnf V.[⹔>C)EA/owy y yPw=,y[K#UCJauPC[?yQCMA a;uNRCƦw)j谩:lj谩:_j?߫[ҭ]k}|qLևw "2z{#҅}`WU#oJn¶Wk:2"@)C8 Qѡ8[ &сz ϡ5d37M6|b}g7RU~Tl)u#*Շք{8j8jTQ6oq s)Ըkꛌ~M+uo`u5;X莊1{؀Fww|ȈbKt"yU6 okha<F_N4ܷ^yI~{o'%1xx,v;oܚ-c~q7&iG.*džVws߇6/ޖ4_{W;=$Xֻ8& {X1vM`m$|m]^xVhlV/<-Lʜtq.OEi#BRY 9+ǔBN)< "OERpJ pJ l{.spO'TJB$ `T﫯f zS7z3/~.hKP9G'DmB&Dm^Qz5z ل͊z2K&kV||~ ք`Mք`M֤`Mք`Mքgv^J狣p NY YAw,LDŽpLDŽpLDŽpLDŽpEE,½U~CUcS"2!""2TdȄLȄLȄ+"e;' &ChM (V" yUz;돍@&k˯\x2We*CE=%_"!N2hA+,yZŻKˠeВ#e`ꅫ`.x)K %҂gɽ/^/V9KU*giKŴ+fq^8/x< KrfUzVe2b~^*G,̂[f-Y~rŬR) + faY,~V- Uªei/Fb{/NaYaY{p ˂,ŠdU<]^"YXd{(FdU zZ+&dAL_~疘+&ԧ%& +5jE"YX,HV^~]y0cV$ +ɂ(-҂(-HV$RcnS;|QF~c&+)Q$jDY.%Vjf5jK\v[-R-UR-Uڤoy6)lK\v[[-7ܖmᶥ?[?[덭zcKifCi6fCi6fCi6fCi6ec,P EXclȆlȆlȆCWi^~a 6ˊ em,+6ˊ %` ` ` ` XVl,+6EK6ˊ %P %P}`eņl(l(l(J2;dcYˊeo,+6ˊeo0ol.wRx#y@LmjIVmEVm#hEyd>{aq%}0A {}0A `؀6hA+ Za V®Vsk#۠0A7 a ntð0 9ӍLt#ֻQMSa ntà0A7 a $ӥ ,'^ij;y4<]-TE.w9\Q5.<E(Bx ``찟vNva;lcm8lcmcp 1+OozI1zt=r6~>V=v軉:W\1w:)1 m^A6:wqn@cp#Xr#<<<c+20V]IWֲB6XhZ6Gޗ{`eLEށ{P6e2st0at:uvq:w,[)`&S`2I8Y8MQ״^<܅6y״^Ufc#`lIyz_CV 2?qR+OVKd'^C]voX&G?{޺^i[W[뉹mAZiU ê. zU<=hW2MnZm_wiTMNS_z@|4VV́JzD HpGA<䛋G6imcw/s)ȭ;Lr ܺC':Fsj⪋ߚ; 8?:_S}tP;kS/@ɗ[(.@kлɡy1޻aŻ>xgF~+Z-WwG?6ߢ}\lS[kȃ\]ڪ=c][-sn49ȳ$Vߊ[9>i"VD:o Nd+uު_UѫMLML w5Hvi7vi7Nl5j8pbVÉ[ 'n?pl^x3ausn|`+n n n nh DݴqE8uua m8pVI4B/U"o o o Ikqn7~7>Nu]ﮫէE @ @M;%#Hu!]]Qu>p`<7{[V݊{rxA4q^3nCie =.Ӽ.uQ=*R7액躋{i^;׎F zmB^MZń]+uſֵpYg Z[wu.F0v1HWTk``g ZYg~g=k|`֮胧ڤY;_fKF]wciv ]g.V ɰՃ}?֬mk +mb] XMLFILE=$1 benchmarks=$2 COMPILER=$3 # choose gdate on OS X if command -v "gdate" >/dev/null; then DATE=gdate else DATE=date fi #=================# # Plateform infos # #=================# COMPILERVERSION=$($COMPILER --version 2>&1 | head -1) if command -v "lscpu" >/dev/null; then CPUFREQ=$(lscpu | grep "MHz" | rev | cut -f1 -d' ' | rev) else CPUFREQ=$((`sysctl -n hw.cpufrequency`/1000000)) fi ARCH=$(uname -m) OSNAME=$(uname -s) OSVERSION=$(uname -r) if hash lsb_release 2>/dev/null then DISTRIB=$(lsb_release -ds) else DISTRIB='Unknown distribution' fi #==========# # Prologue # #==========# if [[ -f $XMLFILE ]] then echo '----> WARNING: File '$XMLFILE' is not empty.' echo '----> Results will be added to its end.' fi #========# # Header # #========# echo '' >> $XMLFILE echo '' >> $XMLFILE #=======# # Start # #=======# echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE #============# # Benchmarks # #============# for benchmark in $benchmarks do if [[ ! -f $benchmark ]] then #File does not exist: compile it echo '[Compiling]' $benchmark COMPILESTART=$($DATE +%s%3N) COMPILELOG=$(make $benchmark 2>&1; echo 'Returned state: '$?) COMPILEEND=$($DATE +%s%3N) COMPILETIME=$(($COMPILEEND - $COMPILESTART)) COMPILECHECK=$(echo $COMPILELOG | grep -o '[^ ]*$') COMPILETIMERELEVANT='true' else #File does exist echo '[Already compiled]' $benchmark COMPILELOG='(Previously compiled)' COMPILETIME='0.0' COMPILECHECK='0' COMPILETIMERELEVANT='false' fi if [[ $COMPILECHECK -ne 0 ]] then #Compilation failure # EXECUTED='no' - keep it to yes so that Jenkins # uses it within its results EXECUTED='yes' PASSED='no' STATE='0' EXECUTIONLOG='(Not executed)' EXECUTIONTIME='0.0' PERFORMANCEFLOPS='0.0' COMPILETIMERELEVANT='false' EXECUTIONTIMERELEVANT='false' PERFORMANCEFLOPSRELEVANT='false' ERRORLOG='Does not compile.' echo '-> Does not compile.' else #Compilation success echo '[Executing]' $benchmark EXECUTED='yes' EXECUTIONLOG=$(./$benchmark 2>&1) if [[ ${EXECUTIONLOG} != "Time:"* ]] then #Execution failure PASSED='no' STATE='0' EXECUTIONTIME='0.0' PERFORMANCEFLOPS='0.0' EXECUTIONTIMERELEVANT='false' PERFORMANCEFLOPSRELEVANT='false' ERRORLOG='Unexpected output.' echo '-> Unexpected output.' else #Execution success PASSED='yes' STATE='100' EXECUTIONTIME=$(echo $EXECUTIONLOG | cut -d' ' -f2) PERFORMANCEFLOPS=$(echo $EXECUTIONLOG | cut -d' ' -f4) EXECUTIONTIMERELEVANT='true' if [[ ${PERFORMANCEFLOPS} != "Irrelevant" ]] then PERFORMANCEFLOPSRELEVANT='true' else PERFORMANCEFLOPSRELEVANT='false' PERFORMANCEFLOPS='0.0' fi ERRORLOG='' fi fi echo '' >> $XMLFILE echo 'BENCHMARK' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE # Logs echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE # Times echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE done #========# # Footer # #========# echo '' >> $XMLFILE #==========# # Epilogue # #==========# echo 'Results correctly exported to' $XMLFILE fflas-ffpack-2.2.2/configure.ac000066400000000000000000000243351274716147400163520ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ AC_PREREQ([2.61]) AC_INIT([FFLAS-FFPACK], [2.2.2],[ffpack-devel@googlegroups.com],[fflas-ffpack], [https://github.com/linbox-team/fflas-ffpack]) AC_CONFIG_MACRO_DIR([macros]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_HEADERS([config.h]) AC_CANONICAL_TARGET AM_INIT_AUTOMAKE([1.8 gnu no-dependencies -Wall -Wno-portability foreign]) AX_PREFIX_CONFIG_H(fflas-ffpack/config.h, __FFLASFFPACK) AC_PATH_PROG(RM, rm, $FALSE) RM="$RM -f" AM_MAINTAINER_MODE AM_DEP_TRACK AM_OUTPUT_DEPENDENCY_COMMANDS AM_ACLOCAL_INCLUDE(macros) # work around to fix the backward compatibility issue of automake 1.10 with 1.9 (pb with MKDIR_P) AC_SUBST([MKDIR_P]) AC_LANG([C++]) echo "-----------------------------------------------" AC_DEBUG AC_PROFILE AC_WARNINGS echo "-----------------------------------------------" # CFLAGS=${CFLAGS:-$DEFAULT_CFLAGS} # CXXFLAGS=${CXXFLAGS:-$DEFAULT_CXXFLAGS} ###################################################### # Try and pass different flags according to compiler # ###################################################### # disable default -g -O2 CXXFLAGS : ${CXXFLAGS=""} #set CXX AC_PROG_CXX AC_COMPILER_NAME # We need a C++11 compiler now - AB 2014-12-12 AX_CXX_COMPILE_STDCXX_11([],[mandatory]) AC_SUBST([DEFAULT_CFLAGS]) AC_SUBST([DEBUG_CFLAGS]) AC_SUBST([TESTS_CFLAGS]) TESTS_CFLAGS="-O2" DEBUG_CFLAGS="-g" DEFAULT_CFLAGS="" WARN_CFLAGS="-Wall" #TODO use -fast for icc, -ipa for eko... if test "x$DBG" = "xyes" ; then DEFAULT_CFLAGS="-O0 ${DEFAULT_CFLAGS}" #those are CXXFLAGS DEBUG_CFLAGS="${DEBUG_CFLAGS} -DDEBUG -DFFLASFFPACK_DEBUG" else DEFAULT_CFLAGS="-O2 ${DEFAULT_CFLAGS}" DEBUG_CFLAGS="-DNDEBUG -UFFLASFFPACK_DEBUG" fi if test "x$PROF" = "xyes" ; then DEFAULT_CFLAGS="${DEFAULT_CFLAGS} -pg" fi if test "x$WARN" = "xyes" -o "x$WARN" = "xfull" ; then case x${CCNAM} in xicc) WARN_CFLAGS="${WARN_CFLAGS} -Wcheck" WARN_CFLAGS="${WARN_CFLAGS} -Wall -Wno-unused-parameter -Wuninitialized -Wconversion -Wcast-qual -pedantic -Wshadow -Wpointer-arith -Wwrite-strings -Wno-long-long" WARN_CFLAGS="${WARN_CFLAGS} -Wextra -ansi" ;; xeko) WARN_CFLAGS="${WARN_CFLAGS} -Wno-unused-parameter" ;; xgcc|xgcc44) WARN_CFLAGS="${WARN_CFLAGS} -Wextra -Wno-unused-parameter" if test "x${WARN}" = "xfull" ; then WARN_CFLAGS="${WARN_CFLAGS} -Wuninitialized -Wconversion -Wcast-qual -pedantic -Wshadow -Wpointer-arith -Wcast-align -Wwrite-strings -Wno-long-long -Wno-variadic-macros -Wno-vla" fi if test "x${HAVE_CXX11}" = "x0" ; then WARN_CFLAGS="${WARN_CFLAGS} -ansi" fi ;; xgcc48) WARN_CFLAGS="${WARN_CFLAGS} -Wextra -Wno-unused-parameter" if test "x${WARN}" = "xfull" ; then WARN_CFLAGS="${WARN_CFLAGS} -Wuninitialized -Wconversion -Wcast-qual -pedantic -Wshadow -Wpointer-arith -Wcast-align -Wwrite-strings -Wno-long-long -Wno-variadic-macros -Wno-vla" # WARN_CFLAGS="${WARN_CFLAGS} -fsanitize=address" fi if test "x${HAVE_CXX11}" = "x0" ; then WARN_CFLAGS="${WARN_CFLAGS} -ansi" fi ;; xclang) WARN_CFLAGS="${WARN_CFLAGS} -Wextra -Wno-unused-parameter" if test "x${WARN}" = "xfull" ; then WARN_CFLAGS="${WARN_CFLAGS} -Wuninitialized -Wconversion -Wcast-qual -pedantic -Wshadow -Wpointer-arith -Wcast-align -Wwrite-strings -Wno-long-long -Wno-vla-extension -Wno-variadic-macros" WARN_CFLAGS="${WARN_CFLAGS} -D__STRICT_ANSI__" fi ;; xclang31) WARN_CFLAGS="${WARN_CFLAGS} -Wextra -Wno-unused-parameter" if test "x${WARN}" = "xfull" ; then WARN_CFLAGS="${WARN_CFLAGS} -Wuninitialized -Wconversion -Wcast-qual -pedantic -Wshadow -Wpointer-arith -Wcast-align -Wwrite-strings -Wno-long-long -g -Wno-vla-extension -Wno-variadic-macros" WARN_CFLAGS="${WARN_CFLAGS} -D__STRICT_ANSI__" # WARN_CFLAGS="${WARN_CFLAGS} -fsanitize=address" fi ;; *) echo echo "*******************************************************" echo "unsupported compiler ($CCNAM). Please file a bug." echo "*******************************************************" echo WARN_CFLAGS="${WARN_CFLAGS}" esac fi DEFAULT_CFLAGS="${DEFAULT_CFLAGS} ${WARN_CFLAGS} ${DEBUG_CFLAGS}" TESTS_CFLAGS="${TESTS_CFLAGS} ${WARN_CFLAGS} ${DEBUG_CFLAGS}" AC_HEADER_STDC AC_PROG_LIBTOOL AC_PROG_EGREP AC_PROG_SED # newer libtool... LT_PREREQ([2.2]) LT_INIT echo "-----------------------------------------------" echo " START FFLAS-FFPACK CONFIG " echo "-----------------------------------------------" FF_CHECK_OMP # checkes which SIMD instructions are available and defines HAVE_{SSE_4_1,AVX,AVX2}_INSTRUCTIONS and compiler flags CUSTOM_SIMD="no" FF_CHECK_SIMD #FF_CHECK_SSE #FF_CHECK_AVX arch=`echo $target | cut -d"-" -f1` if [[ "x$CUSTOM_SIMD" = "xno" ]] ; then AX_CHECK_X86_FEATURES([][]) else CXXFLAGS="${CXXFLAGS} ${SSEFLAGS} ${AVXFLAGS}" fi dnl echo "CCNAM = $CCNAM $CUSTOM_SIMD" dnl With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot dnl have overloads for both types without linking error. AS_IF([test "x$CCNAM" = "xgcc48"],[CXXFLAGS="${CXXFLAGS} -fabi-version=6"],[]) PARFLAGS="${OMPFLAGS}" PARLIBS="${OMPFLAGS}" AC_SUBST(PARFLAGS) AC_SUBST(PARLIBS) echo "-----------------------------------------------" # Machine characteristics AC_CHECK_SIZEOF(char, 8) AC_CHECK_SIZEOF(short, 16) AC_CHECK_SIZEOF(int, 32) AC_CHECK_SIZEOF(long, 32) AC_CHECK_SIZEOF(long long, 64) AC_CHECK_SIZEOF(__int64, 64) # Checks for header files. AC_HEADER_STDC AC_CHECK_HEADERS([float.h limits.h stddef.h stdlib.h string.h sys/time.h stdint.h pthread.h]) # check endianness of the architecture AC_C_BIGENDIAN( [AC_DEFINE(HAVE_BIG_ENDIAN, 1, [Define that architecture uses big endian storage])], [AC_DEFINE(HAVE_LITTLE_ENDIAN, 1, [Define that architecture uses little endian storage])], []) # Create some useful data types of fixed, known lengths # AC_DEFINE_UNQUOTED(INT8, $LINBOX_INT8, Canonical 8-bit data type) # AC_DEFINE_UNQUOTED(INT16, $LINBOX_INT16, Canonical 16-bit data type) # AC_DEFINE_UNQUOTED(INT32, $LINBOX_INT32, Canonical 32-bit data type) # AC_DEFINE_UNQUOTED(INT64, $LINBOX_INT64, Canonical 64-bit data type) echo "-----------------------------------------------" # Feature checks FF_MISC AC_LANG_CPLUSPLUS echo "-----------------------------------------------" # Getting GMP from Givaro - AB 2014-12-12 #FF_CHECK_GMP PKG_CHECK_MODULES([GIVARO],[givaro]) dnl FF_CHECK_GIVARO(,,[ dnl echo '*******************************************************************************' dnl echo ' WARNING: GIVARO not found!' dnl echo dnl echo ' GIVARO library is required for some tests in this library.' dnl echo ' Please make sure GIVARO is installed and specify its location with the' dnl echo ' option --with-givaro= when running configure.' dnl echo ' Do not forget to set/export LD_LIBRARY_PATH if necessary.' dnl echo '*******************************************************************************' dnl exit 1 dnl ]) BLAS_FOUND=false FF_CHECK_BLAS_CFLAGS FF_CHECK_BLAS_LIBS FF_CHECK_MKL FF_CHECK_USER_BLAS FF_CHECK_USER_LAPACK # FF_CHECK_BLAS # FF_CHECK_GOTOBLAS # FF_CHECK_GSL # if test "$BLAS_FOUND" = "false" ; then # FF_CHECK_CBLAS # fi # if test "$BLAS_FOUND" = "false" ; then # FF_CHECK_OTHERBLAS # fi # FF_CHECK_LAPACK # if test "$BLAS_FOUND" = "false" ; then # FF_CHECK_BLAS2 # fi if test "$BLAS_FOUND" = "false" ; then echo '' echo '*******************************************************************************' echo ' ERROR: BLAS not found!' echo echo ' BLAS routines are required for this library to compile. Please' echo ' make sure BLAS are installed and specify its location with the option' echo ' --with-blas-libs= and if necessary --with-blas-cflags=' echo ' when running configure.' echo '*******************************************************************************' exit 1 fi # BLAS_LIBS="${BLAS_LIBS}" # BLAS_LIBS="-L/${BLAS_PATH} ${LAPACK_LIBS} ${BLAS_LIBS}" # AC_SUBST(BLAS_LIBS) # FF_CHECK_CUDA # AM_CONDITIONAL(FFLASFFPACK_HAVE_BLAS, test "x$BLAS_FOUND" != "xfalse") # FF_BENCH FF_DOC # if test ! -d ./benchmarks/data ; then # echo "Creating data dir in benchmark" ; # mkdir ./benchmarks/data ; # fi CXXFLAGS="${CXXFLAGS}" AC_SUBST(CXXFLAGS) FF_PRECOMPILE echo "-----------------------------------------------" echo " END FFLAS-FFPACK CONFIG " echo "-----------------------------------------------" AC_CONFIG_FILES([ Makefile macros/Makefile macros/CodeChunk/Makefile fflas-ffpack-config fflas-ffpack/Makefile fflas-ffpack/fflas/Makefile fflas-ffpack/fflas/fflas_fgemm/Makefile fflas-ffpack/fflas/fflas_sparse/Makefile fflas-ffpack/fflas/fflas_sparse/coo/Makefile fflas-ffpack/fflas/fflas_sparse/csr/Makefile fflas-ffpack/fflas/fflas_sparse/ell/Makefile fflas-ffpack/fflas/fflas_sparse/ell_simd/Makefile fflas-ffpack/fflas/fflas_sparse/csr_hyb/Makefile fflas-ffpack/fflas/fflas_sparse/sell/Makefile fflas-ffpack/fflas/fflas_sparse/hyb_zo/Makefile fflas-ffpack/fflas/fflas_igemm/Makefile fflas-ffpack/fflas/fflas_simd/Makefile fflas-ffpack/ffpack/Makefile fflas-ffpack/field/Makefile fflas-ffpack/utils/Makefile fflas-ffpack/paladin/Makefile fflas-ffpack/interfaces/Makefile fflas-ffpack/interfaces/libs/Makefile fflas-ffpack/checkers/Makefile doc/Makefile tests/Makefile tests/data/Makefile benchmarks/Makefile examples/Makefile optimiser/Makefile fflas-ffpack.pc ]) AC_OUTPUT echo "-----------------------------------------------" FF_OPT fflas-ffpack-2.2.2/doc/000077500000000000000000000000001274716147400146225ustar00rootroot00000000000000fflas-ffpack-2.2.2/doc/Doxyfile000066400000000000000000002276641274716147400163510ustar00rootroot00000000000000# Doxyfile 1.8.0 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (" "). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # http://www.gnu.org/software/libiconv for the list of possible encodings. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or sequence of words) that should # identify the project. Note that if you do not use Doxywizard you need # to put quotes around the project name if it contains spaces. PROJECT_NAME = FFLAS-FFPACK # The PROJECT_NUMBER tag can be used to enter a project or revision number. # This could be handy for archiving the generated documentation or # if some version control system is used. PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer # a quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = # With the PROJECT_LOGO tag one can specify an logo or icon that is # included in the documentation. The maximum height of the logo should not # exceed 55 pixels and the maximum width should not exceed 200 pixels. # Doxygen will copy the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. # If a relative path is entered, it will be relative to the location # where doxygen was started. If left blank the current directory will be used. OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create # 4096 sub-directories (in 2 levels) under the output directory of each output # format and will distribute the generated files over these directories. # Enabling this option can be useful when feeding doxygen a huge amount of # source files, where putting all generated files in the same directory would # otherwise cause performance problems for the file system. CREATE_SUBDIRS = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # The default language is English, other supported languages are: # Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, # Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, # Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English # messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, # Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, # Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will # include brief member descriptions after the members that are listed in # the file and class documentation (similar to JavaDoc). # Set to NO to disable this. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend # the brief description of a member or function before the detailed description. # Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator # that is used to form the text in various listings. Each string # in this list, if found as the leading text of the brief description, will be # stripped from the text and the result after processing the whole list, is # used as the annotated text. Otherwise, the brief description is used as-is. # If left blank, the following values are used ("$name" is automatically # replaced with the name of the entity): "The $name class" "The $name widget" # "The $name file" "is" "provides" "specifies" "contains" # "represents" "a" "an" "the" ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # Doxygen will generate a detailed section even if there is only a brief # description. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. INLINE_INHERITED_MEMB = YES # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full # path before files name in the file list and in the header files. If set # to NO the shortest path that makes the file name unique will be used. FULL_PATH_NAMES = NO # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag # can be used to strip a user-defined part of the path. Stripping is # only done if one of the specified strings matches the left-hand part of # the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the # path to strip. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of # the path mentioned in the documentation of a class, which tells # the reader which header file to include in order to use a class. # If left blank only the name of the header file containing the class # definition is used. Otherwise one should specify the include paths that # are normally passed to the compiler using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter # (but less readable) file names. This can be useful if your file system # doesn't support long names like on DOS, Mac, or CD-ROM. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen # will interpret the first line (until the first dot) of a JavaDoc-style # comment as the brief description. If set to NO, the JavaDoc # comments will behave just like regular Qt-style comments # (thus requiring an explicit @brief command for a brief description.) JAVADOC_AUTOBRIEF = YES # If the QT_AUTOBRIEF tag is set to YES then Doxygen will # interpret the first line (until the first dot) of a Qt-style # comment as the brief description. If set to NO, the comments # will behave just like regular Qt-style comments (thus requiring # an explicit \brief command for a brief description.) QT_AUTOBRIEF = YES # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen # treat a multi-line C++ special comment block (i.e. a block of //! or /// # comments) as a brief description. This used to be the default behaviour. # The new default is to treat a multi-line C++ comment block as a detailed # description. Set this tag to YES if you prefer the old behaviour instead. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES (the default) then an undocumented # member inherits the documentation from any documented member that it # re-implements. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce # a new page for each member. If set to NO, the documentation of a member will # be part of the file/class/namespace that contains it. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. # Doxygen uses this value to replace tabs by spaces in code fragments. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that acts # as commands in the documentation. An alias has the form "name=value". # For example adding "sideeffect=\par Side Effects:\n" will allow you to # put the command \sideeffect (or @sideeffect) in the documentation, which # will result in a user-defined paragraph with heading "Side Effects:". # You can put \n's in the value part of an alias to insert newlines. ALIASES = "bib=\xrefitem bib \"Bibliography\" \"Bibliography\"" # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding # "class=itcl::class" will allow you to use the command class in the # itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C # sources only. Doxygen will then generate output that is more tailored for C. # For instance, some of the names that are used will be different. The list # of all members will be omitted, etc. OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java # sources only. Doxygen will then generate output that is more tailored for # Java. For instance, namespaces will be presented as packages, qualified # scopes will look different, etc. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources only. Doxygen will then generate output that is more tailored for # Fortran. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for # VHDL. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given extension. # Doxygen has a built-in mapping, but you can override or extend it using this # tag. The format is ext=language, where ext is a file extension, and language # is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, # C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make # doxygen treat .inc files as Fortran files (default is PHP), and .f files as C # (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions # you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. EXTENSION_MAPPING = # If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all # comments according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you # can mix doxygen, HTML, and XML commands with Markdown formatting. # Disable only in case of backward compatibilities issues. MARKDOWN_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should # set this tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); v.s. # func(std::string) {}). This also makes the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. BUILTIN_STL_SUPPORT = YES # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. # Doxygen will parse them like normal C++ but will assume all classes use public # instead of private inheritance when no explicit protection keyword is present. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate getter # and setter methods for a property. Setting this option to YES (the default) # will make doxygen replace the get and set methods by a property in the # documentation. This will only work if the methods are indeed getting or # setting a simple type. If this is not the case, or you want to show the # methods anyway, you should set this option to NO. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES, then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. DISTRIBUTE_GROUP_DOC = YES # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a # subgroup of that type (e.g. under the Public Functions section). Set it to # NO to prevent subgrouping. Alternatively, this can be done per class using # the \nosubgrouping command. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and # unions are shown inside the group in which they are included (e.g. using # @ingroup) instead of on a separate page (for HTML and Man pages) or # section (for LaTeX and RTF). INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and # unions with only public data fields will be shown inline in the documentation # of the scope in which they are defined (i.e. file, namespace, or group # documentation), provided this scope is documented. If set to NO (the default), # structs, classes, and unions are shown on a separate page (for HTML and Man # pages) or section (for LaTeX and RTF). INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum # is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically # be useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. TYPEDEF_HIDES_STRUCT = NO # The SYMBOL_CACHE_SIZE determines the size of the internal cache use to # determine which symbols to keep in memory and which to flush to disk. # When the cache is full, less often used symbols will be written to disk. # For small to medium size projects (<1000 input files) the default value is # probably good enough. For larger projects a too small cache size can cause # doxygen to be busy swapping symbols to and from disk most of the time # causing a significant performance penalty. # If the system has enough physical memory increasing the cache will improve the # performance by keeping more symbols in memory. Note that the value works on # a logarithmic scale so increasing the size by one will roughly double the # memory usage. The cache size is given by this formula: # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, # corresponding to a cache size of 2^16 = 65536 symbols. SYMBOL_CACHE_SIZE = 0 # Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be # set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given # their name and scope. Since this can be an expensive process and often the # same symbol appear multiple times in the code, doxygen keeps a cache of # pre-resolved symbols. If the cache is too small doxygen will become slower. # If the cache is too large, memory is wasted. The cache size is given by this # formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0, # corresponding to a cache size of 2^16 = 65536 symbols. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in # documentation are documented, even if no documentation was available. # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES all members with package or internal scope will be included in the documentation. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES all static members of a file # will be included in the documentation. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) # defined locally in source files will be included in the documentation. # If set to NO only classes defined in header files are included. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. When set to YES local # methods, which are defined in the implementation section but not in # the interface are included in the documentation. # If set to NO (the default) only methods in the interface are included. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base # name of the file that contains the anonymous namespace. By default # anonymous namespaces are hidden. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all # undocumented members of documented classes, files or namespaces. # If set to NO (the default) these members will be included in the # various overviews, but no documentation section is generated. # This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_MEMBERS = YES # If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. # If set to NO (the default) these classes will be included in the various # overviews. This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_CLASSES = YES # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all # friend (class|struct|union) declarations. # If set to NO (the default) these declarations will be included in the # documentation. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any # documentation blocks found inside the body of a function. # If set to NO (the default) these blocks will be appended to the # function's detailed documentation block. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation # that is typed after a \internal command is included. If the tag is set # to NO (the default) then the documentation will be excluded. # Set it to YES to include the internal documentation. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate # file names in lower-case letters. If set to YES upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen # will show members with their full class and namespace scopes in the # documentation. If set to YES the scope will be hidden. HIDE_SCOPE_NAMES = YES # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen # will put a list of the files that are included by a file in the documentation # of that file. SHOW_INCLUDE_FILES = YES # If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen # will list include files with double quotes in the documentation # rather than with sharp brackets. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES (the default) then a tag [inline] # is inserted in the documentation for inline members. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen # will sort the (detailed) documentation of file and class members # alphabetically by member name. If set to NO the members will appear in # declaration order. SORT_MEMBER_DOCS = NO # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the # brief documentation of file, namespace and class members alphabetically # by member name. If set to NO (the default) the members will appear in # declaration order. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen # will sort the (brief and detailed) documentation of class members so that # constructors and destructors are listed first. If set to NO (the default) # the constructors will appear in the respective orders defined by # SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. # This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO # and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the # hierarchy of group names into alphabetical order. If set to NO (the default) # the group names will appear in their defined order. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be # sorted by fully-qualified names, including namespaces. If set to # NO (the default), the class list will be sorted only by class name, # not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the # alphabetical list. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to # do proper type resolution of all parameters of a function it will reject a # match between the prototype and the implementation of a member function even # if there is only one candidate or it is obvious which candidate to choose # by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen # will still accept a match between prototype and implementation in such cases. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or # disable (NO) the todo list. This list is created by putting \todo # commands in the documentation. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or # disable (NO) the test list. This list is created by putting \test # commands in the documentation. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or # disable (NO) the bug list. This list is created by putting \bug # commands in the documentation. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or # disable (NO) the deprecated list. This list is created by putting # \deprecated commands in the documentation. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional # documentation sections, marked by \if sectionname ... \endif. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines # the initial value of a variable or macro consists of for it to appear in # the documentation. If the initializer consists of more lines than specified # here it will be hidden. Use a value of 0 to hide initializers completely. # The appearance of the initializer of individual variables and macros in the # documentation can be controlled using \showinitializer or \hideinitializer # command in the documentation regardless of this setting. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated # at the bottom of the documentation of classes and structs. If set to YES the # list will mention the files that were used to generate the documentation. SHOW_USED_FILES = YES # If the sources in your project are distributed over multiple directories # then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy # in the documentation. The default is NO. SHOW_DIRECTORIES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. # This will remove the Files entry from the Quick Index and from the # Folder Tree View (if specified). The default is YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the # Namespaces page. # This will remove the Namespaces entry from the Quick Index # and from the Folder Tree View (if specified). The default is YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command , where is the value of # the FILE_VERSION_FILTER tag, and is the name of an input file # provided by doxygen. Whatever the program writes to standard output # is used as the file version. See the manual for examples. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. The create the layout file # that represents doxygen's defaults, run doxygen with the -l option. # You can optionally specify a file name after the option, if omitted # DoxygenLayout.xml will be used as the name of the layout file. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files # containing the references data. This must be a list of .bib files. The # .bib extension is automatically appended if omitted. Using this command # requires the bibtex tool to be installed. See also # http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style # of the bibliography can be controlled using LATEX_BIB_STYLE. To use this # feature you need bibtex and perl available in the search path. CITE_BIB_FILES = #--------------------------------------------------------------------------- # configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated # by doxygen. Possible values are YES and NO. If left blank NO is used. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated by doxygen. Possible values are YES and NO. If left blank # NO is used. WARNINGS = YES # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings # for undocumented members. If EXTRACT_ALL is set to YES then this flag will # automatically be disabled. WARN_IF_UNDOCUMENTED = YES # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some # parameters in a documented function, or documenting parameters that # don't exist or using markup commands wrongly. WARN_IF_DOC_ERROR = YES # The WARN_NO_PARAMDOC option can be enabled to get warnings for # functions that are documented, but have no documentation for their parameters # or return value. If set to NO (the default) doxygen will only warn about # wrong or incomplete parameter documentation, but not about the absence of # documentation. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that # doxygen can produce. The string should contain the $file, $line, and $text # tags, which will be replaced by the file and line number from which the # warning originated and the warning text. Optionally the format may contain # $version, which will be replaced by the version of the file (if it could # be obtained via FILE_VERSION_FILTER) WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning # and error messages should be written. If left blank the output is written # to stderr. WARN_LOGFILE = doxy.debug #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag can be used to specify the files and/or directories that contain # documented source files. You may enter file names like "myfile.cpp" or # directories like "/usr/src/myproject". Separate the files or directories # with spaces. INPUT = .. \ ../fflas-ffpack \ ../doc \ ../tests \ ../benchmark # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is # also the default input encoding. Doxygen uses libiconv (or the iconv built # into libc) for the transcoding. See http://www.gnu.org/software/libiconv for # the list of possible encodings. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank the following patterns are tested: # *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh # *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py # *.f90 *.f *.for *.vhd *.vhdl FILE_PATTERNS = *.cc \ *.cpp \ *.C \ *.h \ *.inl \ *.doxy # The RECURSIVE tag can be used to turn specify whether or not subdirectories # should be searched for input files as well. Possible values are YES and NO. # If left blank NO is used. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = CVS \ *_src.inl \ Attic # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. Note that the wildcards are matched # against the file with absolute path, so to exclude all test directories # for example use the pattern */test/* EXCLUDE_PATTERNS = */CVS/* \ *_src.inl \ */Attic/* # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or # directories that contain example code fragments that are included (see # the \include command). EXAMPLE_PATH = .. # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank all files are included. EXAMPLE_PATTERNS = *.C \ *.inl \ *INSTALL \ *COPYING \ *HACKING \ *AUTHORS \ *.html # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude # commands irrespective of the value of the RECURSIVE tag. # Possible values are YES and NO. If left blank NO is used. EXAMPLE_RECURSIVE = YES # The IMAGE_PATH tag can be used to specify one or more files or # directories that contain image that are included in the documentation (see # the \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command , where # is the value of the INPUT_FILTER tag, and is the name of an # input file. Doxygen will then use the output that the filter program writes # to standard output. # If FILTER_PATTERNS is specified, this tag will be # ignored. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. # Doxygen will compare the file name with each pattern and apply the # filter if there is a match. # The filters are a list of the form: # pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further # info on how filters are used. If FILTER_PATTERNS is empty or if # non of the patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will be used to filter the input files when producing source # files to browse (i.e. when SOURCE_BROWSER is set to YES). FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) # and it is also possible to disable source filtering for a specific pattern # using *.ext= (so without naming a filter). This option only has effect when # FILTER_SOURCE_FILES is enabled. FILTER_SOURCE_PATTERNS = #--------------------------------------------------------------------------- # configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will # be generated. Documented entities will be cross-referenced with these sources. # Note: To get rid of all source code in the generated output, make sure also # VERBATIM_HEADERS is set to NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body # of functions and classes directly in the documentation. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct # doxygen to hide any special comment blocks from generated source code # fragments. Normal C and C++ comments will always remain visible. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES # then for each documented function all documented # functions referencing it will be listed. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES # then for each documented function all documented entities # called/used by that function will be listed. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES (the default) # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from # functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will # link to the source code. # Otherwise they will link to the documentation. REFERENCES_LINK_SOURCE = YES # If the USE_HTAGS tag is set to YES then the references to source code # will point to the HTML generated by the htags(1) tool instead of doxygen # built-in source browser. The htags tool is part of GNU's global source # tagging system (see http://www.gnu.org/software/global/global.html). You # will need version 4.8.6 or higher. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen # will generate a verbatim copy of the header file for each class for # which an include is specified. Set to NO to disable this. VERBATIM_HEADERS = NO #--------------------------------------------------------------------------- # configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index # of all compounds will be generated. Enable this if the project # contains a lot of classes, structs, unions or interfaces. ALPHABETICAL_INDEX = NO # If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then # the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns # in which this list will be split (can be a number in the range [1..20]) COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that # should be ignored while generating the index headers. IGNORE_PREFIX = #--------------------------------------------------------------------------- # configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES (the default) Doxygen will # generate HTML output. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `html' will be used as the default path. HTML_OUTPUT = fflas-ffpack-html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for # each generated HTML page (for example: .htm,.php,.asp). If it is left blank # doxygen will generate files with .html extension. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a personal HTML header for # each generated HTML page. If it is left blank doxygen will generate a # standard header. Note that when using a custom header you are responsible # for the proper inclusion of any scripts and style sheets that doxygen # needs, which is dependent on the configuration options used. # It is advised to generate a default header using "doxygen -w html # header.html footer.html stylesheet.css YourConfigFile" and then modify # that header. Note that the header is subject to change so you typically # have to redo this when upgrading to a newer version of doxygen or when # changing the value of configuration settings such as GENERATE_TREEVIEW! HTML_HEADER = # The HTML_FOOTER tag can be used to specify a personal HTML footer for # each generated HTML page. If it is left blank doxygen will generate a # standard footer. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading # style sheet that is used by each HTML page. It can be used to # fine-tune the look of the HTML output. If the tag is left blank doxygen # will generate a default style sheet. Note that doxygen will try to copy # the style sheet file to the HTML output directory, so don't put your own # style sheet in the HTML output directory as well, or it will be erased! HTML_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that # the files will be copied as-is; there are no commands or markers available. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. # Doxygen will adjust the colors in the style sheet and background images # according to this color. Hue is specified as an angle on a colorwheel, # see http://en.wikipedia.org/wiki/Hue for more information. # For instance the value 0 represents red, 60 is yellow, 120 is green, # 180 is cyan, 240 is blue, 300 purple, and 360 is red again. # The allowed range is 0 to 359. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of # the colors in the HTML output. For a value of 0 the output will use # grayscales only. A value of 255 will produce the most vivid colors. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to # the luminance component of the colors in the HTML output. Values below # 100 gradually make the output lighter, whereas values above 100 make # the output darker. The value divided by 100 is the actual gamma applied, # so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, # and 100 does not change the gamma. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting # this to NO can help when comparing the output of multiple runs. HTML_TIMESTAMP = YES # If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, # files or namespaces will be aligned in HTML using tables. If set to # NO a bullet list will be used. HTML_ALIGN_MEMBERS = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. For this to work a browser that supports # JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox # Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). HTML_DYNAMIC_SECTIONS = YES # If the GENERATE_DOCSET tag is set to YES, additional index files # will be generated that can be used as input for Apple's Xcode 3 # integrated development environment, introduced with OSX 10.5 (Leopard). # To create a documentation set, doxygen will generate a Makefile in the # HTML output directory. Running make will produce the docset in that # directory and running "make install" will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find # it at startup. # See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. GENERATE_DOCSET = NO # When GENERATE_DOCSET tag is set to YES, this tag determines the name of the # feed. A documentation feed provides an umbrella under which multiple # documentation sets from a single provider (such as a company or product suite) # can be grouped. DOCSET_FEEDNAME = "Doxygen generated docs" # When GENERATE_DOCSET tag is set to YES, this tag specifies a string that # should uniquely identify the documentation set bundle. This should be a # reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen # will append .docset to the name. DOCSET_BUNDLE_ID = org.doxygen.Project # When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compiled HTML help file (.chm) # of the generated HTML documentation. GENERATE_HTMLHELP = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can # be used to specify the file name of the resulting .chm file. You # can add a path in front of the file if the result should not be # written to the html output directory. CHM_FILE = # If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can # be used to specify the location (absolute path including file name) of # the HTML help compiler (hhc.exe). If non-empty doxygen will try to run # the HTML help compiler on the generated index.hhp. HHC_LOCATION = # If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag # controls if a separate .chi index file is generated (YES) or that # it should be included in the master .chm file (NO). GENERATE_CHI = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING # is used to encode HtmlHelp index (hhk), content (hhc) and project file # content. CHM_INDEX_ENCODING = # If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag # controls whether a binary table of contents is generated (YES) or a # normal table of contents (NO) in the .chm file. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members # to the contents of the HTML help documentation and to the tree view. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated # that can be used as input for Qt's qhelpgenerator to generate a # Qt Compressed Help (.qch) of the generated HTML documentation. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can # be used to specify the file name of the resulting .qch file. # The path specified is relative to the HTML output folder. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#namespace QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#virtual-folders QHP_VIRTUAL_FOLDER = doc # If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to # add. For more information please see # http://doc.trolltech.com/qthelpproject.html#custom-filters QHP_CUST_FILTER_NAME = # The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see # # Qt Help Project / Custom Filters. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's # filter section matches. # # Qt Help Project / Filter Attributes. QHP_SECT_FILTER_ATTRS = # If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can # be used to specify the location of Qt's qhelpgenerator. # If non-empty doxygen will try to run qhelpgenerator on the generated # .qhp file. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files # will be generated, which together with the HTML files, form an Eclipse help # plugin. To install this plugin and make it available under the help contents # menu in Eclipse, the contents of the directory containing the HTML and XML # files needs to be copied into the plugins directory of eclipse. The name of # the directory within the plugins directory should be the same as # the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before # the help appears. GENERATE_ECLIPSEHELP = NO # A unique identifier for the eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have # this name. ECLIPSE_DOC_ID = org.doxygen.Project # The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) # at top of each HTML page. The value NO (the default) enables the index and # the value YES disables it. Since the tabs have the same information as the # navigation tree you can set this option to NO if you already set # GENERATE_TREEVIEW to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. # If the tag value is set to YES, a side panel will be generated # containing a tree-like index structure (just like the one that # is generated for HTML Help). For this to work a browser that supports # JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). # Windows users are probably better off using the HTML help feature. # Since the tree basically has the same information as the tab index you # could consider to set DISABLE_INDEX to NO when enabling this option. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values # (range [0,1..20]) that doxygen will group on one line in the generated HTML # documentation. Note that a value of 0 will completely suppress the enum # values from appearing in the overview section. ENUM_VALUES_PER_LINE = 4 # By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, # and Class Hierarchy pages using a tree view instead of an ordered list. USE_INLINE_TREES = NO # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be # used to set the initial width (in pixels) of the frame in which the tree # is shown. TREEVIEW_WIDTH = 250 # When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open # links to external symbols imported via tag files in a separate window. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of Latex formulas included # as images in the HTML documentation. The default is 10. Note that # when you change the font size after a successful doxygen run you need # to manually remove any form_*.png images from the HTML output directory # to force them to be regenerated. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are # not supported properly for IE 6.0, but are supported on all modern browsers. # Note that when changing this option you need to delete any form_*.png files # in the HTML output before the changes have effect. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax # (see http://www.mathjax.org) which uses client side Javascript for the # rendering instead of using prerendered bitmaps. Use this if you do not # have LaTeX installed or if you want to formulas look prettier in the HTML # output. When enabled you may also need to install MathJax separately and # configure the path to it using the MATHJAX_RELPATH option. USE_MATHJAX = NO # When MathJax is enabled you need to specify the location relative to the # HTML output directory using the MATHJAX_RELPATH option. The destination # directory should contain the MathJax.js script. For instance, if the mathjax # directory is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to # the MathJax Content Delivery Network so you can quickly see the result without # installing MathJax. # However, it is strongly recommended to install a local # copy of MathJax from http://www.mathjax.org before deployment. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension # names that should be enabled during MathJax rendering. MATHJAX_EXTENSIONS = # When the SEARCHENGINE tag is enabled doxygen will generate a search box # for the HTML output. The underlying search engine uses javascript # and DHTML and should work on any modern browser. Note that when using # HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets # (GENERATE_DOCSET) there is already a search function so this one should # typically be disabled. For large projects the javascript based search engine # can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. SEARCHENGINE = NO # When the SERVER_BASED_SEARCH tag is enabled the search engine will be # implemented using a PHP enabled web server instead of at the web client # using Javascript. Doxygen will generate the search PHP script and index # file to put on the web server. The advantage of the server # based approach is that it scales better to large projects and allows # full text search. The disadvantages are that it is more difficult to setup # and does not have live searching capabilities. SERVER_BASED_SEARCH = NO #--------------------------------------------------------------------------- # configuration options related to the LaTeX output #--------------------------------------------------------------------------- # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will # generate Latex output. GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `latex' will be used as the default path. LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. If left blank `latex' will be used as the default command name. # Note that when enabling USE_PDFLATEX this option is only used for # generating bitmaps for formulas in the HTML output, but not in the # Makefile that is written to the output directory. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to # generate index for LaTeX. If left blank `makeindex' will be used as the # default command name. MAKEINDEX_CMD_NAME = makeindex # If the COMPACT_LATEX tag is set to YES Doxygen generates more compact # LaTeX documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_LATEX = NO # The PAPER_TYPE tag can be used to set the paper type that is used # by the printer. Possible values are: a4, letter, legal and # executive. If left blank a4wide will be used. PAPER_TYPE = a4wide # The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX # packages that should be included in the LaTeX output. EXTRA_PACKAGES = stmaryrd \ amsmath # The LATEX_HEADER tag can be used to specify a personal LaTeX header for # the generated latex document. The header should contain everything until # the first chapter. If it is left blank doxygen will generate a # standard header. Notice: only use this tag if you know what you are doing! LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for # the generated latex document. The footer should contain everything after # the last chapter. If it is left blank doxygen will generate a # standard footer. Notice: only use this tag if you know what you are doing! LATEX_FOOTER = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated # is prepared for conversion to pdf (using ps2pdf). The pdf file will # contain links (just like the HTML output) instead of page references # This makes the output suitable for online browsing using a pdf viewer. PDF_HYPERLINKS = NO # If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of # plain latex in the generated Makefile. Set this option to YES to get a # higher quality PDF documentation. USE_PDFLATEX = NO # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. # command to the generated LaTeX files. This will instruct LaTeX to keep # running if errors occur, instead of asking the user for help. # This option is also used when generating formulas in HTML. LATEX_BATCHMODE = NO # If LATEX_HIDE_INDICES is set to YES then doxygen will not # include the index chapters (such as File Index, Compound Index, etc.) # in the output. LATEX_HIDE_INDICES = NO # If LATEX_SOURCE_CODE is set to YES then doxygen will include # source code with syntax highlighting in the LaTeX output. # Note that which sources are shown also depends on other settings # such as SOURCE_BROWSER. LATEX_SOURCE_CODE = NO # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See # http://en.wikipedia.org/wiki/BibTeX for more info. LATEX_BIB_STYLE = plain #--------------------------------------------------------------------------- # configuration options related to the RTF output #--------------------------------------------------------------------------- # If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output # The RTF output is optimized for Word 97 and may not look very pretty with # other RTF readers or editors. GENERATE_RTF = NO # The RTF_OUTPUT tag is used to specify where the RTF docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `rtf' will be used as the default path. RTF_OUTPUT = rtf # If the COMPACT_RTF tag is set to YES Doxygen generates more compact # RTF documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_RTF = NO # If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated # will contain hyperlink fields. The RTF file will # contain links (just like the HTML output) instead of page references. # This makes the output suitable for online browsing using WORD or other # programs which support those fields. # Note: wordpad (write) and others do not support links. RTF_HYPERLINKS = NO # Load style sheet definitions from file. Syntax is similar to doxygen's # config file, i.e. a series of assignments. You only have to provide # replacements, missing definitions are set to their default value. RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an rtf document. # Syntax is similar to doxygen's config file. RTF_EXTENSIONS_FILE = #--------------------------------------------------------------------------- # configuration options related to the man page output #--------------------------------------------------------------------------- # If the GENERATE_MAN tag is set to YES (the default) Doxygen will # generate man pages GENERATE_MAN = NO # The MAN_OUTPUT tag is used to specify where the man pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `man' will be used as the default path. MAN_OUTPUT = man # The MAN_EXTENSION tag determines the extension that is added to # the generated man pages (default is the subroutine's section .3) MAN_EXTENSION = .3 # If the MAN_LINKS tag is set to YES and Doxygen generates man output, # then it will generate one additional man file for each entity # documented in the real man page(s). These additional files # only source the real man page, but without them the man command # would be unable to find the correct page. The default is NO. MAN_LINKS = NO #--------------------------------------------------------------------------- # configuration options related to the XML output #--------------------------------------------------------------------------- # If the GENERATE_XML tag is set to YES Doxygen will # generate an XML file that captures the structure of # the code including all documentation. GENERATE_XML = NO # The XML_OUTPUT tag is used to specify where the XML pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `xml' will be used as the default path. XML_OUTPUT = xml # The XML_SCHEMA tag can be used to specify an XML schema, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_SCHEMA = # The XML_DTD tag can be used to specify an XML DTD, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_DTD = # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that # enabling this will significantly increase the size of the XML output. XML_PROGRAMLISTING = YES #--------------------------------------------------------------------------- # configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will # generate an AutoGen Definitions (see autogen.sf.net) file # that captures the structure of the code including all # documentation. Note that this feature is still experimental # and incomplete at the moment. GENERATE_AUTOGEN_DEF = NO #--------------------------------------------------------------------------- # configuration options related to the Perl module output #--------------------------------------------------------------------------- # If the GENERATE_PERLMOD tag is set to YES Doxygen will # generate a Perl module file that captures the structure of # the code including all documentation. Note that this # feature is still experimental and incomplete at the # moment. GENERATE_PERLMOD = NO # If the PERLMOD_LATEX tag is set to YES Doxygen will generate # the necessary Makefile rules, Perl scripts and LaTeX code to be able # to generate PDF and DVI output from the Perl module output. PERLMOD_LATEX = NO # If the PERLMOD_PRETTY tag is set to YES the Perl module output will be # nicely formatted so it can be parsed by a human reader. # This is useful # if you want to understand what is going on. # On the other hand, if this # tag is set to NO the size of the Perl module output will be much smaller # and Perl will parse it just the same. PERLMOD_PRETTY = YES # The names of the make variables in the generated doxyrules.make file # are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. # This is useful so different doxyrules.make files included by the same # Makefile don't overwrite each other's variables. PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor #--------------------------------------------------------------------------- # If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will # evaluate all C-preprocessor directives found in the sources and include # files. ENABLE_PREPROCESSING = YES # If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro # names in the source code. If set to NO (the default) only conditional # compilation will be performed. Macro expansion can be done in a controlled # way by setting EXPAND_ONLY_PREDEF to YES. MACRO_EXPANSION = YES # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES # then the macro expansion is limited to the macros specified with the # PREDEFINED and EXPAND_AS_DEFINED tags. EXPAND_ONLY_PREDEF = NO # If the SEARCH_INCLUDES tag is set to YES (the default) the includes files # pointed to by INCLUDE_PATH will be searched when a #include is found. SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by # the preprocessor. INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the # directories. If left blank, the patterns specified with FILE_PATTERNS will # be used. INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that # are defined before the preprocessor is started (similar to the -D option of # gcc). The argument of the tag is a list of macros of the form: name # or name=definition (no spaces). If the definition and the = are # omitted =1 is assumed. To prevent a macro definition from being # undefined via #undef or recursively expanded use the := operator # instead of the = operator. PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then # this tag can be used to specify a list of macro names that should be expanded. # The macro definition that is found in the sources will be used. # Use the PREDEFINED tag if you want to use a different macro definition that # overrules the definition found in the source code. EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # doxygen's preprocessor will remove all references to function-like macros # that are alone on a line, have an all uppercase name, and do not end with a # semicolon, because these will confuse the parser if not removed. SKIP_FUNCTION_MACROS = YES #--------------------------------------------------------------------------- # Configuration::additions related to external references #--------------------------------------------------------------------------- # The TAGFILES option can be used to specify one or more tagfiles. For each # tag file the location of the external documentation should be added. The # format of a tag file without this location is as follows: # # TAGFILES = file1 file2 ... # Adding location for the tag files is done as follows: # # TAGFILES = file1=loc1 "file2 = loc2" ... # where "loc1" and "loc2" can be relative or absolute paths # or URLs. Note that each tag file must have a unique name (where the name does # NOT include the path). If a tag file is not located in the directory in which # doxygen is run, you must also specify the path to the tagfile here. TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create # a tag file that is based on the input files it reads. GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES all external classes will be listed # in the class index. If set to NO only the inherited external classes # will be listed. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed # in the modules index. If set to NO, only the current project's groups will # be listed. EXTERNAL_GROUPS = YES # The PERL_PATH should be the absolute path and name of the perl script # interpreter (i.e. the result of `which perl'). PERL_PATH = /usr/bin/perl #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- # If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will # generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base # or super classes. Setting the tag to NO turns the diagrams off. Note that # this option also works with HAVE_DOT disabled, but it is recommended to # install and use dot, since it yields more powerful graphs. CLASS_DIAGRAMS = YES # You can define message sequence charts within doxygen comments using the \msc # command. Doxygen will then run the mscgen tool (see # http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the # documentation. The MSCGEN_PATH tag allows you to specify the directory where # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. MSCGEN_PATH = # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz, a graph visualization # toolkit from AT&T and Lucent Bell Labs. The other options in this section # have no effect if this option is set to NO (the default) HAVE_DOT = NO # The DOT_NUM_THREADS specifies the number of dot invocations doxygen is # allowed to run in parallel. When set to 0 (the default) doxygen will # base this on the number of processors available in the system. You can set it # explicitly to a value larger than 0 to get control over the balance # between CPU load and processing speed. DOT_NUM_THREADS = 0 # By default doxygen will use the Helvetica font for all dot files that # doxygen generates. When you want a differently looking font you can specify # the font name using DOT_FONTNAME. You need to make sure dot is able to find # the font, which can be done by putting it in a standard location or by setting # the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the # directory containing the font. DOT_FONTNAME = FreeSans.ttf # The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. # The default size is 10pt. DOT_FONTSIZE = 10 # By default doxygen will tell dot to use the Helvetica font. # If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to # set the path where dot can find it. DOT_FONTPATH = # If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect inheritance relations. Setting this tag to YES will force the # CLASS_DIAGRAMS tag to NO. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect implementation dependencies (inheritance, containment, and # class references variables) of the class with other documented classes. COLLABORATION_GRAPH = YES # If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen # will generate a graph for groups, showing the direct groups dependencies GROUP_GRAPHS = YES # If the UML_LOOK tag is set to YES doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. UML_LOOK = NO # If the UML_LOOK tag is enabled, the fields and methods are shown inside # the class node. If there are many fields or methods and many nodes the # graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS # threshold limits the number of items for each type to make the size more # managable. Set this to 0 for no limit. Note that the threshold may be # exceeded by 50% before the limit is enforced. UML_LIMIT_NUM_FIELDS = 10 # If set to YES, the inheritance and collaboration graphs will show the # relations between templates and their instances. TEMPLATE_RELATIONS = NO # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT # tags are set to YES then doxygen will generate a graph for each documented # file showing the direct and indirect include dependencies of the file with # other documented files. INCLUDE_GRAPH = YES # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and # HAVE_DOT tags are set to YES then doxygen will generate a graph for each # documented header file showing the documented files that directly or # indirectly include this file. INCLUDED_BY_GRAPH = YES # If the CALL_GRAPH and HAVE_DOT options are set to YES then # doxygen will generate a call dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable call graphs # for selected functions only using the \callgraph command. CALL_GRAPH = NO # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then # doxygen will generate a caller dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable caller # graphs for selected functions only using the \callergraph command. CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will generate a graphical hierarchy of all classes instead of a textual one. GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES # then doxygen will show the dependencies a directory has on other directories # in a graphical way. The dependency relations are determined by the #include # relations between the files in the directories. DIRECTORY_GRAPH = YES # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. Possible values are svg, png, jpg, or gif. # If left blank png will be used. If you choose svg you need to set # HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible in IE 9+ (other browsers do not have this requirement). DOT_IMAGE_FORMAT = png # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to # enable generation of interactive SVG images that allow zooming and panning. # Note that this requires a modern browser other than Internet Explorer. # Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you # need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible. Older versions of IE do not have SVG support. INTERACTIVE_SVG = NO # The tag DOT_PATH can be used to specify the path where the dot tool can be # found. If left blank, it is assumed the dot tool can be found in the path. DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the # \dotfile command). DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the # \mscfile command). MSCFILE_DIRS = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of # nodes that will be shown in the graph. If the number of nodes in a graph # becomes larger than this value, doxygen will truncate the graph, which is # visualized by representing a node as a red box. Note that doxygen if the # number of direct children of the root node in a graph is already larger than # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note # that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. DOT_GRAPH_MAX_NODES = 50 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the # graphs generated by dot. A depth value of 3 means that only nodes reachable # from the root by following a path via at most 3 edges will be shown. Nodes # that lay further from the root node will be omitted. Note that setting this # option to 1 or 2 may greatly reduce the computation time needed for large # code bases. Also note that the size of a graph can be further restricted by # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. MAX_DOT_GRAPH_DEPTH = 1000 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent # background. This is disabled by default, because dot on Windows does not # seem to support this out of the box. Warning: Depending on the platform used, # enabling this option may lead to badly anti-aliased labels on the edges of # a graph (i.e. they become hard to read). DOT_TRANSPARENT = NO # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) # support this, this feature is disabled by default. DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will # generate a legend page explaining the meaning of the various boxes and # arrows in the dot generated graphs. GENERATE_LEGEND = YES # If the DOT_CLEANUP tag is set to YES (the default) Doxygen will # remove the intermediate dot files that are used to generate # the various graphs. DOT_CLEANUP = YES fflas-ffpack-2.2.2/doc/DoxyfileDev000066400000000000000000002277151274716147400170050ustar00rootroot00000000000000# Doxyfile 1.8.0 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (" "). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # http://www.gnu.org/software/libiconv for the list of possible encodings. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or sequence of words) that should # identify the project. Note that if you do not use Doxywizard you need # to put quotes around the project name if it contains spaces. PROJECT_NAME = FflasFfpack # The PROJECT_NUMBER tag can be used to enter a project or revision number. # This could be handy for archiving the generated documentation or # if some version control system is used. PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer # a quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = # With the PROJECT_LOGO tag one can specify an logo or icon that is # included in the documentation. The maximum height of the logo should not # exceed 55 pixels and the maximum width should not exceed 200 pixels. # Doxygen will copy the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. # If a relative path is entered, it will be relative to the location # where doxygen was started. If left blank the current directory will be used. OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create # 4096 sub-directories (in 2 levels) under the output directory of each output # format and will distribute the generated files over these directories. # Enabling this option can be useful when feeding doxygen a huge amount of # source files, where putting all generated files in the same directory would # otherwise cause performance problems for the file system. CREATE_SUBDIRS = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # The default language is English, other supported languages are: # Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, # Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, # Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English # messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, # Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, # Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will # include brief member descriptions after the members that are listed in # the file and class documentation (similar to JavaDoc). # Set to NO to disable this. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend # the brief description of a member or function before the detailed description. # Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator # that is used to form the text in various listings. Each string # in this list, if found as the leading text of the brief description, will be # stripped from the text and the result after processing the whole list, is # used as the annotated text. Otherwise, the brief description is used as-is. # If left blank, the following values are used ("$name" is automatically # replaced with the name of the entity): "The $name class" "The $name widget" # "The $name file" "is" "provides" "specifies" "contains" # "represents" "a" "an" "the" ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # Doxygen will generate a detailed section even if there is only a brief # description. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. INLINE_INHERITED_MEMB = YES # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full # path before files name in the file list and in the header files. If set # to NO the shortest path that makes the file name unique will be used. FULL_PATH_NAMES = NO # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag # can be used to strip a user-defined part of the path. Stripping is # only done if one of the specified strings matches the left-hand part of # the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the # path to strip. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of # the path mentioned in the documentation of a class, which tells # the reader which header file to include in order to use a class. # If left blank only the name of the header file containing the class # definition is used. Otherwise one should specify the include paths that # are normally passed to the compiler using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter # (but less readable) file names. This can be useful if your file system # doesn't support long names like on DOS, Mac, or CD-ROM. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen # will interpret the first line (until the first dot) of a JavaDoc-style # comment as the brief description. If set to NO, the JavaDoc # comments will behave just like regular Qt-style comments # (thus requiring an explicit @brief command for a brief description.) JAVADOC_AUTOBRIEF = YES # If the QT_AUTOBRIEF tag is set to YES then Doxygen will # interpret the first line (until the first dot) of a Qt-style # comment as the brief description. If set to NO, the comments # will behave just like regular Qt-style comments (thus requiring # an explicit \brief command for a brief description.) QT_AUTOBRIEF = YES # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen # treat a multi-line C++ special comment block (i.e. a block of //! or /// # comments) as a brief description. This used to be the default behaviour. # The new default is to treat a multi-line C++ comment block as a detailed # description. Set this tag to YES if you prefer the old behaviour instead. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES (the default) then an undocumented # member inherits the documentation from any documented member that it # re-implements. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce # a new page for each member. If set to NO, the documentation of a member will # be part of the file/class/namespace that contains it. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. # Doxygen uses this value to replace tabs by spaces in code fragments. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that acts # as commands in the documentation. An alias has the form "name=value". # For example adding "sideeffect=\par Side Effects:\n" will allow you to # put the command \sideeffect (or @sideeffect) in the documentation, which # will result in a user-defined paragraph with heading "Side Effects:". # You can put \n's in the value part of an alias to insert newlines. ALIASES = "bib=\xrefitem bib \"Bibliography\" \"Bibliography\"" # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding # "class=itcl::class" will allow you to use the command class in the # itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C # sources only. Doxygen will then generate output that is more tailored for C. # For instance, some of the names that are used will be different. The list # of all members will be omitted, etc. OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java # sources only. Doxygen will then generate output that is more tailored for # Java. For instance, namespaces will be presented as packages, qualified # scopes will look different, etc. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources only. Doxygen will then generate output that is more tailored for # Fortran. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for # VHDL. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given extension. # Doxygen has a built-in mapping, but you can override or extend it using this # tag. The format is ext=language, where ext is a file extension, and language # is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, # C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make # doxygen treat .inc files as Fortran files (default is PHP), and .f files as C # (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions # you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. EXTENSION_MAPPING = # If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all # comments according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you # can mix doxygen, HTML, and XML commands with Markdown formatting. # Disable only in case of backward compatibilities issues. MARKDOWN_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should # set this tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); v.s. # func(std::string) {}). This also makes the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. BUILTIN_STL_SUPPORT = YES # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. # Doxygen will parse them like normal C++ but will assume all classes use public # instead of private inheritance when no explicit protection keyword is present. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate getter # and setter methods for a property. Setting this option to YES (the default) # will make doxygen replace the get and set methods by a property in the # documentation. This will only work if the methods are indeed getting or # setting a simple type. If this is not the case, or you want to show the # methods anyway, you should set this option to NO. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES, then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. DISTRIBUTE_GROUP_DOC = YES # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a # subgroup of that type (e.g. under the Public Functions section). Set it to # NO to prevent subgrouping. Alternatively, this can be done per class using # the \nosubgrouping command. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and # unions are shown inside the group in which they are included (e.g. using # @ingroup) instead of on a separate page (for HTML and Man pages) or # section (for LaTeX and RTF). INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and # unions with only public data fields will be shown inline in the documentation # of the scope in which they are defined (i.e. file, namespace, or group # documentation), provided this scope is documented. If set to NO (the default), # structs, classes, and unions are shown on a separate page (for HTML and Man # pages) or section (for LaTeX and RTF). INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum # is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically # be useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. TYPEDEF_HIDES_STRUCT = NO # The SYMBOL_CACHE_SIZE determines the size of the internal cache use to # determine which symbols to keep in memory and which to flush to disk. # When the cache is full, less often used symbols will be written to disk. # For small to medium size projects (<1000 input files) the default value is # probably good enough. For larger projects a too small cache size can cause # doxygen to be busy swapping symbols to and from disk most of the time # causing a significant performance penalty. # If the system has enough physical memory increasing the cache will improve the # performance by keeping more symbols in memory. Note that the value works on # a logarithmic scale so increasing the size by one will roughly double the # memory usage. The cache size is given by this formula: # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, # corresponding to a cache size of 2^16 = 65536 symbols. SYMBOL_CACHE_SIZE = 0 # Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be # set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given # their name and scope. Since this can be an expensive process and often the # same symbol appear multiple times in the code, doxygen keeps a cache of # pre-resolved symbols. If the cache is too small doxygen will become slower. # If the cache is too large, memory is wasted. The cache size is given by this # formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0, # corresponding to a cache size of 2^16 = 65536 symbols. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in # documentation are documented, even if no documentation was available. # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES all members with package or internal scope will be included in the documentation. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES all static members of a file # will be included in the documentation. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) # defined locally in source files will be included in the documentation. # If set to NO only classes defined in header files are included. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. When set to YES local # methods, which are defined in the implementation section but not in # the interface are included in the documentation. # If set to NO (the default) only methods in the interface are included. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base # name of the file that contains the anonymous namespace. By default # anonymous namespaces are hidden. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all # undocumented members of documented classes, files or namespaces. # If set to NO (the default) these members will be included in the # various overviews, but no documentation section is generated. # This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_MEMBERS = YES # If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. # If set to NO (the default) these classes will be included in the various # overviews. This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_CLASSES = YES # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all # friend (class|struct|union) declarations. # If set to NO (the default) these declarations will be included in the # documentation. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any # documentation blocks found inside the body of a function. # If set to NO (the default) these blocks will be appended to the # function's detailed documentation block. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation # that is typed after a \internal command is included. If the tag is set # to NO (the default) then the documentation will be excluded. # Set it to YES to include the internal documentation. INTERNAL_DOCS = YES # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate # file names in lower-case letters. If set to YES upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen # will show members with their full class and namespace scopes in the # documentation. If set to YES the scope will be hidden. HIDE_SCOPE_NAMES = YES # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen # will put a list of the files that are included by a file in the documentation # of that file. SHOW_INCLUDE_FILES = YES # If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen # will list include files with double quotes in the documentation # rather than with sharp brackets. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES (the default) then a tag [inline] # is inserted in the documentation for inline members. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen # will sort the (detailed) documentation of file and class members # alphabetically by member name. If set to NO the members will appear in # declaration order. SORT_MEMBER_DOCS = NO # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the # brief documentation of file, namespace and class members alphabetically # by member name. If set to NO (the default) the members will appear in # declaration order. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen # will sort the (brief and detailed) documentation of class members so that # constructors and destructors are listed first. If set to NO (the default) # the constructors will appear in the respective orders defined by # SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. # This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO # and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the # hierarchy of group names into alphabetical order. If set to NO (the default) # the group names will appear in their defined order. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be # sorted by fully-qualified names, including namespaces. If set to # NO (the default), the class list will be sorted only by class name, # not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the # alphabetical list. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to # do proper type resolution of all parameters of a function it will reject a # match between the prototype and the implementation of a member function even # if there is only one candidate or it is obvious which candidate to choose # by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen # will still accept a match between prototype and implementation in such cases. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or # disable (NO) the todo list. This list is created by putting \todo # commands in the documentation. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or # disable (NO) the test list. This list is created by putting \test # commands in the documentation. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or # disable (NO) the bug list. This list is created by putting \bug # commands in the documentation. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or # disable (NO) the deprecated list. This list is created by putting # \deprecated commands in the documentation. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional # documentation sections, marked by \if sectionname ... \endif. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines # the initial value of a variable or macro consists of for it to appear in # the documentation. If the initializer consists of more lines than specified # here it will be hidden. Use a value of 0 to hide initializers completely. # The appearance of the initializer of individual variables and macros in the # documentation can be controlled using \showinitializer or \hideinitializer # command in the documentation regardless of this setting. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated # at the bottom of the documentation of classes and structs. If set to YES the # list will mention the files that were used to generate the documentation. SHOW_USED_FILES = YES # If the sources in your project are distributed over multiple directories # then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy # in the documentation. The default is NO. SHOW_DIRECTORIES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. # This will remove the Files entry from the Quick Index and from the # Folder Tree View (if specified). The default is YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the # Namespaces page. # This will remove the Namespaces entry from the Quick Index # and from the Folder Tree View (if specified). The default is YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command , where is the value of # the FILE_VERSION_FILTER tag, and is the name of an input file # provided by doxygen. Whatever the program writes to standard output # is used as the file version. See the manual for examples. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. The create the layout file # that represents doxygen's defaults, run doxygen with the -l option. # You can optionally specify a file name after the option, if omitted # DoxygenLayout.xml will be used as the name of the layout file. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files # containing the references data. This must be a list of .bib files. The # .bib extension is automatically appended if omitted. Using this command # requires the bibtex tool to be installed. See also # http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style # of the bibliography can be controlled using LATEX_BIB_STYLE. To use this # feature you need bibtex and perl available in the search path. CITE_BIB_FILES = #--------------------------------------------------------------------------- # configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated # by doxygen. Possible values are YES and NO. If left blank NO is used. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated by doxygen. Possible values are YES and NO. If left blank # NO is used. WARNINGS = YES # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings # for undocumented members. If EXTRACT_ALL is set to YES then this flag will # automatically be disabled. WARN_IF_UNDOCUMENTED = YES # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some # parameters in a documented function, or documenting parameters that # don't exist or using markup commands wrongly. WARN_IF_DOC_ERROR = YES # The WARN_NO_PARAMDOC option can be enabled to get warnings for # functions that are documented, but have no documentation for their parameters # or return value. If set to NO (the default) doxygen will only warn about # wrong or incomplete parameter documentation, but not about the absence of # documentation. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that # doxygen can produce. The string should contain the $file, $line, and $text # tags, which will be replaced by the file and line number from which the # warning originated and the warning text. Optionally the format may contain # $version, which will be replaced by the version of the file (if it could # be obtained via FILE_VERSION_FILTER) WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning # and error messages should be written. If left blank the output is written # to stderr. WARN_LOGFILE = doxydev.debug #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag can be used to specify the files and/or directories that contain # documented source files. You may enter file names like "myfile.cpp" or # directories like "/usr/src/myproject". Separate the files or directories # with spaces. INPUT = .. \ ../fflas-ffpack \ ../doc \ ../tests \ ../benchmark # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is # also the default input encoding. Doxygen uses libiconv (or the iconv built # into libc) for the transcoding. See http://www.gnu.org/software/libiconv for # the list of possible encodings. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank the following patterns are tested: # *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh # *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py # *.f90 *.f *.for *.vhd *.vhdl FILE_PATTERNS = *.cc \ *.cpp \ *.C \ *.h \ *.inl \ *.doxy # The RECURSIVE tag can be used to turn specify whether or not subdirectories # should be searched for input files as well. Possible values are YES and NO. # If left blank NO is used. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = CVS \ *_src.inl \ Attic # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. Note that the wildcards are matched # against the file with absolute path, so to exclude all test directories # for example use the pattern */test/* EXCLUDE_PATTERNS = */CVS/* \ *_src.inl \ */Attic/* # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test EXCLUDE_SYMBOLS = *__FFLASFFPACK__* # The EXAMPLE_PATH tag can be used to specify one or more files or # directories that contain example code fragments that are included (see # the \include command). EXAMPLE_PATH = .. # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank all files are included. EXAMPLE_PATTERNS = *.C \ *.inl \ *INSTALL \ *COPYING \ *HACKING \ *AUTHORS \ *.html # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude # commands irrespective of the value of the RECURSIVE tag. # Possible values are YES and NO. If left blank NO is used. EXAMPLE_RECURSIVE = YES # The IMAGE_PATH tag can be used to specify one or more files or # directories that contain image that are included in the documentation (see # the \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command , where # is the value of the INPUT_FILTER tag, and is the name of an # input file. Doxygen will then use the output that the filter program writes # to standard output. # If FILTER_PATTERNS is specified, this tag will be # ignored. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. # Doxygen will compare the file name with each pattern and apply the # filter if there is a match. # The filters are a list of the form: # pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further # info on how filters are used. If FILTER_PATTERNS is empty or if # non of the patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will be used to filter the input files when producing source # files to browse (i.e. when SOURCE_BROWSER is set to YES). FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) # and it is also possible to disable source filtering for a specific pattern # using *.ext= (so without naming a filter). This option only has effect when # FILTER_SOURCE_FILES is enabled. FILTER_SOURCE_PATTERNS = #--------------------------------------------------------------------------- # configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will # be generated. Documented entities will be cross-referenced with these sources. # Note: To get rid of all source code in the generated output, make sure also # VERBATIM_HEADERS is set to NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body # of functions and classes directly in the documentation. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct # doxygen to hide any special comment blocks from generated source code # fragments. Normal C and C++ comments will always remain visible. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES # then for each documented function all documented # functions referencing it will be listed. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES # then for each documented function all documented entities # called/used by that function will be listed. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES (the default) # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from # functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will # link to the source code. # Otherwise they will link to the documentation. REFERENCES_LINK_SOURCE = YES # If the USE_HTAGS tag is set to YES then the references to source code # will point to the HTML generated by the htags(1) tool instead of doxygen # built-in source browser. The htags tool is part of GNU's global source # tagging system (see http://www.gnu.org/software/global/global.html). You # will need version 4.8.6 or higher. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen # will generate a verbatim copy of the header file for each class for # which an include is specified. Set to NO to disable this. VERBATIM_HEADERS = NO #--------------------------------------------------------------------------- # configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index # of all compounds will be generated. Enable this if the project # contains a lot of classes, structs, unions or interfaces. ALPHABETICAL_INDEX = NO # If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then # the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns # in which this list will be split (can be a number in the range [1..20]) COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that # should be ignored while generating the index headers. IGNORE_PREFIX = #--------------------------------------------------------------------------- # configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES (the default) Doxygen will # generate HTML output. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `html' will be used as the default path. HTML_OUTPUT = fflas-ffpack-dev-html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for # each generated HTML page (for example: .htm,.php,.asp). If it is left blank # doxygen will generate files with .html extension. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a personal HTML header for # each generated HTML page. If it is left blank doxygen will generate a # standard header. Note that when using a custom header you are responsible # for the proper inclusion of any scripts and style sheets that doxygen # needs, which is dependent on the configuration options used. # It is advised to generate a default header using "doxygen -w html # header.html footer.html stylesheet.css YourConfigFile" and then modify # that header. Note that the header is subject to change so you typically # have to redo this when upgrading to a newer version of doxygen or when # changing the value of configuration settings such as GENERATE_TREEVIEW! HTML_HEADER = # The HTML_FOOTER tag can be used to specify a personal HTML footer for # each generated HTML page. If it is left blank doxygen will generate a # standard footer. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading # style sheet that is used by each HTML page. It can be used to # fine-tune the look of the HTML output. If the tag is left blank doxygen # will generate a default style sheet. Note that doxygen will try to copy # the style sheet file to the HTML output directory, so don't put your own # style sheet in the HTML output directory as well, or it will be erased! HTML_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that # the files will be copied as-is; there are no commands or markers available. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. # Doxygen will adjust the colors in the style sheet and background images # according to this color. Hue is specified as an angle on a colorwheel, # see http://en.wikipedia.org/wiki/Hue for more information. # For instance the value 0 represents red, 60 is yellow, 120 is green, # 180 is cyan, 240 is blue, 300 purple, and 360 is red again. # The allowed range is 0 to 359. HTML_COLORSTYLE_HUE = 110 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of # the colors in the HTML output. For a value of 0 the output will use # grayscales only. A value of 255 will produce the most vivid colors. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to # the luminance component of the colors in the HTML output. Values below # 100 gradually make the output lighter, whereas values above 100 make # the output darker. The value divided by 100 is the actual gamma applied, # so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, # and 100 does not change the gamma. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting # this to NO can help when comparing the output of multiple runs. HTML_TIMESTAMP = YES # If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, # files or namespaces will be aligned in HTML using tables. If set to # NO a bullet list will be used. HTML_ALIGN_MEMBERS = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. For this to work a browser that supports # JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox # Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). HTML_DYNAMIC_SECTIONS = YES # If the GENERATE_DOCSET tag is set to YES, additional index files # will be generated that can be used as input for Apple's Xcode 3 # integrated development environment, introduced with OSX 10.5 (Leopard). # To create a documentation set, doxygen will generate a Makefile in the # HTML output directory. Running make will produce the docset in that # directory and running "make install" will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find # it at startup. # See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. GENERATE_DOCSET = NO # When GENERATE_DOCSET tag is set to YES, this tag determines the name of the # feed. A documentation feed provides an umbrella under which multiple # documentation sets from a single provider (such as a company or product suite) # can be grouped. DOCSET_FEEDNAME = "Doxygen generated docs" # When GENERATE_DOCSET tag is set to YES, this tag specifies a string that # should uniquely identify the documentation set bundle. This should be a # reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen # will append .docset to the name. DOCSET_BUNDLE_ID = org.doxygen.Project # When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compiled HTML help file (.chm) # of the generated HTML documentation. GENERATE_HTMLHELP = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can # be used to specify the file name of the resulting .chm file. You # can add a path in front of the file if the result should not be # written to the html output directory. CHM_FILE = # If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can # be used to specify the location (absolute path including file name) of # the HTML help compiler (hhc.exe). If non-empty doxygen will try to run # the HTML help compiler on the generated index.hhp. HHC_LOCATION = # If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag # controls if a separate .chi index file is generated (YES) or that # it should be included in the master .chm file (NO). GENERATE_CHI = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING # is used to encode HtmlHelp index (hhk), content (hhc) and project file # content. CHM_INDEX_ENCODING = # If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag # controls whether a binary table of contents is generated (YES) or a # normal table of contents (NO) in the .chm file. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members # to the contents of the HTML help documentation and to the tree view. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated # that can be used as input for Qt's qhelpgenerator to generate a # Qt Compressed Help (.qch) of the generated HTML documentation. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can # be used to specify the file name of the resulting .qch file. # The path specified is relative to the HTML output folder. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#namespace QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#virtual-folders QHP_VIRTUAL_FOLDER = doc # If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to # add. For more information please see # http://doc.trolltech.com/qthelpproject.html#custom-filters QHP_CUST_FILTER_NAME = # The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see # # Qt Help Project / Custom Filters. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's # filter section matches. # # Qt Help Project / Filter Attributes. QHP_SECT_FILTER_ATTRS = # If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can # be used to specify the location of Qt's qhelpgenerator. # If non-empty doxygen will try to run qhelpgenerator on the generated # .qhp file. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files # will be generated, which together with the HTML files, form an Eclipse help # plugin. To install this plugin and make it available under the help contents # menu in Eclipse, the contents of the directory containing the HTML and XML # files needs to be copied into the plugins directory of eclipse. The name of # the directory within the plugins directory should be the same as # the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before # the help appears. GENERATE_ECLIPSEHELP = NO # A unique identifier for the eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have # this name. ECLIPSE_DOC_ID = org.doxygen.Project # The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) # at top of each HTML page. The value NO (the default) enables the index and # the value YES disables it. Since the tabs have the same information as the # navigation tree you can set this option to NO if you already set # GENERATE_TREEVIEW to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. # If the tag value is set to YES, a side panel will be generated # containing a tree-like index structure (just like the one that # is generated for HTML Help). For this to work a browser that supports # JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). # Windows users are probably better off using the HTML help feature. # Since the tree basically has the same information as the tab index you # could consider to set DISABLE_INDEX to NO when enabling this option. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values # (range [0,1..20]) that doxygen will group on one line in the generated HTML # documentation. Note that a value of 0 will completely suppress the enum # values from appearing in the overview section. ENUM_VALUES_PER_LINE = 4 # By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, # and Class Hierarchy pages using a tree view instead of an ordered list. USE_INLINE_TREES = NO # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be # used to set the initial width (in pixels) of the frame in which the tree # is shown. TREEVIEW_WIDTH = 250 # When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open # links to external symbols imported via tag files in a separate window. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of Latex formulas included # as images in the HTML documentation. The default is 10. Note that # when you change the font size after a successful doxygen run you need # to manually remove any form_*.png images from the HTML output directory # to force them to be regenerated. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are # not supported properly for IE 6.0, but are supported on all modern browsers. # Note that when changing this option you need to delete any form_*.png files # in the HTML output before the changes have effect. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax # (see http://www.mathjax.org) which uses client side Javascript for the # rendering instead of using prerendered bitmaps. Use this if you do not # have LaTeX installed or if you want to formulas look prettier in the HTML # output. When enabled you may also need to install MathJax separately and # configure the path to it using the MATHJAX_RELPATH option. USE_MATHJAX = NO # When MathJax is enabled you need to specify the location relative to the # HTML output directory using the MATHJAX_RELPATH option. The destination # directory should contain the MathJax.js script. For instance, if the mathjax # directory is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to # the MathJax Content Delivery Network so you can quickly see the result without # installing MathJax. # However, it is strongly recommended to install a local # copy of MathJax from http://www.mathjax.org before deployment. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension # names that should be enabled during MathJax rendering. MATHJAX_EXTENSIONS = # When the SEARCHENGINE tag is enabled doxygen will generate a search box # for the HTML output. The underlying search engine uses javascript # and DHTML and should work on any modern browser. Note that when using # HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets # (GENERATE_DOCSET) there is already a search function so this one should # typically be disabled. For large projects the javascript based search engine # can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. SEARCHENGINE = NO # When the SERVER_BASED_SEARCH tag is enabled the search engine will be # implemented using a PHP enabled web server instead of at the web client # using Javascript. Doxygen will generate the search PHP script and index # file to put on the web server. The advantage of the server # based approach is that it scales better to large projects and allows # full text search. The disadvantages are that it is more difficult to setup # and does not have live searching capabilities. SERVER_BASED_SEARCH = NO #--------------------------------------------------------------------------- # configuration options related to the LaTeX output #--------------------------------------------------------------------------- # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will # generate Latex output. GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `latex' will be used as the default path. LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. If left blank `latex' will be used as the default command name. # Note that when enabling USE_PDFLATEX this option is only used for # generating bitmaps for formulas in the HTML output, but not in the # Makefile that is written to the output directory. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to # generate index for LaTeX. If left blank `makeindex' will be used as the # default command name. MAKEINDEX_CMD_NAME = makeindex # If the COMPACT_LATEX tag is set to YES Doxygen generates more compact # LaTeX documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_LATEX = NO # The PAPER_TYPE tag can be used to set the paper type that is used # by the printer. Possible values are: a4, letter, legal and # executive. If left blank a4wide will be used. PAPER_TYPE = a4wide # The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX # packages that should be included in the LaTeX output. EXTRA_PACKAGES = stmaryrd \ amsmath # The LATEX_HEADER tag can be used to specify a personal LaTeX header for # the generated latex document. The header should contain everything until # the first chapter. If it is left blank doxygen will generate a # standard header. Notice: only use this tag if you know what you are doing! LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for # the generated latex document. The footer should contain everything after # the last chapter. If it is left blank doxygen will generate a # standard footer. Notice: only use this tag if you know what you are doing! LATEX_FOOTER = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated # is prepared for conversion to pdf (using ps2pdf). The pdf file will # contain links (just like the HTML output) instead of page references # This makes the output suitable for online browsing using a pdf viewer. PDF_HYPERLINKS = NO # If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of # plain latex in the generated Makefile. Set this option to YES to get a # higher quality PDF documentation. USE_PDFLATEX = NO # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. # command to the generated LaTeX files. This will instruct LaTeX to keep # running if errors occur, instead of asking the user for help. # This option is also used when generating formulas in HTML. LATEX_BATCHMODE = NO # If LATEX_HIDE_INDICES is set to YES then doxygen will not # include the index chapters (such as File Index, Compound Index, etc.) # in the output. LATEX_HIDE_INDICES = NO # If LATEX_SOURCE_CODE is set to YES then doxygen will include # source code with syntax highlighting in the LaTeX output. # Note that which sources are shown also depends on other settings # such as SOURCE_BROWSER. LATEX_SOURCE_CODE = NO # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See # http://en.wikipedia.org/wiki/BibTeX for more info. LATEX_BIB_STYLE = plain #--------------------------------------------------------------------------- # configuration options related to the RTF output #--------------------------------------------------------------------------- # If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output # The RTF output is optimized for Word 97 and may not look very pretty with # other RTF readers or editors. GENERATE_RTF = NO # The RTF_OUTPUT tag is used to specify where the RTF docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `rtf' will be used as the default path. RTF_OUTPUT = rtf # If the COMPACT_RTF tag is set to YES Doxygen generates more compact # RTF documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_RTF = NO # If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated # will contain hyperlink fields. The RTF file will # contain links (just like the HTML output) instead of page references. # This makes the output suitable for online browsing using WORD or other # programs which support those fields. # Note: wordpad (write) and others do not support links. RTF_HYPERLINKS = NO # Load style sheet definitions from file. Syntax is similar to doxygen's # config file, i.e. a series of assignments. You only have to provide # replacements, missing definitions are set to their default value. RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an rtf document. # Syntax is similar to doxygen's config file. RTF_EXTENSIONS_FILE = #--------------------------------------------------------------------------- # configuration options related to the man page output #--------------------------------------------------------------------------- # If the GENERATE_MAN tag is set to YES (the default) Doxygen will # generate man pages GENERATE_MAN = NO # The MAN_OUTPUT tag is used to specify where the man pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `man' will be used as the default path. MAN_OUTPUT = man # The MAN_EXTENSION tag determines the extension that is added to # the generated man pages (default is the subroutine's section .3) MAN_EXTENSION = .3 # If the MAN_LINKS tag is set to YES and Doxygen generates man output, # then it will generate one additional man file for each entity # documented in the real man page(s). These additional files # only source the real man page, but without them the man command # would be unable to find the correct page. The default is NO. MAN_LINKS = NO #--------------------------------------------------------------------------- # configuration options related to the XML output #--------------------------------------------------------------------------- # If the GENERATE_XML tag is set to YES Doxygen will # generate an XML file that captures the structure of # the code including all documentation. GENERATE_XML = NO # The XML_OUTPUT tag is used to specify where the XML pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `xml' will be used as the default path. XML_OUTPUT = xml # The XML_SCHEMA tag can be used to specify an XML schema, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_SCHEMA = # The XML_DTD tag can be used to specify an XML DTD, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_DTD = # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that # enabling this will significantly increase the size of the XML output. XML_PROGRAMLISTING = YES #--------------------------------------------------------------------------- # configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will # generate an AutoGen Definitions (see autogen.sf.net) file # that captures the structure of the code including all # documentation. Note that this feature is still experimental # and incomplete at the moment. GENERATE_AUTOGEN_DEF = NO #--------------------------------------------------------------------------- # configuration options related to the Perl module output #--------------------------------------------------------------------------- # If the GENERATE_PERLMOD tag is set to YES Doxygen will # generate a Perl module file that captures the structure of # the code including all documentation. Note that this # feature is still experimental and incomplete at the # moment. GENERATE_PERLMOD = NO # If the PERLMOD_LATEX tag is set to YES Doxygen will generate # the necessary Makefile rules, Perl scripts and LaTeX code to be able # to generate PDF and DVI output from the Perl module output. PERLMOD_LATEX = NO # If the PERLMOD_PRETTY tag is set to YES the Perl module output will be # nicely formatted so it can be parsed by a human reader. # This is useful # if you want to understand what is going on. # On the other hand, if this # tag is set to NO the size of the Perl module output will be much smaller # and Perl will parse it just the same. PERLMOD_PRETTY = YES # The names of the make variables in the generated doxyrules.make file # are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. # This is useful so different doxyrules.make files included by the same # Makefile don't overwrite each other's variables. PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor #--------------------------------------------------------------------------- # If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will # evaluate all C-preprocessor directives found in the sources and include # files. ENABLE_PREPROCESSING = YES # If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro # names in the source code. If set to NO (the default) only conditional # compilation will be performed. Macro expansion can be done in a controlled # way by setting EXPAND_ONLY_PREDEF to YES. MACRO_EXPANSION = NO # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES # then the macro expansion is limited to the macros specified with the # PREDEFINED and EXPAND_AS_DEFINED tags. EXPAND_ONLY_PREDEF = NO # If the SEARCH_INCLUDES tag is set to YES (the default) the includes files # pointed to by INCLUDE_PATH will be searched when a #include is found. SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by # the preprocessor. INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the # directories. If left blank, the patterns specified with FILE_PATTERNS will # be used. INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that # are defined before the preprocessor is started (similar to the -D option of # gcc). The argument of the tag is a list of macros of the form: name # or name=definition (no spaces). If the definition and the = are # omitted =1 is assumed. To prevent a macro definition from being # undefined via #undef or recursively expanded use the := operator # instead of the = operator. PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then # this tag can be used to specify a list of macro names that should be expanded. # The macro definition that is found in the sources will be used. # Use the PREDEFINED tag if you want to use a different macro definition that # overrules the definition found in the source code. EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # doxygen's preprocessor will remove all references to function-like macros # that are alone on a line, have an all uppercase name, and do not end with a # semicolon, because these will confuse the parser if not removed. SKIP_FUNCTION_MACROS = YES #--------------------------------------------------------------------------- # Configuration::additions related to external references #--------------------------------------------------------------------------- # The TAGFILES option can be used to specify one or more tagfiles. For each # tag file the location of the external documentation should be added. The # format of a tag file without this location is as follows: # # TAGFILES = file1 file2 ... # Adding location for the tag files is done as follows: # # TAGFILES = file1=loc1 "file2 = loc2" ... # where "loc1" and "loc2" can be relative or absolute paths # or URLs. Note that each tag file must have a unique name (where the name does # NOT include the path). If a tag file is not located in the directory in which # doxygen is run, you must also specify the path to the tagfile here. TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create # a tag file that is based on the input files it reads. GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES all external classes will be listed # in the class index. If set to NO only the inherited external classes # will be listed. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed # in the modules index. If set to NO, only the current project's groups will # be listed. EXTERNAL_GROUPS = YES # The PERL_PATH should be the absolute path and name of the perl script # interpreter (i.e. the result of `which perl'). PERL_PATH = /usr/bin/perl #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- # If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will # generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base # or super classes. Setting the tag to NO turns the diagrams off. Note that # this option also works with HAVE_DOT disabled, but it is recommended to # install and use dot, since it yields more powerful graphs. CLASS_DIAGRAMS = YES # You can define message sequence charts within doxygen comments using the \msc # command. Doxygen will then run the mscgen tool (see # http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the # documentation. The MSCGEN_PATH tag allows you to specify the directory where # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. MSCGEN_PATH = # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz, a graph visualization # toolkit from AT&T and Lucent Bell Labs. The other options in this section # have no effect if this option is set to NO (the default) HAVE_DOT = NO # The DOT_NUM_THREADS specifies the number of dot invocations doxygen is # allowed to run in parallel. When set to 0 (the default) doxygen will # base this on the number of processors available in the system. You can set it # explicitly to a value larger than 0 to get control over the balance # between CPU load and processing speed. DOT_NUM_THREADS = 0 # By default doxygen will use the Helvetica font for all dot files that # doxygen generates. When you want a differently looking font you can specify # the font name using DOT_FONTNAME. You need to make sure dot is able to find # the font, which can be done by putting it in a standard location or by setting # the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the # directory containing the font. DOT_FONTNAME = FreeSans.ttf # The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. # The default size is 10pt. DOT_FONTSIZE = 10 # By default doxygen will tell dot to use the Helvetica font. # If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to # set the path where dot can find it. DOT_FONTPATH = # If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect inheritance relations. Setting this tag to YES will force the # CLASS_DIAGRAMS tag to NO. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect implementation dependencies (inheritance, containment, and # class references variables) of the class with other documented classes. COLLABORATION_GRAPH = YES # If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen # will generate a graph for groups, showing the direct groups dependencies GROUP_GRAPHS = YES # If the UML_LOOK tag is set to YES doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. UML_LOOK = NO # If the UML_LOOK tag is enabled, the fields and methods are shown inside # the class node. If there are many fields or methods and many nodes the # graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS # threshold limits the number of items for each type to make the size more # managable. Set this to 0 for no limit. Note that the threshold may be # exceeded by 50% before the limit is enforced. UML_LIMIT_NUM_FIELDS = 10 # If set to YES, the inheritance and collaboration graphs will show the # relations between templates and their instances. TEMPLATE_RELATIONS = NO # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT # tags are set to YES then doxygen will generate a graph for each documented # file showing the direct and indirect include dependencies of the file with # other documented files. INCLUDE_GRAPH = YES # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and # HAVE_DOT tags are set to YES then doxygen will generate a graph for each # documented header file showing the documented files that directly or # indirectly include this file. INCLUDED_BY_GRAPH = YES # If the CALL_GRAPH and HAVE_DOT options are set to YES then # doxygen will generate a call dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable call graphs # for selected functions only using the \callgraph command. CALL_GRAPH = NO # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then # doxygen will generate a caller dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable caller # graphs for selected functions only using the \callergraph command. CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will generate a graphical hierarchy of all classes instead of a textual one. GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES # then doxygen will show the dependencies a directory has on other directories # in a graphical way. The dependency relations are determined by the #include # relations between the files in the directories. DIRECTORY_GRAPH = YES # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. Possible values are svg, png, jpg, or gif. # If left blank png will be used. If you choose svg you need to set # HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible in IE 9+ (other browsers do not have this requirement). DOT_IMAGE_FORMAT = png # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to # enable generation of interactive SVG images that allow zooming and panning. # Note that this requires a modern browser other than Internet Explorer. # Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you # need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible. Older versions of IE do not have SVG support. INTERACTIVE_SVG = NO # The tag DOT_PATH can be used to specify the path where the dot tool can be # found. If left blank, it is assumed the dot tool can be found in the path. DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the # \dotfile command). DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the # \mscfile command). MSCFILE_DIRS = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of # nodes that will be shown in the graph. If the number of nodes in a graph # becomes larger than this value, doxygen will truncate the graph, which is # visualized by representing a node as a red box. Note that doxygen if the # number of direct children of the root node in a graph is already larger than # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note # that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. DOT_GRAPH_MAX_NODES = 50 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the # graphs generated by dot. A depth value of 3 means that only nodes reachable # from the root by following a path via at most 3 edges will be shown. Nodes # that lay further from the root node will be omitted. Note that setting this # option to 1 or 2 may greatly reduce the computation time needed for large # code bases. Also note that the size of a graph can be further restricted by # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. MAX_DOT_GRAPH_DEPTH = 1000 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent # background. This is disabled by default, because dot on Windows does not # seem to support this out of the box. Warning: Depending on the platform used, # enabling this option may lead to badly anti-aliased labels on the edges of # a graph (i.e. they become hard to read). DOT_TRANSPARENT = NO # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) # support this, this feature is disabled by default. DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will # generate a legend page explaining the meaning of the various boxes and # arrows in the dot generated graphs. GENERATE_LEGEND = YES # If the DOT_CLEANUP tag is set to YES (the default) Doxygen will # remove the intermediate dot files that are used to generate # the various graphs. DOT_CLEANUP = YES fflas-ffpack-2.2.2/doc/Makefile.am000066400000000000000000000047321274716147400166640ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox documentation # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ if FFLASFFPACK_BUILD_DOC USE_TARGETS = docs INSTALL_TARGETS = install-doc endif docdir=$(FFLASFFPACK_DOC_PATH) #man1_MANS = fflas-ffpack-config.1 all all-local: $(USE_TARGETS) install install-data-local: $(USE_TARGETS) $(INSTALL_TARGETS) docs: sed -i 's/^\\version.*/\\version\ $(VERSION)/' mainpage.doxy if test -d fflas-ffpack-html ; then echo exists; else mkdir fflas-ffpack-html ; fi if test -d fflas-ffpack-dev-html ; then echo exists; else mkdir fflas-ffpack-dev-html ; fi cp ../INSTALL fflas-ffpack-html/ cp ../COPYING fflas-ffpack-html/ cp ../AUTHORS fflas-ffpack-html/ doxygen Doxyfile docs_dev: make docs cp ../INSTALL fflas-ffpack-dev-html/ cp ../COPYING fflas-ffpack-dev-html/ cp ../AUTHORS fflas-ffpack-dev-html/ doxygen DoxyfileDev # cp tutorial.html fflas-ffpack-dev-html/ # cp install-dist.html fflas-ffpack-dev-html/ # cp install-dev.html fflas-ffpack-dev-html/ install-doc: mkdir -p $(DESTDIR)/$(docdir) cp -rp fflas-ffpack-html $(DESTDIR)/$(docdir)/fflas-ffpack-html cp -rp fflas-ffpack-dev-html $(DESTDIR)/$(docdir)/fflas-ffpack-dev-html cp -p fflas-ffpack.html $(DESTDIR)/$(docdir)/fflas-ffpack.html EXTRA_DIST= \ Doxyfile \ DoxyfileDev \ mainpage.doxy \ fflas-ffpack.html # \ # doc.doxy \ # tutorial.doxy \ # fflas-ffpack.html \ # fflas-ffpack-config.1 \ #tutorial.html \ # install-dev.html \ # index-dev.html \ # install-dist.html clean-local : rm -rf fflas-ffpack-html rm -rf fflas-ffpack-dev-html fflas-ffpack-2.2.2/doc/fflas-ffpack.html000066400000000000000000000032021274716147400200300ustar00rootroot00000000000000 Starter to FFLAS-FFPACK documentation FFLAS-FFPACK documentation main page: fflas-ffpack-html/index.html (going there in 5 seconds). If it is a dead link, you have to build the documentation first with make docs. fflas-ffpack-2.2.2/doc/mainpage.doxy000066400000000000000000000063131274716147400173130ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox documentation # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ /** @mainpage FFLAS-FFPACK Documentation. * * \section intro Introduction * FFLAS-FFPACK is a LGPL-2.1+ source code library for basic linear algebra operations over a finite field. It is inspired by BLAS interface (Basic Linear Algebra Subprograms) and the LAPACK library for numerical linear algebra, and shares part of their design. Yet it differs in many aspects due to the specifities of computing over a finite field: * - it is generic with respect to the finite field, so as to accomodate a large variety of field sizes and implementations; * - it is a pure source code library, to be included and compiled in the user's software. Its build system is only used for tests and benchmarks. * * \section goals Goals * * \section desig Design * * \section using Using FFLAS-FFPACK. * - \subpage copy. * - \subpage tuto. This is a brief introduction to %FFLAS-FFPACK * capabilities. * - \subpage inst. Explains how to configure/install from sources or from the latest * svn version. * - \subpage arch. Describes how FFLAS-FFPACK is organized * - Documentation for Users. If everything around is * blue, then you are reading the lighter, user-oriented, documentation. * - Documentation for Developers. If everything around * is green, then you can get to everything (not necessarily yet) documented. * . * *\section contrib Contributing to fflas-ffpack, getting assistance. * *\version 2.1.0 */ /** @page inst Configuring and Installing FFLAS-FFPACK * FFLAS-FFPACK is a header-only package. Hower configuration process can be tweaked a lot. * Configure looks for BLAS routines and Givaro library which are both mandatory dependencies. * See the output of ./configure --help for information about the LAPACK/BLAS discovering strategies. */ /** @page copy Copying and Licence * @brief The FFLAS-FFPACK library is licensed under the terms of the GNU LGPL v2.1 or later. * See https://www.gnu.org/licenses/lgpl-2.1.html */ /** @page tuto Tutorial * @brief no doc. */ /** @page arch Architecture of the library. * @brief no doc. */ // vim:syn=doxygen fflas-ffpack-2.2.2/examples/000077500000000000000000000000001274716147400156735ustar00rootroot00000000000000fflas-ffpack-2.2.2/examples/101-fgemm.C000066400000000000000000000042631274716147400173760ustar00rootroot00000000000000/* Copyright (c) FFLAS-FFPACK * Written by Jean-Guillaume Dumas * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include #include #include #include #include #include #include using namespace FFLAS; int main(int argc, char** argv) { typedef Givaro::ModularBalanced Ring; Ring F(101); Ring::Element * A, * B, * C; A = fflas_new(F,2,3); B = fflas_new(F,3,2); C = fflas_new(F,2,2); F.assign(*(A+0),F.one); F.init(*(A+1),2); F.init(*(A+2),3); F.init(*(A+3),5); F.init(*(A+4),7); F.init(*(A+5),11); Ring::Element t,u,v; F.init(t, 2); F.init(u, 4); F.init(v); F.assign(*(B+0),F.zero); // B[0] <- 0 F.assign(*(B+1),t); // B[1] <- 2 F.assign(*(B+2),u); // B[2] <- 4 F.add(v,t,u); F.assign(*(B+3),v); // B[3] <- 2+4 F.mul(*(B+4),t,u); // B[4] <- 2*4 F.add(*(B+5),u,v); // B[5] <- 4+6 write_field(F, std::cout << "A:=", A, 2, 3, 3,true) << std::endl; write_field(F, std::cout << "B:=", B, 3, 2, 2,true) << std::endl; fgemm (F, FflasNoTrans, FflasNoTrans, 2,2,3, F.one, A, 3, B, 2, F.zero, C, 2 ); write_field(F, std::cout << "C:=", C, 2, 2, 2,true) << std::endl; fflas_delete( A); fflas_delete( B); fflas_delete( C); return 0; } fflas-ffpack-2.2.2/examples/2x2-fgemm.C000066400000000000000000000035361274716147400175120ustar00rootroot00000000000000/* Copyright (c) FFLAS-FFPACK * Written by Jean-Guillaume Dumas * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include #include #include #include #include #include #include using namespace FFLAS; int main(int argc, char** argv) { typedef Givaro::Modular Ring; Ring F(11); Ring::Element A[4]{1,2,3,4}, B[4]{5,6,7,8}, * C; size_t m(2),k(2),n(2); C = fflas_new(F,m,n); // A is mxk with leading dimension k write_field(F, std::cout << "A:=", A, m, k, k, true) << std::endl; // B is kxn with leading dimension n write_field(F, std::cout << "B:=", B, k, n, n, true) << std::endl; fgemm (F, FflasNoTrans, FflasNoTrans, m, n, k, F.one, A, m, B, n, F.zero, C, n); // C is mxn with leading dimension n write_field(F, std::cout << "C:=", C, m, n, n, true) << " modulo 11" << std::endl; fflas_delete( C); return 0; } fflas-ffpack-2.2.2/examples/2x2-pluq.C000066400000000000000000000036421274716147400173760ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include #include #include #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/Matio.h" using namespace std; int main(int argc, char** argv) { if (argc > 2){ std::cerr<<"Usage: 2x2-pluq

    "<1?atoi(argv[1]):5); // Creating the finite field Z/pZ Givaro::Modular F(p); size_t m(2),n(2); double A[4] {1,2,3,4}; write_field(F,std::cout<<"A = "<(m); size_t * Q = FFLAS::fflas_new(n); FFPACK::PLUQ (F, FFLAS::FflasNonUnit, m, n, A, n, P, Q); write_perm(std::cout<<"P = "< "< F(p); // Reading the matrix from a file double * A = read_field (F, file.c_str(), &m, &n); size_t * P = FFLAS::fflas_new(m); size_t * Q = FFLAS::fflas_new(n); FFPACK::PLUQ (F, FFLAS::FflasNonUnit, m, n, A, n, P, Q); write_field(F,std::cout<<"PLUQ = "< # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ prefix=@prefix@ exec_prefix=@exec_prefix@ includedir=@includedir@ libdir=@libdir@ major=`echo @VERSION@ | cut -d'.' -f1` minor=`echo @VERSION@ | cut -d'.' -f2` micro=`echo @VERSION@ | cut -d'.' -f3` decvr=$((((($major*100)+$minor)*100)+$micro)) cflags=false libs=false usage() { cat <= 4.0.1 Libs: @PARLIBS@ @PRECOMPILE_LIBS@ @CBLAS_LIBS@ Cflags: -I@includedir@ @DEFAULT_CFLAGS@ @CBLAS_FLAG@ @CXXFLAGS@ @PARFLAGS@ @PRECOMPILE_FLAGS@ \-------------------------------------------------------fflas-ffpack-2.2.2/fflas-ffpack/000077500000000000000000000000001274716147400164005ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/Makefile.am000066400000000000000000000022671274716147400204430ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ SUBDIRS=fflas ffpack field utils paladin interfaces checkers EXTRA_DIST=fflas-ffpack.doxy pkginclude_HEADERS = config-blas.h \ fflas-ffpack.h \ config.h \ fflas-ffpack-config.h \ fflas-ffpack-optimise.h fflas-ffpack-2.2.2/fflas-ffpack/checkers/000077500000000000000000000000001274716147400201675ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/checkers/Makefile.am000066400000000000000000000024341274716147400222260ustar00rootroot00000000000000# Copyright (c) 2016 FFLAS-FFPACK # written by Ashley Lesdalons (ash09) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/checkers pkgincludesub_HEADERS= \ checkers_fflas.h \ checkers_fflas.inl \ checkers_ffpack.h \ checkers_ffpack.inl \ checker_empty.h \ checker_pluq.inl \ checker_ftrsm.inl \ checker_fgemm.inl \ checker_charpoly.inl \ checker_invert.inl EXTRA_DIST=checkers.doxy fflas-ffpack-2.2.2/fflas-ffpack/checkers/checker_charpoly.inl000066400000000000000000000121611274716147400242010ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* checkers/Checker_charpoly.inl * Copyright (C) 2016 Ashley Lesdalons * * Written by Ashley Lesdalons * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_checker_charpoly_INL #define __FFLASFFPACK_checker_charpoly_INL #include "fflas-ffpack/ffpack/ffpack.h" #ifdef TIME_CHECKER_CHARPOLY #include #endif namespace FFPACK { template class CheckerImplem_charpoly { const Field& F; const size_t n, lda; typename Field::Element lambda, det; bool pass; #ifdef TIME_CHECKER_CHARPOLY Givaro::Timer _time; #endif public: CheckerImplem_charpoly(const Field& F_, const size_t n_, typename Field::ConstElement_ptr A, size_t lda_) : F(F_), n(n_), lda(lda_) { typename Field::RandIter G(F); init(G,A); } CheckerImplem_charpoly(typename Field::RandIter &G, const size_t n_, typename Field::ConstElement_ptr A, size_t lda_) : F(G.ring()), n(n_), lda(lda_) { init(G,A); } ~CheckerImplem_charpoly() { } inline bool check(Polynomial &g) { #ifdef TIME_CHECKER_CHARPOLY Givaro::Timer checktime; checktime.start(); #endif typename Field::Element h = F.zero, t = F.one, u; for (size_t i=0; i < g.size(); ++i) { F.mul(u,g[i],t); F.add(h,h,u); F.mul(t,t,lambda); } // is h == det ? pass = pass && F.areEqual(h,det); if (!pass) throw FailureCharpolyCheck(); #ifdef TIME_CHECKER_CHARPOLY checktime.stop(); _time += checktime; std::cerr << "CHARPol CHECK: " << _time << std::endl; #endif return pass; } private: inline void init(typename Field::RandIter &G, typename Field::ConstElement_ptr A) { #ifdef TIME_CHECKER_CHARPOLY Givaro::Timer inittime; inittime.start(); #endif // random lambda G.random(lambda); typename Field::Element_ptr v = FFLAS::fflas_new(F,n,1), w = FFLAS::fflas_new(F,n,1), Ac = FFLAS::fflas_new(F,n,n); FFLAS::frand(F,G,n,v,1); // w <- -A.v FFLAS::fgemv(F, FFLAS::FflasNoTrans, n, n, F.mOne, A, lda, v, 1, F.zero, w, 1); if (!F.isZero(lambda)) { // w <- lambda.v + w FFLAS::faxpy(F, n, lambda, v, 1, w, 1); } // Ac <- A - lambda.I FFLAS::fassign(F,n,n,A,lda,Ac,n); for (size_t i=0; i(n); size_t *Q = FFLAS::fflas_new(n); #ifdef TIME_CHECKER_CHARPOLY Givaro::Timer pluqtime; pluqtime.start(); #endif FFPACK::PLUQ(F, FFLAS::FflasNonUnit, n, n, Ac, n, P, Q); #ifdef TIME_CHECKER_CHARPOLY pluqtime.stop(); _time -= pluqtime; inittime.stop(); _time += inittime; std::cerr << "CHARPol server PLUQ:" << pluqtime << std::endl; inittime.start(); #endif // compute the determinant of A F.init(det,*Ac); for (size_t i=1; is,f0,{0,g0,(0,\:0,t0,+0,=s /* checkers/checker_empty.h * Copyright (C) 2016 JG Dumas * * Written by Jean-Guillaume Dumas * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_checkers_empty_H #define __FFLASFFPACK_checkers_empty_H #include "fflas-ffpack/fflas-ffpack-config.h" namespace FFLAS { template struct Checker_Empty { template Checker_Empty(Params... parameters) {} template bool check(Params... parameters) { return true; } }; } #endif fflas-ffpack-2.2.2/fflas-ffpack/checkers/checker_fgemm.inl000066400000000000000000000074451274716147400234640ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* checkers/checker_fgemm.inl * Copyright (C) 2016 Ashley Lesdalons * * Written by Ashley Lesdalons * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_checker_fgemm_INL #define __FFLASFFPACK_checker_fgemm_INL namespace FFLAS { template class CheckerImplem_fgemm { const Field& F; const size_t m,n,k,ldc; typename Field::Element_ptr v,w1; public: CheckerImplem_fgemm(const Field &F_, const size_t m_, const size_t n_, const size_t k_, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc_) : F(F_), m(m_), n(n_), k(k_), ldc(ldc_), v(FFLAS::fflas_new(F_,n,1)),w1(FFLAS::fflas_new(F_,m,1)) { typename Field::RandIter G(F); init(G,beta,C); } CheckerImplem_fgemm(typename Field::RandIter &G, const size_t m_, const size_t n_, const size_t k_, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc_) : F(G.ring()), m(m_), n(n_), k(k_), ldc(ldc_), v(FFLAS::fflas_new(F,n,1)),w1(FFLAS::fflas_new(F,m,1)) { init(G,beta,C); } ~CheckerImplem_fgemm() { FFLAS::fflas_delete(v,w1); } inline bool check(const FFLAS::FFLAS_TRANSPOSE ta, const FFLAS::FFLAS_TRANSPOSE tb, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::ConstElement_ptr C) { // w1 <- C.v - w1 FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, F.one, C, ldc, v, 1, F.mOne, w1, 1); // w2 <- B.v typename Field::Element_ptr w2 = FFLAS::fflas_new(F,k,1); FFLAS::fgemv(F, tb, k, n, F.one, B, ldb, v, 1, F.zero, w2, 1); // w1 <- alpha.A.w2 - w1 FFLAS::fgemv(F, ta, m, k, alpha, A, lda, w2, 1, F.mOne, w1, 1); FFLAS::fflas_delete(w2); // is w1 == O ? bool pass = FFLAS::fiszero(F, m, w1, 1); if (!pass) throw FailureFgemmCheck(); return pass; } private: inline void init(typename Field::RandIter &G, const typename Field::Element beta, typename Field::Element_ptr C) { FFLAS::frand(F,G,n,v,1); // w1 <- beta.C.v FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, beta, C, ldc, v, 1, F.zero, w1, 1); } }; } #endif // __FFLASFFPACK_checker_fgemm_INL fflas-ffpack-2.2.2/fflas-ffpack/checkers/checker_ftrsm.inl000066400000000000000000000106021274716147400235110ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* checkers/Checker_ftrsm.inl * Copyright (C) 2016 Ashley Lesdalons * * Written by Ashley Lesdalons * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_checker_ftrsm_INL #define __FFLASFFPACK_checker_ftrsm_INL namespace FFLAS { template class CheckerImplem_ftrsm { const Field& F; typename Field::Element_ptr v,w; public: CheckerImplem_ftrsm(const Field& F_, const size_t m, const size_t n, const typename Field::Element alpha, const typename Field::ConstElement_ptr B, const size_t ldb) : F(F_), v(FFLAS::fflas_new(F_,n,1)), w(FFLAS::fflas_new(F_,m,1)) { typename Field::RandIter G(F); init(G,m,n,B,ldb,alpha); } CheckerImplem_ftrsm(typename Field::RandIter &G, const size_t m, const size_t n, const typename Field::Element alpha, const typename Field::ConstElement_ptr B, const size_t ldb) : F(G.ring()), v(FFLAS::fflas_new(F,n,1)), w(FFLAS::fflas_new(F,m,1)) { init(G,m,n,B,ldb,alpha); } ~CheckerImplem_ftrsm() { FFLAS::fflas_delete(v,w); } inline bool check(const FFLAS::FFLAS_SIDE side, const FFLAS::FFLAS_UPLO uplo, const FFLAS::FFLAS_TRANSPOSE trans, const FFLAS::FFLAS_DIAG diag, const size_t m, const size_t n, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else typename Field::Element_ptr #endif A, size_t lda, const typename Field::ConstElement_ptr X, size_t ldx) { size_t k = (side==FFLAS::FflasLeft?m:n); typename Field::Element_ptr v1 = FFLAS::fflas_new(F,k,1); if (side==FFLAS::FflasLeft) { // (Left) v1 <- X.v // (Left) v1 <- A.v1 // (Left) w <- w - v1 FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, F.one, X, ldx, v, 1, F.zero, v1, 1); FFLAS::ftrmm(F, FFLAS::FflasLeft, uplo, trans, diag, k, 1, F.one, A, lda, v1, 1); FFLAS::fsubin(F, m, v1, 1, w, 1); } else { // (Right) v <- A.v // (Right) w <- X.v - w FFLAS::ftrmm(F, FFLAS::FflasLeft, uplo, trans, diag, k, 1, F.one, A, lda, v, 1); FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, F.one, X, ldx, v, 1, F.mOne, w, 1); } FFLAS::fflas_delete(v1); bool pass = FFLAS::fiszero(F,m,1,w,1); if (!pass) throw FailureTrsmCheck(); return pass; } private: inline void init(typename Field::RandIter &G, const size_t m, const size_t n, const typename Field::ConstElement_ptr B, size_t ldb, const typename Field::Element alpha) { FFLAS::frand(F,G,n,v,1); // w <- alpha.B.v FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, alpha, B, ldb, v, 1, F.zero, w, 1); } }; } #endif // __FFLASFFPACK_checker_ftrsm_INL fflas-ffpack-2.2.2/fflas-ffpack/checkers/checker_invert.inl000066400000000000000000000057541274716147400237010ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* checkers/Checker_invert.inl * Copyright (C) 2016 Ashley Lesdalons * * Written by Ashley Lesdalons * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_checker_invert_INL #define __FFLASFFPACK_checker_invert_INL namespace FFPACK { template class CheckerImplem_invert { const Field& F; typename Field::Element_ptr v,w; const size_t m,lda; public: CheckerImplem_invert(const Field& F_, const size_t m_, typename Field::ConstElement_ptr A, const size_t lda_) : F(F_), v(FFLAS::fflas_new(F_,m_,1)), w(FFLAS::fflas_new(F_,m_,1)), m(m_), lda(lda_) { typename Field::RandIter G(F); init(G,m,A,lda); } CheckerImplem_invert(typename Field::RandIter &G, const size_t m_, typename Field::ConstElement_ptr A, const size_t lda_) : F(G.ring()), v(FFLAS::fflas_new(F,m_,1)), w(FFLAS::fflas_new(F,m_,1)), m(m_), lda(lda_) { init(G,m,A,lda); } ~CheckerImplem_invert() { FFLAS::fflas_delete(v,w); } inline bool check(typename Field::ConstElement_ptr A, int nullity) { // v <- A.w - v FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, m, F.one, A, lda, w, 1, F.mOne, v, 1); bool pass = FFLAS::fiszero(F,m,1,v,1) || (nullity != 0); if (!pass) throw FailureInvertCheck(); return pass; } private: void init(typename Field::RandIter &G, const size_t m_, typename Field::ConstElement_ptr A, const size_t lda_) { FFLAS::frand(F,G,m,v,1); // write_field(F,std::cerr<<"init A : ",A,m,m,lda,true)<s,f0,{0,g0,(0,\:0,t0,+0,=s /* checkers/checker_pluq.inl * Copyright (C) 2016 Jean-Guillaume Dumas * * Written by Ashley Lesdalons * Jean-Guillaume Dumas * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_checker_pluq_INL #define __FFLASFFPACK_checker_pluq_INL #include "fflas-ffpack/ffpack/ffpack.h" #ifdef TIME_CHECKER_PLUQ #include #endif namespace FFPACK { template class CheckerImplem_PLUQ { const Field& F; typename Field::Element_ptr v,w; const size_t m,n; #ifdef TIME_CHECKER_PLUQ Givaro::Timer _time; #endif public: CheckerImplem_PLUQ(const Field& F_, size_t m_, size_t n_, typename Field::ConstElement_ptr A, size_t lda) : F(F_), v(FFLAS::fflas_new(F_,n_,1)), w(FFLAS::fflas_new(F_,m_,1)), m(m_), n(n_) { typename Field::RandIter G(F); init(G,A,lda); } CheckerImplem_PLUQ(typename Field::RandIter &G, size_t m_, size_t n_, typename Field::ConstElement_ptr A, size_t lda) : F(G.ring()), v(FFLAS::fflas_new(F,n_,1)), w(FFLAS::fflas_new(F,m_,1)), m(m_), n(n_) { init(G,A,lda); } ~CheckerImplem_PLUQ() { FFLAS::fflas_delete(v,w); } /** check if the PLUQ factorization is correct. * Returns true if w - P(L(U(Q.v))) == 0 * @param A * @param r * @param P * @param Q */ inline bool check(typename Field::ConstElement_ptr A, size_t lda, size_t r, size_t *P, size_t *Q) { #ifdef TIME_CHECKER_PLUQ Givaro::Timer checktime; checktime.start(); #endif // _w = [w1|w2] typename Field::Element_ptr _w = FFLAS::fflas_new(F,m,1); // v <-- Q.v FFPACK::applyP(F, FFLAS::FflasLeft, FFLAS::FflasNoTrans, 1, 0, r, v, 1, Q); // w1 <- V1 && w2 <- 0 FFLAS::fassign(F, r, 1, v, 1, _w, 1); FFLAS::fzero(F, m-r, _w+r, 1); // w1 <- U1.w1 // WARNING: should be ftrmv FFLAS::ftrmm(F, FFLAS::FflasLeft, FFLAS::FflasUpper, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, r, 1, F.one, A, lda, _w, 1); // w1 <- U2.V2 + w1 if (r < n) FFLAS::fgemm(F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, r, 1, n-r, F.one, A+r, lda, v+r, 1, F.one, _w, 1); // w2 <- L2.w1 if (r < m) FFLAS::fgemm(F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, m-r, 1, r, F.one, A+r*n, lda, _w, 1, F.zero, _w+r, 1); // w1 <- L1.w1 // WARNING: should be ftrmv FFLAS::ftrmm(F, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, FFLAS::FflasUnit, r, 1, F.one, A, lda, _w, 1); // _w <- P._w FFPACK::applyP(F, FFLAS::FflasRight, FFLAS::FflasNoTrans, 1, 0, r, _w, 1, P); // is _w == w ? FFLAS::fsubin(F, m, w, 1, _w, 1); bool pass = FFLAS::fiszero(F,m,_w,1); FFLAS::fflas_delete(_w); if (!pass) throw FailurePLUQCheck(); #ifdef TIME_CHECKER_PLUQ checktime.stop(); _time += checktime; std::cerr << "PLUQ CHECK: " << _time << std::endl; #endif return pass; } private: inline void init(typename Field::RandIter &G, typename Field::ConstElement_ptr A, size_t lda) { #ifdef TIME_CHECKER_PLUQ Givaro::Timer inittime; inittime.start(); #endif FFLAS::frand(F,G,n,v,1); // w <-- A.v FFLAS::fgemv(F, FFLAS::FflasNoTrans, m, n, F.one, A, lda, v, 1, F.zero, w, 1); #ifdef TIME_CHECKER_PLUQ inittime.stop(); _time += inittime; #endif } }; } #endif // __FFLASFFPACK_checker_pluq_INL fflas-ffpack-2.2.2/fflas-ffpack/checkers/checkers.doxy000066400000000000000000000022061274716147400226630ustar00rootroot00000000000000# Copyright (c) 2016 FFLAS-FFPACK # written by Ashley Lesdalons (ash09) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ /** \ingroup fflas-ffpack * \defgroup checker CHECKER * * \brief Class CHECKER provides functions to verify computations in FFLAS and FFPACK. * */ // vim:syn=doxygen fflas-ffpack-2.2.2/fflas-ffpack/checkers/checkers_fflas.h000066400000000000000000000045501274716147400233060ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* checkers/checkers.h * Copyright (C) 2016 Ashley Lesdalons, JG Dumas * * Written by Ashley Lesdalons * Written by Jean-Guillaume Dumas * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_checkers_fflas_H #define __FFLASFFPACK_checkers_fflas_H #include "fflas-ffpack/fflas-ffpack-config.h" #include "checker_empty.h" #ifdef DEBUG #define CHECKING_MODE 1 #define ENABLE_ALL_CHECKINGS 1 #endif #ifdef ENABLE_ALL_CHECKINGS #define ENABLE_CHECKER_fgemm 1 #define ENABLE_CHECKER_ftrsm 1 #endif #ifdef TIME_CHECKERS #include #define TIME_CHECKER_FGEMM #define TIME_CHECKER_FTRSM #endif // definition of the exceptions class FailureFgemmCheck {}; class FailureTrsmCheck {}; namespace FFLAS { template class CheckerImplem_fgemm; template class CheckerImplem_ftrsm; } namespace FFLAS { #ifdef ENABLE_CHECKER_fgemm template using Checker_fgemm = CheckerImplem_fgemm; #else template using Checker_fgemm = FFLAS::Checker_Empty; #endif #ifdef ENABLE_CHECKER_ftrsm template using Checker_ftrsm = CheckerImplem_ftrsm; #else template using Checker_ftrsm = FFLAS::Checker_Empty; #endif } #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/fflas/fflas_enum.h" #include "fflas-ffpack/utils/fflas_memory.h" #endif fflas-ffpack-2.2.2/fflas-ffpack/checkers/checkers_fflas.inl000066400000000000000000000026711274716147400236430ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* checkers/checkers.inl * Copyright (C) 2016 Ashley Lesdalons * * Written by Ashley Lesdalons * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef FFLASFFPACK_checkers_fflas_inl_H #define FFLASFFPACK_checkers_fflas_inl_H #include "checker_fgemm.inl" #include "checker_ftrsm.inl" namespace FFLAS { template using ForceCheck_fgemm = CheckerImplem_fgemm; template using ForceCheck_ftrsm = CheckerImplem_ftrsm; } #endif fflas-ffpack-2.2.2/fflas-ffpack/checkers/checkers_ffpack.h000066400000000000000000000052741274716147400234510ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* checkers/checkers.h * Copyright (C) 2016 Ashley Lesdalons, JG Dumas * * Written by Ashley Lesdalons * Written by Jean-Guillaume Dumas * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_checkers_ffpack_H #define __FFLASFFPACK_checkers_ffpack_H #include "fflas-ffpack/fflas-ffpack-config.h" #include "checker_empty.h" #ifdef DEBUG #define CHECKING_MODE 1 #define ENABLE_ALL_CHECKINGS 1 #endif #ifdef ENABLE_ALL_CHECKINGS #define ENABLE_CHECKER_PLUQ 1 #define ENABLE_CHECKER_invert 1 #define ENABLE_CHECKER_charpoly 1 #endif #ifdef TIME_CHECKERS #include #define TIME_CHECKER_PLUQ #define TIME_CHECKER_INVERT #define TIME_CHECKER_CHARPOLY #endif // definition of the exceptions class FailurePLUQCheck {}; class FailureInvertCheck {}; class FailureCharpolyCheck {}; namespace FFPACK { template class CheckerImplem_PLUQ; template class CheckerImplem_invert; template class CheckerImplem_charpoly; } namespace FFPACK { #ifdef ENABLE_CHECKER_PLUQ template using Checker_PLUQ = CheckerImplem_PLUQ; #else template using Checker_PLUQ = FFLAS::Checker_Empty; #endif #ifdef ENABLE_CHECKER_invert template using Checker_invert = CheckerImplem_invert; #else template using Checker_invert = FFLAS::Checker_Empty; #endif #ifdef ENABLE_CHECKER_charpoly template using Checker_charpoly = CheckerImplem_charpoly; #else template using Checker_charpoly = FFLAS::Checker_Empty; #endif } #include "fflas-ffpack/ffpack/ffpack.h" #endif fflas-ffpack-2.2.2/fflas-ffpack/checkers/checkers_ffpack.inl000066400000000000000000000031141274716147400237730ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* checkers/checkers.inl * Copyright (C) 2016 Ashley Lesdalons * * Written by Ashley Lesdalons * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef FFLASFFPACK_checkers_ffpack_inl_H #define FFLASFFPACK_checkers_ffpack_inl_H #include "checker_pluq.inl" #include "checker_invert.inl" #include "checker_charpoly.inl" namespace FFPACK { template using ForceCheck_PLUQ = CheckerImplem_PLUQ; template using ForceCheck_invert = CheckerImplem_invert; template using ForceCheck_charpoly = CheckerImplem_charpoly; } #endif fflas-ffpack-2.2.2/fflas-ffpack/config-blas.h000066400000000000000000000502251274716147400207410ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* config-blas.h * Copyright (C) 2005 Pascal Giorgi * 2007 Clement Pernet * Written by Pascal Giorgi * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ #ifndef __FFLASFFPACK_config_blas_H #define __FFLASFFPACK_config_blas_H // #include "fflas-ffpack/utils/fflas_memory.h" // #ifndef __FFLASFFPACK_CONFIGURATION // #include "fflas-ffpack/fflas-ffpack-config.h" // #endif // #ifdef OPTIMISATION_MODE // #include "fflas-ffpack/config.h" // #endif #ifdef HAVE_MKL #define __FFLASFFPACK_HAVE_MKL #endif #ifdef __FFLASFFPACK_HAVE_MKL #include #endif #ifndef CBLAS_INT #ifdef blasint /* openblas */ #define CBLAS_INT blasint #elif defined( MKL_INT ) #define CBLAS_INT MKL_INT #else #define CBLAS_INT int #endif /* blasint */ #endif /* CBLAS_INT */ #ifdef CUDA_BLAS #define sgemv_ cublas_sgemv #define sgemm_ cublas_sgemm #define strsm_ cublas_strsm #define strmm_ cublas_strmm #endif // CUDA_BLAS #ifndef __FFLASFFPACK_HAVE_MKL #define CBLAS_ENUM_DEFINED_H enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 }; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, AtlasConj=114}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; // #define CBLAS_INDEX int #ifndef __FFLASFFPACK_HAVE_CBLAS // CBLAS are not available define our own wrapper // define external link to BLAS function extern "C" { #define CBLAS_EXTERNALS static const char* EXT_BLAS_TRANSPOSE (CBLAS_TRANSPOSE t) { if (t == CblasNoTrans) return "N"; else if (t == CblasTrans) return "T"; else return "";} static const char* EXT_BLAS_TRANSPOSE_tr (CBLAS_TRANSPOSE t) { if (t == CblasNoTrans) return "T"; else if (t == CblasTrans) return "N"; else return "";} static const char* EXT_BLAS_UPLO (CBLAS_UPLO t) { if (t == CblasUpper) return "U"; else return "L";} static const char* EXT_BLAS_UPLO_tr (CBLAS_UPLO t) { if (t == CblasUpper) return "L"; else return "U";} static const char* EXT_BLAS_DIAG (CBLAS_DIAG t) { if (t == CblasUnit) return "U"; else return "N";} static const char* EXT_BLAS_SIDE (CBLAS_SIDE t) { if (t == CblasLeft) return "L"; else return "R";} static const char* EXT_BLAS_SIDE_tr (CBLAS_SIDE t) { if (t == CblasLeft) return "R"; else return "L";} // level 1 routines void daxpy_ (const int*, const double*, const double*, const int*, double*, const int*); void saxpy_ (const int*, const float*, const float*, const int*, float*, const int*); double ddot_ (const int*, const double*, const int*, const double*, const int*); float sdot_ (const int*, const float*, const int*, const float*, const int*); double dasum_ (const int*, const double*, const int*); int idamax_ (const int*, const double*, const int*); double dnrm2_ (const int*, const double*, const int*); // level 2 routines void dgemv_ (const char*, const int*, const int*, const double*, const double*, const int*, const double*, const int*, const double*, double*, const int*); void sgemv_ (const char*, const int*, const int*, const float*, const float*, const int*, const float*, const int*, const float*, float*, const int*); void dger_ (const int*, const int*, const double*, const double*, const int*, const double*, const int*, double*, const int*); void sger_ (const int*, const int*, const float*, const float*, const int*, const float*, const int*, float*, const int*); void dcopy_ (const int *, const double *, const int *, double *, const int *); void scopy_ (const int *, const float *, const int *, float *, const int *); void dscal_ (const int *, const double *, double *, const int *); void sscal_ (const int *, const float *, float *, const int *); // level 3 routines void dtrsm_ (const char*, const char*, const char*, const char*, const int*, const int*, const double*, const double*, const int*, double*, const int*); void strsm_ (const char*, const char*, const char*, const char*, const int*, const int*, const float*, const float*, const int*, float*, const int*); void dtrmm_ (const char*, const char*, const char*, const char*, const int*, const int*, const double*, const double*, const int*, double*, const int*); void strmm_ (const char*, const char*, const char*, const char*, const int*, const int*, const float*, const float*, const int*, float*, const int*); void sgemm_ (const char*, const char*, const int*, const int*, const int*, const float*, const float*, const int*, const float*, const int*, const float*, float*, const int*); void dgemm_ (const char*, const char*, const int*, const int*, const int*, const double*, const double*, const int*, const double*, const int*, const double*, double*, const int*); } // define C wrappers extern "C" { // level 1 routines inline void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY) { daxpy_ (&N,&alpha, X, &incX, Y, &incY); } inline void cblas_saxpy(const int N, const float alpha, const float *X, const int incX, float *Y, const int incY) { saxpy_ (&N,&alpha, X, &incX, Y, &incY); } inline double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY) { return ddot_ (&N, X, &incX, Y, &incY); } inline float cblas_sdot(const int N, const float *X, const int incX, const float *Y, const int incY) { return sdot_ (&N, X, &incX, Y, &incY); } inline double cblas_dasum(const int N, const double *X, const int incX){ return dasum_ (&N, X, &incX); } inline int cblas_idamax(const int N, const double *X, const int incX){ return idamax_ (&N, X, &incX); } inline double cblas_dnrm2(const int N, const double *X, const int incX){ return dnrm2_(&N, X, &incX); } // level 2 routines inline void cblas_dgemv(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY) { if (Order == CblasRowMajor) dgemv_ ( EXT_BLAS_TRANSPOSE_tr(TransA), &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY); else dgemv_ ( EXT_BLAS_TRANSPOSE(TransA), &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY); } inline void cblas_sgemv(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY) { if (Order == CblasRowMajor) sgemv_ ( EXT_BLAS_TRANSPOSE_tr(TransA), &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY); else sgemv_ ( EXT_BLAS_TRANSPOSE(TransA), &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY); } inline void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda) { if (Order == CblasRowMajor) dger_ (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda); else dger_ (&M, &N, &alpha, X, &incX, Y, &incY, A, &lda); } inline void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda) { if (Order == CblasRowMajor) sger_ (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda); else sger_ (&M, &N, &alpha, X, &incX, Y, &incY, A, &lda); } void cblas_dcopy(const int N, const double *X, const int incX, double *Y, const int incY) { dcopy_(&N,X,&incX,Y,&incY); } void cblas_scopy(const int N, const float *X, const int incX, float *Y, const int incY) { scopy_(&N,X,&incX,Y,&incY); } void cblas_dscal(const int N, const double alpha, double *Y, const int incY) { dscal_(&N,&alpha,Y,&incY); } void cblas_sscal(const int N, const float alpha, float *Y, const int incY) { sscal_(&N,&alpha,Y,&incY); } // level 3 routines inline void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb) { if (Order == CblasRowMajor) dtrsm_ ( EXT_BLAS_SIDE_tr(Side), EXT_BLAS_UPLO_tr(Uplo), EXT_BLAS_TRANSPOSE(TransA), EXT_BLAS_DIAG(Diag), &N, &M, &alpha, A, &lda, B, &ldb); else dtrsm_ ( EXT_BLAS_SIDE(Side), EXT_BLAS_UPLO(Uplo), EXT_BLAS_TRANSPOSE(TransA), EXT_BLAS_DIAG(Diag), &M, &N, &alpha, A, &lda, B, &ldb); } inline void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb) { if (Order == CblasRowMajor) strsm_ ( EXT_BLAS_SIDE_tr(Side), EXT_BLAS_UPLO_tr(Uplo), EXT_BLAS_TRANSPOSE(TransA), EXT_BLAS_DIAG(Diag), &N, &M, &alpha, A, &lda, B, &ldb); else strsm_ ( EXT_BLAS_SIDE(Side), EXT_BLAS_UPLO(Uplo), EXT_BLAS_TRANSPOSE(TransA), EXT_BLAS_DIAG(Diag), &M, &N, &alpha, A, &lda, B, &ldb); } inline void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb) { if (Order == CblasRowMajor) dtrmm_ ( EXT_BLAS_SIDE_tr(Side), EXT_BLAS_UPLO_tr(Uplo), EXT_BLAS_TRANSPOSE(TransA), EXT_BLAS_DIAG(Diag), &N, &M, &alpha, A, &lda, B, &ldb); else dtrmm_ ( EXT_BLAS_SIDE(Side), EXT_BLAS_UPLO(Uplo), EXT_BLAS_TRANSPOSE(TransA), EXT_BLAS_DIAG(Diag), &M, &N, &alpha, A, &lda, B, &ldb); } inline void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb) { if (Order == CblasRowMajor) strmm_ ( EXT_BLAS_SIDE_tr(Side), EXT_BLAS_UPLO_tr(Uplo), EXT_BLAS_TRANSPOSE(TransA), EXT_BLAS_DIAG(Diag), &N, &M, &alpha, A, &lda, B, &ldb); else strmm_ ( EXT_BLAS_SIDE(Side), EXT_BLAS_UPLO(Uplo), EXT_BLAS_TRANSPOSE(TransA), EXT_BLAS_DIAG(Diag), &M, &N, &alpha, A, &lda, B, &ldb); } inline void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc) { if (Order == CblasRowMajor) dgemm_ ( EXT_BLAS_TRANSPOSE(TransB), EXT_BLAS_TRANSPOSE(TransA), &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc); else dgemm_ ( EXT_BLAS_TRANSPOSE(TransA), EXT_BLAS_TRANSPOSE(TransB), &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } inline void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc) { if (Order == CblasRowMajor) sgemm_ ( EXT_BLAS_TRANSPOSE(TransB), EXT_BLAS_TRANSPOSE(TransA), &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc); else sgemm_ ( EXT_BLAS_TRANSPOSE(TransA), EXT_BLAS_TRANSPOSE(TransB), &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } } #else // CBLAS PRESENT extern "C" { int cblas_errprn(int ierr, int info, char *form, ...); // level 1 routines void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY); void cblas_saxpy(const int N, const float alpha, const float *X, const int incX, float *Y, const int incY); double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY); float cblas_sdot(const int N, const float *X, const int incX, const float *Y, const int incY); double cblas_dasum(const int N, const double *X, const int incX); int cblas_idamax(const int N, const double *X, const int incX); double cblas_dnrm2(const int N, const double *X, const int incX); // level 2 routines void cblas_dgemv(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY); void cblas_sgemv(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY); void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda); void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda); void cblas_dcopy(const int N, const double *X, const int incX, double *Y, const int incY); void cblas_scopy(const int N, const float *X, const int incX, float *Y, const int incY); void cblas_dscal(const int N, const double alpha, double *Y, const int incY); void cblas_sscal(const int N, const float alpha, float *Y, const int incY); // level 3 routines void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb); void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb); void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb); void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb); void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc) ; void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc) ; } #endif // CBLAS ? #endif // __FFLASFFPACK_HAVE_MKL #ifdef __FFLASFFPACK_HAVE_MKL #define blas_enum #else #define blas_enum enum #endif #ifdef __FFLASFFPACK_HAVE_LAPACK #ifndef __FFLASFFPACK_HAVE_CLAPACK #ifndef CBLAS_EXTERNALS #define CBLAS_EXTERNALS // static const char* EXT_BLAS_TRANSPOSE (CBLAS_TRANSPOSE t) { if (t == CblasNoTrans) return "N"; else if (t == CblasTrans) return "T"; else return "";} // static const char* EXT_BLAS_TRANSPOSE_tr (CBLAS_TRANSPOSE t) { if (t == CblasNoTrans) return "T"; else if (t == CblasTrans) return "N"; else return "";} static const char* EXT_BLAS_UPLO (CBLAS_UPLO t) { if (t == CblasUpper) return "U"; else return "L";} static const char* EXT_BLAS_UPLO_tr (CBLAS_UPLO t) { if (t == CblasUpper) return "L"; else return "U";} static const char* EXT_BLAS_DIAG (CBLAS_DIAG t) { if (t == CblasUnit) return "U"; else return "N";} // static const char* EXT_BLAS_SIDE (CBLAS_SIDE t) { if (t == CblasLeft) return "L"; else return "R";} // static const char* EXT_BLAS_SIDE_tr (CBLAS_SIDE t) { if (t == CblasLeft) return "R"; else return "L";} #endif // CBLAS_EXTERNALS // define external link to LAPACK routines extern "C" { //!@bug we should also allow lapacke from MLK void dgetrf_ (const CBLAS_INT *, const CBLAS_INT *, double *, const CBLAS_INT *, CBLAS_INT *, CBLAS_INT *); void dgetri_ (const CBLAS_INT *, double *, const CBLAS_INT *, const CBLAS_INT *, double *, const CBLAS_INT *, CBLAS_INT *); void dtrtri_ (const char *, const char *, const CBLAS_INT *, double *, const CBLAS_INT *, CBLAS_INT *); void dswap_ (const CBLAS_INT *, double *, const CBLAS_INT *, double *, const CBLAS_INT *); } // define C wrappers extern "C" { // LAPACK routines // return A=P.L.U (L unitary) with ColMajor // return A=L.U.P (U unitary) with RowMajor //! @bug Order is not used. we should use ATLAS/interfaces/lapack/C/src/clapack_dgetrf.c or similar inline CBLAS_INT clapack_dgetrf(const blas_enum CBLAS_ORDER, const CBLAS_INT M, const CBLAS_INT N, double *A, const CBLAS_INT lda, CBLAS_INT *ipiv) { CBLAS_INT info; dgetrf_ ( &M, &N, A, &lda, ipiv, &info); return info; } inline CBLAS_INT clapack_dgetri(const blas_enum CBLAS_ORDER, const CBLAS_INT N, double *A, const CBLAS_INT lda, const CBLAS_INT *ipiv) { CBLAS_INT info; double *work; #ifndef __FFLASFFPACK_AUTOIMPLEMENT_DGETRI // the optimum size of work can be determCBLAS_INTed via the // Lapack function ilaenv. work= new double[N]; dgetri_ (&N, A, &lda, ipiv, work, &N, &info); delete[] work; #else work= new double[N*N]; dtrtri_("U","N", &N, A, &lda, &info); if (info > 0) return 0; for (CBLAS_INT i=0;ii) A[i*N+j]=0.0; } work[i*N+i]=1.; } double cst=1.; dtrsm_ ("R", "L", "N", "U", &N, &N, &cst, work, &N, A, &N); CBLAS_INT ip; const CBLAS_INT incr=1; for (CBLAS_INT i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (C) 2012 FFLAS-FFPACK * Written by Brice Boyer (briceboyer) * * * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, * Boston, MA 02110-1301, USA. */ /*! @file fflas-ffpack/fflas-ffpack-config.h * @ingroup optimise * @brief Defaults for optimised values. * While \c fflas-ffpack-optimise.h is created by \c configure script, * (either left blank or filled by optimiser), this file produces the * defaults for the optimised values. If \c fflas-ffpack-optimise.h is not * empty, then its values preceeds the defaults here. */ #ifndef __FFLASFFPACK_fflas_ffpack_configuration_H #define __FFLASFFPACK_fflas_ffpack_configuration_H #ifndef GCC_VERSION #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) #endif #ifdef __CYGWIN__ # ifndef _GLIBCXX_USE_C99 # define _GLIBCXX_USE_C99 true # ifndef _GLIBCXX_USE_C99_MATH_TR1 # include # include # include # undef fma # include # undef strtoull # undef strtoll namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION using ::fma; using ::strtoll; using ::strtoull; /* unsigned long stoul( const std::string& str, std::size_t* pos = 0, int base = 10 ) { return std::strtoul(str.c_str(), NULL, base); } unsigned long long stoull( const std::string& str, std::size_t* pos = 0, int base = 10 ) { return std::strtoull(str.c_str(), NULL, base); } long stol( const std::string& str, std::size_t* pos = 0, int base = 10 ) { return std::strtol(str.c_str(), NULL, base); } long long stoll( const std::string& str, std::size_t* pos = 0, int base = 10 ) { return std::strtoll(str.c_str(), NULL, base); } */ } # else # define _GLIBCXX_USE_C99 true # include # endif # endif #endif #include "fflas-ffpack/config.h" #ifdef __FFLASFFPACK_USE_OPENMP # ifndef __GIVARO_USE_OPENMP # define __GIVARO_USE_OPENMP 1 # endif #endif #include "fflas-ffpack/fflas-ffpack-optimise.h" // winograd algorithm threshold (for double) #ifndef __FFLASFFPACK_WINOTHRESHOLD #define __FFLASFFPACK_WINOTHRESHOLD 1000 #endif #ifndef __FFLASFFPACK_WINOTHRESHOLD_FLT #define __FFLASFFPACK_WINOTHRESHOLD_FLT 2000 #endif #ifndef __FFLASFFPACK_WINOTHRESHOLD_BAL #define __FFLASFFPACK_WINOTHRESHOLD_BAL 1000 #endif #ifndef __FFLASFFPACK_WINOTHRESHOLD_BAL_FLT #define __FFLASFFPACK_WINOTHRESHOLD_BAL_FLT 2000 #endif #if defined(_OPENMP) || defined(OMP_H) || defined(__OMP_H) || defined(__pmp_omp_h) #ifndef __FFLASFFPACK_USE_OPENMP #warning "openmp was not detected correctly at configure time, please report this bug" #define __FFLASFFPACK_USE_OPENMP #endif #endif #include "givaro/givconfig.h" #ifdef __GIVARO_HAVE_INT128 #define __FFLASFFPACK_HAVE_INT128 #endif #endif // __FFLASFFPACK_fflas_ffpack_configuration_H fflas-ffpack-2.2.2/fflas-ffpack/fflas-ffpack.doxy000066400000000000000000000021751274716147400216350ustar00rootroot00000000000000// Copyright (c) 2011 FFLAS-FFPACK // written by Brice Boyer (briceboyer) // // ========LICENCE======== // This file is part of the library FFLAS-FFPACK. // // FFLAS-FFPACK is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // ========LICENCE======== // /** * \defgroup fflasffpack FFLAS-FFPACK * \brief the FFLAS FFPACK library * * C++ header library for fast exact dense linear algebra * * @see fflas * @see ffpack */ // vim:syn=doxygen fflas-ffpack-2.2.2/fflas-ffpack/fflas-ffpack.h000066400000000000000000000026121274716147400210750ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (C) 2011 FFLAS-FFPACK * Written by Brice Boyer (briceboyer) * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ /*! @file fflas-ffpack/fflas-ffpack.h * @ingroup fflas-ffpack * @brief Includes FFLAS and FFPACK */ #ifndef __FFLASFFPACK_fflas_ffpack_H #define __FFLASFFPACK_fflas_ffpack_H #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas/fflas.h" #include "ffpack/ffpack.h" #endif // __FFLASFFPACK_fflas_ffpack_H fflas-ffpack-2.2.2/fflas-ffpack/fflas/000077500000000000000000000000001274716147400174735ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/Makefile.am000066400000000000000000000041411274716147400215270ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas SUBDIRS=fflas_fgemm fflas_igemm fflas_simd fflas_sparse sparse=fflas_sparse.h \ fflas_sparse.inl multiprecision= fflas_ftrsm_mp.inl \ fflas_fscal_mp.inl \ fflas_freduce_mp.inl \ fflas_fger_mp.inl \ fflas_fgemv_mp.inl pkgincludesub_HEADERS= \ fflas_bounds.inl \ fflas_fassign.h \ fflas_fassign.inl \ fflas_ftrmm.inl \ fflas.h \ fflas_level1.inl \ fflas_level2.inl \ fflas_level3.inl \ fflas_fadd.h \ fflas_fadd.inl \ fflas_fdot.inl \ fflas_ftrmm_src.inl \ fflas_fgemm.inl \ fflas_pfgemm.inl \ fflas_pftrsm.inl \ fflas_ftrsm.inl \ fflas_fgemv.inl \ fflas_freivalds.inl \ fflas_fscal.h \ fflas_fscal.inl \ fflas_ftrsm_src.inl \ fflas_faxpy.inl \ fflas_fger.inl \ fflas_ftrsv.inl \ fflas_freduce.h \ fflas_freduce.inl \ fflas_helpers.inl \ fflas_simd.h \ fflas_enum.h \ ${sparse} \ ${multiprecision} EXTRA_DIST=fflas.doxy fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas.doxy000066400000000000000000000027011274716147400214730ustar00rootroot00000000000000// Copyright (c) 2011 FFLAS-FFPACK // written by Brice Boyer (briceboyer) // // ========LICENCE======== // This file is part of the library FFLAS-FFPACK. // // FFLAS-FFPACK is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // ========LICENCE======== // /** \ingroup fflasffpack * \defgroup fflas FFLAS * * \brief The C-style wrapper of BLAS for finite field linear algebra. * * FFLAS, Finite Field Linear Algebra Subroutines, provide basic linear * algebra subroutines based on the BLAS interface. Therefore, the * specifications are in C style; only the field given as a template parameter * requires \p C++. * * As much as possible, these routines use \p ATLAS/BLAS computations and * achieve therefore high efficiency. * * */ // vim:syn=doxygen fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas.h000066400000000000000000000116411274716147400207420ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas.h * Copyright (C) 2005,2013,2014 Clement Pernet * * Written by Clement Pernet * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas.h * @author Clément Pernet. * @brief Finite Field Linear Algebra Subroutines */ #ifndef __FFLASFFPACK_fflas_H #define __FFLASFFPACK_fflas_H #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/config.h" #include "fflas-ffpack/config-blas.h" #include #include #ifdef __FFLASFFPACK_USE_OPENMP #include #endif // namespace FFLAS { #ifndef WINOTHRESHOLD #define WINOTHRESHOLD __FFLASFFPACK_WINOTHRESHOLD #endif // } /** Thresholds determining which floating point representation to use, depending * on the cardinality of the finite field. This is only used when the element * representation is not a floating point type. * @bug to be benchmarked. */ #ifndef DOUBLE_TO_FLOAT_CROSSOVER #define DOUBLE_TO_FLOAT_CROSSOVER 800 #endif #include /// @brief FFLAS: Finite Field Linear Algebra Subroutines. #include #include "fflas_enum.h" #include "fflas-ffpack/utils/fflas_memory.h" #include "fflas-ffpack/paladin/parallel.h" //--------------------------------------------------------------------- // Level 1 routines //--------------------------------------------------------------------- #include "fflas_level1.inl" //--------------------------------------------------------------------- // Level 2 routines //--------------------------------------------------------------------- #include "fflas_level2.inl" //--------------------------------------------------------------------- // Level 3 routines //--------------------------------------------------------------------- #include "fflas_level3.inl" #ifdef FFLAS_COMPILED #include "fflas-ffpack/interfaces/libs/fflas_L1_inst.h" #include "fflas-ffpack/interfaces/libs/fflas_L2_inst.h" #include "fflas-ffpack/interfaces/libs/fflas_L3_inst.h" #endif //--------------------------------------------------------------------- // Checkers #include "fflas-ffpack/checkers/checkers_fflas.h" //--------------------------------------------------------------------- //--------------------------------------------------------------------- // specialisations and implementation //--------------------------------------------------------------------- #include "fflas_freduce.h" #include "fflas_fadd.h" #include "fflas_fscal.h" #include "fflas_fassign.h" #include "fflas_fgemm.inl" #include "fflas_pfgemm.inl" // fgemm must be before fgemv according to ScalAndReduce function declaration ?!? PG #include "fflas_fgemv.inl" #include "fflas_freivalds.inl" #include "fflas_fger.inl" #include "fflas_ftrsm.inl" #include "fflas_pftrsm.inl" #include "fflas_ftrmm.inl" #include "fflas_ftrsv.inl" #include "fflas_faxpy.inl" #include "fflas_fdot.inl" //--------------------------------------------------------------------- // MultiPrecision routines //--------------------------------------------------------------------- // include multiprecision fields for specialisation #include "fflas-ffpack/field/rns.h" //forward declaration of the multiprecision field #include "fflas_fscal_mp.inl" #include "fflas_freduce_mp.inl" #include "fflas-ffpack/fflas/fflas_fger_mp.inl" #include "fflas_fgemm/fgemm_classical_mp.inl" #include "fflas_ftrsm_mp.inl" #include "fflas_fgemv_mp.inl" #include "fflas-ffpack/field/rns.inl" // real implementation of the multiprecision field #include "fflas-ffpack/paladin/fflas_pfinit.h" //--------------------------------------------------------------------- // Sparse routines //--------------------------------------------------------------------- #include "fflas_sparse.h" //--------------------------------------------------------------------- // Checkers //--------------------------------------------------------------------- #include "fflas-ffpack/checkers/checkers_fflas.inl" #endif // __FFLASFFPACK_fflas_H fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_bounds.inl000066400000000000000000000146461274716147400226570ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_bounds.inl * Copyright (C) 2008 Clement Pernet * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_bounds_INL #define __FFLASFFPACK_fflas_bounds_INL #define FFLAS_INT_TYPE uint64_t #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/utils/flimits.h" #include #include #include namespace FFLAS { namespace Protected { template inline double computeFactorClassic (const Field& F) { //FFLAS_INT_TYPE p=0; Givaro::Integer p=0; F.characteristic(p); return (double) (p-1); } /************************************************************************************* * Specializations for ModularPositive and ModularBalanced over double and float *************************************************************************************/ template <> inline double computeFactorClassic (const Givaro::ModularBalanced& F) { //FFLAS_INT_TYPE p; Givaro::Integer p; F.characteristic(p); return double((p-1) >> 1); } //BB: ajout, pourquoi pas ? template <> inline double computeFactorClassic (const Givaro::ModularBalanced& F) { //FFLAS_INT_TYPE p; Givaro::Integer p; F.characteristic(p); return double((p-1) >> 1); } template inline size_t DotProdBoundClassic (const Field& F, const typename Field::Element& beta ) { //FFLAS_INT_TYPE p=0; Givaro::Integer p=0; F.characteristic(p); //unsigned long mantissa = Protected::Mantissa(); if (p == 0) return std::numeric_limits::max(); double kmax; { double c = computeFactorClassic(F); double cplt=0; if (!F.isZero (beta)){ if (F.isOne (beta) || F.areEqual (beta, F.mOne)) cplt = c; else{ double be; F.convert(be, beta); cplt = fabs(be)*c; } } kmax = floor ( (double (double(limits::max()) + 1 - cplt)) / (c*c)); if (kmax <= 1) return 1; } //kmax--; // we computed a strict upper bound return (size_t) std::min ((uint64_t)kmax, 1_ui64 << 31); } } // FFLAS } // Protected namespace FFLAS { inline Givaro::Integer InfNorm (const size_t M, const size_t N, const Givaro::Integer* A, const size_t lda){ Givaro::Integer max = 0; size_t log=0; for (size_t i=0; i= log) && (abs(x) > max)){ max = abs(x); // max = x; log = x.bitsize(); } } return max; } namespace Protected { /** * TRSMBound * * \brief computes the maximal size for delaying the modular reduction * in a triangular system resolution * * This is the default version over an arbitrary field. * It is currently never used (the recursive algorithm is run until n=1 in this case) * * \param F Finite Field/Ring of the computation * */ template inline size_t TRSMBound (const Field&) { return 1; } // /** // * Specialization for positive modular representation over double // * Computes nmax s.t. (p-1)/2*(p^{nmax-1} + (p-2)^{nmax-1}) < 2^53 // * See [Dumas Giorgi Pernet 06, arXiv:cs/0601133] // */ // template<> // inline size_t TRSMBound (const Givaro::Modular& F) // { // FFLAS_INT_TYPE pi; // F.characteristic(pi); // unsigned long p = pi; // unsigned long long p1(1), p2(1); // size_t nmax = 0; // unsigned long long max = ( (1 << (DBL_MANT_DIG + 1) ) / ((unsigned long long)(p - 1))); // while ( (p1 + p2) < max ){ // p1*=p; // p2*=p-2; // nmax++; // } // return nmax; // } /** * Specialization for positive modular representation over float. * Computes nmax s.t. (p-1)/2*(p^{nmax-1} + (p-2)^{nmax-1}) < 2^24 * @pbi * See [Dumas Giorgi Pernet 06, arXiv:cs/0601133] */ template inline size_t TRSMBound (const Givaro::Modular& F) { FFLAS_INT_TYPE pi; F.characteristic(pi); double p = pi; double p1 = 1.0, p2 = 1.0; double pm1 = (p - 1) / 2; size_t nmax = 0; unsigned long long max = limits::max(); while ( (p1 + p2)*pm1 <= max ){ p1*=p; p2*=p-2; nmax++; } return std::max((size_t)1,nmax); } /** * Specialization for balanced modular representation over double. * Computes nmax s.t. (p-1)/2*(((p+1)/2)^{nmax-1}) < 2^53 * @bib * - Dumas Giorgi Pernet 06, arXiv:cs/0601133 */ template inline size_t TRSMBound (const Givaro::ModularBalanced& F) { FFLAS_INT_TYPE pi; F.characteristic (pi); double pp1 = (pi + 1) / 2; double pm1 = (pi - 1) / 2; double p1 = 1.0; size_t nmax = 0; double max = limits::max(); while (pm1*p1 <= max){ p1 *= pp1; nmax++; } return std::max((size_t) 1,nmax); } // /** // * Specialization for balanced modular representation over float // * Computes nmax s.t. (p-1)/2*(((p+1)/2)^{nmax-1}) < 2^24 // * See [Dumas Giorgi Pernet 06, arXiv:cs/0601133] // */ // template<> // inline size_t TRSMBound (const Givaro::ModularBalanced& F) // { // FFLAS_INT_TYPE pi; // F.characteristic (pi); // unsigned long p = (pi + 1) / 2; // unsigned long long p1(1); // size_t nmax = 0; // unsigned long long max = (1 << (FLT_MANT_DIG + 1)) ; // while ((pi-1)*p1 < max){ // p1 *= p; // nmax++; // } // return nmax; // } } // Protected } // FFLAS #endif // __FFLASFFPACK_fflas_bounds_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_enum.h000066400000000000000000000065621274716147400217740ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas_enum.h * Copyright (C) The FFLAS-FFPACK group * * Written by Clement Pernet * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_enum_INL #define __FFLASFFPACK_enum_INL namespace FFLAS { /// Storage by row or col ? enum FFLAS_ORDER { FflasRowMajor=101, /**< row major */ FflasColMajor=102 /**< col major */ }; // public: /// Is matrix transposed ? enum FFLAS_TRANSPOSE { FflasNoTrans = 111, /**< Matrix is not transposed */ FflasTrans = 112 /**< Matrix is transposed */ }; /// Is triangular matrix's shape upper ? enum FFLAS_UPLO { FflasUpper = 121, /**< Triangular matrix is Upper triangular (if \f$i>j\f$ then \f$T_{i,j} = 0\f$)*/ FflasLower = 122 /**< Triangular matrix is Lower triangular (if \f$i namespace FFLAS{ namespace Protected { template class AreEqual { public: static const bool value = false; }; template class AreEqual { public: static const bool value = true; }; } // Protected } // class FFLAS namespace FFLAS { template const T &min3(const T &m, const T &n, const T &k) { return std::min(m, std::min(n, k)); } template const T &max3(const T &m, const T &n, const T &k) { return std::max(m, std::min(n, k)); } template const T &min4(const T &m, const T &n, const T &k, const T &l) { return std::min(std::min(m, n), std::min(k, l)); } template const T &max4(const T &m, const T &n, const T &k, const T &l) { return std::max(std::max(m, n), std::max(k, l)); } } // FFLAS #endif // __FFLASFFPACK_enum_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fadd.h000066400000000000000000000231671274716147400217260ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFPACK group * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fadd_H #define __FFLASFFPACK_fadd_H namespace FFLAS { template struct support_simd_add : public std::false_type {} ; // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template<> struct support_simd_add : public std::true_type {} ; template<> struct support_simd_add : public std::true_type {} ; #ifdef SIMD_INT template<> struct support_simd_add : public std::true_type {} ; template<> struct support_simd_add : public std::true_type {} ; #endif // SIMD_INT // #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS } // FFLAS #include "fflas_fadd.inl" namespace FFLAS { /***************************/ /* LEVEL 1 */ /***************************/ template void fadd (const Field & F, const size_t N, typename Field::ConstElement_ptr A, const size_t inca, typename Field::ConstElement_ptr B, const size_t incb, typename Field::Element_ptr C, const size_t incc) { details::fadd(F,N,A,inca,B,incb,C,incc , typename FieldTraits::category() ); } template void faddin (const Field& F, const size_t N, typename Field::ConstElement_ptr B, const size_t incb, typename Field::Element_ptr C, const size_t incc) { fadd(F,N,B,incb,C,incc,C,incc); return; } template void fsub(const Field & F, const size_t N, typename Field::ConstElement_ptr A, const size_t inca, typename Field::ConstElement_ptr B, const size_t incb, typename Field::Element_ptr C, const size_t incc) { details::fadd(F,N,A,inca,B,incb,C,incc , typename FieldTraits::category() ); } template void fsubin (const Field& F, const size_t N, typename Field::ConstElement_ptr B, const size_t incb, typename Field::Element_ptr C, const size_t incc) { fsub(F,N,C,incc,B,incb,C,incc); return; } // C = A + a B template void fadd (const Field& F, const size_t N, typename Field::ConstElement_ptr A, const size_t inca, const typename Field::Element alpha, typename Field::ConstElement_ptr B, const size_t incb, typename Field::Element_ptr C, const size_t incc) { if (C == A && inca == incc) return faxpy(F,N,alpha,B,incb,C,incc); if (F.isOne(alpha)) return fadd(F,N,A,inca,B,incb,C,incc); if (F.isMOne(alpha)){ return fsub(F,N,A,inca,B,incb,C,incc); } if (F.isZero(alpha)) return fassign(F,N,A,inca,C,incc); if (inca == 1 && incb == 1 && incc == 1) { for (size_t i = 0 ; i < N ; ++i) { //!@todo optimise here F.mul(C[i],alpha,B[i]); F.addin(C[i],A[i]); } return; } typename Field::ConstElement_ptr Ai = A, Bi = B; typename Field::Element_ptr Ci = C; for (; Ai < A+N*inca; Ai+=inca, Bi+=incb, Ci+=incc) { F.mul(*Ci,alpha,*Bi); F.addin (*Ci, *Ai); } } /***************************/ /* LEVEL 2 */ /***************************/ template void pfadd (const Field & F, const size_t M, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::Element_ptr C, const size_t ldc, const size_t numths){ SYNCH_GROUP( FORBLOCK1D(iter, M, SPLITTER(numths), size_t rowsize= iter.end()-iter.begin(); TASK(MODE(CONSTREFERENCE(F) READWRITE(C[iter.begin()*ldc]) READ(A[iter.begin()*lda], B[iter.begin()*ldb])), fadd(F, rowsize, N, A+iter.begin()*lda, lda, B+iter.begin()*ldb, ldb, C+iter.begin()*ldc, ldc); ); ); ); } template void pfsub (const Field & F, const size_t M, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::Element_ptr C, const size_t ldc, const size_t numths){ SYNCH_GROUP( FORBLOCK1D(iter, M, SPLITTER(numths), size_t rowsize= iter.end()-iter.begin(); TASK(MODE(CONSTREFERENCE(F) READWRITE(C[iter.begin()*ldc]) READ(A[iter.begin()*lda], B[iter.begin()*ldb])), fsub(F, rowsize, N, A+iter.begin()*lda, lda, B+iter.begin()*ldb, ldb, C+iter.begin()*ldc, ldc); ); ); ); } template void pfaddin (const Field& F, const size_t M, const size_t N, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::Element_ptr C, const size_t ldc, size_t numths){ SYNCH_GROUP( FORBLOCK1D(iter, M, SPLITTER(numths), size_t rowsize= iter.end()-iter.begin(); TASK(MODE(CONSTREFERENCE(F) READWRITE(C[iter.begin()*ldc]) READ(B[iter.begin()*ldb])), faddin(F, rowsize, N, B+iter.begin()*ldb, ldb, C+iter.begin()*ldc, ldc); ); ); ); } template void pfsubin (const Field& F, const size_t M, const size_t N, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::Element_ptr C, const size_t ldc, size_t numths){ SYNCH_GROUP( FORBLOCK1D(iter, M, SPLITTER(numths), size_t rowsize= iter.end()-iter.begin(); TASK(MODE(CONSTREFERENCE(F) READWRITE(C[iter.begin()*ldc]) READ(B[iter.begin()*ldb])), fsubin(F, rowsize, N, B+iter.begin()*ldb, ldb, C+iter.begin()*ldc, ldc); ); ); ); } template void fadd (const Field& F, const size_t M, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::Element_ptr C, const size_t ldc) { if (N == lda && N == ldb && N == ldc) return fadd(F,M*N,A,1,B,1,C,1); typename Field::ConstElement_ptr Ai = A, Bi = B; typename Field::Element_ptr Ci = C; for (; Ai < A+M*lda; Ai+=lda, Bi+=ldb, Ci+=ldc) fadd(F,N,Ai,1,Bi,1,Ci,1); } template void fsub (const Field& F, const size_t M, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::Element_ptr C, const size_t ldc) { if (N == lda && N == ldb && N == ldc) return fsub(F,M*N,A,1,B,1,C,1); typename Field::ConstElement_ptr Ai = A, Bi = B; typename Field::Element_ptr Ci = C; for (; Ai < A+M*lda; Ai+=lda, Bi+=ldb, Ci+=ldc) fsub(F,N,Ai,1,Bi,1,Ci,1); } template void faddin (const Field& F, const size_t M, const size_t N, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::Element_ptr C, const size_t ldc) { if (N == ldb && N == ldc) return faddin(F,M*N,B,1,C,1); const typename Field::Element *Bi = B; typename Field::Element_ptr Ci = C; for (; Bi < B+M*ldb; Bi+=ldb, Ci+=ldc) faddin(F,N,Bi,1,Ci,1); } template void fsubin (const Field& F, const size_t M, const size_t N, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::Element_ptr C, const size_t ldc) { if (N == ldb && N == ldc) return fsubin(F,M*N,B,1,C,1); typename Field::ConstElement_ptr Bi = B; typename Field::Element_ptr Ci = C; for (; Bi < B+M*ldb; Bi+=ldb, Ci+=ldc) fsubin(F,N,Bi,1,Ci,1); } // C = A + a B template void fadd (const Field& F, const size_t M, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, const typename Field::Element alpha, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::Element_ptr C, const size_t ldc) { if (C == A && lda == ldc) return faxpy(F,M,N,alpha,B,ldb,C,ldc); if (F.isOne(alpha)) return fadd(F,M,N,A,lda,B,ldb,C,ldc); if (F.isMOne(alpha)) return fsub(F,M,N,A,lda,B,ldb,C,ldc); if (F.isZero(alpha)) return fassign(F,M,N,A,lda,C,ldc); if (N == lda && N == ldb && N == ldc) return fadd(F,M*N,A,1,alpha,B,1,C,1); typename Field::ConstElement_ptr Ai = A, Bi = B; typename Field::Element_ptr Ci = C; for (; Ai < A+M*lda; Ai+=lda, Bi+=ldb, Ci+=ldc) for (size_t i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFPACK group * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fadd_INL #define __FFLASFFPACK_fadd_INL #include "fflas-ffpack/fflas/fflas_simd.h" namespace FFLAS { namespace vectorised { #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline typename std::enable_if::value, void>::type VEC_ADD(SimdT & C, SimdT & A, SimdT & B, SimdT & Q, SimdT & T, SimdT & P, SimdT & NEGP, SimdT & MIN, SimdT & MAX) { using simd = Simd; C = simd::add(A, B); Q = simd::vand(simd::greater(C, MAX),NEGP); if (!positive) { T = simd::vand(simd::lesser(C, MIN),P); Q = simd::vor(Q, T); } C = simd::add(C, Q); } template inline typename std::enable_if::value, void>::type addp(Element * T, const Element * TA, const Element * TB, size_t n, Element p, T1 min_, T2 max_) { Element min= (Element)min_, max= (Element)max_; using simd = Simd; using vect_t = typename simd::vect_t; size_t i = 0; if (n < simd::vect_size) { for (; i < n ; i++) { T[i] = TA[i] + TB[i]; T[i] -= (T[i] > max) ? p : 0; if (!positive) { T[i] += (T[i] < min) ? p : 0; } } return; } vect_t A,B,C,Q,P,NEGP,TMP,MIN,MAX; P = simd::set1(p); NEGP= simd::set1(-p); MIN = simd::set1(min); MAX = simd::set1(max); long st = long(T)%simd::alignment; if (st) { // the array T is not 32 byte aligned (process few elements s.t. (T+i) is 32 bytes aligned) for (size_t j=static_cast(st) ; j < simd::alignment ; j += sizeof(Element), i++) { T[i] = TA[i] + TB[i]; T[i] -= (T[i] > max) ? p : 0; if (!positive) T[i] += (T[i] < min) ? p : 0; } } FFLASFFPACK_check((long(T+i) % simd::alignment == 0)); if ( (long(TA+i)%simd::alignment==0) && (long(TB+i)%simd::alignment==0)) { // perform the loop using 256 bits SIMD for (; i <= n - simd::vect_size ; i += simd::vect_size) { // C = simd::load(T+i); A = simd::load(TA+i); B = simd::load(TB+i); VEC_ADD(C, A, B, Q, TMP, P, NEGP, MIN, MAX); simd::store(T+i, C); } } // perform the last elt from T without SIMD for (; i < n ; i++) { T[i] = TA[i] + TB[i]; T[i] -= (T[i] > max) ? p : 0; if (!positive) T[i] += (T[i] < min) ? p : 0; } } template inline typename std::enable_if::value, void>::type VEC_SUB(SimdT & C, SimdT & A, SimdT & B, SimdT & Q, SimdT & T, SimdT & P, SimdT & NEGP, SimdT & MIN, SimdT & MAX) { using simd = Simd; C = simd::sub(A, B); T = simd::vand(simd::lesser(C, MIN),P); if (!positive) { Q = simd::vand(simd::greater(C, MAX),NEGP); T = simd::vor(Q, T); } C = simd::add(C, T); } template inline typename std::enable_if::value, void>::type subp(Element * T, const Element * TA, const Element * TB, const size_t n, const Element p, const T1 min_, const T2 max_) { Element min = (Element)min_, max = (Element)max_; using simd = Simd; using vect_t = typename simd::vect_t; size_t i = 0; if (n < simd::vect_size) { for (; i < n ; i++) { T[i] = TA[i] - TB[i]; if (!positive) T[i] -= (T[i] > max) ? p : 0; T[i] += (T[i] < min) ? p : 0; } return; } vect_t A,B,C,Q,P,NEGP,TMP,MIN,MAX; P = simd::set1(p); NEGP= simd::set1(-p); MIN = simd::set1(min); MAX = simd::set1(max); long st = long(T) % simd::alignment; if (st) { // the array T is not 32 byte aligned (process few elements s.t. (T+i) is 32 bytes aligned) for (size_t j = static_cast(st) ; j < simd::alignment ; j += sizeof(Element), i++) { T[i] = TA[i] - TB[i]; if (!positive) T[i] -= (T[i] > max) ? p : 0; T[i] += (T[i] < min) ? p : 0; } } FFLASFFPACK_check((long(T+i) % simd::alignment == 0)); if ( (long(TA+i) % simd::alignment == 0) && (long(TB+i) % simd::alignment == 0)) { // perform the loop using 256 bits SIMD for (; i <= n - simd::vect_size ; i += simd::vect_size) { // C = simd::load(T+i); A = simd::load(TA+i); B = simd::load(TB+i); VEC_SUB(C, A, B, Q, TMP, P, NEGP, MIN, MAX); simd::store(T+i, C); } } // perform the last elt from T without SIMD for (; i < n ; i++) { T[i] = TA[i] - TB[i]; if (!positive) T[i] -= (T[i] > max) ? p : 0; T[i] += (T[i] < min) ? p : 0; } } #else // no simd, but faster than F.init() template // inline typename std::enable_if::value, void>::type void subp(Element * T, const Element * TA, const Element * TB, const size_t n, const Element p, const T1 min_, const T2 max_) { Element min = (Element)min_, max = (Element)max_; size_t i = 0; for (; i < n ; i++) { T[i] = TA[i] - TB[i]; if (!positive) T[i] -= (T[i] > max) ? p : 0; T[i] += (T[i] < min) ? p : 0; } return; } template // inline typename std::enable_if::value, void>::type void addp(Element * T, const Element * TA, const Element * TB, const size_t n, const Element p, const T1 min_, const T2 max_) { Element min= (Element)min_, max= (Element)max_; size_t i = 0; for (; i < n ; i++) { T[i] = TA[i] + TB[i]; T[i] -= (T[i] > max) ? p : 0; if (!positive) { T[i] += (T[i] < min) ? p : 0; } } return; } #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS } // vectorised } // FFLAS namespace FFLAS { namespace details { /**** Specialised ****/ template typename std::enable_if::value, void>::type fadd (const Field & F, const size_t N, typename Field::ConstElement_ptr A, const size_t inca, typename Field::ConstElement_ptr B, const size_t incb, typename Field::Element_ptr C, const size_t incc , FieldCategories::ModularTag ) { if (inca == 1 && incb == 1 && incc == 1) { typename Field::Element p = (typename Field::Element) F.characteristic(); if (ADD) FFLAS::vectorised::addp::balanced>(C,A,B,N,p,F.minElement(),F.maxElement()); else FFLAS::vectorised::subp::balanced>(C,A,B,N,p,F.minElement(),F.maxElement()); } else { for (size_t i=0; i typename std::enable_if::value, void>::type fadd (const Field & F, const size_t N, typename Field::ConstElement_ptr A, const size_t inca, typename Field::ConstElement_ptr B, const size_t incb, typename Field::Element_ptr C, const size_t incc , FieldCategories::ModularTag ) { if (inca == 1 && incb == 1 && incc == 1) { for (size_t i=0; i void fadd (const Field & F, const size_t N, typename Field::ConstElement_ptr A, const size_t inca, typename Field::ConstElement_ptr B, const size_t incb, typename Field::Element_ptr C, const size_t incc , FieldCategories::GenericTag ) { if (inca == 1 && incb == 1 && incc == 1) { for (size_t i=0; i void fadd (const Field & F, const size_t N, typename Field::ConstElement_ptr A, const size_t inca, typename Field::ConstElement_ptr B, const size_t incb, typename Field::Element_ptr C, const size_t incc , FieldCategories::UnparametricTag ) { for (size_t i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_fassign.inl * Copyright (C) 2014 FFLAS FFPACK group * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fassign_H #define __FFLASFFPACK_fassign_H //! @todo field traits here too #include "fflas_fassign.inl" #endif // __FFLASFFPACK_fassign_H fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fassign.inl000066400000000000000000000076721274716147400230200ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_fassign.inl * Copyright (C) 2007 Clement Pernet * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fassign_INL #define __FFLASFFPACK_fassign_INL #include #include #include #include #include "fflas-ffpack/utils/debug.h" namespace FFLAS { /***************************/ /* LEVEL 1 */ /***************************/ template inline void fassign (const Field& F, const size_t N, typename Field::ConstElement_ptr Y, const size_t incY, typename Field::Element_ptr X, const size_t incX) { typename Field::Element_ptr Xi = X; typename Field::ConstElement_ptr Yi=Y; if (incX == 1 && incY == 1) { for (; Xi < X+N; ++Xi, ++Yi) F.assign(*Xi,*Yi); } else { for (; Xi < X+N*incX; Xi+=incX, Yi+=incY ) F.assign(*Xi,*Yi); } return; } template<> inline void fassign (const Givaro::Modular& F, const size_t N, const float * Y, const size_t incY, float * X, const size_t incX) { cblas_scopy((int)N,Y,(int)incY,X,(int)incX); return; } template<> inline void fassign (const Givaro::ModularBalanced& F, const size_t N, const float * Y, const size_t incY, float * X, const size_t incX) { cblas_scopy((int)N,Y,(int)incY,X,(int)incX); return; } template<> inline void fassign (const Givaro::ZRing& F, const size_t N, const float * Y, const size_t incY, float * X, const size_t incX) { cblas_scopy((int)N,Y,(int)incY,X,(int)incX); return; } template<> inline void fassign (const Givaro::Modular& F, const size_t N, const double * Y, const size_t incY, double * X, const size_t incX) { cblas_dcopy((int)N,Y,(int)incY,X,(int)incX); return; } template<> inline void fassign (const Givaro::ModularBalanced& F, const size_t N, const double * Y, const size_t incY, double * X, const size_t incX) { cblas_dcopy((int)N,Y,(int)incY,X,(int)incX); return; } template<> inline void fassign (const Givaro::ZRing& F, const size_t N, const double * Y, const size_t incY , double * X, const size_t incX) { cblas_dcopy((int)N,Y,(int)incY,X,(int)incX); return; } /***************************/ /* LEVEL 2 */ /***************************/ template void fassign (const Field& F, const size_t m, const size_t n, typename Field::ConstElement_ptr B, const size_t ldb , typename Field::Element_ptr A, const size_t lda) { FFLASFFPACK_check(n<=std::min(lda,ldb)); // if possible, copy one big block if (lda == n && ldb == n) { fassign(F,m*n,B,1,A,1); return ; } // else, copy row after row for (size_t i = 0 ; i < m ; ++i) { fassign(F,n,B+i*ldb,1,A+i*lda,1); } return; } } // FFLAS #endif // __FFLASFFPACK_fassign_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_faxpy.inl000066400000000000000000000061271274716147400225070ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_faxpy.inl * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_faxpy_INL #define __FFLASFFPACK_faxpy_INL namespace FFLAS { template inline void faxpy( const Field& F, const size_t N, const typename Field::Element a, typename Field::ConstElement_ptr X, const size_t incX, typename Field::Element_ptr Y, const size_t incY ) { if (F.isZero(a)) return ; if (F.isOne(a)) return faddin(F,N,X,incX,Y,incY); //return fassign(F,N,X,incX,Y,incY); if (F.isMOne(a)) return fsubin(F,N,X,incX,Y,incY); //return fneg(F,N,X,incX,Y,incY); typename Field::ConstElement_ptr Xi = X; typename Field::Element_ptr Yi=Y; for (; Xi < X+N*incX; Xi+=incX, Yi+=incY ) F.axpyin( *Yi, a, *Xi ); } template<> inline void faxpy( const Givaro::DoubleDomain& , const size_t N, const Givaro::DoubleDomain::Element a, Givaro::DoubleDomain::ConstElement_ptr x, const size_t incx, Givaro::DoubleDomain::Element_ptr y, const size_t incy ) { cblas_daxpy( (int)N, a, x, (int)incx, y, (int)incy); } template<> inline void faxpy( const Givaro::FloatDomain& , const size_t N, const Givaro::FloatDomain::Element a, Givaro::FloatDomain::ConstElement_ptr x, const size_t incx, Givaro::FloatDomain::Element_ptr y, const size_t incy ) { cblas_saxpy( (int)N, a, x, (int)incx, y, (int)incy); } template inline void faxpy( const Field& F, const size_t m, const size_t n, const typename Field::Element a, typename Field::ConstElement_ptr X, const size_t ldX, typename Field::Element_ptr Y, const size_t ldY ) { if (F.isZero(a)) return ; if (F.isOne(a)) return faddin(F,m,n,X,ldX,Y,ldY); //return fassign(F,m,n,X,ldX,Y,ldY); if (F.isMOne(a)) return fsubin(F,m,n,X,ldX,Y,ldY); //return fneg(F,m,n,X,ldX,Y,ldY); if (n == ldX && n == ldY) return faxpy(F,m*n,a,X,1,Y,1); typename Field::ConstElement_ptr Xi = X; typename Field::Element_ptr Yi=Y; for (; Xi < X+m*ldX; Xi+=ldX, Yi+=ldY ) faxpy(F,n,a,Xi,1,Yi,1); } } // FFLAS #endif // __FFLASFFPACK_faxpy_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fdot.inl000066400000000000000000000045271274716147400223160ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas_fdot.inl * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fdot_INL #define __FFLASFFPACK_fdot_INL // Default implementation // Specializations should be written // to increase efficiency namespace FFLAS { template inline typename Field::Element fdot( const Field& F, const size_t N, typename Field::ConstElement_ptr x, const size_t incx, typename Field::ConstElement_ptr y, const size_t incy ) { typename Field::Element d; typename Field::ConstElement_ptr xi = x; typename Field::ConstElement_ptr yi = y; F.init( d ); for ( ; xi < x+N*incx; xi+=incx, yi+=incy ) F.axpyin( d, *xi, *yi ); return d; } template<> inline Givaro::DoubleDomain::Element fdot( const Givaro::DoubleDomain& , const size_t N, Givaro::DoubleDomain::ConstElement_ptr x, const size_t incx, Givaro::DoubleDomain::ConstElement_ptr y, const size_t incy ) { return cblas_ddot( (int)N, x, (int)incx, y, (int)incy ); } template<> inline Givaro::FloatDomain::Element fdot( const Givaro::FloatDomain& , const size_t N, Givaro::FloatDomain::ConstElement_ptr x, const size_t incx, Givaro::FloatDomain::ConstElement_ptr y, const size_t incy ) { return cblas_sdot( (int)N, x, (int)incx, y, (int)incy ); } } // FFLAS #endif // __FFLASFFPACK_fdot_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm.inl000066400000000000000000000456231274716147400224570ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_fgemm.inl * Copyright (C) 2005 Clement Pernet * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Clement Pernet < Clement.Pernet@imag.fr > * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fgemm_INL #define __FFLASFFPACK_fgemm_INL #include #include #include "fflas-ffpack/utils/debug.h" namespace FFLAS { namespace Protected{ template inline typename Field::Element_ptr fgemm_convert (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A,const size_t lda, typename Field::ConstElement_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & H) { // CP: lda, ldb, ldc can be zero (if m,n or k is 0) and since this may have not // been checked by the caller at this point. // FFLASFFPACK_check(lda); // FFLASFFPACK_check(ldb); // FFLASFFPACK_check(ldc); Givaro::ModularBalanced G((FloatElement) F.characteristic()); FloatElement tmp,alphaf, betaf; // This conversion is quite tricky, but convert and init are required // in sequence e.g. for when F is a ModularBalanced field and alpha == -1 F.convert (tmp, beta); G.init(betaf, tmp); F.convert (tmp, alpha); G.init(alphaf, tmp); FloatElement* Af = FFLAS::fflas_new(G, m, k); FloatElement* Bf = FFLAS::fflas_new(G, k, n); FloatElement* Cf = FFLAS::fflas_new(G, m, n); size_t ma, ka, kb, nb; //mb, na if (ta == FflasTrans) { ma = k; ka = m; } else { ma = m; ka = k; } if (tb == FflasTrans) { kb = n; nb = k; } else { kb = k; nb = n; } size_t ldaf = ka, ldbf = nb, ldcf= n; fconvert(F, ma, ka, Af, ka, A, lda); freduce(G, ma, ka, Af, ka); fconvert(F, kb, nb, Bf, nb, B, ldb); freduce(G, kb, nb, Bf, nb); if (!F.isZero(beta)){ fconvert(F, m, n, Cf, n, C, ldc); freduce (G, m, n, Cf, n); } MMHelper, MMHelperAlgo::Winograd> HG(G,H.recLevel, ParSeqHelper::Sequential()); fgemm (G, ta, tb, m, n, k, alphaf, Af, ldaf, Bf, ldbf, betaf, Cf, ldcf, HG); finit (F, m, n, Cf, n, C, ldc); fflas_delete (Af); fflas_delete (Bf); fflas_delete (Cf); return C; } }//Protected }//FFLAS namespace FFLAS{ namespace Protected{ template inline bool NeedPreAddReduction (Element& Outmin, Element& Outmax, Element& Op1min, Element& Op1max, Element& Op2min, Element& Op2max, MMHelper& WH) { Outmin = Op1min + Op2min; Outmax = Op1max + Op2max; if (WH.MaxStorableValue - Op1max < Op2max || WH.MaxStorableValue + Op1min < -Op2min){ // Reducing both Op1 and Op2 Op1min = Op2min = WH.FieldMin; Op1max = Op2max = WH.FieldMax; Outmin = 2*WH.FieldMin; Outmax = 2*WH.FieldMax; return true; } else return false; } template inline bool NeedPreAddReduction (Element& Outmin, Element& Outmax, Element& Op1min, Element& Op1max, Element& Op2min, Element& Op2max, MMHelper& WH) { Outmin = WH.FieldMin; Outmax = WH.FieldMax; return false; } template inline bool NeedPreSubReduction (Element& Outmin, Element& Outmax, Element& Op1min, Element& Op1max, Element& Op2min, Element& Op2max, MMHelper& WH) { Outmin = Op1min - Op2max; Outmax = Op1max - Op2min; if (WH.MaxStorableValue - Op1max < -Op2min || WH.MaxStorableValue - Op2max < -Op1min){ // Reducing both Op1 and Op2 Op1min = Op2min = WH.FieldMin; Op1max = Op2max = WH.FieldMax; Outmin = WH.FieldMin-WH.FieldMax; Outmax = -Outmin; return true; } else return false; } template inline bool NeedPreSubReduction (Element& Outmin, Element& Outmax, Element& Op1min, Element& Op1max, Element& Op2min, Element& Op2max, MMHelper& WH) { // Necessary? -> CP: Yes, for generic Mode of op Outmin = WH.FieldMin; Outmax = WH.FieldMax; return false; } //Probable bug here due to overflow of int64_t template inline bool NeedDoublePreAddReduction (Element& Outmin, Element& Outmax, Element& Op1min, Element& Op1max, Element& Op2min, Element& Op2max, Element beta, MMHelper& WH) { // Testing if P5 need to be reduced Outmin = std::min(beta*Op2min,beta*Op2max); Outmax = std::max(beta*Op2min,beta*Op2max); if (Op1max > WH.MaxStorableValue-Outmax || -Op1min > WH.MaxStorableValue+Outmin){ Outmin += WH.FieldMin; Outmax += WH.FieldMax; return true; } else{ Outmin += Op1min; Outmax += Op1max; return false; } } template inline bool NeedDoublePreAddReduction (Element& Outmin, Element& Outmax, Element& Op1min, Element& Op1max, Element& Op2min, Element& Op2max, Element beta, MMHelper& WH) { Outmin = WH.FieldMin; Outmax = WH.FieldMax; return false; } template inline void ScalAndReduce (const Field& F, const size_t N, const typename Field::Element alpha, typename Field::Element_ptr X, const size_t incX, const MMHelper& H) { if (!F.isOne(alpha) && !F.isMOne(alpha)){ typename MMHelper::DFElt al; F.convert(al, alpha); if (al < 0) al = -al; if (std::max(-H.Outmin, H.Outmax) > H.MaxStorableValue/al){ freduce (F, N, X, incX); fscalin (F, N, alpha, X, incX); } else { fscalin (H.delayedField, N, alpha, X, incX); freduce (F, N, X, incX); } } else freduce (F, N, X, incX); } template inline void ScalAndReduce (const Field& F, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::Element_ptr A, const size_t lda, const MMHelper& H) { if (!F.isOne(alpha) && !F.isMOne(alpha)){ typename MMHelper::DFElt al; F.convert(al, alpha); if (al<0) al = -al; if (std::max(-H.Outmin, H.Outmax) > H.MaxStorableValue/al){ freduce (F, M, N, A, lda); fscalin (F, M, N, alpha, A, lda); } else { fscalin (H.delayedField, M, N, alpha, (typename MMHelper::DFElt*)A, lda); freduce (F, M, N, A, lda); } } else freduce (F, M, N, A, lda); } } // Protected } // FFLAS namespace FFLAS { template inline typename Field::Element_ptr fgemm (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper, ParSeqHelper::Sequential> & H) { if (F.cardinality() < DOUBLE_TO_FLOAT_CROSSOVER) return Protected::fgemm_convert(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H); else if (16*F.cardinality() < Givaro::ModularBalanced::maxCardinality()) return Protected::fgemm_convert(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H); // else if (Protected::AreEqual::value) { // // Stays over int64_t // MMHelper HG(H); // H.Outmin=HG.Outmin; // H.Outmax=HG.Outmax; // return fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,HG); // } else { // Fall back case: used FFPACK::failure()(__func__,__LINE__,"Invalid ConvertTo Mode for this field"); } return C; } }// FFLAS // fgemm namespace FFLAS { template inline typename Field::Element_ptr fgemm( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const ParSeqHelper::Sequential seq) { MMHelper::value, ParSeqHelper::Sequential > HW (F, m, k, n, seq); return fgemm (F, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, HW); } template inline typename Field::Element_ptr fgemm( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const ParSeqHelper::Parallel par) { MMHelper::value, ParSeqHelper::Parallel > HW (F, m, k, n, par); return fgemm (F, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, HW); } template inline typename Field::Element_ptr fgemm( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc) { if (!m || !n) {return C;} if (!k || F.isZero (alpha)){ fscalin(F, m, n, beta, C, ldc); return C; } Checker_fgemm checker(F,m,n,k,beta,C,ldc); fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,FFLAS::ParSeqHelper::Sequential()); checker.check(ta,tb,alpha,A,lda,B,ldb,C); return C; } template inline typename Field::Element_ptr fgemm( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & H) { MMHelper::value, ModeT, ParSeq> HW (H); return fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,HW); } template inline typename Field::Element_ptr fgemm( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & H) { if (!m || !n) {return C;} if (!k || F.isZero (alpha)){ fscalin(F, m, n, beta, C, ldc); return C; } #ifndef NDEBUG /* check if alpha is invertible. * XXX do it in F.isInvertible(Element&) ? * XXX do it in return status of F.inv(Element&,Element&) */ typename Field::Element e ; F.assign(e,beta); F.divin(e,alpha); F.mulin(e,alpha); FFLASFFPACK_check(F.areEqual(e,beta)); #endif #if 0 // detect fgemv if (n == 1 and ...) {} // detect fger if (k==1 and ...) {} #endif if (Protected::AreEqual >::value || Protected::AreEqual >::value){ //Givaro::Modular need to switch to float if p too small if (F.characteristic() < DOUBLE_TO_FLOAT_CROSSOVER) return Protected::fgemm_convert(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H); } if (Protected::AreEqual >::value || Protected::AreEqual >::value) if (16*F.cardinality() < Givaro::ModularBalanced::maxCardinality()) return Protected::fgemm_convert(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H); typename Field::Element alpha_,beta_; if ( !F.isOne(alpha) && !F.isMOne(alpha)){ F.assign (alpha_, F.one); F.div (beta_, beta, alpha); } else { F.assign (alpha_,alpha); F.assign (beta_,beta); } MMHelper HD(H); // std::cerr<<"\n Delayed -> Lazy alpha_ = "< inline typename Field::Element_ptr fsquare (const Field& F, const FFLAS_TRANSPOSE ta, const size_t n, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc) { double alphad, betad; F.convert (alphad, alpha); if (F.isMOne (beta)) betad = -1.0; else F.convert (betad, beta); //! @bug why double ? // Double matrices initialisation Givaro::DoubleDomain::Element_ptr Ad = fflas_new (Givaro::DoubleDomain(),n,n); Givaro::DoubleDomain::Element_ptr Cd = fflas_new (Givaro::DoubleDomain(),n,n); // Conversion finite Field = > double fconvert (F, n, n, Ad, n, A, lda); if (!F.isZero(beta)) fconvert(F, n, n, Cd, n, C, ldc); // Call to the blas Multiplication FFLASFFPACK_check(n); cblas_dgemm (CblasRowMajor, (CBLAS_TRANSPOSE)ta, (CBLAS_TRANSPOSE)ta, (int)n, (int)n, (int)n, (Givaro::DoubleDomain::Element) alphad, Ad, (int)n, Ad, (int)n, (Givaro::DoubleDomain::Element) betad, Cd, (int)n); // Conversion double = > Finite Field fflas_delete (Ad); finit (F,n,n, Cd, n, C, ldc); fflas_delete (Cd); return C; } namespace Protected { // F is Modular(Balanced) template < class Field > inline typename Field::Element_ptr fsquareCommon (const Field& F, const FFLAS_TRANSPOSE ta, const size_t n, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc) { if (C==A) { typename Field::Element_ptr Ad = fflas_new (F, n, n); fassign(F,n,n,A,lda,Ad,n); fgemm (F, ta, ta, n, n, n, alpha, Ad, n, Ad, n, beta, C, ldc); fflas_delete (Ad); } else fgemm (F, ta, ta, n, n, n, alpha, A, lda, A, lda, beta, C, ldc); return C; } } // Protected template <> inline double* fsquare (const Givaro::ModularBalanced & F, const FFLAS_TRANSPOSE ta, const size_t n, const double alpha, const double* A, const size_t lda, const double beta, double* C, const size_t ldc) { return Protected::fsquareCommon(F,ta,n,alpha,A,lda,beta,C,ldc); } template <> inline float * fsquare (const Givaro::ModularBalanced & F, const FFLAS_TRANSPOSE ta, const size_t n, const float alpha, const float* A, const size_t lda, const float beta, float* C, const size_t ldc) { return Protected::fsquareCommon(F,ta,n,alpha,A,lda,beta,C,ldc); } template <> inline double* fsquare (const Givaro::Modular & F, const FFLAS_TRANSPOSE ta, const size_t n, const double alpha, const double* A, const size_t lda, const double beta, double* C, const size_t ldc) { return Protected::fsquareCommon(F,ta,n,alpha,A,lda,beta,C,ldc); } template <> inline float * fsquare (const Givaro::Modular & F, const FFLAS_TRANSPOSE ta, const size_t n, const float alpha, const float* A, const size_t lda, const float beta, float* C, const size_t ldc) { return Protected::fsquareCommon(F,ta,n,alpha,A,lda,beta,C,ldc); } } // FFLAS #endif // __FFLASFFPACK_fgemm_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm/000077500000000000000000000000001274716147400217415ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm/Makefile.am000066400000000000000000000025141274716147400237770ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_fgemm EXTRA_DIST=matmul.doxy multiprecision=fgemm_classical_mp.inl pkgincludesub_HEADERS= \ fgemm_classical.inl \ fgemm_winograd.inl \ schedule_winograd.inl \ schedule_winograd_acc.inl \ schedule_bini.inl \ schedule_winograd_acc_ip.inl \ schedule_winograd_ip.inl \ ${multiprecision} fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm/fgemm_classical.inl000066400000000000000000000263071274716147400255660ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2008, 2014 the FFLAS-FFPACK group * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas_fgemm/fgemm_classical.inl * @brief Classical \f$2n^3\$f matrix multiplication. * @warning The domain is supposed to be a field since some divisions are required for efficiency purposes * An alternative has to be written for finite rings if necessary */ #ifndef __FFLASFFPACK_fflas_fflas_fgemm_classical_INL #define __FFLASFFPACK_fflas_fflas_fgemm_classical_INL #include #include "fflas-ffpack/field/field-traits.h" #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS #include "fflas-ffpack/fflas/fflas_igemm/igemm.h" #endif #include "fflas-ffpack/utils/Matio.h" namespace FFLAS { // F is a field supporting delayed reductions template inline void fgemm (const Field & F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & H) { // Input matrices are unreduced: need to figure out the best option between: // - reducing them // - making possibly more blocks (smaller kmax) typedef MMHelper HelperType; typename HelperType::DelayedField::Element alphadf, betadf; betadf = beta; if (F.isMOne (alpha)) { alphadf = -H.delayedField.one; } else { alphadf = F.one; if (! F.isOne( alpha)) { // Compute y = A*x + beta/alpha.y // and after y *= alpha FFLASFFPACK_check(!F.isZero(alpha)); typename Field::Element betadalpha; F.init(betadalpha); F.div (betadalpha, beta, alpha); betadf = betadalpha; } } if (F.isMOne(betadf)) betadf = -F.one; size_t kmax = H.MaxDelayedDim (betadf); H.checkA(F,ta, m,k,A,lda); H.checkB(F,tb, k,n,B,ldb); if (kmax <= k/2 || H.Aunfit() || H.Bunfit() ){ // Might as well reduce inputs if (H.Amin < H.FieldMin || H.Amax>H.FieldMax){ H.initA(); freduce_constoverride (F, (ta==FflasNoTrans)?m:k, (ta==FflasNoTrans)?k:m, A, lda); } if (H.Bmin < H.FieldMin || H.Bmax>H.FieldMax){ H.initB(); freduce_constoverride (F, (tb==FflasNoTrans)?k:n, (tb==FflasNoTrans)?n:k, B, ldb); } if (H.Cmin < H.FieldMin || H.Cmax>H.FieldMax){ H.initC(); freduce (F, m, n, C, ldc); } kmax = H.MaxDelayedDim (betadf); } if (!kmax){ MMHelper HG(H); H.initOut(); return fgemm (F, ta, tb, m,n,k,alpha, A, lda, B, ldb, beta, C, ldc, HG); } size_t k2 = std::min(k,kmax); size_t nblock = k / kmax; size_t remblock = k % kmax; if (!remblock) { remblock = kmax; --nblock; } size_t shiftA, shiftB; if (ta == FflasTrans) shiftA = k2*lda; else shiftA = k2; if (tb == FflasTrans) shiftB = k2; else shiftB = k2*ldb; typedef MMHelper DelayedHelper_t; DelayedHelper_t Hfp(H); typedef typename HelperType::DelayedField::Element DFElt; typedef typename HelperType::DelayedField::Element_ptr DFElt_ptr; typedef typename HelperType::DelayedField::ConstElement_ptr DFCElt_ptr; fgemm (H.delayedField, ta, tb, m, n, remblock, alphadf, (DFCElt_ptr)A +nblock*shiftA, lda, (DFCElt_ptr)B +nblock*shiftB, ldb, betadf, (DFElt_ptr)C, ldc, Hfp); for (size_t i = 0; i < nblock; ++i) { freduce (F, m, n, C, ldc); Hfp.initC(); fgemm (H.delayedField, ta, tb, m, n, k2, alphadf, (DFCElt_ptr)A +i*shiftA, lda, (DFCElt_ptr)B +i*shiftB, ldb, F.one, (DFElt_ptr)C, ldc, Hfp); } if (!F.isOne(alpha) && !F.isMOne(alpha)){ DFElt al; F.convert(al, alpha); if (al<0) al = -al; // This cast is needed when Outmin base type is int8/16_t, // getting -Outmin returns a int, not the same base type. if (std::max(static_cast(-Hfp.Outmin), Hfp.Outmax) >Hfp.MaxStorableValue/al){ freduce (F, m, n, C, ldc); Hfp.initOut(); } fscalin(H.delayedField, m,n,alpha,(typename DelayedHelper_t::DelayedField_t::Element_ptr)C,ldc); if (alpha>0){ H.Outmin = (const DFElt)(alpha) * Hfp.Outmin; H.Outmax = (const DFElt)alpha * Hfp.Outmax; } else { H.Outmin = (const DFElt)alpha * Hfp.Outmax; H.Outmax = (const DFElt)alpha * Hfp.Outmin; } }else { H.Outmin = Hfp.Outmin; H.Outmax = Hfp.Outmax; } H.checkOut(F,m,n,C,ldc); } } // FFLAS namespace FFLAS { // Classic multiplication over a generic finite field template < class Field> inline void fgemm (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & H) { if (F.isZero (alpha)) { fscalin(F, m, n, beta, C, ldc); return; } // Standard algorithm is performed over the Field, without conversion if (F.isZero (beta)) fzero (F, m, n, C, ldc); else { typename Field::Element betadivalpha; F.init(betadivalpha); F.div (betadivalpha, beta, alpha); fscalin(F,m,n,betadivalpha,C,ldc); } if (ta == FflasNoTrans) if (tb == FflasNoTrans) for (size_t i = 0; i < m; ++i) for (size_t l = 0; l < k; ++l) for (size_t j = 0; j < n; ++j) F.axpyin (*(C+i*ldc+j), *(A+i*lda+l), *(B+l*ldb+j)); else for (size_t i = 0; i < m; ++i) for (size_t j = 0; j < n; ++j) for (size_t l = 0; l < k; ++l) F.axpyin (*(C+i*ldc+j), *(A+i*lda+l), *(B+j*ldb+l)); else if (tb == FflasNoTrans) for (size_t i = 0; i < m; ++i) for (size_t l = 0; l < k; ++l) for (size_t j = 0; j < n; ++j) F.axpyin (*(C+i*ldc+j), *(A+l*lda+i), *(B+l*ldb+j)); else for (size_t i = 0; i < m; ++i) for (size_t j = 0; j < n; ++j) for (size_t l = 0; l < k; ++l) F.axpyin (*(C+i*ldc+j), *(A+l*lda+i), *(B+j*ldb+l)); fscalin(F,m,n,alpha,C,ldc); } template < class Field> inline void fgemm (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & H) { MMHelper Hd(F,0); fgemm (F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,Hd); H.setOutBounds (k,alpha,beta); } inline void fgemm (const Givaro::DoubleDomain& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const Givaro::DoubleDomain::Element alpha, Givaro::DoubleDomain::ConstElement_ptr Ad, const size_t lda, Givaro::DoubleDomain::ConstElement_ptr Bd, const size_t ldb, const Givaro::DoubleDomain::Element beta, Givaro::DoubleDomain::Element_ptr Cd, const size_t ldc, MMHelper &H) { FFLASFFPACK_check(lda); FFLASFFPACK_check(ldb); FFLASFFPACK_check(ldc); cblas_dgemm (CblasRowMajor, (CBLAS_TRANSPOSE) ta, (CBLAS_TRANSPOSE) tb, (int)m, (int)n, (int)k, (Givaro::DoubleDomain::Element) alpha, Ad, (int)lda, Bd, (int)ldb, (Givaro::DoubleDomain::Element) beta, Cd, (int)ldc); } inline void fgemm (const Givaro::FloatDomain& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const Givaro::FloatDomain::Element alpha, Givaro::FloatDomain::ConstElement_ptr Ad, const size_t lda, Givaro::FloatDomain::ConstElement_ptr Bd, const size_t ldb, const Givaro::FloatDomain::Element beta, Givaro::FloatDomain::Element_ptr Cd, const size_t ldc, MMHelper & H) { FFLASFFPACK_check(lda); FFLASFFPACK_check(ldb); FFLASFFPACK_check(ldc); cblas_sgemm (CblasRowMajor, (CBLAS_TRANSPOSE) ta, (CBLAS_TRANSPOSE) tb, (int)m, (int)n, (int)k, (Givaro::FloatDomain::Element) alpha, Ad, (int)lda, Bd, (int)ldb, (Givaro::FloatDomain::Element) beta,Cd, (int)ldc); } inline void fgemm (const Givaro::ZRing& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const int64_t alpha, const int64_t * Ad, const size_t lda, const int64_t * Bd, const size_t ldb, const int64_t beta, int64_t * Cd, const size_t ldc, MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag> & H) { FFLASFFPACK_check(lda); FFLASFFPACK_check(ldb); FFLASFFPACK_check(ldc); #if defined (__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) igemm_ (FflasRowMajor, ta, tb, (int)m, (int)n, (int)k, alpha, Ad, (int)lda, Bd, (int)ldb, beta, Cd, (int)ldc); #else for (size_t i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas_fgemm/fgemm_classical_mp.inl * @brief matrix multiplication with multiprecision input (either over Z or over Z/pZ) */ #ifndef __FFPACK_fgemm_classical_INL #define __FFPACK_fgemm_classical_INL #include #include #ifdef PROFILE_FGEMM_MP #include "fflas-ffpack/utils/timer.h" #endif #include "fflas-ffpack/field/rns-double.h" #include "fflas-ffpack/field/rns-integer.h" #include "fflas-ffpack/field/rns-integer-mod.h" #include "fflas-ffpack/field/field-traits.h" #include "fflas-ffpack/fflas/fflas_helpers.inl" #include "fflas-ffpack/fflas/fflas_bounds.inl" namespace FFLAS { template struct MMHelper, ParSeqTrait> { typedef MMHelper, ParSeqTrait> Self_t; Givaro::Integer normA,normB; int recLevel; ParSeqTrait parseq; MMHelper() : normA(0), normB(0), recLevel(-1) {} template MMHelper(MMHelper H2) : normA(H2.normA), normB(H2.normB), recLevel(H2.recLevel), parseq(H2.parseq) {} MMHelper(Givaro::Integer Amax, Givaro::Integer Bmax) : normA(Amax), normB(Bmax), recLevel(-1) {} MMHelper(const Field& F, size_t m, size_t n, size_t k, ParSeqTrait PS=ParSeqTrait()) : recLevel(-1), parseq(PS) {F.characteristic(normA);F.characteristic(normB); } MMHelper(const Field& F, int wino, ParSeqTrait PS=ParSeqTrait()) : recLevel(wino), parseq(PS) {F.characteristic(normA);F.characteristic(normB);} void setNorm(Givaro::Integer p){normA=normB=p;} friend std::ostream& operator<<(std::ostream& out, const Self_t& M) { return out <<"Helper: " <).name()<< ' ' << M.parseq < inline typename FFPACK::RNSInteger::Element_ptr fgemm (const FFPACK::RNSInteger &F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const typename FFPACK::RNSInteger::Element alpha, typename FFPACK::RNSInteger::ConstElement_ptr Ad, const size_t lda, typename FFPACK::RNSInteger::ConstElement_ptr Bd, const size_t ldb, const typename FFPACK::RNSInteger::Element beta, typename FFPACK::RNSInteger::Element_ptr Cd, const size_t ldc, MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag, ParSeqHelper::Parallel > & H) { // compute each fgemm componentwise size_t s=F.size(); size_t nt=H.parseq.numthreads(); size_t loop_nt = std::min(s,nt); size_t iter_nt = nt / loop_nt; size_t leftover_nt = nt % loop_nt; //std::cerr<<"iter_nt = "< inline Givaro::Integer* fgemm (const Givaro::ZRing& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const Givaro::Integer alpha, const Givaro::Integer* A, const size_t lda, const Givaro::Integer* B, const size_t ldb, Givaro::Integer beta, Givaro::Integer* C, const size_t ldc, MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq > & H) { //std::cerr<<"Entering fgemm> ParSeq"<>=1; ++lk;} size_t prime_bitsize= (53-lk)>>1; // compute bound on the output Givaro::Integer mA,mB,mC; size_t logA,logB; mA=H.normA; mB=H.normB; if (H.normA==0) H.normA = InfNorm ((ta==FflasNoTrans)?m:k,(ta==FflasNoTrans)?k:m,A,lda); logA = H.normA.bitsize(); if (H.normB==0) H.normB = InfNorm ((tb==FflasNoTrans)?k:n,(tb==FflasNoTrans)?n:k,B,ldb); logB = H.normB.bitsize(); mC = 2*uint64_t(k)*H.normA*H.normB*abs(alpha); // need to use 2x bound to reach both positive and negative // construct an RNS structure and its associated Domain FFPACK::rns_double RNS(mC, prime_bitsize); typedef FFPACK::RNSInteger RnsDomain; RnsDomain Zrns(RNS); size_t Acold,Arowd,Bcold,Browd; if (ta == FFLAS::FflasNoTrans){Arowd=m; Acold = k; } else { Arowd=k; Acold = m;} if (tb == FFLAS::FflasNoTrans){Browd=k; Bcold = n; } else { Browd=n; Bcold = k;} // allocate data for RNS representation typename RnsDomain::Element_ptr Ap,Bp,Cp; Ap = FFLAS::fflas_new(Zrns,Arowd,Acold); Bp = FFLAS::fflas_new(Zrns,Browd,Bcold); Cp = FFLAS::fflas_new(Zrns,m,n); #ifdef PROFILE_FGEMM_MP chrono.stop(); std::cout<<"-------------------------------"< H2(Zrns,H.recLevel,H.parseq); // compute alpha and beta in RNS typename RnsDomain::Element alphap, betap; Zrns.init(alphap, alpha); Zrns.init(betap, F.zero); // call fgemm fgemm(Zrns,ta,tb,m,n,k,alphap,Ap,Acold,Bp,Bcold,betap,Cp,n,H2); #ifdef PROFILE_FGEMM_MP chrono.stop(); std::cout<<"FGEMM_MP: RNS Mul: "< Classic (waiting for Winograd's algorithm to be generic wrt ModeTrait) template inline typename RNS::Element_ptr fgemm (const FFPACK::RNSInteger &F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const typename RNS::Element alpha, typename RNS::ConstElement_ptr Ad, const size_t lda, typename RNS::ConstElement_ptr Bd, const size_t ldb, const typename RNS::Element beta, typename RNS::Element_ptr Cd, const size_t ldc, MMHelper, MMHelperAlgo::Winograd, ModeT, ParSeqHelper::Sequential> & H) { MMHelper, MMHelperAlgo::Classic, ModeT, ParSeqHelper::Sequential> H2(F, H.recLevel,H.parseq); return fgemm(F,ta,tb,m,n,k,alpha,Ad,lda,Bd,ldb,beta,Cd,ldc,H2); } // template // inline Givaro::Integer* // fgemm (const Givaro::ZRing& F, // const FFLAS_TRANSPOSE ta, // const FFLAS_TRANSPOSE tb, // const size_t m, const size_t n,const size_t k, // const Givaro::Integer alpha, // const Givaro::Integer* A, const size_t lda, // const Givaro::Integer* B, const size_t ldb, // Givaro::Integer beta, // Givaro::Integer* C, const size_t ldc, // MMHelper, MMHelperAlgo::Winograd, ModeCategories::ConvertTo, ParSeq > & H) // { // MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeq> H2(F, H.recLevel,H.parseq); // return fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H2); // } /************************************ *** MULTIPRECISION FGEMM OVER Fp *** ************************************/ // fgemm for RNSIntegerMod with Winograd Helper template inline typename RNS::Element_ptr fgemm (const FFPACK::RNSIntegerMod &F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const typename RNS::Element alpha, typename RNS::ConstElement_ptr Ad, const size_t lda, typename RNS::ConstElement_ptr Bd, const size_t ldb, const typename RNS::Element beta, typename RNS::Element_ptr Cd, const size_t ldc, MMHelper, MMHelperAlgo::Winograd> & H) { // compute the product over Z typedef FFPACK::RNSInteger RnsDomain; RnsDomain Zrns(F.rns()); MMHelper H2(Zrns, H.recLevel,H.parseq); #ifdef BENCH_PERF_FGEMM_MP FFLAS::Timer chrono;chrono.start(); #endif fgemm(Zrns,ta,tb,m,n,k,alpha,Ad,lda,Bd,ldb,beta,Cd,ldc,H2); // reduce the product mod p (note that entries are larger than p, due to RNS modulo reduction) freduce (F, m, n, Cd, ldc); #ifdef BENCH_PERF_FGEMM_MP chrono.stop(); F.t_igemm+=chrono.realtime(); #endif return Cd; } // fgemm for IntegerDomain with Winograd Helper inline Givaro::Integer* fgemm (const Givaro::Modular& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const Givaro::Integer alpha, const Givaro::Integer *A, const size_t lda, const Givaro::Integer *B, const size_t ldb, const Givaro::Integer beta, Givaro::Integer* C, const size_t ldc, MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo > & H) { // compute the product over Z //std::cerr<<"Entering fgemm>"< IntegerDomain; Givaro::Integer p; F.cardinality(p); IntegerDomain Z; MMHelper > H2(Z,H.recLevel,H.parseq); H2.setNorm(p); fgemm(Z,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H2); // reduce the product mod p freduce (F, m, n, C, ldc); return C; } template inline Givaro::Integer* fgemm (const Givaro::Modular& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n,const size_t k, const Givaro::Integer alpha, const Givaro::Integer *A, const size_t lda, const Givaro::Integer *B, const size_t ldb, const Givaro::Integer beta, Givaro::Integer* C, const size_t ldc, MMHelper, MMHelperAlgo::Auto, ModeCategories::ConvertTo, ParSeq > & H) { // compute the product over Z //std::cerr<<"Entering fgemm> PArSeq"< IntegerDomain; Givaro::Integer p; F.cardinality(p); IntegerDomain Z; MMHelper, ParSeq > H2(Z,H.recLevel,H.parseq); H2.setNorm(p); fgemm(Z,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,H2); // reduce the product mod p freduce (F, m, n, C, ldc); return C; } // // PARALLEL VERSION (NOT PARALLEL YET) // template // inline Givaro::Integer* fgemm (const Givaro::ZRing& F, // const FFLAS_TRANSPOSE ta, // const FFLAS_TRANSPOSE tb, // const size_t m, const size_t n,const size_t k, // const Givaro::Integer alpha, // const Givaro::Integer* A, const size_t lda, // const Givaro::Integer* B, const size_t ldb, // Givaro::Integer beta, // Givaro::Integer* C, const size_t ldc, // MMHelper,MMHelperAlgo::Winograd,FieldCategories::UnparametricTag,ParSeqHelper::Parallel > & H){ // MMHelper,MMHelperAlgo::Winograd> H2(F,H.recLevel); // return fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,lda,beta,C,ldc,H2); // } }// END of namespace FFLAS #endif fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm/fgemm_winograd.inl000066400000000000000000000433001274716147400254320ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas_fgemm/fgemm_winograd.h * @brief Strassen--Winograd matrix multiplication. * @warning The domain is supposed to be a field since some divisions are required for efficiency purposes * An alternative has to be written for finite rings if necessary */ #ifndef __FFLASFFPACK_fflas_fflas_fgemm_winograd_INL #define __FFLASFFPACK_fflas_fflas_fgemm_winograd_INL #include #include #include #include "fgemm_classical.inl" #include "schedule_winograd.inl" #include "schedule_winograd_acc.inl" #include "schedule_winograd_acc_ip.inl" #include "schedule_winograd_ip.inl" // #include "fflas_fgemm/bini.inl" #ifndef NEWWINO #define NEWWINO #endif //#define OLDWINO #include "fflas-ffpack/fflas-ffpack-config.h" // DynamicPeeling, WinogradCalc namespace FFLAS { namespace Protected { /** \brief Computes the number of recursive levels to perform. * * \param m the common dimension in the product AxB */ template inline int WinogradThreshold(const Field& F) {return __FFLASFFPACK_WINOTHRESHOLD;} template<> inline int WinogradThreshold (const Givaro::Modular& F) {return __FFLASFFPACK_WINOTHRESHOLD_FLT;} template<> inline int WinogradThreshold (const Givaro::ModularBalanced & F) {return __FFLASFFPACK_WINOTHRESHOLD_BAL;} template<> inline int WinogradThreshold (const Givaro::ModularBalanced & F) {return __FFLASFFPACK_WINOTHRESHOLD_BAL_FLT;} template inline int WinogradSteps (const Field & F, const size_t & m) { int w = 0; size_t th = WinogradThreshold(F); size_t mt = m; while ( mt >= th) { ++w; mt >>= 1; } return w; } template < class Field, class FieldMode > inline void DynamicPeeling (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & H, const typename MMHelper::DelayedField::Element Cmin, const typename MMHelper::DelayedField::Element Cmax) { typename Field::ConstElement_ptr a12, a21, b12, b21; size_t inca12, inca21, incb12, incb21, ma, na, mb, nb; size_t mkn = nr + (kr << 1)+ (mr << 2); if (ta == FflasTrans) { ma = k; na = m; a12 = A+(k-1)*lda; inca12 = 1; a21 = A+m-1; inca21 = lda; } else { ma = m; na = k; a12 = A+k-1; inca12 = lda; a21 = A+(m-1)*lda; inca21 = 1; } if (tb == FflasTrans) { mb = n; nb = k; b12 = B+(n-1)*ldb; incb12 = 1; b21 = B+k-1; incb21 = ldb; } else { mb = k; nb = n; b12 = B+n-1; incb12 = ldb; b21 = B+(k-1)*ldb; incb21 = 1; } MMHelper Hacc(H); MMHelper HModd(H); MMHelper HNodd(H); Hacc.Cmin = H.Outmin; Hacc.Cmax = H.Outmax; HModd.Cmin = Cmin; HModd.Cmax = Cmax; HModd.Amax = H.Bmax; HModd.Amin = H.Bmin; HModd.Bmax = H.Amax; HModd.Bmin = H.Amin; HNodd.Cmin = Cmin; HNodd.Cmax = Cmax; switch (mkn) { case 1: // n oddsized fgemv (F, ta, ma, na, alpha, A, lda, b12, incb12, beta, C+n-1,ldc, HNodd); break; case 2: // k oddsized fger (F, m, n, alpha, a12, inca12, b21, incb21, C, ldc, Hacc); break; case 3: // n, k oddsized fgemv (F, ta, ma, na, alpha, A, lda, b12, incb12, beta, C+n-1,ldc, HNodd); fger (F, m, n-1, alpha, a12, inca12, b21, incb21, C, ldc, Hacc); break; case 4: // m oddsized fgemv(F, (tb == FflasTrans)?FflasNoTrans:FflasTrans, mb, nb, alpha, B, ldb, a21, inca21, beta, C+(m-1)*ldc, 1, HModd); break; case 5: // m, n oddsized if (tb == FflasTrans) mb--; else nb--; fgemv (F, ta, ma, na, alpha, A, lda, b12, incb12, beta, C+n-1, ldc, HNodd); fgemv (F, (tb==FflasTrans)?FflasNoTrans:FflasTrans, mb, nb, alpha, B, ldb, a21, inca21, beta, C+(m-1)*ldc, 1, HModd); break; case 6: // m, k oddsized fger (F, m-1, n, alpha, a12, inca12, b21, incb21, C, ldc, Hacc); fgemv(F, (tb==FflasTrans)?FflasNoTrans:FflasTrans, mb, nb, alpha, B, ldb, a21, inca21, beta, C+(m-1)*ldc, 1, HModd); break; case 7: // m, k, n oddsized if (tb == FflasTrans) mb--; else nb--; H.checkA(F,ta, m,k,A,lda); H.checkB(F,tb, k,n,B,ldb); // Block NW fger (F, m-1, n-1, alpha, a12, inca12, b21, incb21, C, ldc, Hacc); // Block SW fgemv (F, (tb==FflasTrans)?FflasNoTrans:FflasTrans, mb, nb, alpha, B, ldb, a21, inca21, beta, C+(m-1)*ldc, 1, HModd); HModd.checkOut(F, m-1,n-1, C, ldc); // Block E fgemv (F, ta, ma, na, alpha, A, lda, b12, incb12, beta, C+n-1, ldc, HNodd); break; } H.Outmin = min4(HModd.Outmin,HNodd.Outmin, Hacc.Outmin, H.Outmin); H.Outmax = max4(HModd.Outmax,HNodd.Outmax, Hacc.Outmax, H.Outmax); H.checkOut(F, m,n, C, ldc); } template < class Field, class FieldMode > inline void DynamicPeeling2 (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & H, const typename MMHelper::DelayedField::Element Cmin, const typename MMHelper::DelayedField::Element Cmax) { size_t mkn =(size_t)( (bool)(nr > 0)+ ((bool)(kr > 0) << 1)+ ((bool)(mr > 0) << 2)); if (mkn == 0) return; typename Field::ConstElement_ptr a12, a21, b12, b21; if (ta == FflasTrans) { a12 = A+(k-kr)*lda; a21 = A+(m-mr); } else { a12 = A+(k-kr); a21 = A+(m-mr)*lda; } if (tb == FflasTrans) { b12 = B+(n-nr)*ldb; b21 = B+(k-kr); } else { b12 = B+(n-nr); b21 = B+(k-kr)*ldb; } MMHelper Hacc(H); MMHelper HModd(H); MMHelper HNodd(H); Hacc.Cmin = H.Outmin; Hacc.Cmax = H.Outmax; Hacc.recLevel=-1;HModd.recLevel=-1;HNodd.recLevel=-1; HModd.Cmin = Cmin; HModd.Cmax = Cmax; HModd.Amax = H.Bmax; HModd.Amin = H.Bmin; HModd.Bmax = H.Amax; HModd.Bmin = H.Amin; HNodd.Cmin = Cmin; HNodd.Cmax = Cmax; switch (mkn) { case 1: // n oddsized fgemm (F, ta, tb, m, nr, k, alpha, A, lda, b12, ldb, beta, C+(n-nr), ldc, HNodd); break; case 2: // k oddsized fgemm (F, ta, tb, m, n, kr, alpha, a12, lda, b21, ldb, F.one, C, ldc, Hacc); break; case 3: // n, k oddsized fgemm (F, ta, tb, m, nr, k, alpha, A, lda, b12, ldb, beta, C+(n-nr), ldc, HNodd); fgemm (F, ta, tb, m, n-nr, kr, alpha, a12, lda, b21, ldb, F.one, C, ldc, Hacc); break; case 4: // m oddsized fgemm (F, ta, tb, mr, n, k, alpha, a21, lda, B, ldb, beta, C+(m-mr)*ldc, ldc, HModd); break; case 5: // m, n oddsized fgemm (F, ta, tb, m, nr, k, alpha, A, lda, b12, ldb, beta, C+(n-nr), ldc, HNodd); fgemm (F, ta, tb, mr, n-nr, k, alpha, a21, lda, B, ldb, beta, C+(m-mr)*ldc, ldc, HModd); break; case 6: // m, k oddsized fgemm (F, ta, tb, m-mr, n, kr, alpha, a12, lda, b21, ldb, F.one, C, ldc, Hacc); fgemm (F, ta, tb, mr, n, k, alpha, a21, lda, B, ldb, beta, C+(m-mr)*ldc, ldc, HModd); break; case 7: // m, k, n oddsized // Block NW fgemm (F, ta, tb, m-mr, n-nr, kr, alpha, a12, lda, b21, ldb, F.one, C, ldc, Hacc); // Block SW fgemm (F, ta, tb, mr, n-nr, k, alpha, a21, lda, B, ldb, beta, C+(m-mr)*ldc, ldc, HModd); // Block NE fgemm (F, ta, tb, m, nr, k, alpha, A, lda, b12, ldb, beta, C+(n-nr), ldc, HNodd); break; } H.Outmin = min4(HModd.Outmin,HNodd.Outmin, Hacc.Outmin, H.Outmin); H.Outmax = max4(HModd.Outmax,HNodd.Outmax, Hacc.Outmax, H.Outmax); H.checkOut(F, m,n, C, ldc); } // #define NEWIP // #define NEWACCIP // Switch between the scheduling for Strassen-Winograd Multiplication template < class Field, class FieldMode > inline void WinogradCalc (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::ConstElement_ptr A,const size_t lda, typename Field::ConstElement_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & H) { #if defined(NEWIP) or defined(NEWACCIP) /* XXX TESTS ONLY */ typedef typename Field::Element Element ; Element_ptr Ac; Element_ptr Bc; if (ta == FflasNoTrans) { Ac = fflas_new (F, mr*2, lda); fassign(F,mr*2,kr*2,A,lda,Ac,lda); } else { Ac = fflas_new (F, kr*2, lda); fassign(F,kr*2,mr*2,A,lda,Ac,lda); } if (tb == FflasNoTrans) { Bc = fflas_new (F, kr*2, ldb); fassign(F,kr*2,nr*2,B,ldb,Bc,ldb); } else { Bc = fflas_new (F, nr*2, ldb); fassign(F,nr*2,kr*2,B,ldb,Bc,ldb); } #endif if (F.isZero(beta)) { #ifdef NEWIP /* NOT IP --- TESTS ONLY */ // (kr == nr && kr <= mr /* if not transposed */) // we copy because they erase stuff // bool normal = (ta == FflasNoTrans && tb == FflasNoTrans) ; bool normal = true; if (kr == nr && kr == mr && normal) { // BLAS3::Winograd_L_S(F,ta,tb,mr,nr,kr,alpha,Ac,lda,Bc,ldb,beta,C,ldc,H); // BLAS3::Winograd_R_S(F,ta,tb,mr,nr,kr,alpha,Ac,lda,Bc,ldb,beta,C,ldc,H); BLAS3::Winograd_LR_S(F,ta,tb,mr,nr,kr,alpha,Ac,lda,Bc,ldb,beta,C,ldc,H); } else #endif { BLAS3::Winograd(F,ta,tb,mr,nr,kr,alpha,A,lda,B,ldb,beta,C,ldc,H); } } else { #ifdef NEWACCIP /* test only */ if (kr == nr && kr == mr ) { BLAS3::WinogradAcc_L_S(F,ta,tb,mr,nr,kr,alpha,Ac,lda,Bc,ldb,beta,C,ldc,H); // BLAS3::WinogradAcc_R_S(F,ta,tb,mr,nr,kr,alpha,Ac,lda,Bc,ldb,beta,C,ldc,H); } else { BLAS3::WinogradAcc_LR(F,ta,tb,mr,nr,kr,alpha,Ac,lda,Bc,ldb,beta,C,ldc,H); } #elif defined(NEWWINO) BLAS3::WinogradAcc_3_21(F,ta,tb,mr,nr,kr,alpha,A,lda,B,ldb,beta,C,ldc,H); #elif defined(OLDWINO) BLAS3::WinogradAcc_3_23(F,ta,tb,mr,nr,kr,alpha,A,lda,B,ldb,beta,C,ldc,H); #elif defined(NEWACC) // BLAS3::WinogradAcc_2_24(F,ta,tb,mr,nr,kr,alpha,A,lda,B,ldb,beta,C,ldc,H); BLAS3::WinogradAcc_2_27(F,ta,tb,mr,nr,kr,alpha,A,lda,B,ldb,beta,C,ldc,H); #else #error "you need to make a choice for a BLAS3 mat mul schedule" #endif } #if defined(NEWIP) or defined(NEWACCIP) /* NOT IP --- TESTS ONLY */ fflas_delete (Ac); fflas_delete (Bc); #endif } // WinogradCalc //#define OLD_DYNAMIC_PEELING }// namespace Protected } // FFLAS namespace FFLAS{ template inline typename Field::Element_ptr fgemm (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & H) { if (!m || !n ) return C; if (!k){ //TODO: update helper fscalin(F,m,n,beta,C,ldc); return C; } if (H.recLevel < 0) { H.recLevel = Protected::WinogradSteps (F, min3(m,k,n)); } if (H.recLevel == 0){ MMHelper HC(H); fgemm (F, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, HC); H.Outmax = HC.Outmax; H.Outmin = HC.Outmin; return C; } // Then w >0 typedef typename MMHelper::DelayedField::Element DFElt; DFElt Cmin = H.Cmin; DFElt Cmax = H.Cmax; #ifdef OLD_DYNAMIC_PEELING Protected::WinogradCalc (F, ta, tb, m/2, n/2, k/2, alpha, A, lda, B, ldb, beta, C, ldc, H); FFLASFFPACK_check(m-(m/2)*2 == (m&0x1)); FFLASFFPACK_check(n-(n/2)*2 == (n&0x1)); FFLASFFPACK_check(k-(k/2)*2 == (k&0x1)); Protected::DynamicPeeling (F, ta, tb, m, n, k, m&0x1, n&0x1, k&0x1, alpha, A, lda, B, ldb, beta, C, ldc, H, Cmin, Cmax); #else size_t ww = (size_t)H.recLevel ; size_t m2 = (m >> ww) << (ww-1) ; size_t n2 = (n >> ww) << (ww-1) ; size_t k2 = (k >> ww) << (ww-1) ; Protected::WinogradCalc (F, ta, tb, m2, n2, k2, alpha, A, lda, B, ldb, beta, C, ldc, H); size_t mr = m -2*m2; size_t nr = n -2*n2; size_t kr = k -2*k2; FFLASFFPACK_check(m == m2*2+mr); FFLASFFPACK_check(n == n2*2+nr); FFLASFFPACK_check(k == k2*2+kr); Protected::DynamicPeeling2 (F, ta, tb, m, n, k, mr, nr, kr, alpha, A, lda, B, ldb, beta, C, ldc, H, Cmin, Cmax); #endif return C; } // fgemm template inline typename Field::Element_ptr fgemm (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper > & H) { if (!m || !n ) return C; if (!k){ //TODO: update helper fscalin(F,m,n,beta,C,ldc); return C; } if (H.recLevel < 0) { H.recLevel = Protected::WinogradSteps (F, min3(m,k,n)); } if (H.recLevel == 0){ #ifdef WINO_SEQ MMHelper HC (F, -1,ParSeqHelper::Sequential()); #elif defined CLASSIC_SEQ MMHelper HC (F, 0,ParSeqHelper::Sequential()); #elif defined CLASSIC_Hybrid typedef StrategyParameter::TwoDAdaptive twoda; typedef CuttingStrategy::Recursive rec; MMHelper::value, FFLAS::ParSeqHelper::Parallel > HC (F, -1, SPLITTER(H.parseq.numthreads(), rec, twoda)); #elif defined PFGEMM_WINO_SEQ MMHelper::value, FFLAS::ParSeqHelper::Parallel> HC (F, -1, ParSeqHelper::Parallel(PFGEMM_WINO_SEQ, RECURSIVE, TWO_D_ADAPT)); #else MMHelper::value, FFLAS::ParSeqHelper::Parallel > HC (F, 0, ParSeqHelper::Parallel(NUM_THREADS)); #endif // MMHelper HC(H); fgemm (F, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, HC); H.Outmax = HC.Outmax; H.Outmin = HC.Outmin; return C; } /// // Then w >0 typedef typename MMHelper::DelayedField::Element DFElt; DFElt Cmin = H.Cmin; DFElt Cmax = H.Cmax; #ifdef OLD_DYNAMIC_PEELING BLAS3::WinoPar (F, ta, tb, m/2, n/2, k/2, alpha, A, lda, B, ldb, beta, C, ldc, H); FFLASFFPACK_check(m-(m/2)*2 == (m&0x1)); FFLASFFPACK_check(n-(n/2)*2 == (n&0x1)); FFLASFFPACK_check(k-(k/2)*2 == (k&0x1)); MMHelper HC(H); Protected::DynamicPeeling (F, ta, tb, m, n, k, m&0x1, n&0x1, k&0x1, alpha, A, lda, B, ldb, beta, C, ldc, HC, Cmin, Cmax); #else size_t ww = (size_t)H.recLevel ; size_t m2 = (m >> ww) << (ww-1) ; size_t n2 = (n >> ww) << (ww-1) ; size_t k2 = (k >> ww) << (ww-1) ; BLAS3::WinoPar (F, ta, tb, m2, n2, k2, alpha, A, lda, B, ldb, beta, C, ldc, H); size_t mr = m -2*m2; size_t nr = n -2*n2; size_t kr = k -2*k2; FFLASFFPACK_check(m == m2*2+mr); FFLASFFPACK_check(n == n2*2+nr); FFLASFFPACK_check(k == k2*2+kr); MMHelper HC(H); Protected::DynamicPeeling2 (F, ta, tb, m, n, k, mr, nr, kr, alpha, A, lda, B, ldb, beta, C, ldc, HC, Cmin, Cmax); #endif return C; } // fgemm } // FFLAS #endif // __FFLASFFPACK_fflas_fflas_fgemm_winograd_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm/matmul.doxy000066400000000000000000000021621274716147400241460ustar00rootroot00000000000000// Copyright (c) 2014 FFLAS-FFPACK // written by Brice Boyer (briceboyer) // // ========LICENCE======== // This file is part of the library FFLAS-FFPACK. // // FFLAS-FFPACK is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // ========LICENCE======== // /** \ingroup fflas-ffpack * \defgroup MMalgos Matrix Multiplication Algorithms * * \brief Matrix Multiplication (level 3) algorithms * * @todo biblio * */ // vim:syn=doxygen fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm/schedule_bini.inl000066400000000000000000000055641274716147400252540ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the LinBox group * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fgemm/schedule_bini.inl * @ingroup MMalgos * @brief Bini implementation */ #ifndef __FFLASFFPACK_fgemm_bini_INL #define __FFLASFFPACK_fgemm_bini_INL namespace FFLAS { namespace BLAS3 { template < class Field > inline void Bini (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, const typename Field::Element_ptr A,const size_t lda, const typename Field::Element_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const size_t kmax, const size_t w, const FFLAS_BASE base, const size_t rec_level) { FFLASFFPACK_check(F.isZero(beta)); FFLASFFPACK_check(rec_level>0); size_t imaxb, jmaxb, imaxa, jmaxa, ldx2; // size_t x3rd = std::max(mr,kr); const typename Field::Element_ptr d11,d12,d21,d22; typename Field::Element_ptr d11c,d12c,d21c,d22c,dx1,dx2; const typename Field::Element_ptr A11=A, A12, A21, A22; const typename Field::Element_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C+nr+mr*ldc; size_t x1rd = std::max(nr,kr); size_t ldx1; if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; imaxa = kr; jmaxa = mr; ldx1 = mr; } else { A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; imaxa = mr; jmaxa = kr; ldx1 = x1rd; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; imaxb = nr; jmaxb = kr; ldx2 = kr; } else { B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; imaxb = kr; ldx2 = jmaxb = nr; } } // Bini } // BLAS3 } // FFLAS #endif // __FFLASFFPACK_fgemm_bini_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm/schedule_winograd.inl000066400000000000000000000476461274716147400261540ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the LinBox group * * Written by Clement Pernet * Brice Boyer (briceboyer) * Ziad Sultan * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fgemm/winograd.inl * @ingroup MMalgos * @brief Winograd implementation * @bib ISSAC09 Scheduling */ #ifndef __FFLASFFPACK_fgemm_winograd_INL #define __FFLASFFPACK_fgemm_winograd_INL namespace FFLAS { namespace BLAS3 { template < class Field, class FieldTrait, class Strat, class Param > inline typename Field::Element_ptr WinoPar (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::ConstElement_ptr A,const size_t lda, typename Field::ConstElement_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, // const size_t kmax, const size_t w, const FFLAS_BASE base MMHelper > & WH ) { FFLASFFPACK_check(F.isZero(beta)); // typedef MMHelper MMH_t; typedef MMHelper > MMH_t; const typename MMH_t::DelayedField & DF = WH.delayedField; typedef typename MMH_t::DelayedField::Element DFElt; size_t lb, cb, la, ca, ldX2; // size_t x3rd = std::max(mr,kr); typename Field::ConstElement_ptr A11=A, A12, A21, A22; typename Field::ConstElement_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; size_t x1rd = std::max(nr,kr); size_t ldX1; if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr; ca = mr; ldX1 = mr; } else { A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr; ca = kr; ldX1 = x1rd; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr; cb = kr; ldX2 = kr; } else { B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr; ldX2 = cb = nr; } // 11 temporary submatrices are required typename Field::Element_ptr X21 = fflas_new (F, kr, nr); typename Field::Element_ptr X11 = fflas_new (F,mr,x1rd); typename Field::Element_ptr X22 = fflas_new (F, kr, nr); typename Field::Element_ptr X12 = fflas_new (F,mr,x1rd); typename Field::Element_ptr X23 = fflas_new (F, kr, nr); typename Field::Element_ptr X13 = fflas_new (F,mr,x1rd); typename Field::Element_ptr X24 = fflas_new (F, kr, nr); typename Field::Element_ptr X14 = fflas_new (F,mr,x1rd); typename Field::Element_ptr X15 = fflas_new (F,mr,x1rd); typename Field::Element_ptr C_11 = fflas_new (F,mr,nr); typename Field::Element_ptr CC_11 = fflas_new (F,mr,nr); SYNCH_GROUP( // T3 = B22 - B12 in X21 and S3 = A11 - A21 in X11 TASK(MODE(READ(B22, B12) WRITE(X21) CONSTREFERENCE(DF)), pfsub(DF,lb,cb,B22,ldb,B12,ldb,X21,ldX2, NUM_THREADS);); TASK(MODE(READ(A11, A21) WRITE(X11) CONSTREFERENCE(DF)), pfsub(DF,la,ca,A11,lda,A21,lda,X11,ldX1, NUM_THREADS);); // T1 = B12 - B11 in X22 and S1 = A21 + A22 in X12 TASK(MODE(READ(B11, B12) WRITE(X22) CONSTREFERENCE(DF)), pfsub(DF,lb,cb,B12,ldb,B11,ldb,X22,ldX2, NUM_THREADS);); TASK(MODE(READ(A12, A22) WRITE(X12) CONSTREFERENCE(DF)), pfadd(DF,la,ca,A21,lda,A22,lda,X12,ldX1, NUM_THREADS);); CHECK_DEPENDENCIES; // T2 = B22 - T1 in X23 and S2 = S1 - A11 in X13 TASK(MODE(READ(B22, X22) READWRITE(X23) CONSTREFERENCE(DF)), pfsub(DF,lb,cb,B22,ldb,X22,ldX2,X23,ldX2, NUM_THREADS);); TASK(MODE(READ(A11, X12) READWRITE(X13) CONSTREFERENCE(DF)), // fsub(DF,la,ca,A11,lda,X12,ldX1,X13,ldX1);); pfsub(DF,la,ca,X12,ldX1,A11,lda,X13,ldX1, NUM_THREADS);); /* fsub(DF,lb,cb,B22,ldb,X2,ldX2,X2,ldX2); fsubin(DF,la,ca,A11,lda,X1,ldX1);); */ CHECK_DEPENDENCIES; // T4 = T2 - B21 in X2 and S4 = A12 -S2 in X1 TASK(MODE(READ(B21, X23) READWRITE(X24) CONSTREFERENCE(DF)), // fsub(DF,lb,cb,B21,ldb,X23,ldX2,X24,ldX2); pfsub(DF,lb,cb,X23,ldX2,B21,ldb,X24,ldX2, NUM_THREADS);); TASK(MODE(READ(A12, X13) READWRITE(X14) CONSTREFERENCE(DF)), pfsub(DF,la,ca,A12,lda,X13,ldX1,X14,ldX1, NUM_THREADS);); /* fsubin(DF,lb,cb,B21,ldb,X2,ldX2); fsub(DF,la,ca,A12,lda,X1,ldX1,X1,ldX1);); */ CHECK_DEPENDENCIES; // P1 = alpha . A11 * B11 in X1 MMH_t H1(F, WH.recLevel-1, WH.Amin, WH.Amax, WH.Bmin, WH.Bmax, 0, 0); MMH_t H7(F, WH.recLevel-1, -(WH.Amax-WH.Amin), WH.Amax-WH.Amin, -(WH.Bmax-WH.Bmin), WH.Bmax-WH.Bmin, 0,0); MMH_t H5(F, WH.recLevel-1, 2*WH.Amin, 2*WH.Amax, -(WH.Bmax-WH.Bmin), WH.Bmax-WH.Bmin, 0, 0); MMH_t H6(F, WH.recLevel-1, 2*WH.Amin-WH.Amax, 2*WH.Amax-WH.Amin, 2*WH.Bmin-WH.Bmax, 2*WH.Bmax-WH.Bmin, 0, 0); MMH_t H3(F, WH.recLevel-1, 2*WH.Amin-2*WH.Amax, 2*WH.Amax-2*WH.Amin, WH.Bmin, WH.Bmax, 0, 0); MMH_t H4(F, WH.recLevel-1, WH.Amin, WH.Amax, 2*WH.Bmin-2*WH.Bmax, 2*WH.Bmax-2*WH.Bmin, 0, 0); MMH_t H2(F, WH.recLevel-1, WH.Amin, WH.Amax, WH.Bmin, WH.Bmax, 0, 0); size_t nt = WH.parseq.numthreads(); size_t nt_rec = nt/7; size_t nt_mod = nt % 7 ; H1.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H2.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H3.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H4.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H5.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H6.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H7.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); TASK(MODE(READ(A11, B11) WRITE(X15) CONSTREFERENCE(F,H1)), fgemm (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, X15, x1rd, H1);); // P7 = alpha . S3 * T3 in C21 TASK(MODE(READ(X11, X21) WRITE(C21) CONSTREFERENCE(F,H7)), fgemm (F, ta, tb, mr, nr, kr, alpha, X11, ldX1, X21, ldX2, F.zero, C21, ldc, H7);); // P5 = alpha . S1*T1 in C22 TASK(MODE(READ(X12, X22) WRITE(C22) CONSTREFERENCE(F,H5)), fgemm (F, ta, tb, mr, nr, kr, alpha, X12, ldX1, X22, ldX2, F.zero, C22, ldc, H5);); // P6 = alpha . S2 * T2 in C12 TASK(MODE(READ(X13, X23) WRITE(C12) CONSTREFERENCE(F,H6)), fgemm (F, ta, tb, mr, nr, kr, alpha, X13, ldX1, X23, ldX2, F.zero, C12, ldc, H6);); // P3 = alpha . S4*B22 in CC_11 TASK(MODE(READ(X14, B22) WRITE(CC_11) CONSTREFERENCE(F,H3)), fgemm (F, ta, tb, mr, nr, kr, alpha, X14, ldX1, B22, ldb, F.zero, CC_11, nr, H3);); // P4 = alpha . A22 * T4 in C_11 TASK(MODE(READ(A22) WRITE(C_11) READWRITE(X24, X22, X23, X21) CONSTREFERENCE(F,H4)), fgemm (F, ta, tb, mr, nr, kr, alpha, A22, lda, X24, ldX2, F.zero, C_11, nr, H4); ); // P2 = alpha . A12 * B21 in C11 TASK(MODE(READ(A12, B21) WRITE(C11) CONSTREFERENCE(F,H2)), fgemm (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, F.zero, C11, ldc, H2);); CHECK_DEPENDENCIES; DFElt U2Min, U2Max; DFElt U3Min, U3Max; DFElt U4Min, U4Max; DFElt U7Min, U7Max; DFElt U5Min, U5Max; // U2 = P1 + P6 in C12 and // U3 = P7 + U2 in C21 and // U4 = P5 + U2 in C12 and // U7 = P5 + U3 in C22 and // U5 = P3 + U4 in C12 // BIG TASK with 5 Addin function calls // TASK(MODE(READWRITE(X15, C12) CONSTREFERENCE(F, DF, WH, U2Min, U2Max, H1.Outmin, H1.Outmax, H6.Outmin, H6.Outmax)), if (Protected::NeedPreAddReduction(U2Min, U2Max, H1.Outmin, H1.Outmax, H6.Outmin, H6.Outmax, WH)){ TASK(MODE(READWRITE(X15) CONSTREFERENCE(F)), pfreduce (F, mr, x1rd, X15, x1rd, NUM_THREADS); ); TASK(MODE(READWRITE(C12) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C12, ldc, NUM_THREADS); ); CHECK_DEPENDENCIES; } TASK(MODE(READWRITE(X15, C12) CONSTREFERENCE(DF)), pfaddin(DF,mr,nr,X15,x1rd,C12,ldc, NUM_THREADS); ); CHECK_DEPENDENCIES; // TASK(MODE(READWRITE(C12, C21) CONSTREFERENCE(F, DF, WH, U3Min, U3Max, U2Min, U2Max)), if (Protected::NeedPreAddReduction(U3Min, U3Max, U2Min, U2Max, H7.Outmin, H7.Outmax, WH)){ TASK(MODE(READWRITE(C12) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C12, ldc, NUM_THREADS); ); TASK(MODE(READWRITE(C21) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C21, ldc, NUM_THREADS); ); CHECK_DEPENDENCIES; } TASK(MODE(READWRITE(C12, C21) CONSTREFERENCE(DF)), pfaddin(DF,mr,nr,C12,ldc,C21,ldc, NUM_THREADS); ); CHECK_DEPENDENCIES; // TASK(MODE(READWRITE(C12, C22) CONSTREFERENCE(F, DF, WH) VALUE(U4Min, U4Max, U2Min, U2Max)), if (Protected::NeedPreAddReduction(U4Min, U4Max, U2Min, U2Max, H5.Outmin, H5.Outmax, WH)){ TASK(MODE(READWRITE(C22) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C22, ldc, NUM_THREADS); ); TASK(MODE(READWRITE(C12) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C12, ldc, NUM_THREADS); ); CHECK_DEPENDENCIES; } TASK(MODE(READWRITE(C12, C22) CONSTREFERENCE(DF, WH)), pfaddin(DF,mr,nr,C22,ldc,C12,ldc, NUM_THREADS); ); CHECK_DEPENDENCIES; // TASK(MODE(READWRITE(C22, C21) CONSTREFERENCE(F, DF, WH) VALUE(U3Min, U3Max, U7Min, U7Max)), if (Protected::NeedPreAddReduction (U7Min,U7Max, U3Min, U3Max, H5.Outmin,H5.Outmax, WH) ){ TASK(MODE(READWRITE(C21) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C21, ldc, NUM_THREADS); ); TASK(MODE(READWRITE(C22) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C22, ldc, NUM_THREADS); ); CHECK_DEPENDENCIES; } TASK(MODE(READWRITE(C22, C21) CONSTREFERENCE(DF, WH)), pfaddin(DF,mr,nr,C21,ldc,C22,ldc, NUM_THREADS); ); // TASK(MODE(READWRITE(C12, CC_11) CONSTREFERENCE(F, DF, WH) VALUE(U5Min, U5Max, U4Min, U4Max)), if (Protected::NeedPreAddReduction (U5Min,U5Max, U4Min, U4Max, H3.Outmin, H3.Outmax, WH) ){ TASK(MODE(READWRITE(C12) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C12, ldc, NUM_THREADS); ); TASK(MODE(READWRITE(CC_11) CONSTREFERENCE(F)), pfreduce (F, mr, nr, CC_11, nr, NUM_THREADS); ); CHECK_DEPENDENCIES; } TASK(MODE(READWRITE(C12, CC_11) CONSTREFERENCE(DF, WH)), pfaddin(DF,mr,nr,CC_11,nr,C12,ldc, NUM_THREADS); ); CHECK_DEPENDENCIES; // U6 = U3 - P4 in C21 DFElt U6Min, U6Max; // TASK(MODE(READWRITE(C_11, C21) CONSTREFERENCE(F, DF, WH) VALUE(U6Min, U6Max, U3Min, U3Max)), if (Protected::NeedPreSubReduction (U6Min,U6Max, U3Min, U3Max, H4.Outmin,H4.Outmax, WH) ){ TASK(MODE(READWRITE(CC_11) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C_11, nr, NUM_THREADS); ); TASK(MODE(READWRITE(C21) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C21, ldc, NUM_THREADS); ); CHECK_DEPENDENCIES } TASK(MODE(READWRITE(C_11, C21) CONSTREFERENCE(DF, WH) ), pfsubin(DF,mr,nr,C_11,nr,C21,ldc, NUM_THREADS); ); //CHECK_DEPENDENCIES; // U1 = P2 + P1 in C11 DFElt U1Min, U1Max; // TASK(MODE(READWRITE(C11, X15/*, X14, X13, X12, X11*/) CONSTREFERENCE(F, DF, WH) VALUE(U1Min, U1Max)), if (Protected::NeedPreAddReduction (U1Min, U1Max, H1.Outmin, H1.Outmax, H2.Outmin,H2.Outmax, WH) ){ TASK(MODE(READWRITE(X15) CONSTREFERENCE(F)), pfreduce (F, mr, nr, X15, x1rd, NUM_THREADS); ); TASK(MODE(READWRITE(C11) CONSTREFERENCE(F)), pfreduce (F, mr, nr, C11, ldc, NUM_THREADS); ); CHECK_DEPENDENCIES } TASK(MODE(READWRITE(C11, X15) CONSTREFERENCE(DF, WH)), pfaddin(DF,mr,nr,X15,x1rd,C11,ldc, NUM_THREADS); ); WH.Outmin = std::min (U1Min, std::min (U5Min, std::min (U6Min, U7Min))); WH.Outmax = std::max (U1Max, std::max (U5Max, std::max (U6Max, U7Max))); ); // WAIT; fflas_delete (CC_11); fflas_delete (C_11); fflas_delete (X15); fflas_delete (X14); fflas_delete (X24); fflas_delete (X13); fflas_delete (X23); fflas_delete (X12); fflas_delete (X22); fflas_delete (X11); fflas_delete (X21); return C; } //wino parallel template < class Field, class FieldTrait > inline void Winograd (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::ConstElement_ptr A,const size_t lda, typename Field::ConstElement_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, // const size_t kmax, const size_t w, const FFLAS_BASE base MMHelper & WH ) { FFLASFFPACK_check(F.isZero(beta)); typedef MMHelper MMH_t; typedef typename MMH_t::DelayedField::Element_ptr DFEptr; typedef typename MMH_t::DelayedField::ConstElement_ptr DFCEptr; typedef typename MMH_t::DelayedField::Element DFElt; const typename MMH_t::DelayedField & DF = WH.delayedField; size_t lb, cb, la, ca, ldX2; // size_t x3rd = std::max(mr,kr); typename Field::ConstElement_ptr A11=A, A12, A21, A22; typename Field::ConstElement_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; size_t x1rd = std::max(nr,kr); size_t ldX1; if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr; ca = mr; ldX1 = mr; } else { A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr; ca = kr; ldX1 = x1rd; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr; cb = kr; ldX2 = kr; } else { B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr; ldX2 = cb = nr; } // Two temporary submatrices are required typename Field::Element_ptr X2 = fflas_new (F, kr, nr); // T3 = B22 - B12 in X2 fsub(DF,lb,cb, (DFCEptr) B22,ldb, (DFCEptr) B12,ldb, (DFEptr)X2,ldX2); // S3 = A11 - A21 in X1 typename Field::Element_ptr X1 = fflas_new (F,mr,x1rd); fsub(DF,la,ca,(DFCEptr)A11,lda,(DFCEptr)A21,lda,(DFEptr)X1,ldX1); // P7 = alpha . S3 * T3 in C21 MMH_t H7(F, WH.recLevel-1, -(WH.Amax-WH.Amin), WH.Amax-WH.Amin, -(WH.Bmax-WH.Bmin), WH.Bmax-WH.Bmin, 0,0); fgemm (F, ta, tb, mr, nr, kr, alpha, X1, ldX1, X2, ldX2, F.zero, C21, ldc, H7); // T1 = B12 - B11 in X2 fsub(DF,lb,cb,(DFCEptr)B12,ldb,(DFCEptr)B11,ldb,(DFEptr)X2,ldX2); // S1 = A21 + A22 in X1 fadd(DF,la,ca,(DFCEptr)A21,lda,(DFCEptr)A22,lda,(DFEptr)X1,ldX1); // P5 = alpha . S1*T1 in C22 MMH_t H5(F, WH.recLevel-1, 2*WH.Amin, 2*WH.Amax, -(WH.Bmax-WH.Bmin), WH.Bmax-WH.Bmin, 0, 0); fgemm (F, ta, tb, mr, nr, kr, alpha, X1, ldX1, X2, ldX2, F.zero, C22, ldc, H5); // T2 = B22 - T1 in X2 fsub(DF,lb,cb,(DFCEptr)B22,ldb,(DFCEptr)X2,ldX2,(DFEptr)X2,ldX2); // S2 = S1 - A11 in X1 fsubin(DF,la,ca,(DFCEptr)A11,lda,(DFEptr)X1,ldX1); // P6 = alpha . S2 * T2 in C12 MMH_t H6(F, WH.recLevel-1, 2*WH.Amin-WH.Amax, 2*WH.Amax-WH.Amin, 2*WH.Bmin-WH.Bmax, 2*WH.Bmax-WH.Bmin, 0, 0); fgemm (F, ta, tb, mr, nr, kr, alpha, X1, ldX1, X2, ldX2, F.zero, C12, ldc, H6); // S4 = A12 -S2 in X1 fsub(DF,la,ca,(DFCEptr)A12,lda,(DFCEptr)X1,ldX1,(DFEptr)X1,ldX1); // P3 = alpha . S4*B22 in C11 MMH_t H3(F, WH.recLevel-1, 2*WH.Amin-2*WH.Amax, 2*WH.Amax-2*WH.Amin, WH.Bmin, WH.Bmax, 0, 0); fgemm (F, ta, tb, mr, nr, kr, alpha, X1, ldX1, B22, ldb, F.zero, C11, ldc, H3); // P1 = alpha . A11 * B11 in X1 MMH_t H1(F, WH.recLevel-1, WH.Amin, WH.Amax, WH.Bmin, WH.Bmax, 0, 0); fgemm (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, X1, nr, H1); // U2 = P1 + P6 in C12 and DFElt U2Min, U2Max; // This test will be optimized out if (Protected::NeedPreAddReduction(U2Min, U2Max, H1.Outmin, H1.Outmax, H6.Outmin, H6.Outmax, WH)){ freduce (F, mr, nr, X1, nr); freduce (F, mr, nr, C12, ldc); } faddin(DF,mr,nr,(DFCEptr)X1,nr,(DFEptr)C12,ldc); // U3 = P7 + U2 in C21 and DFElt U3Min, U3Max; // This test will be optimized out if (Protected::NeedPreAddReduction(U3Min, U3Max, U2Min, U2Max, H7.Outmin, H7.Outmax, WH)){ freduce (F, mr, nr, C12, ldc); freduce (F, mr, nr, C21, ldc); } faddin(DF,mr,nr,(DFCEptr)C12,ldc,(DFEptr)C21,ldc); // U4 = P5 + U2 in C12 and DFElt U4Min, U4Max; // This test will be optimized out if (Protected::NeedPreAddReduction(U4Min, U4Max, U2Min, U2Max, H5.Outmin, H5.Outmax, WH)){ freduce (F, mr, nr, C22, ldc); freduce (F, mr, nr, C12, ldc); } faddin(DF,mr,nr,(DFCEptr)C22,ldc,(DFEptr)C12,ldc); // U7 = P5 + U3 in C22 and DFElt U7Min, U7Max; // This test will be optimized out if (Protected::NeedPreAddReduction (U7Min,U7Max, U3Min, U3Max, H5.Outmin,H5.Outmax, WH) ){ freduce (F, mr, nr, C21, ldc); freduce (F, mr, nr, C22, ldc); } faddin(DF,mr,nr,(DFCEptr)C21,ldc,(DFEptr)C22,ldc); // U5 = P3 + U4 in C12 DFElt U5Min, U5Max; // This test will be optimized out if (Protected::NeedPreAddReduction (U5Min,U5Max, U4Min, U4Max, H3.Outmin, H3.Outmax, WH) ){ freduce (F, mr, nr, C12, ldc); freduce (F, mr, nr, C11, ldc); } faddin(DF,mr,nr,(DFCEptr)C11,ldc,(DFEptr)C12,ldc); // T4 = T2 - B21 in X2 fsubin(DF,lb,cb,(DFCEptr)B21,ldb,(DFEptr)X2,ldX2); // P4 = alpha . A22 * T4 in C11 MMH_t H4(F, WH.recLevel-1, WH.Amin, WH.Amax, 2*WH.Bmin-2*WH.Bmax, 2*WH.Bmax-2*WH.Bmin, 0, 0); fgemm (F, ta, tb, mr, nr, kr, alpha, A22, lda, X2, ldX2, F.zero, C11, ldc, H4); fflas_delete (X2); // U6 = U3 - P4 in C21 DFElt U6Min, U6Max; // This test will be optimized out if (Protected::NeedPreSubReduction (U6Min,U6Max, U3Min, U3Max, H4.Outmin,H4.Outmax, WH) ){ freduce (F, mr, nr, C11, ldc); freduce (F, mr, nr, C21, ldc); } fsubin(DF,mr,nr,(DFCEptr)C11,ldc,(DFEptr)C21,ldc); // P2 = alpha . A12 * B21 in C11 MMH_t H2(F, WH.recLevel-1, WH.Amin, WH.Amax, WH.Bmin, WH.Bmax, 0, 0); fgemm (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, F.zero, C11, ldc, H2); // U1 = P2 + P1 in C11 DFElt U1Min, U1Max; // This test will be optimized out if (Protected::NeedPreAddReduction (U1Min, U1Max, H1.Outmin, H1.Outmax, H2.Outmin,H2.Outmax, WH) ){ freduce (F, mr, nr, X1, nr); freduce (F, mr, nr, C11, ldc); } faddin(DF,mr,nr,(DFCEptr)X1,nr,(DFEptr)C11,ldc); fflas_delete (X1); WH.Outmin = std::min (U1Min, std::min (U5Min, std::min (U6Min, U7Min))); WH.Outmax = std::max (U1Max, std::max (U5Max, std::max (U6Max, U7Max))); } // Winograd } // BLAS3 } // FFLAS #endif // __FFLASFFPACK_fgemm_winograd_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm/schedule_winograd_acc.inl000066400000000000000000000457121274716147400267520ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (C) 2014 the LinBox group * * Written by Clement Pernet * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fgemm/winograd_acc.inl * @ingroup MMalgos * @brief Winograd implementation * @bib ISSAC09 Scheduling */ #ifndef __FFLASFFPACK_fgemm_winograd_acc_INL #define __FFLASFFPACK_fgemm_winograd_acc_INL namespace FFLAS { namespace BLAS3 { // 3 temps and 23 ops // TODO: Add check for modular reductions before final additions template < class Field,class FieldTrait > inline void WinogradAcc_3_23 (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::ConstElement_ptr A,const size_t lda, typename Field::ConstElement_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & WH ) { MMHelper H = WH ; H.recLevel = H.recLevel - 1 ; FFLASFFPACK_check(!F.isZero(beta)); typename Field::Element mbeta ; F.neg(mbeta,beta); size_t lb, cb, la, ca; size_t x3rd = std::max(mr,kr); typename Field::ConstElement_ptr A11=A, A12, A21, A22; typename Field::ConstElement_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; size_t ldX3; // Three temporary submatrices are required if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr; ca = mr; } else { // ta == FflasNoTrans A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr; ca = kr; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr; cb = kr; ldX3 = x3rd; } else { // ta == FflasNoTrans B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr; ldX3 = cb = nr; } // P2 = alpha . A12 * B21 + beta . C11 in C11 fgemm (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, beta, C11, ldc, H); typename Field::Element_ptr X3 = fflas_new (F, x3rd, nr); // T3 = B22 - B12 in X3 fsub(F,lb,cb,B22,ldb,B12,ldb,X3,ldX3); typename Field::Element_ptr X2 = fflas_new (F, mr, kr); // S3 = A11 - A21 in X2 fsub(F,la,ca,A11,lda,A21,lda,X2,ca); // C22 = C22 - C12 if beta != 0 fsubin(F,mr,nr,C12,ldc,C22,ldc); // C21 = C21 - C22 fsubin(F,mr,nr,C22,ldc,C21,ldc); // P7 = alpha . S3 * T3 + beta . C22 in C22 fgemm (F, ta, tb, mr, nr, kr, alpha, X2, ca, X3, ldX3, beta, C22, ldc, H); // T1 = B12 - B11 in X3 fsub(F,lb,cb,B12,ldb,B11,ldb,X3,ldX3); // S1 = A21 + A22 in X2 fadd(F,la,ca,A21,lda,A22,lda,X2,ca); // P5 = alpha . S1*T1 + beta . C12 in C12 fgemm (F, ta, tb, mr, nr, kr, alpha, X2, ca, X3, ldX3, beta, C12, ldc, H); // T2 = B22 - T1 in X3 fsub(F,lb,cb,B22,ldb,X3,ldX3,X3,ldX3); // S2 = S1 - A11 in X2 fsubin(F,la,ca,A11,lda,X2,ca); typename Field::Element_ptr X1 = fflas_new (F, mr, nr); // P6 = alpha . S2 * T2 in X1 fgemm (F, ta, tb, mr, nr, kr, alpha, X2, ca, X3, ldX3, F.zero, X1, nr, H); // T4 = T2 - B21 in X3 fsubin(F,lb,cb,B21,ldb,X3,ldX3); // S4 = A12 -S2 in X2 fsub(F,la,ca,A12,lda,X2,ca,X2,ca); // P4 = alpha . A22 * T4 - beta . C21 in C21 fgemm (F, ta, tb, mr, nr, kr, alpha, A22, lda, X3, ldX3, mbeta, C21, ldc, H); // P1 = alpha . A11 * B11 in X3 fgemm (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, X3, nr, H); // U1 = P2 + P1 in C11 faddin(F,mr,nr,X3,nr,C11,ldc); // U2 = P1 + P6 in tmpU2/X1 and faddin(F, mr, nr, X3, nr, X1, nr); // U3 = P7 + U2 in tmpU3/X3 and fadd(F, mr, nr, X1, nr, C22, ldc, X3, nr); // U7 = P5 + U3 in C22 and fadd(F, mr, nr, C12, ldc, X3, nr, C22, ldc); // U4 = P5 + U2 in C12 and faddin(F, mr, nr, X1, nr, C12, ldc); fflas_delete (X1); // U6 = U3 - P4 in C21 and fsub(F, mr, nr, X3, nr, C21, ldc, C21, ldc); fflas_delete (X3); // P3 = alpha . S4*B22 in X1 fgemm (F, ta, tb, mr, nr, kr, alpha, X2, ca, B22, ldb, F.one, C12, ldc, H); fflas_delete (X2); } // WinogradAccOld // 3 temps and 21 ops template < class Field, class FieldTrait> inline void WinogradAcc_3_21 (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::ConstElement_ptr A,const size_t lda, typename Field::ConstElement_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & WH ) { typedef MMHelper MMH_t; typedef typename MMH_t::DelayedField::Element_ptr DFEptr; typedef typename MMH_t::DelayedField::ConstElement_ptr DFCEptr; typedef typename MMH_t::DelayedField::Element DFElt; const typename MMH_t::DelayedField & DF = WH.delayedField; FFLASFFPACK_check(!DF.isZero(beta)); size_t lb, cb, la, ca; size_t x3rd = std::max(mr,kr); typename Field::ConstElement_ptr A11=A, A12, A21, A22; typename Field::ConstElement_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; typename Field::Element mbeta; F.neg(mbeta,beta); DFElt betadf; if (F.isMOne(beta)) DF.assign(betadf,DF.mOne); else DF.init(betadf, beta); size_t ldX3; if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr; ca = mr; } else { // ta == FflasNoTrans A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr; ca = kr; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr; cb = kr; ldX3 = x3rd; } else { // ta == FflasNoTrans B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr; ldX3 = cb = nr; } // Three temporary submatrices are required typename Field::Element_ptr X3 = fflas_new (F, x3rd, nr); // T1 = B12 - B11 in X3 fsub(DF,lb,cb,(DFCEptr)B12,ldb,(DFCEptr)B11,ldb,(DFEptr)X3,ldX3); typename Field::Element_ptr X2 = fflas_new(F,mr,kr); // S1 = A21 + A22 in X2 fadd(DF,la,ca,(DFCEptr)A21,lda,(DFCEptr)A22,lda,(DFEptr)X2,ca); typename Field::Element_ptr X1 = fflas_new(F,mr,nr); // P5 = alpha . S1*T1 in X1 MMH_t H5(F, WH.recLevel-1, 2*WH.Amin, 2*WH.Amax, -(WH.Bmax-WH.Bmin), WH.Bmax-WH.Bmin, 0, 0); fgemm (F, ta, tb, mr, nr, kr, alpha, X2, ca, X3, ldX3, F.zero, X1, nr, H5); DFElt C22Min, C22Max; DFElt C12Min, C12Max; // This test will be optimized out if (Protected::NeedDoublePreAddReduction (C12Min, C12Max, H5.Outmin, H5.Outmax, WH.Cmin, WH.Cmax, betadf, WH)){ freduce(F,mr,nr,X1,nr); H5.initOut(); } C22Min = C12Min; C22Max = C12Max; // C22 = P5 + beta C22 in C22 fadd(DF,mr,nr,(DFCEptr)X1,nr,betadf,(DFCEptr)C22,ldc,(DFEptr)C22,ldc); // C12 = P5 + beta C12 in C12 fadd(DF,mr,nr,(DFCEptr)X1,nr,betadf,(DFCEptr)C12,ldc,(DFEptr)C12,ldc); // P1 = alpha . A11 * B11 in X1 MMH_t H1(F, WH.recLevel-1, WH.Amin, WH.Amax, WH.Bmin, WH.Bmax, 0, 0); fgemm (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, X1, nr, H1); // P2 = alpha . A12 * B21 + beta . C11 in C11 MMH_t H2(F, WH.recLevel-1, WH.Amin, WH.Amax, WH.Bmin, WH.Bmax, WH.Cmin, WH.Cmax); fgemm (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, beta, C11, ldc, H2); // U1 = P2 + P1 in C11 DFElt U1Min, U1Max; if (Protected::NeedPreAddReduction (U1Min,U1Max, H1.Outmin, H1.Outmax, H2.Outmin,H2.Outmax, WH) ){ freduce(F,mr,nr,X1,nr); freduce(F,mr,nr,C11,ldc); } faddin(DF,mr,nr,(DFCEptr)X1,nr,(DFEptr)C11,ldc); // T2 = B22 - T1 in X3 fsub(DF,lb,cb,(DFCEptr)B22,ldb,(DFCEptr)X3,ldX3,(DFEptr)X3,ldX3); // S2 = S1 - A11 in X2 fsubin(DF,la,ca,(DFCEptr)A11,lda,(DFEptr)X2,ca); // U2 = P6 + P1 = alpha . S2 * T2 + P1 in X1 MMH_t H6(F, WH.recLevel-1, 2*WH.Amin-WH.Amax, 2*WH.Amax-WH.Amin, 2*WH.Bmin-WH.Bmax, 2*WH.Bmax-WH.Bmin, H1.Outmin, H1.Outmax); fgemm (F, ta, tb, mr, nr, kr, alpha, X2, ca, X3, ldX3, F.one, X1, nr, H6); // U4 = U2 + C12 in C12 DFElt U4Min, U4Max; if (Protected::NeedPreAddReduction (U4Min, U4Max, H6.Outmin, H6.Outmax, C12Min, C12Max, WH)){ freduce(F,mr,nr,C12,ldc); freduce(F,mr,nr,X1,nr); } faddin(DF,mr,nr,(DFCEptr)X1,nr,(DFEptr)C12,ldc); // T4 = T2 - B21 in X3 fsubin(DF,lb,cb,(DFCEptr)B21,ldb,(DFEptr)X3,ldX3); // S4 = A12 -S2 in X2 fsub(DF,la,ca,(DFCEptr)A12,lda,(DFCEptr)X2,ca,(DFEptr)X2,ca); // P4 = alpha . A22 * T4 - beta . C21 in C21 MMH_t H4(F, WH.recLevel-1, WH.Amin, WH.Amax, 2*WH.Bmin-2*WH.Bmax, 2*WH.Bmax-2*WH.Bmin, WH.Cmin, WH.Cmax); fgemm (F, ta, tb, mr, nr, kr, alpha, A22, lda, X3, ldX3, mbeta, C21, ldc, H4); // U5 = P3 + U4 = alpha . S4*B22 + U4 in C12 MMH_t H3(F, WH.recLevel-1, 2*WH.Amin-2*WH.Amax, 2*WH.Amax-2*WH.Amin, WH.Bmin, WH.Bmax, U4Min, U4Max); fgemm (F, ta, tb, mr, nr, kr, alpha, X2, ca, B22, ldb, F.one, C12, ldc, H3); // T3 = B22 - B12 in X3 fsub(DF,lb,cb,(DFCEptr)B22,ldb,(DFCEptr)B12,ldb,(DFEptr)X3,ldX3); // S3 = A11 - A21 in X2 fsub(DF,la,ca,(DFCEptr)A11,lda,(DFCEptr)A21,lda,(DFEptr)X2,ca); // U3 = P7 + U2 = alpha . S3 * T3 + U2 in X1 MMH_t H7(F, WH.recLevel-1, WH.Amin-WH.Amax, WH.Amax-WH.Amin, WH.Bmin-WH.Bmax, WH.Bmax-WH.Bmin, H6.Outmin, H6.Outmax); fgemm (F, ta, tb, mr, nr, kr, alpha, X2, ca, X3, ldX3, F.one, X1, nr, H7); fflas_delete (X2); fflas_delete (X3); // U7 = U3 + C22 in C22 DFElt U7Min, U7Max; if (Protected::NeedPreAddReduction (U7Min, U7Max, H7.Outmin, H7.Outmax, C22Min, C22Max, WH)){ freduce(F,mr,nr,X1,nr); freduce(F,mr,nr,C22,ldc); } faddin(DF,mr,nr,(DFCEptr)X1,nr,(DFEptr)C22,ldc); // U6 = U3 - P4 in C21 DFElt U6Min, U6Max; if (Protected::NeedPreSubReduction(U6Min, U6Max, H7.Outmin, H7.Outmax, H4.Outmin, H4.Outmax, WH)){ freduce(F,mr,nr,X1,nr); freduce(F,mr,nr,C21,ldc); } fsub(DF,mr,nr,(DFCEptr)X1,nr,(DFCEptr)C21,ldc,(DFEptr)C21,ldc); fflas_delete (X1); // Updating WH with Outmin, Outmax of the result WH.Outmin = min4 (U1Min, H3.Outmin, U6Min, U7Min); WH.Outmax = max4 (U1Max, H3.Outmax, U6Max, U7Max); } // WinogradAcc // 2 temps and 24 ops // TODO: Add check for modular reductions before final additions template < class Field, class FieldTrait > inline void WinogradAcc_2_24 (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, const typename Field::Element_ptr A,const size_t lda, const typename Field::Element_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & WH ) { MMHelper H = WH ; H.recLevel = H.recLevel - 1 ; FFLASFFPACK_check(!F.isZero(beta)); typename Field::Element malpha ; F.neg(malpha,alpha); // A, B and c submatrices const typename Field::Element_ptr A11=A, A12, A21, A22; const typename Field::Element_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; size_t la, ca, lb, cb; // lines and columns in A,B sub matrices // Three temporary submatrices are required if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr ; ca = mr ; } else { // ta == FflasNoTrans A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr ; ca = kr ; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr ; cb = kr ; } else { // ta == FflasNoTrans B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr ; cb = nr ; } // Z1 = C22 - C12 in C22 fsubin(F,mr,nr,C12,ldc,C22,ldc); // Z3 = C12-C21 in C12 fsubin(F,mr,nr,C21,ldc,C12,ldc); // S1 = A21 + A22 in X typename Field::Element_ptr X = fflas_new(F,mr,std::max(nr,kr)); fadd(F,la,ca,A21,lda,A22,lda,X,ca); // T1 = B12 - B11 in Y typename Field::Element_ptr Y = fflas_new(F,nr,kr); fsub(F,lb,cb,B12,ldb,B11,ldb,Y,cb); // P5 = a S1 T1 + b Z3 in C12 fgemm (F, ta, tb, mr, nr, kr, alpha, X, ca, Y, cb, beta, C12, ldc, H); // S2 = S1 - A11 in X fsubin(F,la,ca,A11,lda,X,ca); // T2 = B22 - T1 in Y fsub(F,lb,cb,B22,ldb,Y,cb,Y,cb); // P6 = a S2 T2 + b C21 in C21 fgemm (F, ta, tb, mr, nr, kr, alpha, X, ca, Y, cb, beta, C21, ldc, H); // S4 = A12 - S2 in X fsub(F,la,ca,A12,lda,X,ca,X,ca); // W1 = P5 + beta Z1 in C22 fadd(F,mr,nr,C12,ldc,beta,C22,ldc,C22,ldc); // P3 = a S4 B22 + P5 in C12 fgemm (F, ta, tb, mr, nr, kr, alpha, X, ca, B22, ldb, F.one, C12, ldc, H); // P1 = a A11 B11 in X fgemm (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, X, nr, H); // U2 = P6 + P1 in C21 faddin(F,mr,nr,X,nr,C21,ldc); // P2 = a A12 B21 + b C11 in C11 fgemm (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, beta, C11, ldc, H); // U1 = P1 + P2 in C11 faddin(F,mr,nr,X,nr,C11,ldc); // U5 = U2 + P3 in C12 faddin(F,mr,nr,C21,ldc,C12,ldc); // S3 = A11 - A21 in X ; fsub(F,la,ca,A11,lda,A21,lda,X,ca); // T3 = B22 - B12 in Y fsub(F,lb,cb,B22,ldb,B12,ldb,Y,cb); // U3 = a S3 T3 + U2 in C21 fgemm (F, ta, tb, mr, nr, kr, alpha, X, ca, Y, cb, F.one, C21, ldc, H); fflas_delete (X); // U7 = U3 + W1 in C22 faddin(F,mr,nr,C21,ldc,C22,ldc); // T1_ = B12 - B11 in Y fsub(F,lb,cb,B12,ldb,B11,ldb,Y,cb); // T2_ = B22 - T1_ in Y fsub(F,lb,cb,B22,ldb,Y,cb,Y,cb); // T4 = T2_ - B21 in Y fsub(F,lb,cb,Y,cb,B21,ldb,Y,cb); // U6 = -a A22 T4 + U3 in C21; fgemm (F, ta, tb, mr, nr, kr, malpha, A22, lda, Y, cb, F.one, C21, ldc, H); fflas_delete (Y); } // WinogradAccOld // 2 temps and 27 ops // TODO: Add check for modular reductions before final additions template < class Field, class FieldTrait > inline void WinogradAcc_2_27 (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, const typename Field::Element_ptr A,const size_t lda, const typename Field::Element_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper & WH) { MMHelper H = WH ; H.recLevel = H.recLevel - 1 ; FFLASFFPACK_check(!F.isZero(beta)); typename Field::Element malpha ; F.neg(malpha,alpha); // A, B and c submatrices const typename Field::Element_ptr A11=A, A12, A21, A22; const typename Field::Element_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; size_t la, ca, lb, cb; // lines and columns in A,B sub matrices // Three temporary submatrices are required if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr ; ca = mr ; } else { // ta == FflasNoTrans A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr ; ca = kr ; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr ; cb = kr ; } else { // ta == FflasNoTrans B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr ; cb = nr ; } // Z1 = C22 - C12 in C22 fsubin(F,mr,nr,C12,ldc,C22,ldc); // Z3 = C12-C21 in C12 fsubin(F,mr,nr,C21,ldc,C12,ldc); // S1 = A21 + A22 in X typename Field::Element_ptr X = fflas_new(F,mr,std::max(nr,kr)); fadd(F,la,ca,A21,lda,A22,lda,X,ca); // T1 = B12 - B11 in Y typename Field::Element_ptr Y = fflas_new(F,nr,std::max(kr,mr)); fsub(F,lb,cb,B12,ldb,B11,ldb,Y,cb); // P5 = a S1 T1 + b Z3 in C12 fgemm (F, ta, tb, mr, nr, kr, alpha, X, ca, Y, cb, beta, C12, ldc, H); // S2 = S1 - A11 in X fsubin(F,la,ca,A11,lda,X,ca); // T2 = B22 - T1 in Y fsub(F,lb,cb,B22,ldb,Y,cb,Y,cb); // P6 = a S2 T2 + b C21 in C21 fgemm (F, ta, tb, mr, nr, kr, alpha, X, ca, Y, cb, beta, C21, ldc, H); // S4 = A12 - S2 in X fsub(F,la,ca,A12,lda,X,ca,X,ca); // W1 = P5 + beta Z1 in C22 fadd(F,mr,nr,C12,ldc,beta,C22,ldc,C22,ldc); // P3 = a S4 B22 + P5 in C12 fgemm (F, ta, tb, mr, nr, kr, alpha, X, ca, B22, ldb, F.zero, Y, nr, H); fadd(F,mr,nr,Y,nr,C12,ldc,C12,ldc); // P1 = a A11 B11 in X fgemm (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, X, nr, H); // U2 = P6 + P1 in C21 faddin(F,mr,nr,X,nr,C21,ldc); // P2 = a A12 B21 + b C11 in C11 fgemm (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, F.zero, Y, nr, H); fadd(F,mr,nr,Y,nr,beta,C11,ldc,C11,ldc); // U1 = P1 + P2 in C11 faddin(F,mr,nr,X,nr,C11,ldc); // U5 = U2 + P3 in C12 faddin(F,mr,nr,C21,ldc,C12,ldc); // S3 = A11 - A21 in X ; fsub(F,la,ca,A11,lda,A21,lda,X,ca); // T3 = B22 - B12 in Y fsub(F,lb,cb,B22,ldb,B12,ldb,Y,cb); // U3 = a S3 T3 + U2 in C21 fgemm (F, ta, tb, mr, nr, kr, alpha, X, ca, Y, cb, F.one, C21, ldc, H); // U7 = U3 + W1 in C22 faddin(F,mr,nr,C21,ldc,C22,ldc); // T1_ = B12 - B11 in Y fsub(F,lb,cb,B12,ldb,B11,ldb,Y,cb); // T2_ = B22 - T1_ in Y fsub(F,lb,cb,B22,ldb,Y,cb,Y,cb); // T4 = T2_ - B21 in Y fsub(F,lb,cb,Y,cb,B21,ldb,Y,cb); // U6 = -a A22 T4 + U3 in C21; fgemm (F, ta, tb, mr, nr, kr, alpha, A22, lda, Y, cb, F.zero, X, nr, H); fflas_delete (Y); fsub(F,mr,nr,C21,ldc,X,nr,C21,ldc); fflas_delete (X); } // WinogradAcc3 } // BLAS3 } // FFLAS #endif // __FFLASFFPACK_fgemm_winograd_acc_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm/schedule_winograd_acc_ip.inl000066400000000000000000000317571274716147400274460ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (C) 2014 the LinBox group * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fgemm/winograd_acc2.inl * @ingroup MMalgos * @brief Winograd implementation * @bib ISSAC09 Scheduling */ #ifndef __FFLASFFPACK_fgemm_winograd_acc_ip_INL #define __FFLASFFPACK_fgemm_winograd_acc_ip_INL namespace FFLAS { namespace BLAS3 { template < class Field, class FieldTrait > inline void WinogradAcc_LR (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::Element_ptr A,const size_t lda, typename Field::Element_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const MMHelper & WH ) { MMHelper H = WH ; H.recLevel = H.recLevel - 1 ; FFLASFFPACK_check(!F.isZero(beta)); typename Field::Element malpha ; F.neg(malpha,alpha); // A, B and c submatrices typename Field::Element_ptr A11=A, A12, A21, A22; typename Field::Element_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; typename Field::Element mbeta ; F.neg(mbeta,beta); size_t la, ca, lb, cb; // lines and columns in A,B sub matrices // Three temporary submatrices are required if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr ; ca = mr ; } else { // ta == FflasNoTrans A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr ; ca = kr ; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr ; cb = kr ; } else { // ta == FflasNoTrans B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr ; cb = nr ; } // Z1 = C22 - C12 in C22 fsubin(F,mr,nr,C12,ldc,C22,ldc); // S1 = A21 + A22 in X typename Field::Element_ptr X = fflas_new (F, std::max(std::max(mr*nr,kr*nr),mr*kr), 1); fadd(F,la,ca,A21,lda,A22,lda,X,ca); // T1 = B12 - B11 in Y typename Field::Element_ptr Y = fflas_new (F, std::max(mr,kr), nr); fsub(F,lb,cb,B12,ldb,B11,ldb,Y,cb); // Z2 = C21 - Z1 in C21 fsubin(F,mr,nr,C22,ldc,C21,ldc); // T3 = B22 - B12 in B12 ; fsub(F,lb,cb,B22,ldb,B12,ldb,B12,ldb); // S3 = A11 - A21 in A21 fsub(F,la,ca,A11,lda,A21,lda,A21,lda); // P7 = a S3 T3 + b Z1 in C22 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A21, lda, B12, ldb, beta, C22, ldc, H); // S2 = S1 - A11 in A21 fsub(F,la,ca,X,ca,A11,lda,A21,lda); // T2 = B22 - T1 in B12 fsub(F,lb,cb,B22,ldb,Y,cb,B12,ldb); // P5 = a S1 T1 + b C12 in C12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, X, ca, Y, cb, beta, C12, ldc, H); // T4 = T2 - B21 in X fsub(F,lb,cb,B12,ldb,B21,ldb,X,cb); // W1 = a A22 T4 in Y; fgemm2 (F, ta, tb, mr, nr, kr, alpha, A22, lda, X, cb, F.zero, Y, nr, H); // P4 = W1 - b Z2 in C21 fadd(F,mr,nr,Y,nr,mbeta,C21,ldc,C21,ldc); // S4 = A12 - S2 in A22 fsub(F,la,ca,A12,lda,A21,lda,A22,lda); // P6 = a S2 T2 in X fgemm2 (F, ta, tb, mr, nr, kr, alpha, A21, lda, B12, ldb, F.zero, X, nr, H); // W2 = a A12 B21 in Y fgemm2 (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, F.zero, Y, nr, H); // P2 = W2 + beta C11 in C11 fadd(F,mr,nr,Y,nr,beta,C11,ldc,C11,ldc); // P1 = a A11 B11 in Y fgemm2 (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, Y, nr, H); // U1 = P1 + P2 in C11 faddin(F,mr,nr,Y,nr,C11,ldc); // U2 = P6 + P1 in X faddin(F,mr,nr,Y,nr,X,nr); fflas_delete (Y); // U3 = U2 + P7 in C22 faddin(F,mr,nr,X,nr,C22,ldc); // U4 = U2 + P5 in X faddin(F,mr,nr,C12,ldc,X,nr); // U6 = U3 - P4 in C21 fsub(F,mr,nr,C22,ldc,C21,ldc,C21,ldc); // U7 = U3 + P5 in C22 faddin(F,mr,nr,C12,ldc,C22,ldc); // P3 = a S4 B22 in C12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A22, lda, B22, ldb, F.zero, C12, ldc, H); // U5 = U4 + P3 in C12 faddin(F,mr,nr,X,nr,C12,ldc); fflas_delete (X); } // WinogradAccOld template < class Field, class FieldTrait > inline void WinogradAcc_R_S (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, const typename Field::Element_ptr A,const size_t lda, typename Field::Element_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const MMHelper & WH ) { MMHelper H = WH ; H.recLevel = H.recLevel - 1 ; FFLASFFPACK_check(!F.isZero(beta)); typename Field::Element malpha ; F.neg(malpha,alpha); // A, B and c submatrices const typename Field::Element_ptr A11=A, A12, A21, A22; typename Field::Element_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; typename Field::Element mbeta ; F.neg(mbeta,beta); size_t la, ca, lb, cb; // lines and columns in A,B sub matrices // Three temporary submatrices are required if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr ; ca = mr ; } else { // ta == FflasNoTrans A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr ; ca = kr ; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr ; cb = kr ; } else { // ta == FflasNoTrans B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr ; cb = nr ; } FFLASFFPACK_check(mr == nr && kr == nr); // Z1 = C22 - C12 in C22 fsubin(F,mr,nr,C12,ldc,C22,ldc); // T1 = B12 - B11 in X // typename Field::Element_ptr X = fflas_new (F, std::max(mr,kr)*nr]; typename Field::Element_ptr X = fflas_new (F, mr, nr); fsub(F,lb,cb,B12,ldb,B11,ldb,X,cb); // Z2 = C21 - Z1 in C21 fsubin(F,mr,nr,C22,ldc,C21,ldc); // T3 = B22 - B12 in B12 ; fsub(F,lb,cb,B22,ldb,B12,ldb,B12,ldb); // S3 = A11 - A21 in Y typename Field::Element_ptr Y = fflas_new (F, mr, kr); fsub(F,la,ca,A11,lda,A21,lda,Y,ca); // P7 = a S3 T3 + b Z1 in C22 fgemm2 (F, ta, tb, mr, nr, kr, alpha, Y, ca, B12, ldb, beta, C22, ldc, H); // S1 = A21 + A22 in Y fadd(F,la,ca,A21,lda,A22,lda,Y,ca); // T2 = B22 - T1 in B12 fsub(F,lb,cb,B22,ldb,X,cb,B12,ldb); // P5 = a S1 T1 + b C12 in C12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, Y, ca, X, cb, beta, C12, ldc, H); // T4 = T2 - B21 in X fsub(F,lb,cb,B12,ldb,B21,ldb,X,cb); // P4 = a A22 T4 - b Z2 in C21 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A22, lda, X, cb, mbeta, C21, ldc, H); // W1 = a A12 B21 in X; fgemm2 (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, F.zero, X, nr, H); // P2 = W1 + beta C11 in C11 fadd(F,mr,nr,X,nr,beta,C11,ldc,C11,ldc); // S2 = S1 - A11 in Y fsubin(F,la,ca,A11,lda,Y,ca); // P6 = a S2 T2 in B21 fgemm2 (F, ta, tb, mr, nr, kr, alpha, Y, ca, B12, ldb, F.zero, B21, ldb, H); // S4 = A12 - S2 in Y fsub(F,la,ca,A12,lda,Y,ca,Y,ca); // P1 = a A11 B11 in X fgemm2 (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, X, nr, H); // U2 = P6 + P1 in B21 faddin(F,mr,nr,X,nr,B21,ldb); // U3 = U2 + P7 in C22 faddin(F,mr,nr,B21,ldb,C22,ldc); // U4 = U2 + P5 in B21 faddin(F,mr,nr,C12,ldc,B21,ldb); // U6 = U3 - P4 in C21 fsub(F,mr,nr,C22,ldc,C21,ldc,C21,ldc); // U1 = P1 + P2 in C11 faddin(F,mr,nr,X,nr,C11,ldc); fflas_delete (X); // U7 = U3 + P5 in C22 faddin(F,mr,nr,C12,ldc,C22,ldc); // P3 = a S4 B22 in C12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, Y, ca, B22, ldb, F.zero, C12, ldc, H); fflas_delete (Y); // U5 = U4 + P3 in C12 faddin(F,mr,nr,B21,ldb,C12,ldc); } // WinogradAccOld template < class Field ,class FieldTrait> inline void WinogradAcc_L_S (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::Element_ptr A,const size_t lda, const typename Field::Element_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const MMHelper & WH ) { MMHelper H = WH ; H.recLevel = H.recLevel - 1 ; FFLASFFPACK_check(!F.isZero(beta)); typename Field::Element malpha ; F.neg(malpha,alpha); // A, B and c submatrices typename Field::Element_ptr A11=A, A12, A21, A22; const typename Field::Element_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; typename Field::Element mbeta ; F.neg(mbeta,beta); size_t la, ca, lb, cb; // lines and columns in A,B sub matrices // Three temporary submatrices are required if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr ; ca = mr ; } else { // ta == FflasNoTrans A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr ; ca = kr ; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr ; cb = kr ; } else { // ta == FflasNoTrans B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr ; cb = nr ; } FFLASFFPACK_check(mr == nr && kr == nr); // Z1 = C22 - C12 in C22 fsubin(F,mr,nr,C12,ldc,C22,ldc); // Z2 = C21 - Z1 in C21 fsubin(F,mr,nr,C22,ldc,C21,ldc); // S3 = A11 - A21 in X typename Field::Element_ptr X = fflas_new (F, mr, nr); fsub(F,la,ca,A11,lda,A21,lda,X,ca); // S1 = A21 + A22 in A21 faddin(F,la,ca,A22,lda,A21,lda); // T3 = B22 - B12 in Y ; typename Field::Element_ptr Y = fflas_new (F, mr, kr); fsub(F,lb,cb,B22,ldb,B12,ldb,Y,cb); // P7 = a S3 T3 + b Z1 in C22 fgemm2 (F, ta, tb, mr, nr, kr, alpha, X, ca, Y, cb, beta, C22, ldc, H); // T1 = B12 - B11 in X fsub(F,lb,cb,B12,ldb,B11,ldb,X,cb); // T2 = B22 - T1 in Y fsub(F,lb,cb,B22,ldb,X,cb,Y,cb); // P5 = a S1 T1 + b C12 in C12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A21, lda, X, cb, beta, C12, ldc, H); // S2 = S1 - A11 in A21 fsubin(F,la,ca,A11,lda,A21,lda); // P1 = a A11 B11 in X fgemm2 (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, X, nr, H); // S4 = A12 - S2 in A11 fsub(F,la,ca,A12,lda,A21,lda,A11,lda); // P2 = a A12 B21 + b C11 in C11; fgemm2 (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, beta, C11, ldc, H); // U1 = P1 + P2 in C11 faddin(F,mr,nr,X,nr,C11,ldc); // P6 = a S2 T2 in A12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A21, lda, Y, cb, F.zero, A12, lda, H); // T4 = T2 - B21 in Y fsubin(F,lb,cb,B21,ldb,Y,cb); // W2 = a A22 T4 in A21 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A22, lda, Y, cb, F.zero, A21, lda, H); // P4 = W2 - beta Z2 in C21 fadd(F,mr,nr,A21,lda,mbeta,C21,ldc,C21,ldc); // U2 = P6 + P1 in X faddin(F,mr,nr,A12,lda,X,nr); // U3 = U2 + P7 in C22 faddin(F,mr,nr,X,nr,C22,ldc); // U6 = U3 - P4 in C21 fsub(F,mr,nr,C22,ldc,C21,ldc,C21,ldc); // U7 = U3 + P5 in C22 faddin(F,mr,nr,C12,ldc,C22,ldc); // U4 = U2 + P5 in C12 faddin(F,mr,nr,X,nr,C12,ldc); fflas_delete (X); // W3 = a S4 B22 in Y fgemm2 (F, ta, tb, mr, nr, kr, alpha, A11, lda, B22, ldb, F.zero, Y, nr, H); // U5 = U4 + W3 in C12 faddin(F,mr,nr,Y,nr,C12,ldc); fflas_delete (Y); } // WinogradAccOld } // BLAS3 } // FFLAS #endif // __FFLASFFPACK_fgemm_winograd_acc_ip_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemm/schedule_winograd_ip.inl000066400000000000000000000264061274716147400266330ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the LinBox group * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fgemm/winograd_ip.inl * @ingroup MMalgos * @brief Winograd implementation * @bib ISSAC09 Scheduling */ #ifndef __FFLASFFPACK_fgemm_winograd_ip_INL #define __FFLASFFPACK_fgemm_winograd_ip_INL namespace FFLAS { namespace BLAS3 { template < class Field, class FieldTrait > inline void Winograd_LR_S (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::Element_ptr A,const size_t lda, typename Field::Element_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const MMHelper & WH ) { MMHelper H = WH ; H.recLevel = H.recLevel - 1 ; FFLASFFPACK_check(F.isZero(beta)); // FFLASFFPACK_check(mr == nr && mr == kr); FFLASFFPACK_check(kr == nr); size_t lb, cb, la, ca; typename Field::Element_ptr A11=A, A12, A21, A22; typename Field::Element_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr; ca = mr; } else { A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr; ca = kr; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr; cb = kr; } else { B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr; cb = nr; } // S3 = A11 - A21 in C11 fsub(F,la,ca,A11,lda,A21,lda,C11,ldc); // S1 = A21 + A22 in A21 faddin(F,la,ca,A22,lda,A21,lda); // T1 = B12 - B11 in C22 fsub(F,lb,cb,B12,ldb,B11,ldb,C22,ldc); // T3 = B22 - B12 in B12 fsub(F,lb,cb,B22,ldb,B12,ldb,B12,ldb); // P7 = S3 T3 in C21 fgemm2 (F, ta, tb, mr, nr, kr, alpha, C11, ldc, B12, ldb, F.zero, C21, ldc, H); // S2 = S1 - A11 in C12 fsub(F,la,ca,A21,lda,A11,lda,C12,ldc); // P1 = A11 B11 in C11 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, C11, ldc, H); // T2 = B22 - T1 in B11 fsub(F,lb,cb,B22,ldb,C22,ldc,B11,ldb); // P5 = S1 T1 in A11 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A21, lda, C22, ldc, F.zero, A11, lda, H); // T4 = T2 - B21 in C22 fsub(F,lb,cb,B11,ldb,B21,ldb,C22,ldc); // P4 = A22 T4 in A21 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A22, lda, C22, ldc, F.zero, A21, lda, H); // S4 = A12 - S2 in A22 fsub(F,la,ca,A12,lda,C12,ldc,A22,lda); // P6 = S2 T2 in C22 fgemm2 (F, ta, tb, mr, nr, kr, alpha, C12, ldc, B11, ldb, F.zero, C22, ldc, H); // U2 = P1 + P6 in C22 faddin(F,mr,nr,C11,ldc,C22,ldc); // P2 = A12 B21 in C12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, F.zero, C12, ldc, H); // U1 = P1 + P2 in C11 faddin(F,mr,nr,C12,ldc,C11,ldc); // U4 = U2 + P5 in C12 fadd(F,mr,nr,C22,ldc,A11,lda,C12,ldc); // U3 = U2 + P7 in C22 faddin(F,mr,nr,C21,ldc,C22,ldc); // U6 = U3 - P4 in C21 fsub(F,mr,nr,C22,ldc,A21,lda,C21,ldc); // U7 = U3 + P5 in C22 faddin(F,mr,nr,A11,lda,C22,ldc); // P3 = S4 B22 in A12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A22, lda, B22, ldb, F.zero, A12, lda, H); // U5 = U4 + P3 in C12 faddin(F,mr,nr,A12,lda,C12,ldc); } // WinogradIP template < class Field, class FieldTrait > inline void Winograd_L_S(const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, typename Field::Element_ptr A,const size_t lda, const typename Field::Element_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const MMHelper & WH ) { MMHelper H = WH ; H.recLevel = H.recLevel - 1 ; FFLASFFPACK_check(F.isZero(beta)); FFLASFFPACK_check(kr == nr && kr <= mr); size_t lb, cb, la, ca; typename Field::Element_ptr A11=A, A12, A21, A22; const typename Field::Element_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr; ca = mr; } else { A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr; ca = kr; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr; cb = kr; } else { B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr; cb = nr; } // S3 = A11 - A21 in C22 fsub(F,la,ca,A11,lda,A21,lda,C22,ldc); // S1 = A21 + A22 in A21 fadd(F,la,ca,A22,lda,A21,lda,A21,lda); // S2 = S1 - A11 in C12 fsub(F,la,ca,A21,lda,A11,lda,C12,ldc); // T1 = B12 - B11 in C21 fsub(F,lb,cb,B12,ldb,B11,ldb,C21,ldc); // P1 = A11 B11 in C11 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, C11, ldc, H); // T3 = B22 - B12 in A11 fsub(F,lb,cb,B22,ldb,B12,ldb,A11,lda); // P7 = S3 T3 in X typename Field::Element_ptr X = fflas_new (F, mr, nr); fgemm2 (F, ta, tb, mr, nr, kr, alpha, C22, ldc, A11, lda, F.zero, X, nr, H); // T2 = B22 - T1 in A11 fsub(F,lb,cb,B22,ldb,C21,ldc,A11,lda); // P5 = S1 T1 in C22 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A21, lda, C21, ldc, F.zero, C22, ldc, H); // S4 = A12 - S2 in C21 fsub(F,la,ca,A12,lda,C12,ldc,C21,ldc); // P3 = S4 B22 in A21 fgemm2 (F, ta, tb, mr, nr, kr, alpha, C21, ldc, B22, ldb, F.zero, A21, lda, H); // P6 = S2 T2 in C21 fgemm2 (F, ta, tb, mr, nr, kr, alpha, C12, ldc, A11, lda, F.zero, C21, ldc, H); // T4 = T2 - B21 in A11 fsubin(F,lb,cb,B21,ldb,A11,lda); // U2 = P1 + P6 in C21 faddin(F,mr,nr,C11,ldc,C21,ldc); // U4 = U2 + P5 in C12 fadd(F,mr,nr,C22,ldc,C21,ldc,C12,ldc); // U3 = U2 + P7 in C21 faddin(F,mr,nr,X,nr,C21,ldc); // U7 = U3 + P5 in C22 faddin(F,mr,nr,C21,ldc,C22,ldc); // U5 = U4 + P3 in C12 faddin(F,la,ca,A21,lda,C12,ldc); // P2 = A12 B21 in X fgemm2 (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, F.zero, X, nr, H); // U1 = P1 + P2 in C11 faddin(F,mr,nr,X,nr,C11,ldc); fflas_delete (X); // P4 = A22 T4 in A21 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A22, lda, A11, lda, F.zero, A21, lda, H); // U6 = U3 - P4 in C21 fsubin(F,mr,nr,A21,lda,C21,ldc); } // WinogradIP template < class Field, class FieldTrait > inline void Winograd_R_S(const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t mr, const size_t nr, const size_t kr, const typename Field::Element alpha, const typename Field::Element_ptr A,const size_t lda, typename Field::Element_ptr B,const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const MMHelper & WH ) { MMHelper H = WH ; H.recLevel = H.recLevel - 1 ; FFLASFFPACK_check(F.isZero(beta)); FFLASFFPACK_check(kr == nr && kr <= mr); size_t lb, cb, la, ca; const typename Field::Element_ptr A11=A, A12, A21, A22; typename Field::Element_ptr B11=B, B12, B21, B22; typename Field::Element_ptr C11=C, C12=C+nr, C21=C+mr*ldc, C22=C21+nr; if (ta == FflasTrans) { A21 = A + mr; A12 = A + kr*lda; A22 = A12 + mr; la = kr; ca = mr; } else { A12 = A + kr; A21 = A + mr*lda; A22 = A21 + kr; la = mr; ca = kr; } if (tb == FflasTrans) { B21 = B + kr; B12 = B + nr*ldb; B22 = B12 + kr; lb = nr; cb = kr; } else { B12 = B + nr; B21 = B + kr*ldb; B22 = B21 + nr; lb = kr; cb = nr; } // S3 = A11 - A21 in C22 fsub(F,la,ca,A11,lda,A21,lda,C22,ldc); // S1 = A21 + A22 in C21 fadd(F,la,ca,A22,lda,A21,lda,C21,ldc); // T1 = B12 - B11 in C12 fsub(F,lb,cb,B12,ldb,B11,ldb,C12,ldc); // P1 = A11 B11 in C11 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, F.zero, C11, ldc, H); // S2 = S1 - A11 in B11 fsub(F,la,ca,C21,ldc,A11,lda,B11,ldb); // T3 = B22 - B12 in B12 fsub(F,lb,cb,B22,ldb,B12,ldb,B12,ldb); // P7 = S3 T3 in X typename Field::Element_ptr X = fflas_new (F, mr, nr); fgemm2 (F, ta, tb, mr, nr, kr, alpha, C22, ldc, B12, ldb, F.zero, X, nr, H); // T2 = B22 - T1 in B12 fsub(F,lb,cb,B22,ldb,C12,ldc,B12,ldb); // P5 = S1 T1 in C22 fgemm2 (F, ta, tb, mr, nr, kr, alpha, C21, ldc, C12, ldc, F.zero, C22, ldc, H); // T4 = T2 - B21 in C12 fsub(F,lb,cb,B12,ldb,B21,ldb,C12,ldc); // P6 = S2 T2 in C21 fgemm2 (F, ta, tb, mr, nr, kr, alpha, B11, ldb, B12, ldb, F.zero, C21, ldc, H); // P4 = A22 T4 in B12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A22, lda, C12, ldc, F.zero, B12, ldb, H); // S4 = A12 - S2 in B11 fsub(F,la,ca,A12,lda,B11,ldb,B11,ldb); // U2 = P1 + P6 in C21 faddin(F,mr,nr,C11,ldc,C21,ldc); // U4 = U2 + P5 in C12 fadd(F,mr,nr,C22,ldc,C21,ldc,C12,ldc); // U3 = U2 + P7 in C21 faddin(F,mr,nr,X,nr,C21,ldc); fflas_delete (X); // U7 = U3 + P5 in C22 faddin(F,mr,nr,C21,ldc,C22,ldc); // U6 = U3 - P4 in C21 fsubin(F,mr,nr,B12,ldb,C21,ldc); // P3 = S4 B22 in B12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, B11, ldb, B22, ldb, F.zero, B12, ldb, H); // U5 = U4 + P3 in C12 faddin(F,la,ca,B12,ldb,C12,ldc); // P2 = A12 B21 in B12 fgemm2 (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, F.zero, B12, ldb, H); // U1 = P1 + P2 in C11 faddin(F,mr,nr,B12,ldb,C11,ldc); } // WinogradIP } // BLAS3 } // FFLAS #endif // __FFLASFFPACK_fgemm_winograd_ip_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemv.inl000066400000000000000000000362221274716147400224630ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_fgemv.inl * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fgemv_INL #define __FFLASFFPACK_fgemv_INL #include // DoubleDomain #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS #include "fflas-ffpack/fflas/fflas_igemm/igemm.h" #endif namespace FFLAS{ namespace Protected { template inline typename Field::Element_ptr fgemv_convert (const Field& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr A,const size_t lda, typename Field::ConstElement_ptr X,const size_t incX, const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY) { FFLASFFPACK_check(lda); Givaro::ModularBalanced G((FloatElement) F.characteristic()); FloatElement tmp,alphaf, betaf; F.convert (tmp, beta); G.init(betaf,tmp); F.convert (tmp, alpha); G.init(alphaf,tmp); size_t ma, na; if (ta == FflasTrans) { ma = N; na = M; } else { ma = M; na = N; } // sizet ldaf = na; FloatElement* Af = FFLAS::fflas_new(M*N); FloatElement* Xf = FFLAS::fflas_new(na); FloatElement* Yf = FFLAS::fflas_new(ma); fconvert(F, M, N, Af, N, A, lda); freduce (G, M, N, Af, N); fconvert(F, na, Xf, 1, X, incX); freduce (G, na, Xf, 1); if (!F.isZero(beta)){ fconvert (F, ma, Yf, 1, Y, incY); freduce (G, ma, Yf, 1); } fgemv (G, ta, M, N, alphaf, Af, N, Xf, 1, betaf, Yf, 1); finit(F, ma, Yf, 1, Y, incY); fflas_delete (Af); fflas_delete (Xf); fflas_delete (Yf); return Y; } }// Protected }// FFLAS namespace FFLAS { template inline typename Field::Element_ptr fgemv (const Field& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr X, const size_t incX, const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, MMHelper > & H) { if (F.cardinality() < DOUBLE_TO_FLOAT_CROSSOVER) return Protected::fgemv_convert(F,ta,M,N,alpha,A,lda,X, incX, beta,Y,incY); else if (16*F.cardinality() < Givaro::ModularBalanced::maxCardinality()) return Protected::fgemv_convert(F,ta,M,N,alpha,A,lda,X, incX, beta,Y,incY); else { FFPACK::failure()(__func__,__LINE__,"Invalid ConvertTo Mode for this field"); } return Y; } }// FFLAS namespace FFLAS { //--------------------------------------------------------------------- // fgemv: GEneral Matrix Vector Multiplication // Computes Y <- alpha.op(A).X + beta.Y // A is M*N, //--------------------------------------------------------------------- template inline typename Field::Element_ptr fgemv (const Field& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr X, const size_t incX, const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, MMHelper & H) { if (!M) {return Y;} size_t Ydim = (ta == FflasNoTrans)?M:N; size_t Xdim = (ta == FflasNoTrans)?N:M; if (!Xdim || F.isZero (alpha)){ fscalin(F, Ydim, beta, Y, incY); return Y; } typename Field::Element alpha_,beta_; F.assign (alpha_,alpha); F.assign (beta_,beta); if (Protected::AreEqual >::value || Protected::AreEqual >::value){ //Givaro::Modular need to switch to float if p too small if (F.characteristic() < DOUBLE_TO_FLOAT_CROSSOVER) return Protected::fgemv_convert(F,ta,M,N,alpha,A,lda,X,incX,beta,Y,incY); } if (Protected::AreEqual >::value || Protected::AreEqual >::value){ if (16*F.cardinality() < Givaro::ModularBalanced::maxCardinality()) return Protected::fgemv_convert(F,ta,M,N,alpha,A,lda,X, incX,beta,Y,incY); else{ // Stay over int64_t MMHelper HG(H); HG.recLevel = 0; if (ta == FflasNoTrans) fgemm(F,FflasNoTrans,FflasNoTrans,M,1,N,alpha,A,lda,X,incX,beta,Y,incY,HG); else fgemm(F,FflasTrans,FflasNoTrans,N,1,M,alpha,A,lda,X,incX,beta,Y,incY,HG); freduce(F,(ta==FflasNoTrans)?M:N, Y,incY); H.initOut(); return Y; } } if ( !F.isOne(alpha) && !F.isMOne(alpha)){ F.assign (alpha_, F.one); F.div (beta_, beta, alpha); } MMHelper HD(F,0); fgemv (F, ta, M, N, alpha_, FFPACK::fflas_const_cast(A), lda, FFPACK::fflas_const_cast(X), incX, beta_, Y, incY, HD); Protected::ScalAndReduce (F, Ydim, alpha, Y, incY, HD); H.initOut(); return Y; } } namespace FFLAS{ template inline typename Field::Element_ptr fgemv (const Field& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr X, const size_t incX, const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, MMHelper & H) { size_t Ydim = (ta==FflasNoTrans)?M:N; if (F.isZero (beta)) fzero (F, Ydim, Y, incY); else { typename Field::Element betadivalpha; FFLASFFPACK_check(!F.isZero(alpha)); F.div (betadivalpha, beta, alpha); fscalin (F, Ydim, betadivalpha, Y, incY); } if (ta == FflasNoTrans) for (size_t i = 0; i < Ydim; ++i) F.addin (Y[i*incY], fdot(F, N, A+i*lda, 1, X, incX)); else for (size_t i = 0; i < Ydim; ++i) F.addin (Y[i*incY], fdot(F, M, A+i, lda, X, incX)); fscalin (F, Ydim, alpha, Y, incY); return Y; } } namespace FFLAS{ template inline typename Field::Element_ptr fgemv (const Field& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr X, const size_t incX, const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, MMHelper & H) { typedef MMHelper HelperType; typedef typename HelperType::DelayedField::Element DFElt; typedef typename HelperType::DelayedField::Element_ptr DFElt_ptr; typedef typename HelperType::DelayedField::ConstElement_ptr DFCElt_ptr; DFElt alphadf=alpha, betadf=beta; size_t Ydim = (ta==FflasNoTrans)?M:N; size_t Xdim = (ta==FflasNoTrans)?N:M; if (F.isMOne (alpha)) alphadf = -F.one; else { alphadf = F.one; if (! F.isOne( alpha)) { // Compute y = A*x + beta/alpha.y, then y *= alpha FFLASFFPACK_check(!F.isZero(alpha)); typename Field::Element betadalpha; F.init(betadalpha); F.div (betadalpha, beta, alpha); betadf=betadalpha; } } if (F.isMOne(betadf)) betadf = -F.one; size_t kmax = H.MaxDelayedDim (betadf); if (kmax <= Xdim/2 ){ // Might as well reduce inputs if (H.Amin < H.FieldMin || H.Amax>H.FieldMax){ H.initA(); freduce_constoverride (F, M, N, A, lda); } if (H.Bmin < H.FieldMin || H.Bmax>H.FieldMax){ H.initB(); freduce_constoverride (F, Xdim, X, incX); } if (H.Cmin < H.FieldMin || H.Cmax>H.FieldMax){ H.initC(); freduce (F, Ydim, Y, incY); } kmax = H.MaxDelayedDim (betadf); } if (!kmax){ MMHelper HG(H); H.initOut(); return fgemv (F, ta, M, N, alpha, A, lda, X, incX, beta, Y, incY, HG); } size_t k2 = std::min (Xdim, kmax); size_t nblock = Xdim / kmax; size_t remblock = Xdim % kmax; if (!remblock) { remblock = kmax; --nblock; } size_t shiftA, M1, N1, Mi, Ni; if (ta == FflasTrans) { shiftA = k2*lda; M1 = remblock; Mi = k2; Ni = N1 = N; }else { shiftA = k2; Mi = M1 = M; N1 = remblock; Ni = k2; } MMHelper::field, MMHelperAlgo::Classic, ModeCategories::DefaultBoundedTag> Hfp(H); fgemv (H.delayedField, ta, M1, N1, alphadf, (DFCElt_ptr)A+nblock*shiftA, lda, (DFCElt_ptr)X+nblock*k2*incX, incX, betadf, (DFElt_ptr)Y, incY, Hfp); for (size_t i = 0; i < nblock; ++i) { freduce (F, Ydim ,Y, incY); Hfp.initC(); fgemv (H.delayedField, ta, Mi, Ni, alphadf, (DFCElt_ptr)A+i*shiftA, lda, (DFCElt_ptr)X+i*k2*incX, incX, F.one, (DFElt_ptr)Y, incY, Hfp); } if (!F.isOne(alpha) && !F.isMOne(alpha)){ DFElt al; F.convert(al, alpha); if (al<0) al = -al; if (std::max(-Hfp.Outmin, Hfp.Outmax) > Hfp.MaxStorableValue/al){ freduce (F, Ydim, Y, incY); Hfp.initOut(); } fscalin (H.delayedField, Ydim, alpha, (DFElt_ptr)Y, incY); if (alpha>0){ H.Outmin = al*Hfp.Outmin; H.Outmax = al*Hfp.Outmax; } else { H.Outmin = -al*Hfp.Outmax; H.Outmax = -al*Hfp.Outmin; } }else { H.Outmin = Hfp.Outmin; H.Outmax = Hfp.Outmax; } return Y; } } namespace FFLAS{ template inline typename Field::Element_ptr fgemv (const Field& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr X, const size_t incX, const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY) { if (!M) {return Y;} size_t Ydim = (ta == FflasNoTrans)?M:N; size_t Xdim = (ta == FflasNoTrans)?N:M; if (!Xdim || F.isZero (alpha)){ fscalin(F, Ydim, beta, Y, incY); return Y; } MMHelper HW (F, 0); return fgemv (F, ta, M, N, alpha, FFPACK::fflas_const_cast(A), lda, FFPACK::fflas_const_cast(X), incX, beta, Y, incY, HW); } } namespace FFLAS{ inline Givaro::ZRing::Element_ptr fgemv (const Givaro::ZRing& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const int64_t alpha, const int64_t* A, const size_t lda, const int64_t* X, const size_t incX, const int64_t beta, int64_t* Y, const size_t incY, MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag> & H) { FFLASFFPACK_check(lda); #if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) if (ta == FflasNoTrans) igemm_ (FflasRowMajor, ta, FflasNoTrans,M,1,N,alpha,A,lda,X,incX,beta,Y,incY); else igemm_ (FflasRowMajor, ta, FflasNoTrans,N,1,M,alpha,A,lda,X,incX,beta,Y,incY); #else if (ta == FflasNoTrans){ int64_t* Yi=Y; for (size_t i=0;i & H) { FFLASFFPACK_check(lda); cblas_dgemv (CblasRowMajor, (CBLAS_TRANSPOSE) ta, (int)M, (int)N, (Givaro::DoubleDomain::Element) alpha, A, (int)lda, X, (int)incX, (Givaro::DoubleDomain::Element) beta, Y, (int)incY); return Y; } template inline typename Field::Element_ptr fgemv (const Field& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const typename Field::Element alpha, const typename Field::ConstElement_ptr A, const size_t lda, const typename Field::ConstElement_ptr X, const size_t incX, const typename Field::Element beta, typename Field::Element_ptr Y, const size_t incY, MMHelper & H) { H.setOutBounds((ta ==FflasNoTrans)?N:M, alpha, beta); MMHelper Hb(F,0); return fgemv(F, ta, M, N, alpha, A, lda, X, incX, beta, Y, incY, Hb); } inline Givaro::FloatDomain::Element_ptr fgemv (const Givaro::FloatDomain& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const Givaro::FloatDomain::Element alpha, const Givaro::FloatDomain::ConstElement_ptr A, const size_t lda, const Givaro::FloatDomain::ConstElement_ptr X, const size_t incX, const Givaro::FloatDomain::Element beta, Givaro::FloatDomain::Element_ptr Y, const size_t incY, MMHelper & H) { FFLASFFPACK_check(lda); cblas_sgemv (CblasRowMajor, (CBLAS_TRANSPOSE) ta, (int)M, (int)N, (Givaro::FloatDomain::Element) alpha, A, (int)lda, X, (int)incX, (Givaro::FloatDomain::Element) beta, Y, (int)incY); return Y; } } #endif // __FFLASFFPACK_fgemv_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fgemv_mp.inl000066400000000000000000000122021274716147400231470ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fgemv_mp_INL #define __FFLASFFPACK_fgemv_mp_INL #include "fflas-ffpack/field/rns-integer-mod.h" namespace FFLAS { // specialization of the fgemv function for the field RNSInteger inline FFPACK::rns_double::Element_ptr fgemv (const FFPACK::RNSInteger& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const FFPACK::rns_double::Element alpha, FFPACK::rns_double::ConstElement_ptr A, const size_t lda, FFPACK::rns_double::ConstElement_ptr X, const size_t incX, const FFPACK::rns_double::Element beta, FFPACK::rns_double::Element_ptr Y, const size_t incY, MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag> & H) { if (M!=0 && N !=0){ for (size_t i=0;i inline FFPACK::rns_double::Element_ptr fgemv (const FFPACK::RNSIntegerMod& F, const FFLAS_TRANSPOSE ta, const size_t M, const size_t N, const FFPACK::rns_double::Element alpha, FFPACK::rns_double::ConstElement_ptr A, const size_t lda, FFPACK::rns_double::ConstElement_ptr X, const size_t incX, const FFPACK::rns_double::Element beta, FFPACK::rns_double::Element_ptr Y, const size_t incY, MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag> & H) { //std::cout<<"HERE 1"<, MMHelperAlgo::Classic, ModeCategories::DefaultTag > H2; //std::cout<<"HERE 2"< inline Givaro::Integer* fgemv (const Givaro::ZRing& F, const FFLAS_TRANSPOSE ta, const size_t m, const size_t n, const Givaro::Integer alpha, Givaro::Integer* A, const size_t lda, Givaro::Integer* X, const size_t ldx, Givaro::Integer beta, Givaro::Integer* Y, const size_t ldy, MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo > & H) { MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeqHelper::Sequential> H2; fgemm(F,ta,FFLAS::FflasNoTrans, (ta==FFLAS::FflasNoTrans)?m:n, 1,(ta==FFLAS::FflasNoTrans)?n:m, alpha,A,lda,X,ldx,beta,Y,ldy,H2); return Y; } // specialization of the fgemv function for the field Givaro::Modular // Calling fgemm, TODO: really specialize fgemv inline Givaro::Integer* fgemv (const Givaro::Modular& F, const FFLAS_TRANSPOSE ta, const size_t m, const size_t n, const Givaro::Integer alpha, Givaro::Integer* A, const size_t lda, Givaro::Integer* X, const size_t ldx, Givaro::Integer beta, Givaro::Integer* Y, const size_t ldy, MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo > & H) { MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo, ParSeqHelper::Sequential> H2; fgemm(F,ta,FFLAS::FflasNoTrans,(ta==FFLAS::FflasNoTrans)?m:n,1,(ta==FFLAS::FflasNoTrans)?n:m,alpha,A,lda,X,ldx,beta,Y,ldy,H2); return Y; } } // end namespace FFLAS #endif fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fger.inl000066400000000000000000000270331274716147400223020ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_fger.inl * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fger_INL #define __FFLASFFPACK_fger_INL namespace FFLAS { template inline void fger (const Field& F, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr x, const size_t incx, typename Field::ConstElement_ptr y, const size_t incy, typename Field::Element_ptr A, const size_t lda) { MMHelper H(F,0); fger (F, M, N, alpha, x, incx, y, incy, A, lda, H); freduce (F, M, N, A, lda); } } //FFLAS namespace FFLAS { namespace Protected { template inline void fger_convert (const Field& F, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr x, const size_t incx, typename Field::ConstElement_ptr y, const size_t incy, typename Field::Element_ptr A, const size_t lda) { Givaro::ModularBalanced G((FloatElement) F.characteristic()); FloatElement alphaf; F.convert (alphaf, alpha); FloatElement* Af = fflas_new (G,M,N); FloatElement* Xf = fflas_new (G,M,1); FloatElement* Yf = fflas_new (G,N,1); fconvert(F, M, N, Af, N, A, lda); freduce(G, M, N, Af, N); fconvert(F, M, Xf, 1, x, incx); freduce(G, M, Xf, 1); fconvert(F, N, Yf, 1, y, incy); freduce(G, N, Yf, 1); fger (G, M, N, alphaf, Xf, 1, Yf, 1, Af, N); finit (F, M, N, Af, N, A, lda); fflas_delete (Af); fflas_delete (Xf); fflas_delete (Yf); } }// Protected }// FFLAS namespace FFLAS{ template inline void fger (const Field& F, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr x, const size_t incx, typename Field::ConstElement_ptr y, const size_t incy, typename Field::Element_ptr A, const size_t lda, MMHelper > & H) { if (F.isZero(alpha)) { return ; } if (F.cardinality() < DOUBLE_TO_FLOAT_CROSSOVER){ return Protected::fger_convert(F,M,N,alpha,x, incx, y,incy, A, lda); } else if (16*F.cardinality() < Givaro::ModularBalanced::maxCardinality()){ return Protected::fger_convert(F,M,N,alpha,x, incx, y,incy, A, lda); } else { FFPACK::failure()(__func__,__LINE__,"Invalid ConvertTo Mode for this field"); } } template inline void fger (const Field& F, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr x, const size_t incx, typename Field::ConstElement_ptr y, const size_t incy, typename Field::Element_ptr A, const size_t lda, MMHelper & H) { if (F.isZero(alpha)) { return ; } typename Field::Element tmp; typename Field::ConstElement_ptr xi=x, yj=y; typename Field::Element_ptr Ai=A; if ( M < N ){ if ( F.isOne( alpha ) ) for ( ; Ai < A+M*lda; Ai+=lda, xi+=incx ){ yj = y; for (size_t j = 0; j < N; ++j, yj+=incy ) F.axpyin( *(Ai+j), *xi, *yj ); } else if ( F.isMOne( alpha ) ) for ( ; Ai < A+M*lda; Ai+=lda, xi+=incx ){ F.neg( tmp, *xi ); yj = y; for (size_t j = 0; j < N; ++j, yj+=incy ) F.axpyin( *(Ai+j), tmp, *yj ); } else for ( ; Ai < A+M*lda; Ai+=lda, xi+=incx ){ F.mul( tmp, alpha, *xi ); yj = y; for (size_t j = 0; j < N; ++j, yj+=incy ) F.axpyin( *(Ai+j), tmp, *yj ); } } else { if ( F.isOne( alpha ) ){ for ( ; Ai < A+N; ++Ai, yj+=incy ){ xi = x; for (size_t i = 0; i < M; ++i, xi+=incx ) F.axpyin( *(Ai+i*lda), *xi, *yj ); } } else if ( F.isMOne( alpha ) ) for ( ; Ai < A+N; ++Ai, yj+=incy ){ F.neg( tmp, *yj ); xi = x; for (size_t i = 0; i < M; ++i, xi+=incx ) F.axpyin( *(Ai+i*lda), *xi, tmp ); } else for ( ; Ai < A+N; ++Ai, yj+=incy ){ F.mul( tmp, alpha, *yj ); xi = x; for (size_t i = 0; i < M; ++i, xi+=incx ) F.axpyin( *(Ai+i*lda), *xi, tmp ); } } } inline void fger( const Givaro::DoubleDomain& F, const size_t M, const size_t N, const Givaro::DoubleDomain::Element alpha, const Givaro::DoubleDomain::ConstElement_ptr x, const size_t incx, const Givaro::DoubleDomain::ConstElement_ptr y, const size_t incy, Givaro::DoubleDomain::Element_ptr A, const size_t lda, MMHelper & H) { if (F.isZero(alpha)) { return ; } FFLASFFPACK_check(lda); cblas_dger( CblasRowMajor, (int)M, (int)N, alpha, x, (int)incx, y, (int)incy, A, (int)lda ); } template inline void fger(const Field& F, const size_t M, const size_t N, const typename Field::Element alpha, const typename Field::ConstElement_ptr x, const size_t incx, const typename Field::ConstElement_ptr y, const size_t incy, typename Field::Element_ptr A, const size_t lda, MMHelper & H) { H.setOutBounds (1, alpha, 1.0); MMHelper Hd(F,0); fger (F, M, N, alpha, x, incx, y, incy, A, lda, Hd); } inline void fger( const Givaro::FloatDomain& F, const size_t M, const size_t N, const Givaro::FloatDomain::Element alpha, const Givaro::FloatDomain::ConstElement_ptr x, const size_t incx, const Givaro::FloatDomain::ConstElement_ptr y, const size_t incy, Givaro::FloatDomain::Element_ptr A, const size_t lda, MMHelper & H) { if (F.isZero(alpha)) { return ; } FFLASFFPACK_check(lda); cblas_sger( CblasRowMajor, (int)M, (int)N, alpha, x, (int)incx, y, (int)incy, A, (int)lda ); } template inline void fger (const Field& F, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr x, const size_t incx, typename Field::ConstElement_ptr y, const size_t incy, typename Field::Element_ptr A, const size_t lda, MMHelper & H) { if (F.isZero(alpha)) { return ; } typedef MMHelper HelperType; typedef typename HelperType::DelayedField delayedField; typedef typename HelperType::DelayedField::Element DFElt; typedef typename HelperType::DelayedField::ConstElement_ptr DFCElt_ptr; typedef typename HelperType::DelayedField::Element_ptr DFElt_ptr; typedef typename Field::Element Element; typedef typename Field::Element_ptr Element_ptr; typedef MMHelper DelayedHelperType; DelayedHelperType Hfp(H); if (Hfp.MaxDelayedDim(1.0) < 1){ if (Hfp.Amin < H.FieldMin || Hfp.Amax>H.FieldMax){ Hfp.initA(); freduce_constoverride (F, M, x, incx); } if (Hfp.Bmin < H.FieldMin || Hfp.Bmax>H.FieldMax){ Hfp.initB(); freduce_constoverride (F, N, y, incy); } if (Hfp.Cmin < H.FieldMin || Hfp.Cmax>H.FieldMax){ Hfp.initC(); freduce (F, M, N, A, lda); } } Hfp.Outmin = Hfp.FieldMin; Hfp.Outmax = Hfp.FieldMax; if (F.isOne(alpha) || F.isMOne(alpha)){ DFElt alphadf; if (F.isMOne( alpha)) alphadf = -F.one; else alphadf = F.one; fger (H.delayedField, M, N, alphadf, (DFCElt_ptr)x, incx, (DFCElt_ptr)y, incy, (DFElt_ptr)A, lda, Hfp); H.Outmin = Hfp.Outmin; H.Outmax = Hfp.Outmax; } else { Element_ptr sY = FFLAS::fflas_new (N); fscal(F, N, alpha, y, incy, sY, 1); fger (H.delayedField, M, N, 1.0, (DFCElt_ptr)x, incx, (DFCElt_ptr) sY, 1, (DFElt_ptr)A, lda, Hfp); FFLAS::fflas_delete(sY); H.setOutBounds (1, alpha, 1.0); } } template inline void fger (const Field& F, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr x, const size_t incx, typename Field::ConstElement_ptr y, const size_t incy, typename Field::Element_ptr A, const size_t lda, MMHelper & H) { if (F.isZero(alpha)) { return ; } if (Protected::AreEqual >::value || Protected::AreEqual >::value){ if (F.cardinality() < Givaro::ModularBalanced::maxCardinality()) return Protected::fger_convert(F,M,N,alpha,x,incx,y,incy, A,lda); else{ // Stay over int64_t MMHelper HG(H); HG.recLevel = 0; fgemm(F,FflasNoTrans,FflasNoTrans,M,N,1,alpha,x,incx,y,incy,F.one,A,lda,HG); freduce(F,M,N,A,lda); H.initOut(); return; } } typedef MMHelper ModularHelperType; typedef typename ModularHelperType::DelayedField delayedField; typedef typename delayedField::Element DFElt; typedef typename delayedField::ConstElement_ptr DFCElt_ptr; typedef typename delayedField::Element_ptr DFElt_ptr; typedef typename Field::Element Element; typedef typename Field::Element_ptr Element_ptr; typedef MMHelper DelayedHelperType; DelayedHelperType Hfp(H); if (F.isOne(alpha) || F.isMOne(alpha)){ DFElt alphadf; if (F.isMOne( alpha)) alphadf = -F.one; else alphadf = F.one; fger (H.delayedField, M, N, alphadf, (DFCElt_ptr)x, incx, (DFCElt_ptr)y, incy, (DFElt_ptr)A, lda, Hfp); } else { Element_ptr sY = FFLAS::fflas_new (N); fscal(F, N, alpha, y, incy, sY, 1); fger (H.delayedField, M, N, H.delayedField.one, (DFCElt_ptr)x, incx, (DFCElt_ptr)sY, (size_t)1, (DFElt_ptr)A, lda, Hfp); FFLAS::fflas_delete(sY); } H.initOut(); } } // FFLAS //#include "fflas-ffpack/fflas/fflas_fger_mp.inl" moved to fflas.h #endif // __FFLASFFPACK_fger_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fger_mp.inl000066400000000000000000000071131274716147400227730ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas_fgemm/fgemm_classical_mp.inl * @brief matrix multiplication with multiprecision input (either over Z or over Z/pZ) */ #ifndef __FFPACK_fger_mp_INL #define __FFPACK_fger_mp_INL #include #include #include "fflas-ffpack/fflas/fflas_helpers.inl" #include "fflas-ffpack/fflas/fflas_fgemm/fgemm_classical_mp.inl" #include "fflas-ffpack/field/rns-integer.h" #include "fflas-ffpack/field/rns-integer-mod.h" namespace FFLAS{ inline void fger (const Givaro::Modular& F, const size_t M, const size_t N, const typename Givaro::Integer alpha, typename Givaro::Integer* x, const size_t incx, typename Givaro::Integer* y, const size_t incy, typename Givaro::Integer* A, const size_t lda, MMHelper, MMHelperAlgo::Classic, ModeCategories::ConvertTo > & H) { MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag> H2; FFLAS::fger(F,M,N,alpha,x,incx,y,incy,A,lda,H2); } template inline void fger (const FFPACK::RNSInteger& F, const size_t M, const size_t N, const typename FFPACK::RNSInteger::Element alpha, typename FFPACK::RNSInteger::Element_ptr x, const size_t incx, typename FFPACK::RNSInteger::Element_ptr y, const size_t incy, typename FFPACK::RNSInteger::Element_ptr A, const size_t lda, MMHelper, MMHelperAlgo::Classic, ModeCategories::DefaultTag> & H) { for(size_t i=0;i inline void fger (const FFPACK::RNSIntegerMod& F, const size_t M, const size_t N, const typename FFPACK::RNSIntegerMod::Element alpha, typename FFPACK::RNSIntegerMod::Element_ptr x, const size_t incx, typename FFPACK::RNSIntegerMod::Element_ptr y, const size_t incy, typename FFPACK::RNSIntegerMod::Element_ptr A, const size_t lda, MMHelper, MMHelperAlgo::Classic> & H) { typedef FFPACK::RNSInteger RnsDomain; MMHelper H2; RnsDomain Zrns(F.rns()); FFLAS::fger(Zrns,M,N,alpha,x,incx,y,incy,A,lda,H2); // reduce the result mod p freduce (F, M, N, A, lda); } } // namespace FFLAS #endif fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_freduce.h000066400000000000000000000120621274716147400224350ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_freduce.inl * Copyright (C) 2014 FFLAS FFPACK group * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_freduce_H #define __FFLASFFPACK_fflas_freduce_H #include "fflas-ffpack/fflas/fflas_simd.h" #include "fflas-ffpack/field/field-traits.h" #include "fflas-ffpack/utils/cast.h" namespace FFLAS { template struct support_simd_mod : public std::false_type {} ; #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template<> struct support_simd_mod : public std::true_type {} ; template<> struct support_simd_mod : public std::true_type {} ; #ifdef SIMD_INT template<> struct support_simd_mod : public std::true_type {} ; #endif // SIMD_INT #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS } // FFLAS #include "fflas-ffpack/fflas/fflas_freduce.inl" namespace FFLAS { /***************************/ /* LEVEL 1 */ /***************************/ template void freduce (const Field & F, const size_t m, typename Field::ConstElement_ptr B, const size_t incY, typename Field::Element_ptr A, const size_t incX) { return details::freduce (F,m,B,incY,A,incX,typename FieldTraits::category()); } template void freduce (const Field & F, const size_t m, typename Field::Element_ptr A, const size_t incX) { return details::freduce (F,m,A,incX,typename FieldTraits::category()); } template void freduce_constoverride(const Field & F, const size_t m, typename Field::ConstElement_ptr A, const size_t incX) { return freduce(F, m, FFPACK::fflas_const_cast(A), incX); } // OOOPS // CP: to be moved to a fflas_finit field, if ever needed template void finit (const Field& F, const size_t n, ConstOtherElement_ptr Y, const size_t incY, typename Field::Element_ptr X, const size_t incX) { typename Field::Element_ptr Xi = X ; ConstOtherElement_ptr Yi = Y ; if (incX == 1 && incY == 1) for (; Yi < Y + n ; ++Xi, ++Yi) F.init( *Xi , *Yi); else for (; Yi < Y+n*incY; Xi+=incX, Yi += incY ) F.init( *Xi , *Yi); } /***************************/ /* LEVEL 2 */ /***************************/ template void freduce (const Field& F, const size_t m , const size_t n, typename Field::Element_ptr A, const size_t lda) { if (n == lda) freduce (F, n*m, A, 1); else for (size_t i = 0 ; i < m ; ++i) freduce (F, n, A+i*lda, 1); return; } template void pfreduce (const Field& F, const size_t m , const size_t n, typename Field::Element_ptr A, const size_t lda, const size_t numths) { SYNCH_GROUP( FORBLOCK1D(iter, m, SPLITTER(numths), size_t rowsize= iter.end()-iter.begin(); TASK(MODE(CONSTREFERENCE(F) READWRITE(A[iter.begin()*lda])), freduce (F, rowsize, n, A+iter.begin()*lda, lda); ); ); ); return; } template void freduce (const Field& F, const size_t m , const size_t n, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::Element_ptr A, const size_t lda) { for (size_t i = 0 ; i < m ; ++i) { freduce(F,n,B+i*ldb,1,A+i*lda,1); } } template void freduce_constoverride(const Field & F, const size_t m, const size_t n, typename Field::ConstElement_ptr A, const size_t lda) { return freduce(F, m, n, FFPACK::fflas_const_cast(A), lda); } // CP: to be moved to a fflas_finit field, if ever needed template void finit (const Field& F, const size_t m , const size_t n, const OtherElement_ptr B, const size_t ldb, typename Field::Element_ptr A, const size_t lda) { if (n == lda && n == ldb) finit (F, n*m, B, 1, A, 1); else for (size_t i = 0 ; i < m ; ++i) finit (F, n, B + i*ldb, 1, A + i*lda, 1); return; } } // end of namespace FFLAS //#include "fflas_freduce_mp.inl" moved to fflas.h #endif // __FFLASFFPACK_fflas_freduce_H fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_freduce.inl000066400000000000000000000476571274716147400230120ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_freduce.inl * Copyright (C) 2014 Pascal Giorgi * * Written by Pascal Giorgi * Brice Boyer (briceboyer) * * Part of this code is taken from http://libdivide.com/ * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_freduce_INL #define __FFLASFFPACK_fflas_freduce_INL #include #include "fflas-ffpack/fflas/fflas_fassign.h" #include "fflas-ffpack/utils/bit_manipulation.h" #define FFLASFFPACK_COPY_REDUCE 32 /* TO BENCMARK LATER */ namespace FFLAS { namespace vectorised { /* for casts (?) */ template inline typename std::enable_if< ! std::is_integral::value, T>::type monfmod(T A, T B) { return fmod(A,B); } template inline typename std::enable_if< std::is_integral::value, T>::type monfmod(T A, T B) { return A % B; // B > 0 } template<> inline Givaro::Integer monfmod(Givaro::Integer A, Givaro::Integer B) // @bug B is not integer, but uint64_t usually { return A % B; // B > 0 } template<> inline float monfmod(float A, float B) { return fmodf(A,B); } template<> inline double monfmod(double A, double B) { //std::cerr<<"fmod"< inline RecInt::rmint& monfmod(RecInt::rmint& A, RecInt::rmint& B) { return RecInt::rmint::mod_n(A, B); } template inline typename std::enable_if< ! std::is_integral::value, T>::type monrint(T A)// @bug pass by reference ? { return rint(A); } template inline typename std::enable_if< std::is_integral::value, T>::type monrint( T A) { return A ; } template<> inline double monrint(double A) { return rint(A); } template<> inline float monrint(float A) { return rintf(A); } template<> inline Givaro::Integer monrint(Givaro::Integer A) // @bug B is not integer, but uint64_t usually { return A ; // B > 0 } template inline int64_t monfmod(int64_t A, int64_t p, int8_t shifter, int64_t magic) { if (poweroftwo) { //shift path int64_t q = A + ((A >> 63) & ((1_i64 << shifter) - 1)); q = A - ((q>>shifter)<< shifter) ; return (q<0)?(q+p):q ; } else { int64_t q = mulhi_64(magic, A); if (overflow) { q += A ; } q >>= shifter; A = A - q * p ; if (A >= p) A-= p ; // because of mulhi_fast return A ; } } } // vectorised } // FFLAS namespace FFLAS { namespace vectorised { template inline void fast_mod_generate(bool & overflow, bool & poweroftwo, int8_t & shift, T & magic, T denom) { overflow = false ; poweroftwo = false ; shift = 0 ; magic = 0 ; } //! @pre d > 0 template<> inline void fast_mod_generate(bool & overflow, bool & poweroftwo, int8_t & shift, int64_t & magic, int64_t denom) { // overflow = false ; // poweroftwo = false ; // shift = 0 ; // magic = 0 ; if ((denom & (denom- 1)) == 0) { shift = (int8_t)ctz((uint64_t)denom) ; magic = 0; poweroftwo = true ; } else { const uint32_t floor_log_2_d = 63 - clz((uint64_t)denom); /*the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word is 0 and the high word is floor_log_2_d - 1 */ uint64_t rem, proposed_m; proposed_m = getpoweroftwoden_128(floor_log_2_d, denom, &rem); const uint64_t e = denom- rem; /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */ if (e < (1_ui64 << floor_log_2_d)) { /* This power works */ shift = (int8_t)(floor_log_2_d - 1); } else { /* We need to go one higher. This should not make proposed_m overflow, but it will make it negative when interpreted as an int32_t. */ proposed_m += proposed_m; const uint64_t twice_rem = rem + rem; if (twice_rem >= (uint64_t)denom || twice_rem < rem) proposed_m += 1; shift = (int8_t) floor_log_2_d ; overflow = true ; } proposed_m += 1; magic = (int64_t)proposed_m ; } } template::value> struct HelperMod ; template struct HelperMod { bool overflow = false ; bool poweroftwo = false ; int8_t shift = 0 ; typename Field::Element magic = (typename Field::Element)0 ; typename Field::Element p; HelperMod() { // std::cout << "empty cstor called" << std::endl; } ; HelperMod( const Field & F) { // std::cout << "field cstor called" << std::endl; p = (typename Field::Element) F.characteristic(); fast_mod_generate(overflow, poweroftwo, shift, magic, p); // std::cout << overflow << ',' << poweroftwo << std::endl; // std::cout << (int) shift << ',' << magic << std::endl; // std::cout << this->shift << std::endl; } int getAlgo() const { // std::cout << "will be " << (2*overflow + poweroftwo) << std::endl; return 2* (int)overflow + (int) poweroftwo ; // return overflow << 1 | poweroftwo ; } } ; template struct HelperMod { typename Field::Element p; typename Field::Element invp; // typename Field::Elmeent min ; // typename Field::Elmeent max ; HelperMod() {} ; HelperMod( const Field & F) { p = (typename Field::Element) F.characteristic(); invp = (typename Field::Element)1/p; // min = F.minElement(); // max = F.maxElement(); } int getAlgo() const { return 0; } } ; template struct HelperMod { typename Field::Element p; // typename Field::Element invp; // typename Field::Elmeent min ; // typename Field::Elmeent max ; HelperMod() {} ; HelperMod( const Field & F) { p = (typename Field::Element) F.characteristic(); // invp = (typename Field::Element)1/p; // min = F.minElement(); // max = F.maxElement(); } int getAlgo() const { return 0; } } ; template struct HelperMod { typename Field::Element p; // typename Field::Element invp; // typename Field::Elmeent min ; // typename Field::Elmeent max ; HelperMod() {} ; HelperMod( const Field & F) { p = (typename Field::Element) F.characteristic(); // invp = (typename Field::Element)1/p; // min = F.minElement(); // max = F.maxElement(); } int getAlgo() const { return 0; } } ; #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template::value> struct HelperModSimd ; template struct HelperModSimd : public HelperMod { typedef typename SimdT::vect_t vect_t ; // bool overflow ; // int8_t shift ; // typename Field::Element p; typename Field::Element magic ; vect_t M ; vect_t P ; vect_t MIN ; vect_t MAX ; vect_t NEGP ; vect_t Q ; vect_t T ; HelperModSimd ( const Field & F) : HelperMod(F) { // std::cout << "HelperMod constructed " << this->shift << std::endl; // p = F.characteristic(); P = SimdT::set1(this->p); NEGP = SimdT::set1(-this->p); MIN = SimdT::set1(F.minElement()); MAX = SimdT::set1(F.maxElement()); // fast_mod_generate(overflow, shift, magic, p); M = SimdT::set1(magic); } HelperModSimd( const Field & F, const HelperMod & G) { this->overflow=G.overflow; this->poweroftwo=G.poweroftwo; this->shift=G.shift; this->magic=G.magic; this->p=G.p; // std::cout << "magic is = " << this->magic<< ',' << G.magic<< std::endl; P = SimdT::set1(this->p); NEGP = SimdT::set1(-(this->p)); MIN = SimdT::set1(F.minElement()); MAX = SimdT::set1(F.maxElement()); // fast_mod_generate(overflow, shift, magic, p); M = SimdT::set1(magic); } } ; template struct HelperModSimd : public HelperMod { typedef typename SimdT::vect_t vect_t ; vect_t INVP; vect_t MIN ; vect_t MAX ; vect_t NEGP ; vect_t P ; vect_t Q ; vect_t T ; HelperModSimd( const Field & F) : HelperMod(F) { P = SimdT::set1(this->p); NEGP = SimdT::set1(-(this->p)); // MIN = SimdT::set1(max); MIN = SimdT::set1(F.minElement()); // MAX = SimdT::set1(min); MAX = SimdT::set1(F.maxElement()); INVP = SimdT::set1(this->invp); } HelperModSimd( const Field & F, const HelperMod & G) { this->p = G.p; this->invp = G.invp ; P = SimdT::set1(this->p); NEGP = SimdT::set1(-this->p); // MIN = SimdT::set1(max); MIN = SimdT::set1(F.minElement()); // MAX = SimdT::set1(min); MAX = SimdT::set1(F.maxElement()); INVP = SimdT::set1(this->invp); } } ; #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS #ifdef __x86_64__ template typename std::enable_if< std::is_same::value , int64_t>::type monfmod (typename Field::Element A, HelperMod & H) { switch(ALGO) { case 3 : // std::cout << 3 << std::endl; return monfmod (A,H.p,H.shift,H.magic); case 2 : // std::cout << 2 << std::endl; return monfmod (A,H.p,H.shift,H.magic); case 1 : // std::cout << 1 << std::endl; return monfmod (A,H.p,H.shift,H.magic); case 0 : // std::cout << "using " << 0 << std::endl; return monfmod(A,H.p,H.shift,H.magic); default : FFLASFFPACK_abort("unknown algo"); } } #endif // __x86_64__ template #ifdef __x86_64__ typename std::enable_if< ! std::is_same::value , typename Field::Element>::type #else typename Field::Element #endif // __x86_64__ monfmod (typename Field::Element A, HelperMod & H) { return monfmod(A,H.p); } template typename Field::Element monfmod (typename Field::Element A, HelperMod & H) { return monfmod(A,H.p); } template typename Field::Element monfmod (typename Field::Element A, HelperMod & H) { return monfmod(A,H.p); } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void VEC_MOD(typename SimdT::vect_t & C, HelperModSimd & H) { C = SimdT::mod( C, H.P, H.INVP, H.NEGP, H.MIN, H.MAX, H.Q, H.T ); } template inline void VEC_MOD(typename SimdT::vect_t & C, HelperModSimd & H) { // std::cout << "magic " << H.magic<< std::endl; // std::cout << H.P << std::endl; switch (ALGO) { case 0 : C = SimdT::template mod( C, H.P, H.shift, H.M, H.NEGP, H.MIN, H.MAX, H.Q, H.T ); break; case 1 : C = SimdT::template mod ( C, H.P, H.shift, H.M, H.NEGP, H.MIN, H.MAX, H.Q, H.T ); break; case 2 : C = SimdT::template mod ( C, H.P, H.shift, H.M, H.NEGP, H.MIN, H.MAX, H.Q, H.T ); break; case 3 : C = SimdT::template mod ( C, H.P, H.shift, H.M, H.NEGP, H.MIN, H.MAX, H.Q, H.T ); break; } } #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS } // vectorised } // FFLAS namespace FFLAS { namespace vectorised { namespace unswitch { #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline typename std::enable_if::value, void>::type modp(const Field &F, typename Field::ConstElement_ptr U, const size_t & n, typename Field::Element_ptr T , HelperMod & G ) { // std::cerr<<"modp vectorized"<; using vect_t = typename simd::vect_t; bool positive = ! FieldTraits::balanced ; // known at compile time HelperModSimd H(F,G); size_t i = 0; if (n < simd::vect_size) { // std::cerr<< n<< " < "<(T[i],H); } else { T[i]=monfmod(U[i],H); } if (!positive) { T[i]-=(T[i]>max)?H.p:0; } T[i]+=(T[i](st) ; j < simd::alignment ; j += sizeof(Element), i++) { if (round) { T[i] = monrint(U[i]); T[i] = monfmod(T[i],H); } else { T[i] = monfmod(U[i],H); } if (!positive) { T[i] -= (T[i] > max) ? H.p : 0; } T[i] += (T[i] < min) ? H.p : 0; } } FFLASFFPACK_check((long(T+i) % simd::alignment == 0)); vect_t C ; if((long(U+i) % simd::alignment == 0)) { // perform the loop using 256 bits SIMD for (; i<= n - simd::vect_size ; i += simd::vect_size) { C = simd::load(U + i); if (round) { C = simd::round(C); } VEC_MOD(C,H); simd::store(T+i, C); } } // perform the last elt from T without SIMD // std::cerr<< n-i<< " unaligned elements left "<(T[i],H); } else { T[i] = monfmod(U[i],H); } if (!positive) { T[i] -= (T[i] > max) ? H.p : 0; } T[i] += (T[i] < min) ? H.p : 0; } } #endif // not vectorised but allows better code than % or fmod via helper template inline typename std::enable_if< !FFLAS::support_simd_mod::value, void>::type modp(const Field &F, typename Field::ConstElement_ptr U, const size_t & n, typename Field::Element_ptr T , HelperMod & H ) { // std::cerr<<"modp not vectorized"<::balanced ; size_t i = 0; for (; i < n ; i++) { if (round) { T[i] = monrint(U[i]); T[i] = monfmod(T[i],H); } else { T[i]=monfmod(U[i],H); } if (!positive) { T[i]-=(T[i]>max)?H.p:(typename Field::Element)0; } T[i]+=(T[i] //inline typename std::enable_if::value, void>::type void modp(const Field &F, typename Field::ConstElement_ptr U, const size_t & n, typename Field::Element_ptr T) { HelperMod H(F); int ALGO = H.getAlgo(); switch (ALGO) { case 0 : unswitch::modp(F,U,n,T,H); break; case 1 : unswitch::modp(F,U,n,T,H); break; case 2 : unswitch::modp(F,U,n,T,H); break; case 3 : unswitch::modp(F,U,n,T,H); break; } } } // vectorised } // FFLAS namespace FFLAS { namespace details { // specialised template typename std::enable_if::value, void>::type freduce (const Field & F, const size_t m, typename Field::Element_ptr A, const size_t incX, FieldCategories::ModularTag) { if(incX == 1) { vectorised::modp(F,A,m,A); } else { /* faster with copy, use incX=1, copy back ? */ if (m < FFLASFFPACK_COPY_REDUCE) { typename Field::Element_ptr Xi = A ; for (; Xi < A+m*incX; Xi+=incX ) F.reduce(*Xi); } else { typename Field::Element_ptr Ac = fflas_new (F,m,1) ; fassign (F,m,A,incX,Ac,1); freduce (F,m,Ac,1,FieldCategories::ModularTag()); fassign (F,m,Ac,1,A,incX); fflas_delete (Ac); } } } template typename std::enable_if< ! FFLAS::support_simd_mod::value, void>::type freduce (const Field & F, const size_t m, typename Field::Element_ptr A, const size_t incX, FieldCategories::ModularTag) { /* ??? ( faster with copy, use incX=1, copy back ? */ // CP: no SIMD supported here! // if(incX == 1) { // vectorised::modp(F,A,m,A); // } // else { typename Field::Element_ptr Xi = A ; for (; Xi < A+m*incX; Xi+=incX ) F.reduce(*Xi); // } } template void freduce (const Field & F, const size_t m, typename Field::Element_ptr A, const size_t incX, FieldCategories::GenericTag) { typename Field::Element_ptr Xi = A ; for (; Xi < A+m*incX; Xi+=incX ) F.reduce (*Xi); } template void freduce (const Field & F, const size_t m, typename Field::Element_ptr A, const size_t incX, FieldCategories::UnparametricTag) { typename Field::Element_ptr Xi = A ; for (; Xi < A+m*incX; Xi+=incX ) F.reduce (*Xi); } template typename std::enable_if< FFLAS::support_simd_mod::value, void>::type freduce (const Field & F, const size_t m, typename Field::ConstElement_ptr B, const size_t incY, typename Field::Element_ptr A, const size_t incX, FieldCategories::ModularTag) { if(incX == 1 && incY == 1) { vectorised::modp(F,B,m,A); } else { typename Field::Element_ptr Xi = A ; typename Field::ConstElement_ptr Yi = B ; for (; Xi < A+m*incX; Xi+=incX, Yi += incY ) F.reduce (*Xi , *Yi); } } template typename std::enable_if< ! FFLAS::support_simd_mod::value, void>::type freduce (const Field & F, const size_t m, typename Field::ConstElement_ptr B, const size_t incY, typename Field::Element_ptr A, const size_t incX, FieldCategories::ModularTag) { typename Field::Element_ptr Xi = A ; typename Field::ConstElement_ptr Yi = B ; for (; Xi < A+m*incX; Xi+=incX, Yi += incY ) F.reduce (*Xi , *Yi); } template void freduce (const Field & F, const size_t m, typename Field::ConstElement_ptr B, const size_t incY, typename Field::Element_ptr A, const size_t incX, FieldCategories::GenericTag) { typename Field::Element_ptr Xi = A ; typename Field::ConstElement_ptr Yi = B ; for (; Xi < A+m*incX; Xi+=incX, Yi += incY ) F.reduce (*Xi , *Yi); } template void freduce (const Field & F, const size_t m, typename Field::ConstElement_ptr B, const size_t incY, typename Field::Element_ptr A, const size_t incX, FieldCategories::UnparametricTag) { typename Field::Element_ptr Xi = A ; typename Field::ConstElement_ptr Yi = B ; for (; Xi < A+m*incX; Xi+=incX, Yi += incY ) F.reduce (*Xi , *Yi); } } // details } // FFLAS #endif // __FFLASFFPACK_fflas_freduce_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_freduce_mp.inl000066400000000000000000000041641274716147400234700ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_freduce_mp.inl * Copyright (C) 2014 FFLAS FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_freduce_mp_INL #define __FFLASFFPACK_fflas_freduce_mp_INL #include "fflas-ffpack/field/rns-integer-mod.h" namespace FFLAS { // specialization of the level1 freduce function for the field RNSInteger template<> inline void freduce (const FFPACK::RNSIntegerMod &F, const size_t n, FFPACK::RNSIntegerMod::Element_ptr A, size_t inc) { if (n==0) return; //cout<<"freduce: "< template<> inline void freduce (const FFPACK::RNSIntegerMod &F, const size_t m, const size_t n, FFPACK::rns_double::Element_ptr A, size_t lda) { if (n==0||m==0) return; //cout<<"freduce: "< * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_freivalds_INL #define __FFLASFFPACK_freivalds_INL // #include "fflas-ffpack/utils/Matio.h" namespace FFLAS{ /** @brief freivalds: Freivalds GEneral Matrix Multiply Random Check. * * Randomly Checks \f$C = \alpha \mathrm{op}(A) \times \mathrm{op}(B)\f$ * \param F field. * \param ta if \c ta==FflasTrans then \f$\mathrm{op}(A)=A^t\f$, else \f$\mathrm{op}(A)=A\f$, * \param tb same for matrix \p B * \param m see \p A * \param n see \p B * \param k see \p A * \param alpha scalar * \param A \f$\mathrm{op}(A)\f$ is \f$m \times k\f$ * \param B \f$\mathrm{op}(B)\f$ is \f$k \times n\f$ * \param C \f$C\f$ is \f$m \times n\f$ * \param lda leading dimension of \p A * \param ldb leading dimension of \p B * \param ldc leading dimension of \p C */ template inline bool freivalds (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, typename Field::ConstElement_ptr C, const size_t ldc) { typename Field::Element_ptr v, y, x; v = FFLAS::fflas_new(F,n,1); y = FFLAS::fflas_new(F,k,1); x = FFLAS::fflas_new(F,m,1); typename Field::RandIter G(F); for(size_t j=0; js,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_faxpy.inl * Copyright (C) 2014 FFLAS-FFPACK group * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fscal_INL #define __FFLASFFPACK_fscal_INL namespace FFLAS { namespace vectorised { #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline typename std::enable_if::value, void>::type VEC_SCAL(SimdT & C, SimdT & ALPHA, SimdT & Q, SimdT & T, SimdT & P, SimdT & NEGP, SimdT & INVP, SimdT & MIN, SimdT & MAX) { using simd = Simd; Q = simd::mul(C,INVP); C = simd::mul(C,ALPHA); Q = simd::floor(Q); C = simd::fnmadd(C,Q,P); Q = simd::greater(C,MAX); T = simd::lesser(C,MIN); Q = simd::vand(Q,NEGP); T = simd::vand(T,P); Q = simd::vor(Q,T); C = simd::add(C,Q); } template inline typename std::enable_if::value, void>::type scalp(Element *T, const Element alpha, const Element * U, const size_t n, const Element p, const Element invp, const T1 min_, const T2 max_) { Element min = (Element)min_, max=(Element)max_; using simd = Simd; using vect_t = typename simd::vect_t; size_t i = 0; if (n < simd::vect_size) { for (; i < n ; i++) { T[i]=monfmod(alpha*U[i], p); T[i] -= (T[i] > max) ? p : 0; T[i] += (T[i] < min) ? p : 0; } return; } vect_t C,Q,P,NEGP,INVP,TMP,MIN,MAX,ALPHA; ALPHA = simd::set1(alpha); P = simd::set1(p); NEGP = simd::set1(-p); INVP = simd::set1(invp); MIN = simd::set1(min); MAX = simd::set1(max); long st = long(T) % simd::alignment; if (st) { // the array T is not 32 byte aligned (process few elements s.t. (T+i) is 32 bytes aligned) for (size_t j = static_cast(st) ; j < simd::alignment ; j+=sizeof(Element), i++) { T[i] = monfmod(alpha*U[i], p); T[i] -= (T[i] > max) ? p : 0; T[i] += (T[i] < min) ? p : 0; } } FFLASFFPACK_check((long(T+i) % simd::alignment == 0)); if ((long(U+i)%simd::alignment==0)) { // perform the loop using 256 bits SIMD for (;i <= n - simd::vect_size ; i += simd::vect_size) { C = simd::load(U+i); VEC_SCAL(C, ALPHA, Q, TMP, P, NEGP, INVP, MIN, MAX); simd::store(T+i,C); } } // perform the last elt from T without SIMD for (; i < n ; i++) { T[i] = monfmod(alpha*U[i],p); T[i] -= (T[i] > max) ? p : 0; T[i] += (T[i] < min) ? p : 0; } } #else template void scalp(Element *T, const Element alpha, const Element * U, const size_t n, const Element p, const Element invp, const T1 min_, const T2 max_) { Element min = (Element)min_, max=(Element)max_; size_t i = 0; { for (; i < n ; i++) { T[i]=monfmod(alpha*U[i], p); T[i] -= (T[i] > max) ? p : 0; T[i] += (T[i] < min) ? p : 0; } return; } } #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS } // vectorised } // FFLAS namespace FFLAS { /***************************/ /* LEVEL 1 */ /***************************/ template inline void fscal( const Field& F, const size_t N, const typename Field::Element a, typename Field::ConstElement_ptr X, const size_t incX, typename Field::Element_ptr Y, const size_t incY ) { // details::fscal(F,N,a,X,incX,Y,incY, typename FieldTraits::value() ); if (F.isOne(a)) { fassign(F,N,X,incX,Y,incY); return ; } typename Field::ConstElement_ptr Xi = X; typename Field::Element_ptr Yi = Y; if (F.areEqual(a,F.mOne)){ fneg(F,N,X,incX,Y,incY); return; } if (F.isZero(a)){ fzero(F,N,Y,incY); return; } if (incX == 1 && incY == 1) for (size_t i = 0 ; i < N ; ++i) F.mul( Y[i], a, X[i] ); else for (; Xi < X+N*incX; Xi+=incX, Yi+=incY ) F.mul( *Yi, a, *Xi ); } template inline void fscalin (const Field& F, const size_t n, const typename Field::Element a, typename Field::Element_ptr X, const size_t incX) { if (F.isOne(a)) return ; if (F.isMOne(a)){ fnegin(F,n,X,incX); return; } if (F.isZero(a)){ fzero(F,n,X,incX); return; } typename Field::Element_ptr Xi = X ; if ( incX == 1) for (size_t i = 0 ; i < n ; ++i) F.mulin( X[i], a); else for (; Xi < X+n*incX; Xi+=incX ) F.mulin( *Xi, a); } template<> inline void fscal( const Givaro::DoubleDomain& , const size_t N, const Givaro::DoubleDomain::Element a, Givaro::DoubleDomain::ConstElement_ptr x, const size_t incx, Givaro::DoubleDomain::Element_ptr y, const size_t incy ) { cblas_dcopy( (int)N, x, (int)incy, y, (int)incy); cblas_dscal( (int)N, a, y, (int)incy); } template<> inline void fscal( const Givaro::FloatDomain& , const size_t N, const Givaro::FloatDomain::Element a, Givaro::FloatDomain::ConstElement_ptr x, const size_t incx, Givaro::FloatDomain::Element_ptr y, const size_t incy ) { cblas_scopy( (int)N, x, (int)incy, y, (int)incy); cblas_sscal( (int)N, a, y, (int)incy); } template<> inline void fscalin( const Givaro::DoubleDomain& , const size_t N, const Givaro::DoubleDomain::Element a, Givaro::DoubleDomain::Element_ptr y, const size_t incy ) { cblas_dscal( (int)N, a, y, (int)incy); } template<> inline void fscalin( const Givaro::FloatDomain& , const size_t N, const Givaro::FloatDomain::Element a, Givaro::FloatDomain::Element_ptr y, const size_t incy ) { cblas_sscal( (int)N, a, y, (int)incy); } template<> inline void fscalin( const Givaro::Modular& F , const size_t N, const float a, float * X, const size_t incX ) { if(incX == 1) { float p, invp; p=(float)F.cardinality(); invp=a/p; vectorised::scalp(X,a,X,N,p,invp,0,p-1); } else { float * Xi = X ; for (; Xi < X+N*incX; Xi+=incX ) F.mulin( *Xi , a); } } template<> inline void fscalin( const Givaro::ModularBalanced& F , const size_t N, const float a, float * X, const size_t incX ) { if(incX == 1) { float p, invp; p=(float)F.cardinality(); invp=a/p; vectorised::scalp(X,a,X,N,p,invp,F.minElement(),F.maxElement()); } else { float * Xi = X ; for (; Xi < X+N*incX; Xi+=incX ) F.mulin( *Xi , a); } } template<> inline void fscalin( const Givaro::Modular& F , const size_t N, const double a, double * X, const size_t incX ) { if(incX == 1) { double p, invp; p=(double)F.cardinality(); invp=a/p; vectorised::scalp(X,a,X,N,p,invp,0,p-1); } else { double * Xi = X ; for (; Xi < X+N*incX; Xi+=incX ) F.mulin( *Xi , a); } } template<> inline void fscal( const Givaro::Modular& F , const size_t N, const double a, const double * X, const size_t incX, double * Y, const size_t incY ) { if(incX == 1 && incY==1) { double p, invp; p=(double)F.cardinality(); invp=a/p; vectorised::scalp(Y,a,X,N,p,invp,0,p-1); } else { const double * Xi = X ; double * Yi = Y ; for (; Xi < X+N*incX; Xi+=incX,Yi+=incY ) F.mul(*Yi, *Xi , a); } } template<> inline void fscalin( const Givaro::ModularBalanced& F , const size_t N, const double a, double * X, const size_t incX ) { if(incX == 1) { double p, invp; p=(double)F.cardinality(); invp=a/p; vectorised::scalp(X,a,X,N,p,invp,F.minElement(),F.maxElement()); } else { double * Xi = X ; for (; Xi < X+N*incX; Xi+=incX ) F.mulin( *Xi , a); } } /***************************/ /* LEVEL 2 */ /***************************/ template void fscalin (const Field& F, const size_t m , const size_t n, const typename Field::Element a, typename Field::Element_ptr A, const size_t lda) { if (F.isOne(a)) { return ; } else if (F.isZero(a)) { fzero(F,m,n,A,lda); } else if (F.isMOne(a)) { fnegin(F,m,n,A,lda); } else { if (lda == n) { fscalin(F,n*m,a,A,1); } else { for (size_t i = 0 ; i < m ; ++i) fscalin(F,n,a,A+i*lda,1); } return; } } template void fscal (const Field& F, const size_t m , const size_t n, const typename Field::Element a, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr B, const size_t ldb) { if (F.isOne(a)) { fassign(F,m,n,A,lda,B,ldb) ; } else if (F.isZero(a)) { fzero(F,m,n,B,ldb); } else if (F.isMOne(a)) { fneg(F,m,n,A,lda,B,ldb); } else { if (n == lda && m == lda) fscal(F,m*n,a,A,lda,B,ldb); else { for (size_t i = 0; i < m ; ++i) fscal(F,n,a,A+i*lda,1,B+i*ldb,1); } } return; } } // FFLAS #endif // __FFLASFFPACK_fscal_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_fscal_mp.inl000066400000000000000000000110221274716147400231320ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fscal_mp_INL #define __FFLASFFPACK_fscal_mp_INL #include "fflas-ffpack/field/rns-integer.h" #include "fflas_fscal.h" #include "fflas_fgemm.inl" namespace FFLAS { /* * specialization for the field RNSInteger */ // level 1 : fscalin template<> inline void fscalin(const FFPACK::RNSInteger &F, const size_t n, const FFPACK::rns_double::Element alpha, FFPACK::rns_double::Element_ptr A, const size_t inc) { for (size_t i=0;i inline void fscal(const FFPACK::RNSInteger &F, const size_t n, const FFPACK::rns_double::Element alpha, FFPACK::rns_double::ConstElement_ptr A, const size_t Ainc, FFPACK::rns_double::Element_ptr B, const size_t Binc) { for (size_t i=0;i inline void fscalin(const FFPACK::RNSInteger &F, const size_t m, const size_t n, const FFPACK::rns_double::Element alpha, FFPACK::rns_double::Element_ptr A, const size_t lda) { for (size_t i=0;i inline void fscal(const FFPACK::RNSInteger &F, const size_t m, const size_t n, const FFPACK::rns_double::Element alpha, FFPACK::rns_double::ConstElement_ptr A, const size_t lda, FFPACK::rns_double::Element_ptr B, const size_t ldb) { for (size_t i=0;i */ // level 1 : fscalin template<> inline void fscalin(const FFPACK::RNSIntegerMod &F, const size_t n, const typename FFPACK::RNSIntegerMod::Element alpha, typename FFPACK::RNSIntegerMod::Element_ptr A, const size_t inc) { fscalin(F.delayed(),n,alpha,A,inc); freduce (F, n, A, inc); } // level 1 : fscal template<> inline void fscal(const FFPACK::RNSIntegerMod &F, const size_t n, const FFPACK::rns_double::Element alpha, FFPACK::rns_double::ConstElement_ptr A, const size_t Ainc, FFPACK::rns_double::Element_ptr B, const size_t Binc) { fscal(F.delayed(),n,alpha,A,Ainc,B,Binc); freduce (F, n, B, Binc); } // level 2 : fscalin template<> inline void fscalin(const FFPACK::RNSIntegerMod &F, const size_t m, const size_t n, const FFPACK::rns_double::Element alpha, FFPACK::rns_double::Element_ptr A, const size_t lda) { fscalin(F.delayed(),m,n,alpha,A,lda); freduce (F, m, n, A, lda); } // level 2 : fscal template<> inline void fscal(const FFPACK::RNSIntegerMod &F, const size_t m, const size_t n, const FFPACK::rns_double::Element alpha, FFPACK::rns_double::ConstElement_ptr A, const size_t lda, FFPACK::rns_double::Element_ptr B, const size_t ldb) { fscal(F.delayed(),m,n,alpha,A,lda,B,ldb); freduce (F, m, n, B, ldb); } } //end of namespace FFLAS #endif fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_ftrmm.inl000066400000000000000000000407321274716147400225050ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_ftrmm.inl * Copyright (C) 2007 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ftrmm_INL #define __FFLASFFPACK_ftrmm_INL namespace FFLAS { //--------------------------------------------------------------------- // ftrmm: TRiangular Matrix Multiply // Computes B <- alpha.op(A).B, B <- alpha.B.op(A) // B is M*N, A is M*M if Side==FflasLeft, N*N if Side==FflasRight // //--------------------------------------------------------------------- template inline void ftrmm (const Field& F, const FFLAS_SIDE Side, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr B, const size_t ldb) { if (!M || !N ) return; if ( Side==FflasLeft ){ if ( Uplo==FflasUpper){ if (TransA == FflasNoTrans){ if (Diag == FflasUnit) Protected::ftrmmLeftUpperNoTransUnit ()(F,M,N,A,lda,B,ldb); else Protected::ftrmmLeftUpperNoTransNonUnit()(F,M,N,A,lda,B,ldb); } else { if (Diag == FflasUnit) Protected::ftrmmLeftUpperTransUnit()(F,M,N,A,lda,B,ldb); else Protected::ftrmmLeftUpperTransNonUnit()(F,M,N,A,lda,B,ldb); } } else { if (TransA == FflasNoTrans){ if (Diag == FflasUnit) Protected::ftrmmLeftLowerNoTransUnit()(F,M,N,A,lda,B,ldb); else Protected::ftrmmLeftLowerNoTransNonUnit()(F,M,N,A,lda,B,ldb); } else { if (Diag == FflasUnit) Protected::ftrmmLeftLowerTransUnit()(F,M,N,A,lda,B,ldb); else Protected::ftrmmLeftLowerTransNonUnit()(F,M,N,A,lda,B,ldb); } } } else { if ( Uplo == FflasUpper){ if (TransA == FflasNoTrans){ if (Diag == FflasUnit) Protected::ftrmmRightUpperNoTransUnit()(F,M,N,A,lda,B,ldb); else Protected::ftrmmRightUpperNoTransNonUnit()(F,M,N,A,lda,B,ldb); } else { if (Diag == FflasUnit) Protected::ftrmmRightUpperTransUnit()(F,M,N,A,lda,B,ldb); else Protected::ftrmmRightUpperTransNonUnit()(F,M,N,A,lda,B,ldb); } } else { if (TransA == FflasNoTrans){ if (Diag == FflasUnit) Protected::ftrmmRightLowerNoTransUnit()(F,M,N,A,lda,B,ldb); else Protected::ftrmmRightLowerNoTransNonUnit()(F,M,N,A,lda,B,ldb); } else { if (Diag == FflasUnit) Protected::ftrmmRightLowerTransUnit()(F,M,N,A,lda,B,ldb); else Protected::ftrmmRightLowerTransNonUnit()(F,M,N,A,lda,B,ldb); } } } if (!F.isOne(alpha)) fscalin(F,M,N,alpha,B,ldb); } #ifndef DOXYGEN_SHOULD_SKIP_THIS namespace Protected { #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT //== #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrmm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT } // Protected #endif // SKIPPED BY DOXYGEN } // FFLAS #endif // __FFLASFFPACK_ftrmm_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_ftrmm_src.inl000066400000000000000000000211431274716147400233470ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (C) 2005 C. Pernet * Written by C. Pernet * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ #define Mjoin(pre, nam) my_join(pre, nam) #define my_join(pre, nam) pre ## nam #ifdef __FFLAS__TRANSPOSE #define __FFLAS__Acolinc lda #define __FFLAS__Arowinc 1 #ifdef __FFLAS__LOW #define __FFLAS__UPPER #else #define __FFLAS__LOWER #endif #else #ifdef __FFLAS__LOW #define __FFLAS__LOWER #else #define __FFLAS__UPPER #endif #define __FFLAS__Acolinc 1 #define __FFLAS__Arowinc lda #endif #ifdef __FFLAS__LEFT #define __FFLAS__SIDE Left #define __FFLAS__Na M #define __FFLAS__Nb N #define __FFLAS__Mb nsplit #define __FFLAS__Nb2 N #define __FFLAS__Mb2 M-nsplit #define __FFLAS__Mbrest nrestsplit #define __FFLAS__Nbrest N #define __FFLAS__Mupdate nrestsplit + i * nsplit #define __FFLAS__Nupdate N #define __FFLAS__Bdim N #define __FFLAS__Bnorminc 1 #ifdef __FFLAS__LOWER #define __FFLAS__Atriang A + (nbblocsplit - (i + 1)) * nsplit * (lda + 1) #define __FFLAS__Aupdate __FFLAS__Atriang + nsplit * __FFLAS__Arowinc #define __FFLAS__Arest A + nbblocsplit * nsplit * (lda+1) #define __FFLAS__Brec B + (nbblocsplit - (i+1)) * nsplit * ldb #define __FFLAS__Bupdate B + (nbblocsplit - i) * nsplit * ldb #define __FFLAS__Brest B + nbblocsplit * nsplit * ldb #define __FFLAS__A1 A + (nsplit) * (lda + 1) #define __FFLAS__A2 A + (nsplit) * __FFLAS__Arowinc #define __FFLAS__A3 A #define __FFLAS__B1 B + (nsplit) * ldb #define __FFLAS__B2 B #else #define __FFLAS__Atriang A + (nrestsplit + i * nsplit) * (lda + 1) #define __FFLAS__Aupdate A + (nrestsplit + i * nsplit) * __FFLAS__Acolinc #define __FFLAS__Arest A #define __FFLAS__Brec B + (nrestsplit + i * nsplit) * ldb #define __FFLAS__Bupdate B #define __FFLAS__Brest B #define __FFLAS__A1 A #define __FFLAS__A2 A + (M-nsplit) * __FFLAS__Acolinc #define __FFLAS__A3 A + (M-nsplit) * (lda + 1) #define __FFLAS__B1 B #define __FFLAS__B2 B + (M-nsplit) * ldb #endif #else #define __FFLAS__SIDE Right #define __FFLAS__Na N #define __FFLAS__Nb nsplit #define __FFLAS__Mb M #define __FFLAS__Mb2 M #define __FFLAS__Nb2 N-nsplit #define __FFLAS__Mbrest M #define __FFLAS__Nbrest nrestsplit #define __FFLAS__Mupdate M #define __FFLAS__Nupdate nrestsplit + i * nsplit #define __FFLAS__Bdim M #define __FFLAS__Bnorminc ldb #ifdef __FFLAS__UPPER #define __FFLAS__Atriang A + (nbblocsplit - (i + 1)) * nsplit * (lda + 1) #define __FFLAS__Aupdate __FFLAS__Atriang + nsplit * __FFLAS__Acolinc #define __FFLAS__Arest A + nbblocsplit * nsplit * (lda+1) #define __FFLAS__Brec B + (nbblocsplit - (i+1)) * nsplit #define __FFLAS__Bupdate B + (nbblocsplit - i) * nsplit #define __FFLAS__Brest B + nbblocsplit * nsplit #define __FFLAS__A1 A + (nsplit) * (lda + 1) #define __FFLAS__A2 A + (nsplit) * __FFLAS__Acolinc #define __FFLAS__A3 A #define __FFLAS__B1 B + nsplit #define __FFLAS__B2 B #else #define __FFLAS__Atriang A + (nrestsplit + i * nsplit) * (lda + 1) #define __FFLAS__Aupdate A + (nrestsplit + i * nsplit) * __FFLAS__Arowinc #define __FFLAS__Arest A #define __FFLAS__Brec B + (nrestsplit + i * nsplit) #define __FFLAS__Bupdate B #define __FFLAS__Brest B #define __FFLAS__A1 A #define __FFLAS__A2 A + (N-nsplit) * __FFLAS__Arowinc #define __FFLAS__A3 A + (N-nsplit) * (lda + 1) #define __FFLAS__B1 B #define __FFLAS__B2 B + N-nsplit #endif #endif #ifdef __FFLAS__UP #define __FFLAS__UPLO Upper #else #define __FFLAS__UPLO Lower #endif #ifdef __FFLAS__UNIT #define __FFLAS__DIAG Unit #else #define __FFLAS__DIAG NonUnit #endif #ifdef __FFLAS__TRANSPOSE #define __FFLAS__TRANS Trans #else #define __FFLAS__TRANS NoTrans #endif #ifdef __FFLAS__DOUBLE #define __FFLAS__ELEMENT double #define __FFLAS__DOMAIN Givaro::DoubleDomain #define __FFLAS__BLAS_PREFIX d #endif #ifdef __FFLAS__FLOAT #define __FFLAS__ELEMENT float #define __FFLAS__DOMAIN Givaro::FloatDomain #define __FFLAS__BLAS_PREFIX s #endif #ifdef __FFLAS__GENERIC #define __FFLAS__ELEMENT Element #endif #ifndef __FFLAS__GENERIC template <> class Mjoin(ftrmm, Mjoin(__FFLAS__SIDE, Mjoin(__FFLAS__UPLO, Mjoin(__FFLAS__TRANS, __FFLAS__DIAG))))<__FFLAS__ELEMENT>{ public: template void delayed (const Field& F, const size_t M, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr B, const size_t ldb) { Mjoin(cblas_,Mjoin(__FFLAS__BLAS_PREFIX,trmm)) (CblasRowMajor, Mjoin (Cblas, __FFLAS__SIDE), Mjoin (Cblas, __FFLAS__UPLO), Mjoin (Cblas, __FFLAS__TRANS), Mjoin (Cblas, __FFLAS__DIAG), (int)M, (int)N, 1.0, A, (int)lda, B, (int)ldb ); freduce(F, M, N, B, ldb); } template void operator () (const Field& F, const size_t M, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr B, const size_t ldb) { if (!M || !N ) return; size_t nsplit = DotProdBoundClassic (F, F.one); size_t nbblocsplit = (__FFLAS__Na-1) / nsplit; size_t nrestsplit = ((__FFLAS__Na-1) % nsplit) +1; FFLASFFPACK_check(__FFLAS__Na == nsplit*nbblocsplit+nrestsplit); if (nrestsplit) this->delayed (F, __FFLAS__Mbrest, __FFLAS__Nbrest, __FFLAS__Arest, lda, __FFLAS__Brest, ldb); for ( size_t i = 0; i < nbblocsplit; ++i) { #ifdef __FFLAS__RIGHT fgemm (F, FflasNoTrans, Mjoin (Fflas, __FFLAS__TRANS), __FFLAS__Mupdate, __FFLAS__Nupdate, nsplit, F.one, __FFLAS__Brec, ldb, __FFLAS__Aupdate, lda, F.one, __FFLAS__Bupdate, ldb); #else fgemm (F, Mjoin (Fflas, __FFLAS__TRANS), FflasNoTrans, __FFLAS__Mupdate, __FFLAS__Nupdate, nsplit, F.one, __FFLAS__Aupdate, lda, __FFLAS__Brec, ldb, F.one, __FFLAS__Bupdate, ldb); #endif this->delayed (F, __FFLAS__Mb, __FFLAS__Nb, __FFLAS__Atriang, lda, __FFLAS__Brec, ldb); } } }; //class ftrmm.... #else // __FFLAS__GENERIC template class Mjoin(ftrmm, Mjoin(__FFLAS__SIDE, Mjoin(__FFLAS__UPLO, Mjoin(__FFLAS__TRANS, __FFLAS__DIAG)))) { public: template void operator() (const Field& F, const size_t M, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr B, const size_t ldb) { if (__FFLAS__Na == 1) #ifdef __FFLAS__NONUNIT fscalin(F, __FFLAS__Bdim, *A, B, __FFLAS__Bnorminc); #else ; #endif else { // __FFLAS__Na > 1 size_t nsplit = __FFLAS__Na >> 1; this->operator() (F, __FFLAS__Mb2, __FFLAS__Nb2, __FFLAS__A1, lda, __FFLAS__B1, ldb); #ifdef __FFLAS__RIGHT fgemm (F, FflasNoTrans , Mjoin (Fflas, __FFLAS__TRANS), __FFLAS__Mb2, __FFLAS__Nb2, nsplit, F.one, __FFLAS__B2, ldb, __FFLAS__A2, lda, F.one, __FFLAS__B1, ldb); #else fgemm (F, Mjoin (Fflas, __FFLAS__TRANS), FflasNoTrans, __FFLAS__Mb2, __FFLAS__Nb2, nsplit, F.one, __FFLAS__A2, lda, __FFLAS__B2, ldb, F.one, __FFLAS__B1, ldb); #endif this->operator() (F, __FFLAS__Mb, __FFLAS__Nb, __FFLAS__A3, lda, __FFLAS__B2, ldb); } } }; #endif // __FFLAS__GENERIC #ifdef __FFLAS__LOWER #undef __FFLAS__LOWER #else #undef __FFLAS__UPPER #endif #undef __FFLAS__UPLO #undef __FFLAS__DIAG #undef __FFLAS__SIDE #undef __FFLAS__TRANS #undef __FFLAS__Na #undef __FFLAS__Mb #undef __FFLAS__Nb #undef __FFLAS__Mbrest #undef __FFLAS__Nbrest #undef __FFLAS__Mupdate #undef __FFLAS__Nupdate #undef __FFLAS__Atriang #undef __FFLAS__Aupdate #undef __FFLAS__Arest #undef __FFLAS__Bupdate #undef __FFLAS__Brec #undef __FFLAS__Brest #undef __FFLAS__ELEMENT #undef __FFLAS__BLAS_PREFIX #undef __FFLAS__DOMAIN #undef __FFLAS__A1 #undef __FFLAS__A2 #undef __FFLAS__A3 #undef __FFLAS__B1 #undef __FFLAS__B2 #undef __FFLAS__Nb2 #undef __FFLAS__Mb2 #undef __FFLAS__Bdim #undef __FFLAS__Acolinc #undef __FFLAS__Arowinc #undef __FFLAS__Bnorminc #undef Mjoin #undef my_join fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_ftrsm.inl000066400000000000000000000456251274716147400225210ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_ftrsm.inl * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ftrsm_INL #define __FFLASFFPACK_ftrsm_INL namespace FFLAS { //--------------------------------------------------------------------- // ftrsm: TRiangular System solve with matrix // Computes B <- alpha.op(A^-1).B, B <- alpha.B.op(A^-1) // B is M*N, A is M*M if Side==FflasLeft, N*N if Side==FflasRight //--------------------------------------------------------------------- template inline void ftrsm (const Field& F, const FFLAS_SIDE Side, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t M, const size_t N, const typename Field::Element alpha, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else typename Field::Element_ptr #endif A, const size_t lda, typename Field::Element_ptr B, const size_t ldb) { ParSeqHelper::Sequential PSH; TRSMHelper H(PSH); FFLAS::Checker_ftrsm checker(F, M, N, alpha, B, ldb); ftrsm(F, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, H); checker.check(Side, Uplo, TransA, Diag, M, N, A, lda, B, ldb); } template inline void ftrsm (const Field& F, const FFLAS_SIDE Side, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t M, const size_t N, const typename Field::Element alpha, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else typename Field::Element_ptr #endif A, const size_t lda, typename Field::Element_ptr B, const size_t ldb, const ParSeqHelper::Sequential& PSH) { TRSMHelper H(PSH); ftrsm(F, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, H); } template inline void ftrsm (const Field& F, const FFLAS_SIDE Side, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t M, const size_t N, const typename Field::Element alpha, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else typename Field::Element_ptr #endif A, const size_t lda, typename Field::Element_ptr B, const size_t ldb, const ParSeqHelper::Parallel& PSH) { TRSMHelper > H(PSH); ftrsm(F, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, H); } template inline void ftrsm (const Field& F, const FFLAS_SIDE Side, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t M, const size_t N, const typename Field::Element alpha, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else typename Field::Element_ptr #endif A, const size_t lda, typename Field::Element_ptr B, const size_t ldb, TRSMHelper & H) { if (!M || !N ) return; if ( Side==FflasLeft ){ if ( Uplo==FflasUpper){ if (TransA == FflasNoTrans){ if (Diag == FflasUnit) Protected::ftrsmLeftUpperNoTransUnit ()(F,M,N,A,lda,B,ldb,H); else Protected::ftrsmLeftUpperNoTransNonUnit()(F,M,N,A,lda,B,ldb,H); } else { if (Diag == FflasUnit) Protected::ftrsmLeftUpperTransUnit()(F,M,N,A,lda,B,ldb,H); else Protected::ftrsmLeftUpperTransNonUnit()(F,M,N,A,lda,B,ldb,H); } } else { if (TransA == FflasNoTrans){ if (Diag == FflasUnit) Protected::ftrsmLeftLowerNoTransUnit()(F,M,N,A,lda,B,ldb,H); else Protected::ftrsmLeftLowerNoTransNonUnit()(F,M,N,A,lda,B,ldb,H); } else { if (Diag == FflasUnit) Protected::ftrsmLeftLowerTransUnit()(F,M,N,A,lda,B,ldb,H); else Protected::ftrsmLeftLowerTransNonUnit()(F,M,N,A,lda,B,ldb,H); } } } else { if ( Uplo == FflasUpper){ if (TransA == FflasNoTrans){ if (Diag == FflasUnit) Protected::ftrsmRightUpperNoTransUnit()(F,M,N,A,lda,B,ldb,H); else Protected::ftrsmRightUpperNoTransNonUnit()(F,M,N,A,lda,B,ldb,H); } else { if (Diag == FflasUnit) Protected::ftrsmRightUpperTransUnit()(F,M,N,A,lda,B,ldb,H); else Protected::ftrsmRightUpperTransNonUnit()(F,M,N,A,lda,B,ldb,H); } } else { if (TransA == FflasNoTrans){ if (Diag == FflasUnit) Protected::ftrsmRightLowerNoTransUnit()(F,M,N,A,lda,B,ldb,H); else Protected::ftrsmRightLowerNoTransNonUnit()(F,M,N,A,lda,B,ldb,H); } else { if (Diag == FflasUnit) Protected::ftrsmRightLowerTransUnit()(F,M,N,A,lda,B,ldb,H); else Protected::ftrsmRightLowerTransNonUnit()(F,M,N,A,lda,B,ldb,H); } } } if (!F.isOne(alpha)) fscalin(F,M,N,alpha,B,ldb); } #ifndef DOXYGEN_SHOULD_SKIP_THIS namespace Protected { #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__GENERIC #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__GENERIC #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT //== #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__DOUBLE #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__DOUBLE #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__FLOAT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__FLOAT #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT } // Protected #endif // SKIPPED BY DOXYGEN } // FFLAS #endif // __FFLASFFPACK_ftrsm_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_ftrsm_mp.inl000066400000000000000000000220321274716147400232000ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_ftrsm_mp.inl * @brief triangular system with matrix right hand side over multiprecision domain (either over Z or over Z/pZ) */ #ifndef __FFPACK_ftrsm_mp_INL #define __FFPACK_ftrsm_mp_INL #include #include #include #include "fflas-ffpack/fflas/fflas_bounds.inl" #include "fflas-ffpack/fflas/fflas_level3.inl" #include "fflas-ffpack/field/rns-integer-mod.h" #include "fflas-ffpack/field/rns-integer.h" namespace FFLAS { inline void ftrsm (const Givaro::Modular & F, const FFLAS_SIDE Side, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t M, const size_t N, const Givaro::Integer alpha, const Givaro::Integer * A, const size_t lda, Givaro::Integer * B, const size_t ldb){ #ifdef BENCH_PERF_TRSM_MP double t_init=0, t_trsm=0, t_mod=0, t_rec=0; FFLAS::Timer chrono; chrono.start(); #endif Givaro::Integer p; F.cardinality(p); size_t logp=p.bitsize(); size_t K; if (Side == FFLAS::FflasLeft) K=M; else K=N; if (K==0) return; // compute bit size of feasible prime size_t _k=std::max(K,logp/20), lk=0; while ( _k ) {_k>>=1; ++lk;} size_t prime_bitsize= (53-lk)>>1; // construct rns basis Givaro::Integer maxC= (p-1)*(p-1)*(p-1)*uint64_t(K); size_t n_pr =maxC.bitsize()/prime_bitsize; maxC=(p-1)*(p-1)*uint64_t(K)*(1< Zp(p, RNS); #ifdef BENCH_PERF_TRSM_MP chrono.stop(); t_init+=chrono.usertime(); chrono.clear();chrono.start(); #endif // compute A and B in RNS FFPACK::rns_double::Element_ptr Ap,Bp; Ap = FFLAS::fflas_new(Zp,K,K); Bp = FFLAS::fflas_new(Zp,M,N); if (Side == FFLAS::FflasLeft){ finit_rns(Zp,K,K,(logp/16)+(logp%16?1:0),A,lda,Ap); finit_rns(Zp,M,N,(logp/16)+(logp%16?1:0),B,ldb,Bp); } else { finit_trans_rns(Zp,K,K,(logp/16)+(logp%16?1:0),A,lda,Ap); finit_trans_rns(Zp,M,N,(logp/16)+(logp%16?1:0),B,ldb,Bp); } #ifdef BENCH_PERF_TRSM_MP chrono.stop(); t_mod+=chrono.usertime(); chrono.clear();chrono.start(); #endif // call ftrsm in rns //ftrsm(Zp, Side, Uplo, TransA, Diag, M, N, Zp.one, Ap, K, Bp, N); if (Side == FFLAS::FflasLeft) ftrsm(Zp, Side, Uplo, TransA, Diag, M, N, Zp.one, Ap, K, Bp, N); else { if (Uplo == FFLAS::FflasUpper) ftrsm(Zp, FFLAS::FflasLeft, FFLAS::FflasLower, TransA, Diag, N, M, Zp.one, Ap, K, Bp, M); else ftrsm(Zp, FFLAS::FflasLeft, FFLAS::FflasUpper, TransA, Diag, N, M, Zp.one, Ap, K, Bp, M); } #ifdef BENCH_PERF_TRSM_MP chrono.stop(); t_trsm+=chrono.usertime(); chrono.clear();chrono.start(); #endif // reconstruct the result if (Side == FFLAS::FflasLeft) fconvert_rns(Zp,M,N,F.zero,B,ldb,Bp); else{ fconvert_trans_rns(Zp,M,N,F.zero,B,ldb,Bp); } // reduce it modulo p freduce (F, M, N, B, ldb); // scale it with alpha if (!F.isOne(alpha)) fscalin(F,M,N,alpha,B,ldb); #ifdef BENCH_PERF_TRSM_MP chrono.stop(); t_rec+=chrono.usertime(); cout<<"FTRSM RNS PERF:"< inline size_t TRSMBound (const FFPACK::RNSIntegerMod &F) { return 1; } template <> inline size_t DotProdBoundClassic (const FFPACK::RNSIntegerMod& F, const FFPACK::rns_double_elt& beta) { Givaro::Integer p,b,M; F.cardinality(p); p--; F.convert(b,beta); M=F.rns()._M; uint64_t kmax= (M-b*p)/(p*p); return (size_t)std::max(uint64_t(1),kmax); //return kmax; } #ifndef __FTRSM_MP_FAST #define __FFLAS_MULTIPRECISION #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__LEFT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__LEFT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__LEFT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__LEFT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__RIGHT #define __FFLAS__UP #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__RIGHT #undef __FFLAS__UP #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__NOTRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__NOTRANSPOSE #undef __FFLAS__UNIT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__NONUNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__NONUNIT #define __FFLAS__RIGHT #define __FFLAS__LOW #define __FFLAS__TRANSPOSE #define __FFLAS__UNIT #include "fflas_ftrsm_src.inl" #undef __FFLAS__RIGHT #undef __FFLAS__LOW #undef __FFLAS__TRANSPOSE #undef __FFLAS__UNIT #endif // #ifdef __FTRSM_MP_FAST } // end of namespace protected #endif // #ifndef DOXYGEN_SHOULD_SKIP_THIS } // END OF NAMESPACE FFLAS #endif fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_ftrsm_src.inl000066400000000000000000000362431274716147400233640ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (C) 2005 C. Pernet * Written by C. Pernet * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #define Mjoin(pre, nam) my_join(pre, nam) #define my_join(pre, nam) pre ## nam #ifdef __FFLAS__TRANSPOSE #define __FFLAS__Acolinc lda #define __FFLAS__Arowinc 1 #ifdef __FFLAS__LOW #define __FFLAS__UPPER #else #define __FFLAS__LOWER #endif #else #ifdef __FFLAS__LOW #define __FFLAS__LOWER #else #define __FFLAS__UPPER #endif #define __FFLAS__Acolinc 1 #define __FFLAS__Arowinc lda #endif #ifdef __FFLAS__LEFT #define __FFLAS__SIDE Left #define __FFLAS__Na M #define __FFLAS__Nb N #ifdef __FFLAS__TRANSPOSE #define __FFLAS__Acopcolinc __FFLAS__Na #define __FFLAS__Acoprowinc 1 #else // __FFLAS__NOTRANSPOSE #define __FFLAS__Acopcolinc 1 #define __FFLAS__Acoprowinc __FFLAS__Na #endif #define __FFLAS__Mb nsplit #define __FFLAS__Nb2 N #define __FFLAS__Mb2 M-nsplit #define __FFLAS__Mbrest nrestsplit #define __FFLAS__Nbrest N #define __FFLAS__Mupdate M-(i+1)*nsplit #define __FFLAS__Nupdate N #define __FFLAS__Anorminc __FFLAS__Acolinc #define __FFLAS__Acopnorminc __FFLAS__Acopcolinc #define __FFLAS__Bnorminc 1 #define __FFLAS__Bnormnext ldb #define __FFLAS__Bdim N #ifdef __FFLAS__LOWER #define __FFLAS__Atriang A + i * nsplit * (lda + 1) #define __FFLAS__Aupdate A + i * nsplit * (lda + 1) + nsplit*__FFLAS__Arowinc #define __FFLAS__Arest A + (__FFLAS__Na - nrestsplit) * (lda + 1) #define __FFLAS__Anormnext __FFLAS__Arowinc #define __FFLAS__Acopnormnext __FFLAS__Acoprowinc #define __FFLAS__Bupdate B + (i+1)*nsplit*ldb #define __FFLAS__Brec B + i * nsplit * ldb #define __FFLAS__Brest B + (M - nrestsplit) * ldb #define __FFLAS__A1 A #define __FFLAS__A2 A + nsplit * __FFLAS__Arowinc #define __FFLAS__A3 A + nsplit * (lda + 1) #define __FFLAS__B1 B #define __FFLAS__B2 B + nsplit * ldb #define __FFLAS__Normdim i #else // __FFLAS__UPPER #define __FFLAS__Atriang A + (__FFLAS__Na - (i + 1) * nsplit) * (lda + 1) #define __FFLAS__Aupdate A + (__FFLAS__Na - (i + 1) * nsplit) * __FFLAS__Acolinc #define __FFLAS__Arest A #define __FFLAS__Anormnext lda + 1 #define __FFLAS__Acopnormnext __FFLAS__Na + 1 #define __FFLAS__Bupdate B #define __FFLAS__Brec B + (M - (i + 1) * nsplit) * ldb #define __FFLAS__Brest B #define __FFLAS__A1 A + (__FFLAS__Na - nsplit) * (lda + 1) #define __FFLAS__A2 A + (__FFLAS__Na - nsplit) * __FFLAS__Acolinc #define __FFLAS__A3 A #define __FFLAS__B1 B + (M - nsplit)*ldb #define __FFLAS__B2 B #define __FFLAS__Normdim __FFLAS__Na-i-1 #endif #else // __FFLAS__RIGHT #define __FFLAS__SIDE Right #define __FFLAS__Na N #define __FFLAS__Nb nsplit #ifdef __FFLAS__TRANSPOSE #define __FFLAS__Acopcolinc __FFLAS__Na #define __FFLAS__Acoprowinc 1 #else // __FFLAS__NOTRANSPOSE #define __FFLAS__Acopcolinc 1 #define __FFLAS__Acoprowinc __FFLAS__Na #endif #define __FFLAS__Mb M #define __FFLAS__Mb2 M #define __FFLAS__Nb2 N-nsplit #define __FFLAS__Mbrest M #define __FFLAS__Nbrest nrestsplit #define __FFLAS__Mupdate M #define __FFLAS__Nupdate N - (i + 1) * nsplit #define __FFLAS__Anorminc __FFLAS__Arowinc #define __FFLAS__Acopnorminc __FFLAS__Acoprowinc #define __FFLAS__Bnorminc ldb #define __FFLAS__Bnormnext 1 #define __FFLAS__Bdim M #ifdef __FFLAS__UPPER #define __FFLAS__Atriang A + i * nsplit * (lda + 1) #define __FFLAS__Aupdate A + i * nsplit * (lda + 1) + nsplit * __FFLAS__Acolinc #define __FFLAS__Arest A + (__FFLAS__Na - nrestsplit) * (lda + 1) #define __FFLAS__Anormnext __FFLAS__Acolinc #define __FFLAS__Acopnormnext __FFLAS__Acopcolinc #define __FFLAS__Bupdate B + (i + 1) * nsplit #define __FFLAS__Brec B + i * nsplit #define __FFLAS__Brest B + (N - nrestsplit) #define __FFLAS__A1 A #define __FFLAS__A2 A + nsplit * __FFLAS__Acolinc #define __FFLAS__A3 A + nsplit * (lda + 1) #define __FFLAS__B1 B #define __FFLAS__B2 B + nsplit #define __FFLAS__Normdim i #else // __FFLAS__LOWER #define __FFLAS__Atriang A + (__FFLAS__Na - (i + 1) * nsplit) * (lda + 1) #define __FFLAS__Aupdate A + (__FFLAS__Na - (i + 1) * nsplit) * __FFLAS__Arowinc #define __FFLAS__Arest A #define __FFLAS__Anormnext lda + 1 #define __FFLAS__Acopnormnext __FFLAS__Na + 1 #define __FFLAS__Bupdate B #define __FFLAS__Brec B + (N - (i + 1) * nsplit) #define __FFLAS__Brest B #define __FFLAS__A1 A + (__FFLAS__Na - nsplit) * (lda + 1) #define __FFLAS__A2 A + (__FFLAS__Na - nsplit) * __FFLAS__Arowinc #define __FFLAS__A3 A #define __FFLAS__B1 B + (N - nsplit) #define __FFLAS__B2 B #define __FFLAS__Normdim __FFLAS__Na - i -1 #endif #endif #ifdef __FFLAS__UP #define __FFLAS__UPLO Upper #else #define __FFLAS__UPLO Lower #endif #ifdef __FFLAS__UNIT #define __FFLAS__DIAG Unit #else #define __FFLAS__DIAG NonUnit #endif #ifdef __FFLAS__TRANSPOSE #define __FFLAS__TRANS Trans #else #define __FFLAS__TRANS NoTrans #endif #ifdef __FFLAS__DOUBLE #define __FFLAS__ELEMENT double #define __FFLAS__DOMAIN Givaro::DoubleDomain #define __FFLAS__BLAS_PREFIX d #endif #ifdef __FFLAS__FLOAT #define __FFLAS__ELEMENT float #define __FFLAS__DOMAIN Givaro::FloatDomain #define __FFLAS__BLAS_PREFIX s #endif #ifdef __FFLAS__GENERIC #define __FFLAS__ELEMENT Element #endif #ifdef __FFLAS_MULTIPRECISION #define __FFLAS__ELEMENT FFPACK::rns_double_elt #define __FFLAS__DOMAIN FFPACK::RNSInteger #define __FFLAS__BLAS_PREFIX imp #endif #ifndef __FFLAS__GENERIC template <> class Mjoin(ftrsm, Mjoin(__FFLAS__SIDE, Mjoin(__FFLAS__UPLO, Mjoin(__FFLAS__TRANS, __FFLAS__DIAG))))<__FFLAS__ELEMENT>{ public: // TRSM with delayed updates: assumes input in Zp and ensures output in Zp. // The multiple MatMul updates (recursive sequence) are done over Z template void delayed (const Field& F, const size_t M, const size_t N, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else //__FFLAS__TRSM_READONLY typename Field::Element_ptr #endif //__FFLAS__TRSM_READONLY A, const size_t lda, typename Field::Element_ptr B, const size_t ldb, const size_t nblas, size_t nbblocsblas, TRSMHelper & H) { //static __FFLAS__DOMAIN D(F); // is this safe ?? __FFLAS__DOMAIN D(F); // is this safe ?? if ( __FFLAS__Na <= nblas ){ freduce (F, M, N, B, ldb); #define __FFLAS__Atrsm A #define __FFLAS__Atrsm_lda lda #ifndef __FFLAS__UNIT #ifdef __FFLAS__TRSM_READONLY //! @warning this is C99 (-Wno-vla) //typename Field::Element Acop[__FFLAS__Na*__FFLAS__Na]; typename Field::Element_ptr Acop = FFLAS::fflas_new(F,__FFLAS__Na,__FFLAS__Na); typename Field::Element_ptr Acopi = Acop; #undef __FFLAS__Atrsm #undef __FFLAS__Atrsm_lda #define __FFLAS__Atrsm Acop #define __FFLAS__Atrsm_lda __FFLAS__Na #endif //__FFLAS__TRSM_READONLY typename Field::Element inv; #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else //__FFLAS__TRSM_READONLY typename Field::Element_ptr #endif //__FFLAS__TRSM_READONLY Ai = A; typename Field::Element_ptr Bi = B; #ifdef __FFLAS__LEFT #ifdef __FFLAS__UP Ai += __FFLAS__Acolinc; #ifdef __FFLAS__TRSM_READONLY Acopi += __FFLAS__Acopcolinc; #endif //__FFLAS__TRSM_READONLY #endif //__FFLAS__UP #endif //__FFLAS__LEFT #ifdef __FFLAS__RIGHT #ifdef __FFLAS__LOW Ai += __FFLAS__Arowinc; #ifdef __FFLAS__TRSM_READONLY Acopi += __FFLAS__Acoprowinc; #endif //__FFLAS__TRSM_READONLY #endif //__FFLAS__LOW #endif //__FFLAS__RIGHT for (size_t i = 0; i < __FFLAS__Na; ++i){ #ifdef _FF_DEBUG if ( F.isZero(*(A+i*(lda+1))) ) throw PreconditionFailed(__func__,__FILE__,__LINE__,"Triangular matrix not invertible"); #endif //_FF_DEBUG F.inv (inv, *(A + i * (lda+1))); #ifndef __FFLAS_MULTIPRECISION #ifdef __FFLAS__TRSM_READONLY fscal (F, __FFLAS__Normdim, inv, Ai, __FFLAS__Anorminc, Acopi, __FFLAS__Acopnorminc); Acopi += __FFLAS__Acopnormnext; #else //__FFLAS__TRSM_READONLY fscalin (F, __FFLAS__Normdim, inv, Ai, __FFLAS__Anorminc); #endif //__FFLAS__TRSM_READONLY #endif //__FFLAS_MULTIPRECISION FFLAS::fscalin (F, __FFLAS__Bdim, inv, Bi, __FFLAS__Bnorminc); Ai += __FFLAS__Anormnext; Bi += __FFLAS__Bnormnext; } #endif // __FFLAS__UNIT #ifndef __FFLAS_MULTIPRECISION Mjoin(cblas_,Mjoin(__FFLAS__BLAS_PREFIX,trsm)) (CblasRowMajor, Mjoin (Cblas, __FFLAS__SIDE), Mjoin (Cblas, __FFLAS__UPLO), Mjoin (Cblas, __FFLAS__TRANS), CblasUnit, (int)M, (int)N, D.one, __FFLAS__Atrsm, (int)__FFLAS__Atrsm_lda, B, (int)ldb ); freduce (F, M, N, B, ldb); #endif //__FFLAS_MULTIPRECISION #ifndef __FFLAS__UNIT Ai = A; #ifdef __FFLAS__LEFT #ifdef __FFLAS__UP Ai += __FFLAS__Acolinc; #endif //__FFLAS__UP #endif //__FFLAS__LEFT #ifdef __FFLAS__RIGHT #ifdef __FFLAS__LOW Ai += __FFLAS__Arowinc; #endif //__FFLAS__LOW #endif //__FFLAS__RIGHT #ifndef __FFLAS__TRSM_READONLY #ifndef __FFLAS_MULTIPRECISION for (size_t i = 0; i < __FFLAS__Na; ++i){ fscalin( F, __FFLAS__Normdim, *(A + i * (lda+1)) , Ai, __FFLAS__Anorminc); Ai += __FFLAS__Anormnext; } #endif //__FFLAS_MULTIPRECISION #endif //__FFLAS__TRSM_READONLY #ifdef __FFLAS__TRSM_READONLY FFLAS::fflas_delete(Acop); #endif //__FFLAS__TRSM_READONLY #endif // __FFLAS__UNIT } else { // __FFLAS__Na <= nblas size_t nbblocsup = (nbblocsblas + 1) / 2; size_t nsplit = nbblocsup * nblas; this->delayed (F, __FFLAS__Mb, __FFLAS__Nb, __FFLAS__A1, lda, __FFLAS__B1, ldb, nblas, nbblocsup, H); #ifdef __FFLAS__RIGHT fgemm (D, FflasNoTrans, Mjoin (Fflas, __FFLAS__TRANS), __FFLAS__Mb2, __FFLAS__Nb2, nsplit, D.mOne, __FFLAS__B1, ldb, __FFLAS__A2, lda, F.one, __FFLAS__B2, ldb, H.parseq); #else fgemm (D, Mjoin (Fflas, __FFLAS__TRANS), FflasNoTrans, __FFLAS__Mb2, __FFLAS__Nb2, nsplit, D.mOne, __FFLAS__A2, lda, __FFLAS__B1, ldb, F.one, __FFLAS__B2, ldb, H.parseq); #endif //__FFLAS__RIGHT this->delayed (F, __FFLAS__Mb2, __FFLAS__Nb2, __FFLAS__A3, lda, __FFLAS__B2, ldb, nblas, nbblocsblas - nbblocsup, H); } } template void operator () (const Field& F, const size_t M, const size_t N, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else typename Field::Element_ptr #endif //__FFLAS__TRSM_READONLY A, const size_t lda, typename Field::Element_ptr B, const size_t ldb, TRSMHelper & H) { #if defined(__FFLAS_MULTIPRECISION) && defined(BENCH_PERF_FTRSM_MP) FFLAS::Timer chrono;chrono.start(); #endif if (!M || !N ) return; //static __FFLAS__DOMAIN D(F); __FFLAS__DOMAIN D(F); size_t nblas = TRSMBound (F); size_t ndel = DotProdBoundClassic (F, F.one); ndel = (ndel / nblas)*nblas; size_t nsplit = ndel; size_t nbblocsplit = (__FFLAS__Na-1) / nsplit; size_t nrestsplit = ((__FFLAS__Na-1) % nsplit) +1; for ( size_t i = 0; i < nbblocsplit; ++i) { this->delayed (F, __FFLAS__Mb, __FFLAS__Nb, __FFLAS__Atriang, lda, __FFLAS__Brec, ldb, nblas, nsplit / nblas, H); #ifdef __FFLAS__RIGHT fgemm (F, FflasNoTrans, Mjoin (Fflas, __FFLAS__TRANS), __FFLAS__Mupdate, __FFLAS__Nupdate, nsplit, F.mOne, __FFLAS__Brec, ldb, __FFLAS__Aupdate, lda, F.one, __FFLAS__Bupdate, ldb, H.parseq); #else fgemm (F, Mjoin (Fflas, __FFLAS__TRANS), FflasNoTrans, __FFLAS__Mupdate, __FFLAS__Nupdate, nsplit, F.mOne, __FFLAS__Aupdate, lda, __FFLAS__Brec, ldb, F.one, __FFLAS__Bupdate, ldb, H.parseq); #endif //__FFLAS__RIGHT } if (nrestsplit) this->delayed (F, __FFLAS__Mbrest, __FFLAS__Nbrest, __FFLAS__Arest, lda, __FFLAS__Brest, ldb, nblas, nrestsplit / nblas, H); #if defined(__FFLAS_MULTIPRECISION) && defined(BENCH_PERF_FTRSM_MP) chrono.stop(); F.t_trsm+=chrono.usertime(); #endif } }; //class ftrsm.... #else // __FFLAS__GENERIC template class Mjoin(ftrsm, Mjoin(__FFLAS__SIDE, Mjoin(__FFLAS__UPLO, Mjoin(__FFLAS__TRANS, __FFLAS__DIAG)))) { public: template void operator() (const Field& F, const size_t M, const size_t N, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else typename Field::Element_ptr #endif A, const size_t lda, typename Field::Element_ptr B, const size_t ldb, TRSMHelper & H) { if (__FFLAS__Na == 1){ #ifndef __FFLAS__UNIT typename Field::Element inv; F.init(inv); #ifdef _FF_DEBUG if ( F.isZero(*A) ) throw PreconditionFailed(__func__,__FILE__,__LINE__,"Triangular matrix not invertible"); #endif //_FF_DEBUG F.inv(inv, *A); FFLAS::fscalin(F, __FFLAS__Bdim, inv, B, __FFLAS__Bnorminc); #endif //__FFLAS__UNIT } else { // __FFLAS__Na > 1 size_t nsplit = __FFLAS__Na >> 1; this->operator() (F, __FFLAS__Mb, __FFLAS__Nb, __FFLAS__A1, lda, __FFLAS__B1, ldb, H); #ifdef __FFLAS__RIGHT fgemm (F, FflasNoTrans , Mjoin (Fflas, __FFLAS__TRANS), __FFLAS__Mb2, __FFLAS__Nb2, nsplit, F.mOne, __FFLAS__B1, ldb, __FFLAS__A2, lda, F.one, __FFLAS__B2, ldb, H.parseq); #else //__FFLAS__RIGHT fgemm (F, Mjoin (Fflas, __FFLAS__TRANS), FFLAS::FflasNoTrans, __FFLAS__Mb2, __FFLAS__Nb2, nsplit, F.mOne, __FFLAS__A2, lda, __FFLAS__B1, ldb, F.one, __FFLAS__B2, ldb, H.parseq); #endif //__FFLAS__RIGHT this->operator() (F, __FFLAS__Mb2, __FFLAS__Nb2, __FFLAS__A3, lda, __FFLAS__B2, ldb, H); } } }; #endif // __FFLAS__GENERIC #ifdef __FFLAS__LOWER #undef __FFLAS__LOWER #else #undef __FFLAS__UPPER #endif #undef __FFLAS__UPLO #undef __FFLAS__DIAG #undef __FFLAS__SIDE #undef __FFLAS__TRANS #undef __FFLAS__Na #undef __FFLAS__Mb #undef __FFLAS__Nb #undef __FFLAS__Mbrest #undef __FFLAS__Nbrest #undef __FFLAS__Mupdate #undef __FFLAS__Nupdate #undef __FFLAS__Atriang #undef __FFLAS__Aupdate #undef __FFLAS__Arest #undef __FFLAS__Bupdate #undef __FFLAS__Brec #undef __FFLAS__Brest #undef __FFLAS__Bnorminc #undef __FFLAS__Bnormnext #undef __FFLAS__Anormnext #undef __FFLAS__Acopnormnext #undef __FFLAS__Anorminc #undef __FFLAS__Acopnorminc #undef __FFLAS__ELEMENT #undef __FFLAS__BLAS_PREFIX #undef __FFLAS__DOMAIN #undef __FFLAS__A1 #undef __FFLAS__A2 #undef __FFLAS__A3 #undef __FFLAS__B1 #undef __FFLAS__B2 #undef __FFLAS__Nb2 #undef __FFLAS__Mb2 #undef __FFLAS__Bdim #undef __FFLAS__Normdim #undef __FFLAS__Acolinc #undef __FFLAS__Arowinc #undef __FFLAS__Acopcolinc #undef __FFLAS__Acoprowinc #undef __FFLAS__Atrsm_lda #undef __FFLAS__Atrsm #undef Mjoin #undef my_join fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_ftrsv.inl000066400000000000000000000062271274716147400225250ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_ftrsv.inl * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ftrsv_INL #define __FFLASFFPACK_ftrsv_INL namespace FFLAS { //--------------------------------------------------------------------- // ftrsv: TRiangular System solve with vector // Computes X <- op(A^-1).X // size of X is m //--------------------------------------------------------------------- template inline void ftrsv (const Field& F, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t N,typename Field::ConstElement_ptr A, size_t lda, typename Field::Element_ptr X, int incX) { typename Field::Element_ptr Xi, Xj, Ximax; typename Field::ConstElement_ptr Ai, Aj; if ( Uplo == FflasLower ){ if ( TransA == FflasTrans){ Ai = A+(N-1)*(lda+1); // bottom right entry of A Ximax = Xi = X+(int)(N-1)*incX; for( ; Xi>=X; Ai-=lda+1,Xi-=incX ){ F.negin( *Xi ); for ( Xj = Xi+incX, Aj=Ai+lda; Xj<=Ximax; Xj+=incX, Aj+=lda){ F.axpyin( *Xi, *Xj, *Aj ); } if ( Diag==FflasNonUnit ){ F.divin(*Xi,*Ai); } F.negin( *Xi ); } } // FflasTrans else{ Ai = A; Xi = X; for( ; Xi=X; Xj-=incX, Aj--){ F.axpyin( *Xi, *Xj, *Aj ); } if ( Diag==FflasNonUnit ) F.divin(*Xi,*Ai); F.negin( *Xi ); } } } // FflasLower else{ if ( TransA == FflasTrans){ Ai = A; Xi = X; for( ; Xi=X; Xj-=incX, Aj-=lda){ F.axpyin( *Xi, *Xj, *Aj ); } if ( Diag==FflasNonUnit ) F.divin(*Xi,*Ai); F.negin( *Xi ); } } // FflasTrans else{ Ai = A+(lda+1)*(N-1); Ximax = Xi = X+incX*(int)(N-1); for( ; Xi>=X; Ai-=lda+1,Xi-=incX ){ F.negin( *Xi ); for ( Xj = Xi+incX, Aj=Ai+1; Xj<=Ximax; Xj+=incX, Aj++){ F.axpyin( *Xi, *Xj, *Aj ); } if ( Diag==FflasNonUnit ) F.divin(*Xi,*Ai); F.negin( *Xi ); } } } } } #endif // __FFLASFFPACK_ftrsv_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_helpers.inl000066400000000000000000000346251274716147400230260ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_mmhelper.h * @brief Matrix-Matrix Helper class */ #ifndef __FFLASFFPACK_fflas_fflas_mmhelper_INL #define __FFLASFFPACK_fflas_fflas_mmhelper_INL #include "fflas-ffpack/field/field-traits.h" #include "fflas-ffpack/paladin/parallel.h" #include "fflas-ffpack/utils/flimits.h" #include // std::max namespace FFLAS{ namespace Protected{ /** \brief Computes the number of recursive levels to perform. * * \param m the common dimension in the product AxB */ template int WinogradSteps (const Field & F, const size_t & m); }//Protected }//FFLAS namespace FFLAS { namespace Protected{ template inline size_t min_types(DFE& k) {return static_cast(k);} #if __FFLASFFPACK_SIZEOF_LONG == 4 template <> inline size_t min_types(double& k) {return static_cast(std::min(k,double(std::numeric_limits::max())));} template <> inline size_t min_types(int64_t& k) {return static_cast(std::min(k,int64_t(std::numeric_limits::max())));} #endif template <> inline size_t min_types(RecInt::rint<6>& k) {return static_cast(uint64_t(std::min(k,RecInt::rint<6>(uint64_t(std::numeric_limits::max())))));} template <> inline size_t min_types(RecInt::rint<7>& k) {return static_cast(uint64_t(std::min(k,RecInt::rint<7>(uint64_t(std::numeric_limits::max())))));} template <> inline size_t min_types(RecInt::rint<8>& k) {return static_cast(uint64_t(std::min(k,RecInt::rint<8>(uint64_t(std::numeric_limits::max())))));} template <> inline size_t min_types(RecInt::rint<9>& k) {return static_cast(uint64_t(std::min(k,RecInt::rint<9>(uint64_t(std::numeric_limits::max())))));} template <> inline size_t min_types(RecInt::rint<10>& k) {return static_cast(uint64_t(std::min(k,RecInt::rint<10>(uint64_t(std::numeric_limits::max())))));} template <> inline size_t min_types(Givaro::Integer& k) {return static_cast(uint64_t(std::min(k,Givaro::Integer(uint64_t(std::numeric_limits::max())))));} template inline bool unfit(T x){return false;} template <> inline bool unfit(int64_t x){return (x>limits::max());} template inline bool unfit(RecInt::rint x){return (x > RecInt::rint(limits>::max()));} template <> inline bool unfit(RecInt::rint<6> x){return (x > limits::max());} } namespace MMHelperAlgo{ struct Auto{}; struct Classic{}; struct Winograd{}; struct WinogradPar{}; struct Bini{}; } template struct AlgoChooser{typedef MMHelperAlgo::Winograd value;}; template struct AlgoChooser, ParSeq>{typedef MMHelperAlgo::Classic value;}; template::value, typename ParSeqTrait = ParSeqHelper::Sequential > struct MMHelper; /*! FGEMM Helper for Default and ConvertTo modes of operation */ template struct MMHelper { typedef MMHelper Self_t; int recLevel ; ParSeqTrait parseq; MMHelper(){} MMHelper(const Field& F, size_t m, size_t k, size_t n, ParSeqTrait _PS) : recLevel(-1), parseq(_PS) {} MMHelper(const Field& F, int w, ParSeqTrait _PS=ParSeqTrait()) : recLevel(w), parseq(_PS) {} // copy constructor from other Field and Algo Traits template MMHelper(MMHelper& WH) : recLevel(WH.recLevel), parseq(WH.parseq) {} friend std::ostream& operator<<(std::ostream& out, const Self_t& M) { return out <<"Helper: " <).name()<< ' ' << M.parseq < void igemm_colmajor( size_t rows, size_t cols, size_t depth , const int64_t alpha , const int64_t* A, size_t lda, const int64_t* B, size_t ldb , int64_t* C, size_t ldc ) ; template void igemm_colmajor( size_t rows, size_t cols, size_t depth , const int64_t alpha , const int64_t* A, size_t lda, const int64_t* B, size_t ldb , int64_t* C, size_t ldc ) ; inline void igemm(const enum FFLAS_TRANSPOSE TransA, const enum FFLAS_TRANSPOSE TransB , size_t rows, size_t cols, size_t depth , const int64_t alpha , const int64_t* A, size_t lda, const int64_t* B, size_t ldb , const int64_t beta , int64_t* C, size_t ldc ) ; } // Protected } // FFLAS namespace FFLAS { /* igemm */ inline void igemm_(const enum FFLAS_ORDER Order, const enum FFLAS_TRANSPOSE TransA, const enum FFLAS_TRANSPOSE TransB , const size_t M, const size_t N, const size_t K , const int64_t alpha , const int64_t *A, const size_t lda, const int64_t *B, const size_t ldb , const int64_t beta , int64_t *C, const size_t ldc); } // FFLAS #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS #include "igemm.inl" #endif #endif // __FFLASFFPACK_fflas_igemm_igemm_H fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_igemm/igemm.inl000066400000000000000000000140351274716147400235510ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2013,2014 Pascal Giorgi * * Written by Pascal Giorgi * the code is inspired and adapted from the Eigen library * modified by Brice Boyer (briceboyer) * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_igemm_igemm_INL #define __FFLASFFPACK_fflas_igemm_igemm_INL #include "fflas-ffpack/utils/fflas_memory.h" namespace FFLAS { namespace Protected { // Assume matrices A,B,C are stored in column major order template void igemm_colmajor(size_t rows, size_t cols, size_t depth, const int64_t alpha, const int64_t* A, size_t lda, const int64_t* B, size_t ldb, int64_t* C, size_t ldc) { FFLASFFPACK_check(alpha != 0); switch(alpha) { case 1: igemm_colmajor(rows,cols,depth, alpha,A,lda,B,ldb, C,ldc); break; case -1: igemm_colmajor(rows,cols,depth, alpha,A,lda,B,ldb, C,ldc); break; default: igemm_colmajor(rows,cols,depth, alpha,A,lda,B,ldb, C,ldc); } } template void igemm_colmajor(size_t rows, size_t cols, size_t depth, const int64_t alpha, const int64_t* A, size_t lda, const int64_t* B, size_t ldb, int64_t* C, size_t ldc) { using simd = Simd ; size_t mc,kc,nc; mc=rows; nc=cols; kc=depth; FFLAS::details::BlockingFactor(mc,nc,kc); size_t sizeA = mc*kc; size_t sizeB = kc*cols; size_t sizeW = simd::vect_size*kc*_nr; // store data duplicated by the number of elements fitting in vector register // these data must be simd::alignment byte aligned int64_t *blockA, *blockB, *blockW; blockA = fflas_new(sizeA, (Alignment)simd::alignment); blockB = fflas_new(sizeB, (Alignment)simd::alignment); blockW = fflas_new(sizeW, (Alignment)simd::alignment); // For each horizontal panel of B, and corresponding vertical panel of A for(size_t k2=0; k2(blockB, B+k2, ldb, actual_kc, cols); else FFLAS::details::pack_lhs<_nr,true>(blockB, B+k2*ldb, ldb, cols, actual_kc); // For each mc x kc block of the lhs's vertical panel... for(size_t i2=0; i2(blockA, A+i2+k2*lda, lda, actual_mc, actual_kc); else FFLAS::details::pack_rhs<_mr,true>(blockA, A+i2*lda+k2, lda, actual_kc, actual_mc); // call block*panel kernel FFLAS::details::igebp(actual_mc, cols, actual_kc , alpha , blockA, actual_kc, blockB, actual_kc , C+i2, ldc , blockW); } } fflas_delete(blockA); fflas_delete(blockB); fflas_delete(blockW); } void igemm( const enum FFLAS_TRANSPOSE TransA, const enum FFLAS_TRANSPOSE TransB, size_t rows, size_t cols, size_t depth , const int64_t alpha , const int64_t* A, size_t lda, const int64_t* B, size_t ldb , const int64_t beta , int64_t* C, size_t ldc ) { if (!rows || !cols) { return ; } //! @todo use primitive (no Field()) and specialise for int64. // CP: fscalin assumes C in row major mode and we are here in col major mode // hence let's transpose the arguments. fscalin(Givaro::ZRing(),cols,rows, beta,C,ldc); if (!depth || alpha == 0) { return ; } if (TransA == FflasNoTrans) { if (TransB == FflasNoTrans) { igemm_colmajor(rows, cols, depth, alpha, A, lda, B, ldb, C, ldc); } else { igemm_colmajor(rows, cols, depth, alpha, A, lda, B, ldb, C, ldc); } } else { if (TransB == FflasNoTrans) { igemm_colmajor(rows, cols, depth, alpha, A, lda, B, ldb, C, ldc); } else { igemm_colmajor(rows, cols, depth, alpha, A, lda, B, ldb, C, ldc); } } } } // Protected } // FFLAS // igemm namespace FFLAS { inline void igemm_(const enum FFLAS_ORDER Order, const enum FFLAS_TRANSPOSE TransA, const enum FFLAS_TRANSPOSE TransB, const size_t M, const size_t N, const size_t K, const int64_t alpha, const int64_t *A, const size_t lda, const int64_t *B, const size_t ldb, const int64_t beta, int64_t *C, const size_t ldc) { if (Order == FflasColMajor) Protected::igemm(TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc); else Protected::igemm(TransB,TransA,N,M,K,alpha,B,ldb,A,lda,beta,C,ldc); } } // FFLAS #endif // __FFLASFFPACK_fflas_igemm_igemm_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_igemm/igemm_kernels.h000066400000000000000000000061751274716147400247470ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2013,2014 Pascal Giorgi * * Written by Pascal Giorgi * the code is inspired and adapted from the Eigen library * modified by Brice Boyer (briceboyer) * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_igemm_igemm_kernels_H #define __FFLASFFPACK_fflas_igemm_igemm_kernels_H namespace FFLAS { namespace details { /* ************* */ /* GEBP KERNELS */ /* ************* */ template inline void igebb44(size_t i, size_t j, size_t depth, size_t pdeth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ); template inline void igebb24(size_t i, size_t j, size_t depth, size_t pdeth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ); template inline void igebb14(size_t i, size_t j, size_t depth, size_t pdeth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ); template inline void igebb41(size_t i, size_t j, size_t depth, size_t pdeth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ); template inline void igebb21(size_t i, size_t j, size_t depth, size_t pdeth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ); template inline void igebb11(size_t i, size_t j, size_t depth, size_t pdeth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ); /************************* * MAIN GEBP OPERATION * ************************/ template void igebp( size_t rows, size_t cols, size_t depth , const int64_t alpha , const int64_t* blockA, size_t lda, const int64_t* blockB, size_t ldb, int64_t* C, size_t ldc, int64_t* blockW); } // details } // FFLAS #include "igemm_kernels.inl" // could be .C #endif // __FFLASFFPACK_fflas_igemm_igemm_kernels_H fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_igemm/igemm_kernels.inl000066400000000000000000000363541274716147400253040ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2013,2014 Pascal Giorgi * * Written by Pascal Giorgi * the code is inspired and adapted from the Eigen library * modified by Brice Boyer (briceboyer) * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_igemm_igemm_kernels_INL #define __FFLASFFPACK_fflas_igemm_igemm_kernels_INL #ifdef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS #define _nr 4 #define _mr 8 #define StepA 4 #define StepB 4 #elif defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) or defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) #define _nr 4 #define _mr 4 #define StepA 2 #define StepB 2 #else #error "kernels not supported" #endif // __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS #include "fflas-ffpack/utils/fflas_memory.h" #include "igemm_tools.h" /******************************************************** * KERNEL FOR MATMUL USING SIMD OPERATION AND REGISTERS * ********************************************************/ namespace FFLAS { namespace details { /* kernels */ template inline void igebb44(size_t i, size_t j, size_t depth, size_t pdepth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ) { using simd = Simd; using vect_t = typename simd::vect_t; size_t k; vect_t C0,C1,C2,C3,C4,C5,C6,C7; C0 = simd::zero(); C1 = simd::zero(); C2 = simd::zero(); C3 = simd::zero(); C4 = simd::zero(); C5 = simd::zero(); C6 = simd::zero(); C7 = simd::zero(); int64_t *r0 = C+j*ldc+i; int64_t *r1 = r0+ldc; int64_t *r2 = r1+ldc; int64_t *r3 = r2+ldc; prefetch(r0+simd::vect_size); prefetch(r1+simd::vect_size); prefetch(r2+simd::vect_size); prefetch(r3+simd::vect_size); // process the loop by (_mrx4) by (4x4) matrix mul for (k=0;k inline void igebb24(size_t i, size_t j, size_t depth, size_t pdepth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ) { using simd = Simd; using vect_t = typename simd::vect_t; //cout<<"aligned 32:"<< int64_t( blA)% 32 < inline void igebb14(size_t i, size_t j, size_t depth, size_t pdepth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ) { // using simd = Simd; // using vect_t = typename simd::vect_t; size_t k; int64_t *r0 = C+j*ldc+i; int64_t *r1 = r0+ldc; int64_t *r2 = r1+ldc; int64_t *r3 = r2+ldc; for(k=0;k inline void igebb41(size_t i, size_t j, size_t depth, size_t pdepth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ) { using simd = Simd; using vect_t = typename simd::vect_t; size_t k; vect_t C0,C4; C0 = simd::zero(); C4 = simd::zero(); int64_t *r0 = C+j*ldc+i; int64_t *r4 = r0+simd::vect_size; // process the loop by (_mrx1) by (1x1) matrix mul for (k=0;k inline void igebb21(size_t i, size_t j, size_t depth, size_t pdepth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ) { using simd = Simd; using vect_t = typename simd::vect_t; size_t k; vect_t C0; C0 = simd::zero(); int64_t *r0 = C+j*ldc+i; // process the loop by (1/2_mrx1) by (1x1) matrix mul for (k=0;k inline void igebb11(size_t i, size_t j, size_t depth, size_t pdepth , const int64_t alpha , const int64_t *blA, const int64_t* blB , int64_t* C, size_t ldc ) { // using simd = Simd; // using vect_t = typename simd::vect_t; size_t k; int64_t *r0 = C+j*ldc+i; for(k=0;k void igebp( size_t rows, size_t cols, size_t depth , const int64_t alpha , const int64_t* blockA, size_t lda, const int64_t* blockB, size_t ldb, int64_t* C, size_t ldc, int64_t* blockW) { using simd = Simd; // using vect_t = typename simd::vect_t; size_t i,j; size_t prows,pcols,pdepth; prows=(rows/_mr)*_mr; pcols=(cols/_nr)*_nr; pdepth=(depth/4)*4; // process columns by pack of _nr for(j=0;j(blockW, blockB+j*ldb,depth*_nr); prefetch(blockW); // process rows by pack of _mr for (i=0;i(i, j, depth, pdepth, alpha, blA, blockW, C, ldc); } i=prows; // process the (rows%_mr) remainings rows int rem=(int)(rows-prows); while (rem >0) { if (rem>=(int)simd::vect_size){ igebb24(i ,j,depth, pdepth, alpha , blockA+i*lda, blockW, C, ldc); i+=simd::vect_size; rem-=(int)simd::vect_size; } else{ // use blockB since no vectorization igebb14(i,j,depth, pdepth, alpha, blockA+i*lda, blockB+j*ldb, C, ldc); i++; rem--; } } } // process the (columns%_nr) remaining columns one by one for (;j(blockW, blockB+j*ldb,depth); prefetch(blockW); // process rows by pack of _mr for (i=0;i(i, j, depth, pdepth, alpha, blA, blockW, C, ldc); } i=prows; // process the (rows%_mr) remainings rows int rem=(int)(rows-prows); while (rem >0) { if (rem>=(int)simd::vect_size){ igebb21(i ,j,depth, pdepth, alpha, blockA+i*lda, blockW, C, ldc); i+=simd::vect_size; rem-=(int)(simd::vect_size); } else{ // use blockB since no vectorization igebb11(i,j,depth, pdepth, alpha, blockA+i*lda, blockB+j*ldb, C, ldc); i++; rem--; } } } } } // details } // FFLAS #endif // __FFLASFFPACK_fflas_igemm_igemm_kernels_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_igemm/igemm_tools.h000066400000000000000000000041141274716147400244330ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2013,2014 Pascal Giorgi * * Written by Pascal Giorgi * the code is inspired and adapted from the Eigen library * modified by Brice Boyer (briceboyer) * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_igemm_igemm_tools_H #define __FFLASFFPACK_fflas_igemm_igemm_tools_H /* ***** */ /* TOOLS */ /* ***** */ namespace FFLAS { namespace details { /* tools */ // duplicate each entry into vector register template inline void duplicate_vect (int64_t* XX, const int64_t* X, size_t n){} template void pack_lhs(int64_t* XX, const int64_t* X, size_t ldx, size_t rows, size_t cols); template void pack_rhs(int64_t* XX, const int64_t* X, size_t ldx, size_t rows, size_t cols); void gebp(size_t rows, size_t cols, size_t depth,int64_t* C, size_t ldc, const int64_t* blockA, size_t lda, const int64_t* BlockB, size_t ldb, int64_t* BlockW); void BlockingFactor(size_t& m, size_t& n, size_t& k); } // details } // FFLAS #include "igemm_tools.inl" #endif // __FFLASFFPACK_fflas_igemm_igemm_tools_H fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_igemm/igemm_tools.inl000066400000000000000000000110661274716147400247720ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2013,2014 Pascal Giorgi * * Written by Pascal Giorgi * the code is inspired and adapted from the Eigen library * modified by Brice Boyer (briceboyer) * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_igemm_igemm_tools_INL #define __FFLASFFPACK_fflas_igemm_igemm_tools_INL #include "fflas-ffpack/fflas/fflas_simd.h" namespace FFLAS { namespace details { template<> inline void duplicate_vect<2>(int64_t* XX, const int64_t* X, size_t n) { int64_t *p=XX; for(size_t i=0;i inline void duplicate_vect<4>(int64_t* XX, const int64_t* X, size_t n) { int64_t *p=XX; for(size_t i=0;i void pack_rhs(int64_t* XX, const int64_t* X, size_t ldx, size_t rows, size_t cols) { size_t cols_by_k=(cols/k)*k; size_t p=0; // pack by k columns for(size_t j=0;j=StepA){ for(size_t i=0;i void pack_lhs(int64_t* XX, const int64_t* X, size_t ldx, size_t rows, size_t cols) { using simd = Simd ; size_t p=0; size_t rows_by_k=(rows/k)*k; // pack rows by group of k for(size_t i=0;i=StepA){ for(size_t j=0;j double matrix // Special design for upper-triangular matrices //--------------------------------------------------------------------- template void MatF2MatD_Triangular (const Field& F, Givaro::DoubleDomain::Element_ptr S, const size_t lds, typename Field::ConstElement_ptr const E, const size_t lde, const size_t m, const size_t n) { typename Field::ConstElement_ptr Ei = E; Givaro::DoubleDomain::Element_ptr Si = S; size_t i=0, j; for ( ; i float matrix // Special design for upper-triangular matrices //--------------------------------------------------------------------- //! @todo do finit(...,FFLAS_TRANS,FFLAS_DIAG) //! @todo do fconvert(...,FFLAS_TRANS,FFLAS_DIAG) template void MatF2MatFl_Triangular (const Field& F, Givaro::FloatDomain::Element_ptr S, const size_t lds, typename Field::ConstElement_ptr const E, const size_t lde, const size_t m, const size_t n) { typename Field::ConstElement_ptr Ei = E; Givaro::FloatDomain::Element_ptr Si = S; size_t i=0, j; for ( ; i class ftrsmLeftUpperNoTransNonUnit; template class ftrsmLeftUpperNoTransUnit; template class ftrsmLeftUpperTransNonUnit; template class ftrsmLeftUpperTransUnit; template class ftrsmLeftLowerNoTransNonUnit; template class ftrsmLeftLowerNoTransUnit; template class ftrsmLeftLowerTransNonUnit; template class ftrsmLeftLowerTransUnit; template class ftrsmRightUpperNoTransNonUnit; template class ftrsmRightUpperNoTransUnit; template class ftrsmRightUpperTransNonUnit; template class ftrsmRightUpperTransUnit; template class ftrsmRightLowerNoTransNonUnit; template class ftrsmRightLowerNoTransUnit; template class ftrsmRightLowerTransNonUnit; template class ftrsmRightLowerTransUnit; // Specialized routines for ftrmm template class ftrmmLeftUpperNoTransNonUnit; template class ftrmmLeftUpperNoTransUnit; template class ftrmmLeftUpperTransNonUnit; template class ftrmmLeftUpperTransUnit; template class ftrmmLeftLowerNoTransNonUnit; template class ftrmmLeftLowerNoTransUnit; template class ftrmmLeftLowerTransNonUnit; template class ftrmmLeftLowerTransUnit; template class ftrmmRightUpperNoTransNonUnit; template class ftrmmRightUpperNoTransUnit; template class ftrmmRightUpperTransNonUnit; template class ftrmmRightUpperTransUnit; template class ftrmmRightLowerNoTransNonUnit; template class ftrmmRightLowerNoTransUnit; template class ftrmmRightLowerTransNonUnit; template class ftrmmRightLowerTransUnit; } // protected } // FFLAS namespace FFLAS { //--------------------------------------------------------------------- // Level 3 routines //--------------------------------------------------------------------- // set by default for ftrsm to be thread safe // undef it at your own risk, and only if you run it in sequential #define __FFLAS__TRSM_READONLY /** @brief ftrsm: TRiangular System solve with Matrix. * Computes \f$ B \gets \alpha \mathrm{op}(A^{-1}) B\f$ or \f$B \gets \alpha B \mathrm{op}(A^{-1})\f$. * \param F field * \param Side if \c Side==FflasLeft then \f$ B \gets \alpha \mathrm{op}(A^{-1}) B\f$ is computed. * \param Uplo if \c Uplo==FflasUpper then \p A is upper triangular * \param TransA if \c TransA==FflasTrans then \f$\mathrm{op}(A)=A^t\f$. * \param Diag if \c Diag==FflasUnit then \p A is unit. * \param M rows of \p B * \param N cols of \p B * @param alpha scalar * \param A triangular invertible matrix. If \c Side==FflasLeft then \p A is \f$N\times N\f$, otherwise \p A is \f$M\times M\f$ * @param lda leading dim of \p A * @param B matrix of size \p MxN * @param ldb leading dim of \p B * @bug \f$\alpha\f$ must be non zero. */ template void ftrsm (const Field& F, const FFLAS_SIDE Side, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t M, const size_t N, const typename Field::Element alpha, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr A, #else typename Field::Element_ptr A, #endif const size_t lda, typename Field::Element_ptr B, const size_t ldb); /** @brief ftrmm: TRiangular Matrix Multiply. * Computes \f$ B \gets \alpha \mathrm{op}(A) B\f$ or \f$B \gets \alpha B \mathrm{op}(A)\f$. * @param F field * \param Side if \c Side==FflasLeft then \f$ B \gets \alpha \mathrm{op}(A) B\f$ is computed. * \param Uplo if \c Uplo==FflasUpper then \p A is upper triangular * \param TransA if \c TransA==FflasTrans then \f$\mathrm{op}(A)=A^t\f$. * \param Diag if \c Diag==FflasUnit then \p A is implicitly unit. * \param M rows of \p B * \param N cols of \p B * @param alpha scalar * \param A triangular matrix. If \c Side==FflasLeft then \p A is \f$N\times N\f$, otherwise \p A is \f$M\times M\f$ * @param lda leading dim of \p A * @param B matrix of size \p MxN * @param ldb leading dim of \p B */ template void ftrmm (const Field& F, const FFLAS_SIDE Side, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t M, const size_t N, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr B, const size_t ldb); /** @brief fgemm: Field GEneral Matrix Multiply. * * Computes \f$C = \alpha \mathrm{op}(A) \times \mathrm{op}(B) + \beta C\f$ * Automatically set Winograd recursion level * \param F field. * \param ta if \c ta==FflasTrans then \f$\mathrm{op}(A)=A^t\f$, else \f$\mathrm{op}(A)=A\f$, * \param tb same for matrix \p B * \param m see \p A * \param n see \p B * \param k see \p A * \param alpha scalar * \param beta scalar * \param A \f$\mathrm{op}(A)\f$ is \f$m \times k\f$ * \param B \f$\mathrm{op}(B)\f$ is \f$k \times n\f$ * \param C \f$C\f$ is \f$m \times n\f$ * \param lda leading dimension of \p A * \param ldb leading dimension of \p B * \param ldc leading dimension of \p C * \param w recursive levels of Winograd's algorithm are used. No argument (or -1) does auto computation of \p w. * @warning \f$\alpha\f$ \e must be invertible */ template typename Field::Element_ptr fgemm( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc); template typename Field::Element_ptr fgemm( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const ParSeqHelper::Sequential seq); template typename Field::Element_ptr fgemm( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, const ParSeqHelper::Parallel par); template typename Field::Element* pfgemm_1D_rec( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::Element_ptr A, const size_t lda, const typename Field::Element_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element * C, const size_t ldc, size_t seuil); template typename Field::Element* pfgemm_2D_rec( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::Element_ptr A, const size_t lda, const typename Field::Element_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element * C, const size_t ldc, size_t seuil); template typename Field::Element* pfgemm_3D_rec( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::Element_ptr A, const size_t lda, const typename Field::Element_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, size_t seuil, size_t * x); template typename Field::Element_ptr pfgemm_3D_rec2( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::Element_ptr A, const size_t lda, const typename Field::Element_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, size_t seuil, size_t *x); /** @brief fgemm: Field GEneral Matrix Multiply. * * Computes \f$C = \alpha \mathrm{op}(A) \times \mathrm{op}(B) + \beta C\f$ * Version with Helper. Input and Output are not supposed to be reduced. * \param F field. * \param ta if \c ta==FflasTrans then \f$\mathrm{op}(A)=A^t\f$, else \f$\mathrm{op}(A)=A\f$, * \param tb same for matrix \p B * \param m see \p A * \param n see \p B * \param k see \p A * \param alpha scalar * \param beta scalar * \param A \f$\mathrm{op}(A)\f$ is \f$m \times k\f$ * \param B \f$\mathrm{op}(B)\f$ is \f$k \times n\f$ * \param C \f$C\f$ is \f$m \times n\f$ * \param lda leading dimension of \p A * \param ldb leading dimension of \p B * \param ldc leading dimension of \p C * \param H helper, driving the computation (algorithm, delayed modular reduction, switch of base type, etc) * @warning \f$\alpha\f$ \e must be invertible */ // template // inline typename Field::Element_ptr // fgemm (const Field& F, // const FFLAS_TRANSPOSE ta, // const FFLAS_TRANSPOSE tb, // const size_t m, const size_t n, const size_t k, // const typename Field::Element alpha, // typename Field::Element_ptr A, const size_t lda, // typename Field::Element_ptr B, const size_t ldb, // const typename Field::Element beta, // typename Field::Element_ptr C, const size_t ldc, // MMHelper & H); } // FFLAS #include "fflas-ffpack/paladin/parallel.h" namespace FFLAS { /** @brief fsquare: Squares a matrix. * compute \f$ C \gets \alpha \mathrm{op}(A) \mathrm{op}(A) + \beta C\f$ over a Field \p F * Avoid the conversion of B * @param ta if \c ta==FflasTrans, \f$\mathrm{op}(A)=A^T\f$. * @param F field * @param n size of \p A * @param alpha scalar * @param beta scalar * @param A dense matrix of size \c nxn * @param lda leading dimension of \p A * @param C dense matrix of size \c nxn * @param ldc leading dimension of \p C */ template typename Field::Element_ptr fsquare (const Field& F, const FFLAS_TRANSPOSE ta, const size_t n, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc); } // FFLAS #endif // __FFLASFFPACK_fflas_fflas_level3_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_pfgemm.inl000066400000000000000000000070441274716147400226320ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_pfgemm.inl * Copyright (C) 2013 Jean Guillaume Dumas Clement Pernet Ziad Sultan * * Written by Jean Guillaume Dumas Clement Pernet Ziad Sultan * Time-stamp: <27 Nov 15 14:07:46 Jean-Guillaume.Dumas@imag.fr> * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_pfgemm_INL #define __FFLASFFPACK_fflas_pfgemm_INL #define __FFLASFFPACK_SEQPARTHRESHOLD 220 #define __FFLASFFPACK_DIMKPENALTY 1 #ifdef __FFLASFFPACK_USE_KAAPI #include #include "fflas-ffpack/fflas/kaapi_routines.inl" #endif #ifdef __FFLASFFPACK_USE_OPENMP #include #endif #include "fflas-ffpack/paladin/blockcuts.inl" #include "fflas-ffpack/paladin/parallel.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/paladin/pfgemm_variants.inl" namespace FFLAS { template inline typename std::enable_if >::value,typename Field::Element_ptr>::type fgemm( const Field& F, const FFLAS::FFLAS_TRANSPOSE ta, const FFLAS::FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper > & H) { return pfgemm (F, ta, tb, m, n, k ,alpha, A, lda, B, ldb, beta, C, ldc, H); } // template // inline typename std::enable_if >::value,typename Field::Element_ptr>::type // fgemm( const Field& F, // const FFLAS::FFLAS_TRANSPOSE ta, // const FFLAS::FFLAS_TRANSPOSE tb, // const size_t m, // const size_t n, // const size_t k, // const typename Field::Element alpha, // typename Field::ConstElement_ptr A, const size_t lda, // typename Field::ConstElement_ptr B, const size_t ldb, // const typename Field::Element beta, // typename Field::Element_ptr C, const size_t ldc, // MMHelper > & H) // { // std::cerr<<"coucou"<s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_pftrsm.inl * Copyright (C) 2013 Ziad Sultan * * Written by Ziad Sultan < Ziad.Sultan@imag.fr > * Time-stamp: <18 Dec 15 16:09:24 Jean-Guillaume.Dumas@imag.fr> * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_pftrsm_INL #define __FFLASFFPACK_fflas_pftrsm_INL #define PTRSM_HYBRID_THRESHOLD 256 #include "fflas-ffpack/paladin/parallel.h" namespace FFLAS { template inline typename Field::Element_ptr ftrsm( const Field& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_UPLO UpLo, const FFLAS::FFLAS_TRANSPOSE TA, const FFLAS::FFLAS_DIAG Diag, const size_t m, const size_t n, const typename Field::Element alpha, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else typename Field::Element_ptr #endif A, const size_t lda, typename Field::Element_ptr B, const size_t ldb, TRSMHelper > & H) // const FFLAS::CuttingStrategy method, // const size_t numThreads) { typedef TRSMHelper seqRecHelper; SYNCH_GROUP( seqRecHelper SeqH(H); if(Side == FflasRight){ FORBLOCK1D(iter, m, H.parseq, TASK(MODE(READ(A[0]) CONSTREFERENCE(F, A, B, SeqH,H) READWRITE(B[iter.begin()*ldb])), ftrsm( F, Side, UpLo, TA, Diag, iter.end()-iter.begin(), n, alpha, A, lda, B + iter.begin()*ldb, ldb, SeqH)); ); } else { FORBLOCK1D(iter, n, H.parseq, // seqRecHelper SeqH(H); TASK(MODE(READ(A[0]) CONSTREFERENCE(F, A, B, SeqH,H) READWRITE(B[iter.begin()])), ftrsm(F, Side, UpLo, TA, Diag, m, iter.end()-iter.begin(), alpha, A , lda, B + iter.begin(), ldb, SeqH)); ); } ); return B; } template inline typename Field::Element_ptr ftrsm( const Field& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_UPLO UpLo, const FFLAS::FFLAS_TRANSPOSE TA, const FFLAS::FFLAS_DIAG Diag, const size_t m, const size_t n, const typename Field::Element alpha, #ifdef __FFLAS__TRSM_READONLY typename Field::ConstElement_ptr #else typename Field::Element_ptr #endif A, const size_t lda, typename Field::Element_ptr B, const size_t ldb, TRSMHelper > & H) // const FFLAS::CuttingStrategy method, // const size_t numThreads) { if(Side == FflasRight){ size_t nt = H.parseq.numthreads(); size_t nt_it,nt_rec; if (m/PTRSM_HYBRID_THRESHOLD < nt){ nt_it = (int)ceil(double(m)/PTRSM_HYBRID_THRESHOLD); nt_rec = (int)ceil(double(nt)/double(nt_it)); } else { nt_it = nt; nt_rec = 1;} // ForStrategy1D iter(m, ParSeqHelper::Parallel((size_t)nt_it,H.parseq.method)); // for (iter.begin(); ! iter.end(); ++iter) { // SYNCH_GROUP(H.parseq.numthreads(), SYNCH_GROUP( ParSeqHelper::Parallel psh(nt_rec); TRSMHelper > SeqH (psh); H.parseq.set_numthreads(nt_it); FORBLOCK1D(iter, m, H.parseq, // std::cerr<<"trsm_rec nt = "<= n){ nt_it>>=1; nt_rec<<=1; while(nt_it*PTRSM_HYBRID_THRESHOLD >= n){ nt_it>>=1; nt_rec<<=1; } nt_it<<=1; nt_rec>>=1; } // if ((int)n/PTRSM_HYBRID_THRESHOLD < nt){ // nt_it = std::min(nt,(int)ceil(double(n)/PTRSM_HYBRID_THRESHOLD)); // nt_rec = ceil(double(nt)/nt_it); // } else { nt_it = nt; nt_rec = 1;} // ForStrategy1D iter(n, ParSeqHelper::Parallel((size_t)nt_it,H.parseq.method)); // for (iter.begin(); ! iter.end(); ++iter) { // std::cerr<<"trsm_rec nt_it = "< #include #include #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/utils/debug.h" #if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) #define INLINE __attribute__((always_inline)) inline #else #define INLINE inline #endif #if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) #define CONST __attribute__((const)) #else #define CONST #endif #if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) #define PURE __attribute__((pure)) #else #define PURE #endif #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS namespace std { // Why? - A.B. 2015-04-30 inline std::ostream &operator<<(std::ostream &o, const __m128 &v) { const float *vArray = (const float *)(&v); o << '<'; o << vArray[0] << ',' << vArray[1]; o << ','; o << vArray[2] << ',' << vArray[3]; o << '>'; return o; } inline std::ostream &operator<<(std::ostream &o, const __m128i &v) { const int64_t *vArray = (const int64_t *)(&v); o << '<'; o << vArray[0] << ',' << vArray[1]; o << '>'; return o; } inline std::ostream &operator<<(std::ostream &o, const __m128d &v) { const double *vArray = (const double *)(&v); o << '<'; o << vArray[0] << ',' << vArray[1]; o << '>'; return o; } } // std #ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS namespace std { inline std::ostream &operator<<(std::ostream &o, const __m256 &v) { const float *vArray = (const float *)(&v); o << '<'; o << vArray[0] << ',' << vArray[1] << ',' << vArray[2] << ',' << vArray[3]; o << ','; o << vArray[4] << ',' << vArray[5] << ',' << vArray[6] << ',' << vArray[7]; o << '>'; return o; } inline std::ostream &operator<<(std::ostream &o, const __m256i &v) { const int64_t *vArray = (const int64_t *)(&v); o << '<'; o << vArray[0] << ',' << vArray[1] << ',' << vArray[2] << ',' << vArray[3]; o << '>'; return o; } inline std::ostream &operator<<(std::ostream &o, const __m256d &v) { const double *vArray = (const double *)(&v); o << '<'; o << vArray[0] << ',' << vArray[1] << ',' << vArray[2] << ',' << vArray[3]; o << '>'; return o; } } // std #endif // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS namespace FFLAS { template struct support_simd : public std::false_type {}; #if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONSUSE_SIMD) template <> struct support_simd : public std::true_type {}; template <> struct support_simd : public std::true_type {}; #ifdef SIMD_INT template <> struct support_simd : public std::true_type {}; template <> struct support_simd : public std::true_type {}; template <> struct support_simd : public std::true_type {}; #endif #endif } // FFLAS #define NORML_MOD(C, P, NEGP, MIN, MAX, Q, T) \ { \ Q = greater(C, MAX); \ T = lesser(C, MIN); \ Q = vand(Q, NEGP); \ T = vand(T, P); \ Q = vor(Q, T); \ C = add(C, Q); \ } #define FLOAT_MOD(C, P, INVP, Q) \ { \ Q = mul(C, INVP); \ Q = floor(Q); \ C = fnmadd(C, Q, P); \ } // to activate SIMD with integers //#define SIMD_INT template struct simdToType; /* * is_simd trait */ template struct is_simd { static const constexpr bool value = false; using type = std::integral_constant; }; // SSE #if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) // SSE or better #include "fflas-ffpack/fflas/fflas_simd/simd128.inl" template <> struct simdToType<__m128d> { using type = double; }; template <> struct simdToType<__m128> { using type = float; }; template <> struct is_simd<__m128d> { static const constexpr bool value = true; using type = std::integral_constant; }; template <> struct is_simd<__m128> { static const constexpr bool value = true; using type = std::integral_constant; }; #ifdef SIMD_INT template <> struct is_simd<__m128i> { static const constexpr bool value = true; using type = std::integral_constant; }; #endif #endif // SSE // AVX #if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) or defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS) #include "fflas-ffpack/fflas/fflas_simd/simd256.inl" template <> struct simdToType<__m256d> { using type = double; }; template <> struct simdToType<__m256> { using type = float; }; template <> struct is_simd<__m256d> { static const constexpr bool value = true; using type = std::integral_constant; }; template <> struct is_simd<__m256> { static const constexpr bool value = true; using type = std::integral_constant; }; #ifdef SIMD_INT template <> struct is_simd<__m256i> { static const constexpr bool value = true; using type = std::integral_constant; }; #endif #endif // AVX /* * Simd functors */ template struct NoSimd { /* * alias to 128 bit simd register */ using vect_t = T*; /* * define the scalar type corresponding to the specialization */ using scalar_t = T; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 1; // Test if the pointer p is multiple of alignment template static constexpr bool valid(TT p) { return false; } // Test if n is multiple of vect_size template static constexpr bool compliant(TT n) { return false; } }; // #if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) template ::value, bool = std::is_integral::value> struct SimdChooser {}; template struct SimdChooser { using value = NoSimd; }; template struct SimdChooser // floating number { #ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS using value = Simd256; #elif defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) using value = Simd128; #else using value = NoSimd; #endif }; template struct SimdChooser // integral number { #ifdef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS using value = Simd256; #elif __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS using value = Simd128; #else using value = NoSimd; #endif }; template using Simd = typename SimdChooser::value; // template struct SimdChooser { // #if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS) // typedef Simd256 value; // #else // typedef Simd128 value; // #endif // __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS // }; // #elif defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) // not AVX // template using Simd = Simd128; // #endif // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS #if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) // SSE or better // template struct floating_simd; // template <> struct floating_simd { typedef Simd value; }; // template <> struct floating_simd { typedef Simd value; }; // template <> struct floating_simd { // #if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS) // // typedef Simd256 value; // #else // typedef Simd128 value; // #endif // }; #endif #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS namespace FFLAS { /* print helper */ // need friend ? template inline std::ostream &print(std::ostream &os, const typename simdT::vect_t &P) { typename simdT::scalar_t p[simdT::vect_size]; os << '<'; simdT::storeu(p, P); for (size_t i = 0; i < simdT::vect_size; ++i) { os << p[i]; if (i < simdT::vect_size - 1) os << '|'; } os << '>'; return os; } } // FFLAS namespace std { // cannot be instanciated, T is not deductible template inline std::ostream &operator<<(std::ostream &o, const typename Simd128::vect_t &v) { FFLAS::print>(o, v); return o; } } // std #ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS namespace std { // cannot be instanciated, T is not deductible template inline std::ostream &operator<<(std::ostream &o, const typename Simd256::vect_t &v) { FFLAS::print(o, v); return o; } } #endif // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS // Provide simd modular support #include #undef INLINE #undef PURE #undef CONST #undef SIMD_INT #endif /* __FFLASFFPACK_utils_simd_H */ fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/000077500000000000000000000000001274716147400216025ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/Makefile.am000066400000000000000000000025711274716147400236430ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_simd EXTRA_DIST=simd.doxy SIMD128= simd128.inl \ simd128_double.inl \ simd128_float.inl \ simd128_int16.inl \ simd128_int32.inl \ simd128_int64.inl SIMD256= simd256.inl \ simd256_double.inl \ simd256_float.inl \ simd256_int16.inl \ simd256_int32.inl \ simd256_int64.inl SIMD_MOD= simd_modular.inl pkgincludesub_HEADERS= \ $(SIMD128) \ $(SIMD256)\ $(SIMD_MOD) fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd.doxy000066400000000000000000000021441274716147400234440ustar00rootroot00000000000000// Copyright (c) 2014 FFLAS-FFPACK // written by Brice Boyer (briceboyer) // // ========LICENCE======== // This file is part of the library FFLAS-FFPACK. // // FFLAS-FFPACK is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // ========LICENCE======== // /** \ingroup fflas-ffpack * \defgroup simd SIMD wrapper * * \brief wraps SIMD functions * Supporst SSE4.1, AVX, AVX2. * * @todo biblio * */ // vim:syn=doxygen fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd128.inl000066400000000000000000000071441274716147400235030ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd128_INL struct Simd128i_base { /* * alias to 128 bit simd register */ using vect_t = __m128i; /* * Return vector of type vect_t with all elements set to zero * Return [0, ...,0] */ static INLINE CONST vect_t zero() { return _mm_setzero_si128(); } /* * Shift packed 128-bit integers in a left by s bits while shifting in zeros, and store the results in vect_t. * Args : [a0] int128_t * Return : [a0 << (s*8)] int128_t */ template static INLINE CONST vect_t sll128(const vect_t a) { return _mm_slli_si128(a, s); } /* * Shift packed 128-bit integers in a right by s while shifting in zeros, and store the results in vect_t. * Args : [a0] int128_t * Return : [a0 >> (s*8)] int128_t */ template static INLINE CONST vect_t srl128(const vect_t a) { return _mm_srli_si128(a, s); } /* * Compute the bitwise AND and store the results in vect_t. * Args : [a0, ..., a127] * [b0, ..., b127] * Return : [a0 AND b0, ..., a127 AND b127] */ static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_si128(b, a); } /* * Compute the bitwise OR and store the results in vect_t. * Args : [a0, ..., a127] * [b0, ..., b127] * Return : [a0 OR b0, ..., a127 OR b127] */ static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_si128(b, a); } /* * Compute the bitwise XOR and store the results in vect_t. * Args : [a0, ..., a127] * [b0, ..., b127] * Return : [a0 XOR b0, ..., a127 XOR b127] */ static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_si128(b, a); } /* * Compute the bitwise AND NOT and store the results in vect_t. * Args : [a0, ..., a127] * [b0, ..., b127] * Return : [a0 AND (NOT b0), ..., a127 AND (NOT b127)] */ static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_si128(b, a); } }; template struct Simd128_impl; template using Simd128 = Simd128_impl::value, std::is_integral::value, std::is_signed::value, sizeof(T)>; #include "simd128_float.inl" #include "simd128_double.inl" #ifdef SIMD_INT // Trop d'instructions SSE manquantes pour les int8_t #include "simd128_int16.inl" #include "simd128_int32.inl" #include "simd128_int64.inl" #endif //#ifdef SIMD_INT #endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd128_double.inl000066400000000000000000000360141274716147400250330ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_double_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd128_double_INL /* * Simd128 specialized for double */ template <> struct Simd128_impl { #if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) /* * alias to 128 bit simd register */ using vect_t = __m128d; /* * define the scalar type corresponding to the specialization */ using scalar_t = double; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 2; /* * alignement required by scalar_t pointer to be loaded in a vect_t */ static const constexpr size_t alignment = 16; /* * Check if the pointer p is a multiple of alignemnt */ template static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } /* * Check if the number n is a multiple of vect_size */ template static constexpr bool compliant(T n) { return n % vect_size == 0; } /* * Return vector of type vect_t with all elements set to zero. * Return [0,0] */ static INLINE CONST vect_t zero() { return _mm_setzero_pd(); } /* * Broadcast double-precision (64-bit) floating-point value a to all elements of vect_t. * Return [x,x] */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_pd(x); } /* * Set packed double-precision (64-bit) floating-point elements in vect_t with the supplied values. * Return [x1,x2] */ static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2) { return _mm_set_pd(x2, x1); } /* * Gather double-precision (64-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in * vect_t. * Return [p[idx[0]], p[idx[1]]] */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return _mm_set_pd(p[idx[1]], p[idx[0]]); } /* * Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into vect_t. * p must be aligned on a 16-byte boundary or a general-protection exception will be generated. * Return [p[0], p[1]] */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_pd(p); } /* * Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into vect_t. * p does not need to be aligned on any particular boundary. * Return [p[0], p[1]] */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_pd(p); } /* * Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from p into memory. * p must be aligned on a 16-byte boundary or a general-protection exception will be generated. */ static INLINE void store(const scalar_t *p, const vect_t v) { _mm_store_pd(const_cast(p), v); } /* * Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from p into memory. * p must be aligned on a 16-byte boundary or a general-protection exception will be generated. */ static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm_storeu_pd(const_cast(p), v); } /* * Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory using * a non-temporal memory hint. * p must be aligned on a 16-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(const scalar_t *p, const vect_t v) { _mm_stream_pd(const_cast(p), v); } /* * Shuffle double-precision (64-bit) floating-point elements using the control in s, * and store the results in dst. * Args : [a0, a1] double * Return : [a[s[0]], a[s[1]]] double */ #if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) template static INLINE CONST vect_t shuffle(const vect_t a) { return _mm_permute_pd(a, s); } #endif /* * Unpack and interleave double-precision (64-bit) floating-point elements from the low half of a and b, and store the results in dst. * Args : [a0, a1] double [b0, b1] double * Return : [a0, b0] double */ static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { return _mm_unpacklo_pd(a, b); } /* * Unpack and interleave double-precision (64-bit) floating-point elements from the high half of a and b, and store the results in dst. * Args : [a0, a1] double [b0, b1] double * Return : [a1, b1] double */ static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { return _mm_unpackhi_pd(a, b); } /* * Blend packed double-precision (64-bit) floating-point elements from a and b using control mask s, * and store the results in dst. * Args : [a0, a1] double [b0, b1] double * Return : [s[0]?a0:b0, s[1]?a1:b1] double */ template static INLINE CONST vect_t blend(const vect_t a, const vect_t b) { return _mm_blend_pd(a, b, s); } /* * Blend packed double-precision (64-bit) floating-point elements from a and b using mask, * and store the results in dst. * Args : [a0, a1] double [b0, b1] double * Return : [mask[63]?a0:b0, mask[127]?a1:b1] double */ static INLINE CONST vect_t blendv(const vect_t a, const vect_t b, const vect_t mask) { return _mm_blendv_pd(a, b, mask); } /* * Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [a0+b0, a1+b1] */ static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_pd(a, b); } static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } /* * Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) * floating-point elements in a, and store the results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [a0-b0, a1-b1] */ static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_pd(a, b); } static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [a0*b0, a1*b1] */ static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm_mul_pd(a, b); } static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); } /* * Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, * and store the results in dst. * Args : [a0, a1], [b0, b1] * Return : [a0/b0, a1/b1] */ static INLINE CONST vect_t div(const vect_t a, const vect_t b) { return _mm_div_pd(a, b); } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to * packed elements in c, and store the results in vect_t. * Args : [a0, a1], [b0, b1], [c0, c1] * Return : [a0*b0+c0, a1*b1+c1] */ static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm_fmadd_pd(a, b, c); #else return add(c, mul(a, b)); #endif } static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result * to packed elements in c, and store the results in vect_t. * Args : [a0, a1], [b0, b1], [c0, c1] * Return : [-(a0*b0)+c0, -(a1*b1)+c1] */ static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm_fnmadd_pd(a, b, c); #else return sub(c, mul(a, b)); #endif } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result * to packed elements in c, and store the results in vect_t. * Args : [a0, a1], [b0, b1], [c0, c1] * Return : [-(a0*b0)+c0, -(a1*b1)+c1] */ static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); } static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from * the intermediate result, and store the results in vect_t. * Args : [a0, a1], [b0, b1], [c0, c1] * Return : [a0*b0-c0, a1*b1-c1] */ static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm_fmsub_pd(a, b, c); #else return sub(mul(a, b), c); #endif } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from * the intermediate result, and store the results in vect_t. * Args : [a0, a1], [b0, b1], [c0, c1] * Return : [a0*b0-c0, a1*b1-c1] */ static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); } static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } /* * Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0] */ static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_pd(a, b); } /* * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser-than, and store the results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [(a0b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0] */ static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_pd(a, b); } /* * Compare packed double-precision (64-bit) floating-point elements in a and b for greater or equal than, and store the results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0] */ static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm_cmpge_pd(a, b); } /* * Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [a0 AND b0, a1 AND b1] */ static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_pd(a, b); } /* * Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [a0 OR b0, a1 OR b1] */ static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_pd(a, b); } /* * Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [a0 XOR b0, a1 XOR b1] */ static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_pd(a, b); } /* * Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [a0 AND NOT b0, a1 AND NOT b1] */ static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_pd(a, b); } /* * Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the * results as packed double-precision floating-point elements in vect_t. * Args : [a0, a1] * Return : [floor(a0), floor(a1)] */ static INLINE CONST vect_t floor(const vect_t a) { return _mm_floor_pd(a); } /* * Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the * results as packed double-precision floating-point elements in vect_t. * Args : [a0, a1] * Return : [ceil(a0), ceil(a1)] */ static INLINE CONST vect_t ceil(const vect_t a) { return _mm_ceil_pd(a); } /* * Round the packed double-precision (64-bit) floating-point elements in a, and store the results as packed * double-precision floating-point elements in vect_t. * Args : [a0, a1] * Return : [round(a0), round(a1)] */ static INLINE CONST vect_t round(const vect_t a) { return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } /* * Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the * results in vect_t. * Args : [a0, a1], [b0, b1] * Return : [a0+a1, b0+b1] */ static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm_hadd_pd(a, b); } /* * Horizontally add double-precision (64-bit) floating-point elements in a. * Args : [a0, a1] * Return : a0+a1 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1]; } static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) { FLOAT_MOD(C, P, INVP, Q); NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); return C; } #else // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS #error "You need SSE instructions to perform 128bits operations on double" #endif }; #endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_double_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd128_float.inl000066400000000000000000000405241274716147400246670ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_float_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd128_float_INL /* * Simd128 specialized for float */ template <> struct Simd128_impl { #if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) /* * alias to 128 bit simd register */ using vect_t = __m128; /* * define the scalar type corresponding to the specialization */ using scalar_t = float; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 4; /* * alignement required by scalar_t pointer to be loaded in a vect_t */ static const constexpr size_t alignment = 16; /* * Check if the pointer p is a multiple of alignemnt */ template static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } /* * Check if the number n is a multiple of vect_size */ template static constexpr bool compliant(T n) { return n % vect_size == 0; } /* * Return vector of type vect_t with all elements set to zero * Return [0,0,0,0] */ static INLINE CONST vect_t zero() { return _mm_setzero_ps(); } /* * Broadcast single-precision (32-bit) floating-point value x to all elements of vect_t. * Return [x,x,x,x] */ static INLINE CONST vect_t set1(const scalar_t x) { #ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS // return _mm_broadcast_ss(&x); return _mm_set1_ps(x); #else return _mm_set1_ps(x); #endif } /* * Set packed single-precision (32-bit) floating-point elements in vect_t with the supplied values. * Return [x1,x2,x3,x4] */ static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4) { return _mm_set_ps(x4, x3, x2, x1); } /* * Gather single-precision (32-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in * vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return _mm_set_ps(p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]); } /* * Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into vect_t. * p must be aligned on a 16-byte boundary or a general-protection exception will be generated. * Return [p[0], p[1], p[2], p[3]] */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_ps(p); } /* * Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into vect_t. * p does not need to be aligned on any particular boundary. * Return [p[0], p[1], p[2], p[3]] */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_ps(p); } /* * Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory. * p must be aligned on a 16-byte boundary or a general-protection exception will be generated. */ static INLINE void store(const scalar_t *p, const vect_t v) { _mm_store_ps(const_cast(p), v); } /* * Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm_storeu_ps(const_cast(p), v); } /* * Store 128-bits (composed of 4 packed double-precision (32-bit) floating-point elements) from a into memory using * a non-temporal memory hint. * p must be aligned on a 16-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(const scalar_t *p, const vect_t v) { _mm_stream_ps(const_cast(p), v); } /* * Shuffle single-precision (32-bit) floating-point elements in a using the control in s, * and store the results in dst. * Args : [a0, a1, a2, a3] float * Return : [a[s[0..1]], ..., a[s[6..7]] float */ #if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) template static INLINE CONST vect_t shuffle(const vect_t a) { return _mm_permute_ps(a, s); } #endif /* * Unpack and interleave single-precision (32-bit) floating-point elements from the low half of a and b, and store the results in dst. * Args : [a0, a1, a2, a3] float [b0, b1, b2, b3] float * Return : [a0, b0, a1, b1] float */ static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { return _mm_unpacklo_ps(a, b); } /* * Unpack and interleave single-precision (32-bit) floating-point elements from the high half a and b, and store the results in dst. * Args : [a0, a1, a2, a3] float [b0, b1, b2, b3] float * Return : [a2, b2, a3, b3] float */ static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { return _mm_unpackhi_ps(a, b); } /* * Blend packed single-precision (32-bit) floating-point elements from a and b using control mask s, * and store the results in dst. * Args : [a0, a1, a2, a3] float [b0, b1, b2, b3] float * Return : [s[0]?a0:b0, , s[3]?a3:b3] float */ template static INLINE CONST vect_t blend(const vect_t a, const vect_t b) { return _mm_blend_ps(a, b, s); } /* * Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst. * and store the results in dst. * Args : [a0, a1, a2, a3] float [b0, b1, b2, b3] float * Return : [mask[31]?a0:b0, , mask[127]?a3:b3] float */ static INLINE CONST vect_t blendv(const vect_t a, const vect_t b, const vect_t mask) { return _mm_blendv_ps(a, b, mask); } /* * Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0+b0, a1+b1, a2+b2, a3+b3] */ static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_ps(a, b); } static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } /* * Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) * floating-point elements in a, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0-b0, a1-b1, a2-b2, a3-b3] */ static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_ps(a, b); } static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0*b0, a1*b1, a2*b2, a3*b3] */ static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm_mul_ps(a, b); } static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); } /* * Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, * and store the results in dst. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] float * Return : [a0/b0, a1/b1, a2/b2, a3/b3] float */ static INLINE CONST vect_t div(const vect_t a, const vect_t b) { return _mm_div_ps(a, b); } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to * packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3] */ static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm_fmadd_ps(a, b, c); #else return add(c, mul(a, b)); #endif } static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to * packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4] */ static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm_fnmadd_ps(a, b, c); #else return sub(c, mul(a, b)); #endif } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to * packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4] */ static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); } static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from * the intermediate result, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3] */ static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm_fmsub_ps(a, b, c); #else return sub(mul(a, b), c); #endif } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from * the intermediate result, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3] */ static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); } static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } /* * Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [(a0==b0) ? 0xFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFF : 0, (a2==b2) ? 0xFFFFFFFF : 0, (a3==b3) ? 0xFFFFFFFF : 0] */ static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_ps(a, b); } /* * Compare packed single-precision (32-bit) floating-point elements in a and b for lesser-than, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [(a0b0) ? 0xFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFF : 0, (a2>b2) ? 0xFFFFFFFF : 0, (a3>b3) ? 0xFFFFFFFF : 0] */ static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_ps(a, b); } /* * Compare packed single-precision (32-bit) floating-point elements in a and b for greater or equal than, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [(a0>=b0) ? 0xFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFF : 0, (a2>=b2) ? 0xFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFF : 0] */ static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm_cmpge_ps(a, b); } /* * Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3] */ static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm_and_ps(a, b); } /* * Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3] */ static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm_or_ps(a, b); } /* * Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3] */ static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm_xor_ps(a, b); } /* * Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3] */ static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm_andnot_ps(a, b); } /* * Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the * results as packed double-precision floating-point elements in vect_t. * Args : [a0, a1, a2, a3] * Return : [floor(a0), floor(a1), floor(a2), floor(a3)] */ static INLINE CONST vect_t floor(const vect_t a) { return _mm_floor_ps(a); } /* * Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the * results as packed single-precision floating-point elements in vect_t. * Args : [a0, a1, a2, a3] * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3)] */ static INLINE CONST vect_t ceil(const vect_t a) { return _mm_ceil_ps(a); } /* * Round the packed single-precision (32-bit) floating-point elements in a, and store the results as packed * single-precision floating-point elements in vect_t. * Args : [a0, a1, a2, a3] * Return : [round(a0), round(a1), round(a2), round(a3)] */ static INLINE CONST vect_t round(const vect_t a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } /* * Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the * results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0+a1, b0+b1, a2+a3, b2+b3] */ static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm_hadd_ps(a, b); } /* * Horizontally add single-precision (32-bit) floating-point elements in a. * Args : [a0, a1, a2, a3] * Return : a0+a1+a2+a3 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] + ((const scalar_t *)&a)[3]; } static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) { FLOAT_MOD(C, P, INVP, Q); NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); return C; } #else // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS #error "You need SSE instructions to perform 128bits operations on double" #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS }; #endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_float_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd128_int16.inl000066400000000000000000000525761274716147400245350ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * Romain Lebreton * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_int16_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd128_int16_INL #ifndef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS #error "You need SSE instructions to perform 128 bits operations on int16" #endif /* * Simd128 specialized for int16_t */ template <> struct Simd128_impl : public Simd128i_base { /* * alias to 128 bit simd register */ using vect_t = __m128i; /* * define the scalar type corresponding to the specialization */ using scalar_t = int16_t; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 8; /* * alignement required by scalar_t pointer to be loaded in a vect_t */ static const constexpr size_t alignment = 16; /* * Check if the pointer p is a multiple of alignemnt */ template static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } /* * Check if the number n is a multiple of vect_size */ template static constexpr bool compliant(T n) { return n % vect_size == 0; } /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 16-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x,x,x,x,x,x,x] int16_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi16(x); } /* * Set packed 16-bit integers in dst with the supplied values. * Return [x0, ..., x7] int16_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7) { return _mm_set_epi16(x7, x6, x5, x4, x3, x2, x1, x0); } /* * Gather 16-bit integer elements with indexes idx[0], ..., idx[7] from the address p in vect_t. * Return [p[idx[0]], ..., p[idx[7]]] int16_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]); } /* * Load 128-bits of integer data from memory into dst. * p must be aligned on a 16-byte boundary or a general-protection exception will be generated. * Return [p[0], ..., p[7]] int16_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_si128(reinterpret_cast(p)); } /* * Load 128-bits of integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0], ..., p[7]] int16_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_si128(reinterpret_cast(p)); } /* * Store 128-bits of integer data from a into memory. * p must be aligned on a 16-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm_store_si128(reinterpret_cast(p), v); } /* * Store 128-bits of integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm_storeu_si128(reinterpret_cast(p), v); } /* * Store 128-bits of integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 16-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm_stream_si128(reinterpret_cast(p), v); } /* * Shift packed 16-bit integers in a left by s while shifting in zeros, and store the results in vect_t. * Args : [a0, ..., a7] int16_t * Return : [a0 << s, a1 << s, a2 << s, a3 << s, a4 << s, a5 << s, a6 << s, a7 << s] int16_t */ static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm_slli_epi16(a, s); } /* * Shift packed 16-bit integers in a right by s while shifting in zeros, and store the results in vect_t. * Args : [a0, ..., a7] int16_t * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s] int16_t */ static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm_srli_epi16(a, s); } /* * Shift packed 16-bit integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, ..., a7] int16_t * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s] int16_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_srai_epi16(a, s); } /* * Shuffle 16-bit integers in a using the control in imm8, and store the results in dst. * Args : [a0, ..., a7] int16_t * Return : [a[s[0..3]], ..., a[s[28..31]] int16_t */ template static INLINE CONST vect_t shuffle(const vect_t a) { //#pragma warning "The simd shuffle function is emulated, it may impact the performances."; Converter conv; conv.v = a; return set (conv.t[( s & 0x0000000F)], conv.t[( s & 0x000000F0)], conv.t[((s>> 8) & 0x0000000F)], conv.t[((s>> 8) & 0x000000F0)], conv.t[((s>>16) & 0x0000000F)], conv.t[((s>>16) & 0x000000F0)], conv.t[((s>>24) & 0x0000000F)], conv.t[((s>>24) & 0x000000F0)]); } /* * Unpack and interleave 16-bit integers from the low half of a and b, and store the results in dst. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [a0, b0, ..., a3, b3] int16_t */ static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { return _mm_unpacklo_epi16(a, b); } /* * Unpack and interleave 16-bit integers from the high half of a and b, and store the results in dst. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [a4, b4, ..., a7, b7] int16_t */ static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { return _mm_unpackhi_epi16(a, b); } /* * Blend packed 16-bit integers from a and b using control mask imm8, and store the results in dst. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [s[0]?a0:b0, , s[7]?a7:b7] int16_t */ template static INLINE CONST vect_t blend(const vect_t a, const vect_t b) { return _mm_blend_epi16(a, b, s); } /* * Add packed 16-bits integer in a and b, and store the results in vect_t. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [a0+b0, ..., a7+b7] int16_t */ static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_epi16(a, b); } static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } /* * Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in vect_t. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [a0-b0, ..., a7-b7] int16_t */ static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_epi16(a, b); } static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } /* * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in vect_t. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [a0*b0 smod 2^16, ..., a7*b7 smod 2^16] int16_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm_mullo_epi16(a, b); } static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); } /* * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in vect_t. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [Floor(a0*b0/2^16), ..., Floor(a7*b7/2^16)] int16_t */ static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { return _mm_mulhi_epi16(a, b); } /* * Multiply the low 8-bit integers from each packed 16-bit element in a and b, and store the signed 16-bit results in vect_t. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [(a0 smod 2^8)*(b0 smod 2^8), ..., (a7 smod 2^8)*(b7 smod 2^8)] int16_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { //#pragma warning "The simd mulx function is emulated, it may impact the performances." vect_t a1, b1, mask1, mask2; mask1 = set1(0x00FF); mask2 = set1(0x0080); a1 = add(a,mask2); a1 = vand(a1,mask1); a1 = sub(a1,mask2); b1 = add(b,mask2); b1 = vand(b1,mask1); b1 = sub(b1,mask2); return mul(a1,b1); } /* * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, * keep the low 16 bits of the intermediate and add the low 16-bits of c. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t [c0, ..., c7] int16_t * Return : [(a0*b0+c0) smod 2^16, ..., (a7*b7+c7) smod 2^16] int16_t */ static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); } static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } /* * Multiply the low 8-bit integers from each packed 16-bit element in a and b, * keep the signed 16-bit results and add the low 16-bits of c. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t [c0, ..., c7] int16_t * Return : [((a0 smod 2^8)*(b0 smod 2^8)+c0) smod 2^16, ..., * ((a7 smod 2^8)*(b7 smod 2^8)+c7) smod 2^16] int16_t */ static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } /* * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, * and substract the low 16 bits of the intermediate from elements of c. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t [c0, ..., c7] int16_t * Return : [(-a0*b0+c0) smod 2^16, ..., (-a7*b7+c7) smod 2^16] int16_t */ static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); } static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } /* * Multiply the low 8-bit integers from each packed 16-bit element in a and b, * keep the signed 16-bit results and substract them from elements of c. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t [c0, ..., c7] int16_t * Return : [(-(a0 smod 2^8)*(b0 smod 2^8)+c0) smod 2^16, ..., * (-(a7 smod 2^8)*(b7 smod 2^8)+c7) smod 2^16] int16_t */ static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } /* * Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers, * and substract elements of c to the low 16-bits of the intermediate. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t [c0, ..., c7] int16_t * Return : [(a0*b0-c0) smod 2^16, ..., (a7*b7-c7) smod 2^16] int16_t */ static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); } static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } /* * Multiply the low 8-bit integers from each packed 16-bit element in a and b, * keep the signed 16-bit results and substract elements of c from them. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t [c0, ..., c7] int16_t * Return : [((a0 smod 2^8)*(b0 smod 2^8)-c0) smod 2^16, ..., * ((a7 smod 2^8)*(b7 smod 2^8)-c7) smod 2^16] int16_t */ static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Compare packed 16-bits in a and b for equality, and store the results in vect_t. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [(a0==b0) ? 0xFFFF : 0, ..., (a7==b7) ? 0xFFFF : 0] int16_t */ static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_epi16(a, b); } /* * Compare packed 16-bits in a and b for greater-than, and store the results in vect_t. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [(a0>b0) ? 0xFFFF : 0, ..., (a7>b7) ? 0xFFFF : 0] int16_t */ static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_epi16(a, b); } /* * Compare packed 16-bits in a and b for lesser-than, and store the results in vect_t. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [(a0=b0) ? 0xFFFF : 0, ..., (a7>=b7) ? 0xFFFF : 0] int16_t */ static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } /* * Compare packed 16-bits in a and b for lesser or equal than, and store the results in vect_t. * Args : [a0, ..., a7] int16_t [b0, ..., b7] int16_t * Return : [(a0<=b0) ? 0xFFFF : 0, ..., (a7<=b7) ? 0xFFFF : 0] int16_t */ static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Horizontally add 16-bits elements of a. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] * Return : a0+a1+a2+a3 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter conv; conv.v = a; return scalar_t(conv.t[0] + conv.t[1] + conv.t[2] + conv.t[3] + conv.t[4] + conv.t[5] + conv.t[6] + conv.t[7]); } static INLINE CONST vect_t round(const vect_t a) { return a; } static INLINE CONST vect_t signbits(const vect_t x) { vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1)); return signBits; } static INLINE vect_t mod(vect_t &C, const vect_t &P, const __m64 &INVP, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) { #ifdef __INTEL_COMPILER C = _mm_rem_epi16(C, P); #else FFLASFFPACK_abort("not implemented"); #endif NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); return C; } }; /* * Simd128 specialized for uint16_t */ template <> struct Simd128_impl : public Simd128_impl { /* * define the scalar type corresponding to the specialization */ using scalar_t = uint16_t; /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 16-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x,x,x,x,x,x,x] uint16_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi16(x); } /* * Broadcast 16-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x0, ..., x7] uint16_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7) { return _mm_set_epi16(x7, x6, x5, x4, x3, x2, x1, x0); } /* * Gather 16-bit unsigned integer elements with indexes idx[0], ..., idx[7] from the address p in vect_t. * Return [p[idx[0]],..., p[idx[7]]] uint16_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]); } /* * Load 128-bits of unsigned integer data from memory into dst. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[idx[0]],..., p[idx[7]]] uint16_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_si128(reinterpret_cast(p)); } /* * Load 128-bits of unsigned integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[idx[0]],..., p[idx[7]]] uint16_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_si128(reinterpret_cast(p)); } /* * Store 128-bits of unsigned integer data from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm_store_si128(reinterpret_cast(p), v); } /* * Store 128-bits of unsigned integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm_storeu_si128(reinterpret_cast(p), v); } /* * Store 128-bits of unsigned integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 16-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm_stream_si128(reinterpret_cast(p), v); } /* * Shift packed 16-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, ..., a7] uint16_t * Return : [Floor(a0/2^s), ..., Floor(a7/2^s)] int16_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_srli_epi16(a, s); } static INLINE CONST vect_t greater(vect_t a, vect_t b) { vect_t x; x = set1((static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(a,x); b = sub(b,x); return _mm_cmpgt_epi16(a, b); } static INLINE CONST vect_t lesser(vect_t a, vect_t b) { vect_t x; x = set1((static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(a,x); b = sub(b,x); return _mm_cmplt_epi16(a, b); } static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, * and store the high 16 bits of the intermediate integers in vect_t. * Args : [a0, ..., a7] uint16_t * [b0, ..., b7] uint16_t * Return : [Floor(a0*b0/2^16), ..., Floor(a7*b7/2^16)] uint16_t */ static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { return _mm_mulhi_epu16(a, b); } /* * Multiply the low unsigned 8-bit integers from each packed 16-bit element in a and b, * and store the signed 16-bit results in vect_t. * Args : [a0, ..., a7] uint16_t * [b0, ..., b7] uint16_t * Return : [(a0 mod 2^8)*(b0 mod 2^8), ..., (a7 mod 2^8)*(b7 mod 2^8)] uint16_t */ static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { //#pragma warning "The simd mulx function is emulated, it may impact the performances." vect_t a1, b1, mask1; mask1 = set1(0x00FF); a1 = vand(a,mask1); b1 = vand(b,mask1); return mul(a1,b1); } static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Horizontally add 16-bits elements of a. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] * Return : a0+a1+a2+a3 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter conv; conv.v = a; return scalar_t(conv.t[0] + conv.t[1] + conv.t[2] + conv.t[3] + conv.t[4] + conv.t[5] + conv.t[6] + conv.t[7]); } }; //Simd128_impl #endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_int16_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd128_int32.inl000066400000000000000000000555541274716147400245320ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * Romain Lebreton * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_int32_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd128_int32_INL #ifndef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS #error "You need SSE instructions to perform 128 bits operations on int32" #endif #include "fflas-ffpack/fflas/fflas_simd/simd128_int64.inl" /* * Simd128 specialized for int32_t */ template <> struct Simd128_impl : public Simd128i_base { /* * alias to 128 bit simd register */ using vect_t = __m128i; /* * define the scalar type corresponding to the specialization */ using scalar_t = int32_t; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 4; /* * alignement required by scalar_t pointer to be loaded in a vect_t */ static const constexpr size_t alignment = 16; /* * Check if the pointer p is a multiple of alignemnt */ template static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } /* * Check if the number n is a multiple of vect_size */ template static constexpr bool compliant(T n) { return n % vect_size == 0; } /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 32-bit integer a to all elements of dst. This intrinsic may generate vpbroadcastd. * Return [x,x,x,x] int32_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi32(x); } /* * Set packed 32-bit integers in dst with the supplied values. * Return [x0,x1,x2,x3] int32_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3) { return _mm_set_epi32(x3, x2, x1, x0); } /* * Gather 32-bit integer elements with indexes idx[0], ..., idx[3] from the address p in vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] int32_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]); } /* * Load 128-bits of integer data from memory into dst. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[0],p[1],p[2],p[3]] int32_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_si128(reinterpret_cast(p)); } /* * Load 128-bits of integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int32_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_si128(reinterpret_cast(p)); } /* * Store 128-bits of integer data from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm_store_si128(reinterpret_cast(p), v); } /* * Store 128-bits of integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm_storeu_si128(reinterpret_cast(p), v); } /* * Store 128-bits of integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 16-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm_stream_si128(reinterpret_cast(p), v); } /* * Shift packed 64-bit integers in a left by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1, a2, a3] int32_t * Return : [a0 << s, a1 << s, a2 << s, a3 << s] int32_t */ static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm_slli_epi32(a, s); } /* * Shift packed 64-bit integers in a right by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1, a2, a3] int32_t * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int32_t */ static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm_srli_epi32(a, s); } /* * Shift packed 32-bit integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, a1, a2, a3] int32_t * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int32_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_srai_epi32(a, s); } /* * Shuffle 32-bit integers in a using the control in imm8, and store the results in dst. * Args : [a0, a1, a2, a3] int32_t * Return : [a[s[0..1]], ..., a[s[6..7]] int32_t */ template static INLINE CONST vect_t shuffle(const vect_t a) { return _mm_shuffle_epi32(a, s); } /* * Unpack and interleave 32-bit integers from the low half of a and b, and store the results in dst. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [a0, b0, a1, b1] int32_t */ static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { return _mm_unpacklo_epi32(a, b); } /* * Unpack and interleave 32-bit integers from the high half of a and b, and store the results in dst. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [a2, b2, a3, b3] int32_t */ static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { return _mm_unpackhi_epi32(a, b); } /* * Blend packed 32-bit integers from a and b using control mask imm8, and store the results in dst. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [s[0]?a0:b0, , s[3]?a3:b3] int32_t */ template static INLINE CONST vect_t blend(const vect_t a, const vect_t b) { // _mm_blend_epi16 is faster than _mm_blend_epi32 and require SSE4.1 instead of AVX2 // We have to transform s = [d3 d2 d1 d0]_base2 to s1 = [d3 d3 d2 d2 d1 d1 d0 d0]_base2 constexpr uint8_t s1 = (s & 0x1) * 3 + (((s & 0x2) << 1)*3) + (((s & 0x4) << 2)*3) + (((s & 0x8) << 3)*3); return _mm_blend_epi16(a, b, s1); } /* * Add packed 32-bits integer in a and b, and store the results in vect_t. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [a0+b0, a1+b1, a2+b2, a3+b3] int32_t */ static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_epi32(a, b); } static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } /* * Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in vect_t. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [a0-b0, a1-b1, a2-b2, a3-b3] int32_t */ static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_epi32(a, b); } static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } /* * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [a0*b0 smod 2^32, ..., a3*b3 smod 2^32] int32_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm_mullo_epi32(a, b); } static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); } /* * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the high 32 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [Floor(a0*b0/2^32), ..., Floor(a3*b3/2^32)] int32_t */ static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { // _mm_mulhi_epi32 emul //#pragma warning "The simd mulhi function is emulated, it may impact the performances." #if 0 vect_t a1, a2, b1, b2; a1 = set(_mm_extract_epi32(a, 0), 0, _mm_extract_epi32(a, 2), 0); a2 = set(_mm_extract_epi32(a, 1), 0, _mm_extract_epi32(a, 3), 0); b1 = set(_mm_extract_epi32(b, 0), 0, _mm_extract_epi32(b, 2), 0); b2 = set(_mm_extract_epi32(b, 1), 0, _mm_extract_epi32(b, 3), 0); a1 = _mm_mul_epi32(a1, b1); a2 = _mm_mul_epi32(a2, b2); return set(_mm_extract_epi32(a1, 1), _mm_extract_epi32(a2, 1), _mm_extract_epi32(a1, 3), _mm_extract_epi32(a2, 3)); #else typedef Simd128_impl Simd128_64; vect_t C,A1,B1; C = Simd128_64::mulx(a,b); A1 = Simd128_64::srl(a,32); B1 = Simd128_64::srl(b,32); A1 = Simd128_64::mulx(A1,B1); C = Simd128_64::srl(C,32); A1 = Simd128_64::srl(A1,32); A1 = Simd128_64::sll(A1,32); return Simd128_64::vor(C,A1); #endif } /* * Multiply the low 16-bit integers from each packed 32-bit element in a and b, and store the signed 32-bit results in vect_t. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [(a0 smod 2^16)*(b0 smod 2^16), (a1 smod 2^16)*(b1 smod 2^16), * (a2 smod 2^16)*(b2 smod 2^16), (a3 smod 2^16)*(b3 smod 2^16)] int32_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { //#pragma warning "The simd mulx function is emulated, it may impact the performances." vect_t a1, b1, mask1, mask2; mask1 = set1(0x0000FFFF); mask2 = set1(0x00008000); a1 = add(a,mask2); a1 = vand(a1,mask1); a1 = sub(a1,mask2); b1 = add(b,mask2); b1 = vand(b1,mask1); b1 = sub(b1,mask2); return mul(a1,b1); } /* * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, * keep the low 32 bits of the intermediate and add the low 32-bits of c. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t [c0, c1, c2, c3] int32_t * Return : [(a0*b0+c0) smod 2^32, ..., (a3*b3+c3) smod 2^32] int32_t */ static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); } static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } /* * Multiply the low 16-bit integers from each packed 32-bit element in a and b, * keep the signed 32-bit results and add the low 32-bits of c. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t [c0, c1, c2, c3] int32_t * Return : [((a0 smod 2^16)*(b0 smod 2^16)+c0) smod 2^32, ..., * ((a3 smod 2^16)*(b3 smod 2^16)+c3) smod 2^32] int32_t */ static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } /* * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, * and substract the low 32 bits of the intermediate from elements of c. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t [c0, c1, c2, c3] int32_t * Return : [(-a0*b0+c0) smod 2^32, ..., (-a3*b3+c3) smod 2^32] int32_t */ static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); } static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } /* * Multiply the low 16-bit integers from each packed 32-bit element in a and b, * keep the signed 32-bit results and add the low 32-bits of c and substract them from elements of c. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t [c0, c1, c2, c3] int32_t * Return : [(-(a0 smod 2^16)*(b0 smod 2^16)+c0) smod 2^32, ..., * (-(a3 smod 2^16)*(b3 smod 2^16)+c3) smod 2^32] int32_t */ static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } /* * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, * and substract elements of c to the low 32-bits of the intermediate. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t [c0, c1, c2, c3] int32_t * Return : [(a0*b0-c0) smod 2^32, ..., (a3*b3-c3) smod 2^32] int32_t */ static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); } static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } /* * Multiply the low 16-bit integers from each packed 32-bit element in a and b, * keep the signed 32-bit results and substract elements of c from them. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t [c0, c1, c2, c3] int32_t * Return : [((a0 smod 2^16)*(b0 smod 2^16)-c0) smod 2^32, ..., * ((a3 smod 2^16)*(b3 smod 2^16)-c3) smod 2^32] int32_t */ static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Compare packed 32-bits in a and b for equality, and store the results in vect_t. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [(a0==b0) ? 0xFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFF : 0, (a2==b2) ? 0xFFFFFFFF : 0, (a3==b3) ? 0xFFFFFFFF : 0] int32_t */ static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_epi32(a, b); } /* * Compare packed 32-bits in a and b for greater-than, and store the results in vect_t. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [(a0>b0) ? 0xFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFF : 0, (a2>b2) ? 0xFFFFFFFF : 0, (a3>b3) ? 0xFFFFFFFF : 0] int32_t */ static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm_cmpgt_epi32(a, b); } /* * Compare packed 32-bits in a and b for lesser-than, and store the results in vect_t. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [(a0=b0) ? 0xFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFF : 0, (a2>=b2) ? 0xFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFF : 0] int32_t */ static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } /* * Compare packed 32-bits in a and b for lesser or equal than, and store the results in vect_t. * Args : [a0, a1, a2, a3] int32_t [b0, b1, b2, b3] int32_t * Return : [(a0<=b0) ? 0xFFFFFFFF : 0, (a1<=b1) ? 0xFFFFFFFF : 0, (a2<=b2) ? 0xFFFFFFFF : 0, (a3<=b3) ? 0xFFFFFFFF : 0] int32_t */ static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Horizontally add 32-bits elements of a. * Args : [a0, a1, a2, a3] * Return : a0+a1+a2+a3 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter conv; conv.v = a; return scalar_t(conv.t[0] + conv.t[1] + conv.t[2] + conv.t[3]); } static INLINE CONST vect_t round(const vect_t a) { return a; } static INLINE CONST vect_t signbits(const vect_t x) { vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1)); return signBits; } static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) { #ifdef __INTEL_COMPILER C = _mm_rem_epi32(C, P); #else FFLASFFPACK_abort("pas implementé"); // C = fnmadd(C,_mm_castps_si128(_mm_floor_ps(_mm_mul_ps(INVP,_mm_castsi128_ps(C)))),P); #endif NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); return C; } }; /* * Simd128 specialized for uint32_t */ template <> struct Simd128_impl : public Simd128_impl { /* * define the scalar type corresponding to the specialization */ using scalar_t = uint32_t; /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 32-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x,x,x] uint32_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi32(x); } /* * Set packed 32-bit unsigned integers in dst with the supplied values. * Return [x0,x1,x2,x3] uint32_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3) { return _mm_set_epi32(x3, x2, x1, x0); } /* * Gather 32-bit unsigned integer elements with indexes idx[0], ..., idx[3] from the address p in vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] uint32_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]); } /* * Load 128-bits of unsigned integer data from memory into dst. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[0],p[1],p[2],p[3]] uint32_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_si128(reinterpret_cast(p)); } /* * Load 128-bits of unsigned integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] uint32_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_si128(reinterpret_cast(p)); } /* * Store 128-bits of unsigned integer data from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm_store_si128(reinterpret_cast(p), v); } /* * Store 128-bits of unsigned integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm_storeu_si128(reinterpret_cast(p), v); } /* * Store 128-bits of unsigned integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 16-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm_stream_si128(reinterpret_cast(p), v); } /* * Shift packed 32-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, ..., a3] int32_t * Return : [Floor(a0/2^s), ..., Floor(a3/2^s)] int32_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_srli_epi32(a, s); } static INLINE CONST vect_t greater(vect_t a, vect_t b) { vect_t x; x = set1((static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(a,x); b = sub(b,x); return _mm_cmpgt_epi32(a, b); } static INLINE CONST vect_t lesser(vect_t a, vect_t b) { vect_t x; x = set1((static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(a,x); b = sub(b,x); return _mm_cmplt_epi32(a, b); } static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Multiply the packed unsigned 32-bit integers in a and b, producing intermediate 64-bit integers, * and store the high 32 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3] uint32_t * [b0, b1, b2, b3] uint32_t * Return : [Floor(a0*b0/2^32), ..., Floor(a3*b3/2^32)] uint32_t */ static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { // _mm_mulhi_epi32 emul //#pragma warning "The simd mulhi function is emulated, it may impact the performances." typedef Simd128_impl Simd128_64; vect_t C,A1,B1; C = Simd128_64::mulx(a,b); A1 = Simd128_64::srl(a,32); B1 = Simd128_64::srl(b,32); A1 = Simd128_64::mulx(A1,B1); C = Simd128_64::srl(C,32); A1 = Simd128_64::srl(A1,32); A1 = Simd128_64::sll(A1,32); return Simd128_64::vor(C,A1); } /* * Multiply the low unsigned 16-bit integers from each packed 32-bit element in a and b, * and store the signed 32-bit results in vect_t. * Args : [a0, a1, a2, a3] uint32_t * [b0, b1, b2, b3] uint32_t * Return : [(a0 mod 2^16)*(b0 mod 2^16), (a1 mod 2^16)*(b1 mod 2^16), * (a2 mod 2^16)*(b2 mod 2^16), (a3 mod 2^16)*(b3 mod 2^16)] uint32_t */ static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { //#pragma warning "The simd mulx function is emulated, it may impact the performances." vect_t a1, b1, mask1; mask1 = set1(0x0000FFFF); a1 = vand(a,mask1); b1 = vand(b,mask1); return mul(a1,b1); } static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Horizontally add 32-bits elements of a. * Args : [a0, a1, a2, a3] * Return : a0+a1+a2+a3 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter conv; conv.v = a; return conv.t[0] + conv.t[1] + conv.t[2] + conv.t[3]; } }; //Simd128_impl #endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_int32_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd128_int64.inl000066400000000000000000000614501274716147400245270ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * Romain Lebreton * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd128_int64_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd128_int64_INL #ifndef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS #error "You need SSE instructions to perform 128 bits operations on int64" #endif /* * Simd128 specialized for int64_t */ template <> struct Simd128_impl : public Simd128i_base { /* * alias to 128 bit simd register */ using vect_t = __m128i; /* * define the scalar type corresponding to the specialization */ using scalar_t = int64_t; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 2; /* * alignement required by scalar_t pointer to be loaded in a vect_t */ static const constexpr size_t alignment = 16; /* * Check if the pointer p is a multiple of alignemnt */ template static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } /* * Check if the number n is a multiple of vect_size */ template static constexpr bool compliant(T n) { return n % vect_size == 0; } /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 64-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x] int64_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi64x(x); } /* * Set packed 64-bit integers in dst with the supplied values. * Return [x0,x1] int64_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1) { return _mm_set_epi64x(x1, x0); } /* * Gather 64-bit integer elements with indexes idx[0], idx[1] from the address p in vect_t. * Return [p[idx[0]], p[idx[1]]] int64_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]]); } /* * Load 128-bits of integer data from memory into dst. * p must be aligned on a 16-byte boundary or a general-protection exception will be generated. * Return [p[0],p[1]] int64_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_si128(reinterpret_cast(p)); } /* * Load 128-bits of integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0],p[1]] int64_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_si128(reinterpret_cast(p)); } /* * Store 128-bits of integer data from a into memory. * p must be aligned on a 16-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm_store_si128(reinterpret_cast(p), v); } /* * Store 128-bits of integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm_storeu_si128(reinterpret_cast(p), v); } /* * Store 128-bits of integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 16-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm_stream_si128(reinterpret_cast(p), v); } /* * Shift packed 64-bit integers in a left by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1] int64_t * Return : [a0 << s, a1 << s] int64_t */ static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm_slli_epi64(a, s); } /* * Shift packed 64-bit integers in a right by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1] int64_t * Return : [a0 >> s, a1 >> s] int64_t */ static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm_srli_epi64(a, s); } /* * Shift packed 64-bit integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, a1] int64_t * Return : [a0 >> s, a1 >> s] int64_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { #ifdef __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS return _mm_srai_epi64(a, s); #else const int b = 63 - s; vect_t m = sll(set1(1), b); vect_t x = srl(a, s); vect_t result = sub(vxor(x, m), m); // result = x^m - m return result; #endif // __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS } /* * Shuffle 64-bit integers in a using the control in imm8, and store the results in dst. * Args : [a0, a1] int64_t * Return : [a[s[0]], a[s[1]]] int64_t */ template static INLINE CONST vect_t shuffle(const vect_t a) { // Transform s = [d1 d0]_base2 to s1 = [2*d1+1 2*d1 2*d0+1 2*d0]_base4 constexpr uint8_t s1 = ((s & 1)?(3*4+2):(1*4+0))+16*((s & 2)?(3*4+2):(1*4+0)); return _mm_shuffle_epi32(a, s1); } /* * Unpack and interleave 64-bit integers from the low half of a and b, and store the results in dst. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [a0, b0] int64_t */ static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { return _mm_unpacklo_epi64(a, b); } /* * Unpack and interleave 64-bit integers from the high half of a and b, and store the results in dst. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [a1, b1] int64_t */ static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { return _mm_unpackhi_epi64(a, b); } /* * Blend packed 64-bit integers from a and b using control mask imm8, and store the results in dst. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [s[0]?a0:b0, s[1]?a1:b1] int64_t */ template static INLINE CONST vect_t blend(const vect_t a, const vect_t b) { // _mm_blend_epi16 is faster than _mm_blend_epi32 and require SSE4.1 instead of AVX2 // We have to transform s = [d1 d0]_base2 to s1 = [d1 d1 d1 d1 d0 d0 d0 d0]_base2 constexpr uint8_t s1 = (s & 0x1) * 15 + ((s & 0x2) << 3) * 15; return _mm_blend_epi16(a, b, s1); } /* * Add packed 64-bits integer in a and b, and store the results in vect_t. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [a0+b0, a1+b1] int64_t */ static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm_add_epi64(a, b); } static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } /* * Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in vect_t. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [a0-b0, a1-b1] int64_t */ static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm_sub_epi64(a, b); } static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } /* * Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in vect_t. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [a0*b0 smod 2^64, a1*b1 smod 2^64] int64_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mullo(const vect_t x0, const vect_t x1) { #ifdef __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS _mm_mullo_epi64(x0, x1); #else // _mm_mullo_epi64 emul //#pragma warning "The simd mullo function is emulate, it may impact the performances." Converter c0, c1; c0.v = x0; c1.v = x1; return set((scalar_t)(c0.t[0] * c1.t[0]), (scalar_t)(c0.t[1] * c1.t[1])); #endif // __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS } static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); } /* * Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the high 64 bits of the intermediate integers in vect_t. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [Floor(a0*b0/2^64), Floor(a1*b1/2^64)] int64_t */ #ifdef __FFLASFFPACK_HAVE_INT128 static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { //#pragma warning "The simd mulhi function is emulated, it may impact the performances." Converter c0, c1; c0.v = a; c1.v = b; return set((scalar_t)((int128_t(c0.t[0]) * c1.t[0]) >> 64), (scalar_t)((int128_t(c0.t[1]) * c1.t[1]) >> 64)); } #endif /* * Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in vect_t. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [(a0 smod 2^32)*(b0 smod 2^32), (a1 smod 2^32)*(b1 smod 2^32)] int64_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm_mul_epi32(a, b); } /* * Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, * keep the low 64 bits of the intermediate and add the low 64-bits of c. * Args : [a0, a1] int64_t [b0, b1] int64_t [c0, c1] int64_t * Return : [(a0*b0+c0) smod 2^64, (a1*b1+c1) smod 2^64] int64_t */ static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); } static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } /* * Multiply the low 32-bit integers from each packed 64-bit element in a and b, * keep the signed 64-bit results and add the low 64-bits of c. * Args : [a0, a1] int64_t [b0, b1] int64_t [c0, c1] int64_t * Return : [((a0 smod 2^32)*(b0 smod 2^32)+c0) smod 2^64, * ((a1 smod 2^32)*(b1 smod 2^32)+c1) smod 2^64] int64_t */ static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } /* * Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, * and substract the low 64 bits of the intermediate from elements of c. * Args : [a0, a1] int64_t [b0, b1] int64_t [c0, c1] int64_t * Return : [(-a0*b0+c0) smod 2^64, (-a1*b1+c1) smod 2^64] int64_t */ static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); } static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } /* * Multiply the low 32-bit integers from each packed 64-bit element in a and b, * keep the signed 64-bit results and substract them from elements of c. * Args : [a0, a1] int64_t [b0, b1] int64_t [c0, c1] int64_t * Return : [(-(a0 smod 2^32)*(b0 smod 2^32)+c0) smod 2^64, * (-(a1 smod 2^32)*(b1 smod 2^32)+c1) smod 2^64] int64_t */ static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } /* * Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, * and substract elements of c to the low 64-bits of the intermediate. * Args : [a0, a1] int64_t [b0, b1] int64_t [c0, c1] int64_t * Return : [(a0*b0-c0) smod 2^64, (a1*b1-c1) smod 2^64] int64_t */ static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); } static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } /* * Multiply the low 32-bit integers from each packed 64-bit element in a and b, * keep the signed 64-bit results and substract elements of c from them. * Args : [a0, a1] int64_t [b0, b1] int64_t [c0, c1] int64_t * Return : [(-(a0 smod 2^32)*(b0 smod 2^32)+c0) smod 2^64, * (-(a1 smod 2^32)*(b1 smod 2^32)+c1) smod 2^64] int64_t */ static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Compare packed 64-bits in a and b for equality, and store the results in vect_t. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0] int64_t */ static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm_cmpeq_epi64(a, b); } /* * Compare packed 64-bits in a and b for greater-than, and store the results in vect_t. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [(a0>b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0] int64_t */ static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { #ifdef __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS return _mm_cmpgt_epi64(a, b); #else //#warning "The simd greater function is emulate, it may impact the performances." Converter ca, cb; ca.v = a; cb.v = b; return set((ca.t[0] > cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] > cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0); #endif // __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS } /* * Compare packed 64-bits in a and b for lesser-than, and store the results in vect_t. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [(a0=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0] int64_t */ static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } /* * Compare packed 64-bits in a and b for lesser or equal than, and store the results in vect_t. * Args : [a0, a1] int64_t [b0, b1] int64_t * Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0] int64_t */ static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Horizontally add 64-bits elements of a. * Args : [a0, a1] int64_t * Return : a0+a1 int64_t */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter conv; conv.v = a; return scalar_t(conv.t[0] + conv.t[1]); } static INLINE CONST vect_t round(const vect_t a) { return a; } static INLINE CONST vect_t signbits(const vect_t x) { vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1)); return signBits; } // mask the high 32 bits of a 64 bits, that is 00000000FFFFFFFF static INLINE CONST vect_t mask_high() { return srl(_mm_set1_epi8(-1), 32); } static INLINE CONST vect_t mulhi_fast(vect_t x, vect_t y); template static INLINE vect_t mod(vect_t &C, const vect_t &P, const int8_t &shifter, const vect_t &magic, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T); }; // Simd128_impl /* * Simd128 specialized for uint64_t */ template <> struct Simd128_impl : public Simd128_impl { /* * define the scalar type corresponding to the specialization */ using scalar_t = uint64_t; /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 64-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x] uint64_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm_set1_epi64x(x); } /* * Set packed 64-bit integers in dst with the supplied values. * Return [x0,x1] uint64_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1) { return _mm_set_epi64x(x1, x0); } /* * Gather 64-bit unsigned integer elements with indexes idx[0], ..., idx[1] from the address p in vect_t. * Return [p[idx[0]], p[idx[1]]] uint64_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]]); } /* * Load 128-bits of unsigned integer data from memory into dst. * p must be aligned on a 16-byte boundary or a general-protection exception will be generated. * Return [p[0],p[1]] uint64_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm_load_si128(reinterpret_cast(p)); } /* * Load 128-bits of unsigned integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0],p[1]] uint64_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm_loadu_si128(reinterpret_cast(p)); } /* * Store 128-bits of unsigned integer data from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm_store_si128(reinterpret_cast(p), v); } /* * Store 128-bits of unsigned integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm_storeu_si128(reinterpret_cast(p), v); } /* * Store 128-bits of unsigned integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 16-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm_stream_si128(reinterpret_cast(p), v); } /* * Shift packed 64-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, a1] uint64_t * Return : [Floor(a0/2^s), Floor(a1/2^s)] uint64_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm_srli_epi64(a, s); } static INLINE CONST vect_t greater(vect_t a, vect_t b) { #ifdef __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS vect_t x; x = set1(-(static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(x, a); b = sub(x, b); return _mm_cmpgt_epi64(b, a); #else //#pragma warning "The simd greater function is emulated, it may impact the performances." Converter ca, cb; ca.v = a; cb.v = b; return set((ca.t[0] > cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] > cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0); #endif // __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS } static INLINE CONST vect_t lesser(vect_t a, vect_t b) { #ifdef __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS vect_t x; x = set1(-(static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(x, a); b = sub(x, b); return _mm_cmpgt_epi64(a, b); #else //#pragma warning "The simd greater function is emulated, it may impact the performances." Converter ca, cb; ca.v = a; cb.v = b; return set((ca.t[0] < cb.t[0]) ? 0xFFFFFFFFFFFFFFFF : 0, (ca.t[1] < cb.t[1]) ? 0xFFFFFFFFFFFFFFFF : 0); #endif // __FFLASFFPACK_HAVE_SSE4_2_INSTRUCTIONS } static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Multiply the packed 64-bit unsigned integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in vect_t. * Args : [a0, a1] uint64_t [b0, b1] uint64_t * Return : [a0*b0 mod 2^64, a1*b1 mod 2^64] uint64_t */ static INLINE CONST vect_t mullo(const vect_t x0, const vect_t x1) { // _mm_mullo_epi32 emul //#pragma warning "The simd mullo function is emulated, it may impact the performances." Converter c0, c1; c0.v = x0; c1.v = x1; return set((scalar_t)(c0.t[0] * c1.t[0]), (scalar_t)(c0.t[1] * c1.t[1])); } /* * Multiply the packed unsigned 64-bit integers in a and b, producing intermediate 128-bit integers, * and store the high 64 bits of the intermediate integers in vect_t. * Args : [a0, a1] uint64_t [b0, b1] uint64_t * Return : [Floor(a0*b0/2^16), Floor(a1*b1/2^16)] uint64_t */ #ifdef __FFLASFFPACK_HAVE_INT128 static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { //#pragma warning "The simd mulhi function is emulate, it may impact the performances." Converter c0, c1; c0.v = a; c1.v = b; return set((scalar_t)((uint128_t(c0.t[0]) * c1.t[0]) >> 64), (scalar_t)((uint128_t(c0.t[1]) * c1.t[1]) >> 64)); } #endif /* * Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in vect_t. * Args : [a0, a1] uint64_t [b0, b1] uint64_t * Return : [(a0 mod 2^32)*(b0 mod 2^32), (a1 mod 2^32)*(b1 mod 2^32)] uint64_t */ static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm_mul_epu32(a, b); } static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE CONST vect_t fmsubxin(vect_t c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Horizontally add 64-bits elements of a. * Args : [a0, a1, a2, a3] * Return : a0+a1+a2+a3 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter c; c.v = a; return c.t[0] + c.t[1]; } }; //Simd128_impl #define vect_t Simd128_impl::vect_t // warning : may be off by 1 multiple, but we save a mul... INLINE CONST vect_t Simd128_impl::mulhi_fast(vect_t x, vect_t y) { // unsigned mulhi starts: // x1 = xy_high = mulhiu_fast(x,y) const vect_t mask = mask_high(); vect_t x0 = vand(x, mask), x1 = srl(x, 32); vect_t y0 = vand(y, mask), y1 = srl(y, 32); x0 = Simd128_impl::mulx(x0, y1); // x0y1 y0 = Simd128_impl::mulx(x1, y0); // x1y0 y1 = Simd128_impl::mulx(x1, y1); // x1y1 x1 = vand(y0, mask); y0 = srl(y0, 32); // x1y0_lo = x1 // y1yo_hi = y0 x1 = srl(add(x1, x0), 32); y0 = add(y1, y0); x1 = add(x1, y0); // unsigned mulhi ends // fixing signs x0 = vand(signbits(x), y); x1 = sub(x1, x0); x0 = vand(signbits(y), x); x1 = sub(x1, x0); // end fixing return x1; } // warning : may be off by 1 multiple, but we save a mul... template INLINE CONST vect_t Simd128_impl::mod(vect_t &C, const vect_t &P, const int8_t &shifter, const vect_t &magic, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) { #ifdef __INTEL_COMPILER // Works fine with ICC 15.0.1 - A.B. // #warning "not tested" C = _mm_rem_epi64(C, P); #else if (poweroftwo) { Q = srl(C, 63); vect_t un = set1(1); T = sub(sll(un, shifter), un); Q = add(C, vand(Q, T)); Q = sll(srl(Q, shifter), shifter); C = sub(C, Q); Q = vand(greater(zero(), Q), P); C = add(C, Q); } else { Q = mulhi_fast(C, magic); if (overflow) { Q = add(Q, C); } Q = sra(Q, shifter); vect_t q1 = Simd128_impl::mulx(Q, P); vect_t q2 = sll(Simd128_impl::mulx(srl(Q, 32), P), 32); C = sub(C, add(q1, q2)); T = greater_eq(C, P); C = sub(C, vand(T, P)); } #endif NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); return C; } #undef vect_t #endif // __FFLASFFPACK_fflas_ffpack_utils_simd128_int64_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd256.inl000066400000000000000000000140411274716147400234770ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd256_INL struct Simd256fp_base { #if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) /* * Shuffle 128-bits selected by imm8 from a and b, and store the results in dst. * Args : [a0, a1] * [b0, b1] * Return : [s[0..3]?a0:a1:b0:b1, s[4..7]?a0:a1:b0:b1] */ template static INLINE CONST __m256d permute128(const __m256d a, const __m256d b) { return _mm256_permute2f128_pd(a, b, s); } template static INLINE CONST __m256 permute128(const __m256 a, const __m256 b) { return _mm256_permute2f128_ps(a, b, s); } /* * Unpack and interleave 128-bit integers from the low half of a and b, and store the results in dst. * Args : [a0, a1] int128_t [b0, b1] int128_t * Return : [a0, b0] int128_t */ static INLINE CONST __m256d unpacklo128(const __m256d a, const __m256d b) { return permute128<0x20>(a, b); } static INLINE CONST __m256 unpacklo128(const __m256 a, const __m256 b) { return permute128<0x20>(a, b); } /* * Unpack and interleave 128-bit integers from the high half of a and b, and store the results in dst. * Args : [a0, a1] int128_t [b0, b1] int128_t * Return : [a1, b1] int128_t */ static INLINE CONST __m256d unpackhi128(const __m256d a, const __m256d b) { return permute128<0x31>(a, b); } static INLINE CONST __m256 unpackhi128(const __m256 a, const __m256 b) { return permute128<0x31>(a, b); } #endif }; struct Simd256i_base { /* * alias to 256 bit simd register */ using vect_t = __m256i; /* * Return vector of type vect_t with all elements set to zero * Return [0, ...,0] */ static INLINE CONST vect_t zero() { return _mm256_setzero_si256(); } #if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS) /* * Shift packed 128-bit integers in a left by s bits while shifting in zeros, and store the results in vect_t. * Args : [a0, a1] int128_t * Return : [a0 << (s*8), a1 << (s*8)] int128_t */ template static INLINE CONST vect_t sll128(const vect_t a) { return _mm256_bslli_epi128(a, s); } /* * Shift packed 128-bit integers in a right by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1] int128_t * Return : [a0 << (s*8), a1 << (s*8)] int128_t */ template static INLINE CONST vect_t srl128(const vect_t a) { return _mm256_bsrli_epi128(a, s); } /* * Compute the bitwise AND and store the results in vect_t. * Args : [a0, ..., a255] * [b0, ..., b255] * Return : [a0 AND b0, ..., a255 AND b255] */ static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_si256(b, a); } /* * Compute the bitwise OR and store the results in vect_t. * Args : [a0, ..., a255] * [b0, ..., b255] * Return : [a0 OR b0, ..., a255 OR b255] */ static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_si256(b, a); } /* * Compute the bitwise XOR and store the results in vect_t. * Args : [a0, ..., a255] * [b0, ..., b255] * Return : [a0 XOR b0, ..., a255 XOR b255] */ static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_si256(b, a); } /* * Compute the bitwise AND NOT and store the results in vect_t. * Args : [a0, ..., a255] * [b0, ..., b255] * Return : [a0 AND (NOT b0), ..., a255 AND (NOT b255)] */ static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_si256(b, a); } /* * Shuffle 128-bit integers in a and b using the control in imm8, and store the results in dst. * Args : [a0, a1] int128_t * [b0, b1] int128_t * Return : [s[0..3]?a0:a1:b0:b1, s[4..7]?a0:a1:b0:b1] int128_t */ template static INLINE CONST vect_t permute128(const vect_t a, const vect_t b) { return _mm256_permute2x128_si256(a, b, s); } /* * Unpack and interleave 128-bit integers from the low half of a and b, and store the results in dst. * Args : [a0, a1] int128_t [b0, b1] int128_t * Return : [a0, b0] int128_t */ static INLINE CONST vect_t unpacklo128(const vect_t a, const vect_t b) { return permute128<0x20>(a, b); } /* * Unpack and interleave 128-bit integers from the high half of a and b, and store the results in dst. * Args : [a0, a1] int128_t [b0, b1] int128_t * Return : [a1, b1] int128_t */ static INLINE CONST vect_t unpackhi128(const vect_t a, const vect_t b) { return permute128<0x31>(a, b); } #endif }; template struct Simd256_impl; template using Simd256 = Simd256_impl::value, std::is_integral::value, std::is_signed::value, sizeof(T)>; #include "simd256_float.inl" #include "simd256_double.inl" #ifdef SIMD_INT // To many missing insctructions on int8_t #if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS) #include "simd256_int64.inl" #include "simd256_int32.inl" #include "simd256_int16.inl" #endif #endif //#ifdef SIMD_INT #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd256_double.inl000066400000000000000000000427121274716147400250370ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL #if not (defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) or defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS)) #error "You need AVX instructions to perform 256bits operations on double" #endif /* * Simd256 specialized for double */ template <> struct Simd256_impl : public Simd256fp_base { /* * alias to 256 bit simd register */ using vect_t = __m256d; /* * define the scalar type corresponding to the specialization */ using scalar_t = double; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 4; /* * alignement required by scalar_t pointer to be loaded in a vect_t */ static const constexpr size_t alignment = 32; /* * Check if the pointer p is a multiple of alignemnt */ template static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } /* * Check if the number n is a multiple of vect_size */ template static constexpr bool compliant(T n) { return n % vect_size == 0; } /* * Return vector of type vect_t with all elements set to zero * Return [0,0,0,0] */ static INLINE CONST vect_t zero() { return _mm256_setzero_pd(); } /* * Broadcast double-precision (64-bit) floating-point value x to all elements of vect_t. * Return [x,x,x,x] */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_pd(x); } /* * Set packed double-precision (64-bit) floating-point elements in vect_t with the supplied values. * Return [x1,x2,x3,x4] */ static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4) { return _mm256_set_pd(x4, x3, x2, x1); } /* * Gather double-precision (64-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in *vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { // TODO AVX2 Gather return _mm256_set_pd(p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]); } /* * Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into vect_t. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[0], p[1], p[2], p[3]] */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_pd(p); } /* * Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into vect_t. * p does not need to be aligned on any particular boundary. * Return [p[0], p[1], p[2], p[3]] */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_pd(p); } /* * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from p into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(const scalar_t *p, const vect_t v) { _mm256_store_pd(const_cast(p), v); } /* * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from p into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm256_storeu_pd(const_cast(p), v); } /* * Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory using * a non-temporal memory hint. * p must be aligned on a 32-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(const scalar_t *p, const vect_t v) { _mm256_stream_pd(const_cast(p), v); } /* * Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, * and store the results in dst. * Args : [a0, a1, a2, a3] double [b0, b1, b2, b3] double * Return : [a[s[0..1]], ..., a[s[6..7]]] double */ #if defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS) template static INLINE CONST vect_t shuffle(const vect_t a) { return _mm256_permute4x64_pd(a, s); } #endif /* * Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, * and store the results in dst. * Args : [a0, a1, a2, a3] double [b0, b1, b2, b3] double * Return : [a0, b0, a2, b2] double */ static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_pd(a, b); } /* * Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, * and store the results in dst. * Args : [a0, a1, a2, a3] double [b0, b1, b2, b3] double * Return : [a1, b1, a3, b3] double */ static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_pd(a, b); } /* * Blend packed double-precision (64-bit) floating-point elements from a and b using control mask s, * and store the results in dst. * Args : [a0, a1, a2, a3] double [b0, b1, b2, b3] double * Return : [s[0]?a0:b0, ..., s[3]?a3:b3] double */ template static INLINE CONST vect_t blend(const vect_t a, const vect_t b) { return _mm256_blend_pd(a, b, s); } /* * Blend packed double-precision (64-bit) floating-point elements from a and b using mask, * and store the results in dst. * Args : [a0, a1, a2, a3] double [b0, b1, b2, b3] double * Return : [mask[31]?a0:b0, ..., mask[255]?a3:b3] double */ static INLINE CONST vect_t blendv(const vect_t a, const vect_t b, const vect_t mask) { return _mm256_blendv_pd(a, b, mask); } /* * Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0+b0, a1+b1, a2+b2, a3+b3] */ static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_pd(a, b); } static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } /* * Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) * floating-point elements in a, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0-b0, a1-b1, a2-b2, a3-b3] */ static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_pd(a, b); } static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0*b0, a1*b1, a2*b2, a3*b3] */ static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm256_mul_pd(a, b); } static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); } /* * Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, * and store the results in dst. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0/b0, a1/b1, a2/b2, a3/b3] */ static INLINE CONST vect_t div(const vect_t a, const vect_t b) { return _mm256_div_pd(a, b); } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to * packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3] */ static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm256_fmadd_pd(a, b, c); #else return add(c, mul(a, b)); #endif } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to * packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3] */ static INLINE CONST vect_t madd(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to * packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3] */ static INLINE CONST vect_t maddx(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); } static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result * to packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3] */ static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm256_fnmadd_pd(a, b, c); #else return sub(c, mul(a, b)); #endif } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result * to packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3] */ static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); } static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from * the intermediate result, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3] */ static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm256_fmsub_pd(a, b, c); #else return sub(mul(a, b), c); #endif } /* * Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from * the intermediate result, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3], [c0, c1, c2, c3] * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3] */ static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); } static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } /* * Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0, (a2==b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3==b3) ? 0xFFFFFFFFFFFFFFFF : 0] */ static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } /* * Compare packed double-precision (64-bit) floating-point elements in a and b for lesser-than, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [(a0b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0, (a2>b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3>b3) ? 0xFFFFFFFFFFFFFFFF : 0] */ static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_GT_OS); } /* * Compare packed double-precision (64-bit) floating-point elements in a and b for greater or equal than, and store the results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [(a0>=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0, (a2>=b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFFFFFFFFFF : 0] */ static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm256_cmp_pd(a, b, _CMP_GE_OS); } /* * Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3] */ static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_pd(a, b); } /* * Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3] */ static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_pd(a, b); } /* * Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3] */ static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_pd(a, b); } /* * Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0 AND NOT b0, a1 AND NOT b1, a2 AND NOT b2, a3 AND NOT b3] */ static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_pd(a, b); } /* * Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the * results as packed double-precision floating-point elements in vect_t. * Args : [a0, a1, a2, a3] * Return : [floor(a0), floor(a1), floor(a2), floor(a3)] */ static INLINE CONST vect_t floor(const vect_t a) { return _mm256_floor_pd(a); } /* * Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the * results as packed double-precision floating-point elements in vect_t. * Args : [a0, a1, a2, a3] * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3)] */ static INLINE CONST vect_t ceil(const vect_t a) { return _mm256_ceil_pd(a); } /* * Round the packed double-precision (64-bit) floating-point elements in a, and store the results as packed * double-precision floating-point elements in vect_t. * Args : [a0, a1, a2, a3] * Return : [round(a0), round(a1), round(a2), round(a3)] */ static INLINE CONST vect_t round(const vect_t a) { return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } /* * Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the * results in vect_t. * Args : [a0, a1, a2, a3], [b0, b1, b2, b3] * Return : [a0+a1, b0+b1, a2+a3, b2+b3] */ static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm256_hadd_pd(a, b); } /* * Horizontally add double-precision (64-bit) floating-point elements in a. * Args : [a0, a1, a2, a3] * Return : a0+a1+a2+a3 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] + ((const scalar_t *)&a)[3]; } static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) { FLOAT_MOD(C, P, INVP, Q); NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); return C; } }; #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_double_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd256_float.inl000066400000000000000000000474121274716147400246740ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_float_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd256_float_INL /* * Simd256 specialized for float */ template <> struct Simd256_impl : public Simd256fp_base { #if defined(__FFLASFFPACK_HAVE_AVX_INSTRUCTIONS) or defined(__FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS) /* * alias to 256 bit simd register */ using vect_t = __m256; /* * define the scalar type corresponding to the specialization */ using scalar_t = float; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 8; /* * alignement required by scalar_t pointer to be loaded in a vect_t */ static const constexpr size_t alignment = 32; /* * Check if the pointer p is a multiple of alignemnt */ template static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } /* * Check if the number n is a multiple of vect_size */ template static constexpr bool compliant(T n) { return n % vect_size == 0; } /* * Return vector of type vect_t with all elements set to zero * Return [0,0,0,0,0,0,0,0] */ static INLINE CONST vect_t zero() { return _mm256_setzero_ps(); } /* * Broadcast single-precision (32-bit) floating-point value x to all elements of vect_t. * Return [x,x,x,x,x,x,x,x] */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_ps(x); } /* * Set packed single-precision (32-bit) floating-point elements in vect_t with the supplied values. * Return [x1,x2,x3,x4,x5,x6,x7,x8] */ static INLINE CONST vect_t set(const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7, const scalar_t x8) { return _mm256_set_ps(x8, x7, x6, x5, x4, x3, x2, x1); } /* * Gather single-precision (32-bit) floating-point elements with indexes idx[0], ..., idx[3] from the address p in *vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]] */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { // TODO AVX2 Gather return _mm256_set_ps(p[idx[7]], p[idx[6]], p[idx[5]], p[idx[4]], p[idx[3]], p[idx[2]], p[idx[1]], p[idx[0]]); } /* * Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into vect_t. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]] */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_ps(p); } /* * Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into vect_t. * p does not need to be aligned on any particular boundary. * Return [p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]] */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_ps(p); } /* * Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(const scalar_t *p, const vect_t v) { _mm256_store_ps(const_cast(p), v); } /* * Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(const scalar_t *p, const vect_t v) { _mm256_storeu_ps(const_cast(p), v); } /* * Store 256-bits (composed of 8 packed double-precision (32-bit) floating-point elements) from a into memory using * a non-temporal memory hint. * p must be aligned on a 32-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(const scalar_t *p, const vect_t v) { _mm256_stream_ps(const_cast(p), v); } /* * Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in s, * and store the results in dst. * Args : [a0, ..., a7] float [b0, ..., b7] float * Return : [a[s[0..3]], ..., a[s[28..31]]] float */ template static INLINE CONST vect_t shuffle_twice(const vect_t a) { return _mm256_permute_ps(a, s); } /* * Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, * and store the results in dst. * Args : [a0, ..., a7] float [b0, ..., b7] float * Return : [a0, b0, a1, b1, a4, b4, a5, b5] float */ static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_ps(a, b); } /* * Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, * and store the results in dst. * Args : [a0, ..., a7] float [b0, ..., b7] float * Return : [a2, b2, a3, b3, a6, b6, a7, b7] float */ static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_ps(a, b); } /* * Blend packed single-precision (32-bit) floating-point elements from a and b using control mask s, * and store the results in dst. * Args : [a0, ..., a7] float [b0, ..., b7] float * Return : [s[0]?a0:b0, ..., s[7]?a7:b7] float */ template static INLINE CONST vect_t blend(const vect_t a, const vect_t b) { return _mm256_blend_ps(a, b, s); } /* * Blend packed single-precision (32-bit) floating-point elements from a and b using mask, * and store the results in dst. * Args : [a0, ..., a7] float [b0, ..., b7] float * Return : [mask[31]?a0:b0, ..., mask[255]?a7:b7] float */ static INLINE CONST vect_t blendv(const vect_t a, const vect_t b, const vect_t mask) { return _mm256_blendv_ps(a, b, mask); } /* * Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [a0+b0, a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7] */ static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_ps(a, b); } static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } /* * Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) * floating-point elements in a, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [a0-b0, a1-b1, a2-b2, a3-b3, a4-b4, a5-b5, a6-b6, a7-b7] */ static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_ps(a, b); } static INLINE CONST vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7] */ static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return _mm256_mul_ps(a, b); } static INLINE CONST vect_t mulin(vect_t &a, const vect_t b) { return a = mul(a, b); } /* * Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, * and store the results in dst. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [a0/b0, a1/b1, a2/b2, a3/b3, a4/b4, a5/b5, a6/b6, a7/b7] */ static INLINE CONST vect_t div(const vect_t a, const vect_t b) { return _mm256_div_ps(a, b); } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to * packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7] * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7] */ static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm256_fmadd_ps(a, b, c); #else return add(c, mul(a, b)); #endif } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to * packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7] * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7] */ static INLINE CONST vect_t madd(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to * packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7] * Return : [a0*b0+c0, a1*b1+c1, a2*b2+c2, a3*b3+c3, a4*b4+c4, a5*b5+c5, a6*b6+c6, a7*b7+c7] */ static INLINE CONST vect_t maddx(const vect_t c, const vect_t a, const vect_t b) { return fmadd(c, a, b); } static INLINE CONST vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result * to packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7] * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3, -(a4*b4)+c4, -(a5*b5)+c5, -(a6*b6)+c6, -(a7*b7)+c7] */ static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm256_fnmadd_ps(a, b, c); #else return sub(c, mul(a, b)); #endif } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result * to packed elements in c, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7] * Return : [-(a0*b0)+c0, -(a1*b1)+c1, -(a2*b2)+c2, -(a3*b3)+c3, -(a4*b4)+c4, -(a5*b5)+c5, -(a6*b6)+c6, -(a7*b7)+c7] */ static INLINE CONST vect_t nmadd(const vect_t c, const vect_t a, const vect_t b) { return fnmadd(c, a, b); } static INLINE CONST vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from * the intermediate result, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7] * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3, a4*b4-c4, a5*b5-c5, a6*b6-c6, a7*b7-c7] */ static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { #ifdef __FMA__ return _mm256_fmsub_ps(a, b, c); #else return sub(mul(a, b), c); #endif } /* * Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from * the intermediate result, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7], [c0, c1, c2, c3, c4, c5, c6, c7] * Return : [a0*b0-c0, a1*b1-c1, a2*b2-c2, a3*b3-c3, a4*b4-c4, a5*b5-c5, a6*b6-c6, a7*b7-c7] */ static INLINE CONST vect_t msub(const vect_t c, const vect_t a, const vect_t b) { return fmsub(c, a, b); } static INLINE CONST vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } /* * Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [(a0==b0) ? 0xFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFF : 0, (a2==b2) ? 0xFFFFFFFF : 0, (a3==b3) ? 0xFFFFFFFF : 0, (a4==b4) ? 0xFFFFFFFF : 0, (a5==b5) ? 0xFFFFFFFF : 0, (a6==b6) ? 0xFFFFFFFF : 0, (a7==b7) ? 0xFFFFFFFF : 0] */ static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } /* * Compare packed single-precision (32-bit) floating-point elements in a and b for lesser-than, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [(a0b0) ? 0xFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFF : 0, (a2>b2) ? 0xFFFFFFFF : 0, (a3>b3) ? 0xFFFFFFFF : 0, (a4>b4) ? 0xFFFFFFFF : 0, (a5>b5) ? 0xFFFFFFFF : 0, (a6>b6) ? 0xFFFFFFFF : 0, (a7>b7) ? 0xFFFFFFFF : 0] */ static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_GT_OS); } /* * Compare packed single-precision (32-bit) floating-point elements in a and b for greater or equal than, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [(a0>=b0) ? 0xFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFF : 0, (a2>=b2) ? 0xFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFF : 0, (a4>=b4) ? 0xFFFFFFFF : 0, (a5>=b5) ? 0xFFFFFFFF : 0, (a6>=b6) ? 0xFFFFFFFF : 0, (a7>=b7) ? 0xFFFFFFFF : 0] */ static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return _mm256_cmp_ps(a, b, _CMP_GE_OS); } /* * Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [a0 AND b0, a1 AND b1, a2 AND b2, a3 AND b3, a4 AND b4, a5 AND b5, a6 AND b6, a7 AND b7] */ static INLINE CONST vect_t vand(const vect_t a, const vect_t b) { return _mm256_and_ps(a, b); } /* * Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [a0 OR b0, a1 OR b1, a2 OR b2, a3 OR b3, a4 OR b4, a5 OR b5, a6 OR b6, a7 OR b7] */ static INLINE CONST vect_t vor(const vect_t a, const vect_t b) { return _mm256_or_ps(a, b); } /* * Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [a0 XOR b0, a1 XOR b1, a2 XOR b2, a3 XOR b3, a4 XOR b4, a5 XOR b5, a6 XOR b6, a7 XOR b7] */ static INLINE CONST vect_t vxor(const vect_t a, const vect_t b) { return _mm256_xor_ps(a, b); } /* * Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the * results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [a0 ANDNOT b0, a1 ANDNOT b1, a2 ANDNOT b2, a3 ANDNOT b3, a4 ANDNOT b4, a5 ANDNOT b5, a6 ANDNOT b6, a7 * ANDNOT b7] */ static INLINE CONST vect_t vandnot(const vect_t a, const vect_t b) { return _mm256_andnot_ps(a, b); } /* * Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the * results as packed double-precision floating-point elements in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] * Return : [floor(a0), floor(a1), floor(a2), floor(a3), floor(a4), floor(a5), floor(a6), floor(a7)] */ static INLINE CONST vect_t floor(const vect_t a) { return _mm256_floor_ps(a); } /* * Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the * results as packed single-precision floating-point elements in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] * Return : [ceil(a0), ceil(a1), ceil(a2), ceil(a3), ceil(a4), ceil(a5), ceil(a6), ceil(a7)] */ static INLINE CONST vect_t ceil(const vect_t a) { return _mm256_ceil_ps(a); } /* * Round the packed single-precision (32-bit) floating-point elements in a, and store the results as packed * single-precision floating-point elements in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] * Return : [round(a0), round(a1), round(a2), round(a3), round(a4), round(a5), round(a6), round(a7)] */ static INLINE CONST vect_t round(const vect_t a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } /* * Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the * results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7], [b0, b1, b2, b3, b4, b5, b6, b7] * Return : [a0+a1, b0+b1, a2+a3, b2+b3, a4+a5, b4+b5, a6+a7, b6+b7] */ static INLINE CONST vect_t hadd(const vect_t a, const vect_t b) { return _mm256_hadd_ps(a, b); } /* * Horizontally add single-precision (32-bit) floating-point elements in a. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] * Return : a0+a1+a2+a3+a4+a5+a6+a7 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { return ((const scalar_t *)&a)[0] + ((const scalar_t *)&a)[1] + ((const scalar_t *)&a)[2] + ((const scalar_t *)&a)[3] + ((const scalar_t *)&a)[4] + ((const scalar_t *)&a)[5] + ((const scalar_t *)&a)[6] + ((const scalar_t *)&a)[7]; } static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) { FLOAT_MOD(C, P, INVP, Q); NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); return C; } #else // __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS #error "You need AVX instructions to perform 256bits operations on float" #endif }; #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_float_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd256_int16.inl000066400000000000000000000673511274716147400245340ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * Romain Lebreton * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_int16_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd256_int16_INL #ifndef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS #error "You need AVX2 instructions to perform 256bits operations on int16_t" #endif /* * Simd256 specialized for int16_t */ template <> struct Simd256_impl : public Simd256i_base { /* * alias to 256 bit simd register */ using vect_t = __m256i; /* * alias to 256 bit simd register */ using half_t = __m128i; /* * define the scalar type corresponding to the specialization */ using scalar_t = int16_t; /* * Simd128 for scalar_t, to deal half_t */ using simdHalf = Simd128; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 16; /* * alignement required by scalar_t pointer to be loaded in a vect_t */ static const constexpr size_t alignment = 32; /* * Check if the pointer p is a multiple of alignemnt */ template static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } /* * Check if the number n is a multiple of vect_size */ template static constexpr bool compliant(T n) { return n % vect_size == 0; } /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 16-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x] int16_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi16(x); } /* * Set packed 16-bit integers in dst with the supplied values. * Return [x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15] int16_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7, const scalar_t x8, const scalar_t x9, const scalar_t x10, const scalar_t x11, const scalar_t x12, const scalar_t x13, const scalar_t x14, const scalar_t x15) { return _mm256_set_epi16(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0); } /* * Gather 16-bit integer elements with indexes idx[0], ..., idx[15] from the address p in vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]], p[idx[8]], p[idx[9]], p[idx[10]], p[idx[11]], p[idx[12]], p[idx[13]], p[idx[14]], p[idx[15]]] int16_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]], p[idx[8]], p[idx[9]], p[idx[10]], p[idx[11]], p[idx[12]], p[idx[13]], p[idx[14]], p[idx[15]]); } /* * Load 256-bits of integer data from memory into dst. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11]p[12],p[13],p[14],p[15]] int16_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_si256(reinterpret_cast(p)); } /* * Load 256-bits of integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11]p[12],p[13],p[14],p[15]] int16_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_si256(reinterpret_cast(p)); } /* * Store 256-bits of integer data from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm256_store_si256(reinterpret_cast(p), v); } /* * Store 256-bits of integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm256_storeu_si256(reinterpret_cast(p), v); } /* * Store 256-bits of integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 32-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm256_stream_si256(reinterpret_cast(p), v); } /* * Shift packed 16-bit integers in a left by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t * Return : [a0 << s, a1 << s, a2 << s, a3 << s, a4 << s, a5 << s, a6 << s, a7 << s, * a8 << s, a9 << s, a10 << s, a11 << s, a12 << s, a13 << s, a14 << s, a15 << s] int16_t */ static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm256_slli_epi16(a, s); } /* * Shift packed 16-bit integers in a right by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s, * a8 >> s, a9 >> s, a10 >> s, a11 >> s, a12 >> s, a13 >> s, a14 >> s, a15 >> s] int16_t */ static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm256_srli_epi16(a, s); } /* * Shift packed 16-bit integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, ..., a15] int16_t * Return : [a0 >> s, ..., a15 >> s] int16_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_srai_epi16(a, s); } /* * Shuffle 16-bit integers in a using the control in imm8, and store the results in dst. * Args : [a0, ..., a15] int16_t * Return : [a[s[0..3]], ..., a[s[60..63]] int16_t */ template static INLINE CONST vect_t shuffle(const vect_t a) { //#pragma warning "The simd shuffle function is emulated, it may impact the performances."; Converter conv; conv.v = a; return set (conv.t[( s & 0x000000000000000F)], conv.t[( s & 0x00000000000000F0)], conv.t[((s>> 8) & 0x000000000000000F)], conv.t[((s>> 8) & 0x00000000000000F0)], conv.t[((s>>16) & 0x000000000000000F)], conv.t[((s>>16) & 0x00000000000000F0)], conv.t[((s>>24) & 0x000000000000000F)], conv.t[((s>>24) & 0x00000000000000F0)], conv.t[((s>>32) & 0x000000000000000F)], conv.t[((s>>32) & 0x00000000000000F0)], conv.t[((s>>40) & 0x000000000000000F)], conv.t[((s>>40) & 0x00000000000000F0)], conv.t[((s>>48) & 0x000000000000000F)], conv.t[((s>>48) & 0x00000000000000F0)], conv.t[((s>>56) & 0x000000000000000F)], conv.t[((s>>56) & 0x00000000000000F0)]); } /* * Unpack and interleave 16-bit integers from the low half of a and b within 128-bit lanes, and store the results in dst. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [a0, b0, a1, b1, ..., a8, b8, a9, b9, ...] int16_t */ static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_epi16(a, b); } /* * Unpack and interleave 16-bit integers from the high half of a and b within 128-bit lanes, and store the results in dst. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [a4, b4, a5, b5, ..., a12, b12, a13, b13, ...] int16_t */ static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_epi16(a, b); } /* * Unpack and interleave 16-bit integers from the low half of a and b, and store the results in dst. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [a0, b0, ..., a7, b7] int16_t */ static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { using Simd256_64 = Simd256; vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4 so a -> [a0,a2,a1,a3] uint64 vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4 return unpacklo_twice(a1, b1); } /* * Unpack and interleave 16-bit integers from the high half of a and b, and store the results in dst. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [a8, b8, ..., a15, b15] int16_t */ static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { using Simd256_64 = Simd256; vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4 vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4 return unpackhi_twice(a1, b1); } /* * Unpack and interleave 16-bit integers from the low then high half of a and b, and store the results in dst. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [a0, b0, ..., a7, b7] int16_t * [a8, b8, ..., a15, b15] int16_t */ static INLINE CONST void unpacklohi(vect_t& s1, vect_t& s2, const vect_t a, const vect_t b) { using Simd256_64 = Simd256; vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4 vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4 s1 = unpacklo_twice(a1, b1); s2 = unpackhi_twice(a1, b1); } /* * Blend packed 16-bit integers from a and b in each 128 lane using control mask imm8, and store the results in dst. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [s[0]?a0:b0, , s[15]?a15:b15] int16_t */ template static INLINE CONST vect_t blend_twice(const vect_t a, const vect_t b) { return _mm256_blend_epi16(a, b, s); } /* * Add packed 16-bits integer in a and b, and store the results in vect_t. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [a0+b0, a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7, a8+b8, a9+b9, a10+b10, a11+b11, a12+b12, a13+b13, a14+b14, a15+b15] int16_t */ static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_epi16(a, b); } static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } /* * Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in vect_t. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [a0-b0, a1-b1, a2-b2, a3-b3, a4-b4, a5-b5, a6-b6, a7-b7, a8-b8, a9-b9, a10-b10, a11-b11, a12-b12, a13-b13, a14-b14, a15-b15] int16_t */ static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_epi16(a, b); } static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } /* * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in vect_t. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [a0*b0 smod 2^16, ..., a15*b15 smod 2^16] int16_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm256_mullo_epi16(a, b); } static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); } /* * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t * [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t * Return : [Floor(a0*b0/2^16), ..., Floor(a15*b15/2^16)] int16_t */ static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { return _mm256_mulhi_epi16(a, b); } /* * Multiply the low 8-bit integers from each packed 16-bit element in a and b, and store the signed 16-bit results in dst. * Args : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] int16_t * [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15] int16_t * Return : [(a0 smod 2^8)*(b0 smod 2^8), ..., (a15 smod 2^8)*(b15 smod 2^8)] int16_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mulx(vect_t a, vect_t b) { //#pragma warning "The simd mulx function is emulated, it may impact the performances." vect_t a1, b1, mask1, mask2; mask1 = set1(0x00FF); mask2 = set1(0x0080); a1 = add(a,mask2); a1 = vand(a1,mask1); a1 = sub(a1,mask2); b1 = add(b,mask2); b1 = vand(b1,mask1); b1 = sub(b1,mask2); return mul(a1,b1); } /* * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, * keep the low 16 bits of the intermediate and add the low 16-bits of c. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t [c0, ..., c15] int16_t * Return : [(a0*b0+c0) smod 2^16, ..., (a15*b15+c15) smod 2^16] int16_t */ static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); } static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } /* * Multiply the low 8-bit integers from each packed 16-bit element in a and b, * keep the signed 16-bit results and add the low 16-bits of c. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t [c0, ..., c15] int16_t * Return : [((a0 smod 2^8)*(b0 smod 2^8)+c0) smod 2^16, ..., * ((a15 smod 2^8)*(b15 smod 2^8)+c15) smod 2^16] int16_t */ static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } /* * Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, * and substract the low 16 bits of the intermediate from elements of c. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t [c0, ..., c15] int16_t * Return : [(-a0*b0+c0) smod 2^16, ..., (-a15*b15+c15) smod 2^16] int16_t */ static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); } static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } /* * Multiply the low 8-bit integers from each packed 16-bit element in a and b, * keep the signed 16-bit results and substract them from elements of c. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t [c0, ..., c15] int16_t * Return : [(-(a0 smod 2^8)*(b0 smod 2^8)+c0) smod 2^16, ..., * (-(a15 smod 2^8)*(b15 smod 2^8)+c15) smod 2^16] int16_t */ static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } /* * Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers, * and substract elements of c to the low 16-bits of the intermediate. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t [c0, ..., c15] int16_t * Return : [(a0*b0-c0) smod 2^16, ..., (a15*b15-c15) smod 2^16] int16_t */ static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); } static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } /* * Multiply the low 8-bit integers from each packed 16-bit element in a and b, * keep the signed 16-bit results and substract elements of c from them. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t [c0, ..., c15] int16_t * Return : [((a0 smod 2^8)*(b0 smod 2^8)-c0) smod 2^16, ..., * ((a15 smod 2^8)*(b15 smod 2^8)-c15) smod 2^16] int16_t */ static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Compare packed 16-bits in a and b for equality, and store the results in vect_t. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [(a0==b0) ? 0xFFFF : 0, (a1==b1) ? 0xFFFF : 0, (a2==b2) ? 0xFFFF : 0, (a3==b3) ? 0xFFFF : 0, (a4==b4) ? 0xFFFF : 0, (a5==b5) ? 0xFFFF : 0, (a6==b6) ? 0xFFFF : 0, (a7==b7) ? 0xFFFF : 0, (a8==b8) ? 0xFFFF : 0, (a9==b9) ? 0xFFFF : 0, (a10==b10) ? 0xFFFF : 0, (a11==b11) ? 0xFFFF : 0, (a12==b12) ? 0xFFFF : 0, (a13==b13) ? 0xFFFF : 0, (a14==b14) ? 0xFFFF : 0, (a15==b15) ? 0xFFFF : 0] int16_t */ static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmpeq_epi16(a, b); } /* * Compare packed 16-bits in a and b for greater-than, and store the results in vect_t. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [(a0>b0) ? 0xFFFF : 0, (a1>b1) ? 0xFFFF : 0, (a2>b2) ? 0xFFFF : 0, (a3>b3) ? 0xFFFF : 0, (a4>b4) ? 0xFFFF : 0, (a5>b5) ? 0xFFFF : 0, (a6>b6) ? 0xFFFF : 0, (a7>b7) ? 0xFFFF : 0, (a8>b8) ? 0xFFFF : 0, (a9>b9) ? 0xFFFF : 0, (a10>b10) ? 0xFFFF : 0, (a11>b11) ? 0xFFFF : 0, (a12>b12) ? 0xFFFF : 0, (a13>b13) ? 0xFFFF : 0, (a14>b14) ? 0xFFFF : 0, (a15>b15) ? 0xFFFF : 0] int16_t */ static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi16(a, b); } /* * Compare packed 16-bits in a and b for lesser-than, and store the results in vect_t. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [(a0b15) ? 0xFFFF : 0] int16_t */ static INLINE CONST vect_t lesser(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi16(b, a); } /* * Compare packed 16-bits in a and b for greater or equal than, and store the results in vect_t. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [(a0>=b0) ? 0xFFFF : 0, (a1>=b1) ? 0xFFFF : 0, (a2>=b2) ? 0xFFFF : 0, (a3>=b3) ? 0xFFFF : 0, (a4>=b4) ? 0xFFFF : 0, (a5>=b5) ? 0xFFFF : 0, (a6>=b6) ? 0xFFFF : 0, (a7>=b7) ? 0xFFFF : 0, (a8>=b8) ? 0xFFFF : 0, (a9>=b9) ? 0xFFFF : 0, (a10>=b10) ? 0xFFFF : 0, (a11>=b11) ? 0xFFFF : 0, (a12>=b12) ? 0xFFFF : 0, (a13>=b13) ? 0xFFFF : 0, (a14>=b14) ? 0xFFFF : 0, (a15>=b15) ? 0xFFFF : 0] int16_t */ static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } /* * Compare packed 16-bits in a and b for lesser or equal than, and store the results in vect_t. * Args : [a0, ..., a15] int16_t [b0, ..., b15] int16_t * Return : [(a0<=b0) ? 0xFFFF : 0, (a1<=b1) ? 0xFFFF : 0, (a2<=b2) ? 0xFFFF : 0, (a3<=b3) ? 0xFFFF : 0, (a4<=b4) ? 0xFFFF : 0, (a5<=b5) ? 0xFFFF : 0, (a6<=b6) ? 0xFFFF : 0, (a7<=b7) ? 0xFFFF : 0, (a8<=b8) ? 0xFFFF : 0, (a9<=b9) ? 0xFFFF : 0, (a10<=b10) ? 0xFFFF : 0, (a11<=b11) ? 0xFFFF : 0, (a12<=b12) ? 0xFFFF : 0, (a13<=b13) ? 0xFFFF : 0, (a14<=b14) ? 0xFFFF : 0, (a15<=b15) ? 0xFFFF : 0] int16_t */ static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Horizontally add 16-bits elements of a. * Args : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] * Return : a0+a1+a2+a3+a4+a5+a6+a7+a8+a9+a10+a11+a12+a13+a14+a15 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter ca; ca.v = a; return scalar_t(ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3] + ca.t[4] + ca.t[5] + ca.t[6] + ca.t[7] + ca.t[8] + ca.t[9] + ca.t[10] + ca.t[11] + ca.t[12] + ca.t[13] + ca.t[14] + ca.t[15]); } static INLINE CONST vect_t round(const vect_t a) { return a; } static INLINE CONST vect_t signbits(const vect_t x) { vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1)); return signBits; } static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) { #ifdef __INTEL_COMPILER C = _mm256_rem_epi16(C, P); #else FFLASFFPACK_abort("pas implementé"); #endif NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); return C; } }; /* * Simd128 specialized for uint16_t */ template <> struct Simd256_impl : public Simd256_impl { /* * define the scalar type corresponding to the specialization */ using scalar_t = uint16_t; /* * Simd128 for scalar_t, to deal half_t */ using simdHalf = Simd128; /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 16-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x] uint16_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi16(x); } /* * Set packed 16-bit unsigned integers in dst with the supplied values. * Return [x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15] uint16_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7, const scalar_t x8, const scalar_t x9, const scalar_t x10, const scalar_t x11, const scalar_t x12, const scalar_t x13, const scalar_t x14, const scalar_t x15) { return _mm256_set_epi16(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0); } /* * Gather 16-bit integer elements with indexes idx[0], ..., idx[15] from the address p in vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]], p[idx[8]], p[idx[9]], p[idx[10]], p[idx[11]], p[idx[12]], p[idx[13]], p[idx[14]], p[idx[15]]] uint16_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]], p[idx[8]], p[idx[9]], p[idx[10]], p[idx[11]], p[idx[12]], p[idx[13]], p[idx[14]], p[idx[15]]); } /* * Load 256-bits of unsigned integer data from memory into dst. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11]p[12],p[13],p[14],p[15]] uint16_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_si256(reinterpret_cast(p)); } /* * Load 256-bits of unsigned integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11]p[12],p[13],p[14],p[15]] uint16_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_si256(reinterpret_cast(p)); } /* * Store 256-bits of unsigned integer data from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm256_store_si256(reinterpret_cast(p), v); } /* * Store 256-bits of unsigned integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm256_storeu_si256(reinterpret_cast(p), v); } /* * Store 256-bits of unsigned integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 32-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm256_stream_si256(reinterpret_cast(p), v); } /* * Shift packed 16-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, ..., a15] int16_t * Return : [Floor(a0/2^s), ..., Floor(a15/2^s)] int16_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_srli_epi16(a, s); } static INLINE CONST vect_t greater(vect_t a, vect_t b) { vect_t x; x = set1((static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(a,x); b = sub(b,x); return _mm256_cmpgt_epi16(a, b); } static INLINE CONST vect_t lesser(vect_t a, vect_t b) { vect_t x; x = set1((static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(a,x); b = sub(b,x); return _mm256_cmpgt_epi16(b, a); } static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, * and store the high 16 bits of the intermediate integers in vect_t. * Args : [a0, ..., a15] uint16_t [b0, ..., b15] uint16_t * Return : [Floor(a0*b0/2^16), ..., Floor(a15*b15/2^16)] uint16_t */ static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { return _mm256_mulhi_epu16(a, b); } /* * Multiply the low unsigned 8-bit integers from each packed 16-bit element in a and b, * and store the signed 16-bit results in vect_t. * Args : [a0, ..., a15] uint16_t [b0, ..., b15] uint16_t * Return : [(a0 mod 2^8)*(b0 mod 2^8), ..., (a15 mod 2^8)*(b15 mod 2^8)] uint16_t */ static INLINE CONST vect_t mulx(vect_t a, vect_t b) { //#pragma warning "The simd mulx function is emulated, it may impact the performances." vect_t a1, b1, mask1; mask1 = set1(0x00FF); a1 = vand(a,mask1); b1 = vand(b,mask1); return mul(a1,b1); } static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Horizontally add 16-bits elements of a. * Args : [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15] * Return : a0+a1+a2+a3+a4+a5+a6+a7+a8+a9+a10+a11+a12+a13+a14+a15 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter ca; ca.v = a; return scalar_t(ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3] + ca.t[4] + ca.t[5] + ca.t[6] + ca.t[7] + ca.t[8] + ca.t[9] + ca.t[10] + ca.t[11] + ca.t[12] + ca.t[13] + ca.t[14] + ca.t[15]); } }; //Simd256_impl #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_int16_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd256_int32.inl000066400000000000000000000663451274716147400245340ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * Romain Lebreton * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_int32_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd256_int32_INL #ifndef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS #error "You need AVX2 instructions to perform 256bits operations on int32_t" #endif #include "fflas-ffpack/fflas/fflas_simd/simd256_int64.inl" /* * Simd256 specialized for int32_t */ template <> struct Simd256_impl : public Simd256i_base { /* * alias to 256 bit simd register */ using vect_t = __m256i; /* * alias to 256 bit simd register */ using half_t = __m128i; /* * define the scalar type corresponding to the specialization */ using scalar_t = int32_t; /* * Simd128 for scalar_t, to deal half_t */ using simdHalf = Simd128; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 8; /* * alignement required by scalar_t pointer to be loaded in a vect_t */ static const constexpr size_t alignment = 32; /* * Check if the pointer p is a multiple of alignemnt */ template static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } /* * Check if the number n is a multiple of vect_size */ template static constexpr bool compliant(T n) { return n % vect_size == 0; } /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 32-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x,x,x,x,x,x,x] int32_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi32(x); } /* * Set packed 32-bit integers in dst with the supplied values. * Return [x0,x1,x2,x3,x4,x5,x6,x7] int32_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7) { return _mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0); } /* * Gather 32-bit integer elements with indexes idx[0], ..., idx[7] from the address p in vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]] int32_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]); } /* * Load 256-bits of integer data from memory into dst. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int32_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_si256(reinterpret_cast(p)); } /* * Load 256-bits of integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] int32_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_si256(reinterpret_cast(p)); } /* * Store 256-bits of integer data from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm256_store_si256(reinterpret_cast(p), v); } /* * Store 256-bits of integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm256_storeu_si256(reinterpret_cast(p), v); } /* * Store 256-bits of integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 32-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm256_stream_si256(reinterpret_cast(p), v); } /* * Shift packed 32-bit integers in a left by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * Return : [a0 << s, a1 << s, a2 << s, a3 << s, a4 << s, a5 << s, a6 << s, a7 << s] int32_t */ static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm256_slli_epi32(a, s); } /* * Shift packed 32-bit integers in a right by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s] int32_t */ static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm256_srli_epi32(a, s); } /* * Shift packed 32-bit integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s, a4 >> s, a5 >> s, a6 >> s, a7 >> s] int32_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_srai_epi32(a, s); } /* * Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst. * Args : [a0, ..., a7] int32_t * Return : [a[s[0..1]], ..., a[s[6..7]],a[4+s[0..1]], ..., a[4+s[6..7]]] int32_t */ template static INLINE CONST vect_t shuffle_twice(const vect_t a) { return _mm256_shuffle_epi32(a, s); } /* * Shuffle 32-bit integers in a using the control in imm8, and store the results in dst. * Args : [a0, ..., a7] int32_t * Return : [a[s[0..3]], ..., a[s[28..31]]] int32_t */ template static INLINE CONST vect_t shuffle(const vect_t a) { //#pragma warning "The simd shuffle function is emulated, it may impact the performances." Converter conv; conv.v = a; return set (conv.t[( s & 0x0000000F)], conv.t[( s & 0x000000F0)], conv.t[((s>> 8) & 0x0000000F)], conv.t[((s>> 8) & 0x000000F0)], conv.t[((s>>16) & 0x0000000F)], conv.t[((s>>16) & 0x000000F0)], conv.t[((s>>24) & 0x0000000F)], conv.t[((s>>24) & 0x000000F0)]); } /* * Unpack and interleave 32-bit integers from the low half of a and b within 128-bit lanes, and store the results in dst. * Args : [a0, ..., a7] int32_t [b0, ..., b7] int32_t * Return : [a0, b0, a1, b1, a4, b4, a5, b5] int32_t */ static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_epi32(a, b); } /* * Unpack and interleave 32-bit integers from the high half of a and b within 128-bit lanes, and store the results in dst. * Args : [a0, ..., a7] int32_t [b0, ..., b7] int32_t * Return : [a2, b2, a3, b3, a6, b6, a7, b7] int32_t */ static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_epi32(a, b); } /* * Unpack and interleave 32-bit integers from the low half of a and b, and store the results in dst. * Args : [a0, ..., a7] int32_t [b0, ..., b7] int32_t * Return : [a0, b0, ..., a3, b3] int32_t */ static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { using Simd256_64 = Simd256; vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4 so a -> [a0,a2,a1,a3] uint64 vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4 return unpacklo_twice(a1, b1); } /* * Unpack and interleave 32-bit integers from the high half of a and b, and store the results in dst. * Args : [a0, ..., a7] int32_t [b0, ..., b7] int32_t * Return : [a4, b4, ..., a7, b7] int32_t */ static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { using Simd256_64 = Simd256; vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4 vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4 return unpackhi_twice(a1, b1); } /* * Unpack and interleave 32-bit integers from the low then high half of a and b, and store the results in dst. * Args : [a0, ..., a7] int32_t [b0, ..., b7] int32_t * Return : [a0, b0, ..., a3, b3] int32_t * [a4, b4, ..., a7, b7] int32_t */ static INLINE CONST void unpacklohi(vect_t& s1, vect_t& s2, const vect_t a, const vect_t b) { using Simd256_64 = Simd256; vect_t a1 = Simd256_64::template shuffle<0xD8>(a); // 0xD8 = 3120 base_4 vect_t b1 = Simd256_64::template shuffle<0xD8>(b); // 0xD8 = 3120 base_4 s1 = unpacklo_twice(a1, b1); s2 = unpackhi_twice(a1, b1); } /* * Blend packed 32-bit integers from a and b using control mask imm8, and store the results in dst. * Args : [a0, ..., a7] int32_t [b0, ..., b7] int32_t * Return : [s[0]?a0:b0, , s[7]?a7:b7] int32_t */ template static INLINE CONST vect_t blend(const vect_t a, const vect_t b) { return _mm256_blend_epi32(a, b, s); } /* * Add packed 32-bits integer in a and b, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * Return : [a0+b0, a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7] int32_t */ static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_epi32(a, b); } static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } /* * Subtract packed 32-bits integers in b from packed 32-bits integers in a, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * Return : [a0-b0, a1-b1, a2-b2, a3-b3, a4-b4, a5-b5, a6-b6, a7-b7] int32_t */ static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_epi32(a, b); } static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } /* * Multiply the packed 32-bits integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * Return : [a0*b0 smod 2^32, ..., a7*b7 smod 2^32] int32_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mullo(const vect_t a, const vect_t b) { return _mm256_mullo_epi32(a, b); } static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); } /* * Multiply the packed 32-bits integers in a and b, producing intermediate 64-bit integers, and store the high 32 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * Return : [Floor(a0*b0/2^32), ..., Floor(a7*b7/2^32)] int32_t */ static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { //#pragma warning "The simd mulhi function is emulated, it may impact the performances." #if 0 typedef Simd256_impl Simd256_64; Converter ca, cb; ca.v = a; cb.v = b; vect_t a1, a2, b1, b2, c1, c2; a1 = set(ca.t[0], 0, ca.t[1], 0, ca.t[2], 0, ca.t[3], 0); a2 = set(ca.t[4], 0, ca.t[5], 0, ca.t[6], 0, ca.t[7], 0); b1 = set(cb.t[0], 0, cb.t[1], 0, cb.t[2], 0, cb.t[3], 0); b2 = set(cb.t[4], 0, cb.t[5], 0, cb.t[6], 0, cb.t[7], 0); c1 = Simd256_64::mulx(a1, b1); c2 = Simd256_64::mulx(a2, b2); ca.v = c1; cb.v = c2; return set(ca.t[1], ca.t[3], ca.t[5], ca.t[7], cb.t[1], cb.t[3], cb.t[5], cb.t[7]); #else typedef Simd256_impl Simd256_64; vect_t C,A1,B1; C = Simd256_64::mulx(a,b); A1 = Simd256_64::srl(a,32); B1 = Simd256_64::srl(b,32); A1 = Simd256_64::mulx(A1,B1); C = Simd256_64::srl(C,32); A1 = Simd256_64::srl(A1,32); A1 = Simd256_64::sll(A1,32); return Simd256_64::vor(C,A1); #endif } /* * Multiply the low 16-bit integers from each packed 32-bit element in a and b, and store the signed 32-bit results in dst. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * Return : [(a0 smod 2^16)*(b0 smod 2^16), ..., (a7 smod 2^16)*(b7 smod 2^16)] int32_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mulx(vect_t a, vect_t b) { //#pragma warning "The simd mulx function is emulated, it may impact the performances." vect_t a1, b1, mask1, mask2; mask1 = set1(0x0000FFFF); mask2 = set1(0x00008000); a1 = add(a,mask2); a1 = vand(a1,mask1); a1 = sub(a1,mask2); b1 = add(b,mask2); b1 = vand(b1,mask1); b1 = sub(b1,mask2); return mul(a1,b1); } /* * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, * keep the low 32 bits of the intermediate and add the low 32-bits of c. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * [c0, c1, c2, c3, c4, c5, c6, c7] int32_t * Return : [(a0*b0+c0) smod 2^32, ..., (a7*b7+c7) smod 2^32] int32_t */ static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); } static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } /* * Multiply the low 16-bit integers from each packed 32-bit element in a and b, * keep the signed 32-bit results and add the low 32-bits of c. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * [c0, c1, c2, c3, c4, c5, c6, c7] int32_t * Return : [((a0 smod 2^16)*(b0 smod 2^16)+c0) smod 2^32, ..., * ((a7 smod 2^16)*(b7 smod 2^16)+c7) smod 2^32] int32_t */ static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } /* * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, * and substract the low 32 bits of the intermediate from elements of c. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * [c0, c1, c2, c3, c4, c5, c6, c7] int32_t * Return : [(-a0*b0+c0) smod 2^32, ..., (-a7*b7+c7) smod 2^32] int32_t */ static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); } static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } /* * Multiply the low 16-bit integers from each packed 32-bit element in a and b, * keep the signed 32-bit results and substract them from elements of c. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * [c0, c1, c2, c3, c4, c5, c6, c7] int32_t * Return : [(-(a0 smod 2^16)*(b0 smod 2^16)+c0) smod 2^32, ..., * (-(a7 smod 2^16)*(b7 smod 2^16)+c7) smod 2^32] int32_t */ static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } /* * Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, * and substract elements of c to the low 32-bits of the intermediate. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * [c0, c1, c2, c3, c4, c5, c6, c7] int32_t * Return : [(a0*b0-c0) smod 2^32, ..., (a7*b7-c7) smod 2^32] int32_t */ static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); } static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } /* * Multiply the low 16-bit integers from each packed 32-bit element in a and b, * keep the signed 32-bit results and substract elements of c from them. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * [c0, c1, c2, c3, c4, c5, c6, c7] int32_t * Return : [((a0 smod 2^16)*(b0 smod 2^16)-c0) smod 2^32, ..., * ((a7 smod 2^16)*(b7 smod 2^16)-c7) smod 2^32] int32_t */ static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Compare packed 32-bits in a and b for equality, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * Return : [(a0==b0) ? 0xFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFF : 0, * (a2==b2) ? 0xFFFFFFFF : 0, (a3==b3) ? 0xFFFFFFFF : 0, * (a4==b4) ? 0xFFFFFFFF : 0, (a5==b5) ? 0xFFFFFFFF : 0, * (a6==b6) ? 0xFFFFFFFF : 0, (a7==b7) ? 0xFFFFFFFF : 0] int32_t */ static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmpeq_epi32(a, b); } /* * Compare packed 32-bits in a and b for greater-than, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * Return : [(a0>b0) ? 0xFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFF : 0, * (a2>b2) ? 0xFFFFFFFF : 0, (a3>b3) ? 0xFFFFFFFF : 0, * (a4>b4) ? 0xFFFFFFFF : 0, (a5>b5) ? 0xFFFFFFFF : 0, * (a6>b6) ? 0xFFFFFFFF : 0, (a7>b7) ? 0xFFFFFFFF : 0] int32_t */ static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi32(a, b); } /* * Compare packed 32-bits in a and b for lesser-than, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * Return : [(a0=b0) ? 0xFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFF : 0, * (a2>=b2) ? 0xFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFF : 0, * (a4>=b4) ? 0xFFFFFFFF : 0, (a5>=b5) ? 0xFFFFFFFF : 0, * (a6>=b6) ? 0xFFFFFFFF : 0, (a7>=b7) ? 0xFFFFFFFF : 0] int32_t */ static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } /* * Compare packed 32-bits in a and b for lesser or equal than, and store the results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] int32_t * [b0, b1, b2, b3, b4, b5, b6, b7] int32_t * Return : [(a0<=b0) ? 0xFFFFFFFF : 0, (a1<=b1) ? 0xFFFFFFFF : 0, * (a2<=b2) ? 0xFFFFFFFF : 0, (a3<=b3) ? 0xFFFFFFFF : 0, * (a4<=b4) ? 0xFFFFFFFF : 0, (a5<=b5) ? 0xFFFFFFFF : 0, * (a6<=b6) ? 0xFFFFFFFF : 0, (a7<=b7) ? 0xFFFFFFFF : 0] int32_t */ static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Horizontally add 32-bits elements of a. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] * Return : a0+a1+a2+a3+a4+a5+a6+a7 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter ca; ca.v = a; return scalar_t(ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3] + ca.t[4] + ca.t[5] + ca.t[6] + ca.t[7]); } static INLINE CONST vect_t round(const vect_t a) { return a; } static INLINE CONST vect_t signbits(const vect_t x) { vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1)); return signBits; } static INLINE vect_t mod(vect_t &C, const vect_t &P, const vect_t &INVP, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) { #ifdef __INTEL_COMPILER C = _mm256_rem_epi32(C, P); #else FFLASFFPACK_abort("pas implementé"); // C = fnmadd(C,_mm256_castps_si128(_mm256_floor_ps(_mm256_mul_ps(INVP,_mm256_castsi128_ps(C)))),P); #endif NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); return C; } }; /* * Simd256 specialized for uint32_t */ template <> struct Simd256_impl : public Simd256_impl { /* * define the scalar type corresponding to the specialization */ using scalar_t = uint32_t; /* * Simd128 for scalar_t, to deal half_t */ using simdHalf = Simd128; /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 32-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x,x,x,x,x,x,x] uint32_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi32(x); } /* * Set packed 32-bit unsigned integers in dst with the supplied values. * Return [x0,x1,x2,x3,x4,x5,x6,x7] uint32_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3, const scalar_t x4, const scalar_t x5, const scalar_t x6, const scalar_t x7) { return _mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0); } /* * Gather 32-bit unsigned integer elements with indexes idx[0], ..., idx[7] from the address p in vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]] uint32_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]], p[idx[4]], p[idx[5]], p[idx[6]], p[idx[7]]); } /* * Load 256-bits of unsigned integer data from memory into dst. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] uint32_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_si256(reinterpret_cast(p)); } /* * Load 256-bits of unsigned integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]] uint32_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_si256(reinterpret_cast(p)); } /* * Store 256-bits of unsigned integer data from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm256_store_si256(reinterpret_cast(p), v); } /* * Store 256-bits of unsigned integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm256_storeu_si256(reinterpret_cast(p), v); } /* * Store 256-bits of unsigned integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 32-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm256_stream_si256(reinterpret_cast(p), v); } /* * Shift packed 32-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, ..., a7] int32_t * Return : [Floor(a0/2^s), ..., Floor(a7/2^s)] int32_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_srli_epi32(a, s); } static INLINE CONST vect_t greater(vect_t a, vect_t b) { vect_t x; x = set1((static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(a,x); b = sub(b,x); return _mm256_cmpgt_epi32(a, b); } static INLINE CONST vect_t lesser(vect_t a, vect_t b) { vect_t x; x = set1((static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(a,x); b = sub(b,x); return _mm256_cmpgt_epi32(b, a); } static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Multiply the packed unsigned 32-bit integers in a and b, producing intermediate 64-bit integers, * and store the high 32 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] uint32_t * [b0, b1, b2, b3, b4, b5, b6, b7] uint32_t * Return : [Floor(a0*b0/2^32), ..., Floor(a7*b7/2^32)] uint32_t */ static INLINE CONST vect_t mulhi(const vect_t a, const vect_t b) { //#pragma warning "The simd mulhi function is emulated, it may impact the performances." typedef Simd256_impl Simd256_64; vect_t C,A1,B1; C = Simd256_64::mulx(a,b); A1 = Simd256_64::srl(a,32); B1 = Simd256_64::srl(b,32); A1 = Simd256_64::mulx(A1,B1); C = Simd256_64::srl(C,32); A1 = Simd256_64::srl(A1,32); A1 = Simd256_64::sll(A1,32); return Simd256_64::vor(C,A1); } /* * Multiply the low unsigned 16-bit integers from each packed 32-bit element in a and b, * and store the signed 32-bit results in vect_t. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] uint32_t * [b0, b1, b2, b3, b4, b5, b6, b7] uint32_t * Return : [(a0 mod 2^16)*(b0 mod 2^16), ..., (a7 mod 2^16)*(b7 mod 2^16)] uint32_t */ static INLINE CONST vect_t mulx(vect_t a, vect_t b) { //#pragma warning "The simd mulx function is emulated, it may impact the performances." vect_t a1, b1, mask1; mask1 = set1(0x0000FFFF); a1 = vand(a,mask1); b1 = vand(b,mask1); return mul(a1,b1); } static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Horizontally add 32-bits elements of a. * Args : [a0, a1, a2, a3, a4, a5, a6, a7] * Return : a0+a1+a2+a3+a4+a5+a6+a7 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter ca; ca.v = a; return scalar_t(ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3] + ca.t[4] + ca.t[5] + ca.t[6] + ca.t[7]); } }; //Simd256_impl #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_int32_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd256_int64.inl000066400000000000000000000656201274716147400245340ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * Romain Lebreton * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_ffpack_utils_simd256_int64_INL #define __FFLASFFPACK_fflas_ffpack_utils_simd256_int64_INL #ifndef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS #error "You need AVX2 instructions to perform 256bits operations on int64_t" #endif /* * Simd256 specialized for int64_t */ template <> struct Simd256_impl : public Simd256i_base { /* * alias to 256 bit simd register */ using vect_t = __m256i; /* * alias to 256 bit simd register */ using half_t = __m128i; /* * define the scalar type corresponding to the specialization */ using scalar_t = int64_t; /* * Simd128 for scalar_t, to deal half_t */ using simdHalf = Simd128; /* * number of scalar_t in a simd register */ static const constexpr size_t vect_size = 4; /* * alignement required by scalar_t pointer to be loaded in a vect_t */ static const constexpr size_t alignment = 32; /* * Check if the pointer p is a multiple of alignemnt */ template static constexpr bool valid(T *p) { return (int64_t)p % alignment == 0; } /* * Check if the number n is a multiple of vect_size */ template static constexpr bool compliant(T n) { return n % vect_size == 0; } /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 64-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x,x,x] int64_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi64x(x); } /* * Set packed 64-bit integers in dst with the supplied values. * Return [x0,x1,x2,x3] int64_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3) { return _mm256_set_epi64x(x3, x2, x1, x0); } /* * Gather 64-bit integer elements with indexes idx[0], ..., idx[3] from the address p in vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] int64_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]); } /* * Load 256-bits of integer data from memory into dst. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[0],p[1],p[2],p[3]] int32_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_si256(reinterpret_cast(p)); } /* * Load 256-bits of integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0],p[1],p[2],p[3]] int64_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_si256(reinterpret_cast(p)); } /* * Store 256-bits of integer data from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm256_store_si256(reinterpret_cast(p), v); } /* * Store 256-bits of integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm256_storeu_si256(reinterpret_cast(p), v); } /* * Store 256-bits of integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 32-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm256_stream_si256(reinterpret_cast(p), v); } /* * Shift packed 64-bit integers in a left by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1, a2, a3] int64_t * Return : [a0 << s, a1 << s, a2 << s, a3 << s] int64_t */ static INLINE CONST vect_t sll(const vect_t a, const int s) { return _mm256_slli_epi64(a, s); } /* * Shift packed 64-bit integers in a right by s while shifting in zeros, and store the results in vect_t. * Args : [a0, a1, a2, a3] int64_t * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int64_t */ static INLINE CONST vect_t srl(const vect_t a, const int s) { return _mm256_srli_epi64(a, s); } /* * Shift packed 64-bit integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, a1, a2, a3] int64_t * Return : [a0 >> s, a1 >> s, a2 >> s, a3 >> s] int64_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { #ifdef __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS return _mm256_srai_epi64(a, s); #else const int b = 63 - s; vect_t m = sll(set1(1), b); vect_t x = srl(a, s); vect_t result = sub(vxor(x, m), m); // result = x^m - m return result; #endif } /* * Shuffle 64-bit integers in a using the control in imm8, and store the results in dst. * Args : [a0, ..., a3] int32_t * Return : [a[s[0..1]], ..., a[s[6..7]]] int32_t */ template static INLINE CONST vect_t shuffle(const vect_t a) { return _mm256_permute4x64_epi64(a, s); } /* * Unpack and interleave 64-bit integers from the low half of a and b within 128-bit lanes, and store the results in dst. * Args : [a0, a1, a2, a3] int64_t [b0, b1, b2, b3] int64_t * Return : [a0, b0, a2, b2] int64_t */ static INLINE CONST vect_t unpacklo_twice(const vect_t a, const vect_t b) { return _mm256_unpacklo_epi64(a, b); } /* * Unpack and interleave 64-bit integers from the high half of a and b within 128-bit lanes, and store the results in dst. * Args : [a0, a1, a2, a3] int64_t [b0, b1, b2, b3] int64_t * Return : [a1, b1, a3, b3] int64_t */ static INLINE CONST vect_t unpackhi_twice(const vect_t a, const vect_t b) { return _mm256_unpackhi_epi64(a, b); } /* * Unpack and interleave 64-bit integers from the low half of a and b, and store the results in dst. * Args : [a0, a1, a2, a3] int64_t [b0, b1, b2, b3] int64_t * Return : [a0, b0, a1, b1] int64_t */ static INLINE CONST vect_t unpacklo(const vect_t a, const vect_t b) { vect_t a1 = shuffle<0xD8>(a); // 0xD8 = 3120 base_4 so a -> [a0,a2,a1,a3] vect_t b1 = shuffle<0xD8>(b); // 0xD8 = 3120 base_4 return unpacklo_twice(a1, b1); } /* * Unpack and interleave 64-bit integers from the high half of a and b, and store the results in dst. * Args : [a0, a1, a2, a3] int64_t [b0, b1, b2, b3] int64_t * Return : [a2, b2, a3, b3] int64_t */ static INLINE CONST vect_t unpackhi(const vect_t a, const vect_t b) { vect_t a1 = shuffle<0xD8>(a); // 0xD8 = 3120 base_4 vect_t b1 = shuffle<0xD8>(b); // 0xD8 = 3120 base_4 return unpackhi_twice(a1, b1); } /* * Unpack and interleave 64-bit integers from the low then high half of a and b, and store the results in dst. * Args : [a0, a1, a2, a3] int64_t [b0, b1, b2, b3] int64_t * Return : [a0, b0, a1, b1] int64_t * [a2, b2, a3, b3] int64_t */ static INLINE CONST void unpacklohi(vect_t& l, vect_t& h, const vect_t a, const vect_t b) { vect_t a1 = shuffle<0xD8>(a); // 0xD8 = 3120 base_4 so a -> [a0,a2,a1,a3] vect_t b1 = shuffle<0xD8>(b); // 0xD8 = 3120 base_4 l = unpacklo_twice(a1, b1); h = unpackhi_twice(a1, b1); } /* * Blend packed 64-bit integers from a and b using control mask imm8, and store the results in dst. * Args : [a0, a1, a2, a3] int64_t [b0, b1, b2, b3] int64_t * Return : [s[0]?a0:b0, , s[3]?a3:b3] int64_t */ template static INLINE CONST vect_t blend(const vect_t a, const vect_t b) { // _mm_blend_epi16 is faster than _mm_blend_epi32 and require SSE4.1 instead of AVX2 // We have to transform s = [d3 d2 d1 d0]_base2 to s1 = [d3 d3 d2 d2 d1 d1 d0 d0]_base2 constexpr uint8_t s1 = (s & 0x1) * 3 + (((s & 0x2) << 1)*3) + (((s & 0x4) << 2)*3) + (((s & 0x8) << 3)*3); return _mm256_blend_epi32(a, b, s1); } /* * Add packed 64-bits integer in a and b, and store the results in vect_t. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * Return : [a0+b0, a1+b1, a2+b2, a3+b3] int64_t */ static INLINE CONST vect_t add(const vect_t a, const vect_t b) { return _mm256_add_epi64(a, b); } static INLINE vect_t addin(vect_t &a, const vect_t b) { return a = add(a, b); } /* * Subtract packed 64-bits integers in b from packed 64-bits integers in a, and store the results in vect_t. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * Return : [a0-b0, a1-b1, a2-b2, a3-b3] int64_t */ static INLINE CONST vect_t sub(const vect_t a, const vect_t b) { return _mm256_sub_epi64(a, b); } static INLINE vect_t subin(vect_t &a, const vect_t b) { return a = sub(a, b); } /* * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * Return : [a0*b0 smod 2^32, ..., a3*b3 smod 2^32] int64_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mullo(vect_t a, vect_t b) { #ifdef __FFLASFFPACK_HAVE_AVX512F_INSTRUCTIONS return _mm256_mullo_epi64(a, b); #else //#pragma warning "The simd mullo function is emulate, it may impact the performances." Converter ca, cb; ca.v = a; cb.v = b; return set(ca.t[0] * cb.t[0], ca.t[1] * cb.t[1], ca.t[2] * cb.t[2], ca.t[3] * cb.t[3]); #endif } static INLINE CONST vect_t mul(const vect_t a, const vect_t b) { return mullo(a, b); } /* * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the high 64 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * Return : [Floor(a0*b0/2^64), ..., Floor(a3*b3/2^64)] int64_t */ #ifdef __FFLASFFPACK_HAVE_INT128 static INLINE CONST vect_t mulhi(vect_t a, vect_t b) { //#pragma warning "The simd mulhi function is emulate, it may impact the performances." // ugly solution, but it works. // tested with gcc, clang, icc Converter ca, cb; ca.v = a; cb.v = b; return set((scalar_t)((int128_t(ca.t[0]) * cb.t[0]) >> 64), (scalar_t)((int128_t(ca.t[1]) * cb.t[1]) >> 64), (scalar_t)((int128_t(ca.t[2]) * cb.t[2]) >> 64), (scalar_t)((int128_t(ca.t[3]) * cb.t[3]) >> 64)); } #endif /* * Multiply the low 32-bits integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * Return : [(a0 smod 2^32)*(b0 smod 2^32), ..., (a3 smod 2^32)*(b3 smod 2^32)] int64_t * where (a smod p) is the signed representant of a modulo p, that is -p/2 <= (a smod p) < p/2 */ static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm256_mul_epi32(a, b); } /* * Multiply packed 64-bit integers in a and b, producing intermediate 128-bit integers, and add the low 64-bits of the intermediate with c, and store the results in vect_t. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * [c0, c1, c2, c3] int64_t * Return : [(a0*b0+c0) smod 2^64, ..., (a3*b3+c3) smod 2^64] int64_t */ static INLINE CONST vect_t fmadd(const vect_t c, const vect_t a, const vect_t b) { return add(c, mul(a, b)); } static INLINE vect_t fmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fmadd(c, a, b); } /* * Multiply the low 32-bit integers from each packed 64-bit element in a and b, * keep the signed 64-bit results and add the low 64-bits of c. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * [c0, c1, c2, c3] int64_t * Return : [((a0 smod 2^32)*(b0 smod 2^32)+c0) smod 2^64, ..., * ((a3 smod 2^32)*(b3 smod 2^32)+c3) smod 2^64] int64_t */ static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } /* * Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, * and substract the low 64 bits of the intermediate from elements of c. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * [c0, c1, c2, c3] int64_t * Return : [(-a0*b0+c0) smod 2^64, ..., (-a3*b3+c3) smod 2^64] int64_t */ static INLINE CONST vect_t fnmadd(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mul(a, b)); } static INLINE vect_t fnmaddin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmadd(c, a, b); } /* * Multiply the low 32-bit integers from each packed 64-bit element in a and b, * keep the signed 64-bit results and substract them from elements of c. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * [c0, c1, c2, c3] int64_t * Return : [(-(a0 smod 2^32)*(b0 smod 2^32)+c0) smod 2^64, ..., * (-(a3 smod 2^32)*(b3 smod 2^32)+c3) smod 2^64] int64_t */ static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } /* * Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, * and substract elements of c to the low 64-bits of the intermediate. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * [c0, c1, c2, c3] int64_t * Return : [(a0*b0-c0) smod 2^64, ..., (a3*b3-c3) smod 2^64] int64_t */ static INLINE CONST vect_t fmsub(const vect_t c, const vect_t a, const vect_t b) { return sub(mul(a, b), c); } static INLINE vect_t fmsubin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsub(c, a, b); } /* * Multiply the low 32-bit integers from each packed 64-bit element in a and b, * keep the signed 64-bit results and substract elements of c from them. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * [c0, c1, c2, c3] int64_t * Return : [((a0 smod 2^32)*(b0 smod 2^32)-c0) smod 2^64, ..., * ((a3 smod 2^32)*(b3 smod 2^32)-c3) smod 2^64] int64_t */ static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Compare packed 64-bits in a and b for equality, and store the results in vect_t. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * Return : [(a0==b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1==b1) ? 0xFFFFFFFFFFFFFFFF : 0, (a2==b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3==b3) ? 0xFFFFFFFFFFFFFFFF : 0] int64_t */ static INLINE CONST vect_t eq(const vect_t a, const vect_t b) { return _mm256_cmpeq_epi64(a, b); } /* * Compare packed 64-bits in a and b for greater-than, and store the results in vect_t. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * Return : [(a0>b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>b1) ? 0xFFFFFFFFFFFFFFFF : 0, (a2>b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3>b3) ? 0xFFFFFFFFFFFFFFFF : 0] int64_t */ static INLINE CONST vect_t greater(const vect_t a, const vect_t b) { return _mm256_cmpgt_epi64(a, b); } /* * Compare packed 64-bits in a and b for lesser-than, and store the results in vect_t. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * Return : [(a0=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1>=b1) ? 0xFFFFFFFFFFFFFFFF : 0, (a2>=b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3>=b3) ? 0xFFFFFFFFFFFFFFFF : 0, (a4>=b4) ? 0xFFFFFFFFFFFFFFFF : 0, (a5>=b5) ? 0xFFFFFFFFFFFFFFFF : 0, (a6>=b6) ? 0xFFFFFFFFFFFFFFFF : 0, (a7>=b7) ? 0xFFFFFFFFFFFFFFFF : 0] int64_t */ static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } /* * Compare packed 64-bits in a and b for lesser or equal than, and store the results in vect_t. * Args : [a0, a1, a2, a3] int64_t * [b0, b1, b2, b3] int64_t * Return : [(a0<=b0) ? 0xFFFFFFFFFFFFFFFF : 0, (a1<=b1) ? 0xFFFFFFFFFFFFFFFF : 0, (a2<=b2) ? 0xFFFFFFFFFFFFFFFF : 0, (a3<=b3) ? 0xFFFFFFFFFFFFFFFF : 0] int64_t */ static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Horizontally add 64-bits elements of a. * Args : [a0, a1, a2, a3] int64_t * Return : a0+a1+a2+a3 int64_t */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter ca; ca.v = a; return scalar_t(ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3]); } static INLINE CONST vect_t round(const vect_t a) { return a; } static INLINE CONST vect_t signbits(const vect_t x) { vect_t signBits = sub(zero(), srl(x, 4*sizeof(scalar_t)-1)); return signBits; } // mask the high 32 bits of a 64 bits, that is 00000000FFFFFFFF static INLINE CONST vect_t mask_high() { return srl(_mm256_set1_epi8(-1), 32); } static INLINE CONST vect_t mulhi_fast(vect_t x, vect_t y); template static INLINE vect_t mod(vect_t &C, const vect_t &P, const int8_t &shifter, const vect_t &magic, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T); }; // Simd256_impl /* * Simd256 specialized for uint64_t */ template <> struct Simd256_impl : public Simd256_impl { /* * define the scalar type corresponding to the specialization */ using scalar_t = uint64_t; /* * Simd128 for scalar_t, to deal half_t */ using simdHalf = Simd128; /* * Converter from vect_t to a tab. * exple: * Converter conv; * conv.v = a; * scalart_t x = conv.t[1] */ union Converter { vect_t v; scalar_t t[vect_size]; }; /* * Broadcast 64-bit unsigned integer a to all elements of dst. This intrinsic may generate the vpbroadcastw. * Return [x,x,x,x] uint64_t */ static INLINE CONST vect_t set1(const scalar_t x) { return _mm256_set1_epi64x(x); } /* * Set packed 64-bit unsigned integers in dst with the supplied values. * Return [x0,x1,x2,x3] uint64_t */ static INLINE CONST vect_t set(const scalar_t x0, const scalar_t x1, const scalar_t x2, const scalar_t x3) { return _mm256_set_epi64x(x3, x2, x1, x0); } /* * Gather 64-bit unsigned integer elements with indexes idx[0], ..., idx[3] from the address p in vect_t. * Return [p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]] uint64_t */ template static INLINE PURE vect_t gather(const scalar_t *const p, const T *const idx) { return set(p[idx[0]], p[idx[1]], p[idx[2]], p[idx[3]]); } /* * Load 256-bits of unsigned integer data from memory into dst. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. * Return [p[0],p[1],p[2],p[3]] uint64_t */ static INLINE PURE vect_t load(const scalar_t *const p) { return _mm256_load_si256(reinterpret_cast(p)); } /* * Load 256-bits of unsigned integer data from memory into dst. * p does not need to be aligned on any particular boundary. * Return [p[0],p[1],p[2],p[3]] uint64_t */ static INLINE PURE vect_t loadu(const scalar_t *const p) { return _mm256_loadu_si256(reinterpret_cast(p)); } /* * Store 256-bits of unsigned integer data from a into memory. * p must be aligned on a 32-byte boundary or a general-protection exception will be generated. */ static INLINE void store(scalar_t *p, vect_t v) { _mm256_store_si256(reinterpret_cast(p), v); } /* * Store 256-bits of unsigned integer data from a into memory. * p does not need to be aligned on any particular boundary. */ static INLINE void storeu(scalar_t *p, vect_t v) { _mm256_storeu_si256(reinterpret_cast(p), v); } /* * Store 256-bits of unsigned integer data from a into memory using a non-temporal memory hint. * p must be aligned on a 32-byte boundary or a general-protection exception may be generated. */ static INLINE void stream(scalar_t *p, const vect_t v) { _mm256_stream_si256(reinterpret_cast(p), v); } /* * Shift packed 64-bit unsigned integers in a right by s while shifting in sign bits, and store the results in vect_t. * Args : [a0, ..., a3] uint64_t * Return : [Floor(a0/2^s), ..., Floor(a3/2^s)] uint64_t */ static INLINE CONST vect_t sra(const vect_t a, const int s) { return _mm256_srli_epi64(a, s); } static INLINE CONST vect_t greater(vect_t a, vect_t b) { vect_t x; x = set1(-(static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(x, a); b = sub(x, b); return _mm256_cmpgt_epi64(b,a); } static INLINE CONST vect_t lesser(vect_t a, vect_t b) { vect_t x; x = set1(-(static_cast(1) << (sizeof(scalar_t) * 8 - 1))); a = sub(x, a); b = sub(x, b); return _mm256_cmpgt_epi64(a, b); } static INLINE CONST vect_t greater_eq(const vect_t a, const vect_t b) { return vor(greater(a, b), eq(a, b)); } static INLINE CONST vect_t lesser_eq(const vect_t a, const vect_t b) { return vor(lesser(a, b), eq(a, b)); } /* * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3] uint64_t [b0, b1, b2, b3] uint64_t * Return : [a0*b0 mod 2^64, a1*b1 mod 2^64, a2*b2 mod 2^64, a3*b3 mod 2^64] uint64_t */ static INLINE CONST vect_t mullo(vect_t a, vect_t b) { //#pragma warning "The simd mullo function is emulate, it may impact the performances." Converter ca, cb; ca.v = a; cb.v = b; return set(ca.t[0] * cb.t[0], ca.t[1] * cb.t[1], ca.t[2] * cb.t[2], ca.t[3] * cb.t[3]); } /* * Multiply the packed 64-bits integers in a and b, producing intermediate 128-bit integers, and store the high 64 bits of the intermediate integers in vect_t. * Args : [a0, a1, a2, a3] uint64_t [b0, b1, b2, b3] uint64_t * Return : */ #ifdef __FFLASFFPACK_HAVE_INT128 static INLINE CONST vect_t mulhi(vect_t a, vect_t b) { //#pragma warning "The simd mulhi function is emulate, it may impact the performances." // ugly solution, but it works. // tested with gcc, clang, icc Converter c0, c1; c0.v = a; c1.v = b; return set((scalar_t)(((uint128_t)(c0.t[0]) * c1.t[0]) >> 64), (scalar_t)(((uint128_t)(c0.t[1]) * c1.t[1]) >> 64), (scalar_t)(((uint128_t)(c0.t[2]) * c1.t[2]) >> 64), (scalar_t)(((uint128_t)(c0.t[3]) * c1.t[3]) >> 64)); } #endif /* * Multiply the low 32-bits integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst. * Args : [a0, a1, a2, a3] uint64_t [b0, b1, b2, b3] uint64_t * Return : [a0*b0, a1*b1, a2*b2, a3*b3] uint64_t */ static INLINE CONST vect_t mulx(const vect_t a, const vect_t b) { return _mm256_mul_epu32(a, b); } static INLINE CONST vect_t fmaddx(const vect_t c, const vect_t a, const vect_t b) { return add(c, mulx(a, b)); } static INLINE vect_t fmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmaddx(c, a, b); } static INLINE CONST vect_t fnmaddx(const vect_t c, const vect_t a, const vect_t b) { return sub(c, mulx(a, b)); } static INLINE vect_t fnmaddxin(vect_t &c, const vect_t a, const vect_t b) { return c = fnmaddx(c, a, b); } static INLINE CONST vect_t fmsubx(const vect_t c, const vect_t a, const vect_t b) { return sub(mulx(a, b), c); } static INLINE vect_t fmsubxin(vect_t &c, const vect_t a, const vect_t b) { return c = fmsubx(c, a, b); } /* * Horizontally add 64-bits elements of a. * Args : [a0, a1, a2, a3] * Return : a0+a1+a2+a3 */ static INLINE CONST scalar_t hadd_to_scal(const vect_t a) { Converter ca; ca.v = a; return ca.t[0] + ca.t[1] + ca.t[2] + ca.t[3]; } }; // Simd256_impl #define vect_t Simd256_impl::vect_t // warning : may be off by 1 multiple, but we save a mul... INLINE CONST vect_t Simd256_impl::mulhi_fast(vect_t x, vect_t y) { // unsigned mulhi starts: // x1 = xy_high = mulhiu_fast(x,y) const vect_t mask = mask_high(); vect_t x0 = vand(x, mask), x1 = srl(x, 32); vect_t y0 = vand(y, mask), y1 = srl(y, 32); x0 = Simd256_impl::mulx(x0, y1); // x0y1 y0 = Simd256_impl::mulx(x1, y0); // x1y0 y1 = Simd256_impl::mulx(x1, y1); // x1y1 x1 = vand(y0, mask); y0 = srl(y0, 32); // x1y0_lo = x1 // y1yo_hi = y0 x1 = srl(add(x1, x0), 32); y0 = add(y1, y0); x1 = add(x1, y0); // unsigned mulhi ends // fixing signs x0 = vand(signbits(x), y); x1 = sub(x1, x0); x0 = vand(signbits(y), x); x1 = sub(x1, x0); // end fixing return x1; } template INLINE vect_t Simd256_impl::mod(vect_t &C, const vect_t &P, const int8_t &shifter, const vect_t &magic, const vect_t &NEGP, const vect_t &MIN, const vect_t &MAX, vect_t &Q, vect_t &T) { #ifdef __INTEL_COMPILER // Works fine with ICC 15.0.1 - A.B. C = _mm256_rem_epi64(C, P); #else if (poweroftwo) { Q = srl(C, 63); vect_t un = set1(1); T = sub(sll(un, shifter), un); Q = add(C, vand(Q, T)); Q = sll(srl(Q, shifter), shifter); C = sub(C, Q); Q = vand(greater(zero(), Q), P); C = add(C, Q); } else { Q = mulhi_fast(C, magic); if (overflow) { Q = add(Q, C); } Q = sra(Q, shifter); vect_t q1 = Simd256_impl::mulx(Q, P); vect_t q2 = sll(Simd256_impl::mulx(srl(Q, 32), P), 32); C = sub(C, add(q1, q2)); T = greater_eq(C, P); C = sub(C, vand(T, P)); } #endif NORML_MOD(C, P, NEGP, MIN, MAX, Q, T); return C; } #undef vect_t #endif // __FFLASFFPACK_fflas_ffpack_utils_simd256_int64_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_simd/simd_modular.inl000066400000000000000000000135521274716147400247730ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ // functions wih _r are relaxed, meaning no modular reduction template class FieldSimd { public: using Field = _Field; using Element = typename Field::Element; using simd = Simd; using vect_t = typename simd::vect_t; using scalar_t = typename simd::scalar_t; static const constexpr size_t vect_size = simd::vect_size; static const constexpr size_t alignment = simd::alignment; private: using Self = FieldSimd; const Field *_field; vect_t _modulus; vect_t _invmod; vect_t _negmod; vect_t _mask; vect_t _min; vect_t _max; public: FieldSimd(const Field &f) : _field(&f) { init(); } private: void init() { _modulus = simd::set1((Element)_field->characteristic()); _min = simd::set1(_field->minElement()); _max = simd::set1(_field->maxElement()); _negmod = simd::set1(-(Element)_field->characteristic()); if (std::is_floating_point::value) { _invmod = simd::set1(1 / ((Element)_field->characteristic())); } } public: FieldSimd(const Self &) = default; FieldSimd(Self &&) = default; Self &operator=(const Self &) = default; Self &operator=(Self &&) = default; INLINE vect_t init(vect_t &x, const vect_t a) const { return x = mod(a); } INLINE vect_t init(const vect_t a) const { return mod(a); } INLINE vect_t add(vect_t &c, const vect_t a, const vect_t b) { c = simd::add(a, b); _mask = simd::greater(c, _max); _mask = simd::vand(_mask, _modulus); return c = simd::sub(c, _mask); } INLINE vect_t add(const vect_t a, const vect_t b) { vect_t c; c = simd::add(a, b); _mask = simd::greater(c, _max); _mask = simd::vand(_mask, _modulus); return c = simd::sub(c, _mask); } INLINE vect_t addin(vect_t &a, const vect_t b) const { return a = add(a, b); } INLINE vect_t add_r(vect_t &c, const vect_t a, const vect_t b) const { return c = simd::add(a, b); } INLINE vect_t add_r(const vect_t a, const vect_t b) const { return simd::add(a, b); } INLINE vect_t addin_r(vect_t &a, const vect_t b) const { return a = add_r(a, b); } INLINE vect_t sub(vect_t &c, const vect_t a, const vect_t b) { c = simd::sub(a, b); _mask = simd::lesser(c, _min); _mask = simd::vand(_mask, _modulus); return c = simd::add(c, _mask); } INLINE vect_t sub(const vect_t a, const vect_t b) { vect_t c; c = simd::sub(a, b); _mask = simd::greater(c, _max); _mask = simd::vand(_mask, _modulus); return c = simd::add(c, _mask); } INLINE vect_t subin(vect_t &a, const vect_t b) const { return a = sub(a, b); } INLINE vect_t sub_r(vect_t &c, const vect_t a, const vect_t b) const { return c = simd::sub(a, b); } INLINE vect_t sub_r(const vect_t a, const vect_t b) const { return simd::sub(a, b); } INLINE vect_t subin_r(vect_t &a, const vect_t b) const { return a = sub_r(a, b); } INLINE vect_t zero(vect_t &x) const { return x = simd::zero(); } INLINE vect_t zero() const { return simd::zero(); } INLINE vect_t mod(vect_t &c) const { if (std::is_floating_point::value) { vect_t q, t; q = simd::mul(c, _invmod); q = simd::floor(q); c = simd::fnmadd(c, q, _modulus); q = simd::greater(c, _max); t = simd::lesser(c, _min); q = simd::vand(q, _negmod); t = simd::vand(t, _modulus); q = simd::vor(q, t); return c = simd::add(c, q); } else { FFLASFFPACK_abort("pas implementé"); } } INLINE vect_t mul(vect_t &c, const vect_t a, const vect_t b) const { return c = mod(simd::mul(a, b)); } INLINE vect_t mul(const vect_t a, const vect_t b) const { return mod(simd::mul(a, b)); } INLINE vect_t mulin(vect_t &a, const vect_t b) const { return mul(a, a, b); } INLINE vect_t mul_r(vect_t &c, const vect_t a, const vect_t b) const { return c = simd::mul(a, b); } INLINE vect_t mul_r(const vect_t a, const vect_t b) const { return simd::mul(a, b); } INLINE vect_t axpy(vect_t &r, const vect_t a, const vect_t b, const vect_t c) const { return r = mod(simd::fmadd(c, a, b)); } INLINE vect_t axpy(const vect_t c, const vect_t a, const vect_t b) const { return mod(simd::fmadd(c, a, b)); } INLINE vect_t axpyin(vect_t &c, const vect_t a, const vect_t b) const { return c = axpy(c, a, b); } INLINE vect_t axpy_r(vect_t &r, const vect_t a, const vect_t b, const vect_t c) const { return r = simd::fmadd(c, a, b); } INLINE vect_t axpy_r(const vect_t c, const vect_t a, const vect_t b) const { return simd::fmadd(c, a, b); } INLINE vect_t axpyin_r(vect_t &c, const vect_t a, const vect_t b) const { return c = axpy_r(c, a, b); } INLINE vect_t maxpy(vect_t &r, const vect_t a, const vect_t b, const vect_t c) const { return r = mod(simd::fmsub(c, a, b)); } INLINE vect_t maxpy(const vect_t c, const vect_t a, const vect_t b) const { return mod(simd::fmsub(c, a, b)); } INLINE vect_t maxpyin(vect_t &c, const vect_t a, const vect_t b) const { return c = maxpy(c, a, b); } }; fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse.h000066400000000000000000000502341274716147400223200ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Brice Boyer (briceboyer) * Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_sparse.h */ #ifndef __FFLASFFPACK_fflas_fflas_sparse_H #define __FFLASFFPACK_fflas_fflas_sparse_H #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/config.h" #include "fflas-ffpack/config-blas.h" #include "fflas-ffpack/paladin/parallel.h" #include #include #ifndef index_t #define index_t uint32_t #endif #ifdef __FFLASFFPACK_HAVE_MKL #ifndef _MKL_H_ // temporary #error "MKL (mkl.h) not present, while you have MKL enabled" #endif #undef index_t #define index_t MKL_INT #endif // __FFLASFFPACK_HAVE_MKL // Bigger multiple of s lesser or equal than x, s must be a power of two #ifndef ROUND_DOWN #define ROUND_DOWN(x, s) ((x) & ~((s)-1)) #endif #ifndef __FFLASFFPACK_CACHE_LINE_SIZE #define __FFLASFFPACK_CACHE_LINE_SIZE 64 #endif #if (__GNUC_MAJOR > 4 || (__GNUC_MAJOR == 4 &&__GNUC_MINOR__ >= 7)) || defined(__clang__) #define assume_aligned(pout, pin, v) decltype(pin) __restrict__ pout = static_cast(__builtin_assume_aligned(pin, v)); #elif defined(__INTEL_COMPILER) #define assume_aligned(pout, pin, v) \ decltype(pin) __restrict pout = pin; \ __assume_aligned(pout) #else #define assume_aligned(pout, pin, v) decltype(pin) pout = pin; #endif #define DENSE_THRESHOLD 0.5 #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/field/field-traits.h" #include "fflas-ffpack/fflas/fflas_bounds.inl" #include "fflas-ffpack/utils/fflas_memory.h" #include "fflas-ffpack/paladin/parallel.h" #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS #include "fflas-ffpack/fflas/fflas_simd.h" #endif #include #include #include namespace MKL_CONFIG { static const double dalpha = 1; static const float salpha = 1; static const double dbeta = 0; static const float sbeta = 0; static const char metaChar[4] = {'G', ' ', ' ', 'C'}; static const char trans[1] = {'N'}; } namespace FFLAS { enum class SparseMatrix_t { CSR, CSR_ZO, CSC, CSC_ZO, COO, COO_ZO, ELL, ELL_ZO, SELL, SELL_ZO, ELL_simd, ELL_simd_ZO, CSR_HYB, HYB_ZO }; template struct Sparse; } // FFLAS #include "fflas-ffpack/fflas/fflas_sparse/sparse_matrix_traits.h" #include "fflas-ffpack/fflas/fflas_sparse/utils.h" #include "fflas-ffpack/fflas/fflas_sparse/csr.h" #include "fflas-ffpack/fflas/fflas_sparse/coo.h" #include "fflas-ffpack/fflas/fflas_sparse/ell.h" #include "fflas-ffpack/fflas/fflas_sparse/csr_hyb.h" #include "fflas-ffpack/fflas/fflas_sparse/ell_simd.h" #include "fflas-ffpack/fflas/fflas_sparse/hyb_zo.h" // #include "fflas-ffpack/fflas/fflas_sparse/sparse_matrix.h" namespace FFLAS { /********************************************************************************************************************* * * Sparse Details * *********************************************************************************************************************/ namespace sparse_details { template inline void init_y(const Field &F, const size_t m, const typename Field::Element b, typename Field::Element_ptr y); template inline void init_y(const Field &F, const size_t m, const size_t n, const typename Field::Element b, typename Field::Element_ptr y, const int ldy); /************************************* fspmv **************************************/ template inline typename std::enable_if< !(std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value)>::type fspmv_dispatch(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FC fc, MZO mzo); template inline typename std::enable_if< std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value>::type fspmv_dispatch(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FC fc, MZO mzo); // non ZO matrix template inline void fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag, NotZOSparseMatrix); // ZO matrix template inline void fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag, ZOSparseMatrix); template inline typename std::enable_if::value>::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag, ZOSparseMatrix); template inline typename std::enable_if::value>::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag, ZOSparseMatrix); template inline void fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag, std::true_type); /************************************* fspmm **************************************/ template inline typename std::enable_if< !(std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value)>::type fspmm_dispatch(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FCat, MZO); template inline typename std::enable_if< std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value>::type fspmm_dispatch(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FCat, MZO); template inline void fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, NotZOSparseMatrix); // ZO matrix template inline void fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, ZOSparseMatrix); template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, ZOSparseMatrix); template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, ZOSparseMatrix); template inline void fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, ZOSparseMatrix); /************************************* pfspmm **************************************/ template inline typename std::enable_if< !(std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value)>::type pfspmm_dispatch(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FCat, MZO); template inline typename std::enable_if< std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value>::type pfspmm_dispatch(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FCat, MZO); template inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, NotZOSparseMatrix); template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, NotZOSparseMatrix); // ZO matrix template inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, ZOSparseMatrix); template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, ZOSparseMatrix); template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, ZOSparseMatrix); template inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, ZOSparseMatrix); /************************************* pfspmv **************************************/ template inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag, std::false_type); template inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag, std::false_type); template inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag, std::false_type); template inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag, std::true_type); template inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag, std::true_type); template inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag, std::true_type); } // sparse_details /********************************************************************************************************************* * * SpMV, SpMM, pSpMV, pSpMM * *********************************************************************************************************************/ template inline void fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, const typename Field::Element &beta, typename Field::Element_ptr y); template inline void fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, const typename Field::Element &beta, typename Field::Element_ptr y, int ldy); #if defined(__FFLASFFPACK_USE_OPENMP) template inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, const typename Field::Element &beta, typename Field::Element_ptr y); template inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, const typename Field::Element &beta, typename Field::Element_ptr y, int ldy); #endif } #include "fflas-ffpack/fflas/fflas_sparse.inl" #include "fflas-ffpack/fflas/fflas_sparse/read_sparse.h" namespace FFLAS { struct HelperFlag { static constexpr uint64_t none = 0_ui64; static constexpr uint64_t coo = 1_ui64; static constexpr uint64_t csr = 1_ui64 << 1; static constexpr uint64_t ell = 1_ui64 << 2; static constexpr uint64_t aut = 1_ui64 << 32; static constexpr uint64_t pm1 = 1_ui64 << 33; }; template struct CsrMat { typename FFLAS::Sparse * _csr16 = nullptr; typename FFLAS::Sparse * _csr32 = nullptr ; typename FFLAS::Sparse * _csr64 = nullptr ; typename FFLAS::Sparse * _csr16_zo = nullptr ; typename FFLAS::Sparse * _csr32_zo = nullptr ; typename FFLAS::Sparse * _csr64_zo = nullptr ; }; template struct CooMat { typename FFLAS::Sparse * _coo16 = nullptr; typename FFLAS::Sparse * _coo32 = nullptr ; typename FFLAS::Sparse * _coo64 = nullptr ; typename FFLAS::Sparse * _coo16_zo = nullptr ; typename FFLAS::Sparse * _coo32_zo = nullptr ; typename FFLAS::Sparse * _coo64_zo = nullptr ; }; template struct EllMat { typename FFLAS::Sparse * _ell16 = nullptr; typename FFLAS::Sparse * _ell32 = nullptr ; typename FFLAS::Sparse * _ell64 = nullptr ; typename FFLAS::Sparse * _ell16_zo = nullptr ; typename FFLAS::Sparse * _ell32_zo = nullptr ; typename FFLAS::Sparse * _ell64_zo = nullptr ; }; template struct SpMat { typename FFLAS::CooMat * _coo = nullptr ; typename FFLAS::CsrMat * _csr = nullptr ; typename FFLAS::EllMat * _ell = nullptr ; }; } #undef ROUND_DOWN #undef DENSE_THRESHOLD #undef assume_aligned #endif // __FFLASFFPACK_fflas_fflas_sparse_H fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse.inl000066400000000000000000001243531274716147400226570ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Brice Boyer (briceboyer) * Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_sparse.inl */ #ifndef __FFLASFFPACK_fflas_fflas_sparse_INL #define __FFLASFFPACK_fflas_fflas_sparse_INL namespace FFLAS { namespace sparse_details { template inline void init_y(const Field &F, const size_t m, const typename Field::Element b, typename Field::Element_ptr y) { if (!F.isOne(b)) { if (F.isZero(b)) { fzero(F, m, y, 1); } else if (F.isMOne(b)) { fnegin(F, m, y, 1); } else { fscalin(F, m, b, y, 1); } } } template inline void init_y(const Field &F, const size_t m, const size_t n, const typename Field::Element b, typename Field::Element_ptr y, const int ldy) { if (!F.isOne(b)) { if (F.isZero(b)) { fzero(F, m, n, y, ldy); } else if (F.isMOne(b)) { fnegin(F, m, n, y, 1); } else { fscalin(F, m, n, b, y, 1); } } } } // sparse_details namespace sparse_details { /************************************************************************************* * * fspmv dispatch * *************************************************************************************/ template inline typename std::enable_if< !(std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value)>::type fspmv_dispatch(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FC fc, MZO mzo) { sparse_details::fspmv(F, A, x, y, FieldCategories::GenericTag(), MZO()); } template inline typename std::enable_if< std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value>::type fspmv_dispatch(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FC fc, MZO mzo) { sparse_details::fspmv(F, A, x, y, FC(), MZO()); } // non ZO matrix template inline void fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag, NotZOSparseMatrix) { sparse_details_impl::fspmv(F, A, x, y, FieldCategories::GenericTag()); } template inline typename std::enable_if::value>::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag, NotZOSparseMatrix) { sparse_details_impl::fspmv(F, A, x, y, FieldCategories::UnparametricTag()); } template inline typename std::enable_if::value && support_simd::value >::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag, NotZOSparseMatrix) { // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS sparse_details_impl::fspmv_simd(F, A, x, y, FieldCategories::UnparametricTag()); // #else // sparse_details_impl::fspmv(F, A, x, y, FieldCategories::UnparametricTag()); // #endif } template inline typename std::enable_if::value>::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag, NotZOSparseMatrix) { if (A.delayed) { sparse_details::fspmv(F, A, x, y, FieldCategories::UnparametricTag(), std::false_type()); freduce(F, A.m, y, 1); } else { sparse_details_impl::fspmv(F, A, x, y, A.kmax); } } template inline typename std::enable_if::value && support_simd::value >::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag, NotZOSparseMatrix) { // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS if (A.delayed) { sparse_details::fspmv(F, A, x, y, FieldCategories::UnparametricTag(), std::false_type()); freduce(F, A.m, y, 1); } else { sparse_details_impl::fspmv_simd(F, A, x, y, A.kmax); } // #else // if (A.delayed) { // sparse_details::fspmv(F, A, x, y, FieldCategories::UnparametricTag(), std::false_type()); // freduce(F, A.m, y, 1); // } else { // sparse_details_impl::fspmv(F, A, x, y, A.kmax); // } // #endif } // ZO matrix template inline void fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag, ZOSparseMatrix) { if (A.cst == 1) { sparse_details_impl::fspmv_one(F, A, x, y, FieldCategories::GenericTag()); } else if (A.cst == -1) { sparse_details_impl::fspmv_mone(F, A, x, y, FieldCategories::GenericTag()); } else { auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); fscal(F, A.n, A.cst, x, 1, x1, 1); sparse_details_impl::fspmv_one(F, A, x, y, FieldCategories::GenericTag()); fflas_delete(x1); } } template inline typename std::enable_if::value>::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag, ZOSparseMatrix) { if (A.cst == 1) { sparse_details_impl::fspmv_one(F, A, x, y, FieldCategories::UnparametricTag()); } else if (A.cst == -1) { sparse_details_impl::fspmv_mone(F, A, x, y, FieldCategories::UnparametricTag()); } else { auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); fscal(F, A.n, A.cst, x, 1, x1, 1); sparse_details_impl::fspmv_one(F, A, x, y, FieldCategories::UnparametricTag()); fflas_delete(x1); } } template inline typename std::enable_if::value && support_simd::value >::type fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag, ZOSparseMatrix) { // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS if (A.cst == 1) { sparse_details_impl::fspmv_one_simd(F, A, x, y, FieldCategories::UnparametricTag()); } else if (A.cst == -1) { sparse_details_impl::fspmv_mone_simd(F, A, x, y, FieldCategories::UnparametricTag()); } else { auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); fscal(F, A.n, A.cst, x, 1, x1, 1); sparse_details_impl::fspmv_one_simd(F, A, x, y, FieldCategories::UnparametricTag()); fflas_delete(x1); } // #else // if (A.cst == 1) { // sparse_details_impl::fspmv_one(F, A, x, y, FieldCategories::UnparametricTag()); // } else if (A.cst == -1) { // sparse_details_impl::fspmv_mone(F, A, x, y, FieldCategories::UnparametricTag()); // } else { // auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); // fscal(F, A.n, A.cst, x, 1, x1, 1); // sparse_details_impl::fspmv_one(F, A, x, y, FieldCategories::UnparametricTag()); // fflas_delete(x1); // } // #endif // SIMD } template inline void fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag, std::true_type) { sparse_details::fspmv(F, A, x, y, FieldCategories::UnparametricTag(), std::true_type()); freduce(F, A.m, y, 1); } /************************************************************************************* * * fspmm dispatch * *************************************************************************************/ template inline typename std::enable_if< !(std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value)>::type fspmm_dispatch(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FCat, MZO) { sparse_details::fspmm(F, A, blockSize, x, ldx, y, ldy, typename FieldCategories::GenericTag(), MZO()); } template inline typename std::enable_if< std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value>::type fspmm_dispatch(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FCat, MZO) { sparse_details::fspmm(F, A, blockSize, x, ldx, y, ldy, FCat(), MZO()); } template inline void fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, NotZOSparseMatrix) { sparse_details_impl::fspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); } template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, NotZOSparseMatrix) { using simd = Simd; if (simd::valid(y) && simd::valid(x) && simd::compliant(blockSize)) { sparse_details_impl::fspmm_simd_aligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else { sparse_details_impl::fspmm_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } } template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, NotZOSparseMatrix) { sparse_details_impl::fspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, NotZOSparseMatrix) { if (A.delayed) { sparse_details::fspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag(), typename std::false_type()); freduce(F, A.m, blockSize, y, ldy); } else { using simd = Simd; if (simd::valid(y) && simd::valid(x) && simd::compliant(blockSize)) { sparse_details_impl::fspmm_simd_aligned(F, A, blockSize, x, ldx, y, ldy, A.kmax); } else { sparse_details_impl::fspmm_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, A.kmax); } } } template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, NotZOSparseMatrix) { if (A.delayed) { sparse_details::fspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag(), NotZOSparseMatrix()); freduce(F, A.m, blockSize, y, ldy); } else { sparse_details_impl::fspmm(F, A, blockSize, x, ldx, y, ldy, A.kmax); } } // ZO matrix template inline void fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, ZOSparseMatrix) { if (F.isOne(A.cst)) { sparse_details_impl::fspmm_one(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); } else if (F.isMOne(A.cst)) { sparse_details_impl::fspmm_mone(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); } else { auto x1 = fflas_new(F, A.m, blockSize, Alignment::CACHE_LINE); fscal(F, A.m, blockSize, A.cst, x, ldx, x1, 1); sparse_details_impl::fspmm_one(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); fflas_delete(x1); } } template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, ZOSparseMatrix) { using simd = Simd; if (F.isOne(A.cst)) { if (simd::valid(x) && simd::valid(y) && simd::compliant(blockSize)) { sparse_details_impl::fspmm_one_simd_aligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else { sparse_details_impl::fspmm_one_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } } else if (F.isMOne(A.cst)) { if (simd::valid(x) && simd::valid(y) && simd::compliant(blockSize)) { sparse_details_impl::fspmm_mone_simd_aligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else { sparse_details_impl::fspmm_mone_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } } else { auto x1 = fflas_new(F, A.m, blockSize, Alignment::CACHE_LINE); fscal(F, A.m, blockSize, A.cst, x, ldx, x1, 1); if (simd::valid(x) && simd::valid(y) && simd::compliant(blockSize)) { sparse_details_impl::fspmm_one_simd_aligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else { sparse_details_impl::fspmm_one_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } fflas_delete(x1); } } template inline typename std::enable_if::value>::type fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, ZOSparseMatrix) { if (F.isOne(A.cst)) { sparse_details_impl::fspmm_one(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else if (F.isMOne(A.cst)) { sparse_details_impl::fspmm_mone(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else { auto x1 = fflas_new(F, A.m, blockSize, Alignment::CACHE_LINE); fscal(F, A.m, blockSize, A.cst, x, ldx, x1, 1); sparse_details_impl::fspmm_one(F, A, blockSize, x1, ldx, y, ldy, FieldCategories::UnparametricTag()); fflas_delete(x1); } } template inline void fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, ZOSparseMatrix) { sparse_details::fspmm(F, A, blockSize, x, ldx, y, ldy, typename FieldCategories::UnparametricTag(), ZOSparseMatrix()); freduce(F, blockSize, A.m, y, ldy); } #if defined(__FFLASFFPACK_USE_OPENMP) /************************************************************************************* * * pfspmm dispatch * *************************************************************************************/ template inline typename std::enable_if< !(std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value)>::type pfspmm_dispatch(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FCat, MZO) { sparse_details::pfspmm(F, A, blockSize, x, ldx, y, ldy, typename FieldCategories::GenericTag(), MZO()); } template inline typename std::enable_if< std::is_same::value, ElementCategories::MachineFloatTag>::value || std::is_same::value, ElementCategories::MachineIntTag>::value>::type pfspmm_dispatch(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FCat, MZO) { sparse_details::pfspmm(F, A, blockSize, x, ldx, y, ldy, FCat(), MZO()); } template inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, NotZOSparseMatrix) { sparse_details_impl::pfspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); } #if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, NotZOSparseMatrix) { using simd = Simd; if (simd::valid(y) && simd::valid(x) && simd::compliant(blockSize)) { sparse_details_impl::pfspmm_simd_aligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else { sparse_details_impl::pfspmm_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } } template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, NotZOSparseMatrix) { sparse_details_impl::pfspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, NotZOSparseMatrix) { if (A.delayed) { sparse_details::pfspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag(), typename std::false_type()); freduce(F, A.m, blockSize, y, ldy); } else { using simd = Simd; if (simd::valid(y) && simd::valid(x) && simd::compliant(blockSize)) { sparse_details_impl::pfspmm_simd_aligned(F, A, blockSize, x, ldx, y, ldy, A.kmax); } else { sparse_details_impl::pfspmm_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, A.kmax); } } } template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, NotZOSparseMatrix) { if (A.delayed) { sparse_details::pfspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag(), NotZOSparseMatrix()); freduce(F, A.m, blockSize, y, ldy); } else { sparse_details_impl::pfspmm(F, A, blockSize, x, ldx, y, ldy, A.kmax); } } #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS // ZO matrix template inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, ZOSparseMatrix) { if (F.isOne(A.cst)) { sparse_details_impl::pfspmm_one(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); } else if (F.isMOne(A.cst)) { sparse_details_impl::pfspmm_mone(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); } else { auto x1 = fflas_new(F, A.m, blockSize, Alignment::CACHE_LINE); fscal(F, A.m, blockSize, A.cst, x, ldx, x1, 1); sparse_details_impl::pfspmm_one(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); fflas_delete(x1); } } #if defined(__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS) template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, ZOSparseMatrix) { using simd = Simd; if (F.isOne(A.cst)) { if (simd::valid(x) && simd::valid(y) && simd::compliant(blockSize)) { sparse_details_impl::pfspmm_one_simd_aligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else { sparse_details_impl::pfspmm_one_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } } else if (F.isMOne(A.cst)) { if (simd::valid(x) && simd::valid(y) && simd::compliant(blockSize)) { sparse_details_impl::pfspmm_mone_simd_aligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else { sparse_details_impl::pfspmm_mone_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } } else { auto x1 = fflas_new(F, A.m, blockSize, Alignment::CACHE_LINE); fscal(F, A.m, blockSize, A.cst, x, ldx, x1, 1); if (simd::valid(x) && simd::valid(y) && simd::compliant(blockSize)) { sparse_details_impl::pfspmm_one_simd_aligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else { sparse_details_impl::pfspmm_one_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } fflas_delete(x1); } } template inline typename std::enable_if::value>::type pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, ZOSparseMatrix) { if (F.isOne(A.cst)) { sparse_details_impl::pfspmm_one(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else if (F.isMOne(A.cst)) { sparse_details_impl::pfspmm_mone(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } else { auto x1 = fflas_new(F, A.m, blockSize, Alignment::CACHE_LINE); fscal(F, A.m, blockSize, A.cst, x, ldx, x1, 1); sparse_details_impl::pfspmm_one(F, A, blockSize, x1, ldx, y, ldy, FieldCategories::UnparametricTag()); fflas_delete(x1); } } template inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, ZOSparseMatrix) { sparse_details::pfspmm(F, A, blockSize, x, ldx, y, ldy, typename FieldCategories::UnparametricTag(), ZOSparseMatrix()); freduce(F, blockSize, A.m, y, ldy); } #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS // /***************************** pfspmv ******************************/ // #if defined(__FFLASFFPACK_USE_OPENMP) // template // inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, // FieldCategories::MultiPrecisionTag ,FC fc, MZO mzo) { // sparse_details::pfspmv(F, A, x, y, FieldCategories::GenericTag(), MZO()); // } // template // inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, // FieldCategories::GenericTag ,FC fc, MZO mzo) { // sparse_details::pfspmv(F, A, x, y, FC(), MZO()); // } template inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag tag, std::false_type) { sparse_details_impl::pfspmv(F, A, x, y, tag); } // template // inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, // FieldCategories::UnparametricTag, std::false_type) { // sparse_details_impl::pfspmv(F, A, x, y, FieldCategories::UnparametricTag()); // } // template // inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, // FieldCategories::ModularTag, std::false_type) { // if (A.delayed) { // sparse_details::pfspmv(F, A, x, y, FieldCategories::UnparametricTag(), std::false_type()); // freduce(F, A.m, y, 1); // } else { // sparse_details_impl::pfspmv(F, A, x, y, A.kmax); // } // } // // ZO matrix // template // inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, // FieldCategories::GenericTag, std::true_type) { // if (A.cst == 1) { // sparse_details_impl::pfspmv_one(F, A, x, y, FieldCategories::GenericTag()); // } else if (A.cst == -1) { // sparse_details_impl::pfspmv_mone(F, A, x, y, FieldCategories::GenericTag()); // } else { // auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); // fscal(F, A.n, A.cst, x, 1, x1, 1); // sparse_details_impl::pfspmv_one(F, A, x, y, FieldCategories::GenericTag()); // fflas_delete(x1); // } // } // template // inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x, // typename Field::Element_ptr y, FieldCategories::UnparametricTag, std::true_type) { // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS // if (A.cst == 1) { // sparse_details_impl::pfspmv_one_simd(F, A, x, y, FieldCategories::UnparametricTag()); // } else if (A.cst == -1) { // sparse_details_impl::pfspmv_mone_simd(F, A, x, y, FieldCategories::UnparametricTag()); // } else { // auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); // fscal(F, A.n, A.cst, x, 1, x1, 1); // sparse_details_impl::pfspmv_one_simd(F, A, x, y, FieldCategories::UnparametricTag()); // fflas_delete(x1); // } // #else // if (A.cst == 1) { // sparse_details_impl::pfspmv_one(F, A, x, y, FieldCategories::UnparametricTag()); // } else if (A.cst == -1) { // sparse_details_impl::pfspmv_mone(F, A, x, y, FieldCategories::UnparametricTag()); // } else { // auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); // fscal(F, A.n, A.cst, x, 1, x1, 1); // sparse_details_impl::pfspmv_one(F, A, x, y, FieldCategories::UnparametricTag()); // fflas_delete(x1); // } // #endif // SIMD // } // template // inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr // x, // typename Field::Element_ptr y, FieldCategories::UnparametricTag, std::true_type) { // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS // if (A.cst == 1) { // sparse_details_impl::pfspmv_one_simd(F, A, x, y, FieldCategories::UnparametricTag()); // } else if (A.cst == -1) { // sparse_details_impl::pfspmv_mone_simd(F, A, x, y, FieldCategories::UnparametricTag()); // } else { // auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); // fscal(F, A.n, A.cst, x, 1, x1, 1); // sparse_details_impl::pfspmv_one_simd(F, A, x, y, FieldCategories::UnparametricTag()); // fflas_delete(x1); // } // #else // if (A.cst == 1) { // sparse_details_impl::pfspmv_one(F, A, x, y, FieldCategories::UnparametricTag()); // } else if (A.cst == -1) { // sparse_details_impl::pfspmv_mone(F, A, x, y, FieldCategories::UnparametricTag()); // } else { // auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); // fscal(F, A.n, A.cst, x, 1, x1, 1); // sparse_details_impl::pfspmv_one(F, A, x, y, FieldCategories::UnparametricTag()); // fflas_delete(x1); // } // #endif // SIMD // } // template // inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, // FieldCategories::UnparametricTag, std::true_type) { // if (A.cst == 1) { // sparse_details_impl::pfspmv_one(F, A, x, y, FieldCategories::UnparametricTag()); // } else if (A.cst == -1) { // sparse_details_impl::pfspmv_mone(F, A, x, y, FieldCategories::UnparametricTag()); // } else { // auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); // fscal(F, A.n, A.cst, x, 1, x1, 1); // sparse_details_impl::pfspmv_one(F, A, x, y, FieldCategories::UnparametricTag()); // fflas_delete(x1); // } // } // template // inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, // FieldCategories::ModularTag, std::true_type) { // sparse_details::pfspmv(F, A, x, y, FieldCategories::UnparametricTag(), std::true_type()); // freduce(F, A.m, y, 1); // } // #endif // /***************************** pfspmm *****************************/ // template // inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, // typename Field::Element_ptr y, int ldy, FieldCategories::MultiPrecisionTag, FCat fc, MZO mz) { // sparse_details::pfspmm(F, A, blockSize, x, ldx, y, ldy, typename FieldCategories::GenericTag(), MZO()); // } // template // inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, // typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, FCat fc, MZO mz) { // sparse_details::pfspmm(F, A, blockSize, x, ldx, y, ldy, FCat(), MZO()); // } // template // inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, // typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, std::false_type) { // // std::cout << "no ZO Generic" << std::endl; // /*sparse_details_impl::*/pfspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); // } // template // inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, // typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, std::false_type) { // // std::cout << "no ZO Unparametric" << std::endl; // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS // using simd = Simd; // if (((uint64_t)y % simd::alignment == 0) && ((uint64_t)x % simd::alignment == 0) && // (blockSize % simd::vect_size == 0)) { // // std::cout << "no ZO Unparametric algined" << std::endl; // sparse_details_impl::pfspmm_simd_aligned(F, A, blockSize, x, ldx, y, ldy, // FieldCategories::UnparametricTag()); // } // else{ // sparse_details_impl::fspmm_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); // } // #else // sparse_details_impl::pfspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); // #endif // } // template // inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, // typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, std::false_type) { // // std::cout << "no ZO Modular" << std::endl; // if (A.delayed) { // sparse_details::pfspmm(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag(), // typename std::false_type()); // freduce(F, A.m, blockSize, y, ldy); // } else { // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS // using simd = Simd; // if (((uint64_t)y % simd::alignment == 0) && ((uint64_t)x % simd::alignment == 0) && // (blockSize % simd::vect_size == 0)) { // sparse_details_impl::pfspmm_simd_aligned(F, A, blockSize, x, ldx, y, ldy, A.kmax); // } else { // sparse_details_impl::pfspmm_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, A.kmax); // } // #else // sparse_details_impl::pfspmm(F, A, blockSize, x, ldx, y, ldy, A.kmax); // #endif // } // } // // ZO matrix // template // inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, // typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag, std::true_type) { // // std::cout << "ZO Generic" << std::endl; // if (F.isOne(A.cst)) { // sparse_details_impl::pfspmm_one(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); // } else if (F.isMOne(A.cst)) { // sparse_details_impl::pfspmm_mone(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); // } else { // auto x1 = fflas_new(F, A.m, blockSize, Alignment::CACHE_LINE); // fscal(F, A.m, blockSize, A.cst, x, ldx, x1, 1); // sparse_details_impl::pfspmm_one(F, A, blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); // fflas_delete(x1); // } // } // template // inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, // typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag, std::true_type) { // // std::cout << "ZO Unparametric" << std::endl; // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS // using simd = Simd; // if (F.isOne(A.cst)) { // if (((uint64_t)y % simd::alignment == 0) && ((uint64_t)x % simd::alignment == 0) && // (blockSize % simd::vect_size == 0)) { // // std::cout << "ZO Unparametric aligned" << std::endl; // sparse_details_impl::pfspmm_one_simd_aligned(F, A, blockSize, x, ldx, y, ldy, // FieldCategories::UnparametricTag()); // } else { // // std::cout << "ZO Unparametric unaligned" << std::endl; // sparse_details_impl::pfspmm_one_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, // FieldCategories::UnparametricTag()); // } // } else if (F.isMOne(A.cst)) { // if (((uint64_t)y % simd::alignment == 0) && ((uint64_t)x % simd::alignment == 0) && // (blockSize % simd::vect_size == 0)) { // sparse_details_impl::pfspmm_mone_simd_aligned(F, A, blockSize, x, ldx, y, ldy, // FieldCategories::UnparametricTag()); // } else { // sparse_details_impl::pfspmm_mone_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, // FieldCategories::UnparametricTag()); // } // } else { // auto x1 = fflas_new(F, A.m, blockSize, Alignment::CACHE_LINE); // fscal(F, A.m, blockSize, A.cst, x, ldx, x1, 1); // if (((uint64_t)y % simd::alignment == 0) && ((uint64_t)x % simd::alignment == 0) && // (blockSize % simd::vect_size == 0)) { // sparse_details_impl::pfspmm_one_simd_aligned(F, A, blockSize, x, ldx, y, ldy, // FieldCategories::UnparametricTag()); // } else { // sparse_details_impl::pfspmm_one_simd_unaligned(F, A, blockSize, x, ldx, y, ldy, // FieldCategories::UnparametricTag()); // } // fflas_delete(x1); // } // #else // if (F.isOne(A.cst)) { // sparse_details_impl::pfspmm_one(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); // } else if (F.isMOne(A.cst)) { // sparse_details_impl::pfspmm_mone(F, A, blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); // } else { // auto x1 = fflas_new(F, A.m, blockSize, Alignment::CACHE_LINE); // fscal(F, A.m, blockSize, A.cst, x, ldx, x1, 1); // sparse_details_impl::pfspmm_one(F, A, blockSize, x1, ldx, y, ldy, FieldCategories::UnparametricTag()); // fflas_delete(x1); // } // #endif // } // template // inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, // typename Field::Element_ptr y, int ldy, FieldCategories::ModularTag, std::true_type) { // // std::cout << "ZO Modular" << std::endl; // if (A.delayed) { // sparse_details::pfspmm(F, A, blockSize, x, ldx, y, ldy, typename FieldCategories::UnparametricTag(), // typename std::true_type()); // freduce(F, blockSize, A.m, y, ldy); // } else { // sparse_details_impl::pfspmm(F, A, blockSize, x, ldx, y, ldy, A.kmax); // } // } #endif // __FFLASFFPACK_USE_OPENMP } // sparse details template inline void fspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, const typename Field::Element &beta, typename Field::Element_ptr y) { sparse_details::init_y(F, A.m, beta, y); sparse_details::fspmv_dispatch(F, A, x, y, typename FieldTraits::category(), typename isZOSparseMatrix::type()); } template inline void fspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, const typename Field::Element &beta, typename Field::Element_ptr y, int ldy) { sparse_details::init_y(F, A.m, blockSize, beta, y, ldy); sparse_details::fspmm_dispatch(F, A, blockSize, x, ldx, y, ldy, typename FieldTraits::category(), typename isZOSparseMatrix::type()); } #if defined(__FFLASFFPACK_USE_OPENMP) template inline void pfspmv(const Field &F, const SM &A, typename Field::ConstElement_ptr x, const typename Field::Element &beta, typename Field::Element_ptr y) { sparse_details::init_y(F, A.m, beta, y); sparse_details::pfspmv(F, A, x, y, typename FieldTraits::category(), typename isZOSparseMatrix::type()); } template inline void pfspmm(const Field &F, const SM &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, const typename Field::Element &beta, typename Field::Element_ptr y, int ldy) { sparse_details::init_y(F, A.m, blockSize, beta, y, ldy); sparse_details::pfspmm_dispatch(F, A, blockSize, x, ldx, y, ldy, typename FieldTraits::category(), typename isZOSparseMatrix::type()); } #endif // __FFLASFFPACK_USE_OPENMP // template // inline void pfspmm(const Field &F, const SM &A, size_t blockSize, // typename Field::ConstElement_ptr x, int ldx, // const typename Field::Element &beta, // typename Field::Element_ptr y, int ldy) { // sparse_details::init_y(F, A.m, blockSize, beta, y, ldy); // sparse_details::pfspmm( // F, A, blockSize, x, ldx, y, ldy, typename FieldTraits::value(), // typename FieldTraits::category(), // typename isZOSparseMatrix::type()); // } } #endif // __FFLASFFPACK_fflas_fflas_sparse_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/000077500000000000000000000000001274716147400221435ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/Makefile.am000066400000000000000000000023371274716147400242040ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Bastien Vialla # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_sparse SUBDIRS=coo csr csr_hyb ell ell_simd hyb_zo sell pkgincludesub_HEADERS= \ sparse_matrix_traits.h \ read_sparse.h \ utils.h \ coo.h \ csr.h \ ell.h \ ell_simd.h \ sell.h \ csr_hyb.h \ hyb_zo.h fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/coo.h000066400000000000000000000053761274716147400231070ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fspmv_coo.inl * NO DOC */ #ifndef __FFLASFFPACK_fflas_sparse_coo_H #define __FFLASFFPACK_fflas_sparse_coo_H namespace FFLAS { /* COO */ template struct Sparse<_Field, SparseMatrix_t::COO> { using Field = _Field; index_t *col = nullptr; index_t *row = nullptr; typename _Field::Element_ptr dat; bool delayed = false; uint64_t kmax = 0; index_t m = 0; index_t n = 0; uint64_t nnz = 0; uint64_t nElements = 0; uint64_t maxrow = 0; }; template struct Sparse<_Field, SparseMatrix_t::COO_ZO> : public Sparse<_Field, SparseMatrix_t::COO> { using Field = _Field; typename _Field::Element cst = 1; }; template void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz); template void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz); template void sparse_delete(const Sparse &A); template void sparse_delete(const Sparse &A); } // FFLAS #include "fflas-ffpack/fflas/fflas_sparse/coo/coo_utils.inl" #include "fflas-ffpack/fflas/fflas_sparse/coo/coo_spmv.inl" #include "fflas-ffpack/fflas/fflas_sparse/coo/coo_spmm.inl" #endif // __FFLASFFPACK_fflas_sparse_coo_Hfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/coo/000077500000000000000000000000001274716147400227235ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/coo/Makefile.am000066400000000000000000000021031274716147400247530ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Bastien Vialla # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_sparse/coo pkgincludesub_HEADERS= \ coo_spmv.inl \ coo_spmm.inl \ coo_utils.inl fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/coo/coo_spmm.inl000066400000000000000000000363751274716147400252610ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_coo_spmm_INL #define __FFLASFFPACK_fflas_sparse_coo_spmm_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nnz; ++i) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[row[i] * ldy + k], dat[i], x[col[i] * ldx + k]); F.axpyin(y[row[i] * ldy + k + 1], dat[i], x[col[i] * ldx + k + 1]); F.axpyin(y[row[i] * ldy + k + 2], dat[i], x[col[i] * ldx + k + 2]); F.axpyin(y[row[i] * ldy + k + 3], dat[i], x[col[i] * ldx + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[row[i] * ldy + k], dat[i], x[col[i] * ldx + k]); } } template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nnz; ++i) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[row[i] * ldy + k] += dat[i] * x[col[i] * ldx + k]; y[row[i] * ldy + k + 1] += dat[i] * x[col[i] * ldx + k + 1]; y[row[i] * ldy + k + 2] += dat[i] * x[col[i] * ldx + k + 2]; y[row[i] * ldy + k + 3] += dat[i] * x[col[i] * ldx + k + 3]; } for (; k < blockSize; ++k) y[row[i] * ldy + k] += dat[i] * x[col[i] * ldx + k]; } } #ifdef __FFLASFFPACK_HAVE_MKL inline void fspmm_mkl(const Givaro::DoubleDomain &F, const Sparse &A, index_t blockSize , Givaro::DoubleDomain::ConstElement_ptr x_, index_t ldx, Givaro::DoubleDomain::Element_ptr y_, index_t ldy, FieldCategories::UnparametricTag) { // assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); // assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); // assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); // assume_aligned(x, x_, (size_t)Alignment::DEFAULT); // assume_aligned(y, y_, (size_t)Alignment::DEFAULT); MKL_INT A_nnz = A.nnz ; mkl_dcoomm(MKL_CONFIG::trans, &A.m , &blockSize, &A.n, &MKL_CONFIG::dalpha, MKL_CONFIG::metaChar, A.dat, A.row, A.col, &A_nnz, x_, &ldx, &MKL_CONFIG::dbeta, y_, &ldy ); // void mkl_dcoomv (char *transa, MKL_INT *m, MKL_INT *k, double *alpha, char *matdescra, double *val, MKL_INT *rowind, MKL_INT *colind, MKL_INT *nnz, double *x, double *beta, double *y); } inline void fspmm_mkl(const Givaro::FloatDomain &F, const Sparse &A, index_t blockSize , Givaro::FloatDomain::ConstElement_ptr x_, index_t ldx, Givaro::FloatDomain::Element_ptr y_, index_t ldy, FieldCategories::UnparametricTag) { // assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); // assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); // assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); // assume_aligned(x, x_, (size_t)Alignment::DEFAULT); // assume_aligned(y, y_, (size_t)Alignment::DEFAULT); MKL_INT A_nnz = A.nnz ; mkl_scoomm(MKL_CONFIG::trans, &A.m , &blockSize, &A.n, &MKL_CONFIG::salpha, MKL_CONFIG::metaChar, A.dat, A.row, A.col, &A_nnz, x_, &ldx, &MKL_CONFIG::sbeta, y_, &ldy ); // void mkl_scoomm (char *transa, MKL_INT *m, MKL_INT *n, MKL_INT *k, float *alpha, char *matdescra, float *val, MKL_INT *rowind, MKL_INT *colind, MKL_INT *nnz, float *b, MKL_INT *ldb, float *beta, float *c, MKL_INT *ldc); } #endif // __FFLASFFPACK_HAVE_MKL #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nnz; ++i) { vect_t vy, vx, vdat; vdat = simd::set1(dat[i]); size_t k = 0; for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy = simd::load(y + row[i] * ldy + k); vx = simd::load(x + col[i] * ldx + k); simd::store(y + row[i] * ldy + k, simd::fmadd(vy, vdat, vx)); } for (; k < blockSize; ++k) y[row[i] * ldy + k] += dat[i] * x[col[i] * ldx + k]; } } template inline void fspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); auto x = x_; auto y = y_; for (index_t i = 0; i < A.nnz; ++i) { vect_t vy, vx, vdat; vdat = simd::set1(dat[i]); size_t k = 0; for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy = simd::loadu(y + row[i] * ldy + k); vx = simd::loadu(x + col[i] * ldx + k); simd::storeu(y + row[i] * ldy + k, simd::fmadd(vy, vdat, vx)); } for (; k < blockSize; ++k) y[row[i] * ldy + k] += dat[i] * x[col[i] * ldx + k]; } } #endif // SIMD template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { // TODO } template inline void fspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { // TODO } template inline void fspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { // TODO } template inline void fspmm_one(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nnz; ++i) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.addin(y[row[i] * ldy + k], x[col[i] * ldx + k]); F.addin(y[row[i] * ldy + k + 1], x[col[i] * ldx + k + 1]); F.addin(y[row[i] * ldy + k + 2], x[col[i] * ldx + k + 2]); F.addin(y[row[i] * ldy + k + 3], x[col[i] * ldx + k + 3]); } for (; k < blockSize; ++k) F.addin(y[row[i] * ldy + k], x[col[i] * ldx + k]); } } template inline void fspmm_mone(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nnz; ++i) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.subin(y[row[i] * ldy + k], x[col[i] * ldx + k]); F.subin(y[row[i] * ldy + k + 1], x[col[i] * ldx + k + 1]); F.subin(y[row[i] * ldy + k + 2], x[col[i] * ldx + k + 2]); F.subin(y[row[i] * ldy + k + 3], x[col[i] * ldx + k + 3]); } for (; k < blockSize; ++k) F.subin(y[row[i] * ldy + k], x[col[i] * ldx + k]); } } // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_one_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.nnz; ++i) { vect_t vy, vx; size_t k = 0; for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy = simd::load(y + row[i] * ldy + k); vx = simd::load(x + col[i] * ldx + k); simd::store(y + row[i] * ldy + k, simd::add(vy, vx)); } for (; k < blockSize; ++k) y[row[i] * ldy + k] += x[col[i] * ldx + k]; } } template inline void fspmm_one_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.nnz; ++i) { vect_t vy, vx; size_t k = 0; for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy = simd::loadu(y + row[i] * ldy + k); vx = simd::loadu(x + col[i] * ldx + k); simd::storeu(y + row[i] * ldy + k, simd::add(vy, vx)); } for (; k < blockSize; ++k) y[row[i] * ldy + k] += x[col[i] * ldx + k]; } } template inline void fspmm_mone_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.nnz; ++i) { vect_t vy, vx; size_t k = 0; for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy = simd::load(y + row[i] * ldy + k); vx = simd::load(x + col[i] * ldx + k); simd::store(y + row[i] * ldy + k, simd::sub(vy, vx)); } for (; k < blockSize; ++k) y[row[i] * ldy + k] -= x[col[i] * ldx + k]; } } template inline void fspmm_mone_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.nnz; ++i) { vect_t vy, vx; size_t k = 0; for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy = simd::loadu(y + row[i] * ldy + k); vx = simd::loadu(x + col[i] * ldx + k); simd::storeu(y + row[i] * ldy + k, simd::sub(vy, vx)); } for (; k < blockSize; ++k) y[row[i] * ldy + k] -= x[col[i] * ldx + k]; } } // #endif /* __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS */ } // coo_details } // FFLAS #endif // __FFLASFFPACK_fflas_coo_spmm_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/coo/coo_spmv.inl000066400000000000000000000222331274716147400252560ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Barowien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redirowribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is dirowributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin rowreet, Fifth Floor, Borowon, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_coo_spmv_INL #define __FFLASFFPACK_fflas_sparse_coo_spmv_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t j = 0; for (; j < ROUND_DOWN(A.nnz, 4); j += 4) { F.axpyin(y[row[j]], dat[j], x[col[j]]); F.axpyin(y[row[j + 1]], dat[j + 1], x[col[j + 1]]); F.axpyin(y[row[j + 2]], dat[j + 2], x[col[j + 2]]); F.axpyin(y[row[j + 3]], dat[j + 3], x[col[j + 3]]); } for (; j < A.nnz; ++j) { F.axpyin(y[row[j]], dat[j], x[col[j]]); } } template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t j = 0; for (; j < ROUND_DOWN(A.nnz, 4); j += 4) { y[row[j]] += dat[j] * x[col[j]]; y[row[j + 1]] += dat[j + 1] * x[col[j + 1]]; y[row[j + 2]] += dat[j + 2] * x[col[j + 2]]; y[row[j + 3]] += dat[j + 3] * x[col[j + 3]]; } for (; j < A.nnz; ++j) { y[row[j]] += dat[j] * x[col[j]]; } } #ifdef __FFLASFFPACK_HAVE_MKL inline void fspmv_mkl(const Givaro::DoubleDomain &F, const Sparse &A, Givaro::DoubleDomain::ConstElement_ptr x_, Givaro::DoubleDomain::Element_ptr y_, FieldCategories::UnparametricTag) { // assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); // assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); // assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); // assume_aligned(x, x_, (size_t)Alignment::DEFAULT); // assume_aligned(y, y_, (size_t)Alignment::DEFAULT); MKL_INT A_nnz = A.nnz ; mkl_dcoomv(MKL_CONFIG::trans, &A.m , &A.n, &MKL_CONFIG::dalpha, MKL_CONFIG::metaChar, A.dat, A.row, A.col, &A_nnz, x_, &MKL_CONFIG::dbeta, y_ ); // void mkl_dcoomv (char *transa, MKL_INT *m, MKL_INT *k, double *alpha, char *matdescra, double *val, MKL_INT *rowind, MKL_INT *colind, MKL_INT *nnz, double *x, double *beta, double *y); } inline void fspmv_mkl(const Givaro::FloatDomain &F, const Sparse &A, Givaro::FloatDomain::ConstElement_ptr x_, Givaro::FloatDomain::Element_ptr y_, FieldCategories::UnparametricTag) { // assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); // assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); // assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); // assume_aligned(x, x_, (size_t)Alignment::DEFAULT); // assume_aligned(y, y_, (size_t)Alignment::DEFAULT); MKL_INT A_nnz = A.nnz ; mkl_scoomv(MKL_CONFIG::trans, &A.m , &A.n, &MKL_CONFIG::salpha, MKL_CONFIG::metaChar, A.dat, A.row, A.col, &A_nnz, x_, &MKL_CONFIG::sbeta, y_ ); // void mkl_scoomv (char *transa, MKL_INT *m, MKL_INT *k, float *alpha, char *matdescra, float *val, MKL_INT *rowind, MKL_INT *colind, MKL_INT *nnz, float *x, float *beta, float *y); } #endif // __FFLASFFPACK_HAVE_MKL template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); size_t w = 0; index_t larow_i = 0; typename Field::Element e; F.init(e, y[larow_i]); size_t accu = 0; while (w < A.nnz) { if (row[w] == larow_i) { // same line if (accu < (size_t)kmax) { e += dat[w] * x[col[w]]; accu += 1; } else { F.axpyin(e, dat[w], x[col[w]]); accu = 0; } } else { // new line F.init(y[larow_i], e); larow_i = row[w]; F.init(e, y[larow_i]); e += dat[w] * x[col[w]]; accu = 1; } ++w; } F.init(y[larow_i], e); } template inline void fspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t j = 0; for (; j < ROUND_DOWN(A.nnz, 4); j += 4) { F.addin(y[row[j]], x[col[j]]); F.addin(y[row[j + 1]], x[col[j + 1]]); F.addin(y[row[j + 2]], x[col[j + 2]]); F.addin(y[row[j + 3]], x[col[j + 3]]); } for (; j < A.nnz; ++j) { F.addin(y[row[j]], x[col[j]]); } } template inline void fspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t j = 0; for (; j < ROUND_DOWN(A.nnz, 4); j += 4) { F.subin(y[row[j]], x[col[j]]); F.subin(y[row[j + 1]], x[col[j + 1]]); F.subin(y[row[j + 2]], x[col[j + 2]]); F.subin(y[row[j + 3]], x[col[j + 3]]); } for (; j < A.nnz; ++j) { F.subin(y[row[j]], x[col[j]]); } } template inline void fspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t j = 0; for (; j < ROUND_DOWN(A.nnz, 4); j += 4) { y[row[j]] += x[col[j]]; y[row[j + 1]] += x[col[j + 1]]; y[row[j + 2]] += x[col[j + 2]]; y[row[j + 3]] += x[col[j + 3]]; } for (; j < A.nnz; ++j) { y[row[j]] += x[col[j]]; } } template inline void fspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(row, A.row, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t j = 0; for (; j < ROUND_DOWN(A.nnz, 4); j += 4) { y[row[j]] -= x[col[j]]; y[row[j + 1]] -= x[col[j + 1]]; y[row[j + 2]] -= x[col[j + 2]]; y[row[j + 3]] -= x[col[j + 3]]; } for (; j < A.nnz; ++j) { y[row[j]] -= x[col[j]]; } } } // coo_details } // FFLAS #endif // __FFLASFFPACK_fflas_coo_spmv_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/coo/coo_utils.inl000066400000000000000000000065341274716147400254370ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_coo_utils_INL #define __FFLASFFPACK_fflas_sparse_coo_utils_INL namespace FFLAS { template inline void sparse_delete(const Sparse &A) { fflas_delete(A.dat); fflas_delete(A.col); fflas_delete(A.row); } template inline void sparse_delete(const Sparse &A) { fflas_delete(A.col); fflas_delete(A.row); } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.kmax = Protected::DotProdBoundClassic(F, F.one); A.m = rowdim; A.n = coldim; A.nnz = nnz; A.nElements = nnz; std::vector rows(rowdim, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.maxrow = *(std::max_element(rows.begin(), rows.end())); if (A.kmax > A.maxrow) A.delayed = true; A.col = fflas_new(nnz, Alignment::CACHE_LINE); A.row = fflas_new(nnz, Alignment::CACHE_LINE); A.dat = fflas_new(F, nnz, 1, Alignment::CACHE_LINE); for (uint64_t i = 0; i < nnz; ++i) { A.col[i] = (index_t)col[i]; A.row[i] = (index_t)row[i]; A.dat[i] = dat[i]; } } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.kmax = Protected::DotProdBoundClassic(F, F.one); A.m = rowdim; A.n = coldim; A.nnz = nnz; A.nElements = nnz; std::vector rows(A.m, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.maxrow = *(std::max_element(rows.begin(), rows.end())); if (A.kmax > A.maxrow) A.delayed = true; A.col = fflas_new(nnz, Alignment::CACHE_LINE); A.row = fflas_new(nnz, Alignment::CACHE_LINE); for (uint64_t i = 0; i < nnz; ++i) { A.col[i] = (index_t)col[i]; A.row[i] = (index_t)row[i]; } } } #endif // __FFLASFFPACK_fflas_sparse_coo_spmv_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr.h000066400000000000000000000061201274716147400231020ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fspmv_CSR.inl * NO DOC */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_H #define __FFLASFFPACK_fflas_sparse_CSR_H namespace FFLAS { /* CSR */ template struct Sparse<_Field, SparseMatrix_t::CSR> { using Field = _Field; bool delayed = false; uint64_t kmax = 0; index_t m = 0; index_t n = 0; uint64_t nnz = 0; uint64_t nElements = 0; uint64_t maxrow = 0; index_t *col = nullptr; index_t *st = nullptr; index_t *stend = nullptr; typename _Field::Element_ptr dat; }; template struct Sparse<_Field, SparseMatrix_t::CSR_ZO> : public Sparse<_Field, SparseMatrix_t::CSR> { using Field = _Field; int64_t cst = 1; bool delayed = false; }; template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz); template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz); template inline void sparse_delete(const Sparse &A); template inline void sparse_delete(const Sparse &A); } // FFLAS #include "fflas-ffpack/fflas/fflas_sparse/csr/csr_utils.inl" #include "fflas-ffpack/fflas/fflas_sparse/csr/csr_spmv.inl" #include "fflas-ffpack/fflas/fflas_sparse/csr/csr_spmm.inl" #if defined(__FFLASFFPACK_USE_OPENMP) || defined(__FFLASFFPACK_USE_TBB) #include "fflas-ffpack/fflas/fflas_sparse/csr/csr_pspmv.inl" #include "fflas-ffpack/fflas/fflas_sparse/csr/csr_pspmm.inl" #endif #endif // __FFLASFFPACK_fflas_sparse_CSR_Hfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr/000077500000000000000000000000001274716147400227325ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr/Makefile.am000066400000000000000000000021631274716147400247700ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Bastien Vialla # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_sparse/csr pkgincludesub_HEADERS= \ csr_spmv.inl \ csr_spmm.inl \ csr_pspmv.inl \ csr_pspmm.inl \ csr_utils.inl fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr/csr_pspmm.inl000066400000000000000000001120711274716147400254430ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_pspmm_INL #define __FFLASFFPACK_fflas_sparse_CSR_pspmm_INL namespace FFLAS { namespace sparse_details_impl { template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); size_t m = A.m; SYNCH_GROUP( FORBLOCK1D(it, m, SPLITTER(NUM_THREADS), TASK(CONSTREFERENCE(F) MODE(READ(dat, col, st, x) READWRITE(y)), { for (index_t i = it.begin(); i < it.end(); ++i) { for (index_t j = st[i]; j < st[i + 1]; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * ldy + k], dat[j], x[col[j] * ldx + k]); F.axpyin(y[i * ldy + k + 1], dat[j], x[col[j] * ldx + k + 1]); F.axpyin(y[i * ldy + k + 2], dat[j], x[col[j] * ldx + k + 2]); F.axpyin(y[i * ldy + k + 3], dat[j], x[col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * ldy + k], dat[j], x[col[j] * ldx + k]); } } } ); ); ); } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); size_t m = A.m; SYNCH_GROUP( FORBLOCK1D(it, m, SPLITTER(NUM_THREADS), TASK(MODE(READ(dat, col, st, x) READWRITE(y)), { for (index_t i = it.begin(); i < it.end(); ++i) { for (index_t j = st[i]; j < st[i + 1]; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; y[i * ldy + k + 1] += dat[j] * x[col[j] * ldx + k + 1]; y[i * ldy + k + 2] += dat[j] * x[col[j] * ldx + k + 2]; y[i * ldy + k + 3] += dat[j] * x[col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } } ); ); ); /* for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { } for (; k < blockSize; ++k) y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } */ } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; size_t m = A.m; vect_t y1, x1, y2, x2, vdat; SYNCH_GROUP( FORBLOCK1D(it, m, SPLITTER(NUM_THREADS), TASK(MODE(READ(dat, col, st, x) READWRITE(y)), { for (index_t i = it.begin(); i < it.end(); ++i) { for (index_t j = st[i]; j < st[i + 1]; ++j) { uint32_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::store(y + i * ldy + k, y1); simd::store(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::store(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } } } ); ); ); /* for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2, vdat; size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::store(y + i * ldy + k, y1); simd::store(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::store(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } } //*/ } template inline void pfspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; size_t m = A.m; vect_t y1, x1, y2, x2, vdat; SYNCH_GROUP( FORBLOCK1D(it, m, SPLITTER(NUM_THREADS), TASK(MODE(READ(dat, col, st, x) READWRITE(y)), { for (index_t i = it.begin(); i < it.end(); ++i) { for (index_t j = st[i]; j < st[i + 1]; ++j) { uint32_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::storeu(y + i * ldy + k, y1); simd::storeu(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::storeu(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } } } ); ); ); /* for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2, dat; size_t k = 0; dat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, dat); y2 = simd::fmadd(y2, x2, dat); simd::storeu(y + i * ldy + k, y1); simd::storeu(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, dat); simd::storeu(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } } */ } #endif template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { index_t j = st[i]; index_t j_loc = j; index_t j_end = st[i + 1]; index_t block = (j_end - j_loc) / kmax; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { for (size_t k = 0; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } // TODO : replace with freduce FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } for (; j < j_end; ++j) { for (size_t k = 0; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.m; ++i) { index_t j = st[i]; index_t j_loc = j; index_t j_end = st[i + 1]; index_t block = (j_end - j_loc) / kmax; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { vect_t y1, x1, y2, x2, vdat; size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::storeu(y + i * ldy + k, y1); simd::storeu(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::storeu(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } // TODO : replace with freduce FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } for (; j < j_end; ++j) { vect_t y1, x1, y2, x2, vdat; size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::storeu(y + i * ldy + k, y1); simd::storeu(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::storeu(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } } template inline void pfspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.m; ++i) { index_t j = st[i]; index_t j_loc = j; index_t j_end = st[i + 1]; index_t block = (j_end - j_loc) / kmax; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { vect_t y1, x1, y2, x2, vdat; size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::store(y + i * ldy + k, y1); simd::store(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::store(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } // TODO : replace with freduce FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } for (; j < j_end; ++j) { vect_t y1, x1, y2, x2, vdat; y1 = simd::zero(); y2 = simd::zero(); size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::store(y + i * ldy + k, y1); simd::store(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::store(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } } #endif // SIMD template inline void pfspmm_one(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); // for (index_t i = 0; i < A.m; ++i) { index_t am=A.m; SYNCH_GROUP( FORBLOCK1D(it, am, SPLITTER(NUM_THREADS), TASK(MODE(CONSTREFERENCE(F) READ(/*dat,*/ col, st, x) READWRITE(y)), for (index_t i = it.begin(); i < it.end(); ++i) { auto start = st[i]; auto stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.addin(y[i * ldy + k], x[col[j] * ldx + k]); F.addin(y[i * ldy + k + 1], x[col[j] * ldx + k + 1]); F.addin(y[i * ldy + k + 2], x[col[j] * ldx + k + 2]); F.addin(y[i * ldy + k + 3], x[col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.addin(y[i * ldy + k], x[col[j] * ldx + k]); } } ); ); ); // } } template inline void pfspmm_mone(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); // #pragma omp parallel for schedule(static, 32) // for (index_t i = 0; i < A.m; ++i) { index_t am=A.m; SYNCH_GROUP( FORBLOCK1D(it, am, SPLITTER(NUM_THREADS), TASK(MODE(CONSTREFERENCE(F) READ(/*dat,*/ col, st, x) READWRITE(y)), for (index_t i = it.begin(); i < it.end(); ++i) { auto start = st[i]; auto stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.subin(y[i * ldy + k], x[col[j] * ldx + k]); F.subin(y[i * ldy + k + 1], x[col[j] * ldx + k + 1]); F.subin(y[i * ldy + k + 2], x[col[j] * ldx + k + 2]); F.subin(y[i * ldy + k + 3], x[col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.subin(y[i * ldy + k], x[col[j] * ldx + k]); } } ); ); ); // } } template inline void pfspmm_one(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); size_t m = A.m; SYNCH_GROUP( FORBLOCK1D(it, m, SPLITTER(NUM_THREADS), TASK(MODE(READ(/*dat,*/ col, st, x) READWRITE(y)), { for (index_t i = it.begin(); i < it.end(); ++i) { for (index_t j = st[i]; j < st[i + 1]; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += x[col[j] * ldx + k]; y[i * ldy + k + 1] += x[col[j] * ldx + k + 1]; y[i * ldy + k + 2] += x[col[j] * ldx + k + 2]; y[i * ldy + k + 3] += x[col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += x[col[j] * ldx + k]; } } } ); ); ); /* for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += x[col[j] * ldx + k]; y[i * ldy + k + 1] += x[col[j] * ldx + k + 1]; y[i * ldy + k + 2] += x[col[j] * ldx + k + 2]; y[i * ldy + k + 3] += x[col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += x[col[j] * ldx + k]; } } */ } template inline void pfspmm_mone(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); size_t m = A.m; SYNCH_GROUP( FORBLOCK1D(it, m, SPLITTER(NUM_THREADS), TASK(MODE(READ(/*dat,*/ col, st, x) READWRITE(y)), { for (index_t i = it.begin(); i < it.end(); ++i) { for (index_t j = st[i]; j < st[i + 1]; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] -= x[col[j] * ldx + k]; y[i * ldy + k + 1] -= x[col[j] * ldx + k + 1]; y[i * ldy + k + 2] -= x[col[j] * ldx + k + 2]; y[i * ldy + k + 3] -= x[col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] -= x[col[j] * ldx + k]; } } } ); ); ); /* for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] -= x[col[j] * ldx + k]; y[i * ldy + k + 1] -= x[col[j] * ldx + k + 1]; y[i * ldy + k + 2] -= x[col[j] * ldx + k + 2]; y[i * ldy + k + 3] -= x[col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] -= x[col[j] * ldx + k]; } } */ } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmm_one_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; //* size_t m = A.m; vect_t y1, x1, y2, x2, vdat; SYNCH_GROUP( FORBLOCK1D(it, m, SPLITTER(NUM_THREADS), TASK(MODE(READ(/*dat,*/ col, st, x) READWRITE(y)), { for (index_t i = it.begin(); i < it.end(); ++i) { for (index_t j = st[i]; j < st[i + 1]; ++j) { uint32_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::add(y1, x1)); simd::store(y + i * ldy + k + simd::vect_size, simd::add(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); simd::store(y + i * ldy + k, simd::add(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] += x[col[j] * ldx + k]; } } } } ); ); ); //*/ /* #pragma omp parallel for schedule(static, 256) for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::add(y1, x1)); simd::store(y + i * ldy + k + simd::vect_size, simd::add(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); simd::store(y + i * ldy + k, simd::add(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] += x[col[j] * ldx + k]; } } } //*/ } template inline void pfspmm_one_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; vect_t y1, x1, y2, x2, vdat; size_t m = A.m; SYNCH_GROUP( FORBLOCK1D(it, m, SPLITTER(NUM_THREADS), TASK(MODE(READ(/*dat,*/ col, st, x) READWRITE(y)), { for (index_t i = it.begin(); i < it.end(); ++i) { for (index_t j = st[i]; j < st[i + 1]; ++j) { uint32_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::add(y1, x1)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::add(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); simd::storeu(y + i * ldy + k, simd::add(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] += x[col[j] * ldx + k]; } } } } ); ); ); /* for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::add(y1, x1)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::add(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); simd::storeu(y + i * ldy + k, simd::add(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] += x[col[j] * ldx + k]; } } } */ } template inline void pfspmm_mone_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; //* size_t m = A.m; vect_t y1, x1, y2, x2, vdat; SYNCH_GROUP( FORBLOCK1D(it, m, SPLITTER(NUM_THREADS), TASK(MODE(READ(/*dat,*/ col, st, x) READWRITE(y)), { for (index_t i = it.begin(); i < it.end(); ++i) { for (index_t j = st[i]; j < st[i + 1]; ++j) { uint32_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::sub(y1, x1)); simd::store(y + i * ldy + k + simd::vect_size, simd::sub(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); simd::store(y + i * ldy + k, simd::sub(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] += x[col[j] * ldx + k]; } } } } ); ); ); //*/ /* #pragma omp parallel for schedule(static, 256) for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::sub(y1, x1)); simd::store(y + i * ldy + k + simd::vect_size, simd::sub(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); simd::store(y + i * ldy + k, simd::sub(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] -= x[col[j] * ldx + k]; } } } //*/ } template inline void pfspmm_mone_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; size_t m = A.m; vect_t y1, x1, y2, x2, vdat; SYNCH_GROUP( FORBLOCK1D(it, m, SPLITTER(NUM_THREADS), TASK(MODE(READ(/*dat,*/ col, st, x) READWRITE(y)), { for (index_t i = it.begin(); i < it.end(); ++i) { for (index_t j = st[i]; j < st[i + 1]; ++j) { uint32_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::sub(y1, x1)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::sub(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); simd::storeu(y + i * ldy + k, simd::sub(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] += x[col[j] * ldx + k]; } } } } ); ); ); /* for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::sub(y1, x1)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::sub(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); simd::storeu(y + i * ldy + k, simd::sub(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] -= x[col[j] * ldx + k]; } } } */ } #endif //__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS } // CSR_details } // FFLAS #endif // __FFLASFFPACK_fflas_CSR_spmm_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr/csr_pspmv.inl000066400000000000000000000410331274716147400254530ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_pspmv_INL #define __FFLASFFPACK_fflas_sparse_CSR_pspmv_INL #ifdef __FFLASFFPACK_USE_TBB #include "tbb/parallel_for.h" #include "tbb/blocked_range.h" #endif #include namespace FFLAS { namespace sparse_details_impl { template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, dat, col, st](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(diff, 4); j += 4) { F.axpyin(y1, dat[start + j], x[col[start + j]]); F.axpyin(y2, dat[start + j + 1], x[col[start + j + 1]]); F.axpyin(y3, dat[start + j + 2], x[col[start + j + 2]]); F.axpyin(y4, dat[start + j + 3], x[col[start + j + 3]]); } for (; j < diff; ++j) { F.axpyin(y1, dat[start + j], x[col[start + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } }); #else // The minimum size has to be a multiple of cache_line/sizeof(Element) to avoid // cache coherency problem (ex: 8 for double, 16 for float) #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(diff, 4); j += 4) { F.axpyin(y1, dat[start + j], x[col[start + j]]); F.axpyin(y2, dat[start + j + 1], x[col[start + j + 1]]); F.axpyin(y3, dat[start + j + 2], x[col[start + j + 2]]); F.axpyin(y4, dat[start + j + 3], x[col[start + j + 3]]); } for (; j < diff; ++j) { F.axpyin(y1, dat[start + j], x[col[start + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } #endif } template inline void pfspmv_task(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const index_t iStart, const index_t iStop, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for(index_t i = iStart ; i < iStop ; ++i){ auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += dat[start + j] * x[col[start + j]]; y2 += dat[start + j + 1] * x[col[start + j + 1]]; y3 += dat[start + j + 2] * x[col[start + j + 2]]; y4 += dat[start + j + 3] * x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += dat[start + j] * x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } } template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, dat, col, st](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += dat[start + j] * x[col[start + j]]; y2 += dat[start + j + 1] * x[col[start + j + 1]]; y3 += dat[start + j + 2] * x[col[start + j + 2]]; y4 += dat[start + j + 3] * x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += dat[start + j] * x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } }); #else // #pragma omp parallel for schedule(static, 8) // for (index_t i = 0; i < A.m; ++i) { // auto start = st[i], stop = st[i + 1]; // index_t j = 0; // index_t diff = stop - start; // typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; // for (; j < ROUND_DOWN(diff, 4); j += 4) { // y1 += dat[start + j] * x[col[start + j]]; // y2 += dat[start + j + 1] * x[col[start + j + 1]]; // y3 += dat[start + j + 2] * x[col[start + j + 2]]; // y4 += dat[start + j + 3] * x[col[start + j + 3]]; // } // for (; j < diff; ++j) { // y1 += dat[start + j] * x[col[start + j]]; // } // y[i] += y1 + y2 + y3 + y4; // } std::vector pool(6); #endif } template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const int64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, kmax, dat, col, st](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = st[i]; index_t j_loc = j; index_t j_end = st[i + 1]; index_t block = (j_end - j_loc) / kmax; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { y[i] += dat[j] * x[col[j]]; } F.reduce(y[i]); } for (; j < j_end; ++j) { y[i] += dat[j] * x[col[j]]; } F.reduce(y[i]); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { index_t j = st[i]; index_t j_loc = j; index_t j_end = st[i + 1]; index_t block = (j_end - j_loc) / kmax; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { y[i] += dat[j] * x[col[j]]; } F.reduce(y[i]); } for (; j < j_end; ++j) { y[i] += dat[j] * x[col[j]]; } F.reduce(y[i]); } #endif } template inline void pfspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); size_t am = A.m; SYNCH_GROUP( FORBLOCK1D(it, am, SPLITTER(NUM_THREADS), TASK(MODE(CONSTREFERENCE(F) READ(col, st, x) READWRITE(y)), for (index_t i = it.begin(); i < it.end(); ++i) { auto start = st[i]; auto stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1; typename Field::Element y2; typename Field::Element y3; typename Field::Element y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(diff, 4); j += 4) { F.addin(y1, x[col[start + j]]); F.addin(y2, x[col[start + j + 1]]); F.addin(y3, x[col[start + j + 2]]); F.addin(y4, x[col[start + j + 3]]); } for (; j < diff; ++j) { F.addin(y1, x[col[start + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } ); ); ); } template inline void pfspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); size_t am = A.m; SYNCH_GROUP( FORBLOCK1D(it, am, SPLITTER(NUM_THREADS), TASK(MODE(CONSTREFERENCE(F) READ(col, st, x) READWRITE(y)), for (index_t i = it.begin(); i < it.end(); ++i) { auto start = st[i]; auto stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1; typename Field::Element y2; typename Field::Element y3; typename Field::Element y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(diff, 4); j += 4) { F.addin(y1, x[col[start + j]]); F.addin(y2, x[col[start + j + 1]]); F.addin(y3, x[col[start + j + 2]]); F.addin(y4, x[col[start + j + 3]]); } for (; j < diff; ++j) { F.addin(y1, x[col[start + j]]); } F.subin(y[i], y1); F.subin(y[i], y2); F.subin(y[i], y3); F.subin(y[i], y4); } ); ); ); } template inline void pfspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, col, st](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } #endif } template inline void pfspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, col, st](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] -= y1 + y2 + y3 + y4; } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] -= y1 + y2 + y3 + y4; } #endif } } // CSR_details } // FFLAS #endif // __FFLASFFPACK_fflas_CSR_pspmv_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr/csr_spmm.inl000066400000000000000000000672111274716147400252700ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_spmm_INL #define __FFLASFFPACK_fflas_sparse_CSR_spmm_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * ldy + k], dat[j], x[col[j] * ldx + k]); F.axpyin(y[i * ldy + k + 1], dat[j], x[col[j] * ldx + k + 1]); F.axpyin(y[i * ldy + k + 2], dat[j], x[col[j] * ldx + k + 2]); F.axpyin(y[i * ldy + k + 3], dat[j], x[col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * ldy + k], dat[j], x[col[j] * ldx + k]); } } } template inline void fspmm(const Field &F, const Sparse &A, index_t blockSize, typename Field::ConstElement_ptr x_, index_t ldx, typename Field::Element_ptr y_, index_t ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; y[i * ldy + k + 1] += dat[j] * x[col[j] * ldx + k + 1]; y[i * ldy + k + 2] += dat[j] * x[col[j] * ldx + k + 2]; y[i * ldy + k + 3] += dat[j] * x[col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } } #ifdef __FFLASFFPACK_HAVE_MKL inline void fspmm_mkl(const Givaro::DoubleDomain &F, const Sparse &A, index_t blockSize, Givaro::DoubleDomain::ConstElement_ptr x_, index_t ldx, Givaro::DoubleDomain::Element_ptr y_, index_t ldy, FieldCategories::UnparametricTag) { // assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); // assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); // assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); // assume_aligned(x, x_, (size_t)Alignment::DEFAULT); // assume_aligned(y, y_, (size_t)Alignment::DEFAULT); mkl_dcsrmm(MKL_CONFIG::trans, &A.m , &blockSize, &A.n, &MKL_CONFIG::dalpha, MKL_CONFIG::metaChar, A.dat, A.col, A.st, A.st+1, x_, &ldx, &MKL_CONFIG::dbeta, y_ , &ldy); // void mkl_dcsrmm (char *transa, MKL_INT *m, MKL_INT *n, MKL_INT *k, double *alpha, char *matdescra, double *val, MKL_INT *indx, MKL_INT *pntrb, MKL_INT *pntre, double *b, MKL_INT *ldb, double *beta, double *c, MKL_INT *ldc); } inline void fspmm_mkl(const Givaro::FloatDomain &F, const Sparse &A, index_t blockSize, Givaro::FloatDomain::ConstElement_ptr x_, index_t ldx, Givaro::FloatDomain::Element_ptr y_, index_t ldy, FieldCategories::UnparametricTag) { // assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); // assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); // assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); // assume_aligned(x, x_, (size_t)Alignment::DEFAULT); // assume_aligned(y, y_, (size_t)Alignment::DEFAULT); mkl_scsrmm(MKL_CONFIG::trans, &A.m , &blockSize, &A.n, &MKL_CONFIG::salpha, MKL_CONFIG::metaChar, A.dat, A.col, A.st, A.st+1, x_, &ldx, &MKL_CONFIG::sbeta, y_ , &ldy); // void mkl_scsrmm (char *transa, MKL_INT *m, MKL_INT *n, MKL_INT *k, float *alpha, char *matdescra, float *val, MKL_INT *indx, MKL_INT *pntrb, MKL_INT *pntre, float *b, MKL_INT *ldb, float *beta, float *c, MKL_INT *ldc);i } #endif // __FFLASFFPACK_HAVE_MKL // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { // std::cout << "spmm simd Unparam aligned" << std::endl; assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2, vdat; size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::store(y + i * ldy + k, y1); simd::store(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::store(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } } } template inline void fspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { // std::cout << "spmm simd Unparam unaligned" << std::endl; assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2, vdat; size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + A.col[j] * ldx + k); x2 = simd::loadu(x + A.col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::storeu(y + i * ldy + k, y1); simd::storeu(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::storeu(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } } } // #endif template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { index_t j = st[i]; index_t j_loc = j; index_t j_end = st[i + 1]; index_t block = (j_end - j_loc) / kmax; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { for (size_t k = 0; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } // TODO : replace with freduce FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } for (; j < j_end; ++j) { for (size_t k = 0; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.m; ++i) { index_t j = st[i]; index_t j_loc = j; index_t j_end = st[i + 1]; index_t block = (j_end - j_loc) / kmax; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { vect_t y1, x1, y2, x2, vdat; size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::storeu(y + i * ldy + k, y1); simd::storeu(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::storeu(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } // TODO : replace with freduce // FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } for (; j < j_end; ++j) { vect_t y1, x1, y2, x2, vdat; y1 = simd::zero(); y2 = simd::zero(); size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::storeu(y + i * ldy + k, y1); simd::storeu(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::storeu(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } } template inline void fspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.m; ++i) { index_t j = st[i]; index_t j_loc = j; index_t j_end = st[i + 1]; index_t block = (j_end - j_loc) / kmax; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { vect_t y1, x1, y2, x2, vdat; size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::store(y + i * ldy + k, y1); simd::store(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::store(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } // TODO : replace with freduce FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } for (; j < j_end; ++j) { vect_t y1, x1, y2, x2, vdat; size_t k = 0; vdat = simd::set1(dat[j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); y1 = simd::fmadd(y1, x1, vdat); y2 = simd::fmadd(y2, x2, vdat); simd::store(y + i * ldy + k, y1); simd::store(y + i * ldy + k + simd::vect_size, y2); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); y1 = simd::fmadd(y1, x1, vdat); simd::store(y + i * ldy + k, y1); } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[j] * x[col[j] * ldx + k]; } } FFLAS::freduce(F,blockSize,y+i*ldy,1); // for (size_t k = 0; k < blockSize; ++k) { // F.reduce(y[i * ldy + k]); // } } } #endif // SIMD template inline void fspmm_one(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.addin(y[i * ldy + k], x[col[j] * ldx + k]); F.addin(y[i * ldy + k + 1], x[col[j] * ldx + k + 1]); F.addin(y[i * ldy + k + 2], x[col[j] * ldx + k + 2]); F.addin(y[i * ldy + k + 3], x[col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.addin(y[i * ldy + k], x[col[j] * ldx + k]); } } } template inline void fspmm_mone(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.subin(y[i * ldy + k], x[col[j] * ldx + k]); F.subin(y[i * ldy + k + 1], x[col[j] * ldx + k + 1]); F.subin(y[i * ldy + k + 2], x[col[j] * ldx + k + 2]); F.subin(y[i * ldy + k + 3], x[col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.subin(y[i * ldy + k], x[col[j] * ldx + k]); } } } // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_one_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::add(y1, x1)); simd::store(y + i * ldy + k + simd::vect_size, simd::add(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); simd::store(y + i * ldy + k, simd::add(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] += x[col[j] * ldx + k]; } } } } template inline void fspmm_one_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::add(y1, x1)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::add(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); simd::storeu(y + i * ldy + k, simd::add(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] += x[col[j] * ldx + k]; } } } } template inline void fspmm_mone_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::load(y+i*ldy+k); y2 = simd::load(y+i*ldy+k+simd::vect_size); x1 = simd::load(x + col[j] * ldx + k); x2 = simd::load(x + col[j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::sub(y1, x1)); simd::store(y + i * ldy + k + simd::vect_size, simd::sub(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::load(y+i*ldy+k); x1 = simd::load(x + col[j] * ldx + k); simd::store(y + i * ldy + k, simd::sub(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] -= x[col[j] * ldx + k]; } } } } template inline void fspmm_mone_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; for (index_t j = start; j < stop; ++j) { vect_t y1, x1, y2, x2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); y2 = simd::loadu(y+i*ldy+k+simd::vect_size); x1 = simd::loadu(x + col[j] * ldx + k); x2 = simd::loadu(x + col[j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::sub(y1, x1)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::sub(y2, x2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { y1 = simd::loadu(y+i*ldy+k); x1 = simd::loadu(x + col[j] * ldx + k); simd::storeu(y + i * ldy + k, simd::sub(y1, x1)); } for (; k < blockSize; ++k) { y[i * ldy + k] -= x[col[j] * ldx + k]; } } } } // #endif //__FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS } // CSR_details } // FFLAS #endif // __FFLASFFPACK_fflas_CSR_spmm_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr/csr_spmv.inl000066400000000000000000000324531274716147400253010ustar00rootroot00000000000000 /* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_spmv_INL #define __FFLASFFPACK_fflas_sparse_CSR_spmv_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(diff, 4); j += 4) { F.axpyin(y1, dat[start + j], x[col[start + j]]); F.axpyin(y2, dat[start + j + 1], x[col[start + j + 1]]); F.axpyin(y3, dat[start + j + 2], x[col[start + j + 2]]); F.axpyin(y4, dat[start + j + 3], x[col[start + j + 3]]); } for (; j < diff; ++j) { F.axpyin(y1, dat[start + j], x[col[start + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } } #if 0 template inline void fspmv_task(const Field &F, const index_t start_, const index_t size_ const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = start_; i < start_+size_; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(diff, 4); j += 4) { F.axpyin(y1, dat[start + j], x[col[start + j]]); F.axpyin(y2, dat[start + j + 1], x[col[start + j + 1]]); F.axpyin(y3, dat[start + j + 2], x[col[start + j + 2]]); F.axpyin(y4, dat[start + j + 3], x[col[start + j + 3]]); } for (; j < diff; ++j) { F.axpyin(y1, dat[start + j], x[col[start + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } } #endif template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += dat[start + j] * x[col[start + j]]; y2 += dat[start + j + 1] * x[col[start + j + 1]]; y3 += dat[start + j + 2] * x[col[start + j + 2]]; y4 += dat[start + j + 3] * x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += dat[start + j] * x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } } #ifdef __FFLASFFPACK_HAVE_MKL inline void fspmv_mkl(const Givaro::DoubleDomain &F, const Sparse &A, Givaro::DoubleDomain::ConstElement_ptr x_, Givaro::DoubleDomain::Element_ptr y_, FieldCategories::UnparametricTag) { // assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); // assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); // assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); // assume_aligned(x, x_, (size_t)Alignment::DEFAULT); // assume_aligned(y, y_, (size_t)Alignment::DEFAULT); mkl_dcsrmv(MKL_CONFIG::trans, &A.m , &A.n, &MKL_CONFIG::dalpha, MKL_CONFIG::metaChar, A.dat, A.col, A.st, A.st+1, x_, &MKL_CONFIG::dbeta, y_ ); // void mkl_dcsrmv (char *transa, MKL_INT *m, MKL_INT *k, double *alpha, char *matdescra, double *val, MKL_INT *indx, MKL_INT *pntrb, MKL_INT *pntre, double *x, double *beta, double *y); } inline void fspmv_mkl(const Givaro::FloatDomain &F, const Sparse &A, Givaro::FloatDomain::ConstElement_ptr x_, Givaro::FloatDomain::Element_ptr y_, FieldCategories::UnparametricTag) { // assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); // assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); // assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); // assume_aligned(x, x_, (size_t)Alignment::DEFAULT); // assume_aligned(y, y_, (size_t)Alignment::DEFAULT); mkl_scsrmv(MKL_CONFIG::trans, &A.m , &A.n, &MKL_CONFIG::salpha, MKL_CONFIG::metaChar, A.dat, A.col, A.st, A.st+1, x_, &MKL_CONFIG::sbeta, y_ ); // void mkl_scsrmv (char *transa, MKL_INT *m, MKL_INT *k, float *alpha, char *matdescra, float *val, MKL_INT *indx, MKL_INT *pntrb, MKL_INT *pntre, float *x, float *beta, float *y); } #endif // __FFLASFFPACK_HAVE_MKL #if 0 template inline void fspmv_task(const Field &F, const index_t start_, const index_t size_, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = start_; i < start_+size_; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += dat[start + j] * x[col[start + j]]; y2 += dat[start + j + 1] * x[col[start + j + 1]]; y3 += dat[start + j + 2] * x[col[start + j + 2]]; y4 += dat[start + j + 3] * x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += dat[start + j] * x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } } #endif template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const int64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { index_t j = st[i]; index_t j_loc = j; index_t j_end = st[i + 1]; index_t block = (j_end - j_loc) / kmax; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { y[i] += dat[j] * x[col[j]]; } F.reduce(y[i]); } for (; j < j_end; ++j) { y[i] += dat[j] * x[col[j]]; } F.reduce(y[i]); } } template inline void fspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { F.addin(y1, x[col[start + j]]); F.addin(y2, x[col[start + j + 1]]); F.addin(y3, x[col[start + j + 2]]); F.addin(y4, x[col[start + j + 3]]); } for (; j < diff; ++j) { F.addin(y1, x[col[start + j]]); } F.addin(y[i], y1 + y2 + y3 + y4); } } template inline void fspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { F.addin(y1, x[col[start + j]]); F.addin(y2, x[col[start + j + 1]]); F.addin(y3, x[col[start + j + 2]]); F.addin(y4, x[col[start + j + 3]]); } for (; j < diff; ++j) { F.addin(y1, x[col[start + j]]); } F.subin(y[i], y1 + y2 + y3 + y4); } } template inline void fspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } } template inline void fspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { auto start = st[i], stop = st[i + 1]; index_t j = 0; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] -= y1 + y2 + y3 + y4; } } } // sparse_details } // FFLAS #endif // __FFLASFFPACK_fflas_CSR_spmv_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr/csr_utils.inl000066400000000000000000000203131274716147400254440ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ namespace FFLAS { template inline void sparse_delete(const Sparse &A) { fflas_delete(A.dat); fflas_delete(A.col); fflas_delete(A.st); } template inline void sparse_delete(const Sparse &A) { fflas_delete(A.col); fflas_delete(A.st); } template inline std::ostream& sparse_print(std::ostream& os, const Sparse &A) { // for (size_t i = 0; i <= A.m; ++i) // std::cout << A.st[i] << " "; // std::cout << std::endl; for (index_t i = 0; i < A.m; ++i) { auto start = A.st[i], stop = A.st[i + 1]; index_t j = 0; index_t diff = stop - start; os << i << " : "; for (; j < diff; ++j) { os << '(' << A.col[start + j] << ',' << A.dat[start+j] << ") "; } os << std::endl; } return os; } template inline void sparse_init(const Givaro::Modular &F, Sparse, SparseMatrix_t::CSR> &A, const IndexT *row, const IndexT *col, Givaro::Integer* dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.m = rowdim; A.n = coldim; A.nnz = nnz; A.nElements = nnz; std::vector rows(rowdim, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.delayed = true; A.col = fflas_new(nnz, Alignment::CACHE_LINE); A.st = fflas_new(rowdim + 1, Alignment::CACHE_LINE); A.dat = fflas_new(F, nnz, 1, Alignment::CACHE_LINE); for(size_t i = 0 ; i < nnz ; ++i){ if(col[i] >= coldim){ std::cout << "Error col index too big" << std::endl; } } for (size_t i = 0; i < nnz; ++i) { A.col[i] = static_cast(col[i]); A.dat[i] = dat[i]; } A.st[0] = 0; for (size_t i = 1; i <= rowdim; ++i) { A.st[i] = A.st[i - 1] + rows[i - 1]; } } template inline void sparse_init(const Givaro::ZRing &F, Sparse, SparseMatrix_t::CSR_ZO> &A, const IndexT *row, const IndexT *col, Givaro::Integer* dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.m = rowdim; A.n = coldim; A.nnz = nnz; A.nElements = nnz; std::vector rows(rowdim, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.delayed = true; A.col = fflas_new(nnz, Alignment::CACHE_LINE); A.st = fflas_new(rowdim + 1, Alignment::CACHE_LINE); for(size_t i = 0 ; i < nnz ; ++i){ if(col[i] >= coldim){ std::cout << "Error col index too big" << std::endl; } } for (size_t i = 0; i < nnz; ++i) { A.col[i] = static_cast(col[i]); } A.st[0] = 0; for (size_t i = 1; i <= rowdim; ++i) { A.st[i] = A.st[i - 1] + rows[i - 1]; } } template inline void sparse_init(const Givaro::ZRing> &F, Sparse>, SparseMatrix_t::CSR_ZO> &A, const IndexT *row, const IndexT *col, typename Givaro::ZRing>::Element_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.m = rowdim; A.n = coldim; A.nnz = nnz; A.nElements = nnz; std::vector rows(rowdim, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.delayed = true; A.col = fflas_new(nnz, Alignment::CACHE_LINE); A.st = fflas_new(rowdim + 1, Alignment::CACHE_LINE); for(size_t i = 0 ; i < nnz ; ++i){ if(col[i] >= coldim){ std::cout << "Error col index too big" << std::endl; } } for (size_t i = 0; i < nnz; ++i) { A.col[i] = static_cast(col[i]); } A.st[0] = 0; for (size_t i = 1; i <= rowdim; ++i) { A.st[i] = A.st[i - 1] + rows[i - 1]; } } template inline void sparse_init(const Givaro::ZRing> &F, Sparse>, SparseMatrix_t::CSR> &A, const IndexT *row, const IndexT *col, typename Givaro::ZRing>::Element_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.m = rowdim; A.n = coldim; A.nnz = nnz; A.nElements = nnz; std::vector rows(rowdim, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.delayed = true; A.col = fflas_new(nnz, Alignment::CACHE_LINE); A.st = fflas_new(rowdim + 1, Alignment::CACHE_LINE); for(size_t i = 0 ; i < nnz ; ++i){ if(col[i] >= coldim){ std::cout << "Error col index too big" << std::endl; } } for (size_t i = 0; i < nnz; ++i) { A.col[i] = static_cast(col[i]); } A.st[0] = 0; for (size_t i = 1; i <= rowdim; ++i) { A.st[i] = A.st[i - 1] + rows[i - 1]; } } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.kmax = Protected::DotProdBoundClassic(F, F.one); A.m = rowdim; A.n = coldim; A.nnz = nnz; A.nElements = nnz; std::vector rows(rowdim, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.maxrow = *(std::max_element(rows.begin(), rows.end())); if (A.kmax > A.maxrow) A.delayed = true; A.col = fflas_new(nnz, Alignment::CACHE_LINE); A.st = fflas_new(rowdim + 1, Alignment::CACHE_LINE); A.stend = fflas_new(rowdim + 1, Alignment::CACHE_LINE); A.dat = fflas_new(F, nnz, 1, Alignment::CACHE_LINE); for (size_t i = 0; i < nnz; ++i) { A.col[i] = static_cast(col[i]); A.dat[i] = dat[i]; } A.st[0] = 0; for (size_t i = 1; i <= rowdim; ++i) { A.st[i] = A.st[i - 1] + rows[i - 1]; } for(size_t i = 0 ; i < rowdim ; ++i){ A.stend[i] = A.st[i+1]; } } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.delayed = true; A.m = rowdim; A.n = coldim; A.nnz = nnz; A.nElements = nnz; std::vector rows(A.m, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.maxrow = *(std::max_element(rows.begin(), rows.end())); A.col = fflas_new(nnz, Alignment::CACHE_LINE); A.st = fflas_new(rowdim + 1, Alignment::CACHE_LINE); for (size_t i = 0; i < nnz; ++i) { A.col[i] = static_cast(col[i]); } for (size_t i = 0; i <= rowdim; ++i) { A.st[i] = 0; } for (size_t i = 0; i < nnz; ++i) { A.st[row[i] + 1]++; } for (size_t i = 1; i <= rowdim; ++i) { A.st[i] += A.st[i - 1]; } } } fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr_hyb.h000066400000000000000000000047731274716147400237600ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fspmv_CSR_HYB.inl * NO DOC */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_HYB_H #define __FFLASFFPACK_fflas_sparse_CSR_HYB_H namespace FFLAS { /* CSR_HYB */ template struct Sparse<_Field, SparseMatrix_t::CSR_HYB> { using Field = _Field; bool delayed = false; index_t *col = nullptr; index_t *st = nullptr; typename _Field::Element_ptr dat; uint64_t kmax = 0; index_t m = 0; index_t n = 0; uint64_t nnz = 0; uint64_t nElements = 0; uint64_t maxrow = 0; uint64_t nOnes = 0; uint64_t nMOnes = 0; uint64_t nOthers = 0; }; template void sparse_delete(const Sparse &A); template void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz); } // FFLAS #include "fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_utils.inl" #include "fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_spmv.inl" #if defined(__FFLASFFPACK_USE_OPENMP) #include "fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_pspmv.inl" #endif #include "fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_spmm.inl" // #include "fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_pspmm.inl" #endif // __FFLASFFPACK_fflas_sparse_CSR_HYB_Hfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr_hyb/000077500000000000000000000000001274716147400235745ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr_hyb/Makefile.am000066400000000000000000000022131274716147400256260ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Bastien Vialla # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_sparse/csr_hyb pkgincludesub_HEADERS= \ csr_hyb_spmv.inl \ csr_hyb_spmm.inl \ csr_hyb_pspmv.inl \ csr_hyb_pspmm.inl \ csr_hyb_utils.inl fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_pspmm.inl000066400000000000000000001073101274716147400271470ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_HYB_pspmm_INL #define __FFLASFFPACK_fflas_sparse_CSR_HYB_pspmm_INL namespace FFLAS { namespace sparse_details_impl { template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag) { #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.subin(y[i * blockSize + k], x[A.col[j] * blockSize + k]); F.subin(y[i * blockSize + k + 1], x[A.col[j] * blockSize + k + 1]); F.subin(y[i * blockSize + k + 2], x[A.col[j] * blockSize + k + 2]); F.subin(y[i * blockSize + k + 3], x[A.col[j] * blockSize + k + 3]); } for (; k < blockSize; ++k) F.subin(y[i * blockSize + k], x[A.col[j] * blockSize + k]); } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.addin(y[i * blockSize + k], x[A.col[j] * blockSize + k]); F.addin(y[i * blockSize + k + 1], x[A.col[j] * blockSize + k + 1]); F.addin(y[i * blockSize + k + 2], x[A.col[j] * blockSize + k + 2]); F.addin(y[i * blockSize + k + 3], x[A.col[j] * blockSize + k + 3]); } for (; k < blockSize; ++k) F.addin(y[i * blockSize + k], x[A.col[j] * blockSize + k]); } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * blockSize + k], A.dat[startDat + k], x[A.col[j] * blockSize + k]); F.axpyin(y[i * blockSize + k + 1], A.dat[startDat + k], x[A.col[j] * blockSize + k + 1]); F.axpyin(y[i * blockSize + k + 2], A.dat[startDat + k], x[A.col[j] * blockSize + k + 2]); F.axpyin(y[i * blockSize + k + 3], A.dat[startDat + k], x[A.col[j] * blockSize + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * blockSize + k], A.dat[startDat + k], x[A.col[j] * blockSize + k]); } } }); #else #pragma omp parallel for for (uint64_t i = 0; i < A.m; ++i) { index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.subin(y[i * blockSize + k], x[A.col[j] * blockSize + k]); F.subin(y[i * blockSize + k + 1], x[A.col[j] * blockSize + k + 1]); F.subin(y[i * blockSize + k + 2], x[A.col[j] * blockSize + k + 2]); F.subin(y[i * blockSize + k + 3], x[A.col[j] * blockSize + k + 3]); } for (; k < blockSize; ++k) F.subin(y[i * blockSize + k], x[A.col[j] * blockSize + k]); } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.addin(y[i * blockSize + k], x[A.col[j] * blockSize + k]); F.addin(y[i * blockSize + k + 1], x[A.col[j] * blockSize + k + 1]); F.addin(y[i * blockSize + k + 2], x[A.col[j] * blockSize + k + 2]); F.addin(y[i * blockSize + k + 3], x[A.col[j] * blockSize + k + 3]); } for (; k < blockSize; ++k) F.addin(y[i * blockSize + k], x[A.col[j] * blockSize + k]); } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * blockSize + k], A.dat[startDat + k], x[A.col[j] * blockSize + k]); F.axpyin(y[i * blockSize + k + 1], A.dat[startDat + k], x[A.col[j] * blockSize + k + 1]); F.axpyin(y[i * blockSize + k + 2], A.dat[startDat + k], x[A.col[j] * blockSize + k + 2]); F.axpyin(y[i * blockSize + k + 3], A.dat[startDat + k], x[A.col[j] * blockSize + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * blockSize + k], A.dat[startDat + k], x[A.col[j] * blockSize + k]); } } #endif } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag) { #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, ldx, ldy](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.subin(y[i * ldy + k], x[A.col[j] * ldx + k]); F.subin(y[i * ldy + k + 1], x[A.col[j] * ldx + k + 1]); F.subin(y[i * ldy + k + 2], x[A.col[j] * ldx + k + 2]); F.subin(y[i * ldy + k + 3], x[A.col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.subin(y[i * ldy + k], x[A.col[j] * ldx + k]); } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.addin(y[i * ldy + k], x[A.col[j] * ldx + k]); F.addin(y[i * ldy + k + 1], x[A.col[j] * ldx + k + 1]); F.addin(y[i * ldy + k + 2], x[A.col[j] * ldx + k + 2]); F.addin(y[i * ldy + k + 3], x[A.col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.addin(y[i * ldy + k], x[A.col[j] * ldx + k]); } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * ldy + k], A.dat[startDat + k], x[A.col[j] * ldx + k]); F.axpyin(y[i * ldy + k + 1], A.dat[startDat + k], x[A.col[j] * ldx + k + 1]); F.axpyin(y[i * ldy + k + 2], A.dat[startDat + k], x[A.col[j] * ldx + k + 2]); F.axpyin(y[i * ldy + k + 3], A.dat[startDat + k], x[A.col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * ldy + k], A.dat[startDat + k], x[A.col[j] * ldx + k]); } } }); #else #pragma omp parallel for for (uint64_t i = 0; i < A.m; ++i) { index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.subin(y[i * ldy + k], x[A.col[j] * ldx + k]); F.subin(y[i * ldy + k + 1], x[A.col[j] * ldx + k + 1]); F.subin(y[i * ldy + k + 2], x[A.col[j] * ldx + k + 2]); F.subin(y[i * ldy + k + 3], x[A.col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.subin(y[i * ldy + k], x[A.col[j] * ldx + k]); } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.addin(y[i * ldy + k], x[A.col[j] * ldx + k]); F.addin(y[i * ldy + k + 1], x[A.col[j] * ldx + k + 1]); F.addin(y[i * ldy + k + 2], x[A.col[j] * ldx + k + 2]); F.addin(y[i * ldy + k + 3], x[A.col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.addin(y[i * ldy + k], x[A.col[j] * ldx + k]); } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * ldy + k], A.dat[startDat + k], x[A.col[j] * ldx + k]); F.axpyin(y[i * ldy + k + 1], A.dat[startDat + k], x[A.col[j] * ldx + k + 1]); F.axpyin(y[i * ldy + k + 2], A.dat[startDat + k], x[A.col[j] * ldx + k + 2]); F.axpyin(y[i * ldy + k + 3], A.dat[startDat + k], x[A.col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * ldy + k], A.dat[startDat + k], x[A.col[j] * ldx + k]); } } #endif } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag) { #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] -= x[A.col[j] * blockSize + k]; y[i * blockSize + k + 1] -= x[A.col[j] * blockSize + k + 1]; y[i * blockSize + k + 2] -= x[A.col[j] * blockSize + k + 2]; y[i * blockSize + k + 3] -= x[A.col[j] * blockSize + k + 3]; } for (; k < blockSize; ++k) y[i * blockSize + k] -= x[A.col[j] * blockSize + k]; } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] += x[A.col[j] * blockSize + k]; y[i * blockSize + k + 1] += x[A.col[j] * blockSize + k + 1]; y[i * blockSize + k + 2] += x[A.col[j] * blockSize + k + 2]; y[i * blockSize + k + 3] += x[A.col[j] * blockSize + k + 3]; } for (; k < blockSize; ++k) y[i * blockSize + k] += x[A.col[j] * blockSize + k]; } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] += A.dat[startDat + j] * x[A.col[j] * blockSize + k]; y[i * blockSize + k + 1] += A.dat[startDat + j] * x[A.col[j] * blockSize + k + 1]; y[i * blockSize + k + 2] += A.dat[startDat + j] * x[A.col[j] * blockSize + k + 2]; y[i * blockSize + k + 3] += A.dat[startDat + j] * x[A.col[j] * blockSize + k + 3]; } for (; k < blockSize; ++k) y[i * blockSize + k] += A.dat[startDat + j] * x[A.col[j] * blockSize + k]; } } }); #else #pragma omp parallel for for (uint64_t i = 0; i < A.m; ++i) { index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] -= x[A.col[j] * blockSize + k]; y[i * blockSize + k + 1] -= x[A.col[j] * blockSize + k + 1]; y[i * blockSize + k + 2] -= x[A.col[j] * blockSize + k + 2]; y[i * blockSize + k + 3] -= x[A.col[j] * blockSize + k + 3]; } for (; k < blockSize; ++k) y[i * blockSize + k] -= x[A.col[j] * blockSize + k]; } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] += x[A.col[j] * blockSize + k]; y[i * blockSize + k + 1] += x[A.col[j] * blockSize + k + 1]; y[i * blockSize + k + 2] += x[A.col[j] * blockSize + k + 2]; y[i * blockSize + k + 3] += x[A.col[j] * blockSize + k + 3]; } for (; k < blockSize; ++k) y[i * blockSize + k] += x[A.col[j] * blockSize + k]; } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] += A.dat[startDat + j] * x[A.col[j] * blockSize + k]; y[i * blockSize + k + 1] += A.dat[startDat + j] * x[A.col[j] * blockSize + k + 1]; y[i * blockSize + k + 2] += A.dat[startDat + j] * x[A.col[j] * blockSize + k + 2]; y[i * blockSize + k + 3] += A.dat[startDat + j] * x[A.col[j] * blockSize + k + 3]; } for (; k < blockSize; ++k) y[i * blockSize + k] += A.dat[startDat + j] * x[A.col[j] * blockSize + k]; } } #endif } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag) { #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, ldx, ldy](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] -= x[A.col[j] * ldx + k]; y[i * ldy + k + 1] -= x[A.col[j] * ldx + k + 1]; y[i * ldy + k + 2] -= x[A.col[j] * ldx + k + 2]; y[i * ldy + k + 3] -= x[A.col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] -= x[A.col[j] * ldx + k]; } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += x[A.col[j] * ldx + k]; y[i * ldy + k + 1] += x[A.col[j] * ldx + k + 1]; y[i * ldy + k + 2] += x[A.col[j] * ldx + k + 2]; y[i * ldy + k + 3] += x[A.col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += x[A.col[j] * ldx + k]; } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += A.dat[startDat + j] * x[A.col[j] * ldx + k]; y[i * ldy + k + 1] += A.dat[startDat + j] * x[A.col[j] * ldx + k + 1]; y[i * ldy + k + 2] += A.dat[startDat + j] * x[A.col[j] * ldx + k + 2]; y[i * ldy + k + 3] += A.dat[startDat + j] * x[A.col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += A.dat[startDat + j] * x[A.col[j] * ldx + k]; } } }); #else #pragma omp parallel for for (uint64_t i = 0; i < A.m; ++i) { index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] -= x[A.col[j] * ldx + k]; y[i * ldy + k + 1] -= x[A.col[j] * ldx + k + 1]; y[i * ldy + k + 2] -= x[A.col[j] * ldx + k + 2]; y[i * ldy + k + 3] -= x[A.col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] -= x[A.col[j] * ldx + k]; } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += x[A.col[j] * ldx + k]; y[i * ldy + k + 1] += x[A.col[j] * ldx + k + 1]; y[i * ldy + k + 2] += x[A.col[j] * ldx + k + 2]; y[i * ldy + k + 3] += x[A.col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += x[A.col[j] * ldx + k]; } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += A.dat[startDat + j] * x[A.col[j] * ldx + k]; y[i * ldy + k + 1] += A.dat[startDat + j] * x[A.col[j] * ldx + k + 1]; y[i * ldy + k + 2] += A.dat[startDat + j] * x[A.col[j] * ldx + k + 2]; y[i * ldy + k + 3] += A.dat[startDat + j] * x[A.col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += A.dat[startDat + j] * x[A.col[j] * ldx + k]; } } #endif } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, LFunc &&lfunc, SFunc &&sfunc, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, lfunc, sfunc](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { vect_t vx1, vx2, vy1, vy2, vdat; index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy2 = lfunc(y + i * blockSize + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * blockSize + k); vx2 = lfunc(y + A.col[j] * blockSize + k + simd::vect_size); sfunc(y + i * blockSize + k, simd::sub(vy1, vx1)); sfunc(y + i * blockSize + k + simd::vect_size, simd::sub(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vx1 = lfunc(y + A.col[j] * blockSize + k); sfunc(y + i * blockSize + k, simd::sub(vy1, vx1)); } for (; k < blockSize; ++k) y[i * blockSize + k] -= x[A.col[j] * blockSize + k]; } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy2 = lfunc(y + i * blockSize + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * blockSize + k); vx2 = lfunc(y + A.col[j] * blockSize + k + simd::vect_size); sfunc(y + i * blockSize + k, simd::add(vy1, vx1)); sfunc(y + i * blockSize + k + simd::vect_size, simd::add(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vx1 = lfunc(y + A.col[j] * blockSize + k); sfunc(y + i * blockSize + k, simd::add(vy1, vx1)); } for (; k < blockSize; ++k) y[i * blockSize + k] += x[A.col[j] * blockSize + k]; } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { for (uint64_t j = start; j < stop; ++j) { size_t k = 0; vdat = simd::set1(A.dat[startDat + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy2 = lfunc(y + i * blockSize + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * blockSize + k); vx2 = lfunc(y + A.col[j] * blockSize + k + simd::vect_size); sfunc(y + i * blockSize + k, simd::fmadd(vy1, vdat, vx1)); sfunc(y + i * blockSize + k + simd::vect_size, simd::fmadd(vy2, vdat, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vx1 = lfunc(y + A.col[j] * blockSize + k); sfunc(y + i * blockSize + k, simd::fmadd(vy1, vdat, vx1)); } for (; k < blockSize; ++k) y[i * blockSize + k] -= A.dat[startDat + j] * x[A.col[j] * blockSize + k]; } } } }); #else #pragma omp parallel for for (uint64_t i = 0; i < A.m; ++i) { vect_t vx1, vx2, vy1, vy2, vdat; index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy2 = lfunc(y + i * blockSize + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * blockSize + k); vx2 = lfunc(y + A.col[j] * blockSize + k + simd::vect_size); sfunc(y + i * blockSize + k, simd::sub(vy1, vx1)); sfunc(y + i * blockSize + k + simd::vect_size, simd::sub(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vx1 = lfunc(y + A.col[j] * blockSize + k); sfunc(y + i * blockSize + k, simd::sub(vy1, vx1)); } for (; k < blockSize; ++k) y[i * blockSize + k] -= x[A.col[j] * blockSize + k]; } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy2 = lfunc(y + i * blockSize + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * blockSize + k); vx2 = lfunc(y + A.col[j] * blockSize + k + simd::vect_size); sfunc(y + i * blockSize + k, simd::add(vy1, vx1)); sfunc(y + i * blockSize + k + simd::vect_size, simd::add(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vx1 = lfunc(y + A.col[j] * blockSize + k); sfunc(y + i * blockSize + k, simd::add(vy1, vx1)); } for (; k < blockSize; ++k) y[i * blockSize + k] += x[A.col[j] * blockSize + k]; } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { for (uint64_t j = start; j < stop; ++j) { size_t k = 0; vdat = simd::set1(A.dat[startDat + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy2 = lfunc(y + i * blockSize + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * blockSize + k); vx2 = lfunc(y + A.col[j] * blockSize + k + simd::vect_size); sfunc(y + i * blockSize + k, simd::fmadd(vy1, vdat, vx1)); sfunc(y + i * blockSize + k + simd::vect_size, simd::fmadd(vy2, vdat, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vx1 = lfunc(y + A.col[j] * blockSize + k); sfunc(y + i * blockSize + k, simd::fmadd(vy1, vdat, vx1)); } for (; k < blockSize; ++k) y[i * blockSize + k] -= A.dat[startDat + j] * x[A.col[j] * blockSize + k]; } } } #endif } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, LFunc &&lfunc, SFunc &&sfunc, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, ldx, ldy, lfunc, sfunc](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { vect_t vx1, vx2, vy1, vy2, vdat; index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy2 = lfunc(y + i * ldy + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * ldx + k); vx2 = lfunc(y + A.col[j] * ldx + k + simd::vect_size); sfunc(y + i * ldy + k, simd::sub(vy1, vx1)); sfunc(y + i * ldy + k + simd::vect_size, simd::sub(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vx1 = lfunc(y + A.col[j] * ldx + k); sfunc(y + i * ldy + k, simd::sub(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] -= x[A.col[j] * ldx + k]; } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy2 = lfunc(y + i * ldy + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * ldx + k); vx2 = lfunc(y + A.col[j] * ldx + k + simd::vect_size); sfunc(y + i * ldy + k, simd::add(vy1, vx1)); sfunc(y + i * ldy + k + simd::vect_size, simd::add(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vx1 = lfunc(y + A.col[j] * ldx + k); sfunc(y + i * ldy + k, simd::add(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] += x[A.col[j] * ldx + k]; } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { for (uint64_t j = start; j < stop; ++j) { size_t k = 0; vdat = simd::set1(A.dat[startDat + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy2 = lfunc(y + i * ldy + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * ldx + k); vx2 = lfunc(y + A.col[j] * ldx + k + simd::vect_size); sfunc(y + i * ldy + k, simd::fmadd(vy1, vdat, vx1)); sfunc(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vdat, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vx1 = lfunc(y + A.col[j] * ldx + k); sfunc(y + i * ldy + k, simd::fmadd(vy1, vdat, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] -= A.dat[startDat + j] * x[A.col[j] * ldx + k]; } } } }); #else #pragma omp parallel for for (uint64_t i = 0; i < A.m; ++i) { vect_t vx1, vx2, vy1, vy2, vdat; index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy2 = lfunc(y + i * ldy + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * ldx + k); vx2 = lfunc(y + A.col[j] * ldx + k + simd::vect_size); sfunc(y + i * ldy + k, simd::sub(vy1, vx1)); sfunc(y + i * ldy + k + simd::vect_size, simd::sub(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vx1 = lfunc(y + A.col[j] * ldx + k); sfunc(y + i * ldy + k, simd::sub(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] -= x[A.col[j] * ldx + k]; } start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy2 = lfunc(y + i * ldy + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * ldx + k); vx2 = lfunc(y + A.col[j] * ldx + k + simd::vect_size); sfunc(y + i * ldy + k, simd::add(vy1, vx1)); sfunc(y + i * ldy + k + simd::vect_size, simd::add(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vx1 = lfunc(y + A.col[j] * ldx + k); sfunc(y + i * ldy + k, simd::add(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] += x[A.col[j] * ldx + k]; } start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; index_t startDat = A.st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { for (uint64_t j = start; j < stop; ++j) { size_t k = 0; vdat = simd::set1(A.dat[startDat + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy2 = lfunc(y + i * ldy + k + simd::vect_size); vx1 = lfunc(y + A.col[j] * ldx + k); vx2 = lfunc(y + A.col[j] * ldx + k + simd::vect_size); sfunc(y + i * ldy + k, simd::fmadd(vy1, vdat, vx1)); sfunc(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vdat, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vx1 = lfunc(y + A.col[j] * ldx + k); sfunc(y + i * ldy + k, simd::fmadd(vy1, vdat, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] -= A.dat[startDat + j] * x[A.col[j] * ldx + k]; } } } #endif } #endif template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, const int64_t kmax) { // TODO } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, const int64_t kmax) { // TODO } } // csr_hyb_details } // FFLAS #endif // __FFLASFFPACK_fflas_CSR_HYB_pspmm_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_pspmv.inl000066400000000000000000000200671274716147400271630ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_HYB_pspmv_INL #define __FFLASFFPACK_fflas_sparse_CSR_HYB_pspmv_INL #ifdef __FFLASFFPACK_USE_TBB #include "tbb/parallel_for.h" #include "tbb/blocked_range.h" #endif namespace FFLAS { namespace sparse_details_impl { template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, dat, st, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[4 * i], stop = st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { F.subin(y[i], x[col[j]]); } start = st[4 * i + 1], stop = st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { F.addin(y[i], x[col[j]]); } start = st[4 * i + 2], stop = st[4 * (i + 1)]; index_t startDat = st[4 * i + 3]; for (uint64_t j = start, k = 0; j < stop; ++j, ++k) { F.axpyin(y[i], dat[startDat + k], x[col[j]]); } } }); #else #pragma omp parallel for for (uint64_t i = 0; i < A.m; ++i) { index_t start = st[4 * i], stop = st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { F.subin(y[i], x[col[j]]); } start = st[4 * i + 1], stop = st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { F.addin(y[i], x[col[j]]); } start = st[4 * i + 2], stop = st[4 * (i + 1)]; index_t startDat = st[4 * i + 3]; for (uint64_t j = start, k = 0; j < stop; ++j, ++k) { F.axpyin(y[i], dat[startDat + k], x[col[j]]); } } #endif } template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, col, dat, st](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[4 * i], stop = st[4 * i + 1]; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; uint64_t j = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] -= y1 + y2 + y3 + y4; y1 = 0; y2 = 0; y3 = 0; y4 = 0; start = st[4 * i + 1], stop = st[4 * i + 2]; diff = stop - start; j = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; y1 = 0; y2 = 0; y3 = 0; y4 = 0; start = st[4 * i + 2], stop = st[4 * (i + 1)]; diff = stop - start; index_t startDat = st[4 * i + 3]; j = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += dat[startDat + j] * x[col[start + j]]; y2 += dat[startDat + j + 1] * x[col[start + j + 1]]; y3 += dat[startDat + j + 2] * x[col[start + j + 2]]; y4 += dat[startDat + j + 3] * x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += dat[startDat + j] * x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } }); #else #pragma omp parallel for for (uint64_t i = 0; i < A.m; ++i) { index_t start = st[4 * i], stop = st[4 * i + 1]; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; uint64_t j = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] -= y1 + y2 + y3 + y4; y1 = 0; y2 = 0; y3 = 0; y4 = 0; start = st[4 * i + 1], stop = st[4 * i + 2]; diff = stop - start; j = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; y1 = 0; y2 = 0; y3 = 0; y4 = 0; start = st[4 * i + 2], stop = st[4 * (i + 1)]; diff = stop - start; index_t startDat = st[4 * i + 3]; j = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += dat[startDat + j] * x[col[start + j]]; y2 += dat[startDat + j + 1] * x[col[start + j + 1]]; y3 += dat[startDat + j + 2] * x[col[start + j + 2]]; y4 += dat[startDat + j + 3] * x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += dat[startDat + j] * x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } #endif } template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const int64_t kmax) { // TODO } } // CSR_HYB_details } // FFLAS #endif // __FFLASFFPACK_fflas_CSR_HYB_pspmv_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_spmm.inl000066400000000000000000000362201274716147400267700ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_HYB_spmm_INL #define __FFLASFFPACK_fflas_sparse_CSR_HYB_spmm_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (uint64_t i = 0; i < A.m; ++i) { index_t start = st[4 * i], stop = st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.subin(y[i * ldy + k], x[col[j] * ldx + k]); F.subin(y[i * ldy + k + 1], x[col[j] * ldx + k + 1]); F.subin(y[i * ldy + k + 2], x[col[j] * ldx + k + 2]); F.subin(y[i * ldy + k + 3], x[col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.subin(y[i * ldy + k], x[col[j] * ldx + k]); } start = st[4 * i + 1], stop = st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.addin(y[i * ldy + k], x[col[j] * ldx + k]); F.addin(y[i * ldy + k + 1], x[col[j] * ldx + k + 1]); F.addin(y[i * ldy + k + 2], x[col[j] * ldx + k + 2]); F.addin(y[i * ldy + k + 3], x[col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.addin(y[i * ldy + k], x[col[j] * ldx + k]); } start = st[4 * i + 2], stop = st[4 * (i + 1)]; index_t startDat = st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * ldy + k], dat[startDat + k], x[col[j] * ldx + k]); F.axpyin(y[i * ldy + k + 1], dat[startDat + k], x[col[j] * ldx + k + 1]); F.axpyin(y[i * ldy + k + 2], dat[startDat + k], x[col[j] * ldx + k + 2]); F.axpyin(y[i * ldy + k + 3], dat[startDat + k], x[col[j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * ldy + k], dat[startDat + k], x[col[j] * ldx + k]); } } } template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (uint64_t i = 0; i < A.m; ++i) { index_t start = st[4 * i], stop = st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] -= x[col[j] * ldx + k]; y[i * ldy + k + 1] -= x[col[j] * ldx + k + 1]; y[i * ldy + k + 2] -= x[col[j] * ldx + k + 2]; y[i * ldy + k + 3] -= x[col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] -= x[col[j] * ldx + k]; } start = st[4 * i + 1], stop = st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += x[col[j] * ldx + k]; y[i * ldy + k + 1] += x[col[j] * ldx + k + 1]; y[i * ldy + k + 2] += x[col[j] * ldx + k + 2]; y[i * ldy + k + 3] += x[col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += x[col[j] * ldx + k]; } start = st[4 * i + 2], stop = st[4 * (i + 1)]; index_t startDat = st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += dat[startDat + j] * x[col[j] * ldx + k]; y[i * ldy + k + 1] += dat[startDat + j] * x[col[j] * ldx + k + 1]; y[i * ldy + k + 2] += dat[startDat + j] * x[col[j] * ldx + k + 2]; y[i * ldy + k + 3] += dat[startDat + j] * x[col[j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += dat[startDat + j] * x[col[j] * ldx + k]; } } } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); vect_t vx1, vx2, vy1, vy2, vdat; for (uint64_t i = 0; i < A.m; ++i) { index_t start = st[4 * i], stop = st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vy2 = simd::load(y + i * ldy + k + simd::vect_size); vx1 = simd::load(y + col[j] * ldx + k); vx2 = simd::load(y + col[j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::sub(vy1, vx1)); simd::store(y + i * ldy + k + simd::vect_size, simd::sub(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vx1 = simd::load(y + col[j] * ldx + k); simd::store(y + i * ldy + k, simd::sub(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] -= x[col[j] * ldx + k]; } start = st[4 * i + 1], stop = st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vy2 = simd::load(y + i * ldy + k + simd::vect_size); vx1 = simd::load(y + col[j] * ldx + k); vx2 = simd::load(y + col[j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::add(vy1, vx1)); simd::store(y + i * ldy + k + simd::vect_size, simd::add(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vx1 = simd::load(y + col[j] * ldx + k); simd::store(y + i * ldy + k, simd::add(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] += x[col[j] * ldx + k]; } start = st[4 * i + 2], stop = st[4 * (i + 1)]; index_t startDat = st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; vdat = simd::set1(dat[startDat + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vy2 = simd::load(y + i * ldy + k + simd::vect_size); vx1 = simd::load(y + col[j] * ldx + k); vx2 = simd::load(y + col[j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::fmadd(vy1, vdat, vx1)); simd::store(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vdat, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vx1 = simd::load(y + col[j] * ldx + k); simd::store(y + i * ldy + k, simd::fmadd(vy1, vdat, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] -= dat[startDat + j] * x[col[j] * ldx + k]; } } } template inline void fspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); vect_t vx1, vx2, vy1, vy2, vdat; for (uint64_t i = 0; i < A.m; ++i) { index_t start = st[4 * i], stop = st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vy2 = simd::loadu(y + i * ldy + k + simd::vect_size); vx1 = simd::loadu(y + col[j] * ldx + k); vx2 = simd::loadu(y + col[j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::sub(vy1, vx1)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::sub(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vx1 = simd::loadu(y + col[j] * ldx + k); simd::storeu(y + i * ldy + k, simd::sub(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] -= x[col[j] * ldx + k]; } start = st[4 * i + 1], stop = st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vy2 = simd::loadu(y + i * ldy + k + simd::vect_size); vx1 = simd::loadu(y + col[j] * ldx + k); vx2 = simd::loadu(y + col[j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::add(vy1, vx1)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::add(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vx1 = simd::loadu(y + col[j] * ldx + k); simd::storeu(y + i * ldy + k, simd::add(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] += x[col[j] * ldx + k]; } start = st[4 * i + 2], stop = st[4 * (i + 1)]; index_t startDat = st[4 * i + 3]; for (uint64_t j = start; j < stop; ++j) { size_t k = 0; vdat = simd::set1(dat[startDat + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vy2 = simd::loadu(y + i * ldy + k + simd::vect_size); vx1 = simd::loadu(y + col[j] * ldx + k); vx2 = simd::loadu(y + col[j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::fmadd(vy1, vdat, vx1)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vdat, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vx1 = simd::loadu(y + col[j] * ldx + k); simd::storeu(y + i * ldy + k, simd::fmadd(vy1, vdat, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] -= dat[startDat + j] * x[col[j] * ldx + k]; } } } #endif template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { // TODO } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, uint64_t kmax) { // TODO } template inline void fspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, uint64_t kmax) { // TODO } #endif } // csr_hyb_details } // FFLAS #endif // __FFLASFFPACK_fflas_CSR_HYB_spmm_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_spmv.inl000066400000000000000000000115061274716147400270010ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_HYB_spmv_INL #define __FFLASFFPACK_fflas_sparse_CSR_HYB_spmv_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (uint64_t i = 0; i < A.m; ++i) { index_t start = st[4 * i], stop = st[4 * i + 1]; for (uint64_t j = start; j < stop; ++j) { F.subin(y[i], x[col[j]]); } start = st[4 * i + 1], stop = st[4 * i + 2]; for (uint64_t j = start; j < stop; ++j) { F.addin(y[i], x[col[j]]); } start = st[4 * i + 2], stop = st[4 * (i + 1)]; index_t startDat = st[4 * i + 3]; for (uint64_t j = start, k = 0; j < stop; ++j, ++k) { F.axpyin(y[i], dat[startDat + k], x[col[j]]); } } } template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (uint64_t i = 0; i < A.m; ++i) { index_t start = st[4 * i], stop = st[4 * i + 1]; index_t diff = stop - start; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; uint64_t j = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] -= y1 + y2 + y3 + y4; y1 = 0; y2 = 0; y3 = 0; y4 = 0; start = st[4 * i + 1], stop = st[4 * i + 2]; diff = stop - start; j = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; y1 = 0; y2 = 0; y3 = 0; y4 = 0; start = st[4 * i + 2], stop = st[4 * (i + 1)]; diff = stop - start; index_t startDat = st[4 * i + 3]; j = 0; for (; j < ROUND_DOWN(diff, 4); j += 4) { y1 += dat[startDat + j] * x[col[start + j]]; y2 += dat[startDat + j + 1] * x[col[start + j + 1]]; y3 += dat[startDat + j + 2] * x[col[start + j + 2]]; y4 += dat[startDat + j + 3] * x[col[start + j + 3]]; } for (; j < diff; ++j) { y1 += dat[startDat + j] * x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } } template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) { return; // TODO } } // CSR_HYB_details } // FFLAS #endif // __FFLASFFPACK_fflas_CSR_HYB_spmv_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/csr_hyb/csr_hyb_utils.inl000066400000000000000000000150621274716147400271550ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_CSR_HYB_utils_INL #define __FFLASFFPACK_fflas_sparse_CSR_HYB_utils_INL // #define CSR_HYB_DEBUG 1 namespace FFLAS { template inline void sparse_delete(const Sparse &A) { fflas_delete(A.dat); fflas_delete(A.col); fflas_delete(A.st); } namespace csr_hyb_details { struct Info { uint64_t size = 0; uint64_t perm = 0; uint64_t begin = 0; Info(uint64_t it, uint64_t s, uint64_t p) : size(s), perm(p), begin(it) {} Info() = default; Info(const Info &) = default; Info(Info &&) = default; Info &operator=(const Info &) = default; Info &operator=(Info &&) = default; }; template struct Coo { using Self = Coo; ValT val = 0; IdxT row = 0; IdxT col = 0; Coo(ValT v, IdxT r, IdxT c) : val(v), row(r), col(c) {} Coo() = default; Coo(const Self &) = default; Coo(Self &&) = default; Self &operator=(const Self &) = default; Self &operator=(Self &&) = default; }; } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { using namespace csr_hyb_details; using coo = Coo; A.kmax = Protected::DotProdBoundClassic(F, F.one); A.m = rowdim; A.n = coldim; A.nnz = nnz; A.nElements = nnz; std::vector data(nnz); for (uint64_t i = 0; i < nnz; ++i) { // data.emplace_back(dat[i], col[i], row[i]); data[i].val = dat[i]; data[i].col = col[i]; data[i].row = row[i]; } std::vector rows(rowdim, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.maxrow = *(std::max_element(rows.begin(), rows.end())); if (A.kmax > A.maxrow) A.delayed = true; rows.resize(3 * (rowdim + 1)); for (auto &x : rows) x = 0; for (uint64_t i = 0; i < data.size(); ++i) { auto x = data[i]; if (F.isOne(x.val)) { rows[3 * x.row + 1]++; A.nOnes++; } else if (F.isMOne(x.val)) { rows[3 * x.row]++; A.nMOnes++; } else { rows[3 * x.row + 2]++; A.nOthers++; } } A.col = fflas_new(nnz, Alignment::CACHE_LINE); A.st = fflas_new(4 * (rowdim + 1), Alignment::CACHE_LINE); A.dat = fflas_new(F, A.nOthers, 1, Alignment::CACHE_LINE); for (uint64_t i = 0; i < 4 * (rowdim + 1); ++i) A.st[i] = 0; for (size_t i = 0; i < nnz; ++i) { A.col[i] = static_cast(data[i].col); } data.shrink_to_fit(); // sort nnz by row with order -1 1 L std::sort(data.begin(), data.end(), [&F](const coo &a, const coo &b) { return (a.row < b.row) || ((a.row == b.row) && (F.isMOne(a.val) && !F.isMOne(b.val))) || ((a.row == b.row) && (F.isMOne(a.val) && F.isMOne(b.val) && (a.col < b.col))) || ((a.row == b.row) && (F.isOne(a.val) && !F.isOne(b.val) && !F.isMOne(b.val))) || ((a.row == b.row) && (F.isOne(a.val) && F.isOne(b.val) && (a.col < b.col))) || ((a.row == b.row) && (!F.isOne(a.val) && !F.isMOne(a.val) && !F.isOne(b.val) && !F.isMOne(b.val)) && (a.col < b.col)); }); #ifdef CSR_HYB_DEBUG for (auto &x : data) { cout << "(" << x.row << "," << x.col << "," << x.val << ") "; } cout << endl; #endif uint64_t it = 0; for (size_t i = 0; i < data.size(); ++i) { if (!F.isOne(data[i].val) && !F.isMOne(data[i].val)) { A.dat[it] = data[i].val; ++it; } } A.st[1] = rows[0]; A.st[2] = rows[1] + A.st[1]; A.st[3] = 0; A.st[4] = rows[2] + A.st[2]; for (uint64_t i = 1; i < rowdim; ++i) { A.st[4 * i + 1] = rows[3 * i] + A.st[4 * i]; A.st[4 * i + 2] = rows[3 * i + 1] + A.st[4 * i + 1]; A.st[4 * i + 3] = rows[3 * (i - 1) + 2] + A.st[4 * (i - 1) + 3]; A.st[4 * (i + 1)] = rows[3 * i + 2] + A.st[4 * i + 2]; } #ifdef CSR_HYB_DEBUG for (uint64_t i = 0; i < it; ++i) cout << A.dat[i] << " "; cout << endl; for (uint64_t i = 0; i < nnz; ++i) cout << A.col[i] << " "; cout << endl; for (uint64_t i = 0; i < rowdim; ++i) cout << "(" << A.st[4 * i] << " , " << A.st[4 * i + 1] << " , " << A.st[4 * i + 2] << " , " << A.st[4 * i + 3] << ") " << endl; cout << endl; cout << endl; for (uint64_t i = 0; i < rowdim; ++i) { index_t start = A.st[4 * i], stop = A.st[4 * i + 1]; index_t diff = stop - start; cout << i << endl; cout << " -1 : "; for (uint64_t j = 0; j < diff; ++j) { cout << A.col[start + j] << " "; } cout << endl; start = A.st[4 * i + 1], stop = A.st[4 * i + 2]; diff = stop - start; cout << " 1 : "; for (uint64_t j = 0; j < diff; ++j) { cout << A.col[start + j] << " "; } cout << endl; start = A.st[4 * i + 2], stop = A.st[4 * (i + 1)]; diff = stop - start; index_t startDat = A.st[4 * i + 3]; cout << " l : "; for (uint64_t j = 0; j < diff; ++j) { cout << "(" << A.col[start + j] << " , " << A.dat[startDat + j] << ") "; } cout << endl; } #endif } } // FFLAS #endiffflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell.h000066400000000000000000000060451274716147400230750ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fspmv_ell.inl * NO DOC */ #ifndef __FFLASFFPACK_fflas_sparse_ell_H #define __FFLASFFPACK_fflas_sparse_ell_H namespace FFLAS { /* ELL */ template struct Sparse<_Field, SparseMatrix_t::ELL> { using Field = _Field; bool delayed = false; uint64_t kmax = 0; index_t m = 0; index_t n = 0; index_t ld = 0; uint64_t nnz = 0; uint64_t nElements = 0; uint64_t maxrow = 0; index_t *col = nullptr; typename _Field::Element_ptr dat; }; template struct Sparse<_Field, SparseMatrix_t::ELL_ZO> : public Sparse<_Field, SparseMatrix_t::ELL> { using Field = _Field; typename _Field::Element cst = 1; }; template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz); template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz); template inline void sparse_delete(const Sparse &A); template inline void sparse_delete(const Sparse &A); } // FFLAS #include "fflas-ffpack/fflas/fflas_sparse/ell/ell_utils.inl" #include "fflas-ffpack/fflas/fflas_sparse/ell/ell_spmv.inl" #include "fflas-ffpack/fflas/fflas_sparse/ell/ell_spmm.inl" #if defined(__FFLASFFPACK_USE_OPENMP) || defined(__FFLASFFPACK_USE_TBB) #include "fflas-ffpack/fflas/fflas_sparse/ell/ell_pspmv.inl" // #include "fflas-ffpack/fflas/fflas_sparse/ell/ell_pspmm.inl" #endif #endif // __FFLASFFPACK_fflas_sparse_ELL_Hfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell/000077500000000000000000000000001274716147400227175ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell/Makefile.am000066400000000000000000000021631274716147400247550ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Bastien Vialla # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_sparse/ell pkgincludesub_HEADERS= \ ell_spmv.inl \ ell_spmm.inl \ ell_pspmv.inl \ ell_pspmm.inl \ ell_utils.inl fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell/ell_pspmm.inl000066400000000000000000001043541274716147400254220ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_pspmm_INL #define __FFLASFFPACK_fflas_sparse_ELL_pspmm_INL #ifdef __FFLASFFPACK_USE_TBB #include "tbb/parallel_for.h" #include "tbb/blocked_range.h" #endif namespace FFLAS { namespace sparse_details_impl { template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag) { #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * blockSize + k], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * blockSize + k]); F.axpyin(y[i * blockSize + k + 1], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * blockSize + k + 1]); F.axpyin(y[i * blockSize + k + 2], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * blockSize + k + 2]); F.axpyin(y[i * blockSize + k + 3], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * blockSize + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * blockSize + k], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * blockSize + k]); } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * blockSize + k], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * blockSize + k]); F.axpyin(y[i * blockSize + k + 1], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * blockSize + k + 1]); F.axpyin(y[i * blockSize + k + 2], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * blockSize + k + 2]); F.axpyin(y[i * blockSize + k + 3], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * blockSize + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * blockSize + k], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * blockSize + k]); } } #endif } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag) { #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, ldx, ldy](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * ldy + k], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * ldx + k]); F.axpyin(y[i * ldy + k + 1], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * ldx + k + 1]); F.axpyin(y[i * ldy + k + 2], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * ldx + k + 2]); F.axpyin(y[i * ldy + k + 3], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * ldy + k], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * ldx + k]); } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * ldy + k], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * ldx + k]); F.axpyin(y[i * ldy + k + 1], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * ldx + k + 1]); F.axpyin(y[i * ldy + k + 2], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * ldx + k + 2]); F.axpyin(y[i * ldy + k + 3], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * ldy + k], A.dat[i * A.ld + j], x[A.col[i * A.ld + j] * ldx + k]); } } #endif } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag) { #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, ldx, ldy](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; y[i * blockSize + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 1]; y[i * blockSize + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 2]; y[i * blockSize + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 3]; } for (; k < blockSize; ++k) y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; y[i * blockSize + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 1]; y[i * blockSize + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 2]; y[i * blockSize + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 3]; } for (; k < blockSize; ++k) y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; } } #endif } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag) { #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, ldx, ldy](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; } } #endif } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, LFunc &&lfunc, SFunc &&sfunc, FieldCategories::UnparametricTag) { using simd = Simd; using vec_t = typename simd::vec_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, lfunc, sfunc](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { vec_t vx1, vx2, vy1, vy2, vdat; size_t k = 0; vdat = simd::set1(A.dat[i * A.ld + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy2 = lfunc(y + i * blockSize + k + simd::vect_size); vy1 = lfunc(x + A.col[i * A.ld + j] * blockSize + k); vy2 = lfunc(x + A.col[i * A.ld + j] * blockSize + k + simd::vect_size); sfunc(y + i * blockSize + k, simd::fmadd(vy1, vx1, vdat)); sfunc(y + i * blockSize + k + simd::vect_size, simd::fmadd(vy2, vx2, vdat)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy1 = lfunc(x + A.col[i * A.ld + j] * blockSize + k); sfunc(y + i * blockSize + k, simd::fmadd(vy1, vx1, vdat)); } for (; k < blockSize; ++k) y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { vec_t vx1, vx2, vy1, vy2, vdat; size_t k = 0; vdat = simd::set1(A.dat[i * A.ld + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy2 = lfunc(y + i * blockSize + k + simd::vect_size); vy1 = lfunc(x + A.col[i * A.ld + j] * blockSize + k); vy2 = lfunc(x + A.col[i * A.ld + j] * blockSize + k + simd::vect_size); sfunc(y + i * blockSize + k, simd::fmadd(vy1, vx1, vdat)); sfunc(y + i * blockSize + k + simd::vect_size, simd::fmadd(vy2, vx2, vdat)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy1 = lfunc(x + A.col[i * A.ld + j] * blockSize + k); sfunc(y + i * blockSize + k, simd::fmadd(vy1, vx1, vdat)); } for (; k < blockSize; ++k) y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; } } #endif } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, LFunc &&lfunc, SFunc &&sfunc, FieldCategories::UnparametricTag) { using simd = Simd; using vec_t = typename simd::vec_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, ldx, ldy, lfunc, sfunc](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { vec_t vx1, vx2, vy1, vy2, vdat; size_t k = 0; vdat = simd::set1(A.dat[i * A.ld + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy2 = lfunc(y + i * ldy + k + simd::vect_size); vy1 = lfunc(x + A.col[i * A.ld + j] * ldx + k); vy2 = lfunc(x + A.col[i * A.ld + j] * ldx + k + simd::vect_size); sfunc(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); sfunc(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vx2, vdat)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy1 = lfunc(x + A.col[i * A.ld + j] * ldx + k); sfunc(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); } for (; k < blockSize; ++k) y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { vec_t vx1, vx2, vy1, vy2, vdat; size_t k = 0; vdat = simd::set1(A.dat[i * A.ld + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy2 = lfunc(y + i * ldy + k + simd::vect_size); vy1 = lfunc(x + A.col[i * A.ld + j] * ldx + k); vy2 = lfunc(x + A.col[i * A.ld + j] * ldx + k + simd::vect_size); sfunc(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); sfunc(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vx2, vdat)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy1 = lfunc(x + A.col[i * A.ld + j] * ldx + k); sfunc(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); } for (; k < blockSize; ++k) y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; } } #endif } #endif template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, const int64_t kmax) { index_t block = (A.ld) / kmax; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, kmax](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j_loc = 0, j = 0; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; y[i * blockSize + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 1]; y[i * blockSize + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 2]; y[i * blockSize + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 3]; } for (; k < blockSize; ++k) { y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; } } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * blockSize + k]); } } for (; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; y[i * blockSize + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 1]; y[i * blockSize + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 2]; y[i * blockSize + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 3]; } for (; k < blockSize; ++k) { y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; } } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * blockSize + k]); } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { index_t j_loc = 0, j = 0; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; y[i * blockSize + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 1]; y[i * blockSize + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 2]; y[i * blockSize + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 3]; } for (; k < blockSize; ++k) { y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; } } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * blockSize + k]); } } for (; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; y[i * blockSize + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 1]; y[i * blockSize + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 2]; y[i * blockSize + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k + 3]; } for (; k < blockSize; ++k) { y[i * blockSize + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * blockSize + k]; } } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * blockSize + k]); } } #endif } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, const int64_t kmax) { index_t block = (A.ld) / kmax; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, ldx, ldy, kmax](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j_loc = 0, j = 0; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) { y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; } } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * ldy + k]); } } for (; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) { y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; } } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * ldy + k]); } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { index_t j_loc = 0, j = 0; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) { y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; } } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * ldy + k]); } } for (; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) { y[i * ldy + k] += A.dat[i * A.ld + j] * x[A.col[i * A.ld + j] * ldx + k]; } } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * ldy + k]); } } #endif } template inline void pfspmm_zo(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, Func &&func) { #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, func](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { func(y[i * blockSize + k], x[A.col[i * A.ld + j] * blockSize + k]); func(y[i * blockSize + k + 1], x[A.col[i * A.ld + j] * blockSize + k + 1]); func(y[i * blockSize + k + 2], x[A.col[i * A.ld + j] * blockSize + k + 2]); func(y[i * blockSize + k + 3], x[A.col[i * A.ld + j] * blockSize + k + 3]); } for (; k < blockSize; ++k) func(y[i * blockSize + k], x[A.col[i * A.ld + j] * blockSize + k]); } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { func(y[i * blockSize + k], x[A.col[i * A.ld + j] * blockSize + k]); func(y[i * blockSize + k + 1], x[A.col[i * A.ld + j] * blockSize + k + 1]); func(y[i * blockSize + k + 2], x[A.col[i * A.ld + j] * blockSize + k + 2]); func(y[i * blockSize + k + 3], x[A.col[i * A.ld + j] * blockSize + k + 3]); } for (; k < blockSize; ++k) func(y[i * blockSize + k], x[A.col[i * A.ld + j] * blockSize + k]); } } #endif } template inline void pfspmm_zo(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, Func &&func) { #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, ldx, ldy, func](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { func(y[i * ldy + k], x[A.col[i * A.ld + j] * ldx + k]); func(y[i * ldy + k + 1], x[A.col[i * A.ld + j] * ldx + k + 1]); func(y[i * ldy + k + 2], x[A.col[i * A.ld + j] * ldx + k + 2]); func(y[i * ldy + k + 3], x[A.col[i * A.ld + j] * ldx + k + 3]); } for (; k < blockSize; ++k) func(y[i * ldy + k], x[A.col[i * A.ld + j] * ldx + k]); } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { func(y[i * ldy + k], x[A.col[i * A.ld + j] * ldx + k]); func(y[i * ldy + k + 1], x[A.col[i * A.ld + j] * ldx + k + 1]); func(y[i * ldy + k + 2], x[A.col[i * A.ld + j] * ldx + k + 2]); func(y[i * ldy + k + 3], x[A.col[i * A.ld + j] * ldx + k + 3]); } for (; k < blockSize; ++k) func(y[i * ldy + k], x[A.col[i * A.ld + j] * ldx + k]); } } #endif } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmm_zo(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, VectFunc &&vfunc, ScalFunc &&scalfunc, LFunc &&lfunc, SFunc &&sfunc) { using simd = Simd; using vec_t = typename simd::vec_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, vfunc, scalfunc, lfunc, sfunc](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { vec_t vx1, vx2, vy1, vy2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy2 = lfunc(y + i * blockSize + k + simd::vect_size); vy1 = lfunc(x + A.col[i * A.ld + j] * blockSize + k); vy2 = lfunc(x + A.col[i * A.ld + j] * blockSize + k + simd::vect_size); sfunc(y + i * blockSize + k, vfunc(vy1, vx1)); sfunc(y + i * blockSize + k + simd::vect_size, vfunc(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy1 = lfunc(x + A.col[i * A.ld + j] * blockSize + k); sfunc(y + i * blockSize + k, vfunc(vy1, vx1)); } for (; k < blockSize; ++k) scalfunc(y[i * blockSize + k], x[A.col[i * A.ld + j] * blockSize + k]); } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { vec_t vx1, vx2, vy1, vy2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy2 = lfunc(y + i * blockSize + k + simd::vect_size); vy1 = lfunc(x + A.col[i * A.ld + j] * blockSize + k); vy2 = lfunc(x + A.col[i * A.ld + j] * blockSize + k + simd::vect_size); sfunc(y + i * blockSize + k, vfunc(vy1, vx1)); sfunc(y + i * blockSize + k + simd::vect_size, vfunc(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * blockSize + k); vy1 = lfunc(x + A.col[i * A.ld + j] * blockSize + k); sfunc(y + i * blockSize + k, vfunc(vy1, vx1)); } for (; k < blockSize; ++k) scalfunc(y[i * blockSize + k], x[A.col[i * A.ld + j] * blockSize + k]); } } #endif } template inline void pfspmm_zo(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, LFunc &&lfunc, SFunc &&sfunc, FieldCategories::UnparametricTag) { using simd = Simd; using vec_t = typename simd::vec_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for( tbb::blocked_range(0, A.m), [&F, &A, &x, &y, blockSize, ldx, ldy, vfunc, scalfunc, lfunc, sfunc](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { vec_t vx1, vx2, vy1, vy2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy2 = lfunc(y + i * ldy + k + simd::vect_size); vy1 = lfunc(x + A.col[i * A.ld + j] * ldx + k); vy2 = lfunc(x + A.col[i * A.ld + j] * ldx + k + simd::vect_size); sfunc(y + i * ldx + k, vfunc(vy1, vx1)); sfunc(y + i * ldx + k + simd::vect_size, vfunc(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy1 = lfunc(x + A.col[i * A.ld + j] * ldy + k); sfunc(y + i * ldx + k, vfunc(vy1, vx1)); } for (; k < blockSize; ++k) scalfunc(y[i * ldy + k], x[A.col[i * A.ld + j] * ldx + k]); } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { vec_t vx1, vx2, vy1, vy2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy2 = lfunc(y + i * ldy + k + simd::vect_size); vy1 = lfunc(x + A.col[i * A.ld + j] * ldx + k); vy2 = lfunc(x + A.col[i * A.ld + j] * ldx + k + simd::vect_size); sfunc(y + i * ldx + k, vfunc(vy1, vx1)); sfunc(y + i * ldx + k + simd::vect_size, vfunc(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = lfunc(y + i * ldy + k); vy1 = lfunc(x + A.col[i * A.ld + j] * ldy + k); sfunc(y + i * ldx + k, vfunc(vy1, vx1)); } for (; k < blockSize; ++k) scalfunc(y[i * ldy + k], x[A.col[i * A.ld + j] * ldx + k]); } } #endif } #endif } // ell_details } // FFLAS #endif // __FFLASFFPACK_fflas_ELL_pspmm_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell/ell_pspmv.inl000066400000000000000000000366771274716147400254470ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_pspmv_INL #define __FFLASFFPACK_fflas_sparse_ELL_pspmv_INL #ifdef __FFLASFFPACK_USE_TBB #include "tbb/parallel_for.h" #include "tbb/blocked_range.h" #endif namespace FFLAS { namespace sparse_details_impl { template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, x, y, dat, col, &A](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(A.ld, 4); j += 4) { F.axpyin(y1, dat[i * A.ld + j], x[col[i * A.ld + j]]); F.axpyin(y2, dat[i * A.ld + j + 1], x[col[i * A.ld + j + 1]]); F.axpyin(y3, dat[i * A.ld + j + 2], x[col[i * A.ld + j + 2]]); F.axpyin(y4, dat[i * A.ld + j + 3], x[col[i * A.ld + j + 3]]); } for (; j < A.ld; ++j) { F.axpyin(y1, dat[i * A.ld + j], x[col[i * A.ld + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(A.ld, 4); j += 4) { F.axpyin(y1, dat[i * A.ld + j], x[col[i * A.ld + j]]); F.axpyin(y2, dat[i * A.ld + j + 1], x[col[i * A.ld + j + 1]]); F.axpyin(y3, dat[i * A.ld + j + 2], x[col[i * A.ld + j + 2]]); F.axpyin(y4, dat[i * A.ld + j + 3], x[col[i * A.ld + j + 3]]); } for (; j < A.ld; ++j) { F.axpyin(y1, dat[i * A.ld + j], x[col[i * A.ld + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } #endif } template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, x, y, dat, col, &A](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(A.ld, 4); j += 4) { y1 += dat[i * A.ld + j] * x[col[i * A.ld + j]]; y2 += dat[i * A.ld + j + 1] * x[col[i * A.ld + j + 1]]; y3 += dat[i * A.ld + j + 2] * x[col[i * A.ld + j + 2]]; y4 += dat[i * A.ld + j + 3] * x[col[i * A.ld + j + 3]]; } for (; j < A.ld; ++j) { y1 += dat[i * A.ld + j] * x[col[i * A.ld + j]]; } y[i] += y1 + y2 + y3 + y4; } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(A.ld, 4); j += 4) { y1 += dat[i * A.ld + j] * x[col[i * A.ld + j]]; y2 += dat[i * A.ld + j + 1] * x[col[i * A.ld + j + 1]]; y3 += dat[i * A.ld + j + 2] * x[col[i * A.ld + j + 2]]; y4 += dat[i * A.ld + j + 3] * x[col[i * A.ld + j + 3]]; } for (; j < A.ld; ++j) { y1 += dat[i * A.ld + j] * x[col[i * A.ld + j]]; } y[i] += y1 + y2 + y3 + y4; } #endif } template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const int64_t kmax) { index_t block = (A.ld) / kmax; assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, kmax, block, dat, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j_loc = 0, j = 0; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { y[i] += dat[i * A.ld + j] * x[col[i * A.ld + j]]; } F.reduce(y[i]); } for (; j < A.ld; ++j) { y[i] += dat[i * A.ld + j] * x[col[i * A.ld + j]]; } F.reduce(y[i]); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { index_t j_loc = 0, j = 0; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { y[i] += dat[i * A.ld + j] * x[col[i * A.ld + j]]; } F.reduce(y[i]); } for (; j < A.ld; ++j) { y[i] += dat[i * A.ld + j] * x[col[i * A.ld + j]]; } F.reduce(y[i]); } #endif } template inline void pfspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(A.ld, 4); j += 4) { F.addin(y1, x[col[i * A.ld + j]]); F.addin(y2, x[col[i * A.ld + j + 1]]); F.addin(y3, x[col[i * A.ld + j + 2]]); F.addin(y4, x[col[i * A.ld + j + 3]]); } for (; j < A.ld; ++j) { F.addin(y1, x[col[i * A.ld + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(A.ld, 4); j += 4) { F.addin(y1, x[col[i * A.ld + j]]); F.addin(y2, x[col[i * A.ld + j + 1]]); F.addin(y3, x[col[i * A.ld + j + 2]]); F.addin(y4, x[col[i * A.ld + j + 3]]); } for (; j < A.ld; ++j) { F.addin(y1, x[col[i * A.ld + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } #endif } template inline void pfspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(A.ld, 4); j += 4) { F.addin(y1, x[col[i * A.ld + j]]); F.addin(y2, x[col[i * A.ld + j + 1]]); F.addin(y3, x[col[i * A.ld + j + 2]]); F.addin(y4, x[col[i * A.ld + j + 3]]); } for (; j < A.ld; ++j) { F.addin(y1, x[col[i * A.ld + j]]); } F.subin(y[i], y1); F.subin(y[i], y2); F.subin(y[i], y3); F.subin(y[i], y4); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(A.ld, 4); j += 4) { F.addin(y1, x[col[i * A.ld + j]]); F.addin(y2, x[col[i * A.ld + j + 1]]); F.addin(y3, x[col[i * A.ld + j + 2]]); F.addin(y4, x[col[i * A.ld + j + 3]]); } for (; j < A.ld; ++j) { F.addin(y1, x[col[i * A.ld + j]]); } F.subin(y[i], y1); F.subin(y[i], y2); F.subin(y[i], y3); F.subin(y[i], y4); } #endif } template inline void pfspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(A.ld, 4); j += 4) { y1 += x[col[i * A.ld + j]]; y2 += x[col[i * A.ld + j + 1]]; y3 += x[col[i * A.ld + j + 2]]; y4 += x[col[i * A.ld + j + 3]]; } for (; j < A.ld; ++j) { y1 += x[col[i * A.ld + j]]; } y[i] += y1 + y2 + y3 + y4; } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(A.ld, 4); j += 4) { y1 += x[col[i * A.ld + j]]; y2 += x[col[i * A.ld + j + 1]]; y3 += x[col[i * A.ld + j + 2]]; y4 += x[col[i * A.ld + j + 3]]; } for (; j < A.ld; ++j) { y1 += x[col[i * A.ld + j]]; } y[i] += y1 + y2 + y3 + y4; } #endif } template inline void pfspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #if defined(__FFLASFFPACK_USE_TBB) int step = __FFLASFFPACK_CACHE_LINE_SIZE / sizeof(typename Field::Element); tbb::parallel_for(tbb::blocked_range(0, A.m, step), [&F, &A, x, y, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(A.ld, 4); j += 4) { y1 += x[col[i * A.ld + j]]; y2 += x[col[i * A.ld + j + 1]]; y3 += x[col[i * A.ld + j + 2]]; y4 += x[col[i * A.ld + j + 3]]; } for (; j < A.ld; ++j) { y1 += x[col[i * A.ld + j]]; } y[i] -= y1 + y2 + y3 + y4; } }); #else #pragma omp parallel for for (index_t i = 0; i < A.m; ++i) { index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(A.ld, 4); j += 4) { y1 += x[col[i * A.ld + j]]; y2 += x[col[i * A.ld + j + 1]]; y3 += x[col[i * A.ld + j + 2]]; y4 += x[col[i * A.ld + j + 3]]; } for (; j < A.ld; ++j) { y1 += x[col[i * A.ld + j]]; } y[i] -= y1 + y2 + y3 + y4; } #endif } } // ELL_details } // FFLAS #endif // __FFLASFFPACK_fflas_ELL_pspmv_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell/ell_spmm.inl000066400000000000000000000667401274716147400252500ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_spmm_INL #define __FFLASFFPACK_fflas_sparse_ELL_spmm_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.axpyin(y[i * ldy + k], dat[i * A.ld + j], x[col[i * A.ld + j] * ldx + k]); F.axpyin(y[i * ldy + k + 1], dat[i * A.ld + j], x[col[i * A.ld + j] * ldx + k + 1]); F.axpyin(y[i * ldy + k + 2], dat[i * A.ld + j], x[col[i * A.ld + j] * ldx + k + 2]); F.axpyin(y[i * ldy + k + 3], dat[i * A.ld + j], x[col[i * A.ld + j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.axpyin(y[i * ldy + k], dat[i * A.ld + j], x[col[i * A.ld + j] * ldx + k]); } } } template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; } } } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { vect_t vx1, vx2, vy1, vy2, vdat; size_t k = 0; vdat = simd::set1(dat[i * A.ld + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vy2 = simd::load(y + i * ldy + k + simd::vect_size); vx1 = simd::load(x + col[i * A.ld + j] * ldx + k); vx2 = simd::load(x + col[i * A.ld + j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); simd::store(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vx2, vdat)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vx1 = simd::load(x + col[i * A.ld + j] * ldx + k); simd::store(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); } for (; k < blockSize; ++k) y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; } } } template inline void fspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { vect_t vx1, vx2, vy1, vy2, vdat; size_t k = 0; vdat = simd::set1(dat[i * A.ld + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vy2 = simd::loadu(y + i * ldy + k + simd::vect_size); vx1 = simd::loadu(x + col[i * A.ld + j] * ldx + k); vx2 = simd::loadu(x + col[i * A.ld + j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vx2, vdat)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vx1 = simd::loadu(x + col[i * A.ld + j] * ldx + k); simd::storeu(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); } for (; k < blockSize; ++k) y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; } } } #endif template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t block = (A.ld) / kmax; for (index_t i = 0; i < A.m; ++i) { index_t j_loc = 0, j = 0; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; } } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * ldy + k]); } } for (; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) { y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; } } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * ldy + k]); } } } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; index_t block = (A.ld) / kmax; for (index_t i = 0; i < A.m; ++i) { index_t j_loc = 0, j = 0; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { vect_t vx1, vx2, vy1, vy2, vdat; size_t k = 0; vdat = simd::set1(dat[i * A.ld + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vy2 = simd::load(y + i * ldy + k + simd::vect_size); vx1 = simd::load(x + col[i * A.ld + j] * ldx + k); vx2 = simd::load(x + col[i * A.ld + j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); simd::store(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vx2, vdat)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vx1 = simd::load(x + col[i * A.ld + j] * ldx + k); simd::store(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); } for (; k < blockSize; ++k) y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * ldy + k]); } } for (; j < A.ld; ++j) { vect_t vx1, vx2, vy1, vy2, vdat; size_t k = 0; vdat = simd::set1(dat[i * A.ld + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vy2 = simd::load(y + i * ldy + k + simd::vect_size); vx1 = simd::load(x + col[i * A.ld + j] * ldx + k); vx2 = simd::load(x + col[i * A.ld + j] * ldx + k + simd::vect_size); simd::store(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); simd::store(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vx2, vdat)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vx1 = simd::load(x + col[i * A.ld + j] * ldx + k); simd::store(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); } for (; k < blockSize; ++k) y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * ldy + k]); } } } template inline void fspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, const int64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; index_t block = (A.ld) / kmax; for (index_t i = 0; i < A.m; ++i) { index_t j_loc = 0, j = 0; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { vect_t vx1, vx2, vy1, vy2, vdat; size_t k = 0; vdat = simd::set1(dat[i * A.ld + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vy2 = simd::loadu(y + i * ldy + k + simd::vect_size); vx1 = simd::loadu(x + col[i * A.ld + j] * ldx + k); vx2 = simd::loadu(x + col[i * A.ld + j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vx2, vdat)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vx1 = simd::loadu(x + col[i * A.ld + j] * ldx + k); simd::storeu(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); } for (; k < blockSize; ++k) y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * ldy + k]); } } for (; j < A.ld; ++j) { vect_t vx1, vx2, vy1, vy2, vdat; size_t k = 0; vdat = simd::set1(dat[i * A.ld + j]); for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vy2 = simd::loadu(y + i * ldy + k + simd::vect_size); vx1 = simd::loadu(x + col[i * A.ld + j] * ldx + k); vx2 = simd::loadu(x + col[i * A.ld + j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); simd::storeu(y + i * ldy + k + simd::vect_size, simd::fmadd(vy2, vx2, vdat)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vx1 = simd::loadu(x + col[i * A.ld + j] * ldx + k); simd::storeu(y + i * ldy + k, simd::fmadd(vy1, vx1, vdat)); } for (; k < blockSize; ++k) y[i * ldy + k] += dat[i * A.ld + j] * x[col[i * A.ld + j] * ldx + k]; } // TODO : replace with freduce for (size_t k = 0; k < blockSize; ++k) { F.reduce(y[i * ldy + k]); } } } #endif // SIMD template inline void fspmm_mone(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.subin(y[i * ldy + k], x[col[i * A.ld + j] * ldx + k]); F.subin(y[i * ldy + k + 1], x[col[i * A.ld + j] * ldx + k + 1]); F.subin(y[i * ldy + k + 2], x[col[i * A.ld + j] * ldx + k + 2]); F.subin(y[i * ldy + k + 3], x[col[i * A.ld + j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.subin(y[i * ldy + k], x[col[i * A.ld + j] * ldx + k]); } } } template inline void fspmm_one(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { F.addin(y[i * ldy + k], x[col[i * A.ld + j] * ldx + k]); F.addin(y[i * ldy + k + 1], x[col[i * A.ld + j] * ldx + k + 1]); F.addin(y[i * ldy + k + 2], x[col[i * A.ld + j] * ldx + k + 2]); F.addin(y[i * ldy + k + 3], x[col[i * A.ld + j] * ldx + k + 3]); } for (; k < blockSize; ++k) F.addin(y[i * ldy + k], x[col[i * A.ld + j] * ldx + k]); } } } template inline void fspmm_mone(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] -= x[col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] -= x[col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] -= x[col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] -= x[col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] -= x[col[i * A.ld + j] * ldx + k]; } } } template inline void fspmm_one(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(blockSize, 4); k += 4) { y[i * ldy + k] += x[col[i * A.ld + j] * ldx + k]; y[i * ldy + k + 1] += x[col[i * A.ld + j] * ldx + k + 1]; y[i * ldy + k + 2] += x[col[i * A.ld + j] * ldx + k + 2]; y[i * ldy + k + 3] += x[col[i * A.ld + j] * ldx + k + 3]; } for (; k < blockSize; ++k) y[i * ldy + k] += x[col[i * A.ld + j] * ldx + k]; } } } // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_one_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { vect_t vx1, vx2, vy1, vy2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vy2 = simd::load(y + i * ldy + k + simd::vect_size); vx1 = simd::load(x + col[i * A.ld + j] * ldx + k); vx2 = simd::load(x + col[i * A.ld + j] * ldx + k + simd::vect_size); simd::store(y + i * ldx + k, simd::add(vy1, vx1)); simd::store(y + i * ldx + k + simd::vect_size, simd::add(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vx1 = simd::load(x + col[i * A.ld + j] * ldy + k); simd::store(y + i * ldx + k, simd::add(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] += x[col[i * A.ld + j] * ldx + k]; } } } template inline void fspmm_one_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { vect_t vx1, vx2, vy1, vy2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vy2 = simd::loadu(y + i * ldy + k + simd::vect_size); vx1 = simd::loadu(x + col[i * A.ld + j] * ldx + k); vx2 = simd::loadu(x + col[i * A.ld + j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldx + k, simd::add(vy1, vx1)); simd::storeu(y + i * ldx + k + simd::vect_size, simd::add(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vx1 = simd::loadu(x + col[i * A.ld + j] * ldy + k); simd::storeu(y + i * ldx + k, simd::add(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] += x[col[i * A.ld + j] * ldx + k]; } } } template inline void fspmm_mone_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { vect_t vx1, vx2, vy1, vy2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vy2 = simd::load(y + i * ldy + k + simd::vect_size); vx1 = simd::load(x + col[i * A.ld + j] * ldx + k); vx2 = simd::load(x + col[i * A.ld + j] * ldx + k + simd::vect_size); simd::store(y + i * ldx + k, simd::sub(vy1, vx1)); simd::store(y + i * ldx + k + simd::vect_size, simd::sub(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::load(y + i * ldy + k); vx1 = simd::load(x + col[i * A.ld + j] * ldy + k); simd::store(y + i * ldx + k, simd::sub(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] -= x[col[i * A.ld + j] * ldx + k]; } } } template inline void fspmm_mone_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x_, int ldx, typename Field::Element_ptr y_, int ldy, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.m; ++i) { for (index_t j = 0; j < A.ld; ++j) { vect_t vx1, vx2, vy1, vy2; size_t k = 0; for (; k < ROUND_DOWN(blockSize, 2 * simd::vect_size); k += 2 * simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vy2 = simd::loadu(y + i * ldy + k + simd::vect_size); vx1 = simd::loadu(x + col[i * A.ld + j] * ldx + k); vx2 = simd::loadu(x + col[i * A.ld + j] * ldx + k + simd::vect_size); simd::storeu(y + i * ldx + k, simd::sub(vy1, vx1)); simd::storeu(y + i * ldx + k + simd::vect_size, simd::sub(vy2, vx2)); } for (; k < ROUND_DOWN(blockSize, simd::vect_size); k += simd::vect_size) { vy1 = simd::loadu(y + i * ldy + k); vx1 = simd::loadu(x + col[i * A.ld + j] * ldy + k); simd::storeu(y + i * ldx + k, simd::sub(vy1, vx1)); } for (; k < blockSize; ++k) y[i * ldy + k] -= x[col[i * A.ld + j] * ldx + k]; } } } // #endif /* __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS */ } // ell_details } // FFLAS #endif // __FFLASFFPACK_fflas_ELL_spmm_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell/ell_spmv.inl000066400000000000000000000231631274716147400252510ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_spmv_INL #define __FFLASFFPACK_fflas_sparse_ELL_spmv_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t start = 0; for (index_t i = 0; i < A.m; ++i, start += A.ld) { index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(A.ld, 4); j += 4) { F.axpyin(y1, dat[start + j], x[col[start + j]]); F.axpyin(y2, dat[start + j + 1], x[col[start + j + 1]]); F.axpyin(y3, dat[start + j + 2], x[col[start + j + 2]]); F.axpyin(y4, dat[start + j + 3], x[col[start + j + 3]]); } for (; j < A.ld; ++j) { F.axpyin(y1, dat[start + j], x[col[start + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } } template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t start = 0; for (index_t i = 0; i < A.m; ++i, start += A.ld) { index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(A.ld, 4); j += 4) { y1 += dat[start + j] * x[col[start + j]]; y2 += dat[start + j + 1] * x[col[start + j + 1]]; y3 += dat[start + j + 2] * x[col[start + j + 2]]; y4 += dat[start + j + 3] * x[col[start + j + 3]]; } for (; j < A.ld; ++j) { y1 += dat[start + j] * x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } } template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t start = 0; index_t block = (A.ld) / kmax; for (index_t i = 0; i < A.m; ++i, start += A.ld) { index_t j_loc = 0, j = 0; for (index_t l = 0; l < (index_t)block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { y[i] += dat[start + j] * x[col[start + j]]; } F.reduce(y[i]); } for (; j < A.ld; ++j) { y[i] += dat[start + j] * x[col[start + j]]; } F.reduce(y[i]); } } // template // inline void // fspmv(const Field &F, const Sparse &A, // typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, // Func &&func, FieldCategories::GenericTag) { // index_t start = 0; // for (index_t i = 0; i < A.m; ++i, start += A.ld) { // index_t j = 0; // typename Field::Element y1, y2, y3, y4; // F.assign(y1, F.zero); // F.assign(y2, F.zero); // F.assign(y3, F.zero); // F.assign(y4, F.zero); // for (; j < ROUND_DOWN(A.ld, 4); j += 4) { // func(y1, x[col[start + j]]); // func(y2, x[col[start + j + 1]]); // func(y3, x[col[start + j + 2]]); // func(y4, x[col[start + j + 3]]); // } // for (; j < A.ld; ++j) { // func(y1, x[col[start + j]]); // } // F.addin(y[i], y1); // F.addin(y[i], y2); // F.addin(y[i], y3); // F.addin(y[i], y4); // } // } template inline void fspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t start = 0; for (index_t i = 0; i < A.m; ++i, start += A.ld) { index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(A.ld, 4); j += 4) { F.addin(y1, x[col[start + j]]); F.addin(y2, x[col[start + j + 1]]); F.addin(y3, x[col[start + j + 2]]); F.addin(y4, x[col[start + j + 3]]); } for (; j < A.ld; ++j) { F.addin(y1, x[col[start + j]]); } F.addin(y[i], y1); F.addin(y[i], y2); F.addin(y[i], y3); F.addin(y[i], y4); } } template inline void fspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t start = 0; for (index_t i = 0; i < A.m; ++i, start += A.ld) { index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for (; j < ROUND_DOWN(A.ld, 4); j += 4) { F.addin(y1, x[col[start + j]]); F.addin(y2, x[col[start + j + 1]]); F.addin(y3, x[col[start + j + 2]]); F.addin(y4, x[col[start + j + 3]]); } for (; j < A.ld; ++j) { F.addin(y1, x[col[start + j]]); } F.subin(y[i], y1); F.subin(y[i], y2); F.subin(y[i], y3); F.subin(y[i], y4); } } template inline void fspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t start = 0; for (index_t i = 0; i < A.m; ++i, start += A.ld) { index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(A.ld, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < A.ld; ++j) { y1 += x[col[start + j]]; } y[i] += y1 + y2 + y3 + y4; } } template inline void fspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t start = 0; for (index_t i = 0; i < A.m; ++i, start += A.ld) { index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for (; j < ROUND_DOWN(A.ld, 4); j += 4) { y1 += x[col[start + j]]; y2 += x[col[start + j + 1]]; y3 += x[col[start + j + 2]]; y4 += x[col[start + j + 3]]; } for (; j < A.ld; ++j) { y1 += x[col[start + j]]; } y[i] -= y1 + y2 + y3 + y4; } } } // ELL_details } // FFLAS #endif // __FFLASFFPACK_fflas_ELL_spmv_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell/ell_utils.inl000066400000000000000000000075061274716147400254270ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_utils_INL #define __FFLASFFPACK_fflas_sparse_ELL_utils_INL #include namespace FFLAS { template inline void sparse_delete(const Sparse &A) { fflas_delete(A.dat); fflas_delete(A.col); } template inline void sparse_delete(const Sparse &A) { fflas_delete(A.col); } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.kmax = Protected::DotProdBoundClassic(F, F.one); A.m = rowdim; A.n = coldim; A.nnz = nnz; // cout << A.m << " " << A.n << endl; std::vector rows(A.m, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.maxrow = *(std::max_element(rows.begin(), rows.end())); // cout << "maxrow : " << A.maxrow << endl; A.ld = A.maxrow; if (A.kmax > A.maxrow) A.delayed = true; // cout << A.ld << " " << rowdim << " " << nnz << " " << A.ld*rowdim << endl; A.nElements = A.m * A.ld; A.col = fflas_new(rowdim * A.ld, Alignment::CACHE_LINE); A.dat = fflas_new(F, rowdim * A.ld, 1, Alignment::CACHE_LINE); for (size_t i = 0; i < rowdim * A.ld; ++i) { A.col[i] = 0; F.assign(A.dat[i], F.zero); } size_t currow = row[0], it = 0; for (size_t i = 0; i < nnz; ++i) { if (row[i] != currow) { it = 0; currow = row[i]; } A.col[row[i] * A.ld + it] = col[i]; A.dat[row[i] * A.ld + it] = dat[i]; ++it; } } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.kmax = Protected::DotProdBoundClassic(F, F.one); A.m = rowdim; A.n = coldim; A.nnz = nnz; std::vector rows(A.m, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i]]++; A.maxrow = *(std::max_element(rows.begin(), rows.end())); A.ld = A.maxrow; if (A.kmax > A.maxrow) A.delayed = true; A.nElements = A.m * A.ld; A.col = fflas_new(rowdim * A.ld, Alignment::CACHE_LINE); for (size_t i = 0; i < rowdim * A.ld; ++i) { A.col[i] = 0; } size_t currow = row[0], it = 0; for (size_t i = 0; i < nnz; ++i) { if (row[i] != currow) { it = 0; currow = row[i]; } A.col[row[i] * A.ld + it] = col[i]; ++it; } } } #endiffflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell_r.h000066400000000000000000000057071274716147400234220ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fspmv_ELL_R.inl * NO DOC */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_R_H #define __FFLASFFPACK_fflas_sparse_ELL_R_H namespace FFLAS { /* ELL_R */ template struct Sparse<_Field, SparseMatrix_t::ELL_R> { bool delayed = false; uint64_t kmax = 0; index_t m = 0; index_t n = 0; index_t ld = 0; uint64_t nnz = 0; uint64_t maxrow = 0; uint64_t mRow = 0; index_t *col = nullptr; index_t *row = nullptr; typename _Field::Element_ptr dat; }; template struct Sparse<_Field, SparseMatrix_t::ELL_R_ZO> : public Sparse<_Field, SparseMatrix_t::ELL_R> { typename _Field::Element cst = 1; }; template void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x, const typename Field::Element &beta, typename Field::Element_ptr y); template void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x, const typename Field::Element &beta, typename Field::Element_ptr y); template void fspmm(const Field &F, const Sparse &A, const size_t blockSize, const typename Field::Element_ptr &x, const int ldx, const typename Field::Element &beta, typename Field::Element_ptr &y, const int ldy); template void fspmm(const Field &F, const Sparse &A, const size_t blockSize, const typename Field::Element_ptr &x, const int ldx, const typename Field::Element &beta, typename Field::Element_ptr &y, const int ldy); } // FFLAS #include "fflas-ffpack/fflas/fflas_sparse/ell_r_spmv.inl" // #include "fflas-ffpack/fflas/fflas_sparse/ell_r_spmm.inl" #endif // __FFLASFFPACK_fflas_sparse_ELL_R_Hfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell_r/000077500000000000000000000000001274716147400232405ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell_r/Makefile.am000066400000000000000000000020301274716147400252670ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Bastien Vialla # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_sparse/ell_r pkgincludesub_HEADERS= \ ell_r_spmv.inl fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell_r/ell_r_spmv.inl000066400000000000000000000265541274716147400261220ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Brice Boyer (briceboyer) * Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_R_spmv_INL #define __FFLASFFPACK_fflas_sparse_ELL_R_spmv_INL namespace FFLAS{ namespace ell_r_details{ template inline void fspmv(const Field & F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag){ index_t start = 0; for(index_t i = 0 ; i < A.mRow ; ++i, start+=A.ld){ index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for(; j < ROUND_DOWN(A.ld, 4) ; j+=4){ F.axpyin(y1,A.dat[start+j],x[A.col[start+j]]); F.axpyin(y2,A.dat[start+j+1],x[A.col[start+j+1]]); F.axpyin(y3,A.dat[start+j+2],x[A.col[start+j+2]]); F.axpyin(y4,A.dat[start+j+3],x[A.col[start+j+3]]); } for(; j < A.ld ; ++j){ F.axpyin(y1,A.dat[start+j],x[A.col[start+j]]); } F.addin(y[A.row[i]], y1); F.addin(y[A.row[i]], y2); F.addin(y[A.row[i]], y3); F.addin(y[A.row[i]], y4); } } template inline void fspmv(const Field & F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag){ index_t start = 0; for(index_t i = 0 ; i < A.mRow ; ++i, start+=A.ld){ index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for(; j < ROUND_DOWN(A.ld, 4) ; j+=4){ y1 += A.dat[start+j] * x[A.col[start+j]]; y2 += A.dat[start+j+1] * x[A.col[start+j+1]]; y3 += A.dat[start+j+2] * x[A.col[start+j+2]]; y4 += A.dat[start+j+3] * x[A.col[start+j+3]]; } for(; j < A.ld ; ++j){ y1 += A.dat[start+j] * x[A.col[start+j]]; } y[A.row[i]] += y1+y2+y3+y4; } } template inline void fspmv(const Field & F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, const int64_t kmax){ index_t start = 0; index_t block = (A.ld)/kmax ; for (index_t i = 0 ; i < A.mRow ; ++i, start+=A.ld) { index_t j_loc = 0, j = 0; for (index_t l = 0 ; l < (index_t) block ; ++l) { j_loc += kmax ; for ( ; j < j_loc ; ++j) { y[A.row[i]] += A.dat[start+j] * x[A.col[start+j]]; } F.reduce(y[A.row[i]]); } for ( ; j < A.ld ; ++j) { y[A.row[i]] += A.dat[start+j] * x[A.col[start+j]]; } F.reduce(y[A.row[i]]; } } template inline void fspmv(const Field & F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, Func && func, FieldCategories::GenericTag){ index_t start = 0; for(index_t i = 0 ; i < A.mRow ; ++i, start+=A.ld){ index_t j = 0; typename Field::Element y1, y2, y3, y4; F.assign(y1, F.zero); F.assign(y2, F.zero); F.assign(y3, F.zero); F.assign(y4, F.zero); for(; j < ROUND_DOWN(A.ld, 4) ; j+=4){ func(y1,x[A.col[start+j]]); func(y2,x[A.col[start+j+1]]); func(y3,x[A.col[start+j+2]]); func(y4,x[A.col[start+j+3]]); } for(; j < A.ld ; ++j){ func(y1,x[A.col[start+j]]); } F.addin(y[A.row[i]], y1); F.addin(y[A.row[i]], y2); F.addin(y[A.row[i]], y3); F.addin(y[A.row[i]], y4); } } template inline void fspmv(const Field & F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, Func && func, FieldCategories::UnparametricTag){ index_t start = 0; for(index_t i = 0 ; i < A.mRow ; ++i, start+=A.ld){ index_t j = 0; typename Field::Element y1 = 0, y2 = 0, y3 = 0, y4 = 0; for(; j < ROUND_DOWN(A.ld, 4) ; j+=4){ func(y1,x[A.col[start+j]]); func(y2,x[A.col[start+j+1]]); func(y3,x[A.col[start+j+2]]); func(y4,x[A.col[start+j+3]]); } for(; j < A.ld ; ++j){ func(y1,x[A.col[start+j]]); } y[A.row[i]] += y1+y2+y3+y4; } } }// ELL_R_details template inline void fspmv(const Field& F, const Sparse & A, typename Field::ConstElement_ptr x, const typename Field::Element & beta, typename Field::Element_ptr y){ sparse_details::init_y(F, A.m, beta, y, typename FieldTraits::category()); fspmv(F, A, x, y, typename FieldTraits::category()); } template inline void fspmv(const Field& F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag){ ell_r_details::fspmv(F, A, x, y, FieldCategories::GenericTag()); } template inline void fspmv(const Field& F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag){ ell_r_details::fspmv(F, A, x, y, FieldCategories::UnparametricTag()); } template inline void fspmv(const Field& F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag){ if(A.delayed){ ell_r_details::fspmv(F, A, x, y, FieldCategories::UnparametricTag()); freduce(F, A.m, y, 1); }else{ ell_r_details::fspmv(F, A, x, y, A.kmax); } } template inline void fspmv(const Field& F, const Sparse & A, typename Field::ConstElement_ptr x, const typename Field::Element & beta, typename Field::Element_ptr y){ sparse_details::init_y(F, A.m, beta, y, typename FieldTraits::category()); fspmv(F, A, x, y, typename FieldTraits::category()); } template inline void fspmv(const Field& F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag){ using Element = typename Field::Element; if(A.cst == 1){ ell_r_details::fspmv(F, A, x, y, [&F](Element & a, const Element & b){F.addin(a, b);}, FieldCategories::GenericTag()); }else if(A.cst == -1){ ell_r_details::fspmv(F, A, x, y, [&F](Element & a, const Element & b){F.subin(a, b);}, FieldCategories::GenericTag()); }else{ auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); fscal(F, A.n, A.cst, x, 1, x1, 1); ell_r_details::fspmv(F, A, x, y, [&F](Element & a, const Element & b){F.addin(a, b);}, FieldCategories::GenericTag()); fflas_delete(x1); } } template inline void fspmv(const Field& F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag){ using Element = typename Field::Element; if(A.cst == 1){ ell_r_details::fspmv(F, A, x, y, [](Element & a, const Element & b){a += b;}, FieldCategories::UnparametricTag()); }else if(A.cst == -1){ ell_r_details::fspmv(F, A, x, y, [](Element & a, const Element & b){a -= b;}, FieldCategories::UnparametricTag()); }else{ auto x1 = fflas_new(F, A.n, 1, Alignment::CACHE_LINE); fscal(F, A.n, A.cst, x, 1, x1, 1); ell_r_details::fspmv(F, A, x, y, [](Element & a, const Element & b){a += b;}, FieldCategories::UnparametricTag()); fflas_delete(x1); } } template inline void fspmv(const Field& F, const Sparse & A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag){ fspmv(F, A, x, y, FieldCategories::UnparametricTag()); freduce(F, A.m, y, 1); } template inline void sparse_delete(const Sparse & A){ fflas_delete(A.dat); fflas_delete(A.col); } template inline void sparse_delete(const Sparse & A){ fflas_delete(A.col); } template inline void sparse_init(const Field & F, Sparse & A, const IndexT * row, const IndexT * col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz){ // TODO // A.kmax = Protected::DotProdBoundClassic(F,F.one); // A.m = rowdim; // A.n = coldim; // A.nnz = nnz; // std::vector rows(A.m, 0); // for(uint64_t i = 0 ; i < A.nnz ; ++i) // rows[row[i]]++; // A.maxrow = *(std::max_element(rows.begin(), rows.end())); // A.ld = A.maxrow; // for(auto & x : rows) // if(x != 0) // A.mRow++; // if(A.kmax > A.maxrow) // A.delayed = true; // A.col = fflas_new(A.mRow*A.ld, Alignment::CACHE_LINE); // A.dat = fflas_new(F, rowdim*A.ld, 1, Alignment::CACHE_LINE); // for(size_t i = 0 ; i < rowdim*A.ld ; ++i){ // A.col[i] = 0; // F.assign(A.dat[i], F.zero); // } // size_t currow = row[0], it = 0; // for(size_t i = 0 ; i < nnz ; ++i){ // if(row[i] != currow){ // it = 0; // currow = row[i]; // } // A.col[row[i]*A.ld + it] = col[i]; // A.dat[row[i]*A.ld + it] = dat[i]; // ++it; // } } template inline void sparse_init(const Field & F, Sparse & A, const IndexT * row, const IndexT * col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz){ // TODO // A.kmax = Protected::DotProdBoundClassic(F,F.one); // A.m = rowdim; // A.n = coldim; // A.nnz = nnz; // std::vector rows(A.m, 0); // for(uint64_t i = 0 ; i < A.nnz ; ++i) // rows[row[i]]++; // A.maxrow = *(std::max_element(rows.begin(), rows.end())); // A.ld = A.maxrow; // if(A.kmax > A.maxrow) // A.delayed = true; // A.col = fflas_new(rowdim*A.ld, Alignment::CACHE_LINE); // for(size_t i = 0 ; i < rowdim*A.ld ; ++i){ // A.col[i] = 0; // } // size_t currow = row[0], it = 0; // for(size_t i = 0 ; i < nnz ; ++i){ // if(row[i] != currow){ // it = 0; // currow = row[i]; // } // A.col[row[i]*A.ld + it] = col[i]; // ++it; // } } } // FFLAS #endif // __FFLASFFPACK_fflas_ELL_R_spmv_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell_simd.h000066400000000000000000000060531274716147400241100ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fspmv_ELL_simd.inl * NO DOC */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_simd_H #define __FFLASFFPACK_fflas_sparse_ELL_simd_H namespace FFLAS { /* ELL_simd */ template struct Sparse<_Field, SparseMatrix_t::ELL_simd> { bool delayed = false; int chunk = 0; index_t m = 0; index_t n = 0; index_t ld = 0; uint64_t kmax = 0; uint64_t nnz = 0; uint64_t nElements = 0; uint64_t maxrow = 0; uint64_t nChunks = 0; index_t *col = nullptr; typename _Field::Element_ptr dat; }; template struct Sparse<_Field, SparseMatrix_t::ELL_simd_ZO> : public Sparse<_Field, SparseMatrix_t::ELL_simd> { typename _Field::Element cst = 1; }; template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz); template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz); template inline void sparse_delete(const Sparse &A); template inline void sparse_delete(const Sparse &A); } // FFLAS #include "fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_utils.inl" #include "fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_spmv.inl" #if defined(__FFLASFFPACK_USE_OPENMP) #include "fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_pspmv.inl" #endif // #include "fflas-ffpack/fflas/fflas_sparse/ell_simd_spmm.inl" #endif // __FFLASFFPACK_fflas_sparse_ELL_simd_Hfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell_simd/000077500000000000000000000000001274716147400237335ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell_simd/Makefile.am000066400000000000000000000021301274716147400257630ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Bastien Vialla # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_sparse/ell_simd pkgincludesub_HEADERS= \ ell_simd_spmv.inl \ ell_simd_pspmv.inl \ ell_simd_utils.inl fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_pspmv.inl000066400000000000000000000670671274716147400274740ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_simd_pspmv_INL #define __FFLASFFPACK_fflas_sparse_ELL_simd_pspmv_INL #ifdef __FFLASFFPACK_USE_TBB #include "tbb/parallel_for.h" #include "tbb/blocked_range.h" #endif namespace FFLAS { namespace sparse_details_impl { template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, dat, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; for (; j < A.ld; ++j) { for (index_t k = 0; k < A.chunk; ++k) { F.axpyin(y[i * A.chunk + k], dat[i * A.ld * A.chunk + j * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); } } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; for (; j < A.ld; ++j) { for (index_t k = 0; k < A.chunk; ++k) { F.axpyin(y[i * A.chunk + k], dat[i * A.ld * A.chunk + j * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); } } } #endif } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmv_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { using simd = Simd; using vect_t = typename simd::vect_t; assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nChunks, 2), [&F, &A, x, y, dat, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; vect_t y1, y2, x1, x2, dat1, dat2, yy; y1 = simd::zero(); y2 = simd::zero(); for (; j < ROUND_DOWN(A.ld, 2); j += 2) { dat1 = simd::load(dat + i * A.ld * A.chunk + j * A.chunk); dat2 = simd::load(dat + i * A.ld * A.chunk + (j + 1) * A.chunk); x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); x2 = simd::gather(x, col + i * A.ld * A.chunk + (j + 1) * A.chunk); y1 = simd::fmadd(y1, dat1, x1); y2 = simd::fmadd(y2, dat2, x2); } for (; j < A.ld; ++j) { dat1 = simd::load(dat + i * A.ld * A.chunk + j * A.chunk); x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); y1 = simd::fmadd(y1, dat1, x1); } yy = simd::load(y + i * A.chunk); simd::store(y + i * A.chunk, simd::add(yy, simd::add(y1, y2))); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; vect_t y1, y2, x1, x2, dat1, dat2, yy; y1 = simd::zero(); y2 = simd::zero(); for (; j < ROUND_DOWN(A.ld, 2); j += 2) { dat1 = simd::load(dat + i * A.ld * A.chunk + j * A.chunk); dat2 = simd::load(dat + i * A.ld * A.chunk + (j + 1) * A.chunk); x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); x2 = simd::gather(x, col + i * A.ld * A.chunk + (j + 1) * A.chunk); y1 = simd::fmadd(y1, dat1, x1); y2 = simd::fmadd(y2, dat2, x2); } for (; j < A.ld; ++j) { dat1 = simd::load(dat + i * A.ld * A.chunk + j * A.chunk); x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); y1 = simd::fmadd(y1, dat1, x1); } yy = simd::load(y + i * A.chunk); simd::store(y + i * A.chunk, simd::add(yy, simd::add(y1, y2))); } #endif } #endif // SIMD template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nChunks, 2), [&F, &A, x, y, dat, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + J * A.chunk + k]]; y[i * A.chunk + k + 1] += dat[i * A.ld * A.chunk + j * A.chunk + k + 1] * x[col[i * A.ld * A.chunk + J * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[i * A.ld * A.chunk + j * A.chunk + k + 2] * x[col[i * A.ld * A.chunk + J * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[i * A.ld * A.chunk + j * A.chunk + k + 3] * x[col[i * A.ld * A.chunk + J * A.chunk + k + 3]]; } for (; k < A.chunk; ++k) y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + J * A.chunk + k]]; } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + j * A.chunk + k]]; y[i * A.chunk + k + 1] += dat[i * A.ld * A.chunk + j * A.chunk + k + 1] * x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[i * A.ld * A.chunk + j * A.chunk + k + 2] * x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[i * A.ld * A.chunk + j * A.chunk + k + 3] * x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]; } for (; k < A.chunk; ++k) y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } #endif // TBB } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmv_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t block = (A.ld) / kmax; // use DIVIDE_INTO from pfspmvgpu index_t chunk = A.chunk; using simd = Simd; using vect_t = typename simd::vect_t; vect_t X, Y, D, C, Q, TMP, NEGP, INVP, MIN, MAX, P; double p = (typename Field::Element)F.characteristic(); P = simd::set1(p); NEGP = simd::set1(-p); INVP = simd::set1(1 / p); MIN = simd::set1(F.minElement()); MAX = simd::set1(F.maxElement()); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nChunks, 2), [&F, &A, x, y, P, NEGP, INVP, MAX, MIN, dat, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; index_t j_loc = 0; Y = simd::load(y + i * chunk); for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { D = simd::load(dat + i * A.chunk * A.ld + j * A.chunk); X = simd::gather(x, col + i * A.chunk * A.ld + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); } for (; j < A.ld; ++j) { D = simd::load(dat + i * A.chunk * A.ld + j * A.chunk); X = simd::gather(x, col + i * A.chunk * A.ld + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); simd::store(y + i * A.chunk, Y); } }); #else #pragma omp parallel for for (size_t i = 0; i < A.nChunks; ++i) { index_t j = 0; index_t j_loc = 0; Y = simd::load(y + i * chunk); for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { D = simd::load(dat + i * A.chunk * A.ld + j * A.chunk); X = simd::gather(x, col + i * A.chunk * A.ld + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); } for (; j < A.ld; ++j) { D = simd::load(dat + i * A.chunk * A.ld + j * A.chunk); X = simd::gather(x, col + i * A.chunk * A.ld + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); simd::store(y + i * A.chunk, Y); } #endif // TBB } #endif template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) { index_t block = (A.ld) / kmax; // use DIVIDE_INTO from pfspmvgpu // index_t chunk = A.chunk; // size_t end = (A.m % chunk == 0) ? A.m : A.m + A.m % chunk; assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nChunks, 2), [&F, &A, x, y, P, NEGP, INVP, MAX, MIN, dat, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; index_t j_loc = 0; for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { for (size_t k = 0; k < A.chunk; ++k) { y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } for (size_t k = 0; k < A.chunk; ++k) F.reduce(y[i * A.chunk + k], y[i * A.chunk + k]); } for (; j < A.ld; ++j) { for (size_t k = 0; k < A.chunk; ++k) { y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } for (size_t k = 0; k < A.chunk; ++k) F.reduce(y[i * A.chunk + k], y[i * A.chunk + k]); } }); #else #pragma omp parallel for for (size_t i = 0; i < A.nChunks; ++i) { index_t j = 0; index_t j_loc = 0; for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { for (size_t k = 0; k < A.chunk; ++k) { y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } for (size_t k = 0; k < A.chunk; ++k) F.reduce(y[i * A.chunk + k], y[i * A.chunk + k]); } for (; j < A.ld; ++j) { for (size_t k = 0; k < A.chunk; ++k) { y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } for (size_t k = 0; k < A.chunk; ++k) F.reduce(y[i * A.chunk + k], y[i * A.chunk + k]); } #endif // TBB } template inline void pfspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nChunks, 2), [&F, &A, x, y, dat, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { F.addin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); F.addin(y[i * A.chunk + k + 1], x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]); F.addin(y[i * A.chunk + k + 2], x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]); F.addin(y[i * A.chunk + k + 3], x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]); } for (; k < A.chunk; ++k) F.addin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { F.addin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); F.addin(y[i * A.chunk + k + 1], x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]); F.addin(y[i * A.chunk + k + 2], x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]); F.addin(y[i * A.chunk + k + 3], x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]); } for (; k < A.chunk; ++k) F.addin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); } } #endif } template inline void pfspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nChunks, 2), [&F, &A, x, y, dat, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { F.subin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); F.subin(y[i * A.chunk + k + 1], x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]); F.subin(y[i * A.chunk + k + 2], x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]); F.subin(y[i * A.chunk + k + 3], x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]); } for (; k < A.chunk; ++k) F.subin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { F.subin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); F.subin(y[i * A.chunk + k + 1], x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]); F.subin(y[i * A.chunk + k + 2], x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]); F.subin(y[i * A.chunk + k + 3], x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]); } for (; k < A.chunk; ++k) F.subin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); } } #endif } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmv_one_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nChunks, 2), [&F, &A, x, y, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; vect_t y1, y2, x1, x2, dat1, dat2, yy; y1 = simd::zero(); y2 = simd::zero(); for (; j < ROUND_DOWN(A.ld, 2); j += 2) { dat1 = simd::load(dat + i * A.ld * A.chunk + j * A.chunk); dat2 = simd::load(dat + i * A.ld * A.chunk + (j + 1) * A.chunk); x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); x2 = simd::gather(x, col + i * A.ld * A.chunk + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y1 = simd::add(y2, x2); } for (; j < A.ld; ++j) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); y1 = simd::add(y1, dat1, x1); } yy = simd::load(y + i * A.chunk); simd::store(y + i * A.chunk, simd::add(yy, simd::add(y1, y2))); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; vect_t y1, y2, x1, x2, dat1, dat2, yy; y1 = simd::zero(); y2 = simd::zero(); for (; j < ROUND_DOWN(A.ld, 2); j += 2) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); x2 = simd::gather(x, col + i * A.ld * A.chunk + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y1 = simd::add(y2, x2); } for (; j < A.ld; ++j) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); y1 = simd::add(y1, dat1, x1); } yy = simd::load(y + i * A.chunk); simd::store(y + i * A.chunk, simd::add(yy, simd::add(y1, y2))); } #endif } template inline void pfspmv_mone_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nChunks, 2), [&F, &A, x, y, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; vect_t y1, y2, x1, x2, dat1, dat2, yy; y1 = simd::zero(); y2 = simd::zero(); for (; j < ROUND_DOWN(A.ld, 2); j += 2) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); x2 = simd::gather(x, col + i * A.ld * A.chunk + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y1 = simd::add(y2, x2); } for (; j < A.ld; ++j) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); y1 = simd::add(y1, dat1, x1); } yy = simd::load(y + i * A.chunk); simd::store(y + i * A.chunk, simd::sub(yy, simd::add(y1, y2))); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; vect_t y1, y2, x1, x2, dat1, dat2, yy; y1 = simd::zero(); y2 = simd::zero(); for (; j < ROUND_DOWN(A.ld, 2); j += 2) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); x2 = simd::gather(x, col + i * A.ld * A.chunk + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y1 = simd::add(y2, x2); } for (; j < A.ld; ++j) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); y1 = simd::add(y1, dat1, x1); } yy = simd::load(y + i * A.chunk); simd::store(y + i * A.chunk, simd::sub(yy, simd::add(y1, y2))); } #endif } #endif // SIMD template inline void pfspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nChunks, 2), [&F, &A, x, y, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += x[col[i * A.ld * A.chunk + j * A.chunk + k]]; y[i * A.chunk + k + 1] += x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]; } for (; k < A.chunk; ++k) y[i * A.chunk + k] += x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += x[col[i * A.ld * A.chunk + j * A.chunk + k]]; y[i * A.chunk + k + 1] += x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]; } for (; k < A.chunk; ++k) y[i * A.chunk + k] += x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } #endif // TBB } template inline void pfspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nChunks, 2), [&F, &A, x, y, col](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] -= x[col[i * A.ld * A.chunk + j * A.chunk + k]]; y[i * A.chunk + k + 1] -= x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] -= x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] -= x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]; } for (; k < A.chunk; ++k) y[i * A.chunk + k] -= x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] -= x[col[i * A.ld * A.chunk + j * A.chunk + k]]; y[i * A.chunk + k + 1] -= x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] -= x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] -= x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]; } for (; k < A.chunk; ++k) y[i * A.chunk + k] -= x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } #endif // TBB } } // ELL_simd_details } // FFLAS #endif // __FFLASFFPACK_fflas_ELL_simd_pspmv_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_spmv.inl000066400000000000000000000371261274716147400273050ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_simd_spmv_INL #define __FFLASFFPACK_fflas_sparse_ELL_simd_spmv_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; for (; j < A.ld; ++j) { for (index_t k = 0; k < A.chunk; ++k) { F.axpyin(y[i * A.chunk + k], dat[i * A.ld * A.chunk + j * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); } } } } // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmv_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; index_t chunk = A.chunk; for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; vect_t y1, y2, x1, x2, dat1, dat2, yy; y1 = simd::zero(); y2 = simd::zero(); for (; j < ROUND_DOWN(A.ld, 2); j += 2) { dat1 = simd::load(dat + i * A.ld * A.chunk + j * chunk); dat2 = simd::load(dat + i * A.ld * A.chunk + (j + 1) * chunk); x1 = simd::gather(x, col + i * A.ld * A.chunk + j * chunk); x2 = simd::gather(x, col + i * A.ld * A.chunk + (j + 1) * chunk); y1 = simd::fmadd(y1, dat1, x1); y2 = simd::fmadd(y2, dat2, x2); } for (; j < A.ld; ++j) { dat1 = simd::load(dat + i * A.ld * A.chunk + j * chunk); x1 = simd::gather(x, col + i * A.ld * A.chunk + j * chunk); y1 = simd::fmadd(y1, dat1, x1); } yy = simd::load(y + i * chunk); simd::store(y + i * chunk, simd::add(yy, simd::add(y1, y2))); } } // #endif template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nChunks; ++i) { for (index_t j = 0; j < A.ld; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + j * A.chunk + k]]; y[i * A.chunk + k + 1] += dat[i * A.ld * A.chunk + j * A.chunk + k + 1] * x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[i * A.ld * A.chunk + j * A.chunk + k + 2] * x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[i * A.ld * A.chunk + j * A.chunk + k + 3] * x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]; } for (; k < A.chunk; ++k) y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } } // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmv_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t block = (A.ld) / kmax; // use DIVIDE_INTO from fspmvgpu index_t chunk = A.chunk; size_t end = (A.m % chunk == 0) ? A.m : A.m + A.m % chunk; using simd = Simd; using vect_t = typename simd::vect_t; vect_t X, Y, D, C, Q, TMP, NEGP, INVP, MIN, MAX, P; double p = (typename Field::Element)F.characteristic(); P = simd::set1(p); NEGP = simd::set1(-p); INVP = simd::set1(1 / p); MIN = simd::set1(F.minElement()); MAX = simd::set1(F.maxElement()); for (size_t i = 0; i < end / chunk; ++i) { index_t j = 0; index_t j_loc = 0; Y = simd::load(y + i * chunk); for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { D = simd::load(dat + i * A.chunk * A.ld + j * A.chunk); X = simd::gather(x, col + i * A.chunk * A.ld + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); } for (; j < A.ld; ++j) { D = simd::load(dat + i * A.chunk * A.ld + j * A.chunk); X = simd::gather(x, col + i * A.chunk * A.ld + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); simd::store(y + i * A.chunk, Y); } } // #endif template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t block = (A.ld) / kmax; // use DIVIDE_INTO from fspmvgpu index_t chunk = A.chunk; size_t end = (A.m % chunk == 0) ? A.m : A.m + A.m % chunk; for (size_t i = 0; i < end / chunk; ++i) { index_t j = 0; index_t j_loc = 0; for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { for (size_t k = 0; k < A.chunk; ++k) { y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } for (size_t k = 0; k < A.chunk; ++k) F.reduce(y[i * A.chunk + k], y[i * A.chunk + k]); } for (; j < A.ld; ++j) { for (size_t k = 0; k < A.chunk; ++k) { y[i * A.chunk + k] += dat[i * A.ld * A.chunk + j * A.chunk + k] * x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } for (size_t k = 0; k < A.chunk; ++k) F.reduce(y[i * A.chunk + k], y[i * A.chunk + k]); } } template inline void fspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { F.addin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); F.addin(y[i * A.chunk + k + 1], x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]); F.addin(y[i * A.chunk + k + 2], x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]); F.addin(y[i * A.chunk + k + 3], x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]); } for (; k < A.chunk; ++k) F.addin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); } } } template inline void fspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { F.subin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); F.subin(y[i * A.chunk + k + 1], x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]); F.subin(y[i * A.chunk + k + 2], x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]); F.subin(y[i * A.chunk + k + 3], x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]); } for (; k < A.chunk; ++k) F.subin(y[i * A.chunk + k], x[col[i * A.ld * A.chunk + j * A.chunk + k]]); } } } template inline void fspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += x[col[i * A.ld * A.chunk + j * A.chunk + k]]; y[i * A.chunk + k + 1] += x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]; } for (; k < A.chunk; ++k) y[i * A.chunk + k] += x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } } template inline void fspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; for (; j < A.ld; ++j) { index_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] -= x[col[i * A.ld * A.chunk + j * A.chunk + k]]; y[i * A.chunk + k + 1] -= x[col[i * A.ld * A.chunk + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] -= x[col[i * A.ld * A.chunk + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] -= x[col[i * A.ld * A.chunk + j * A.chunk + k + 3]]; } for (; k < A.chunk; ++k) y[i * A.chunk + k] -= x[col[i * A.ld * A.chunk + j * A.chunk + k]]; } } } // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmv_one_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; vect_t y1, y2, x1, x2, dat1, dat2, yy; y1 = simd::zero(); y2 = simd::zero(); for (; j < ROUND_DOWN(A.ld, 2); j += 2) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); x2 = simd::gather(x, col + i * A.ld * A.chunk + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y1 = simd::add(y2, x2); } for (; j < A.ld; ++j) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); y1 = simd::add(y1, x1); } yy = simd::load(y + i * A.chunk); simd::store(y + i * A.chunk, simd::add(yy, simd::add(y1, y2))); } } template inline void fspmv_mone_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.nChunks; ++i) { index_t j = 0; vect_t y1, y2, x1, x2, dat1, dat2, yy; y1 = simd::zero(); y2 = simd::zero(); for (; j < ROUND_DOWN(A.ld, 2); j += 2) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); x2 = simd::gather(x, col + i * A.ld * A.chunk + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y1 = simd::add(y2, x2); } for (; j < A.ld; ++j) { x1 = simd::gather(x, col + i * A.ld * A.chunk + j * A.chunk); y1 = simd::add(y1, x1); } yy = simd::load(y + i * A.chunk); simd::store(y + i * A.chunk, simd::sub(yy, simd::add(y1, y2))); } } // #endif } // ELL_simd_details } // FFLAS #endif // __FFLASFFPACK_fflas_ELL_simd_spmv_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/ell_simd/ell_simd_utils.inl000066400000000000000000000133031274716147400274470ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_ELL_simd_utils_INL #define __FFLASFFPACK_fflas_sparse_ELL_simd_utils_INL namespace FFLAS { template inline void sparse_delete(const Sparse &A) { fflas_delete(A.dat); fflas_delete(A.col); } template inline void sparse_delete(const Sparse &A) { fflas_delete(A.col); } template inline void sparse_print(const Sparse &A) { for (size_t i = 0; i < A.nChunks; ++i) { for (size_t k = 0; k < A.chunk; ++k) { std::cout << i *A.chunk + k << " : "; for (size_t j = 0; j < A.ld; ++j) { std::cout << A.dat[i * A.ld * A.chunk + j * A.chunk + k] << " "; } std::cout << std::endl; } } } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS using simd = Simd; A.chunk = simd::vect_size; #else A.chunk = 8; #endif A.kmax = Protected::DotProdBoundClassic(F, F.one); A.m = rowdim; A.n = coldim; A.nnz = nnz; std::vector rows(A.m + 1, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i] + 1]++; A.maxrow = *(std::max_element(rows.begin(), rows.end())); A.ld = A.maxrow; if (A.kmax > A.maxrow) A.delayed = true; for (size_t i = 1; i <= A.m; ++i) { rows[i] += rows[i - 1]; } index_t m = (A.m % A.chunk == 0) ? A.m : ROUND_DOWN(A.m, A.chunk) + A.chunk; // cout << A.m << " " << ROUND_DOWN(A.m, simd::vect_size)+simd::vect_size << // " " << m/A.chunk << endl; A.nChunks = m / A.chunk; A.col = fflas_new(A.nChunks * A.chunk * A.ld, Alignment::CACHE_LINE); A.dat = fflas_new(F, A.nChunks * A.chunk * A.ld, 1, Alignment::CACHE_LINE); A.nElements = A.nChunks * A.chunk * A.ld; for (size_t i = 0; i < A.nChunks * A.chunk * A.ld; ++i) { A.col[i] = 0; F.assign(A.dat[i], F.zero); } for (size_t i = 0; i < A.nChunks; ++i) { for (size_t k = 0; k < A.chunk; ++k) { if (i * A.chunk + k < rowdim) { uint64_t start = rows[i * A.chunk + k], stop = rows[i * A.chunk + k + 1]; // cout << "start " << start << " stop " << stop << endl; for (size_t j = 0; j < stop - start; ++j) { A.dat[i * A.chunk * A.ld + j * A.chunk + k] = dat[start + j]; A.col[i * A.chunk * A.ld + j * A.chunk + k] = col[start + j]; } } } } } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS using simd = Simd; A.chunk = simd::vect_size; #else A.chunk = 8; #endif A.kmax = Protected::DotProdBoundClassic(F, F.one); A.m = rowdim; A.n = coldim; A.nnz = nnz; std::vector rows(A.m + 1, 0); for (uint64_t i = 0; i < A.nnz; ++i) rows[row[i] + 1]++; A.maxrow = *(std::max_element(rows.begin(), rows.end())); A.ld = A.maxrow; if (A.kmax > A.maxrow) A.delayed = true; for (size_t i = 1; i <= A.m; ++i) { rows[i] += rows[i - 1]; } index_t m = (A.m % A.chunk == 0) ? A.m : ROUND_DOWN(A.m, A.chunk) + A.chunk; // cout << A.m << " " << ROUND_DOWN(A.m, simd::vect_size)+simd::vect_size << // " " << m/A.chunk << endl; A.nChunks = m / A.chunk; A.col = fflas_new(A.nChunks * A.chunk * A.ld, Alignment::CACHE_LINE); A.nElements = A.nChunks * A.chunk * A.ld; for (size_t i = 0; i < A.nChunks * A.chunk * A.ld; ++i) { A.col[i] = 0; } for (size_t i = 0; i < A.nChunks; ++i) { for (size_t k = 0; k < A.chunk; ++k) { if (i * A.chunk + k < rowdim) { uint64_t start = rows[i * A.chunk + k], stop = rows[i * A.chunk + k + 1]; // cout << "start " << start << " stop " << stop << endl; for (size_t j = 0; j < stop - start; ++j) { A.col[i * A.chunk * A.ld + j * A.chunk + k] = col[start + j]; } } } } } } #endiffflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/hyb_zo.h000066400000000000000000000042631274716147400236130ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fspmv_HYB_ZO.inl * NO DOC */ #ifndef __FFLASFFPACK_fflas_sparse_HYB_ZO_H #define __FFLASFFPACK_fflas_sparse_HYB_ZO_H namespace FFLAS { /* HYB_ZO */ template struct Sparse<_Field, SparseMatrix_t::HYB_ZO> { using Field = _Field; typedef Sparse<_Field, SparseMatrix_t::HYB_ZO> Self_t; bool delayed = false; uint64_t kmax = 0; index_t m = 0; index_t n = 0; uint64_t nnz = 0; uint64_t maxrow = 0; uint64_t nElements = 0; Sparse<_Field, SparseMatrix_t::CSR> *dat = nullptr; Sparse<_Field, SparseMatrix_t::CSR_ZO> *one = nullptr; Sparse<_Field, SparseMatrix_t::CSR_ZO> *mone = nullptr; }; } // FFLAS #include "fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_utils.inl" #include "fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_spmv.inl" #include "fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_spmm.inl" #if defined(__FFLASFFPACK_USE_OPENMP) #include "fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_pspmv.inl" #include "fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_pspmm.inl" #endif #endif // __FFLASFFPACK_fflas_sparse_HYB_ZO_H fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/hyb_zo/000077500000000000000000000000001274716147400234355ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/hyb_zo/Makefile.am000066400000000000000000000022061274716147400254710ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Bastien Vialla # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_sparse/hyb_zo pkgincludesub_HEADERS= \ hyb_zo_spmv.inl \ hyb_zo_spmm.inl \ hyb_zo_pspmm.inl \ hyb_zo_pspmv.inl \ hyb_zo_utils.inl fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_pspmm.inl000066400000000000000000000157171274716147400266620ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_HYB_ZO_pspmm_INL #define __FFLASFFPACK_fflas_sparse_HYB_ZO_pspmm_INL namespace FFLAS { namespace sparse_details_impl { template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag) { if (A.one != nullptr) sparse_details_impl::pfspmm_one(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); if (A.mone != nullptr) sparse_details_impl::pfspmm_mone(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); if (A.dat != nullptr) sparse_details_impl::pfspmm(F, *(A.dat), blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); } template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag) { if (A.one != nullptr) sparse_details_impl::pfspmm_one(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::pfspmm_mone(F, *(A.mone), blockSize, x, ldx, y, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::pfspmm(F, *(A.dat), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag) { if (A.one != nullptr) sparse_details_impl::pfspmm_one_simd_aligned(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::pfspmm_mone_simd_aligned(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::pfspmm_simd_aligned(F, *(A.dat), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } template inline void pfspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag) { if (A.one != nullptr) sparse_details_impl::pfspmm_one_simd_unaligned(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::pfspmm_mone_simd_unaligned(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::pfspmm_simd_unaligned(F, *(A.dat), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } #endif template inline void pfspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, uint64_t kmax) { if (A.one != nullptr) sparse_details_impl::pfspmm_one(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::pfspmm_mone(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::pfspmm(F, *(A.dat), blockSize, x, ldx, y, ldy, kmax); } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, uint64_t kmax) { if (A.one != nullptr) sparse_details_impl::pfspmm_one_simd_aligned(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::pfspmm_mone_simd_aligned(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::pfspmm_simd_aligned(F, *(A.dat), blockSize, x, ldx, y, ldy, kmax); } template inline void pfspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, uint64_t kmax) { if (A.one != nullptr) sparse_details_impl::pfspmm_one_simd_unaligned(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::pfspmm_mone_simd_unaligned(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::pfspmm_simd_unaligned(F, *(A.dat), blockSize, x, ldx, y, ldy, kmax); } #endif } // HYB_ZO_details } // FFLAS #endif // __FFLASFFPACK_fflas_HYB_ZO_pspmm_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_pspmv.inl000066400000000000000000000057741274716147400266750ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_HYB_ZO_pspmv_INL #define __FFLASFFPACK_fflas_sparse_HYB_ZO_pspmv_INL namespace FFLAS { namespace sparse_details_impl { template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag) { if (A.one != nullptr) sparse_details_impl::pfspmv_one(F, *(A.one), x, y, FieldCategories::GenericTag()); if (A.mone != nullptr) sparse_details_impl::pfspmv_mone(F, *(A.mone), x, y, FieldCategories::GenericTag()); if (A.dat != nullptr) sparse_details_impl::pfspmv(F, *(A.dat), x, y, FieldCategories::GenericTag()); } template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag) { if (A.one != nullptr) sparse_details_impl::pfspmv_one(F, *(A.one), x, y, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::pfspmv_mone(F, *(A.mone), x, y, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::pfspmv(F, *(A.dat), x, y, FieldCategories::UnparametricTag()); } template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, uint64_t kmax) { if (A.one != nullptr) sparse_details_impl::pfspmv_one(F, *(A.one), x, y, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::pfspmv_mone(F, *(A.mone), x, y, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::pfspmv(F, *(A.dat), x, y, kmax); } } // HYB_ZO_details } // FFLAS #endif // __FFLASFFPACK_fflas_HYB_ZO_pspmv_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_spmm.inl000066400000000000000000000156551274716147400265030ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_HYB_ZO_spmm_INL #define __FFLASFFPACK_fflas_sparse_HYB_ZO_spmm_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::GenericTag) { if (A.one != nullptr) sparse_details_impl::fspmm_one(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); if (A.mone != nullptr) sparse_details_impl::fspmm_mone(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); if (A.dat != nullptr) sparse_details_impl::fspmm(F, *(A.dat), blockSize, x, ldx, y, ldy, FieldCategories::GenericTag()); } template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag) { if (A.one != nullptr) sparse_details_impl::fspmm_one(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::fspmm_mone(F, *(A.mone), blockSize, x, ldx, y, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::fspmm(F, *(A.dat), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag) { if (A.one != nullptr) sparse_details_impl::fspmm_one_simd_aligned(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::fspmm_mone_simd_aligned(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::fspmm_simd_aligned(F, *(A.dat), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } template inline void fspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, FieldCategories::UnparametricTag) { if (A.one != nullptr) sparse_details_impl::fspmm_one_simd_unaligned(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::fspmm_mone_simd_unaligned(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::fspmm_simd_unaligned(F, *(A.dat), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); } #endif template inline void fspmm(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, uint64_t kmax) { if (A.one != nullptr) sparse_details_impl::fspmm_one(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::fspmm_mone(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::fspmm(F, *(A.dat), blockSize, x, ldx, y, ldy, kmax); } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmm_simd_aligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, uint64_t kmax) { if (A.one != nullptr) sparse_details_impl::fspmm_one_simd_aligned(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::fspmm_mone_simd_aligned(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::fspmm_simd_aligned(F, *(A.dat), blockSize, x, ldx, y, ldy, kmax); } template inline void fspmm_simd_unaligned(const Field &F, const Sparse &A, size_t blockSize, typename Field::ConstElement_ptr x, int ldx, typename Field::Element_ptr y, int ldy, uint64_t kmax) { if (A.one != nullptr) sparse_details_impl::fspmm_one_simd_unaligned(F, *(A.one), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::fspmm_mone_simd_unaligned(F, *(A.mone), blockSize, x, ldx, y, ldy, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::fspmm_simd_unaligned(F, *(A.dat), blockSize, x, ldx, y, ldy, kmax); } #endif } // HYB_ZO_details } // FFLAS #endif // __FFLASFFPACK_fflas_HYB_ZO_spmm_INL fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_spmv.inl000066400000000000000000000057521274716147400265110ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_HYB_ZO_spmv_INL #define __FFLASFFPACK_fflas_sparse_HYB_ZO_spmv_INL namespace FFLAS { namespace sparse_details_impl { template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::GenericTag) { if (A.one != nullptr) sparse_details_impl::fspmv_one(F, *(A.one), x, y, FieldCategories::GenericTag()); if (A.mone != nullptr) sparse_details_impl::fspmv_mone(F, *(A.mone), x, y, FieldCategories::GenericTag()); if (A.dat != nullptr) sparse_details_impl::fspmv(F, *(A.dat), x, y, FieldCategories::GenericTag()); } template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::UnparametricTag) { if (A.one != nullptr) sparse_details_impl::fspmv_one(F, *(A.one), x, y, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::fspmv_mone(F, *(A.mone), x, y, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::fspmv(F, *(A.dat), x, y, FieldCategories::UnparametricTag()); } template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, uint64_t kmax) { if (A.one != nullptr) sparse_details_impl::fspmv_one(F, *(A.one), x, y, FieldCategories::UnparametricTag()); if (A.mone != nullptr) sparse_details_impl::fspmv_mone(F, *(A.mone), x, y, FieldCategories::UnparametricTag()); if (A.dat != nullptr) sparse_details_impl::fspmv(F, *(A.dat), x, y, kmax); } } // HYB_ZO_details } // FFLAS #endif // __FFLASFFPACK_fflas_HYB_ZO_spmv_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/hyb_zo/hyb_zo_utils.inl000066400000000000000000000105021274716147400266510ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_HYB_ZO_utils_INL #define __FFLASFFPACK_fflas_sparse_HYB_ZO_utils_INL namespace FFLAS { // #define HYB_ZO_DEBUG 1 template inline void sparse_delete(const Sparse &A) { if (A.dat != nullptr) sparse_delete(*(A.dat)); if (A.one != nullptr) sparse_delete(*(A.one)); if (A.mone != nullptr) sparse_delete(*(A.mone)); } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { A.m = rowdim; A.n = coldim; A.nnz = nnz; A.delayed = true; A.nElements = nnz; uint64_t nOnes = 0, nMOnes = 0, nOthers = 0; for (uint64_t i = 0; i < nnz; ++i) { if (F.isOne(dat[i])) nOnes++; else if (F.isMOne(dat[i])) nMOnes++; else nOthers++; } typename Field::Element_ptr dat2; index_t *colOne = nullptr, *colMOne = nullptr, *colOther = nullptr, *rowOne = nullptr, *rowMOne = nullptr, *rowOther = nullptr; if (nOnes) { colOne = fflas_new(nOnes, Alignment::CACHE_LINE); rowOne = fflas_new(nOnes, Alignment::CACHE_LINE); } if (nMOnes) { colMOne = fflas_new(nMOnes, Alignment::CACHE_LINE); rowMOne = fflas_new(nMOnes, Alignment::CACHE_LINE); } if (nOthers) { dat2 = fflas_new(F, nOthers, 1, Alignment::CACHE_LINE); colOther = fflas_new(nOthers, Alignment::CACHE_LINE); rowOther = fflas_new(nOthers, Alignment::CACHE_LINE); } uint64_t itOne = 0, itMOne = 0, itOther = 0; for (uint64_t i = 0; i < nnz; ++i) { if (F.isOne(dat[i])) { colOne[itOne] = col[i]; rowOne[itOne] = row[i]; ++itOne; } else if (F.isMOne(dat[i])) { colMOne[itMOne] = col[i]; rowMOne[itMOne] = row[i]; ++itMOne; } else { dat2[itOther] = dat[i]; colOther[itOther] = col[i]; rowOther[itOther] = row[i]; ++itOther; } } if (nOnes) { A.one = new Sparse(); sparse_init(F, *(A.one), rowOne, colOne, nullptr, rowdim, coldim, nOnes); } if (nMOnes) { A.mone = new Sparse(); sparse_init(F, *(A.mone), rowMOne, colMOne, nullptr, rowdim, coldim, nMOnes); A.mone->cst = -1; } if (nOthers) { A.dat = new Sparse(); sparse_init(F, *(A.dat), rowOther, colOther, dat2, rowdim, coldim, nOthers); } if (nOnes) { fflas_delete(colOne); fflas_delete(rowOne); } if (nMOnes) { fflas_delete(colMOne); fflas_delete(rowMOne); } if (nOthers) { fflas_delete(colOther); fflas_delete(rowOther); fflas_delete(dat2); } } template std::ostream& operator<<(std::ostream& os, const Sparse<_Field, SparseMatrix_t::HYB_ZO>& A) { return sparse_print(os << "non-ones: ", *(A.dat)); } } #endif fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/read_sparse.h000066400000000000000000000322061274716147400246070ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * * Written by Bastien Vialla * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_sparse/read_sparse.h */ #ifndef __FFLASFFPACK_fflas_fflas_sparse_read_sparse_H #define __FFLASFFPACK_fflas_fflas_sparse_read_sparse_H #include "fflas-ffpack/fflas-ffpack-config.h" // #include // #include #include /* getline */ #include // #include #include #include /* istream_iterator */ namespace FFLAS { namespace details_spmv { template struct Coo { private: using Self = Coo; public: typename Field::Element val = 0; index_t col = 0; index_t row = 0; bool deleted = false; Coo() = default; Coo(typename Field::Element v, index_t r, index_t c) : val(v), col(c), row(r) {} Coo(const Self &) = default; Coo(Self &&) = default; Self &operator=(const Self &) = default; Self &operator=(Self &&) = default; }; } // details_spmv } // FFLAS namespace FFLAS { template void readSmsFormat(const std::string &path, const Field &f, index_t *&row, index_t *&col, typename Field::Element_ptr &val, index_t &rowdim, index_t &coldim, uint64_t &nnz) { using namespace details_spmv; std::ifstream file(path, std::ios::in); std::string line; std::vector> data; while (true) { /* comments ? */ std::getline(file,line); if (line.empty()) { continue ; } std::istringstream ligne (line); std::string comm ; if (ligne >> comm ) { if (comm[0] != '%') { break; } } else { std::cerr << " the impossible happened, continuing for now " << std::endl; break; } } bool sms = false ; std::istringstream ligne (line); std::string nbnz_s ; if (ligne >> rowdim >> coldim >> nbnz_s) { if (nbnz_s == "M") { sms = true ; // nnz = 0; } else nnz = std::strtoull(nbnz_s.c_str(),NULL,0); } else { std::cerr << "file " << path << " is not in sms/smf format " << line << std::endl; exit(1); } row = fflas_new(rowdim+1); std::memset(row,0, sizeof(index_t)*(rowdim+1)); assert(!row[0] && !row[rowdim]); std::vector colid((sms)?0:nnz); std::vector dat((sms)?0:nnz); /* got header */ if (!rowdim || !coldim) exit(-1) ; if (!sms && !nnz) exit(-1) ; size_t i=0,l,c ; int64_t d ; while (true) { if ((!sms) && (i == nnz)){ break ; } std::getline(file,line); // std::cout << i << ',' << nnz << std::endl; if (file.bad() || file.eof()) exit(-3); if (line.empty()){ continue; } std::istringstream lign (line); if (lign >> l >> c >> d){ // std::cout << l << ' ' << c << ' ' << d << std::endl; if (sms) { if (l == 0 && c == 0 && d == 0) break ; // nnz ++; } typename Field::Element v; assert(l && c); f.init(v, d); if (!f.isZero(v)) { if (!sorted) { data.emplace_back(v, l-1, c-1); } else { row[l] += 1 ; if (!sms) { colid[i] = c-1 ; dat[i] = v ; } else { colid.push_back(c-1); dat.push_back(v); } } } } else { exit(1); } ++i ; } if (sms) { nnz=dat.size(); } assert(i == nnz); col = fflas_new(nnz); val = fflas_new(f, nnz, 1); if (!sorted) { assert(nnz == data.size()); std::sort(data.begin(), data.end(), [](const Coo &a, const Coo &b) { return (a.row < b.row) || ((a.row == b.row) && (a.col < b.col)); }); auto rowmax = (std::max_element(data.begin(), data.end(), [](const Coo &a, const Coo &b) { return a.row < b.row; }))->row; if (rowdim != rowmax + 1) { std::cout << "Matrix row dimension change : " << rowdim << " -> " << rowmax << std::endl; rowdim = rowmax; } for (size_t j = 0, end = data.size(); j < end; ++j) { val[j] = data[j].val; col[j] = data[j].col; row[data[j].row+1]+=1; } } else { assert(nnz==dat.size()); for (size_t j = 0, end = nnz; j < end; ++j) { val[j] = dat[j]; col[j] = colid[j]; } } for (size_t j = 0, end = rowdim ; j < end; ++j) { row[j+1] += row[j] ; } } template void readSprFormat(const std::string &path, const Field &f, index_t *&row, index_t *&col, typename Field::Element_ptr &val, index_t &rowdim, index_t &coldim, uint64_t &nnz) { using namespace details_spmv; std::ifstream file(path, std::ios::in); std::vector tokens; std::string line; // while(std::getline(file, line) && line.size()!=0); std::getline(file, line); std::istringstream is(line); // std::cout << "line : " << line << std::endl; std::copy(std::istream_iterator(is), std::istream_iterator(), std::back_inserter>(tokens)); // std::cout << tokens.size() << std::endl; // std::cout << " " << std::stoull(tokens[0]) << " " << std::stoull(tokens[1]) << std::endl; rowdim = static_cast(std::stoull(tokens[0])); coldim = static_cast(std::stoull(tokens[1])); std::vector> data; nnz = 0; uint64_t itLine = 0; while (std::getline(file, line)) { tokens.resize(0); std::istringstream iss(line); std::copy(std::istream_iterator(iss), std::istream_iterator(), std::back_inserter>(tokens)); // if (!(tokens[0] == "0" && tokens[1] == "0" && tokens[2] == "0")) { uint64_t nElements = std::stoull(tokens[0]); for (uint64_t i = 0; i < nElements; ++i) { index_t c = std::stoull(tokens[2 * i + 1]) - 1; typename Field::Element v; int64_t vtmp = std::stoll(tokens[2 * (i + 1)]); f.init(v, vtmp); data.emplace_back(v, itLine, c); } // typename Field::Element v; // f.init(v, std::stol(tokens[2])); // index_t r = (index_t)(std::stoull(tokens[0])) - 1; // index_t c = (index_t)(std::stoull(tokens[1])) - 1; // data.emplace_back(v, r, c); // } ++itLine; } std::sort(data.begin(), data.end(), [](const Coo &a, const Coo &b) { return (a.row < b.row) || ((a.row == b.row) && (a.col < b.col)); }); auto rowmax = (std::max_element(data.begin(), data.end(), [](const Coo &a, const Coo &b) { return a.row < b.row; }))->row; if (rowdim != rowmax + 1) { std::cout << "Matrix row dimension change : " << rowdim << " -> " << rowmax << std::endl; rowdim = rowmax; } row = fflas_new(data.size()); col = fflas_new(data.size()); val = fflas_new(f, data.size(), 1); nnz = data.size(); std::cout << "nnz : " << nnz << std::endl; for (size_t i = 0, end = data.size(); i < end; ++i) { val[i] = data[i].val; col[i] = data[i].col; row[i] = data[i].row; } } #define DNS_BIN_VER 0 #define mask_t uint64_t template struct readMyMachineType { typedef typename Field::Element Element ; typedef typename Field::Element_ptr Element_ptr ; void operator() (const Field &F, Element & modulo, Element_ptr val, std::ifstream & file, const uint64_t dims, const mask_t data_type, const mask_t field_desc); }; template struct readMyMachineType { typedef typename Field::Element Element ; typedef typename Field::Element_ptr Element_ptr ; void operator() (const Field &F, Element & modulo, Element_ptr val, std::ifstream & file, const uint64_t dims, const mask_t data_type, const mask_t field_desc); }; template void readMyMachineType:: operator() (const Field &F, Element & modulo, Element_ptr val, std::ifstream & file, const uint64_t dims, const mask_t data_type, const mask_t field_desc) { if (field_desc ==1) { /* modulo */ T modulo_read ; file.read((char*) &modulo_read, sizeof(T)); F.init(modulo,modulo_read); } /* do something with field_desc and multiprec... */ T * data_read = fflas_new(dims); file.read((char*)data_read,sizeof(T)); /* TODO freduce ? */ for (size_t i = 0 ; i< dims ; ++i) { F.init(val[i],data_read[i]); } } template void readMyMachineType:: operator() (const Field &F, typename Field::Element & modulo, typename Field::Element_ptr val, std::ifstream & file, const uint64_t dims, const mask_t data_type, const mask_t field_desc) { /* need to use FILE * instead of std::ifstream */ throw("not implemented, use mpz_in_raw, but FILE*..."); } template std::enable_if::value,int> getDataType() { return (1<<(sizeof(T)-1))+ std::is_unsigned::value ; } template std::enable_if::value,int> getDataType() { return (1<<8)+std::is_same::value ; } template std::enable_if::value,int> getDataType() { return (1<<16) ; } template int getDataType() { return -1 ; } template void readMachineType(const Field &F, typename Field::Element & modulo, typename Field::Element_ptr val, std::ifstream & file, const uint64_t dims, const mask_t data_type, const mask_t field_desc) { // switch(data_type) { // case (1<<0) + 0 : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // case (1<<0) + 1 : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // case (1<<1) + 0 : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // case (1<<1) + 1 : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // case (1<<2) + 0 : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // case (1<<2) + 0 : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // case (1<<3) + 0 : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // case (1<<3) + 0 : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // case (1<<8) : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // case (1<<8)+1 : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // case (1<<16) : // readMyMachineType() (F,val, modulo, file,dims,data_type,field_desc); // default : // throw("bad data type descriptor"); // } } template void readDnsFormat(const std::string &path, const Field &F, index_t &rowdim, index_t &coldim, typename Field::Element_ptr &val) { std::ifstream file(path, std::ifstream::binary); mask_t magic, field_desc, data_type ; typename Field::Element modulo ; file.read((char*) &magic , sizeof(int64_t)) ; if (magic != DNS_BIN_VER) { throw("bad version"); } file.read((char*) &field_desc, sizeof(int64_t)) ; file.read((char*) &data_type , sizeof(int64_t)) ; file.read((char*) &rowdim , sizeof(int64_t)) ; file.read((char*) &coldim , sizeof(int64_t)) ; val = fflas_new(F,rowdim*coldim,1); readMachineType(F,val, modulo, file,rowdim*coldim,field_desc,data_type); } template void writeDnsFormat(const std::string &path, const Field &F, const index_t &rowdim, const index_t &coldim, typename Field::Element_ptr A, index_t ldA) { typedef typename Field::Element Element ; std::ofstream file(path, std::ofstream::binary); mask_t field_desc = getFieldDesc(F); mask_t magic = DNS_BIN_VER ; mask_t data_type = getDataType(F); Element modulo ; file.write((char*) &magic , sizeof(int64_t)) ; file.write((char*) &field_desc, sizeof(int64_t)) ; file.write((char*) &data_type , sizeof(int64_t)) ; file.write((char*) &rowdim , sizeof(int64_t)) ; file.write((char*) &coldim , sizeof(int64_t)) ; // writeMachineType(F,A, modulo, file,rowdim,coldim,lda,field_desc,data_type); } }// FFLAS #endif /* __FFLASFFPACK_fflas_fflas_sparse_read_sparse_H */ fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/sell.h000066400000000000000000000045111274716147400232540ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_fspmv_sell.inl * NO DOC */ #ifndef __FFLASFFPACK_fflas_sparse_sell_H #define __FFLASFFPACK_fflas_sparse_sell_H namespace FFLAS { /* SELL */ template struct Sparse<_Field, SparseMatrix_t::SELL> { using Field = _Field; bool delayed = false; int chunk = 0; index_t kmax = 0; index_t m = 0; index_t n = 0; index_t maxrow = 0; index_t sigma = 0; index_t nChunks = 0; uint64_t nnz = 0; uint64_t nElements = 0; index_t *perm = nullptr; uint64_t *st = nullptr; index_t *chunkSize = nullptr; index_t *col = nullptr; typename _Field::Element_ptr dat; }; template struct Sparse<_Field, SparseMatrix_t::SELL_ZO> : public Sparse<_Field, SparseMatrix_t::SELL> { using Field = _Field; typename _Field::Element cst = 1; }; } // FFLAS #include "fflas-ffpack/fflas/fflas_sparse/sell/sell_utils.inl" #include "fflas-ffpack/fflas/fflas_sparse/sell/sell_spmv.inl" #if defined(__FFLASFFPACK_USE_OPENMP) #include "fflas-ffpack/fflas/fflas_sparse/sell/sell_pspmv.inl" #endif // #include "fflas-ffpack/fflas/fflas_sparse/sell/sell_spmm.inl" // #include "fflas-ffpack/fflas/fflas_sparse/sell/sell_pspmm.inl" #endif // __FFLASFFPACK_fflas_sparse_SELL_Hfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/sell/000077500000000000000000000000001274716147400231025ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/sell/Makefile.am000066400000000000000000000021101274716147400251300ustar00rootroot00000000000000# Copyright (c) 2014 FFLAS-FFPACK # written by Bastien Vialla # # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/fflas/fflas_sparse/sell pkgincludesub_HEADERS= \ sell_spmv.inl \ sell_utils.inl \ sell_pspmv.inl fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/sell/sell_pspmv.inl000066400000000000000000000721231274716147400257770ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_sell_pspmv_INL #define __FFLASFFPACK_fflas_sparse_sell_pspmv_INL #ifdef __FFLASFFPACK_USE_TBB #include "tbb/parallel_for.h" #include "tbb/blocked_range.h" #endif namespace FFLAS { namespace sparse_details_impl { template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, col, st, dat, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; index_t j = 0; for (; j < size; j++) { for (index_t k = 0; k < A.chunk; ++k) { F.axpyin(y[i * A.chunk + k], dat[start + j * A.chunk + k], x[col[start + j * A.chunk + k]]); } } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; index_t j = 0; for (; j < size; j++) { for (index_t k = 0; k < A.chunk; ++k) { F.axpyin(y[i * A.chunk + k], dat[start + j * A.chunk + k], x[col[start + j * A.chunk + k]]); } } } #endif } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmv_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, st, col, dat, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; vect_t x1, x2, y1, y2, dat1, dat2; y1 = simd::zero(); y2 = simd::zero(); index_t j = 0; for (; j < ROUND_DOWN(size, 2); j += 2) { dat1 = simd::load(dat + start + j * A.chunk); dat2 = simd::load(dat + start + (j + 1) * A.chunk); x1 = simd::gather(x, col + start + j * A.chunk); x2 = simd::gather(x, col + start + (j + 1) * A.chunk); y1 = simd::fmadd(y1, dat1, x1); y2 = simd::fmadd(y2, dat2, x2); } if (size % 2 != 0) { dat1 = simd::load(dat + start + j * A.chunk); x1 = simd::gather(x, col + start + j * A.chunk); y1 = simd::fmadd(y1, dat1, x1); } simd::store(y + i * A.chunk, simd::add(simd::load(y + i * A.chunk), simd::add(y1, y2))); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; vect_t x1, x2, y1, y2, dat1, dat2; y1 = simd::zero(); y2 = simd::zero(); index_t j = 0; for (; j < ROUND_DOWN(size, 2); j += 2) { dat1 = simd::load(dat + start + j * A.chunk); dat2 = simd::load(dat + start + (j + 1) * A.chunk); x1 = simd::gather(x, col + start + j * A.chunk); x2 = simd::gather(x, col + start + (j + 1) * A.chunk); y1 = simd::fmadd(y1, dat1, x1); y2 = simd::fmadd(y2, dat2, x2); } if (size % 2 != 0) { dat1 = simd::load(dat + start + j * A.chunk); x1 = simd::gather(x, col + start + j * A.chunk); y1 = simd::fmadd(y1, dat1, x1); } simd::store(y + i * A.chunk, simd::add(simd::load(y + i * A.chunk), simd::add(y1, y2))); } #endif // TBB } #endif // SIMD template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, st, col, dat, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; for (index_t j = 0; j < size; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] += dat[start + j * A.chunk + k + 1] * x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[start + j * A.chunk + k + 2] * x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[start + j * A.chunk + k + 3] * x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; } } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; for (index_t j = 0; j < size; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] += dat[start + j * A.chunk + k + 1] * x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[start + j * A.chunk + k + 2] * x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[start + j * A.chunk + k + 3] * x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; } } } #endif // TBB } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmv_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const int64_t kmax) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t chunk = A.chunk; using simd = Simd; using vect_t = typename simd::vect_t; vect_t X, Y, D, C, Q, TMP, NEGP, INVP, MIN, MAX, P; double p = (typename Field::Element)F.characteristic(); P = simd::set1(p); NEGP = simd::set1(-p); INVP = simd::set1(1 / p); MIN = simd::set1(F.minElement()); MAX = simd::set1(F.maxElement()); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, st, col, dat, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; index_t j_loc = 0; Y = simd::load(y + i * A.chunk); index_t size = chunkSize[i]; index_t start = st[i]; index_t block = size / kmax; for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { D = simd::load(dat + start + j * A.chunk); X = simd::gather(x, col + start + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); } for (; j < size; ++j) { D = simd::load(dat + start + j * A.chunk); X = simd::gather(x, col + start + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); simd::store(y + i * A.chunk, Y); } }); #else #pragma omp parallel for for (size_t i = 0; i < A.nChunks; ++i) { index_t j = 0; index_t j_loc = 0; Y = simd::load(y + i * A.chunk); index_t size = chunkSize[i]; index_t start = st[i]; index_t block = size / kmax; for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { D = simd::load(dat + start + j * A.chunk); X = simd::gather(x, col + start + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); } for (; j < size; ++j) { D = simd::load(dat + start + j * A.chunk); X = simd::gather(x, col + start + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); simd::store(y + i * A.chunk, Y); } #endif // TBB } #endif // SIMD template inline void pfspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const int64_t kmax) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t chunk = A.chunk; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, &x, &y, st, col, dat, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t j = 0; index_t j_loc = 0; index_t size = chunkSize[i]; index_t start = st[i]; index_t block = size / kmax; for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] += dat[start + j * A.chunk + k + 1] * x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[start + j * A.chunk + k + 2] * x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[start + j * A.chunk + k + 3] * x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; } } for (size_t k = 0; k < size; ++k) { F.reduce(y[i * A.chunk + k]); } } for (; j < size; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] += dat[start + j * A.chunk + k + 1] * x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[start + j * A.chunk + k + 2] * x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[start + j * A.chunk + k + 3] * x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; } } for (size_t k = 0; k < size; ++k) { F.reduce(y[i * A.chunk + k]); } } }); #else #pragma omp parallel for for (size_t i = 0; i < A.nChunks; ++i) { index_t j = 0; index_t j_loc = 0; index_t size = chunkSize[i]; index_t start = st[i]; index_t block = size / kmax; for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] += dat[start + j * A.chunk + k + 1] * x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[start + j * A.chunk + k + 2] * x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[start + j * A.chunk + k + 3] * x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; } } for (size_t k = 0; k < size; ++k) { F.reduce(y[i * A.chunk + k]); } } for (; j < size; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] += dat[start + j * A.chunk + k + 1] * x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[start + j * A.chunk + k + 2] * x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[start + j * A.chunk + k + 3] * x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; } } for (size_t k = 0; k < size; ++k) { F.reduce(y[i * A.chunk + k]); } } #endif // TBB } template inline void pfspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, st, col, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; index_t j = 0; for (; j < size; j++) { for (index_t k = 0; k < A.chunk; ++k) { F.addin(y[i * A.chunk + k], x[col[start + j * A.chunk + k]]); } } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; index_t j = 0; for (; j < size; j++) { for (index_t k = 0; k < A.chunk; ++k) { F.addin(y[i * A.chunk + k], x[col[start + j * A.chunk + k]]); } } } #endif } template inline void pfspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, st, col, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; index_t j = 0; for (; j < size; j++) { for (index_t k = 0; k < A.chunk; ++k) { F.subin(y[i * A.chunk + k], x[col[start + j * A.chunk + k]]); } } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; index_t j = 0; for (; j < size; j++) { for (index_t k = 0; k < A.chunk; ++k) { F.subin(y[i * A.chunk + k], x[col[start + j * A.chunk + k]]); } } } #endif } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void pfspmv_one_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, st, col, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; vect_t x1, x2, y1, y2; y1 = simd::zero(); y2 = simd::zero(); index_t j = 0; for (; j < ROUND_DOWN(size, 2); j += 2) { x1 = simd::gather(x, col + start + j * A.chunk); x2 = simd::gather(x, col + start + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y2 = simd::add(y2, x2); } if (size % 2 != 0) { x1 = simd::gather(x, col + start + j * A.chunk); y1 = simd::add(y1, x1); } simd::store(y + i * A.chunk, simd::add(simd::load(y + i * A.chunk), simd::add(y1, y2))); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; vect_t x1, x2, y1, y2; y1 = simd::zero(); y2 = simd::zero(); index_t j = 0; for (; j < ROUND_DOWN(size, 2); j += 2) { x1 = simd::gather(x, col + start + j * A.chunk); x2 = simd::gather(x, col + start + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y2 = simd::add(y2, x2); } if (size % 2 != 0) { x1 = simd::gather(x, col + start + j * A.chunk); y1 = simd::add(y1, x1); } simd::store(y + i * A.chunk, simd::add(simd::load(y + i * A.chunk), simd::add(y1, y2))); } #endif // TBB } template inline void pfspmv_mone_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, st, col, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; vect_t x1, x2, y1, y2; y1 = simd::zero(); y2 = simd::zero(); index_t j = 0; for (; j < ROUND_DOWN(size, 2); j += 2) { x1 = simd::gather(x, col + start + j * A.chunk); x2 = simd::gather(x, col + start + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y2 = simd::add(y2, x2); } if (size % 2 != 0) { x1 = simd::gather(x, col + start + j * A.chunk); y1 = simd::add(y1, x1); } simd::store(y + i * A.chunk, simd::sub(simd::load(y + i * A.chunk), simd::add(y1, y2))); } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; vect_t x1, x2, y1, y2; y1 = simd::zero(); y2 = simd::zero(); index_t j = 0; for (; j < ROUND_DOWN(size, 2); j += 2) { x1 = simd::gather(x, col + start + j * A.chunk); x2 = simd::gather(x, col + start + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y2 = simd::add(y2, x2); } if (size % 2 != 0) { x1 = simd::gather(x, col + start + j * A.chunk); y1 = simd::add(y1, x1); } simd::store(y + i * A.chunk, simd::sub(simd::load(y + i * A.chunk), simd::add(y1, y2))); } #endif // TBB } #endif // SIMD template inline void pfspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, st, col, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; for (index_t j = 0; j < size; j++) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] += x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += x[col[start + j * A.chunk + k]]; } } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; for (index_t j = 0; j < size; j++) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] += x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += x[col[start + j * A.chunk + k]]; } } } #endif // TBB } template inline void pfspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); #ifdef __FFLASFFPACK_USE_TBB tbb::parallel_for(tbb::blocked_range(0, A.nbChunks, 2), [&F, &A, x, y, st, col, chunkSize](const tbb::blocked_range &r) { for (index_t i = r.begin(), end = r.end(); i < end; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; for (index_t j = 0; j < size; j++) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] -= x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] -= x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] -= x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] -= x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] -= x[col[start + j * A.chunk + k]]; } } } }); #else #pragma omp parallel for for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; for (index_t j = 0; j < size; j++) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] -= x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] -= x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] -= x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] -= x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] -= x[col[start + j * A.chunk + k]]; } } } #endif // TBB } } // SELL_details } // FFLAS #endif // __FFLASFFPACK_fflas_SELL_spmv_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/sell/sell_spmv.inl000066400000000000000000000415671274716147400256270ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_sell_spmv_INL #define __FFLASFFPACK_fflas_sparse_sell_spmv_INL // #define SELL_DEBUG 1 namespace FFLAS { namespace sparse_details_impl { template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; index_t j = 0; for (; j < size; j++) { for (index_t k = 0; k < A.chunk; ++k) { F.axpyin(y[i * A.chunk + k], dat[start + j * A.chunk + k], x[col[start + j * A.chunk + k]]); } } } } // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmv_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; vect_t x1, x2, y1, y2, dat1, dat2; y1 = simd::zero(); y2 = simd::zero(); index_t j = 0; for (; j < ROUND_DOWN(size, 2); j += 2) { dat1 = simd::load(dat + start + j * A.chunk); dat2 = simd::load(dat + start + (j + 1) * A.chunk); x1 = simd::gather(x, col + start + j * A.chunk); x2 = simd::gather(x, col + start + (j + 1) * A.chunk); y1 = simd::fmadd(y1, dat1, x1); y2 = simd::fmadd(y2, dat2, x2); } if (size % 2 != 0) { dat1 = simd::load(dat + start + j * A.chunk); x1 = simd::gather(x, col + start + j * A.chunk); y1 = simd::fmadd(y1, dat1, x1); } simd::store(y + i * A.chunk, simd::add(simd::load(y + i * A.chunk), simd::add(y1, y2))); } } // #endif template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; for (index_t j = 0; j < size; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; y[i * A.chunk + k + 1] += dat[start + j * A.chunk + k + 1] * x[col[start + j * A.chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[start + j * A.chunk + k + 2] * x[col[start + j * A.chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[start + j * A.chunk + k + 3] * x[col[start + j * A.chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += dat[start + j * A.chunk + k] * x[col[start + j * A.chunk + k]]; } } } } // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmv_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t chunk = A.chunk; using simd = Simd; using vect_t = typename simd::vect_t; vect_t X, Y, D, C, Q, TMP, NEGP, INVP, MIN, MAX, P; double p = (typename Field::Element)F.characteristic(); P = simd::set1(p); NEGP = simd::set1(-p); INVP = simd::set1(1 / p); MIN = simd::set1(F.minElement()); MAX = simd::set1(F.maxElement()); for (size_t i = 0; i < A.nChunks; ++i) { index_t j = 0; index_t j_loc = 0; Y = simd::load(y + i * chunk); index_t size = chunkSize[i]; index_t start = st[i]; index_t block = size / kmax; for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { D = simd::load(dat + start + j * A.chunk); X = simd::gather(x, col + start + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); } for (; j < size; ++j) { D = simd::load(dat + start + j * A.chunk); X = simd::gather(x, col + start + j * A.chunk); Y = simd::fmadd(Y, D, X); } simd::mod(Y, P, INVP, NEGP, MIN, MAX, Q, TMP); simd::store(y + i * A.chunk, Y); } } // #endif template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, const uint64_t kmax) { assume_aligned(dat, A.dat, (size_t)Alignment::CACHE_LINE); assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); index_t chunk = A.chunk; for (size_t i = 0; i < A.nChunks; ++i) { index_t j = 0; index_t j_loc = 0; index_t size = chunkSize[i]; index_t start = st[i]; index_t block = size / kmax; for (size_t l = 0; l < block; ++l) { j_loc += kmax; for (; j < j_loc; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[start + j * chunk + k] * x[col[start + j * chunk + k]]; y[i * A.chunk + k + 1] += dat[start + j * chunk + k + 1] * x[col[start + j * chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[start + j * chunk + k + 2] * x[col[start + j * chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[start + j * chunk + k + 3] * x[col[start + j * chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += dat[start + j * chunk + k] * x[col[start + j * chunk + k]]; } } for (size_t k = 0; k < size; ++k) { F.reduce(y[i * A.chunk + k]); } } for (; j < size; ++j) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += dat[start + j * chunk + k] * x[col[start + j * chunk + k]]; y[i * A.chunk + k + 1] += dat[start + j * chunk + k + 1] * x[col[start + j * chunk + k + 1]]; y[i * A.chunk + k + 2] += dat[start + j * chunk + k + 2] * x[col[start + j * chunk + k + 2]]; y[i * A.chunk + k + 3] += dat[start + j * chunk + k + 3] * x[col[start + j * chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += dat[start + j * chunk + k] * x[col[start + j * chunk + k]]; } } for (size_t k = 0; k < size; ++k) { F.reduce(y[i * A.chunk + k]); } } } template inline void fspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; index_t j = 0; for (; j < size; j++) { for (index_t k = 0; k < A.chunk; ++k) { F.addin(y[i * A.chunk + k], x[col[start + j * A.chunk + k]]); } } } } template inline void fspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::GenericTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; index_t j = 0; for (; j < size; j++) { for (index_t k = 0; k < A.chunk; ++k) { F.subin(y[i * A.chunk + k], x[col[start + j * A.chunk + k]]); } } } } // #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template inline void fspmv_one_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; vect_t x1, x2, y1, y2; y1 = simd::zero(); y2 = simd::zero(); index_t j = 0; for (; j < ROUND_DOWN(size, 2); j += 2) { x1 = simd::gather(x, col + start + j * A.chunk); x2 = simd::gather(x, col + start + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y2 = simd::add(y2, x2); } if (size % 2 != 0) { x1 = simd::gather(x, col + start + j * A.chunk); y1 = simd::add(y1, x1); } simd::store(y + i * A.chunk, simd::add(simd::load(y + i * A.chunk), simd::add(y1, y2))); } } template inline void fspmv_mone_simd(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); using simd = Simd; using vect_t = typename simd::vect_t; for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; vect_t x1, x2, y1, y2; y1 = simd::zero(); y2 = simd::zero(); index_t j = 0; for (; j < ROUND_DOWN(size, 2); j += 2) { x1 = simd::gather(x, col + start + j * A.chunk); x2 = simd::gather(x, col + start + (j + 1) * A.chunk); y1 = simd::add(y1, x1); y2 = simd::add(y2, x2); } if (size % 2 != 0) { x1 = simd::gather(x, col + start + j * A.chunk); y1 = simd::add(y1, x1); } simd::store(y + i * A.chunk, simd::sub(simd::load(y + i * A.chunk), simd::add(y1, y2))); } } // #endif template inline void fspmv_one(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); auto chunk = A.chunk; for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; for (index_t j = 0; j < size; j++) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] += x[col[start + j * chunk + k]]; y[i * A.chunk + k + 1] += x[col[start + j * chunk + k + 1]]; y[i * A.chunk + k + 2] += x[col[start + j * chunk + k + 2]]; y[i * A.chunk + k + 3] += x[col[start + j * chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] += x[col[start + j * chunk + k]]; } } } } template inline void fspmv_mone(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x_, typename Field::Element_ptr y_, FieldCategories::UnparametricTag) { assume_aligned(col, A.col, (size_t)Alignment::CACHE_LINE); assume_aligned(st, A.st, (size_t)Alignment::CACHE_LINE); assume_aligned(chunkSize, A.chunkSize, (size_t)Alignment::CACHE_LINE); assume_aligned(x, x_, (size_t)Alignment::DEFAULT); assume_aligned(y, y_, (size_t)Alignment::DEFAULT); auto chunk = A.chunk; for (index_t i = 0; i < A.nChunks; ++i) { index_t start = st[i]; index_t size = chunkSize[i]; for (index_t j = 0; j < size; j++) { size_t k = 0; for (; k < ROUND_DOWN(A.chunk, 4); k += 4) { y[i * A.chunk + k] -= x[col[start + j * chunk + k]]; y[i * A.chunk + k + 1] -= x[col[start + j * chunk + k + 1]]; y[i * A.chunk + k + 2] -= x[col[start + j * chunk + k + 2]]; y[i * A.chunk + k + 3] -= x[col[start + j * chunk + k + 3]]; } for (; k < size; ++k) { y[i * A.chunk + k] -= x[col[start + j * chunk + k]]; } } } } } // SELL_details } // FFLAS #endif // __FFLASFFPACK_fflas_SELL_spmv_INLfflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/sell/sell_utils.inl000066400000000000000000000213241274716147400257670ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_sparse_sell_utils_INL #define __FFLASFFPACK_fflas_sparse_sell_utils_INL namespace FFLAS { template inline void fspmv(const Field &F, const Sparse &A, typename Field::ConstElement_ptr x, typename Field::Element_ptr y, FieldCategories::ModularTag) { fspmv(F, A, x, y, FieldCategories::UnparametricTag()); freduce(F, A.m, y, 1); } template inline void sparse_delete(const Sparse &A) { fflas_delete(A.dat); fflas_delete(A.col); fflas_delete(A.st); fflas_delete(A.chunkSize); } template inline void sparse_delete(const Sparse &A) { fflas_delete(A.col); fflas_delete(A.st); fflas_delete(A.chunkSize); } namespace sell_details { struct Info { uint64_t size = 0; uint64_t perm = 0; uint64_t begin = 0; Info(uint64_t it, uint64_t s, uint64_t p) : size(s), perm(p), begin(it) {} Info() = default; Info(const Info &) = default; Info(Info &&) = default; Info &operator=(const Info &) = default; Info &operator=(Info &&) = default; }; template struct Coo { using Self = Coo; ValT val = 0; IdxT row = 0; IdxT col = 0; Coo(ValT v, IdxT r, IdxT c) : val(v), row(r), col(c) {} Coo() = default; Coo(const Self &) = default; Coo(Self &&) = default; Self &operator=(const Self &) = default; Self &operator=(Self &&) = default; }; } template inline void sparse_print(const Sparse &A) { uint64_t it = 0; for (size_t i = 0; i < A.nChunks; ++i) { for (size_t k = 0; k < A.chunk; ++k) { std::cout << i *A.chunk + k << " : "; for (size_t j = 0; j < A.chunkSize[i]; ++j) { std::cout << A.dat[it + j * A.chunk + k] << " "; } std::cout << std::endl; } it += A.chunkSize[i] * A.chunk; } } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz, uint64_t sigma = 0) { using namespace sell_details; using coo = Coo; if (!sigma) sigma = rowdim; A.kmax = Protected::DotProdBoundClassic(F, F.one); A.m = rowdim; A.n = coldim; A.nnz = nnz; #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS using simd = Simd; A.chunk = simd::vect_size; #else A.chunk = 8; #endif index_t m = (A.m % A.chunk == 0) ? A.m : ROUND_DOWN(A.m, A.chunk) + A.chunk; A.nChunks = (m / A.chunk); std::vector data; std::vector infos(A.nChunks * A.chunk); for (uint64_t i = 0; i < nnz; ++i) { data.emplace_back(dat[i], row[i], col[i]); } IndexT currow = row[0]; infos[currow].begin = 0; for (uint64_t i = 0; i < nnz; ++i) { if (row[i] != currow) { currow = row[i]; infos[currow].begin = i; } infos[row[i]].size++; } A.maxrow = (std::max_element(infos.begin(), infos.end(), [](const Info &a, const Info &b) { return a.size >= b.size; }))->size; // cout << "maxrow : " << A.maxrow << endl; if (A.maxrow < A.kmax) A.delayed = true; for (uint64_t i = 0; i < rowdim; ++i) { infos[i].perm = i; } #ifdef SELL_DEBUG for (auto &x : infos) { cout << x.size << " "; } std::cout << std::endl; #endif uint64_t it = 0; for (; it < ROUND_DOWN(rowdim, sigma); it += sigma) { std::sort(infos.begin() + it, infos.begin() + it + sigma, [](const Info &a, const Info &b) { return a.size >= b.size; }); } if (it != rowdim) { std::sort(infos.begin() + it, infos.end(), [](Info a, Info b) { return a.size >= b.size; }); } // cout << "sorted : " << std::is_sorted(infos.begin(), infos.end(), [](Info // a, Info b){ // return a.size >= b.size; // }) << endl; for (size_t i = 0; i < infos.size(); ++i) { if (infos[i].begin > nnz) std::cout << "ERROR sort " << i << " size : " << infos[i].size << " begin : " << infos[i].begin << " perm : " << infos[i].perm << std::endl; } #ifdef SELL_DEBUG for (auto &x : infos) { cout << x.size << " "; } std::cout << std::endl; #endif A.perm = fflas_new(rowdim, Alignment::CACHE_LINE); // cout << "perm : "; for (uint64_t i = 0; i < rowdim; ++i) { // cout << "(" << i << " , " << infos[i].perm << ") "; A.perm[infos[i].perm] = i; } // for(size_t i = 0 ; i < A.m ; ++i) // cout << A.perm[i] << " "; // cout << endl; // cout << endl; // add info if rowdim%chunk != 0, with empty infos (size = 0, begin = 0) // infos.resize(A.nChunks*A.chunk); // for(auto & x:infos) // if(x.begin > nnz) // cout << "ERROR resize" << endl; A.chunkSize = fflas_new(A.nChunks, Alignment::CACHE_LINE); for (uint64_t i = 0; i < A.nChunks; ++i) A.chunkSize[i] = 0; for (uint64_t i = 0; i < A.nChunks; ++i) { for (uint64_t j = 0; j < A.chunk; ++j) { if (infos[i * A.chunk + j].size >= A.chunkSize[i]) A.chunkSize[i] = infos[i * A.chunk + j].size; } } #ifdef SELL_DEBUG for (uint64_t i = 0; i < A.nChunks; ++i) cout << "chunk " << i << " : " << A.chunkSize[i] << endl; ; #endif uint64_t sum = 0; for (uint64_t i = 0; i < A.nChunks; ++i) sum += A.chunkSize[i]; #ifdef SELL_DEBUG cout << "sum : " << sum << " chunk : " << A.chunk << endl; #endif A.col = fflas_new(sum * A.chunk, Alignment::CACHE_LINE); A.dat = fflas_new(F, sum * A.chunk, 1, Alignment::CACHE_LINE); A.nElements = sum * A.chunk; for (uint64_t i = 0; i < sum * A.chunk; ++i) { A.col[i] = 0; F.assign(A.dat[i], F.zero); } it = 0; for (uint64_t i = 0; i < A.nChunks; ++i) { for (uint64_t k = 0; k < A.chunk; ++k) { uint64_t start = infos[i * A.chunk + k].begin; #ifdef SELL_DEBUG cout << it << " " << start << " " << infos[i * A.chunk + k].size << endl; cout << " "; #endif for (uint64_t j = 0; j < infos[i * A.chunk + k].size; ++j) { if (it + k + j * A.chunk >= sum * A.chunk) std::cout << "error : " << it + k + j *A.chunk << " " << sum *A.chunk << std::endl; A.dat[it + k + j * A.chunk] = data[start + j].val; A.col[it + k + j * A.chunk] = data[start + j].col; #ifdef SELL_DEBUG cout << data[start + j].val << " "; #endif } #ifdef SELL_DEBUG cout << endl; #endif } it += A.chunkSize[i] * A.chunk; } A.st = fflas_new(A.nChunks, Alignment::CACHE_LINE); A.st[0] = 0; for (uint64_t i = 1; i < A.nChunks; ++i) { A.st[i] = A.chunkSize[i - 1] * A.chunk; } for (uint64_t i = 1; i < A.nChunks; ++i) { A.st[i] += A.st[i - 1]; } #ifdef SELL_DEBUG cout << "st : "; for (uint64_t i = 0; i < A.nChunks; ++i) cout << A.st[i] << " "; cout << endl; #endif } template inline void sparse_init(const Field &F, Sparse &A, const IndexT *row, const IndexT *col, typename Field::ConstElement_ptr dat, uint64_t rowdim, uint64_t coldim, uint64_t nnz) {} } // FFLAS #endiffflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/sparse_matrix_traits.h000066400000000000000000000270121274716147400265650ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_SPARSEMATRIX_TRAITS_H #define __FFLASFFPACK_SPARSEMATRIX_TRAITS_H #include namespace FFLAS { /**************************************************************************************************************** * * SparseMatrix Traits * ****************************************************************************************************************/ template struct isSparseMatrix : public std::false_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isSparseMatrix> : public std::true_type {}; template struct isZOSparseMatrix : public std::false_type {}; template struct isZOSparseMatrix> : public std::true_type {}; template struct isZOSparseMatrix> : public std::true_type {}; template struct isZOSparseMatrix> : public std::true_type {}; template struct isZOSparseMatrix> : public std::true_type {}; template struct isZOSparseMatrix> : public std::true_type {}; using ZOSparseMatrix = std::true_type; using NotZOSparseMatrix = std::false_type; template struct isSparseMatrixSimdFormat : public std::false_type {}; #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS template struct isSparseMatrixSimdFormat> : public support_simd::type {}; template struct isSparseMatrixSimdFormat> : public support_simd::type {}; #endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS using SimdSparseMatrix = std::true_type; using NoSimdSparseMatrix = std::false_type; template struct isSparseMatrixMKLFormat : public std::false_type {}; #ifdef __FFLASFFPACK_HAVE_MKL template struct isSparseMatrixMKLFormat> : public std::true_type {}; template struct isSparseMatrixMKLFormat> : public std::true_type {}; #endif // __FFLASFFPACK_HAVE_MKL using MKLSparseMatrixFormat = std::true_type; using NotMKLSparseMatrixFormat = std::false_type; /******************************************************************************************************** * * Traits to test if operator +, -, *, =, +=, -=, *= exists * ********************************************************************************************************/ #if 0 #define function_to_functor(X) \ struct tfn_##X { \ template \ auto operator()(Args&&... args) const \ ->decltype(X(std::forward(args)...)){ \ return X(std::forward(args)...); } } #endif struct tfn_plus { template auto operator()(Args&&... args) const ->decltype(operator+(std::forward(args)...)) { return operator+(std::forward(args)...); } }; struct tfn_mul { template auto operator()(Args&&... args) const ->decltype(operator+(std::forward(args)...)) { return operator*(std::forward(args)...); } }; struct tfn_mul_eq { template auto operator()(Args&&... args) const ->decltype(operator+(std::forward(args)...)) { return operator*=(std::forward(args)...); } }; struct tfn_minus { template auto operator()(Args&&... args) const ->decltype(operator+(std::forward(args)...)) { return operator-(std::forward(args)...); } }; struct tfn_plus_eq { template auto operator()(Args&&... args) const ->decltype(operator+(std::forward(args)...)) { return operator+=(std::forward(args)...); } }; struct tfn_minus_eq { template auto operator()(Args&&... args) const ->decltype(operator+(std::forward(args)...)) { return operator+=(std::forward(args)...); } }; template struct has_plus_impl { private: // Test for non member operator template static constexpr auto check(T*) -> typename std::is_same::type, T>::type; // Test for member operator template static constexpr auto check(T*) -> typename std::is_same().operator+(std::declval())), T>::type; template static constexpr std::false_type check(...); typedef decltype(check(0)) type; public: static constexpr bool value = type::value; }; template struct has_mul_impl { private: // Test for non member operator template static constexpr auto check(T*) -> typename std::is_same::type, T>::type; // Test for member operator template static constexpr auto check(T*) -> typename std::is_same().operator*(std::declval())), T>::type; template static constexpr std::false_type check(...); typedef decltype(check(0)) type; public: static constexpr bool value = type::value; }; template struct has_mul_eq_impl { private: // Test for non member operator template static constexpr auto check(T*) -> typename std::is_same::type, T>::type; // Test for member operator template static constexpr auto check(T*) -> typename std::is_same().operator*=(std::declval())), typename std::add_lvalue_reference::type>::type; template static constexpr std::false_type check(...); typedef decltype(check(0)) type; public: static constexpr bool value = type::value; }; template struct has_plus_eq_impl { private: // Test for non member operator template static constexpr auto check(T*) -> typename std::is_same::type, T>::type; // Test for member operator template static constexpr auto check(T*) -> typename std::is_same().operator+=(std::declval())), typename std::add_lvalue_reference::type>::type; template static constexpr std::false_type check(...); typedef decltype(check(0)) type; public: static constexpr bool value = type::value; }; template struct has_minus_eq_impl { private: // Test for non member operator template static constexpr auto check(T*) -> typename std::is_same::type, T>::type; // Test for member operator template static constexpr auto check(T*) -> typename std::is_same().operator-=(std::declval())), typename std::add_lvalue_reference::type>::type; template static constexpr std::false_type check(...); typedef decltype(check(0)) type; public: static constexpr bool value = type::value; }; template struct has_minus_impl { private: // Test for non member operator template static constexpr auto check(T*) -> typename std::is_same::type, T>::type; // Test for member operator template static constexpr auto check(T*) -> typename std::is_same().operator-(std::declval())), T>::type; template static constexpr std::false_type check(...); typedef decltype(check(0)) type; public: static constexpr bool value = type::value; }; template using has_plus = typename std::conditional::value, std::true_type, has_plus_impl>::type; template using has_minus = typename std::conditional::value, std::true_type, has_minus_impl>::type; template using has_equal = typename std::conditional::value, std::true_type, std::is_copy_assignable>::type; template using has_plus_eq = typename std::conditional::value, std::true_type, has_plus_eq_impl>::type; template using has_minus_eq = typename std::conditional::value, std::true_type, has_minus_eq_impl>::type; template using has_mul = typename std::conditional::value, std::true_type, has_mul_impl>::type; template using has_mul_eq = typename std::conditional::value, std::true_type, has_mul_eq_impl>::type; template struct has_operation{ static constexpr bool value = (has_plus::value && has_minus::value && has_equal::value && has_plus_eq::value && has_minus_eq::value && has_mul::value && has_mul_eq::value); }; } // FFLAS #endif // __FFLASFFPACK_SPARSEMATRIX_TRAITS_H fflas-ffpack-2.2.2/fflas-ffpack/fflas/fflas_sparse/utils.h000066400000000000000000000107001274716147400234520ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Bastien Vialla * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas/fflas_sparse.h */ #ifndef __FFLASFFPACK_fflas_fflas_sparse_utils_H #define __FFLASFFPACK_fflas_fflas_sparse_utils_H #include #include #include namespace FFLAS{ struct StatsMatrix { uint64_t rowdim = 0; uint64_t coldim = 0; uint64_t nOnes = 0; uint64_t nMOnes = 0; uint64_t nOthers = 0; uint64_t nnz = 0; uint64_t maxRow = 0; uint64_t minRow = 0; uint64_t averageRow = 0; uint64_t deviationRow = 0; uint64_t maxCol = 0; uint64_t minCol = 0; uint64_t averageCol = 0; uint64_t deviationCol = 0; uint64_t minColDifference = 0; uint64_t maxColDifference = 0; uint64_t averageColDifference = 0; uint64_t deviationColDifference = 0; uint64_t minRowDifference = 0; uint64_t maxRowDifference = 0; uint64_t averageRowDifference = 0; uint64_t deviationRowDifference = 0; uint64_t nDenseRows = 0; uint64_t nDenseCols = 0; uint64_t nEmptyRows = 0; uint64_t nEmptyCols = 0; uint64_t nEmptyColsEnd = 0; std::vector denseRows; std::vector denseCols; }; template double computeDeviation(It begin, It end) { using T = typename std::decay::type; T average = 0; average = std::accumulate(begin, end, 0) / (end - begin); T sum = 0; for (It i = begin; i != end; ++i) { sum += ((*(i)) - average) * ((*(i)) - average); } return std::sqrt(sum / (end - begin)); } template StatsMatrix getStat(const Field &F, const index_t *row, const index_t *col, typename Field::ConstElement_ptr val, uint64_t rowdim, uint64_t coldim, uint64_t nnz) { StatsMatrix stats; stats.nnz = nnz; stats.rowdim = rowdim; stats.coldim = coldim; std::vector rows(rowdim+1); std::vector cols(coldim); std::fill(rows.begin(), rows.end(), 0); std::fill(cols.begin(), cols.end(), 0); for (uint64_t i = 0; i < nnz; ++i) { cols[col[i]]++; if (F.isOne(val[i])) { stats.nOnes++; } else if (F.isMOne(val[i])) { stats.nMOnes++; } else { stats.nOthers++; } } rows[0] = row[0]; for(size_t i = 1 ; i < rowdim+1 ; ++i){ rows[i] = row[i] - row[i-1]; } stats.nEmptyRows = std::count(rows.begin(), rows.end(), 0); stats.nEmptyCols = std::count(cols.begin(), cols.end(), 0); auto rowMinMax = std::minmax_element(rows.begin(), rows.end()); auto colMinMax = std::minmax_element(cols.begin(), cols.end()); stats.minRow = (*(rowMinMax.first)); stats.maxRow = (*(rowMinMax.second)); stats.minCol = (*(colMinMax.first)); stats.maxCol = (*(colMinMax.second)); stats.averageRow = std::accumulate(rows.begin(), rows.end(), 0) / rowdim; stats.averageCol = std::accumulate(cols.begin(), cols.end(), 0) / coldim; stats.deviationRow = (uint64_t)computeDeviation(rows.begin(), rows.end()); stats.deviationCol = (uint64_t)computeDeviation(cols.begin(), cols.end()); stats.nDenseRows = std::count_if(rows.begin(), rows.begin(), [rowdim](uint64_t &x) { return x >= DENSE_THRESHOLD * rowdim; }); stats.nDenseCols = std::count_if(cols.begin(), cols.begin(), [coldim](uint64_t &x) { return x >= DENSE_THRESHOLD * coldim; }); return stats; } } #endif fflas-ffpack-2.2.2/fflas-ffpack/ffpack/000077500000000000000000000000001274716147400176325ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/ffpack/Makefile.am000066400000000000000000000034661274716147400216770ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/ffpack multiprecision= ffpack_ludivine_mp.inl ffpack_pluq_mp.inl pkgincludesub_HEADERS= \ ffpack_charpoly_danilevski.inl \ ffpack_charpoly.inl \ ffpack_charpoly_kgfastgeneralized.inl \ ffpack.h \ ffpack_charpoly_kgfast.inl \ ffpack_krylovelim.inl \ ffpack_charpoly_kglu.inl \ ffpack_echelonforms.inl \ ffpack_ludivine.inl \ ffpack_pluq.inl \ ffpack_ppluq.inl \ ffpack_frobenius.inl \ ffpack_minpoly_construct.inl \ ffpack_minpoly.inl \ ffpack.inl\ ffpack_invert.inl\ ffpack_fgesv.inl\ ffpack_fgetrs.inl\ ffpack_permutation.inl\ ffpack_ftrtr.inl\ ffpack_rankprofiles.inl\ $(multiprecision) EXTRA_DIST=ffpack.doxy fflas-ffpack-2.2.2/fflas-ffpack/ffpack/ffpack.doxy000066400000000000000000000021401274716147400217660ustar00rootroot00000000000000// Copyright (c) 2011 FFLAS-FFPACK // written by Brice Boyer (briceboyer) // // ========LICENCE======== // This file is part of the library FFLAS-FFPACK. // // FFLAS-FFPACK is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // ========LICENCE======== // /** \ingroup fflas-ffpack * \defgroup ffpack FFPACK * * \brief Class FFPACK provides functions using fflas much as Lapack uses BLAS. * */ // vim:syn=doxygen fflas-ffpack-2.2.2/fflas-ffpack/ffpack/ffpack.h000066400000000000000000001527371274716147400212540ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack.h * Copyright (C) 2005 Clement Pernet * 2014 FFLAS-FFPACK group * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file ffpack.h * \brief Set of elimination based routines for dense linear algebra. * Matrices are supposed over finite prime field of characteristic less than 2^26. */ #ifndef __FFLASFFPACK_ffpack_H #define __FFLASFFPACK_ffpack_H #include #ifdef __FFLASFFPACK_USE_OPENMP #include #endif #include "fflas-ffpack/fflas/fflas.h" //#include "parallel.h" #include #include #include // std::cout #include // The use of the small size LQUP is currently disabled: // need for a better handling of element base (double, float, generic) combined // with different thresholds. // TransPosed version has to be implemented too. #ifndef __FFPACK_LUDIVINE_CUTOFF #define __FFPACK_LUDIVINE_CUTOFF 0 #endif #ifndef __FFPACK_CHARPOLY_THRESHOLD #define __FFPACK_CHARPOLY_THRESHOLD 30 #endif /** @brief Finite Field PACK * Set of elimination based routines for dense linear algebra. * * This namespace enlarges the set of BLAS routines of the class FFLAS, with higher * level routines based on elimination. \ingroup ffpack */ namespace FFPACK { /* tags */ enum FFPACK_LU_TAG { FfpackSlabRecursive = 1, FfpackTileRecursive = 2, FfpackSingular = 3 }; enum FFPACK_CHARPOLY_TAG { FfpackLUK=1, FfpackKG=2, FfpackHybrid=3, FfpackKGFast=4, FfpackDanilevski=5, FfpackArithProg=6, FfpackKGFastG=7 }; class CharpolyFailed{}; enum FFPACK_MINPOLY_TAG { FfpackDense=1, FfpackKGF=2 }; } namespace FFPACK { /* Permutations */ /*****************/ /* PERMUTATIONS */ /*****************/ void LAPACKPerm2MathPerm (size_t * MathP, const size_t * LapackP, const size_t N); void MathPerm2LAPACKPerm (size_t * LapackP, const size_t * MathP, const size_t N); template void MatrixApplyS (const Field& F, typename Field::Element_ptr A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4); template void PermApplyS (Element* A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4); template void MatrixApplyT (const Field& F, typename Field::Element_ptr A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4); template void PermApplyT (Element* A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4); void composePermutationsP (size_t * MathP, const size_t * P1, const size_t * P2, const size_t R, const size_t N); void composePermutationsQ (size_t * MathP, const size_t * Q1, const size_t * Q2, const size_t R, const size_t N); void cyclic_shift_mathPerm (size_t * P, const size_t s); template void cyclic_shift_row_col(Base_t * A, size_t m, size_t n, size_t lda); template void cyclic_shift_row(const Field& F, typename Field::Element_ptr A, size_t m, size_t n, size_t lda); template void cyclic_shift_col(const Field& F, typename Field::Element_ptr A, size_t m, size_t n, size_t lda); /** Apply a permutation P, stored in the LAPACK format (a sequence of transpositions) * between indices ibeg and iend of P to (iend-ibeg) vectors of size M stored in A (as column for NoTrans and rows for Trans). * Side==FFLAS::FflasLeft for row permutation Side==FFLAS::FflasRight for a column permutation * Trans==FFLAS::FflasTrans for the inverse permutation of P * @param F * @param Side * @param Trans * @param M * @param ibeg * @param iend * @param A * @param lda * @param P * @warning not sure the submatrix is still a permutation and the one we expect in all cases... examples for iend=2, ibeg=1 and P=[2,2,2] */ template void applyP( const Field& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_TRANSPOSE Trans, const size_t M, const size_t ibeg, const size_t iend, typename Field::Element_ptr A, const size_t lda, const size_t * P ); /** Apply a R-monotonically increasing permutation P, to the matrix A. * The permutation represented by P is defined as follows: * - the first R values of P is a LAPACK reprensentation (a sequence of transpositions) * - the remaining iend-ibeg-R values of the permutation are in a monotonically increasing progression * Side==FFLAS::FflasLeft for row permutation Side==FFLAS::FflasRight for a column permutation * Trans==FFLAS::FflasTrans for the inverse permutation of P * @param F * @param Side * @param Trans * @param M * @param ibeg * @param iend * @param A * @param lda * @param P * @param R */ template void MonotonicApplyP (const Field& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_TRANSPOSE Trans, const size_t M, const size_t ibeg, const size_t iend, typename Field::Element_ptr A, const size_t lda, const size_t * P, const size_t R); template void MonotonicCompress (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, typename Field::Element_ptr A, const size_t lda, const size_t incA, const size_t * P, const size_t R, const size_t maxpiv, const size_t rowstomove, const std::vector &ispiv); template void MonotonicCompressMorePivots (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, typename Field::Element_ptr A, const size_t lda, const size_t incA, const size_t * MathP, const size_t R, const size_t rowstomove, const size_t lenP); template void MonotonicCompressCycles (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, typename Field::Element_ptr A, const size_t lda, const size_t incA, const size_t * MathP, const size_t lenP); template void MonotonicExpand (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, typename Field::Element_ptr A, const size_t lda, const size_t incA, const size_t * MathP, const size_t R, const size_t maxpiv, const size_t rowstomove, const std::vector &ispiv); //! Parallel applyP with OPENMP tasks template void papplyP( const Field& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_TRANSPOSE Trans, const size_t m, const size_t ibeg, const size_t iend, typename Field::Element_ptr A, const size_t lda, const size_t * P ); //! Parallel applyT with OPENMP tasks template void pMatrixApplyT (const Field& F, typename Field::Element_ptr A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) ; //! Parallel applyS tasks with OPENMP tasks template void pMatrixApplyS (const Field& F, typename Field::Element_ptr A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) ; template size_t pPLUQ(const Field& Fi, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* P, size_t* Q, int nt); //#endif } // FFPACK permutations // #include "ffpack_permutation.inl" namespace FFPACK { /* fgetrs, fgesv */ /** Solve the system \f$A X = B\f$ or \f$X A = B\f$. * Solving using the \c LQUP decomposition of \p A * already computed inplace with \c LUdivine(FFLAS::FflasNoTrans, FFLAS::FflasNonUnit). * Version for A square. * If A is rank deficient, a solution is returned if the system is consistent, * Otherwise an info is 1 * * @param F field * @param Side Determine wheter the resolution is left or right looking. * @param M row dimension of \p B * @param N col dimension of \p B * @param R rank of \p A * @param A input matrix * @param lda leading dimension of \p A * @param P column permutation of the \c LQUP decomposition of \p A * @param Q column permutation of the \c LQUP decomposition of \p A * @param B Right/Left hand side matrix. Initially stores \p B, finally stores the solution \p X. * @param ldb leading dimension of \p B * @param info Success of the computation: 0 if successfull, >0 if system is inconsistent */ template void fgetrs (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, typename Field::Element_ptr A, const size_t lda, const size_t *P, const size_t *Q, typename Field::Element_ptr B, const size_t ldb, int * info); /** Solve the system A X = B or X A = B. * Solving using the LQUP decomposition of A * already computed inplace with LUdivine(FFLAS::FflasNoTrans, FFLAS::FflasNonUnit). * Version for A rectangular. * If A is rank deficient, a solution is returned if the system is consistent, * Otherwise an info is 1 * * @param F field * @param Side Determine wheter the resolution is left or right looking. * @param M row dimension of A * @param N col dimension of A * @param NRHS number of columns (if Side = FFLAS::FflasLeft) or row (if Side = FFLAS::FflasRight) of the matrices X and B * @param R rank of A * @param A input matrix * @param lda leading dimension of A * @param P column permutation of the LQUP decomposition of A * @param Q column permutation of the LQUP decomposition of A * @param X solution matrix * @param ldx leading dimension of X * @param B Right/Left hand side matrix. * @param ldb leading dimension of B * @param info Succes of the computation: 0 if successfull, >0 if system is inconsistent */ template typename Field::Element_ptr fgetrs (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t NRHS, const size_t R, typename Field::Element_ptr A, const size_t lda, const size_t *P, const size_t *Q, typename Field::Element_ptr X, const size_t ldx, typename Field::ConstElement_ptr B, const size_t ldb, int * info); /** @brief Square system solver * @param F The computation domain * @param Side Determine wheter the resolution is left or right looking * @param M row dimension of B * @param N col dimension of B * @param A input matrix * @param lda leading dimension of A * @param B Right/Left hand side matrix. Initially contains B, finally contains the solution X. * @param ldb leading dimension of B * @param info Success of the computation: 0 if successfull, >0 if system is inconsistent * @return the rank of the system * * Solve the system A X = B or X A = B. * Version for A square. * If A is rank deficient, a solution is returned if the system is consistent, * Otherwise an info is 1 */ template size_t fgesv (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr B, const size_t ldb, int * info); /** @brief Rectangular system solver * @param F The computation domain * @param Side Determine wheter the resolution is left or right looking * @param M row dimension of A * @param N col dimension of A * @param NRHS number of columns (if Side = FFLAS::FflasLeft) or row (if Side = FFLAS::FflasRight) of the matrices X and B * @param A input matrix * @param lda leading dimension of A * @param B Right/Left hand side matrix. Initially contains B, finally contains the solution X. * @param ldb leading dimension of B * @param X * @param ldx * @param info Success of the computation: 0 if successfull, >0 if system is inconsistent * @return the rank of the system * * Solve the system A X = B or X A = B. * Version for A square. * If A is rank deficient, a solution is returned if the system is consistent, * Otherwise an info is 1 */ template size_t fgesv (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t NRHS, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx, typename Field::ConstElement_ptr B, const size_t ldb, int * info); /** Solve the system Ax=b. * Solving using LQUP factorization and * two triangular system resolutions. * The input matrix is modified. * @param F The computation domain * @param M row dimension of the matrix * @param A input matrix * @param lda leading dimension of A * @param x solution vector * @param incx increment of x * @param b right hand side vector * @param incb increment of b */ } // FFPACK fgesv, fgetrs // #include "ffpack_fgesv.inl" // #include "ffpack_fgetrs.inl" namespace FFPACK { /* ftrtr */ /** Compute the inverse of a triangular matrix. * @param F * @param Uplo whether the matrix is upper of lower triangular * @param Diag whether the matrix if unit diagonal * @param N * @param A * @param lda * */ template void ftrtri (const Field& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG Diag, const size_t N, typename Field::Element_ptr A, const size_t lda); template void trinv_left( const Field& F, const size_t N, typename Field::ConstElement_ptr L, const size_t ldl, typename Field::Element_ptr X, const size_t ldx ); /** Compute the product UL. * Product UL of the upper, resp lower triangular matrices U and L * stored one above the other in the square matrix A. * Diag == Unit if the matrix U is unit diagonal * @param F * @param diag * @param N * @param A * @param lda * */ template void ftrtrm (const Field& F, const FFLAS::FFLAS_DIAG diag, const size_t N, typename Field::Element_ptr A, const size_t lda); } // FFPACK ftrtr // #include "ffpack_ftrtr.inl" namespace FFPACK { /* PLUQ */ /** @brief Compute the PLUQ factorization of the given matrix. * Using a block algorithm and return its rank. * The permutations P and Q are represented * using LAPACK's convention. * @param F field * @param Diag whether U should have a unit diagonal or not * @param trans, \c LU of \f$A^t\f$ * @param M matrix row dimension * @param N matrix column dimension * @param A input matrix * @param lda leading dimension of \p A * @param P the row permutation * @param Q the column permutation * @return the rank of \p A * @bib * - Dumas J-G., Pernet C., and Sultan Z. \c Simultaneous computation of the row and column rank profiles , ISSAC'13, 2013 * . */ template size_t PLUQ (const Field& F, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q); } // FFPACK PLUQ // #include "ffpack_pluq.inl" namespace FFPACK { /* ludivine */ /** @brief Compute the CUP factorization of the given matrix. * Using * a block algorithm and return its rank. * The permutations P and Q are represented * using LAPACK's convention. * @param F field * @param Diag whether U should have a unit diagonal or not * @param trans \c LU of \f$A^t\f$ * @param M matrix row dimension * @param N matrix column dimension * @param A input matrix * @param lda leading dimension of \p A * @param P the column permutation * @param Qt the transpose of the row permutation \p Q * @param LuTag flag for setting the earling termination if the matrix * is singular * @param cutoff UNKOWN TAG, probably a switch to a faster algo below \c cutoff * * @return the rank of \p A * @bib * - Jeannerod C-P, Pernet, C. and Storjohann, A. \c Rank-profile revealing Gaussian elimination and the CUP matrix decomposition , J. of Symbolic Comp., 2013 * - Pernet C, Brassel M \c LUdivine, une divine factorisation \c LU, 2002 * . */ template size_t LUdivine (const Field& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* P, size_t* Qt, const FFPACK_LU_TAG LuTag = FfpackSlabRecursive, const size_t cutoff=__FFPACK_LUDIVINE_CUTOFF); template class callLUdivine_small; //! LUdivine small case template size_t LUdivine_small (const Field& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* P, size_t* Q, const FFPACK_LU_TAG LuTag=FfpackSlabRecursive); //! LUdivine gauss template size_t LUdivine_gauss (const Field& F, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* P, size_t* Q, const FFPACK_LU_TAG LuTag=FfpackSlabRecursive); namespace Protected { //--------------------------------------------------------------------- // LUdivine_construct: (Specialisation of LUdivine) // LUP factorisation of X, the Krylov base matrix of A^t and v, in A. // X contains the nRowX first vectors v, vA, .., vA^{nRowX-1} // A contains the LUP factorisation of the nUsedRowX first row of X. // When all rows of X have been factorized in A, and rank is full, // then X is updated by the following scheme: X <= ( X; X.B ), where // B = A^2^i. // This enables to make use of Matrix multiplication, and stop computing // Krylov vector, when the rank is not longer full. // P is the permutation matrix stored in an array of indexes //--------------------------------------------------------------------- template size_t LUdivine_construct( const Field& F, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx, typename Field::Element_ptr u, size_t* P, bool computeX, const FFPACK_MINPOLY_TAG MinTag= FfpackDense , const size_t kg_mc =0 , const size_t kg_mb =0 , const size_t kg_j =0 ); } // Protected } //FFPACK ludivine, turbo // #include "ffpack_ludivine.inl" namespace FFPACK { /* echelon */ /*****************/ /* ECHELON FORMS */ /*****************/ /** Compute the Column Echelon form of the input matrix in-place. * * If LuTag == FfpackTileRecursive, then after the computation A = [ M \ V ] * such that AU = C is a column echelon decomposition of A, * with U = P^T [ V ] and C = M + Q [ Ir ] * [ 0 In-r ] [ 0 ] * If LuTag == FfpackTileRecursive then A = [ N \ V ] such that the same holds with M = Q N * * Qt = Q^T * If transform=false, the matrix V is not computed. * See also test-colechelon for an example of use * @param F * @param M * @param N * @param A * @param lda * @param P the column permutation * @param Qt the row position of the pivots in the echelon form * @param transform */ template size_t ColumnEchelonForm (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* P, size_t* Qt, bool transform = false, const FFPACK_LU_TAG LuTag=FfpackSlabRecursive); /** Compute the Row Echelon form of the input matrix in-place. * * If LuTag == FfpackTileRecursive, then after the computation A = [ L \ M ] * such that X A = R is a row echelon decomposition of A, * with X = [ L 0 ] P and R = M + [Ir 0] Q^T * [ In-r] * If LuTag == FfpackTileRecursive then A = [ L \ N ] such that the same holds with M = N Q^T * Qt = Q^T * If transform=false, the matrix L is not computed. * See also test-rowechelon for an example of use * @param F * @param M * @param N * @param A * @param lda * @param P the row permutation * @param Qt the column position of the pivots in the echelon form * @param transform */ template size_t RowEchelonForm (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* P, size_t* Qt, const bool transform = false, const FFPACK_LU_TAG LuTag=FfpackSlabRecursive); /** Compute the Reduced Column Echelon form of the input matrix in-place. * * After the computation A = [ V ] such that AX = R is a reduced col echelon * [ M 0 ] * decomposition of A, where X = P^T [ V ] and R = Q [ Ir ] * [ 0 In-r ] [ M 0 ] * Qt = Q^T * If transform=false, the matrix X is not computed and the matrix A = R * * @param F * @param M * @param N * @param A * @param lda * @param P * @param Qt * @param transform */ template size_t ReducedColumnEchelonForm (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* P, size_t* Qt, const bool transform = false, const FFPACK_LU_TAG LuTag=FfpackSlabRecursive); /** Compute the Reduced Row Echelon form of the input matrix in-place. * * After the computation A = [ V1 M ] such that X A = R is a reduced row echelon * [ V2 0 ] * decomposition of A, where X = [ V1 0 ] P and R = [ Ir M ] Q^T * [ V2 In-r ] [ 0 ] * Qt = Q^T * If transform=false, the matrix X is not computed and the matrix A = R * @param F * @param M * @param N * @param A * @param lda * @param P * @param Qt * @param transform */ template size_t ReducedRowEchelonForm (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* P, size_t* Qt, const bool transform = false, const FFPACK_LU_TAG LuTag=FfpackSlabRecursive); /** Variant by the block recursive algorithm. * (See A. Storjohann Thesis 2000) * !!!!!! Warning !!!!!! * This code is NOT WORKING properly for some echelon structures. * This is due to a limitation of the way we represent permutation matrices * (LAPACK's standard): * - a composition of transpositions Tij of the form * P = T_{1,j1} o T_{2,j2] o...oT_{r,jr}, with jk>k for all 0 < k <= r <= n * - The permutation on the columns, performed by this block recursive algorithm * cannot be represented with such a composition. * Consequently this function should only be used for benchmarks */ template size_t ReducedRowEchelonForm2 (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* P, size_t* Qt, const bool transform = true); //! REF template size_t REF (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, const size_t colbeg, const size_t rowbeg, const size_t colsize, size_t* Qt, size_t* P); } // FFPACK // #include "ffpack_echelonforms.inl" namespace FFPACK { /* invert */ /*****************/ /* INVERSION */ /*****************/ /** @brief Invert the given matrix in place * or computes its nullity if it is singular. * * An inplace \f$2n^3\f$ algorithm is used. * @param F The computation domain * @param M order of the matrix * @param [in,out] A input matrix (\f$M \times M\f$) * @param lda leading dimension of A * @param nullity dimension of the kernel of A * @return pointer to \f$A\f$ and \f$A \gets A^{-1}\f$ */ template typename Field::Element_ptr Invert (const Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, int& nullity); /** @brief Invert the given matrix in place * or computes its nullity if it is singular. * * @pre \p X is preallocated and should be large enough to store the * \f$ m \times m\f$ matrix \p A. * * @param F The computation domain * @param M order of the matrix * @param [in] A input matrix (\f$M \times M\f$) * @param lda leading dimension of \p A * @param [out] X this is the inverse of \p A if \p A is invertible * (non \c NULL and \f$ \mathtt{nullity} = 0\f$). It is untouched * otherwise. * @param ldx leading dimension of \p X * @param nullity dimension of the kernel of \p A * @return pointer to \f$X = A^{-1}\f$ */ template typename Field::Element_ptr Invert (const Field& F, const size_t M, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx, int& nullity); /** @brief Invert the given matrix or computes its nullity if it is singular. * * An \f$2n^3f\f$ algorithm is used. * This routine can be \% faster than FFPACK::Invert but is not totally inplace. * * @pre \p X is preallocated and should be large enough to store the * \f$ m \times m\f$ matrix \p A. * * @warning A is overwritten here ! * @bug not tested. * @param F * @param M order of the matrix * @param [in,out] A input matrix (\f$M \times M\f$). On output, \p A * is modified and represents a "psycological" factorisation \c LU. * @param lda leading dimension of A * @param [out] X this is the inverse of \p A if \p A is invertible * (non \c NULL and \f$ \mathtt{nullity} = 0\f$). It is untouched * otherwise. * @param ldx leading dimension of \p X * @param nullity dimension of the kernel of \p A * @return pointer to \f$X = A^{-1}\f$ */ template typename Field::Element_ptr Invert2( const Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx, int& nullity); } // FFPACK invert // #include "ffpack_invert.inl" namespace FFPACK { /* charpoly */ /*****************************/ /* CHARACTERISTIC POLYNOMIAL */ /*****************************/ /** * Compute the characteristic polynomial of A using Krylov * Method, and LUP factorization of the Krylov matrix */ template std::list& CharPoly( const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda, const FFPACK_CHARPOLY_TAG CharpTag= FfpackArithProg); template Polynomial & mulpoly(const Field& F, Polynomial &res, const Polynomial & P1, const Polynomial & P2); template Polynomial& CharPoly( const Field& F, Polynomial& charp, const size_t N, typename Field::Element_ptr A, const size_t lda, const FFPACK_CHARPOLY_TAG CharpTag= FfpackArithProg); namespace Protected { template std::list& KellerGehrig( const Field& F, std::list& charp, const size_t N, typename Field::ConstElement_ptr A, const size_t lda ); template int KGFast ( const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t * kg_mc, size_t* kg_mb, size_t* kg_j ); template std::list& KGFast_generalized (const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda); template void fgemv_kgf( const Field& F, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr X, const size_t incX, typename Field::Element_ptr Y, const size_t incY, const size_t kg_mc, const size_t kg_mb, const size_t kg_j ); template std::list& LUKrylov( const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr U, const size_t ldu); template std::list& Danilevski (const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda); template std::list& LUKrylov_KGFast( const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx); } // Protected } // FFPACK charpoly // #include "ffpack_charpoly_kglu.inl" // #include "ffpack_charpoly_kgfast.inl" // #include "ffpack_charpoly_kgfastgeneralized.inl" // #include "ffpack_charpoly_danilevski.inl" // #include "ffpack_charpoly.inl" namespace FFPACK { /* frobenius, charpoly */ template std::list& CharpolyArithProg (const Field& F, std::list& frobeniusForm, const size_t N, typename Field::Element_ptr A, const size_t lda, const size_t c); } // FFPACK frobenius // #include "ffpack_frobenius.inl" namespace FFPACK { /* minpoly */ /**********************/ /* MINIMAL POLYNOMIAL */ /**********************/ /** Compute the minimal polynomial. * Minpoly of (A,v) using an LUP * factorization of the Krylov Base (v, Av, .., A^kv) * U,X must be (n+1)*n * U contains the Krylov matrix and X, its LSP factorization */ template Polynomial& MinPoly( const Field& F, Polynomial& minP, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx, size_t* P, const FFPACK_MINPOLY_TAG MinTag= FFPACK::FfpackDense, const size_t kg_mc=0, const size_t kg_mb=0, const size_t kg_j=0 ); } // FFPACK minpoly // #include "ffpack_minpoly.inl" namespace FFPACK { /* Krylov Elim */ template size_t KrylovElim( const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q, const size_t deg, size_t *iterates, size_t * inviterates, const size_t maxit,size_t virt); template size_t SpecRankProfile (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, const size_t deg, size_t *rankProfile); } // FFPACK KrylovElim // #include "ffpack_krylovelim.inl" namespace FFPACK { /* Solutions */ /********/ /* RANK */ /********/ /** Computes the rank of the given matrix using a LQUP factorization. * The input matrix is modified. * @param F field * @param M row dimension of the matrix * @param N column dimension of the matrix * @param A input matrix * @param lda leading dimension of A */ template size_t Rank( const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda) ; /********/ /* DET */ /********/ /** Returns true if the given matrix is singular. * The method is a block elimination with early termination * * using LQUP factorization with early termination. * If M != N, * then the matrix is virtually padded with zeros to make it square and * it's determinant is zero. * @warning The input matrix is modified. * @param F field * @param M row dimension of the matrix * @param N column dimension of the matrix. * @param [in,out] A input matrix * @param lda leading dimension of A */ template bool IsSingular( const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda); /** @brief Returns the determinant of the given matrix. * @details The method is a block elimination with early termination * using LQUP factorization with early termination. * If M != N, * then the matrix is virtually padded with zeros to make it square and * it's determinant is zero. * @warning The input matrix is modified. * @param F field * @param M row dimension of the matrix * @param N column dimension of the matrix. * @param [in,out] A input matrix * @param lda leading dimension of A */ template typename Field::Element Det( const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda); /*********/ /* SOLVE */ /*********/ /// Solve linear system using LQUP factorization. template typename Field::Element_ptr Solve( const Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr x, const int incx, typename Field::ConstElement_ptr b, const int incb ); //! Solve L X = B or X L = B in place. //! L is M*M if Side == FFLAS::FflasLeft and N*N if Side == FFLAS::FflasRight, B is M*N. //! Only the R non trivial column of L are stored in the M*R matrix L //! Requirement : so that L could be expanded in-place template void solveLB( const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, typename Field::Element_ptr L, const size_t ldl, const size_t * Q, typename Field::Element_ptr B, const size_t ldb ); //! Solve L X = B in place. //! L is M*M or N*N, B is M*N. //! Only the R non trivial column of L are stored in the M*R matrix L template void solveLB2( const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, typename Field::Element_ptr L, const size_t ldl, const size_t * Q, typename Field::Element_ptr B, const size_t ldb ); /*************/ /* NULLSPACE */ /*************/ /** Computes a vector of the Left/Right nullspace of the matrix A. * * @param F The computation domain * @param Side * @param M * @param N * @param[in,out] A input matrix of dimension M x N, A is modified to its LU version * @param lda * @param[out] X output vector * @param incX * */ template void RandomNullSpaceVector (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t incX); /** Computes a basis of the Left/Right nullspace of the matrix A. * return the dimension of the nullspace. * * @param F The computation domain * @param Side * @param M * @param N * @param[in,out] A input matrix of dimension M x N, A is modified * @param lda * @param[out] NS output matrix of dimension N x NSdim (allocated here) * @param[out] ldn * @param[out] NSdim the dimension of the Nullspace (N-rank(A)) * */ template size_t NullSpaceBasis (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr& NS, size_t& ldn, size_t& NSdim); /*****************/ /* RANK PROFILES */ /*****************/ /** @brief Computes the row rank profile of A. * * @param F * @param M * @param N * @param A input matrix of dimension M x N * @param lda * @param rkprofile return the rank profile as an array of row indexes, of dimension r=rank(A) * @param LuTag: chooses the elimination algorithm. SlabRecursive for LUdivine, TileRecursive for PLUQ * * A is modified * rkprofile is allocated during the computation. * @returns R */ template size_t RowRankProfile (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* &rkprofile, const FFPACK_LU_TAG LuTag=FfpackSlabRecursive); /** @brief Computes the column rank profile of A. * * @param F * @param M * @param N * @param A input matrix of dimension * @param lda * @param rkprofile return the rank profile as an array of row indexes, of dimension r=rank(A) * @param LuTag: chooses the elimination algorithm. SlabRecursive for LUdivine, TileRecursive for PLUQ * * A is modified * rkprofile is allocated during the computation. * @returns R */ template size_t ColumnRankProfile (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* &rkprofile, const FFPACK_LU_TAG LuTag=FfpackSlabRecursive); /** @brief Recovers the column/row rank profile from the permutation of an LU decomposition. * * Works with both the CUP/PLE decompositions (obtained by LUdivine) or the PLUQ decomposition * Assumes that the output vector containing the rank profile is already allocated. * @param P the permutation carrying the rank profile information * @param N the row/col dimension for a row/column rank profile * @param R the rank of the matrix ( * @param rkprofile return the rank profile as an array of indices * @param LuTag: chooses the elimination algorithm. SlabRecursive for LUdivine, TileRecursive for PLUQ * * A is modified * */ void RankProfileFromLU (const size_t* P, const size_t N, const size_t R, size_t* rkprofile, const FFPACK_LU_TAG LuTag); /** @brief Recovers the row and column rank profiles of any leading submatrix from the PLUQ decomposition. * * Only works with the PLUQ decomposition * Assumes that the output vectors containing the rank profiles are already allocated. * * @param P the permutation carrying the rank profile information * @param M the row dimension of the initial matrix * @param N the column dimension of the initial matrix * @param R the rank of the initial matrix * @param LSm the row dimension of the leading submatrix considered * @param LSn the column dimension of the leading submatrix considered * @param P the row permutation of the PLUQ decomposition * @param Q the column permutation of the PLUQ decomposition * @param RRP return the row rank profile of the leading * @param LuTag: chooses the elimination algorithm. SlabRecursive for LUdivine, TileRecursive for PLUQ * @return the rank of the LSm x LSn leading submatrix * * A is modified * @bib * - Dumas J-G., Pernet C., and Sultan Z. \c Simultaneous computation of the row and column rank profiles , ISSAC'13. */ size_t LeadingSubmatrixRankProfiles (const size_t M, const size_t N, const size_t R, const size_t LSm, const size_t LSn, const size_t* P, const size_t* Q, size_t* RRP, size_t* CRP); /** RowRankProfileSubmatrixIndices. * Computes the indices of the submatrix r*r X of A whose rows correspond to * the row rank profile of A. * * @param F * @param M * @param N * @param A input matrix of dimension * @param rowindices array of the row indices of X in A * @param colindices array of the col indices of X in A * @param lda * @param[out] R * * rowindices and colindices are allocated during the computation. * A is modified * @returns R */ template size_t RowRankProfileSubmatrixIndices (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*& rowindices, size_t*& colindices, size_t& R); /** Computes the indices of the submatrix r*r X of A whose columns correspond to * the column rank profile of A. * * @param F * @param M * @param N * @param A input matrix of dimension * @param rowindices array of the row indices of X in A * @param colindices array of the col indices of X in A * @param lda * @param[out] R * * rowindices and colindices are allocated during the computation. * @warning A is modified * \return R */ template size_t ColRankProfileSubmatrixIndices (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*& rowindices, size_t*& colindices, size_t& R); /** Computes the r*r submatrix X of A, by picking the row rank profile rows of A. * * @param F * @param M * @param N * @param A input matrix of dimension M x N * @param lda * @param X the output matrix * @param[out] R * * A is not modified * X is allocated during the computation. * @return R */ template size_t RowRankProfileSubmatrix (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr& X, size_t& R); /** Compute the \f$ r\times r\f$ submatrix X of A, by picking the row rank profile rows of A. * * * @param F * @param M * @param N * @param A input matrix of dimension M x N * @param lda * @param X the output matrix * @param[out] R * * A is not modified * X is allocated during the computation. * \returns R */ template size_t ColRankProfileSubmatrix (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr& X, size_t& R); /*********************************************/ /* Accessors to Triangular and Echelon forms */ /*********************************************/ /** Extracts a triangular matrix from a compact storage A=L\U of rank R. * if OnlyNonZeroVectors is false, then T and A have the same dimensions * Otherwise, T is R x N if UpLo = FflasUpper, else T is M x R * @param F: base field * @param UpLo: selects if the upper or lower triangular matrix is returned * @param diag: selects if the triangular matrix unit-diagonal * @param M: row dimension of T * @param N: column dimension of T * @param R: rank of the triangular matrix (how many rows/columns need to be copied) * @param A: input matrix * @param lda: leading dimension of A * @param T: output matrix * @param ldt: leading dimension of T * @param OnlyNonZeroVectors: decides whether the last zero rows/columns should be ignored */ template void getTriangular (const Field& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG diag, const size_t M, const size_t N, const size_t R, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr T, const size_t ldt, const bool OnlyNonZeroVectors = false); /** Cleans up a compact storage A=L\U to reveal a triangular matrix of rank R. * @param F: base field * @param UpLo: selects if the upper or lower triangular matrix is revealed * @param diag: selects if the triangular matrix unit-diagonal * @param M: row dimension of A * @param N: column dimension of A * @param R: rank of the triangular matrix * @param A: input/output matrix * @param lda: leading dimension of A */ template void getTriangular (const Field& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG diag, const size_t M, const size_t N, const size_t R, typename Field::Element_ptr A, const size_t lda); /** Extracts a matrix in echelon form from a compact storage A=L\U of rank R obtained by * RowEchelonForm or ColumnEchelonForm. * Either L or U is in Echelon form (depending on Uplo) * The echelon structure is defined by the first R values of the array P. * row and column dimension of T are greater or equal to that of A * @param F: base field * @param UpLo: selects if the upper or lower triangular matrix is returned * @param diag: selects if the echelon matrix has unit pivots * @param M: row dimension of T * @param N: column dimension of T * @param R: rank of the triangular matrix (how many rows/columns need to be copied) * @param P: positions of the R pivots * @param A: input matrix * @param lda: leading dimension of A * @param T: output matrix * @param ldt: leading dimension of T * @param OnlyNonZeroVectors: decides whether the last zero rows/columns should be ignored * @param LuTag: which factorized form (CUP/PLE if FfpackSlabRecursive, PLUQ if FfpackTileRecursive) */ template void getEchelonForm (const Field& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG diag, const size_t M, const size_t N, const size_t R, const size_t* P, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr T, const size_t ldt, const bool OnlyNonZeroVectors = false, const FFPACK_LU_TAG LuTag = FfpackSlabRecursive); /** Cleans up a compact storage A=L\U obtained by RowEchelonForm or ColumnEchelonForm * to reveal an echelon form of rank R. * Either L or U is in Echelon form (depending on Uplo) * The echelon structure is defined by the first R values of the array P. * @param F: base field * @param UpLo: selects if the upper or lower triangular matrix is returned * @param diag: selects if the echelon matrix has unit pivots * @param M: row dimension of A * @param N: column dimension of A * @param R: rank of the triangular matrix (how many rows/columns need to be copied) * @param P: positions of the R pivots * @param A: input/output matrix * @param lda: leading dimension of A * @param LuTag: which factorized form (CUP/PLE if FfpackSlabRecursive, PLUQ if FfpackTileRecursive) */ template void getEchelonForm (const Field& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG diag, const size_t M, const size_t N, const size_t R, const size_t* P, typename Field::Element_ptr A, const size_t lda, const FFPACK_LU_TAG LuTag = FfpackSlabRecursive); /** Extracts a transformation matrix to echelon form from a compact storage A=L\U * of rank R obtained by RowEchelonForm or ColumnEchelonForm. * If Uplo == FflasLower: * T is N x N (already allocated) such that A T = C is a transformation of A in * Column echelon form * Else * T is M x M (already allocated) such that T A = E is a transformation of A in * Row Echelon form * @param F: base field * @param UpLo: Lower means Transformation to Column Echelon Form, Upper, to Row Echelon Form * @param diag: selects if the echelon matrix has unit pivots * @param M: row dimension of A * @param N: column dimension of A * @param R: rank of the triangular matrix * @param P: permutation matrix * @param A: input matrix * @param lda: leading dimension of A * @param T: output matrix * @param ldt: leading dimension of T * @param LuTag: which factorized form (CUP/PLE if FfpackSlabRecursive, PLUQ if FfpackTileRecursive) */ template void getEchelonTransform (const Field& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG diag, const size_t M, const size_t N, const size_t R, const size_t* P, const size_t* Q, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr T, const size_t ldt, const FFPACK_LU_TAG LuTag = FfpackSlabRecursive); /** Extracts a matrix in echelon form from a compact storage A=L\U of rank R obtained by * ReducedRowEchelonForm or ReducedColumnEchelonForm with transform = true. * Either L or U is in Echelon form (depending on Uplo) * The echelon structure is defined by the first R values of the array P. * row and column dimension of T are greater or equal to that of A * @param F: base field * @param UpLo: selects if the upper or lower triangular matrix is returned * @param diag: selects if the echelon matrix has unit pivots * @param M: row dimension of T * @param N: column dimension of T * @param R: rank of the triangular matrix (how many rows/columns need to be copied) * @param P: positions of the R pivots * @param A: input matrix * @param lda: leading dimension of A * @param ldt: leading dimension of T * @param LuTag: which factorized form (CUP/PLE if FfpackSlabRecursive, PLUQ if FfpackTileRecursive) * @param OnlyNonZeroVectors: decides whether the last zero rows/columns should be ignored */ template void getReducedEchelonForm (const Field& F, const FFLAS::FFLAS_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr T, const size_t ldt, const bool OnlyNonZeroVectors = false, const FFPACK_LU_TAG LuTag = FfpackSlabRecursive); /** Cleans up a compact storage A=L\U of rank R obtained by ReducedRowEchelonForm or * ReducedColumnEchelonForm with transform = true. * Either L or U is in Echelon form (depending on Uplo) * The echelon structure is defined by the first R values of the array P. * @param F: base field * @param UpLo: selects if the upper or lower triangular matrix is returned * @param diag: selects if the echelon matrix has unit pivots * @param M: row dimension of A * @param N: column dimension of A * @param R: rank of the triangular matrix (how many rows/columns need to be copied) * @param P: positions of the R pivots * @param A: input/output matrix * @param lda: leading dimension of A * @param LuTag: which factorized form (CUP/PLE if FfpackSlabRecursive, PLUQ if FfpackTileRecursive) */ template void getReducedEchelonForm (const Field& F, const FFLAS::FFLAS_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, typename Field::Element_ptr A, const size_t lda, const FFPACK_LU_TAG LuTag = FfpackSlabRecursive); /** Extracts a transformation matrix to echelon form from a compact storage A=L\U * of rank R obtained by RowEchelonForm or ColumnEchelonForm. * If Uplo == FflasLower: * T is N x N (already allocated) such that A T = C is a transformation of A in * Column echelon form * Else * T is M x M (already allocated) such that T A = E is a transformation of A in * Row Echelon form * @param F: base field * @param UpLo: selects Col or Row Echelon Form * @param diag: selects if the echelon matrix has unit pivots * @param M: row dimension of A * @param N: column dimension of A * @param R: rank of the triangular matrix * @param P: permutation matrix * @param A: input matrix * @param lda: leading dimension of A * @param T: output matrix * @param ldt: leading dimension of T * @param LuTag: which factorized form (CUP/PLE if FfpackSlabRecursive, PLUQ if FfpackTileRecursive) */ template void getReducedEchelonTransform (const Field& F, const FFLAS::FFLAS_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, const size_t* Q, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr T, const size_t ldt, const FFPACK_LU_TAG LuTag = FfpackSlabRecursive); /** Auxiliary routine: determines the permutation that changes a PLUQ decomposition * into a echelon form revealing PLUQ decomposition */ void PLUQtoEchelonPermutation (const size_t N, const size_t R, const size_t * P, size_t * outPerm); } // FFPACK // #include "ffpack.inl" namespace FFPACK { /* not used */ /** LQUPtoInverseOfFullRankMinor. * Suppose A has been factorized as L.Q.U.P, with rank r. * Then Qt.A.Pt has an invertible leading principal r x r submatrix * This procedure efficiently computes the inverse of this minor and puts it into X. * @note It changes the lower entries of A_factors in the process (NB: unless A was nonsingular and square) * * @param F * @param rank rank of the matrix. * @param A_factors matrix containing the L and U entries of the factorization * @param lda * @param QtPointer theLQUP->getQ()->getPointer() (note: getQ returns Qt!) * @param X desired location for output * @param ldx */ template typename Field::Element_ptr LQUPtoInverseOfFullRankMinor( const Field& F, const size_t rank, typename Field::Element_ptr A_factors, const size_t lda, const size_t* QtPointer, typename Field::Element_ptr X, const size_t ldx); } // FFPACK // include precompiled instantiation headers (avoiding to recompile them) #ifdef FFPACK_COMPILED #include "fflas-ffpack/interfaces/libs/ffpack_inst.h" #endif //--------------------------------------------------------------------- // Checkers #include "fflas-ffpack/checkers/checkers_ffpack.h" //--------------------------------------------------------------------- #include "ffpack_fgesv.inl" #include "ffpack_fgetrs.inl" #include "ffpack_ftrtr.inl" //--------------------------------------------------------------------- // Checkers #include "fflas-ffpack/checkers/checkers_ffpack.inl" //--------------------------------------------------------------------- #include "ffpack_pluq.inl" #include "ffpack_pluq_mp.inl" #include "ffpack_ppluq.inl" #include "ffpack_ludivine.inl" #include "ffpack_ludivine_mp.inl" #include "ffpack_echelonforms.inl" #include "ffpack_invert.inl" #include "ffpack_charpoly_kglu.inl" #include "ffpack_charpoly_kgfast.inl" #include "ffpack_charpoly_kgfastgeneralized.inl" #include "ffpack_charpoly_danilevski.inl" #include "ffpack_charpoly.inl" #include "ffpack_frobenius.inl" #include "ffpack_minpoly.inl" #include "ffpack_krylovelim.inl" #include "ffpack_permutation.inl" #include "ffpack_rankprofiles.inl" #include "ffpack.inl" #endif // __FFLASFFPACK_ffpack_H fflas-ffpack-2.2.2/fflas-ffpack/ffpack/ffpack.inl000066400000000000000000000264201274716147400215740ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack.inl * Copyright (C) 2014 FFLAS-FFACK group * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_INL #define __FFLASFFPACK_ffpack_INL namespace FFPACK { template size_t Rank (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda) { if (M == 0 and N == 0) return 0 ; size_t *P = FFLAS::fflas_new(N); size_t *Q = FFLAS::fflas_new(M); size_t R = LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, M, N, A, lda, P, Q); FFLAS::fflas_delete( Q); FFLAS::fflas_delete( P); return R; } template bool IsSingular (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda) { if ( (M==0) and (N==0) ) return false; if ( (M==0) or (N==0) ) return true; if ( M != N ) return true ; size_t *P = FFLAS::fflas_new(N); size_t *Q = FFLAS::fflas_new(M); bool singular = !LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, M, N, A, lda, P, Q, FfpackSingular); FFLAS::fflas_delete( P); FFLAS::fflas_delete( Q); return singular; } template typename Field::Element Det( const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda) { if ( (M==0) and (N==0) ) return F.one ; if ( (M==0) or (N==0) ) return F.zero ; if ( M != N ) return F.zero ; typename Field::Element det; F.init(det); bool singular; size_t *P = FFLAS::fflas_new(N); size_t *Q = FFLAS::fflas_new(M); singular = !LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, M, N, A, lda, P, Q, FfpackSingular); if (singular){ F.assign(det,F.zero); FFLAS::fflas_delete( P); FFLAS::fflas_delete( Q); return det; } else{ F.assign(det,F.one); typename Field::Element_ptr Ai=A; for (; Ai < A+ M*lda+N; Ai+=lda+1 ) F.mulin( det, *Ai ); int count=0; for (size_t i=0;i typename Field::Element_ptr Solve( const Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr x, const int incx, typename Field::ConstElement_ptr b, const int incb ) { size_t *P = FFLAS::fflas_new(M); size_t *rowP = FFLAS::fflas_new(M); if (LUdivine( F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, M, M, A, lda, P, rowP) < M){ std::cerr<<"SINGULAR MATRIX"< void RandomNullSpaceVector (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t incX) { // Right kernel vector: X s.t. AX == 0 if (Side == FFLAS::FflasRight) { size_t* P = FFLAS::fflas_new(N); size_t* Qt = FFLAS::fflas_new(M); size_t R = LUdivine(F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, M, N, A, lda, P, Qt); FFLAS::fflas_delete(Qt); // Nullspace is {0} if (N == R) { FFLAS::fzero(F, N, X, incX); FFLAS::fflas_delete(P); return; } // We create t (into X) not null such that U * t == 0, i.e. U1 * t1 == -U2 * t2 // Random after rank is passed (t2) typename Field::RandIter g(F); for (size_t i = R; i < N; ++i) g.random(*(X + i * incX)); // Nullspace is total, any random vector would do if (R == 0) { FFLAS::fflas_delete(P); return; } // Compute -U2 * t2 (into t1 as temporary) FFLAS::fgemv(F, FFLAS::FflasNoTrans, R, N - R, F.mOne, A + R, lda, X + R * incX, incX, 0u, X, incX); // Now get t1 such that U1 * t1 == -U2 * t2 FFLAS::ftrsv(F, FFLAS::FflasUpper, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, R, A, lda, X, (int)incX); applyP(F, FFLAS::FflasLeft, FFLAS::FflasTrans, 1u, 0u, (int) R, X, 1u, P); FFLAS::fflas_delete(P); } // Left kernel vector else { size_t* P = FFLAS::fflas_new(M); size_t* Qt = FFLAS::fflas_new(N); size_t R = LUdivine(F, FFLAS::FflasNonUnit, FFLAS::FflasTrans, M, N, A, lda, P, Qt); FFLAS::fflas_delete(Qt); // Nullspace is {0} if (M == R) { FFLAS::fzero(F, M, X, incX); FFLAS::fflas_delete(P); return; } // We create t (into X) not null such that t * L == 0, i.e. t1 * L1 == -t2 * L2 // Random after rank is passed (t2) typename Field::RandIter g(F); for (size_t i = R; i < M; ++i) g.random(*(X + i * incX)); // Nullspace is total, any random vector would do if (R == 0) { FFLAS::fflas_delete(P); return; } // Compute -t2 * L2 (into t1 as temporary) FFLAS::fgemv(F, FFLAS::FflasTrans, M - R, R, F.mOne, A + R * lda, lda, X + R * incX, incX, 0u, X, incX); // Now get t1 such that t1 * L1 == -t2 * L2 FFLAS::ftrsv(F, FFLAS::FflasLower, FFLAS::FflasTrans, FFLAS::FflasNonUnit, R, A, lda, X, (int)incX); applyP(F, FFLAS::FflasRight, FFLAS::FflasNoTrans, 1u, 0u, (int) R, X, 1u, P); FFLAS::fflas_delete(P); } } template size_t NullSpaceBasis (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr& NS, size_t& ldn, size_t& NSdim) { if (Side == FFLAS::FflasRight) { // Right NullSpace size_t* P = FFLAS::fflas_new(N); size_t* Qt = FFLAS::fflas_new(M); size_t R = LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, M, N, A, lda, P, Qt); delete [] Qt; ldn = N-R; NSdim = ldn; if (NSdim == 0) { FFLAS::fflas_delete( P); NS = NULL ; return NSdim ; } NS = FFLAS::fflas_new (F, N, ldn); if (R == 0) { FFLAS::fflas_delete( P); FFLAS::fidentity(F,N,ldn,NS,ldn); return NSdim; } FFLAS::fassign (F, R, ldn, A + R, lda, NS , ldn ); ftrsm (F, FFLAS::FflasLeft, FFLAS::FflasUpper, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, R, ldn, F.mOne, A, lda, NS, ldn); FFLAS::fidentity(F,NSdim,NSdim,NS+R*ldn,ldn); applyP (F, FFLAS::FflasLeft, FFLAS::FflasTrans, NSdim, 0,(int) R, NS, ldn, P); delete [] P; return NSdim; } else { // Left NullSpace size_t* P = FFLAS::fflas_new(M); size_t* Qt = FFLAS::fflas_new(N); size_t R = LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasTrans, M, N, A, lda, P, Qt); delete [] Qt; ldn = M; NSdim = M-R; if (NSdim == 0) { FFLAS::fflas_delete( P); NS = NULL; return NSdim; } NS = FFLAS::fflas_new (F, NSdim, ldn); if (R == 0) { FFLAS::fflas_delete( P); FFLAS::fidentity(F,NSdim,ldn,NS,ldn); return NSdim; } FFLAS::fassign (F, NSdim, R, A + R *lda, lda, NS, ldn); ftrsm (F, FFLAS::FflasRight, FFLAS::FflasLower, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, NSdim, R, F.mOne, A, lda, NS, ldn); FFLAS::fidentity(F,NSdim,NSdim,NS+R,ldn); applyP (F, FFLAS::FflasRight, FFLAS::FflasNoTrans, NSdim, 0,(int) R, NS, ldn, P); delete [] P; return NSdim; } } template void solveLB( const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, typename Field::Element_ptr L, const size_t ldl, const size_t * Q, typename Field::Element_ptr B, const size_t ldb ) { size_t LM = (Side == FFLAS::FflasRight)?N:M; int i = (int)R ; for (; i--; ){ // much faster for if ( Q[i] > (size_t) i){ //for (size_t j=0; j<=Q[i]; ++j) //F.init( *(L+Q[i]+j*ldl), 0 ); //std::cerr<<"1 deplacement "<"< (size_t) ii){ //for (size_t j=0; j<=Q[ii]; ++j) //F.init( *(L+Q[ii]+j*ldl), 0 ); FFLAS::fassign( F, LM-Q[ii]-1, L+Q[ii]*(ldl+1)+ldl,ldl, L+(Q[ii]+1)*ldl+ii, ldl ); for ( size_t j=Q[ii]*ldl; j void solveLB2( const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, typename Field::Element_ptr L, const size_t ldl, const size_t * Q, typename Field::Element_ptr B, const size_t ldb ) { typename Field::Element_ptr Lcurr, Rcurr, Bcurr; size_t ib, Ldim; int k; if ( Side == FFLAS::FflasLeft ){ size_t j = 0; while ( j= 0 ) { ib = Q[j]; k = (int) ib; while ( (j >= 0) && ( (int)Q[j] == k) ) {--k;--j;} Ldim = ib-(size_t)k; Lcurr = L + j+1 + (k+1)*(int)ldl; Bcurr = B + ib+1; Rcurr = Lcurr + Ldim*ldl; fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M, Ldim, N-ib-1, F.mOne, Bcurr, ldb, Rcurr, ldl, F.one, Bcurr-Ldim, ldb); ftrsm (F, Side, FFLAS::FflasLower, FFLAS::FflasNoTrans, FFLAS::FflasUnit, M, Ldim, F.one, Lcurr, ldl , Bcurr-Ldim, ldb ); } } } } // FFPACK #endif // __FFLASFFPACK_ffpack_INL fflas-ffpack-2.2.2/fflas-ffpack/ffpack/ffpack_charpoly.inl000066400000000000000000000265101274716147400234750ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack/ffpack_charpoly.inl * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_charpoly_INL #define __FFLASFFPACK_charpoly_INL namespace FFPACK { // template // std::list >& // CharPoly_convert (const Field& F, std::list >& charp, const size_t N, // typename Field::Element_ptr A, const size_t lda, // const FFPACK_CHARPOLY_TAG CharpTag) // { // Givaro::ModularBalanced G((FloatElement) F.cardinality()); // FloatElement* Af = FFLAS::fflas_new(N*N); // typename std::list< Polynomial > charp_float; // fconvert(F, M, N, Af, N, A, lda); // //convertir aussi le poly // CharPoly (G, charp_float, N, Af, N, CharpTag); // finit(F, ma, Yf, 1, Y, incY); // fflas_delete (Af); // return charp; // } template std::list& CharPoly (const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda, const FFPACK_CHARPOLY_TAG CharpTag) { // if (Protected::AreEqual >::value || // Protected::AreEqual >::value){ // if (F.characteristic() < DOUBLE_TO_FLOAT_CROSSOVER) // return CharPoly_convert (F, charp, N, A, lda, CharpTag); // } switch (CharpTag) { case FfpackLUK: { typename Field::Element_ptr X = FFLAS::fflas_new (F, N, N+1); Protected::LUKrylov (F, charp, N, A, lda, X, N); FFLAS::fflas_delete (X); return charp; } case FfpackKG: { return Protected::KellerGehrig (F, charp, N, A, lda); // break; } case FfpackDanilevski: { return Danilevski (F, charp, N, A, lda); // break; } case FfpackKGFast: { size_t mc, mb, j; if (Protected::KGFast (F, charp, N, A, lda, &mc, &mb, &j)){ std::cerr<<"NON GENERIC MATRIX PROVIDED TO KELLER-GEHRIG-FAST"<(F.characteristic()); // Heuristic condition (the pessimistic theoretical one being p<2n^2. if (p < static_cast(N)){ return CharPoly (F, charp, N, A, lda, FfpackLUK); } do{ try { CharpolyArithProg (F, charp, N, A, lda, __FFPACK_CHARPOLY_THRESHOLD); } catch (CharpolyFailed){ if (attempts++ < 2) cont = true; else return CharPoly(F, charp, N, A, lda, FfpackLUK); } } while (cont); return charp; } default: { typename Field::Element_ptr X = FFLAS::fflas_new (F, N, N+1); Protected::LUKrylov (F, charp, N, A, lda, X, N); FFLAS::fflas_delete (X); return charp; } } } template Polynomial & mulpoly(const Field& F, Polynomial &res, const Polynomial & P1, const Polynomial & P2) { size_t i,j; // Warning: assumes that res is allocated to the size of the product res.resize(P1.size()+P2.size()-1); FFLAS::fzero(F,res.size(),&res[0],1); for ( i=0;i Polynomial& CharPoly( const Field& F, Polynomial& charp, const size_t N, typename Field::Element_ptr A, const size_t lda, const FFPACK_CHARPOLY_TAG CharpTag/*= FfpackArithProg*/) { Checker_charpoly checker(F,N,A,lda); std::list factor_list; CharPoly (F, factor_list, N, A, lda, CharpTag); typename std::list::const_iterator it; it = factor_list.begin(); charp.resize(N+1); Polynomial P = charp = *(it++); while( it!=factor_list.end() ){ mulpoly (F,charp, P, *it); P = charp; ++it; } checker.check(charp); return charp; } namespace Protected { template std::list& LUKrylov (const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx) { typedef typename Field::Element elt; elt* Ai, *Xi, *X2=X; int Ncurr=int(N); charp.clear(); int nbfac = 0; while (Ncurr > 0){ size_t *P = FFLAS::fflas_new((size_t)Ncurr); Polynomial minP;//=new Polynomial(); FFPACK::MinPoly (F, minP, (size_t)Ncurr, A, lda, X2, ldx, P); int k = int(minP.size()-1); // degre of minpoly if ((k==1) && F.isZero ((minP)[0])){ // minpoly is X Ai = A; int j = Ncurr*Ncurr; while (j-- && F.isZero(*(Ai++))) ; if (!j){ // A is 0, CharPoly=X^n minP.resize((size_t)Ncurr+1); (minP)[1] = F.zero; (minP)[(size_t)Ncurr] = F.one; k=Ncurr; } } nbfac++; charp.push_front (minP); if (k==Ncurr){ FFLAS::fflas_delete( P); return charp; } size_t Nrest = (size_t)(Ncurr-k); elt * X21 = X2 + k*(int)ldx; elt * X22 = X21 + k; // Compute the n-k last rows of A' = PA^tP^t in X2_ // A = A . P^t applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans, (size_t)Ncurr, 0, (int)k, A, lda, P); // Copy X2_ = (A'_2)^t for (Xi = X21, Ai = A+k; Xi != X21 + Nrest*ldx; Ai++, Xi+=ldx-(size_t)Ncurr) for (size_t jj=0; jj<(size_t)Ncurr*lda; jj+=lda) *(Xi++) = *(Ai+jj); // A = A . P : Undo the permutation on A applyP (F, FFLAS::FflasRight, FFLAS::FflasNoTrans, (size_t)Ncurr, 0, (int)k, A, lda, P); // X2_ = X2_ . P^t (= (P A^t P^t)2_) applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans, Nrest, 0, (int)k, X21, ldx, P); FFLAS::fflas_delete( P ); // X21 = X21 . S1^-1 ftrsm(F, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, FFLAS::FflasUnit, Nrest, (size_t)k, F.one, X2, ldx, X21, ldx); // Creation of the matrix A2 for recurise call for (Xi = X22, Ai = A; Xi != X22 + Nrest*ldx; Xi += (ldx-Nrest), Ai += (lda-Nrest)) for (size_t jj=0; jj std::list& LUKrylov_KGFast (const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx) { size_t kg_mc, kg_mb, kg_j; if (!KGFast (F, charp, N, A, lda, &kg_mc, &kg_mb, &kg_j)) return charp; else{// Matrix A is not generic Polynomial *minP = new Polynomial(); typename Field::ConstElement_ptr Ai; typename Field::Element_ptr A2i, Xi; size_t *P = FFLAS::fflas_new(N); FFPACK::MinPoly (F, *minP, N, A, lda, X, ldx, P, FfpackKGF, kg_mc, kg_mb, kg_j); size_t k = minP->size()-1; // degre of minpoly if ((k==1) && F.isZero ((*minP)[0])){ // minpoly is X Ai = A; int j = int(N*N); while (j-- && F.isZero(*(Ai++))) ; if (!j){ // A is 0, CharPoly=X^n minP->resize((size_t)N+1); (*minP)[1] = F.zero; (*minP)[N] = F.one; k=N; } } if (k==N){ charp.clear(); charp.push_front(*minP); // CharPoly = MinPoly FFLAS::fflas_delete( P); return charp; } size_t Nrest = N-k; typename Field::Element_ptr X21 = X + k*ldx; typename Field::Element_ptr X22 = X21 + k; // Creates the matrix A //size_t lambda = std::max(0,N - kg_mc*(kg_j+1) - kg_mb); // uint >= 0 !!! size_t lambda = kg_mc*(kg_j+1) + kg_mb; if (lambda > N) lambda = 0 ; else lambda = N - lambda ; size_t imax = kg_mc+kg_mb; // First Id for (size_t j = 0; j < lambda; ++j){ for (size_t i=0; i*/(A), lda, P); // Copy X2_ = (A'2_) for (Xi = X21, Ai = A+k*lda; Xi != X21 + Nrest*ldx; Ai+=lda-N, Xi+=ldx-N){ for (size_t jj=0; jj*/(A), lda, P); // X2_ = X2_ . P^t (= (P A P^t)2_) applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans, Nrest, 0,(int) k, X21, ldx, P); // X21 = X21 . S1^-1 ftrsm(F, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, FFLAS::FflasUnit, Nrest, k, F.one, X, ldx, X21, ldx); // Creation of the matrix A2 for recurise call typename Field::Element_ptr A2 = FFLAS::fflas_new (F, Nrest, Nrest); for (Xi = X22, A2i = A2; Xi != X22 + Nrest*ldx; Xi += (ldx-Nrest)){ for (size_t jj=0; jjs,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack/ffpack_charpoly_danilevski.inl * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_charpoly_danilveski_INL #define __FFLASFFPACK_ffpack_charpoly_danilveski_INL namespace FFPACK { //--------------------------------------------------------------------- // CharPoly: Compute the characteristic polynomial of A using // Danilevski's algorithm. //--------------------------------------------------------------------- template std::list& Danilevski (const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda) { charp.clear(); size_t dtot=0; typename Field::Element_ptr pivot,e,u1; typename Field::Element invp; for (size_t k=0; k k + 1) { FFLAS::fswap (F, N-k, e, 1, pivot, 1); FFLAS::fswap (F, N, A+i, lda, A+k+1, lda); } F.inv (invp, *pivot); FFLAS::fscalin (F, N-k-1, invp, pivot+1, 1); FFLAS::fscalin (F, N-dtot, *pivot, A+dtot*lda+k+1, lda); // X <- X - uw FFLAS::fger (F, k + 1-dtot, N - k -1, F.mOne, A + dtot*lda + k, lda, pivot+1, 1, A+k+1+dtot*lda, lda); if (koperator[](i), *(Ai+i*lda)); } F.assign( (*P)[d], F.one); charp.push_front(*P); dtot+=d; } } return charp; } } // FFPACK #endif // __FFLASFFPACK_ffpack_charpoly_danilveski_INL fflas-ffpack-2.2.2/fflas-ffpack/ffpack/ffpack_charpoly_kgfast.inl000066400000000000000000000215051274716147400250330ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack/ffpack_charpoly_kgfast.inl * Copyright (C) 2004 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_charpoly_kgfast_INL #define __FFLASFFPACK_ffpack_charpoly_kgfast_INL namespace FFPACK { namespace Protected { //--------------------------------------------------------------------- // CharPoly: Compute the characteristic polynomial of A using // Keller-Gehrig's fast algorithm. A must be generic. //--------------------------------------------------------------------- template int KGFast ( const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t * kg_mc, size_t* kg_mb, size_t* kg_j ) { //std::cerr<<"Dans KGFast"<>1; // Matrix A is transformed into a mc_Frobenius form size_t mb=N-mc; typename Field::Element_ptr C, B; while ( mc > 0 ) { // size_t r; #if 0 std::cerr<<"Boucle1: mc,mb,N="<(mc); size_t * Q = FFLAS::fflas_new(mc); if ( (LUdivine( F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, mc, mc, LUP, mc, P, Q)) < mc ){ * kg_mc = mc; * kg_mb = mb; * kg_j = j; FFLAS::fflas_delete( P); FFLAS::fflas_delete( Q); FFLAS::fflas_delete (LUP); return -1; } #if 0 std::cerr<<"LUP="< B2;B1 typename Field::Element_ptr tmp = FFLAS::fflas_new (F, mc, mb); // for (size_t i=0; i 0 ){ #if 0 std::cerr<<"lambda>0"<>=1; mb -= mc; } Polynomial *minP = new Polynomial(); minP->resize(N+1); minP->operator[](N) = F.one; typename Polynomial::iterator it = minP->begin(); for (size_t j=0; j void fgemv_kgf( const Field& F, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::ConstElement_ptr X, const size_t incX, typename Field::Element_ptr Y, const size_t incY, const size_t kg_mc, const size_t kg_mb, const size_t kg_j ) { size_t big_truc =kg_mb-kg_mc*(kg_j+1) ; size_t lambda = (Ns,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas-ffpack/ffpack/ffpack_charpoly_kgfast.inl * Copyright (C) 2004 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_charpoly_kgfastgeneralized_INL #define __FFLASFFPACK_ffpack_charpoly_kgfastgeneralized_INL //--------------------------------------------------------------------- // CharPoly: Compute the characteristic polynomial of A using // Keller-Gehrig's fast algorithm. //--------------------------------------------------------------------- //#define LB_DEBUG #include // std::cout #ifdef LB_DEBUG #include "tests/Matio.h" namespace FFPACK { template void printA(const Field& F, std::ostream& os, typename Field::ConstElement_ptr E, typename Field::ConstElement_ptr C, const size_t lda, const size_t*B, const size_t*T, const size_t me,const size_t mc, const size_t lambda, const size_t mu) { typename Field::Element_ptr A = buildMatrix(F,E,C,lda,B,T,me,mc,lambda,mu); size_t N = mc+me+lambda+mu; write_field(F,os,A,N,N,N); FFLAS::fflas_delete (A); } } // FFPACK #endif namespace FFPACK { template typename Field::Element_ptr buildMatrix (const Field& F, typename Field::ConstElement_ptr E, typename Field::ConstElement_ptr C, const size_t lda, const size_t*B, const size_t*T, const size_t me, const size_t mc, const size_t lambda, const size_t mu) { size_t N = mc+me+lambda+mu; typename Field::Element_ptr A = FFLAS::fflas_new (F, N, N); for (size_t j=0; j std::list& KGFast_generalized (const Field& F, std::list& charp, const size_t N, typename Field::Element_ptr A, const size_t lda) { //std::cerr<<"Dans KGFast"<>1; // Matrix A is transformed into a mc_Frobenius form size_t me=N-mc; // B[i] = j, the row of the 1 if the col Ai is sparse; // B[i] = n+k, if the col Ai is the kth col of E size_t * B = FFLAS::fflas_new(N); bool * allowedRows = FFLAS::fflas_new(N); for (size_t i=0;i<(N+1)/2;++i) allowedRows[i]=true; // T[i] = j si T_i,j = 1 size_t * T = FFLAS::fflas_new(N); for (size_t i=0;i 0) { #ifdef LB_DEBUG std::cerr<<"Boucle1: mc,me,lambda="<(ncols); size_t * Q = FFLAS::fflas_new(lambda+me); for (size_t i=0; i(lambda+me+mc); for (size_t i=0; i< lambda+me+mc; ++i) tempP[i] = i; for (int i = int(r) ; i--; ) if (Q[i] > (size_t) i){ #ifdef LB_DEBUG std::cerr<<"Permutation de tempP["<i) FFLAS::fassign(F, i, LUP+Q[i]*mc,1, LUP+i*mc, 1); #ifdef LB_DEBUG std::cerr<<"..done"< E2;E1 std::cerr<<"// Shifting E: E1;E2 -> E2;E1"; #endif typename Field::Element_ptr tmp = FFLAS::fflas_new (F, r, me); for (size_t i=0; i C_{2,2};C_{1,2} std::cerr<<"// Shifting C_{*,2}: C_{1,2};C_{2,2} -> C_{2,2};C_{1,2}"; #endif tmp = FFLAS::fflas_new (F, r, mc-r); for (size_t i=0; i= N){ FFLAS::fassign (F, r, C+i*lda, 1, tmp+(B[i]-N)*r, 1); } fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, mu + r, r, me, F.one, E+(N-mu-r)*lda, lda, tmp, r, F.one, C+(N-mu-mc)*lda, lda); FFLAS::fflas_delete (tmp); #ifdef LB_DEBUG std::cerr<<"..done"<= (int) (N -mu-r); --i) FFLAS::fassign (F, r, C+((size_t)i-mc+r)*lda, 1, C+i*(int)lda, 1); #ifdef LB_DEBUG std::cerr<<"..done"<= N){ #ifdef LB_DEBUG std::cerr<<"saving in row "< C_2"; #endif tmp = FFLAS::fflas_new (F, N, r); for (size_t j = 0; j= N){ #ifdef LB_DEBUG std::cerr<<"B["< typename Field::Element_ptr fgetrs (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t NRHS, const size_t R, typename Field::Element_ptr A, const size_t lda, const size_t *P, const size_t *Q, typename Field::Element_ptr X, const size_t ldx, typename Field::ConstElement_ptr B, const size_t ldb, int * info) { *info =0; typename Field::Element_ptr W; size_t ldw; if (Side == FFLAS::FflasLeft) { // Left looking solve A X = B // Initializing X to 0 (to be optimized) FFLAS::fzero(F,N,NRHS,X,ldx); // for (size_t i = 0; i N){ // Cannot copy B into X W = FFLAS::fflas_new (F, M, NRHS); ldw = NRHS; FFLAS::fassign(F,M,NRHS,B,ldb,W,ldw); solveLB2 (F, FFLAS::FflasLeft, M, NRHS, R, A, lda, Q, W, ldw); applyP (F, FFLAS::FflasLeft, FFLAS::FflasNoTrans, NRHS, 0,(int) R, W, ldw, Q); bool consistent = true; for (size_t i = R; i < M; ++i) for (size_t j = 0; j < NRHS; ++j) if (!F.isZero (*(W + i*ldw + j))) consistent = false; if (!consistent) { std::cerr<<"System is inconsistent"<=N FFLAS::fassign(F,NRHS,N,B,ldb,X,ldx); applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans, NRHS, 0,(int) R, X, ldx, P); ftrsm (F, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, NRHS, R, F.one, A, lda , X, ldx); fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, NRHS, N-R, R, F.one, X, ldx, A+R, lda, F.mOne, X+R, ldx); bool consistent = true; for (size_t i = 0; i < NRHS; ++i) for (size_t j = R; j < N; ++j) if (!F.isZero (*(X + i*ldx + j))) consistent = false; if (!consistent) { std::cerr<<"System is inconsistent"<s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas-ffpack/ffpack/ffpack_frobenius.inl * Copyright (C) 2007 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include //--------------------------------------------------------------------- // CharpolyArithProg: Las Vegas algorithm to compute the Charpoly // over a large field (Z/pZ, s.t. p > 2n^2) //--------------------------------------------------------------------- // // namespace FFPACK { namespace Protected { template void CompressRows (Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t nb_blocs); template void CompressRowsQK (Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d,const size_t deg, const size_t nb_blocs); template void DeCompressRows (Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t nb_blocs); template void DeCompressRowsQK (Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t deg, const size_t nb_blocs); template void CompressRowsQA (Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t nb_blocs); template void DeCompressRowsQA (Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t nb_blocs); } // Protected } // FFPACK template std::list& FFPACK::CharpolyArithProg (const Field& F, std::list& frobeniusForm, const size_t N, typename Field::Element_ptr A, const size_t lda, const size_t c) { FFLASFFPACK_check(c); size_t * rp = FFLAS::fflas_new(2*N); size_t noc = static_cast(ceil(double(N)/double(c))); size_t Nnoc = N*noc; // Building the workplace matrix typename Field::Element_ptr K = FFLAS::fflas_new (F, Nnoc, c); typename Field::Element_ptr K2 = FFLAS::fflas_new (F, Nnoc, c); // for (size_t i = 0 ; i < Nnoc*c ; ++i) // K[i] = F.zero; size_t ldk = N; size_t *dA = FFLAS::fflas_new(N); //PA size_t *dK = FFLAS::fflas_new(noc*c); for (size_t i=0; i nzg (g); for (size_t i = 0; i < noc; ++i) for (size_t j = 0; j < N; ++j) g.random( *(K + i*ldk +j) ); for (size_t i = 0; i < noc; ++i) nzg.random (*(K + i*ldk +i)); // Computing the bloc Krylov matrix [U AU .. A^(c-1) U]^T for (size_t i = 1; i(N); size_t * Qk = FFLAS::fflas_new(N); for (size_t i=0; i dold){ // std::cerr << "FAIL in preconditionning phase:" // << " degree sequence is not monotonically not increasing" // << std::endl; FFLAS::fflas_delete( rp); FFLAS::fflas_delete (K); FFLAS::fflas_delete( Pk); FFLAS::fflas_delete( Qk); FFLAS::fflas_delete(dA); FFLAS::fflas_delete( dK); throw CharpolyFailed(); } dK[k] = dold = d; Mk++; if (d == c) nb_full_blocks++; if (row_idx < N) ii = Qk[row_idx]; } // Selection of the last iterate of each block typename Field::Element_ptr K3 = FFLAS::fflas_new (F, Mk, N); typename Field::Element_ptr K4 = FFLAS::fflas_new (F, Mk, N); size_t bk_idx = 0; for (size_t i = 0; i < Mk; ++i){ FFLAS::fassign (F, N, (K2 + (bk_idx + dK[i]-1)*ldk), 1, (K3+i*ldk), 1); bk_idx += c; } FFLAS::fflas_delete (K2); // K <- K A^T fgemm( F, FFLAS::FflasNoTrans, FFLAS::FflasTrans, Mk, N, N,F.one, K3, ldk, A, lda, F.zero, K4, ldk); // K <- K P^T applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans, Mk, 0,(int) R, K4, ldk, Pk); // K <- K U^-1 ftrsm (F, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, Mk, R,F.one, K, ldk, K4, ldk); // L <- Q^T L applyP(F, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N, 0,(int) R, K, ldk, Qk); // K <- K L^-1 ftrsm (F, FFLAS::FflasRight, FFLAS::FflasLower, FFLAS::FflasNoTrans, FFLAS::FflasUnit, Mk, R,F.one, K, ldk, K4, ldk); //undoing permutation on L applyP(F, FFLAS::FflasLeft, FFLAS::FflasTrans, N, 0,(int) R, K, ldk, Qk); // Recovery of the completed invariant factors size_t Ma = Mk; size_t Ncurr = R; size_t offset = Ncurr-1; for (size_t i=Mk-1; i>=nb_full_blocks+1; --i){ if (dK[i] >= 1){ for (size_t j = offset+1; j polyList; polyList.clear(); // Recursive call on the complementary subspace CharPoly(F, polyList, Nrest, Arec, ldarec); FFLAS::fflas_delete (Arec); frobeniusForm.merge(polyList); } FFLAS::fflas_delete( Pk); FFLAS::fflas_delete( Qk); size_t deg = c+1; for (size_t i=0; i= 1) && (Mk > 1)) { size_t block_idx, it_idx, rp_val; FFLAS::fflas_delete (K); FFLAS::fflas_delete (K3); K = FFLAS::fflas_new (F, Ncurr, Ma); K3 = FFLAS::fflas_new (F, Ncurr, Ma); ldk = Ma; // Computation of the rank profile for (size_t i=0; i < Ncurr; ++i) for (size_t j=0; j < Ma; ++j) *(Arp + j*ldarp + Ncurr-i-1) = *(Ac + i*ldac + j); for (size_t i=0; i<2*Ncurr; ++i) rp[i] = 0; size_t RR; try{ RR = SpecRankProfile (F, Ma, Ncurr, Arp, ldarp, deg-1, rp); } catch (CharpolyFailed){ FFLAS::fflas_delete (Arp); FFLAS::fflas_delete (Ac); FFLAS::fflas_delete (K); FFLAS::fflas_delete (K3); FFLAS::fflas_delete( rp); FFLAS::fflas_delete( dA); FFLAS::fflas_delete( dK); throw CharpolyFailed(); } if (RR < Ncurr){ //std::cerr<<"FAIL RR dK[block_idx-1])){ FFLAS::fflas_delete (Arp); FFLAS::fflas_delete (Ac); FFLAS::fflas_delete (K); FFLAS::fflas_delete (K3); FFLAS::fflas_delete( rp); FFLAS::fflas_delete( dA); FFLAS::fflas_delete(dK); throw CharpolyFailed(); //std::cerr<<"FAIL d non decroissant"<(Mk); size_t *Q=FFLAS::fflas_new(Mk); if (LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, Mk, Mk , K3 + (Ncurr-Mk)*ldk, ldk, P, Q) < Mk){ // should never happen (not a LAS VEGAS check) //std::cerr<<"FAIL R2 < MK"<=nb_full_blocks+1; --i) if (dK[i] >= 1){ Polynomial PP (dK [i]+1); F.assign(PP[dK[i]],F.one); for (size_t j=0; j < dK[i]; ++j) F.neg( PP[dK[i]-j-1], *(K + i + (offset-j)*ldk)); frobeniusForm.push_front(PP); offset -= dK[i]; Ncurr -= dK[i]; } for (size_t i= offset+1; i void CompressRowsQK (Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t deg,const size_t nb_blocs) { int currtmp = 0; size_t currw = d[0]-1; size_t currr = d[0]-1; for (int i = 0; i< int(nb_blocs)-1; ++i){ // FFLAS::fassign(F,deg-d[i],M,A+currr*lda,lda,tmp+(size_t)currtmp*ldtmp); for (int j = int(d[i]-1); j void CompressRows (Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t nb_blocs) { size_t currd = d[0]-1; size_t curri = d[0]-1; for (int i = 0; i< int(nb_blocs)-1; ++i){ FFLAS::fassign(F, M, A + currd*lda, 1, tmp + i*(int)ldtmp, 1); for (int j=0; j < int(d[i+1]) -1; ++j){ FFLAS::fassign(F, M, A+(currd+(size_t)j+1)*lda, 1, A + (curri++)*lda, 1); } currd += d[i+1]; } for (int i=0; i < int(nb_blocs)-1; ++i){ FFLAS::fassign (F, M, tmp + i*(int)ldtmp, 1, A + (curri++)*lda, 1); } } template void DeCompressRows (Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t nb_blocs) { for (int i=0; i void DeCompressRowsQK (Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t deg,const size_t nb_blocs) { size_t zeroblockdim = 1; // the last block contributes with 1 size_t currtmp = 0; for (int i=0; i void CompressRowsQA (Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t nb_blocs) { size_t currd = 0; size_t curri = 0; for (size_t i = 0; i< nb_blocs; ++i){ FFLAS::fassign(F, M, A + currd*lda, 1, tmp + i*ldtmp, 1); for (size_t j=0; j < d[i] -1; ++j) FFLAS::fassign(F, M, A+(currd+j+1)*lda, 1, A + (curri++)*lda, 1); currd += d[i]; } for (size_t i=0; i < nb_blocs; ++i) FFLAS::fassign (F, M, tmp + i*ldtmp, 1, A + (curri++)*lda, 1); } template void DeCompressRowsQA (Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t ldtmp, const size_t * d, const size_t nb_blocs) { for (size_t i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFACK group * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_ftrtr_INL #define __FFLASFFPACK_ffpack_ftrtr_INL namespace FFPACK { template void ftrtri (const Field& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG Diag, const size_t N, typename Field::Element_ptr A, const size_t lda) { if (N == 1){ if (Diag == FFLAS::FflasNonUnit) F.invin (*A); } else { size_t N1 = N/2; size_t N2 = N - N1; ftrtri (F, Uplo, Diag, N1, A, lda); ftrtri (F, Uplo, Diag, N2, A + N1*(lda+1), lda); if (Uplo == FFLAS::FflasUpper){ ftrmm (F, FFLAS::FflasLeft, Uplo, FFLAS::FflasNoTrans, Diag, N1, N2, F.one, A, lda, A + N1, lda); ftrmm (F, FFLAS::FflasRight, Uplo, FFLAS::FflasNoTrans, Diag, N1, N2, F.mOne, A + N1*(lda+1), lda, A + N1, lda); } else { ftrmm (F, FFLAS::FflasLeft, Uplo, FFLAS::FflasNoTrans, Diag, N2, N1, F.one, A + N1*(lda+1), lda, A + N1*lda, lda); ftrmm (F, FFLAS::FflasRight, Uplo, FFLAS::FflasNoTrans, Diag, N2, N1, F.mOne, A, lda, A + N1*lda, lda); } } } template void ftrtrm (const Field& F, const FFLAS::FFLAS_DIAG diag, const size_t N, typename Field::Element_ptr A, const size_t lda) { if (N == 1) return; size_t N1 = N/2; size_t N2 = N-N1; ftrtrm (F, diag, N1, A, lda); fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, N1, N1, N2, F.one, A+N1, lda, A+N1*lda, lda, F.one, A, lda); ftrmm (F, FFLAS::FflasRight, FFLAS::FflasLower, FFLAS::FflasNoTrans, (diag == FFLAS::FflasUnit) ? FFLAS::FflasNonUnit : FFLAS::FflasUnit, N1, N2, F.one, A + N1*(lda+1), lda, A + N1, lda); ftrmm (F, FFLAS::FflasLeft, FFLAS::FflasUpper, FFLAS::FflasNoTrans, diag, N2, N1, F.one, A + N1*(lda+1), lda, A + N1*lda, lda); ftrtrm (F, diag, N2, A + N1*(lda+1), lda); } template void trinv_left( const Field& F, const size_t N, typename Field::ConstElement_ptr L, const size_t ldl, typename Field::Element_ptr X, const size_t ldx ) { FFLAS::fassign(F,N,N,L,ldl,X,ldx); ftrtri (F, FFLAS::FflasLower, FFLAS::FflasUnit, N, X, ldx); //invL(F,N,L,ldl,X,ldx); } } // FFPACK #endif // __FFLASFFPACK_ffpack_ftrtr_INL fflas-ffpack-2.2.2/fflas-ffpack/ffpack/ffpack_invert.inl000066400000000000000000000073631274716147400231700ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFACK group * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_invert_INL #define __FFLASFFPACK_ffpack_invert_INL namespace FFPACK { template typename Field::Element_ptr Invert (const Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, int& nullity) { FFLASFFPACK_check(lda >= M); Checker_invert checker(F,M,A,lda); if (M == 0) { nullity = 0 ; return NULL ; } size_t * P = FFLAS::fflas_new(M); size_t * Q = FFLAS::fflas_new(M); size_t R = ReducedColumnEchelonForm (F, M, M, A, lda, P, Q, true); nullity = (int)(M - R); applyP (F, FFLAS::FflasLeft, FFLAS::FflasTrans, M, 0, (int)R, A, lda, P); delete [] P; delete [] Q; checker.check(A,nullity); return A; } template typename Field::Element_ptr Invert (const Field& F, const size_t M, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx, int& nullity) { FFLASFFPACK_check(lda >= M); FFLASFFPACK_check(ldx >= M); if (M == 0) { nullity = 0 ; return NULL ; } FFLAS::fassign(F,M,M,A,lda,X,ldx); Invert (F, M, X, ldx, nullity); return X; } template typename Field::Element_ptr Invert2( const Field& F, const size_t M, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx, int& nullity) { FFLASFFPACK_check(lda >= M); FFLASFFPACK_check(ldx >= M); if (M == 0) { nullity = 0 ; return NULL ; } size_t *P = FFLAS::fflas_new(M); size_t *rowP = FFLAS::fflas_new(M); nullity = int(M - LUdivine( F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, M, M, A, lda, P, rowP)); if (nullity > 0){ FFLAS::fflas_delete( P); FFLAS::fflas_delete( rowP); return NULL; } else { // Initializing X to 0 #if 0/* timer remnants */ t1.clear(); t1.start(); #endif //! @todo this init is not all necessary (done after ftrtri) FFLAS::fzero(F,M,M,X,ldx); // X = L^-1 in n^3/3 ftrtri (F, FFLAS::FflasLower, FFLAS::FflasUnit, M, A, lda); for (size_t i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack/ffpack_krylovelim.inl * Copyright (C) 2007 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_krylovelim_INL #define __FFLASFFPACK_ffpack_krylovelim_INL // A is m x n with m <= n // Ensures : rankprof is the row rankprofil of the matrix k x n matrix B formed as follows (k = sum d_i): // for d_i < j < d_{i+1} the jth row B_j = is e_j // B_{d_i} = A_i // iterates must be initialized by [1, 2, ...] // inviterates is the inverse finction of iterates template inline size_t FFPACK::KrylovElim( const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q, const size_t deg, size_t *iterates, size_t * inviterates,size_t maxit, size_t virt) { if ( !(M && N) ) return 0; if (M == 1){ virt += deg; for (size_t i=0; i1 size_t Nup = M>>1; size_t Ndown = M - Nup; // Recursive call on NW size_t R = KrylovElim (F, Nup, N, A, lda, P, Q, deg, iterates, inviterates, maxit, virt); typename Field::Element_ptr Ar = A + Nup*lda; // SW typename Field::Element_ptr Ac = A + R; // NE typename Field::Element_ptr An = Ar + R; // SE if (R){ // Ar <- Ar.P applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans, Ndown, 0, (int)R, Ar, lda, P); // Ar <- Ar.U1^-1 ftrsm( F, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, FFLAS::FflasNonUnit, Ndown, R, F.one, A, lda, Ar, lda); // An <- An - Ar*Ac fgemm( F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, Ndown, N-R, R, F.mOne, Ar, lda, Ac, lda, F.one, An, lda); } // Recursive call on SE size_t R2 = KrylovElim (F, Ndown, N-R, An, lda,P+R, Q+Nup, deg, iterates, inviterates, maxit, std::min(maxit-deg,(virt+Nup*deg))); for (size_t i = R; i < R + R2; ++i) P[i] += R; if (R2) // An <- An.P2 applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans, Nup, (int)R, (int)(R+R2), A, lda, P); // Non zero row permutations for (size_t i = Nup; i < M; i++) Q[i] += Nup; if (R < Nup){ // Permutation of the 0 rows for ( size_t i = Nup, j = R ; i < Nup + R2; ++i, ++j){ FFLAS::fassign( F, N - j, A + i*lda + j, 1, A + j*(lda + 1), 1); for (typename Field::Element_ptr Ai = A + i*lda + j; Ai != A + i*lda + N; ++Ai) F.assign (*Ai, F.zero); size_t t = Q[j]; Q[j]=Q[i]; Q[i] = t; } } return R + R2; } } template size_t FFPACK::SpecRankProfile (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, const size_t deg, size_t *rankProfile) { //size_t deg = (N-1)/M+1; // Number of trivial iterates per blocs size_t * Q = FFLAS::fflas_new(M); size_t * P = FFLAS::fflas_new(N); size_t * iterates = FFLAS::fflas_new(N); size_t * inviterates = FFLAS::fflas_new(N+1); for (size_t i=0; i < N; ++i) inviterates[i+1] = iterates[i] = i+1; size_t R = KrylovElim (F, M, N, A, lda, P, Q, deg, iterates, inviterates, N,0); #if 0 cerr<<"Apres tout iterates = "<s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack/ffpack_ludivine.inl * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_ludivine_INL #define __FFLASFFPACK_ffpack_ludivine_INL #include "fflas-ffpack/fflas/fflas_bounds.inl" //#define LB_DEBUG namespace FFPACK { template inline size_t LUdivine_gauss( const Field& F, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag) { size_t MN = std::min(M,N); typename Field::Element_ptr Acurr = A; size_t r = 0; for (size_t k = 0; k < MN; ++k){ size_t p = r; Acurr = A+k*lda+r; while ((p < N) && F.isZero (*(Acurr++))) p++; if (p < N){ P[r] = p; if (r < k){ FFLAS::fassign (F, N-r, (A+k*lda+r),1, (A + r*(lda+1)), 1); Acurr = A+r+k*lda; for (size_t i=r; i class callLUdivine_small; template inline size_t LUdivine_small( const Field& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag) { return callLUdivine_small () (F, Diag, trans, M, N, A, lda, P, Q, LuTag); } template class callLUdivine_small { public: template inline size_t operator()( const Field& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag) { if ( !(M && N) ) return 0; typedef typename Field::Element elt; typedef typename Field::Element_ptr elt_ptr; elt_ptr Aini = A; elt_ptr Acurr; size_t rowp = 0; size_t R = 0; size_t k = 0; while ((rowp i){ FFLAS::fassign (F, l-i, Aini+(Q[i]-i)*lda, 1, Aini, 1); for (size_t j=0; j class callLUdivine_small { public: template inline size_t operator()( const Field& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag) { if ( !(M && N) ) return 0; typedef typename Field::Element elt; elt * Aini = A; elt * Acurr; size_t rowp = 0; size_t R = 0; size_t k = 0; size_t delay =0; size_t kmax = FFLAS::Protected::DotProdBoundClassic (F, F.one) -1; // the max number of delayed operations while ((rowp= kmax){ // Reduction has to be done delay = 0; FFLAS::freduce (F, M-rowp-1,N-k-1, Aini+lda+1, lda); // for (size_t i=1; i i){ FFLAS::fassign (F, l-i, Aini+(Q[i]-i)*lda, 1, Aini, 1); for (size_t j=0; j class callLUdivine_small { public: template inline size_t operator()( const Field& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag) { if ( !(M && N) ) return 0; typedef typename Field::Element elt; elt * Aini = A; elt * Acurr; size_t rowp = 0; size_t R = 0; size_t k = 0; size_t delay =0; size_t kmax = FFLAS::Protected::DotProdBoundClassic (F, F.one) -1; // the max number of delayed operations while ((rowp= kmax){ // Reduction has to be done delay = 0; FFLAS::freduce (F, M-rowp-1, N-k-1, Aini+lda+1, lda); // for (size_t i=1; i i){ FFLAS::fassign (F, l-i, Aini+(Q[i]-i)*lda, 1, Aini, 1); for (size_t j=0; j inline size_t LUdivine (const Field& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q , const FFPACK::FFPACK_LU_TAG LuTag // =FFPACK::FfpackSlabRecursive , const size_t cutoff // =__FFPACK_LUDIVINE_CUTOFF ) { if ( !(M && N) ) return 0; typedef typename Field::Element elt; size_t MN = std::min(M,N); size_t incRow, incCol, rowDim, colDim; if (trans == FFLAS::FflasTrans){ incRow = 1; incCol = lda; colDim = M; rowDim = N; } else { incRow = lda; incCol = 1; colDim = N; rowDim = M; } if ((rowDim < cutoff) && (colDim < 2*cutoff)) { // the coeff 2 is experimentally determined! return LUdivine_small (F, Diag, trans, M, N, A, lda, P, Q, LuTag); } else { // recursively : if (MN == 1){ size_t ip=0; while (F.isZero (*(A+ip*incCol))) if (++ip == colDim) break; *Q=0; if (ip == colDim){ // current row is zero *P=0; if (colDim == 1){ //while (ip1){ // Normalisation of the row FFLAS::fscalin(F,colDim-1,invpiv,A+incCol,incCol); } else if ( (colDim==1) &&(Diag==FFLAS::FflasNonUnit) ){ if (++ip < rowDim){ FFLAS::fscalin(F,rowDim-ip,invpiv,A+ip*incRow,incRow); } } return 1; } else { // MN>1 size_t Nup = rowDim >> 1; size_t Ndown = rowDim - Nup; // FFLASFFPACK_check(Ndown < rowDim); // Recursive call on NW size_t R, R2; if (trans == FFLAS::FflasTrans){ R = LUdivine (F, Diag, trans, colDim, Nup, A, lda, P, Q, LuTag, cutoff); typename Field::Element_ptr Ar = A + Nup*incRow; // SW typename Field::Element_ptr Ac = A + R*incCol; // NE typename Field::Element_ptr An = Ar+ R*incCol; // SE if (!R){ if (LuTag == FFPACK::FfpackSingular ) return 0; } else { FFPACK::applyP (F, FFLAS::FflasLeft, FFLAS::FflasNoTrans, Ndown, 0,(int) R, Ar, lda, P); // Ar <- L1^-1 Ar FFLAS::ftrsm( F, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, Diag, R, Ndown, F.one, A, lda, Ar, lda); // An <- An - Ac*Ar if (colDim>R) fgemm( F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, colDim-R, Ndown, R, F.mOne, Ac, lda, Ar, lda, F.one, An, lda); } // Recursive call on SE R2 = LUdivine (F, Diag, trans, colDim-R, Ndown, An, lda, P + R, Q + Nup, LuTag, cutoff); for (size_t i = R; i < R + R2; ++i) P[i] += R; if (R2) { // An <- An.P2 FFPACK::applyP (F, FFLAS::FflasLeft, FFLAS::FflasNoTrans, Nup,(int) R, (int)(R+R2), A, lda, P); } else { if (LuTag == FFPACK::FfpackSingular) return 0; } } else { // trans == FFLAS::FflasNoTrans R = LUdivine (F, Diag, trans, Nup, colDim, A, lda, P, Q, LuTag, cutoff); typename Field::Element_ptr Ar = A + Nup*incRow; // SW typename Field::Element_ptr Ac = A + R*incCol; // NE typename Field::Element_ptr An = Ar+ R*incCol; // SE if (!R){ if (LuTag == FFPACK::FfpackSingular ) return 0; } else { /* R>0 */ // Ar <- Ar.P FFPACK::applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans, Ndown, 0,(int) R, Ar, lda, P); // Ar <- Ar.U1^-1 ftrsm( F, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, Diag, Ndown, R, F.one, A, lda, Ar, lda); // An <- An - Ar*Ac if (colDim>R) fgemm( F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, Ndown, colDim-R, R, F.mOne, Ar, lda, Ac, lda, F.one, An, lda ); } // Recursive call on SE R2=LUdivine (F, Diag, trans, Ndown, N-R, An, lda,P+R, Q+Nup, LuTag, cutoff); for (size_t i = R; i < R + R2; ++i) P[i] += R; if (R2) // An <- An.P2 FFPACK::applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans, Nup,(int) R, (int)(R+R2), A, lda, P); else if (LuTag == FFPACK::FfpackSingular) return 0; } // Non zero row permutations for (size_t i = Nup; i < Nup + R2; i++) Q[i] += Nup; if (R < Nup){ // Permutation of the 0 rows if (Diag == FFLAS::FflasNonUnit){ for ( size_t i = Nup, j = R ; i < Nup + R2; ++i, ++j){ FFLAS::fassign( F, colDim - j, A + i*incRow + j*incCol, incCol, A + j * (lda + 1), incCol); for (typename Field::Element_ptr Ai = A + i*incRow + j*incCol; Ai != A + i*incRow + colDim*incCol; Ai+=incCol) F.assign (*Ai, F.zero); ///@todo std::swap ? size_t t = Q[j]; Q[j]=Q[i]; Q[i] = t; } } else { // Diag == FFLAS::FflasUnit for ( size_t i = Nup, j = R+1 ; i < Nup + R2; ++i, ++j){ FFLAS::fassign( F, colDim - j, A + i*incRow + j*incCol, incCol, A + (j-1)*incRow + j*incCol, incCol); for (typename Field::Element_ptr Ai = A + i*incRow + j*incCol; Ai != A + i*incRow + colDim*incCol; Ai+=incCol) F.assign (*Ai, F.zero); size_t t = Q[j-1]; Q[j-1]=Q[i]; Q[i] = t; } } } return R + R2; } } } namespace Protected { //--------------------------------------------------------------------- // LUdivine_construct: (Specialisation of LUdivine) // LUP factorisation of the Krylov base matrix of A^t and v. // When all rows have been factorized in A, and rank is full, // then new krylov vectors are computed and then triangularized // P is the permutation matrix stored in the lapack style // nRowX is the number of Krylov vectors already computed, // nUsedRowX is the number of Krylov vectors already triangularized //--------------------------------------------------------------------- template size_t LUdivine_construct( const Field& F, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr X, const size_t ldx, typename Field::Element_ptr u, size_t* P, bool computeX , const FFPACK::FFPACK_MINPOLY_TAG MinTag //= FFPACK::FfpackDense , const size_t kg_mc// =0 , const size_t kg_mb// =0 , const size_t kg_j // =0 ) { size_t MN = std::min(M,N); if (MN == 1){ size_t ip=0; while (ip1 && computeX)// Only appends when A is 1 by 1 F.mul(*(X+ldx),*X, *A); return 1; } else{ // MN>1 size_t Nup = MN>>1; size_t Ndown = M - Nup; // Recursive call on NW size_t R = LUdivine_construct(F, Diag, Nup, N, A, lda, X, ldx, u, P, computeX, MinTag, kg_mc, kg_mb, kg_j ); if (R==Nup){ typename Field::Element_ptr Xr = X + Nup*ldx; // SW typename Field::Element_ptr Xc = X + Nup; // NE typename Field::Element_ptr Xn = Xr + Nup; // SE typename Field::Element_ptr Xi = Xr; if ( computeX ){ if (MinTag == FFPACK::FfpackDense) for (size_t i=0; i< Ndown; ++i, Xi+=ldx){ fgemv(F, FFLAS::FflasNoTrans, N, N, F.one, A, lda, u, 1, F.zero, Xi,1); FFLAS::fassign(F, N,Xi, 1, u,1); } else // Keller-Gehrig Fast algorithm's matrix for (size_t i=0; i< Ndown; ++i, Xi+=ldx){ FFPACK::Protected::fgemv_kgf( F, N, A, lda, u, 1, Xi, 1, kg_mc, kg_mb, kg_j ); FFLAS::fassign(F, N,Xi, 1, u,1); } } // Apply the permutation on SW FFPACK::applyP( F, FFLAS::FflasRight, FFLAS::FflasTrans, Ndown, 0,(int) R, Xr, ldx, P); // Triangular block inversion of NW and apply to SW // Xr <- Xr.U1^-1 ftrsm( F, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, Diag, Ndown, R, F.one, X, ldx, Xr, ldx); // Update of SE // Xn <- Xn - Xr*Xc fgemm( F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, Ndown, N-Nup, Nup, F.mOne, Xr, ldx, Xc, ldx, F.one, Xn, ldx); // Recursive call on SE size_t R2 = LUdivine_construct(F, Diag, Ndown, N-Nup, A, lda, Xn, ldx, u, P + Nup, false, MinTag, kg_mc, kg_mb, kg_j); for ( size_t i=R;is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFPACK_ludivine_mp_INL #define __FFPACK_ludivine_mp_INL #include #include #ifdef BENCH_PERF_LQUP_MP #define BENCH_PERF_FGEMM_MP #endif #include "fflas-ffpack/field/rns-integer-mod.h" #include "fflas-ffpack/field/rns-integer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/ffpack/ffpack_ludivine.inl" namespace FFPACK { template <> inline size_t LUdivine (const Givaro::Modular& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans, const size_t M, const size_t N, typename Givaro::Integer* A, const size_t lda, size_t*P, size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag, const size_t cutoff ) { #ifdef BENCH_PERF_LQUP_MP double t_init=0, t_lqup=0, t_mod=0, t_rec=0; FFLAS::Timer chrono; chrono.start(); #endif Givaro::Integer p; F.cardinality(p); size_t logp=p.bitsize(); size_t K = std::max(M,N); // compute bit size of feasible prime size_t _k=std::max(K+1,logp/20), lk=0; while ( _k ) {_k>>=1; ++lk;} size_t prime_bitsize= (53-lk)>>1; // construct rns basis Givaro::Integer maxC= (p-1)*(p-1)*(p-1)*uint64_t(K); uint64_t n_pr =uint64_t(ceil(double(maxC.bitsize())/double(prime_bitsize))); maxC=(p-1)*(p-1)*uint64_t(K)*(1< Zp(p, RNS); #ifdef BENCH_PERF_LQUP_MP chrono.stop(); t_init+=chrono.usertime(); chrono.clear();chrono.start(); #endif // compute A in RNS FFPACK::rns_double::Element_ptr Ap; Ap = FFLAS::fflas_new(Zp,M,N); FFLAS::finit_rns(Zp,M,N,(logp/16)+(logp%16?1:0),A,lda,Ap); #ifdef BENCH_PERF_LQUP_MP chrono.stop(); t_mod+=chrono.usertime(); chrono.clear();chrono.start(); #endif // call lqup in rns size_t R=FFPACK::LUdivine(Zp, Diag, trans, M, N, Ap, N, P, Q, LuTag, cutoff); //std::cout<<"LUDivine RNS done"<s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack/ffpack_minpoly.inl * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_minpoly_INL #define __FFLASFFPACK_ffpack_minpoly_INL namespace FFPACK { template Polynomial& MinPoly( const Field& F, Polynomial& minP, const size_t N ,typename Field::ConstElement_ptr A, const size_t lda ,typename Field::Element_ptr X, const size_t ldx ,size_t* P ,const FFPACK_MINPOLY_TAG MinTag// = FfpackDense ,const size_t kg_mc// =0 ,const size_t kg_mb//=0 ,const size_t kg_j //=0 ) { // nRow is the number of row in the krylov base already computed size_t j, k ; //size_t nRow = 2; typename Polynomial::iterator it; typename Field::Element_ptr Xi, Ui; typename Field::RandIter g (F); bool KeepOn=true; typename Field::Element_ptr U = FFLAS::fflas_new (F, N, 1); // Picking a non zero vector do{ for (Ui=U, Xi = X; Ui(k); FFLAS::fassign( F, k, X+k*ldx, 1, U, 1); ftrsv( F, FFLAS::FflasLower, FFLAS::FflasTrans, FFLAS::FflasNonUnit, k, X, ldx, U, 1); it = minP.begin(); for (j=0; js,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas-ffpack/ffpack/ffpack_minpoly_construct.inl * Copyright (C) 2003 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_minpoly_construct_INL #define __FFLASFFPACK_ffpack_minpoly_construct_INL #error "not included anywhere" namespace FFPACK { //#define LB_DEBUG //--------------------------------------------------------------------- // MinPoly: Compute the minimal polynomial of (A,v) using an LUP // factorization of the Krylov Base (v, Av, .., A^kv) // U must be (n+1)*n //--------------------------------------------------------------------- template Polynomial& MinPoly( const Field& F, Polynomial& minP, const size_t N, typename Field::ConstElement_ptr A, const size_t lda, typename Field::Element_ptr U, size_t ldu, typename Field::Element_ptr X, size_t ldx, size_t* P) { // nRow is the number of row in the krylov base already computed size_t j, k, nRow = 2; typename Field::Element_ptr B = FFLAS::fflas_new (F, N, N); typename Polynomial::iterator it; typename Field::Element_ptr Xi, *Ui; typename Field::RandIter g (F); bool KeepOn=true; // Creating the Krylov Base copy matrix X where to factorize //typename Field::Element_ptr X = FFLAS::fflas_new((N+1)*N); #ifdef LB_DEBUG for (j=0;j<(N+1)*N;j++) X[j] = zero; #endif // Creating the copy of A, where to compute A^2^i // Try memcopy here for (size_t i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFACK group * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_permutation_INL #define __FFLASFFPACK_ffpack_permutation_INL #include #include "fflas-ffpack/fflas/fflas_fassign.h" #define FFLASFFPACK_PERM_BKSIZE 32 namespace FFPACK { /** MonotonicApplyP * Apply a permutation defined by the first R entries of the vector P (the pivots). * The non pivot elements, are located in montonically increasing order. */ template void MonotonicApplyP (const Field& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_TRANSPOSE Trans, const size_t M, const size_t ibeg, const size_t iend, typename Field::Element_ptr A, const size_t lda, const size_t * P, const size_t R) { const size_t B = FFLASFFPACK_PERM_BKSIZE; size_t lenP = iend-ibeg; size_t * MathP = new size_t[lenP]; for (size_t i=0; i ispiv(lenP,false); size_t pivrowstomove = 0; size_t nonpivrowstomove = 0; size_t maxpiv = R-1; for (size_t i=0; i void MonotonicCompress (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, typename Field::Element_ptr A, const size_t lda, const size_t incA, const size_t * MathP, const size_t R, const size_t maxpiv, const size_t rowstomove, const std::vector &ispiv) { // Storing pivot rows in temp typename Field::Element_ptr temp= FFLAS::fflas_new (F, rowstomove, M); size_t ldtemp=M; for (size_t i=0,j=0; i= (int)R){ if ((src >= 0) && ispiv[src]){ // src points to a pivot row: skip it src--; continue; } FFLAS::fassign(F, M, A+src*lda, incA, A+dest*lda, incA); src--; dest--; } // Moving the pivots to their position in the first R rows for (size_t i=0, j=0; i void MonotonicCompressMorePivots (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, typename Field::Element_ptr A, const size_t lda, const size_t incA, const size_t * MathP, const size_t R, const size_t rowstomove, const size_t lenP) { std::vector done(lenP,false); typename Field::Element_ptr temp= FFLAS::fflas_new (F, rowstomove, M); size_t ldtemp=M; // Move every non pivot row to temp #ifdef VERBOSE std::cerr<<"R = "< temp["< A[j] #ifdef VERBOSE std::cerr<<"Moving pivots 1 A["< A["< tmprow"< A[j] #ifdef VERBOSE std::cerr<<"Moving pivots 2 A["< A["< A["< A["< void MonotonicCompressCycles (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, typename Field::Element_ptr A, const size_t lda, const size_t incA, const size_t * MathP, const size_t lenP) { std::vector done(lenP,false); // Move every non pivot row to temp #ifdef VERBOSE write_perm(std::cerr<<"MathP = ",MathP,lenP); #endif // Moving the remaining cycles using one vector temp typename Field::Element_ptr tmprow = FFLAS::fflas_new(F,1,FFLASFFPACK_PERM_BKSIZE); for (size_t i=0; i tmprow"< A[j] #ifdef VERBOSE std::cerr<<"Moving pivots A["< A["< A["< void MonotonicExpand (const Field& F, const FFLAS::FFLAS_SIDE Side, const size_t M, typename Field::Element_ptr A, const size_t lda, const size_t incA, const size_t * MathP, const size_t R, const size_t maxpiv, const size_t rowstomove, const std::vector &ispiv) { // Storing pivot rows in temp typename Field::Element_ptr temp= FFLAS::fflas_new (F, rowstomove, M); size_t ldtemp=M; for (size_t i=0,j=0; i void applyP_block (const Field& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_TRANSPOSE Trans, const size_t M, const size_t ibeg, const size_t iend, typename Field::Element_ptr A, const size_t lda, const size_t * P) { if ( Side == FFLAS::FflasRight ) { if ( Trans == FFLAS::FflasTrans ){ for ( size_t i=(size_t)ibeg; i<(size_t) iend; ++i) if ( P[i]!= i ) FFLAS::fswap( F, M, A + P[i]*1, lda, A + i*1, lda); } else { // Trans == FFLAS::FflasNoTrans for (size_t i=iend; i-->ibeg; ) if ( P[i]!=(size_t)i ) FFLAS::fswap( F, M, A + P[i]*1, lda, A + i*1, lda); } } else { // Side == FFLAS::FflasLeft if ( Trans == FFLAS::FflasNoTrans ) { for (size_t i=(size_t)ibeg; i<(size_t)iend; ++i) if ( P[i]!= (size_t) i ) FFLAS::fswap( F, M, A + P[i]*lda, 1, A + i*lda, 1); } else { // Trans == FFLAS::FflasTrans for (size_t i=iend; i-->ibeg; ) if ( P[i]!= (size_t) i ) FFLAS::fswap( F, M, A + P[i]*lda, 1, A + i*lda, 1); } } } template void applyP( const Field& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_TRANSPOSE Trans, const size_t M, const size_t ibeg, const size_t iend, typename Field::Element_ptr A, const size_t lda, const size_t * P ) { const size_t bk = FFLASFFPACK_PERM_BKSIZE; const size_t NB = M/bk; const size_t last = M%bk; const size_t incA = (Side == FFLAS::FflasLeft)? 1:lda; const size_t inc = bk*incA; for (size_t i = 0; i inline void doApplyS (const Field& F, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) { FFLAS::fassign(F, M2-R1-R2, width, A + (R1+R2)*lda, lda, tmp, width); FFLAS::fassign(F, R3+R4, width, A + M2*lda, lda, A + (R1+R2)*lda, lda); FFLAS::fassign(F, M2-R1-R2, width, tmp, width, A + (R1+R2+R3+R4)*lda, lda); } template inline void MatrixApplyS (const Field& F, typename Field::Element_ptr A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) { typename Field::Element_ptr tmp = FFLAS::fflas_new (F, M2-R1-R2, width); doApplyS (F, A, lda, tmp, width, M2, R1, R2, R3, R4); FFLAS::fflas_delete (tmp); } template inline void PermApplyS (T* A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) { Givaro::ZRing D; T* tmp = FFLAS::fflas_new((M2-R1-R2)*width); doApplyS (D, A, lda, tmp, width, M2, R1, R2, R3, R4); FFLAS::fflas_delete( tmp); } template inline void doApplyT (const Field& F, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr tmp, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) { for (size_t k = 0; k < width; ++k){ FFLAS::fassign(F, N2-R1, A+R1+k*lda, 1, tmp + k*(N2-R1), 1); FFLAS::fassign(F, R2, A+N2+k*lda, 1, A + R1 + k*lda, 1); FFLAS::fassign(F, R3, tmp + k*(N2-R1), 1, A+R1+R2+k*lda, 1); FFLAS::fassign(F, R4, A + N2 + R2 + k*lda, 1, A + R1+R2+R3 + k*lda, 1); FFLAS::fassign(F, N2-R1-R3, tmp + R3 + k*(N2-R1), 1, A+R1+R2+R3+R4+k*lda, 1); } } template inline void MatrixApplyT (const Field& F, typename Field::Element_ptr A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) { typename Field::Element_ptr tmp = FFLAS::fflas_new (F, N2-R1, width); doApplyT (F, A, lda, tmp, width, N2, R1, R2, R3, R4); FFLAS::fflas_delete (tmp); } template inline void PermApplyT (T* A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) { Givaro::ZRing D; T* tmp = FFLAS::fflas_new((N2-R1)*width); doApplyT (D, A, lda, tmp, width, N2, R1, R2, R3, R4); FFLAS::fflas_delete( tmp); } /** * Conversion of a permutation from LAPACK format to Math format */ inline void LAPACKPerm2MathPerm (size_t * MathP, const size_t * LapackP, const size_t N) { for (size_t i=0; i(N); size_t * Tinv = FFLAS::fflas_new(N); for (size_t i=0; i // Need a rewrite in order to support RNSModP field template inline void cyclic_shift_row_col(const Field & F, typename Field::Element_ptr A, size_t m, size_t n, size_t lda) { typedef typename Field::Element Element; typedef typename Field::Element_ptr Element_ptr; #ifdef MEMCOPY // std::cerr << "BEF m: " << m << ", n: " << n << std::endl; if (m > 1) { const size_t mun(m-1); if (n > 1) { // std::cerr << "m: " << m << ", n: " << n << std::endl; const size_t nun(n-1); const size_t blo(sizeof(Element)); // const size_t bmu(blo*mun); const size_t bnu(blo*nun); Element_ptr b = FFLAS::fflas_new(F,mun); for(size_t i=0; i0; --i) memcpy(A+1+i*lda, A+(i-1)*lda, bnu); memcpy(A, dc, bnu+blo); for(size_t i=0; i0; --i) A[i*lda]=A[(i-1)*lda]; *A=d; } } else { if ((m!=0) && (n > 1)) { const size_t nun(n-1); const size_t blo(sizeof(Element)); const size_t bnu(blo*nun); Element d = A[nun]; // std::cerr << "d: " << d << std::endl; Element_ptr tmp = FFLAS::fflas_new(F,nun); memcpy(tmp,A,bnu); memcpy(A+1,tmp,bnu); // std::copy(A,A+nun,A+1); *A=d; delete [] tmp; } } // std::cerr << "AFT m: " << m << ", n: " << n << std::endl; #else // std::cerr << "BEF m: " << m << ", n: " << n << std::endl; if (m > 1) { const size_t mun(m-1); if (n > 1) { const size_t nun(n-1); Element_ptr b = FFLAS::fflas_new (F,mun); Element_ptr Ainun = A+nun; for(size_t i=0; i0; --i, Ai-=lda) FFLAS::fassign(F, nun, Ai,1,Ai+1+lda,1); // std::copy(Ai, Ai+nun, Ai+1+lda); FFLAS::fassign(F, n, dc, 1, A, 1); //std::copy(dc, dc+n, A); Element_ptr Aipo = A+lda; for(size_t i=0; i 1)) { const size_t nun(n-1); Element d = A[nun]; FFLAS::fassign(F,nun,A,1,A+1,1); //std::copy(A,A+nun,A+1); *A=d; } } #endif } template inline void cyclic_shift_row(const Field& F, typename Field::Element_ptr A, size_t m, size_t n, size_t lda) { #ifdef MEMCOPY if (m > 1) { const size_t mun(m-1); typename Field::Element_ptr b = FFLAS::fflas_new (F,n,1); typename Field::Element_ptr Ai = A+mun*lda; //@BUG not safe with RNSModp field memcpy (b,Ai,n*sizeof(typename Field::Element)); for(typename Field::Element_ptr Ac = A+mun*lda; Ac!=A;Ac-=lda) memcpy (Ac, Ac-lda, n*sizeof(typename Field::Element)); memcpy ( A, b, n*sizeof(typename Field::Element)); FFLAS::fflas_delete (b); } #else if (m > 1) { const size_t mun(m-1); typename Field::Element_ptr b = FFLAS::fflas_new (F, n, 1); typename Field::Element_ptr Ai = A+mun*lda; for(size_t i=0; i inline void cyclic_shift_row(const RNSIntegerMod& F, typename T::Element_ptr A, size_t m, size_t n, size_t lda) { if (m > 1) { const size_t mun(m-1); typename T::Element_ptr b = FFLAS::fflas_new (F, n, 1); typename T::Element_ptr Ai = A+mun*lda; for(size_t i=0; i inline void cyclic_shift_col(const Field& F, typename Field::Element_ptr A, size_t m, size_t n, size_t lda) { if (n > 1) { const size_t nun(n-1); for(typename Field::Element_ptr Ai=A; Ai!= A+m*lda; Ai+=lda) { typename Field::Element tmp; F.init(tmp); F.assign(tmp, Ai[nun]); //@BUG: not safe with RNSModP field std::copy_backward(Ai, Ai+nun, Ai+n); *Ai=tmp; } } } template inline void cyclic_shift_col(const RNSIntegerMod& F, typename T::Element_ptr A, size_t m, size_t n, size_t lda) { if (n > 1) { const size_t nun(n-1); for(typename T::Element_ptr Ai=A; Ai!= A+m*lda; Ai+=lda) { typename T::Element tmp; F.init(tmp); F.assign(tmp, Ai[nun]); //std::copy_backward(Ai, Ai+nun, Ai+n); typename T::Element_ptr Xi = Ai+nun; typename T::ConstElement_ptr Yi=Ai+nun-1; for (size_t i =0;i void papplyP( const Field& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_TRANSPOSE Trans, const size_t m, const size_t ibeg, const size_t iend, typename Field::Element_ptr A, const size_t lda, const size_t * P ) { int numthreads = MAX_THREADS; size_t BLOCKSIZE=std::max(2*m/numthreads,(size_t)1); // Assume that there is at least 2 ApplyP taking place in parallel size_t NBlocks = m/BLOCKSIZE; size_t LastBlockSize = m % BLOCKSIZE; if (LastBlockSize) NBlocks++; else LastBlockSize=BLOCKSIZE; SYNCH_GROUP( for (size_t t = 0; t < NBlocks; ++t) { size_t BlockDim = BLOCKSIZE; if (t == NBlocks-1) BlockDim = LastBlockSize; TASK(MODE(CONSTREFERENCE(F, A,P) READ(A[BLOCKSIZE*t*((Side == FFLAS::FflasRight)?lda:1)])), applyP(F, Side, Trans, BlockDim, ibeg, iend, A+BLOCKSIZE*t*((Side == FFLAS::FflasRight)?lda:1), lda, P);); } ); //#pragma omp taskwait } template void pMatrixApplyT (const Field& F, typename Field::Element_ptr A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) { int numthreads = MAX_THREADS;//omp_get_max_threads(); size_t BLOCKSIZE=std::max(width/numthreads,(size_t)1); size_t NBlocks = width/BLOCKSIZE; size_t LastBlockSize = width % BLOCKSIZE; if (LastBlockSize) NBlocks++; else LastBlockSize=BLOCKSIZE; SYNCH_GROUP( for (size_t t = 0; t < NBlocks; ++t) { size_t BlockDim = BLOCKSIZE; if (t == NBlocks-1) BlockDim = LastBlockSize; TASK(MODE(CONSTREFERENCE(F, A) READWRITE(A[BLOCKSIZE*t*lda])), {MatrixApplyT(F,A+BLOCKSIZE*t*lda, lda, BlockDim, N2, R1, R2, R3, R4);} ); } ); } template void pMatrixApplyS (const Field& F, typename Field::Element_ptr A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) { int numthreads = MAX_THREADS;//omp_get_max_threads(); size_t BLOCKSIZE=std::max(width/numthreads,(size_t)1); size_t NBlocks = width/BLOCKSIZE; size_t LastBlockSize = width % BLOCKSIZE; if (LastBlockSize) NBlocks++; else LastBlockSize=BLOCKSIZE; SYNCH_GROUP( for (size_t t = 0; t < NBlocks; ++t) { size_t BlockDim = BLOCKSIZE; if (t == NBlocks-1) BlockDim = LastBlockSize; //#pragma omp task shared (F, A) firstprivate(BlockDim) TASK(MODE(CONSTREFERENCE(F,A) READ(A[BLOCKSIZE*t])), MatrixApplyS (F, A+BLOCKSIZE*t, lda, BlockDim, M2, R1, R2, R3, R4);); } ); //#pragma omp taskwait } //#endif // __FFLASFFPACK_USE_OPENMP } // FFPACK #endif // __FFLASFFPACK_ffpack_permutation_INL fflas-ffpack-2.2.2/fflas-ffpack/ffpack/ffpack_pluq.inl000066400000000000000000000525171274716147400226430ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack/ffpack_pluq.inl * Copyright (C) 2012 Clement Pernet * * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_pluq_INL #define __FFLASFFPACK_ffpack_pluq_INL //#define BCONLY //#define CROUT //#define BCV2 //#define BCV3 //#define LEFTLOOKING #ifndef BASECASE_K #define BASECASE_K 256 #endif namespace FFPACK { template inline size_t PLUQ_basecaseV3 (const Field& Fi, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::Element * A, const size_t lda, size_t*P, size_t *Q) { typedef typename Field::Element Element; size_t row = 0; size_t col = 0; size_t rank = 0; size_t * MathP = new size_t[M]; size_t * MathQ = new size_t[N]; for (size_t i=0; i rank) cyclic_shift_mathPerm(MathP+rank, piv2-rank+1); //if(piv3 > rank) cyclic_shift_mathPerm(MathQ+rank, piv3-rank+1); Element invpiv; Fi.inv (invpiv, A3[piv3]); if (Diag==FFLAS::FflasUnit){ #ifdef LEFTLOOKING // Normalizing the pivot row for (size_t i=piv3+1; i rank || piv2 > rank) { cyclic_shift_row_col(A+rank*(1+lda), piv2-rank+1, piv3-rank+1, lda); cyclic_shift_row(Fi,A+rank*lda, piv2-rank+1, rank, lda); cyclic_shift_row(Fi,A+rank*lda+piv3+1, piv2-rank+1, N-1-piv3, lda); cyclic_shift_col(Fi,A+rank, rank, piv3-rank+1, lda); cyclic_shift_col(Fi,A+rank+(piv2+1)*lda, M-1-piv2, piv3-rank+1, lda); } /* if(piv2 > rank) cyclic_shift_row(A+rank*lda, piv2-rank+1, N, lda); if(piv3 > rank) cyclic_shift_col(A+rank, M, piv3-rank+1, lda); */ #ifdef LEFTLOOKING // Need to update the cols already updated for (size_t i=piv2+1; i inline size_t PLUQ_basecaseV2 (const Field& Fi, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::Element * A, const size_t lda, size_t*P, size_t *Q) { typedef typename Field::Element Element; size_t row = 0; size_t col = 0; size_t rank = 0; std::vector pivotRows(M,false); std::vector pivotCols(N,false); size_t * MathP = new size_t[M]; size_t * MathQ = new size_t[N]; // size_t npp=0; // size_t npq=0; #ifdef LEFTLOOKING Element* Ltemp = new Element[M*N]; for (size_t i=0; i inline size_t PLUQ_basecaseCrout (const Field& Fi, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q) { size_t row = 0; size_t rank = 0; typename Field::Element_ptr CurrRow=A; size_t * MathP = FFLAS::fflas_new(M); size_t * MathQ = FFLAS::fflas_new(N); for (size_t i=0; i rank){ // Column rotation to move pivot on the diagonal // on U cyclic_shift_col(Fi, A+rank, rank, i-rank+1, lda); cyclic_shift_mathPerm(MathQ+rank, (size_t)(i-rank+1)); // on A cyclic_shift_col(Fi, CurrRow+lda+rank, M-row-1, i-rank+1, lda); Fi.assign(*(A+rank*(lda+1)), *(CurrRow+i)); FFLAS::fzero (Fi, i-rank, A+rank*(lda+1)+1, 1); } if (row > rank){ // Row rotation for L // Optimization: delay this to the end cyclic_shift_row(Fi, A+rank*lda, row-rank+1, rank, lda); cyclic_shift_mathPerm(MathP+rank, (size_t) (row-rank+1) ); // Row rotation for U (not moving the 0 block) FFLAS::fassign (Fi, N-i-1, CurrRow+i+1, 1, A+rank*lda+i+1, 1); Fi.assign(*(A+rank*(lda+1)), *(CurrRow+i)); FFLAS::fzero (Fi, row-rank, A+rank*(lda+1)+lda, lda); Fi.assign(*(CurrRow+i),Fi.zero); // only needed once here } rank++; } CurrRow+=lda; row++; } // size_t nonpiv = rank; // for (size_t i = 0; i inline size_t _PLUQ (const Field& Fi, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q) { #ifdef BCONLY #ifdef CROUT return PLUQ_basecaseCrout(Fi,Diag,M,N,A,lda,P,Q); #elif defined BCV2 return PLUQ_basecaseV2(Fi,Diag,M,N,A,lda,P,Q); #elif defined BCV3 return PLUQ_basecaseV3(Fi,Diag,M,N,A,lda,P,Q); #else return PLUQ_basecase(Fi,Diag,M,N,A,lda,P,Q); #endif #endif for (size_t i=0; i> 1; size_t N2 = N >> 1; size_t * P1 = FFLAS::fflas_new(M2); size_t * Q1 = FFLAS::fflas_new(N2); size_t R1,R2,R3,R4; // A1 = P1 [ L1 ] [ U1 V1 ] Q1 // [ M1 ] R1 = _PLUQ (Fi, Diag, M2, N2, A, lda, P1, Q1); typename Field::Element_ptr A2 = A + N2; typename Field::Element_ptr A3 = A + M2*lda; typename Field::Element_ptr A4 = A3 + N2; typename Field::Element_ptr F = A2 + R1*lda; typename Field::Element_ptr G = A3 + R1; // [ B1 ] <- P1^T A2 // [ B2 ] #ifdef MONOTONIC_APPLYP MonotonicApplyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N-N2, size_t(0), M2, A2, lda, P1, R1); MonotonicApplyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M-M2, size_t(0), N2, A3, lda, Q1, R1); #else applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N-N2, size_t(0), M2, A2, lda, P1); // [ C1 C2 ] <- A3 Q1^T applyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M-M2, size_t(0), N2, A3, lda, Q1); #endif // D <- L1^-1 B1 ftrsm (Fi, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, OppDiag, R1, N-N2, Fi.one, A, lda, A2, lda); // E <- C1 U1^-1 ftrsm (Fi, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, Diag, M-M2, R1, Fi.one, A, lda, A3, lda); // F <- B2 - M1 D fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M2-R1, N-N2, R1, Fi.mOne, A + R1*lda, lda, A2, lda, Fi.one, A2+R1*lda, lda); // G <- C2 - E V1 fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M-M2, N2-R1, R1, Fi.mOne, A3, lda, A+R1, lda, Fi.one, A3+R1, lda); // H <- A4 - ED fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M-M2, N-N2, R1, Fi.mOne, A3, lda, A2, lda, Fi.one, A4, lda); // F = P2 [ L2 ] [ U2 V2 ] Q2 // [ M2 ] size_t * P2 = FFLAS::fflas_new(M2-R1); size_t * Q2 = FFLAS::fflas_new(N-N2); R2 = _PLUQ (Fi, Diag, M2-R1, N-N2, F, lda, P2, Q2); // G = P3 [ L3 ] [ U3 V3 ] Q3 // [ M3 ] size_t * P3 = FFLAS::fflas_new(M-M2); size_t * Q3 = FFLAS::fflas_new(N2-R1); R3 = _PLUQ (Fi, Diag, M-M2, N2-R1, G, lda, P3, Q3); // [ H1 H2 ] <- P3^T H Q2^T // [ H3 H4 ] #ifdef MONOTONIC_APPLYP MonotonicApplyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M-M2, size_t(0), N-N2, A4, lda, Q2, R2); MonotonicApplyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N-N2, size_t(0), M-M2, A4, lda, P3, R3); #else applyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M-M2, size_t(0), N-N2, A4, lda, Q2); applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N-N2, size_t(0), M-M2, A4, lda, P3); #endif // [ E1 ] <- P3^T E // [ E2 ] #ifdef MONOTONIC_APPLYP MonotonicApplyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, R1, size_t(0), M-M2, A3, lda, P3, R3); #else applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, R1, size_t(0), M-M2, A3, lda, P3); #endif // [ M11 ] <- P2^T M1 // [ M12 ] #ifdef MONOTONIC_APPLYP MonotonicApplyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, R1, size_t(0), M2-R1, A+R1*lda, lda, P2, R2); // [ D1 D2 ] <- D Q2^T MonotonicApplyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, R1, size_t(0), N-N2, A2, lda, Q2, R2); // [ V1 V2 ] <- V1 Q3^T MonotonicApplyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, R1, size_t(0), N2-R1, A+R1, lda, Q3, R3); #else applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, R1, size_t(0), M2-R1, A+R1*lda, lda, P2); // [ D1 D2 ] <- D Q2^T applyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, R1, size_t(0), N-N2, A2, lda, Q2); // [ V1 V2 ] <- V1 Q3^T applyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, R1, size_t(0), N2-R1, A+R1, lda, Q3); #endif // I <- H U2^-1 // K <- H3 U2^-1 ftrsm (Fi, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, Diag, M-M2, R2, Fi.one, F, lda, A4, lda); // J <- L3^-1 I (in a temp) typename Field::Element_ptr temp = FFLAS::fflas_new (Fi, R3, R2); FFLAS::fassign (Fi, R3, R2, A4 , lda, temp , R2); ftrsm (Fi, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, OppDiag, R3, R2, Fi.one, G, lda, temp, R2); // N <- L3^-1 H2 ftrsm (Fi, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, OppDiag, R3, N-N2-R2, Fi.one, G, lda, A4+R2, lda); // O <- N - J V2 fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, R3, N-N2-R2, R2, Fi.mOne, temp, R2, F+R2, lda, Fi.one, A4+R2, lda); FFLAS::fflas_delete (temp); // R <- H4 - K V2 - M3 O typename Field::Element_ptr R = A4 + R2 + R3*lda; fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M-M2-R3, N-N2-R2, R2, Fi.mOne, A4+R3*lda, lda, F+R2, lda, Fi.one, R, lda); fgemm (Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M-M2-R3, N-N2-R2, R3, Fi.mOne, G+R3*lda, lda, A4+R2, lda, Fi.one, R, lda); // H4 = P4 [ L4 ] [ U4 V4 ] Q4 // [ M4 ] size_t * P4 = FFLAS::fflas_new(M-M2-R3); size_t * Q4 = FFLAS::fflas_new(N-N2-R2); R4 = _PLUQ (Fi, Diag, M-M2-R3, N-N2-R2, R, lda, P4, Q4); // [ E21 M31 0 K1 ] <- P4^T [ E2 M3 0 K ] // [ E22 M32 0 K2 ] #ifdef MONOTONIC_APPLYP MonotonicApplyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N2+R2, size_t(0), M-M2-R3, A3+R3*lda, lda, P4, R4); // [ D21 D22 ] [ D2 ] // [ V21 V22 ] <- [ V2 ] Q4^T // [ 0 0 ] [ 0 ] // [ O1 O2 ] [ O ] MonotonicApplyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M2+R3, size_t(0), N-N2-R2, A2+R2, lda, Q4, R4); #else applyP (Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N2+R2, size_t(0), M-M2-R3, A3+R3*lda, lda, P4); // [ D21 D22 ] [ D2 ] // [ V21 V22 ] <- [ V2 ] Q4^T // [ 0 0 ] [ 0 ] // [ O1 O2 ] [ O ] applyP (Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M2+R3, size_t(0), N-N2-R2, A2+R2, lda, Q4); #endif // P <- Diag (P1 [ I_R1 ] , P3 [ I_R3 ]) // [ P2 ] [ P4 ] size_t* MathP = FFLAS::fflas_new(M); composePermutationsP (MathP, P1, P2, R1, M2); composePermutationsP (MathP+M2, P3, P4, R3, M-M2); FFLAS::fflas_delete( P1); FFLAS::fflas_delete( P2); FFLAS::fflas_delete( P3); FFLAS::fflas_delete( P4); for (size_t i=M2; i(N); composePermutationsQ (MathQ, Q1, Q3, R1, N2); composePermutationsQ (MathQ+N2, Q2, Q4, R2, N-N2); FFLAS::fflas_delete( Q1); FFLAS::fflas_delete( Q2); FFLAS::fflas_delete( Q3); FFLAS::fflas_delete( Q4); for (size_t i=N2; i inline size_t PLUQ (const Field& Fi, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*P, size_t *Q) { Checker_PLUQ checker (Fi,M,N,A,lda); size_t R = FFPACK::_PLUQ(Fi,Diag,M,N,A,lda,P,Q); checker.check(A,lda,R,P,Q); return R; } } // namespace FFPACK #endif // __FFLASFFPACK_ffpack_pluq_INL fflas-ffpack-2.2.2/fflas-ffpack/ffpack/ffpack_pluq_mp.inl000066400000000000000000000074331274716147400233340ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFPACK_pluq_mp_INL #define __FFPACK_pluq_mp_INL #ifdef BENCH_PERF_LQUP_MP #define BENCH_PERF_FGEMM_MP #endif #include "fflas-ffpack/field/rns-integer-mod.h" #include "fflas-ffpack/field/rns-integer.h" #include "fflas-ffpack/fflas-ffpack.h" #include "givaro/givinteger.h" #include "givaro/modular-integer.h" namespace FFPACK { template <> inline size_t PLUQ (const Givaro::Modular& F, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Givaro::Integer* A, const size_t lda, size_t*P, size_t *Q) { #ifdef BENCH_PERF_LQUP_MP double t_init=0, t_lqup=0, t_mod=0, t_rec=0; FFLAS::Timer chrono; chrono.start(); #endif Givaro::Integer p; F.cardinality(p); size_t logp=p.bitsize(); size_t K = std::max(M,N); // compute bit size of feasible prime size_t _k=std::max(K,logp/20), lk=0; while ( _k ) {_k>>=1; ++lk;} size_t prime_bitsize= (53-lk)>>1; // construct rns basis Givaro::Integer maxC= (p-1)*(p-1)*(p-1)*uint64_t(K); uint64_t n_pr =uint64_t(ceil(double(maxC.bitsize())/double(prime_bitsize))); maxC=(p-1)*(p-1)*uint64_t(K)*(1< Zp(p, RNS); #ifdef BENCH_PERF_LQUP_MP chrono.stop(); t_init+=chrono.usertime(); chrono.clear();chrono.start(); #endif // compute A in RNS FFPACK::rns_double::Element_ptr Ap; Ap = FFLAS::fflas_new(Zp,M,N); FFLAS::finit_rns(Zp,M,N,(logp/16)+(logp%16?1:0),A,lda,Ap); #ifdef BENCH_PERF_LQUP_MP chrono.stop(); t_mod+=chrono.usertime(); chrono.clear();chrono.start(); #endif // call lqup in rns size_t R=FFPACK::PLUQ(Zp, Diag, M, N, Ap, N, P, Q); #ifdef BENCH_PERF_LQUP_MP chrono.stop(); t_lqup+=chrono.usertime(); chrono.clear();chrono.start(); #endif //Zp.write(std::cout,*Ap); // reconstruct the result FFLAS::fconvert_rns(Zp,M,N,F.zero,A,lda,Ap); #ifdef BENCH_PERF_LQUP_MP chrono.stop(); t_rec+=chrono.usertime(); chrono.clear();chrono.start(); #endif // reduce it modulo p FFLAS::freduce (F,M,N,A,lda); //F.write(std::cout,*A); #ifdef BENCH_PERF_LQUP_MP chrono.stop(); //t_rec+=chrono.usertime(); cout<<"PLUQ RNS PERF:"<s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack/ffpack_ppluq.inl * Copyright (C) 2014 Ziad Sultan * * Written by Ziad.Sultan@imag.fr * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, WRITE to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_ppluq_INL #define __FFLASFFPACK_ffpack_ppluq_INL //#ifdef __FFLASFFPACK_USE_OPENMP #define __FFLAS__TRSM_READONLY #define PBASECASE_K 256 namespace FFPACK { template void threads_fgemm(const size_t m, const size_t n, const size_t r, int nbthreads, size_t * W1, size_t * W2, size_t * W3, size_t gamma) { size_t H1, H2, H3; size_t M2 = m>>1; size_t N2 = n>>1; H1 = ((m-N2)*r*(N2-r))<<1; H2 = ((M2-r)*r*(n-N2))<<1; H3 = ((m-M2)*r*(n-N2))<<1; // if we take into account 2 concurrent pluq calls.... size_t h; size_t z1= h*((m-M2)*(N2-r)*(N2-r)-(N2-r)*(N2-r)*(N2-r)/3); size_t z2= h*((n-N2)*(M2-r)*(M2-r)-(M2-r)*(M2-r)*(M2-r)/3); H1+= z1; H2+= z2; // compute number of threads for each fgemm call *W1=std::max(H1*nbthreads/(H1+H2+H3),(size_t)1); *W2=std::max(H2*nbthreads/(H1+H2+H3),(size_t)1); *W3=std::max(nbthreads-*W1-*W2,(size_t)1); // add gamma factor to change number of threads for pluq calls W1-= gamma*z1/(z1+z2); W2-= gamma*(1-z1/(z1+z2)); W3+= gamma; } template void threads_ftrsm(const size_t m, const size_t n, int nbthreads, size_t * t1, size_t * t2) { *t1 = nbthreads*m/(m+n); *t2 = nbthreads-(int)*t1; } // TODO: replace pPLUQ and "int nt", by PLUQ and a Parallel Helper ... template inline size_t pPLUQ(const Field& Fi, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* P, size_t* Q, int nt) { for (size_t i=0; i PBASECASE_K) return PLUQ_basecaseCrout (Fi, Diag, M, N, A, lda, P, Q); #endif FFLAS::FFLAS_DIAG OppDiag = (Diag == FFLAS::FflasUnit)? FFLAS::FflasNonUnit : FFLAS::FflasUnit; size_t M2 = M >> 1; size_t N2 = N >> 1; size_t * P1 = FFLAS::fflas_new (M2); size_t * Q1 = FFLAS::fflas_new (N2); size_t* MathP = 0; size_t* MathQ = 0; size_t* P2,*P3,*Q2,*Q3,*P4,*Q4; size_t R1,R2,R3,R4; // A1 = P1 [ L1 ] [ U1 V1 ] Q1 // [ M1 ] R1 = pPLUQ (Fi, Diag, M2, N2, A, lda, P1, Q1,nt); typename Field::Element * A2 = A + N2; typename Field::Element * A3 = A + M2*lda; typename Field::Element * A4 = A3 + N2; typename Field::Element * F = A2 + R1*lda; typename Field::Element * G = A3 + R1; // const FFLAS::CuttingStrategy meth = FFLAS::RECURSIVE; // const FFLAS::StrategyParameter strat = FFLAS::TWO_D_ADAPT; typename FFLAS::ParSeqHelper::Parallel pWH (std::max(nt,1)); typename FFLAS::ParSeqHelper::Parallel PH (std::max(nt,1)); SYNCH_GROUP( // [ B1 ] <- P1^T A2 // [ B2 ] TASK(MODE(READ(P1) CONSTREFERENCE(Fi, P1, A2) READWRITE(A2[0])), { papplyP( Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N-N2, 0, M2, A2, lda, P1); } ); // [ C1 C2 ] <- A3 Q1^T TASK(MODE(READ(Q1) CONSTREFERENCE(Fi, Q1, A3) READWRITE(A3[0])), papplyP( Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M-M2, 0, N2, A3, lda, Q1);); CHECK_DEPENDENCIES; // D <- L1^-1 B1 TASK(MODE(READ(A[0], R1, PH) CONSTREFERENCE(Fi, PH, A2) READWRITE(A2[0])), ftrsm( Fi, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, OppDiag, R1, N-N2, Fi.one, A, lda, A2, lda, PH)); // E <- C1 U1^-1 TASK(MODE(READ(R1, A[0], PH) CONSTREFERENCE(A3, Fi, M2, R1, PH) READWRITE(A3[0])), ftrsm(Fi, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, Diag, M-M2, R1, Fi.one, A, lda, A3, lda, PH)); CHECK_DEPENDENCIES; // F <- B2 - M1 D TASK(MODE(READ(A2[0], A[R1*lda], pWH) READWRITE(F[0]) CONSTREFERENCE(A, A2, F, pWH, Fi)), fgemm( Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M2-R1, N-N2, R1, Fi.mOne, A + R1*lda, lda, A2, lda, Fi.one, F, lda, pWH)); // G <- C2 - E V1 TASK(MODE(READ(R1, A[R1], A3[0], pWH) READWRITE(G[0]) CONSTREFERENCE(Fi, A, A3, G, pWH)), fgemm( Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M-M2, N2-R1, R1, Fi.mOne, A3, lda, A+R1, lda, Fi.one, G, lda, pWH)); CHECK_DEPENDENCIES; P2 = FFLAS::fflas_new(M2-R1); Q2 = FFLAS::fflas_new(N-N2); //typename Field::Element * A4R2 = 0; // F = P2 [ L2 ] [ U2 V2 ] Q2 // [ M2 ] TASK(MODE(CONSTREFERENCE(Fi, P2, Q2, F,/* A4R2,*/ R2) WRITE(R2/*, A4R2[0]*/) READWRITE(F[0], P2, Q2) ), R2 = pPLUQ( Fi, Diag, M2-R1, N-N2, F, lda, P2, Q2,nt/2) //A4R2 = A4+R2; ); //R2 = PLUQ (Fi, Diag, M2-R1, N-N2, F, lda, P2, Q2); P3 = FFLAS::fflas_new(M-M2); Q3 = FFLAS::fflas_new(N2-R1); // G = P3 [ L3 ] [ U3 V3 ] Q3 // [ M3 ] TASK(MODE(CONSTREFERENCE(Fi, G, Q3, P3, R3) WRITE(R3, P3, Q3) READWRITE(G[0])), R3 = pPLUQ( Fi, Diag, M-M2, N2-R1, G, lda, P3, Q3,nt/2)); // H <- A4 - ED TASK(MODE(CONSTREFERENCE(Fi, A3, A2, A4, pWH) READ(M2, N2, R1, A3[0], A2[0]) READWRITE(A4[0])), fgemm( Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M-M2, N-N2, R1, Fi.mOne, A3, lda, A2, lda, Fi.one, A4, lda, pWH)); CHECK_DEPENDENCIES; // [ H1 H2 ] <- P3^T H Q2^T // [ H3 H4 ] TASK(MODE(READ(P3, Q2) CONSTREFERENCE(Fi, A4, Q2, P3) READWRITE(A4[0])), papplyP( Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M-M2, 0, N-N2, A4, lda, Q2); papplyP( Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N-N2, 0, M-M2, A4, lda, P3);); CHECK_DEPENDENCIES; // [ E1 ] <- P3^T E // [ E2 ] TASK(MODE(READ(P3) CONSTREFERENCE(Fi, P3, A3) READWRITE(A3[0])), papplyP( Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, R1, 0, M-M2, A3, lda, P3)); //applyP( Fi, FflasLeft, FflasNoTrans, R1, 0, M-M2, A3, lda, P3); // [ M11 ] <- P2^T M1 // [ M12 ] TASK(MODE(READ(P2) CONSTREFERENCE(P2, A, Fi) READWRITE(A[R1*lda])), papplyP(Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, R1, 0, M2-R1, A+R1*lda, lda, P2)); //applyP(Fi, FflasLeft, FflasNoTrans, R1, 0, M2-R1, A+R1*lda, lda, P2); // [ D1 D2 ] <- D Q2^T TASK(MODE(READ(Q2) CONSTREFERENCE(Fi, Q2, A2) READWRITE(A2[0])), papplyP( Fi, FFLAS::FflasRight, FFLAS::FflasTrans, R1, 0, N-N2, A2, lda, Q2)); //papplyP( Fi, FflasRight, FflasTrans, R1, 0, N-N2, A2, lda, Q2); // [ V1 V2 ] <- V1 Q3^T TASK(MODE(READ(Q3) CONSTREFERENCE(Fi, Q3, A) READWRITE(A[R1])), papplyP( Fi, FFLAS::FflasRight, FFLAS::FflasTrans, R1, 0, N2-R1, A+R1, lda, Q3)); //applyP( Fi, FflasRight, FflasTrans, R1, 0, N2-R1, A+R1, lda, Q3); // CHECK_DEPENDENCIES; // I <- H1 U2^-1 // K <- H3 U2^-1 TASK(MODE(READ(R2, F[0], P2) CONSTREFERENCE(Fi, A4, F, PH, R2) READWRITE(A4[0])), ftrsm( Fi, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, Diag, M-M2, R2, Fi.one, F, lda, A4, lda, PH)); //pftrsm( Fi, FflasRight, FflasUpper, FflasNoTrans, Diag, M-M2, R2, Fi.one, F, lda, A4, lda, method, NUM); //ftrsm( Fi, FflasRight, FflasUpper, FflasNoTrans, Diag, M-M2, R2, Fi.one, F, lda, A4, lda); CHECK_DEPENDENCIES; typename Field::Element_ptr temp = 0; TASK(MODE(READ(A4[0], R3, P2) READWRITE(temp[0], R2) CONSTREFERENCE(Fi, A4, temp, R2, R3)), temp = FFLAS::fflas_new (Fi, R3, R2); FFLAS::fassign (Fi, R3, R2, A4, lda, temp, R2); ); CHECK_DEPENDENCIES; // J <- L3^-1 I (in a temp) TASK(MODE(READ(R2, R3, G[0]) CONSTREFERENCE(Fi, G, temp, R2, R3, PH) READWRITE(temp[0])), ftrsm( Fi, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, OppDiag, R3, R2, Fi.one, G, lda, temp, R2, PH);); // N <- L3^-1 H2 TASK(MODE(READ(R3, R2, G[0]) CONSTREFERENCE(Fi, G, A4, R3, R2, PH) READWRITE(A4[R2])), ftrsm(Fi, FFLAS::FflasLeft, FFLAS::FflasLower, FFLAS::FflasNoTrans, OppDiag, R3, N-N2-R2, Fi.one, G, lda, A4+R2, lda, PH)); CHECK_DEPENDENCIES; // O <- N - J V2 TASK(MODE(READ(R2, F[R2]) CONSTREFERENCE(Fi, R2, A4, R3, temp, pWH) READWRITE(A4[R2], temp[0])), fgemm( Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, R3, N-N2-R2, R2, Fi.mOne, temp, R2, F+R2, lda, Fi.one, A4+R2, lda, pWH); FFLAS::fflas_delete (temp); // delete[] temp; temp=0; ); typename Field::Element_ptr R = 0; // R <- H4 - K V2 TASK(MODE(READ(R2, R3, M2, N2, A4[R3*lda], F[R2]) CONSTREFERENCE(Fi, R, F, R2, R3, pWH) READWRITE(R[0])), R = A4 + R2 + R3*lda; fgemm( Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M-M2-R3, N-N2-R2, R2, Fi.mOne, A4+R3*lda, lda, F+R2, lda, Fi.one, R, lda, pWH) ); //fgemm( Fi, FflasNoTrans, FflasNoTrans, M-M2-R3, N-N2-R2, R2, Fi.mOne, A4+R3*lda, lda, F+R2, lda, Fi.one, R, lda); CHECK_DEPENDENCIES; // R <- R - M3 O TASK(MODE(READ(R3, R2, A4[R2], G[R3*lda]) CONSTREFERENCE(Fi, A4, R, R3, R2, G, pWH) READWRITE(R[0])), fgemm( Fi, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M-M2-R3, N-N2-R2, R3, Fi.mOne, G+R3*lda, lda, A4+R2, lda, Fi.one, R, lda, pWH)); //fgemm( Fi, FflasNoTrans, FflasNoTrans, M-M2-R3, N-N2-R2, R3, Fi.mOne, G+R3*lda, lda, A4+R2, lda, Fi.one, R, lda); CHECK_DEPENDENCIES; /* size_t * P4 = FFLAS::fflas_new(M-M2-R3); size_t * Q4 = FFLAS::fflas_new(N-N2-R2); */ // H4 = P4 [ L4 ] [ U4 V4 ] Q4 // [ M4 ] //TASK(READ(Fi), NOWRITE(R4), READWRITE(R, P4, Q4), PPLUQ, R4, Fi, Diag, M-M2-R3, N-N2-R2, R, lda, P4, Q4); TASK(MODE(CONSTREFERENCE(Fi, R4, R, P4, Q4, R2, R3, M2, N2) READWRITE(R[0]) WRITE(R4, P4[0], Q4[0])), P4 = FFLAS::fflas_new(M-M2-R3); Q4 = FFLAS::fflas_new(N-N2-R2); R4 = pPLUQ (Fi, Diag, M-M2-R3, N-N2-R2, R, lda, P4, Q4,nt); ); CHECK_DEPENDENCIES; // [ E21 M31 0 K1 ] <- P4^T [ E2 M3 0 K ] // [ E22 M32 0 K2 ] TASK(MODE(READ(P4[0], R2, R3, M2) CONSTREFERENCE(Fi, P4, A3, R2, R3) READWRITE(A3[R3*lda])), papplyP(Fi, FFLAS::FflasLeft, FFLAS::FflasNoTrans, N2+R2, 0, M-M2-R3, A3+R3*lda, lda, P4)); //applyP( Fi, FflasLeft, FflasNoTrans, N2+R2, 0, M-M2-R3, A3+R3*lda, lda, P4); // [ D21 D22 ] [ D2 ] // [ V21 V22 ] <- [ V2 ] Q4^T // [ 0 0 ] [ 0 ] // [ O1 O2 ] [ O ] TASK(MODE(READ(Q4[0], R2, N2, M2, R3) CONSTREFERENCE(Fi, Q4, A2, R2, R3) READWRITE(A2[R2])), papplyP( Fi, FFLAS::FflasRight, FFLAS::FflasTrans, M2+R3, 0, N-N2-R2, A2+R2, lda, Q4)); //applyP( Fi, FflasRight, FflasTrans, M2+R3, 0, N-N2-R2, A2+R2, lda, Q4); // P <- Diag (P1 [ I_R1 ] , P3 [ I_R3 ]) // [ P2 ] [ P4 ] WAIT; // TASK(MODE(CONSTREFERENCE(P1, P2, P3, P4, R1, R3, MathP, M2) READ(P1, P2, R1, R3, P3, P4, M2) READWRITE(MathP)), MathP = FFLAS::fflas_new(M); composePermutationsP (MathP, P1, P2, R1, M2); composePermutationsP (MathP+M2, P3, P4, R3, M-M2); for (size_t i=M2; i(N); TASK(MODE(CONSTREFERENCE(Q1, Q2, Q3, Q4, R1, R2) READ(Q1[0], Q2[0], Q3[0], Q4[0], R1, R2) READWRITE(MathQ[0])), composePermutationsQ (MathQ, Q1, Q3, R1, N2); composePermutationsQ (MathQ+N2, Q2, Q4, R2, N-N2); for (size_t i=N2; is,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack_rankprofiles.inl * Copyright (C) 2015 FFLAS-FFACK group * * Written by Clement Pernet * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_ffpack_rank_profiles_INL #define __FFLASFFPACK_ffpack_rank_profiles_INL namespace FFPACK{ template inline size_t RowRankProfile (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* &rkprofile, const FFPACK_LU_TAG LuTag){ size_t *P = FFLAS::fflas_new((LuTag==FfpackSlabRecursive)?N:M); size_t *Q = FFLAS::fflas_new((LuTag==FfpackSlabRecursive)?M:N); size_t R; if (LuTag == FfpackSlabRecursive){ R = LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, M, N, A, lda, P, Q); std::swap(P,Q); } else R = PLUQ (F, FFLAS::FflasNonUnit, M, N, A, lda, P, Q); rkprofile = FFLAS::fflas_new (R); RankProfileFromLU (P, M, R, rkprofile, LuTag); FFLAS::fflas_delete (Q); FFLAS::fflas_delete (P); return R; } template inline size_t ColumnRankProfile (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t* &rkprofile, const FFPACK_LU_TAG LuTag){ size_t *P = FFLAS::fflas_new(M); size_t *Q = FFLAS::fflas_new(N); size_t R; if (LuTag == FfpackSlabRecursive){ R = LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasTrans, M, N, A, lda, P, Q); } else R = PLUQ (F, FFLAS::FflasNonUnit, M, N, A, lda, P, Q); rkprofile = FFLAS::fflas_new (R); RankProfileFromLU (Q, N, R, rkprofile, LuTag); FFLAS::fflas_delete (P); FFLAS::fflas_delete (Q); return R; } inline void RankProfileFromLU (const size_t* Q, const size_t N, const size_t R, size_t* rkprofile, const FFPACK_LU_TAG LuTag){ if (LuTag == FfpackSlabRecursive) std::copy(Q, Q+R, rkprofile); else { size_t * RP = FFLAS::fflas_new(N); for (size_t i=0;i < N; ++i) RP [i] = i; for (size_t i=0; i(M); size_t* MathQ = FFLAS::fflas_new(N); LAPACKPerm2MathPerm (MathP, P, M); LAPACKPerm2MathPerm (MathQ, Q, N); for (size_t i = 0; i < R; i++) if (MathP[i] < LSm && MathQ[i] < LSn){ RRP [LSr] = MathP[i]; CRP [LSr] = MathQ[i]; LSr++; } std::sort (RRP, RRP+LSr); std::sort (CRP, CRP+LSr); FFLAS::fflas_delete(MathP); FFLAS::fflas_delete(MathQ); return LSr; } template size_t RowRankProfileSubmatrixIndices (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*& rowindices, size_t*& colindices, size_t& R) { size_t *P = FFLAS::fflas_new(N); size_t *Q = FFLAS::fflas_new(M); R = LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, M, N, A, lda, P, Q); rowindices = FFLAS::fflas_new(M); colindices = FFLAS::fflas_new(N); for (size_t i=0; i size_t ColRankProfileSubmatrixIndices (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, size_t*& rowindices, size_t*& colindices, size_t& R) { size_t *P = FFLAS::fflas_new(M); size_t *Q = FFLAS::fflas_new(N); R = LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasTrans, M, N, A, lda, P, Q); rowindices = FFLAS::fflas_new(M); colindices = FFLAS::fflas_new(N); for (size_t i=0; i size_t RowRankProfileSubmatrix (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr& X, size_t& R) { size_t * rowindices, * colindices; typename Field::Element_ptr A2 = FFLAS::fflas_new (F, M, N) ; FFLAS::fassign(F,M,N,A,lda,A2,N); RowRankProfileSubmatrixIndices (F, M, N, A2, N, rowindices, colindices, R); X = FFLAS::fflas_new (F, R, R); for (size_t i=0; i size_t ColRankProfileSubmatrix (const Field& F, const size_t M, const size_t N, typename Field::Element_ptr A, const size_t lda, typename Field::Element_ptr& X, size_t& R) { size_t * rowindices, * colindices; typename Field::Element_ptr A2 = FFLAS::fflas_new (F, M, N); FFLAS::fassign(F,M,N,A,lda,A2,N); ColRankProfileSubmatrixIndices (F, M, N, A2, N, rowindices, colindices, R); X = FFLAS::fflas_new (F, R, R); for (size_t i=0; i typename Field::Element_ptr LQUPtoInverseOfFullRankMinor( const Field& F, const size_t rank, typename Field::Element_ptr A_factors, const size_t lda, const size_t* QtPointer, typename Field::Element_ptr X, const size_t ldx) { // upper entries are okay, just need to move up bottom ones const size_t* srcRow = QtPointer; for (size_t row=0; row # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ pkgincludesubdir=$(pkgincludedir)/field RNS=rns.h \ rns.inl \ rns-double.h \ rns-double-elt.h \ rns-double.inl \ rns-double-recint.inl \ rns-integer.h \ rns-integer-mod.h pkgincludesub_HEADERS= \ field-traits.h \ $(RNS) EXTRA_DIST=field.doxy fflas-ffpack-2.2.2/fflas-ffpack/field/field-traits.h000066400000000000000000000314571274716147400222350ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file field/field-traits.h * @brief Field Traits */ #ifndef __FFLASFFPACK_field_field_traits_H #define __FFLASFFPACK_field_field_traits_H #include // CXX11 #include "fflas-ffpack/field/rns-double-elt.h" // ----- Forward declarations #include "recint/rmint.h" #include "givaro/modular-general.h" #include "givaro/zring.h" namespace RecInt { template class rint; template class ruint; } namespace Givaro { template class ModularBalanced; template class Montgomery; } namespace FFPACK { template class RNSInteger; template class RNSIntegerMod; } namespace FFLAS { /* Categories */ //! Traits and categories will need to be placed in a proper file later namespace FieldCategories { // Classify //! generic ring. struct GenericTag{}; //! This is a modular field like e.g. Modular or ModularBalanced struct ModularTag{}; //! If the field uses a representation with infix operators struct UnparametricTag{}; } //! Specifies the mode of action for an algorithm w.r.t. its field //! namespace ModeCategories { //! No specific mode of action: use standard field operations struct DefaultTag{}; //! Use standard field operations, but keeps track of bounds on input and output struct DefaultBoundedTag{}; //! Force conversion to appropriate element type of ElementCategory T. //! e.g. //! - ConvertTo tries conversion //! of Modular to Modular //! - ConvertTo tries conversion //! of Modular to Modular > //! - ConvertTo tries conversion //! of Modular to RNSInteger //! . template struct ConvertTo{}; //! Performs field operations with delayed mod reductions. Ensures result is reduced. struct DelayedTag{}; //! Performs field operations with delayed mod only when necessary. Result may not be reduced. struct LazyTag{}; } namespace ElementCategories { //! default is generic struct GenericTag{}; //! float or double struct MachineFloatTag{}; //! short, int, long, long long, and unsigned variants struct MachineIntTag{}; //! Fixed precision integers above machine precision: Givaro::recInt struct FixedPrecIntTag{}; //! Arbitrary precision integers: GMP struct ArbitraryPrecIntTag{}; //! Representation in a Residue Number System struct RNSElementTag{}; //- If it can support SIMD operations (ie \c double or \c int32_t, etc) // struct SIMDTag : public GenericTag{}; } } // FFLAS namespace FFLAS { /* Traits */ /*! ElementTraits */ template struct ElementTraits {typedef typename ElementCategories::GenericTag value;}; template<> struct ElementTraits {typedef ElementCategories::MachineFloatTag value;}; template<> struct ElementTraits {typedef ElementCategories::MachineFloatTag value;}; template<> struct ElementTraits {typedef ElementCategories::MachineIntTag value;}; template<> struct ElementTraits {typedef ElementCategories::MachineIntTag value;}; template<> struct ElementTraits {typedef ElementCategories::MachineIntTag value;}; template<> struct ElementTraits {typedef ElementCategories::MachineIntTag value;}; template<> struct ElementTraits {typedef ElementCategories::MachineIntTag value;}; template<> struct ElementTraits {typedef ElementCategories::MachineIntTag value;}; template<> struct ElementTraits {typedef ElementCategories::MachineIntTag value;}; template<> struct ElementTraits {typedef ElementCategories::MachineIntTag value;}; template<> struct ElementTraits {typedef ElementCategories::ArbitraryPrecIntTag value;}; template struct ElementTraits > {typedef ElementCategories::FixedPrecIntTag value;}; template struct ElementTraits > {typedef ElementCategories::FixedPrecIntTag value;}; template struct ElementTraits >{typedef ElementCategories::FixedPrecIntTag value;}; template<> struct ElementTraits{typedef ElementCategories::RNSElementTag value;}; /*! ModeTraits */ template struct ModeTraits {typedef typename ModeCategories::DefaultTag value;}; template struct ModeTraits >{typedef typename ModeCategories::DelayedTag value;}; template struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; template struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; template struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; template struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; template struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; template struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; #ifndef INTEGER_NO_RNS template struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; #endif template struct ModeTraits >{typedef typename ModeCategories::DelayedTag value;}; template <> struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; template <> struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; template <> struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; #ifndef INTEGER_NO_RNS template <> struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; template <> struct ModeTraits > {typedef typename ModeCategories::ConvertTo value;}; #endif // These ones are here temporarily, to ensure // In the long term ZRing should be in DefaultTag, and forced to be in DefaultBoundedTag be the caller. However this would prevent these rings to use Winograd's algorithm (extensive use of bounded helpers) in the current implementation. Needs work. template <> struct ModeTraits > {typedef typename ModeCategories::DefaultBoundedTag value;}; template <> struct ModeTraits > {typedef typename ModeCategories::DefaultBoundedTag value;}; template struct ModeTraits > {typedef typename ModeCategories::DefaultBoundedTag value;}; /*! FieldTrait */ template struct FieldTraits { typedef typename FieldCategories::GenericTag category; // typedef false_type balanced ; static const bool balanced = false ; }; // RecInt template struct FieldTraits > > { //typedef FieldCategories::FloatingPointConvertibleTag value; typedef FieldCategories::UnparametricTag category; static const bool balanced = false ; }; // Modular // ModularBalanced template struct FieldTraits > { typedef FieldCategories::ModularTag category; static const bool balanced = false ; }; template struct FieldTraits > { typedef FieldCategories::ModularTag category; static const bool balanced = true ; }; // ZRing< float|double > template<> struct FieldTraits > { // typedef FieldCategories::FloatingPointTag value; typedef FieldCategories::UnparametricTag category; static const bool balanced = false ; }; template<> struct FieldTraits > { // typedef FieldCategories::FloatingPointTag value; typedef FieldCategories::UnparametricTag category; static const bool balanced = false ; }; // ZRing< intX > template<> struct FieldTraits > { // typedef FieldCategories::FloatingPointConvertibleTag value; typedef FieldCategories::UnparametricTag category; static const bool balanced = false ; }; template<> struct FieldTraits > { // typedef FieldCategories::FloatingPointConvertibleTag value; typedef FieldCategories::UnparametricTag category; static const bool balanced = false ; }; template<> struct FieldTraits > { // typedef FieldCategories::FloatingPointConvertibleTag value; typedef FieldCategories::UnparametricTag category; static const bool balanced = false ; }; template<> struct FieldTraits > { // typedef FieldCategories::FloatingPointConvertibleTag value; typedef FieldCategories::UnparametricTag category; static const bool balanced = false ; }; template<> struct FieldTraits > { // typedef FieldCategories::FloatingPointConvertibleTag value; typedef FieldCategories::UnparametricTag category; static const bool balanced = false ; }; template<> struct FieldTraits > { // typedef FieldCategories::FloatingPointConvertibleTag value; typedef FieldCategories::UnparametricTag category; static const bool balanced = false ; }; // ZRing template<> struct FieldTraits > { // typedef FieldCategories::MultiPrecisionTag value; typedef FieldCategories::UnparametricTag category; static const bool balanced = false ; }; // RNSInteger template struct FieldTraits > { // typedef FieldCategories::MultiPrecisionTag value; typedef FieldCategories::UnparametricTag category; // typedef true_type balanced ; static const bool balanced = false ; }; // RNSIntegerMod template struct FieldTraits >{ // typedef FieldCategories::MultiPrecisionTag value; typedef FieldCategories::ModularTag category; // typedef true_type balanced ; static const bool balanced = false ; }; } // FFLAS namespace FFLAS { /* associatedDelayedField */ template struct associatedDelayedField{ typedef Field field; typedef Field& type; // reference to avoid copying heavy fields }; template struct associatedDelayedField> { typedef Givaro::ZRing field; typedef Givaro::ZRing type; }; template struct associatedDelayedField> { typedef Givaro::ZRing field; typedef Givaro::ZRing type; }; template struct associatedDelayedField> { typedef Givaro::ZRing field; typedef Givaro::ZRing type; }; template struct associatedDelayedField> { typedef FFPACK::RNSInteger field; typedef FFPACK::RNSInteger type; }; } // FFLAS #endif // __FFLASFFPACK_field_field_traits_H fflas-ffpack-2.2.2/fflas-ffpack/field/field.doxy000066400000000000000000000021761274716147400214610ustar00rootroot00000000000000// Copyright (c) 2014 FFLAS-FFPACK // written by Brice Boyer (briceboyer) // // ========LICENCE======== // This file is part of the library FFLAS-FFPACK. // // FFLAS-FFPACK is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // ========LICENCE======== // /** \ingroup fflas-ffpack * \defgroup field FFLAS-FFPACK fields * * \brief fields in the FFLAS-FFPACK library * * Unparametric/Random elements * * @todo biblio * */ // vim:syn=doxygen fflas-ffpack-2.2.2/fflas-ffpack/field/rns-double-elt.h000066400000000000000000000152231274716147400224730ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /*! @file field/rns-double-elt.h * @ingroup field * @brief rns elt structure with double support */ #ifndef __FFLASFFPACK_field_rns_double_elt_INL #define __FFLASFFPACK_field_rns_double_elt_INL #include "fflas-ffpack/utils/fflas_memory.h" #include "fflas-ffpack/utils/cast.h" namespace FFPACK { // forward declaration struct rns_double_elt_ptr; struct rns_double_elt_cstptr; // element of the rns structure (allow virtualization of element from an array of double) struct rns_double_elt { double *_ptr; size_t _stride; bool _alloc; // specify wether Element owns its memory; alloc is true only through F.init() and _ptr==NULL (this is to handle Element allocated within a matrix) rns_double_elt(): _ptr(NULL), _stride(0), _alloc(false) {} ~rns_double_elt(){ if (_alloc) FFLAS::fflas_delete(_ptr);} rns_double_elt(double* p, size_t r, size_t a=false) : _ptr(p), _stride(r), _alloc(a) {} inline rns_double_elt_ptr operator&() ; inline rns_double_elt_cstptr operator&()const ; rns_double_elt(const rns_double_elt& x) : _ptr(x._ptr),_stride(x._stride),_alloc(false) {} }; // pointer to element of the rns structure (allow virtualization of element from an array of double) struct rns_double_elt_ptr : public rns_double_elt { rns_double_elt other; rns_double_elt_ptr(){} rns_double_elt_ptr(double* p, size_t r) : rns_double_elt(p,r,false){} rns_double_elt_ptr(const rns_double_elt_ptr &x) : rns_double_elt(x._ptr,x._stride,false){} rns_double_elt_ptr(const rns_double_elt_cstptr &x); rns_double_elt_ptr(rns_double_elt_ptr &&)=default; //inline operator rns_double_elt_cstptr(); inline rns_double_elt_ptr* operator&(){return this;} inline rns_double_elt& operator*() {return static_cast(*this);} inline rns_double_elt operator[](size_t i) const {return rns_double_elt(_ptr+i,_stride);} // BUGGY inline rns_double_elt& operator[](size_t i) {other=rns_double_elt(_ptr+i,_stride);return other;} // BUGGY inline rns_double_elt_ptr operator++() {return rns_double_elt_ptr(_ptr++,_stride);} inline rns_double_elt_ptr operator--() {return rns_double_elt_ptr(_ptr--,_stride);} inline rns_double_elt_ptr operator+(size_t inc) {return rns_double_elt_ptr(_ptr+inc,_stride);} inline rns_double_elt_ptr operator-(size_t inc) {return rns_double_elt_ptr(_ptr-inc,_stride);} inline rns_double_elt_ptr& operator+=(size_t inc) {_ptr+=inc;return *this;} inline rns_double_elt_ptr& operator-=(size_t inc) {_ptr-=inc;return *this;} inline rns_double_elt_ptr& operator=(const rns_double_elt_ptr& x); bool operator< (const rns_double_elt_ptr& x) {return _ptr < x._ptr;} bool operator!= (const rns_double_elt_ptr& x) {return _ptr != x._ptr;} }; struct rns_double_elt_cstptr : public rns_double_elt { rns_double_elt other; rns_double_elt_cstptr(){} rns_double_elt_cstptr(double* p, size_t r) : rns_double_elt(p,r,false){} rns_double_elt_cstptr(const rns_double_elt_ptr& x) : rns_double_elt(x._ptr,x._stride,false){} rns_double_elt_cstptr(const rns_double_elt_cstptr& x) : rns_double_elt(x._ptr,x._stride,false){} rns_double_elt_cstptr(rns_double_elt_cstptr &&)=default; inline rns_double_elt_cstptr* operator&(){return this;} inline rns_double_elt& operator*() const { return *const_cast(static_cast(this)); } inline rns_double_elt operator[](size_t i)const {return rns_double_elt(_ptr+i,_stride);} inline rns_double_elt& operator[](size_t i) {other=rns_double_elt(_ptr+i,_stride);return other;} // BUGGY //inline rns_double_elt& operator[](size_t i)const {return *((*this)+i);}// BUGGY inline rns_double_elt_cstptr operator++() {return rns_double_elt_cstptr(_ptr++,_stride);} inline rns_double_elt_cstptr operator--() {return rns_double_elt_cstptr(_ptr--,_stride);} inline rns_double_elt_cstptr operator+(size_t inc)const {return rns_double_elt_cstptr(_ptr+inc,_stride);} inline rns_double_elt_cstptr operator-(size_t inc)const {return rns_double_elt_cstptr(_ptr-inc,_stride);} inline rns_double_elt_cstptr& operator+=(size_t inc) {_ptr+=inc;return *this;} inline rns_double_elt_cstptr& operator-=(size_t inc) {_ptr-=inc;return *this;} inline rns_double_elt_cstptr& operator=(const rns_double_elt_cstptr& x); bool operator< (const rns_double_elt_cstptr& x) {return _ptr < x._ptr;} bool operator!= (const rns_double_elt_cstptr& x) {return _ptr != x._ptr;} }; inline rns_double_elt_ptr& rns_double_elt_ptr::operator=(const rns_double_elt_ptr& x) { if (this != &x){ if (_alloc) FFLAS::fflas_delete(_ptr); _ptr= x._ptr; _stride=x._stride; _alloc=false; } return *this; } inline rns_double_elt_cstptr& rns_double_elt_cstptr::operator=(const rns_double_elt_cstptr& x) { if (this != &x){ if (_alloc) FFLAS::fflas_delete(_ptr); _ptr= x._ptr; _stride=x._stride; _alloc=false; } return *this; } inline rns_double_elt_ptr::rns_double_elt_ptr(const rns_double_elt_cstptr &x) : rns_double_elt(x._ptr,x._stride,false){} //inline rns_double_elt_ptr::operator rns_double_elt_cstptr(){return rns_double_elt_cstptr(_ptr,_stride);} inline rns_double_elt_ptr rns_double_elt::operator&() {return rns_double_elt_ptr(_ptr,_stride);} inline rns_double_elt_cstptr rns_double_elt::operator&() const {return rns_double_elt_cstptr(_ptr,_stride);} template<> inline rns_double_elt_ptr fflas_const_cast (rns_double_elt_cstptr x){return x;} template<> inline rns_double_elt_cstptr fflas_const_cast (rns_double_elt_ptr x){return x;} } // end namespace FFPACK: #endif // __FFLASFFPACK_field_rns_double_elt_INL fflas-ffpack-2.2.2/fflas-ffpack/field/rns-double-recint.inl000066400000000000000000000257111274716147400235310ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2016 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_field_rns_double_recint_INL #define __FFLASFFPACK_field_rns_double_recint_INL #include "fflas-ffpack/fflas/fflas_freduce.h" namespace FFPACK { // Arns must be an array of m*n*_size // abs(||A||) < 2^(16k) template inline void rns_double::init(size_t m, size_t n, double* Arns, size_t rda, const RecInt::ruint* A, size_t lda, size_t k, bool RNS_MAJOR) const { if (k>_ldm){ FFPACK::failure()(__func__,__FILE__,__LINE__,"rns_struct: init (too large entry)"); std::cerr<<"k="<(), FFLAS::FflasNoTrans,FFLAS::FflasTrans,_size,mn,k,1.0,_crt_in.data(),_ldm,A_beta,k,0.,Arns,rda, // FFLAS::ParSeqHelper::Parallel()); FFLAS::ParSeqHelper::Parallel()); } tfgemm.stop(); //if(m>1 && n>1) std::cerr<<"fgemm : "<1 && n>1) std::cerr<<"Reduce : "< inline void rns_double::convert(size_t m, size_t n, integer gamma, RecInt::ruint* A, size_t lda, const double* Arns, size_t rda, integer p,bool RNS_MAJOR) const { if (p==0 && _M > integer(1)<<(1< too small for the rns basis log[2](M)="<<_M.bitsize()<>1; size_t mn= m*n; double *A_beta= FFLAS::fflas_new(mn*_ldm); Givaro::Timer tfgemmc;tfgemmc.start(); if (RNS_MAJOR==false) // compute A_beta = Ap^T x M_beta PAR_BLOCK{ FFLAS::fgemm(Givaro::ZRing(),FFLAS::FflasTrans, FFLAS::FflasNoTrans,(int) mn,(int) _ldm,(int) _size, 1.0 , Arns,(int) rda, _crt_out.data(),(int) _ldm, 0., A_beta,(int)_ldm, FFLAS::ParSeqHelper::Parallel()); // FFLAS::ParSeqHelper::Parallel()); } else // compute A_beta = Ap x M_Beta cblas_dgemm(CblasRowMajor,CblasNoTrans, CblasNoTrans, (int)mn, (int)_ldm, (int)_size, 1.0 , Arns, (int)_size, _crt_out.data(), (int)_ldm, 0., A_beta,(int)_ldm); tfgemmc.stop(); //if(m>1 && n>1) std::cerr<<"fgemm Convert : "<* Aiter= A; size_t k=_ldm; if ((_ldm+3)*16 > (1< -> convert needs "<<(_ldm+3)*16<<"bits ...aborting"<>2)+ (((k+3)%4==0)?0:1); std::vector A0(k4<<2,0),A1(k4<<2,0),A2(k4<<2,0),A3(k4<<2,0); integer a0,a1,a2,a3,res; mpz_t *m0,*m1,*m2,*m3; m0= reinterpret_cast(&a0); m1= reinterpret_cast(&a1); m2= reinterpret_cast(&a2); m3= reinterpret_cast(&a3); mp_limb_t *m0_d,*m1_d,*m2_d,*m3_d; m0_d = m0[0]->_mp_d; m1_d = m1[0]->_mp_d; m2_d = m2[0]->_mp_d; m3_d = m3[0]->_mp_d; m0[0]->_mp_alloc = m1[0]->_mp_alloc = m2[0]->_mp_alloc = m3[0]->_mp_alloc = (int) (k4*8/sizeof(mp_limb_t)); // to ensure 32 bits portability m0[0]->_mp_size = m1[0]->_mp_size = m2[0]->_mp_size = m3[0]->_mp_size = (int) (k4*8/sizeof(mp_limb_t)); // to ensure 32 bits portability Givaro::Timer tkroc; tkroc.start(); // auto sp=SPLITTER(); // PARFOR1D(i,m,sp, for(size_t i=0;i(&tmp); A0[l ]= tptr[0]; A1[l+1]= tptr[1]; A2[l+2]= tptr[2]; A3[l+3]= tptr[3]; } // see A0,A1,A2,A3 as a the gmp integers a0,a1,a2,a3 m0[0]->_mp_d= reinterpret_cast(&A0[0]); m1[0]->_mp_d= reinterpret_cast(&A1[0]); m2[0]->_mp_d= reinterpret_cast(&A2[0]); m3[0]->_mp_d= reinterpret_cast(&A3[0]); res = a0;res+= a1;res+= a2;res+= a3; res%=_M; if (p!=0) res%=p; // get the correct result according to the expected sign of A if (res>hM) res-=_M; if (gamma==0) Aiter[j+i*lda]=RecInt::ruint(res); else if (gamma==integer(1)) Aiter[j+i*lda]+=RecInt::ruint(res); else if (gamma==integer(-1)) Aiter[j+i*lda]=RecInt::ruint(res)-Aiter[j+i*lda]; else{ Aiter[j+i*lda]*=RecInt::ruint(gamma); Aiter[j+i*lda]+=RecInt::ruint(res); } } tkroc.stop(); //if(m>1 && n>1) std::cerr<<"Kronecker Convert : "<_mp_d = m0_d; m1[0]->_mp_d = m1_d; m2[0]->_mp_d = m2_d; m3[0]->_mp_d = m3_d; m0[0]->_mp_alloc = m1[0]->_mp_alloc = m2[0]->_mp_alloc= m3[0]->_mp_alloc = 1; m0[0]->_mp_size = m1[0]->_mp_size = m2[0]->_mp_size = m3[0]->_mp_size = 0; } else { //size_t k4=((k+3)>>2)+ (((k+3)%4==0)?0:1); std::vector A0(1<<(K-4),0),A1(1<<(K-4),0),A2(1<<(K-4),0),A3(1<<(K-4),0); RecInt::ruint *a0,*a1,*a2,*a3,res; Givaro::Timer tkroc; tkroc.start(); // auto sp=SPLITTER(); // PARFOR1D(i,m,sp, for(size_t i=0;i(&tmp); A0[l ]= tptr[0]; A1[l+1]= tptr[1]; A2[l+2]= tptr[2]; A3[l+3]= tptr[3]; } a0= reinterpret_cast*>(&A0[0]); a1= reinterpret_cast*>(&A1[0]); a2= reinterpret_cast*>(&A2[0]); a3= reinterpret_cast*>(&A3[0]); res = *a0;res+= *a1;res+= *a2;res+= *a3; res%= RecInt::ruint(_M); // get the correct result according to the expected sign of A //if (res>hM) // res-=_M; if (gamma==0) Aiter[j+i*lda]=res; else if (gamma==1) Aiter[j+i*lda]+=res; else if (gamma==-1) Aiter[j+i*lda]=res-Aiter[j+i*lda]; else{ Aiter[j+i*lda]*=RecInt::ruint(gamma); Aiter[j+i*lda]+=res; } } tkroc.stop(); } //if(m>1 && n>1) std::cerr<<"Kronecker Convert : "< with log[2](M)="<<_M.bitsize()<s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /*! @file field/rns-double.h * @ingroup field * @brief rns structure with double support */ #ifndef __FFPACK_rns_double_H #define __FFPACK_rns_double_H // Bigger multiple of s lesser or equal than x, s must be a power of two #ifndef ROUND_DOWN #define ROUND_DOWN(x, s) ((x) & ~((s)-1)) #endif #include #include #include #include #include "givaro/modular-extended.h" #include #include "fflas-ffpack/config-blas.h" #include "fflas-ffpack/utils/fflas_memory.h" #include "fflas-ffpack/utils/align-allocator.h" #include "fflas-ffpack/field/rns-double-elt.h" namespace FFPACK { /* Structure that handles rns representation given a bound and bitsize for prime moduli * support sign representation (i.e. the bound must be twice larger then ||A||) */ struct rns_double { typedef Givaro::Integer integer; typedef Givaro::Modular ModField; std::vector> _basis; // the rns moduli (mi) std::vector> _basisMax; // (mi-1) std::vector> _negbasis; // (-mi) std::vector> _invbasis; // the inverse of rns moduli (1/mi) std::vector _field_rns; // the associated prime field for each mi integer _M; // the product of the mi's std::vector _Mi; // _M/mi std::vector _MMi; // (_Mi)^(-1) mod mi std::vector _crt_in; // 2^(16*j) mod mi std::vector _crt_out; // (_Mi._MMi) written in base 2^16 size_t _size; // the size of the rns basis (number of mi's) size_t _pbits; // the size in bit of the mi's size_t _ldm; // log[2^16](_M) typedef double BasisElement; typedef rns_double_elt Element; typedef rns_double_elt_ptr Element_ptr; typedef rns_double_elt_cstptr ConstElement_ptr; rns_double(const integer& bound, size_t pbits, bool rnsmod=false, long seed=time(NULL)) : _M(1), _size(0), _pbits(pbits) { integer::seeding(seed); Givaro::IntPrimeDom IPD; integer prime; integer sum=1; while (_M < bound*sum) { _basis.resize(_size+1); do { integer::random_exact_2exp(prime, _pbits-1); IPD.nextprimein(prime); } while (_M%prime == 0); _basis[_size]=prime; _size++; _M*=prime; if (rnsmod) sum+=prime; } precompute_cst(); } rns_double(size_t pbits, size_t size, long seed=time(NULL)) : _M(1), _size(size), _pbits(pbits) { integer::seeding(seed); Givaro::IntPrimeDom IPD; integer prime; integer sum=1; _basis.resize(size); _negbasis.resize(size); _basisMax.resize(size); for(size_t i = 0 ; i < _size ; ++i){ integer::random_exact_2exp(prime, _pbits-1); IPD.nextprimein(prime); _basis[i]=prime; _basisMax[i] = prime-1; _negbasis[i] = 0-prime; _M*=prime; } precompute_cst(); } template rns_double(const Vect& basis, bool rnsmod=false, long seed=time(NULL)) : _basis(basis.begin(),basis.end()), _basisMax(basis.size()), _negbasis(basis.size()), _M(1), _size(basis.size()), _pbits(0) { for(size_t i=0;i<_size;i++){ //std::cout<<"basis["< inline void fflas_delete (FFPACK::rns_double_elt_ptr A) {FFLAS::fflas_delete( A._ptr);} template<> inline void fflas_delete (FFPACK::rns_double_elt_cstptr A) {delete[] A._ptr;} } #endif // __FFPACK_rns_double_Hfflas-ffpack-2.2.2/fflas-ffpack/field/rns-double.inl000066400000000000000000000475331274716147400222550ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_field_rns_double_INL #define __FFLASFFPACK_field_rns_double_INL #include "fflas-ffpack/fflas/fflas_freduce.h" namespace FFPACK { // Arns must be an array of m*n*_size // abs(||A||) < 2^(16k) inline void rns_double::init(size_t m, size_t n, double* Arns, size_t rda, const integer* A, size_t lda, size_t k, bool RNS_MAJOR) const { if (k>_ldm){ FFPACK::failure()(__func__,__FILE__,__LINE__,"rns_struct: init (too large entry)"); std::cerr<<"k="<(), FFLAS::FflasNoTrans,FFLAS::FflasTrans,_size,mn,k,1.0,_crt_in.data(),_ldm,A_beta,k,0.,Arns,rda, // FFLAS::ParSeqHelper::Parallel()); FFLAS::ParSeqHelper::Parallel()); tfgemm.stop(); //if(m>1 && n>1) std::cerr<<"fgemm : "<1 && n>1) std::cerr<<"Reduce : "<_ldm) FFPACK::failure()(__func__,__FILE__,__LINE__,"rns_struct: init (too large entry)"); size_t mn=m*n; double *A_beta = FFLAS::fflas_new(mn*k); const integer* Aiter=A; // split A into A_beta according to a Kronecker transform in base 2^16 for(size_t j=0;j(Aiter+j+i*lda); const uint16_t* m0_ptr = reinterpret_cast(m0[0]->_mp_d); size_t l=0; //size_t maxs=std::min(k,(Aiter[j+i*lda].size())<<2); size_t maxs=std::min(k,(Aiter[j+i*lda].size())*sizeof(mp_limb_t)/2); // to ensure 32 bits portability if (m0[0]->_mp_size >= 0) for (;l>1; size_t mn= m*n; double *A_beta= FFLAS::fflas_new(mn*_ldm); Givaro::Timer tfgemmc;tfgemmc.start(); if (RNS_MAJOR==false) // compute A_beta = Ap^T x M_beta FFLAS::fgemm(Givaro::ZRing(),FFLAS::FflasTrans, FFLAS::FflasNoTrans,(int) mn,(int) _ldm,(int) _size, 1.0 , Arns,(int) rda, _crt_out.data(),(int) _ldm, 0., A_beta,(int)_ldm, FFLAS::ParSeqHelper::Parallel()); // FFLAS::ParSeqHelper::Parallel()); else // compute A_beta = Ap x M_Beta cblas_dgemm(CblasRowMajor,CblasNoTrans, CblasNoTrans, (int)mn, (int)_ldm, (int)_size, 1.0 , Arns, (int)_size, _crt_out.data(), (int)_ldm, 0., A_beta,(int)_ldm); tfgemmc.stop(); //if(m>1 && n>1) std::cerr<<"fgemm Convert : "<>2)+ (((k+3)%4==0)?0:1); std::vector A0(k4<<2,0),A1(k4<<2,0),A2(k4<<2,0),A3(k4<<2,0); integer a0,a1,a2,a3,res; mpz_t *m0,*m1,*m2,*m3; m0= reinterpret_cast(&a0); m1= reinterpret_cast(&a1); m2= reinterpret_cast(&a2); m3= reinterpret_cast(&a3); mp_limb_t *m0_d,*m1_d,*m2_d,*m3_d; m0_d = m0[0]->_mp_d; m1_d = m1[0]->_mp_d; m2_d = m2[0]->_mp_d; m3_d = m3[0]->_mp_d; m0[0]->_mp_alloc = m1[0]->_mp_alloc = m2[0]->_mp_alloc = m3[0]->_mp_alloc = (int) (k4*8/sizeof(mp_limb_t)); // to ensure 32 bits portability m0[0]->_mp_size = m1[0]->_mp_size = m2[0]->_mp_size = m3[0]->_mp_size = (int) (k4*8/sizeof(mp_limb_t)); // to ensure 32 bits portability Givaro::Timer tkroc; tkroc.start(); // auto sp=SPLITTER(); // PARFOR1D(i,m,sp, for(size_t i=0;i(&tmp); A0[l ]= tptr[0]; A1[l+1]= tptr[1]; A2[l+2]= tptr[2]; A3[l+3]= tptr[3]; } // see A0,A1,A2,A3 as a the gmp integers a0,a1,a2,a3 m0[0]->_mp_d= reinterpret_cast(&A0[0]); m1[0]->_mp_d= reinterpret_cast(&A1[0]); m2[0]->_mp_d= reinterpret_cast(&A2[0]); m3[0]->_mp_d= reinterpret_cast(&A3[0]); res = a0;res+= a1;res+= a2;res+= a3; res%=_M; // get the correct result according to the expected sign of A if (res>hM) res-=_M; if (gamma==0) Aiter[j+i*lda]=res; else if (gamma==integer(1)) Aiter[j+i*lda]+=res; else if (gamma==integer(-1)) Aiter[j+i*lda]=res-Aiter[j+i*lda]; else{ Aiter[j+i*lda]*=gamma; Aiter[j+i*lda]+=res; } } tkroc.stop(); //if(m>1 && n>1) std::cerr<<"Kronecker Convert : "<_mp_d = m0_d; m1[0]->_mp_d = m1_d; m2[0]->_mp_d = m2_d; m3[0]->_mp_d = m3_d; m0[0]->_mp_alloc = m1[0]->_mp_alloc = m2[0]->_mp_alloc= m3[0]->_mp_alloc = 1; m0[0]->_mp_size = m1[0]->_mp_size = m2[0]->_mp_size = m3[0]->_mp_size = 0; FFLAS::fflas_delete( A_beta); #ifdef CHECK_RNS bool ok=true; for (size_t i=0;i>1; size_t mn= m*n; double *A_beta= FFLAS::fflas_new(mn*_ldm); if (RNS_MAJOR==false) // compute A_beta = Ap^T x M_beta cblas_dgemm(CblasRowMajor,CblasTrans, CblasNoTrans,(int) mn,(int) _ldm,(int) _size, 1.0 , Arns,(int) rda, _crt_out.data(),(int) _ldm, 0., A_beta,(int)_ldm); else // compute A_beta = Ap x M_Beta cblas_dgemm(CblasRowMajor,CblasNoTrans, CblasNoTrans, (int)mn, (int)_ldm, (int)_size, 1.0 , Arns, (int)_size, _crt_out.data(), (int)_ldm, 0., A_beta,(int)_ldm); // compute A using inverse Kronecker transform of A_beta expressed in base 2^log_beta integer* Aiter= A; size_t k=_ldm; size_t k4=((k+3)>>2)+ (((k+3)%4==0)?0:1); std::vector A0(k4<<2,0),A1(k4<<2,0),A2(k4<<2,0),A3(k4<<2,0); integer a0,a1,a2,a3,res; mpz_t *m0,*m1,*m2,*m3; m0= reinterpret_cast(&a0); m1= reinterpret_cast(&a1); m2= reinterpret_cast(&a2); m3= reinterpret_cast(&a3); mp_limb_t *m0_d,*m1_d,*m2_d,*m3_d; m0_d = m0[0]->_mp_d; m1_d = m1[0]->_mp_d; m2_d = m2[0]->_mp_d; m3_d = m3[0]->_mp_d; m0[0]->_mp_alloc = m1[0]->_mp_alloc = m2[0]->_mp_alloc = m3[0]->_mp_alloc = (int32_t)(k4*8/sizeof(mp_limb_t)); // to ensure 32 bits portability m0[0]->_mp_size = m1[0]->_mp_size = m2[0]->_mp_size = m3[0]->_mp_size = (int32_t)(k4*8/sizeof(mp_limb_t)); // to ensure 32 bits portability for (size_t j=0;j(&tmp); A0[l ]= tptr[0]; A1[l+1]= tptr[1]; A2[l+2]= tptr[2]; A3[l+3]= tptr[3]; } // see A0,A1,A2,A3 as a the gmp integers a0,a1,a2,a3 m0[0]->_mp_d= reinterpret_cast(&A0[0]); m1[0]->_mp_d= reinterpret_cast(&A1[0]); m2[0]->_mp_d= reinterpret_cast(&A2[0]); m3[0]->_mp_d= reinterpret_cast(&A3[0]); res = a0;res+= a1;res+= a2;res+= a3; res%=_M; // get the correct result according to the expected sign of A if (res>hM) res-=_M; if (gamma==0) Aiter[j+i*lda]=res; else if (gamma==integer(1)) Aiter[j+i*lda]+=res; else if (gamma==integer(-1)) Aiter[j+i*lda]=res-Aiter[j+i*lda]; else{ Aiter[j+i*lda]*=gamma; Aiter[j+i*lda]+=res; } } m0[0]->_mp_d = m0_d; m1[0]->_mp_d = m1_d; m2[0]->_mp_d = m2_d; m3[0]->_mp_d = m3_d; m0[0]->_mp_alloc = m1[0]->_mp_alloc = m2[0]->_mp_alloc= m3[0]->_mp_alloc = 1; m0[0]->_mp_size = m1[0]->_mp_size = m2[0]->_mp_size = m3[0]->_mp_size = 0; FFLAS::fflas_delete( A_beta); #ifdef CHECK_RNS bool ok=true; for (size_t i=0;i; using vect_t = typename simd::vect_t; if(_size % simd::vect_size == 0){ for(size_t i = 0 ; i < n ; i++){ vect_t tmp1, tmp2, tmp3, v, max, basis, inv, neg; for(size_t j = 0 ; j < _size ; j+=simd::vect_size){ basis = simd::load(_basis.data()+j); inv = simd::load(_invbasis.data()+j); max = simd::load(_basisMax.data()+j); neg = simd::load(_negbasis.data()+j); v = simd::load(Arns+i*_size+j); tmp1 = simd::floor(simd::mul(v, inv)); tmp2 = simd::fnmadd(v, tmp1, basis); tmp1 = simd::greater(tmp2, max); tmp3 = simd::lesser(tmp2, simd::zero()); tmp1 = simd::vand(tmp1, neg); tmp3 = simd::vand(tmp3, basis); tmp1 = simd::vor(tmp1, tmp3); tmp2 = simd::add(tmp2, tmp1); simd::store(Arns+i*_size+j, tmp2); } } }else{ for(size_t i = 0 ; i < n ; i++){ vect_t tmp1, tmp2, tmp3, v, max, basis, inv, neg; size_t j = 0; for( ; j < ROUND_DOWN(_size, simd::vect_size) ; j+=simd::vect_size){ basis = simd::load(_basis.data()+j); inv = simd::load(_invbasis.data()+j); max = simd::load(_basisMax.data()+j); neg = simd::load(_negbasis.data()+j); v = simd::loadu(Arns+i*_size+j); tmp1 = simd::floor(simd::mul(v, inv)); tmp2 = simd::fnmadd(v, tmp1, basis); tmp1 = simd::greater(tmp2, max); tmp3 = simd::lesser(tmp2, simd::zero()); tmp1 = simd::vand(tmp1, neg); tmp3 = simd::vand(tmp3, basis); tmp1 = simd::vor(tmp1, tmp3); tmp2 = simd::add(tmp2, tmp1); simd::storeu(Arns+i*_size+j, tmp2); } for( ; j < _size ; ++j){ // std::cout << j << std::endl; // auto x = std::floor(Arns[i*_size+j] * _invbasis[j]); Arns[i*_size+j] -= std::floor(Arns[i*_size+j]*_invbasis[j])*_basis[j]; // Arns[i*_size+j] = std::fma(Arns[i*_size+j], -x, _basis[j]); if(Arns[i*_size+j] >= _basis[j]){ Arns[i*_size+j] -= _basis[j]; }else if(Arns[i*_size+j] < 0){ Arns[i*_size+j] += _basis[j]; } } } } #else for(size_t i = 0 ; i < n ; i+= _size){ for(size_t j = 0 ; j < _size ; ++j){ //_field_rns.reduce(Arns+i*_size+j); _field_rns[i].reduce(Arns[i*_size+j]); } } #endif } else { // NOT IN RNS MAJOR // #ifndef __FFLASFFPACK_SEQUENTIAL // auto sp=SPLITTER(MAX_THREADS); // #else // auto sp=SPLITTER(1); // #endif PARFOR1D(i,_size,SPLITTER(NUM_THREADS), //for(size_t i=0;i<_size;i++) FFLAS::freduce (_field_rns[i],n,Arns+i*rda,1); ); } } // TODO: less naive implementation inline void rns_double_extended::init(size_t m, double* Arns, const integer* A, size_t lda) const{ for(size_t i = 0 ; i < m ; ++i){ for(size_t j = 0 ; j < _size ; ++j){ Arns[i*_size+j] = (double)((A[i*lda]%integer(_basis[j]))[0]); } } } // TODO: less naive implementation inline void rns_double_extended::convert(size_t m, integer *A, const double *Arns) const{ integer hM= (_M-1)/2; for(size_t i = 0 ; i < m ; ++i){ A[i] = 0; integer tmp; for(size_t j = 0 ; j < _size ; ++j){ A[i] += ((integer(Arns[i*_size+j])*integer(_MMi[j]))%integer(_basis[j]))*integer(_Mi[j]); } A[i] %= _M; if(A[i] > hM) A[i] -= _M; } } // reduce entries of Arns to be less than the rns basis elements inline void rns_double_extended::reduce(size_t n, double* Arns, size_t rda, bool RNS_MAJOR) const{ #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS using simd = Simd; using vect_t = typename simd::vect_t; if(_size % simd::vect_size == 0){ //#pragma omp parallel for schedule(static, 256) for(size_t i = 0 ; i < n ; i++){ vect_t tmp1, tmp2, tmp3, v, max, basis, inv, neg; for(size_t j = 0 ; j < _size ; j+=simd::vect_size){ basis = simd::load(_basis.data()+j); inv = simd::load(_invbasis.data()+j); max = simd::load(_basisMax.data()+j); neg = simd::load(_negbasis.data()+j); v = simd::load(Arns+i*_size+j); tmp2 = modSimd(v, basis, inv, neg); tmp1 = simd::greater(tmp2, max); tmp3 = simd::lesser(tmp2, simd::zero()); tmp1 = simd::vand(tmp1, neg); tmp3 = simd::vand(tmp3, basis); tmp1 = simd::vor(tmp1, tmp3); tmp2 = simd::add(tmp2, tmp1); simd::store(Arns+i*_size+j, tmp2); } } }else{ //#pragma omp parallel for schedule(static, 256) for(size_t i = 0 ; i < n ; i++){ vect_t tmp1, tmp2, tmp3, v, max, basis, inv, neg; size_t j = 0; for( ; j < ROUND_DOWN(_size, simd::vect_size) ; j+=simd::vect_size){ basis = simd::load(_basis.data()+j); inv = simd::load(_invbasis.data()+j); max = simd::load(_basisMax.data()+j); neg = simd::load(_negbasis.data()+j); v = simd::loadu(Arns+i*_size+j); tmp2 = modSimd(v, basis, inv, neg); tmp1 = simd::greater(tmp2, max); tmp3 = simd::lesser(tmp2, simd::zero()); tmp1 = simd::vand(tmp1, neg); tmp3 = simd::vand(tmp3, basis); tmp1 = simd::vor(tmp1, tmp3); tmp2 = simd::add(tmp2, tmp1); simd::storeu(Arns+i*_size+j, tmp2); } for( ; j < _size ; ++j){ _field_rns[j].reduce(Arns[i*_size+j]); } } } #else // TODO : SIMD version for(size_t i = 0 ; i < n ; i+= _size){ for(size_t j = 0 ; j < _size ; ++j){ //_field_rns.reduce(Arns+i*_size+j); _field_rns[i].reduce(Arns[i*_size+j]); } } #endif } } // FFPACK #endif // __FFLASFFPACK_field_rns_double_INL fflas-ffpack-2.2.2/fflas-ffpack/field/rns-integer-mod.h000066400000000000000000000726751274716147400226670ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /*! @file field/rns-integer-mod.h * @ingroup field * @brief representation of Z/pZ using RNS representation (note: fixed precision) */ #ifndef __FFPACK_rns_integer_mod_H #define __FFPACK_rns_integer_mod_H #include #include #include #include #include #include #include "givaro/modular-extended.h" #include "fflas-ffpack/field/rns-double.h" #include "fflas-ffpack/field/rns-integer.h" #include "fflas-ffpack/fflas/fflas_level1.inl" #include "fflas-ffpack/fflas/fflas_level2.inl" #include "fflas-ffpack/fflas/fflas_level3.inl" #include "fflas-ffpack/fflas/fflas_enum.h" namespace FFPACK { template class RNSIntegerMod; } #include "fflas-ffpack/fflas/fflas_fscal_mp.inl" #if defined(BENCH_PERF_FGEMM_MP) || defined(BENCH_PERF_TRSM_MP) || defined(BENCH_PERF_LQUP_MP) #define BENCH_PERF_SCAL_MP #define BENCH_MODP #endif namespace FFPACK { template class RNSIntegerMod { public: typedef typename RNS::Element Element; typedef typename RNS::Element_ptr Element_ptr; typedef typename RNS::ConstElement_ptr ConstElement_ptr; typedef rnsRandIter RandIter; protected: typedef typename RNS::BasisElement BasisElement; typedef Givaro::Modular ModField; typedef Givaro::Integer integer; integer _p; std::vector> _Mi_modp_rns; std::vector> _iM_modp_rns; const RNS *_rns; Givaro::Modular _F; RNSInteger _RNSdelayed; public: Element one, mOne,zero; #ifdef BENCH_MODP mutable double t_modp, t_igemm, t_scal,t_trsm; mutable size_t n_modp; #endif RNSIntegerMod(const integer& p, const RNS& myrns) : _p(p), _Mi_modp_rns(myrns._size*myrns._size), _iM_modp_rns(myrns._size*myrns._size), _rns(&myrns), _F(p), _RNSdelayed(myrns){ init(one,1); init(zero,0); init(mOne,-1); integer iM=0; size_t mysize=myrns._size; integer sum=0; //std::cout << "M: " << myrns._M << std::endl; for (size_t i=0;i& delayed() const {return _RNSdelayed;} size_t size() const {return _rns->_size;} bool isOne(const Element& x) const { bool isone=true; for (size_t i=0;i<_rns->_size;i++) isone&= (one._ptr[i]== x._ptr[i]); return isone; } bool isMOne(const Element& x) const { bool ismone=true; for (size_t i=0;i<_rns->_size;i++) ismone&= (mOne._ptr[i]== x._ptr[i]); return ismone; } bool isZero(const Element& x) const { //write(std::cout,x)<<" == "; //write(std::cout,zero)<(_rns->_size); x._stride=1; x._alloc=true; } return x; } Element& init(Element& x, const Givaro::Integer& y) const{ init(x); size_t k =(_p.bitsize())/16+((_p.bitsize())%16?1:0); _rns->init(1,1,x._ptr,x._stride, &y,1,k); return x; } // assume this is the mod p operation Element& reduce (Element& x, const Element& y) const{ Givaro::Integer tmp; convert(tmp,y); tmp %= _p; init (x,tmp); return x; } Element& reduce (Element& x) const{ Givaro::Integer tmp; convert (tmp, x); tmp %= _p; return init (x, tmp); } Element& init(Element& x, const Element& y) const{ return reduce (x, y); } Givaro::Integer convert(Givaro::Integer& x, const Element& y)const { _rns->convert(1,1,integer(0),&x,1,y._ptr,y._stride); return x; } Element& assign(Element& x, const Element& y) const { for(size_t i=0;i<_rns->_size;i++) x._ptr[i*x._stride] = y._ptr[i*y._stride]; return x; } Element& add(Element& x, const Element& y, const Element& z) const { for(size_t i=0;i<_rns->_size;i++) _rns->_field_rns[i].add((x._ptr)[i*x._stride], (y._ptr)[i*y._stride], (z._ptr)[i*z._stride]); return x; } Element& sub(Element& x, const Element& y, const Element& z) const { for(size_t i=0;i<_rns->_size;i++) _rns->_field_rns[i].sub((x._ptr)[i*x._stride], (y._ptr)[i*y._stride], (z._ptr)[i*z._stride]); return x; } Element& neg(Element& x, const Element& y) const { for(size_t i=0;i<_rns->_size;i++) _rns->_field_rns[i].neg((x._ptr)[i*x._stride], (y._ptr)[i*y._stride]); return x; } Element& mul(Element& x, const Element& y, const Element& z) const { for(size_t i=0;i<_rns->_size;i++) _rns->_field_rns[i].mul((x._ptr)[i*x._stride], (y._ptr)[i*y._stride], (z._ptr)[i*z._stride]); return x; } Element& axpyin(Element& x, const Element& y, const Element& z) const { for(size_t i=0;i<_rns->_size;i++) _rns->_field_rns[i].axpyin((x._ptr)[i*x._stride], (y._ptr)[i*y._stride], (z._ptr)[i*z._stride]); return x; } Element& inv(Element& x, const Element& y) const { Givaro::Integer tmp; convert(tmp,y); _F.invin(tmp); init(x,tmp); return x; } bool areEqual(const Element& x, const Element& y) const { for(size_t i=0;i<_rns->_size;i++) if (!_rns->_field_rns[i].areEqual((x._ptr)[i*x._stride],(y._ptr)[i*y._stride])) return false; return true; } std::ostream& write(std::ostream& os, const Element& y) const { os<<"[ "<< (long) (y._ptr)[0]; for(size_t i=1;i<_rns->_size;i++) os<<" , "<< (long) ((y._ptr)[i*y._stride]); return os<<" ]"; } std::ostream& write(std::ostream& os) const { os<<"M:=[ "<< (long) _rns->_basis[0]; for(size_t i=1;i<_rns->_size;i++) os<<" , "<< (long) _rns->_basis[i]; return os<<" ]"<_size; BasisElement *Gamma, *alpha, *A; A=B._ptr; size_t rda = B._stride; Givaro::ZRing D; Gamma = FFLAS::fflas_new(D,_size,n); alpha = FFLAS::fflas_new(D,n,1); // compute Gamma //for(size_t i=0;i<_size;i++) // FFLAS::fscal(_rns->_field_rns[i], n, _rns->_MMi[i], A+i*rda, 1, Gamma+i*n,1); typename RNS::Element mmi(const_cast(_rns->_MMi.data()),1); FFLAS::fscal(_RNSdelayed, n, mmi, B, 1, typename RNS::Element_ptr(Gamma,n), 1); // compute A = _Mi_modp_rns.Gamma (note must be reduced mod m_i, but this is postpone to the end) FFLAS::fgemm(D,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, _size, n, _size, D.one, _Mi_modp_rns.data(), _size, Gamma, n, D.zero, A, rda); //std::cout<<"fgemv (Y)..."; //std::cout<<"fgemv (Y)..."< "<<_size<_invbasis.data(), 1 , D.zero, alpha, 1); //std::cout<<"done"<_field_rns[i], n, A+i*rda, 1); FFLAS::fflas_delete(Gamma); FFLAS::fflas_delete(alpha); #ifdef BENCH_MODP chrono.stop(); t_modp+=chrono.usertime(); #endif } std::ostream& write_matrix(std::ostream& c, double* E, int n, int m, int lda) const { c<_size; size_t mn=m*n; BasisElement *Gamma, *alpha, *z, *A; A=B._ptr; size_t rda=B._stride; Gamma = FFLAS::fflas_new(mn*_size); alpha = FFLAS::fflas_new(mn); z = FFLAS::fflas_new(mn*_size); // compute Gamma //for(size_t i=0;i<_size;i++) // FFLAS::fscal(_rns->_field_rns[i], m, n, _rns->_MMi[i], A+i*rda, lda, Gamma+i*mn,n); typename RNS::Element mmi(const_cast(_rns->_MMi.data()),1); FFLAS::fscal(_RNSdelayed, m, n, mmi, B, lda, typename RNS::Element_ptr(Gamma,mn), n); // compute Gamma = _Mi_modp_rns.Gamma (note must be reduced mod m_i, but this is postpone to the end) Givaro::ZRing D; FFLAS::fgemm(D,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans,_size, mn, _size, D.one, _Mi_modp_rns.data(), _size, Gamma, mn, D.zero, z, mn); //write_matrix(std::cout,Gamma, mn, _size, mn); // compute alpha = _invbase.Gamma //std::cout<<"fgemv (X)..."< "<<_size<<" "<_invbasis.data(), 1 , D.zero, alpha, 1); //std::cout<<"done"<_field_rns[i], m, n, A+i*rda, lda); FFLAS::fflas_delete(Gamma); FFLAS::fflas_delete(alpha); FFLAS::fflas_delete(z); #ifdef BENCH_MODP chrono.stop(); t_modp+=chrono.usertime(); #endif } #ifdef __DLP_CHALLENGE #define DELTA 27 template inline void splitSimd(const SimdT x, SimdT & x_h, SimdT & x_l) const { using simd = Simd; using vect_t = typename simd::vect_t; vect_t vc = simd::set1((double)((1_ui64 << DELTA) + 1_ui64)); vect_t tmp = simd::mul(vc, x); x_h = simd::add(tmp, simd::sub(x, tmp)); x_l = simd::sub(x, x_h); } template inline void multSimd(const SimdT va, const SimdT vb, SimdT & vs, SimdT & vt) const{ using simd = Simd; using vect_t = typename simd::vect_t; vect_t vah, val, vbh, vbl; splitSimd(va, vah, val); splitSimd(vb, vbh, vbl); vs = simd::mul(va, vb); vt = simd::add(simd::add(simd::sub(simd::mul(vah, vbh), vs), simd::mul(vah, vbl)), simd::add(simd::mul(val, vbh), simd::mul(val, vbl))); } template inline SimdT multModSimd(const SimdT a, const SimdT b, const SimdT p, const SimdT ip, const SimdT np) const{ using simd = Simd; using vect_t = typename simd::vect_t; vect_t abh, abl, pqh, pql; multSimd(a, b, abh, abl); vect_t q = simd::floor(simd::mul(abh, ip)); multSimd(p, q, pqh, pql); vect_t r = simd::add(simd::sub(abh, pqh), simd::sub(abl, pql)); abh = simd::greater_eq(r, p); abl = simd::lesser(r, simd::zero()); abh = simd::vand(abh, np); abl = simd::vand(abl, p); abh = simd::vor(abh, abl); return r = simd::add(r, abh); } inline void split(const double x, const int delta, double &x_h, double &x_l) const { double c = (double)((1_ui64 << delta) + 1_ui64); x_h = (c*x)+(x-(c*x)); x_l = x - x_h; } inline void mult(const double a, const double b, double &s, double &t) const{ double ah, al, bh, bl; s = a*b; #ifdef __FMA__ t = std::fma(-a, b, s); #else split(a, DELTA, ah, al); split(b, DELTA, bh, bl); t = ((((-s+ah*bh)+(ah*bl))+(al*bh))+(al*bl)); #endif } inline double multmod(const double a, const double b, const double p, const double ip, const double np) const{ double abh, abl, pqh, pql; mult(a, b, abh, abl); double q = floor(abh*ip); mult(p, q, pqh, pql); double r = (abh-pqh)+(abl-pql); if(r > p) r -= p; else if(r < 0) r += p; return r; } void reduce_modp_rnsmajor_scal_quad(size_t n, Element_ptr B) const { // std::cout << "modp scalar quad" << std::endl; // using namespace modp_details; using simd = Simd; using vect_t = typename simd::vect_t; FFLAS::Timer T; size_t _size= _rns->_size; Givaro::ZRing D; std::vector> Fields; for(size_t i = 0 ; i < _size ; ++i){ Fields.emplace_back(_rns->_basis[i]); } /* if((int64_t)B._ptr%simd::alignment == 0 && _size%simd::vect_size==0){ for(size_t j = 0 ; j < n ; ++j){ BasisElement *A, *Gamma, *tabTmp; A = FFLAS::fflas_new(_size, Alignment::CACHE_LINE); Gamma = FFLAS::fflas_new(_size, Alignment::CACHE_LINE); tabTmp = FFLAS::fflas_new(_size, Alignment::CACHE_LINE); vect_t vA, vB, vG, vp, vnp, vip, vRNS, vtmp; // Compute Gamma for(size_t i = 0 ; i < _size ; i+= simd::vect_size){ vB = simd::load(B._ptr+j*_size+i); vp = simd::load(_rns->_basis.data()+i); vip = simd::load(_rns->_invbasis.data()+i); vnp = simd::load(_rns->_negbasis.data()+i); vRNS = simd::load(_rns->_MMi.data()+i); vG = multModSimd(vB, vRNS, vp, vip, vnp); simd::store(Gamma+i, vG); } // Compute A=Gamma*Mi in rns for(size_t k = 0 ; k < _size ; ++k){ for(size_t i = 0 ; i < _size ; i+= simd::vect_size){ vG = simd::load(Gamma+i); vp = simd::set1(_rns->_basis[k]); vip = simd::set1(_rns->_invbasis[k]); vnp = simd::set1(_rns->_negbasis[k]); vRNS = simd::load(_Mi_modp_rns.data()+k*_size+i); vtmp = multModSimd(vG, vRNS, vp, vip, vnp); simd::store(tabTmp+i, vtmp); } for(size_t i = 0 ; i < _size ; ++i){ Fields[k].addin(A[k], tabTmp[i]); } } double alpha = 0; for(size_t k = 0 ; k < _size ; ++k){ alpha += Gamma[k]*_rns->_invbasis[k]; } // -= alpha long aa= (long)rint(alpha); for(size_t k = 0; k < _size ; k++){ Fields[k].sub(B._ptr[j*_size+k], A[k], _iM_modp_rns[aa+k*_size]); } FFLAS::fflas_delete(Gamma); FFLAS::fflas_delete(A); FFLAS::fflas_delete(tabTmp); } }else{ for(size_t j = 0 ; j < n ; ++j){ BasisElement *A, *Gamma, *tabTmp; A = FFLAS::fflas_new(_size, Alignment::CACHE_LINE); Gamma = FFLAS::fflas_new(_size, Alignment::CACHE_LINE); tabTmp = FFLAS::fflas_new(_size, Alignment::CACHE_LINE); vect_t vA, vB, vG, vp, vnp, vip, vRNS, vtmp; // Compute Gamma size_t i = 0; for(; i < ROUND_DOWN(_size, simd::vect_size) ; i+= simd::vect_size){ vB = simd::load(B._ptr+j*_size+i); vp = simd::load(_rns->_basis.data()+i); vip = simd::load(_rns->_invbasis.data()+i); vnp = simd::load(_rns->_negbasis.data()+i); vRNS = simd::load(_rns->_MMi.data()+i); vG = multModSimd(vB, vRNS, vp, vip, vnp); simd::store(Gamma+i, vG); } for(; i < _size ; ++i){ Fields[i].mul(Gamma[i], B._ptr[j*_size+i], _rns->_MMi[i]); } // Compute A=Gamma*Mi in rns for(size_t k = 0 ; k < _size ; ++k){ i = 0; A[k] = 0; for( ; i < ROUND_DOWN(_size, simd::vect_size); i+= simd::vect_size){ vG = simd::load(Gamma+i); vp = simd::set1(_rns->_basis[k]); vip = simd::set1(_rns->_invbasis[k]); vnp = simd::set1(_rns->_negbasis[k]); vRNS = simd::load(_Mi_modp_rns.data()+k*_size+i); vtmp = multModSimd(vG, vRNS, vp, vip, vnp); simd::store(tabTmp+i, vtmp); } for(; i < _size ; ++i){ Fields[k].mul(tabTmp[i], Gamma[i], _Mi_modp_rns[i]); } for(size_t i = 0 ; i < _size ; ++i){ Fields[k].addin(A[k], tabTmp[i]); } } double alpha = 0; for(size_t k = 0 ; k < _size ; ++k){ alpha += Gamma[k]*_rns->_invbasis[k]; } // -= alpha long aa= (long)rint(alpha); for(size_t k = 0; k < _size ; k++){ Fields[k].sub(B._ptr[j*_size+k], A[k], _iM_modp_rns[aa+k*_size]); } FFLAS::fflas_delete(Gamma); FFLAS::fflas_delete(A); FFLAS::fflas_delete(tabTmp); } } //*/ //* #pragma omp parallel for schedule(static, 256) for(size_t i = 0 ; i < n; ++i){ double* Ad; BasisElement *Gamma; Gamma = FFLAS::fflas_new(_size); Ad = FFLAS::fflas_new(_size); // Compute Gamma // std::cout << "B: " << std::endl; // for(size_t j = 0 ; j < _size ; ++j){ // std::cout << B._ptr[i*_size+j] << " "; // } // std::cout << std::endl; for(size_t k = 0; k < _size ; ++k){ Fields[k].mul(Gamma[k], B._ptr[i*_size+k], _rns->_MMi[k]); } // std::cout << "Gamma: " << std::endl; // for(size_t j = 0 ; j < _size ; ++j){ // std::cout << Gamma[j] << " "; // } // std::cout << std::endl; // std::cout << "MMi: " << std::endl; // for(size_t j = 0 ; j < _size ; ++j){ // std::cout << _rns->_MMi[j] << " "; // } // std::cout << std::endl; // FFLAS::fgemm(D,FFLAS::FflasNoTrans,FFLAS::FflasTrans, n, _size, _size, D.one, Gamma, _size, _Mi_modp_rns.data(), _size, D.zero, A, _size); // Mul by Mi_modp for(size_t k = 0 ; k < _size ; ++k){ Ad[k] = FFLAS::fdot(Fields[k], _size, Gamma, 1, _Mi_modp_rns.data()+k*_size,1); } // std::cout << "_Mi_modp_rns: " << std::endl; // std::cout << "["; // for(size_t j = 0 ; j < _size ; ++j){ // std::cout << "["; // for(size_t k = 0 ; k < _size-1 ; ++k){ // std::cout << _Mi_modp_rns[j*_size+k] << " ,"; // } // std::cout << _Mi_modp_rns[j*_size+_size-1] << "],"; // } // std::cout << "]" << std::endl; // std::cout << "Ad: " << std::endl; // for(size_t j = 0 ; j < _size ; ++j){ // std::cout << Ad[j] << " "; // } // std::cout << std::endl; // std::cout << "_iM_modp_rns: " << std::endl; // std::cout << "["; // for(size_t j = 0 ; j < _size ; ++j){ // std::cout << "["; // for(size_t k = 0 ; k < _size-1 ; ++k){ // std::cout << _iM_modp_rns[j*_size+k] << " ,"; // } // std::cout << _iM_modp_rns[j*_size+_size-1] << "],"; // } // std::cout << std::endl; // compute alpha // FFLAS::fgemv(D,FFLAS::FflasNoTrans, n, _size, D.one, Gamma, _size, _rns->_invbasis.data(), 1 , D.zero, alpha, 1); double alpha = 0; for(size_t k = 0 ; k < _size ; ++k){ alpha += Gamma[k]*_rns->_invbasis[k]; } // std::cout << "alpha: " << alpha << std::endl; // -= alpha // long aa= (int64_t)alpha; long aa= (long)rint(alpha); //std::cout << "aa: " << aa << std::endl; for(size_t k = 0; k < _size ; k++){ // std::cout << Ad[k] << " - " << _iM_modp_rns[aa+k*_size] << " = "; Fields[k].sub(B._ptr[i*_size+k], Ad[k], _iM_modp_rns[aa+k*_size]); // std::cout<reduce(n,B._ptr,1,true); } #endif // __DLP_CHALLENGE void reduce_modp_rnsmajor(size_t n, Element_ptr B) const{ // std::cout << "modp BLAS" << std::endl; #ifdef BENCH_MODP FFLAS::Timer chrono; chrono.start(); #endif size_t _size= _rns->_size; BasisElement *Gamma, *alpha, *A; A=B._ptr; Givaro::ZRing D; FFLAS::Timer T; // T.start(); Gamma = FFLAS::fflas_new(D,n,_size); alpha = FFLAS::fflas_new(D,n,1); // T.stop(); // std::cout << "Alloc: " << T << std::endl; // compute Gamma (NOT EFFICIENT) //for(size_t i=0;i<_size;i++) // // FFLAS::fscal(_rns->_field_rns[i], n, _rns->_MMi[i], A+i, _size, Gamma+i,_size); T.start(); #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS using simd = Simd; using vect_t = typename simd::vect_t; if(((int64_t)A%simd::alignment == 0) && (_size%simd::vect_size==0)){ auto MMi = _rns->_MMi.data(); for(size_t i = 0 ; i < n ; ++i){ vect_t vA1, vA2, vMi1, vMi2, tmp1, tmp2, tmp3, v, max, basis, inv_, neg_; size_t k = 0; for( ; k < ROUND_DOWN(_size, simd::vect_size) ; k+=simd::vect_size){ basis = simd::load(_rns->_basis.data()+k); inv_ = simd::load(_rns->_invbasis.data()+k); max = simd::load(_rns->_basisMax.data()+k); neg_ = simd::load(_rns->_negbasis.data()+k); vA1 = simd::load(A+i*_size+k); vMi1 = simd::load(MMi+k); v = simd::mul(vA1, vMi1); tmp1 = simd::floor(simd::mul(v, inv_)); tmp2 = simd::fnmadd(v, tmp1, basis); tmp1 = simd::greater(tmp2, max); tmp3 = simd::lesser(tmp2, simd::zero()); tmp1 = simd::vand(tmp1, neg_); tmp3 = simd::vand(tmp3, basis); tmp1 = simd::vor(tmp1, tmp3); tmp2 = simd::add(tmp2, tmp1); simd::store(Gamma+i*_size+k, tmp2); } } }else{ vect_t vA1, vA2, vMi1, vMi2, tmp1, tmp2, tmp3, v, max, basis, inv_, neg_; auto MMi = _rns->_MMi.data(); for(size_t i = 0 ; i < n ; ++i){ size_t k = 0; for( ; k < ROUND_DOWN(_size, simd::vect_size) ; k+=simd::vect_size){ basis = simd::load(_rns->_basis.data()+k); inv_ = simd::load(_rns->_invbasis.data()+k); max = simd::load(_rns->_basisMax.data()+k); neg_ = simd::load(_rns->_negbasis.data()+k); vA1 = simd::loadu(A+i*_size+k); vMi1 = simd::loadu(MMi+k); v = simd::mul(vA1, vMi1); tmp1 = simd::floor(simd::mul(v, inv_)); tmp2 = simd::fnmadd(v, tmp1, basis); tmp1 = simd::greater(tmp2, max); tmp3 = simd::lesser(tmp2, simd::zero()); tmp1 = simd::vand(tmp1, neg_); tmp3 = simd::vand(tmp3, basis); tmp1 = simd::vor(tmp1, tmp3); tmp2 = simd::add(tmp2, tmp1); simd::storeu(Gamma+i*_size+k, tmp2); } for(; k < _size ; ++k){ Gamma[i*_size+k] = A[i*_size+k]*MMi[k]; Gamma[i*_size+k] -= std::floor(Gamma[i*_size+k]*_rns->_invbasis[k])*_rns->_basis[k]; if(Gamma[i*_size+k] >= _rns->_basis[k]){ Gamma[i*_size+k] -= _rns->_basis[k]; }else if(Gamma[i*_size+k] < 0){ Gamma[i*_size+k] += _rns->_basis[k]; } } } } // _rns->reduce(n,Gamma,1,true); #else typename RNS::Element mmi(const_cast(_rns->_MMi.data()),1); FFLAS::fscal(_RNSdelayed, n, mmi, B, 1, typename RNS::Element_ptr(Gamma,1), 1); #endif T.stop(); // std::cout << "Gamma: " << T << std::endl; // compute A = Gamma._Mi_modp_rns^T (note must be reduced mod m_i, but this is postpone to the end) T.start(); FFLAS::fgemm(D,FFLAS::FflasNoTrans,FFLAS::FflasTrans, n, _size, _size, D.one, Gamma, _size, _Mi_modp_rns.data(), _size, D.zero, A, _size); T.stop(); // std::cout << "fgemm: " << T << std::endl; // std::cout<<"fgemv (Y)..."; //std::cout<<"fgemv (Y)..."< "<<_size<_invbasis.data(), 1 , D.zero, alpha, 1); T.stop(); // std::cout << "fgemv: " << T << std::endl; //std::cout<<"done"<reduce(n,A,1,true); T.stop(); // std::cout << "reduce: "<< T << std::endl; // T.start(); FFLAS::fflas_delete(Gamma); FFLAS::fflas_delete(alpha); // T.stop(); // std::cout << "delete: " << T << std::endl; #ifdef BENCH_MODP chrono.stop(); t_modp+=chrono.usertime(); #endif } }; // end of class RNSIntegerMod } // end of namespace FFPACK namespace FFLAS { // specialization for the fflas alloc function template<> inline FFPACK::rns_double_elt_ptr fflas_new(const FFPACK::RNSIntegerMod &F, const size_t m, const size_t n, const Alignment align){ return FFPACK::rns_double_elt_ptr(FFLAS::fflas_new(m*n*F.size(),align),m*n); } // function to convert from integer to RNS (note: this is not the finit function from FFLAS, extra k) template void finit_rns(const FFPACK::RNSIntegerMod &F, const size_t m, const size_t n, size_t k, const Givaro::Integer *B, const size_t ldb, typename RNS::Element_ptr A) { F.rns().init(m,n,A._ptr,A._stride, B,ldb,k); } template void finit_trans_rns(const FFPACK::RNSIntegerMod &F, const size_t m, const size_t n, size_t k, const Givaro::Integer *B, const size_t ldb, typename RNS::Element_ptr A) { F.rns().init_transpose(m,n,A._ptr,A._stride, B,ldb,k); } // function to convert from RNS to integer (note: this is not the fconvert function from FFLAS, extra alpha) template void fconvert_rns(const FFPACK::RNSIntegerMod &F, const size_t m, const size_t n, Givaro::Integer alpha, Givaro::Integer *B, const size_t ldb, typename RNS::ConstElement_ptr A) { F.rns().convert(m,n,alpha,B,ldb,A._ptr,A._stride); } template void fconvert_trans_rns(const FFPACK::RNSIntegerMod &F, const size_t m, const size_t n, Givaro::Integer alpha, Givaro::Integer *B, const size_t ldb, typename RNS::ConstElement_ptr A) { F.rns().convert_transpose(m,n,alpha,B,ldb,A._ptr,A._stride); } } // end of namespace FFLAS #undef DELTA #endif fflas-ffpack-2.2.2/fflas-ffpack/field/rns-integer.h000066400000000000000000000122001274716147400220640ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Pascal Giorgi * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /*! @file field/rns-integer.h * @ingroup field * @brief representation of Z using RNS representation (note: fixed precision) */ #ifndef __FFPACK_unparametric_rns_integer_H #define __FFPACK_unparametric_rns_integer_H #include #include "fflas-ffpack/field/rns-double.h" namespace FFPACK { template class RNSInteger { protected: const RNS *_rns; // the rns structure typedef typename RNS::BasisElement BasisElement; typedef Givaro::Integer integer; public: typedef typename RNS::Element Element; typedef typename RNS::Element_ptr Element_ptr; typedef typename RNS::ConstElement_ptr ConstElement_ptr; Element one, mOne,zero; RNSInteger(const RNS& myrns) : _rns(&myrns) { init(one,1); init(zero,0); init(mOne,-1); } template RNSInteger(const T &F) : _rns(&(F.rns())) { init(one,1); init(zero,0); init(mOne,-1); } const RNS& rns() const {return *_rns;} size_t size() const {return _rns->_size;} bool isOne(const Element& x) const { bool isone=true; for (size_t i=0;i<_rns->_size;i++) isone&= (one._ptr[i]== x._ptr[i]); return isone; } bool isMOne(const Element& x) const { bool ismone=true; for (size_t i=0;i<_rns->_size;i++) ismone&= (mOne._ptr[i]== x._ptr[i]); return ismone; } bool isZero(const Element& x) const { bool iszero=true; for (size_t i=0;i<_rns->_size;i++) iszero&= (zero._ptr[i]== x._ptr[i]); return iszero; } integer characteristic(integer &p) const { return p=0;} integer cardinality(integer &p) const { return p=-1;} Element& init(Element& x) const{ if (x._ptr == NULL){ x._ptr = FFLAS::fflas_new(_rns->_size); x._stride=1; x._alloc=true; } return x; } Element& init(Element& x, const Givaro::Integer& y) const{ init(x); size_t k =(y.bitsize())/16+((y.bitsize())%16?1:0); _rns->init(1,1,x._ptr,x._stride, &y,1,k); return x; } Element& reduce (Element& x, const Element& y) const {return assign (x,y);} Element& reduce (Element& x) const {return x;} Givaro::Integer convert(Givaro::Integer& x, const Element& y)const { _rns->convert(1,1,integer(0),&x,1,y._ptr,y._stride); return x; } Element& assign(Element& x, const Element& y) const { for(size_t i=0;i<_rns->_size;i++) x._ptr[i*x._stride] = y._ptr[i*y._stride]; return x; } std::ostream& write(std::ostream& os, const Element& y) const { os<<"[ "<< (long) (y._ptr)[0]; for(size_t i=1;i<_rns->_size;i++) os<<" , "<< (long) (y._ptr)[i*y._stride]; return os<<" ]"; } std::ostream& write(std::ostream& os) const { os<<"M:=[ "<< (long) _rns->_basis[0]; for(size_t i=1;i<_rns->_size;i++) os<<" , "<< (long) _rns->_basis[i]; return os<<" ]"< } // end of namespace FFPACK namespace FFLAS { // specialization for the fflas alloc function template<> inline FFPACK::rns_double_elt_ptr fflas_new(const FFPACK::RNSInteger &F, const size_t m, const size_t n, const Alignment align){ double *ptr=FFLAS::fflas_new(m*n*F.size(), align); return FFPACK::rns_double_elt_ptr(ptr,m*n); } // function to convert from integer to RNS (note: this is not the finit function from FFLAS, extra k) template void finit_rns(const FFPACK::RNSInteger &F, const size_t m, const size_t n, size_t k, const Givaro::Integer *B, const size_t ldb, typename FFPACK::RNSInteger::Element_ptr A) { F.rns().init(m,n,A._ptr,A._stride, B,ldb,k); } // function to convert from RNS to integer (note: this is not the fconvert function from FFLAS, extra alpha) template void fconvert_rns(const FFPACK::RNSInteger &F, const size_t m, const size_t n, Givaro::Integer alpha, Givaro::Integer *B, const size_t ldb, typename FFPACK::RNSInteger::ConstElement_ptr A) { F.rns().convert(m,n,alpha,B,ldb,A._ptr,A._stride); } } // end of namespace FFLAS #endif // __FFPACK_unparametric_rns_integer_H fflas-ffpack-2.2.2/fflas-ffpack/field/rns.h000066400000000000000000000026101274716147400204350ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS group * * Written by Brice Boyer (briceboyer) * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ /*! @file field/rns.h * @ingroup field * @defgroup rns RNS * @brief just include them all */ #ifndef __FFLASFFPACK_field_rns_H #define __FFLASFFPACK_field_rns_H namespace FFPACK{ template class RNSInteger; template class RNSIntegerMod; } #endif // __FFLASFFPACK_field_rns_H fflas-ffpack-2.2.2/fflas-ffpack/field/rns.inl000066400000000000000000000024071274716147400207740ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS group * * Written by Pascal Giorgi * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ #ifndef __FFLASFFPACK_field_rns_INL #define __FFLASFFPACK_field_rns_INL #include "rns-double.h" #include "rns-integer.h" #include "rns-integer-mod.h" #endif // __FFLASFFPACK_field_rns_INL fflas-ffpack-2.2.2/fflas-ffpack/interfaces/000077500000000000000000000000001274716147400205235ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/interfaces/Makefile.am000066400000000000000000000017571274716147400225710ustar00rootroot00000000000000# Copyright (c) 2010 the LinBox group # Written by Brice Boyer (briceboyer) # ========LICENCE======== # This file is part of the library LinBox. # # LinBox is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== pkgincludesubdir=$(pkgincludedir)/interfaces SUBDIRS=libs EXTRA_DIST=interfaces.doxy fflas-ffpack-2.2.2/fflas-ffpack/interfaces/interfaces.doxy000066400000000000000000000021351274716147400235540ustar00rootroot00000000000000// Copyright (c) 2011 FFLAS-FFPACK // written by Brice Boyer (briceboyer) // // ========LICENCE======== // This file is part of the library FFLAS-FFPACK. // // FFLAS-FFPACK is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // ========LICENCE======== // /** \ingroup fflasffpack * \defgroup interfaces Interfaces * * \brief Intefaces for FFLAS-FFPACK * * C interface in folder @see libs */ // vim:syn=doxygen fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/000077500000000000000000000000001274716147400214545ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/Makefile.am000066400000000000000000000054141274716147400235140ustar00rootroot00000000000000# Copyright (c) 2010 the LinBox group # Written by Brice Boyer (briceboyer) # ========LICENCE======== # This file is part of the library LinBox. # # LinBox is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== if FFLASFFPACK_PRECOMPILED pkgincludesubdir=$(pkgincludedir)/interfaces/libs AM_CPPFLAGS=-I$(top_srcdir) AM_CXXFLAGS = @DEFAULT_CFLAGS@ AM_CPPFLAGS += $(OPTFLAGS) -I$(top_srcdir)/fflas-ffpack/utils/ -I$(top_srcdir)/fflas-ffpack/fflas/ -I$(top_srcdir)/fflas-ffpack/ffpack -I$(top_srcdir)/fflas-ffpack/field $(GIVARO_CFLAGS) $(CBLAS_FLAG) $(CUDA_CFLAGS) $(PARFLAGS) LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS) $(PARFLAGS) #AM_LDFLAGS=-static pkgincludesub_HEADERS=fflas_c.h \ ffpack_c.h \ fflas_L3_inst.h \ fflas_L3_inst_implem.inl \ fflas_L2_inst.h \ fflas_L2_inst_implem.inl \ fflas_L1_inst.h \ fflas_L1_inst_implem.inl \ ffpack_inst.h \ ffpack_inst_implem.inl lib_LTLIBRARIES=libfflas.la \ libffpack.la \ libfflas_c.la \ libffpack_c.la libfflas_la_SOURCES= fflas_L1_inst.C \ fflas_L1_inst_implem.inl\ fflas_L2_inst.C \ fflas_L2_inst_implem.inl \ fflas_L3_inst.C \ fflas_L3_inst_implem.inl libfflas_la_LDFLAGS= $(LDADD) -version-info 1:0:0 \ -no-undefined libffpack_la_SOURCES= ffpack_inst.C \ ffpack_inst_implem.inl libffpack_la_LDFLAGS= $(LDADD) -version-info 1:0:0 \ -no-undefined -lfflas EXTRA_libffpack_la_DEPENDENCIES= libfflas.la libfflas_c_la_SOURCES=fflas_lvl1.C \ fflas_lvl2.C \ fflas_lvl3.C \ fflas_sparse.C #libfflas_c_la_CPPFLAGS=$(AM_CPPFLAGS) -DFFLAS_COMPILED -DFFPACK_COMPILED libfflas_c_la_LDFLAGS= $(LDADD) -version-info 1:0:0 \ -no-undefined -lfflas EXTRA_libfflas_c_la_DEPENDENCIES=libfflas.la libffpack_c_la_SOURCES=ffpack.C #libffpack_c_la_CPPFLAGS=$(AM_CPPFLAGS) -DFFLAS_COMPILED -DFFPACK_COMPILED libffpack_c_la_LDFLAGS= $(LDADD) -version-info 1:0:0 \ -no-undefined -lfflas -lffpack EXTRA_libffpack_c_la_DEPENDENCIES=libffpack.la EXTRA_DIST=c_libs.doxy endif fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/c_libs.doxy000066400000000000000000000023471274716147400236220ustar00rootroot00000000000000// Copyright (c) 2011 FFLAS-FFPACK // written by Brice Boyer (briceboyer) // // ========LICENCE======== // This file is part of the library FFLAS-FFPACK. // // FFLAS-FFPACK is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // ========LICENCE======== // /** \ingroup interfaces * * \brief C library intefaces for FFLAS-FFPACK * * Routines will look like their C++ counterpart : * * freduce(Modular,m,n, double *) * * becomes * * freduce_modular_double(p,m,n, double *,positive) * */ // vim:syn=doxygen fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_L1_inst.C000066400000000000000000000037421274716147400242520ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas_L1_inst.h * Copyright (C) 2015 FFLAS-FFPACK group * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include "fflas-ffpack/fflas-ffpack-config.h" #ifndef __FFLAS_L1_INST_C #define __FFLAS_L1_INST_C #include "givaro/modular.h" #include "givaro/modular-balanced.h" #include "fflas.h" #include "fflas_helpers.inl" #ifdef INST_OR_DECL #undef INST_OR_DECL #endif #define INST_OR_DECL #define FFLAS_FIELD Givaro::ModularBalanced #define FFLAS_ELT double #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #define FFLAS_FIELD Givaro::Modular #define FFLAS_ELT double #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #endif // __FFLAS_L1_INST_C fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_L1_inst.h000066400000000000000000000037331274716147400243170ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas_L1_inst.h * Copyright (C) 2015 FFLAS-FFPACK group * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLAS_L1_INST_H #define __FFLAS_L1_INST_H #include "givaro/modular.h" #include "givaro/modular-balanced.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/fflas/fflas_helpers.inl" #ifdef INST_OR_DECL #undef INST_OR_DECL #endif #define INST_OR_DECL <> #define FFLAS_FIELD Givaro::ModularBalanced #define FFLAS_ELT double #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #define FFLAS_FIELD Givaro::Modular #define FFLAS_ELT double #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L1_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #endif //__FFLAS_L1_INST_H fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_L1_inst_implem.inl000066400000000000000000000240551274716147400262150ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014,2015 the FFLAS-FFPACK group * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ namespace FFLAS { //--------------------------------------------------------------------- // Level 1 routines //--------------------------------------------------------------------- /** freduce * \f$x \gets x mod F\f$. * @param F field * @param n size of the vectors * \param X vector in \p F * \param incX stride of \p X * @bug use cblas_(d)scal when possible */ template INST_OR_DECL void freduce (const FFLAS_FIELD& F, const size_t n, FFLAS_ELT* X, const size_t incX); /** freduce * \f$x \gets y mod F\f$. * @param F field * @param n size of the vectors * \param Y vector of \p Element * \param incY stride of \p Y * \param X vector in \p F * \param incX stride of \p X * @bug use cblas_(d)scal when possible */ template INST_OR_DECL void freduce (const FFLAS_FIELD& F, const size_t n, const FFLAS_ELT* Y, const size_t incY, FFLAS_ELT* X, const size_t incX); /** finit * \f$x \gets y mod F\f$. * @param F field * @param n size of the vectors * \param Y vector of \p OtherElement * \param incY stride of \p Y * \param X vector in \p F * \param incX stride of \p X * @bug use cblas_(d)scal when possible */ template INST_OR_DECL void finit (const FFLAS_FIELD& F, const size_t n, const FFLAS_ELT* Y, const size_t incY, FFLAS_ELT* X, const size_t incX); /** fconvert * \f$x \gets y mod F\f$. * @param F field * @param n size of the vectors * \param Y vector of \p F * \param incY stride of \p Y * \param X vector in \p OtherElement * \param incX stride of \p X * @bug use cblas_(d)scal when possible */ template INST_OR_DECL void fconvert (const FFLAS_FIELD& F, const size_t n, FFLAS_ELT* X, const size_t incX, const FFLAS_ELT* Y, const size_t incY); // { // OtherElement_ptr Xi = X ; // const FFLAS_ELT* Yi = Y ; // for (; Xi < X+n*incX; Xi+=incX, Yi += incY ) // F.convert( *Xi , *Yi); // } /** fnegin * \f$x \gets - x\f$. * @param F field * @param n size of the vectors * \param X vector in \p F * \param incX stride of \p X * @bug use cblas_(d)scal when possible */ template INST_OR_DECL void fnegin (const FFLAS_FIELD& F, const size_t n, FFLAS_ELT* X, const size_t incX); // { // FFLAS_ELT* Xi = X ; // for (; Xi < X+n*incX; Xi+=incX ) // F.negin( *Xi ); // } /** fneg * \f$x \gets - y\f$. * @param F field * @param n size of the vectors * \param X vector in \p F * \param incX stride of \p X * \param Y vector in \p F * \param incY stride of \p Y * @bug use cblas_(d)scal when possible */ template INST_OR_DECL void fneg (const FFLAS_FIELD& F, const size_t n, const FFLAS_ELT* Y, const size_t incY, FFLAS_ELT* X, const size_t incX); // { // FFLAS_ELT* Xi = X ; // const FFLAS_ELT* Yi = Y ; // for (; Xi < X+n*incX; Xi+=incX,Yi+=incY ) // F.neg( *Xi, *Yi ); // } /** \brief fzero : \f$A \gets 0 \f$. * @param F field * @param n number of elements to zero * \param X vector in \p F * \param incX stride of \p X */ template INST_OR_DECL void fzero (const FFLAS_FIELD& F, const size_t n, FFLAS_ELT* X, const size_t incX); // { // if (incX == 1) { // contigous data // // memset(X,(int)F.zero,n); // might be bogus ? // for (size_t i = 0 ; i < n ; ++i) // F.assign(*(X+i), F.zero); // } // else { // not contiguous (strided) // for (size_t i = 0 ; i < n ; ++i) // F.assign(*(X+i*incX), F.zero); // } // } /** \brief fiszero : test \f$X = 0 \f$. * @param F field * @param n vector dimension * \param X vector in \p F * \param incX increment of \p X */ template INST_OR_DECL bool fiszero (const FFLAS_FIELD& F, const size_t n, const FFLAS_ELT* X, const size_t incX); // { // bool res=true; // for (size_t i = 0 ; i < n ; ++i) // res &= F.isZero (X [i*incX]); // return res; // } /** \brief fequal : test \f$X = Y \f$. * @param F field * @param n vector dimension * \param X vector in \p F * \param incX increment of \p X * \param Y vector in \p F * \param incY increment of \p Y */ template INST_OR_DECL bool fequal (const FFLAS_FIELD& F, const size_t n, const FFLAS_ELT* X, const size_t incX, const FFLAS_ELT* Y, const size_t incY); // { // bool res=true; // for (size_t i = 0 ; i < n ; ++i) // res &= F.areEqual (X [i*incX], Y [i*incY]); // return res; // } /** \brief fassign : \f$x \gets y \f$. * X is preallocated * @todo variant for triagular matrix * @param F field * @param N size of the vectors * \param [out] X vector in \p F * \param incX stride of \p X * \param [in] Y vector in \p F * \param incY stride of \p Y */ template INST_OR_DECL void fassign (const FFLAS_FIELD& F, const size_t N, const FFLAS_ELT* Y, const size_t incY , FFLAS_ELT* X, const size_t incX); /** fscalin * \f$x \gets \alpha \cdot x\f$. * @param F field * @param n size of the vectors * @param alpha scalar * \param X vector in \p F * \param incX stride of \p X * @bug use cblas_(d)scal when possible * @internal * @todo check if comparison with +/-1,0 is necessary. */ template INST_OR_DECL void fscalin (const FFLAS_FIELD& F, const size_t n, const FFLAS_ELT alpha, FFLAS_ELT* X, const size_t incX); /** fscal * \f$y \gets \alpha \cdot x\f$. * @param F field * @param n size of the vectors * @param alpha scalar * \param[in] X vector in \p F * \param incX stride of \p X * \param[out] Y vector in \p F * \param incY stride of \p Y * @bug use cblas_(d)scal when possible * @internal * @todo check if comparison with +/-1,0 is necessary. */ template INST_OR_DECL void fscal (const FFLAS_FIELD& F, const size_t n , const FFLAS_ELT alpha , const FFLAS_ELT* X, const size_t incX , FFLAS_ELT* Y, const size_t incY); /** \brief faxpy : \f$y \gets \alpha \cdot x + y\f$. * @param F field * @param N size of the vectors * @param alpha scalar * \param[in] X vector in \p F * \param incX stride of \p X * \param[in,out] Y vector in \p F * \param incY stride of \p Y */ template INST_OR_DECL void faxpy (const FFLAS_FIELD& F, const size_t N, const FFLAS_ELT alpha, const FFLAS_ELT* X, const size_t incX, FFLAS_ELT* Y, const size_t incY ); /** \brief faxpby : \f$y \gets \alpha \cdot x + \beta \cdot y\f$. * @param F field * @param N size of the vectors * @param alpha scalar * \param[in] X vector in \p F * \param incX stride of \p X * \param beta scalar * \param[in,out] Y vector in \p F * \param incY stride of \p Y * \note this is a catlas function */ // template INST_OR_DECL // void // faxpby (const FFLAS_FIELD& F, const size_t N, // const FFLAS_ELT alpha, // const FFLAS_ELT* X, const size_t incX, // const FFLAS_ELT beta, // FFLAS_ELT* Y, const size_t incY ); /** \brief fdot: dot product \f$x^T y\f$. * @param F field * @param N size of the vectors * \param X vector in \p F * \param incX stride of \p X * \param Y vector in \p F * \param incY stride of \p Y */ template INST_OR_DECL FFLAS_ELT fdot (const FFLAS_FIELD& F, const size_t N, const FFLAS_ELT* X, const size_t incX, const FFLAS_ELT* Y, const size_t incY ); /** \brief fswap: \f$ X \leftrightarrow Y\f$. * @bug use cblas_dswap when double * @param F field * @param N size of the vectors * \param X vector in \p F * \param incX stride of \p X * \param Y vector in \p F * \param incY stride of \p Y */ template INST_OR_DECL void fswap (const FFLAS_FIELD& F, const size_t N, FFLAS_ELT* X, const size_t incX, FFLAS_ELT* Y, const size_t incY ); // { // FFLAS_ELT tmp; F.init(tmp); // FFLAS_ELT* Xi = X; // FFLAS_ELT* Yi=Y; // for (; Xi < X+N*incX; Xi+=incX, Yi+=incY ){ // F.assign( tmp, *Xi ); // F.assign( *Xi, *Yi ); // F.assign( *Yi, tmp ); // } // } template INST_OR_DECL void fadd (const FFLAS_FIELD& F, const size_t N, const FFLAS_ELT* A, const size_t inca, const FFLAS_ELT* B, const size_t incb, FFLAS_ELT* C, const size_t incc); template INST_OR_DECL void fsub (const FFLAS_FIELD& F, const size_t N, const FFLAS_ELT* A, const size_t inca, const FFLAS_ELT* B, const size_t incb, FFLAS_ELT* C, const size_t incc); template INST_OR_DECL void faddin (const FFLAS_FIELD& F, const size_t N, const FFLAS_ELT* B, const size_t incb, FFLAS_ELT* C, const size_t incc); // template INST_OR_DECL // void // fsubin (const FFLAS_FIELD& F, const size_t N, // FFLAS_ELT* C, const size_t incc); template INST_OR_DECL void fadd (const FFLAS_FIELD& F, const size_t N, const FFLAS_ELT* A, const size_t inca, const FFLAS_ELT alpha, const FFLAS_ELT* B, const size_t incb, FFLAS_ELT* C, const size_t incc); } // FFLAS fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_L2_inst.C000066400000000000000000000037421274716147400242530ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas_L2_inst.h * Copyright (C) 2015 FFLAS-FFPACK group * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include "fflas-ffpack/fflas-ffpack-config.h" #ifndef __FFLAS_L2_INST_C #define __FFLAS_L2_INST_C #include "givaro/modular.h" #include "givaro/modular-balanced.h" #include "fflas.h" #include "fflas_helpers.inl" #ifdef INST_OR_DECL #undef INST_OR_DECL #endif #define INST_OR_DECL #define FFLAS_FIELD Givaro::ModularBalanced #define FFLAS_ELT double #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #define FFLAS_FIELD Givaro::Modular #define FFLAS_ELT double #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #endif // __FFLAS_L2_INST_C fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_L2_inst.h000066400000000000000000000037331274716147400243200ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas_L2_inst.h * Copyright (C) 2015 FFLAS-FFPACK group * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLAS_L2_INST_H #define __FFLAS_L2_INST_H #include "givaro/modular.h" #include "givaro/modular-balanced.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/fflas/fflas_helpers.inl" #ifdef INST_OR_DECL #undef INST_OR_DECL #endif #define INST_OR_DECL <> #define FFLAS_FIELD Givaro::ModularBalanced #define FFLAS_ELT double #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #define FFLAS_FIELD Givaro::Modular #define FFLAS_ELT double #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L2_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #endif //__FFLAS_L2_INST_H fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_L2_inst_implem.inl000066400000000000000000000327431274716147400262210ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014,2015 the FFLAS-FFPACK group * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ namespace FFLAS { //--------------------------------------------------------------------- // Level 2 routines //--------------------------------------------------------------------- /** \brief fassign : \f$A \gets B \f$. * @param F field * @param m number of rows to copy * @param n number of cols to copy * \param A matrix in \p F * \param lda stride of \p A * \param B vector in \p F * \param ldb stride of \p B */ template INST_OR_DECL void fassign (const FFLAS_FIELD& F, const size_t m, const size_t n, const FFLAS_ELT* B, const size_t ldb , FFLAS_ELT* A, const size_t lda ); /** \brief fzero : \f$A \gets 0 \f$. * @param F field * @param m number of rows to zero * @param n number of cols to zero * \param A matrix in \p F * \param lda stride of \p A * @warning may be buggy if Element is larger than int */ template INST_OR_DECL void fzero (const FFLAS_FIELD& F, const size_t m, const size_t n, FFLAS_ELT* A, const size_t lda); // { // /* use memset only with Elements that are ok */ // if (n == lda) { // contigous data // // memset(A,(int) F.zero,m*n); // might be bogus ? // fzero(F,m*n,A,1); // } // else { // not contiguous (strided) // for (size_t i = 0 ; i < m ; ++i) // // memset(A+i*lda,(int) F.zero,n) ; // might be bogus ? // fzero(F,n,A+i*lda,1); // } // } /** \brief fequal : test \f$A = B \f$. * @param F field * @param m row dimension * @param n column dimension * \param A m x n matrix in \p F * \param lda leading dimension of A * \param B m x n matrix in \p F * \param ldb leading dimension of B */ template INST_OR_DECL bool fequal (const FFLAS_FIELD& F, const size_t m, const size_t n, const FFLAS_ELT* A, const size_t lda, const FFLAS_ELT* B, const size_t ldb); // { // bool res=true; // for (size_t i = 0 ; i < m ; ++i) // res &= fequal (F, n, A + i*lda, 1, B + i*ldb, 1); // return res; // } /** \brief fiszero : test \f$A = 0 \f$. * @param F field * @param m row dimension * @param n column dimension * \param A m x n matrix in \p F * \param lda leading dimension of A */ template INST_OR_DECL bool fiszero (const FFLAS_FIELD& F, const size_t m, const size_t n, const FFLAS_ELT* A, const size_t lda); // { // bool res=true; // for (size_t i = 0 ; i < m ; ++i) // res &= fiszero (F, n, A + i*lda, 1); // return res; // } //! creates a diagonal matrix template INST_OR_DECL void fidentity (const FFLAS_FIELD& F, const size_t m, const size_t n, FFLAS_ELT* A, const size_t lda, const FFLAS_ELT & d); // { // fzero(F,m,n,A,lda); // for (size_t i = 0 ; i < std::min(m,n) ; ++i) // F.assign(A[i*lda+i],d); // } //! creates a diagonal matrix template INST_OR_DECL void fidentity (const FFLAS_FIELD& F, const size_t m, const size_t n, FFLAS_ELT* A, const size_t lda); // { // fzero(F,m,n,A,lda); // for (size_t i = 0 ; i < std::min(m,n) ; ++i) // F.assign(A[i*lda+i],F.one); // } /** freduce * \f$A \gets A mod F\f$. * @param F field * @param m number of rows * @param n number of cols * \param A matrix in \p F * \param lda stride of \p A * @internal */ template INST_OR_DECL void freduce (const FFLAS_FIELD& F, const size_t m , const size_t n, FFLAS_ELT* A, const size_t lda); /** freduce * \f$A \gets B mod F\f$. * @param F field * @param m number of rows * @param n number of cols * \param A matrix in \p F * \param lda stride of \p A * \param B matrix in \p Element * \param ldb stride of \p B * @internal */ template INST_OR_DECL void freduce (const FFLAS_FIELD& F, const size_t m , const size_t n, const FFLAS_ELT* B, const size_t ldb, FFLAS_ELT* A, const size_t lda); /** finit * \f$A \gets B mod F\f$. * @param F field * @param m number of rows * @param n number of cols * \param A matrix in \p F * \param lda stride of \p A * \param B matrix in \p F * \param ldb stride of \p B * @internal */ template INST_OR_DECL void finit (const FFLAS_FIELD& F, const size_t m , const size_t n, const FFLAS_ELT* B, const size_t ldb, FFLAS_ELT* A, const size_t lda); /** fnegin * \f$A \gets - A\f$. * @param F field * @param m number of rows * @param n number of cols * \param A matrix in \p F * \param lda stride of \p A * @internal */ template INST_OR_DECL void fnegin (const FFLAS_FIELD& F, const size_t m , const size_t n, FFLAS_ELT* A, const size_t lda); // { // //!@todo check if n == lda // for (size_t i = 0 ; i < m ; ++i) // fnegin(F,n,A+i*lda,1); // return; // } /** fneg * \f$A \gets - B\f$. * @param F field * @param m number of rows * @param n number of cols * \param A matrix in \p F * \param lda stride of \p A * @internal */ template INST_OR_DECL void fneg (const FFLAS_FIELD& F, const size_t m , const size_t n, const FFLAS_ELT* B, const size_t ldb, FFLAS_ELT* A, const size_t lda); // { // //!@todo check if n == lda // for (size_t i = 0 ; i < m ; ++i) // fneg(F,n,B+i*ldb,1,A+i*lda,1); // return; // } /** fscalin * \f$A \gets a \cdot A\f$. * @param F field * @param m number of rows * @param n number of cols * @param alpha homotecie scalar * \param A matrix in \p F * \param lda stride of \p A * @internal */ template INST_OR_DECL void fscalin (const FFLAS_FIELD& F, const size_t m , const size_t n, const FFLAS_ELT alpha, FFLAS_ELT* A, const size_t lda); /** fscal * \f$B \gets a \cdot A\f$. * @param F field * @param m number of rows * @param n number of cols * @param alpha homotecie scalar * \param[in] A matrix in \p F * \param lda stride of \p A * \param[out] B matrix in \p F * \param ldb stride of \p B * @internal */ template INST_OR_DECL void fscal (const FFLAS_FIELD& F, const size_t m , const size_t n, const FFLAS_ELT alpha, const FFLAS_ELT* A, const size_t lda, FFLAS_ELT* B, const size_t ldb); /** \brief faxpy : \f$y \gets \alpha \cdot x + y\f$. * @param F field * @param m row dimension * @param n column dimension * @param alpha scalar * \param[in] X vector in \p F * \param ldx leading dimension of \p X * \param[in,out] Y vector in \p F * \param ldy leading dimension of \p Y */ template INST_OR_DECL void faxpy (const FFLAS_FIELD& F, const size_t m, const size_t n , const FFLAS_ELT alpha, const FFLAS_ELT* X, const size_t ldx, FFLAS_ELT* Y, const size_t ldy ); /** \brief faxpby : \f$y \gets \alpha \cdot x + \beta \cdot y\f$. * @param F field * @param m row dimension * @param n column dimension * @param alpha scalar * \param[in] X vector in \p F * \param ldx leading dimension of \p X * \param beta scalar * \param[in,out] Y vector in \p F * \param ldy leading dimension of \p Y * \note this is a catlas function */ // template INST_OR_DECL // void // faxpby (const FFLAS_FIELD& F, const size_t m, const size_t n, // const FFLAS_ELT alpha, // const FFLAS_ELT* X, const size_t ldx, // const FFLAS_ELT beta, // FFLAS_ELT* Y, const size_t ldy ); /** \brief fmove : \f$A \gets B \f$ and \f$ B \gets 0\f$. * @param F field * @param m number of rows to copy * @param n number of cols to copy * \param A matrix in \p F * \param lda stride of \p A * \param B vector in \p F * \param ldb stride of \p B */ template INST_OR_DECL void fmove (const FFLAS_FIELD& F, const size_t m, const size_t n, FFLAS_ELT* A, const size_t lda, FFLAS_ELT* B, const size_t ldb ); // { // fassign(F,m,n,A,lda,B,ldb); // fzero(F,m,n,B,ldb); // } /** fadd : matrix addition. * Computes \p C = \p A + \p B. * @param F field * @param M rows * @param N cols * @param A dense matrix of size \c MxN * @param lda leading dimension of \p A * @param B dense matrix of size \c MxN * @param ldb leading dimension of \p B * @param C dense matrix of size \c MxN * @param ldc leading dimension of \p C */ template INST_OR_DECL void fadd (const FFLAS_FIELD& F, const size_t M, const size_t N, const FFLAS_ELT* A, const size_t lda, const FFLAS_ELT* B, const size_t ldb, FFLAS_ELT* C, const size_t ldc); /** fsub : matrix subtraction. * Computes \p C = \p A - \p B. * @param F field * @param M rows * @param N cols * @param A dense matrix of size \c MxN * @param lda leading dimension of \p A * @param B dense matrix of size \c MxN * @param ldb leading dimension of \p B * @param C dense matrix of size \c MxN * @param ldc leading dimension of \p C */ template INST_OR_DECL void fsub (const FFLAS_FIELD& F, const size_t M, const size_t N, const FFLAS_ELT* A, const size_t lda, const FFLAS_ELT* B, const size_t ldb, FFLAS_ELT* C, const size_t ldc); //! fsubin //! C = C - B template INST_OR_DECL void fsubin (const FFLAS_FIELD& F, const size_t M, const size_t N, const FFLAS_ELT* B, const size_t ldb, FFLAS_ELT* C, const size_t ldc); /** fadd : matrix addition with scaling. * Computes \p C = \p A + alpha \p B. * @param F field * @param M rows * @param N cols * @param A dense matrix of size \c MxN * @param lda leading dimension of \p A * @param alpha some scalar * @param B dense matrix of size \c MxN * @param ldb leading dimension of \p B * @param C dense matrix of size \c MxN * @param ldc leading dimension of \p C */ template INST_OR_DECL void fadd (const FFLAS_FIELD& F, const size_t M, const size_t N, const FFLAS_ELT* A, const size_t lda, const FFLAS_ELT alpha, const FFLAS_ELT* B, const size_t ldb, FFLAS_ELT* C, const size_t ldc); //! faddin template INST_OR_DECL void faddin (const FFLAS_FIELD& F, const size_t M, const size_t N, const FFLAS_ELT* B, const size_t ldb, FFLAS_ELT* C, const size_t ldc); /** @brief finite prime FFLAS_FIELD GEneral Matrix Vector multiplication. * * Computes \f$Y \gets \alpha \mathrm{op}(A) X + \beta Y \f$. * @param F field * \param TransA if \c TransA==FflasTrans then \f$\mathrm{op}(A)=A^t\f$. * @param M rows * @param N cols * @param alpha scalar * @param A dense matrix of size \c MxN * @param lda leading dimension of \p A * @param X dense vector of size \c N * @param incX stride of \p X * @param beta scalar * @param[out] Y dense vector of size \c M * @param incY stride of \p Y */ template INST_OR_DECL FFLAS_ELT* fgemv (const FFLAS_FIELD& F, const FFLAS_TRANSPOSE TransA, const size_t M, const size_t N, const FFLAS_ELT alpha, const FFLAS_ELT* A, const size_t lda, const FFLAS_ELT* X, const size_t incX, const FFLAS_ELT beta, FFLAS_ELT* Y, const size_t incY); /** @brief fger: rank one update of a general matrix * * Computes \f$A \gets \alpha x . y^T + A\f$ * @param F field * @param M rows * @param N cols * @param alpha scalar * @param[in,out] A dense matrix of size \c MxN and leading dimension \p lda * @param lda leading dimension of \p A * @param x dense vector of size \c M * @param incx stride of \p X * @param y dense vector of size \c N * @param incy stride of \p Y */ template INST_OR_DECL void fger (const FFLAS_FIELD& F, const size_t M, const size_t N, const FFLAS_ELT alpha, const FFLAS_ELT* x, const size_t incx, const FFLAS_ELT* y, const size_t incy, FFLAS_ELT* A, const size_t lda); /** @brief ftrsv: TRiangular System solve with Vector * Computes \f$ X \gets \mathrm{op}(A^{-1}) X\f$ * @param F field * @param X vector of size \p N on a field \p F * @param incX stride of \p X * @param A a matrix of leading dimension \p lda and size \p N * @param lda leading dimension of \p A * @param N number of rows or columns of \p A according to \p TransA * \param TransA if \c TransA==FflasTrans then \f$\mathrm{op}(A)=A^t\f$. * \param Diag if \c Diag==FflasUnit then \p A is unit. * \param Uplo if \c Uplo==FflasUpper then \p A is upper triangular */ template INST_OR_DECL void ftrsv (const FFLAS_FIELD& F, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t N,const FFLAS_ELT* A, const size_t lda, FFLAS_ELT* X, int incX); } // FFLAS fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_L3_inst.C000066400000000000000000000037411274716147400242530ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas_L3_inst.h * Copyright (C) 2015 FFLAS-FFPACK group * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include "fflas-ffpack/fflas-ffpack-config.h" #ifndef __FFLAS_L3_INST_C #define __FFLAS_L3_INST_C #include "givaro/modular.h" #include "givaro/modular-balanced.h" #include "fflas.h" #include "fflas_helpers.inl" #ifdef INST_OR_DECL #undef INST_OR_DECL #endif #define INST_OR_DECL #define FFLAS_FIELD Givaro::ModularBalanced #define FFLAS_ELT double #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #define FFLAS_FIELD Givaro::Modular #define FFLAS_ELT double #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #endif // __FFLAS_L3_INST_C fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_L3_inst.h000066400000000000000000000037331274716147400243210ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas_L3_inst.h * Copyright (C) 2015 FFLAS-FFPACK group * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLAS_L3_INST_H #define __FFLAS_L3_INST_H #include "givaro/modular.h" #include "givaro/modular-balanced.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/fflas/fflas_helpers.inl" #ifdef INST_OR_DECL #undef INST_OR_DECL #endif #define INST_OR_DECL <> #define FFLAS_FIELD Givaro::ModularBalanced #define FFLAS_ELT double #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #define FFLAS_FIELD Givaro::Modular #define FFLAS_ELT double #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "fflas_L3_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #endif //__FFLAS_L3_INST_H fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_L3_inst_implem.inl000066400000000000000000000165071274716147400262220ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ namespace FFLAS { //--------------------------------------------------------------------- // Level 3 routines //--------------------------------------------------------------------- // set by default for ftrsm to be thread safe // undef it at your own risk, and only if you run it in sequential #define __FFLAS__TRSM_READONLY /** @brief ftrsm: TRiangular System solve with Matrix. * Computes \f$ B \gets \alpha \mathrm{op}(A^{-1}) B\f$ or \f$B \gets \alpha B \mathrm{op}(A^{-1})\f$. * \param F field * \param Side if \c Side==FflasLeft then \f$ B \gets \alpha \mathrm{op}(A^{-1}) B\f$ is computed. * \param Uplo if \c Uplo==FflasUpper then \p A is upper triangular * \param TransA if \c TransA==FflasTrans then \f$\mathrm{op}(A)=A^t\f$. * \param Diag if \c Diag==FflasUnit then \p A is unit. * \param M rows of \p B * \param N cols of \p B * @param alpha scalar * \param A triangular invertible matrix. If \c Side==FflasLeft then \p A is \f$N\times N\f$, otherwise \p A is \f$M\times M\f$ * @param lda leading dim of \p A * @param B matrix of size \p MxN * @param ldb leading dim of \p B * @bug \f$\alpha\f$ must be non zero. */ template INST_OR_DECL void ftrsm (const FFLAS_FIELD & F, const FFLAS_SIDE Side, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t M, const size_t N, const FFLAS_ELT alpha, #ifdef __FFLAS__TRSM_READONLY const FFLAS_ELT* A, #else FFLAS_ELT* A, #endif const size_t lda, FFLAS_ELT* B, const size_t ldb); /** @brief ftrmm: TRiangular Matrix Multiply. * Computes \f$ B \gets \alpha \mathrm{op}(A) B\f$ or \f$B \gets \alpha B \mathrm{op}(A)\f$. * @param F field * \param Side if \c Side==FflasLeft then \f$ B \gets \alpha \mathrm{op}(A) B\f$ is computed. * \param Uplo if \c Uplo==FflasUpper then \p A is upper triangular * \param TransA if \c TransA==FflasTrans then \f$\mathrm{op}(A)=A^t\f$. * \param Diag if \c Diag==FflasUnit then \p A is implicitly unit. * \param M rows of \p B * \param N cols of \p B * @param alpha scalar * \param A triangular matrix. If \c Side==FflasLeft then \p A is \f$N\times N\f$, otherwise \p A is \f$M\times M\f$ * @param lda leading dim of \p A * @param B matrix of size \p MxN * @param ldb leading dim of \p B */ template INST_OR_DECL void ftrmm (const FFLAS_FIELD & F, const FFLAS_SIDE Side, const FFLAS_UPLO Uplo, const FFLAS_TRANSPOSE TransA, const FFLAS_DIAG Diag, const size_t M, const size_t N, const FFLAS_ELT alpha, const FFLAS_ELT* A, const size_t lda, FFLAS_ELT* B, const size_t ldb); /** @brief fgemm: Field GEneral Matrix Multiply. * * Computes \f$C = \alpha \mathrm{op}(A) \times \mathrm{op}(B) + \beta C\f$ * Automatically set Winograd recursion level * \param F field. * \param ta if \c ta==FflasTrans then \f$\mathrm{op}(A)=A^t\f$, else \f$\mathrm{op}(A)=A\f$, * \param tb same for matrix \p B * \param m see \p A * \param n see \p B * \param k see \p A * \param alpha scalar * \param beta scalar * \param A \f$\mathrm{op}(A)\f$ is \f$m \times k\f$ * \param B \f$\mathrm{op}(B)\f$ is \f$k \times n\f$ * \param C \f$C\f$ is \f$m \times n\f$ * \param lda leading dimension of \p A * \param ldb leading dimension of \p B * \param ldc leading dimension of \p C * \param w recursive levels of Winograd's algorithm are used. No argument (or -1) does auto computation of \p w. * @warning \f$\alpha\f$ \e must be invertible */ template INST_OR_DECL FFLAS_ELT* fgemm( const FFLAS_FIELD & F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const FFLAS_ELT alpha, const FFLAS_ELT* A, const size_t lda, const FFLAS_ELT* B, const size_t ldb, const FFLAS_ELT beta, FFLAS_ELT* C, const size_t ldc); template INST_OR_DECL FFLAS_ELT* fgemm( const FFLAS_FIELD & F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const FFLAS_ELT alpha, const FFLAS_ELT* A, const size_t lda, const FFLAS_ELT* B, const size_t ldb, const FFLAS_ELT beta, FFLAS_ELT* C, const size_t ldc, const ParSeqHelper::Sequential seq); template INST_OR_DECL FFLAS_ELT* fgemm( const FFLAS_FIELD & F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const FFLAS_ELT alpha, const FFLAS_ELT* A, const size_t lda, const FFLAS_ELT* B, const size_t ldb, const FFLAS_ELT beta, FFLAS_ELT* C, const size_t ldc, const ParSeqHelper::Parallel par); template INST_OR_DECL FFLAS_ELT* fgemm( const FFLAS_FIELD & F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const FFLAS_ELT alpha, const FFLAS_ELT* A, const size_t lda, const FFLAS_ELT* B, const size_t ldb, const FFLAS_ELT beta, FFLAS_ELT* C, const size_t ldc, const ParSeqHelper::Parallel par); /** @brief fsquare: Squares a matrix. * compute \f$ C \gets \alpha \mathrm{op}(A) \mathrm{op}(A) + \beta C\f$ over a FFLAS_FIELD \p F * Avoid the conversion of B * @param ta if \c ta==FflasTrans, \f$\mathrm{op}(A)=A^T\f$. * @param F field * @param n size of \p A * @param alpha scalar * @param beta scalar * @param A dense matrix of size \c nxn * @param lda leading dimension of \p A * @param C dense matrix of size \c nxn * @param ldc leading dimension of \p C */ template INST_OR_DECL FFLAS_ELT* fsquare (const FFLAS_FIELD & F, const FFLAS_TRANSPOSE ta, const size_t n, const FFLAS_ELT alpha, const FFLAS_ELT* A, const size_t lda, const FFLAS_ELT beta, FFLAS_ELT* C, const size_t ldc); } // FFLAS fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_c.h000066400000000000000000000275771274716147400232440ustar00rootroot00000000000000/* -*- mode: C++; tAb-width: 8; indent-tAbs-mode: t; c-basic-offset: 8 -*- */ /* vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s */ /* * Copyright (C) 2015 FFLAS-FFPACK * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more detAils. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas-c.h * @author Brice Boyer * @brief C functions calls for FFLAS * @see fflas/fflas.h */ #ifndef __FFLASFFPACK_interfaces_libs_fflas_c_H #define __FFLASFFPACK_interfaces_libs_fflas_c_H //#include "fflas-ffpack/fflas-ffpack-config.h" #ifndef FFLAS_COMPILED #define FFLAS_COMPILED #endif #include #include #include #ifdef __cplusplus extern "C" { #endif /// Storage by row or col ? enum FFLAS_C_ORDER{ FflasRowMajor=101, /**< row major */ FflasColMajor=102 /**< col major */ }; // public: /// Is matrix transposed ? enum FFLAS_C_TRANSPOSE { FflasNoTrans = 111, /**< Matrix is not transposed */ FflasTrans = 112 /**< Matrix is transposed */ }; /// Is triangular matrix's shape upper ? enum FFLAS_C_UPLO { FflasUpper = 121, /**< Triangular matrix is Upper triangular (if \f$i>j\f$ then \f$T_{i,j} = 0\f$)*/ FflasLower = 122 /**< Triangular matrix is Lower triangular (if \f$i */ /* ModularBalanced */ void freducein_1_modular_double (const double p, const size_t n, double * X, const size_t incX , bool positive ); void freduce_1_modular_double (const double F, const size_t n, const double * Y, const size_t incY, double * X, const size_t incX , bool positive ); void fnegin_1_modular_double (const double F, const size_t n, double * X, const size_t incX , bool positive ); void fneg_1_modular_double (const double p, const size_t n, const double * Y, const size_t incY, double * X, const size_t incX , bool positive ); void fzero_1_modular_double (const double p, const size_t n, double * X, const size_t incX , bool positive ); bool fiszero_1_modular_double (const double p, const size_t n, const double * X, const size_t incX , bool positive ); bool fequal_1_modular_double (const double p, const size_t n, const double * X, const size_t incX, const double * Y, const size_t incY , bool positive ); void fassign_1_modular_double (const double p, const size_t n, const double * Y, const size_t incY , double * X, const size_t incX , bool positive ); void fscalin_1_modular_double (const double p, const size_t n, const double alpha, double * X, const size_t incX , bool positive ); void fscal_1_modular_double (const double p, const size_t n , const double alpha , const double * X, const size_t incX , double * Y, const size_t incY , bool positive ); void faxpy_1_modular_double (const double p, const size_t n, const double alpha, const double * X, const size_t incX, double * Y, const size_t incY , bool positive ); #if 0 void faxpby_1_modular_double (const double p, const size_t n, const double alpha, const double * X, const size_t incX, const double betA, double * Y, const size_t incY , bool positive ); #endif double fdot_1_modular_double (const double p, const size_t n, const double * X, const size_t incX, const double * Y, const size_t incY , bool positive ); void fswap_1_modular_double (const double p, const size_t n, double * X, const size_t incX, double * Y, const size_t incY , bool positive ); void fadd_1_modular_double (const double p, const size_t n, const double * A, const size_t incA, const double * B, const size_t incB, double * C, const size_t incC , bool positive ); void fsub_1_modular_double (const double p, const size_t n, const double * A, const size_t incA, const double * B, const size_t incB, double * C, const size_t incC , bool positive ); void faddin_1_modular_double (const double p, const size_t n, const double * B, const size_t incB, double * C, const size_t incC , bool positive ); void fsubin_1_modular_double (const double p, const size_t n, const double * B, const size_t incB, double * C, const size_t incC , bool positive ); /* ******** * * LEVEL1.5 * * ******** */ // fspmv /* ******** * * LEVEL2 * * ******** */ /* Modular */ /* ModularBalanced */ void fassign_2_modular_double (const double p, const size_t m, const size_t n, const double * B, const size_t ldB , double * A, const size_t ldA , bool positive ); void fzero_2_modular_double (const double p, const size_t m, const size_t n, double * A, const size_t ldA , bool positive ); bool fequal_2_modular_double (const double p, const size_t m, const size_t n, const double * A, const size_t ldA, const double * B, const size_t ldB , bool positive ); bool fiszero_2_modular_double (const double p, const size_t m, const size_t n, const double * A, const size_t ldA , bool positive ); void fidentity_2_modular_double (const double p, const size_t m, const size_t n, double * A, const size_t ldA, const double d , bool positive ); void freducein_2_modular_double (const double p, const size_t m , const size_t n, double * A, const size_t ldA , bool positive ); void freduce_2_modular_double (const double p, const size_t m , const size_t n, const double * B, const size_t ldB, double * A, const size_t ldA , bool positive ); void fnegin_2_modular_double (const double p, const size_t m , const size_t n, double * A, const size_t ldA , bool positive ); void fneg_2_modular_double (const double p, const size_t m , const size_t n, const double * B, const size_t ldB, double * A, const size_t ldA , bool positive ); void fscalin_2_modular_double (const double p, const size_t m , const size_t n, const double alpha, double * A, const size_t ldA , bool positive ); void fscal_2_modular_double (const double p, const size_t m , const size_t n, const double alpha, const double * A, const size_t ldA, double * B, const size_t ldB , bool positive ); void faxpy_2_modular_double (const double p, const size_t m, const size_t n , const double alpha, const double * X, const size_t ldX, double * Y, const size_t ldY , bool positive ); #if 0 void faxpby_2_modular_double (const double p, const size_t m, const size_t n, const double alpha, const double * X, const size_t ldX, const double betA, double * Y, const size_t ldY , bool positive ); #endif void fmove_2_modular_double (const double p, const size_t m, const size_t n, double * A, const size_t ldA, double * B, const size_t ldB , bool positive ); void fadd_2_modular_double (const double p, const size_t m, const size_t n, const double * A, const size_t ldA, const double * B, const size_t ldB, double * C, const size_t ldC , bool positive ); void fsub_2_modular_double (const double p, const size_t m, const size_t n, const double * A, const size_t ldA, const double * B, const size_t ldB, double * C, const size_t ldC , bool positive ); void fsubin_2_modular_double (const double p, const size_t m, const size_t n, const double * B, const size_t ldB, double * C, const size_t ldC , bool positive ); void faddin_2_modular_double (const double p, const size_t m, const size_t n, const double * B, const size_t ldB, double * C, const size_t ldC , bool positive ); double * fgemv_2_modular_double (const double p, const enum FFLAS_C_TRANSPOSE TransA, const size_t m, const size_t n, const double alpha, const double * A, const size_t ldA, const double * X, const size_t incX, const double betA, double * Y, const size_t incY , bool positive ); void fger_2_modular_double (const double p, const size_t m, const size_t n, const double alpha, const double * x, const size_t incX, const double * y, const size_t incY, double * A, const size_t ldA , bool positive ); void ftrsv_2_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_TRANSPOSE TransA, const enum FFLAS_C_DIAG Diag, const size_t n,const double * A, const size_t ldA, double * X, int incX , bool positive ); /* ******** * * LEVEL2.5 * * ******** */ // fspmm /* ******** * * LEVEL3 * * ******** */ void ftrsm_3_modular_double (const double p, const enum FFLAS_C_SIDE Side, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_TRANSPOSE TransA, const enum FFLAS_C_DIAG Diag, const size_t m, const size_t n, const double alpha, const double * A, const size_t ldA, double * B, const size_t ldB , bool positive ); void ftrmm_3_modular_double (const double p, const enum FFLAS_C_SIDE Side, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_TRANSPOSE TransA, const enum FFLAS_C_DIAG Diag, const size_t m, const size_t n, const double alpha, double * A, const size_t ldA, double * B, const size_t ldB , bool positive ); double * fgemm_3_modular_double( const double p, const enum FFLAS_C_TRANSPOSE tA, const enum FFLAS_C_TRANSPOSE tB, const size_t m, const size_t n, const size_t k, const double alpha, const double * A, const size_t ldA, const double * B, const size_t ldB, const double betA, double * C, const size_t ldC , bool positive ); double * fsquare_3_modular_double (const double p, const enum FFLAS_C_TRANSPOSE tA, const size_t n, const double alpha, const double * A, const size_t ldA, const double betA, double * C, const size_t ldC , bool positive ); #ifdef __cplusplus } #endif #endif // __FFLASFFPACK_interfaces_libs_fflas_c_H fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_lvl1.C000066400000000000000000000163031274716147400236140ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 FFLAS-FFPACK * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas_lvl1.C * @author Brice Boyer * @brief C functions calls for level 1 FFLAS in fflas-c.h * @see fflas/fflas_level1.inl */ #include "fflas-ffpack/interfaces/libs/fflas_c.h" #include "fflas-ffpack/fflas/fflas.h" #include "givaro//modular-balanced.h" #include "givaro//modular.h" using Givaro::Modular ; using Givaro::ModularBalanced ; using namespace FFLAS ; #ifdef __cplusplus extern "C" { #endif /* * level 1 */ void freducein_1_modular_double (const double p, const size_t n, double * X, const size_t incX , bool positive ) { if (positive) { Modular F(p); freduce(F,n,X,incX); } else { ModularBalanced F(p); freduce(F,n,X,incX); } } void freduce_1_modular_double (const double p, const size_t n, const double * Y, const size_t incY, double * X, const size_t incX , bool positive ) { if (positive) { Modular F(p); freduce(F,n,Y,incY,X,incX); } else { ModularBalanced F(p); freduce(F,n,Y,incY,X,incX); } } void fnegin_1_modular_double (const double p, const size_t n, double * X, const size_t incX , bool positive ) { if (positive) { Modular F(p); fnegin(F,n,X,incX); } else { ModularBalanced F(p); fnegin(F,n,X,incX); } } void fneg_1_modular_double (const double p, const size_t n, const double * Y, const size_t incY, double * X, const size_t incX , bool positive ) { if (positive) { Modular F(p); fneg(F,n,Y,incY,X,incX); } else { ModularBalanced F(p); fneg(F,n,Y,incY,X,incX); } } void fzero_1_modular_double (const double p, const size_t n, double * X, const size_t incX , bool positive ) { if (positive) { Modular F(p); fzero(F,n,X,incX); } else { ModularBalanced F(p); fzero(F,n,X,incX); } } bool fiszero_1_modular_double (const double p, const size_t n, const double * X, const size_t incX , bool positive ) { if (positive) { Modular F(p); return fiszero(F,n,X,incX); } else { ModularBalanced F(p); return fiszero(F,n,X,incX); } } bool fequal_1_modular_double (const double p, const size_t n, const double * X, const size_t incX, const double * Y, const size_t incY , bool positive ) { if (positive) { Modular F(p); return fequal(F,n,Y,incY,X,incX); } else { ModularBalanced F(p); return fequal(F,n,Y,incY,X,incX); } } void fassign_1_modular_double (const double p, const size_t n, const double * Y, const size_t incY , double * X, const size_t incX , bool positive ) { if (positive) { Modular F(p); fassign(F,n,Y,incY,X,incX); } else { ModularBalanced F(p); fassign(F,n,Y,incY,X,incX); } } void fscalin_1_modular_double (const double p, const size_t n, const double alpha, double * X, const size_t incX , bool positive ) { if (positive) { Modular F(p); fscalin(F,n,alpha,X,incX); } else { ModularBalanced F(p); fscalin(F,n,alpha,X,incX); } } void fscal_1_modular_double (const double p, const size_t n , const double alpha , const double * X, const size_t incX , double * Y, const size_t incY , bool positive ) { if (positive) { Modular F(p); fscal(F,n,alpha,X,incX,Y,incY); } else { ModularBalanced F(p); fscal(F,n,alpha,X,incX,Y,incY); } } void faxpy_1_modular_double (const double p, const size_t n, const double alpha, const double * X, const size_t incX, double * Y, const size_t incY , bool positive ) { if (positive) { Modular F(p); faxpy(F,n,alpha,X,incX,Y,incY); } else { ModularBalanced F(p); faxpy(F,n,alpha,X,incX,Y,incY); } } #if 0 void faxpby_1_modular_double (const double p, const size_t n, const double alpha, const double * X, const size_t incX, const double beta, double * Y, const size_t incY , bool positive ) { if (positive) { Modular F(p); faxpby(F,n,alpha,X,incX,beta,Y,incY); } else { ModularBalanced F(p); faxpby(F,n,alpha,X,incX,beta,Y,incY); } } #endif double fdot_1_modular_double (const double p, const size_t n, const double * X, const size_t incX, const double * Y, const size_t incY , bool positive ) { if (positive) { Modular F(p); return fdot(F,n,Y,incY,X,incX); } else { ModularBalanced F(p); return fdot(F,n,Y,incY,X,incX); } } void fswap_1_modular_double (const double p, const size_t n, double * X, const size_t incX, double * Y, const size_t incY , bool positive ) { if (positive) { Modular F(p); fswap(F,n,Y,incY,X,incX); } else { ModularBalanced F(p); fswap(F,n,Y,incY,X,incX); } } void fadd_1_modular_double (const double p, const size_t n, const double * A, const size_t incA, const double * B, const size_t incB, double * C, const size_t incC , bool positive ) { if (positive) { Modular F(p); fadd(F,n,A,incA,B,incB,C,incC); } else { ModularBalanced F(p); fadd(F,n,A,incA,B,incB,C,incC); } } void fsub_1_modular_double (const double p, const size_t n, const double * A, const size_t incA, const double * B, const size_t incB, double * C, const size_t incC , bool positive ) { if (positive) { Modular F(p); fsub(F,n,A,incA,B,incB,C,incC); } else { ModularBalanced F(p); fsub(F,n,A,incA,B,incB,C,incC); } } void faddin_1_modular_double (const double p, const size_t n, const double * B, const size_t incB, double * C, const size_t incC , bool positive ) { if (positive) { Modular F(p); faddin(F,n,B,incB,C,incC); } else { ModularBalanced F(p); faddin(F,n,B,incB,C,incC); } } void fsubin_1_modular_double (const double p, const size_t n, const double * B, const size_t incB, double * C, const size_t incC , bool positive ) { if (positive) { Modular F(p); fsubin(F,n,B,incB,C,incC); } else { ModularBalanced F(p); fsubin(F,n,B,incB,C,incC); } } #ifdef __cplusplus } #endif fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_lvl2.C000066400000000000000000000220731274716147400236160ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright_2_modular_double (C) 2015 FFLAS-FFPACK * * Written by Brice Boyer_2_modular_double (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or_2_modular_double (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas_lvl2.C * @author Brice Boyer * @brief C functions calls for level 2 FFLAS in fflas-c.h * @see fflas/fflas_level2.inl */ #include "fflas-ffpack/interfaces/libs/fflas_c.h" #include "fflas-ffpack/fflas/fflas.h" #include "givaro//modular-balanced.h" #include "givaro//modular.h" using Givaro::Modular ; using Givaro::ModularBalanced ; using namespace FFLAS ; #ifdef __cplusplus extern "C" { #endif void fassign_2_modular_double (const double p, const size_t m, const size_t n, const double * A, const size_t lda , double * B, const size_t ldb , bool positive ) { if (positive) { Modular F(p); fassign(F,m,n,A,lda,B,ldb); } else { ModularBalanced F(p); fassign(F,m,n,A,lda,B,ldb); } } void fzero_2_modular_double (const double p, const size_t m, const size_t n, double * A, const size_t lda , bool positive ) { if (positive) { Modular F(p); fzero(F,m,n,A,lda); } else { ModularBalanced F(p); fzero(F,m,n,A,lda); } } bool fequal_2_modular_double (const double p, const size_t m, const size_t n, const double * A, const size_t lda, const double * B, const size_t ldb , bool positive ) { if (positive) { Modular F(p); return fequal(F,m,n,A,lda,B,ldb); } else { ModularBalanced F(p); return fequal(F,m,n,A,lda,B,ldb); } } bool fiszero_2_modular_double (const double p, const size_t m, const size_t n, const double * A, const size_t lda , bool positive ) { if (positive) { Modular F(p); return fiszero(F,m,n,A,lda); } else { ModularBalanced F(p); return fiszero(F,m,n,A,lda); } } void fidentity_2_modular_double (const double p, const size_t m, const size_t n, double * A, const size_t lda, const double d , bool positive ) { if (positive) { Modular F(p); fidentity(F,m,n,A,lda,d); } else { ModularBalanced F(p); fidentity(F,m,n,A,lda,d); } } void freducein_2_modular_double (const double p, const size_t m , const size_t n, double * A, const size_t lda , bool positive ) { if (positive) { Modular F(p); freduce(F,m,n,A,lda); } else { ModularBalanced F(p); freduce(F,m,n,A,lda); } } void freduce_2_modular_double (const double p, const size_t m , const size_t n, const double * A, const size_t lda, double * B, const size_t ldb , bool positive ) { if (positive) { Modular F(p); freduce(F,m,n,A,lda,B,ldb); } else { ModularBalanced F(p); freduce(F,m,n,A,lda,B,ldb); } } void fnegin_2_modular_double (const double p, const size_t m , const size_t n, double * A, const size_t lda , bool positive ) { if (positive) { Modular F(p); fnegin(F,m,n,A,lda); } else { ModularBalanced F(p); fnegin(F,m,n,A,lda); } } void fneg_2_modular_double (const double p, const size_t m , const size_t n, const double * A, const size_t lda, double * B, const size_t ldb , bool positive ) { if (positive) { Modular F(p); fneg(F,m,n,A,lda,B,ldb); } else { ModularBalanced F(p); fneg(F,m,n,A,lda,B,ldb); } } void fscalin_2_modular_double (const double p, const size_t m , const size_t n, const double alpha, double * A, const size_t lda , bool positive ) { if (positive) { Modular F(p); fscalin(F,m,n,alpha,A,lda); } else { ModularBalanced F(p); fscalin(F,m,n,alpha,A,lda); } } void fscal_2_modular_double (const double p, const size_t m , const size_t n, const double alpha, const double * A, const size_t lda, double * B, const size_t ldb , bool positive ) { if (positive) { Modular F(p); fscal(F,m,n,alpha,A,lda,B,ldb); } else { ModularBalanced F(p); fscal(F,m,n,alpha,A,lda,B,ldb); } } void faxpy_2_modular_double (const double p, const size_t m, const size_t n , const double alpha, const double * A, const size_t lda, double * B, const size_t ldb , bool positive ) { if (positive) { Modular F(p); faxpy(F,m,n,alpha,A,lda,B,ldb); } else { ModularBalanced F(p); faxpy(F,m,n,alpha,A,lda,B,ldb); } } #if 0 void faxpby_2_modular_double (const double p, const size_t m, const size_t n, const double alpha, const double * A, const size_t lda, const double beta, double * B, const size_t ldb , bool positive ) { if (positive) { Modular F(p); faxpby(F,m,n,alpha,A,lda,beta,B,ldb); } else { ModularBalanced F(p); faxpby(F,m,n,alpha,A,lda,beta,B,ldb); } } #endif void fmove_2_modular_double (const double p, const size_t m, const size_t n, double * A, const size_t lda, double * B, const size_t ldb , bool positive ) { if (positive) { Modular F(p); fmove(F,m,n,A,lda,B,ldb); } else { ModularBalanced F(p); fmove(F,m,n,A,lda,B,ldb); } } void fadd_2_modular_double (const double p, const size_t m, const size_t n, const double * A, const size_t lda, const double * B, const size_t ldb, double * C, const size_t ldc , bool positive ) { if (positive) { Modular F(p); fadd(F,m,n,A,lda,B,ldb,C,ldc); } else { ModularBalanced F(p); fadd(F,m,n,A,lda,B,ldb,C,ldc); } } void fsub_2_modular_double (const double p, const size_t m, const size_t n, const double * A, const size_t lda, const double * B, const size_t ldb, double * C, const size_t ldc , bool positive ) { if (positive) { Modular F(p); fsub(F,m,n,A,lda,B,ldb,C,ldc); } else { ModularBalanced F(p); fsub(F,m,n,A,lda,B,ldb,C,ldc); } } void fsubin_2_modular_double (const double p, const size_t m, const size_t n, const double * B, const size_t ldb, double * C, const size_t ldc , bool positive ) { if (positive) { Modular F(p); fsubin(F,m,n,B,ldb,C,ldc); } else { ModularBalanced F(p); fsubin(F,m,n,B,ldb,C,ldc); } } void faddin_2_modular_double (const double p, const size_t m, const size_t n, const double * B, const size_t ldb, double * C, const size_t ldc , bool positive ) { if (positive) { Modular F(p); faddin(F,m,n,B,ldb,C,ldc); } else { ModularBalanced F(p); faddin(F,m,n,B,ldb,C,ldc); } } double * fgemv_2_modular_double (const double p, const enum FFLAS_C_TRANSPOSE TransA, const size_t m, const size_t n, const double alpha, const double * A, const size_t lda, const double * X, const size_t incX, const double beta, double * Y, const size_t incY , bool positive ) { if (positive) { Modular F(p); return fgemv(F,(enum FFLAS::FFLAS_TRANSPOSE)TransA,m,n,alpha,A,lda,X,incX,beta,Y,incY); } else { ModularBalanced F(p); return fgemv(F,(enum FFLAS::FFLAS_TRANSPOSE)TransA,m,n,alpha,A,lda,X,incX,beta,Y,incY); } return nullptr; } void fger_2_modular_double (const double p, const size_t m, const size_t n, const double alpha, const double * X, const size_t incX, const double * Y, const size_t incY, double * A, const size_t lda , bool positive ) { if (positive) { Modular F(p); fger(F,m,n,alpha,X,incX,Y,incY,A,lda); } else { ModularBalanced F(p); fger(F,m,n,alpha,X,incX,Y,incY,A,lda); } } void ftrsv_2_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_TRANSPOSE TransA, const enum FFLAS_C_DIAG Diag, const size_t n,const double * A, const size_t lda, double * X, int incX , bool positive ) { if (positive) { Modular F(p); ftrsv(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_TRANSPOSE)TransA,(enum FFLAS::FFLAS_DIAG)Diag,n,A,lda,X,incX); } else { ModularBalanced F(p); ftrsv(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_TRANSPOSE)TransA,(enum FFLAS::FFLAS_DIAG)Diag,n,A,lda,X,incX); } } #ifdef __cplusplus } #endif fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_lvl3.C000066400000000000000000000101721274716147400236140ustar00rootroot00000000000000/* -*- mode: C++; tAb-width: 8; indent-tAbs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 FFLAS-FFPACK * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more detAils. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas_lvl3.C * @author Brice Boyer * @brief C functions calls for level 3 FFLAS in fflas-c.h * @see fflas/fflas_level3.inl */ #include "fflas-ffpack/interfaces/libs/fflas_c.h" #include "fflas-ffpack/fflas/fflas.h" #include "givaro//modular-balanced.h" #include "givaro//modular.h" using Givaro::Modular ; using Givaro::ModularBalanced ; using namespace FFLAS ; #ifdef __cplusplus extern "C" { #endif void ftrsm_3_modular_double (const double p, const enum FFLAS_C_SIDE Side, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_TRANSPOSE tA, const enum FFLAS_C_DIAG Diag, const size_t m, const size_t n, const double alpha, const double * A, const size_t ldA, double * B, const size_t ldB , bool positive ) { if (positive) { Modular F(p); ftrsm(F,(enum FFLAS_SIDE)Side,(enum FFLAS_UPLO)Uplo,(FFLAS_TRANSPOSE)tA,(enum FFLAS_DIAG)Diag,m,n,alpha,A,ldA,B,ldB); } else { ModularBalanced F(p); ftrsm(F,(enum FFLAS_SIDE)Side,(enum FFLAS_UPLO)Uplo,(FFLAS_TRANSPOSE)tA,(enum FFLAS_DIAG)Diag,m,n,alpha,A,ldA,B,ldB); } } void ftrmm_3_modular_double (const double p, const enum FFLAS_C_SIDE Side, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_TRANSPOSE tA, const enum FFLAS_C_DIAG Diag, const size_t m, const size_t n, const double alpha, double * A, const size_t ldA, double * B, const size_t ldB , bool positive ) { if (positive) { Modular F(p); ftrmm(F,(enum FFLAS_SIDE)Side,(enum FFLAS_UPLO)Uplo,(FFLAS_TRANSPOSE)tA,(enum FFLAS_DIAG)Diag,m,n,alpha,A,ldA,B,ldB); } else { ModularBalanced F(p); ftrmm(F,(enum FFLAS_SIDE)Side,(enum FFLAS_UPLO)Uplo,(FFLAS_TRANSPOSE)tA,(enum FFLAS_DIAG)Diag,m,n,alpha,A,ldA,B,ldB); } } double * fgemm_3_modular_double( const double p, const enum FFLAS_C_TRANSPOSE tA, const enum FFLAS_C_TRANSPOSE tB, const size_t m, const size_t n, const size_t k, const double alpha, const double * A, const size_t ldA, const double * B, const size_t ldB, const double betA, double * C, const size_t ldC, bool positive ) { if (positive) { Modular F(p); return fgemm(F,(FFLAS_TRANSPOSE)tA,(FFLAS_TRANSPOSE)tB,m,n,k,alpha,A,ldA,B,ldB,betA,C,ldC); } else { ModularBalanced F(p); return fgemm(F,(FFLAS_TRANSPOSE)tA,(FFLAS_TRANSPOSE)tB,m,n,k,alpha,A,ldA,B,ldB,betA,C,ldC); } return nullptr; } double * fsquare_3_modular_double (const double p, const enum FFLAS_C_TRANSPOSE tA, const size_t n, const double alpha, const double * A, const size_t ldA, const double betA, double * C, const size_t ldC , bool positive ) { if (positive) { Modular F(p); return fsquare(F,(FFLAS_TRANSPOSE)tA,n,alpha,A,ldA,betA,C,ldC); } else { ModularBalanced F(p); return fsquare(F,(FFLAS_TRANSPOSE)tA,n,alpha,A,ldA,betA,C,ldC); } return nullptr; } #ifdef __cplusplus } #endif fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/fflas_sparse.C000066400000000000000000000024671274716147400242410ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 FFLAS-FFPACK * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file fflas_sparse.C * @author Brice Boyer * @brief C functions calls for level 1.5 and 2.5 FFLAS in fflas-c.h * @see fflas/fflas_sparse.h */ // struct COO { // }; // fspmv // COO // CSR // fspmm // COO // CSR fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/ffpack.C000066400000000000000000001000311274716147400230050ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 FFLAS-FFPACK * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file ffpack.C * @author Brice Boyer * @brief C functions calls for FFPACK in ffpack-c.h * @see ffpack/ffpack.h */ #include "fflas-ffpack/interfaces/libs/ffpack_c.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "givaro//modular-balanced.h" #include "givaro//modular.h" using Givaro::Modular ; using Givaro::ModularBalanced ; using namespace FFLAS ; using namespace FFPACK; /*****************/ /* PERMUTATIONS */ /*****************/ void LAPACKPerm2MathPerm (size_t * MathP, const size_t * LapackP, const size_t N) { FFPACK::LAPACKPerm2MathPerm(MathP,LapackP,N); } void MathPerm2LAPACKPerm (size_t * LapackP, const size_t * MathP, const size_t N) { FFPACK::MathPerm2LAPACKPerm(LapackP, MathP, N); } void MatrixApplyS_modular_double (const double p, double * A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4 , bool positive) { if (positive) { Modular F(p); MatrixApplyS(F,A,lda,width,M2,R1,R2,R3,R4); } else { ModularBalanced F(p); MatrixApplyS(F,A,lda,width,M2,R1,R2,R3,R4); } } void PermApplyS_double (double * A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) { PermApplyS(A,lda,width,M2,R1,R2,R3,R4); } void MatrixApplyT_modular_double (const double p, double * A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4 , bool positive) { if (positive) { Modular F(p); MatrixApplyT(F,A,lda,width,N2,R1,R2,R3,R4); } else { ModularBalanced F(p); MatrixApplyT(F,A,lda,width,N2,R1,R2,R3,R4); } } void PermApplyT_double (double * A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) { PermApplyT(A,lda,width,N2,R1,R2,R3,R4); } void composePermutationsP (size_t * MathP, const size_t * P1, const size_t * P2, const size_t R, const size_t N) { FFPACK::composePermutationsP(MathP,P1,P2,R,N); } void composePermutationsQ (size_t * MathP, const size_t * Q1, const size_t * Q2, const size_t R, const size_t N) { FFPACK::composePermutationsQ(MathP,Q1,Q2,R,N); } void cyclic_shift_mathPerm (size_t * P, const size_t s) { FFPACK::cyclic_shift_mathPerm(P,s); } #if 0 template void cyclic_shift_row_col(Base_t * A, size_t m, size_t n, size_t lda); #endif void cyclic_shift_row_modular_double(const double p, double * A, size_t m, size_t n, size_t lda , bool positive) { if (positive) { Modular F(p); cyclic_shift_row(F,A,m,n,lda); } else { ModularBalanced F(p); cyclic_shift_row(F,A,m,n,lda); } } void cyclic_shift_col_modular_double(const double p, double * A, size_t m, size_t n, size_t lda , bool positive) { if (positive) { Modular F(p); cyclic_shift_col(F,A,m,n,lda); } else { ModularBalanced F(p); cyclic_shift_col(F,A,m,n,lda); } } void applyP_modular_double( const double p, const enum FFLAS::FFLAS_SIDE Side, const enum FFLAS::FFLAS_TRANSPOSE Trans, const size_t M, const size_t ibeg, const size_t iend, double * A, const size_t lda, const size_t * P , bool positive) { if (positive) { Modular F(p); applyP(F,(enum FFLAS::FFLAS_SIDE)Side,(enum FFLAS::FFLAS_TRANSPOSE)Trans,M,ibeg,iend,A,lda,P); } else { ModularBalanced F(p); applyP(F,(enum FFLAS::FFLAS_SIDE)Side,(enum FFLAS::FFLAS_TRANSPOSE)Trans,M,ibeg,iend,A,lda,P); } } /* fgetrs, fgesv */ void fgetrsin_modular_double (const double p, const enum FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, double * A, const size_t lda, const size_t *P, const size_t *Q, double * B, const size_t ldb, int * info , bool positive) { if (positive) { Modular F(p); fgetrs(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,R,A,lda,P,Q,B,ldb,info); } else { ModularBalanced F(p); fgetrs(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,R,A,lda,P,Q,B,ldb,info); } } double * fgetrsv_modular_double (const double p, const enum FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t NRHS, const size_t R, double * A, const size_t lda, const size_t *P, const size_t *Q, double * X, const size_t ldx, const double * B, const size_t ldb, int * info , bool positive) { if (positive) { Modular F(p); return fgetrs(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,NRHS,R,A,lda,P,Q,X,ldx,B,ldb,info); } else { ModularBalanced F(p); return fgetrs(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,NRHS,R,A,lda,P,Q,X,ldx,B,ldb,info); } } size_t fgesvin_modular_double (const double p, const enum FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, double * A, const size_t lda, double * B, const size_t ldb, int * info , bool positive) { if (positive) { Modular F(p); return fgesv(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,A,lda,B,ldb,info); } else { ModularBalanced F(p); return fgesv(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,A,lda,B,ldb,info); } } size_t fgesv_modular_double (const double p, const enum FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t NRHS, double * A, const size_t lda, double * X, const size_t ldx, const double * B, const size_t ldb, int * info , bool positive) { if (positive) { Modular F(p); return fgesv(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,NRHS,A,lda,X,ldx,B,ldb,info); } else { ModularBalanced F(p); return fgesv(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,NRHS,A,lda,X,ldx,B,ldb,info); } } /* ftrtr */ void ftrtri_modular_double (const double p, const enum FFLAS::FFLAS_UPLO Uplo, const enum FFLAS::FFLAS_DIAG Diag, const size_t N, double * A, const size_t lda , bool positive) { if (positive) { Modular F(p); ftrtri(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,N,A,lda); } else { ModularBalanced F(p); ftrtri(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,N,A,lda); } } void trinv_left_modular_double( const double p, const size_t N, const double * L, const size_t ldl, double * X, const size_t ldx , bool positive) { if (positive) { Modular F(p); trinv_left(F,N,L,ldl,X,ldx); } else { ModularBalanced F(p); trinv_left(F,N,L,ldl,X,ldx); } } void ftrtrm_modular_double (const double p, const enum FFLAS::FFLAS_DIAG Diag, const size_t N, double * A, const size_t lda , bool positive) { if (positive) { Modular F(p); ftrtrm(F,(enum FFLAS::FFLAS_DIAG)Diag,N,A,lda); } else { ModularBalanced F(p); ftrtrm(F,(enum FFLAS::FFLAS_DIAG)Diag,N,A,lda); } } /* PLUQ */ size_t PLUQ_modular_double (const double p, const enum FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, double * A, const size_t lda, size_t*P, size_t *Q , bool positive) { if (positive) { Modular F(p); return PLUQ(F,(enum FFLAS::FFLAS_DIAG)Diag,M,N,A,lda,P,Q); } else { ModularBalanced F(p); return PLUQ(F,(enum FFLAS::FFLAS_DIAG)Diag,M,N,A,lda,P,Q); } } size_t LUdivine_modular_double (const double p, const enum FFLAS::FFLAS_DIAG Diag, const enum FFLAS::FFLAS_TRANSPOSE Trans, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, const enum FFPACK_C_LU_TAG LuTag, const size_t cutoff , bool positive) { if (positive) { Modular F(p); return LUdivine(F,(enum FFLAS::FFLAS_DIAG)Diag,(enum FFLAS::FFLAS_TRANSPOSE)Trans,M,N,A,lda,P,Qt,(enum FFPACK::FFPACK_LU_TAG)LuTag,cutoff); } else { ModularBalanced F(p); return LUdivine(F,(enum FFLAS::FFLAS_DIAG)Diag,(enum FFLAS::FFLAS_TRANSPOSE)Trans,M,N,A,lda,P,Qt,(enum FFPACK::FFPACK_LU_TAG)LuTag,cutoff); } } #if 0 /* UTILE ?? */ size_t LUdivine_small_modular_double (const double p, const enum FFLAS::FFLAS_DIAG Diag, const enum FFLAS::FFLAS_TRANSPOSE Trans, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Q, const enum FFPACK_C_LU_TAG LuTag); size_t LUdivine_gauss_modular_double (const double p, const enum FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Q, const enum FFPACK_C_LU_TAG LuTag); #endif /*****************/ /* ECHELON FORMS */ /*****************/ size_t ColumnEchelonForm_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, bool transform , const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return ColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return ColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t RowEchelonForm_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return RowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return RowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t ReducedColumnEchelonForm_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return ReducedColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return ReducedColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t ReducedRowEchelonForm_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return ReducedRowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return ReducedRowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t ColumnEchelonForm_modular_float (const float p, const size_t M, const size_t N, float * A, const size_t lda, size_t* P, size_t* Qt, bool transform , const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return ColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return ColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t RowEchelonForm_modular_float (const float p, const size_t M, const size_t N, float * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return RowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return RowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t ReducedColumnEchelonForm_modular_float (const float p, const size_t M, const size_t N, float * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return ReducedColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return ReducedColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t ReducedRowEchelonForm_modular_float (const float p, const size_t M, const size_t N, float * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return ReducedRowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return ReducedRowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t ColumnEchelonForm_modular_int32_t (const int32_t p, const size_t M, const size_t N, int32_t * A, const size_t lda, size_t* P, size_t* Qt, bool transform , const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return ColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return ColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t RowEchelonForm_modular_int32_t (const int32_t p, const size_t M, const size_t N, int32_t * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return RowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return RowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t ReducedColumnEchelonForm_modular_int32_t (const int32_t p, const size_t M, const size_t N, int32_t * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return ReducedColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return ReducedColumnEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t ReducedRowEchelonForm_modular_int32_t (const int32_t p, const size_t M, const size_t N, int32_t * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return ReducedRowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return ReducedRowEchelonForm(F,M,N,A,lda,P,Qt,transform,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t ReducedRowEchelonForm2_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, const bool transform , bool positive) { if (positive) { Modular F(p); return ReducedRowEchelonForm2(F,M,N,A,lda,P,Qt,transform); } else { ModularBalanced F(p); return ReducedRowEchelonForm2(F,M,N,A,lda,P,Qt,transform); } } size_t REF_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, const size_t colbeg, const size_t rowbeg, const size_t colsize, size_t* Qt, size_t* P , bool positive) { if (positive) { Modular F(p); return REF(F,M,N,A,lda,colbeg,rowbeg,colsize,Qt,P); } else { ModularBalanced F(p); return REF(F,M,N,A,lda,colbeg,rowbeg,colsize,Qt,P); } } /*****************/ /* INVERSION */ /*****************/ double * Invertin_modular_double (const double p, const size_t M, double * A, const size_t lda, int * nullity , bool positive) { if (positive) { Modular F(p); return Invert(F,M,A,lda,*nullity); } else { ModularBalanced F(p); return Invert(F,M,A,lda,*nullity); } } double * Invert_modular_double (const double p, const size_t M, const double * A, const size_t lda, double * X, const size_t ldx, int* nullity , bool positive) { if (positive) { Modular F(p); return Invert(F,M,A,lda,X,ldx,*nullity); } else { ModularBalanced F(p); return Invert(F,M,A,lda,X,ldx,*nullity); } } double * Invert2_modular_double( const double p, const size_t M, double * A, const size_t lda, double * X, const size_t ldx, int* nullity , bool positive) { if (positive) { Modular F(p); return Invert2(F,M,A,lda,X,ldx,*nullity); } else { ModularBalanced F(p); return Invert2(F,M,A,lda,X,ldx,*nullity); } } /*****************************/ /* CHARACTERISTIC POLYNOMIAL */ /*****************************/ #if 0 /* pas pour le moment */ template std::list& CharPoly( const double p, std::list& charp, const size_t N, double * A, const size_t lda, const enum FFPACK_C_CHARPOLY_TAG CharpTag= FfpackArithProg); template Polynomial & mulpoly_modular_double(const double p, Polynomial &res, const Polynomial & P1, const Polynomial & P2); template Polynomial& CharPoly_modular_double( const double p, Polynomial& charp, const size_t N, double * A, const size_t lda, const enum FFPACK_C_CHARPOLY_TAG CharpTag= FfpackArithProg); template std::list& CharpolyArithProg_modular_double (const double p, std::list& frobeniusForm, const size_t N, double * A, const size_t lda, const size_t c); #endif /**********************/ /* MINIMAL POLYNOMIAL */ /**********************/ #if 0 /* pas pour le moment */ template Polynomial& MinPoly_modular_double( const double p, Polynomial& minP, const size_t N, const double * A, const size_t lda, double * X, const size_t ldx, size_t* P, const enum FFPACK_C_MINPOLY_TAG MinTag= FFPACK::FfpackDense, const size_t kg_mc=0, const size_t kg_mb=0, const size_t kg_j=0 ); #endif /* Krylov Elim */ size_t KrylovElim_modular_double( const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t*P, size_t *Q, const size_t deg, size_t *iterates, size_t * inviterates, const size_t maxit,size_t virt , bool positive) { if (positive) { Modular F(p); return KrylovElim(F,M,N,A,lda,P,Q,deg,iterates,inviterates,maxit, virt); } else { ModularBalanced F(p); return KrylovElim(F,M,N,A,lda,P,Q,deg,iterates,inviterates,maxit, virt); } } size_t SpecRankProfile_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, const size_t deg, size_t *rankProfile , bool positive) { if (positive) { Modular F(p); return SpecRankProfile(F,M,N,A,lda,deg,rankProfile); } else { ModularBalanced F(p); return SpecRankProfile(F,M,N,A,lda,deg,rankProfile); } } /********/ /* RANK */ /********/ size_t Rank_modular_double( const double p, const size_t M, const size_t N, double * A, const size_t lda , bool positive) { if (positive) { Modular F(p); return Rank(F,M,N,A,lda); } else { ModularBalanced F(p); return Rank(F,M,N,A,lda); } } /********/ /* DET */ /********/ bool IsSingular_modular_double( const double p, const size_t M, const size_t N, double * A, const size_t lda , bool positive) { if (positive) { Modular F(p); return IsSingular(F,M,N,A,lda); } else { ModularBalanced F(p); return IsSingular(F,M,N,A,lda); } } double Det_modular_double( const double p, const size_t M, const size_t N, double * A, const size_t lda , bool positive) { if (positive) { Modular F(p); return Det(F,M,N,A,lda); } else { ModularBalanced F(p); return Det(F,M,N,A,lda); } } /*********/ /* SOLVE */ /*********/ double * Solve_modular_double( const double p, const size_t M, double * A, const size_t lda, double * x, const int incx, const double * b, const int incb , bool positive) { if (positive) { Modular F(p); return Solve(F,M,A,lda,x,incx,b,incb); } else { ModularBalanced F(p); return Solve(F,M,A,lda,x,incx,b,incb); } } void solveLB_modular_double( const double p, const enum FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, double * L, const size_t ldl, const size_t * Q, double * B, const size_t ldb , bool positive) { if (positive) { Modular F(p); solveLB(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,R,L,ldl,Q,B,ldb); } else { ModularBalanced F(p); solveLB(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,R,L,ldl,Q,B,ldb); } } void solveLB2_modular_double( const double p, const enum FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, double * L, const size_t ldl, const size_t * Q, double * B, const size_t ldb , bool positive) { if (positive) { Modular F(p); solveLB2(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,R,L,ldl,Q,B,ldb); } else { ModularBalanced F(p); solveLB2(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,R,L,ldl,Q,B,ldb); } } /*************/ /* NULLSPACE */ /*************/ void RandomNullSpaceVector_modular_double (const double p, const enum FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, double * A, const size_t lda, double * X, const size_t incX , bool positive) { if (positive) { Modular F(p); RandomNullSpaceVector(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,A,lda,X,incX); } else { ModularBalanced F(p); RandomNullSpaceVector(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,A,lda,X,incX); } } size_t NullSpaceBasis_modular_double (const double p, const enum FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, double * A, const size_t lda, double ** NS, size_t* ldn, size_t * NSdim , bool positive) { if (positive) { Modular F(p); return NullSpaceBasis(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,A,lda,*NS,*ldn,*NSdim); } else { ModularBalanced F(p); return NullSpaceBasis(F,(enum FFLAS::FFLAS_SIDE)Side,M,N,A,lda,*NS,*ldn,*NSdim); } } /*****************/ /* RANK PROFILES */ /*****************/ size_t RowRankProfile_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t ** rkprofile, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return RowRankProfile(F,M,N,A,lda,*rkprofile,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return RowRankProfile(F,M,N,A,lda,*rkprofile,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } size_t ColumnRankProfile_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t ** rkprofile, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); return ColumnRankProfile(F,M,N,A,lda,*rkprofile,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); return ColumnRankProfile(F,M,N,A,lda,*rkprofile,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } void RankProfileFromLU (const size_t* P, const size_t N, const size_t R, size_t* rkprofile, const enum FFPACK_C_LU_TAG LuTag) { FFPACK::RankProfileFromLU(P,N,R,rkprofile,(enum FFPACK::FFPACK_LU_TAG)LuTag); } size_t LeadingSubmatrixRankProfiles (const size_t M, const size_t N, const size_t R, const size_t LSm, const size_t LSn, const size_t* P, const size_t* Q, size_t* RRP, size_t* CRP) { return FFPACK::LeadingSubmatrixRankProfiles(M,N,R,LSm,LSn,P,Q,RRP,CRP); } size_t RowRankProfileSubmatrixIndices_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t ** rowindices, size_t ** colindices, size_t * R , bool positive) { if (positive) { Modular F(p); return RowRankProfileSubmatrixIndices(F,M,N,A,lda,*rowindices,*colindices,*R); } else { ModularBalanced F(p); return RowRankProfileSubmatrixIndices(F,M,N,A,lda,*rowindices,*colindices,*R); } } size_t ColRankProfileSubmatrixIndices_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t** rowindices, size_t** colindices, size_t* R , bool positive) { if (positive) { Modular F(p); return ColRankProfileSubmatrixIndices(F,M,N,A,lda,*rowindices,*colindices,*R); } else { ModularBalanced F(p); return ColRankProfileSubmatrixIndices(F,M,N,A,lda,*rowindices,*colindices,*R); } } size_t RowRankProfileSubmatrix_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, double ** X, size_t* R , bool positive) { if (positive) { Modular F(p); return RowRankProfileSubmatrix(F,M,N,A,lda,*X,*R); } else { ModularBalanced F(p); return RowRankProfileSubmatrix(F,M,N,A,lda,*X,*R); } } size_t ColRankProfileSubmatrix_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, double ** X, size_t* R , bool positive) { if (positive) { Modular F(p); return ColRankProfileSubmatrix(F,M,N,A,lda,*X,*R); } else { ModularBalanced F(p); return ColRankProfileSubmatrix(F,M,N,A,lda,*X,*R); } } /*********************************************/ /* Accessors to Triangular and Echelon forms */ /*********************************************/ void getTriangular_modular_double (const double p, const enum FFLAS::FFLAS_UPLO Uplo, const enum FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, const size_t R, const double * A, const size_t lda, double * T, const size_t ldt, const bool OnlyNonZeroVectors , bool positive) { if (positive) { Modular F(p); getTriangular(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,M,N,R,A,lda,T,ldt,OnlyNonZeroVectors); } else { ModularBalanced F(p); getTriangular(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,M,N,R,A,lda,T,ldt,OnlyNonZeroVectors); } } void getTriangularin_modular_double (const double p, const enum FFLAS::FFLAS_UPLO Uplo, const enum FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, const size_t R, double * A, const size_t lda , bool positive) { if (positive) { Modular F(p); getTriangular(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,M,N,R,A,lda); } else { ModularBalanced F(p); getTriangular(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,M,N,R,A,lda); } } void getEchelonForm_modular_double (const double p, const enum FFLAS::FFLAS_UPLO Uplo, const enum FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, const size_t R, const size_t* P, const double * A, const size_t lda, double * T, const size_t ldt, const bool OnlyNonZeroVectors, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); getEchelonForm(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,M,N,R,P,A,lda,T,ldt,OnlyNonZeroVectors,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); getEchelonForm(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,M,N,R,P,A,lda,T,ldt,OnlyNonZeroVectors,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } void getEchelonFormin_modular_double (const double p, const enum FFLAS::FFLAS_UPLO Uplo, const enum FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, const size_t R, const size_t* P, double * A, const size_t lda, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); getEchelonForm(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,M,N,R,P,A,lda,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); getEchelonForm(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,M,N,R,P,A,lda,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } void getEchelonTransform_modular_double (const double p, const enum FFLAS::FFLAS_UPLO Uplo, const enum FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, const size_t R, const size_t* P, const size_t* Q, const double * A, const size_t lda, double * T, const size_t ldt, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); getEchelonTransform(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,M,N,R,P,Q,A,lda,T,ldt,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); getEchelonTransform(F,(enum FFLAS::FFLAS_UPLO)Uplo,(enum FFLAS::FFLAS_DIAG)Diag,M,N,R,P,Q,A,lda,T,ldt,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } void getReducedEchelonForm_modular_double (const double p, const enum FFLAS::FFLAS_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, const double * A, const size_t lda, double * T, const size_t ldt, const bool OnlyNonZeroVectors, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); getReducedEchelonForm(F,(enum FFLAS::FFLAS_UPLO)Uplo,M,N,R,P,A,lda,T,ldt,OnlyNonZeroVectors,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); getReducedEchelonForm(F,(enum FFLAS::FFLAS_UPLO)Uplo,M,N,R,P,A,lda,T,ldt,OnlyNonZeroVectors,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } void getReducedEchelonFormin_modular_double (const double p, const enum FFLAS::FFLAS_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, double * A, const size_t lda, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); getReducedEchelonForm(F,(enum FFLAS::FFLAS_UPLO)Uplo,M,N,R,P,A,lda,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); getReducedEchelonForm(F,(enum FFLAS::FFLAS_UPLO)Uplo,M,N,R,P,A,lda,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } void getReducedEchelonTransform_modular_double (const double p, const enum FFLAS::FFLAS_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, const size_t* Q, const double * A, const size_t lda, double * T, const size_t ldt, const enum FFPACK_C_LU_TAG LuTag , bool positive) { if (positive) { Modular F(p); getReducedEchelonTransform(F,(enum FFLAS::FFLAS_UPLO)Uplo,M,N,R,P,Q,A,lda,T,ldt,(enum FFPACK::FFPACK_LU_TAG)LuTag); } else { ModularBalanced F(p); getReducedEchelonTransform(F,(enum FFLAS::FFLAS_UPLO)Uplo,M,N,R,P,Q,A,lda,T,ldt,(enum FFPACK::FFPACK_LU_TAG)LuTag); } } void PLUQtoEchelonPermutation (const size_t N, const size_t R, const size_t * P, size_t * outPerm) { FFPACK::PLUQtoEchelonPermutation(N,R,P,outPerm); } fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/ffpack_c.h000066400000000000000000000462021274716147400233650ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s */ /* * Copyright (C) 2015 FFLAS-FFPACK * * Written by Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /** @file ffpack-c.h * @author Brice Boyer * @brief C functions calls for FFPACK * @see ffpack/ffpack.h */ #ifndef __FFLASFFPACK_interfaces_libs_ffpack_c_H #define __FFLASFFPACK_interfaces_libs_ffpack_c_H //#include "fflas-ffpack/fflas-ffpack-config.h" #ifndef FFPACK_COMPILED #define FFPACK_COMPILED #endif #include #include #include #ifdef __cplusplus extern "C" { #endif #ifndef __FFLASFFPACK_interfaces_libs_fflas_c_H enum FFLAS_C_ORDER { FflasRowMajor=101, FflasColMajor=102 }; enum FFLAS_C_TRANSPOSE { FflasNoTrans = 111, FflasTrans = 112 }; enum FFLAS_C_UPLO { FflasUpper = 121, FflasLower = 122 }; enum FFLAS_C_DIAG { FflasNonUnit = 131, FflasUnit = 132 }; enum FFLAS_C_SIDE { FflasLeft = 141, FflasRight = 142 }; #endif // __FFLASFFPACK_interfaces_libs_fflas_c_H enum FFPACK_C_LU_TAG { FfpackSlabRecursive = 1, FfpackTileRecursive = 2, FfpackSingular = 3 }; enum FFPACK_C_CHARPOLY_TAG { FfpackLUK=1, FfpackKG=2, FfpackHybrid=3, FfpackKGFast=4, FfpackDanilevski=5, FfpackArithProg=6, FfpackKGFastG=7 }; enum FFPACK_C_MINPOLY_TAG { FfpackDense=1, FfpackKGF=2 }; /*****************/ /* PERMUTATIONS */ /*****************/ void LAPACKPerm2MathPerm (size_t * MathP, const size_t * LapackP, const size_t N); void MathPerm2LAPACKPerm (size_t * LapackP, const size_t * MathP, const size_t N); void MatrixApplyS_modular_double (const double p, double * A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4 , bool positive ); void PermApplyS_double (double * A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4); void MatrixApplyT_modular_double (const double p, double * A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4 , bool positive ); void PermApplyT_double (double * A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4); void composePermutationsP (size_t * MathP, const size_t * P1, const size_t * P2, const size_t R, const size_t N); void composePermutationsQ (size_t * MathP, const size_t * Q1, const size_t * Q2, const size_t R, const size_t N); void cyclic_shift_mathPerm (size_t * P, const size_t s); #if 0 template void cyclic_shift_row_col(Base_t * A, size_t m, size_t n, size_t lda); #endif void cyclic_shift_row_modular_double(const double p, double * A, size_t m, size_t n, size_t lda , bool positive ); void cyclic_shift_col_modular_double(const double p, double * A, size_t m, size_t n, size_t lda , bool positive ); void applyP_modular_double( const double p, const enum FFLAS_C_SIDE Side, const enum FFLAS_C_TRANSPOSE Trans, const size_t M, const size_t ibeg, const size_t iend, double * A, const size_t lda, const size_t * P , bool positive ); /* fgetrs, fgesv */ void fgetrsin_modular_double (const double p, const enum FFLAS_C_SIDE Side, const size_t M, const size_t N, const size_t R, double * A, const size_t lda, const size_t *P, const size_t *Q, double * B, const size_t ldb, int * info , bool positive ); double * fgetrs_modular_double (const double p, const enum FFLAS_C_SIDE Side, const size_t M, const size_t N, const size_t NRHS, const size_t R, double * A, const size_t lda, const size_t *P, const size_t *Q, double * X, const size_t ldx, const double * B, const size_t ldb, int * info , bool positive ); size_t fgesvin_modular_double (const double p, const enum FFLAS_C_SIDE Side, const size_t M, const size_t N, double * A, const size_t lda, double * B, const size_t ldb, int * info , bool positive ); size_t fgesv_modular_double (const double p, const enum FFLAS_C_SIDE Side, const size_t M, const size_t N, const size_t NRHS, double * A, const size_t lda, double * X, const size_t ldx, const double * B, const size_t ldb, int * info); /* ftrtr */ void ftrtri_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_DIAG Diag, const size_t N, double * A, const size_t lda , bool positive ); void trinv_left_modular_double( const double p, const size_t N, const double * L, const size_t ldl, double * X, const size_t ldx , bool positive ); void ftrtrm_modular_double (const double p, const enum FFLAS_C_DIAG diag, const size_t N, double * A, const size_t lda , bool positive ); /* PLUQ */ size_t PLUQ_modular_double (const double p, const enum FFLAS_C_DIAG Diag, const size_t M, const size_t N, double * A, const size_t lda, size_t*P, size_t *Q , bool positive ); size_t LUdivine_modular_double (const double p, const enum FFLAS_C_DIAG Diag, const enum FFLAS_C_TRANSPOSE trans, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, const enum FFPACK_C_LU_TAG LuTag, const size_t cutoff , bool positive ); size_t LUdivine_small_modular_double (const double p, const enum FFLAS_C_DIAG Diag, const enum FFLAS_C_TRANSPOSE trans, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Q, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t LUdivine_gauss_modular_double (const double p, const enum FFLAS_C_DIAG Diag, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Q, const enum FFPACK_C_LU_TAG LuTag , bool positive ); /*****************/ /* ECHELON FORMS */ /*****************/ size_t ColumnEchelonForm_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t RowEchelonForm_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t ColumnEchelonForm_modular_float (const float p, const size_t M, const size_t N, float * A, const size_t lda, size_t* P, size_t* Qt, bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t RowEchelonForm_modular_float (const float p, const size_t M, const size_t N, float * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t ColumnEchelonForm_modular_int32_t (const int32_t p, const size_t M, const size_t N, int32_t * A, const size_t lda, size_t* P, size_t* Qt, bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t RowEchelonForm_modular_int32_t (const int32_t p, const size_t M, const size_t N, int32_t * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t ReducedColumnEchelonForm_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t ReducedRowEchelonForm_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t ReducedColumnEchelonForm_modular_float (const float p, const size_t M, const size_t N, float * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t ReducedRowEchelonForm_modular_float (const float p, const size_t M, const size_t N, float * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t ReducedColumnEchelonForm_modular_int32_t (const int32_t p, const size_t M, const size_t N, int32_t * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t ReducedRowEchelonForm_modular_int32_t (const int32_t p, const size_t M, const size_t N, int32_t * A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t ReducedRowEchelonForm2_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t* P, size_t* Qt, const bool transform , bool positive ); size_t REF_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, const size_t colbeg, const size_t rowbeg, const size_t colsize, size_t* Qt, size_t* P , bool positive ); /*****************/ /* INVERSION */ /*****************/ double * Invertin_modular_double (const double p, const size_t M, double * A, const size_t lda, int * nullity , bool positive ); double * Invert_modular_double (const double p, const size_t M, const double * A, const size_t lda, double * X, const size_t ldx, int* nullity , bool positive ); double * Invert2_modular_double( const double p, const size_t M, double * A, const size_t lda, double * X, const size_t ldx, int* nullity , bool positive ); /*****************************/ /* CHARACTERISTIC POLYNOMIAL */ /*****************************/ #if 0 /* pas pour le moment */ template std::list& CharPoly( const double p, std::list& charp, const size_t N, double * A, const size_t lda, const enum FFPACK_C_CHARPOLY_TAG CharpTag= FfpackArithProg); template Polynomial & mulpoly_modular_double(const double p, Polynomial &res, const Polynomial & P1, const Polynomial & P2); template Polynomial& CharPoly_modular_double( const double p, Polynomial& charp, const size_t N, double * A, const size_t lda, const enum FFPACK_C_CHARPOLY_TAG CharpTag= FfpackArithProg); template std::list& CharpolyArithProg_modular_double (const double p, std::list& frobeniusForm, const size_t N, double * A, const size_t lda, const size_t c); #endif /**********************/ /* MINIMAL POLYNOMIAL */ /**********************/ #if 0 /* pas pour le moment */ template Polynomial& MinPoly_modular_double( const double p, Polynomial& minP, const size_t N, const double * A, const size_t lda, double * X, const size_t ldx, size_t* P, const enum FFPACK_C_MINPOLY_TAG MinTag= FfpackDense, const size_t kg_mc=0, const size_t kg_mb=0, const size_t kg_j=0 ); #endif /* Krylov Elim */ size_t KrylovElim_modular_double( const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t*P, size_t *Q, const size_t deg, size_t *iterates, size_t * inviterates, const size_t maxit,size_t virt , bool positive ); size_t SpecRankProfile_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, const size_t deg, size_t *rankProfile , bool positive ); /********/ /* RANK */ /********/ size_t Rank_modular_double( const double p, const size_t M, const size_t N, double * A, const size_t lda , bool positive ) ; /********/ /* DET */ /********/ bool IsSingular_modular_double( const double p, const size_t M, const size_t N, double * A, const size_t lda , bool positive ); double Det_modular_double( const double p, const size_t M, const size_t N, double * A, const size_t lda , bool positive ); /*********/ /* SOLVE */ /*********/ double * Solve_modular_double( const double p, const size_t M, double * A, const size_t lda, double * x, const int incx, const double * b, const int incb , bool positive ); void solveLB_modular_double( const double p, const enum FFLAS_C_SIDE Side, const size_t M, const size_t N, const size_t R, double * L, const size_t ldl, const size_t * Q, double * B, const size_t ldb ); void solveLB2_modular_double( const double p, const enum FFLAS_C_SIDE Side, const size_t M, const size_t N, const size_t R, double * L, const size_t ldl, const size_t * Q, double * B, const size_t ldb , bool positive ); /*************/ /* NULLSPACE */ /*************/ void RandomNullSpaceVector_modular_double (const double p, const enum FFLAS_C_SIDE Side, const size_t M, const size_t N, double * A, const size_t lda, double * X, const size_t incX , bool positive ); size_t NullSpaceBasis_modular_double (const double p, const enum FFLAS_C_SIDE Side, const size_t M, const size_t N, double * A, const size_t lda, double ** NS, size_t* ldn, size_t * NSdim , bool positive ); /*****************/ /* RANK PROFILES */ /*****************/ size_t RowRankProfile_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t ** rkprofile, const enum FFPACK_C_LU_TAG LuTag , bool positive ); size_t ColumnRankProfile_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t ** rkprofile, const enum FFPACK_C_LU_TAG LuTag , bool positive ); void RankProfileFromLU (const size_t* P, const size_t N, const size_t R, size_t* rkprofile, const enum FFPACK_C_LU_TAG LuTag); size_t LeadingSubmatrixRankProfiles (const size_t M, const size_t N, const size_t R, const size_t LSm, const size_t LSn, const size_t* P, const size_t* Q, size_t* RRP, size_t* CRP); size_t RowRankProfileSubmatrixIndices_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t ** rowindices, size_t ** colindices, size_t * R , bool positive ); size_t ColRankProfileSubmatrixIndices_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, size_t** rowindices, size_t** colindices, size_t* R , bool positive ); size_t RowRankProfileSubmatrix_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, double ** X, size_t* R , bool positive ); size_t ColRankProfileSubmatrix_modular_double (const double p, const size_t M, const size_t N, double * A, const size_t lda, double ** X, size_t* R , bool positive ); /*********************************************/ /* Accessors to Triangular and Echelon forms */ /*********************************************/ void getTriangular_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_DIAG diag, const size_t M, const size_t N, const size_t R, const double * A, const size_t lda, double * T, const size_t ldt, const bool OnlyNonZeroVectors , bool positive ); void getTriangularin_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_DIAG diag, const size_t M, const size_t N, const size_t R, double * A, const size_t lda , bool positive ); void getEchelonForm_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_DIAG diag, const size_t M, const size_t N, const size_t R, const size_t* P, const double * A, const size_t lda, double * T, const size_t ldt, const bool OnlyNonZeroVectors, const enum FFPACK_C_LU_TAG LuTag , bool positive ); void getEchelonFormin_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_DIAG diag, const size_t M, const size_t N, const size_t R, const size_t* P, double * A, const size_t lda, const enum FFPACK_C_LU_TAG LuTag , bool positive ); void getEchelonTransform_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const enum FFLAS_C_DIAG diag, const size_t M, const size_t N, const size_t R, const size_t* P, const size_t* Q, const double * A, const size_t lda, double * T, const size_t ldt, const enum FFPACK_C_LU_TAG LuTag , bool positive ); void getReducedEchelonForm_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, const double * A, const size_t lda, double * T, const size_t ldt, const bool OnlyNonZeroVectors, const enum FFPACK_C_LU_TAG LuTag , bool positive ); void getReducedEchelonFormin_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, double * A, const size_t lda, const enum FFPACK_C_LU_TAG LuTag , bool positive ); void getReducedEchelonTransform_modular_double (const double p, const enum FFLAS_C_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, const size_t* Q, const double * A, const size_t lda, double * T, const size_t ldt, const enum FFPACK_C_LU_TAG LuTag , bool positive ); void PLUQtoEchelonPermutation (const size_t N, const size_t R, const size_t * P, size_t * outPerm); #ifdef __cplusplus } #endif #endif // __FFLASFFPACK_interfaces_libs_ffpack_c_H fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/ffpack_compiled_spec.inl000066400000000000000000000035201274716147400263000ustar00rootroot00000000000000#include "givaro//modular-balanced.h" #include "givaro//modular.h" #include "fflas-ffpack/ffpack/ffpack.h" #define PASTER(x,y) x ## _ ## y #define EVALUATOR(x,y) PASTER(x,y) #define NAME(fun) EVALUATOR(fun, FFLAS_TYPE) #if FFLAS_FIELD == Modular #define FFLAS_POSITIVE true #else #define FFLAS_POSITIVE false #endif namespace FFPACK{ template <> size_t ColumnEchelonForm (const Givaro::FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_TYPE* A, const size_t lda, size_t* P, size_t* Qt, bool transform, const FFPACK::FFPACK_LU_TAG LuTag){ return NAME(ColumnEchelonForm_modular) (F.cardinality(), M, N, A, lda, P, Qt, transform, LuTag, FFLAS_POSITIVE); } template <> size_t RowEchelonForm (const Givaro::FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_TYPE* A, const size_t lda, size_t* P, size_t* Qt, bool transform, const FFPACK::FFPACK_LU_TAG LuTag){ return NAME(RowEchelonForm_modular) (F.cardinality(), M, N, A, lda, P, Qt, transform, LuTag, FFLAS_POSITIVE); } template <> size_t ReducedColumnEchelonForm (const Givaro::FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_TYPE* A, const size_t lda, size_t* P, size_t* Qt, bool transform, const FFPACK::FFPACK_LU_TAG LuTag){ return NAME(ReducedColumnEchelonForm_modular) (F.cardinality(), M, N, A, lda, P, Qt, transform, LuTag, FFLAS_POSITIVE); } template <> size_t ReducedRowEchelonForm (const Givaro::FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_TYPE* A, const size_t lda, size_t* P, size_t* Qt, bool transform, const FFPACK::FFPACK_LU_TAG LuTag){ return NAME(ReducedRowEchelonForm_modular) (F.cardinality(), M, N, A, lda, P, Qt, transform, LuTag, FFLAS_POSITIVE); } } #undef FFLAS_POSITIVE #undef PASTER #undef EVALUATOR #undef NAME fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/ffpack_inst.C000066400000000000000000000041251274716147400240510ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack_inst.C * Copyright (C) 2015 FFLAS-FFPACK group * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFPACK_INST_C #define __FFPACK_INST_C // The ffpack lib should link to the compiled fflas lib #ifndef FFLAS_COMPILED #define FFLAS_COMPILED #endif #include "fflas-ffpack/fflas-ffpack-config.h" #include "givaro/modular.h" #include "givaro/modular-balanced.h" #include "ffpack.h" // This is a C file: we do template instantiations #ifdef INST_OR_DECL #undef INST_OR_DECL #endif #define INST_OR_DECL #define FFLAS_FIELD Givaro::ModularBalanced #define FFLAS_ELT double #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #define FFLAS_FIELD Givaro::Modular #define FFLAS_ELT double #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #endif // __FFPACK_INST_C fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/ffpack_inst.h000066400000000000000000000040721274716147400241170ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack_inst.h * Copyright (C) 2015 FFLAS-FFPACK group * Written by Clement Pernet * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFPACK_INST_H #define __FFPACK_INST_H // The ffpack lib should link to the compiled fflas lib #ifndef FFLAS_COMPILED #define FFLAS_COMPILED #endif #include "givaro/modular.h" #include "givaro/modular-balanced.h" #include "fflas-ffpack/ffpack/ffpack.h" // This is a H file: we do template declarations #ifdef INST_OR_DECL #undef INST_OR_DECL #endif #define INST_OR_DECL <> #define FFLAS_FIELD Givaro::ModularBalanced #define FFLAS_ELT double #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #define FFLAS_FIELD Givaro::Modular #define FFLAS_ELT double #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT float #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #define FFLAS_ELT int32_t #include "ffpack_inst_implem.inl" #undef FFLAS_ELT #undef FFLAS_FIELD #endif //__FFPACK_INST_H fflas-ffpack-2.2.2/fflas-ffpack/interfaces/libs/ffpack_inst_implem.inl000066400000000000000000000404541274716147400260210ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* ffpack_inst_implem.inl * Copyright (C) 2005 Clement Pernet * 2014 FFLAS-FFPACK group * 2015 FFLAS-FFPACK group * Written by Clement Pernet * Brice Boyer (briceboyer) * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ namespace FFPACK { template INST_OR_DECL void MatrixApplyS (const FFLAS_FIELD& F, FFLAS_ELT* A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4); template INST_OR_DECL void MatrixApplyT (const FFLAS_FIELD& F, FFLAS_ELT* A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4); void composePermutationsP (size_t * MathP, const size_t * P1, const size_t * P2, const size_t R, const size_t N); void composePermutationsQ (size_t * MathP, const size_t * Q1, const size_t * Q2, const size_t R, const size_t N); void cyclic_shift_mathPerm (size_t * P, const size_t s); template void cyclic_shift_row_col(Base_t * A, size_t m, size_t n, size_t lda); template INST_OR_DECL void cyclic_shift_row(const FFLAS_FIELD& F, FFLAS_ELT* A, size_t m, size_t n, size_t lda); template INST_OR_DECL void cyclic_shift_col(const FFLAS_FIELD& F, FFLAS_ELT* A, size_t m, size_t n, size_t lda); template INST_OR_DECL void applyP( const FFLAS_FIELD& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_TRANSPOSE Trans, const size_t M, const size_t ibeg, const size_t iend, FFLAS_ELT* A, const size_t lda, const size_t * P ); template INST_OR_DECL void papplyP( const FFLAS_FIELD& F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_TRANSPOSE Trans, const size_t m, const size_t ibeg, const size_t iend, FFLAS_ELT* A, const size_t lda, const size_t * P ); template INST_OR_DECL void pMatrixApplyT (const FFLAS_FIELD& F, FFLAS_ELT* A, const size_t lda, const size_t width, const size_t N2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) ; template INST_OR_DECL void pMatrixApplyS (const FFLAS_FIELD& F, FFLAS_ELT* A, const size_t lda, const size_t width, const size_t M2, const size_t R1, const size_t R2, const size_t R3, const size_t R4) ; template INST_OR_DECL size_t pPLUQ(const FFLAS_FIELD& Fi, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t* P, size_t* Q, int nt); template INST_OR_DECL void fgetrs (const FFLAS_FIELD& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, FFLAS_ELT* A, const size_t lda, const size_t *P, const size_t *Q, FFLAS_ELT* B, const size_t ldb, int * info); template INST_OR_DECL FFLAS_ELT* fgetrs (const FFLAS_FIELD& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t NRHS, const size_t R, FFLAS_ELT* A, const size_t lda, const size_t *P, const size_t *Q, FFLAS_ELT* X, const size_t ldx, const FFLAS_ELT* B, const size_t ldb, int * info); template INST_OR_DECL size_t fgesv (const FFLAS_FIELD& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, FFLAS_ELT* B, const size_t ldb, int * info); template INST_OR_DECL size_t fgesv (const FFLAS_FIELD& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t NRHS, FFLAS_ELT* A, const size_t lda, FFLAS_ELT* X, const size_t ldx, const FFLAS_ELT* B, const size_t ldb, int * info); template INST_OR_DECL void ftrtri (const FFLAS_FIELD& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG Diag, const size_t N, FFLAS_ELT* A, const size_t lda); template INST_OR_DECL void trinv_left( const FFLAS_FIELD& F, const size_t N, const FFLAS_ELT* L, const size_t ldl, FFLAS_ELT* X, const size_t ldx ); template INST_OR_DECL void ftrtrm (const FFLAS_FIELD& F, const FFLAS::FFLAS_DIAG diag, const size_t N, FFLAS_ELT* A, const size_t lda); template INST_OR_DECL size_t PLUQ (const FFLAS_FIELD& F, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t*P, size_t *Q); template INST_OR_DECL size_t LUdivine (const FFLAS_FIELD& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t* P, size_t* Qt, const FFPACK_LU_TAG LuTag, const size_t cutoff); template INST_OR_DECL size_t LUdivine_small (const FFLAS_FIELD& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t* P, size_t* Q, const FFPACK_LU_TAG LuTag); template INST_OR_DECL size_t LUdivine_gauss (const FFLAS_FIELD& F, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t* P, size_t* Q, const FFPACK_LU_TAG LuTag); template INST_OR_DECL size_t RowEchelonForm (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const FFPACK_LU_TAG LuTag); template INST_OR_DECL size_t ReducedRowEchelonForm (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const FFPACK_LU_TAG LuTag); template INST_OR_DECL size_t ColumnEchelonForm (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const FFPACK_LU_TAG LuTag); template INST_OR_DECL size_t ReducedColumnEchelonForm (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t* P, size_t* Qt, const bool transform, const FFPACK_LU_TAG LuTag); template INST_OR_DECL FFLAS_ELT* Invert (const FFLAS_FIELD& F, const size_t M, FFLAS_ELT* A, const size_t lda, int& nullity); template INST_OR_DECL FFLAS_ELT* Invert (const FFLAS_FIELD& F, const size_t M, const FFLAS_ELT* A, const size_t lda, FFLAS_ELT* X, const size_t ldx, int& nullity); template INST_OR_DECL FFLAS_ELT* Invert2( const FFLAS_FIELD& F, const size_t M, FFLAS_ELT* A, const size_t lda, FFLAS_ELT* X, const size_t ldx, int& nullity); template INST_OR_DECL std::list >& CharPoly (const FFLAS_FIELD& F, std::list >& charp, const size_t N, FFLAS_ELT* A, const size_t lda, const FFPACK_CHARPOLY_TAG CharpTag); template INST_OR_DECL std::vector & mulpoly(const FFLAS_FIELD& F, std::vector &res, const std::vector & P1, const std::vector & P2); template INST_OR_DECL std::vector& CharPoly( const FFLAS_FIELD& F, std::vector& charp, const size_t N, FFLAS_ELT* A, const size_t lda, const FFPACK_CHARPOLY_TAG CharpTag); template INST_OR_DECL std::list>& CharpolyArithProg (const FFLAS_FIELD& F, std::list>& frobeniusForm, const size_t N, FFLAS_ELT* A, const size_t lda, const size_t c); template INST_OR_DECL std::vector& MinPoly( const FFLAS_FIELD& F, std::vector& minP, const size_t N, const FFLAS_ELT* A, const size_t lda, FFLAS_ELT* X, const size_t ldx, size_t* P, const FFPACK::FFPACK_MINPOLY_TAG MinTag, const size_t kg_mc, const size_t kg_mb, const size_t kg_j ); template INST_OR_DECL size_t KrylovElim( const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t*P, size_t *Q, const size_t deg, size_t *iterates, size_t * inviterates, const size_t maxit,size_t virt); template INST_OR_DECL size_t SpecRankProfile (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, const size_t deg, size_t *rankProfile); template INST_OR_DECL size_t Rank (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda); template INST_OR_DECL bool IsSingular (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda); template INST_OR_DECL FFLAS_ELT Det (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda); template INST_OR_DECL FFLAS_ELT* Solve( const FFLAS_FIELD& F, const size_t M, FFLAS_ELT* A, const size_t lda, FFLAS_ELT* x, const int incx, const FFLAS_ELT* b, const int incb ); template INST_OR_DECL void solveLB( const FFLAS_FIELD& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, FFLAS_ELT* L, const size_t ldl, const size_t * Q, FFLAS_ELT* B, const size_t ldb ); template INST_OR_DECL void solveLB2( const FFLAS_FIELD& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, const size_t R, FFLAS_ELT* L, const size_t ldl, const size_t * Q, FFLAS_ELT* B, const size_t ldb ); template INST_OR_DECL void RandomNullSpaceVector (const FFLAS_FIELD& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, FFLAS_ELT* X, const size_t incX); template INST_OR_DECL size_t NullSpaceBasis (const FFLAS_FIELD& F, const FFLAS::FFLAS_SIDE Side, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, FFLAS_ELT*& NS, size_t& ldn, size_t& NSdim); template INST_OR_DECL size_t RowRankProfile (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t* &rkprofile, const FFPACK_LU_TAG LuTag); template INST_OR_DECL size_t ColumnRankProfile (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t* &rkprofile, const FFPACK_LU_TAG LuTag); void RankProfileFromLU (const size_t* P, const size_t N, const size_t R, size_t* rkprofile, const FFPACK_LU_TAG LuTag); size_t LeadingSubmatrixRankProfiles (const size_t M, const size_t N, const size_t R, const size_t LSm, const size_t LSn, const size_t* P, const size_t* Q, size_t* RRP, size_t* CRP); template INST_OR_DECL size_t RowRankProfileSubmatrixIndices (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t*& rowindices, size_t*& colindices, size_t& R); template INST_OR_DECL size_t ColRankProfileSubmatrixIndices (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, size_t*& rowindices, size_t*& colindices, size_t& R); template INST_OR_DECL size_t RowRankProfileSubmatrix (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, FFLAS_ELT*& X, size_t& R); template INST_OR_DECL size_t ColRankProfileSubmatrix (const FFLAS_FIELD& F, const size_t M, const size_t N, FFLAS_ELT* A, const size_t lda, FFLAS_ELT*& X, size_t& R); template INST_OR_DECL void getTriangular > (const FFLAS_FIELD & F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG diag, const size_t M, const size_t N, const size_t R, const FFLAS_ELT* A, const size_t lda, FFLAS_ELT* T, const size_t ldt, const bool OnlyNonZeroVectors); template INST_OR_DECL void getTriangular >(const FFLAS_FIELD& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG diag, const size_t M, const size_t N, const size_t R, FFLAS_ELT* A, const size_t lda); template INST_OR_DECL void getEchelonForm > (const FFLAS_FIELD& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG diag, const size_t M, const size_t N, const size_t R, const size_t* P, const FFLAS_ELT* A, const size_t lda, FFLAS_ELT* T, const size_t ldt, const bool OnlyNonZeroVectors, const FFPACK_LU_TAG LuTag); template INST_OR_DECL void getEchelonForm > (const FFLAS_FIELD& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG diag, const size_t M, const size_t N, const size_t R, const size_t* P, FFLAS_ELT* A, const size_t lda, const FFPACK_LU_TAG LuTag); template INST_OR_DECL void getEchelonTransform > (const FFLAS_FIELD& F, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_DIAG diag, const size_t M, const size_t N, const size_t R, const size_t* P, const size_t* Q, const FFLAS_ELT* A, const size_t lda, FFLAS_ELT* T, const size_t ldt, const FFPACK_LU_TAG LuTag); template INST_OR_DECL void getReducedEchelonForm > (const FFLAS_FIELD & F, const FFLAS::FFLAS_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, const FFLAS_ELT* A, const size_t lda, FFLAS_ELT* T, const size_t ldt, const bool OnlyNonZeroVectors, const FFPACK_LU_TAG LuTag); template INST_OR_DECL void getReducedEchelonForm > (const FFLAS_FIELD& F, const FFLAS::FFLAS_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, FFLAS_ELT* A, const size_t lda, const FFPACK_LU_TAG LuTag); template INST_OR_DECL void getReducedEchelonTransform > (const FFLAS_FIELD& F, const FFLAS::FFLAS_UPLO Uplo, const size_t M, const size_t N, const size_t R, const size_t* P, const size_t* Q, const FFLAS_ELT* A, const size_t lda, FFLAS_ELT* T, const size_t ldt, const FFPACK_LU_TAG LuTag); void PLUQtoEchelonPermutation (const size_t N, const size_t R, const size_t * P, size_t * outPerm); template INST_OR_DECL FFLAS_ELT* LQUPtoInverseOfFullRankMinor( const FFLAS_FIELD& F, const size_t rank, FFLAS_ELT* A_factors, const size_t lda, const size_t* QtPointer, FFLAS_ELT* X, const size_t ldx); } // FFPACK fflas-ffpack-2.2.2/fflas-ffpack/paladin/000077500000000000000000000000001274716147400200105ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/paladin/Makefile.am000066400000000000000000000020041274716147400220400ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== pkgincludesubdir=$(pkgincludedir)/paladin pkgincludesub_HEADERS= fflas_pfinit.h \ blockcuts.inl \ pfgemm_variants.inl \ parallel.h \ kaapi_routines.inl fflas-ffpack-2.2.2/fflas-ffpack/paladin/blockcuts.inl000066400000000000000000000434631274716147400225170ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_bounds.inl * Copyright (C) 2013 Jean-Guillaume Dumas * * Written by Jean-Guillaume Dumas * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_blockcuts_INL #define __FFLASFFPACK_fflas_blockcuts_INL #include #include #define __FFLASFFPACK_MINBLOCKCUTS ((size_t)256) namespace FFLAS { // enum CuttingStrategy { // SINGLE , // ROW , // COLUMN , // BLOCK , // RECURSIVE // }; // enum StrategyParameter { // FIXED , // THREADS , // GRAIN , // TWO_D , // THREE_D_INPLACE , // THREE_D_ADAPT , // TWO_D_ADAPT , // THREE_D // }; namespace CuttingStrategy{ struct Single{}; struct Row{}; struct Column{}; struct Block{}; struct Recursive{}; } namespace StrategyParameter{ struct Fixed{}; struct Threads{}; struct Grain{}; struct TwoD{}; struct TwoDAdaptive{}; struct ThreeD{}; struct ThreeDInPlace{}; struct ThreeDAdaptive{}; } /*! ParSeqHelper for both fgemm and ftrsm */ /*! ParSeqHelper for both fgemm and ftrsm */ namespace ParSeqHelper { template struct Parallel{ typedef C Cut; typedef P Param; Parallel(size_t n=NUM_THREADS):_numthreads(n){} friend std::ostream& operator<<(std::ostream& out, const Parallel& p) { return out << "Parallel: " << p.numthreads(); } size_t numthreads() const { return _numthreads; } size_t& set_numthreads(size_t n) { return _numthreads=n; } // CuttingStrategy method() const { return _method; } // StrategyParameter strategy() const { return _param; } private: size_t _numthreads; // CuttingStrategy _method; // StrategyParameter _param; }; struct Sequential{ Sequential() {} template Sequential(Parallel& ) {} friend std::ostream& operator<<(std::ostream& out, const Sequential&) { return out << "Sequential"; } size_t numthreads() const { return 1; } // CuttingStrategy method() const { return SINGLE; } // // numthreads==1 ==> a single block // StrategyParameter strategy() const { return THREADS; } }; } template inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t numthreads); template<> inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t numthreads) { assert(numthreads==1); RBLOCKSIZE = std::max(m,(size_t)1); CBLOCKSIZE = std::max(n,(size_t)1); } template<> inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t numthreads) { RBLOCKSIZE = std::max(std::min(m,__FFLASFFPACK_MINBLOCKCUTS),(size_t)1); CBLOCKSIZE = std::max(n,(size_t)1); } template<> inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t grainsize) { RBLOCKSIZE = std::max(std::min(m,grainsize),(size_t)1); CBLOCKSIZE = std::max(n,(size_t)1); } template<> inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t grainsize) { RBLOCKSIZE = std::max(std::min(m,grainsize),(size_t)1); CBLOCKSIZE = std::max(std::min(n,grainsize),(size_t)1); } template<> inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t numthreads) { RBLOCKSIZE = std::max(m,(size_t)1); CBLOCKSIZE = std::max(std::min(n,__FFLASFFPACK_MINBLOCKCUTS),(size_t)1); } template<> inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t grainsize) { RBLOCKSIZE = std::max(m,(size_t)1); CBLOCKSIZE = std::max(std::min(n,grainsize),(size_t)1); } template<> inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t numthreads) { RBLOCKSIZE = std::max(std::min(m,__FFLASFFPACK_MINBLOCKCUTS),(size_t)1); CBLOCKSIZE = std::max(std::min(n,__FFLASFFPACK_MINBLOCKCUTS),(size_t)1); } template<> inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t numthreads) { RBLOCKSIZE = std::max(m/numthreads,(size_t)1); CBLOCKSIZE = std::max(n,(size_t)1); } template<> inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t numthreads) { RBLOCKSIZE = std::max(m,(size_t)1); CBLOCKSIZE = std::max(n/numthreads,(size_t)1); } template<> inline void BlockCuts(size_t& RBLOCKSIZE, size_t& CBLOCKSIZE, const size_t m, const size_t n, const size_t numthreads) { if (numthreads<65) { //CP: Let's not compute these values all the time const short maxtc[64] = {1,2,3,2,5,3,7,4,3,5,11,4,13,7,5,4,17,6,19,5,7,11,23,6,5,13,9,7,29,6,31,8,11,17,7,6,37,19,13,8,41,7,43,11,9,23,47,8,7,10,17,13,53,9,11,8,19,29,59,10,61,31,9,8}; const short maxtr[64] = {1,1,1,2,1,2,1,2,3,2,1,3,1,2,3,4,1,3,1,4,3,2,1,4,5,2,3,4,1,5,1,4,3,2,5,6,1,2,3,5,1,6,1,4,5,2,1,6,7,5,3,4,1,6,5,7,3,2,1,6,1,2,7,8}; RBLOCKSIZE=std::max(m/(size_t)maxtr[numthreads-1],(size_t)1); CBLOCKSIZE=std::max(n/(size_t)maxtc[numthreads-1],(size_t)1); } else { const size_t maxt = (size_t)sqrt((double)numthreads); size_t maxtr=maxt,maxtc=maxt; for(size_t i=maxt; i>=1; --i) { size_t j=maxt; size_t newpr = i*j; for( ; newpr < numthreads; ++j, newpr+=i ) {} if (newpr == numthreads) { maxtc = j; maxtr = i; break; } } RBLOCKSIZE=std::max(m/maxtr,(size_t)1); CBLOCKSIZE=std::max(n/maxtc,(size_t)1); } } // inline void BlockCuts(size_t& r, size_t& c, // size_t m, size_t n, // const CuttingStrategy method, // const StrategyParameter strategy, // const size_t t) { // switch(method) { // case CuttingStrategy::Block: // switch(strategy) { // case StrategyParameter::Threads: BlockCuts(r,c,m,n,t); break; // case StrategyParameter::Grain: BlockCuts(r,c,m,n,t); break; // case StrategyParameter::Fixed: BlockCuts(r,c,m,n,t); break; // default: BlockCuts(r,c,m,n,t); // } // break; // case CuttingStrategy::Row: // switch(strategy) { // case StrategyParameter::Threads: BlockCuts(r,c,m,n,t); break; // case StrategyParameter::Grain: BlockCuts(r,c,m,n,t); break; // case StrategyParameter::Fixed: BlockCuts(r,c,m,n,t); break; // default: BlockCuts(r,c,m,n,t); // } // break; // case CuttingStrategy::Column: // switch(strategy) { // case StrategyParameter::Threads: BlockCuts(r,c,m,n,t); break; // case StrategyParameter::Grain: BlockCuts(r,c,m,n,t); break; // case StrategyParameter::Fixed: BlockCuts(r,c,m,n,t); break; // default: BlockCuts(r,c,m,n,t); // } // break; // default: BlockCuts(r,c,m,n,t); // }; // } template inline void BlockCuts(size_t& rowBlockSize, size_t& colBlockSize, size_t& lastRBS, size_t& lastCBS, size_t& changeRBS, size_t& changeCBS, size_t& numRowBlock, size_t& numColBlock, size_t m, size_t n, // const CuttingStrategy method, // const StrategyParameter strategy, const size_t numthreads) { BlockCuts(rowBlockSize, colBlockSize, m, n, numthreads); numRowBlock = m/rowBlockSize; numColBlock = n/colBlockSize; changeRBS = m-rowBlockSize*numRowBlock; lastRBS = rowBlockSize; if (changeRBS) ++rowBlockSize; changeCBS = n-colBlockSize*numColBlock; lastCBS = colBlockSize; if (changeCBS) ++colBlockSize; /* // Better preserve numRowBlock and numColBlock if (lastRBS) { lastRBS = m-rowBlockSize*numRowBlock; ++rowBlockSize; } else lastRBS = rowBlockSize; if (lastCBS) { lastCBS = n-colBlockSize*numColBlock; ++colBlockSize; } else lastCBS = colBlockSize; */ // // Better preserve rowBlockSize and colBlockSize // lastRBS = m % rowBlockSize; // lastCBS = n % colBlockSize; // if (lastRBS) ++numRowBlock; else lastRBS = rowBlockSize; // if (lastCBS) ++numColBlock; else lastCBS = colBlockSize; } } namespace FFLAS { template struct ForStrategy1D { ForStrategy1D(const blocksize_t n, const ParSeqHelper::Parallel H) { build(n,H); } ForStrategy1D(const blocksize_t b, const blocksize_t e, const ParSeqHelper::Parallel H) { build(e-b,H); } void build(const blocksize_t n, const ParSeqHelper::Parallel H) { // std::cout<<"FS1D n : "<::value ) { numBlock = std::max((blocksize_t)(H.numthreads()),(blocksize_t)1); } else if ( Protected::AreEqual::value ) { numBlock = std::max(n/ (blocksize_t)(H.numthreads()), (blocksize_t)1); } else { numBlock = std::max(n/(blocksize_t)(__FFLASFFPACK_MINBLOCKCUTS),(blocksize_t)1); } firstBlockSize = n/numBlock; if (firstBlockSize<1) { firstBlockSize = (blocksize_t)1; numBlock = n; } changeBS = n - numBlock*firstBlockSize; lastBlockSize = firstBlockSize; if (changeBS) ++firstBlockSize; // std::cout<<"FS1D 1BLOCKSIZE : "< struct ForStrategy2D { ForStrategy2D(const blocksize_t m, const blocksize_t n, const ParSeqHelper::Parallel H) { BlockCuts(rowBlockSize, colBlockSize, lastRBS, lastCBS, changeRBS, changeCBS, numRowBlock, numColBlock, m, n, // H.method(), H.strategy(), H.numthreads()); BLOCKS = numRowBlock * numColBlock; } blocksize_t initialize() { _ibeg = 0; _iend = rowBlockSize; _jbeg = 0; _jend = colBlockSize; return current = 0; } bool isTerminated() const { return current == BLOCKS; } blocksize_t ibegin() const { return _ibeg; } blocksize_t jbegin() const { return _jbeg; } blocksize_t iend() const { return _iend; } blocksize_t jend() const { return _jend; } blocksize_t operator++() { ++current; blocksize_t icurr = current/numColBlock; blocksize_t jcurr = current%numColBlock; if (jcurr) { _jbeg = _jend; _jend += (jcurrs,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_pfinit.inl * Copyright (C) 2015 Jean Guillaume Dumas Clement Pernet Ziad Sultan * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include "fflas-ffpack/paladin/parallel.h" namespace FFLAS { template void pfzero(const Field& F, size_t m, size_t n, typename Field::Element_ptr C, size_t BS=0) { using FFLAS::CuttingStrategy::Block; using FFLAS::StrategyParameter::Grain; BS=std::max(BS, (size_t)Protected::WinogradThreshold(F) ); SYNCH_GROUP( FORBLOCK2D(iter, m, n, SPLITTER(BS, Block, Grain), TASK(MODE(CONSTREFERENCE(F)), { fzero(F, iter.iend()-iter.ibegin(), iter.jend()-iter.jbegin(), C+iter.ibegin()*n+iter.jbegin(), n); } ); ); ); } template void pfrand(const Field& F, RandIter& G, size_t m, size_t n, typename Field::Element_ptr C, size_t BS=0) { using FFLAS::CuttingStrategy::Block; using FFLAS::StrategyParameter::Grain; BS=std::max(BS, (size_t)Protected::WinogradThreshold(F) ); SYNCH_GROUP( FORBLOCK2D(iter, m, n, SPLITTER(BS, Block, Grain), TASK(MODE(CONSTREFERENCE(F,G)), { frand(F, G, iter.iend()-iter.ibegin(), iter.jend()-iter.jbegin(), C+iter.ibegin()*n+iter.jbegin(), n); } ); ); ); } } // FFLAS fflas-ffpack-2.2.2/fflas-ffpack/paladin/kaapi_routines.inl000066400000000000000000000141411274716147400235320ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_pftrsm.inl * Copyright (C) 2013 Ziad Sultan * * Written by Ziad Sultan < Ziad.Sultan@imag.fr > * Time-stamp: <17 Jun 14 14:32:29 Jean-Guillaume.Dumas@imag.fr> * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_KAAPI_ROUTINES_INL #define __FFLASFFPACK_KAAPI_ROUTINES_INL #ifdef __FFLASFFPACK_USE_KAAPI namespace FFLAS { template struct Taskfgemm15 : public ka::Task<15>::Signature< Field, FFLAS_TRANSPOSE, FFLAS_TRANSPOSE, size_t , size_t , size_t , typename Field::Element, ka::R, size_t , ka::R, size_t , typename Field::Element, ka::RW, size_t , Helper //size_t // winograd >{}; /* template struct Taskfgemm14 : public ka::Task<14>::Signature< Field, FFLAS_TRANSPOSE, FFLAS_TRANSPOSE, size_t , size_t , size_t , typename Field::Element, ka::R, size_t , ka::R, size_t , typename Field::Element, ka::RW, size_t >{}; */ template struct Taskftrsm12: public ka::Task<12>::Signature< Field , /* Field F */ FFLAS::FFLAS_SIDE , FFLAS::FFLAS_UPLO , FFLAS::FFLAS_TRANSPOSE , FFLAS::FFLAS_DIAG , size_t , /* size : M */ size_t , /* size : N */ typename Field::Element , ka::R, /* Matrix A */ size_t , /* lda */ ka::RW, /* Matrix B */ size_t /* ldb */ >{}; template void spawnerfgemm(const Field& F, const FFLAS::FFLAS_TRANSPOSE ta, const FFLAS::FFLAS_TRANSPOSE tb, size_t BlockRowDim, size_t BlockColDim, size_t k, const typename Field::Element alpha, ka::pointer_r A, const size_t lda, ka::pointer_r B, const size_t ldb, const typename Field::Element beta, ka::pointer_rw C, const size_t ldc, Helper WH){ ka::Spawn >()( F, ta, tb, BlockRowDim, BlockColDim, k, alpha, A.ptr(), lda, B.ptr() , ldb, beta, C.ptr(), ldc, WH); } } template struct TaskBodyCPU >{ void operator()(const Field& F, const FFLAS::FFLAS_TRANSPOSE ta, const FFLAS::FFLAS_TRANSPOSE tb, size_t BlockRowDim, size_t BlockColDim, size_t k, const typename Field::Element alpha, ka::pointer_r A, const size_t lda, ka::pointer_r B, const size_t ldb, const typename Field::Element beta, ka::pointer_rw C, const size_t ldc, Helper WH // Helper & WH // size_t w ) { FFLAS::MMHelper::value> W(WH); /* FFLAS::MMHelper WH; WH(F,w);*/ FFLAS::fgemm( F, ta, tb, BlockRowDim, BlockColDim, k, alpha, A.ptr(), lda, B.ptr() , ldb, beta, C.ptr(), ldc, W); } }; /* template struct TaskBodyCPU >{ void operator()(const Field& F, const FFLAS::FFLAS_TRANSPOSE ta, const FFLAS::FFLAS_TRANSPOSE tb, size_t BlockRowDim, size_t BlockColDim, size_t k, const typename Field::Element alpha, ka::pointer_r A, const size_t lda, ka::pointer_r B, const size_t ldb, const typename Field::Element beta, ka::pointer_rw C, const size_t ldc) { FFLAS::fgemm( F, ta, tb, BlockRowDim, BlockColDim, k, alpha, A.ptr(), lda, B.ptr() , ldb, beta, C.ptr(), ldc); } }; */ template struct TaskBodyCPU > { void operator()(const Field & F, const FFLAS::FFLAS_SIDE Side, const FFLAS::FFLAS_UPLO Uplo, const FFLAS::FFLAS_TRANSPOSE TransA, const FFLAS::FFLAS_DIAG Diag, const size_t M, const size_t N, const typename Field::Element alpha, ka::pointer_r A, const size_t lda, ka::pointer_rw B, const size_t ldb ) { FFLAS::ftrsm(F, Side, Uplo, TransA, Diag, M, N, alpha, A.ptr(), lda, B.ptr(), ldb); } }; #endif #endif // __FFLASFFPACK_KAAPI_ROUTINES_INL fflas-ffpack-2.2.2/fflas-ffpack/paladin/parallel.h000066400000000000000000000516721274716147400217700ustar00rootroot00000000000000/* fflas/parallel.h * Copyright (C) 2013 Jean Guillaume Dumas Clement Pernet Ziad Sultan * * Written by Jean Guillaume Dumas Clement Pernet Ziad Sultan * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_fflas_parallel_H #define __FFLASFFPACK_fflas_parallel_H #include "fflas-ffpack/config.h" #ifndef __FFLASFFPACK_USE_OPENMP #define __FFLASFFPACK_SEQUENTIAL #else #include "omp.h" #endif #ifdef __FFLASFFPACK_SEQUENTIAL #undef __FFLASFFPACK_USE_OPENMP #undef __FFLASFFPACK_USE_TBB #undef __FFLASFFPACK_USE_KAAPI #elif defined (__FFLASFFPACK_USE_KAAPI) #undef __FFLASFFPACK_SEQUENTIAL #undef __FFLASFFPACK_USE_TBB #undef __FFLASFFPACK_USE_OPENMP #include "kaapi++" #include "fflas-ffpack/fflas/kaapi_routines.inl" #elif defined __FFLASFFPACK_USE_TBB #undef __FFLASFFPACK_SEQUENTIAL #undef __FFLASFFPACK_USE_OPENMP #undef __FFLASFFPACK_USE_KAAPI #include #include #include #include /* extern "C" { tbb::task_group g; } */ #ifdef __FFLASFFPACK_HAVE_MKL #ifndef _MKL_H_ // temporary #error "MKL (mkl.h) not present, while you have MKL enabled" #endif #undef index_t #define index_t MKL_INT #endif // __FFLASFFPACK_HAVE_MKL #endif #ifdef __FFLASFFPACK_FORCE_SEQ #undef __FFLASFFPACK_USE_OPENMP #undef __FFLASFFPACK_USE_KAAPI #undef __FFLASFFPACK_USE_TBB #define __FFLASFFPACK_SEQUENTIAL #endif #ifndef index_t #define index_t size_t #endif /*********************************************************/ /*********************** SEQUENTIAL***********************/ /*********************************************************/ #ifdef __FFLASFFPACK_SEQUENTIAL // MACRO for sequential execution // TASK is a function call #define TASK(M, I) {I;} #define WAIT #define CHECK_DEPENDENCIES #define BARRIER #define PAR_BLOCK #define SYNCH_GROUP(Args...) {{Args};} #define NUM_THREADS 1 #define MAX_THREADS 1 #define READ(Args...) #define WRITE(Args...) #define READWRITE(Args...) #define CONSTREFERENCE(...) #define VALUE(...) #define BEGIN_PARALLEL_MAIN(Args...) int main(Args) { #define END_PARALLEL_MAIN(void) return 0; } // for 1D with iterator control and range access through iter (strategy 1D) #define FORBLOCK1D(iter, m, Helper, Args...) \ { FFLAS::ForStrategy1D::type, typename decltype(Helper)::Cut, typename decltype(Helper)::Param> iter(m, Helper); \ for(iter.initialize(); !iter.isTerminated(); ++iter) \ {Args;} } // for strategy 1D #define FOR1D(i, m, Helper, Args...) \ FORBLOCK1D(_internal_iterator, m, Helper, \ for(auto i=_internal_iterator.begin(); i!=_internal_iterator.end(); ++i) \ { Args; }) // PARFOR1D does normal execution of the loop #define PARFORBLOCK1D(iter, m, Helper, Args...) \ for(std::remove_const::type iter=0; iter::type iter=0; iter::type, typename decltype(Helper)::Cut, typename decltype(Helper)::Param> iter(m,n,Helper); \ for(iter.initialize(); !iter.isTerminated(); ++iter) \ { Args; } } // for strategy 2D #define FOR2D(i, j, m, n, Helper, Args...) \ FORBLOCK2D(_internal_iterator, m, n, Helper, \ for(auto i=_internal_iterator.ibegin(); i!=_internal_iterator.iend(); ++i) \ for(auto j=_internal_iterator.jbegin(); j!=_internal_iterator.jend(); ++j) \ { Args; }) // parallel for strategy 2D with access to the range and control of iterator #define PARFORBLOCK2D(iter, m, n, Helper, Args...) \ FORBLOCK2D(iter, m, n, Helper, Args) // parallel for strategy 2D #define PARFOR2D(i, j, m, n, Helper, Args...) \ FOR2D(i, j, m, n, Helper, Args) #endif // Macro for sequential /*********************************************************/ /************************* OPENMP ************************/ /*********************************************************/ #ifdef __FFLASFFPACK_USE_OPENMP //OpenMP macros #define PRAGMA_OMP_IMPL(Args...) _Pragma(#Args) #define TASK(M, I) \ PRAGMA_OMP_IMPL(omp task M) \ {I;} #define SYNCH_GROUP(Args...) {{Args};}\ WAIT; // macro omp taskwait (waits for all childs of current task) #define WAIT PRAGMA_OMP_IMPL(omp taskwait) #define GLOBALSHARED(a, Args...) shared(Args) #define CONSTREFERENCE(Args...) shared(Args) #define VALUE(Args...) firstprivate(Args) #define BARRIER PRAGMA_OMP_IMPL(omp barrier) //////////////////// CUTTING LOOP MACROS 1D ////////////////////// // for with iterator control and range access through iter (strategy 1D) // Warning: by default we assume that there is no dependency between each iteration, hence we pass an empty MODE() to the tasks. // TODO: add an optional MODE argument to the parameter list of FORBLOCK1D #define FORBLOCK1D(iter, m, Helper, Args...) \ { FFLAS::ForStrategy1D::type, typename decltype(Helper)::Cut, typename decltype(Helper)::Param > iter(m, Helper); \ for(iter.initialize(); !iter.isTerminated(); ++iter){ {Args;} } } // for strategy 1D // WARNING: the inner code Args should not contain any coma outside parenthesis (e.g. declaration lists, and template param list) #define FOR1D(i, m, Helper, Args...) \ FORBLOCK1D(_internal_iterator, m, Helper, \ TASK( , \ {for(auto i=_internal_iterator.begin(); i!=_internal_iterator.end(); ++i) \ { Args; } });) \ WAIT; /* #define PARFORBLOCK1D(iter, m, Helper, Args...) \ { FFLAS::ForStrategy1D::type > iter(m, Helper); \ PRAGMA_OMP_IMPL(omp parallel for num_threads(iter.numblocks()) schedule(runtime)) \ for(iter.initialize(); !iter.isTerminated(); ++iter) \ {Args;} } */ //parallel for 1D with iterator control and range access cannot be done with openmp: syntax of openmp does not allow the use of the iterator syntax // Thus, PARFORBLOCK1D and PARFOR1D have the same implementation with no cutting. If using OpenMP the user can specify the cutting in runtime using the environmental variable: (see OpenMP spec for more details) // export OMP_SCHEDULE="DYNAMIC" // or export OMP_SCHEDULE="GUIDED,4" // or export OMP_SCHEDULE="STATIC" // or export OMP_SCHEDULE="AUTO" #define PARFORBLOCK1D(iter, m, Helper, Args...) \ { FFLAS::ForStrategy1D::type, typename decltype(Helper)::Cut, typename decltype(Helper)::Param > OMPstrategyIterator(m, Helper); \ PRAGMA_OMP_IMPL(omp parallel for num_threads(OMPstrategyIterator.numblocks()) schedule(runtime)) \ for(std::remove_const::type iter=0; iter::type, typename decltype(h)::Cut, typename decltype(h)::Param > iter(m,n,h); \ for(iter.initialize(); !iter.isTerminated(); ++iter) \ {Args;} } // for strategy 2D // WARNING: the inner code Args should not contain any coma outside parenthesis (e.g. declaration lists, and template param list) #define FOR2D(i, j, m, n, Helper, Args...) \ FORBLOCK2D(_internal_iterator, m, n, Helper, \ TASK(, \ for(auto i=_internal_iterator.ibegin(); i!=_internal_iterator.iend(); ++i) \ for(auto j=_internal_iterator.jbegin(); j!=_internal_iterator.jend(); ++j) \ { Args; });) \ WAIT; // parallel for strategy 2D with access to the range and control of iterator // WARNING: This is not doable : OMP requires an iteration over an interval of ints. /* #define PARFORBLOCK2D(iter, m, n, Helper, Args...) \ * { FFLAS::ForStrategy2D::type, typename decltype(Helper)::Cut, typename decltype(Helper)::Param > iter(m,n,Helper); \ * PRAGMA_OMP_IMPL(omp parallel for num_threads(iter.rownumblocks()*iter.colnumblocks()) schedule(runtime)) \ * for(iter.initialize(); !iter.isTerminated(); ++iter) \ * {Args;} } */ // parallel for strategy 2D /* #define PARFOR2D(i, j, m, n, Helper, Args...) \ * PARFORBLOCK2D(_internal_iterator, m, n, Helper, \ * for(auto i=_internal_iterator.ibegin(); i!=_internal_iterator.iend(); ++i) \ * for(auto j=_internal_iterator.jbegin(); j!=_internal_iterator.jend(); ++j) \ * { Args; }) */ // parallel region #define PAR_BLOCK PRAGMA_OMP_IMPL(omp parallel) \ PRAGMA_OMP_IMPL(omp single) // get the number of threads in the parallel region # define NUM_THREADS omp_get_num_threads() // get the number of threads specified with the global variable OMP_NUM_THREADS # define MAX_THREADS omp_get_max_threads() #define BEGIN_PARALLEL_MAIN(Args...) int main(Args) { #define END_PARALLEL_MAIN(void) return 0; } ////////////////////////////////////////////// /////////////// dataflow macros ////////////// #ifdef __FFLASFFPACK_USE_DATAFLOW // OMP dataflow synch DSL features #define READ(Args...) depend(in: Args) #define WRITE(Args...) depend(out: Args) #define READWRITE(Args...) depend(inout: Args) //computes dependencies (no wait here) #define CHECK_DEPENDENCIES #else // OPENMP3.1 (explicit synch mode) #define CHECK_DEPENDENCIES PRAGMA_OMP_IMPL(omp taskwait) #define READ(Args...) #define WRITE(Args...) #define READWRITE(Args...) #endif // end DATAFLOW FLAG /////////////////////////////////////////////// /////////////////////////////////////////////// #endif // OpenMP macros /*********************************************************/ /*************************** TBB ************************/ /*********************************************************/ #ifdef __FFLASFFPACK_USE_TBB // workaround to overload macro CONSTREFERENCE // CONSTREFERENCE macro /* #define REF1(a) =,&a */ /* #define REF2(a,b) =,&a, &b */ /* #define REF3(a,b,c) =,&a,&b,&c */ /* #define REF4(a,b,c,d) =,&a,&b,&c,&d */ /* #define REF5(a,b,c,d,e) =,&a,&b,&c,&d,&e */ /* #define REF6(a,b,c,d,e,f) =,&a,&b,&c,&d,&e,&f */ /* #define REF7(a,b,c,d,e,f,g) =,&a,&b,&c,&d,&e,&f,&g */ /* #define REF8(a,b,c,d,e,f,g,h) =,&a,&b,&c,&d,&e,&f,&g,&h */ /* #define REF9(a,b,c,d,e,f,g,h,i) =,&a,&b,&c,&d,&e,&f,&g,&h,&i */ /* #define REF10(a,b,c,d,e,f,g,h,i,enough) =,&a,&b,&c,&d,&e,&f,&g,&h,&i,&enough */ /* #define GET_REF(_1,_2,_3,_4,_5,_6,_7,_8,_9,_10, NAME,...) NAME */ /* #define CONSTREFERENCE(...) GET_REF(__VA_ARGS__, REF10,REF9,REF8,REF7,REF6,REF5,REF4,REF3,REF2,REF1)(__VA_ARGS__) */ #define REF1(a) ,&a #define REF2(a,b) ,&a, &b #define REF3(a,b,c) ,&a,&b,&c #define REF4(a,b,c,d) ,&a,&b,&c,&d #define REF5(a,b,c,d,e) ,&a,&b,&c,&d,&e #define REF6(a,b,c,d,e,f) ,&a,&b,&c,&d,&e,&f #define REF7(a,b,c,d,e,f,g) ,&a,&b,&c,&d,&e,&f,&g #define REF8(a,b,c,d,e,f,g,h) ,&a,&b,&c,&d,&e,&f,&g,&h #define REF9(a,b,c,d,e,f,g,h,i) ,&a,&b,&c,&d,&e,&f,&g,&h,&i #define REF10(a,b,c,d,e,f,g,h,i,enough) ,&a,&b,&c,&d,&e,&f,&g,&h,&i,&enough #define GET_REF(_1,_2,_3,_4,_5,_6,_7,_8,_9,_10, NAME,...) NAME #define CONSTREFERENCE(...) GET_REF(__VA_ARGS__, REF10,REF9,REF8,REF7,REF6,REF5,REF4,REF3,REF2,REF1)(__VA_ARGS__) // workaround to overload macro VALUE #define VAL1(a) ,a #define VAL2(a,b) ,a, b #define VAL3(a,b,c) ,a,b,c #define VAL4(a,b,c,d) ,a,b,c,d #define VAL5(a,b,c,d,e) ,a,b,c,d,e #define GET_VAL(_1,_2,_3,_4,_5, NAME,...) NAME #define VALUE(...) GET_VAL(__VA_ARGS__, VAL5,VAL4,VAL3,VAL2,VAL1)(__VA_ARGS__) // need task_group to lunch a group of tasks in parallel #define SYNCH_GROUP(Args...) \ {tbb::task_group g; \ {{Args};} \ g.wait();} // TBB task #define TASK(M, I) \ { \ g.run([=M](){I;}); \ } //#define MODE(Args...) Args #define WAIT g.wait() #define CHECK_DEPENDENCIES g.wait() #define BARRIER #define PAR_BLOCK #define NUM_THREADS tbb::task_scheduler_init::default_num_threads() #define MAX_THREADS tbb::task_scheduler_init::default_num_threads() #define READ(Args...) #define WRITE(Args...) #define READWRITE(Args...) #define BEGIN_PARALLEL_MAIN(Args...) int main(Args) { #define END_PARALLEL_MAIN(void) return 0; } #define CAPTURE(Args...) [Args] // for strategy 1D with access to the iterator #define FORBLOCK1D(iter, m, Helper, Args...) \ { FFLAS::ForStrategy1D::type, typename decltype(Helper)::Cut, typename decltype(Helper)::Param > iter(m, Helper); \ for(iter.initialize(); !iter.isTerminated(); ++iter) \ {Args;} } // for strategy 1D #define FOR1D(i, m, Helper, Args...) \ FORBLOCK1D(_internal_iterator, m, Helper, \ for(auto i=_internal_iterator.begin(); i!=_internal_iterator.end(); ++i) \ { Args; } ) // tbb parallel for 1D #define PARFORBLOCK1D(iter, m, Helper, Args...) \ { FFLAS::ForStrategy1D::type, typename decltype(Helper)::Cut, typename decltype(Helper)::Param> iter(m, Helper); \ tbb::parallel_for( \ tbb::blocked_range::type >(0, m, iter.blocksize() ), \ [=, &iter](const tbb::blocked_range::type > &iter) { \ {Args;} }); \ } // tbb parallel for 1D /* #define PARFOR1D(i, m, Helper, Args...) \ PARFORBLOCK1D(_internal_iterator, m, Helper, \ for(auto i=_internal_iterator.begin(); i!=_internal_iterator.end(); ++i) \ { Args; } ) */ #define PARFOR1D(i, m, Helper, Args...) \ { FFLAS::ForStrategy1D::type, typename decltype(Helper)::Cut, typename decltype(Helper)::Param> TBBstrategyIterator(m, Helper); \ tbb::parallel_for( \ tbb::blocked_range::type >(0, m, TBBstrategyIterator.blocksize() ), \ [=](const tbb::blocked_range::type > &TBBblockrangeIterator) { \ for(auto i = TBBblockrangeIterator.begin(); \ i < TBBblockrangeIterator.end() ; ++i){ \ {Args;} }}); \ } // for strategy 2D with access to the iterator #define FORBLOCK2D(iter, m, n, Helper, Args...) \ { FFLAS::ForStrategy2D::type, typename decltype(Helper)::Cut, typename decltype(Helper)::Param> iter(m,n,Helper); \ for(iter.initialize(); !iter.isTerminated(); ++iter) \ {Args;} } // for strategy 2D #define FOR2D(i, j, m, n, Helper, Args...) \ FORBLOCK2D(_internal_iterator, m, n, Helper, \ for(auto i=_internal_iterator.ibegin(); i!=_internal_iterator.iend(); ++i) \ for(auto j=_internal_iterator.jbegin(); j!=_internal_iterator.jend(); ++j) \ { Args; }) // parallel for strategy 2D with access to the range and control of iterator #define PARFORBLOCK2D(iter, m, n, Helper, Args...) \ { FFLAS::ForStrategy2D::type, typename decltype(Helper)::Cut, typename decltype(Helper)::Param> iter(m,n,Helper); \ tbb::parallel_for( \ tbb::blocked_range2d::type >(0, m, iter.rowblocksize(), 0, n, iter.colblocksize() ), \ [=, &i](const tbb::blocked_range2d::type > &iter) { \ {Args;} }); \ } // parallel for strategy 2D #define PARFOR2D(i, j, m, n, Helper, Args...) \ PARFORBLOCK2D(_internal_iterator, m, n, Helper, \ for(auto i=_internal_iterator.ibegin(); i!=_internal_iterator.iend(); ++i) \ for(auto j=_internal_iterator.jbegin(); j!=_internal_iterator.jend(); ++j) \ { Args; }) #endif // end TBB macros /*********************************************************/ /************************* KAAPI *************************/ /*********************************************************/ #ifdef __FFLASFFPACK_USE_KAAPI // KAAPI #define SPAWN(f,N) CONCATENATE_ARGS(ka::Spawn()(argc, argv); \ com.leave(); \ ka::System::terminate();} \ catch (const std::exception& E) { ka::logfile() << "Catch : " << E.what() << std::endl;} \ catch (...) { ka::logfile() << "Catch unknown exception: " << std::endl;} \ return 0;} #define SYNCH_GROUP(Args...) {{Args};} #endif // KAAPI macros /*********************************************************/ /********************* common macros *********************/ /*********************************************************/ #define COMMA , #define MODE(...) __VA_ARGS__ #define RETURNPARAM(f, P1, Args...) P1=f(Args) // Macro computes number of Arguments #define NUMARGS(...) \ PP_NARG_(__VA_ARGS__,PP_RSEQ_N()) #define PP_NARG_(...) \ PP_ARG_N(__VA_ARGS__) #define PP_ARG_N( \ _1, _2, _3, _4, _5, _6, _7, _8, _9,_10, \ _11,_12,_13,_14,_15,_16,_17,_18,_19,_20, \ _21,_22,_23,_24,_25,_26,_27,_28,_29,_30, \ _31,_32,_33,_34,_35,_36,_37,_38,_39,_40, \ _41,_42,_43,_44,_45,_46,_47,_48,_49,_50, \ _51,_52,_53,_54,_55,_56,_57,_58,_59,_60, \ _61,_62,_63,N,...) N #define PP_RSEQ_N() \ 63,62,61,60, \ 59,58,57,56,55,54,53,52,51,50, \ 49,48,47,46,45,44,43,42,41,40, \ 39,38,37,36,35,34,33,32,31,30, \ 29,28,27,26,25,24,23,22,21,20, \ 19,18,17,16,15,14,13,12,11,10, \ 9,8,7,6,5,4,3,2,1,0 #define NOSPLIT() FFLAS::ParSeqHelper::Sequential() // overload of SPLITTER #define splitting_0() FFLAS::ParSeqHelper::Parallel() #define splitting_1(a) FFLAS::ParSeqHelper::Parallel(a) #define splitting_2(a,c) FFLAS::ParSeqHelper::Parallel(a) #define splitting_3(a,b,c) FFLAS::ParSeqHelper::Parallel(a) #define splitt(_1,_2,_3, NAME,...) NAME #define SPLITTER(...) splitt(__VA_ARGS__, splitting_3, splitting_2, splitting_1, splitting_0)(__VA_ARGS__) #include "fflas-ffpack/paladin/blockcuts.inl" #endif //__FFLASFFPACK_fflas_parallel_H fflas-ffpack-2.2.2/fflas-ffpack/paladin/pfgemm_variants.inl000066400000000000000000000517731274716147400237130ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_pfgemm.inl * Copyright (C) 2013 Jean Guillaume Dumas Clement Pernet Ziad Sultan * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ namespace FFLAS { template typename Field::Element* pfgemm(const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::ConstElement_ptr A, const size_t lda, const typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element * C, const size_t ldc, MMHelper > & H){ { H.parseq.set_numthreads( std::min(H.parseq.numthreads(), std::max((size_t)1,(size_t)(m*n/(__FFLASFFPACK_SEQPARTHRESHOLD*__FFLASFFPACK_SEQPARTHRESHOLD)))) ); MMHelper SeqH (H); size_t sa = (ta==FFLAS::FflasNoTrans)?lda:1; size_t sb = (tb==FFLAS::FflasNoTrans)?1:ldb; SYNCH_GROUP({FORBLOCK2D(iter,m,n,H.parseq, TASK( MODE( READ(A[iter.ibegin()*sa],B[iter.jbegin()*sb]) CONSTREFERENCE(F, SeqH) READWRITE(C[iter.ibegin()*ldc+iter.jbegin()])), fgemm( F, ta, tb, iter.iend()-iter.ibegin(), iter.jend()-iter.jbegin(), k, alpha, A+iter.ibegin()*sa, lda, B+iter.jbegin()*sb, ldb, beta, C+iter.ibegin()*ldc+iter.jbegin(), ldc, SeqH);); ); }); } return C; } template typename Field::Element* pfgemm(const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::ConstElement_ptr AA, const size_t lda, const typename Field::ConstElement_ptr BB, const size_t ldb, const typename Field::Element beta, typename Field::Element * C, const size_t ldc, MMHelper > & H){ typename Field::Element a = alpha; typename Field::Element b = beta; typename Field::ConstElement_ptr B = BB; typename Field::ConstElement_ptr A = AA; if (!m || !n) {return C;} if (!k || F.isZero (alpha)){ fscalin(F, m, n, beta, C, ldc); return C; } if (H.parseq.numthreads()<=1 || std::min(m*n,std::min(m*k,k*n))<=__FFLASFFPACK_SEQPARTHRESHOLD*__FFLASFFPACK_SEQPARTHRESHOLD){ MMHelper SeqH(H); return fgemm(F, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, SeqH); } typedef MMHelper > MMH_t; MMH_t H1(H); MMH_t H2(H); if(__FFLASFFPACK_DIMKPENALTY*m > k && m >= n) { SYNCH_GROUP(size_t M2= m>>1; H1.parseq.set_numthreads(H1.parseq.numthreads() >> 1); H2.parseq.set_numthreads(H.parseq.numthreads() - H1.parseq.numthreads()); typename Field::ConstElement_ptr A1= A; typename Field::ConstElement_ptr A2= A+M2*((ta==FFLAS::FflasTrans)?1:lda); typename Field::Element_ptr C1= C; typename Field::Element_ptr C2= C+M2*ldc; // 2 multiply (1 split on dimension m) TASK(MODE(CONSTREFERENCE(F, H1) READ(A1,B) READWRITE(C1)), {pfgemm( F, ta, tb, M2, n, k, alpha, A1, lda, B, ldb, beta, C1, ldc, H1);} ); TASK(MODE(CONSTREFERENCE(F,H2) READ(A2,B) READWRITE(C2)), {pfgemm(F, ta, tb, m-M2, n, k, alpha, A2, lda, B, ldb, beta, C2, ldc, H2);} ); ); } else if (__FFLASFFPACK_DIMKPENALTY*n > k) { SYNCH_GROUP( size_t N2 = n>>1; H1.parseq.set_numthreads( H1.parseq.numthreads() >> 1); H2.parseq.set_numthreads(H.parseq.numthreads() - H1.parseq.numthreads()); typename Field::ConstElement_ptr B1= B; typename Field::ConstElement_ptr B2= B+N2*((tb==FFLAS::FflasTrans)?ldb:1); typename Field::Element_ptr C1= C; typename Field::Element_ptr C2= C+N2; TASK(MODE(CONSTREFERENCE(F,H1) READ(A,B1) READWRITE(C1)), pfgemm(F, ta, tb, m, N2, k, a, A, lda, B1, ldb, b, C1, ldc, H1)); TASK(MODE(CONSTREFERENCE(F,H2) READ(A,B2) READWRITE(C2)), pfgemm(F, ta, tb, m, n-N2, k, a, A, lda, B2, ldb, b,C2, ldc, H2)); ); } else { size_t K2 = k>>1; typename Field::ConstElement_ptr B1= B; typename Field::ConstElement_ptr B2= B+K2*((tb==FFLAS::FflasTrans)?1:ldb); typename Field::ConstElement_ptr A1= A; typename Field::ConstElement_ptr A2= A+K2*((ta==FFLAS::FflasTrans)?lda:1); typename Field::Element_ptr C2 = fflas_new (F, m, n,Alignment::CACHE_PAGESIZE); H1.parseq.set_numthreads(H1.parseq.numthreads() >> 1); H2.parseq.set_numthreads(H.parseq.numthreads()-H1.parseq.numthreads()); SYNCH_GROUP( TASK(MODE(CONSTREFERENCE(F,H1) READ(A1,B1) READWRITE(C)), pfgemm(F, ta, tb, m, n, K2, a, A1, lda, B1, ldb, b, C, ldc, H1)); TASK(MODE(CONSTREFERENCE(F,H2) READ(A2,B2) READWRITE(C2)), pfgemm(F, ta, tb, m, n, k-K2, a, A2, lda, B2, ldb, F.zero, C2, n, H2)); CHECK_DEPENDENCIES; TASK(MODE(CONSTREFERENCE(F) READ(C2) READWRITE(C)),faddin(F, n, m, C2, n, C, ldc)); ); fflas_delete(C2); } return C; } template typename Field::Element* pfgemm (const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::ConstElement_ptr AA, const size_t lda, const typename Field::ConstElement_ptr BB, const size_t ldb, const typename Field::Element beta, typename Field::Element * C, const size_t ldc, MMHelper > & H){ typename Field::Element a = alpha; typename Field::Element b = beta; typename Field::ConstElement_ptr B = BB; typename Field::ConstElement_ptr A = AA; if (!m || !n) {return C;} if (!k || F.isZero (alpha)){ fscalin(F, m, n, beta, C, ldc); return C; } if (H.parseq.numthreads()<=1 || m*n<=__FFLASFFPACK_SEQPARTHRESHOLD*__FFLASFFPACK_SEQPARTHRESHOLD){ MMHelper SeqH(H); return fgemm(F, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, SeqH); } typedef MMHelper > MMH_t; MMH_t H1(H); MMH_t H2(H); H1.parseq.set_numthreads(H1.parseq.numthreads() >> 1); H2.parseq.set_numthreads(H.parseq.numthreads() - H1.parseq.numthreads()); if(m >= n) { size_t M2= m>>1; typename Field::ConstElement_ptr A1= A; typename Field::ConstElement_ptr A2= A+M2*((ta==FFLAS::FflasTrans)?1:lda); typename Field::Element_ptr C1= C; typename Field::Element_ptr C2= C+M2*ldc; SYNCH_GROUP( TASK(MODE(CONSTREFERENCE(F,H1, A1, B) READ(M2, A1[0],B[0]) READWRITE(C1[0])), pfgemm(F, ta, tb, M2, n, k, alpha, A1, lda, B, ldb, beta, C1, ldc, H1)); TASK(MODE(CONSTREFERENCE(F,H2, A2, B) READ(M2, A2[0],B[0]) READWRITE(C2[0])), pfgemm(F, ta, tb, m-M2, n, k, alpha, A2, lda, B, ldb, beta, C2, ldc, H2)); ); } else { size_t N2 = n>>1; typename Field::ConstElement_ptr B1= B; typename Field::ConstElement_ptr B2= B+N2*((tb==FFLAS::FflasTrans)?ldb:1); typename Field::Element_ptr C1= C; typename Field::Element_ptr C2= C+N2; SYNCH_GROUP( TASK(MODE(CONSTREFERENCE(F,H1, A, B1) READ(N2, A[0], B1[0]) READWRITE(C1[0])), pfgemm(F, ta, tb, m, N2, k, a, A, lda, B1, ldb, b, C1, ldc, H1)); TASK(MODE(CONSTREFERENCE(F,H2, A, B2) READ(N2, A[0], B2[0]) READWRITE(C2[0])), pfgemm(F, ta, tb, m, n-N2, k, a, A, lda, B2, ldb, b,C2, ldc, H2)); ); } return C; } template typename Field::Element* pfgemm( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::ConstElement_ptr AA, const size_t lda, const typename Field::ConstElement_ptr BB, const size_t ldb, const typename Field::Element beta, typename Field::Element * C, const size_t ldc, MMHelper > & H){ typename Field::Element a = alpha; typename Field::Element b = beta; typename Field::ConstElement_ptr B = BB; typename Field::ConstElement_ptr A = AA; if (!m || !n) {return C;} if (!k || F.isZero (alpha)){ fscalin(F, m, n, beta, C, ldc); return C; } if(H.parseq.numthreads()<=1|| m*n<=__FFLASFFPACK_SEQPARTHRESHOLD*__FFLASFFPACK_SEQPARTHRESHOLD){ MMHelper SeqH(H); return fgemm(F, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, SeqH); } else { size_t M2= m>>1; size_t N2= n>>1; typename Field::ConstElement_ptr A1= A; typename Field::ConstElement_ptr A2= A+M2*((ta==FFLAS::FflasTrans)?1:lda); typename Field::ConstElement_ptr B1= B; typename Field::ConstElement_ptr B2= B+N2*((tb==FFLAS::FflasTrans)?ldb:1); typename Field::Element_ptr C11= C; typename Field::Element_ptr C21= C+M2*ldc; typename Field::Element_ptr C12= C+N2; typename Field::Element_ptr C22= C+N2+M2*ldc; typedef MMHelper > MMH_t; MMH_t H1(H); MMH_t H2(H); MMH_t H3(H); MMH_t H4(H); size_t nt = H.parseq.numthreads(); size_t nt_rec = nt/4; size_t nt_mod = nt%4; H1.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H2.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H3.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H4.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); SYNCH_GROUP( TASK(MODE(CONSTREFERENCE(F,H1) READ(A1,B1) READWRITE(C11)), pfgemm(F, ta, tb, M2, N2, k, alpha, A1, lda, B1, ldb, beta, C11, ldc, H1)); TASK(MODE(CONSTREFERENCE(F,H2) READ(A1,B2) READWRITE(C12)), pfgemm(F, ta, tb, M2, n-N2, k, alpha, A1, lda, B2, ldb, beta, C12, ldc, H2)); TASK(MODE(CONSTREFERENCE(F,H3) READ(A2,B1) READWRITE(C21)), pfgemm(F, ta, tb, m-M2, N2, k, a, A2, lda, B1, ldb, b, C21, ldc, H3)); TASK(MODE(CONSTREFERENCE(F,H4) READ(A2,B2) READWRITE(C22)), pfgemm(F, ta, tb, m-M2, n-N2, k, a, A2, lda, B2, ldb, b,C22, ldc, H4)); ); } return C; } template typename Field::Element_ptr pfgemm(const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::ConstElement_ptr A, const size_t lda, const typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper > & H){ if (!m || !n) {return C;} if (!k || F.isZero (alpha)){ fscalin(F, m, n, beta, C, ldc); return C; } if(H.parseq.numthreads() <= 1|| std::min(m*n,std::min(m*k,k*n))<=__FFLASFFPACK_SEQPARTHRESHOLD*__FFLASFFPACK_SEQPARTHRESHOLD){ FFLAS::MMHelper WH (H); return fgemm(F, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, WH); } else { typename Field::Element a = alpha; typename Field::Element b = 0; size_t M2= m>>1; size_t N2= n>>1; size_t K2= k>>1; typename Field::ConstElement_ptr A11= A; typename Field::ConstElement_ptr A12= A+K2*((ta==FFLAS::FflasTrans)?lda:1); typename Field::ConstElement_ptr A21= A+M2*((ta==FFLAS::FflasTrans)?1:lda); typename Field::ConstElement_ptr A22= A12+M2*((ta==FFLAS::FflasTrans)?1:lda); typename Field::ConstElement_ptr B11= B; typename Field::ConstElement_ptr B12= B+N2*((tb==FFLAS::FflasTrans)?ldb:1); typename Field::ConstElement_ptr B21= B+K2*((tb==FFLAS::FflasTrans)?1:ldb); typename Field::ConstElement_ptr B22= B12+K2*((tb==FFLAS::FflasTrans)?1:ldb); typename Field::Element_ptr C11= C; typename Field::Element_ptr C_11 = fflas_new (F, M2, N2,Alignment::CACHE_PAGESIZE); typename Field::Element_ptr C12= C+N2; typename Field::Element_ptr C_12 = fflas_new (F, M2, n-N2,Alignment::CACHE_PAGESIZE); typename Field::Element_ptr C21= C+M2*ldc; typename Field::Element_ptr C_21 = fflas_new (F, m-M2, N2,Alignment::CACHE_PAGESIZE); typename Field::Element_ptr C22= C+N2+M2*ldc; typename Field::Element_ptr C_22 = fflas_new (F, m-M2, n-N2,Alignment::CACHE_PAGESIZE); // 1/ 8 multiply in parallel //omp_set_task_affinity(omp_get_locality_domain_num_for( C11)); typedef MMHelper > MMH_t; MMH_t H1(H); MMH_t H2(H); MMH_t H3(H); MMH_t H4(H); MMH_t H5(H); MMH_t H6(H); MMH_t H7(H); MMH_t H8(H); size_t nt = H.parseq.numthreads(); size_t nt_rec = nt/8; size_t nt_mod = nt % 8 ; H1.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H2.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H3.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H4.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H5.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H6.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H7.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H8.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); SYNCH_GROUP( TASK(MODE(CONSTREFERENCE(F,H1) READ(A11,B11) READWRITE(C11)), pfgemm(F, ta, tb, M2, N2, K2, alpha, A11, lda, B11, ldb, beta, C11, ldc, H1)); //omp_set_task_affinity(omp_get_locality_domain_num_for( C_11)); TASK(MODE(CONSTREFERENCE(F,H2) READ(A12,B21) WRITE(C_11)), pfgemm(F, ta, tb, M2, N2, k-K2, a, A12, lda, B21, ldb, b,C_11, N2, H2)); //omp_set_task_affinity(omp_get_locality_domain_num_for( C12)); TASK(MODE(CONSTREFERENCE(F,H3) READ(A12,B22) READWRITE(C12)), pfgemm(F, ta, tb, M2, n-N2, k-K2, alpha, A12, lda, B22, ldb, beta, C12, ldc, H3)); //omp_set_task_affinity(omp_get_locality_domain_num_for( C_12)); TASK(MODE(CONSTREFERENCE(F,H4) READ(A11,B12) WRITE(C_12)), pfgemm(F, ta, tb, M2, n-N2, K2, a, A11, lda, B12, ldb, b, C_12, n-N2, H4)); //omp_set_task_affinity(omp_get_locality_domain_num_for( C21)); TASK(MODE(CONSTREFERENCE(F,H5) READ(A22,B21) READWRITE(C21)), pfgemm(F, ta, tb, m-M2, N2, k-K2, alpha, A22, lda, B21, ldb, beta, C21, ldc, H5)); //omp_set_task_affinity(omp_get_locality_domain_num_for( C_21)); TASK(MODE(CONSTREFERENCE(F,H6) READ(A21,B11) WRITE(C_21)), pfgemm(F, ta, tb, m-M2, N2, K2, a, A21, lda, B11, ldb, b,C_21, N2, H6)); //omp_set_task_affinity(omp_get_locality_domain_num_for( C22)); TASK(MODE(CONSTREFERENCE(F,H7) READ(A21,B12) READWRITE(C22)), pfgemm(F, ta, tb, m-M2, n-N2, K2, alpha, A21, lda, B12, ldb, beta, C22, ldc, H7)); //omp_set_task_affinity(omp_get_locality_domain_num_for( C_22)); TASK(MODE(CONSTREFERENCE(F,H8) READ(A22,B22) WRITE(C_22)), pfgemm(F, ta, tb, m-M2, n-N2, k-K2, a, A22, lda, B22, ldb, b,C_22, n-N2, H8)); CHECK_DEPENDENCIES; // 2/ final add //omp_set_task_affinity(omp_get_locality_domain_num_for( C11)); TASK(MODE(CONSTREFERENCE(F) READ(C_11) READWRITE(C11)), faddin(F, M2, N2, C_11, N2, C11, ldc)); //omp_set_task_affinity(omp_get_locality_domain_num_for( C12)); TASK(MODE(CONSTREFERENCE(F) READ(C_12) READWRITE(C12)),faddin(F, M2, n-N2, C_12, n-N2, C12, ldc)); //omp_set_task_affinity(omp_get_locality_domain_num_for( C21)); TASK(MODE(CONSTREFERENCE(F) READ(C_21) READWRITE(C21)), faddin(F, m-M2, N2, C_21, N2, C21, ldc)); //omp_set_task_affinity(omp_get_locality_domain_num_for( C22)); TASK(MODE(CONSTREFERENCE(F) READ(C_22) READWRITE(C22)), faddin(F, m-M2, n-N2, C_22, n-N2, C22, ldc)); ); FFLAS::fflas_delete (C_11); FFLAS::fflas_delete (C_12); FFLAS::fflas_delete (C_21); FFLAS::fflas_delete (C_22); } return C; } template typename Field::Element* pfgemm( const Field& F, const FFLAS_TRANSPOSE ta, const FFLAS_TRANSPOSE tb, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::ConstElement_ptr A, const size_t lda, const typename Field::ConstElement_ptr B, const size_t ldb, const typename Field::Element beta, typename Field::Element_ptr C, const size_t ldc, MMHelper > & H){ if (!m || !n) {return C;} if (!k || F.isZero (alpha)){ fscalin(F, m, n, beta, C, ldc); return C; } if(H.parseq.numthreads() <= 1|| std::min(m*n,std::min(m*k,k*n))<=__FFLASFFPACK_SEQPARTHRESHOLD*__FFLASFFPACK_SEQPARTHRESHOLD){ // threshold FFLAS::MMHelper WH (H); return fgemm(F, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, WH); }else{ size_t M2= m>>1; size_t N2= n>>1; size_t K2= k>>1; typename Field::ConstElement_ptr A11= A; typename Field::ConstElement_ptr A12= A+K2*((ta==FFLAS::FflasTrans)?lda:1); typename Field::ConstElement_ptr A21= A+M2*((ta==FFLAS::FflasTrans)?1:lda); typename Field::ConstElement_ptr A22= A12+M2*((ta==FFLAS::FflasTrans)?1:lda); typename Field::ConstElement_ptr B11= B; typename Field::ConstElement_ptr B12= B+N2*((tb==FFLAS::FflasTrans)?ldb:1); typename Field::ConstElement_ptr B21= B+K2*((tb==FFLAS::FflasTrans)?1:ldb); typename Field::ConstElement_ptr B22= B12+K2*((tb==FFLAS::FflasTrans)?1:ldb); typename Field::Element_ptr C11= C; typename Field::Element_ptr C12= C+N2; typename Field::Element_ptr C21= C+M2*ldc; typename Field::Element_ptr C22= C+N2+M2*ldc; typedef MMHelper > MMH_t; MMH_t H1(H); MMH_t H2(H); MMH_t H3(H); MMH_t H4(H); size_t nt = H.parseq.numthreads(); size_t nt_rec = nt/4; size_t nt_mod = nt%4; H1.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H2.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H3.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); H4.parseq.set_numthreads(std::max(size_t(1),nt_rec + ((nt_mod-- > 0)?1:0))); SYNCH_GROUP( // 1/ 4 multiply TASK(MODE(CONSTREFERENCE(F,H1) READ(A11,B11) READWRITE(C11)), pfgemm(F, ta, tb, M2, N2, K2, alpha, A11, lda, B11, ldb, beta, C11, ldc, H1)); TASK(MODE(CONSTREFERENCE(F,H2) READ(A12,B22) READWRITE(C12)), pfgemm(F, ta, tb, M2, n-N2, k-K2, alpha, A12, lda, B22, ldb, beta, C12, ldc, H2)); TASK(MODE(CONSTREFERENCE(F,H3) READ(A22,B21) READWRITE(C21)), pfgemm(F, ta, tb, m-M2, N2, k-K2, alpha, A22, lda, B21, ldb, beta, C21, ldc, H3)); TASK(MODE(CONSTREFERENCE(F,H4) READ(A21,B12) READWRITE(C22)), pfgemm(F, ta, tb, m-M2, n-N2, K2, alpha, A21, lda, B12, ldb, beta, C22, ldc, H4)); CHECK_DEPENDENCIES; // 2/ 4 add+multiply TASK(MODE(CONSTREFERENCE(F,H1) READ(A12,B21) READWRITE(C11)), pfgemm(F, ta, tb, M2, N2, k-K2, alpha, A12, lda, B21, ldb, F.one, C11, ldc, H1)); TASK(MODE(CONSTREFERENCE(F,H2) READ(A11,B12) READWRITE(C12)), pfgemm(F, ta, tb, M2, n-N2, K2, alpha, A11, lda, B12, ldb, F.one, C12, ldc, H2)); TASK(MODE(CONSTREFERENCE(F,H3) READ(A21,B11) READWRITE(C21)), pfgemm(F, ta, tb, m-M2, N2, K2, alpha, A21, lda, B11, ldb, F.one, C21, ldc, H3)); TASK(MODE(CONSTREFERENCE(F,H4) READ(A22,B22) READWRITE(C22)), pfgemm(F, ta, tb, m-M2, n-N2, k-K2, alpha, A22, lda, B22, ldb, F.one, C22, ldc, H4)); ); } return C; } } // FFLAS fflas-ffpack-2.2.2/fflas-ffpack/utils/000077500000000000000000000000001274716147400175405ustar00rootroot00000000000000fflas-ffpack-2.2.2/fflas-ffpack/utils/Makefile.am000066400000000000000000000024361274716147400216010ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== pkgincludesubdir=$(pkgincludedir)/utils pkgincludesub_HEADERS= \ align-allocator.h \ args-parser.h \ debug.h \ fflas_memory.h \ fflas_randommatrix.h \ flimits.h \ Matio.h \ bit_manipulation.h \ print-utils.h \ timer.h \ cast.h \ fflas_intrinsic.h ### is this really extra dist ? ### this is wrong in a header only lib # EXTRA_DIST=timer.h fflas-ffpack-2.2.2/fflas-ffpack/utils/Matio.h000066400000000000000000000075061274716147400207720ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (C) LinBox,FFLAS-FFPACK * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #ifndef __FFLASFFPACK_matio_H #define __FFLASFFPACK_matio_H #include #include #include //#include "fflas-ffpack/fflas/fflas.h" #include "fflas_memory.h" // Reading and writing matrices over field // Reading a matrice from a (eventually zipped) file template typename Field::Element_ptr read_field(const Field& F, const char * mat_file,size_t * tni,size_t* tnj) { char *UT = NULL; const char* File_Name; int is_gzipped = 0; size_t s = strlen(mat_file); typename Field::Element_ptr X = NULL; if ((mat_file[--s] == 'z') && (mat_file[--s] == 'g') && (mat_file[--s] == '.')) { is_gzipped = 1; char tmp_nam[] = "/tmp/bbXXXXXX_"; if (mkstemp(tmp_nam)) printf("Error opening file]\n"); File_Name = tmp_nam; UT = FFLAS::fflas_new(s+34+strlen(File_Name)); sprintf(UT,"gunzip -c %s > %s", mat_file, File_Name); if (system(UT)) printf("Error uncompressing file\n"); sprintf(UT,"\\rm %s", File_Name); } else { File_Name = mat_file; } FILE* FileDes = fopen(File_Name, "r"); if (FileDes != NULL) { char tmp [200];// unsigned long tni, tnj; if (fscanf(FileDes,"%lu %lu %199s\n",tni, tnj, tmp)<0) printf("Error Reading first line of file \n"); int n=*tni; int p=*tnj; X = FFLAS::fflas_new(n*p); for (int i=0;i std::ostream& write_field(const Field& F,std::ostream& c, typename Field::ConstElement_ptr E, int n, int m, int id, bool mapleFormat = false, bool column_major=false) { // typename Field::Element tmp; // double tmp; // Givaro::Integer tmp; typename Field::Element tmp; F.init(tmp); if (mapleFormat) c << "Matrix(" << n <<',' << m << ",\n[" ; for (int i = 0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by * * STL align allocator inspired by MAlloc from Stephan T. Lavavej, Visual C++ Libraries Developer * (http://blogs.msdn.com/b/vcblog/archive/2008/08/28/the-mallocator.aspx) * Update to c++11 * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_align_allocator_H #define __FFLASFFPACK_align_allocator_H #include "fflas-ffpack/config.h" #ifdef __FFLASFFPACK_HAVE_CXX11 #include #include #include #include #include #include "fflas-ffpack/utils/fflas_intrinsic.h" //#include // Alignment Type enum class Alignment : size_t { NONE = 0, Normal = sizeof(void*), SSE = 16, AVX = 32, XEON_PHI = 64, CACHE_LINE = 64, CACHE_PAGESIZE = 4096, DEFAULT = #ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS 32 #else 16 #endif }; /* * Allocate T[size] with address aligned to alignement * ex : int* tab = malloc_align(100, Alignment::AVX) */ template T* malloc_align(size_t size, Alignment alignment = Alignment::DEFAULT) noexcept { void* p = nullptr; int err = 0; err = posix_memalign(&p, (size_t) alignment, size*sizeof(T)); if(err) std::cout << "posix_memalign error" << std::endl; //return new(p) T[size]; return static_cast(p); } namespace detail { inline void* allocate(size_t align, size_t size) { assert(align >= sizeof(void*)); if (size == 0) { return nullptr; } void* ptr = nullptr; int rc = posix_memalign(&ptr, align, size); if (rc != 0) { return nullptr; } return ptr; } inline void deallocate(void* ptr) noexcept { return free(ptr); } } /* STL Aligned Allocator * ex : std::vector>; * * template using vector = std::vector>; */ template class AlignedAllocator; template class AlignedAllocator { public: using value_type = T; using pointer = T*; using const_pointer = const T*; using reference = T&; using const_reference = const T&; using size_type = std::size_t; using difference_type = ptrdiff_t; using propagate_on_container_move_assignment = std::true_type; template struct rebind { using other = AlignedAllocator; }; public: AlignedAllocator() noexcept {} template AlignedAllocator(const AlignedAllocator&) noexcept {} size_type max_size() const noexcept { return (size_type(~0) - size_type(Align)) / sizeof(T); } pointer address(reference x) const noexcept { return std::addressof(x); } const_pointer address(const_reference x) const noexcept { return std::addressof(x); } pointer allocate(size_type n, typename AlignedAllocator::const_pointer = 0) { const size_type alignment = static_cast(Align); void* ptr = detail::allocate(alignment, n * sizeof(T)); if (ptr == nullptr) { throw std::bad_alloc(); } return reinterpret_cast(ptr); } void deallocate(pointer p, size_type) noexcept { return detail::deallocate(p); } template void construct(U* p, Args&&... args) { ::new (reinterpret_cast(p)) U(std::forward(args)...); } void destroy(pointer p) { p->~T(); } }; /* * Specialization for void* */ template class AlignedAllocator { public: using pointer = void*; using const_pointer = const void*; using value_type = void; template struct rebind { using other = AlignedAllocator; }; }; /* * Specialization for const T */ template class AlignedAllocator { public: using value_type = T; using pointer = const T*; using const_pointer = const T*; using reference = const T&; using const_reference = const T&; using size_type = std::size_t; using difference_type = ptrdiff_t; using propagate_on_container_move_assignment = std::true_type; template struct rebind { using other = AlignedAllocator; }; public: AlignedAllocator() noexcept {} template AlignedAllocator(const AlignedAllocator&) noexcept {} size_type max_size() const noexcept { return (size_type(~0) - size_type(Align)) / sizeof(T); } const_pointer address(const_reference x) const noexcept { return std::addressof(x); } pointer allocate(size_type n, typename AlignedAllocator::const_pointer = 0) { const size_type alignment = static_cast(Align); void* ptr = detail::allocate(alignment, n * sizeof(T)); if (ptr == nullptr) { throw std::bad_alloc(); } return reinterpret_cast(ptr); } void deallocate(pointer p, size_type) noexcept { return detail::deallocate(p); } template void construct(U* p, Args&&... args) { ::new (reinterpret_cast(p)) U(std::forward(args)...); } void destroy(pointer p) { p->~T(); } }; template inline bool operator==(const AlignedAllocator&, const AlignedAllocator&) noexcept { return TAlign == UAlign; } template inline bool operator!=(const AlignedAllocator&, const AlignedAllocator&) noexcept { return TAlign != UAlign; } #else // C++11 #error "You need a c++11 compiler." #endif // C++11 #endif /* _FFLASFFPACK_align_allocator_h */ fflas-ffpack-2.2.2/fflas-ffpack/utils/args-parser.h000066400000000000000000000261361274716147400221470ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* utils/args-parser.C * Copyright (C) 2001, 2002 Bradford Hovinen * * Written by Bradford Hovinen * Modified by Dmitriy Morozov . May 27, 2002. * Modified 2011 Brice Boyer (more types,...) * * Added parametrization to the VectorCategory tags to make them fit the * Rootbeer meeting design of VectorCategories being parametrized by * VectorTraits. * * ------------------------------------ * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #ifndef __FFLASFFPACK_args_parser_H #define __FFLASFFPACK_args_parser_H #include #include #include #include #include #include #include #include #include #include "fflas-ffpack/utils/print-utils.h" enum ArgumentType { TYPE_NONE, TYPE_INT, TYPE_LONGLONG, TYPE_INTEGER, TYPE_DOUBLE, TYPE_INTLIST, TYPE_STR }; #define TYPE_BOOL TYPE_NONE #define END_OF_ARGUMENTS \ { '\0', "\0", "\0", TYPE_NONE, NULL } #ifdef _GIVARO_CONFIG_H #define type_integer Givaro::Integer #else #define type_integer long int #endif struct Argument { char c; const char *example ; const char *helpString ; ArgumentType type ; void *data ; }; // example may be passed as null and will be generated intelligently // eg "-b {YN+-}" for bools, "-v v" for all else namespace FFLAS { void parseArguments (int argc, char **argv, Argument *args, bool printDefaults = true); } void printHelpMessage (const char *program, Argument *args, bool printDefaults = false) { int i, l; // Skip past libtool prefix in program name if (!strncmp (program, "lt-", strlen ("lt-"))) program += strlen ("lt-"); std::cout << "Usage: " << program << " [options] []" << std::endl; std::cout << std::endl; std::cout << "Where [options] are the following:" << std::endl; bool messageboolean(false),messageprimality(false); for (i = 0; args[i].c != '\0'; ++i) { if (args[i].example != 0) { std::cout << " " << args[i].example; l = 10 - (int)strlen (args[i].example); do std::cout << ' '; while (--l > 0); } else if (args[i].type == TYPE_NONE) { std::cout << " -" << args[i].c << " {YN+-} "; messageboolean = true; } else std::cout << " -" << args[i].c << ' ' << args[i].c << " "; std::cout << args[i].helpString; if (strncmp(args[i].helpString,"Operate over the \"field\"",24) == 0) messageprimality = true; if (printDefaults) { l = 54 - (int)strlen (args[i].helpString); do std::cout << ' '; while (--l > 0); std::cout << " (default "; switch (args[i].type) { case TYPE_NONE: std::cout << ((*(bool *)args[i].data)?"ON":"OFF"); break; case TYPE_INT: std::cout << *(int *) args[i].data; break; case TYPE_LONGLONG: std::cout << *(long long *) args[i].data; break; case TYPE_INTEGER: std::cout << *(type_integer *) args[i].data; break; case TYPE_DOUBLE: std::cout << *(double *) args[i].data; break; case TYPE_INTLIST: std::cout << *(std::list *) args[i].data ; break; case TYPE_STR: std::cout << "\"" << *(std::string *) args[i].data << "\"" ; break; } std::cout << ")"; } std::cout << std::endl; } std::cout << " -h or -? Display this message" << std::endl; if (messageboolean) std::cout << "For boolean switches, the argument may be omitted, meaning the switch should be ON" << std::endl; std::cout << std::endl; std::cout << "If is '-' the report is written to std output. If is" << std::endl; std::cout << "not given, then no detailed reporting is done. This is suitable if you wish only" << std::endl; std::cout << "to determine whether the tests succeeded." << std::endl; std::cout << std::endl; if (messageprimality) std::cout << "[1] N.B. This program does not verify the primality of Q, and does not use a" << std::endl << " field extension in the event that Q=p^n, n > 1" << std::endl; std::cout << std::endl; } /* Find an argument in the argument list for a character */ Argument *findArgument (Argument *args, char c) { int i; for (i = 0; args[i].c != '\0' && args[i].c != c; ++i) ; if (args[i].c != '\0') return &(args[i]); else return (Argument *) 0; } /* Parse command line arguments */ /*! @internal *  @brief transforms a string list of ints to a list of int * string "12,13,15" is turned into list of ints {12,13,15} * @param outlist list once converted * @param instring list to be converted * @return status message. */ int getListArgs(std::list & outlist, std::string & instring) { int start = 0 ; int count = 0 ; size_t i = 0 ; for( ; i < instring.size() ; ++i) { if (isdigit(instring[i])) { ++count; continue ; } if (ispunct(instring[i])) { if (!count) { std::cout << std::endl << "ill formed list " << instring << std::endl; for (size_t sp = 0 ; sp < 16+i ; ++sp) std::cout << '-' ; std::cout << '^' << std::endl; return(1); } int j = atoi(instring.substr((size_t)start,(size_t)count).c_str()); outlist.push_front(j); count = 0 ; start = int(i+1) ; } else { std::cout << std::endl << "ill formed list " << instring << std::endl; for (size_t sp = 0 ; sp < 16+i ; ++sp) std::cout << '-' ; std::cout << '^' << std::endl; return(1); } } std::cout << std::endl; if (!count) { std::cout << std::endl << "ill formed list " << instring << std::endl; for (size_t sp = 0 ; sp < 15+i ; ++sp) std::cout << '-' ; std::cout << '^' << std::endl; return(1); } int j = atoi(instring.substr((size_t)start,(size_t)count).c_str()); outlist.push_front(j); return 0 ; } namespace FFLAS { void parseArguments (int argc, char **argv, Argument *args, bool printDefaults) { int i; Argument *current; for (i = 1; i < argc; ++i) { // std::cout << "i=" << i << std::endl; if (argv[i][0] == '-') { if (argv[i][1] == 0) { std::cout << "Writing report data to cout (intermingled with brief report)" << std::endl << std::endl; std::cout.flush (); } else if (argv[i][1] == 'h' || argv[i][1] == '?' || argv[i][1] == '-') { printHelpMessage (argv[0], args, printDefaults); exit (1); } else if ((current = findArgument (args, argv[i][1])) != (Argument *) 0) { switch (current->type) { case TYPE_NONE: { if (argc == i+1 || (argv[i+1][0] == '-' && argv[i+1][1] != '\0')) { // if at last argument, or next argument is a switch, set to true *((bool *) current->data) = true; break; } *(bool *) current->data = (argv[i+1][0] == '+' || argv[i+1][0] == 'Y' || argv[i+1][0] == 'y' || argv[i+1][0] == 'T' || argv[i+1][0] == 't') ; ++i; } break; case TYPE_INT: { *(int *) current->data = atoi (argv[i+1]); ++i; } break; case TYPE_LONGLONG: { *(long long *) current->data = atoi (argv[i+1]); ++i; } break; case TYPE_INTEGER: { #ifdef _GIVARO_CONFIG_H type_integer tmp(argv[i+1]); #else type_integer tmp = atol(argv[i+1]); #endif *(type_integer *) current->data = tmp; } ++i; break; case TYPE_DOUBLE: { *(double *) current->data = atof (argv[i+1]); ++i; } break; case TYPE_INTLIST: { std::string lst = argv[i+1] ; std::list LST ; getListArgs(LST,lst); *(std::list *) current->data = LST ; ++i; } break; case TYPE_STR: { *(std::string *) current->data = argv[i+1] ; ++i; } break; } } else { std::cerr << "ERROR: Bad argument " << argv[i] << std::endl; break; } } else { std::cout << "Writing report data to " << argv[i] << std::endl << std::endl; std::cout.flush (); } } } /** writes the values of all arguments, preceded by the programName */ std::ostream& writeCommandString (std::ostream& os, Argument *args, char* programName = nullptr) { if (programName != nullptr) os << programName; for (int i = 0; args[i].c != '\0'; ++i) { os << " -" << args[i].c; switch (args[i].type) { case TYPE_NONE: if ((*(bool *)args[i].data)) os << " Y"; else os << " N"; break; case TYPE_INT: os << ' ' << *(int *) args[i].data; break; case TYPE_LONGLONG: os << ' ' << *(long long *) args[i].data; break; case TYPE_INTEGER: os << ' ' << *(Givaro::Integer *) args[i].data; break; case TYPE_DOUBLE: os << ' ' << *(double *) args[i].data; break; case TYPE_INTLIST: os << ' ' << *(std::list *) args[i].data; break; case TYPE_STR: os << " \"" << *(std::string *) args[i].data << "\""; break; } } return os; } } #undef type_integer #endif // __FFLASFFPACK_args_parser_H fflas-ffpack-2.2.2/fflas-ffpack/utils/bit_manipulation.h000066400000000000000000000110561274716147400232520ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFPACK group * * Written by Brice Boyer (briceboyer) * * Part of this code is taken from http://libdivide.com/ * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_utils_bit_manipulation_H #define __FFLASFFPACK_utils_bit_manipulation_H #ifndef __has_builtin #define __has_builtin(x) 0 // Compatibility with non-clang compilers. #endif #include #include "fflas-ffpack/fflas-ffpack-config.h" // count leading zeros inline int32_t clz(uint64_t val) { #if __GNUC__ || __has_builtin(__builtin_clzll) return __builtin_clzll(val); #else if (! val) return 64 ; int32_t result = 0; while (! (val & (1_ui64 << 63))) { val <<= 1; result++; } return result; #endif } inline int32_t clz(uint32_t val) { #if __GNUC__ || __has_builtin(__builtin_clzll) return __builtin_clz(val); #else if (! val) return 32 ; int32_t result = 0; while (! (val & (1 << 31))) { val <<= 1; result++; } return result; #endif } // count trailing zeros inline int32_t ctz(uint32_t val) { #if __GNUC__ || __has_builtin(__builtin_ctz) return __builtin_ctz(val); #else if (!val) return 32; int32_t result = 0; val = (val ^ (val - 1)) >> 1; // Set v's trailing 0s to 1s and zero rest while (val) { val >>= 1; result++; } return result; #endif } // count trailing zeros inline int32_t ctz(uint64_t val) { #if __GNUC__ || __has_builtin(__builtin_ctzll) return __builtin_ctzll(val); #else if (!val) return 64; uint32_t lo = val & 0xFFFFFFFF; if (lo != 0) return ctz(lo); return 32 + ctz(val >> 32); #endif } #ifdef __FFLASFFPACK_HAVE_INT128 // division 128bits by 64 bits // int128_t(u1,u0) = u1*2^64+u0, div v, rem r // return quo static uint64_t divide_128(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { // u0 -> rax // u1 -> rdx // divq uint64_t result; __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1) ); return result; } #endif static uint64_t getpoweroftwoden_128(uint32_t d, uint64_t q, uint64_t *r) { #ifdef __FFLASFFPACK_HAVE_INT128 return divide_128(1_ui64 << (d - 1), 0, q, r); #else lldiv_t ta; ta = lldiv(1ULL<<63,q); lldiv_t br; br = lldiv(ta.rem<> 32); } static inline int64_t mulhi_64(int64_t x, int64_t y) { #ifdef __FFLASFFPACK_HAVE_INT128 int128_t xl = x, yl = y; int128_t rl = xl * yl; return (int64_t)(rl >> 64); #else const uint32_t mask = 0xFFFFFFFF; const uint32_t x0 = (uint32_t)(x & mask), y0 = (uint32_t)(y & mask); const int32_t x1 = (int32_t)(x >> 32), y1 = (int32_t)(y >> 32); const uint32_t x0y0_hi = mullhi_u32(x0, y0); const int64_t t = x1*(int64_t)y0 + x0y0_hi; const int64_t w1 = x0*(int64_t)y1 + (t & mask); return x1*(int64_t)y1 + (t >> 32) + (w1 >> 32); #endif } static inline int64_t mulhi_fast_64(int64_t x, int64_t y) { #ifdef __FFLASFFPACK_HAVE_INT128 int128_t xl = x, yl = y; int128_t rl = xl * yl; return (int64_t)(rl >> 64); #else const uint32_t mask = 0xFFFFFFFF; const uint32_t x0 = (uint32_t)(x & mask), y0 = (uint32_t)(y & mask); const int32_t x1 = (int32_t)(x >> 32), y1 = (int32_t)(y >> 32); // const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0); const int64_t t = x1*(int64_t)y0 ; // + x0y0_hi; const int64_t w1 = x0*(int64_t)y1 ; // + (t & mask); return x1*(int64_t)y1 + (t >> 32) + (w1 >> 32); #endif } #endif // __FFLASFFPACK_utils_bit_manipulation_H fflas-ffpack-2.2.2/fflas-ffpack/utils/cast.h000066400000000000000000000022721274716147400206460ustar00rootroot00000000000000/* * This file is part of FFLAS-FFPACK * Copyright (C) 2011 Brice Boyer (briceboyer) * * ------------------------------------ * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_const_H #define __FFLASFFPACK_const_H namespace FFPACK { template T fflas_const_cast (CT x) { return const_cast(x); } } #endif fflas-ffpack-2.2.2/fflas-ffpack/utils/debug.h000066400000000000000000000123721274716147400210040ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* utils/debug.h * * Copyright (C) 2011 Fflas-ffpack * Modified by BB, from LinBox * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ /*! @file utils/debug.h * @ingroup util * Various utilities for debugging. * @todo we should put vector printing elsewhere. */ #ifndef __FFLASFFPACK_util_debug_H #define __FFLASFFPACK_util_debug_H #include #include #include #include #include "fflas-ffpack/fflas-ffpack-config.h" #ifdef __FFLASFFPACK_HAVE_STDINT_H #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS #endif #include #include // If somebody nasty previously included without __STDC_LIMIT_MACROS :) #ifndef INT64_MAX #define INT64_MAX std::numeric_limits::max() #endif #ifndef UINT64_MAX #define UINT64_MAX std::numeric_limits::max() #endif #ifndef INT32_MAX #define INT32_MAX std::numeric_limits::max() #endif #ifndef UINT32_MAX #define UINT32_MAX std::numeric_limits::max() #endif #ifndef INT16_MAX #define INT16_MAX std::numeric_limits::max() #endif #ifndef UINT16_MAX #define UINT16_MAX std::numeric_limits::max() #endif #ifndef INT8_MAX #define INT8_MAX std::numeric_limits::max() #endif #ifndef UINT8_MAX #define UINT8_MAX std::numeric_limits::max() #endif #else #error "you need intXX_t types" #endif #ifndef NDEBUG #include #define FFLASFFPACK_check(check) \ if (!(check)) {\ FFPACK::failure()(__func__, __FILE__, __LINE__, #check); \ throw std::runtime_error(#check); \ } #define FFLASFFPACK_abort(msg) \ {\ FFPACK::failure()(__func__, __FILE__, __LINE__, msg); \ throw std::runtime_error(msg); \ } #else #define FFLASFFPACK_check(check) ((void) 0) #define FFLASFFPACK_abort(mst) ((void) 0) #endif namespace FFPACK { /*! A precondtion failed. * @ingroup util * The \c throw mechanism is usually used here as in \code if (!check) failure()(__func__,__LINE__,"this check just failed"); \endcode * The parameters of the constructor help debugging. */ class Failure { protected: std::ostream *_errorStream; public: Failure() {} /*! @internal * A precondtion failed. * @param function usually \c __func__, the function that threw the error * @param line usually \c __LINE__, the line where it happened * @param check a string telling what failed. */ void operator() (const char *function, int line, const char *check) { if (_errorStream == (std::ostream *) 0) _errorStream = &std::cerr; (*_errorStream) << std::endl << std::endl; (*_errorStream) << "ERROR (" << function << ":" << line << "): "; (*_errorStream) << "Precondition not met:" << check << std::endl; } /*! @internal * A precondtion failed. * The parameter help debugging. This is not much different from the previous * except we can digg faster in the file where the exception was triggered. * @param function usually \c __func__, the function that threw the error * @param file usually \c __FILE__, the file where this function is * @param line usually \c __LINE__, the line where it happened * @param check a string telling what failed. */ void operator() (const char* function, const char *file, int line, const char *check) { if (_errorStream == (std::ostream *) 0) _errorStream = &std::cerr; (*_errorStream) << std::endl << std::endl; (*_errorStream) << "ERROR (at " << function << " in " << file << ':' << line << "): " << std::endl; (*_errorStream) << "Precondition not met:" << check << std::endl; } void setErrorStream (std::ostream &stream); /*! @internal overload the virtual print of LinboxError. * @param o output stream */ std::ostream &print (std::ostream &o) const { if (std::ostringstream * str = dynamic_cast(_errorStream)) return o << str->str() ; else throw "FFLAS-FFPACK ERROR: Failure exception is not initialized correctly"; } }; inline Failure& failure() { static Failure failure_internal; return failure_internal; } template inline bool isOdd (const T & a) { return (a%2); } inline bool isOdd(const float &a) { return (bool)(int)fmodf(a,2.f); } inline bool isOdd(const double &a) { return (bool)(int)fmod(a,2.); } } // FFPACK #endif // __FFLASFFPACK_util_debug_H fflas-ffpack-2.2.2/fflas-ffpack/utils/fflas_intrinsic.h000066400000000000000000000043161274716147400230720ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2016 the FFLAS-FFPACK group * * Written by * * Includes the proper intrinsic definitions, according to the architecture and system. * Code proposed by Marat Dukhan http://stackoverflow.com/questions/11228855/header-files-for-simd-intrinsics * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #if defined(_MSC_VER) /* Microsoft C/C++-compatible compiler */ #include #elif (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)) && (defined(__x86_64__) || defined(__i386__)) /* GCC-compatible compiler, targeting x86/x86-64 */ #include #elif (defined(__GNUC__) || defined(__clang__)) && defined(__ARM_NEON__) /* GCC-compatible compiler, targeting ARM with NEON */ #include #elif (defined(__GNUC__) || defined(__clang__)) && defined(__IWMMXT__) /* GCC-compatible compiler, targeting ARM with WMMX */ #include #elif (defined(__GNUC__) || defined(__xlC__) || defined(__clang__)) && (defined(__VEC__) || defined(__ALTIVEC__)) /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ #include #elif (defined(__GNUC__) || defined(__clang__)) && defined(__SPE__) /* GCC-compatible compiler, targeting PowerPC with SPE */ #include #endif fflas-ffpack-2.2.2/fflas-ffpack/utils/fflas_memory.h000066400000000000000000000367631274716147400224130ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* fflas/fflas_memory.h * Copyright (C) 2014 fflas-ffpack group * * Written by Clement Pernet * * The cache size detection has been copied from the Eigen library, * a lightweight C++ template library for linear algebra, licensed under * the Mozilla * Public License v. 2.0. If a copy of the MPL was not distributed * with this file, You can obtain one at http://mozilla.org/MPL/2.0/. * Copyright (C) 2008-2010 Gael Guennebaud * Copyright (C) 2008-2009 Benoit Jacob * Copyright (C) 2009 Kenneth Riddile * Copyright (C) 2010 Hauke Heibel * Copyright (C) 2010 Thomas Capricelli * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_memory_H #define __FFLASFFPACK_memory_H #include "fflas-ffpack/utils/align-allocator.h" #include namespace FFLAS{ template inline bool alignable() { return true ; } // BB : segfault in Givaro::Integer::logcpy otherwise template<> inline bool alignable() { return false; } template inline typename Field::Element_ptr fflas_new (const Field& F, const size_t m, const size_t n, const Alignment align = Alignment::DEFAULT) { if (alignable() ) { return malloc_align(m*n, align); } else { return new typename Field::Element[m*n]; } } template inline Element* fflas_new (const size_t m, const Alignment align = Alignment::DEFAULT) { if (alignable() ) { return malloc_align(m, align); } else { return new Element[m]; } } template inline void fflas_delete(Element_ptr A) { if (alignable() ) free(A); else delete[] A; } template inline void fflas_delete(Ptr p, Args ... args){ fflas_delete(p); fflas_delete(std::forward(args)...); } #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS inline void prefetch(const int64_t* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } #else inline void prefetch(const int64_t*) {} #endif #define __CPUID(abcd,func,id) \ __asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id) ); inline void getCacheSize(int& l1, int& l2, int& l3) { int abcd[4]; l1 = l2 = l3 = 0; int cache_id = 0; int cache_type = 0; do { abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; __CPUID(abcd,0x4,cache_id); cache_type = (abcd[0] & 0x0F) >> 0; if(cache_type==1||cache_type==3) // data or unified cache { int cache_level = (abcd[0] & 0xE0) >> 5; // A[7:5] int ways = (abcd[1] & 0xFFC00000) >> 22; // B[31:22] int partitions = (abcd[1] & 0x003FF000) >> 12; // B[21:12] int line_size = (abcd[1] & 0x00000FFF) >> 0; // B[11:0] int sets = (abcd[2]); // C[31:0] int cache_size = (ways+1) * (partitions+1) * (line_size+1) * (sets+1); switch(cache_level) { case 1: l1 = cache_size; break; case 2: l2 = cache_size; break; case 3: l3 = cache_size; break; default: break; } } cache_id++; } while(cache_type>0 && cache_id<16); } inline void getTLBSize(int& tlb) { int abcd[4]={}; int sTLB=0; int lTLB; __CPUID(abcd,0x2,0); unsigned char * bytes = reinterpret_cast(abcd)+2; for(int i=0; i<14; ++i) switch(bytes[i]){ case 0x03: sTLB=64; break; case 0x04: lTLB=8; break; case 0x05: lTLB=32; break; case 0x56: lTLB=16; break; case 0x57: sTLB=16; break; case 0x59: sTLB=16; break; case 0x5A: lTLB=32; break; case 0x5B: sTLB=lTLB=64; break; case 0x5C: sTLB=lTLB=128; break; case 0x5D: sTLB=lTLB=256; break; case 0xB3: sTLB=128; break; case 0xB4: sTLB=256; break; case 0xBA: sTLB=64; break; case 0xC0: sTLB=lTLB=8; break; case 0xCA: sTLB=512; break; default: break; } //cout<<"small TLB: "< 1500) && ( defined(_M_IX86) || defined(_M_X64) ) # define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id) # endif # endif #endif #ifdef EIGEN_CPUID inline bool cpuid_is_vendor(int abcd[4], const char* vendor) { return abcd[1]==(reinterpret_cast(vendor))[0] && abcd[3]==(reinterpret_cast(vendor))[1] && abcd[2]==(reinterpret_cast(vendor))[2]; } inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) { int abcd[4]; l1 = l2 = l3 = 0; int cache_id = 0; int cache_type = 0; do { abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; EIGEN_CPUID(abcd,0x4,cache_id); cache_type = (abcd[0] & 0x0F) >> 0; if(cache_type==1||cache_type==3) // data or unified cache { int cache_level = (abcd[0] & 0xE0) >> 5; // A[7:5] int ways = (abcd[1] & 0xFFC00000) >> 22; // B[31:22] int partitions = (abcd[1] & 0x003FF000) >> 12; // B[21:12] int line_size = (abcd[1] & 0x00000FFF) >> 0; // B[11:0] int sets = (abcd[2]); // C[31:0] int cache_size = (ways+1) * (partitions+1) * (line_size+1) * (sets+1); switch(cache_level) { case 1: l1 = cache_size; break; case 2: l2 = cache_size; break; case 3: l3 = cache_size; break; default: break; } } cache_id++; } while(cache_type>0 && cache_id<16); } inline void queryCacheSizes_intel_codes(int& l1, int& l2, int& l3) { int abcd[4]; abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; l1 = l2 = l3 = 0; EIGEN_CPUID(abcd,0x00000002,0); unsigned char * bytes = reinterpret_cast(abcd)+2; bool check_for_p2_core2 = false; for(int i=0; i<14; ++i) { switch(bytes[i]) { case 0x0A: l1 = 8; break; // 0Ah data L1 cache, 8 KB, 2 ways, 32 byte lines case 0x0C: l1 = 16; break; // 0Ch data L1 cache, 16 KB, 4 ways, 32 byte lines case 0x0E: l1 = 24; break; // 0Eh data L1 cache, 24 KB, 6 ways, 64 byte lines case 0x10: l1 = 16; break; // 10h data L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64) case 0x15: l1 = 16; break; // 15h code L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64) case 0x2C: l1 = 32; break; // 2Ch data L1 cache, 32 KB, 8 ways, 64 byte lines case 0x30: l1 = 32; break; // 30h code L1 cache, 32 KB, 8 ways, 64 byte lines case 0x60: l1 = 16; break; // 60h data L1 cache, 16 KB, 8 ways, 64 byte lines, sectored case 0x66: l1 = 8; break; // 66h data L1 cache, 8 KB, 4 ways, 64 byte lines, sectored case 0x67: l1 = 16; break; // 67h data L1 cache, 16 KB, 4 ways, 64 byte lines, sectored case 0x68: l1 = 32; break; // 68h data L1 cache, 32 KB, 4 ways, 64 byte lines, sectored case 0x1A: l2 = 96; break; // code and data L2 cache, 96 KB, 6 ways, 64 byte lines (IA-64) case 0x22: l3 = 512; break; // code and data L3 cache, 512 KB, 4 ways (!), 64 byte lines, dual-sectored case 0x23: l3 = 1024; break; // code and data L3 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored case 0x25: l3 = 2048; break; // code and data L3 cache, 2048 KB, 8 ways, 64 byte lines, dual-sectored case 0x29: l3 = 4096; break; // code and data L3 cache, 4096 KB, 8 ways, 64 byte lines, dual-sectored case 0x39: l2 = 128; break; // code and data L2 cache, 128 KB, 4 ways, 64 byte lines, sectored case 0x3A: l2 = 192; break; // code and data L2 cache, 192 KB, 6 ways, 64 byte lines, sectored case 0x3B: l2 = 128; break; // code and data L2 cache, 128 KB, 2 ways, 64 byte lines, sectored case 0x3C: l2 = 256; break; // code and data L2 cache, 256 KB, 4 ways, 64 byte lines, sectored case 0x3D: l2 = 384; break; // code and data L2 cache, 384 KB, 6 ways, 64 byte lines, sectored case 0x3E: l2 = 512; break; // code and data L2 cache, 512 KB, 4 ways, 64 byte lines, sectored case 0x40: l2 = 0; break; // no integrated L2 cache (P6 core) or L3 cache (P4 core) case 0x41: l2 = 128; break; // code and data L2 cache, 128 KB, 4 ways, 32 byte lines case 0x42: l2 = 256; break; // code and data L2 cache, 256 KB, 4 ways, 32 byte lines case 0x43: l2 = 512; break; // code and data L2 cache, 512 KB, 4 ways, 32 byte lines case 0x44: l2 = 1024; break; // code and data L2 cache, 1024 KB, 4 ways, 32 byte lines case 0x45: l2 = 2048; break; // code and data L2 cache, 2048 KB, 4 ways, 32 byte lines case 0x46: l3 = 4096; break; // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines case 0x47: l3 = 8192; break; // code and data L3 cache, 8192 KB, 8 ways, 64 byte lines case 0x48: l2 = 3072; break; // code and data L2 cache, 3072 KB, 12 ways, 64 byte lines case 0x49: if(l2!=0) l3 = 4096; else {check_for_p2_core2=true; l3 = l2 = 4096;} break;// code and data L3 cache, 4096 KB, 16 ways, 64 byte lines (P4) or L2 for core2 case 0x4A: l3 = 6144; break; // code and data L3 cache, 6144 KB, 12 ways, 64 byte lines case 0x4B: l3 = 8192; break; // code and data L3 cache, 8192 KB, 16 ways, 64 byte lines case 0x4C: l3 = 12288; break; // code and data L3 cache, 12288 KB, 12 ways, 64 byte lines case 0x4D: l3 = 16384; break; // code and data L3 cache, 16384 KB, 16 ways, 64 byte lines case 0x4E: l2 = 6144; break; // code and data L2 cache, 6144 KB, 24 ways, 64 byte lines case 0x78: l2 = 1024; break; // code and data L2 cache, 1024 KB, 4 ways, 64 byte lines case 0x79: l2 = 128; break; // code and data L2 cache, 128 KB, 8 ways, 64 byte lines, dual-sectored case 0x7A: l2 = 256; break; // code and data L2 cache, 256 KB, 8 ways, 64 byte lines, dual-sectored case 0x7B: l2 = 512; break; // code and data L2 cache, 512 KB, 8 ways, 64 byte lines, dual-sectored case 0x7C: l2 = 1024; break; // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored case 0x7D: l2 = 2048; break; // code and data L2 cache, 2048 KB, 8 ways, 64 byte lines case 0x7E: l2 = 256; break; // code and data L2 cache, 256 KB, 8 ways, 128 byte lines, sect. (IA-64) case 0x7F: l2 = 512; break; // code and data L2 cache, 512 KB, 2 ways, 64 byte lines case 0x80: l2 = 512; break; // code and data L2 cache, 512 KB, 8 ways, 64 byte lines case 0x81: l2 = 128; break; // code and data L2 cache, 128 KB, 8 ways, 32 byte lines case 0x82: l2 = 256; break; // code and data L2 cache, 256 KB, 8 ways, 32 byte lines case 0x83: l2 = 512; break; // code and data L2 cache, 512 KB, 8 ways, 32 byte lines case 0x84: l2 = 1024; break; // code and data L2 cache, 1024 KB, 8 ways, 32 byte lines case 0x85: l2 = 2048; break; // code and data L2 cache, 2048 KB, 8 ways, 32 byte lines case 0x86: l2 = 512; break; // code and data L2 cache, 512 KB, 4 ways, 64 byte lines case 0x87: l2 = 1024; break; // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines case 0x88: l3 = 2048; break; // code and data L3 cache, 2048 KB, 4 ways, 64 byte lines (IA-64) case 0x89: l3 = 4096; break; // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines (IA-64) case 0x8A: l3 = 8192; break; // code and data L3 cache, 8192 KB, 4 ways, 64 byte lines (IA-64) case 0x8D: l3 = 3072; break; // code and data L3 cache, 3072 KB, 12 ways, 128 byte lines (IA-64) default: break; } } if(check_for_p2_core2 && l2 == l3) l3 = 0; l1 *= 1024; l2 *= 1024; l3 *= 1024; } inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs) { if(max_std_funcs>=4) queryCacheSizes_intel_direct(l1,l2,l3); else queryCacheSizes_intel_codes(l1,l2,l3); } inline void queryCacheSizes_amd(int& l1, int& l2, int& l3) { int abcd[4]; abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; EIGEN_CPUID(abcd,0x80000005,0); l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; EIGEN_CPUID(abcd,0x80000006,0); l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB } #endif /** \internal * Queries and returns the cache sizes in Bytes of the L1, L2, and L3 data caches respectively */ inline void queryCacheSizes(int& l1, int& l2, int& l3) { #ifdef EIGEN_CPUID int abcd[4]; // identify the CPU vendor EIGEN_CPUID(abcd,0x0,0); int max_std_funcs = abcd[1]; if(cpuid_is_vendor(abcd,"GenuineIntel")) queryCacheSizes_intel(l1,l2,l3,max_std_funcs); else if(cpuid_is_vendor(abcd,"AuthenticAMD") || cpuid_is_vendor(abcd,"AMDisbetter!")) queryCacheSizes_amd(l1,l2,l3); else // by default let's use Intel's API queryCacheSizes_intel(l1,l2,l3,max_std_funcs); // here is the list of other vendors: // ||cpuid_is_vendor(abcd,"VIA VIA VIA ") // ||cpuid_is_vendor(abcd,"CyrixInstead") // ||cpuid_is_vendor(abcd,"CentaurHauls") // ||cpuid_is_vendor(abcd,"GenuineTMx86") // ||cpuid_is_vendor(abcd,"TransmetaCPU") // ||cpuid_is_vendor(abcd,"RiseRiseRise") // ||cpuid_is_vendor(abcd,"Geode by NSC") // ||cpuid_is_vendor(abcd,"SiS SiS SiS ") // ||cpuid_is_vendor(abcd,"UMC UMC UMC ") // ||cpuid_is_vendor(abcd,"NexGenDriven") #else l1 = l2 = l3 = -1; #endif } /** \internal * \returns the size in Bytes of the L1 data cache */ inline int queryL1CacheSize() { int l1(-1), l2, l3; queryCacheSizes(l1,l2,l3); return l1; } /** \internal * \returns the size in Bytes of the L2 or L3 cache if this later is present */ inline int queryTopLevelCacheSize() { int l1, l2(-1), l3(-1); queryCacheSizes(l1,l2,l3); return (std::max)(l2,l3); } } // namespace FFLAS #endif // __FFLASFFPACK_memory_H fflas-ffpack-2.2.2/fflas-ffpack/utils/fflas_randommatrix.h000066400000000000000000000272101274716147400235730ustar00rootroot00000000000000/* * Copyright (C) FFLAS-FFPACK * Written by Brice Boyer (briceboyer) * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /*! @file utils/fflas_randommatrix.h * @ingroup tests * @brief Utilities to create matrices with prescribed shapes, properties,... * To be used in benchmarks/tests */ #ifndef __FFLASFFPACK_randommatrix_H #define __FFLASFFPACK_randommatrix_H #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/utils/debug.h" #include "fflas-ffpack/ffpack/ffpack.h" #include #include #include #include #include namespace FFPACK { /*! @brief Random Matrix. * Creates a \c m x \c n matrix with random entries. * @param F field * @param A pointer to the matrix (preallocated to at least \c m x \c lda field elements) * @param m number of rows in \p A * @param n number of cols in \p A * @param lda leading dimension of \p A * @return pointer to \c A. */ template typename Field::Element * RandomMatrix(const Field & F, typename Field::Element * A, size_t m, size_t n, size_t lda, size_t b=0) { typedef typename Field::RandIter Randiter ; Randiter R(F, b); for (size_t i=0 ; i=a); return x ; } /*! @brief Random Matrix with prescribed rank. * Creates an \c m x \c n matrix with random entries and rank \c r. * @param F field * @param A pointer to the matrix (preallocated to at least \c m x \c lda field elements) * @param r rank of the matrix to build * @param m number of rows in \p A * @param n number of cols in \p A * @param lda leading dimension of \p A * @return pointer to \c A. */ template typename Field::Element_ptr RandomMatrixWithRank (const Field & F, typename Field::Element_ptr A, size_t lda, size_t r, size_t m, size_t n) { FFLASFFPACK_check(r <= std::min(m,n)); FFLASFFPACK_check(n <= lda); typedef typename Field::RandIter Randiter ; typedef typename Field::Element_ptr Element_ptr; Randiter R(F); Givaro::GeneralRingNonZeroRandIter nzR(R); size_t * P = FFLAS::fflas_new(n); size_t * Q = FFLAS::fflas_new(m); for (size_t i = 0 ; i < m ; ++i ) Q[i] = 0; for (size_t i = 0 ; i < n ; ++i ) P[i] = 0; Element_ptr U = FFLAS::fflas_new(F,m,n); Element_ptr L = FFLAS::fflas_new(F,m,m); /* Create L, lower invertible */ for (size_t i=0 ; i rows(N,false); while (curr void RandomMatrixWithRankandRPM (const Field& F, typename Field::Element_ptr A, size_t lda, size_t R, size_t M, size_t N, const size_t * RRP, const size_t * CRP){ typedef typename Field::RandIter Randiter ; Randiter RI(F); Givaro::GeneralRingNonZeroRandIter nzR(RI); typename Field::Element_ptr L= FFLAS::fflas_new(F,M,N); FFLAS::pfzero(F, M, N, L, N); FFLAS::ParSeqHelper::Parallel H; SYNCH_GROUP ( FOR1D(k, R, H, { size_t i = RRP[k]; size_t j = CRP[k]; nzR.random (L [i*N+j]); for (size_t l=i+1; l < M; ++l) RI.random (L [l*N+j]); })); typename Field::Element_ptr U= FFLAS::fflas_new(F,N,N); FFLAS::pfzero(F, N, N, U, N); SYNCH_GROUP ( FOR1D(i, N, H, { nzR.random (U [i*N+i]); for (size_t j=i+1; j < N; ++j) RI.random (U [i*N+j]); })); typename Field::Element alpha, beta; F.init(alpha,1.0); F.init(beta,0.0); // auto sp=SPLITTER(); //CP: broken with Modular. Need to reorganize the helper behaviour with ParSeq and ModeTraits auto sp=NOSPLIT(); FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, M,N,N, alpha, L, N, U, N, beta, A, lda, sp); FFLAS::fflas_delete(L); FFLAS::fflas_delete(U); } /*! @brief Random Matrix with prescribed rank, with random rank profile matrix * Creates an \c m x \c n matrix with random entries, rank \c r and with a rank profile matrix * chosen uniformly at random. * @param F field * @param A pointer to the matrix (preallocated to at least \c m x \c lda field elements) * @param r rank of the matrix to build * @param m number of rows in \p A * @param n number of cols in \p A * @param lda leading dimension of \p A * @return pointer to \c A. */ template void RandomMatrixWithRankandRandomRPM (const Field& F, typename Field::Element_ptr A, size_t lda, size_t R, size_t M, size_t N){ // generate the r pivots in the rank profile matrix E size_t pivot_r[R]; size_t pivot_c[R]; RandomRankProfile (M, R, pivot_r); RandomRankProfile (N, R, pivot_c); RandomMatrixWithRankandRPM (F, A, lda, R, M, N, pivot_r, pivot_c); } /*! @brief Random Matrix with prescribed det. * @bug duplicate with linbox * Creates a \c m x \c n matrix with random entries and rank \c r. * @param F field * @param A pointer to the matrix (preallocated to at least \c m x \c lda field elements) * @param r rank of the matrix to build * @param m number of rows in \p A * @param n number of cols in \p A * @param lda leading dimension of \p A * @return pointer to \c A. */ template typename Field::Element * RandomMatrixWithDet(const Field & F, typename Field::Element * A, typename Field::Element d, size_t n, size_t lda) { FFLASFFPACK_check(n <= lda); typedef typename Field::RandIter Randiter ; typedef typename Field::Element Element ; Randiter R(F); Givaro::GeneralRingNonZeroRandIter nzR(R); size_t * P = FFLAS::fflas_new(n); size_t * Q = FFLAS::fflas_new(n); for (size_t i = 0 ; i < n ; ++i ) Q[i] = 0; for (size_t i = 0 ; i < n ; ++i ) P[i] = 0; Element * U = FFLAS::fflas_new(n*lda); Element * L = FFLAS::fflas_new(n*n); /* Create a random P,Q */ for (size_t i = 0 ; i < n ; ++i) P[i] = i + RandInt(0U,n-i); for (size_t i = 0 ; i < n ; ++i) Q[i] = i + RandInt(0U,n-i); /* det of P,Q */ int d1 =1 ; for (size_t i = 0 ; i < n ; ++i) if (P[i] != i) d1 = -d1; for (size_t i = 0 ; i < n ; ++i) if (Q[i] != i) d1 = -d1; /* Create L, lower det d */ for (size_t i=0 ; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #ifndef __FFLASFFPACK_limits_H #define __FFLASFFPACK_limits_H //#include #include #include #include #include template struct limits; // { // constexpr inline static T max() noexcept {return 0;} // constexpr inline static T min() noexcept {return 0;} // }; template <> struct limits { typedef unsigned char T ; constexpr inline static unsigned char max() noexcept {return UCHAR_MAX;} constexpr inline static unsigned char min() noexcept {return 0;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef signed char T ; constexpr inline static signed char max() noexcept {return SCHAR_MAX;} constexpr inline static signed char min() noexcept {return SCHAR_MIN;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef char T ; constexpr inline static char max() noexcept {return CHAR_MAX;} constexpr inline static char min() noexcept {return CHAR_MIN;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef unsigned short int T ; constexpr inline static unsigned short int max() noexcept {return USHRT_MAX;} constexpr inline static unsigned short int min() noexcept {return 0;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef short int T ; constexpr inline static short int max() noexcept {return SHRT_MAX;} constexpr inline static short int min() noexcept {return SHRT_MIN;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef unsigned int T ; constexpr inline static unsigned int max() noexcept {return UINT_MAX;} constexpr inline static unsigned int min() noexcept {return 0;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef int T ; constexpr inline static int max() noexcept {return INT_MAX;} constexpr inline static int min() noexcept {return INT_MIN;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef unsigned long T ; constexpr inline static unsigned long max() noexcept {return ULONG_MAX;} constexpr inline static unsigned long min() noexcept {return 0;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef long T ; constexpr inline static long max() noexcept {return LONG_MAX;} constexpr inline static long min() noexcept {return LONG_MIN;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef unsigned long long T ; constexpr inline static unsigned long long max() noexcept { return ULLONG_MAX; } constexpr inline static unsigned long long min() noexcept {return 0;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef long long T ; constexpr inline static long long max() noexcept {return LLONG_MAX;} constexpr inline static long long min() noexcept {return LLONG_MIN;} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef float T ; constexpr inline static int32_t max() noexcept {return (int32_t(1) << FLT_MANT_DIG) - 1;} constexpr inline static int32_t min() noexcept {return -((int32_t(1) << FLT_MANT_DIG) - 1);} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef double T; constexpr inline static int64_t max() noexcept {return (int64_t(1) << DBL_MANT_DIG) - 1;} constexpr inline static int64_t min() noexcept {return -((int64_t(1) << DBL_MANT_DIG) - 1);} constexpr inline static int32_t digits() noexcept {return std:: numeric_limits::digits ;} }; template <> struct limits { typedef Givaro::Integer T; constexpr inline static int max() noexcept {return -1;} constexpr inline static int min() noexcept {return 0;} }; template struct limits > { typedef RecInt::ruint T; constexpr inline static RecInt::ruint max() noexcept {return RecInt::ruint(-1);} constexpr inline static RecInt::ruint min() noexcept {return 0;} }; template struct limits > { typedef RecInt::ruint T; constexpr inline static RecInt::rint max() noexcept {return RecInt::rint(RecInt::ruint(-1) >> 1u);} constexpr inline static RecInt::rint min() noexcept {return max() + 1;} }; // template struct limits > { // constexpr inline static RecInt::rint max() noexcept {return RecInt::rint(RecInt::ruint(-1))/2;} // constexpr inline static RecInt::rint min() noexcept {return -RecInt::rint(RecInt::ruint(-1))/2;} // }; // template struct limits > { // constexpr inline static RecInt::ruint max() noexcept {return RecInt::rmint(-1);} // constexpr inline static RecInt::ruint min() noexcept {return 0;} // }; /* * in_range, determine if an element e of type E fit in a type T */ template typename std::enable_if::value == std::is_signed::value, bool>::type in_range(E e) { return (e >= limits::min() && e <= limits::max()); } template typename std::enable_if<(std::is_signed::value) && !(std::is_signed::value), bool>::type in_range(E e) { return (e <= static_cast(limits::max())); } template typename std::enable_if::value) && (std::is_signed::value), bool>::type in_range(E e) { return ((e >= 0) && (static_cast(e) <= limits::max())); } #endif /* _FFLASFFPACK_limits_H */ fflas-ffpack-2.2.2/fflas-ffpack/utils/print-utils.h000066400000000000000000000073421274716147400222110ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* tests/print-utils.h * Copyright (C) 2011, Brice Boyer (briceboyer) * Bastien Vialla * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #ifndef __FFLASFFPACK_print_utils_H #define __FFLASFFPACK_print_utils_H #include #include // #include #include #include #include namespace std { /*! Prints a vector on output. * @param o output stream * @param v vector * @warning <<(ostream&,T&) exists ! */ template std::ostream & operator<<(std::ostream&o, const std::vector & v) { o << '[' ; std::copy(v.begin(), v.end(), std::ostream_iterator(o, " ")); // if (v.size()) { // size_t i = 0 ; // for (; i < v.size()-1 ; ++i) // o << v[i] << ',' ; // o << v[i] ; // } return o << ']' ; } /*! Prints a pair. * @param o output stream * @param C a pair * @warning <<(ostream&,T&) exists ! */ template std::ostream& operator<<(std::ostream& o, const std::pair & C) { o << '(' << C.first << ", " << C.second << ')'; return o ; } /*! Prints a list. * @param o output stream * @param C a pair * @warning <<(ostream&,T&) exists ! */ template std::ostream& operator<< (std::ostream& o, const std::list & L) { o << '{' ; std::copy(L.begin(), L.end(), std::ostream_iterator(o, " ")); return o << '}' ; // typename std::list::const_iterator it = L.begin() ; // if (it != L.end() ) // while(true) { // o << *it ; // if (++it != L.end()) // o << ", " ; // else // break; // } } /*! Prints a set. * @param o output stream * @param C a pair * @warning <<(ostream&,T&) exists ! */ template std::ostream& operator<< (std::ostream& o, const std::set & S) { o << '|' ; std::copy(S.begin(), S.end(), std::ostream_iterator(o, " ")); return o << '|' ; // typename std::set::const_iterator it = L.begin() ; // o << '|' ; // if (it != L.end() ) // while(true) { // o << *it ; // if (++it != L.end()) // o << ", " ; // else // break; // } // return o << '|' ; } #if 0 std::ostream &operator << (std::ostream &out, const std::vector &S) { std::vector::const_iterator i; for (i = S.begin (); i != S.end (); ++i) { out << ((*i) ? "1" : "0"); if (i != S.end () - 1) out << ", "; } return out; } template class Container> std::ostream& operator<< (std::ostream& o, const Container& C) { for(typename Container::const_iterator refs = C.begin(); refs != C.end() ; ++refs ) o << (*refs) << " " ; return o << std::endl; } #endif } #endif // __FFLASFFPACK_print_utils_H fflas-ffpack-2.2.2/fflas-ffpack/utils/timer.h000066400000000000000000000042061274716147400210330ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* test/timer.h * Copyright (C) 1994-1997 Givaro Team * * Written by T. Gautier * * ------------------------------------ * Modified by Bradford Hovinen * * Added _start_t member to BaseTimer, so that stop () does not clobber the * class' memory of its start time. This allows it to be called repeatedly to * get elapsed times. * ------------------------------------ * Modified by Clement Pernet * integrated into FFLAS_FFPACK * * ------------------------------------ * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. * * This file implements the C++ interface to commentators (for * providing runtime commentary to the user) */ #ifndef __FFLASFFPACK_timer_H #define __FFLASFFPACK_timer_H #include #ifdef __FFLASFFPACK_USE_OPENMP # ifndef __GIVARO_USE_OPENMP # define __GIVARO_USE_OPENMP 1 # endif #endif #include #ifdef __GIVARO_USE_OPENMP #include #endif namespace FFLAS { typedef Givaro::Timer Timer ; typedef Givaro::BaseTimer BaseTimer ; typedef Givaro::UserTimer UserTimer ; typedef Givaro::SysTimer SysTimer ; #ifdef __GIVARO_USE_OPENMP typedef Givaro::OMPTimer OMPTimer ; #endif } #endif // __FFLASFFPACK_timer_H fflas-ffpack-2.2.2/incremente-versions000077500000000000000000000077551274716147400200200ustar00rootroot00000000000000#!/bin/csh -f # Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ set conf = configure.ac set ver = Makefile.am #verbatim second argument of AC_INIT set verb = `grep ^AC_INIT $conf | cut -d',' -f2` #removes spaces and brackets set vern = `echo "$verb" | sed 's/ //g;s/\[//;s/\]//'` echo "Current version is $vern." echo -n "Increment library version ? (y/n)" set answ = $< if ("$answ" == "y") then set line = `fgrep -n ^AC_INIT $conf | cut -d':' -f1` #gets the line set macro = `echo "$vern" | cut -d'.' -f1` #a version number is macro.minor.micro set minor = `echo "$vern" | cut -d'.' -f2` set micro = `echo "$vern" | cut -d'.' -f3` set tmpfile = `mktemp` #tempfile set sedfile = `mktemp` #temp sed file set pmicro = `echo $micro` @ pmicro ++ set pminor = `echo $minor` @ pminor ++ set pmacro = `echo $macro` @ pmacro ++ echo "Increment micro revision number ($vern -> $macro.$minor.$pmicro) ? press '0' " echo "Increment minor revision number ($vern -> $macro.$pminor.0) ? press '1' " echo -n "Increment macro revision number ($vern -> $pmacro.0.0) ? press '2' " set increm = $< switch ($increm) case 0: set newv = "[$macro.$minor.$pmicro]" breaksw case 1: set newv = "[$macro.$pminor.0]" breaksw case 2: set newv = "[$pmacro.0.0]" breaksw default: set newv = "$verb" echo "'$increm' read. Not incrementing anything." breaksw endsw #replacing [ ] and . with escaped version for sed would understand them as 'operators' echo "$line s/$verb/$newv/" | sed 's/\./\\\./g;s/\[/\\\[/g;s/\]/\\\]/g' > $sedfile sed -f $sedfile $conf > $tmpfile #clean up \rm -f $sedfile #diff for changes diff -u0 $conf $tmpfile #if something was changed, confirm incrementation : if ("$newv" != "$verb") then echo -n "Confirmation of incrementation ? (yes/no)" set answ = $< set backupconf = $conf.back$$ if ("$answ" == "yes") then \cp -p $conf $backupconf echo "Back-up of $conf made in $backupconf. Now overwriting $conf." \mv -f $tmpfile $conf else echo "'$answ' read. Not incrementing anything." \rm -f $tmpfile exit 0 ; endif #now change Makefile accordingly echo -n "Incrementing Makefile revision accordingly" set tmpfile = `mktemp` #tempfile set sedfile = `mktemp` #tempfile switch ($increm) case 0: echo -n "s/VERSION.*/VERSION=$macro.$minor.$pmicro/" >> $sedfile breaksw case 1: echo "s/VERSION.*/VERSION=$macro.$pminor.0/" > $sedfile breaksw case 2: echo "s/VERSION.*/VERSION=$pmacro.0.0/" > $sedfile breaksw default: echo "Something abnormal happened" exit 1 breaksw endsw sed -f $sedfile $ver > $tmpfile \rm -f $sedfile diff -u0 $ver $tmpfile echo -n "Confirmation of incrementation ? (yes/no) " set answ = $< if ("$answ" == "yes") then \mv -f $tmpfile $ver echo " your old $conf is destroyed..." \rm -f $backupconf else echo "'$answ' read. Not incrementing anything." echo " your old $conf is restored..." \rm -f $tmpfile \mv -f $backupconf $conf exit 0 endif endif else echo "'$answ' read. Not doing anything." endif exit 0 fflas-ffpack-2.2.2/macros/000077500000000000000000000000001274716147400153415ustar00rootroot00000000000000fflas-ffpack-2.2.2/macros/CodeChunk/000077500000000000000000000000001274716147400172045ustar00rootroot00000000000000fflas-ffpack-2.2.2/macros/CodeChunk/Makefile.am000066400000000000000000000022411274716147400212370ustar00rootroot00000000000000# Copyright (c) 2013 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ EXTRA_DIST= \ clapack.C \ lapack.C \ cblas.C \ givaro.C \ cuda.C \ sse.C \ avx.C \ gmp.C fflas-ffpack-2.2.2/macros/CodeChunk/avx.C000066400000000000000000000002671274716147400201130ustar00rootroot00000000000000#include int main() { __m256d P ; double p = 0; P = _mm256_set1_pd(p); P = _mm256_add_pd(P,P); #ifdef __try_avx2 P = _mm256_fnmadd_pd(P,P,P); #endif return 0; } fflas-ffpack-2.2.2/macros/CodeChunk/cblas.C000066400000000000000000000024611274716147400203770ustar00rootroot00000000000000/* * Copyright (C) 2013 FFLAS-FFPACK group. * * Extirpé form a m4 macro by Brice Boyer (briceboyer) . * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ #define __FFLASFFPACK_CONFIGURATION #include "fflas-ffpack/config-blas.h" int main () { double a[4] = {1.,2.,3.,4.}; double b[4] = {4.,3.,2.,1.}; double c[4]; cblas_dgemm(CblasRowMajor, CblasNoTrans,CblasNoTrans,2,2,2,1., a,2,b,2,0.,c,2); if ( (c[0]!=8.) && (c[1]!=5.) && (c[2]!=20.) && (c[3]!=13)) return -1; else return 0; } fflas-ffpack-2.2.2/macros/CodeChunk/clapack.C000066400000000000000000000025031274716147400207060ustar00rootroot00000000000000/* * Copyright (C) 2013 FFLAS-FFPACK group. * * Extirpé form a m4 macro by Brice Boyer (briceboyer) . * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ #define __FFLASFFPACK_CONFIGURATION #define __FFLASFFPACK_HAVE_LAPACK 1 #define __FFLASFFPACK_HAVE_CLAPACK 1 #include "fflas-ffpack/config-blas.h" int main () { double a[4] = {1.,2.,3.,4.}; CBLAS_INT ipiv[2]; clapack_dgetrf(CblasRowMajor, 2, 2, a, 2, ipiv); if ( (a[0]!=2.) && (a[1]!=0.5) && (a[2]!=4.) && (a[3]!=1.)) return -1; else return 0; } fflas-ffpack-2.2.2/macros/CodeChunk/cuda.C000066400000000000000000000002311274716147400202200ustar00rootroot00000000000000#include #include #include int main() { cusparseHandle_t handle = 0; cusparseCreate( &handle ); return 0 ; } fflas-ffpack-2.2.2/macros/CodeChunk/givaro.C000066400000000000000000000023071274716147400206010ustar00rootroot00000000000000/* * Copyright (C) 2013 FFLAS-FFPACK group. * * Extirpé form a m4 macro by Brice Boyer (briceboyer) . * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ #include int main () { if (GIVARO_VERSION < $version_min || GIVARO_VERSION >= $version_max || GIVARO_VERSION>0x030000) return -1; else return 0; /* old version of Givaro are defined as hexa 0x03yyzz*/ } fflas-ffpack-2.2.2/macros/CodeChunk/gmp.C000066400000000000000000000002271274716147400200740ustar00rootroot00000000000000#include int main () { if (__GNU_MP_VERSION < 4) return -1; mpz_class a(2),b(3),c(5); if ( a+b == c ) return 0; else return -1; } fflas-ffpack-2.2.2/macros/CodeChunk/lapack.C000066400000000000000000000025061274716147400205460ustar00rootroot00000000000000/* * Copyright (C) 2013 FFLAS-FFPACK group. * * Extirpé form a m4 macro by Brice Boyer (briceboyer) . * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ #define __FFLASFFPACK_CONFIGURATION #define __FFLASFFPACK_HAVE_LAPACK 1 // #define __FFLASFFPACK_HAVE_CLAPACK 1 #include "fflas-ffpack/config-blas.h" int main () { double a[4] = {1.,2.,3.,4.}; CBLAS_INT ipiv[2]; clapack_dgetrf(CblasRowMajor, 2, 2, a, 2, ipiv); if ( (a[0]!=2.) && (a[1]!=0.5) && (a[2]!=4.) && (a[3]!=1.)) return -1; else return 0; } fflas-ffpack-2.2.2/macros/CodeChunk/sse.C000066400000000000000000000002471274716147400201050ustar00rootroot00000000000000#include int main() { // SSE 2 __m128d P ; double p = 0; P = _mm_set1_pd(p); P = _mm_add_pd(P,P); // SSE 4.1 P = _mm_floor_pd(P); return 0; } fflas-ffpack-2.2.2/macros/Makefile.am000066400000000000000000000024211274716147400173740ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # adapted from LinBox configuration # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ SUBDIRS=CodeChunk EXTRA_DIST= \ aclocal-include.m4 \ fflas-ffpack-blas.m4 \ config-header.m4 \ debug.m4 \ fflas-ffpack-doc.m4 \ fflas-ffpack-misc.m4 \ fflas-ffpack-opt.m4 \ fflas-ffpack-precompile.m4\ givaro-check.m4 \ mkl-check.m4 \ avx-check.m4 \ omp-check.m4 \ cuda-check.m4 fflas-ffpack-2.2.2/macros/aclocal-include.m4000066400000000000000000000025561274716147400206320ustar00rootroot00000000000000dnl aclocal-include.m4 dnl Copyright (c) 2011 FFLAS-FFPACK dnl written by Brice Boyer (briceboyer) dnl adapted from LinBox configuration dnl dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ dnl This macro adds the name macrodir to the set of directories dnl that `aclocal' searches for macros. dnl serial 1 dnl AM_ACLOCAL_INCLUDE(macrodir) AC_DEFUN([AM_ACLOCAL_INCLUDE], [ AM_CONDITIONAL(INSIDE_GNOME_COMMON, test x = y) test -n "$ACLOCAL_FLAGS" && ACLOCAL="$ACLOCAL $ACLOCAL_FLAGS" for k in $1 ; do ACLOCAL="$ACLOCAL -I $k" ; done ]) fflas-ffpack-2.2.2/macros/avx-check.m4000066400000000000000000000055711274716147400174640ustar00rootroot00000000000000dnl Check for AVX dnl Copyright (c) 2011 FFLAS-FFPACK dnl Created by BB, 2014-03-25 dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl dnl FF_CHECK_AVX dnl dnl turn on AVX or AVX2 extensions if available AC_DEFUN([FF_CHECK_AVX], [ AC_ARG_ENABLE(avx,[AC_HELP_STRING([--disable-avx], [ Disable Intel(r) AVX])]) AC_MSG_CHECKING(for AVX) AS_IF([ test "x$enable_avx" != "xno" ], [ BACKUP_CXXFLAGS=${CXXFLAGS} CODE_AVX=`cat macros/CodeChunk/avx.C` dnl Check for AVX dnl Intel compilers usually do not require option to enable avx dnl Thus, we test with no option on for switch_avxflags in "" "-mavx"; do CXXFLAGS="${BACKUP_CXXFLAGS} -O0 ${switch_avxflags}" AC_TRY_RUN([ ${CODE_AVX} ], [ avx_found="yes" AVXFLAGS=${switch_avxflags} break ], [ avx_found="no" ], [ echo "cross compiling...disabling" avx_found="no" break ]) done dnl Is AVX found? AS_IF([ test "x$avx_found" = "xyes" ], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_AVX_INSTRUCTIONS,1,[Define if AVX is available]) dnl Check for AVX2 AC_MSG_CHECKING(for AVX2) for switch_avx2flags in "" "-mfma -mavx2"; do CXXFLAGS="${BACKUP_CXXFLAGS} -O0 ${switch_avx2flags}" AC_TRY_RUN( [ #define __try_avx2 ${CODE_AVX} ], [ avx2_found="yes" AVX2FLAGS=${switch_avx2flags} break ], [ avx2_found="no" ], [ echo "cross compiling...disabling" avx2_found = "no" break ]) done dnl Is AVX2 found? AS_IF([ test "x$avx2_found" = "xyes" ], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_AVX2_INSTRUCTIONS,1,[Define if AVX2 is available]) AVXFLAGS=${AVX2FLAGS} ], [ dnl No AVX2 AC_MSG_RESULT(no) ] ) ], [ dnl No AVX AC_MSG_RESULT(no) ] ) CXXFLAGS=${BACKUP_CXXFLAGS} ], [ dnl --disable-avx AC_MSG_RESULT(no [disabled]) ] ) ]) fflas-ffpack-2.2.2/macros/ax_check_x86_features.m4000066400000000000000000000066561274716147400217700ustar00rootroot00000000000000# =========================================================================== # http://www.gnu.org/software/autoconf-archive/ax_check_x86_features.html # =========================================================================== # # SYNOPSIS # # AX_CHECK_X86_FEATURES([ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND]) # # DESCRIPTION # # Checks if the host cpu supports various x86 instruction set, the # instructions that will get tested are "mmx, popcnt, sse, sse2, sse3, # sse4.1, sse4.2, sse4a, avx, avx2, avx512f, fma, fma4, bmi, bmi2". If the # instruction set is supported by the host cpu, the C preprocessor macro # HAVE_XXX_INSTRUCTIONS is set to 1. The XXX is up-cased instruction case # with dot replaced by underscore. For example, the test for "sse4.2" # would export HAVE_SSE4_2_INSTRUCTIONS=1. Also the compiler flag # "-msse4.2" would be added to X86_FEATURE_CFLAGS variable, that can be # obtained in Makefile.am using @X86_FEATURE_CFLAGS@. # # If any of the test for the instruction set were succeeded, the configure # script would run ACTION-IF-FOUND if it is specified, or append # X86_FEATURE_CFLAGS to CFLAGS. If none of the instruction were found, # ACTION-IF-NOT-FOUND hook is triggered. # # This macro requires gcc extended builtin function "__builtin_cpu_init" # and "__builtin_cpu_supports" to detect the cpu features. It will error # out if the compiler doesn't has these builtins. # # See also AX_GCC_X86_CPU_SUPPORTS, which is the actual macro that perform # the checks for the instruction sets. # # LICENSE # # Copyright (c) 2016 Felix Chern # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Archive. When you make and distribute a # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. #serial 1 AC_DEFUN([AX_CHECK_X86_FEATURES], [m4_foreach_w( [ax_x86_feature], [mmx popcnt sse sse2 sse3 sse4.1 sse4.2 sse4a avx avx2 avx512f fma fma4 bmi bmi2], [AX_GCC_X86_CPU_SUPPORTS(ax_x86_feature, [X86_FEATURE_CFLAGS="$X86_FEATURE_CFLAGS -m[]ax_x86_feature"], []) ]) AC_SUBST([X86_FEATURE_CFLAGS]) m4_ifval([$1],[$1], [CXXFLAGS="$CXXFLAGS $X86_FEATURE_CFLAGS"]) $2 ]) fflas-ffpack-2.2.2/macros/ax_cxx_compile_stdcxx_11.m4000066400000000000000000000107631274716147400225120ustar00rootroot00000000000000# ============================================================================ # http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html # ============================================================================ # # SYNOPSIS # # AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional]) # # DESCRIPTION # # Check for baseline language coverage in the compiler for the C++11 # standard; if necessary, add switches to CXXFLAGS to enable support. # # The first argument, if specified, indicates whether you insist on an # extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g. # -std=c++11). If neither is specified, you get whatever works, with # preference for an extended mode. # # The second argument, if specified 'mandatory' or if left unspecified, # indicates that baseline C++11 support is required and that the macro # should error out if no mode with that support is found. If specified # 'optional', then configuration proceeds regardless, after defining # HAVE_CXX11 if and only if a supporting mode is found. # # LICENSE # # Copyright (c) 2008 Benjamin Kosnik # Copyright (c) 2012 Zack Weinberg # Copyright (c) 2013 Roy Stogner # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 3 m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [ template struct check { static_assert(sizeof(int) <= sizeof(T), "not big enough"); }; typedef check> right_angle_brackets; int a; decltype(a) b; typedef check check_type; check_type c; check_type&& cr = static_cast(c); auto d = a; ]) AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl m4_if([$1], [], [], [$1], [ext], [], [$1], [noext], [], [m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl m4_if([$2], [], [ax_cxx_compile_cxx11_required=true], [$2], [mandatory], [ax_cxx_compile_cxx11_required=true], [$2], [optional], [ax_cxx_compile_cxx11_required=false], [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])])dnl AC_LANG_PUSH([C++])dnl ac_success=no AC_CACHE_CHECK(whether $CXX supports C++11 features by default, ax_cv_cxx_compile_cxx11, [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], [ax_cv_cxx_compile_cxx11=yes], [ax_cv_cxx_compile_cxx11=no])]) if test x$ax_cv_cxx_compile_cxx11 = xyes; then ac_success=yes fi m4_if([$1], [noext], [], [dnl if test x$ac_success = xno; then for switch in -std=gnu++11 -std=gnu++0x; do cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch]) AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch, $cachevar, [ac_save_CXXFLAGS="$CXXFLAGS" CXXFLAGS="$CXXFLAGS $switch" AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], [eval $cachevar=yes], [eval $cachevar=no]) CXXFLAGS="$ac_save_CXXFLAGS"]) if eval test x\$$cachevar = xyes; then CXXFLAGS="$CXXFLAGS $switch" ac_success=yes break fi done fi]) m4_if([$1], [ext], [], [dnl if test x$ac_success = xno; then for switch in -std=c++11 -std=c++0x; do cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch]) AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch, $cachevar, [ac_save_CXXFLAGS="$CXXFLAGS" CXXFLAGS="$CXXFLAGS $switch" AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], [eval $cachevar=yes], [eval $cachevar=no]) CXXFLAGS="$ac_save_CXXFLAGS"]) if eval test x\$$cachevar = xyes; then CXXFLAGS="$CXXFLAGS $switch" ac_success=yes break fi done fi]) AC_LANG_POP([C++]) if test x$ax_cxx_compile_cxx11_required = xtrue; then if test x$ac_success = xno; then AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.]) fi else if test x$ac_success = xno; then HAVE_CXX11=0 AC_MSG_NOTICE([No compiler with C++11 support was found]) else HAVE_CXX11=1 AC_DEFINE(HAVE_CXX11,1, [define if the compiler supports basic C++11 syntax]) fi AC_SUBST(HAVE_CXX11) fi ]) fflas-ffpack-2.2.2/macros/ax_gcc_x86_cpu_supports.m4000066400000000000000000000101231274716147400223570ustar00rootroot00000000000000# =========================================================================== # http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_cpu_supports.html # =========================================================================== # # SYNOPSIS # # AX_GCC_X86_CPU_SUPPORTS(X86-INSTRUCTION-SET, # [ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND]) # # DESCRIPTION # # Checks if the host cpu supports X86-INSTRUCTION-SET. The instruction set # that can be tested are "mmx, popcnt, sse, sse2, sse3, sse4.1, sse4.2, # sse4a, avx, avx2, avx512f, fma, fma4, bmi, bmi2". If the instruction set # is supported by the host cpu, the C preprocessor macro # HAVE_XXX_INSTRUCTIONS is set to 1. The XXX is up-cased instruction case # with dot replaced by underscore. For example, the test for "sse4.2" # would export HAVE_SSE4_2_INSTRUCTIONS=1. This macro requires gcc # extended builtin function "__builtin_cpu_init" and # "__builtin_cpu_supports" to detect the cpu features. It will error out # if the compiler doesn't has these builtins. # # If the test for the instruction set succeeded, the hook ACTION-IF-FOUND # would run. Otherwise the hook ACTION-IF-NOT-FOUND would run if # specified. # # See also AX_CHECK_X86_FEATURES, which checks all the possible # instruction set and export the corresponding CFLAGS. # # LICENSE # # Copyright (c) 2016 Felix Chern # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Archive. When you make and distribute a # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. #serial 1 AC_DEFUN_ONCE([_AX_GCC_X86_CPU_INIT], [AC_LANG_PUSH([C]) AC_CACHE_CHECK([for gcc __builtin_cpu_init function], [ax_cv_gcc_check_x86_cpu_init], [AC_RUN_IFELSE( [AC_LANG_PROGRAM([#include ], [__builtin_cpu_init ();]) ], [ax_cv_gcc_check_x86_cpu_init=yes], [ax_cv_gcc_check_x86_cpu_init=no])]) AS_IF([test "X$ax_cv_gcc_check_x86_cpu_init" = "Xno"], [AC_MSG_ERROR([Need GCC to support X86 CPU features tests])]) ]) AC_DEFUN([AX_GCC_X86_CPU_SUPPORTS], [AC_REQUIRE([AC_PROG_CC]) AC_REQUIRE([_AX_GCC_X86_CPU_INIT]) AC_LANG_PUSH([C]) AS_VAR_PUSHDEF([gcc_x86_feature], [AS_TR_SH([ax_cv_gcc_x86_cpu_supports_$1])]) AC_CACHE_CHECK([for x86 $1 instruction support], [gcc_x86_feature], [AC_RUN_IFELSE( [AC_LANG_PROGRAM( [#include ], [ __builtin_cpu_init (); if (__builtin_cpu_supports("$1")) return 0; return 1; ])], [gcc_x86_feature=yes], [gcc_x86_feature=no] )] ) AC_LANG_POP([C]) AS_VAR_IF([gcc_x86_feature],[yes], [AC_DEFINE( AS_TR_CPP([HAVE_$1_INSTRUCTIONS]), [1], [Define if $1 instructions are supported]) $2], [$3] ) AS_VAR_POPDEF([gcc_x86_feature]) ]) fflas-ffpack-2.2.2/macros/config-header.m4000066400000000000000000000101341274716147400202750ustar00rootroot00000000000000dnl Copyright (c) 2011 FFLAS-FFPACK dnl This file is part of FFLAS-FFPACK dnl This was copied from Givaro's dnl dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ AC_DEFUN([AX_PREFIX_CONFIG_H],[AC_REQUIRE([AC_CONFIG_HEADERS]) AC_CONFIG_COMMANDS([ifelse($1,,$PACKAGE-config.h,$1)],[dnl AS_VAR_PUSHDEF([_OUT],[ac_prefix_conf_OUT])dnl AS_VAR_PUSHDEF([_DEF],[ac_prefix_conf_DEF])dnl AS_VAR_PUSHDEF([_PKG],[ac_prefix_conf_PKG])dnl AS_VAR_PUSHDEF([_LOW],[ac_prefix_conf_LOW])dnl AS_VAR_PUSHDEF([_UPP],[ac_prefix_conf_UPP])dnl AS_VAR_PUSHDEF([_INP],[ac_prefix_conf_INP])dnl m4_pushdef([_script],[conftest.prefix])dnl m4_pushdef([_symbol],[m4_cr_Letters[]m4_cr_digits[]_])dnl _OUT=`echo ifelse($1, , $PACKAGE-config.h, $1)` _DEF=`echo _$_OUT | sed -e "y:m4_cr_letters:m4_cr_LETTERS[]:" -e "s/@<:@^m4_cr_Letters@:>@/_/g"` _PKG=`echo ifelse($2, , $PACKAGE, $2)` _LOW=`echo _$_PKG | sed -e "y:m4_cr_LETTERS-:m4_cr_letters[]_:"` _UPP=`echo $_PKG | sed -e "y:m4_cr_letters-:m4_cr_LETTERS[]_:" -e "/^@<:@m4_cr_digits@:>@/s/^/_/"` _INP=`echo "ifelse($3,,,$3)" | sed -e 's/ *//'` if test ".$_INP" = "."; then for ac_file in : $CONFIG_HEADERS; do test "_$ac_file" = _: && continue case "$ac_file" in *.h) _INP=$ac_file ;; *) esac test ".$_INP" != "." && break done fi if test ".$_INP" = "."; then case "$_OUT" in */*) _INP=`basename "$_OUT"` ;; *-*) _INP=`echo "$_OUT" | sed -e "s/@<:@_symbol@:>@*-//"` ;; *) _INP=config.h ;; esac fi if test -z "$_PKG" ; then AC_MSG_ERROR([no prefix for _PREFIX_PKG_CONFIG_H]) else if test ! -f "$_INP" ; then if test -f "$srcdir/$_INP" ; then _INP="$srcdir/$_INP" fi fi AC_MSG_NOTICE(creating $_OUT - prefix $_UPP for $_INP defines) if test -f $_INP ; then echo "s/@%:@undef *\\(@<:@m4_cr_LETTERS[]_@:>@\\)/@%:@undef $_UPP""_\\1/" > _script echo "s/@%:@undef *\\(@<:@m4_cr_letters@:>@\\)/@%:@undef $_LOW""_\\1/" >> _script echo "s/@%:@def[]ine *\\(@<:@m4_cr_LETTERS[]_@:>@@<:@_symbol@:>@*\\)\\(.*\\)/@%:@ifndef $_UPP""_\\1 \\" >> _script echo "@%:@def[]ine $_UPP""_\\1 \\2 \\" >> _script echo "@%:@endif/" >>_script echo "s/@%:@def[]ine *\\(@<:@m4_cr_letters@:>@@<:@_symbol@:>@*\\)\\(.*\\)/@%:@ifndef $_LOW""_\\1 \\" >> _script echo "@%:@define $_LOW""_\\1 \\2 \\" >> _script echo "@%:@endif/" >> _script # now executing _script on _DEF input to create _OUT output file echo "@%:@ifndef $_DEF" >$tmp/pconfig.h echo "@%:@def[]ine $_DEF 1" >>$tmp/pconfig.h echo ' ' >>$tmp/pconfig.h echo /'*' $_OUT. Generated automatically at end of configure. '*'/ >>$tmp/pconfig.h sed -f _script $_INP >>$tmp/pconfig.h echo ' ' >>$tmp/pconfig.h echo '/* once:' $_DEF '*/' >>$tmp/pconfig.h echo "@%:@endif" >>$tmp/pconfig.h if cmp -s $_OUT $tmp/pconfig.h 2>/dev/null; then AC_MSG_NOTICE([$_OUT is unchanged]) else ac_dir=`AS_DIRNAME(["$_OUT"])` AS_MKDIR_P(["$ac_dir"]) rm -f "$_OUT" mv $tmp/pconfig.h "$_OUT" fi else AC_MSG_ERROR([input file $_INP does not exist - skip generating $_OUT]) fi rm -f conftest.* fi m4_popdef([_symbol])dnl m4_popdef([_script])dnl AS_VAR_POPDEF([_INP])dnl AS_VAR_POPDEF([_UPP])dnl AS_VAR_POPDEF([_LOW])dnl AS_VAR_POPDEF([_PKG])dnl AS_VAR_POPDEF([_DEF])dnl AS_VAR_POPDEF([_OUT])dnl ],[PACKAGE="$PACKAGE"])]) fflas-ffpack-2.2.2/macros/cuda-check.m4000066400000000000000000000070361274716147400176000ustar00rootroot00000000000000dnl Check for CUDA dnl Copyright(c)'1994-2009,2003,2013 by The Givaro group dnl This file is part of FFLAS-FFPACK dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ dnl Modified by Pascal Giorgi, 2003-12-03 dnl Modified by BB, 2013-5-22 and other times dnl Test for CUDA dnl Sets CUDA_CFLAGS and CUDA_LIBS dnl Defines HAVE_CUDA AC_DEFUN([FF_CHECK_CUDA], [ AC_ARG_WITH(cuda, [AC_HELP_STRING([--with-cuda=|yes|no],[ Use CUDA library. If argument is no, you do not have the library installed on your machine. If argument is yes or that means the library is reachable with the standard search path "/usr" or "/usr/local" (set as default). Otherwise you give the to the directory which contain the library. ])], [if test "$withval" = yes ; then CUDA_HOME_PATH="${DEFAULT_CHECKING_PATH}" elif test "$withval" != no ; then CUDA_HOME_PATH="$withval ${DEFAULT_CHECKING_PATH}" fi], [CUDA_HOME_PATH="${DEFAULT_CHECKING_PATH}"]) min_cuda_version=ifelse([$1], ,5.5.0,$1) dnl Check for existence BACKUP_CXXFLAGS=${CXXFLAGS} BACKUP_LIBS=${LIBS} AC_MSG_CHECKING(for CUDA >= $min_cuda_version ) dnl todo lib (32) and lib64. CUDA_PATH= for CUDA_HOME in ${CUDA_HOME_PATH} do if test "x$CUDA_HOME" != "x/usr" -a "x$CUDA_HOME" != "x/usr/local"; then if test -r "$CUDA_HOME/include/cuda.h" ; then CUDA_CFLAGS="-I${CUDA_HOME}/include" CUDA_PATH="-L${CUDA_HOME}/lib64" CUDA_LIBS="-L${CUDA_HOME}/lib64 -lcusparse" else echo "($CUDA_HOME) seems an invalid CUDA prefix" echo "Searching CUDA in PATH" CUDA_CFLAGS="" CUDA_LIBS="-lcusparse" fi else CUDA_CFLAGS="" CUDA_LIBS="-lcusparse" fi CXXFLAGS="${CXXFLAGS} ${CUDA_CFLAGS}" LIBS="${LIBS} ${CUDA_LIBS}" CODE_CUDA=`cat macros/CodeChunk/cuda.C` AC_TRY_LINK( [ #include ], [ CUresult a;], [ dnl # See if we are running CUDA 4.0 with --enable-cxx AC_TRY_RUN( [ ${CODE_CUDA} ], [ AC_MSG_RESULT(found) AC_DEFINE(HAVE_CUDA,1,[Define if CUDA is installed]) dnl CUDA_VERSION="" dnl I could find it but why is it here ? CUDA_LIBS="${CUDA_PATH} -lcusparse" dnl AC_SUBST(CUDA_VERSION) AC_SUBST(CUDA_LIBS) AC_SUBST(CUDA_CFLAGS) break; ],[ AC_MSG_RESULT(no : cuda is too old or not found) dnl AC_SUBST(CUDA_VERSION) ],[ dnl This should never happen AC_MSG_RESULT(no) ]) ],[ AC_MSG_RESULT(unknown) echo "WARNING: You appear to be cross compiling, so there is no way to determine" echo "whether your CUDA version is new enough. I am assuming it is." AC_SUBST(CUDA_CFLAGS) AC_SUBST(CUDA_LIBS) AC_DEFINE(HAVE_CUDA,1,[Define if CUDA is installed]) ]) unset CUDA_CFLAGS unset CUDA_LIBS done CXXFLAGS=${BACKUP_CXXFLAGS} LIBS=${BACKUP_LIBS} #unset LD_LIBRARY_PATH ]) fflas-ffpack-2.2.2/macros/debug.m4000066400000000000000000000106151274716147400166740ustar00rootroot00000000000000dnl Copyright(c)'2011 FFLAS-FFPACK dnl Written by Brice Boyer (briceboyer) dnl dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ dnl enable basic debug mode. AC_DEFUN([AC_DEBUG], [AC_MSG_CHECKING([whether to enable debugging options in the library]) AC_ARG_ENABLE(debug, [AC_HELP_STRING([--enable-debug=yes|no], [enable debugging options in library])], USE_DEBUG=$enableval, USE_DEBUG=no) AC_MSG_RESULT([$USE_DEBUG]) AM_CONDITIONAL(DEBUG, [test x$USE_DEBUG = xyes]) DBG=$USE_DEBUG AC_SUBST(DBG)dnl ] ) AC_DEFUN([AC_PROFILE], [AC_MSG_CHECKING([whether to enable profiling everything in the library]) AC_ARG_ENABLE(profile, [AC_HELP_STRING([--enable-profile=yes|no], [enable profiling options in library])], USE_PROFILE=$enableval, USE_PROFILE=no) AC_MSG_RESULT([$USE_PROFILE]) AM_CONDITIONAL(PROFILE, [test $USE_PROFILE = yes]) PROF=$USE_PROFILE AC_SUBST(PROF)dnl ] ) dnl Enable warnings from compiler. AC_DEFUN([AC_WARNINGS], [AC_MSG_CHECKING([whether to enable warnings when compiling the library]) AC_ARG_ENABLE(warnings, [AC_HELP_STRING([--enable-warnings=yes|full|no], [enable warnings when compiling the library. If nothing or yes is given, more aggressive compiler warnings are passed to the compiler. If full is given, we become paranoïd about warnings and treat them as errors.])], USE_WARNINGS=$enableval, USE_WARNINGS=no) AC_MSG_RESULT([$USE_WARNINGS]) dnl AM_CONDITIONAL(WARNINGS, [test $USE_WARNINGS = yes]) WARN=$USE_WARNINGS AC_SUBST(WARN)dnl ]dnl )dnl CCNAM="" AC_DEFUN([AC_COMPILER_NAME], [ AC_MSG_CHECKING(for family name of compiler) dnl CHECKING for various compilers dnl ICC ? AC_TRY_RUN( [ #ifdef __INTEL_COMPILER int main() { return 0 ; } #else pas intel #endif], [ AC_MSG_RESULT(icc) CCNAM=icc AC_SUBST(CCNAM) ]) dnl PATHSCALE > 4 ? AS_IF([ test -z "${CCNAM}"], [ AC_TRY_RUN( [ #ifdef __PATHSCALE__ int main() { return !(__PATHCC__ >= 4) ; } #else pas ekopath non plus. #endif], [ AC_MSG_RESULT(eko) CCNAM=eko AC_SUBST(CCNAM) ]) ]) dnl CLANG > 3.1 ? AS_IF([ test -z "${CCNAM}"], [ AC_TRY_RUN( [ #ifdef __clang__ int main() { return !(__clang_major__ >=3 && __clang_minor__ >=1) ; } #else pas clang non plus. #endif], [ AC_MSG_RESULT(clang31) CCNAM=clang31 AC_SUBST(CCNAM) ]) ]) dnl CLANG > 3 ? AS_IF([ test -z "${CCNAM}"], [ AC_TRY_RUN( [ #ifdef __clang__ int main() { return !(__clang_major__ >=3) ; } #else pas clang non plus. #endif], [ AC_MSG_RESULT(clang31) CCNAM=clang AC_SUBST(CCNAM) ]) ]) dnl GCC >= 4.8 ? AS_IF([ test -z "${CCNAM}"], [ AC_TRY_RUN( [ #ifdef __GNUC__ int main() { return !(__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ > 7 )) ; } #else pas gcc non plus ??? #endif], [ CCNOM=gcc AS_IF([ test -n "${CC}" ], [CCNOM="`$CC --version 2>&1| awk 'NR<2{print $1}'`"]) CCNAM=gcc48 AC_SUBST(CCNAM) AC_MSG_RESULT($CCNOM) ]) ]) dnl GCC > 4.2 ? AS_IF([ test -z "${CCNAM}"], [ AC_TRY_RUN( [ #ifdef __GNUC__ int main() { return !(__GNUC__ >= 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) ; } #else pas gcc non plus ??? #endif], [ CCNOM=gcc AS_IF([ test -n "${CC}" ], [CCNOM="`$CC --version 2>&1| awk 'NR<2{print $1}'`"]) CCNAM=gcc AC_SUBST(CCNAM) AC_MSG_RESULT($CCNOM) ]) ]) dnl autre ? AS_IF([ test -z "${CCNAM}"], [ AC_MSG_RESULT(unknown) CCNAM=unknown AC_SUBST(CCNAM) echo echo " *** unknow compiler. please file a bug " echo ]) ]) fflas-ffpack-2.2.2/macros/fflas-ffpack-blas.m4000066400000000000000000000114511274716147400210470ustar00rootroot00000000000000dnl Check for BLAS dnl Copyright 2014 Brice Boyer (briceboyer) dnl This file is part of FFLAS-FFPACK dnl dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ dnl Tests BLAS for and define CBLAS_FLAG and CBLAS_LIBS dnl Defines HAVE_LAPACK, HAVE_CLAPACK, HAVE_BLAS, HAVE_CBLAS if available AC_DEFUN([FF_CHECK_BLAS_CFLAGS], [ AC_ARG_WITH(blas-cflags, [AC_HELP_STRING([--with-blas-cflags=], [ CFLAGS for BLAS/LAPACK (i.e. -I/path/to/toto-blas) ]) ]) CBLAS_FLAG="$with_blas_cflags -D__FFLASFFPACK_HAVE_CBLAS" AC_SUBST(CBLAS_FLAG) dnl echo $CBLAS_FLAG; ] ) dnl AC_DEFUN([FF_CHECK_BLAS_LIBS], [ AC_ARG_WITH(blas-libs, [AC_HELP_STRING([--with-blas-libs=], [ LIBS for BLAS/LAPACK (i.e. -L/path/to/toto-blas -ltoto-blas) ]) ]) CBLAS_LIBS="$with_blas_libs" AC_SUBST(CBLAS_LIBS) dnl echo $CBLAS_LIBS; ] ) dnl AC_DEFUN([FF_CHECK_USER_BLAS], [ BACKUP_CXXFLAGS=${CXXFLAGS} BACKUP_LIBS=${LIBS} saved_LD_RUN_PATH="$LD_RUN_PATH" blas_lib_path=`echo $CBLAS_LIBS | $EGREP '\-L' | $SED -e 's/-L//;s/ .*//'` LD_RUN_PATH="${LD_RUN_PATH:+$LD_RUN_PATH$PATH_SEPARATOR}$blas_lib_path" export LD_RUN_PATH CODE_CBLAS=`cat macros/CodeChunk/cblas.C` AC_MSG_CHECKING(for USER BLAS) CXXFLAGS="${BACKUP_CXXFLAGS} ${CBLAS_FLAG} -I. -I.. -I`pwd` -I`pwd`/fflas-ffpack ${GIVARO_CFLAGS}" LIBS="${BACKUP_LIBS} ${CBLAS_LIBS}" AC_TRY_LINK( [ #define __FFLASFFPACK_CONFIGURATION #include "fflas-ffpack/config-blas.h"], [double a;], [ AC_TRY_RUN( [ ${CODE_CBLAS} ],[ blas_found="yes" ],[ blas_problem="$problem" ],[ blas_found="yes" blas_cross="yes" ]) ], [ blas_found="no" ]) AS_IF([ test "x$blas_found" = "xyes" ], [ BLAS_VENDOR="USER" AC_SUBST(BLAS_VENDOR) dnl AC_SUBST(CBLAS_FLAG) dnl AC_SUBST(BLAS_PATH) AC_DEFINE(HAVE_BLAS,1,[Define if BLAS is installed]) AC_DEFINE(HAVE_CBLAS,1,[Define if C interface to BLAS is installed]) BLAS_FOUND=true AC_SUBST(BLAS_FOUND) dnl AC_DEFINE(BLAS_AVAILABLE,,[Define if BLAS routines are available]) #echo ${CBLAS_FLAG} #echo ${CBLAS_LIBS} HAVE_BLAS=yes AS_IF([test "x$blas_cross" != "xyes"], [ AC_MSG_RESULT(found (cblas)) ] , [AC_MSG_RESULT(unknown) echo "WARNING: You appear to be cross compiling, so there is no way to determine" echo "whether your BLAS are good. I am assuming it is."]) ], [ AC_MSG_RESULT(problem) ] ) AM_CONDITIONAL(FFLASFFPACK_HAVE_BLAS, test "x$HAVE_BLAS" = "xyes") CXXFLAGS=${BACKUP_CXXFLAGS} LIBS=${BACKUP_LIBS} LD_RUN_PATH="$saved_LD_RUN_PATH" export LD_RUN_PATH unset saved_LD_RUN_PATH dnl unset LD_LIBRARY_PATH ] ) dnl AC_DEFUN([FF_CHECK_USER_LAPACK], [ BACKUP_CXXFLAGS=${CXXFLAGS} BACKUP_LIBS=${LIBS} CODE_CLAPACK=`cat macros/CodeChunk/clapack.C` CODE_LAPACK=`cat macros/CodeChunk/lapack.C` AC_MSG_CHECKING(for USER LAPACK) CXXFLAGS="${BACKUP_CXXFLAGS} ${CBLAS_FLAG} -I. -I.. -I`pwd` -I`pwd`/fflas-ffpack ${GIVARO_CFLAGS}" LIBS="${BACKUP_LIBS} ${CBLAS_LIBS}" AC_TRY_RUN( [ ${CODE_CLAPACK} ], [ dgetrf_found="yes" ], [ dgetrf_problem="problem" ], [ dgetrf_found="" ] ) AS_IF([ test "${dgetrf_found}" = "yes"], [ AC_MSG_RESULT( yes (clapack)) AC_DEFINE(HAVE_LAPACK,1,[Define if LAPACK is installed]) AC_DEFINE(HAVE_CLAPACK,1,[Define if C interface to LAPACK is available]) HAVE_LAPACK=yes ], [ AC_TRY_RUN( [ ${CODE_LAPACK} ], [ dgetrf_found="yes"], [ dgetrf_problem="$problem"], [ dgetrf_found="" ] ) AS_IF([ test "x${dgetrf_found}" = "xyes"], [ AC_SUBST(LAPACK_LIBS) AC_MSG_RESULT( yes (lapack)) AC_DEFINE(HAVE_LAPACK,1,[Define if LAPACK is installed]) HAVE_LAPACK=yes ], dnl clapack not found. looking for lapack [ AC_MSG_RESULT( no ) ] ) ] ) dnl AM_CONDITIONAL(FFLASFFPACK_HAVE_LAPACK, test "x$HAVE_LAPACK" = "xyes") CXXFLAGS=${BACKUP_CXXFLAGS} LIBS=${BACKUP_LIBS} dnl unset LD_LIBRARY_PATH ] ) fflas-ffpack-2.2.2/macros/fflas-ffpack-doc.m4000066400000000000000000000042411274716147400206720ustar00rootroot00000000000000dnl Copyright(c)'2011 FFLAS-FFPACK dnl Written by Brice Boyer (briceboyer) dnl dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ AC_DEFUN([FF_DOC], [ AC_MSG_CHECKING(whether to build documentation) AC_ARG_WITH(docdir, [AC_HELP_STRING([--with-docdir=], [Where the FFLAS-FFPACK documentation should be installed])], [ FFLASFFPACK_DOC_PATH="$withval" ], [ eval FFLASFFPACK_DOC_PATH="${prefix}/docs" ]) AC_SUBST(FFLASFFPACK_DOC_PATH) AC_ARG_WITH(doxygen, [AC_HELP_STRING([--with-doxygen=], [Give the path to Doxygen. Note: --enable-doc needed])], [ DOXYGEN_PATH="$PATH $withval" ], [ DOXYGEN_PATH="$PATH" ]) AC_ARG_ENABLE(doc,[AC_HELP_STRING([--enable-doc], [Enable building documentation])], [ AC_MSG_RESULT(yes) AC_MSG_CHECKING(whether doxygen works) export PATH=$DOXYGEN_PATH (doxygen --version) < /dev/null > /dev/null 2>&1 || { AC_MSG_RESULT(no) echo echo "You must have doxygen installed to create documentation for" echo "FFLAS-FFPACK. This error only happens if you use --enable-doc." echo "Download the appropriate package for your distribution, or get" echo "the source tarball from http://www.stack.nl/~dimitri/doxygen/" exit -1 } AC_MSG_RESULT(yes) AM_CONDITIONAL(FFLASFFPACK_BUILD_DOC, true) ], [ AC_MSG_RESULT(no) AM_CONDITIONAL(FFLASFFPACK_BUILD_DOC, false) ]) ]) fflas-ffpack-2.2.2/macros/fflas-ffpack-misc.m4000066400000000000000000000054101274716147400210570ustar00rootroot00000000000000dnl fflas-ffpack miscellaneous functonnnalities dnl Copyright (c) fflas-ffpack dnl This file comes from LinBox' linbox-misc.m4 dnl dnl Copyright(c)'2011 FFLAS-FFPACK dnl Written by Brice Boyer (briceboyer) dnl dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ AC_DEFUN([FF_MISC], [ AC_ARG_WITH(default, [AC_HELP_STRING([--with-default=], [Add to the default path for external package checking. Set as default with /usr and /usr/local. ])], [if test "$withval" = yes ; then echo "Default path = /usr /usr/local" DEFAULT_CHECKING_PATH="/usr /usr/local" else echo "Default path = $withval /usr /usr/local" DEFAULT_CHECKING_PATH="$withval /usr /usr/local" fi ], [ echo "Default path = /usr /usr/local" DEFAULT_CHECKING_PATH="/usr /usr/local" ]) AC_ARG_WITH(all, [AC_HELP_STRING([--with-all= |yes|no], [Use all external packages. If the argument is no, you not sure that all libraries are reachable with the default path. If the argument is yes or , that means that all libraries are reachable with the default path. Otherwise add to default path and enable all external packages. ])], [if test "$withval" = yes ; then check_all="yes" echo "Checking all external packages in ${DEFAULT_CHECKING_PATH}" elif test "$withval" != no ; then check_all="yes" DEFAULT_CHECKING_PATH="$withval ${DEFAULT_CHECKING_PATH}" echo "Checking all external packages in ${DEFAULT_CHECKING_PATH}" fi ], []) if test -n "$check_all"; then GMP_HOME_PATH="${DEFAULT_CHECKING_PATH}" GIVARO_HOME_PATH="${DEFAULT_CHECKING_PATH}" # NTL_HOME_PATH="${DEFAULT_CHECKING_PATH}" # LIDIA_HOME_PATH="${DEFAULT_CHECKING_PATH}" # SACLIB_HOME_PATH="${DEFAULT_CHECKING_PATH}" # MAPLE_HOME_PATH="${DEFAULT_CHECKING_PATH} unknown" # EXPAT_HOME_PATH="${DEFAULT_CHECKING_PATH}" BLAS_HOME_PATH="${DEFAULT_CHECKING_PATH}" fi ]) fflas-ffpack-2.2.2/macros/fflas-ffpack-opt.m4000066400000000000000000000141271274716147400207330ustar00rootroot00000000000000dnl Copyright (c) 2012 FFLAS-FFPACK dnl Written by Clément Pernet, Brice Boyer. dnl This file was taken from LinBox linbox-opt.m4 dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ AC_DEFUN([FF_OPT], [ AC_MSG_CHECKING([whether to use run time optimization]) AC_ARG_ENABLE(optimization, [AC_HELP_STRING([--disable-optimization], [ Disable run time optimization in FflasFpack code])]) dnl creating the optimise file unconditionally echo "#ifndef __FFLASFFPACK_optimise_H" > fflas-ffpack/fflas-ffpack-optimise.h echo "#define __FFLASFFPACK_optimise_H" >> fflas-ffpack/fflas-ffpack-optimise.h echo "" >> fflas-ffpack/fflas-ffpack-optimise.h dnl The optimise.h file has to be correcly written, so we close the #if ! echo "#endif // optimise.h" >> fflas-ffpack/fflas-ffpack-optimise.h AS_IF([test "x$enable_optimization" == "xyes"], [ AC_MSG_RESULT(yes) BACKUP_CXXFLAGS=${CXXFLAGS} BACKUP_LIBS=${LIBS} echo " *** OPTIMIZATION *** " AC_MSG_CHECKING([best threshold for Strassen-Winograd matrix multiplication]) AC_MSG_RESULT([see below]) CXXFLAGS_ALL="-I. -I.. -I`pwd` -I`pwd`/fflas-ffpack ${BACKUP_CXXFLAGS} ${AVXFLAGS} ${DEFAULT_CFLAGS} ${GIVARO_CFLAGS} ${CBLAS_FLAG} ${OMPFLAGS}" LIBS="${BACKUP_LIBS} ${CBLAS_LIBS} ${GIVARO_LIBS}" WINO=`cat optimiser/winograd.C` ADDFLAGS="-DOPTIMISATION_MODE" saved_LD_RUN_PATH="$LD_RUN_PATH" LD_RUN_PATH="${LD_RUN_PATH:+$LD_RUN_PATH$PATH_SEPARATOR}$givaro_lib_path" export LD_RUN_PATH dnl for Wino threshold for double echo " == Wino/BLAS threshold for Givaro::Modular == " CXXFLAGS="${CXXFLAGS_ALL} -DFLTTYPE=Givaro::Modular ${ADDFLAGS}" AC_RUN_IFELSE([AC_LANG_SOURCE([${WINO}])],[ dnl remove last line dnl sed -i '$d' fflas-ffpack/fflas-ffpack-optimise.h ; dnl -i does not work on BSD sed sed '$d' fflas-ffpack/fflas-ffpack-optimise.h > fflas-ffpack/fflas-ffpack-optimise.back.h ; mv fflas-ffpack/fflas-ffpack-optimise.back.h fflas-ffpack/fflas-ffpack-optimise.h ; dnl append new definition cat WinoThreshold >> fflas-ffpack/fflas-ffpack-optimise.h ; dnl close the file echo "#endif // optimise.h" >> fflas-ffpack/fflas-ffpack-optimise.h dnl echo done : `cat WinoThreshold` WINOT=`cat WinoThreshold | awk 'NR==2' | awk '{print $ 3}'` dnl cleaning service ! rm WinoThreshold ; AC_MSG_RESULT(done (${WINOT})) ],[ AC_MSG_RESULT(problem) break ],[ AC_MSG_RESULT(cross compilation) break ]) dnl for WinoThreshold for float echo " == Wino/BLAS threshold for Givaro::Modular == " CXXFLAGS="${CXXFLAGS_ALL} -DFLTTYPE=Givaro::Modular ${ADDFLAGS}" AC_RUN_IFELSE([AC_LANG_SOURCE([${WINO}])],[ dnl remove last line dnl sed -i '$ d' fflas-ffpack/fflas-ffpack-optimise.h ; sed '$d' fflas-ffpack/fflas-ffpack-optimise.h > fflas-ffpack/fflas-ffpack-optimise.back.h ; mv fflas-ffpack/fflas-ffpack-optimise.back.h fflas-ffpack/fflas-ffpack-optimise.h ; dnl append new definition cat WinoThreshold >> fflas-ffpack/fflas-ffpack-optimise.h ; dnl close the file echo "#endif // optimise.h" >> fflas-ffpack/fflas-ffpack-optimise.h dnl echo done : `cat WinoThreshold` WINOT=`cat WinoThreshold | awk 'NR==2' | awk '{print $ 3}'` dnl cleaning service ! rm WinoThreshold ; AC_MSG_RESULT(done (${WINOT})) ],[ AC_MSG_RESULT(problem) break ],[ AC_MSG_RESULT(cross compilation) break ]) dnl for Wino threshold for double echo " == Wino/BLAS threshold for Givaro::ModularBalanced == " CXXFLAGS="${CXXFLAGS_ALL} -DFLTTYPE=Givaro::ModularBalanced ${ADDFLAGS}" AC_RUN_IFELSE([AC_LANG_SOURCE([${WINO}])],[ dnl remove last line dnl sed -i '$d' fflas-ffpack/fflas-ffpack-optimise.h ; dnl -i does not work on BSD sed sed '$d' fflas-ffpack/fflas-ffpack-optimise.h > fflas-ffpack/fflas-ffpack-optimise.back.h ; mv fflas-ffpack/fflas-ffpack-optimise.back.h fflas-ffpack/fflas-ffpack-optimise.h ; dnl append new definition cat WinoThreshold >> fflas-ffpack/fflas-ffpack-optimise.h ; dnl close the file echo "#endif // optimise.h" >> fflas-ffpack/fflas-ffpack-optimise.h dnl cleaning service ! WINOT=`cat WinoThreshold | awk 'NR==2' | awk '{print $ 3}'` dnl echo done : `cat WinoThreshold` rm WinoThreshold ; AC_MSG_RESULT(done (${WINOT})) ],[ AC_MSG_RESULT(problem) break ],[ AC_MSG_RESULT(cross compilation) break ]) dnl for WinoThreshold for float echo " == Wino/BLAS threshold for Givaro::ModularBalanced == " CXXFLAGS="${CXXFLAGS_ALL} -DFLTTYPE=Givaro::ModularBalanced ${ADDFLAGS}" AC_RUN_IFELSE([AC_LANG_SOURCE([${WINO}])],[ dnl remove last line dnl sed -i '$ d' fflas-ffpack/fflas-ffpack-optimise.h ; sed '$d' fflas-ffpack/fflas-ffpack-optimise.h > fflas-ffpack/fflas-ffpack-optimise.back.h ; mv fflas-ffpack/fflas-ffpack-optimise.back.h fflas-ffpack/fflas-ffpack-optimise.h ; dnl append new definition cat WinoThreshold >> fflas-ffpack/fflas-ffpack-optimise.h ; dnl close the file echo "#endif // optimise.h" >> fflas-ffpack/fflas-ffpack-optimise.h dnl echo done : `cat WinoThreshold` WINOT=`cat WinoThreshold | awk 'NR==2' | awk '{print $ 3}'` dnl cleaning service ! rm WinoThreshold ; AC_MSG_RESULT(done (${WINOT})) ],[ AC_MSG_RESULT(problem) break ],[ AC_MSG_RESULT(cross compilation) break ]) LD_RUN_PATH="$saved_LD_RUN_PATH" unset givaro_lib_path ], [AC_MSG_RESULT(no optimization)] ) ]) fflas-ffpack-2.2.2/macros/fflas-ffpack-precompile.m4000066400000000000000000000031031274716147400222600ustar00rootroot00000000000000dnl Copyright (c) 2012 FFLAS-FFPACK dnl Written by Clément Pernet, Brice Boyer. dnl This file was taken from LinBox linbox-opt.m4 dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ AC_DEFUN([FF_PRECOMPILE], [ AC_MSG_CHECKING([whether to compile the standard specializations]) AC_ARG_ENABLE(precompilation, [AC_HELP_STRING([--enable-precompilation], [ Enable precompilation of the standard specializations])]) AM_CONDITIONAL(FFLASFFPACK_PRECOMPILED, test "x$enable_precompilation" == "xyes") AS_IF([test "x$enable_precompilation" == "xyes"], [ AC_MSG_RESULT(yes) PRECOMPILE_FLAGS="-DFFLAS_COMPILED -DFFPACK_COMPILED" PRECOMPILE_LIBS="-L${libdir} -lfflas -lffpack" AC_SUBST(PRECOMPILE_FLAGS) AC_SUBST(PRECOMPILE_LIBS) ], [AC_MSG_RESULT(no)] ) ]) fflas-ffpack-2.2.2/macros/givaro-check.m4000066400000000000000000000102671274716147400201530ustar00rootroot00000000000000dnl Check for GIVARO dnl Copyright (c) the Givaro group dnl This file is part of FFLAS-FFPACK dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ dnl adapted from LinBox by BB. dnl FF_CHECK_GIVARO ([MINIMUM-VERSION [, ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]]]) dnl dnl Tests for Givaro and define GIVARO_CFLAGS and GIVARO_LIBS dnl Defines HAVE_GIVARO AC_DEFUN([FF_CHECK_GIVARO], [ AC_ARG_WITH(givaro, [AC_HELP_STRING([--with-givaro=|yes], [Use Givaro library. This library is mandatory for LinBox compilation. If argument is yes or that means the library is reachable with the standard search path (/usr or /usr/local). Otherwise you give the to the directory which contains the library. ])], [if test "$withval" = yes ; then GIVARO_HOME_PATH="${DEFAULT_CHECKING_PATH}" elif test "$withval" != no ; then GIVARO_HOME_PATH="$withval ${DEFAULT_CHECKING_PATH}" fi], [GIVARO_HOME_PATH="${DEFAULT_CHECKING_PATH}"]) dnl -------------- dnl dnl GIVARO VERSION dnl dnl -------------- dnl dnl As we need Integer and Modular, should be updated on each interface changes version_min=40001 version_max=40003 dnl Check for existence BACKUP_CXXFLAGS=${CXXFLAGS} BACKUP_LIBS=${LIBS} saved_LD_RUN_PATH="$LD_RUN_PATH" AC_MSG_CHECKING(for GIVARO >= $version_min and < $version_max) for GIVARO_HOME in ${GIVARO_HOME_PATH} do if test -r "$GIVARO_HOME/include/givaro/givconfig.h"; then # Givaro Libs + CFlags contain GMP info - AB 2014-12-12 GIVARO_LIBS=`$GIVARO_HOME/bin/givaro-config --libs` GIVARO_CFLAGS=`$GIVARO_HOME/bin/givaro-config --cflags` givaro_lib_path=`$GIVARO_HOME/bin/givaro-config --prefix`/lib CXXFLAGS="${BACKUP_CXXFLAGS} ${GIVARO_CFLAGS}" LIBS="${BACKUP_LIBS} ${GIVARO_LIBS}" LD_RUN_PATH="${LD_RUN_PATH:+$LD_RUN_PATH$PATH_SEPARATOR}$givaro_lib_path" export LD_RUN_PATH AC_TRY_LINK( [#include ], [Givaro::Integer a;], [ AC_TRY_RUN( [#include int main () { if (GIVARO_VERSION >= $version_min && GIVARO_VERSION < $version_max) return 0; else return -1; /* old version of Givaro are defined as hexa 0x03yyzz*/ } ],[ givaro_found="yes" break ],[ givaro_problem="$problem $GIVARO_HOME" unset GIVARO_CFLAGS unset GIVARO_LIBS ],[ givaro_found="yes" givaro_cross="yes" break ]) ], [ givaro_found="yes" givaro_checked="$checked $GIVARO_HOME" #unset GIVARO_CFLAGS #unset GIVARO_LIBS break ]) else givaro_found="no" fi done if test "x$givaro_found" = "xyes" ; then AC_SUBST(GIVARO_CFLAGS) AC_SUBST(GIVARO_LIBS) dnl echo $GIVARO_CFLAGS $GIVARO_LIBS AC_DEFINE(HAVE_GIVARO,1,[Define if GIVARO is installed]) HAVE_GIVARO=yes if test "x$givaro_cross" != "xyes"; then AC_MSG_RESULT(found) else AC_MSG_RESULT(unknown) echo "WARNING: You appear to be cross compiling, so there is no way to determine" echo "whether your GIVARO version is new enough. I am assuming it is." fi ifelse([$2], , :, [$2]) elif test -n "$givaro_problem"; then AC_MSG_RESULT(problem) echo "Sorry, your GIVARO version is too old. Disabling." ifelse([$3], , :, [$3]) elif test "x$givaro_found" = "xno" ; then AC_MSG_RESULT(not found) ifelse([$3], , :, [$3]) fi AM_CONDITIONAL(FFLASFFPACK_HAVE_GIVARO, test "x$HAVE_GIVARO" = "xyes") CXXFLAGS=${BACKUP_CXXFLAGS} LIBS=${BACKUP_LIBS} LD_RUN_PATH="$saved_LD_RUN_PATH" export LD_RUN_PATH unset saved_LD_RUN_PATH #unset LD_LIBRARY_PATH ]) fflas-ffpack-2.2.2/macros/mkl-check.m4000066400000000000000000000024771274716147400174530ustar00rootroot00000000000000dnl Check for MKL dnl Brice Boyer 2014 dnl This file is part of FFLAS-FFPACK dnl dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl/ AC_DEFUN([FF_CHECK_MKL], [ AC_MSG_CHECKING(for use of MKL) dnl echo $CBLAS_LIBS USE_MKL="false" MKL_USED=`echo $CBLAS_LIBS | grep -i MKL` AS_IF( [test -n "$MKL_USED"] , [ AC_DEFINE(HAVE_MKL,1,[Define if we use MKL for blas/lapack]) USE_MKL="true" AC_SUBST(USE_MKL) AC_MSG_RESULT( yes ) ] , [ AC_MSG_RESULT( no ) ] ) ] ) fflas-ffpack-2.2.2/macros/omp-check.m4000066400000000000000000000036261274716147400174600ustar00rootroot00000000000000dnl turn on OPENMP dnl Copyright (c) 2011 FFLAS-FFPACK dnl Created by BB, 2014-07-01 dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl dnl FF_CHECK_OMP dnl dnl turn on OpenMP if available AC_DEFUN([FF_CHECK_OMP], [ AC_ARG_ENABLE(openmp, [AC_HELP_STRING([--enable-openmp], [ Use OpenMP ]) ], [ avec_omp=$enable_openmp], [ avec_omp=yes ] ) AC_MSG_CHECKING(for OpenMP) AS_IF([ test "x$avec_omp" != "xno" ], [ BACKUP_CXXFLAGS=${CXXFLAGS} OMPFLAGS="-fopenmp" CXXFLAGS="${BACKUP_CXXFLAGS} ${OMPFLAGS}" AC_TRY_RUN([ #include int main() { int p = omp_get_num_threads(); return 0; } ], [ omp_found="yes" ], [ omp_found="no" ], [ echo "cross compiling...disabling" omp_found="no" ]) AS_IF( [ test "x$omp_found" = "xyes" ], [ AC_DEFINE(USE_OPENMP,1,[Define if OMP is available]) AC_SUBST(OMPFLAGS) AC_MSG_RESULT(yes) HAVE_OMP=yes ], [ OMPFLAGS= AC_SUBST(OMPFLAGS) AC_MSG_RESULT(no) ] ) CXXFLAGS=${BACKUP_CXXFLAGS} ], [ AC_MSG_RESULT(no) ] ) AM_CONDITIONAL(FFLASFFPACK_HAVE_OMP, test "x$HAVE_OMP" = "xyes") ] ) fflas-ffpack-2.2.2/macros/simd-check.m4000066400000000000000000000077301274716147400176210ustar00rootroot00000000000000dnl Check for SIMD dnl Copyright (c) 2011 FFLAS-FFPACK dnl Created by BB, 2014-03-25 dnl modified by CP, 2016-07-11 dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl dnl FF_CHECK_SIMD dnl dnl turn on SSE4.1 AVX, AVX2 extensions if available AC_DEFUN([FF_CHECK_SIMD], [ AC_ARG_ENABLE(simd,[AC_HELP_STRING([--disable-simd], [ Disable vectorized instructions: SSE4.1, AVX, AVX2])]) AS_IF([ test "x$enable_simd" != "xno" ], [ AS_ECHO("SIMD enabled") arch=`echo $target | cut -d"-" -f1` # if we are on a x86 (32 or 64 bits) with gcc>=4.8 then run the AX_CHECK_X86_FEATURES macro AS_IF([test "x$arch" = "xx86_64" -o "x$arch" = "xi686"], [archx86="yes"], [archx86="no"] ) AS_IF([ test "x$CCNAM" != "xgcc48" -o "x$archx86" = "xno" ], [ CUSTOM_SIMD="yes" echo "Compiling with $CCNAM for a $arch target: running custom checks for SSE4.1 and AVX1,2" AC_MSG_CHECKING(for SSE 4.1) BACKUP_CXXFLAGS=${CXXFLAGS} SSEFLAGS="-msse4.1" CXXFLAGS="${BACKUP_CXXFLAGS} ${SSEFLAGS}" CODE_SSE=`cat macros/CodeChunk/sse.C` AC_TRY_RUN([ ${CODE_SSE} ], [ sse_found="yes" ], [ sse_found="no" ], [ echo "cross compiling...disabling" sse_found="no" ]) AS_IF([ test "x$sse_found" = "xyes" ], [ AC_DEFINE(HAVE_SSE4_1_INSTRUCTIONS,1,[Define if SSE is available]) AC_SUBST(SSEFLAGS) AC_MSG_RESULT(yes) ], [ SSEFLAGS="" AC_MSG_RESULT(no) ]) CXXFLAGS=${BACKUP_CXXFLAGS} dnl Check for AVX AC_MSG_CHECKING(for AVX) CODE_AVX=`cat macros/CodeChunk/avx.C` dnl Intel compilers usually do not require option to enable avx dnl Thus, we test with no option on for switch_avxflags in "" "-mavx"; do CXXFLAGS="${BACKUP_CXXFLAGS} -O0 ${switch_avxflags}" AC_TRY_RUN([ ${CODE_AVX} ], [ avx_found="yes" AVXFLAGS=${switch_avxflags} break ], [ avx_found="no" ], [ echo "cross compiling...disabling" avx_found="no" break ]) done dnl Is AVX found? AS_IF([ test "x$avx_found" = "xyes" ], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_AVX_INSTRUCTIONS,1,[Define if AVX is available]) dnl Check for AVX2 AC_MSG_CHECKING(for AVX2) for switch_avx2flags in "" "-mfma -mavx2"; do CXXFLAGS="${BACKUP_CXXFLAGS} -O0 ${switch_avx2flags}" AC_TRY_RUN( [ #define __try_avx2 ${CODE_AVX} ], [ avx2_found="yes" AVX2FLAGS="${switch_avx2flags}" break ], [ avx2_found="no" ], [ echo "cross compiling...disabling" avx2_found = "no" break ]) done dnl Is AVX2 found? AS_IF([ test "x$avx2_found" = "xyes" ], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_AVX2_INSTRUCTIONS,1,[Define if AVX2 is available]) AVXFLAGS=${AVX2FLAGS} ], [ AC_MSG_RESULT(no) ] ) ], [ dnl No AVX AC_MSG_RESULT(no) ]) CXXFLAGS=${BACKUP_CXXFLAGS} ], [ ]) ],[ AS_ECHO("SIMD disabled") CUSTOM_SIMD="yes" ]) ]) fflas-ffpack-2.2.2/macros/sse2-check.m4000066400000000000000000000036611274716147400175400ustar00rootroot00000000000000dnl Check for SSE dnl Copyright (c) 2011 FFLAS-FFPACK dnl Created by BB, 2014-03-25 dnl ========LICENCE======== dnl This file is part of the library FFLAS-FFPACK. dnl dnl FFLAS-FFPACK is free software: you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public dnl License as published by the Free Software Foundation; either dnl version 2.1 of the License, or (at your option) any later version. dnl dnl This library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with this library; if not, write to the Free Software dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA dnl ========LICENCE======== dnl dnl FF_CHECK_SSE dnl dnl turn on SSE4.1 extensions if available AC_DEFUN([FF_CHECK_SSE], [ AC_ARG_ENABLE(sse,[AC_HELP_STRING([--disable-sse], [ Disable Intel(r) SSE 4.1])]) AC_MSG_CHECKING(for SSE 4.1) AS_IF([ test "x$enable_sse" != "xno" ], [ BACKUP_CXXFLAGS=${CXXFLAGS} dnl SSEFLAGS="-msse2" SSEFLAGS="-msse4.1" CXXFLAGS="${BACKUP_CXXFLAGS} ${SSEFLAGS}" CODE_SSE=`cat macros/CodeChunk/sse.C` AC_TRY_RUN([ ${CODE_SSE} ], [ sse_found="yes" ], [ sse_found="no" ], [ echo "cross compiling...disabling" sse_found="no" ]) AS_IF([ test "x$sse_found" = "xyes" ],[ AC_DEFINE(HAVE_SSE4_1_INSTRUCTIONS,1,[Define if SSE is available]) AC_SUBST(SSEFLAGS) AC_MSG_RESULT(yes) ], [ SSEFLAGS="" dnl Forcing to disable AVX enable_avx="no" AC_MSG_RESULT(no) ] ) CXXFLAGS=${BACKUP_CXXFLAGS} ], [ dnl --disable-sse AC_MSG_RESULT(no [disabled]) dnl Forcing to disable AVX enable_avx="no" ] ) ]) fflas-ffpack-2.2.2/optimiser/000077500000000000000000000000001274716147400160705ustar00rootroot00000000000000fflas-ffpack-2.2.2/optimiser/Makefile.am000066400000000000000000000016771274716147400201370ustar00rootroot00000000000000# Copyright (c) 2012 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ EXTRA_DIST= winograd.C fflas-ffpack-2.2.2/optimiser/winograd.C000066400000000000000000000134751274716147400200200ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2012 FFLAS-FFPACK group. * * Extirpé form a m4 macro by Brice Boyer (briceboyer) . * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ //#define LinBoxSrcOnly #define DOUBLE_TO_FLOAT_CROSSOVER 0 #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #ifndef FLTTYPE #define FLTTYPE Givaro::Modular #endif template bool balanced(const Field & ) { return false; } template bool balanced(const Givaro::ModularBalanced&) { return true; } #ifdef __GIVARO_USE_OPENMP typedef Givaro::OMPTimer TTimer; #else typedef Givaro::Timer TTimer; #endif #define MFLOPS (2.0*iter/chrono.realtime()*(double)n/100.0*(double)n/100.0*(double)n/100.0) #define GFLOPS (2.0*iter/chrono.realtime()*(double)n/1000.0*(double)n/1000.0*(double)n/1000.0) #ifdef __FFLASFFPACK_HAVE_CXX11 #include #endif //using namespace LinBox; int main () { using namespace std; typedef FLTTYPE Field ; Field F(17); typedef Field::Element Element ; size_t n=768, nmax=5000, prec=512, nbest=0, count=0; TTimer chrono; bool bound=false; Field::RandIter G(F); Element *A,*B,*C; A = FFLAS::fflas_new(nmax*nmax); B = FFLAS::fflas_new(nmax*nmax); C = FFLAS::fflas_new(nmax*nmax); for (size_t i=0; i ClassicH(F,0, FFLAS::ParSeqHelper::Sequential()); FFLAS::MMHelper WinogradH(F,1, FFLAS::ParSeqHelper::Sequential()); int iter=3; //warm up computation FFLAS::fgemm(F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, n, n, n, F.mOne, A, n, B, n, F.one, C, n, ClassicH); chrono.start(); for (int i=0;i time ){ count++; if (count > 1){ nbest=n; bound=true; prec=prec>>1; n-=prec; } } else{ count=0; if (bound) prec=prec>>1; n+=prec; } } while ((prec > 64 ) && (n < nmax)); std::ofstream out("WinoThreshold"); if (nbest != 0 ) { if (typeid(Element).name() == typeid(double).name()) { if ( balanced(F) ) { out << "#ifndef __FFLASFFPACK_WINOTHRESHOLD_BAL" << endl; out << "#define __FFLASFFPACK_WINOTHRESHOLD_BAL" << ' ' << nbest << endl; } else { out << "#ifndef __FFLASFFPACK_WINOTHRESHOLD" << endl; out << "#define __FFLASFFPACK_WINOTHRESHOLD" << ' ' << nbest << endl; } out << "#endif" << endl << endl; } if (typeid(Element).name() == typeid(float).name()) { if ( balanced(F) ) { out << "#ifndef __FFLASFFPACK_WINOTHRESHOLD_BAL_FLT" << endl; out << "#define __FFLASFFPACK_WINOTHRESHOLD_BAL_FLT" << ' ' << nbest << endl; } else { out << "#ifndef __FFLASFFPACK_WINOTHRESHOLD_FLT" << endl; out << "#define __FFLASFFPACK_WINOTHRESHOLD_FLT" << ' ' << nbest << endl; } out << "#endif" << endl << endl; } } out.close(); outlog << "defined __FFLASFFPACK_WINOTHRESHOLD to " << nbest << "" << std::endl; outlog.close(); FFLAS::fflas_delete( A); FFLAS::fflas_delete( C); return 0; } fflas-ffpack-2.2.2/tests/000077500000000000000000000000001274716147400152175ustar00rootroot00000000000000fflas-ffpack-2.2.2/tests/Makefile.am000066400000000000000000000143231274716147400172560ustar00rootroot00000000000000# Copyright (c) 2011 FFLAS-FFPACK # written by Brice Boyer (briceboyer) # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ SUBDIRS = data check: $(BASE_TESTS) AM_CPPFLAGS=-I$(top_srcdir) -g AM_CXXFLAGS = @TESTS_CFLAGS@ $(OPTFLAGS) $(GIVARO_CFLAGS) $(CBLAS_FLAG) $(CUDA_CFLAGS) $(PARFLAGS) $(PRECOMPILE_FLAGS) AM_CPPFLAGS += -I$(top_srcdir)/fflas-ffpack/ -I$(top_srcdir)/fflas-ffpack/utils/ -I$(top_srcdir)/fflas-ffpack/fflas/ -I$(top_srcdir)/fflas-ffpack/ffpack -I$(top_srcdir)/fflas-ffpack/field #LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS) $(PARFLAGS) $(PRECOMPILE_LIBS) AM_LDFLAGS=-static $(PARFLAGS) #-L$(prefix)/lib -lfflas -lffpack -lfflas_c -lffpack_c EXTRA_DIST= test-utils.h PERFPUBLISHERFILE=tests-report.xml BASIC_TESTS = \ test-lu \ test-det \ test-echelon \ test-rankprofiles \ test-compressQ \ test-permutations \ test-fadd \ test-finit \ test-fscal \ test-fgemm \ test-pluq-check \ test-fgemm-check \ test-ftrsm-check \ test-invert-check \ test-charpoly-check \ test-fger \ test-ftrsm \ test-multifile \ test-maxdelayeddim \ regression-check if FFLASFFPACK_PRECOMPILED LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS) $(PARLIBS) \ $(top_builddir)/fflas-ffpack/interfaces/libs/libfflas.la \ $(top_builddir)/fflas-ffpack/interfaces/libs/libffpack.la INTERFACE_TESTS= test-interfaces-c test_interfaces_c_LDADD = \ $(top_builddir)/fflas-ffpack/interfaces/libs/libfflas_c.la \ $(top_builddir)/fflas-ffpack/interfaces/libs/libffpack_c.la \ -lm -lstdc++ else LDADD = $(CBLAS_LIBS) $(GIVARO_LIBS) $(CUDA_LIBS) $(PARLIBS) endif NOT_A_TEST = \ test-lqup2 \ test-charpoly \ benchlqup \ test-fsquare \ test-redcolechelon \ benchfgemm \ test-rank \ test-krylov-elim \ test-rowechelon \ test-fgemv \ test-colechelon \ test-fullranksubmatrix \ test-redrowechelon \ test-ftrtri \ test-redechelon \ test-frobenius \ test-fgesv \ test-invert \ test-nullspace INFINITE_TEST= \ testeur_fgemm \ testeur_ftrsm \ testeur_lqup EXTRA_PROGRAMS = \ $(BASIC_TESTS) $(USE_OMP_TESTS) $(INTERFACE_TESTS) CLEANFILES = \ $(NOT_A_TEST) $(EXTRA_PROGRAMS) $(PERFPUBLISHERFILE) TESTS = $(EXTRA_PROGRAMS) test_compressQ_SOURCES = test-compressQ.C test_permutations_SOURCES = test-permutations.C test_lu_SOURCES = test-lu.C #test_lqup2_SOURCES = test-lqup2.C test_det_SOURCES = test-det.C test_pluq_check_SOURCES = test-pluq-check.C test_fgemm_check_SOURCES = test-fgemm-check.C test_ftrsm_check_SOURCES = test-ftrsm-check.C test_invert_check_SOURCES = test-invert-check.C test_charpoly_check_SOURCES = test-charpoly-check.C test_echelon_SOURCES = test-echelon.C test_rankprofiles_SOURCES = test-rankprofiles.C test_fgemm_SOURCES = test-fgemm.C test_fger_SOURCES = test-fger.C test_multifile_SOURCES = test-multifile1.C test-multifile2.C # test_fgemm_SOURCES = test-fgemm.C # test_charpoly_SOURCES = test-charpoly.C # benchfgemm_SOURCES = benchfgemm.C # test_fsquare_SOURCES = test-fsquare.C # test_rank_SOURCES = test-rank.C # benchlqup_SOURCES = benchlqup.C # test_ftrmm_SOURCES = test-ftrmm.C # test_redcolechelon_SOURCES = test-redcolechelon.C # testeur_fgemm_SOURCES = testeur_fgemm.C test_ftrsm_SOURCES = test-ftrsm.C # test_redechelon_SOURCES = test-redechelon.C # testeur_ftrsm_SOURCES = testeur_ftrsm.C # test_ftrtri_SOURCES = test-ftrtri.C # test_redrowechelon_SOURCES = test-redrowechelon.C # testeur_lqup_SOURCES = testeur_lqup.C # test_fullranksubmatrix_SOURCES = test-fullranksubmatrix.C # test_rowechelon_SOURCES = test-rowechelon.C # test_invert_SOURCES = test-invert.C # test_fgemv_SOURCES = test-fgemv.C # test_krylov_elim_SOURCES = test-krylov-elim.C # test_colechelon_SOURCES = test-colechelon.C # test_fgesv_SOURCES = test-fgesv.C # test_frobenius_SOURCES = test-frobenius.C # test_nullspace_SOURCES = test-nullspace.C test_fadd_SOURCES = test-fadd.C test_fscal_SOURCES = test-fscal.C test_finit_SOURCES = test-finit.C test_interfaces_c_SOURCES = test-interfaces-c.c test_maxdelayeddim_SOURCES = test-maxdelayeddim.C #test_interfaces_c_CFLAGS= -std=c11 -I/$(prefix)/include $(AM_CPPFLAGS) $(AM_CXXFLAGS) $(PARFLAGS) #test_interfaces_c_LDFLAGS= $(LDFLAGS) $(LDADD) $(AM_LDFLAGS) -L/$(prefix)/lib/ -lfflas_c -lffpack_c -lstdc++ # test_fspmv_SOURCES = test-fspmv.C regression_check_SOURCES = regression-check.C dense_generator: dense_generator.C $(CXX) $(CXXFLAGS) $(AM_CXXFLAGS) dense_generator.C -o dense_generator # dense_generator_SOURCES = dense_generator.C # Perfpublisher script interaction - AB 2014/11/17 perfpublisher: +./perfpublisher.sh "$(PERFPUBLISHERFILE)" "$(EXTRA_PROGRAMS)" "$(CXX)" # for compilation of new tests FFLASFFPACK_BIN=@bindir@ new_examp_comp = $(CXX) $(CXXFLAGS) $(AM_CXXFLAGS) ${INCLUDES} $(AM_CPPFLAGS) $^ -o $@ $(LDFLAGS) $(LDADD) $(LOADLIBES) %:%.C $(new_examp_comp) %:%.cpp $(new_examp_comp) fflas-ffpack-2.2.2/tests/Makefile.template000066400000000000000000000025501274716147400204730ustar00rootroot00000000000000#---------------------------------------------------------- # Parameters to be configured by the user # root for the blas library, for ex. /home/foo/ATLAS/lib/Linux_P4SSE2 BLASROOT = # ATLAS BLAS users : uncomment these lines: #CXXFLAGS+=-D__LINBOX_HAVE_CBLAS #LOADLIBES+=-L${BLASROOT} -lcblas -latlas # GotoBlas BLAS users : uncomment this line: #LOADLIBES+=-L${BLASROOT} -lgoto # Other BLAS users, uncomment this line: #LOADLIBES+=-L${BLASROOT} -lcblas # architecture parameter for gcc: #ARCH = -march=pentium3 #ARCH = -march=pentium4 #ARCH = -march=athlon #ARCH = -march=opteron #ARCH = -m64 -mtune=k8 # Givaro/GMP root (only necessary for compiling the regression tests testeur_fgemm, testeur_lqup and testeur_ftrsm) #GIVARO_ROOT= #GMP_ROOT= #INCLUDES+= -I ${GIVARO_ROOT}/include -I ${GMP_ROOT}/include #LOADLIBES+= -L ${GIVARO_ROOT}/lib -lgivaro -L ${GMP_ROOT}/lib -lgmp -lgmpxx #---------------------------------------------------------- OPTFLAGS+=-O3 #OPTFLAGS+=-g OPTFLAGS+= ${ARCH} CXXFLAGS+=${OPTFLAGS} INCLUDES+=-I. -I../include CXX=g++ ${INCLUDES} all: test-fgemm test-invert test-det test-rank test-charpoly test-lqup test-nullspace dense_generator regression: testeur_fgemm testeur_lqup testeur_ftrsm clean: rm -f test-fgemm test-fgemv test-invert test-det test-rank test-charpoly test-lqup dense_generator testeur_fgemm testeur_lqup testeur_ftrsm fflas-ffpack-2.2.2/tests/benchfgemm.C000066400000000000000000000053741274716147400174270ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s //#include "goto-def.h" /* * Copyright (c) FFLAS-FFPACK * Written by Clement Pernet * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/field/modular-positive.h" #include "fflas-ffpack/utils/timer.h" #include "Matio.h" using namespace std; using namespace FFPACK; int main(int argc, char** argv) { // parameter: p, n, iteration, file1, file2 double p = atof(argv[1]); int n = atoi(argv[2]); size_t w = atoi (argv[3]); size_t iter = atoi(argv[4]); // typedef Givaro::Modular Field; // typedef Givaro::Modular Field; // typedef ModularBalanced Field; typedef ModularBalanced Field; typedef Field::Element Element; Field F((Field::Element)p); Element one,zero; F.init(one, 1.0); F.init(zero,0.0); FFLAS::Timer chrono; double time=0.0; // double time2=0.0; // int singular; Element * A, * B, * C; for (size_t i=0;i(n*n); for (size_t i=0; i<(size_t)n*n; ++i) G.random (*(A+i)); B = FFLAS::fflas_new(n*n); for (size_t i=0; i<(size_t)n*n; ++i) G.random(*(B+i)); C = FFLAS::fflas_new(n*n); chrono.clear(); chrono.start(); FFLAS::MMHelper WH (F,w); FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, n,n,n, one, A, n, B, n, zero, C,n, WH); chrono.stop(); time+=chrono.realtime(); FFLAS::fflas_delete( A); FFLAS::fflas_delete( B); FFLAS::fflas_delete( C); } std::cerr<<"n: "<s,f0,{0,g0,(0,\:0,t0,+0,=s // /* * Copyright (c) FFLAS-FFPACK * Written by Clement Pernet * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/utils/timer.h" #include "Matio.h" using namespace std; using namespace FFPACK; int main(int argc, char** argv) { // parameter: p, n, iteration, file float p = (float)atof(argv[1]); int n = atoi(argv[2]); size_t iter = atoi(argv[3]); typedef ModularBalanced Field; // typedef ModularBalanced Field; typedef Field::Element Element; Field F(p); FFLAS::Timer chrono; double time=0.0; // int singular; Element *A; for (size_t i=0;i(n*n); Field::RandIter G(F); for (size_t i=0; i< (size_t)n*n; ++i) G.random(*(A+i)); size_t * P = FFLAS::fflas_new(n); size_t * Q = FFLAS::fflas_new(n); chrono.clear(); chrono.start(); FFPACK::LUdivine (F, FFLAS::FflasNonUnit, FFLAS::FflasNoTrans, n, n, A, n, P, Q); chrono.stop(); time+=chrono.realtime(); FFLAS::fflas_delete( P); FFLAS::fflas_delete( Q); FFLAS::fflas_delete( A); } cerr<<"n: "< # # ========LICENCE======== # This file is part of the library FFLAS-FFPACK. # # FFLAS-FFPACK is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ========LICENCE======== #/ SUBDIRS = EXTRA_DIST= mat11.sms fflas-ffpack-2.2.2/tests/data/mat11.sms000066400000000000000000000003771274716147400176060ustar00rootroot0000000000000011 11 M 1 3 2 1 4 3 1 10 1 3 1 2 3 3 888 3 4 1 3 5 -1 3 11 6 4 1 3 4 3 1 4 4 4 4 7 12 4 10 -13 5 3 -1 6 6 1 6 8 1 6 10 1 7 4 12 8 6 1 8 8 500 8 9 400 8 10 300 8 11 200 9 8 400 10 1 1 10 4 -13 10 6 1 10 8 300 10 10 10 10 11 1 11 10 1 11 8 200 11 3 6 0 0 0 fflas-ffpack-2.2.2/tests/dense_generator.C000066400000000000000000000033601274716147400204710ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s // /* * Copyright (c) FFLAS-FFPACK * Written by Clement Pernet * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include #include #include template T& myrand (T& r, long size) { if (size < 0) return r = T( (lrand48() % (-size-size)) + size ); else return r = T( lrand48() % size ) ; }; int main(int argc, char ** argv) { srand48(time(NULL)); long ni=10,nj=10,max=100; int offset = 0; if (argc > ++offset) ni = atoi( argv[offset] ); if (argc > ++offset) nj = atoi( argv[offset] ); if (argc > ++offset) max = atoi( argv[offset] ); long tmp; printf("%ld %ld M\n", ni, nj); for (long i = 0; i < ni; ++i) for (long j = 0; j < nj; ++j){ printf("%ld %ld %ld\n", i+1, j+1, myrand(tmp, max)); } printf("0 0 0\n"); return 0; } fflas-ffpack-2.2.2/tests/jenkins-maker.sh000077500000000000000000000063141274716147400203200ustar00rootroot00000000000000#!/bin/bash # This file is part of the FFLAS-FFPACK library. # It is distributed under the terms of the LGPL licence version 2.1 or later # (see COPYING) # Created by AB - 2014/12/03 # Modified by AC - 2016/06/20 # Modified by CP - 2016/06/22 # Some influential environment variables: # CXX C++ compiler command # CXXFLAGS C++ compiler flags # Note: This script is intended to be launched # by the Jenkins web interface whenever it needs # to compile the project. # It is launched from the svn:trunk root directory. # But should be stored in //makers/ SOURCE_DIRECTORY=$( cd "$( dirname "$0" )" && pwd ) #=============================# # Change only these variables # #=============================# CXX=`pwd | awk -F/ '{print $(NF-2)}'` SSE=`pwd | awk -F/ '{print $NF}'` # Job fflas-ffpack with SSE option flag # by default sse is enabled if [ "$SSE" == "withoutSSE" ]; then FFLAS_SSEFLAG="--disable-simd" fi JENKINS_DIR=${SOURCE_DIRECTORY%%/workspace/*} LOCAL_DIR="$JENKINS_DIR"/local # Add path to compilers (if needed) export PATH=$PATH:/usr/local/bin:"$LOCAL_DIR/$CXX/bin" echo $PATH # Where are blas installed (/lib/.so) # And their name (libtotoblas) BLAS_HOME="$LOCAL_DIR/$CXX" BLAS_NAME=openblas # Change these if necessary BLAS_LIBS="-L$BLAS_HOME/lib/ -l$BLAS_NAME" BLAS_CFLAGS=-I"$BLAS_HOME"/include # Where to install fflas-ffpack binaries # Keep default for local installation. PREFIX_INSTALL="$LOCAL_DIR/$CXX/$SSE" # Add specific locations (if needed) export LD_LIBRARY_PATH="$LD_LIBRARY_PATH":/usr/local/lib:"$LOCAL_DIR/$CXX/lib":"$PREFIX_INSTALL"/lib echo "LD_LIBRARY_PATH = ${LD_LIBRARY_PATH}" export PKG_CONFIG_PATH=${PKG_CONFIG_PATH}:"$LOCAL_DIR/$CXX/lib/pkgconfig" echo "PKG_CONFIG_PATH = ${PKG_CONFIG_PATH}" # /!\ Warning /!\ This could be an issue if you changed # the local installation directory rm -rf "$PREFIX_INSTALL"/bin/fflas-ffpack* "$PREFIX_INSTALL"/include/fflas-ffpack* #================# # Setup Variables# #================# if [ "$CXX" == "icpc" ]; then distribution=`uname -m` CC=icc if [ "$distribution" == "i686" ]; then source /usr/local/bin/compilervars.sh ia32 else source /usr/local/bin/compilervars.sh intel64 fi fi # Particular case for Fedora23: g++=g++-5.3 vm_name=`uname -n | cut -d"-" -f1` if [[ "$vm_name" == "fedora" && "$CXX" == "g++-5.3" ]]; then CXX="g++" CC=gcc fi if [ -z "$CC" ]; then if [[ $CXX == g++* ]]; then CC=`echo $CXX | sed -re 'y/++/cc/'` else CC="clang" fi fi #==================================# # Automated installation and tests # #==================================# echo "|=== JENKINS AUTOMATED SCRIPT ===| ./autogen.sh CXX=$CXX CC=$CC --prefix=$PREFIX_INSTALL --with-blas-libs=$BLAS_LIBS --enable-optimization --enable-precompilation $FFLAS_SSEFLAG" ./autogen.sh CXX=$CXX CC=$CC --prefix="$PREFIX_INSTALL" --with-blas-libs="$BLAS_LIBS" --enable-optimization --enable-precompilation "$FFLAS_SSEFLAG" V="$?"; if test "x$V" != "x0"; then exit "$V"; fi echo "|=== JENKINS AUTOMATED SCRIPT ===| make prefix=$PREFIX_INSTALL install" make install V="$?"; if test "x$V" != "x0"; then exit "$V"; fi echo "|=== JENKINS AUTOMATED SCRIPT ===| make perfpublisher" make perfpublisher fflas-ffpack-2.2.2/tests/perfpublisher.sh000077500000000000000000000103541274716147400204330ustar00rootroot00000000000000#!/bin/bash # Script to format tests results into a single xml file. # See https://wiki.jenkins-ci.org/display/JENKINS/PerfPublisher+Plugin # ----- # 2014/11/17 - Written by AB XMLFILE=$1 tests=$2 COMPILER=$3 # choose gdate on OS X if command -v "gdate" >/dev/null; then DATE=gdate else DATE=date fi #=================# # Plateform infos # #=================# COMPILERVERSION=$($COMPILER --version 2>&1 | head -1) if command -v "lscpu" >/dev/null; then CPUFREQ=$(lscpu | grep "MHz" | rev | cut -f1 -d' ' | rev) else CPUFREQ=$((`sysctl -n hw.cpufrequency`/1000000)) fi ARCH=$(uname -m) OSNAME=$(uname -s) OSVERSION=$(uname -r) if hash lsb_release 2>/dev/null then DISTRIB=$(lsb_release -ds) else DISTRIB='Unknown distribution' fi #==========# # Prologue # #==========# if [[ -f $XMLFILE ]] then echo '----> WARNING: File '$XMLFILE' is not empty.' echo '----> Results will be added to its end.' fi #========# # Header # #========# echo '' >> $XMLFILE echo '' >> $XMLFILE #=======# # Start # #=======# echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE #=======# # Tests # #=======# for test in $tests do if [[ ! -f $test ]] then #File does not exist: compile it echo '[Compiling]' $test COMPILESTART=$($DATE +%s%3N) COMPILELOG=$(make $test 2>&1; echo 'Returned state: '$?) COMPILEEND=$($DATE +%s%3N) COMPILETIME=$(($COMPILEEND - $COMPILESTART)) COMPILECHECK=$(echo $COMPILELOG | grep -o '[^ ]*$') COMPILETIMERELEVANT='true' else #File does exist echo '[Already compiled]' $benchmark COMPILELOG='(Previously compiled)' COMPILETIME='0.0' COMPILECHECK='0' COMPILETIMERELEVANT='false' fi if [[ $COMPILECHECK -ne 0 ]] then #Compilation failure # EXECUTED='no' - keep it to yes so that Jenkins # uses it within its results EXECUTED='yes' PASSED='no' STATE='0' EXECUTIONLOG='(Not executed)' EXECUTIONTIME='0.0' COMPILETIMERELEVANT='false' EXECUTIONTIMERELEVANT='false' ERRORLOG='Does not compile.' echo '-> Does not compile.' else #Compilation success echo '[Executing]' $test EXECUTED='yes' EXECUTIONSTART=$($DATE +%s%3N) EXECUTIONLOG=$(./$test 2>&1; echo 'Returned state: '$?) EXECUTIONEND=$($DATE +%s%3N) EXECUTIONTIME=$(($EXECUTIONEND - $EXECUTIONSTART)) EXECUTIONCHECK=$(echo $EXECUTIONLOG | grep -o '[^ ]*$') if [[ $EXECUTIONCHECK -ne 0 ]] then #Execution failure PASSED='no' STATE='0' EXECUTIONTIMERELEVANT='false' ERRORLOG='Execution failure.' echo '-> Execution failure.' else #Execution success PASSED='yes' STATE='100' EXECUTIONTIMERELEVANT='true' ERRORLOG='' fi fi echo '' >> $XMLFILE echo 'TEST' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE # Logs echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE # Times echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE echo '' >> $XMLFILE done #========# # Footer # #========# echo '' >> $XMLFILE #==========# # Epilogue # #==========# echo 'Results correctly exported to' $XMLFILE fflas-ffpack-2.2.2/tests/regression-check.C000066400000000000000000000041171274716147400205610ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* tests/regression-check.C * Copyright (C) 2014 the FFLAS-FFPACK group * * Written by all reporters of bugs (see ffpack-devel@googlegroups.com) * * ------------------------------------ * * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include "fflas-ffpack/fflas-ffpack.h" /* #1 */ bool check1 () ; /* #2 */ bool check2() { Givaro::Modular F(2); Givaro::Modular::RandIter R(F); size_t ok = 0 ; size_t tot = 500 ; for (size_t i = 0 ; i < tot ; ++i) { double elt ; R.random(elt); if (elt == 1) ++ok ; } double f = (double) ok / (double) tot ; if (f < 0.3 or f > 0.7) return false ; return true ; } /* #3 */ bool check3() { Givaro::Modular F(2); double * A = NULL ; double d = FFPACK::Det(F,0,0,A,0); return F.areEqual(d,F.one); } /* #4 */ bool check4() { typedef int32_t Element; Givaro::Modular F(2); Element * A = NULL ; Element * X = NULL ; int nul; FFPACK::Invert2(F,0,A,0,X,0,nul); return true ; } int main() { bool pass = true ; pass &= check2(); pass &= check3(); pass &= check4(); return !pass; } fflas-ffpack-2.2.2/tests/test-bini-p.C000066400000000000000000001764321274716147400174730ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * Written by Brice Boyer (briceboyer) * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ #include "fflas-ffpack/utils/timer.h" #include "Matio.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/fflas-ffpack-config.h" #include "test-utils.h" #include "assert.h" #include "fflas-ffpack/utils/args-parser.h" #include "fflas-ffpack/utils/flimits.h" #include // using namespace FFPACK; #define NEWWINO // #define NOTRANDOM // #define DIVIDE_INTO(x,y) (((x) + (y) - 1)/(y)) const int algos = 6 ; const int algos_k = 2 ; using Givaro::Modular; using Givaro::ModularBalanced; using Givaro::Timer; using FFLAS::FieldTraits; typedef std::vector time_v ; typedef std::vector int_v ; const int selec[] = { 0 ,1 ,2 ,3 ,4 ,5 }; const int selec_k[] = { 0 ,1 }; const char * descr[] = { "322 low mem" , "322 first 1" , "322 4 tmp " , "223 low mem" , "232 first 1" , "232 all tmp" , "comp left " , "comp right " // , "322 sqrt " }; const char * descr_k[] = { "comp left " , "comp right " }; namespace FFLAS { /* compression */ template struct Packer ; template<> struct Packer { uint64_t bits = (limits::digits()/2) ; double base = (double) (1_ui64 << bits) ; uint64_t mask = (1_ui64 << bits) - 1_ui64 ; template void accu(double * p, T * w) { *p *= base ; *p += (double)*w ; } } ; /* ****** */ /* pack */ /* ****** */ /* pack nb words (a,b,c) -> [a|b|c] */ template void pack_word( pack_T * packed, const wide_T * words, int32_t stride, Packer & packer) ; template void pack_word/**/( double * packed, const wide_T * words, int32_t stride, Packer & packer) { // std::cout << "pack " << *words << '+' << *(words+stride) << " * " << (uint64_t) packer.base << " = "; // words += stride ; *packed = (double) *words ; words += stride ; packer.accu(packed,words); // std::cout << (uint64_t) *packed << std::endl; } /* pack nb words (a,b) -> [a|b|0] filling with zeros */ template void pack_word_part( pack_T * packed, int32_t nb, const wide_T * words, int32_t stride, Packer & packer) ; template void pack_word_part/* */( double * packed, int32_t nb, const wide_T * words, int32_t stride, Packer & packer) { assert(nb == 1); *packed = (double) *words ; // words += stride ; // packer.accu(packed,words); *packed *= packer.base ; } /* ****** */ /* unpack */ /* ****** */ template void unpack_word( wide_T * words, int32_t stride, const pack_T * packed, Packer & packer); template void unpack_word/* */( wide_T * words, int32_t stride, const double * packed, Packer & packer) { uint64_t pck = (uint64_t) *packed ; words += stride ; *words = (double) (pck & packer.mask) ; words -= stride ; pck >>= packer.bits ; *words = (double) pck /* & packer.mask */ ; } template void unpack_word_part( wide_T * words, int32_t stride, const pack_T * packed, int32_t nb, Packer & packer); template void unpack_word_part/* */( wide_T * words, int32_t stride, const double * packed, int32_t nb, Packer & packer) { assert(nb == 1); words += stride ; *words = 0 ; words -= stride ; uint64_t pck = (uint64_t) *packed ; pck >>= packer.bits ; *words = (double)pck /* & packer.mask */ ; } /* ****** */ /* pack */ /* ****** */ template void pack_matrix( pack_T * packed, int32_t row_p, int32_t col_p, int32_t ldm_p, const wide_T * elemts, int32_t row_e, int32_t col_e, int32_t ldm_e, Packer & packer) { if (row_packed == true) { for (int32_t i = 0 ; i < row_e ; i++ ) { const wide_T * e_p = elemts + i * ldm_e ; pack_T * p_p = packed + i * ldm_p ; int32_t j = 0 ; for ( ; j < col_e/Nb*Nb ; j+=Nb, e_p+=Nb, p_p++) { pack_word(p_p,e_p,1,packer); } if (j < col_e) pack_word_part(p_p,col_e-j,e_p,1,packer); } } else { /* col_packed */ int32_t i = 0 ; int32_t ii = 0 ; for ( ; i < row_e/Nb*Nb ; i += Nb , ii++) { const wide_T * e_p = elemts + i * ldm_e ; pack_T * p_p = packed + ii * ldm_p ; for (int32_t j = 0 ; j < col_e ; j++, e_p++, p_p++) { pack_word(p_p,e_p,ldm_e,packer); } } if (i < row_e) pack_word_part(packed+i*ldm_p,row_e-i,elemts+ii*ldm_e,ldm_e,packer); } } /* ****** */ /* unpack */ /* ****** */ template void unpack_matrix( wide_T * elemts, int32_t row_e, int32_t col_e, int32_t ldm_e, const pack_T * packed, int32_t row_p, int32_t col_p, int32_t ldm_p, Packer & packer) { if (row_packed == true) { for (int32_t i = 0 ; i < row_e ; i++ ) { wide_T * e_p = elemts + i * ldm_e ; const pack_T * p_p = packed + i * ldm_p ; int32_t j = 0 ; for ( ; j < col_e/Nb*Nb ; j+=Nb, e_p+=Nb, p_p++) { unpack_word(e_p,1,p_p,packer); } if (j < col_e) unpack_word_part(e_p,1,p_p,col_e-j,packer); } } else { /* col_packed */ int32_t i = 0 ; int32_t ii = 0 ; for ( ; i < row_e/Nb*Nb ; i += Nb , ii++) { wide_T * e_p = elemts + i * ldm_e ; const pack_T * p_p = packed + ii * ldm_p ; for (int32_t j = 0 ; j < col_e ; j++, e_p++, p_p++) { unpack_word(e_p,ldm_e,p_p,packer); } } if (i < row_e) unpack_word_part(elemts+i*ldm_e,ldm_e,packed+ii*ldm_p,row_e-i,packer); } } /* compress A */ template void fgemm_compressed(const Field & F, int m, int n, int k, const typename Field::Element * A, int lda, const typename Field::Element * B, int ldb, typename Field::Element * C, int ldc ) { Givaro::ZRing NoField; double * A_k, * B_k, * C_k ; typedef typename Field::Element elem_t ; Packer packer ; int m_k = m , n_k = n , lda_k = lda, ldb_k = ldb, ldc_k = ldc ; if (left_compress) { m_k = DIVIDE_INTO(m,2)*2 ; lda_k = m_k ; ldc_k = n ; A_k = FFLAS::fflas_new(m_k*k) ; //!@bug don't zero all, just the "border" FFLAS::fzero(NoField,m_k,k,A_k,k); B_k = const_cast(B) ; pack_matrix(A_k,m_k,k,lda_k, A,m,k,lda, packer); } else { n_k = DIVIDE_INTO(n,2)*2 ; ldb_k = n_k ; ldc_k = n_k ; A_k = const_cast(A) ; B_k = FFLAS::fflas_new(k*n_k) ; //!@bug don't zero all, just the "border" FFLAS::fzero(NoField,k,n_k,B_k,n_k); pack_matrix(B_k,k,n_k,ldb_k, B,k,n,ldb, packer); } C_k = FFLAS::fflas_new(m_k*n_k) ; //!@bug don't zero all, just the "border" FFLAS::fzero(NoField,m_k,n_k,C_k,n_k); pack_matrix(C_k,m_k,n_k,ldc_k, C,m,n,ldc, packer); #if 0 double * C_e = FFLAS::fflas_new(m*ldc); unpack_matrix(C_e,m,n,ldc, C_k,m_k,n_k,ldc_k, packer); int faux = 0 ; for (int i = 0 ; i < m ; ++i) { for (int j = 0 ; j < n ; ++j) { if (! (C[i*ldc+j] == C_e[i*ldc+j]) ) { ++faux ; } } } if (faux) { std::cout << "bad pack/unpack ; bad/all = " << faux << '/' << m*n << " ~~ " << (double)faux/(double)(m*n) << std::endl; } if (faux && (n<20)) { std::cout << "IN " << std::endl; for (int i = 0 ; i < m ; ++i) { for (int j = 0 ; j < n ; ++j) std::cout << C[i*ldc+j] << ' '; std::cout << std::endl; } std::cout << "OUT" << std::endl; for (int i = 0 ; i < m ; ++i) { for (int j = 0 ; j < n ; ++j) std::cout << C_e[i*ldc+j] << ' '; std::cout << std::endl; } } if (faux) exit(-1); #endif Givaro::DoubleDomain G ; fgemm(G,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, m_k,n_k,k, 1, A_k,lda_k, B_k,ldb_k, 0, C_k, ldc_k); // cblas_dgemm(CblasRowMajor, CblasNoTrans,CblasNoTrans, // m_k,n_k,k, 1, A_k,lda_k, B_k,ldb_k, 0, C_k, ldc_k); unpack_matrix(C,m,n,ldc, C_k,m_k,n_k,ldc_k, packer); if (left_compress) FFLAS::fflas_delete(A_k); else FFLAS::fflas_delete(B_k); FFLAS::fflas_delete(C_k); } } namespace FFLAS { /* tools */ template void finit_fuzzy(Field & F, size_t m, size_t n, double * C, size_t ldc) { if (n == ldc) // FFLAS::vectorised::modp(C,C,m*n,p,invp,0,p-1); FFLAS::vectorised::modp(F,C,m*n,C); else for (size_t i = 0 ; i < m ; ++i) // FFLAS::vectorised::modp(C+i*ldc,C+i*ldc,n,p,invp,0,p-1); FFLAS::vectorised::modp(F,C+i*ldc,n,C+i*ldc); } // C = a*A + B void add(const size_t m, const size_t n, double a, const double *A, const size_t lda, const double *B, const size_t ldb, double *C, const size_t ldc) { const double *Ai = A,*Bi = B; double *Ci = C; for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc) for (size_t j = 0 ; j < n ; ++j) Ci[j] = a * Ai[j] + Bi[j]; } // C = C-(A+B) void subadd(const size_t m, const size_t n, const double *A, const size_t lda, const double *B, const size_t ldb, double *C, const size_t ldc) { const double *Ai = A,*Bi = B; double *Ci = C; for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc) for (size_t j = 0 ; j < n ; ++j) { Ci[j] = Ci[j] - Ai[j] - Bi[j] ; } } // C = -(A+B) void negadd(const size_t m, const size_t n, const double *A, const size_t lda, const double *B, const size_t ldb, double *C, const size_t ldc) { const double *Ai = A,*Bi = B; double *Ci = C; for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc) for (size_t j = 0 ; j < n ; ++j) { Ci[j] = - Ai[j] - Bi[j] ; } } // C = C+A-B void addsub(const size_t m, const size_t n, const double *A, const size_t lda, const double *B, const size_t ldb, double *C, const size_t ldc) { const double *Ai = A,*Bi = B; double *Ci = C; for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc) for (size_t j = 0 ; j < n ; ++j) { Ci[j] = Ci[j] + Ai[j] - Bi[j] ; } } // C = (C+B)/e template void addscalinf(const Field & F, const size_t m, const size_t n, const double *B, const size_t ldb, double e, double *C, const size_t ldc) { const double * Bi = B; double * Ci = C; for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb) for (size_t j = 0 ; j < n ; ++j) Ci[j]= (Ci[j]+Bi[j])*e ; // F.init( Ci[j], (Ci[j]+Bi[j])/e ); } // C = (C-B)/e template void subscalinf(const Field & F, const size_t m, const size_t n, const double *B, const size_t ldb, double e, double *C, const size_t ldc) { const double * Bi = B; double * Ci = C; for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb) for (size_t j = 0 ; j < n ; ++j) Ci[j]= (Ci[j]-Bi[j])*e ; // F.init( Ci[j], (Ci[j]-Bi[j])/e ); } // C = (D-B)/e template void subscal(const Field & F, const size_t m, const size_t n, const double *D, const size_t ldd, const double *B, const size_t ldb, double e, double *C, const size_t ldc) { const double * Bi = B; const double * Di = D; double * Ci = C; for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb, Di += ldd) for (size_t j = 0 ; j < n ; ++j) Ci[j] = (Di[j]-Bi[j])*e ; } // C = (D+B)/e template void addscal(const Field & F, const size_t m, const size_t n, const double *D, const size_t ldd, const double *B, const size_t ldb, double e, double *C, const size_t ldc) { const double * Bi = B; const double * Di = D; double * Ci = C; for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb, Di += ldd) for (size_t j = 0 ; j < n ; ++j) Ci[j] = (Di[j]+Bi[j])*e ; } // C = C + (D-B)/e template void subscalacc(const Field & F, const size_t m, const size_t n, const double *D, const size_t ldd, const double *B, const size_t ldb, double e, double *C, const size_t ldc) { const double * Bi = B; const double * Di = D; double * Ci = C; for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb, Di += ldd) for (size_t j = 0 ; j < n ; ++j) Ci[j] += (Di[j]-Bi[j])*e ; } #ifndef TRE // #ifndef NDEBUG // #define TRE 1 // #else #define TRE (size_t)(__FFLASFFPACK_WINOTHRESHOLD) // #define TRE (size_t)(__FFLASFFPACK_WINOTHRESHOLD*0.9) // #endif #endif template double * gemm_fflas(const Field & F, const size_t m, const size_t n, const size_t k, const double *A, size_t lda, const double *B, size_t ldb, double * C, size_t ldc, int rec = 0) { Givaro::DoubleDomain R; FFLAS::fgemm(R, FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, m,n,k, 1, A,lda, B,ldb, 0, C, ldc); // cblas_dgemm(CblasRowMajor, CblasNoTrans,CblasNoTrans, // m,n,k,1,A,lda,B,ldb,0,C,ldc); return C; } } // FFLAS namespace FFLAS { namespace Protected { namespace Rec { // Field must be Givaro::Modular template double * gemm_bini_322_0(const Field & F , const size_t m , const size_t n , const size_t k , const double *A , const size_t lda , const double *B , const size_t ldb , double *C , const size_t ldc , int rec , const double & epsilon ) { Givaro::ZRing NoField; // const double p = (double)F.characteristic(); size_t M = (n>m)?std::min(k,m):std::min(k,n); // std::cout << rec << ',' << M << std::endl; // Field G(p*p); if ( M < TRE || rec <= 0) { return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); } assert(k/2*2==k); // k divisible par 2 assert(n/2*2==n); // n divisible par 2 assert(m/3*3==m); // m divisible par 3 size_t n2 = n/2; size_t k2 = k/2; size_t m3 = m/3; // std::cout << "€ = " << epsilon << std::endl; // sub matrices in A const double * A11 = A; const double * A12 = A +k2; const double * A21 = A +lda*m3; const double * A22 = A21 +k2; const double * A31 = A21 +lda*m3; const double * A32 = A31 +k2; // sub matrices in C double * C11 = C; double * C12 = C +n2; double * C21 = C +ldc*m3; double * C22 = C21 +n2; double * C31 = C21 +ldc*m3; double * C32 = C31 +n2; // sub matrices in B const double * B11 = B; const double * B12 = B +n2; const double * B21 = B +ldb*k2; const double * B22 = B21 +n2; FFLAS::fzero(NoField,m,n,C,ldc); /* * Algo : * S1 := A11 +A22; * S4 := e*A12+A22; * S5 := A11 +e*A12; * S6 := A21 +A32; * S9 := A21 +e*A31; * S10 := e*A31+A32; * * T1 := e*B11 +B22; * T2 := B21 +B22; * T4 := -e*B11+B21; * T5 := e*B12 +B22; * T6 := B11 +e*B22; * T7 := B11 +B12; * T9 := B12 -e*B22; * T10 := B11 +e*B21; * * P1 := S1 *T1; * P2 := A22*T2; * P3 := A11*B22; * P4 := S4 *T4; * P5 := S5 *T5; * P6 := S6 *T6; * P7 := A21*T7; * P8 := A32*B11; * P9 := S9 *T9; * P10:= S10*T10; * * C11 := (P1-P2-P3+P4)/e; * C12 := (P3-P5)/(-e) ; * C21 := P4+P6-P10 ; * C22 := P1-P5+P9; * C31 := (-P8+P10)/e; * C32 := (P6-P7-P8+P9)/e; * */ double * S1 = FFLAS::fflas_new(m3*k2) ; // double * C11t = FFLAS::fflas_new(n2*m3) ; // S1 := A11 +A22; FFLAS::fadd(NoField,m3,k2,A11,lda,A22,lda,S1,k2); // T1 := e*B11 +B22; double * T1 = FFLAS::fflas_new(n2*k2) ; // ou aire add(k2,n2,epsilon,B11,ldb,B22,ldb,T1,n2); // P1 := S1 *T1; (dans C22) gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,C22,ldc,rec-1,epsilon); // S4 := e*A12+A22; double * eA12 = FFLAS::fflas_new(m3*k2); FFLAS::fscal(NoField,m3,k2,epsilon,A12,lda,eA12,k2) ; FFLAS::fadd(NoField,m3,k2,eA12,k2,A22,lda,S1,k2); // T4 := -e*B11+B21; add(k2,n2,-epsilon,B11,ldb,B21,ldb,T1,n2); // P4 := S4 *T4; (dans C21) gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,C21,ldc,rec-1,epsilon); // C11 = P1+P4 FFLAS::fadd(NoField,m3,n2,C21,ldc,C22,ldc,C11,ldc); // T2 := B21 +B22; FFLAS::fadd(NoField,k2,n2,B21,ldb,B22,ldb,T1,n2); // P2 := A22*T2; double * P1 = FFLAS::fflas_new(n2*m3) ; // ou aire gemm_bini_322_0(F,m3,n2,k2,A22,lda,T1,n2,P1,n2,rec-1,epsilon); // P3 := A11*B22; (dans C12) gemm_bini_322_0(F,m3,n2,k2,A11,lda,B22,ldb,C12,ldc,rec-1,epsilon); // C11 -= (P2+P3) subadd(m3,n2,P1,n2,C12,ldc,C11,ldc); // S5 := A11 +e*A12; FFLAS::fadd(NoField,m3,k2,eA12,k2,A11,lda,S1,k2); // T5 := e*B12 +B22; add(k2,n2,epsilon,B12,ldb,B22,ldb,T1,n2); // P5 := S5 *T5; double * P2 = FFLAS::fflas_new(n2*m3) ; // ou aire gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,P2,n2,rec-1,epsilon); // C12 -= P5 subscalinf(NoField,m3,n2,P2,n2,-(double)1/epsilon,C12,ldc); // S6 := A21 +A32; FFLAS::fadd(NoField,m3,k2,A21,lda,A32,lda,S1,k2); // T6 := B11 +e*B22; add(k2,n2,epsilon,B22,ldb,B11,ldb,T1,n2); // P6 := S6 *T6; dans C32 gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,C32,ldc,rec-1,epsilon); // C21+= P6 FFLAS::faddin(NoField,m3,n2,C32,ldc,C21,ldc); // T7 := B11 +B12; FFLAS::fadd(NoField,k2,n2,B11,ldb,B12,ldb,T1,n2); // P7 := A21*T7; !signe gemm_bini_322_0(F,m3,n2,k2,A21,lda,T1,n2,P1,n2,rec-1,epsilon); // P8 := A32*B11; dans C31 !signe gemm_bini_322_0(F,m3,n2,k2,A32,lda,B11,ldb,C31,ldc,rec-1,epsilon); // C32 -= P8+P7 subadd(m3,n2,P1,n2,C31,ldc,C32,ldc); // S9 := A21 +e*A31; double * eA31 = eA12 ; FFLAS::fscal(NoField,m3,k2,epsilon,A31,lda,eA31,k2); FFLAS::fadd(NoField,m3,k2,eA31,k2,A21,lda,S1,k2); // T9 := B12 -e*B22; add(k2,n2,-epsilon,B22,ldb,B12,ldb,T1,n2); // P9 := S9 *T9; gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,P1,n2,rec-1,epsilon); // C32= (C32+P9)/p addscalinf(NoField,m3,n2,P1,n2,(double)1/epsilon,C32,ldc); // C22+= P9-P5 addsub(m3,n2,P1,n2,P2,n2,C22,ldc); FFLAS::fflas_delete( P2); // S10 := e*A31+A32; FFLAS::fadd(NoField,m3,k2,eA31,k2,A32,lda,S1,k2); FFLAS::fflas_delete( eA12 ); // T10 := B11 +e*B21; add(k2,n2,epsilon,B21,ldb,B11,ldb,T1,n2); // P10:= S10*T10; gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,P1,n2,rec-1,epsilon); FFLAS::fflas_delete( S1); FFLAS::fflas_delete( T1); // C21-= P10 FFLAS::fsubin(NoField,m3,n2,P1,n2,C21,ldc); // C31= (C31-P10)/(-epsilon) subscalinf(NoField,m3,n2,P1,n2,-(double)1/epsilon,C31,ldc); FFLAS::fflas_delete( P1); // C11 := (P1+P-P3+P4)/e; FFLAS::fscalin(NoField,m3,n2,(double)1/epsilon,C11,ldc); return C; } // Field must be Givaro::Modular template double * gemm_bini_322_mem(const Field & F , const size_t m , const size_t n , const size_t k , const double *A , const size_t lda , const double *B , const size_t ldb , double *C , const size_t ldc , int rec , const double & epsilon ) { Givaro::ZRing NoField; // const double p = (double)F.characteristic(); size_t M = (n>m)?std::min(k,m):std::min(k,n); // std::cout << rec << ',' << M << std::endl; // Field G(p*p); if ( M < TRE || rec <= 0) { // std::cout << "ffw" << std::endl; return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); } assert(k/2*2==k); // k divisible par 2 assert(n/2*2==n); // n divisible par 2 assert(m/3*3==m); // m divisible par 3 // std::cout << "tested" << std::endl; size_t n2 = n/2; size_t k2 = k/2; size_t m3 = m/3; // std::cout << "€ = " << epsilon << std::endl; // sub matrices in A const double * A11 = A; const double * A12 = A +k2; const double * A21 = A +lda*m3; const double * A22 = A21 +k2; const double * A31 = A21 +lda*m3; const double * A32 = A31 +k2; // sub matrices in C double * C11 = C; double * C12 = C +n2; double * C21 = C +ldc*m3; double * C22 = C21 +n2; double * C31 = C21 +ldc*m3; double * C32 = C31 +n2; // sub matrices in B const double * B11 = B; const double * B12 = B +n2; const double * B21 = B +ldb*k2; const double * B22 = B21 +n2; FFLAS::fzero(F,m,n,C,ldc); /* * Algo : * S1 := A11 +A22; * S4 := e*A12+A22; * S5 := A11 +e*A12; * S6 := A21 +A32; * S9 := A21 +e*A31; * S3 := e*A31+A32; * * T1 := e*B11 +B22; * T2 := B21 +B22; * T4 := -e*B11+B21; * T5 := e*B12 +B22; * T6 := B11 +e*B22; * T7 := B11 +B12; * T9 := B12 -e*B22; * T3 := B11 +e*B21; * * P1 := S1 *T1; * P2 := A22*T2; * P10 := A11*B22; * P4 := S4 *T4; * P5 := S5 *T5; * P6 := S6 *T6; * P7 := A21*T7; * P8 := A32*B11; * P9 := S9 *T9; * P3:= S3*T3; * * C11 := (P1-P2-P10+P4)/e; * C12 := (P10-P5)/(-e) ; * C21 := P4+P6-P3 ; * C22 := P1-P5+P9; * C31 := (-P8+P3)/e; * C32 := (P6-P7-P8+P9)/e; * */ // P10 gemm_bini_322_mem(F,m3,n2,k2,A11,lda,B22,ldb,C11,ldc,rec-1,epsilon); // S5 double * X = FFLAS::fflas_new(m3*k2); add(m3,k2,epsilon,A12,lda,A11,lda,X,k2); // T5 // double * Y = FFLAS::fflas_new(std::max(k2,m3)*n2); double * Y = FFLAS::fflas_new(k2*n2); add(k2,n2,epsilon,B12,ldb,B22,ldb,Y,n2); // P5 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C22,ldc,rec-1,epsilon); // C12 subscal(NoField,m3,n2,C22,ldc,C11,ldc,(double)1/epsilon,C12,ldc); // T2 FFLAS::fadd(NoField,k2,n2,B21,ldb,B22,ldb,Y,n2); // P2 gemm_bini_322_mem(F,m3,n2,k2,A22,lda,Y,n2,C31,ldc,rec-1,epsilon); // C11 FFLAS::faddin(NoField,m3,n2,C31,ldc,C11,ldc); // S1 FFLAS::fadd(NoField,m3,k2,A11,lda,A22,lda,X,k2); // T1 add(k2,n2,epsilon,B11,ldb,B22,ldb,Y,n2); // P1 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C21,ldc,rec-1,epsilon); // C22 FFLAS::fsub(NoField,m3,n2,C21,ldc,C22,ldc,C22,ldc); // C11 FFLAS::fsub(NoField,m3,n2,C21,ldc,C11,ldc,C11,ldc); // S4 add(m3,k2,epsilon,A12,lda,A22,lda,X,k2); // T4 add(k2,n2,-epsilon,B11,ldb,B21,ldb,Y,n2); // P4 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C21,ldc,rec-1,epsilon); // C11 addscalinf(NoField,m3,n2,C21,ldc,(double)1/epsilon,C11,ldc); // S9 add(m3,k2,epsilon,A31,lda,A21,lda,X,k2); // T9 add(k2,n2,-epsilon,B22,ldb,B12,ldb,Y,n2); // P9 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C32,ldc,rec-1,epsilon); // C22 FFLAS::faddin(NoField,m3,n2,C32,ldc,C22,ldc); // S6 FFLAS::fadd(NoField,m3,k2,A21,lda,A32,lda,X,k2); // T6 add(k2,n2,epsilon,B22,ldb,B11,ldb,Y,n2); // P6 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C31,ldc,rec-1,epsilon); // C21 FFLAS::faddin(NoField,m3,n2,C31,ldc,C21,ldc); // C32 FFLAS::faddin(NoField,m3,n2,C31,ldc,C32,ldc); // T7 FFLAS::fadd(NoField,k2,n2,B11,ldb,B12,ldb,Y,n2); // P7 gemm_bini_322_mem(F,m3,n2,k2,A21,lda,Y,n2,C31,ldc,rec-1,epsilon); // if (epsilon > 1 && rec == 2) { FFLAS::finit(G,m3,n2,C31,ldc);} // C32 FFLAS::fsubin(NoField,m3,n2,C31,ldc,C32,ldc); // S3 add(m3,k2,epsilon,A31,lda,A32,lda,X,k2); // T3 add(k2,n2,epsilon,B21,ldb,B11,ldb,Y,n2); // P3 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C31,ldc,rec-1,epsilon); FFLAS::fflas_delete( X); FFLAS::fflas_delete( Y ); // C21 FFLAS::fsubin(NoField,m3,n2,C31,ldc,C21,ldc); // P8 Y = FFLAS::fflas_new(m3*n2); gemm_bini_322_mem(F,m3,n2,k2,A32,lda,B11,ldb,Y,n2,rec-1,epsilon); // C31 subscalinf(NoField,m3,n2,Y,n2,(double)1/epsilon,C31,ldc); // FFLAS::fsubin(NoField,m3,n2,Y,n2,C31,ldc); // C32 subscalinf(NoField,m3,n2,Y,n2,(double)1/epsilon,C32,ldc); // FFLAS::fsubin(NoField,m3,n2,Y,n2,C32,ldc); // FFLAS::fscalin(NoField,m3,n,(double)1/epsilon,C31,ldc); FFLAS::fflas_delete( Y ); return C; } // Field must be Givaro::Modular template double * gemm_bini_223_mem(const Field & F , const size_t m , const size_t n , const size_t k , const double *A , const size_t lda , const double *B , const size_t ldb , double *C , const size_t ldc , int rec , const double & epsilon ) { Givaro::ZRing NoField; // const double p = (double)F.characteristic(); size_t M = (n>m)?std::min(k,m):std::min(k,n); // std::cout << rec << ',' << M << std::endl; // Field G(p*p); if ( M < TRE || rec <= 0) { // std::cout << "ffw" << std::endl; return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); } assert(k/2*2==k); // k divisible par 2 assert(n/3*3==n); // n divisible par 2 assert(m/2*2==m); // m divisible par 3 // std::cout << "tested" << std::endl; size_t m2 = m/2; size_t k2 = k/2; size_t n3 = n/3; // std::cout << "€ = " << epsilon << std::endl; // sub matrices in A const double * A11 = A; const double * A12 = A +k2; const double * A21 = A +lda*m2; const double * A22 = A21 +k2; // sub matrices in C double * C11 = C; double * C12 = C +n3; double * C13 = C +2*n3; double * C21 = C +ldc*m2; double * C22 = C21 +n3; double * C23 = C21 +2*n3; // sub matrices in B const double * B11 = B; const double * B12 = B +n3; const double * B13 = B +2*n3; const double * B21 = B +ldb*k2; const double * B22 = B21 +n3; const double * B23 = B21 +2*n3; FFLAS::fzero(F,m,n,C,ldc); /* * Algo : * S1 := B11 +B22; * S4 := e*B21+B22; * S5 := B11 +e*B21; * S6 := B12 +B23; * S9 := B12 +e*B13; * S3 := e*B13+B23; * * T1 := e*A11 +A22; * T2 := A12 +A22; * T4 := -e*A11+A12; * T5 := e*A21 +A22; * T6 := A11 +e*A22; * T7 := A11 +A21; * T9 := A21 -e*A22; * T3 := A11 +e*A12; * * P1 := S1 *T1; * P2 := T2 * B22; * P10 := A22 * B11; * P4 := S4 *T4; * P5 := S5 *T5; * P6 := S6 *T6; * P7 := T7*B12; * P8 := A11*B23; * P9 := S9 *T9; * P3 := S3*T3; * * C11 := (P1-P2-P10+P4)/e; * C21 := (P10-P5)/(-e) ; * C12 := P4+P6-P3 ; * C22 := P1-P5+P9; * C13 := (-P8+P3)/e; * C23 := (P6-P7-P8+P9)/e; * */ // P10 gemm_bini_223_mem(F,m2,n3,k2,A22,lda,B11,ldb,C11,ldc,rec-1,epsilon); // S5 double * Y = FFLAS::fflas_new(k2*n3); add(k2,n3,epsilon,B21,ldb,B11,ldb,Y,n3); // T5 double * X = FFLAS::fflas_new(m2*k2); add(m2,k2,epsilon,A21,lda,A22,lda,X,k2); // P5 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C22,ldc,rec-1,epsilon); // C12 subscal(NoField,m2,n3,C22,ldc,C11,ldc,(double)1/epsilon,C21,ldc); // T2 FFLAS::fadd(NoField,m2,k2,A12,lda,A22,lda,X,k2); // P2 gemm_bini_223_mem(F,m2,n3,k2,X,k2,B22,ldb,C13,ldc,rec-1,epsilon); // C11 FFLAS::faddin(NoField,m2,n3,C13,ldc,C11,ldc); // S1 FFLAS::fadd(NoField,k2,n3,B11,ldb,B22,ldb,Y,n3); // T1 add(m2,k2,epsilon,A11,lda,A22,lda,X,k2); // P1 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon); // C22 FFLAS::fsub(NoField,m2,n3,C12,ldc,C22,ldc,C22,ldc); // C11 FFLAS::fsub(NoField,m2,n3,C12,ldc,C11,ldc,C11,ldc); // S4 add(k2,n3,epsilon,B21,ldb,B22,ldb,Y,n3); // T4 add(m2,k2,-epsilon,A11,lda,A12,lda,X,k2); // P4 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon); // C11 addscalinf(NoField,m2,n3,C12,ldc,(double)1/epsilon,C11,ldc); // S9 add(k2,n3,epsilon,B13,ldb,B12,ldb,Y,n3); // T9 add(m2,k2,-epsilon,A22,lda,A21,lda,X,k2); // P9 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C23,ldc,rec-1,epsilon); // C22 FFLAS::faddin(NoField,m2,n3,C23,ldc,C22,ldc); // S6 FFLAS::fadd(NoField,k2,n3,B12,ldb,B23,ldb,Y,n3); // T6 add(m2,k2,epsilon,A22,lda,A11,lda,X,k2); // P6 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon); // C21 FFLAS::faddin(NoField,m2,n3,C13,ldc,C12,ldc); // C32 FFLAS::faddin(NoField,m2,n3,C13,ldc,C23,ldc); // T7 FFLAS::fadd(NoField,m2,k2,A11,lda,A21,lda,X,k2); // P7 gemm_bini_223_mem(F,m2,n3,k2,X,k2,B12,ldb,C13,ldc,rec-1,epsilon); // if (epsilon > 1 && rec == 2) { FFLAS::finit(G,m2,n3,C31,ldc);} // C32 FFLAS::fsubin(NoField,m2,n3,C13,ldc,C23,ldc); // S3 add(k2,n3,epsilon,B13,ldb,B23,ldb,Y,n3); // T3 add(m2,k2,epsilon,A12,lda,A11,lda,X,k2); // P3 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon); FFLAS::fflas_delete( Y ); FFLAS::fflas_delete( X ); // C21 FFLAS::fsubin(NoField,m2,n3,C13,ldc,C12,ldc); // P8 Y = FFLAS::fflas_new(m2*n3); gemm_bini_223_mem(F,m2,n3,k2,A11,lda,B23,ldb,Y,n3,rec-1,epsilon); // C31 subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C13,ldc); // C32 subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C23,ldc); FFLAS::fflas_delete( Y ); return C; } // Field must be Givaro::Modular template double * gemm_bini_322_2(const Field & F , const size_t m , const size_t n , const size_t k , const double *A , const size_t lda , const double *B , const size_t ldb , double *C , const size_t ldc , int rec , const double & epsilon ) { Givaro::ZRing NoField; // const double p = (double)F.characteristic(); size_t M = (n>m)?std::min(k,m):std::min(k,n); // std::cout << rec << ',' << M << std::endl; // Field G(p*p); if ( M < TRE || rec <= 0) { // std::cout << "ffw" << std::endl; return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); } assert(k/2*2==k); // k divisible par 2 assert(n/2*2==n); // n divisible par 2 assert(m/3*3==m); // m divisible par 3 // std::cout << "tested" << std::endl; size_t n2 = n/2; size_t k2 = k/2; size_t m3 = m/3; // std::cout << "€ = " << epsilon << std::endl; // sub matrices in A const double * A11 = A; const double * A12 = A +k2; const double * A21 = A +lda*m3; const double * A22 = A21 +k2; const double * A31 = A21 +lda*m3; const double * A32 = A31 +k2; // sub matrices in C double * C11 = C; double * C12 = C +n2; double * C21 = C +ldc*m3; double * C22 = C21 +n2; double * C31 = C21 +ldc*m3; double * C32 = C31 +n2; // sub matrices in B const double * B11 = B; const double * B12 = B +n2; const double * B21 = B +ldb*k2; const double * B22 = B21 +n2; FFLAS::fzero(F,m,n,C,ldc); /* * Algo : * S1 := A11 +A22; * S4 := e*A12+A22; * S5 := A11 +e*A12; * S3 := e*A31+A32; * S6 := A21 +A32; * S9 := A21 +e*A31; * * T1 := e*B11 +B22; * T2 := B21 +B22; * T3 := B11 +e*B21; * T4 := -e*B11+B21; * T5 := e*B12 +B22; * T6 := B11 +e*B22; * T7 := B11 +B12; * T9 := B12 -e*B22; * * P1 := S1 *T1; * P2 := A22*T2; * P10 := A11*B22; * P4 := S4 *T4; * P5 := S5 *T5; * P6 := S6 *T6; * P7 := A21*T7; * P8 := A32*B11; * P9 := S9 *T9; * P3:= S3*T3; * * C11 := (P1-P2-P10+P4)/e; * C12 := (P10-P5)/(-e) ; * C21 := P4+P6-P3 ; * C22 := P1-P5+P9; * C31 := (-P8+P3)/e; * C32 := (P6-P7-P8+P9)/e; * */ double * U = FFLAS::fflas_new(m3*n2); double * V = FFLAS::fflas_new(m3*n2); double * X = FFLAS::fflas_new(m3*std::max(k2,n2)); double * Y = FFLAS::fflas_new(std::max(k2,m3)*n2); // S4 add(m3,k2,epsilon,A12,lda,A22,lda,X,k2); // T4 add(k2,n2,-epsilon,B11,ldb,B21,ldb,Y,n2); // P4 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,U,n2,rec-1,epsilon); // S9 add(m3,k2,epsilon,A31,lda,A21,lda,X,k2); // T9 add(k2,n2,-epsilon,B22,ldb,B12,ldb,Y,n2); // P9 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,V,n2,rec-1,epsilon); // S5 add(m3,k2,epsilon,A12,lda,A11,lda,X,k2); // T5 add(k2,n2,epsilon,B12,ldb,B22,ldb,Y,n2); // P5 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,C12,ldc,rec-1,epsilon); // S3 add(m3,k2,epsilon,A31,lda,A32,lda,X,k2); // T3 add(k2,n2,epsilon,B21,ldb,B11,ldb,Y,n2); // P3 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,C31,ldc,rec-1,epsilon); // C22 = P9-P5 FFLAS::fsub(NoField,m3,n2,V,n2,C12,ldc,C22,ldc); // C21 = P4-P3 FFLAS::fsub(NoField,m3,n2,U,n2,C31,ldc,C21,ldc); // T2 FFLAS::fadd(NoField,k2,n2,B21,ldb,B22,ldb,Y,n2); // P2 gemm_bini_322_2(F,m3,n2,k2,A22,lda,Y,n2,X,n2,rec-1,epsilon); // XXX approximate // C11 = (P4 - P2) / e subscal(NoField,m3,n2,U,n2,X,n2,1./epsilon,C11,ldc); // T7 FFLAS::fadd(NoField,k2,n2,B11,ldb,B12,ldb,Y,n2); // P7 gemm_bini_322_2(F,m3,n2,k2,A21,lda,Y,n2,X,n2,rec-1,epsilon); // XXX approximate // C32 = (P9-P7) / e subscal(NoField,m3,n2,V,n2,X,n2,1./epsilon,C32,ldc); // S1 FFLAS::fadd(NoField,m3,k2,A11,lda,A22,lda,X,k2); // T1 add(k2,n2,epsilon,B11,ldb,B22,ldb,Y,n2); // P1 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,U,n2,rec-1,epsilon); // C22 += P1 FFLAS::faddin(NoField,m3,n2,U,n2,C22,ldc); // P10 gemm_bini_322_2(F,m3,n2,k2,A11,lda,B22,ldb,V,n2,rec-1,epsilon); // C12 = (P5-P10)/e subscalinf(NoField,m3,n2,V,n2,1./epsilon,C12,ldc); // XXX approximate // C11 = C11 + (P1-P10)/e subscalacc(NoField,m3,n2,U,n2,V,n2,1./epsilon,C11,ldc); // S6 FFLAS::fadd(NoField,m3,k2,A21,lda,A32,lda,X,k2); // T6 add(k2,n2,epsilon,B22,ldb,B11,ldb,Y,n2); // P6 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,U,n2,rec-1,epsilon); // C21 += P6 FFLAS::faddin(NoField,m3,n2,U,n2,C21,ldc); // P8 gemm_bini_322_2(F,m3,n2,k2,A32,lda,B11,ldb,V,n2,rec-1,epsilon); // C31 = (P3-P8)/2 subscalinf(NoField,m3,n2,V,n2,1./epsilon,C31,ldc); // XXX approximate // C32 = C32 + (P6-P8)/e subscalacc(NoField,m3,n2,U,n2,V,n2,1./epsilon,C32,ldc); FFLAS::fflas_delete( X); FFLAS::fflas_delete( Y ); FFLAS::fflas_delete( U); FFLAS::fflas_delete( V); return C; } // Field must be Givaro::Modular template double * gemm_bini_232_2(const Field & F , const size_t m , const size_t n , const size_t k , const double *A , const size_t lda , const double *B , const size_t ldb , double *C , const size_t ldc , int rec , const double & epsilon ) { Givaro::ZRing NoField; // const double p = (double)F.characteristic(); size_t M = (n>m)?std::min(k,m):std::min(k,n); // Field G(p*p); if ( M < TRE || rec <= 0) { // std::cout << "ffw" << std::endl; return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); } assert(k/3*3==k); // k divisible par 3 assert(n/2*2==n); // n divisible par 2 assert(m/2*2==m); // m divisible par 2 // std::cout << "tested" << std::endl; size_t n2 = n/2; size_t k3 = k/3; size_t m2 = m/2; // std::cout << "€ = " << epsilon << std::endl; // sub matrices in B const double * B11 = B; const double * B12 = B +n2; const double * B21 = B +ldb*k3; const double * B22 = B21 +n2; const double * B31 = B21 +ldb*k3; const double * B32 = B31 +n2; // sub matrices in C double * C11 = C; double * C12 = C +n2; double * C21 = C +ldc*m2; double * C22 = C21 +n2; // sub matrices in A const double * A11 = A; const double * A12 = A +k3; const double * A13 = A +2*k3; const double * A21 = A +lda*m2; const double * A22 = A21 +k3; const double * A23 = A21 +2*k3; FFLAS::fzero(F,m,n,C,ldc); /* * Algo : * * S1 := A11 +A22*e; * S3 := -(A11+A21); * S4 := A11+A12*e; * S5 := A21 - A22*e; * S6 := A12*e + A23; * S8 := -(A13+A23): * S9 := A22*e + A23; * S10 := -A12*e+A13; * * T1 := B11 +B22; * T4 := e*B12+B22; * T5 := B11 +e*B12; * T6 := B21 +B32; * T9 := B21 + e*B31; * T10 := e*B31 +B32; * * P1 := Bini232(S1,T1 ,e); * P2 := Bini232(A11,B22 ,e); * P3 := Bini232(S3,B11,e); * P4 := Bini232(S4,T4 ,e); * P5 := Bini232(S5,T5 ,e); * P6 := Bini232(S6,T6 ,e); * P7 := Bini232(A23,B21 ,e); * P8 := Bini232(S8,B32,e); * P9 := Bini232(S9,T9 ,e); * P10:= Bini232(S10,T10,e); * * * C11 := evalm(P1-P4+(P6-P7+P8+P10)/e); * C12 := evalm((-P2+P4)/e+P10) ; * C21 := evalm(P5+(-P7+P9)/e) ; * C22 := evalm((P1-P2+P3+P5)/e+P6-P9); * */ double * U = FFLAS::fflas_new(m2*n2); double * V = FFLAS::fflas_new(m2*n2); double * X = FFLAS::fflas_new(m2*k3); double * Y = FFLAS::fflas_new(k3*n2); // S1 add(m2,k3,epsilon,A22,lda,A11,lda,X,k3); // T1 FFLAS::fadd(NoField,k3,n2,B11,ldb,B22,ldb,Y,n2); // P1 (in U) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); // S3 negadd(m2,k3,A11,lda,A21,lda,X,k3); // P3 (in V) gemm_bini_232_2(F,m2,n2,k3,X,k3,B11,ldb,V,n2,rec-1,epsilon); // C22 = (P1+P3)/e // FFLAS::fadd(NoField,m2,n2,U,n2,V,n2,C22,ldc); // XXX acc addscal(NoField,m2,n2,U,n2,V,n2,(double)1/epsilon,C22,ldc); // S6 add(m2,k3,epsilon,A12,lda,A23,lda,X,k3); // T6 FFLAS::fadd(NoField,k3,n2,B21,ldb,B32,ldb,Y,n2); // P6 (in V) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon); // C22 += P6 FFLAS::faddin(NoField,m2,n2,V,n2,C22,ldc); // S8 negadd(m2,k3,A13,lda,A23,lda,X,k3); // P8 (in C11) gemm_bini_232_2(F,m2,n2,k3,X,k3,B32,ldb,C11,ldc,rec-1,epsilon); // C11 = (P8+P6)/e addscalinf(NoField,m2,n2,V,n2,(double)1/epsilon,C11,ldc); // C11 += P1 FFLAS::faddin(NoField,m2,n2,U,n2,C11,ldc); // S4 add(m2,k3,epsilon,A12,lda,A11,lda,X,k3); // T4 add(k3,n2,epsilon,B12,ldb,B22,ldb,Y,n2); // P4 (in U) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); // C11 -= P4 FFLAS::fsubin(NoField,m2,n2,U,n2,C11,ldc); // P2 (in C12) gemm_bini_232_2(F,m2,n2,k3,A11,lda,B22,ldb,C12,ldc,rec-1,epsilon); // S5 add(m2,k3,-epsilon,A22,lda,A21,lda,X,k3); // T5 add(k3,n2,epsilon,B12,ldb,B11,ldb,Y,n2); // P5 (in V) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon); // C22 += (P5-P2)/e subscalacc(NoField,m2,n2,V,n2,C12,ldc,(double)1/epsilon,C22,ldc); // C12 = (P4-P2)/e subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C12,ldc); // S9 add(m2,k3,epsilon,A22,lda,A23,lda,X,k3); // T9 add(k3,n2,epsilon,B31,ldb,B21,ldb,Y,n2); // P9 (in U) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); // C22 -= P9 FFLAS::fsubin(NoField,m2,n2,U,n2,C22,ldc); // P7 (in C21) gemm_bini_232_2(F,m2,n2,k3,A23,lda,B21,ldb,C21,ldc,rec-1,epsilon); // C11 = C11 - P7/e add(m2,n2,-(double)1/epsilon,C21,ldc,C11,ldc,C11,ldc); // C21 = (P9-P7)/e subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C21,ldc); // C21 += P5 FFLAS::faddin(NoField,m2,n2,V,n2,C21,ldc); // S10 add(m2,k3,-epsilon,A12,lda,A13,lda,X,k3); // T10 add(k3,n2,epsilon,B31,ldb,B32,ldb,Y,n2); // P10 (in U) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); // C12 += P10 FFLAS::faddin(NoField,m2,n2,U,n2,C12,ldc); // C11 += P10/e add(m2,n2,(double)1/epsilon,U,n2,C11,ldc,C11,ldc); FFLAS::fflas_delete( X ); FFLAS::fflas_delete( Y ); FFLAS::fflas_delete( U ); FFLAS::fflas_delete( V ); return C; } template double * gemm_bini_232_3_acc(const Field & F , const size_t m , const size_t n , const size_t k , const double *A , const size_t lda , const double *B , const size_t ldb , double *C , const size_t ldc , int rec , const double & epsilon ) { if (rec != 0) exit(-1); Givaro::DoubleDomain R; FFLAS::fgemm(R, FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, m,n,k, 1, A,lda, B,ldb, 1, C, ldc); } template double * gemm_bini_232_3(const Field & F , const size_t m , const size_t n , const size_t k , const double *A , const size_t lda , const double *B , const size_t ldb , double *C , const size_t ldc , int rec , const double & epsilon ) { Givaro::ZRing NoField; // const double p = (double)F.characteristic(); size_t M = (n>m)?std::min(k,m):std::min(k,n); // Field G(p*p); if ( M < TRE || rec <= 0) { // std::cout << "ffw" << std::endl; return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); } assert(k/3*3==k); // k divisible par 3 assert(n/2*2==n); // n divisible par 2 assert(m/2*2==m); // m divisible par 2 // std::cout << "tested" << std::endl; size_t n2 = n/2; size_t k3 = k/3; size_t m2 = m/2; // std::cout << "€ = " << epsilon << std::endl; // sub matrices in B const double * B11 = B; const double * B12 = B +n2; const double * B21 = B +ldb*k3; const double * B22 = B21 +n2; const double * B31 = B21 +ldb*k3; const double * B32 = B31 +n2; // sub matrices in C double * C11 = C; double * C12 = C +n2; double * C21 = C +ldc*m2; double * C22 = C21 +n2; // sub matrices in A const double * A11 = A; const double * A12 = A +k3; const double * A13 = A +2*k3; const double * A21 = A +lda*m2; const double * A22 = A21 +k3; const double * A23 = A21 +2*k3; FFLAS::fzero(F,m,n,C,ldc); /* * Algo : * * S1 := A11 +A22*e; * S3 := -(A11+A21); * S4 := A11+A12*e; * S5 := A21 - A22*e; * S6 := A12*e + A23; * S8 := -(A13+A23): * S9 := A22*e + A23; * S10 := -A12*e+A13; * * T1 := B11 +B22; * T4 := e*B12+B22; * T5 := B11 +e*B12; * T6 := B21 +B32; * T9 := B21 + e*B31; * T10 := e*B31 +B32; * * P1 := Bini232(S1,T1 ,e); * P2 := Bini232(A11,B22 ,e); * P3 := Bini232(S3,B11,e); * P4 := Bini232(S4,T4 ,e); * P5 := Bini232(S5,T5 ,e); * P6 := Bini232(S6,T6 ,e); * P7 := Bini232(A23,B21 ,e); * P8 := Bini232(S8,B32,e); * P9 := Bini232(S9,T9 ,e); * P10:= Bini232(S10,T10,e); * * * C11 := evalm(P1-P4+(P6-P7+P8+P10)/e); * C12 := evalm((-P2+P4)/e+P10) ; * C21 := evalm(P5+(-P7+P9)/e) ; * C22 := evalm((P1-P2+P3+P5)/e+P6-P9); * */ // could be just one band for the scalings double * U = FFLAS::fflas_new(m2*n2); double * V = FFLAS::fflas_new(std::max(k3,m2)*n2); double * X = FFLAS::fflas_new(m2*k3); double * Y = FFLAS::fflas_new(k3*n2); // S1 double * eA22 = FFLAS::fflas_new(std::max(m2,n2)*k3); FFLAS::fscal(NoField,m2,k3,epsilon,A22,lda,eA22,k3); FFLAS::fadd(NoField,m2,k3,eA22,k3,A11,lda,X,k3); // T1 FFLAS::fadd(NoField,k3,n2,B11,ldb,B22,ldb,Y,n2); // P1 (in U) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); // S3 negadd(m2,k3,A11,lda,A21,lda,X,k3); // P3 (in V) gemm_bini_232_2(F,m2,n2,k3,X,k3,B11,ldb,V,n2,rec-1,epsilon); // C22 = (P1+P3)/e addscal(NoField,m2,n2,U,n2,V,n2,(double)1/epsilon,C22,ldc); // S6 double * eA12 = FFLAS::fflas_new(m2*k3); FFLAS::fscal(NoField,m2,k3,epsilon,A12,lda,eA12,k3); FFLAS::fadd(NoField,m2,k3,eA12,k3,A23,lda,X,k3); // T6 FFLAS::fadd(NoField,k3,n2,B21,ldb,B32,ldb,Y,n2); // P6 (in V) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon); // C22 += P6 FFLAS::faddin(NoField,m2,n2,V,n2,C22,ldc); // S8 negadd(m2,k3,A13,lda,A23,lda,X,k3); // P8 (in C11) gemm_bini_232_2(F,m2,n2,k3,X,k3,B32,ldb,C11,ldc,rec-1,epsilon); // C11 = (P8+P6)/e addscalinf(NoField,m2,n2,V,n2,(double)1/epsilon,C11,ldc); // C11 += P1 FFLAS::faddin(NoField,m2,n2,U,n2,C11,ldc); // S4 FFLAS::fadd(NoField,m2,k3,eA12,k3,A11,lda,X,k3); // T4 double * eB12 = V ; // FFLAS::fflas_new(n2*k3); FFLAS::fscal(NoField,k3,n2,epsilon,B12,ldb,eB12,n2); FFLAS::fadd(NoField,k3,n2,eB12,n2,B22,ldb,Y,n2); // P4 (in U) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); // C11 -= P4 FFLAS::fsubin(NoField,m2,n2,U,n2,C11,ldc); // P2 (in C12) gemm_bini_232_2(F,m2,n2,k3,A11,lda,B22,ldb,C12,ldc,rec-1,epsilon); // S5 FFLAS::fsub(NoField,m2,k3,A21,lda,eA22,k3,X,k3); // T5 FFLAS::fadd(NoField,k3,n2,eB12,n2,B11,ldb,Y,n2); // FFLAS::fflas_delete( eB12); // P5 (in V) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon); // C22 += (P5-P2)/e subscalacc(NoField,m2,n2,V,n2,C12,ldc,(double)1/epsilon,C22,ldc); // C12 = (P4-P2)/e subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C12,ldc); // S9 FFLAS::fadd(NoField,m2,k3,eA22,k3,A23,lda,X,k3); double * eB31 = eA22 ; FFLAS::fscal(NoField,k3,n2,epsilon,B31,ldb,eB31,n2); // T9 FFLAS::fadd(NoField,k3,n2,eB31,n2,B21,ldb,Y,n2); // P9 (in U) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); // C22 -= P9 FFLAS::fsubin(NoField,m2,n2,U,n2,C22,ldc); // P7 (in C21) gemm_bini_232_2(F,m2,n2,k3,A23,lda,B21,ldb,C21,ldc,rec-1,epsilon); // C11 = C11 - P7/e add(m2,n2,-(double)1/epsilon,C21,ldc,C11,ldc,C11,ldc); // C21 = (P9-P7)/e subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C21,ldc); // C21 += P5 FFLAS::faddin(NoField,m2,n2,V,n2,C21,ldc); // S10 FFLAS::fsub(NoField,m2,k3,A13,lda,eA12,k3,X,k3); FFLAS::fflas_delete( eA12); // T10 FFLAS::fadd(NoField,k3,n2,eB31,n2,B32,ldb,Y,n2); FFLAS::fflas_delete( eA22); // P10 (in U) gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); // C12 += P10 FFLAS::faddin(NoField,m2,n2,U,n2,C12,ldc); // C11 += P10/e add(m2,n2,(double)1/epsilon,U,n2,C11,ldc,C11,ldc); FFLAS::fflas_delete( X ); FFLAS::fflas_delete( Y ); FFLAS::fflas_delete( U ); FFLAS::fflas_delete( V ); return C; } #if 0 template double * gemm_bini_322_sqrt(const Field & F , const size_t m , const size_t n , const size_t k , const double *A , const size_t lda , const double *B , const size_t ldb , double *C , const size_t ldc , int rec , const double & epsilon ) { Givaro::ZRing NoField; // const double p = (double)F.characteristic(); size_t M = (n>m)?std::min(k,m):std::min(k,n); // std::cout << rec << ',' << M << std::endl; // Field G(p*p); if ( M < TRE || rec <= 0) { // std::cout << "ffw" << std::endl; return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); } assert(k/2*2==k); // k divisible par 2 assert(n/3*3==n); // n divisible par 2 assert(m/2*2==m); // m divisible par 3 // std::cout << "tested" << std::endl; size_t m2 = m/2; size_t k2 = k/2; size_t n3 = n/3; // std::cout << "€ = " << epsilon << std::endl; // sub matrices in A const double * A11 = A; const double * A12 = A +k2; const double * A21 = A +lda*m2; const double * A22 = A21 +k2; // sub matrices in C double * C11 = C; double * C12 = C +n3; double * C13 = C +2*n3; double * C21 = C +ldc*m2; double * C22 = C21 +n3; double * C23 = C21 +2*n3; // sub matrices in B const double * B11 = B; const double * B12 = B +n3; const double * B13 = B +2*n3; const double * B21 = B +ldb*k2; const double * B22 = B21 +n3; const double * B23 = B21 +2*n3; FFLAS::fzero(F,m,n,C,ldc); /* * Algo : * S1 := B11 +B22; * S4 := e*B21+B22; * S5 := B11 +e*B21; * S6 := B12 +B23; * S9 := B12 +e*B13; * S3 := e*B13+B23; * * T1 := e*A11 +A22; * T2 := A12 +A22; * T4 := -e*A11+A12; * T5 := e*A21 +A22; * T6 := A11 +e*A22; * T7 := A11 +A21; * T9 := A21 -e*A22; * T3 := A11 +e*A12; * * P1 := S1 *T1; * P2 := T2 * B22; * P10 := A22 * B11; * P4 := S4 *T4; * P5 := S5 *T5; * P6 := S6 *T6; * P7 := T7*B12; * P8 := A11*B23; * P9 := S9 *T9; * P3 := S3*T3; * * C11 := (P1-P2-P10+P4)/e; * C21 := (P10-P5)/(-e) ; * C12 := P4+P6-P3 ; * C22 := P1-P5+P9; * C13 := (-P8+P3)/e; * C23 := (P6-P7-P8+P9)/e; * */ // P10 gemm_bini_223_mem(F,m2,n3,k2,A22,lda,B11,ldb,C11,ldc,rec-1,epsilon); // S5 double * Y = FFLAS::fflas_new(k2*n3); add(k2,n3,epsilon,B21,ldb,B11,ldb,Y,n3); // T5 double * X = FFLAS::fflas_new(m2*k2); add(m2,k2,epsilon,A21,lda,A22,lda,X,k2); // P5 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C22,ldc,rec-1,epsilon); // C12 subscal(NoField,m2,n3,C22,ldc,C11,ldc,(double)1/epsilon,C21,ldc); // T2 FFLAS::fadd(NoField,m2,k2,A12,lda,A22,lda,X,k2); // P2 gemm_bini_223_mem(F,m2,n3,k2,X,k2,B22,ldb,C13,ldc,rec-1,epsilon); // C11 FFLAS::faddin(NoField,m2,n3,C13,ldc,C11,ldc); // S1 FFLAS::fadd(NoField,k2,n3,B11,ldb,B22,ldb,Y,n3); // T1 add(m2,k2,epsilon,A11,lda,A22,lda,X,k2); // P1 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon); // C22 FFLAS::fsub(NoField,m2,n3,C12,ldc,C22,ldc,C22,ldc); // C11 FFLAS::fsub(NoField,m2,n3,C12,ldc,C11,ldc,C11,ldc); // S4 add(k2,n3,epsilon,B21,ldb,B22,ldb,Y,n3); // T4 add(m2,k2,-epsilon,A11,lda,A12,lda,X,k2); // P4 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon); // C11 addscalinf(NoField,m2,n3,C12,ldc,(double)1/epsilon,C11,ldc); // S9 add(k2,n3,epsilon,B13,ldb,B12,ldb,Y,n3); // T9 add(m2,k2,-epsilon,A22,lda,A21,lda,X,k2); // P9 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C23,ldc,rec-1,epsilon); // C22 FFLAS::faddin(NoField,m2,n3,C23,ldc,C22,ldc); // S6 FFLAS::fadd(NoField,k2,n3,B12,ldb,B23,ldb,Y,n3); // T6 add(m2,k2,epsilon,A22,lda,A11,lda,X,k2); // P6 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon); // C21 FFLAS::faddin(NoField,m2,n3,C13,ldc,C12,ldc); // C32 FFLAS::faddin(NoField,m2,n3,C13,ldc,C23,ldc); // T7 FFLAS::fadd(NoField,m2,k2,A11,lda,A21,lda,X,k2); // P7 gemm_bini_223_mem(F,m2,n3,k2,X,k2,B12,ldb,C13,ldc,rec-1,epsilon); // if (epsilon > 1 && rec == 2) { FFLAS::finit(G,m2,n3,C31,ldc);} // C32 FFLAS::fsubin(NoField,m2,n3,C13,ldc,C23,ldc); // S3 add(k2,n3,epsilon,B13,ldb,B23,ldb,Y,n3); // T3 add(m2,k2,epsilon,A12,lda,A11,lda,X,k2); // P3 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon); FFLAS::fflas_delete( Y ); FFLAS::fflas_delete( X ); // C21 FFLAS::fsubin(NoField,m2,n3,C13,ldc,C12,ldc); // P8 Y = FFLAS::fflas_new(m2*n3); gemm_bini_223_mem(F,m2,n3,k2,A11,lda,B23,ldb,Y,n3,rec-1,epsilon); // C31 subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C13,ldc); // C32 subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C23,ldc); FFLAS::fflas_delete( Y ); return C; } #endif } // Rec } // Protected } // FFLAS namespace FFLAS { namespace Protected { template typename Field::Element * gemm_bini_p(const Field &F , const size_t m , const size_t n , const size_t k , const typename Field::Element *A , const size_t lda , const typename Field::Element *B , const size_t ldb , typename Field::Element *C , const size_t ldc , int rec , size_t algo ) { assert(k/6*6==k); // k divisible par 6 assert(n/6*6==n); // n divisible par 6 assert(m/6*6==m); // m divisible par 6 // e-formule double epsilon = (double) F.characteristic() ; switch(algo) { case 0 : Rec::gemm_bini_322_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); FFLAS::finit_fuzzy(F,m,n,C,ldc); // FFLAS::finit(F,m,n,C,ldc); break; case 1 : Rec::gemm_bini_322_0(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); FFLAS::finit_fuzzy(F,m,n,C,ldc); // FFLAS::finit(F,m,n,C,ldc); break; case 2 : Rec::gemm_bini_322_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); FFLAS::finit_fuzzy(F,m,n,C,ldc); break; case 3 : Rec::gemm_bini_223_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); FFLAS::finit_fuzzy(F,m,n,C,ldc); // FFLAS::finit(F,m,n,C,ldc); break; case 4 : Rec::gemm_bini_232_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); FFLAS::finit_fuzzy(F,m,n,C,ldc); break; case 5 : Rec::gemm_bini_232_3(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); FFLAS::finit_fuzzy(F,m,n,C,ldc); break; #if 0 case 8 : { double epsilon2 = sqrt((double)epsilon); std::cout << epsilon2 << std::endl; Rec::gemm_bini_322_sqrt(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon2); // FFLAS::finit_fuzzy(F,m,n,C,ldc); for(size_t i = 0 ; i < m ; ++i) { for(size_t j = 0 ; j < n ; ++j) C[i*ldc+j] = rint(fmod(C[i*ldc+j],epsilon2)); } break; } #endif default : std::cout << " not an algo :" << algo << std::endl;; exit(-1); } return C; } template typename Field::Element * gemm_bini_e(const Field &F , const size_t m , const size_t n , const size_t k , const typename Field::Element *A , const size_t lda , const typename Field::Element *B , const size_t ldb , typename Field::Element *C , const size_t ldc , int rec , size_t algo ) { assert(k/2*2==k); // k divisible par 2 assert(n/2*2==n); // n divisible par 2 assert(m/3*3==m); // m divisible par 3 // e-formule double epsilon = 1./(1<<27); switch(algo) { case 0 : Rec::gemm_bini_322_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); break; case 1 : Rec::gemm_bini_322_0(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); break; case 2 : Rec::gemm_bini_322_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); break; case 3 : Rec::gemm_bini_223_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); break; case 4 : Rec::gemm_bini_232_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); break; case 5 : Rec::gemm_bini_232_3(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); break; default : std::cout << " not an algo :" << algo << std::endl;; exit(-1); } // vire les e. // FFLAS::finit_fuzzy(F,m,n,C,ldc); FFLAS::finit_fuzzy(F,m,n,C,ldc); return C; } template typename Field::Element * gemm_compress(const Field &F , const size_t m , const size_t n , const size_t k , const typename Field::Element *A , const size_t lda , const typename Field::Element *B , const size_t ldb , typename Field::Element *C , const size_t ldc , int rec , size_t algo ) { assert(k/6*6==k); // k divisible par 6 assert(n/6*6==n); // n divisible par 6 assert(m/6*6==m); // m divisible par 6 switch(algo) { case 0 : fgemm_compressed(F,(int)m,(int)n,(int)k,A,(int)lda,B,(int)ldb,C,(int)ldc); FFLAS::freduce(F,m,n,C,ldc); break; case 1 : fgemm_compressed(F,(int)m,(int)n,(int)k,A,(int)lda,B,(int)ldb,C,(int)ldc); FFLAS::freduce(F,m,n,C,ldc); break; default : std::cout << " not an algo :" << algo << std::endl;; exit(-1); } return C; } } // Protected } // FFLAS template void check_equal(const Field & F,int m,int n, typename Field::Element * D,int ldd, typename Field::Element * E,int lde, const char * nomalgo, const char * madescr, int & ok_p) { int faux = 0 ; for (int i = 0 ; i < m ; ++i) { for (int j = 0 ; j < n ; ++j) { if (!F.areEqual(D[i*ldd+j],E[i*lde+j])) { ++faux ; } } } if (faux) { std::cout << nomalgo << " " << madescr << " : bad/all = " << faux << '/' << m*n << " ~~ " << (double)faux/(double)(m*n) << std::endl; } else ok_p ++ ; #if 1 if (faux && (n<20)) { std::cout << "OK" << std::endl; for (int i = 0 ; i < m ; ++i) { for (int j = 0 ; j < n ; ++j) std::cout << D[i*ldd+j] << ' '; std::cout << std::endl; } std::cout << "KO" << std::endl; for (int i = 0 ; i < m ; ++i) { for (int j = 0 ; j < n ; ++j) std::cout << E[i*lde+j] << ' '; std::cout << std::endl; } std::cout << "Diff" << std::endl; for (int i = 0 ; i < m ; ++i) { for (int j = 0 ; j < n ; ++j) std::cout << D[i*ldd+j]-E[i*lde+j] << ' '; std::cout << std::endl; } } #endif } template void test_algos(const Field &F, int m, int n, int k , const typename Field::Element * A, int lda , const typename Field::Element * B, int ldb , int r , time_v & tim_k, time_v & tim_e , time_v & tim_p , int_v & ok_k, int_v & ok_e, int_v & ok_p , FFLAS::Timer & tim_wd, int & nb_wd , bool with_e , bool with_k ) { FFLAS::Timer tmp ; typedef typename Field::Element Element; Element * D = FFLAS::fflas_new(m*n); Element * C = FFLAS::fflas_new(m*n); tmp.clear();tmp.start(); fgemm(F,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, m,n,k, 1, A,k, B,n, 0, D, n); tmp.stop(); tim_wd += tmp ; nb_wd ++; /* bini_p */ if (with_e) { for (int algo = 0 ; algo < algos ; ++algo) { tmp.clear();tmp.start(); FFLAS::Protected::gemm_bini_e(F,m,n,k,A,k,B,n,C,n,r,selec[algo]); tmp.stop(); tim_e[algo] += tmp ; /* checking */ check_equal(F,m,n,D,n,C,n,"bini_e",descr[algo],ok_e[algo]) ; } } /* compress */ if (with_k && std::is_same::category,FFLAS::FieldCategories::ModularTag>::value && (! FieldTraits::balanced)) { for (int algo = 0 ; algo < algos_k ; ++algo) { tmp.clear();tmp.start(); FFLAS::Protected::gemm_compress(F,m,n,k,A,k,B,n,C,n,r,selec_k[algo]); tmp.stop(); tim_k[algo] += tmp ; /* checking */ check_equal(F,m,n,D,n,C,n,"compress",descr_k[algo],ok_k[algo]) ; } } /* bini_p */ for (int algo = 0 ; algo < algos ; ++algo) { tmp.clear();tmp.start(); FFLAS::Protected::gemm_bini_p(F,m,n,k,A,k,B,n,C,n,r,selec[algo]); tmp.stop(); tim_p[algo] += tmp ; /* checking */ check_equal(F,m,n,D,n,C,n,"bini_p",descr[algo],ok_p[algo]) ; } FFLAS::fflas_delete(C); FFLAS::fflas_delete(D); } template struct changeField { typedef T other ; }; template<> struct changeField > { typedef Givaro::Modular other; }; template<> struct changeField > { typedef ModularBalanced other; }; double descrip(int algo, int_v & ok_e, time_v & tim_e, int iters, const char ** madescr, const char * nom) { int min_e = -1 ; double bini_e = -1 ; for (int i = 0 ; i < algo ; ++i){ if (ok_e[i] == (int)iters) { double bini1 = tim_e[i].usertime()/(double)ok_e[i] ; if (bini_e < 0) { bini_e = bini1; min_e = (int) i ; } else if (bini1 < bini_e) { min_e = (int)i ; bini_e = bini1 ; } } } for (int i = 0 ; i < algo ; ++i){ if (ok_e[i] == (int)iters) { double bini1 = tim_e[i].usertime()/(double)ok_e[i] ; std::cout << nom << " ( " << madescr[i] << " ) : " ; if ((int)i == min_e) std::cout << " * " ; else std::cout << " "; std::cout << bini1 << 's'<< std::endl; } } return bini_e ; } template void test(int m, int k, int n, int p, int r, bool with_e, bool with_k, int iters = 4) { typedef typename Field::Element Element; Element * A = FFLAS::fflas_new(m*k); Element * B = FFLAS::fflas_new(n*k); Field F(p); F.write(std::cout<< " * Field " ) << std::endl; typedef typename changeField::other Field_f ; typedef typename Field_f::Element Element_f ; Field_f F_f(p); Element_f * A_f = FFLAS::fflas_new(m*k); Element_f * B_f = FFLAS::fflas_new(n*k); Element_f * C_f = FFLAS::fflas_new(m*n); #if defined(NOTRANDOM) int i0 ; int j0 ; Element p2 ; F.init(p2,(int)F.mOne/2); std::cout << p2 << std::endl; #warning "not random" for (int i = 0 ; i < m ; ++i) for (int j = 0 ; j < k ; ++j) { i0 = i/(m/3); j0 = j/(k/2); if (i0 == 0 and j0 == 0) A[i*k+j] = F.mOne ; else if (i0 == 0 and j0 == 1) A[i*k+j] = F.zero ; else if (i0 == 1 and j0 == 0) A[i*k+j] = F.mOne ; else if (i0 == 1 and j0 == 1) A[i*k+j] = F.mOne ; else if (i0 == 2 and j0 == 0) A[i*k+j] = F.mOne ; else if (i0 == 2 and j0 == 1) A[i*k+j] = F.mOne ; else A[i*k+j] = F.mOne ; } for (int i = 0 ; i < k ; ++i) for (int j = 0 ; j < n ; ++j) { i0 = i/(k/2); j0 = j/(n/2); if (i0 == 0 and j0 == 0) B[i*n+j] = F.mOne ; else if (i0 == 0 and j0 == 1) B[i*n+j] = F.mOne ; else if (i0 == 1 and j0 == 0) B[i*n+j] = F.mOne ; else if (i0 == 1 and j0 == 1) B[i*n+j] = F.zero ; else B[i*n+j] = F.mOne ; } #endif time_v tim_e(algos), tim_p(algos), tim_k(algos_k); FFLAS::Timer tim_wd; tim_wd.clear(); FFLAS::Timer tim_wf; tim_wf.clear(); FFLAS::Timer tmp; for (int i = 0 ; i < algos ; ++i) { tim_e[i].clear(); tim_p[i].clear(); } for (int i = 0 ; i < algos_k ; ++i) { tim_k[i].clear(); } int_v ok_p(algos,0), ok_e(algos,0), ok_k(algos_k,0); int nb_wd = 0 , nb_wf = 0 ; for (int b = 0 ; b < iters ; ++b) { std::cout << "iter " << b+1 << " of " << iters << std::endl; #if not defined(NOTRANDOM) FFPACK::RandomMatrix(F,A,m,k,k); FFPACK::RandomMatrix(F,B,k,n,n); #endif FFLAS::finit(F_f,m,k,A,k,A_f,k); FFLAS::finit(F_f,k,n,B,n,B_f,n); tmp.clear();tmp.start(); fgemm(F_f,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, m,n,k, 1, A_f,k, B_f,n, 0, C_f, n); tmp.stop(); tim_wf += tmp ; nb_wf ++ ; test_algos(F,m,n,k,A,k,B,n,r, tim_k,tim_e,tim_p, ok_k,ok_e,ok_p, tim_wd,nb_wd, with_e,with_k); } std::cout << std::endl << "results" << std::endl; double bini_e = descrip(algos,ok_e,tim_e,iters,descr,"Bini_e"); double bini_p = descrip(algos,ok_p,tim_p,iters,descr,"Bini_p"); double bini_k = descrip(algos_k,ok_k,tim_k,iters,descr_k,"Bini_k"); double t_wd = tim_wd.usertime()/(double)(nb_wd); double t_wf = tim_wf.usertime()/(double)(nb_wf); std::cout << "Wino d : " << t_wd << 's'<< std::endl; std::cout << "Wino f : " << t_wf << 's'<< std::endl; double wino = std::min(t_wd,t_wf) ; if (bini_e>=0) std::cout << "Gain e: " << ((bini_e-wino)/wino)*100 << '%' << std::endl; if (bini_p>=0) std::cout << "Gain p: " << ((bini_p-wino)/wino)*100 << '%' << std::endl; if (bini_k>=0) std::cout << "Gain k: " << ((bini_k-wino)/wino)*100 << '%' << std::endl; FFLAS::fflas_delete( A ); FFLAS::fflas_delete( B); FFLAS::fflas_delete( A_f ); FFLAS::fflas_delete( B_f); FFLAS::fflas_delete( C_f); } int main(int ac, char **av) { static int m = 36 ; static int n = 12 ; static int k = 18 ; static int p = 101; bool eps = false ; bool kom = false ; int r = 1 ; int seed = (int) time(NULL); int iters = 4; static Argument as[] = { { 'p', "-p P", "Set the field characteristic.", TYPE_INT , &p }, { 'n', "-n N", "Set the number of cols in C.", TYPE_INT , &n }, { 'm', "-m N", "Set the number of rows in C.", TYPE_INT , &m }, { 'k', "-k N", "Set the number of rows in B.", TYPE_INT , &k }, { 'r', "-k N", "Set the recursive number Bini.", TYPE_INT , &r }, { 'i', "-i N", "Set the numebr of iterations.", TYPE_INT , &iters }, { 's', "-s N", "Set the seed .", TYPE_INT , &seed }, { 'e', "-e " , "epsilon .", TYPE_NONE , &eps }, { 'c', "-c " , "compress .", TYPE_NONE , &kom}, END_OF_ARGUMENTS }; FFLAS::parseArguments(ac,av,as); srand(seed); srand48(seed); std::cout << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << std::endl; std::cout << "size: " << m << ',' << k << ',' << n << std::endl; std::cout << "mod : " << p << std::endl; std::cout << "rec : " << r << std::endl; std::cout << "seed: " << seed << std::endl; std::cout << "thre: " << TRE << std::endl; std::cout << "=====================================================" << std::endl; test > (m,k,n,p,r,eps,kom,iters); std::cout << "=====================================================" << std::endl; test > (m,k,n,p,r,eps,kom,iters); std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; return 0; } /* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s fflas-ffpack-2.2.2/tests/test-charpoly-check.C000066400000000000000000000064341274716147400212030ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * Written by Ashley Lesdalons * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ //-------------------------------------------------------------------------- // Test for Checker_charpoly //-------------------------------------------------------------------------- #define ENABLE_ALL_CHECKINGS 1 #include #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/args-parser.h" template void printPolynomial (const Field &F, Polynomial &v) { for (int i = v.size() - 1; i >= 0; i--) { F.write (std::cout, v[i]); if (i > 0) std::cout << " x^" << i << " + "; } std::cout << std::endl; } int main(int argc, char** argv) { srand (time(NULL)); typedef Givaro::ModularBalanced Field; Givaro::Integer q = 131071; size_t iter = 3; size_t MAXN = 100; size_t n = 0; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'n', "-n N", "Set the size of the matrix.", TYPE_INT , &n }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); Field F(q); Field::RandIter Rand(F); typedef std::vector Polynomial; size_t pass = 0; for (size_t i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for charpoly // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- #define ENABLE_ALL_CHECKINGS 1 #include #include #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/field/modular-positive.h" // #include "fflas-ffpack/field/modular-int.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/utils/args-parser.h" using namespace std; //typedef ModularBalanced Field; typedef Givaro::ModularBalanced Field; //typedef Givaro::Modular Field; //typedef Givaro::Modular Field; //typedef Givaro::Modular Field; //typedef Givaro::Modular Field; typedef vector Polynomial; using namespace FFPACK; template void printPolynomial (const Field &F, const Polynomial &v) { for (int i = v.size () - 1; i >= 0; i--) { F.write (cout, v[i]); if (i > 0) cout << " x^" << i << " + "; } cout << endl; } template bool launch_test(const Field & F, typename Field::Element * A, int n, size_t p, size_t nbit, FFPACK::FFPACK_CHARPOLY_TAG CT) { FFLAS::Timer tim,t; t.clear();tim.clear(); list P_list; for(size_t i = 0;i::iterator P_it = P_list.begin(); for (;P_it!=P_list.end();++P_it) printPolynomial ( F, *P_it); F.write(cerr<<"n = "<(F,const_cast(filestring),&n,&n); bool passed = launch_test(F,A,n,p,nbit,CT); FFLAS::fflas_delete( A); return !passed ; } else { std::cerr << std::endl << "##################################"<< std::endl; std::cerr << std::endl << " **** not implemented yet ! ***" << std::endl; std::cerr << std::endl << "##################################"<< std::endl; // create A random // create A diagonal // create A nilpotent // create A non invertible return false ; } } fflas-ffpack-2.2.2/tests/test-colechelon.C000066400000000000000000000124571274716147400204240ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s // /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the column echelon factorisation //-------------------------------------------------------------------------- // usage: test-colechelon p A n, for n computations // of A over Z/pZ //------------------------------------------------------------------------- //------------------------------------------------------------------------- //#define DEBUG 1 // Debug option 0: no debug // 1: check A = LQUP //------------------------------------------------------------------------- using namespace std; //#define __LUDIVINE_CUTOFF 1 #include #include #include "Matio.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace FFPACK ; typedef Givaro::Modular Field; int main(int argc, char** argv){ cerr< "<(n); size_t *Q = FFLAS::fflas_new(m); // size_t cutoff = atoi(argv[3]); nbf = atoi(argv[3]); FFLAS::Timer tim,timc; timc.clear(); for ( i=0;i(m*n); Field::Element * U = FFLAS::fflas_new(n*n); Field::Element * X = FFLAS::fflas_new(m*n); Field::Element zero,one; F.init(zero,0.0); F.init(one,1.0); for (int i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the krylov-elimination //-------------------------------------------------------------------------- // usage: test-krylov-elim p A, to compute the rank profile of the (n+m)xn matrix B // formed by the n identity vectors and the mxn matrix A over Z/pZ //------------------------------------------------------------------------- //------------------------------------------------------------------------- //#define DEBUG 0 #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/utils/args-parser.h" #include "Matio.h" using namespace std; typedef Givaro::Modular Field; //! @bug does not belong here template std::ostream& printvect(std::ostream& o, vector& vect){ for(size_t i=0; i < vect.size(); ++i) o << vect[i] << " " ; return o << std::endl; } int main(int argc, char** argv) { static Argument as[] = { END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); // int m,n; Field F(65521); size_t N = 17; double * A = FFLAS::fflas_new(N*N); double * tmp = FFLAS::fflas_new(N*N); size_t * deg = FFLAS::fflas_new(N); for (size_t i=0; i<(size_t)N*N; ++i) A[i] = 0; for (size_t i=0; i<3; ++i) A[i+i*N] = 1; for (size_t i=3; i<6; ++i) A[i+1+i*N] = 1; for (size_t i=6; i<9; ++i) A[i+2+i*N] = 1; A[12+9*N] = 1; A[13+10*N] = 1; A[14+12*N] = 1; A[15+15*N] = 1; A[16+16*N] = 1; deg[0] = 4; deg[1] = 4; deg[2] = 4;deg[3] = 2; deg[4] = 1; deg[5] =2; for (size_t i=0; i(N*N) ; FFLAS::fassign(F,N*N,A,1,B,1); // write_field(F, cerr, A, N, N, N); FFPACK::Protected::CompressRowsQK (F, N, A+9*N, N, tmp, N, deg+3, 4, 3 ); // write_field(F, cerr, A, N, N, N); FFPACK::Protected::DeCompressRowsQK (F, N, N-9, A+9*N, N, tmp, N, deg+3, 4, 3 ); // write_field(F, cerr, A, N, N, N); int ok = 0 ; for (size_t i = 0 ; i < (size_t)N * (size_t)N ; ++i) if (A[i] != B[i]) { ok = 1 ; break ; } FFLAS::fflas_delete( A ); FFLAS::fflas_delete( tmp) ; FFLAS::fflas_delete(deg) ; FFLAS::fflas_delete( B ); return ok ; } fflas-ffpack-2.2.2/tests/test-det.C000066400000000000000000000074231274716147400170620ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for det // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/utils/args-parser.h" #include "test-utils.h" #include "Matio.h" // using namespace std; template bool test_det(Field &F, size_t n, int iter) { typedef typename Field::Element Element; //! @todo test with stride Element * A = FFLAS::fflas_new(n*n); // A = read_field(F,argv[2],&n,&n); bool pass = true; #ifdef TIME_IT FFLAS::Timer tim,t; t.clear();tim.clear(); #endif Element d=0; Element dt=-4; for(int i = 0;i <" < Field; Field F(p); pass &= test_det(F,n,iters); // pass &= test_det(F,0,iters); return ((pass==true)?0:1); } fflas-ffpack-2.2.2/tests/test-echelon.C000066400000000000000000000321171274716147400177210ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the echelon factorisation //-------------------------------------------------------------------------- //#define __LUDIVINE_CUTOFF 1 #define __FFLASFFPACK_SEQUENTIAL #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/utils/args-parser.h" #include "test-utils.h" #include "Matio.h" using namespace std; using namespace FFPACK; using Givaro::Modular; using Givaro::ModularBalanced; template bool test_colechelon(Field &F, size_t m, size_t n, size_t r, size_t iters, FFPACK::FFPACK_LU_TAG LuTag) { typedef typename Field::Element Element ; Element * A = FFLAS::fflas_new (F,m,n); Element * B = FFLAS::fflas_new (F,m,n); Element * L = FFLAS::fflas_new (F,m,n); Element * U = FFLAS::fflas_new (F,n,n); Element * X = FFLAS::fflas_new (F,m,n); size_t lda = n; //!@todo check lda size_t *P = FFLAS::fflas_new(n); size_t *Q = FFLAS::fflas_new(m); size_t R = (size_t)-1; bool pass=true; for (size_t l=0;l bool test_rowechelon(Field &F, size_t m, size_t n, size_t r, size_t iters, FFPACK::FFPACK_LU_TAG LuTag) { typedef typename Field::Element Element ; Element * A = FFLAS::fflas_new (F,m,n); Element * B = FFLAS::fflas_new (F,m,n); Element * L = FFLAS::fflas_new (F,m,m); Element * U = FFLAS::fflas_new (F,m,n); Element * X = FFLAS::fflas_new (F,m,n); size_t lda = n; //!@todo check lda size_t *P = FFLAS::fflas_new(m); size_t *Q = FFLAS::fflas_new(n); size_t R = (size_t)-1; bool pass=true; for (size_t l=0;l bool test_redrowechelon(Field &F, size_t m, size_t n, size_t r, size_t iters, FFPACK::FFPACK_LU_TAG LuTag) { typedef typename Field::Element Element ; Element * A = FFLAS::fflas_new (F,m,n); Element * B = FFLAS::fflas_new (F,m,n); Element * L = FFLAS::fflas_new (F,m,m); Element * U = FFLAS::fflas_new (F,m,n); Element * X = FFLAS::fflas_new (F,m,n); size_t lda = n; //!@todo check lda size_t *P = FFLAS::fflas_new(m); size_t *Q = FFLAS::fflas_new(n); size_t R = (size_t)-1; bool pass=true; for (size_t l=0;lwrite(std::cerr) << std::endl; #endif ok &= test_colechelon(*F,m,n,r,iters, FFPACK::FfpackSlabRecursive); std::cout<<"."; ok &= test_colechelon(*F,m,n,r,iters, FFPACK::FfpackTileRecursive); std::cout<<"."; ok &= test_redcolechelon(*F,m,n,r,iters, FFPACK::FfpackSlabRecursive); std::cout<<"."; ok &= test_redcolechelon(*F,m,n,r,iters, FFPACK::FfpackTileRecursive); std::cout<<"."; ok &= test_rowechelon(*F,m,n,r,iters, FFPACK::FfpackSlabRecursive); std::cout<<"."; ok &= test_rowechelon(*F,m,n,r,iters, FFPACK::FfpackTileRecursive); std::cout<<"."; ok &= test_redrowechelon(*F,m,n,r,iters, FFPACK::FfpackSlabRecursive); std::cout<<"."; ok &= test_redrowechelon(*F,m,n,r,iters, FFPACK::FfpackTileRecursive); std::cout<<"."; nbit--; if ( !ok ) std::cout << "FAILED "< >(q,b,m,n,r,iters); ok &= run_with_field >(q,b,m,n,r,iters); ok &= run_with_field >(q,b,m,n,r,iters); ok &= run_with_field >(q,b,m,n,r,iters); ok &= run_with_field >(q,b,m,n,r,iters); ok &= run_with_field >(q,b,m,n,r,iters); ok &= run_with_field >(q,b,m,n,r,iters); // ok &= run_with_field > >(q,b,m,n,r,iters); // BUG: not available yet (missing division in the field ok &= run_with_field >(q,b,m,n,r,iters); ok &= run_with_field >(q,(b?b:128_ui64),m/8+1,n/8+1,r/8+1,iters); } while (loop && ok); return !ok ; } fflas-ffpack-2.2.2/tests/test-echelon_old.C000066400000000000000000000124411274716147400205550ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the echelon factorisation //-------------------------------------------------------------------------- // usage: test-echelon p A n, for n lsp factorization // of A over Z/pZ //------------------------------------------------------------------------- //------------------------------------------------------------------------- //#define DEBUG 1 // Debug option 0: no debug // 1: check A = LQUP //------------------------------------------------------------------------- using namespace std; //#define __LUDIVINE_CUTOFF 1 #include #include #include "Matio.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace FFPACK; typedef Givaro::Modular Field; int main(int argc, char** argv){ cerr< "<(n); size_t *Q = FFLAS::fflas_new(m); // size_t cutoff = atoi(argv[3]); nbf = atoi(argv[3]); FFLAS::Timer tim,timc; timc.clear(); for ( i=0;i(m*n); Field::Element * U = FFLAS::fflas_new(n*n); Field::Element * X = FFLAS::fflas_new(m*n); Field::Element zero,one; F.init(zero,0.0); F.init(one,1.0); for (int i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFPACK * Written by : * Brice Boyer (briceboyer) * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ // #define SIMD_INT #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/args-parser.h" #include "Matio.h" #include "test-utils.h" #include "assert.h" template bool test_fadd(const Field & F, size_t m, size_t k, size_t n, bool timing) { typedef typename Field::Element T ; T * A = FFLAS::fflas_new(m*n); T * B = FFLAS::fflas_new(m*n); T * C = FFLAS::fflas_new(m*n); T * D = FFLAS::fflas_new(m*n); if (timing) std::cout << ">>>" << std::endl ; size_t iter = 3 ; FFLAS::Timer tim, tom, tam ; tim.clear() ; tom.clear() ; if (timing) F.write(std::cout << "Field ") << std::endl; for (size_t b = 0 ; b < iter ; ++b) { FFPACK::RandomMatrix(F,A,m,k,n); FFPACK::RandomMatrix(F,B,m,k,n); FFPACK::RandomMatrix(F,C,m,k,n); FFLAS::fassign(F,m,k,C,n,D,n); tam.clear();tam.start(); for (size_t i = 0 ; i < m ; ++i) for (size_t j = 0 ; j < k ; ++j) F.add(D[i*n+j],A[i*n+j],B[i*n+j]); tam.stop(); tim += tam ; tam.clear();tam.start(); FFLAS::fadd(F,m,k,A,n,B,n,C,n); tam.stop(); tom += tam ; #if 1 for (size_t i =0 ; i < m ; ++i) for (size_t j =0 ; j < k ; ++j) if (! F.areEqual(C[i*n+j],D[i*n+j])) { if (timing) std::cout << i << ',' << j << " : " << C[i*n+j] << "!= (ref)" << D[i*n+j] << std::endl; return false ; } #endif } if (timing) std::cout << "fadd (___): " << tim.usertime()/(double)iter << 's' << std::endl; if (timing) std::cout << "fadd (AVX): " << tom.usertime()/(double)iter << 's'<< std::endl; if (timing) std::cout << "<<<" << std::endl; FFLAS::fflas_delete( A ); FFLAS::fflas_delete( B); FFLAS::fflas_delete( C ); FFLAS::fflas_delete( D ); return true; } template bool test_faddin(const Field & F, size_t m, size_t k, size_t n, bool timing) { typedef typename Field::Element T ; T * A = FFLAS::fflas_new(m*n); T * C = FFLAS::fflas_new(m*n); T * D = FFLAS::fflas_new(m*n); if (timing) std::cout << ">>>" << std::endl ; if (timing) F.write(std::cout << "Field ") << std::endl; size_t iter = 3 ; FFLAS::Timer tim, tom, tam ; tim.clear() ; tom.clear() ; for (size_t b = 0 ; b < iter ; ++b) { FFPACK::RandomMatrix(F,A,m,k,n); FFPACK::RandomMatrix(F,C,m,k,n); FFLAS::fassign(F,m,k,C,n,D,n); tam.clear();tam.start(); for (size_t i = 0 ; i < m ; ++i) for (size_t j = 0 ; j < k ; ++j) F.addin(D[i*n+j],A[i*n+j]); tam.stop(); tim += tam ; tam.clear();tam.start(); FFLAS::faddin(F,m,k,A,n,C,n); tam.stop(); tom += tam ; #if 1 for (size_t i =0 ; i < m ; ++i) for (size_t j =0 ; j < k ; ++j) if (! F.areEqual(C[i*n+j],D[i*n+j])) { if (timing) std::cout << i << ',' << j << " : " << C[i*n+j] << "!= (ref)" << D[i*n+j] << std::endl; return false ; } #endif } if (timing) std::cout << "faddin (___): " << tim.usertime()/(double)iter << 's' << std::endl; if (timing) std::cout << "faddin (AVX): " << tom.usertime()/(double)iter << 's'<< std::endl; if (timing) std::cout << "<<<" << std::endl; FFLAS::fflas_delete( A ); FFLAS::fflas_delete( C ); FFLAS::fflas_delete( D ); return true; } template bool test_fsub(const Field & F, size_t m, size_t k, size_t n, bool timing) { typedef typename Field::Element T ; T * A = FFLAS::fflas_new(m*n); T * B = FFLAS::fflas_new(m*n); T * C = FFLAS::fflas_new(m*n); T * D = FFLAS::fflas_new(m*n); if (timing) std::cout << ">>>" << std::endl ; size_t iter = 3 ; FFLAS::Timer tim, tom, tam ; tim.clear() ; tom.clear() ; if (timing) F.write(std::cout << "Field ") << std::endl; for (size_t b = 0 ; b < iter ; ++b) { FFPACK::RandomMatrix(F,A,m,k,n); FFPACK::RandomMatrix(F,B,m,k,n); FFPACK::RandomMatrix(F,C,m,k,n); FFLAS::fassign(F,m,k,C,n,D,n); tam.clear();tam.start(); for (size_t i = 0 ; i < m ; ++i) for (size_t j = 0 ; j < k ; ++j) F.sub(D[i*n+j],A[i*n+j],B[i*n+j]); tam.stop(); tim += tam ; tam.clear();tam.start(); FFLAS::fsub(F,m,k,A,n,B,n,C,n); tam.stop(); tom += tam ; #if 1 for (size_t i =0 ; i < m ; ++i) for (size_t j =0 ; j < k ; ++j) if (! F.areEqual(C[i*n+j],D[i*n+j])) { if (timing) std::cout << i << ',' << j << " : " << C[i*n+j] << "!= (ref)" << D[i*n+j] << std::endl; return false ; } #endif } if (timing) std::cout << "fsub (___): " << tim.usertime()/(double)iter << 's' << std::endl; if (timing) std::cout << "fsub (AVX): " << tom.usertime()/(double)iter << 's'<< std::endl; if (timing) std::cout << "<<<" << std::endl; FFLAS::fflas_delete( A ); FFLAS::fflas_delete( B); FFLAS::fflas_delete( C ); FFLAS::fflas_delete( D ); return true; } template bool test_fsubin(const Field & F, size_t m, size_t k, size_t n, bool timing) { typedef typename Field::Element T ; T * A = FFLAS::fflas_new(m*n); T * C = FFLAS::fflas_new(m*n); T * D = FFLAS::fflas_new(m*n); if (timing) std::cout << ">>>" << std::endl ; if (timing) F.write(std::cout << "Field ") << std::endl; size_t iter = 3 ; FFLAS::Timer tim, tom, tam ; tim.clear() ; tom.clear() ; for (size_t b = 0 ; b < iter ; ++b) { FFPACK::RandomMatrix(F,A,m,k,n); FFPACK::RandomMatrix(F,C,m,k,n); FFLAS::fassign(F,m,k,C,n,D,n); tam.clear();tam.start(); for (size_t i = 0 ; i < m ; ++i) for (size_t j = 0 ; j < k ; ++j) F.subin(D[i*n+j],A[i*n+j]); tam.stop(); tim += tam ; tam.clear();tam.start(); FFLAS::fsubin(F,m,k,A,n,C,n); tam.stop(); tom += tam ; #if 1 for (size_t i =0 ; i < m ; ++i) for (size_t j =0 ; j < k ; ++j) if (! F.areEqual(C[i*n+j],D[i*n+j])) { if (timing) std::cout << i << ',' << j << " : " << C[i*n+j] << "!= (ref)" << D[i*n+j] << std::endl; return false ; } #endif } if (timing) std::cout << "fsubin (___): " << tim.usertime()/(double)iter << 's' << std::endl; if (timing) std::cout << "fsubin (AVX): " << tom.usertime()/(double)iter << 's'<< std::endl; if (timing) std::cout << "<<<" << std::endl; FFLAS::fflas_delete( A ); FFLAS::fflas_delete( C ); FFLAS::fflas_delete( D ); return true; } int main(int ac, char **av) { static size_t m = 300 ; static size_t n = 301 ; static size_t k = 300 ; static uint64_t p = 7; int seed = (int) time(NULL); static bool timing = false ; static Argument as[] = { { 'p', "-p P", "Set the field characteristic.", TYPE_INT , &p }, { 'n', "-n N", "Set the number of cols in C.", TYPE_INT , &n }, { 'm', "-m N", "Set the number of rows in C.", TYPE_INT , &m }, { 'k', "-k N", "Set the number of rows in B.", TYPE_INT , &k }, { 's', "-s N", "Set the seed .", TYPE_INT , &seed }, { 't', "-timing", "Output timings" , TYPE_NONE, &timing}, END_OF_ARGUMENTS }; FFLAS::parseArguments(ac,av,as); if (n < k) { std::cout << "Usage : m k n ; matrix of size m x k, lda is n" << std::endl; return -1 ; } srand(seed); srand48(seed); // std::cout << seed << std::endl; bool pass = true ; { /* fadd */ { Givaro::Modular F(p) ; pass &= test_fadd(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fadd(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_fadd(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fadd(F,m,k,n,timing); } { Givaro::Modular F( (int32_t)p ) ; pass &= test_fadd(F,m,k,n,timing); } { Givaro::ModularBalanced F((int32_t)p) ; pass &= test_fadd(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_fadd(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fadd(F,m,k,n,timing); } #if 1 { Givaro::ZRing F ; pass &= test_fadd(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_fadd(F,m,k,n,timing); } { Givaro::ZRing F; pass &= test_fadd(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_fadd(F,m,k,n,timing); } #endif } { /* faddin */ { Givaro::Modular F(p) ; pass &= test_faddin(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_faddin(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_faddin(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_faddin(F,m,k,n,timing); } { Givaro::Modular F((int32_t)p) ; pass &= test_faddin(F,m,k,n,timing); } { Givaro::ModularBalanced F((int32_t)p) ; pass &= test_faddin(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_faddin(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_faddin(F,m,k,n,timing); } #if 1 { Givaro::ZRing F ; pass &= test_faddin(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_faddin(F,m,k,n,timing); } { Givaro::ZRing F; pass &= test_faddin(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_faddin(F,m,k,n,timing); } #endif } { /* fsub */ { Givaro::Modular F(p) ; pass &= test_fsub(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fsub(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_fsub(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fsub(F,m,k,n,timing); } { Givaro::Modular F((int32_t)p) ; pass &= test_fsub(F,m,k,n,timing); } { Givaro::ModularBalanced F((int32_t)p) ; pass &= test_fsub(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_fsub(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fsub(F,m,k,n,timing); } #if 1 { Givaro::ZRing F ; pass &= test_fsub(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_fsub(F,m,k,n,timing); } { Givaro::ZRing F; pass &= test_fsub(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_fsub(F,m,k,n,timing); } #endif } { /* fsubin */ { Givaro::Modular F(p) ; pass &= test_fsubin(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fsubin(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_fsubin(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fsubin(F,m,k,n,timing); } { Givaro::Modular F((int32_t)p) ; pass &= test_fsubin(F,m,k,n,timing); } { Givaro::ModularBalanced F((int32_t)p) ; pass &= test_fsubin(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_fsubin(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fsubin(F,m,k,n,timing); } #if 1 { Givaro::ZRing F ; pass &= test_fsubin(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_fsubin(F,m,k,n,timing); } { Givaro::ZRing F; pass &= test_fsubin(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_fsubin(F,m,k,n,timing); } #endif } return (pass?0:1) ; } /* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s fflas-ffpack-2.2.2/tests/test-fgemm-check.C000066400000000000000000000064021274716147400204500ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * Written by Ashley Lesdalons * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ //-------------------------------------------------------------------------- // Test for Checker_fgemm //-------------------------------------------------------------------------- #define ENABLE_ALL_CHECKINGS 1 #include #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/args-parser.h" int main(int argc, char** argv) { srand (time(NULL)); typedef Givaro::Modular Field; Givaro::Integer q = 131071; size_t iter = 3; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); Field F(q); Field::RandIter Rand(F); FFLAS::FFLAS_TRANSPOSE ta,tb; size_t pass = 0; for (size_t i=0; i checker(F,m,n,k,beta,C,ldc); FFLAS::fgemm(F,ta,tb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); try { checker.check(ta,tb,alpha,A,lda,B,ldb,C); std::cout << "Verification successful\n"; pass++; } catch (FailureFgemmCheck &e) { std::cout << "Verification failed!\n"; } FFLAS::fflas_delete(A,B,C); } std::cout << pass << "/" << iter << " tests were successful.\n"; return 0; } fflas-ffpack-2.2.2/tests/test-fgemm.C000066400000000000000000000325711274716147400174030ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) the FFLAS-FFPACK group * Written by Clément Pernet * Brice Boyer (briceboyer) * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ // #ifndef NEWINO // #define NEWWINO // #endif // #define WINOTHRESHOLD 100 // #define OLD_DYNAMIC_PEELING //#define DEBUG 1 #define ENABLE_CHECKER_fgemm 1 #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/args-parser.h" #include "test-utils.h" #include "fflas-ffpack/utils/Matio.h" using namespace std; using namespace FFPACK; using Givaro::Modular; using Givaro::ModularBalanced; // checks that D = alpha . C + beta . A ^ta * B ^tb template bool check_MM(const Field & F, const typename Field::Element_ptr Cd, // c0 enum FFLAS::FFLAS_TRANSPOSE & ta, enum FFLAS::FFLAS_TRANSPOSE & tb, const size_t m, const size_t n, const size_t k, const typename Field::Element & alpha, const typename Field::Element_ptr A, const size_t lda, const typename Field::Element_ptr B, const size_t ldb, const typename Field::Element & beta, const typename Field::Element_ptr C, // res const size_t ldc ) { bool wrong = false; typedef typename Field::Element Element; typedef typename Field::Element_ptr Element_ptr; typedef typename Field::ConstElement_ptr ConstElement_ptr; Element tmp; ConstElement_ptr ail,blj; Element_ptr D = FFLAS::fflas_new (F,m,n); FFLAS::fassign(F,m,n,Cd,n,D,n); for (size_t i = 0; i < m; ++i) for (size_t j = 0; j < n; ++j){ F.mulin(*(D+i*n+j),beta); F.assign (tmp, F.zero); for ( size_t l = 0; l < k ; ++l ){ if ( ta == FFLAS::FflasNoTrans ) ail = A+i*lda+l; else ail = A+l*lda+i; if ( tb == FFLAS::FflasNoTrans ) blj = B+l*ldb+j; else blj = B+j*ldb+l; F.axpyin (tmp, *ail, *blj); } F.axpyin (*(D+i*n+j), alpha, tmp); if ( !F.areEqual( *(D+i*n+j), *(C+i*ldc+j) ) ) { wrong = true; } } if ( wrong ){ size_t ici = 20 ; std::cerr<<"FAIL"< bool launch_MM(const Field & F, const size_t m, const size_t n, const size_t k, const typename Field::Element alpha, const typename Field::Element beta, const size_t ldc, const size_t lda, enum FFLAS::FFLAS_TRANSPOSE ta, const size_t ldb, enum FFLAS::FFLAS_TRANSPOSE tb, size_t iters, int nbw, bool par, size_t b) { bool ok = true; typedef typename Field::Element_ptr Element_ptr; Element_ptr A ; Element_ptr B ; Element_ptr C = FFLAS::fflas_new (F,m,ldc); FFLASFFPACK_check(ldc >= n); FFLAS::fzero(F,m,n,C,ldc); Element_ptr D = FFLAS::fflas_new (F, m, n); for(size_t i = 0;i= k); A = FFLAS::fflas_new (F, m, lda); FFLAS::fzero(F,m,lda,A,lda); RandomMatrix(F,A,m,k,lda,b); } else { FFLASFFPACK_check(lda >= m); A = FFLAS::fflas_new (F, k, lda); FFLAS::fzero(F,k,lda,A,lda); RandomMatrix(F,A,k,m,lda,b); } if (tb == FFLAS::FflasNoTrans) { FFLASFFPACK_check(ldb >= n); B = FFLAS::fflas_new (F,k,ldb); FFLAS::fzero(F,k,ldb,B,ldb); RandomMatrix(F,B,k,n,ldb,b); } else { FFLASFFPACK_check(ldb >= k); B = FFLAS::fflas_new (F,n,ldb); FFLAS::fzero(F,n,ldb,B,ldb); RandomMatrix(F,B,n,k,ldb,b); } RandomMatrix(F,C,m,n,ldc,b); FFLAS::fassign(F,m,n,C,ldc,D,n); if (par){ FFLAS::MMHelper::value, FFLAS::ParSeqHelper::Parallel > WH (F, nbw); PAR_BLOCK{ FFLAS::fgemm (F, ta, tb,m,n,k,alpha, A,lda, B,ldb, beta,C,ldc,WH); } }else{ FFLAS::MMHelper::value> WH(F,nbw,FFLAS::ParSeqHelper::Sequential()); FFLAS::fgemm (F, ta, tb,m,n,k,alpha, A,lda, B,ldb, beta,C,ldc,WH); } ok &= check_MM(F, D, ta, tb,m,n,k,alpha, A,lda, B,ldb, beta,C,ldc); FFLAS::fflas_delete(A); FFLAS::fflas_delete(B); if (!ok) break; } FFLAS::fflas_delete (C); FFLAS::fflas_delete (D); return ok ; } template bool launch_MM_dispatch(const Field &F, const int mm, const int nn, const int kk, const typename Field::Element alpha, const typename Field::Element beta, const size_t iters, const int nbw, const bool par, const size_t b) { bool ok = true; size_t m,n,k; size_t lda,ldb,ldc; //!@bug test for ldX equal //!@bug test for transpo //!@todo does nbw actually do nbw recursive calls and then call blas (check ?) ? size_t ld = 13 ; { FFLAS::FFLAS_TRANSPOSE ta = FFLAS::FflasNoTrans ; FFLAS::FFLAS_TRANSPOSE tb = FFLAS::FflasNoTrans ; if (! par) { if (random()%2) ta = FFLAS::FflasTrans ; if (random()%2) tb = FFLAS::FflasTrans ; } if (mm<0) m = 1+(size_t)random() % -mm; else m = mm; if (nn<0) n = 1+(size_t)random() % -nn; else n = nn; if (kk<0) k = 1+(size_t)random() % -kk; else k = kk; int logdim = (int)floor(log(std::min(std::min(m,k),n))/log(2.)); int nw = std::min (logdim,nbw); lda = std::max(k,m)+(size_t)random()%ld; ldb = std::max(n,k)+(size_t)random()%ld; ldc = n+(size_t)random()%ld; #ifdef DEBUG std::cerr <<"q = "<(F,m,n,k, alpha,beta, ldc, lda, ta, ldb, tb, iters,nw, par, b); #ifdef DEBUG std::cerr<<(ok?" -> ok ":" -> KO")< bool run_with_field (Givaro::Integer q, uint64_t b, int m, int n, int k, int nbw, size_t iters, bool par ){ bool ok = true ; int nbit=(int)iters; while (ok && nbit){ typedef typename Field::Element Element ; // choose Field Field* F= chooseField(q,b); if (F==nullptr) return true; std::ostringstream oss; F->write(oss); std::cout.fill('.'); std::cout<<"Checking "; std::cout.width(40); std::cout<write(std::cerr) << std::endl; #endif typedef typename Field::RandIter Randiter ; typedef typename Field::Element Element ; Randiter R1(*F,b); Givaro::GeneralRingNonZeroRandIter R(R1); //size_t k = 0 ; //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->one,F->zero,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->zero,F->zero,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->mOne,F->zero,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->one ,F->one,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->zero,F->one,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->mOne,F->one,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->one ,F->mOne,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->zero,F->mOne,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->mOne,F->mOne,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; Element alpha,beta ; R.random(alpha); ok &= launch_MM_dispatch(*F,m,n,k,F->one ,alpha,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->zero,alpha,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,F->mOne,alpha,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,alpha,F->one ,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,alpha,F->zero,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_MM_dispatch(*F,m,n,k,alpha,F->mOne,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; for (size_t j = 0 ; j < 3 ; ++j) { R.random(alpha); R.random(beta); ok &= launch_MM_dispatch(*F,m,n,k,alpha,beta,iters,nbw, par, b); //std::cout << k << "/24" << std::endl; ++k; } //std::cout< >(q,b,m,n,k,nbw,iters,p); ok &= run_with_field >(q,b,m,n,k,nbw,iters,p); ok &= run_with_field >(q,b,m,n,k,nbw,iters,p); ok &= run_with_field >(q,b,m,n,k,nbw,iters,p); ok &= run_with_field >(q,b,m,n,k,nbw,iters,p); ok &= run_with_field >(q,b,m,n,k,nbw,iters,p); ok &= run_with_field >(q,b,m,n,k,nbw,iters, p); ok &= run_with_field >(q,b,m,n,k,nbw,iters, p); ok &= run_with_field > >(q,b?b:63_ui64,m,n,k,nbw,iters, p); ok &= run_with_field > >(q,b?b:127_ui64,m,n,k,nbw,iters, p); ok &= run_with_field >(q,(b?b:512_ui64),m,n,k,nbw,iters,p); ok &= run_with_field >(0,(b?b:512_ui64),m,n,k,nbw,iters,p); } while (loop && ok); return !ok ; } fflas-ffpack-2.2.2/tests/test-fgemv.C000066400000000000000000000104151274716147400174050ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for fgemv : 1 computation // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- #define DEBUG 1 #define TIME 1 #if not defined(STD_RECINT_SIZE) #define STD_RECINT_SIZE 8 #endif #include #include "recint/recint.h" #include #include #include "fflas-ffpack/utils/timer.h" #include "Matio.h" #include "fflas-ffpack/fflas/fflas.h" using namespace std; using namespace FFPACK; // typedef Givaro::Modular Field; typedef RecInt::ruint Ints; typedef Givaro::Modular Field; int main(int argc, char** argv){ int m,n,k; int nbit=atoi(argv[4]); // number of times the product is performed cerr< " <<" "<s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by JG Dumas * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for fger : 1 computation // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- // #define DEBUG #define TIME 1 #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/args-parser.h" #include "test-utils.h" #include "Matio.h" using namespace std; using namespace FFPACK; using Givaro::Modular; using Givaro::ModularBalanced; // checks that D = alpha . x . y^T + C // WARNING template bool check_fger(const Field & F, const typename Field::Element_ptr Cd, // c0 const size_t m, const size_t n, const typename Field::Element & alpha, const typename Field::Element_ptr x, const size_t incx, const typename Field::Element_ptr y, const size_t incy, const typename Field::Element_ptr C, // res const size_t ldc ) { bool wrong = false; typedef typename Field::Element Element; typedef typename Field::Element_ptr Element_ptr; // std::cerr << "with(LinearAlgebra):" << std::endl; // write_field(F,std::cerr <<"X:=",x, m, 1, incx, true) << ';' << std::endl; // write_field(F,std::cerr <<"Y:=Transpose(",y, n, 1, incy, true) << ");" << std::endl; // write_field(F,std::cerr <<"A:=",Cd, m, n, ldc, true) << ';' << std::endl; // F.write(std::cerr << "a:=", alpha) << ';' << std::endl; // std::cerr << "q:=" << F.characteristic() << ';' << std::endl; Element_ptr D = FFLAS::fflas_new (F,m,n); FFLAS::fassign(F,m,n,Cd,n,D,n); for(size_t i=0; i(F,m,n, alpha, ldc, inca, incb, iters); #ifdef DEBUG std::cout<<(ok?" -> ok ":" -> KO")< bool run_with_field (int64_t q, uint64_t b, size_t n, size_t iters){ bool ok = true ; int nbit=(int)iters; while (ok && nbit){ typedef typename Field::Element Element ; typedef typename Field::RandIter Randiter ; typedef typename Field::Element Element ; Field* F= chooseField(q,b); #ifdef DEBUG F->write(std::cout) << std::endl; #endif Randiter R1(*F); Givaro::GeneralRingNonZeroRandIter R(R1); //size_t k = 0 ; //std::cout << k << "/24" << std::endl; ++k; ok &= launch_fger_dispatch(*F,n,F->one,iters); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_fger_dispatch(*F,n,F->zero,iters); //std::cout << k << "/24" << std::endl; ++k; ok &= launch_fger_dispatch(*F,n,F->mOne,iters); //std::cout << k << "/24" << std::endl; ++k; Element alpha ; R.random(alpha); ok &= launch_fger_dispatch(*F,n,alpha,iters); //std::cout< >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); } while (loop && ok); return !ok ; } fflas-ffpack-2.2.2/tests/test-fgesv.C000066400000000000000000000121601274716147400174120ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for fgesv : 1 computation // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- //#define DEBUG 1 #define TIME 1 #include #include using namespace std; #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/utils/timer.h" #include "Matio.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace FFPACK; typedef Givaro::Modular Field; int main(int argc, char** argv){ int n,m,mb,nb; cerr< " <(n*nb); ldx = nb; } else { X = FFLAS::fflas_new(mb*m); ldx = m; } } if ( ((side == FFLAS::FflasRight) && (n != nb)) || ((side == FFLAS::FflasLeft)&&(m != mb)) ) { cerr<<"Error in the dimensions of the input matrices"< 0){ std::cerr<<"System inconsistent"<(mb*nb); if (m==n) if (side == FFLAS::FflasLeft) FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, m, nb, n, one, A, n, B, nb, zero, B2, nb); else FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, mb, n, m, one, B, nb, A, n, zero, B2, nb); else if (side == FFLAS::FflasLeft) FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, m, nb, n, one, A, n, X, ldx, zero, B2, nb); else FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, mb, n, m, one, X, ldx, A, n, zero, B2, nb); FFLAS::fflas_delete( B); FFLAS::fflas_delete( X); B = read_field(F,argv[3],&mb,&nb); bool wrong = false; for (int i=0;is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFPACK * Written by : * Brice Boyer (briceboyer) * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ // #define SIMD_INT #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/utils/args-parser.h" #include "Matio.h" #include "test-utils.h" #include "assert.h" template bool test_freduce (const Field & F, size_t m, size_t k, size_t n, bool timing) { typedef typename Field::Element T ; size_t repet = 3 ; T * A = FFLAS::fflas_new(m*n); T * B = FFLAS::fflas_new(m*n); Givaro::ModularBalanced E(101); if (timing) std::cout << ">>>" << std::endl ; if (timing) std::cout << "=== inc == 1 ===" << std::endl ; FFLAS::Timer chrono, tim, tom ; tim.clear(); tom.clear(); if (timing) F.write(std::cout << "Field ") << std::endl; for (size_t b = 0 ; b < repet ; ++b) { FFPACK::RandomMatrix(E,A,m,k,n); // RandomMatrix(E,B,m,k,n); FFLAS::fassign(E,m,k,A,n,B,n); chrono.clear();chrono.start(); for (size_t i = 0 ; i < m ; ++i) for (size_t j = 0 ; j < k ; ++j) F.init(A[i*n+j],A[i*n+j]); chrono.stop(); tim += chrono ; chrono.clear();chrono.start(); FFLAS::freduce (F,m,k,B,n); chrono.stop(); tom += chrono ; #if 1 for (size_t i =0 ; i < m ; ++i) for (size_t j =0 ; j < k ; ++j) if (! F.areEqual(B[i*n+j],A[i*n+j])) { F.write(std::cout) << std::endl << i << ',' << j << " : "; F.write(std::cout, B[i*n+j]) << "!= (ref)"; F.write(std::cout, A[i*n+j]) << std::endl; return false ; } #endif } if (timing) std::cout << " freduce (___): " << tim.usertime()/(double)repet << 's' << std::endl; if (timing) std::cout << " freduce (AVX): " << tom.usertime()/(double)repet << 's'<< std::endl << std::endl; if (timing) std::cout << "=== inc != 1 ===" << std::endl ; tim.clear() ; tom.clear(); if (timing) F.write(std::cout << "Modular ") << std::endl; for (size_t b = 0 ; b < repet ; ++b) { FFPACK::RandomMatrix(E,A,m,n,n); FFLAS::fassign(E,m,n,A,n,B,n); size_t incX = 2 ; chrono.clear();chrono.start(); for (size_t i = 1 ; i < m*n ; i += incX) { F.init(A[i],A[i]); } chrono.stop(); tim += chrono ; size_t cnt = (size_t)floor((double)(m*n)/(double)incX) ; chrono.clear();chrono.start(); FFLAS::freduce (F,cnt,B+1,incX); chrono.stop(); tom += chrono ; #if 1 for (size_t i =1 ; i < m*n ; i+=incX) if (! F.areEqual(B[i],A[i])) { F.write(std::cout) << std::endl << i << " : "; F.write(std::cout, B[i]) << "!= (ref)"; F.write(std::cout, A[i]) << std::endl; return false ; } #endif } if (timing) std::cout << " freduce (___): " << tim.usertime()/(double)repet << 's' << std::endl; if (timing) std::cout << " freduce (AVX): " << tom.usertime()/(double)repet << 's'<< std::endl << std::endl; if (timing) std::cout << "<<<" << std::endl; FFLAS::fflas_delete( A ); FFLAS::fflas_delete( B); return true; } int main(int ac, char **av) { static size_t m = 297 ; static size_t n = 301 ; static size_t k = 299 ; static uint64_t p = 7; int seed = (int) time(NULL); static bool timing = false ; static Argument as[] = { { 'p', "-p P", "Set the field characteristic.", TYPE_INT , &p }, { 'n', "-n N", "Set the number of cols in C.", TYPE_INT , &n }, { 'm', "-m N", "Set the number of rows in C.", TYPE_INT , &m }, { 'k', "-k N", "Set the number of rows in B.", TYPE_INT , &k }, { 's', "-s N", "Set the seed .", TYPE_INT , &seed }, { 't', "-timing", "Output timings" , TYPE_NONE, &timing}, END_OF_ARGUMENTS }; FFLAS::parseArguments(ac,av,as); if (n < k) { std::cout << "Usage : m k n ; matrix of size m x k, lda is n" << std::endl; return -1 ; } srand(seed); srand48(seed); bool pass = true ; { /* freduce */ { Givaro::Modular F(p) ; pass &= test_freduce (F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_freduce (F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_freduce (F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_freduce (F,m,k,n,timing); } { Givaro::Modular F((int32_t)p) ; pass &= test_freduce (F,m,k,n,timing); } { Givaro::ModularBalanced F((int32_t)p) ; pass &= test_freduce (F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_freduce (F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_freduce (F,m,k,n,timing); } #if 1 { Givaro::ZRing F ; pass &= test_freduce (F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_freduce (F,m,k,n,timing); } { Givaro::ZRing F; pass &= test_freduce (F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_freduce (F,m,k,n,timing); } #endif } return (pass?0:1) ; } /* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s fflas-ffpack-2.2.2/tests/test-frobenius.C000066400000000000000000000056471274716147400203100ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the krylov-elimination //-------------------------------------------------------------------------- // usage: test-krylov-elim p A, to compute the rank profile of the (n+m)xn matrix B // formed by the n identity vectors and the mxn matrix A over Z/pZ //------------------------------------------------------------------------- //------------------------------------------------------------------------- //#define DEBUG 0 #include #include #include #include #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/timer.h" using namespace std; #include "givaro/modular.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace FFPACK; typedef Givaro::Modular Field; template std::ostream& printvect(std::ostream& o, vector& vect){ for(size_t i=0; i < vect.size()-1; ++i) o << vect[i] << " " ; return o << vect[vect.size()-1] << std::endl; } int main(int argc, char** argv){ int m,n; cout< "< (F,argv[2],&m,&n); size_t c = atoi(argv[3]); std::list > frobForm; FFLAS::Timer tim; tim.clear(); tim.start(); FFPACK::CharpolyArithProg (F, frobForm, n, A, n, c); tim.stop(); std::list >::iterator it = frobForm.begin(); while(it != frobForm.end()){ printvect (cout, *(it++)); } cerr<s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFPACK * Written by : * Brice Boyer (briceboyer) * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/args-parser.h" #include "Matio.h" #include "test-utils.h" #include "assert.h" // using namespace FFPACK; using FFPACK::RandomMatrix ; using Givaro::ModularBalanced ; template bool test_fscal(const Field & F, const typename Field::Element & alpha, size_t m, size_t k, size_t n, bool timing) { typedef typename Field::Element T ; T * A = FFLAS::fflas_new(m*n); T * C = FFLAS::fflas_new(m*n); T * D = FFLAS::fflas_new(m*n); if (timing) std::cout << ">>>" << std::endl ; size_t iter = 3 ; FFLAS::Timer tim, tom, tam ; tim.clear() ; tom.clear() ; if (timing) F.write(std::cout << "Field ") << std::endl; for (size_t b = 0 ; b < iter ; ++b) { RandomMatrix(F,A,m,k,n); RandomMatrix(F,C,m,k,n); FFLAS::fassign(F,m,k,C,n,D,n); tam.clear();tam.start(); for (size_t i = 0 ; i < m ; ++i) for (size_t j = 0 ; j < k ; ++j) F.mul(D[i*n+j],A[i*n+j],alpha); tam.stop(); tim += tam ; tam.clear();tam.start(); FFLAS::fscal(F,m,k,alpha,A,n,C,n); tam.stop(); tom += tam ; #if 1 for (size_t i =0 ; i < m ; ++i) for (size_t j =0 ; j < k ; ++j) if (! F.areEqual(C[i*n+j],D[i*n+j])) { if (timing) std::cout << i << ',' << j << " : " << C[i*n+j] << "!= (ref)" << D[i*n+j] << std::endl; return false ; } #endif } if (timing) std::cout << "fscal(___): " << tim.usertime()/(double)iter << 's' << std::endl; if (timing) std::cout << "fscal (AVX): " << tom.usertime()/(double)iter << 's'<< std::endl; if (timing) std::cout << "<<<" << std::endl; FFLAS::fflas_delete( A ); FFLAS::fflas_delete( C ); FFLAS::fflas_delete( D ); return true; } template bool test_fscal(const Field & F, size_t m, size_t k, size_t n, bool timing) { ModularBalanced G(1234); // for alpha bool pass = true ; typename Field::Element alpha; F.init(alpha,F.one); pass &= test_fscal(F,alpha,m,k,n,timing); F.init(alpha,F.mOne); pass &= test_fscal(F,alpha,m,k,n,timing); F.init(alpha,F.zero); pass &= test_fscal(F,alpha,m,k,n,timing); typename ModularBalanced::RandIter RValue( G ); F.init(alpha,RValue.random(alpha)); pass &= test_fscal(F,alpha,m,k,n,timing); F.init(alpha,RValue.random(alpha)); pass &= test_fscal(F,alpha,m,k,n,timing); return pass ; } template bool test_fscalin(const Field & F, const typename Field::Element & alpha, size_t m, size_t k, size_t n, bool timing) { typedef typename Field::Element T ; T * C = FFLAS::fflas_new(m*n); T * D = FFLAS::fflas_new(m*n); if (timing) std::cout << ">>>" << std::endl ; size_t iter = 3 ; FFLAS::Timer tim, tom, tam ; tim.clear() ; tom.clear() ; if (timing) F.write(std::cout << "Field ") << std::endl; for (size_t b = 0 ; b < iter ; ++b) { RandomMatrix(F,C,m,k,n); FFLAS::fassign(F,m,k,C,n,D,n); tam.clear();tam.start(); for (size_t i = 0 ; i < m ; ++i) for (size_t j = 0 ; j < k ; ++j) F.mulin(D[i*n+j],alpha); tam.stop(); tim += tam ; tam.clear();tam.start(); FFLAS::fscalin(F,m,k,alpha,C,n); tam.stop(); tom += tam ; #if 1 for (size_t i =0 ; i < m ; ++i) for (size_t j =0 ; j < k ; ++j) if (! F.areEqual(C[i*n+j],D[i*n+j])) { if (timing) std::cout << i << ',' << j << " : " << C[i*n+j] << "!= (ref)" << D[i*n+j] << std::endl; return false ; } #endif } if (timing) std::cout << "fscalin(___): " << tim.usertime()/(double)iter << 's' << std::endl; if (timing) std::cout << "fscalin (AVX): " << tom.usertime()/(double)iter << 's'<< std::endl; if (timing) std::cout << "<<<" << std::endl; FFLAS::fflas_delete( C ); FFLAS::fflas_delete( D ); return true; } template bool test_fscalin(const Field & F, size_t m, size_t k, size_t n, bool timing) { ModularBalanced G(1234); // for alpha bool pass = true ; typename Field::Element alpha; F.init(alpha,F.one); pass &= test_fscalin(F,alpha,m,k,n,timing); F.init(alpha,F.mOne); pass &= test_fscalin(F,alpha,m,k,n,timing); F.init(alpha,F.zero); pass &= test_fscalin(F,alpha,m,k,n,timing); typename ModularBalanced::RandIter RValue( G ); F.init(alpha,RValue.random(alpha)); pass &= test_fscalin(F,alpha,m,k,n,timing); F.init(alpha,RValue.random(alpha)); pass &= test_fscalin(F,alpha,m,k,n,timing); return pass ; } int main(int ac, char **av) { static size_t m = 300 ; static size_t n = 301 ; static size_t k = 300 ; static uint64_t p = 7; int seed = (int) time(NULL); static bool timing = false ; static Argument as[] = { { 'p', "-p P", "Set the field characteristic.", TYPE_INT , &p }, { 'n', "-n N", "Set the number of cols in C." , TYPE_INT , &n }, { 'm', "-m N", "Set the number of rows in C." , TYPE_INT , &m }, { 'k', "-k N", "Set the number of rows in B." , TYPE_INT , &k }, { 's', "-s N", "Set the seed." , TYPE_INT , &seed }, { 't', "-timing", "Output timings" , TYPE_NONE, &timing}, END_OF_ARGUMENTS }; FFLAS::parseArguments(ac,av,as); if (n < k) { std::cout << "Usage : m k n ; matrix of size m x k, lda is n" << std::endl; return -1 ; } srand(seed); srand48(seed); // std::cout << seed << std::endl; bool pass = true ; { /* fscal */ { Givaro::Modular F(p) ; pass &= test_fscal(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fscal(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_fscal(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fscal(F,m,k,n,timing); } { Givaro::Modular F((int32_t)p) ; pass &= test_fscal(F,m,k,n,timing); } { Givaro::ModularBalanced F((int32_t)p) ; pass &= test_fscal(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_fscal(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fscal(F,m,k,n,timing); } #if 1 { Givaro::ZRing F ; pass &= test_fscal(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_fscal(F,m,k,n,timing); } { Givaro::ZRing F; pass &= test_fscal(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_fscal(F,m,k,n,timing); } #endif } { /* fscalin */ { Givaro::Modular F(p) ; pass &= test_fscalin(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fscalin(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_fscalin(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fscalin(F,m,k,n,timing); } { Givaro::Modular F((int32_t)p) ; pass &= test_fscalin(F,m,k,n,timing); } { Givaro::ModularBalanced F((int32_t)p) ; pass &= test_fscalin(F,m,k,n,timing); } { Givaro::Modular F(p) ; pass &= test_fscalin(F,m,k,n,timing); } { Givaro::ModularBalanced F(p) ; pass &= test_fscalin(F,m,k,n,timing); } #if 1 { Givaro::ZRing F ; pass &= test_fscalin(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_fscalin(F,m,k,n,timing); } { Givaro::ZRing F; pass &= test_fscalin(F,m,k,n,timing); } { Givaro::ZRing F ; pass &= test_fscalin(F,m,k,n,timing); } #endif } return (pass?0:1) ; } /* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s fflas-ffpack-2.2.2/tests/test-fspmm-dlp.C000066400000000000000000000270051274716147400202030ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Bastien Vialla * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #define __DLP_CHALLENGE #include #include #include #include #include #include "gmpxx.h" #include #include #include #include #include #include #include "fflas-ffpack/fflas/fflas_sparse.h" #include "fflas-ffpack/utils/args-parser.h" #include "fflas-ffpack/field/rns-integer-mod.h" #include "fflas-ffpack/fflas/fflas_sparse/read_sparse.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/flimits.h" #ifdef __FFLASFFPACK_USE_OPENMP typedef FFLAS::OMPTimer TTimer; #else typedef FFLAS::Timer TTimer; #endif using namespace std; using namespace FFLAS; using namespace Givaro; using Data = std::vector>>; using Coo = typename Data::value_type; /******************************************************************************************************************* * * Utility functions: sms reader and random field * *******************************************************************************************************************/ void readMat(string path, index_t *& row, index_t *& col, double *&val, index_t &rowdim, index_t &coldim, uint64_t & nnz){ std::ifstream file(path, std::ios::out); std::string line, nnz_c; std::getline(file, line); std::istringstream(line) >> rowdim >> coldim >> nnz_c; Data mat; int64_t r, c, v; while(std::getline(file, line)){ std::istringstream(line) >> r >> c >> v; if(r!=0) mat.emplace_back(v, r-1,c-1); } std::sort(mat.begin(), mat.end(), [](Coo &a, Coo &b){ return (a.row < b.row) || ((a.row == b.row) && (a.col < b.col)); ;}); mat.shrink_to_fit(); nnz = mat.size(); val = FFLAS::fflas_new(nnz, Alignment::CACHE_LINE); col = FFLAS::fflas_new(nnz, Alignment::CACHE_LINE); row = FFLAS::fflas_new(nnz, Alignment::CACHE_LINE); for(size_t i = 0 ; i < nnz ; ++i){ val[i] = mat[i].val; col[i] = mat[i].col; row[i] = mat[i].row; } } template size_t bitSize(T n){ return sizeof(T)*4-__builtin_clz(n); } template Givaro::Integer maxFieldElt() {return (Givaro::Integer)Field::maxCardinality();} template<> Givaro::Integer maxFieldElt>() {return (Givaro::Integer)-1;} /*** Field chooser for test according to characteristic q and bitsize b ***/ /* if q=-1 -> field is chosen randomly with a charateristic of b bits if b=0 -> bitsize is chosen randomly according to maxFieldElt */ template Field* chooseField(Givaro::Integer q, uint64_t b){ Givaro::Integer maxV= maxFieldElt(); auto seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); std::mt19937 mt_rand(seed); if (maxV>0 && (q> maxV || b> maxV.bitsize())) return nullptr; if (b<=1){ //srand((double)std::chrono::high_resolution_clock::now()); auto bitrand = std::bind(std::uniform_int_distribution(2,maxV.bitsize()-1), mt_rand); b = bitrand(); } Givaro::IntPrimeDom IPD; Givaro::Integer tmp,p; if (q==-1){ // Choose characteristic as a random prime of b bits do{ Givaro::Integer _p; Givaro::Integer::seeding(Givaro::Integer(mt_rand())); Givaro::Integer::random_exact_2exp(_p,b); IPD.prevprime( tmp, _p+1 ); p = tmp; }while( (p < 2) ); } else p=q; return new Field(p); } /*************************************************************************************************************/ int main(int argc, char **argv) { using Field = Modular; using FieldMat = ZRing; using FieldComp = FFPACK::RNSIntegerMod; using SparseMatrix = FFLAS::Sparse; Integer q = -1; int b = 128; int blockSize = 1; std::string matrixFile = ""; int nIter = 100; static Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'b', "-b B", "Set the bitsize of the random characteristic.", TYPE_INT , &b }, { 'k', "-k K", "Set the size of the block (1 by default).", TYPE_INT, &blockSize }, { 'n', "-n N", "Number of iterations (1 by default).", TYPE_INT, &nIter }, { 'f', "-f FILE", "Set matrix file.", TYPE_STR, &matrixFile }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc, argv, as); // Construct Givaro::Integer field Field *F= chooseField(q,b); if (F==nullptr) exit(0); Integer p; F->cardinality(p); cout << "Prime p: " << p << endl; // Pointers for the matrix index_t *row = nullptr, *col = nullptr; typename FieldMat::Element_ptr dat; index_t rowdim, coldim; uint64_t nnz; // Field associate to the matrix FieldMat Fword; // Read the matrix readMat(matrixFile, row, col, dat, rowdim, coldim, nnz); vector rows(rowdim, 0); for(size_t i = 0 ; i < nnz ; ++i) rows[row[i]]++; for(size_t i = 0 ; i < 20 ; ++i) cout << "#rows with "< x(coldim, 1), y(rowdim, 0); cout.precision(20); // Compute the bigger row FFLAS::fspmv(Fword, A, x.data(), 0, y.data()); for(auto &x: y){ if(x < 0){ x = -x; } } double maxSum = *(std::max_element(y.begin(), y.end())); cout << "maxSum: " << maxSum << endl; // Compute the bitsize of the RNS primes size_t primeBitsize = 53 - Integer(maxSum).bitsize()-1; cout << "primeBitsize: " << primeBitsize << endl; // construct RNS // primeBitsize = 23; FFPACK::rns_double_extended RNS(Integer(maxSum)*p, primeBitsize, true, 0); size_t rnsSize = RNS._size; cout << "M: " << RNS._M << endl; cout << "RNS basis size: " << rnsSize << endl; cout << "Rns basis: "; for(auto&x:RNS._basis){ cout << x << " "; } cout << endl; cout << "RNS Mi: " << endl; for(auto &x : RNS._Mi){ cout << x << " "; } cout << endl; cout << "RNS MMi: " << endl; for(auto &x : RNS._MMi){ cout << x << " "; } cout << endl; // construct RNS field FieldComp Frns(p,RNS); std::vector X(coldim*blockSize), Y(rowdim*blockSize, 0); // Fill X with random values for(auto &x: X){ Givaro::Integer::random_exact_2exp(x,b); F->init(x, x); } size_t ld = 0; Integer maxRep = Integer(maxSum)*rnsSize*p; while(maxRep.bitsize() < RNS._M.bitsize()){ maxRep *= Integer(maxSum); ld++; } ld -= 1; cout << "Spmm by modp: " << ld << endl; double* Xrns = fflas_new(coldim*blockSize*rnsSize, Alignment::CACHE_LINE); double* Yrns = fflas_new(rowdim*blockSize*rnsSize, Alignment::CACHE_LINE); // Transform X in RNS RNS.init(coldim*blockSize, Xrns, X.data(), 1); cout << endl; TTimer Tspmm; TTimer Tmodp; TTimer Ttotal; double spmmTime = 0, modpTime = 0; bool bb = true; Ttotal.start(); for(size_t kk = 1 ; kk <= nIter ; ++kk){ // perform Yrns = A.Xrns + beta.Yrns over ZZ Tspmm.start(); if(bb){ pfspmm(Fword, A, blockSize*rnsSize, Xrns, blockSize*rnsSize, 0, Yrns, blockSize*rnsSize); RNS.reduce(rowdim*blockSize, Yrns, 1, true); // reduce Yrns wrt the RNS basis Tspmm.stop(); spmmTime += Tspmm.usertime(); cout << "after spmm:" << endl; for(size_t i = 0, end = (Y.size()>20)?20:Y.size() ; i < end ; ++i){ cout << Yrns[i] << " "; } cout << endl; bb = !bb; // if(kk%ld == 0){ Tmodp.start(); Frns.reduce_modp_rnsmajor_scal_quad(rowdim*blockSize, FFPACK::rns_double_elt_ptr(Yrns, 1)); Tmodp.stop(); modpTime += Tmodp.usertime(); cout << "after modp:" << endl; for(size_t i = 0, end = (Y.size()>20)?20:Y.size() ; i < end ; ++i){ cout << Yrns[i] << " "; } cout << endl; // } }else{ fspmm(Fword, A, blockSize*rnsSize, Yrns, blockSize*rnsSize, 0, Xrns, blockSize*rnsSize); RNS.reduce(rowdim*blockSize, Xrns, 1, true); // reduce Yrns wrt the RNS basis Tspmm.stop(); spmmTime += Tspmm.usertime(); bb = !bb; for(size_t i = 0, end = (Y.size()>20)?20:Y.size() ; i < end ; ++i){ cout << Xrns[i] << " "; } cout << endl; // if(kk%ld == 0){ Tmodp.start(); Frns.reduce_modp_rnsmajor_scal_quad(rowdim*blockSize, FFPACK::rns_double_elt_ptr(Xrns, 1)); Tmodp.stop(); modpTime += Tmodp.usertime(); // } cout << "after modp:" << endl; for(size_t i = 0, end = (Y.size()>20)?20:Y.size() ; i < end ; ++i){ cout << Xrns[i] << " "; } cout << endl; } } // if(bb && nIter%ld != 0){ // Tmodp.start(); // Frns.reduce_modp_rnsmajor_scal_quad(rowdim*blockSize, FFPACK::rns_double_elt_ptr(Yrns, 1)); // Tmodp.stop(); // modpTime += Tmodp.usertime(); // }else if(!bb && nIter%ld != 0){ // Tmodp.start(); // Frns.reduce_modp_rnsmajor_scal_quad(rowdim*blockSize, FFPACK::rns_double_elt_ptr(Xrns, 1)); // Tmodp.stop(); // modpTime += Tmodp.usertime(); // } // Reconstruct Y from Yrns RNS.convert(rowdim*blockSize, Y.data(), Yrns); Ttotal.stop(); for(size_t i = 0 ; i < rowdim*blockSize ; ++i){ if(Y[i] < 0){ Integer q = -Y[i] / p; Y[i] = p - (-Y[i] - p*q); } Y[i] %= p; } cout << "Y res:" << endl; for(size_t i = 0, end = (Y.size()>20)?20:Y.size() ; i < end ; ++i){ cout << Y[i] << " "; } cout << endl; cout << nIter << " iterations in " << Ttotal << endl; cout << "spmm: " << spmmTime << endl; cout << "modp: " << modpTime << endl; FFLAS::fflas_delete(Xrns); FFLAS::fflas_delete(Yrns); return 0; } fflas-ffpack-2.2.2/tests/test-fspmm-recint.C000066400000000000000000000134151274716147400207100ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (c) FFLAS-FFPACK * Written by Bastien Vialla * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #define __DLP_CHALLENGE #include #include #include #include #include #include "gmpxx.h" #include #include #include #include #include #include using namespace RecInt; #include "fflas-ffpack/fflas/fflas_sparse.h" #include "fflas-ffpack/utils/args-parser.h" #include "fflas-ffpack/field/rns-integer-mod.h" #include "fflas-ffpack/fflas/fflas_sparse/read_sparse.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/flimits.h" #ifdef __FFLASFFPACK_USE_OPENMP typedef FFLAS::OMPTimer TTimer; #else typedef FFLAS::Timer TTimer; #endif using namespace std; using namespace FFLAS; using namespace Givaro; /******************************************************************************************************************* * * Utility functions: sms reader and random field * *******************************************************************************************************************/ template size_t bitSize(T n){ return sizeof(T)*4-__builtin_clz(n); } template Givaro::Integer maxFieldElt() {return (Givaro::Integer)Field::maxCardinality();} template<> Givaro::Integer maxFieldElt>() {return (Givaro::Integer)-1;} /*** Field chooser for test according to characteristic q and bitsize b ***/ /* if q=-1 -> field is chosen randomly with a charateristic of b bits if b=0 -> bitsize is chosen randomly according to maxFieldElt */ template Field* chooseField(Givaro::Integer q, uint64_t b){ Givaro::Integer maxV= maxFieldElt(); auto seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); std::mt19937 mt_rand(seed); if (maxV>0 && (q> maxV || b> maxV.bitsize())) return nullptr; if (b<=1){ //srand((double)std::chrono::high_resolution_clock::now()); auto bitrand = std::bind(std::uniform_int_distribution(2,maxV.bitsize()-1), mt_rand); b = bitrand(); } Givaro::IntPrimeDom IPD; Givaro::Integer tmp,p; if (q==-1){ // Choose characteristic as a random prime of b bits do{ Givaro::Integer _p; Givaro::Integer::seeding(Givaro::Integer(mt_rand())); Givaro::Integer::random_exact_2exp(_p,b); IPD.prevprime( tmp, _p+1 ); p = tmp; }while( (p < 2) ); } else p=q; return new Field(p); } /*************************************************************************************************************/ int main(int argc, char **argv) { using Field = Modular; using FieldMat = ZRing; using FieldComp = FFPACK::RNSIntegerMod; using FieldElement = RecInt::rmint<7>; using FieldRec = ZRing; using SparseMatrix = FFLAS::Sparse; Integer q = -1; int b = 128; int blockSize = 1; std::string matrixFile = ""; int nIter = 100; static Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'b', "-b B", "Set the bitsize of the random characteristic.", TYPE_INT , &b }, { 'k', "-k K", "Set the size of the block (1 by default).", TYPE_INT, &blockSize }, { 'n', "-n N", "Set the size of the block (1 by default).", TYPE_INT, &nIter }, { 'f', "-f FILE", "Set matrix file.", TYPE_STR, &matrixFile }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc, argv, as); // Construct Givaro::Integer field Field *F= chooseField(q,b); if (F==nullptr) exit(0); Integer p; F->cardinality(p); cout << "Prime p: " << p << endl; RecInt::ruint<7> pRec; // RecInt::mpz_to_ruint(pRec, FieldElement(p)); FieldElement::init_module(ruint<7>(p)); FieldRec Frec; // Pointers for the matrix index_t *row = nullptr, *col = nullptr; typename FieldRec::Element_ptr dat; index_t rowdim, coldim; uint64_t nnz; // Read the matrix readSmsFormat(matrixFile, Frec, row, col, dat, rowdim, coldim, nnz); vector rowCoo(nnz, 0); for(size_t i = 0 ; i < rowdim ; ++i){ for(size_t j = row[i] ; j < row[i+1] ; ++j){ rowCoo[j] = i; } } // Build the matrix SparseMatrix A; FFLAS::sparse_init(Frec, A, rowCoo.data(), col, dat, rowdim, coldim, nnz); FFLAS::fflas_delete(row); FFLAS::fflas_delete(col); FFLAS::fflas_delete(dat); rowCoo.resize(0); vector x(coldim*blockSize, 1), y(rowdim*blockSize, 0); pfspmm(Frec, A, blockSize, x.data(), blockSize, 0, y.data(), blockSize); return 0; } fflas-ffpack-2.2.2/tests/test-fsquare.C000066400000000000000000000055171274716147400177560ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for fsquare : 1 computation // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- //#define DEBUG 0 #define TIME 1 #include #include #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/utils/timer.h" #include "Matio.h" #include "fflas-ffpack/fflas/fflas.h" using namespace FFPACK; using namespace std; typedef Givaro::Modular Field; int main(int argc, char** argv){ int n; cerr< " <<" " <<" to do i computations of C <- AA" <(n*n); FFLAS::Timer tim,t; t.clear();tim.clear(); for(int i = 0;is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * Written by Ashley Lesdalons * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ //-------------------------------------------------------------------------- // Test for Checker_ftrsm //-------------------------------------------------------------------------- #define ENABLE_ALL_CHECKINGS 1 #include #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/args-parser.h" int main(int argc, char** argv) { srand (time(NULL)); typedef Givaro::Modular Field; Givaro::Integer q = 131071; size_t iter = 3; size_t MAXN = 100; size_t seed(0); Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 'n', "-n N", "Set the size of the matrix.", TYPE_INT , &MAXN }, { 's', "-s N", "Set the seed.", TYPE_INT , &seed }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); Field F(q); Field::RandIter G(F,0,seed); srandom(seed); typename Field::Element alpha,tmp; Field::RandIter Rand(F); Field::NonZeroRandIter NZRand(Rand); size_t pass = 0; for (size_t i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Pascal Giorgi * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #define __FFLASFFPACK_SEQUENTIAL #define ENABLE_ALL_CHECKINGS 1 #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/args-parser.h" #include "test-utils.h" #include #include using namespace std; using namespace FFPACK; using Givaro::Modular; using Givaro::ModularBalanced; template void write_matrix(Givaro::Integer p, size_t m, size_t n, T* C, size_t ldc){ size_t www=(p.bitsize()*log(2.))/log(10.); for (size_t i=0;i bool check_ftrsm (const Field &F, size_t m, size_t n, const typename Field::Element &alpha, FFLAS::FFLAS_SIDE side, FFLAS::FFLAS_UPLO uplo, FFLAS::FFLAS_TRANSPOSE trans, FFLAS::FFLAS_DIAG diag){ typedef typename Field::Element Element; Element * A, *B, *B2, *C, tmp; size_t k = (side==FFLAS::FflasLeft?m:n); size_t lda,ldb,ldc; lda=k+13; ldb=n+14; ldc=n+15; A = FFLAS::fflas_new(F,k,lda); B = FFLAS::fflas_new(F,m,ldb); B2 = FFLAS::fflas_new(F,m,ldb); C = FFLAS::fflas_new(F,m,ldc); typename Field::RandIter Rand(F); typename Field::NonZeroRandIter NZRand(Rand); for (size_t i=0;i bool run_with_field (Givaro::Integer q, size_t b, size_t m, size_t n, int s, size_t iters){ bool ok = true ; int nbit=(int)iters; while (ok && nbit){ //typedef typename Field::Element Element ; // choose Field Field* F= chooseField(q,b); if (F==nullptr) return true; typename Field::Element alpha; F->init (alpha, (typename Field::Element)s); cout<<"Checking with ";F->write(cout)< >(q,b,m,n,s,iters); ok &= run_with_field >(q,b,m,n,s,iters); ok &= run_with_field >(q,b,m,n,s,iters); ok &= run_with_field >(q,b,m,n,s,iters); ok &= run_with_field >(q,b,m,n,s,iters); ok &= run_with_field >(q,b,m,n,s,iters); ok &= run_with_field >(q,b,m,n,s,iters); ok &= run_with_field >(q,b,m,n,s,iters); ok &= run_with_field >(q,5,m/4+1,n/4+1,s,iters); ok &= run_with_field >(q,(b?b:512),m/4+1,n/4+1,s,iters); } while (loop && ok); return !ok ; } fflas-ffpack-2.2.2/tests/test-ftrtri.C000066400000000000000000000070071274716147400176160ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for ftrtri : 1 computation // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- #define DEBUG 1 #define TIME 1 #include #include #include "givaro/modular-balanced.h" #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/utils/timer.h" #include "Matio.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace std; using namespace FFPACK; typedef Givaro::ModularBalanced Field; int main(int argc, char** argv) { int n; int nbit=atoi(argv[3]); // number of times the product is performed cerr< <" <(n*n); for (int i=0; i(n*n); FFLAS::Timer tim,t; t.clear();tim.clear(); for(int i = 0;is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for rank // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- #include #include #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/utils/timer.h" #include "Matio.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace std; using namespace FFPACK; typedef Givaro::Modular Field; int main(int argc, char** argv){ int n,m; cerr< <" < // COL_MAJOR true not supported in test. To be updated. #define COL_MAJOR false #define LEAD_GEN true #define DISPLAY false #define TRUST_FGEMM false using namespace FFLAS; int test_igemm(size_t m, size_t n, size_t k, enum FFLAS_TRANSPOSE tA, enum FFLAS_TRANSPOSE tB, int a_scal, int b_scal, bool timing) { FFLAS::Timer tim; srand((unsigned int)time(NULL)); typedef Givaro::Modular IField ; IField Z(1_ui64<<63); size_t ra = (tA==FflasNoTrans) ? m : k ; size_t ca = (tA==FflasNoTrans) ? k : m ; size_t rb = (tB==FflasNoTrans) ? k : n; size_t cb = (tB==FflasNoTrans) ? n : k; size_t lda = ca ; size_t ldb = cb ; // n size_t ldc = n ; // n #if COL_MAJOR size_t ldA = m;//+rand() % 3 ; // m size_t ldB = k;//+rand() % 3 ; // k size_t ldC = m;//+rand() % 3 ; // m #else size_t ldA = ca ; // k size_t ldB = cb ; // n size_t ldC = n ; // n #endif #if LEAD_GEN lda += rand() % 5; ldb += rand() % 5; ldc += rand() % 5; ldA += rand() % 5; ldB += rand() % 5; ldC += rand() % 5; #endif int seed=0; typename IField::RandIter Rand(Z,seed); // typename IField::RandIter Rand(Z,seed); IField::Element_ptr A,B,C,D; C= FFLAS::fflas_new(Z,m,ldc); D= FFLAS::fflas_new(Z,m,n); A= FFLAS::fflas_new(Z,ra,lda); B= FFLAS::fflas_new(Z,rb,ldb) ; for (size_t i=0;i FField ; FField F ; FField::Element_ptr Ci,Ai,Bi; #if COL_MAJOR Ci= FFLAS::fflas_new(F,ldC,n); Ai= FFLAS::fflas_new(F,ldA,k); Bi= FFLAS::fflas_new(F,ldB,n); for (size_t i=0;i G(65537); Givaro::ZRing G; double af, bf ; G.init(af,alpha); G.init(bf,beta); double *Cf,*Af,*Bf; Cf= FFLAS::fflas_new(G,m,ldC); Af= FFLAS::fflas_new(G,ra,ldA); Bf= FFLAS::fflas_new(G,rb,ldB); for (size_t i=0;is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include #include #include #include int main() { double * A = (double*)malloc(4*sizeof(double)); A[0] = A[2] = 1 ; A[1] = A[3] = 0 ; size_t * P = (size_t*) malloc(2*sizeof(size_t)); size_t * Qt = (size_t*) malloc(2*sizeof(size_t)); size_t r = RowEchelonForm_modular_double(101,2,2,A,2,P,Qt,false,FfpackSlabRecursive,true); freducein_2_modular_double(101,2,2,A,2,false); freducein_1_modular_double(101,4,A,1,false); fsquare_3_modular_double(101,FflasNoTrans,2,1,A,2,1,A,1,true); return !(r==1); } fflas-ffpack-2.2.2/tests/test-invert-check.C000066400000000000000000000056271274716147400206740ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * Written by Ashley Lesdalons * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ //-------------------------------------------------------------------------- // Test for Checker_invert //-------------------------------------------------------------------------- #define ENABLE_ALL_CHECKINGS 1 #include #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/args-parser.h" #include "fflas-ffpack/utils/fflas_randommatrix.h" int main(int argc, char** argv) { srand (time(NULL)); typedef Givaro::Modular Field; Givaro::Integer q = 131071; size_t iter = 3; size_t MAXM = 1000; size_t seed( (int) time(NULL) ); Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'n', "-n N", "Set the maximal size of the matrix.", TYPE_INT , &MAXM }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 's', "-s N", "Set the seed.", TYPE_INT , &seed }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); FFLAS::writeCommandString(std::cout, as) << std::endl; Field F(q); Field::RandIter Rand(F,0,seed); Field::NonZeroRandIter NZRand(Rand); srandom(seed); int nullity; size_t m = MAXM, pass = 0; for (size_t i=0; i checker(Rand,m,A,m<<1); FFPACK::Invert(F,m,A,m<<1,nullity); try { checker.check(A,nullity); std::cout << "Verification successful\n"; pass++; } catch (FailureInvertCheck &e) { std::cout << "Verification failed!\n"; } FFLAS::fflas_delete(A); } std::cout << pass << "/" << iter << " tests were successful.\n"; return 0; } fflas-ffpack-2.2.2/tests/test-invert.C000066400000000000000000000100301274716147400176010ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) the FFLAS-FFPACK group * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #define ENABLE_ALL_CHECKINGS 1 #define __FFLASFFPACK_SEQUENTIAL #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/utils/args-parser.h" #include "test-utils.h" #include #include using namespace std; using namespace FFLAS; using namespace FFPACK; using Givaro::Modular; using Givaro::ModularBalanced; template bool run_with_field (Givaro::Integer q, size_t b, size_t n, size_t iters){ bool ok = true ; int nbit=(int)iters; while (ok && nbit){ Field* F= chooseField(q,b); if (F==nullptr) return true; cout<<"Checking with ";F->write(cout)<one, A, lda, X, ldx, F->mOne, Y, n); if (! fiszero(*F,n,n,Y,n)){ write_field(*F, std::cerr<<"Y = "< >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,b,n,iters); ok &= run_with_field >(q,(b?b:512),n/4+1,iters); } while (loop && ok); return !ok ; } fflas-ffpack-2.2.2/tests/test-krylov-elim.C000066400000000000000000000067651274716147400205700ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the krylov-elimination //-------------------------------------------------------------------------- // usage: test-krylov-elim p A, to compute the rank profile of the (n+m)xn matrix B // formed by the n identity vectors and the mxn matrix A over Z/pZ //------------------------------------------------------------------------- //------------------------------------------------------------------------- //#define DEBUG 0 #include #include "Matio.h" #include "fflas-ffpack/utils/timer.h" using namespace std; #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace FFPACK; typedef Givaro::Modular Field; template std::ostream& printvect(std::ostream& o, T* vect, size_t dim) { for(size_t i=0; i "< (F,argv[2],(int*)&m,(int*)&n); Field::Element * B = FFLAS::fflas_new((m+n)*n); for (size_t i=0; i<(n+m)*n;++i) *(B+i)=0; size_t deg = (n-1)/m+1; size_t curr_row = 0; size_t it_idx = 0; size_t bk_idx = 0; for (size_t i=0; i #include #include Givaro::Timer tperm, tgemm, tBC, ttrsm,trest,timtot; size_t mvcnt = 0; #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "test-utils.h" #include "fflas-ffpack/utils/args-parser.h" using namespace std; using namespace FFPACK; /*! Tests the LUdivine routine. * @tparam Field Field * @tparam Diag Unit diagonal in U * @tparam Trans * @param F field * @param A Matrix (preallocated) * @param r rank of A * @param m rows * @param n cols * @param lda leading dim of A * @return 0 iff correct, 1 otherwise */ template bool test_LUdivine(const Field & F, typename Field::ConstElement_ptr A, size_t lda, size_t r, size_t m, size_t n) { bool fail = false; typedef typename Field::Element_ptr Element_ptr ; typedef typename Field::Element Element ; Element_ptr B = FFLAS::fflas_new(F,m,lda) ; FFLAS::fassign(F,m,n,A,lda,B,lda); size_t maxP, maxQ ; if (trans == FFLAS::FflasTrans){ maxP = m; maxQ = n; } else{ // trans == FFLAS::FflasNoTrans maxP = n; maxQ = m; } size_t * P = FFLAS::fflas_new(maxP) ; size_t * Q = FFLAS::fflas_new(maxQ) ; size_t R = FFPACK::LUdivine (F, diag, trans, m, n, B, lda, P, Q); if (R != r) { std::cout << "rank is wrong (expecting " << r << " but got " << R << ")" << std::endl; FFLAS::fflas_delete( B ); FFLAS::fflas_delete( P ); FFLAS::fflas_delete( Q ); return fail = true; } Element_ptr X = FFLAS::fflas_new(F, m, n); // compute X=CUP and check X == A /* Build L,U */ Element_ptr L, U; if (trans == FFLAS::FflasNoTrans){ L = FFLAS::fflas_new(F, m, m); U = FFLAS::fflas_new(F, m, n); Element zero,one; F.init(zero,0.0); F.init(one,1.0); /* build U */ for (size_t i=0; i checker (G,m,n,A,n); size_t R = FFPACK::PLUQ (F, diag, m, n, B, lda, P, Q); // write_field(F,std::cerr<<"\n PLUQ = \n",B,m,n,lda); try { checker.check(A,n,R,P,Q); } catch(FailurePLUQCheck &e) { std::cout << m << 'x' << n << " pluq verification failed!\n"; } if (R != r) { std::cout << "rank is wrong (expected " << r << " but got " << R << ")" << std::endl; FFLAS::fflas_delete (B); FFLAS::fflas_delete (P); FFLAS::fflas_delete (Q); return fail = true; } fail |= verifPLUQ (F,A, lda, B, lda, P, Q, m, n, r); FFLAS::fflas_delete (B); FFLAS::fflas_delete(P); FFLAS::fflas_delete(Q); return fail; } /*! Tests the LUpdate routine. * @tparam Field Field * @tparam Diag Unit diagonal in L ? * @tparam Trans ? * @param F field * @param A Matrix (preallocated) * @param r rank of A * @param B Matrix (preallocated) * @param m rows in A * @param n cols in A (and B) * @param k rows in B * @param lda leading dim of A (and B) * @return 0 iff correct, 1 otherwise */ // template // bool test_lu_append(const Field & F, // const typename Field::Element_ptr A, // const typename Field::Element_ptr B, // size_t m, size_t n, size_t k, size_t lda) // { // FFLASFFPACK_check(n<=lda); // bool fail = false; // size_t M = m + k ; // typedef typename Field::Element Element ; // Element_ptr Acop = FFLAS::fflas_new(F, m, lda) ; // FFLAS::fassign(F,m,n,A,lda,Acop,lda) ; // Element_ptr Bcop = FFLAS::fflas_new(F, k, lda) ; // FFLAS::fassign(F,k,n,B,lda,Bcop,lda) ; // Element_ptr Append = FFLAS::fflas_new (F, M, lda); // FFLAS::fassign(F,m,n,A,lda,Append,lda) ; // FFLAS::fassign(F,k,n,B,lda,Append+m*lda,lda) ; // #if 0 /* paranoid check */ // for (size_t i = 0 ; i < m ; ++i) { // for (size_t j = 0 ; j < n ; ++j) { // FFLASFFPACK_check(Append[i*lda+j]==A[i*lda+j]); // } // } // for (size_t i = 0 ; i < k ; ++i) { // for (size_t j = 0 ; j < n ; ++j) { // FFLASFFPACK_check(Append[(i+m)*lda+j]==B[i*lda+j]); // } // } // #endif // Element_ptr Afull = FFLAS::fflas_new(F, M, lda); // FFLAS::fassign(F,M,n,Append,lda,Afull,lda) ; // // FFLAS::fassign(F,m,n,A,lda,Afull,lda) ; // // FFLAS::fassign(F,k,n,B,lda,Afull+m*lda,lda) ; // #if 0 // std::cout << "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" << std::endl; // for (size_t i = 0 ; i < m ; ++i) { // for (size_t j = 0 ; j < n ; ++j) { // std::cout << Append[i*lda+j] << "(" << A[i*lda+j] << ") " ; // } std::cout << std::endl; // } // std::cout << "-----------------------------------" << std::endl; // for (size_t i = 0 ; i < k ; ++i) { // for (size_t j = 0 ; j < n ; ++j) { // std::cout << Append[(i+m)*lda+j] ; // std::cout << "(" << B[i*lda+j] << ") " ; // }std::cout << std::endl; // } // std::cout << "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" << std::flush << std::endl; // #endif // #if 0 // for (size_t i = 0 ; i < m ; ++i) // for (size_t j = 0 ; j < n ; ++j) // FFLASFFPACK_check(Acop[i*lda+j]==A[i*lda+j]); // for (size_t i = 0 ; i < k ; ++i) // for (size_t j = 0 ; j < n ; ++j) // FFLASFFPACK_check(Bcop[i*lda+j]==B[i*lda+j]); // for (size_t i = 0 ; i < M ; ++i) // for (size_t j = 0 ; j < n ; ++j) // if (i < m) // FFLASFFPACK_check(Afull[i*lda+j]==A[i*lda+j]); // else // FFLASFFPACK_check(Afull[i*lda+j]==B[(i-m)*lda+j]); // #endif // size_t maxP, maxQ ; // if (trans == FFLAS::FflasTrans){ // maxP = M; // maxQ = n; // } // else{ // trans == FFLAS::FflasNoTrans // maxP = n; // maxQ = M; // } // size_t * P = FFLAS::fflas_new(maxP) ; // size_t * Q = FFLAS::fflas_new(maxQ) ; // size_t * PP = FFLAS::fflas_new(maxP) ; // size_t * QQ = FFLAS::fflas_new(maxQ) ; // /* valgrind says the following leaks. Just incroyable. */ // size_t R = FFPACK::LUdivine (F, diag, trans, M, n, Append, lda, PP, QQ); // size_t R1 = FFPACK::LUdivine (F, diag, trans, m, n, Acop, lda, P, Q); // size_t R2 = FFPACK::LUpdate (F,diag,trans,m,n,Acop,lda,R1,k,Bcop,lda,P,Q, // FFPACK::FfpackLQUP); // #if 0 // std::cout << "P := [ " ; // for (size_t i = 0 ; i < maxP ; ++i) // std::cout << P[i] << " " ; // std::cout << ']' << std::endl; // std::cout << "Q := [ "; // for (size_t i = 0 ; i < maxQ ; ++i) // std::cout << Q[i] << " " ; // std::cout << ']' << std::endl; // std::cout << "PP := [ "; // for (size_t i = 0 ; i < maxP ; ++i) // std::cout << PP[i] << " " ; // std::cout << ']' << std::endl; // std::cout << "QQ := [ "; // for (size_t i = 0 ; i < maxQ ; ++i) // std::cout << QQ[i] << " " ; // std::cout << ']' << std::endl; // #endif // if (R2 != R) { // std::cout << "error, bad rank " << R2 << " <> " << R << " (expected) " << std::endl; // FFLAS::fflas_delete( Bcop ); // FFLAS::fflas_delete( Acop ); // FFLAS::fflas_delete( Append ); // FFLAS::fflas_delete( PP); // FFLAS::fflas_delete( QQ); // FFLAS::fflas_delete( P ); // FFLAS::fflas_delete( Q ); // return fail=true; // } // // compute C=LQUP and check C == A // Element_ptr C = FFLAS::fflas_new (F, M, lda); // /* Build L,U */ // Element_ptr L, U; // if (trans == FFLAS::FflasNoTrans){ // L = FFLAS::fflas_new(F, M, M); // U = FFLAS::fflas_new(F, M, n); // typename Field::Element zero,one; // F.init(zero,0.0); // F.init(one,1.0); // /* build U */ // for (size_t i=0; i(F,A,lda,R,m,n); RandomMatrixWithRankandRandomRPM(F,A,lda,R,m,n); fail |= test_pluq(F,A,R,m,n,lda); if (fail) std::cout << "failed at big lda max rank" << std::endl; FFLAS::fflas_delete( A ); } { /* user given and lda bigger. Rank is min */ size_t lda = n+10 ; size_t R = 0; Element_ptr A = FFLAS::fflas_new (F, m, lda); RandomMatrixWithRankandRandomRPM(F,A,lda,R,m,n); fail |= test_LUdivine(F,A,lda,R,m,n); RandomMatrixWithRankandRandomRPM(F,A,lda,R,m,n); fail |= test_pluq(F,A,R,m,n,lda); if (fail) std::cout << "failed at big lda, rank 0" << std::endl; FFLAS::fflas_delete( A ); } { /* square */ size_t M = std::max(m,n); size_t N = M ; size_t R = M/2 ; size_t lda = N+10 ; Element_ptr A = FFLAS::fflas_new (F, M, lda); RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N); fail |= test_LUdivine(F,A,lda,R,M,N); RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N); fail |= test_pluq(F,A,R,M,N,lda); if (fail) std::cout << "failed at square" << std::endl; FFLAS::fflas_delete( A ); } { /* wide */ size_t M = std::max(m,n); size_t N = 2*M ; size_t R = 3*M/4 ; size_t lda = N+5 ; Element_ptr A = FFLAS::fflas_new (F, M, lda); RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N); fail |= test_LUdivine(F,A,lda,R,M,N); RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N); fail |= test_pluq(F,A,R,M,N,lda); if (fail) std::cout << "failed at wide" << std::endl; FFLAS::fflas_delete( A ); } { /* narrow */ size_t M = std::max(m,n); size_t N = M/2 ; size_t R = 3*M/8 ; size_t lda = N+5 ; Element_ptr A = FFLAS::fflas_new (F, M, lda); RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N); fail |= test_LUdivine(F,A,lda,R,M,N); RandomMatrixWithRankandRandomRPM(F,A,lda,R,M,N); fail |= test_pluq(F,A,R,M,N,lda); if (fail) std::cout << "failed at narrow" << std::endl; FFLAS::fflas_delete( A ); } return !fail; } // template // bool launch_test_append(const Field & F, // size_t r, // size_t m, size_t n) // { // typedef typename Field::Element Element ; // bool fail = false ; // { /* user given and lda bigger */ // size_t lda = n+10 ; // size_t k = m/2+1 ; // Element_ptr A = FFLAS::fflas_new (F, m, lda); // Element_ptr B = FFLAS::fflas_new (F, k, lda); // RandomMatrixWithRank(F,A,lda,r,m,n); // RandomMatrixWithRank(F,B,lda,k/2+1,k,n); // fail |= test_lu_append(F,A,B,m,n,k,lda); // if (fail) std::cout << "failed" << std::endl; // FFLAS::fflas_delete( A ); // FFLAS::fflas_delete( B ); // } // { /* user given and lda bigger. Rank is max */ // size_t lda = n+10 ; // size_t R = std::min(m,n); // size_t k = m/2+1 ; // Element_ptr A = FFLAS::fflas_new (F, m, lda); // Element_ptr B = FFLAS::fflas_new (F, k, lda); // RandomMatrixWithRank(F,A,lda,R,m,n); // RandomMatrixWithRank(F,B,lda,k/2+1,k,n); // fail |= test_lu_append(F,A,B,m,n,k,lda); // if (fail) std::cout << "failed" << std::endl; // FFLAS::fflas_delete( A ); // FFLAS::fflas_delete( B ); // } // { /* user given and lda bigger. Appended Rank is min */ // size_t lda = n+10 ; // size_t R = std::min(m,n); // size_t k = m/2+1 ; // Element_ptr A = FFLAS::fflas_new (F, m, lda); // Element_ptr B = FFLAS::fflas_new (F, k, lda); // RandomMatrixWithRank(F,A,lda,R,m,n); // RandomMatrixWithRank(F,B,lda,0,k,n); // fail |= test_lu_append(F,A,B,m,n,k,lda); // if (fail) std::cout << "failed" << std::endl; // FFLAS::fflas_delete( A ); // FFLAS::fflas_delete( B ); // } // { /* user given and lda bigger. Rank is min */ // size_t lda = n+10 ; // size_t R = 0; // size_t k = m/2+1 ; // Element_ptr A = FFLAS::fflas_new (F, m, lda); // Element_ptr B = FFLAS::fflas_new (F, k, lda); // RandomMatrixWithRank(F,A,lda,R,m,n); // RandomMatrixWithRank(F,B,lda,k/2+1,k,n); // fail |= test_lu_append(F,A,B,m,n,k,lda); // if (fail) std::cout << "failed" << std::endl; // FFLAS::fflas_delete( A ); // FFLAS::fflas_delete( B ); // } // { /* square */ // size_t M = std::max(m,n); // size_t N = M ; // size_t R = M/2 ; // size_t lda = N+10 ; // size_t k = R ; // Element_ptr A = FFLAS::fflas_new (F, M, lda); // Element_ptr B = FFLAS::fflas_new (F, k, lda); // RandomMatrixWithRank(F,A,lda,R,M,N); // RandomMatrixWithRank(F,B,lda,R/2,k,N); // fail |= test_lu_append(F,A,B,M,N,k,lda); // if (fail) std::cout << "failed" << std::endl; // FFLAS::fflas_delete( A ); // FFLAS::fflas_delete( B ); // } // { /* wide */ // size_t M = std::max(m,n); // size_t N = 2*M ; // size_t R = M/2 ; // size_t k = R ; // size_t lda = N+10 ; // Element_ptr A = FFLAS::fflas_new (F, M, lda); // Element_ptr B = FFLAS::fflas_new (F, k, lda); // RandomMatrixWithRank(F,A,lda,R,M,N); // RandomMatrixWithRank(F,B,lda,k/2,k,N); // fail |= test_lu_append(F,A,B,M,N,k,lda); // if (fail) std::cout << "failed" << std::endl; // FFLAS::fflas_delete( A ); // FFLAS::fflas_delete( B ); // } // //! @bug leaks : // #if 0 /* leak here */ // { /* narrow */ // size_t M = std::max(m,n); // size_t N = M/2 ; // size_t R = M/3 ; // size_t k = N ; // size_t lda = N+10 ; // Element_ptr A = FFLAS::fflas_new (F, M, lda); // Element_ptr B = FFLAS::fflas_new (F, k, lda); // RandomMatrixWithRank(F,A,lda,R,M,N); // RandomMatrixWithRank(F,A,lda,std::min(k/2,M/2),k,N); // fail |= test_lu_append(F,A,B,M,N,k,lda); // if (fail) std::cout << "failed" << std::endl; // FFLAS::fflas_delete( A ); // FFLAS::fflas_delete( B ); // } // #endif // return fail; // } template bool run_with_field(Givaro::Integer q, uint64_t b, size_t m, size_t n, size_t r, size_t iters){ bool ok = true ; int nbit=(int)iters; while (ok && nbit){ // choose Field Field* F= chooseField(q,b); if (F==nullptr) return true; std::ostringstream oss; F->write(oss); std::cout.fill('.'); std::cout<<"Checking "; std::cout.width(40); std::cout< (*F,r,m,n); ok&= launch_test (*F,r,m,n); ok&= launch_test (*F,r,m,n); ok&= launch_test (*F,r,m,n); #if 0 /* may be bogus */ ok&= launch_test_append (*F,r,m,n); ok&= launch_test_append(*F,r,m,n); ok&= launch_test_append (*F,r,m,n); ok&= launch_test_append (*F,r,m,n); #endif nbit--; if ( !ok ) //std::cout << "\033[1;31mFAILED\033[0m "< std::min (m,n)) r = std::min (m, n); bool ok=true; do{ ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,5,m/6,n/6,r/6,iters); ok&=run_with_field > (q,(b?b:512),m/6,n/6,r/6,iters); } while (loop && ok); return !ok; } fflas-ffpack-2.2.2/tests/test-matrix-io.h000066400000000000000000000024761274716147400202670ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * Written by Brice Boyer (briceboyer) * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/utils/args-parser.h" fflas-ffpack-2.2.2/tests/test-maxdelayeddim.C000066400000000000000000000073271274716147400211200ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include #include #include "fflas-ffpack/fflas-ffpack.h" #include #include template bool test (Givaro::Integer p, size_t kmax){ Field F(p); FFLAS::MMHelper MMH(F, 0); size_t k = MMH.MaxDelayedDim(0); if (kmax!=k) F.write(std::cerr)<<": expected: "< >(17,35184372088831); // ok &= test >(65521,2098176); // ok &= test >(67108859,2); // // kmax = floor(2^53 / ((p-1)/2)^2) // ok &= test >(17,140737488355327); // ok &= test >(65521,8392705); // ok &= test >(67108859,8); // // kmax = floor(2^24 / (p-1)^2) // ok &= test > (17,65535); // ok &= test > (2039,4); // // kmax = floor(2^24 / ((p-1)/2)^2) // ok &= test >(17,262143); // ok &= test > (2039,16); // // kmax = floor(2^53 / (p-1)^2) // ok &= test >(17,36028797018963967); // ok &= test >(65521,2148532608); // ok &= test >(1147482977,7); // // kmax = floor(2^53 / ((p-1)/2)^2) // ok &= test >(17,144115188075855871); // ok &= test >(65521,8594130432); // ok &= test >(1147482977,28); // // kmax = floor(2^31 / (p-1)^2) // ok &= test >(17,8388607); // ok &= test >(24571,3); // // kmax = floor(2^31 / ((p-1)/2)^2) // ok &= test >(17,33554431); // ok &= test >(24571,14); // // kmax = maxsize_t // ok &= test >(17, std::numeric_limits::max()); // ok &= test >(Givaro::Integer("46768052394588893382517914646921056628989841375373"),std::numeric_limits::max()); // // kmax = maxsize_t ok &= test > >(17, std::numeric_limits::max()); ok &= test > >(Givaro::Integer("166153499473114484112975882535042793"),2097152); return !ok; } fflas-ffpack-2.2.2/tests/test-multifile1.C000066400000000000000000000002351274716147400203530ustar00rootroot00000000000000#include "fflas-ffpack/fflas-ffpack.h" // See test-multifile2.C - it is a test // to confirm that the lib is *really* header-only and full inline. fflas-ffpack-2.2.2/tests/test-multifile2.C000066400000000000000000000001521274716147400203520ustar00rootroot00000000000000#include "fflas-ffpack/fflas-ffpack.h" int main(void) { // If it compiles, it is OK. return 0; } fflas-ffpack-2.2.2/tests/test-nullspace.C000066400000000000000000000074201274716147400202710ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for nullspace // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- //#define DEBUG 1 #define TIME 1 using namespace std; #include #include #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/utils/timer.h" #include "Matio.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace FFPACK; typedef ModularBalanced Field; int main(int argc, char** argv){ int n,m; int nbit=atoi(argv[3]); // number of times the product is performed cerr< <" <(NSdim*n); FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, m, NSdim, n, 1.0, Ab, n, NS, ldn, 0.0, C, NSdim); // FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, NSdim, n, m, // 1.0, NS, ldn, Ab, n, 0.0, C, n); bool wrong = false; for (int i=0;is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) the FFLAS-FFPACK group * Written by Ziad Sultan * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //#define DEBUG 1 //#define __FFLASFFPACK_FORCE_SEQ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include #include #include #include #include #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/args-parser.h" #include "test-utils.h" #include "fflas-ffpack/utils/Matio.h" typedef Givaro::ModularBalanced Field; template bool tmain(int argc, char** argv, std::string printStrat) { std::cerr << "tmain: " << printStrat << std::endl; size_t n = 2000; bool p = true; size_t iters = 3; int64_t q = 131071 ; bool dataPar = true; int proc = MAX_THREADS; int strat = 1; Argument as[] = { { 'n', "-n N", "Set the dimension of the matrix.", TYPE_INT , &n }, { 'i', "-i N", "Set number of repetitions.", TYPE_INT , &iters }, { 't', "-t N", "Set number of processors.", TYPE_INT , &proc }, { 's', "-s N", "Set the strategy parameter using t: 1 for (t, BLOCK, THREADS), 2 for (t, BLOCK, GRAIN), 3 for (t, BLOCK, FIXED), 4 for (t, ROW, THREADS), 5 for (t, ROW, GRAIN), 6 for (t, ROW, FIXED), 7 for (t, COLUMN, THREADS), 8 for (t, COLUMN, GRAIN), 9 for (t, COLUMN, FIXED), 10 for SINGLE strategy.", TYPE_INT , &strat }, { 'p', "-p Y/N", "run the parallel program using Parallel(Y)/Sequential(N).", TYPE_BOOL , &p }, { 'd', "-d Y/N", "run the parallel program using data parallelism(Y)/task parallelism(N).", TYPE_BOOL , &dataPar }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); size_t m = n; // matrices are square in this test Field F(q); Field::RandIter G(F); // Allocate matrices typename Field::Element_ptr A = FFLAS::fflas_new (F, m, n); typename Field::Element_ptr B = FFLAS::fflas_new (F, m, n); typename Field::Element_ptr C = FFLAS::fflas_new (F, m, n); typename Field::Element_ptr Acop = FFLAS::fflas_new (F, m, n); auto CUTTER = SPLITTER(proc, CutStrat, StratParam); // initialize if(dataPar){ PARFOR1D(i, m, CUTTER, for (size_t j=0; j<(size_t)n; ++j) G.random (*(A+i*n+j)); ); PARFOR1D(i, m, CUTTER, for (size_t j=0; j<(size_t)n; ++j) G.random (*(B+i*n+j)); ); PARFOR1D(i, m, CUTTER, for (size_t j=0; j<(size_t)n; ++j) G.random (*(C+i*n+j)); ); } else{ // initialize with tasks using FORBLOCK1D PAR_BLOCK{ SYNCH_GROUP( FORBLOCK1D(itt, m*n, CUTTER, TASK(MODE(WRITE(A)), for(size_t i=itt.begin(); i!=itt.end(); ++i) G.random (*(A+i));); TASK(MODE(WRITE(B)), for(size_t i=itt.begin(); i!=itt.end(); ++i) G.random (*(B+i));); TASK(MODE(WRITE(C)), for(size_t i=itt.begin(); i!=itt.end(); ++i) G.random (*(C+i));); );// end of FORBLOCK1D );// end of SYNCH_GROUP }// end of PAR_BLOCK } // copy A for verification FFLAS::fassign(F,m,n,A,n,Acop,n); // time FFLAS::Timer chrono; double *time=new double[iters]; // parallel add using PARFOR1D for (size_t it=0;it<=iters;++it){ chrono.clear(); if (it) chrono.start(); if(dataPar){ PARFOR1D(i, m, CUTTER, for (size_t j=0; j<(size_t)n; ++j) A[i*n+j] = B[i*n+j] + C[i*n+j]; ); } else{ PAR_BLOCK{ FORBLOCK1D(itt, m*n, CUTTER, TASK(MODE(READ(B,C) WRITE(A)), for(size_t i=itt.begin(); i!=itt.end(); ++i) A[i] = B[i] + C[i]; ); ); } } if (it) {chrono.stop(); time[it-1]=chrono.realtime();} } std::sort(time, time+iters); double meantime = time[iters/2]; delete[] time; // sequential add chrono.clear(); chrono.start(); for(size_t i=0; i(argc,argv,std::string("FFLAS::BLOCK, FFLAS::THREADS")); case 2: fail |= tmain(argc,argv,std::string("FFLAS::BLOCK, FFLAS::GRAIN")); case 3: fail |= tmain(argc,argv,std::string("FFLAS::BLOCK, FFLAS::FIXED")); case 4: fail |= tmain(argc,argv,std::string("FFLAS::ROW, FFLAS::THREADS")); case 5: fail |= tmain(argc,argv,std::string("FFLAS::ROW, FFLAS::GRAIN")); case 6: fail |= tmain(argc,argv,std::string("FFLAS::ROW, FFLAS::FIXED")); case 7: fail |= tmain(argc,argv,std::string("FFLAS::COLUMN, FFLAS::THREADS")); case 8: fail |= tmain(argc,argv,std::string("FFLAS::COLUMN, FFLAS::GRAIN")); case 9: fail |= tmain(argc,argv,std::string("FFLAS::COLUMN, FFLAS::FIXED")); case 10: fail |= tmain(argc,argv,std::string("FFLAS::SINGLE, FFLAS::THREADS")); } return fail; } fflas-ffpack-2.2.2/tests/test-paladin-task.C000066400000000000000000000077061274716147400206620ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ // vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) the FFLAS-FFPACK group * Written by Ziad Sultan * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #undef __FFLASFFPACK_USE_OPENMP #define __FFLASFFPACK_USE_TBB #include #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/utils/args-parser.h" size_t add(const size_t x, const size_t y) { return x+y; } size_t seq_fib(size_t n) { if (n < 2) return n; else return seq_fib(n-1) + seq_fib(n-2); } size_t par_fib(const size_t n, const size_t cutoff) { size_t x=0, y=0, z=0; // if (n < 2) // return n; if (n < cutoff) // The bigger the cutoff the bigger is the parallel speed-up return seq_fib(n); else{ SYNCH_GROUP( TASK(MODE(READ(n) WRITE(x) CONSTREFERENCE(x)), x = par_fib(n-1, cutoff); ); TASK(MODE(READ(n) WRITE(y) CONSTREFERENCE(y)), y = par_fib(n-2, cutoff); ); CHECK_DEPENDENCIES; TASK(MODE(READ(x,y) WRITE(z) CONSTREFERENCE(z,x,y)), z=add(x,y); ); );//end SYNCH_GROUP return z; } } int main(int argc, char** argv) { size_t n = 20; bool p = true; size_t iters = 3; size_t cutoff = 2; // int64_t q = 131071 ; // int proc = MAX_THREADS; Argument as[] = { { 'n', "-n N", "Set the nth number of fibonacci to compute", TYPE_INT , &n }, { 'c', "-c N", "Set the Cutoff at which the sequential base case is called (the bigger the cuttof is the better is the parallel speed-up)", TYPE_INT , &cutoff }, { 'i', "-i N", "Set number of repetitions.", TYPE_INT , &iters }, { 'p', "-p Y/N", "run the parallel program using Parallel(Y)/Sequential(N).", TYPE_BOOL , &p }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); // { 't', "-t N", "Set number of processors.", TYPE_INT , &proc }, size_t f=0; // time FFLAS::Timer chrono; double *time=new double[iters]; // parallel add using PARFOR1D for (size_t it=0;it<=iters;++it){ chrono.clear(); if (it) chrono.start(); if(p){ PAR_BLOCK{ f=par_fib(n, cutoff); }// end of PAR_BLOCK } else f=seq_fib(n); if (it) {chrono.stop(); time[it-1]=chrono.realtime();} } std::sort(time, time+iters); double meantime = time[iters/2]; delete[] time; // sequential add chrono.clear(); chrono.start(); size_t l=seq_fib(n); chrono.stop(); double timeseq = chrono.realtime(); // verification of the parallel result if (f!=l) std::cout<<"FAIL: Par_Fib("<s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) the FFLAS-FFPACK group * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include "fflas-ffpack/fflas-ffpack-config.h" #include #include Givaro::Timer tperm, tgemm, tBC, ttrsm,trest,timtot; #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/utils/Matio.h" using namespace std; using namespace FFLAS; using namespace FFPACK; using Givaro::Modular; bool checkMonotonicApplyP(FFLAS_SIDE Side, FFLAS_TRANSPOSE trans, size_t * P, size_t N, size_t R){ bool ok = true; typedef Modular Field; Field F(101); size_t M = 2; size_t lda = (Side == FflasLeft)? M : N; size_t ldb = lda; Field::Element_ptr A = fflas_new(F, M, N); Field::Element_ptr B = fflas_new(F, M, N); if (Side == FflasLeft) for (size_t i = 0; i= A[(i+1)*lda]){ std::cerr<<"ERROR: A["<= "<s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Ziad Sultan * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // DSL test for pfgemm // //-------------------------------------------------------------------------- // Ziad Sultan //------------------------------------------------------------------------- /* #ifndef DEBUG #define DEBUG 0 #endif */ #define NEWWINO #ifndef TIME #define TIME 1 #endif #define DEBUG 1 #include #include using namespace std; #define __FFLASFFPACK_USE_OPENMP //#define __FFLASFFPACK_USE_KAAPI //#define __FFLASFFPACK_FORCE_SEQ #include "fflas-ffpack/field/modular-positive.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/fflas/fflas.h" #include "time.h" /* #ifdef __FFLASFFPACK_USE_KAAPI #include #endif #ifdef __FFLASFFPACK_USE_OPENMP #include #endif */ using namespace FFPACK; typedef Givaro::Modular Field; //typedef Givaro::Modular Field; //typedef ModularBalanced Field; //typedef ModularBalanced Field; //typedef Givaro::Modular Field; BEGIN_PARALLEL_MAIN(int argc, char** argv) { if (argc != 8) { cerr<<"Testing pfgemm with : test-fgemm-DSL

    " <::value, FFLAS::ParSeqHelper::Parallel> pWH (F, nbw,FFLAS::ParSeqHelper::Parallel(MAX_THREADS,meth,strat)); for(int i = 0;i(m*n); clock_gettime(CLOCK_REALTIME, &t0); PAR_INSTR{ FFLAS::fgemm(F, ta, tb,m,n,k,alpha, A,lda, B,ldb, beta,C,n, pWH); } BARRIER; clock_gettime(CLOCK_REALTIME, &t1); delay = (double)(t1.tv_sec-t0.tv_sec)+(double)(t1.tv_nsec-t0.tv_nsec)/1000000000; if (i) t_total+=delay; } avrg = t_total/(nbit-1); #if TIME double mflops = (2.0*(m*k-((!F.isZero(beta))?m:0))/1000000.0)*n/avrg; cerr<(m*n); for (int i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * Written by Ashley Lesdalons * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ //-------------------------------------------------------------------------- // Test for Checker_PLUQ //-------------------------------------------------------------------------- #define ENABLE_ALL_CHECKINGS 1 #include #include #include #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/utils/args-parser.h" int main(int argc, char** argv) { size_t iter = 3 ; Givaro::Integer q = 131071; size_t MAXM = 1000; size_t MAXN = 1000; size_t m=0,n=0; size_t seed(0); bool random_dim = false; Argument as[] = { { 'q', "-q Q", "Set the field characteristic (-1 for random).", TYPE_INTEGER , &q }, { 'm', "-m M", "Set the row dimension of A.", TYPE_INT , &m }, { 'n', "-n N", "Set the col dimension of A.", TYPE_INT , &n }, { 'i', "-i R", "Set number of repetitions.", TYPE_INT , &iter }, { 's', "-s N", "Set the seed.", TYPE_INT , &seed }, END_OF_ARGUMENTS }; FFLAS::parseArguments(argc,argv,as); if (m == 0 || n == 0) random_dim = true; srandom ( seed?seed:time(NULL) ); typedef Givaro::Modular Field; Field F(q); Field::RandIter Rand(F,0,seed); srandom(seed); size_t pass = 0; // number of tests that have successfully passed for(size_t it=0; it(m); size_t *Q = FFLAS::fflas_new(n); // generate a random matrix A PAR_BLOCK { FFLAS::pfrand(F,Rand, m,n,A,m/MAX_THREADS); } // FFPACK::Checker_PLUQ checker (RValue,m,n,A,n); // size_t R = FFPACK::PLUQ(F, FFLAS::FflasNonUnit, m, n, A, n, P, Q); FFPACK::PLUQ(F, FFLAS::FflasNonUnit, m, n, A, n, P, Q); try { // checker.check(A,n,R,P,Q); std::cout << m << 'x' << n << " pluq verification successful\n"; pass++; } catch(FailurePLUQCheck &e) { std::cout << m << 'x' << n << " pluq verification failed!\n"; } FFLAS::fflas_delete(A,P,Q); } std::cout << pass << "/" << iter << " tests were successful.\n"; return 0; } fflas-ffpack-2.2.2/tests/test-pluq.C000066400000000000000000000164051274716147400172670ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * Written by * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ //-------------------------------------------------------------------------- // Test for the lqup factorisation //-------------------------------------------------------------------------- // usage: test-lqup p A n, for n lqup factorization // of A over Z/pZ //------------------------------------------------------------------------- //------------------------------------------------------------------------- //#define DEBUG 0 #define __FFLAS__TRSM_READONLY // Debug option 0: no debug // 1: check A = LQUP //------------------------------------------------------------------------- #define ENABLE_ALL_CHECKINGS 1 #define __FFPACK_LUDIVINE_CUTOFF 60 #include #include #include #include "fflas-ffpack/utils/Matio.h" #include "fflas-ffpack/utils/timer.h" #include "givaro/modular-integer.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "test-utils.h" using namespace std; using namespace FFPACK; typedef Givaro::Modular Field; int main(int argc, char** argv){ //cerr< "<(maxP); size_t *Q = FFLAS::fflas_new(maxQ); //write_field (F,cerr<<"A = "<(R); CRP = FFLAS::fflas_new(R); // RankProfilesFromPLUQ(RRP, CRP, P, Q, m, n, R); } // cerr<<"Row Rank Profile = "; // for (size_t i=0;i(m*n); Field::Element * L, *U; L = FFLAS::fflas_new(m*R); U = FFLAS::fflas_new(R*n); Field::Element zero,one; F.init(zero,0.0); F.init(one,1.0); for (size_t i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2015 the FFLAS-FFPACK group * Written by * * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== * */ /* ******************************************************* Parallel PLUQ quad recurisve with OpenMP ******************************************************* g++ -D__FFLASFFPACK_HAVE_CBLAS -Wall -g -fopenmp -O3 -march=native -mavx -I/home/sultan/soft/fflas-ffpack/ -I/usr/local/soft/givaro-3.7.1/include test-ppluq.C -L/home/pernet/Logiciels/ATLAS_1TH/lib -lcblas -latlas -L/usr/local/soft/givaro-3.7.1/lib -lgivaro -lm -lrt -Wl,-rpath -Wl,/usr/local/soft/givaro-3.7.1/lib -o test-ppluq */ #include #include #include #include //#include "omp.h" #define __FFLASFFPACK_USE_OPENMP #define __FFLAS__TRSM_READONLY #define __PFTRSM_FOR_PLUQ #include "fflas-ffpack/utils/Matio.h" #include //#include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/fflas-ffpack.h" #include "fflas-ffpack/fflas/fflas.h" #include "sys/time.h" //#define BASECASE_K 256 //#include "fflas-ffpack/ffpack/parallel.h" using namespace std; using namespace FFLAS; using namespace FFPACK; #ifndef MODULO #define MODULO 1 #endif #if(MODULO==1) typedef Givaro::Modular Field; #else typedef Givaro::ZRing Field; #endif #ifndef DEBUG #define DEBUG 1 #endif #ifndef SEQ #define SEQ 1 #endif void verification_PLUQ(const Field & F, typename Field::Element * B, typename Field::Element * A, size_t * P, size_t * Q, size_t m, size_t n, size_t R) { Field::Element * X = FFLAS::fflas_new(m*n); Field::Element * L, *U; L = FFLAS::fflas_new(m*R); U = FFLAS::fflas_new(R*n); ParSeqHelper::Parallel H; PARFOR1D (i,m*R, H, F.init(L[i], 0.0); ); PARFOR1D (i,m*R, H, F.init(U[i], 0.0); ); PARFOR1D (i,m*n, H, F.init(X[i], 0.0); ); Field::Element zero,one; F.init(zero,0.0); F.init(one,1.0); PARFOR1D (i,R, H, for (size_t j=0; j 6){ std::cerr<<"usage : PLUQ-rec-omp

    "<

    "<1 ? atoi( argv[1] ) : 1009); m = (argc>2 ? atoi( argv[2] ) : 1024); n = (argc>3 ? atoi( argv[3] ) : 1024); // r = atoi( argv[4] ); nbf = (argc>4 ? atoi( argv[4] ) : 1); // size_t lda = n; // random seed // ifstream f("/dev/urandom"); // size_t seed1, seed2, seed3,seed4; // f.read(reinterpret_cast(&seed1), sizeof(seed1)); // f.read(reinterpret_cast(&seed2), sizeof(seed2)); // f.read(reinterpret_cast(&seed3), sizeof(seed3)); // f.read(reinterpret_cast(&seed4), sizeof(seed4)); // seed1=10;seed2=12; // seed3=13;seed4=14; enum FFLAS::FFLAS_DIAG diag = FFLAS::FflasNonUnit; size_t R; const Field F((double)p); // Field::RandIter G(F, seed1); Field::Element alpha, beta; F.init(alpha,1.0); F.init(beta,0.0); // Field::Element * U = FFLAS::fflas_new(n*n); ParSeqHelper::Parallel H; typename Field::Element* Acop; if (argc > 5) { Acop = read_field(F,argv[5],&m,&n); } else { Field::RandIter G(F); Acop = FFLAS::fflas_new(m*n); PARFOR1D(i,(size_t)m, H, for (size_t j=0; j<(size_t)n; ++j) G.random (*(Acop+i*n+j)); ); } // FFLAS::fflas_new(n*m); Field::Element* A = FFLAS::fflas_new(n*m); #if(DEBUG==1) Field::Element* Adebug = FFLAS::fflas_new(n*m); #endif // std::vector Index_P(r); // U = construct_U(F,G, n, r, Index_P, seed4, seed3); // A = construct_L(F,G, m, r, Index_P, seed2); // M_randgen(F, A, U, r, m, n); // size_t taille=m*n; // for(size_t i=0; i(maxP); size_t *Q = FFLAS::fflas_new(maxQ); PARFOR1D(i, (size_t)m, H, for (size_t j=0; j<(size_t)n; ++j) { *(A+i*n+j) = *(Acop+i*n+j) ; #if(DEBUG==1) *(Adebug+i*n+j) = *(Acop+i*n+j) ; #endif } ); for ( int i=0;i(maxP); size_t * QQ = FFLAS::fflas_new(maxQ); for (size_t j=0;js,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for rank // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- #include #include #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/utils/timer.h" #include "Matio.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace std; using namespace FFPACK; typedef ModularBalanced Field; int main(int argc, char** argv){ int n,m; int nbit=atoi(argv[3]); // number of times the product is performed cerr< <" <s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the computations of rank profiles //-------------------------------------------------------------------------- #define __FFLASFFPACK_SEQUENTIAL #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/utils/args-parser.h" #include #include #include #include "test-utils.h" #include "Matio.h" using namespace FFPACK; template bool run_with_field(Givaro::Integer q, uint64_t b, size_t m, size_t n, size_t r, size_t iters){ bool ok = true ; int nbit=(int)iters; while (ok && nbit){ // choose Field Field* F= chooseField(q,b); if (F==nullptr) return true; std::ostringstream oss; F->write(oss); std::cout.fill('.'); std::cout<<"Checking "; std::cout.width(40); std::cout<(m); size_t * Q = FFLAS::fflas_new(n); FFLAS::fassign (*F, m, n, B, lda, A, lda); PLUQ(*F, FFLAS::FflasNonUnit, m, n, A, lda, P, Q); for (size_t i=0; i<1;i++){ size_t mm = 1 + (rand() % m); size_t nn = 1 + (rand() % n); FFLAS::fassign (*F, m, n, B, lda, A, lda); size_t rr = FFPACK::ColumnRankProfile (*F, mm, nn, A, lda, RP1, FFPACK::FfpackSlabRecursive); FFLAS::fassign (*F, m, n, B, lda, A, lda); FFPACK::RowRankProfile (*F, mm, nn, A, lda, RP2, FFPACK::FfpackSlabRecursive); size_t* RRP = FFLAS::fflas_new(r); size_t* CRP = FFLAS::fflas_new(r); LeadingSubmatrixRankProfiles (m,n,r,mm,nn,P,Q,RRP,CRP); for (size_t ii=0; ii(r); size_t* CRP = FFLAS::fflas_new(r); size_t* RRPLUD, * RRPPLUQ, *CRPLUD, *CRPPLUQ; RandomRankProfile (m, r, RRP); RandomRankProfile (n, r, CRP); RandomMatrixWithRankandRPM(*F,A,lda,r,m,n, RRP, CRP); FFLAS::fassign (*F, m, n, A, lda, B, lda); size_t cs = FFPACK::ColumnRankProfile (*F, m, n, A, lda, CRPLUD, FFPACK::FfpackSlabRecursive); FFLAS::fassign (*F, m, n, B, lda, A, lda); size_t ct = FFPACK::ColumnRankProfile (*F, m, n, A, lda, CRPPLUQ, FFPACK::FfpackTileRecursive); FFLAS::fassign (*F, m, n, B, lda, A, lda); size_t rs = FFPACK::RowRankProfile (*F, m, n, A, lda, RRPLUD, FFPACK::FfpackSlabRecursive); FFLAS::fassign (*F, m, n, B, lda, A, lda); size_t rt = FFPACK::RowRankProfile (*F, m, n, A, lda, RRPPLUQ, FFPACK::FfpackTileRecursive); // write_perm (std::cout<<"RRP = ", RRP, r); // write_perm (std::cout<<"CRP = ", CRP, r); std::sort(CRP,CRP+r); std::sort(RRP,RRP+r); ok &= (cs==ct)&(cs==rs)&(cs==rt)&(cs==r); for (size_t i=0; i std::min (m,n)) r = std::min (m, n); bool ok=true; do{ ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field > (q,b,m,n,r,iters); ok&=run_with_field >(q,(b?b:128),m/4+1,n/4+1,r/4+1,iters); } while (loop && ok); return !ok; } fflas-ffpack-2.2.2/tests/test-redcolechelon.C000066400000000000000000000126721274716147400211160ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the reduced column echelon factorisation //-------------------------------------------------------------------------- // usage: test-redcolechelon p A n, for n reduced column echelon computations // of A over Z/pZ //------------------------------------------------------------------------- //------------------------------------------------------------------------- //#define DEBUG 1 // Debug option 0: no debug // 1: check A = LQUP //------------------------------------------------------------------------- using namespace std; //#define __LUDIVINE_CUTOFF 1 #include #include #include "Matio.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace FFPACK; typedef Givaro::Modular Field; int main(int argc, char** argv){ //cerr< "<(n); size_t *Q = FFLAS::fflas_new(m); // size_t cutoff = atoi(argv[3]); nbf = atoi(argv[3]); FFLAS::Timer tim,timc; timc.clear(); for ( i=0;i(m*n); Field::Element * U = FFLAS::fflas_new(n*n); Field::Element * X = FFLAS::fflas_new(m*n); Field::Element zero,one; F.init(zero,0.0); F.init(one,1.0); for (int i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the reduced echelon factorisation //-------------------------------------------------------------------------- // usage: test-redechelon p A n, for n reduced echelon computations // of A over Z/pZ //------------------------------------------------------------------------- //------------------------------------------------------------------------- //#define DEBUG 1 // Debug option 0: no debug // 1: check A = LQUP //------------------------------------------------------------------------- using namespace std; //#define __LUDIVINE_CUTOFF 1 #include #include #include "Matio.h" #include "fflas-ffpack/utils/timer.h" #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace FFPACK; typedef Givaro::Modular Field; int main(int argc, char** argv){ //cerr< "<(n); size_t *Q = FFLAS::fflas_new(m); // size_t cutoff = atoi(argv[3]); nbf = atoi(argv[3]); FFLAS::Timer tim,timc; timc.clear(); for ( i=0;i(m*n); Field::Element * U = FFLAS::fflas_new(n*n); Field::Element * X = FFLAS::fflas_new(m*n); Field::Element zero,one; F.init(zero,0.0); F.init(one,1.0); for (int i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the reduced row echelon factorisation //-------------------------------------------------------------------------- // usage: test-redrowechelon p A n, for n reduced row echelon computations // of A over Z/pZ //------------------------------------------------------------------------- //------------------------------------------------------------------------- //#define DEBUG 1 // Debug option 0: no debug // 1: check A = LQUP //------------------------------------------------------------------------- using namespace std; //#define __LUDIVINE_CUTOFF 1 #include #include #include "Matio.h" #include "fflas-ffpack/utils/timer.h" //#include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/field/modular-positive.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace FFPACK; typedef Givaro::Modular Field; int main(int argc, char** argv){ //cerr< "<(n); size_t *Q = FFLAS::fflas_new(m); // size_t cutoff = atoi(argv[3]); nbf = atoi(argv[3]); FFLAS::Timer tim,timc; timc.clear(); for ( i=0;i(m*m); Field::Element * U = FFLAS::fflas_new(m*n); Field::Element * X = FFLAS::fflas_new(m*n); Field::Element zero,one; F.init(zero,0.0); F.init(one,1.0); for (int i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ //-------------------------------------------------------------------------- // Test for the row echelon factorisation //-------------------------------------------------------------------------- // usage: test-row p A n, for n computations // of A over Z/pZ //------------------------------------------------------------------------- //------------------------------------------------------------------------- //#define DEBUG 1 // Debug option 0: no debug // 1: check A = LQUP //------------------------------------------------------------------------- using namespace std; //#define __LUDIVINE_CUTOFF 1 #include #include #include "Matio.h" #include "fflas-ffpack/utils/timer.h" //#include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/field/modular-positive.h" #include "fflas-ffpack/ffpack/ffpack.h" using namespace FFPACK; // typedef Givaro::Modular Field; typedef Givaro::Modular Field; // typedef Givaro::Modular Field; int main(int argc, char** argv){ cerr< "<(m); size_t *Q = FFLAS::fflas_new(n); // size_t cutoff = atoi(argv[3]); nbf = atoi(argv[3]); FFLAS::Timer tim,timc; timc.clear(); for ( i=0;i(m*m); Field::Element * U = FFLAS::fflas_new(m*n); Field::Element * X = FFLAS::fflas_new(m*n); Field::Element zero,one; F.init(zero,0.0); F.init(one,1.0); for (int i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) 2014 FFLAS-FFPACK * Written by : * Bastien Vialla * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include "givaro/givinteger.h" #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/fflas/fflas_simd.h" #include "fflas-ffpack/utils/args-parser.h" #include "fflas-ffpack/utils/align-allocator.h" #include #include #include #include #include #include #include #include #include #include typedef Givaro::Integer integer; /********************************************************************************** * * Random generators * ***********************************************************************************/ template typename std::enable_if::value>::type generate_random (std::vector &a, std::mt19937 &generator) { std::uniform_int_distribution dist(std::numeric_limits::min(), std::numeric_limits::max()); std::generate(a.begin(), a.end(), [&](){return dist(generator);}); } template typename std::enable_if::value>::type generate_random (std::vector &a, std::mt19937 &generator) { std::uniform_real_distribution dist(std::numeric_limits::min(), std::numeric_limits::max()); std::generate(a.begin(), a.end(), [&](){return dist(generator);}); } /********************************************************************************** * * Function Traits * ***********************************************************************************/ template struct function_traits; // function pointer template struct function_traits : public function_traits {}; template struct function_traits { using return_type = R; static constexpr std::size_t arity = sizeof...(Args); template struct argument { static_assert(N < arity, "error: invalid parameter index."); using type = typename std::tuple_element >::type; }; }; // member function pointer template struct function_traits : public function_traits {}; // const member function pointer template struct function_traits : public function_traits {}; // member object pointer template struct function_traits : public function_traits {}; template void print_arity (SimdFunc f) { std::cout << "Arity of function is " << (function_traits::arity) << std::endl; } /**************************************************************************************/ template inline typename std::enable_if< (function_traits::arity == 0) && !(std::is_same::return_type, void>::value) , bool>::type test_op(SimdFunc && fsimd, ScalFunc && fscal, size_t seed, size_t vectorSize, Element max, std::string name){ using vect_t = typename simd::vect_t; std::vector> c1(vectorSize), c2(vectorSize); std::transform(c1.begin(), c1.end(), c1.begin(), fscal); vect_t vc2; for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){ c2 = fsimd(); simd::store(c2.data()+i, c2); } bool res = std::equal(c1.begin(), c1.end(), c2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;}); if(!res) { std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name << " on " << (sizeof(Element) * 8) << "bits." << std::endl; std::copy(c1.begin(), c1.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl; std::copy(c2.begin(), c2.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl ; } return res; } template inline typename std::enable_if< (function_traits::arity == 1) && !(std::is_same::return_type, void>::value) , bool>::type test_op(SimdFunc fsimd, ScalFunc fscal, size_t seed, size_t vectorSize, Element max, std::string name){ using vect_t = typename simd::vect_t; std::mt19937 generator(seed); std::vector> a1(vectorSize), c1(vectorSize), a2(vectorSize), c2(vectorSize), c3(vectorSize); generate_random(a1, generator); a2 = a1; std::transform(a1.begin(), a1.end(), c1.begin(), fscal); vect_t va2, vc2, vc3; for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){ va2 = simd::load(a2.data()+i); vc3 = simd::load(c1.data()+i); vc2 = fsimd(va2); vc3 = simd::sub(vc3,vc2); simd::store(c2.data()+i, vc2); simd::store(c3.data()+i, vc3); } bool res = std::equal(c1.begin(), c1.end(), c2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;}); if(!res) { std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name << " on " << (sizeof(Element) * 8) << "bits." << std::endl; std::cout << "a2: "; std::copy(a2.begin(), a2.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl; std::cout << "c1: "; std::copy(c1.begin(), c1.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl; std::cout << "c2: "; std::copy(c2.begin(), c2.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl << std::endl; std::cout << "c1-c2: "; std::copy(c3.begin(), c3.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl << std::endl; } return res; } template inline typename std::enable_if< (function_traits::arity == 2) && !(std::is_same::return_type, void>::value) , bool>::type test_op(SimdFunc fsimd, ScalFunc fscal, size_t seed, size_t vectorSize, Element max, std::string name){ using vect_t = typename simd::vect_t; std::mt19937 generator(seed); std::vector> a1(vectorSize), b1(vectorSize), c1(vectorSize), a2(vectorSize), b2(vectorSize), c2(vectorSize), c3(vectorSize); generate_random(a1, generator); generate_random(b1, generator); a2 = a1; b2 = b1; std::transform(a1.begin(), a1.end(), b1.begin(), c1.begin(), fscal); vect_t va2, vb2, vc2, vc3; for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){ va2 = simd::load(a2.data()+i); vb2 = simd::load(b2.data()+i); vc3 = simd::load(c1.data()+i); vc2 = fsimd(va2, vb2); vc3 = simd::sub(vc3,vc2); simd::store(c2.data()+i, vc2); simd::store(c3.data()+i, vc3); } bool res = std::equal(c1.begin(), c1.end(), c2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;}); if(!res) { std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name << " on " << (sizeof(Element) * 8) << "bits." << std::endl; std::cout << "a2: "; std::copy(a2.begin(), a2.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl; std::cout << "b2: "; std::copy(b2.begin(), b2.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl; std::cout << "c1: "; std::copy(c1.begin(), c1.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl; std::cout << "c2: "; std::copy(c2.begin(), c2.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl << std::endl; std::cout << "c1-c2: "; std::copy(c3.begin(), c3.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl << std::endl; } return res; } template inline typename std::enable_if< (function_traits::arity == 3) && !(std::is_same::return_type, void>::value) , bool>::type test_op(SimdFunc fsimd, ScalFunc fscal, size_t seed, size_t vectorSize, Element max, std::string name){ using vect_t = typename simd::vect_t; std::mt19937 generator(seed); std::vector> a1(vectorSize), b1(vectorSize), c1(vectorSize), d1(vectorSize), a2(vectorSize), b2(vectorSize), c2(vectorSize), d2(vectorSize); generate_random(a1, generator); generate_random(b1, generator); generate_random(c1, generator); a2 = a1; b2 = b1; c2 = c1; for(size_t i = 0 ; i < vectorSize ; ++i){ d1[i] = fscal(c1[i], a1[i], b1[i]); } vect_t va2, vb2, vc2; for(size_t i = 0 ; i < vectorSize ; i+=simd::vect_size){ va2 = simd::load(a2.data()+i); vb2 = simd::load(b2.data()+i); vc2 = simd::load(c2.data()+i); simd::store(d2.data()+i, fsimd(vc2, va2, vb2)); } bool res = std::equal(d1.begin(), d1.end(), d2.begin(), [](Element x1, Element x2){return (std::isnan(x1) && std::isnan(x2)) || x1 == x2;}); if(!res) { std::cout << "Error Simd" << sizeof(typename simd::scalar_t)*simd::vect_size*8 << "::" << name << " on " << (sizeof(Element) * 8) << "bits." << std::endl; std::transform(d1.begin(), d1.end(), d2.begin(), d2.begin(), [](Element x1, Element x2){return x1-x2;}); //std::copy(d1.begin(), d1.end(), std::ostream_iterator(std::cout, " ")); //std::cout << std::endl; std::copy(d2.begin(), d2.end(), std::ostream_iterator(std::cout, " ")); std::cout << std::endl; } return res; } template bool test_float_impl(size_t seed, size_t vectorSize, Element max){ bool btest = true; btest &= test_op(simd::ceil, [](Element x){return std::ceil(x);}, seed, vectorSize, max, "ceil"); btest &= test_op(simd::floor, [](Element x){return std::floor(x);}, seed, vectorSize, max,"floor"); btest &= test_op(simd::round, [](Element x){return std::round(x);}, seed, vectorSize, max, "round"); btest &= test_op(simd::add, [](Element x1, Element x2){return x1+x2;}, seed, vectorSize, max, "add"); btest &= test_op(simd::sub, [](Element x1, Element x2){return x1-x2;}, seed, vectorSize, max, "sub"); btest &= test_op(simd::mul, [](Element x1, Element x2){return x1*x2;}, seed, vectorSize, max, "mul"); btest &= test_op(simd::fmadd, [](Element x1, Element x2, Element x3){return std::fma(x3,x2,x1);}, seed, vectorSize, max, "fmadd"); btest &= test_op(simd::fmsub, [](Element x1, Element x2, Element x3){return std::fma(x3,x2,-x1);}, seed, vectorSize, max, "fmsub"); btest &= test_op(simd::fnmadd, [](Element x1, Element x2, Element x3){return std::fma(-x3,x2,x1);}, seed, vectorSize, max, "fnmadd"); btest &= test_op(simd::lesser, [](Element x1, Element x2){return (x1(simd::lesser_eq, [](Element x1, Element x2){return (x1<=x2)?NAN:0;}, seed, vectorSize, max, "lesser_eq"); btest &= test_op(simd::greater, [](Element x1, Element x2){return (x1>x2)?NAN:0;}, seed, vectorSize, max, "greater"); btest &= test_op(simd::greater_eq, [](Element x1, Element x2){return (x1>=x2)?NAN:0;}, seed, vectorSize, max, "greater_eq"); btest &= test_op(simd::eq, [](Element x1, Element x2){return (x1==x2)?NAN:0;}, seed, vectorSize, max, "eq"); return btest; } template typename simd::vect_t mysra (typename simd::vect_t x1){return simd::sra(x1, int(2));} template bool test_integer_impl(size_t seed, size_t vectorSize, Element max){ bool btest = true; btest &= test_op(simd::add, [](Element x1, Element x2){return x1+x2;}, seed, vectorSize, max, "add"); btest &= test_op(simd::sub, [](Element x1, Element x2){return x1-x2;}, seed, vectorSize, max, "sub"); btest &= test_op(simd::mullo, [](Element x1, Element x2){return x1*x2;}, seed, vectorSize, max, "mullo"); btest &= test_op(simd::mul, [](Element x1, Element x2){return x1*x2;}, seed, vectorSize, max, "mullo"); btest &= test_op(simd::fmadd, [](Element x1, Element x2, Element x3){return x1+x3*x2;}, seed, vectorSize, max, "fmadd"); btest &= test_op(simd::fmsub, [](Element x1, Element x2, Element x3){return -x1+x3*x2;}, seed, vectorSize, max, "fmsub"); btest &= test_op(simd::fnmadd, [](Element x1, Element x2, Element x3){return x1-x3*x2;}, seed, vectorSize, max, "fnmadd"); btest &= test_op(simd::lesser, [](Element x1, Element x2){return (x1(simd::lesser_eq, [](Element x1, Element x2){return (x1<=x2)?-1:0;}, seed, vectorSize, max, "lesser_eq"); btest &= test_op(simd::greater, [](Element x1, Element x2){return (x1>x2)?-1:0;}, seed, vectorSize, max, "greater"); btest &= test_op(simd::greater_eq, [](Element x1, Element x2){return (x1>=x2)?-1:0;}, seed, vectorSize, max, "greater_eq"); btest &= test_op(simd::eq, [](Element x1, Element x2){return (x1==x2)?-1:0;}, seed, vectorSize, max, "eq"); // print_arity(mysra); btest &= test_op(mysra, //std::bind(simd::sra,std::placeholders::_1,int(sizeof(Element)*4)), [](Element x1){ integer h = integer (1) << 2; integer r = integer(x1) / h; r -= ((integer(x1)-h*r) < 0)?1:0; return Element(r); // return Element(std::floor(double(x1)/double(h))); }, seed, vectorSize, max, "sra"); btest &= test_op(simd::mulhi, [](Element x1, Element x2){ integer q,r; integer a = (integer(x1)*integer(x2)); integer b = integer(1) << uint64_t(sizeof(Element)*8); Givaro::IntegerDom Z; Z.divmod(q, r, a, b); return Element(q); }, seed, vectorSize, max, "mulhi"); btest &= test_op(simd::mulx, [](Element x1, Element x2){ Element h = Element(1) << (sizeof(Element)*4); /* Representative r of x1 modulo h with -h/2 <= r < h/2*/ if (std::is_signed::value) { x1 = (x1+h/2)%h; x1 += (x1 < 0)?h/2:-h/2; x2 = (x2+h/2)%h; x2 += (x2 < 0)?h/2:-h/2; } else { x1 = x1 % h; x2 = x2 % h; } return x1*x2; }, seed, vectorSize, max, "mulx"); return btest; } template bool test_float(size_t seed, size_t vectorSize, size_t max_){ bool sse = true, avx = true; #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS sse = test_float_impl>(seed, vectorSize, (Element)max_); if(!sse) std::cout << "bug sse" << std::endl; else std::cout << "SSE OK" << std::endl; #endif #ifdef __FFLASFFPACK_HAVE_AVX_INSTRUCTIONS avx = test_float_impl>(seed, vectorSize, (Element)max_); if(!avx) std::cout << "bug avx" << std::endl; else std::cout << "AVX OK" << std::endl; #endif return sse && avx; } template bool test_integer(size_t seed, size_t vectorSize, size_t max_){ bool sse = true, avx = true; #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS sse = test_integer_impl>(seed, vectorSize, (Element)max_); if(!sse) std::cout << "bug sse" << std::endl; else std::cout << "SSE OK" << std::endl; #endif #ifdef __FFLASFFPACK_HAVE_AVX2_INSTRUCTIONS avx = test_integer_impl>(seed, vectorSize, (Element)max_); if(!avx) std::cout << "bug avx" << std::endl; else std::cout << "AVX OK" << std::endl; #endif return sse && avx; } int main(int ac, char **av) { int seed = (int) time(NULL); int vectorSize = 32; int max = 100; int loop = false; static Argument as[] = { { 's', "-s N", "Set the seed .", TYPE_INT , &seed }, { 'l', "-l N", "Set the loop execution .", TYPE_INT , &loop }, END_OF_ARGUMENTS }; FFLAS::parseArguments(ac,av,as); srand(seed); srand48(seed); bool pass = true ; { do{ { pass &= test_float(seed, vectorSize, max); } { pass &= test_float(seed, vectorSize, max); } { pass &= test_integer(seed, vectorSize, max); } { pass &= test_integer(seed, vectorSize, max); } { pass &= test_integer(seed, vectorSize, max); } { pass &= test_integer(seed, vectorSize, max); } { pass &= test_integer(seed, vectorSize, max); } { pass &= test_integer(seed, vectorSize, max); } }while(loop); } std::cout << std::boolalpha << pass << std::endl; return (pass?0:1) ; } fflas-ffpack-2.2.2/tests/test-sparse.C000066400000000000000000000303161274716147400176000ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* Copyright (C) 2014 FFLAS-FFPACK * Written by : Bastien Vialla * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== */ #include "fflas-ffpack/fflas/fflas.h" #include "fflas-ffpack/fflas/fflas_sparse.h" #include "fflas-ffpack/utils/args-parser.h" #include "givaro/modular-double.h" #include "givaro/zring.h" #include #include #include #include #include #include #include #include // #include #include using namespace FFLAS; using namespace FFPACK; using namespace std; using namespace Givaro; template T from_string(std::string const & s) { std::stringstream ss(s); T result; ss >> result; // TODO handle errors return result; } template void testEq(PtrT y1, PtrT y2, uint64_t n) { for (uint64_t i = 0; i < n; ++i) { if (y1[i] != y2[i]) { cout << "Error " << i << endl; cout << y1[i] << " != " << y2[i] << endl; break; } } } template void test_spmv(const Field &F, IndexT *row, IndexT *col, typename Field::Element_ptr dat, index_t rowdim, index_t coldim, uint64_t nnz, typename Field::Element_ptr x, typename Field::Element_ptr y, typename Field::Element beta) { MatT matrix; sparse_init(F, matrix, row, col, dat, rowdim, coldim, nnz); fspmv(F, matrix, x, 1, y); sparse_delete(matrix); } template void test_spmv_sell(const Field &F, IndexT *row, IndexT *col, typename Field::Element_ptr dat, index_t rowdim, index_t coldim, uint64_t nnz, Sparse &matrix, typename Field::Element_ptr x, typename Field::Element_ptr y, typename Field::Element beta) { sparse_init(F, matrix, row, col, dat, rowdim, coldim, nnz); fspmv(F, matrix, x, 1, y); auto tmp = fflas_new(F, rowdim, 1); for (size_t i = 0; i < rowdim; ++i) { tmp[i] = y[matrix.perm[i]]; } for (size_t i = 0; i < rowdim; ++i) { y[i] = tmp[i]; } sparse_delete(matrix); fflas_delete(tmp); } template void test_spmm(const Field &F, IndexT *row, IndexT *col, typename Field::Element_ptr dat, index_t rowdim, index_t coldim, uint64_t nnz, int blockSize, typename Field::Element_ptr x, int ldx, typename Field::Element_ptr y, int ldy, typename Field::Element beta) { MatT matrix; sparse_init(F, matrix, row, col, dat, rowdim, coldim, nnz); fspmm(F, matrix, blockSize, x, ldx, beta, y, ldy); sparse_delete(matrix); } #if 0 template void test_pspmm(const Field &F, IndexT *row, IndexT *col, typename Field::Element_ptr dat, index_t rowdim, index_t coldim, uint64_t nnz, int blockSize, typename Field::Element_ptr x, int ldx, typename Field::Element_ptr y, int ldy, typename Field::Element beta) { MatT matrix; sparse_init(F, matrix, row, col, dat, rowdim, coldim, nnz); pfspmm(F, matrix, blockSize, x, ldx, beta, y, ldy); sparse_delete(matrix); } template void test_pspmv(const Field &F, IndexT *row, IndexT *col, typename Field::Element_ptr dat, index_t rowdim, index_t coldim, uint64_t nnz, typename Field::Element_ptr x, typename Field::Element_ptr y, typename Field::Element beta) { MatT matrix; sparse_init(F, matrix, row, col, dat, rowdim, coldim, nnz); pfspmv(F, matrix, x, 1, y); sparse_delete(matrix); } template void test_pspmv_sell(const Field &F, IndexT *row, IndexT *col, typename Field::Element_ptr dat, index_t rowdim, index_t coldim, uint64_t nnz, Sparse &matrix, typename Field::Element_ptr x, typename Field::Element_ptr y, typename Field::Element beta) { sparse_init(F, matrix, row, col, dat, rowdim, coldim, nnz); pfspmv(F, matrix, x, 1, y); auto tmp = fflas_new(F, rowdim, 1); for (size_t i = 0; i < rowdim; ++i) { tmp[i] = y[matrix.perm[i]]; } for (size_t i = 0; i < rowdim; ++i) { y[i] = tmp[i]; } sparse_delete(matrix); fflas_delete(tmp); } #endif int main(int argc, char **argv) { using Field = Modular; Field F(101); int nbTests = 25; std::string path; index_t *row = nullptr, *col = nullptr; typename Field::Element_ptr dat; index_t rowdim, coldim; uint64_t nnz; if(argc > 1) path = argv[1]; // path = "data/mat11.sms"; index_t * st = nullptr ; readSmsFormat(path, F, row, col, dat, rowdim, coldim, nnz); row = fflas_new(nnz); for (index_t j = 0 ; j < rowdim ; ++j) { for (index_t k = st[j] ; k < st[j+1] ; ++k) row[k] = j ; } auto x = fflas_new(F, coldim, 1, Alignment::CACHE_LINE); auto y = fflas_new(F, rowdim, 1, Alignment::CACHE_LINE); auto y1 = fflas_new(F, rowdim, 1, Alignment::CACHE_LINE); for (size_t i = 0; i < coldim; ++i) { x[i] = 1; } for (size_t i = 0; i < rowdim; ++i) { y[i] = 0; y1[i] = 0; } /************************************************************************************ * * SPMV * *************************************************************************************/ cout << "=== spmv ===" << endl; test_spmv>(F, row, col, dat, rowdim, coldim, nnz, x, y, 1); cout << "CSR: OK" << endl; test_spmv>(F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); // for(size_t i = 0 ; i < 10 ; ++i) // { // cout << y[i] << " "; // } // cout << endl; // for(size_t i = 0 ; i < 10 ; ++i) // { // cout << y1[i] << " "; // } // cout << endl; cout << "CSR_ZO: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } test_spmv>(F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); cout << "COO: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } test_spmv>(F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); cout << "ELL: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } test_spmv>(F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); cout << "ELL_simd: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } test_spmv>(F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); cout << "CSR_HYB: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } test_spmv>(F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); cout << "HYB_ZO: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } Sparse A; test_spmv_sell(F, row, col, dat, rowdim, coldim, nnz, A, x, y1, 1); cout << "SELL: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } /************************************************************************************ * * pSPMV * *************************************************************************************/ #if 0 cout << "=== pspmv ===" << endl; test_pspmv>(F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); cout << "CSR: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } test_pspmv>(F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); cout << "ELL: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } test_pspmv>( F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); cout << "ELL_simd: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } test_pspmv>(F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); cout << "CSR_HYB: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } test_pspmv>(F, row, col, dat, rowdim, coldim, nnz, x, y1, 1); cout << "HYB_ZO: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } Sparse A1; test_pspmv_sell(F, row, col, dat, rowdim, coldim, nnz, A1, x, y1, 1); cout << "SELL: " << ((std::equal(y, y + rowdim, y1)) ? "OK" : "ERROR") << endl; for (size_t i = 0; i < rowdim; ++i) { y1[i] = 0; } #endif // // test_spmm>(F, row, col, dat, // rowdim, // coldim, nnz, 1, x, 1, y, 1, 1); // // test_pspmm>(F, row, col, dat, // rowdim, // coldim, nnz, 1, x, 1, y, 1, 1); // // test_spmm>(F, row, col, dat, // rowdim, coldim, nnz, 1, x, 1, y, 1, 1); // // test_spmm>(F, row, col, dat, // rowdim, // coldim, nnz, 1, x, 1, y, 1, 1); // // test_spmv>(F, row, col, dat, // rowdim, coldim, nnz, x, y1, 1); // for(size_t i = 0 ; i < 11 ; ++i) // { // cout << y[i] << " "; // } // cout << endl; // for(size_t i = 0 ; i < 11 ; ++i) // { // cout << y1[i] << " "; // } // cout << endl; // auto bb = std::equal(y, y+rowdim, y1); // cout << ((bb) ? "CORRECT" : "ERROR") << endl; // if(!bb) // testEq(y, y1, rowdim); fflas_delete(x); fflas_delete(y); fflas_delete(y1); return 0; } fflas-ffpack-2.2.2/tests/test-utils.h000066400000000000000000000063341274716147400175130ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ // vim:sts=8:sw=8:ts=8:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s /* * Copyright (C) FFLAS-FFPACK * Written by Brice Boyer (briceboyer) * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ /*! @file tests/test-utils.h * @ingroup tests * @brief Utilities to create matrices with prescribed shapes, properties,... * To be used in the tests */ #ifndef __FFLASFFPACK_tests_test_utils_H #define __FFLASFFPACK_tests_test_utils_H #include "fflas-ffpack/fflas-ffpack-config.h" #include "fflas-ffpack/utils/debug.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "fflas-ffpack/utils/fflas_randommatrix.h" #include #include #include #include #include namespace FFPACK { /*! Random integer in range. * @param a min bound * @param b max bound * @return a random integer in [a,b[ */ int RandInt(int a, int b) { int x = a ; x += rand()%(b-a+1); FFLASFFPACK_check(x=a); return x ; } template Givaro::Integer maxFieldElt() {return (Givaro::Integer)Field::maxCardinality();} template<> Givaro::Integer maxFieldElt>() {return (Givaro::Integer)-1;} /*** Field chooser for test according to characteristic q and bitsize b ***/ /* if q=-1 -> field is chosen randomly with a charateristic of b bits if b=0 -> bitsize is chosen randomly according to maxFieldElt */ template Field* chooseField(Givaro::Integer q, uint64_t b){ Givaro::Integer maxV= maxFieldElt(); auto seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); std::mt19937 mt_rand(seed); if (maxV>0 && (q> maxV || b> maxV.bitsize())) return nullptr; if (b<=1){ //srand((double)std::chrono::high_resolution_clock::now()); auto bitrand = std::bind(std::uniform_int_distribution(2,maxV.bitsize()-1), mt_rand); b = bitrand(); } Givaro::IntPrimeDom IPD; Givaro::Integer tmp,p; if (q==-1){ // Choose characteristic as a random prime of b bits do{ Givaro::Integer _p; Givaro::Integer::seeding(Givaro::Integer(mt_rand())); Givaro::Integer::random_exact_2exp(_p,b); IPD.prevprime( tmp, _p+1 ); p = tmp; }while( (p < 2) ); } else p=q; return new Field(p); } } // FFPACK #endif fflas-ffpack-2.2.2/tests/testeur_fgemm.C000066400000000000000000000151221274716147400201720ustar00rootroot00000000000000/* -*- mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ //-------------------------------------------------------------------------- // Test for the fgemm winograd // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- //#define NEWWINO #include #include using namespace std; //#include "fflas-ffpack/modular-int.h" //#include "fflas-ffpack/modular-positive.h" #include "fflas-ffpack/field/modular-positive.h" //#include "timer.h" #include "Matio.h" #include "fflas-ffpack/fflas/fflas.h" #include "givaro/givintprime.h" #include "givaro/modular.h" #include "givaro/gfq.h" using namespace FFPACK; using namespace Givaro; //typedef ModularBalanced Field; //typedef ModularBalanced Field; typedef Givaro::Modular Field; //typedef Givaro::Modular Field; //typedef Givaro::Modular Field; //-> bug avec w>=1 (olddynamic pealing) //typedef Givaro::Modular Field; //typedef GFqDom Field; int main(int argc, char** argv){ FFLAS::Timer tim; IntPrimeDom IPD; Field::Element alpha, beta; long p; size_t M, K, N, Wino; bool keepon = true; Integer _p,tmp; cerr< 1 ) TMAXM = atoi(argv[1]); if (argc > 2 ) PRIMESIZE = atoi(argv[2]); if (argc > 3 ) WINOMAX = atoi(argv[3]); if (argc > 4 ) TMAXK = atoi(argv[4]); else TMAXK = TMAXM; if (argc > 5 ) TMAXN = atoi(argv[5]); else TMAXN = TMAXM; enum FFLAS::FFLAS_TRANSPOSE ta, tb; size_t lda,ldb; Field::Element * A; Field::Element * B; Field::Element * C, *Cbis, *Cter; while (keepon){ srandom(_p); do{ // max = Integer::random(2); _p = random();//max % (2<<30); IPD.prevprime( tmp, (_p% (1< RnValue( F, RValue ); do{ M = (size_t) random() % TMAXM; K = (size_t) random() % TMAXK; N = (size_t) random() % TMAXN; Wino = random() % WINOMAX; } while (!( (K>>Wino > 0) && (M>>Wino > 0) && (N>>Wino > 0) )); if (random()%2){ ta = FFLAS::FflasTrans; lda = M; } else{ ta = FFLAS::FflasNoTrans; lda = K; } if (random()%2){ tb = FFLAS::FflasTrans; ldb = K; } else{ tb = FFLAS::FflasNoTrans; ldb = N; } A = FFLAS::fflas_new(M*K); B = FFLAS::fflas_new(K*N); C = FFLAS::fflas_new(M*N); Cbis = FFLAS::fflas_new(M*N); Cter = FFLAS::fflas_new(M*N); for( size_t i = 0; i < M*K; ++i ) RValue.random( *(A+i) ); for( size_t i = 0; i < K*N; ++i ) RValue.random( *(B+i) ); for( size_t i = 0; i < M*N; ++i ) *(Cter+i) = *(Cbis+i)= RValue.random( *(C+i) ); RValue.random( alpha ); RValue.random( beta ); cout <<"p = "<<(size_t)p<<" M = "<::value> WH (F,Wino,FFLAS::ParSeqHelper::Sequential()); FFLAS::fgemm (F, ta, tb, M, N, K, alpha, A, lda, B, ldb, beta, C, N, WH); tim.stop(); // for (int j = 0; j < n; ++j ){ // FFLAS::fgemv( F, FFLAS::FflasNoTrans, m, k, alpha, A, k, B+j, n, beta, Cbis+j, n); // for (int i=0; is,f0,{0,g0,(0,\:0,t0,+0,=s //-------------------------------------------------------------------------- // Sanity check for ftrsm and ftrmm // //-------------------------------------------------------------------------- /* * Copyright (C) 2007 FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include #include #include "fflas-ffpack/field/modular-balanced.h" //#include "fflas-ffpack/field/modular-int.h" #include "fflas-ffpack/utils/timer.h" #include "Matio.h" #include "fflas-ffpack/fflas/fflas.h" #include "givaro/givintprime.h" using namespace std; using namespace FFPACK; //typedef Givaro::Modular Field; //typedef Givaro::Modular Field; typedef ModularBalanced Field; int main(int argc, char** argv){ FFLAS::Timer tim; Givaro::IntPrimeDom IPD; uint64_t p; size_t M, N, K ; bool keepon = true; Givaro::Integer _p,tmp; Field::Element zero,one; cerr< 1 ) TMAX = atoi(argv[1]); if (argc > 2 ) PRIMESIZE = atoi(argv[2]); FFLAS::FFLAS_TRANSPOSE trans; FFLAS::FFLAS_SIDE side; FFLAS::FFLAS_UPLO uplo; FFLAS::FFLAS_DIAG diag; size_t lda, ldb; Field::Element * A, *Abis, *B,* Bbis; Field::Element alpha; while (keepon){ srandom(_p); do{ // max = Integer::random(2); _p = random();//max % (2<<30); IPD.prevprime( tmp, (_p% (1<(K*K); B = FFLAS::fflas_new(M*N); Abis = FFLAS::fflas_new(K*K); Bbis = FFLAS::fflas_new(M*N); for (size_t i = 0; i < M; ++i) for (size_t j = 0; j < N; ++j){ RValue.random (*(B + i*N + j)); *(Bbis + i*N + j) = *(B + i*N + j); } for (size_t i = 0; i < K; ++i) for (size_t j = 0; j < K; ++j) *(Abis + i*K + j) = RValue.random (*(A + i*K + j)); for (size_t i = 0; i < K; ++i){ while (F.isZero(RValue.random (*(A + i*(K+1))))); *(Abis + i*(K +1)) = *(A + i*(K+1)); } cout <<"p = "<<(size_t)p <<" M = "<s,f0,{0,g0,(0,\:0,t0,+0,=s //-------------------------------------------------------------------------- // Test for the lqup decomposition // //-------------------------------------------------------------------------- // Clement Pernet //------------------------------------------------------------------------- /* * Copyright (C) FFLAS-FFPACK * Written by Clément Pernet * This file is Free Software and part of FFLAS-FFPACK. * * ========LICENCE======== * This file is part of the library FFLAS-FFPACK. * * FFLAS-FFPACK is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * ========LICENCE======== *. */ #include #include using namespace std; //#include "fflas-ffpack/field/modular-int.h" //#include "fflas-ffpack/field/modular-positive.h" #include "fflas-ffpack/field/modular-balanced.h" #include "fflas-ffpack/utils/timer.h" #include "Matio.h" #include "fflas-ffpack/ffpack/ffpack.h" #include "givaro/givintprime.h" using namespace FFPACK; //typedef Givaro::Modular Field; typedef ModularBalanced Field; //typedef Givaro::Modular Field; //typedef ModularBalanced Field; //typedef Givaro::Modular Field; //typedef GivaroZpz Field; //typedef GivaroGfq Field; int main(int argc, char** argv){ FFLAS::Timer tim; Givaro::IntPrimeDom IPD; uint64_t p; size_t M, N ; bool keepon = true; Givaro::Integer _p,tmp; Field::Element zero,one; cerr< 1 ) TMAX = atoi(argv[1]); if (argc > 2 ) PRIMESIZE = atoi(argv[2]); FFLAS::FFLAS_TRANSPOSE ta; FFLAS::FFLAS_DIAG diag; size_t lda; Field::Element * A, *Abis, *X,* U, *L; size_t *P, *Q; while (keepon){ srandom(_p); do{ // max = Integer::random(2); _p = random();//max % (2<<30); IPD.prevprime( tmp, (_p% (1<(M*N); U = FFLAS::fflas_new(N*N); P = FFLAS::fflas_new(M); Q = FFLAS::fflas_new(N); for (size_t i=0; i(M*M); U = FFLAS::fflas_new(M*N); P = FFLAS::fflas_new(N); Q = FFLAS::fflas_new(M); for (size_t i=0; i(M*M); Field::Element * H = FFLAS::fflas_new(M*N); size_t t; do{ t = (size_t) random() % 10; } while ((!t)||(t==1)); for (size_t i=0; i