pax_global_header00006660000000000000000000000064131733504260014516gustar00rootroot0000000000000052 comment=086df7e5709f5c09c79647abced914388c80de2b criu-3.6/000077500000000000000000000000001317335042600123305ustar00rootroot00000000000000criu-3.6/.gitignore000066400000000000000000000012451317335042600143220ustar00rootroot00000000000000.config *.o *.d *.a *.img *.bin *.elf *.out *.swp *.swo *.so .git-ignore *.patch *.pyc cscope* tags TAGS Makefile.local compel/compel compel/compel-host-bin images/*.c images/*.h images/google/protobuf/*.c images/google/protobuf/*.h .gitid criu/criu criu/arch/*/sys-exec-tbl*.c # x86 syscalls-table is not generated !criu/arch/x86/sys-exec-tbl.c criu/arch/*/syscalls*.S criu/include/config.h criu/include/syscall-codes*.h criu/include/syscall*.h soccr/config.h criu/include/version.h criu/pie/restorer-blob.h criu/pie/parasite-blob.h criu/protobuf-desc-gen.h lib/build/ lib/c/criu.pc scripts/build/qemu-user-static/* lib/.crit-setup.files compel/include/asm include/common/asm criu-3.6/.mailmap000066400000000000000000000005651317335042600137570ustar00rootroot00000000000000Stanislav Kinsbursky Pavel Emelyanov Andrey Vagin Andrey Vagin Andrey Vagin Andrew Vagin Cyrill Gorcunov criu-3.6/.travis.yml000066400000000000000000000014571317335042600144500ustar00rootroot00000000000000language: c sudo: required dist: trusty cache: ccache services: - docker env: - TR_ARCH=local GCOV=1 - TR_ARCH=local CLANG=1 - TR_ARCH=alpine - TR_ARCH=fedora-asan - TR_ARCH=x86_64 - TR_ARCH=x86_64 CLANG=1 - TR_ARCH=armv7hf - TR_ARCH=aarch64 - TR_ARCH=ppc64le - TR_ARCH=s390x - TR_ARCH=armv7hf CLANG=1 - TR_ARCH=aarch64 CLANG=1 - TR_ARCH=ppc64le CLANG=1 - TR_ARCH=alpine CLANG=1 - TR_ARCH=docker-test - TR_ARCH=fedora-rawhide - TR_ARCH=fedora-rawhide-aarch64 matrix: allow_failures: - env: TR_ARCH=docker-test - env: TR_ARCH=fedora-rawhide - env: TR_ARCH=fedora-rawhide-aarch64 script: - sudo make CCACHE=1 -C scripts/travis $TR_ARCH after_success: - ccache -s - make -C scripts/travis after_success group: deprecated-2017Q2 criu-3.6/COPYING000066400000000000000000001305421317335042600133700ustar00rootroot00000000000000This software is licensed under the GNU GENERAL PUBLIC LICENCE Version 2. Except that any software in the lib/ directory is for the creation of a linkable library to the tools and is licensed under the GNU LESSER GENERAL PUBLIC LICENCE Version 2.1. Contributing Authors agree that their code is submitted under the licence appropriate for its location within the source tree (GPL except for LGPL in lib/) and agree that any future patches, provided they are accepted into the project, may change the licence of their code from GPL to LGPL by moving pieces of it into lib/ or LGPL to GPL by moving pieces of it out of lib/ Note that the only valid version of the GPL is THIS particular version of the license (ie v2, not v2.2 or v3.x or whatever), unless explicitly otherwise stated. ---------------------------------------- GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. --------------------------------------- GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. , 1 April 1990 Ty Coon, President of Vice That's all there is to it! criu-3.6/CREDITS000066400000000000000000000004701317335042600133510ustar00rootroot00000000000000The following people provided invaluable help to CRIU project (in alphabetical order) ------------------------------------------------------------------- Andrew Morton David Miller Eric Dumazet Eric W. Biederman H. Peter Anvin Kees Cook KOSAKI Motohiro Li Yu Linus Torvalds Oleg Nesterov Serge Hallyn Tejun Heo criu-3.6/Documentation/000077500000000000000000000000001317335042600151415ustar00rootroot00000000000000criu-3.6/Documentation/.gitattributes000066400000000000000000000000211317335042600200250ustar00rootroot00000000000000*.txt whitespace criu-3.6/Documentation/.gitignore000066400000000000000000000000531317335042600171270ustar00rootroot00000000000000*.xml *.html *.[1-8] *.pdf *.ps footer.txt criu-3.6/Documentation/HOWTO.cross-compile000066400000000000000000000026461317335042600205520ustar00rootroot00000000000000This HOWTO explains how to cross-compile CRIU on x86 1. Download the protobuf sources. 2. Apply the patch http://16918.selcdn.ru/crtools/aarch64/0001-protobuf-added-the-support-for-the-acrchitecture-AAr.patch 3. Configure protobuf to be compiled for the target architecture: ./configure --prefix=$X86_PREFIX --disable-shared --enable-static 4. Compile protobuf. 5. Download protobuf-c sources. 6. Configure protobuf-c for the architecture x86: export PATH=$PATH:$X86_PREFIX/bin export PKG_CONFIG_PATH=$X86_PREFIX/lib/pkgconfig CPPFLAGS=`pkg-config --cflags protobuf` LDFLAGS=`pkg-config --libs protobuf` ./configure --prefix=$X86_PREFIX --disable-shared --enable-static 7. Compile and install protobuf-c. 8. Configure protobuf to be compiled for the target architecture: ./configure --prefix=$ARCH_PREFIX --disable-shared --enable-static --with-protoc=protoc --host=$TARGET 9. Compile and install protobuf. 10. Let PKG_CONFIG_PATH=$ARCH_PREFIX/lib/pkgconfig. 11. Configure protobuf-c to be compiled for the target architecture: CPPFLAGS=`pkg-config --cflags protobuf` LDFLAGS=`pkg-config --libs protobuf` ./configure --prefix=$ARCH_PREFIX --disable-shared --enable-static --disable-protoc --host=$TARGET 12. Compile and install protobuf-c. 13. Compile CRIU: ARCH= CROSS_COMPILE=$TARGET- CFLAGS=`pkg-config --cflags libprotobuf-c` LDFLAGS="`pkg-config --libs libprotobuf-c`" make criu-3.6/Documentation/Makefile000066400000000000000000000043611317335042600166050ustar00rootroot00000000000000__nmk_dir ?= ../scripts/nmk/scripts/ include $(__nmk_dir)include.mk include $(__nmk_dir)macro.mk ASCIIDOC := asciidoc A2X := a2x XMLTO := xmlto FOOTER := footer.txt SRC1 += crit.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) MAN1S := $(patsubst %.txt,%.1,$(SRC1)) MAN8S := $(patsubst %.txt,%.8,$(SRC8)) MANS := $(MAN1S) $(MAN8S) MAN1DIR := $(MANDIR)/man1 MAN8DIR := $(MANDIR)/man8 GROFF :=groff PAPER :=$(shell paperconf 2>/dev/null || echo letter) GROFF_OPTS := -Tps -t -dpaper=$(PAPER) -P-p$(PAPER) -man -msafer -rC1 -rD1 -rS11 PSS := $(patsubst %,%.ps,$(basename $(MANS))) PDFS := $(patsubst %,%.pdf,$(basename $(MANS))) all: check $(MANS) ps: $(PSS) pdf: $(PDFS) .PHONY: all ps pdf check check: $(Q) for B in $(ASCIIDOC) $(A2X) $(XMLTO); do \ $$B --version > /dev/null || exit 1; \ done ifeq ($(CRIU_VERSION),) include ../Makefile.versions endif $(FOOTER): ../Makefile.versions $(call msg-gen, $@) $(Q) echo ":doctype: manpage" > $@ $(Q) echo ":man source: criu" >> $@ $(Q) echo ":man version: $(CRIU_VERSION)" >> $@ $(Q) echo ":man manual: CRIU Manual" >> $@ %.1: %.txt $(FOOTER) custom.xsl $(call msg-gen, $@) $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.1,%.xml,$@) $< $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.1,%.xml,$@) 2>/dev/null %.8: %.txt $(FOOTER) custom.xsl $(call msg-gen, $@) $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.8,%.xml,$@) $< $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.8,%.xml,$@) 2>/dev/null %.ps: %.1 $(call msg-gen, $@) $(Q) $(GROFF) $(GROFF_OPTS) $^ > $@ %.ps: %.8 $(call msg-gen, $@) $(Q) $(GROFF) $(GROFF_OPTS) $^ > $@ %.pdf: %.ps $(call msg-gen, $@) $(Q) ps2pdf $< $@ clean: $(call msg-clean, "Documentation") $(Q) rm -f $(XMLS) $(MANS) $(PSS) $(PDFS) $(FOOTER) install: $(MANS) $(E) " INSTALL " $(MAN8S) $(Q) mkdir -p $(DESTDIR)$(MAN8DIR) $(Q) install -m 644 $(MAN8S) $(DESTDIR)$(MAN8DIR) $(E) " INSTALL " $(MAN1S) $(Q) mkdir -p $(DESTDIR)$(MAN1DIR) $(Q) install -m 644 $(MAN1S) $(DESTDIR)$(MAN1DIR) uninstall: $(E) " UNINSTALL" $(MAN1S) $(Q) $(RM) $(addprefix $(DESTDIR)$(MAN1DIR)/,$(MAN1S)) $(E) " UNINSTALL" $(MAN8S) $(Q) $(RM) $(addprefix $(DESTDIR)$(MAN8DIR)/,$(MAN8S)) .PHONY: clean install uninstall criu-3.6/Documentation/asciidoc.conf000066400000000000000000000000011317335042600175550ustar00rootroot00000000000000 criu-3.6/Documentation/crit.txt000066400000000000000000000014601317335042600166440ustar00rootroot00000000000000CRIT(1) ======= include::footer.txt[] NAME ---- crit - CRiu Image Tool SYNOPSIS -------- *crit* 'decode' [-h] [-i IN] [-o OUT] [--pretty] *crit* 'encode' [-h] [-i IN] [-o OUT] *crit* 'info' [-h] in *crit* 'x' [-h] dir {ps,fds,mems} *crit* 'show' [-h] in DESCRIPTION ----------- *crit* is a feature-rich replacement for existing *criu* show. ARGUMENTS --------- Positional Arguments ~~~~~~~~~~~~~~~~~~~~ *decode*:: convert *criu* image from binary type JSON *encode*:: convert *criu* image from JSON type to binary *info*:: show info about image *x*:: explore image directory *show*:: convert *criu* image from binary to human-readable JSON Optional Arguments ~~~~~~~~~~~~~~~~~~ *-h*, *--help*:: Print some help and exit SEE ALSO -------- criu(8) AUTHOR ------ The CRIU team criu-3.6/Documentation/criu.txt000066400000000000000000000545261317335042600166600ustar00rootroot00000000000000CRIU(8) ======= include::footer.txt[] NAME ---- criu - checkpoint/restore in userspace SYNOPSIS -------- *criu* 'command' ['option' ...] DESCRIPTION ----------- *criu* is a tool for checkpointing and restoring running applications. It does this by saving their state as a collection of files (see the *dump* command) and creating equivalent processes from those files (see the *restore* command). The restore operation can be performed at a later time, on a different system, or both. OPTIONS ------- Most of the true / false long options (the ones without arguments) can be prefixed with *--no-* to negate the option (example: *--display-stats* and *--no-display-stats*). Common options ~~~~~~~~~~~~~~ Common options are applicable to any 'command'. *-v*[*v*...], *--verbosity*:: Increase verbosity up from the default level. Multiple *v* can be used, each increasing verbosity by one level. Using long option without argument increases verbosity by one level. *-v*'num', *--verbosity*='num':: Set verbosity level to 'num'. The higher the level, the more output is produced. + The following levels are available: * *-v0* no output; * *-v1* only errors; * *-v2* above plus warnings (this is the default level); * *-v3* above plus information messages and timestamps; * *-v4* above plus lots of debug. *--pidfile* 'file':: Write root task, service or page-server pid into a 'file'. *-o*, *--log-file* 'file':: Write logging messages to 'file'. *--log-pid*:: Write separate logging files per each pid. *--display-stats*:: During dump as well as during restore *criu* collects information like the time required to dump or restore the process or the number of pages dumped or restored. This information is always written to the files 'stats-dump' and 'stats-restore' and can be easily displayed using *crit*. The option *--display-stats* additionally prints out this information on the console at the end of a dump or a restore. *-D*, *--images-dir* 'path':: Use 'path' as a base directory where to look for sets of image files. *--prev-images-dir* 'path':: Use 'path' as a parent directory where to look for sets of image files. This option makes sense in case of incremental dumps. *-W*, *--work-dir* 'dir':: Use directory 'dir' for putting logs, pidfiles and statistics. If not specified, 'path' from *-D* option is taken. *--close* 'fd':: Close file descriptor 'fd' before performing any actions. *-L*, *--libdir* 'path':: Path to plugins directory. *--action-script* 'script':: Add an external action script to be executed at certain stages. The environment variable *CRTOOLS_SCRIPT_ACTION* is available to the script to find out which action is being executed, and its value can be one of the following: *pre-dump*::: run prior to beginning a *dump* *post-dump*::: run upon *dump* completion *pre-restore*::: run prior to beginning a *restore* *pre-resume*::: run when all processes and resources are restored but tasks are stopped waiting for final kick to run. Must not fail. *post-restore*::: run upon *restore* completion *network-lock*::: run to lock network in a target network namespace *network-unlock*::: run to unlock network in a target network namespace *setup-namespaces*::: run once root task just been created with required namespaces. Note it is an early stage of restore, when nothing is restored yet except for namespaces themselves *-V*, *--version*:: Print program version and exit. *-h*, *--help*:: Print some help and exit. *pre-dump* ~~~~~~~~~~ Performs the pre-dump procedure, during which *criu* creates a snapshot of memory changes since the previous *pre-dump*. Note that during this *criu* also creates the fsnotify cache which speeds up the *restore* procedure. *pre-dump* requires at least *-t* option (see *dump* below). In addition, *page-server* options may be specified. *--track-mem*:: Turn on memory changes tracker in the kernel. If the option is not passed the memory tracker get turned on implicitly. *dump* ~~~~~~ Performs a checkpoint procedure. *-t*, *--tree* 'pid':: Checkpoint the whole process tree starting from 'pid'. *-R*, *--leave-running*:: Leave tasks in running state after checkpoint, instead of killing. This option is pretty dangerous and should be used only if you understand what you are doing. + Note if task is about to run after been checkpointed, it can modify TCP connections, delete files and do other dangerous actions. Therefore, *criu* can not guarantee that the next *restore* action will succeed. Most likely if this option is used, at least the file system snapshot must be made with the help of *post-dump* action script. + In other words, do not use it unless really needed. *-s*, *--leave-stopped*:: Leave tasks in stopped state after checkpoint, instead of killing. *--external* 'type'*[*'id'*]:*'value':: Dump an instance of an external resource. The generic syntax is 'type' of resource, followed by resource 'id' (enclosed in literal square brackets), and optional 'value' (prepended by a literal semicolon). The following resource types are currently supported: *mnt*, *dev*, *file*, *tty*, *unix*. Syntax depends on type. Note to restore external resources, either *--external* or *--inherit-fd* is used, depending on resource type. *--external mnt[*'mountpoint'*]:*'name':: Dump an external bind mount referenced by 'mountpoint', saving it to image under the identifier 'name'. *--external mnt[]:*'flags':: Dump all external bind mounts, autodetecting those. Optional 'flags' can contain *m* to also dump external master mounts, *s* to also dump external shared mounts (default behavior is to abort dumping if such mounts are found). If 'flags' are not provided, semicolon is optional. *--external dev[*'major'*/*'minor'*]:*'name':: Allow to dump a mount namespace having a real block device mounted. A block device is identified by its 'major' and 'minor' numbers, and *criu* saves its information to image under the identifier 'name'. *--external file[*'mnt_id'*:*'inode'*]*:: Dump an external file, i.e. an opened file that is can not be resolved from the current mount namespace, which can not be dumped without using this option. The file is identified by 'mnt_id' (a field obtained from */proc/*'pid'*/fdinfo/*'N') and 'inode' (as returned by *stat*(2)). *--external tty[*'rdev'*:*'dev'*]*:: Dump an external TTY, identified by *st_rdev* and *st_dev* fields returned by *stat*(2). *--external unix[*'id'*]*:: Tell *criu* that one end of a pair of UNIX sockets (created by *socketpair*(2)) with 'id' is OK to be disconnected. *--freeze-cgroup*:: Use cgroup freezer to collect processes. *--manage-cgroups*:: Collect cgroups into the image thus they gonna be restored then. Without this option, *criu* will not save cgroups configuration associated with a task. *--cgroup-props* 'spec':: Specify controllers and their properties to be saved into the image file. *criu* predefines specifications for common controllers, but since the kernel can add new controllers and modify their properties, there should be a way to specify ones matched the kernel. + 'spec' argument describes the controller and properties specification in a simplified YAML form: + ---------- "c1": - "strategy": "merge" - "properties": ["a", "b"] "c2": - "strategy": "replace" - "properties": ["c", "d"] ---------- + where 'c1' and 'c2' are controllers names, and 'a', 'b', 'c', 'd' are their properties. + Note the format: double quotes, spaces and new lines are required. The 'strategy' specifies what to do if a controller specified already exists as a built-in one: *criu* can either *merge* or *replace* such. + For example, the command line for the above example should look like this: + ---------- --cgroup-props "\"c1\":\n - \"strategy\": \"merge\"\n - \"properties\": [\"a\", \"b\"]\n \"c2\":\n - \"strategy\": \"replace\"\n - \"properties\": [\"c\", \"d\"]" ---------- *--cgroup-props-file* 'file':: Same as *--cgroup-props*, except the specification is read from the 'file'. *--cgroup-dump-controller* 'name':: Dump a controller with 'name' only, skipping anything else that was discovered automatically (usually via */proc*). This option is useful when one needs *criu* to skip some controllers. *--cgroup-props-ignore-default*:: When combined with *--cgroup-props*, makes *criu* substitute a predefined controller property with the new one shipped. If the option is not used, the predefined properties are merged with the provided ones. *--tcp-established*:: Checkpoint established TCP connections. *--skip-in-flight*:: This option skips in-flight TCP connections. If any TCP connections that are not yet completely established are found, *criu* ignores these connections, rather than errors out. The TCP stack on the client side is expected to handle the re-connect gracefully. *--tcp-close*:: Restore connected TCP sockets in closed state. *--evasive-devices*:: Use any path to a device file if the original one is inaccessible. *--page-server*:: Send pages to a page server (see the *page-server* command). *--force-irmap*:: Force resolving names for inotify and fsnotify watches. *--auto-dedup*:: Deduplicate "old" data in pages images of previous *dump*. This option implies incremental *dump* mode (see the *pre-dump* command). *-l*, *--file-locks*:: Dump file locks. It is necessary to make sure that all file lock users are taken into dump, so it is only safe to use this for enclosed containers where locks are not held by any processes outside of dumped process tree. *--link-remap*:: Allows to link unlinked files back, if possible (modifies filesystem during *restore*). *--ghost-limit* 'size':: Set the maximum size of deleted file to be carried inside image. By default, up to 1M file is allowed. Using this option allows to not put big deleted files inside images. Argument 'size' may be postfixed with a *K*, *M* or *G*, which stands for kilo-, mega, and gigabytes, accordingly. *-j*, *--shell-job*:: Allow one to dump shell jobs. This implies the restored task will inherit session and process group ID from the *criu* itself. This option also allows to migrate a single external tty connection, to migrate applications like *top*. If used with *dump* command, it must be specified with *restore* as well. *--cpu-cap* ['cap'[,'cap'...]]:: Specify CPU capabilities to write to an image file. The argument is a comma-separated list of *none*, *fpu*, *cpu*, *ins*, *all*. If the argument is omitted or set to *none*, capabilities will not be written, which is the default behavior. *--cgroup-root* ['controller':]/'newroot':: Change the root for the controller that will be dumped. By default, *criu* simply dumps everything below where any of the tasks live. However, if a container moves all of its tasks into a cgroup directory below the container engine's default directory for tasks, permissions will not be preserved on the upper directories with no tasks in them, which may cause problems. *--lazy-pages*:: Perform the dump procedure without writing memory pages into the image files and prepare to service page requests over the network. When *dump* runs in this mode it presumes that *lazy-pages* daemon will connect to it and fetch memory pages to lazily inject them into the restored process address space. This option is intended for post-copy (lazy) migration and should be used in conjunction with *restore* with appropriate options. *restore* ~~~~~~~~~ Restores previously checkpointed processes. *--inherit-fd* *fd[*'N'*]:*'resource':: Inherit a file descriptor. This option lets *criu* use an already opened file descriptor 'N' for restoring a file identified by 'resource'. This option can be used to restore an external resource dumped with the help of *--external* *file*, *tty*, and *unix* options. + The 'resource' argument can be one of the following: + - *tty[*'rdev'*:*'dev'*]* - *pipe[*'inode'*]* - *socket[*'inode'*]* - *file[*'mnt_id'*:*'inode'*]* - 'path/to/file' + Note that square brackets used in this option arguments are literals and usually need to be escaped from shell. *-d*, *--restore-detached*:: Detach *criu* itself once restore is complete. *-s*, *--leave-stopped*:: Leave tasks in stopped state after restore (rather than resuming their execution). *-S*, *--restore-sibling*:: Restore root task as a sibling (makes sense only with *--restore-detached*). *-r*, *--root* 'path':: Change the root filesystem to 'path' (when run in a mount namespace). *--external* 'type'*[*'id'*]:*'value':: Restore an instance of an external resource. The generic syntax is 'type' of resource, followed by resource 'id' (enclosed in literal square brackets), and optional 'value' (prepended by a literal semicolon). The following resource types are currently supported: *mnt*, *dev*, *veth*, *macvlan*. Syntax depends on type. Note to restore external resources dealing with opened file descriptors (such as dumped with the help of *--external* *file*, *tty*, and *unix* options), option *--inherit-fd* should be used. *--external mnt[*'name'*]:*'mountpoint':: Restore an external bind mount referenced in the image by 'name', bind-mounting it from the host 'mountpoint' to a proper mount point. *--external mnt[]*:: Restore all external bind mounts (dumped with the help of *--external mnt[]* auto-detection). *--external dev[*'name'*]:*'/dev/path':: Restore an external mount device, identified in the image by 'name', using the existing block device '/dev/path'. *--external veth[*'inner_dev'*]:*'outer_dev'*@*'bridge':: Set the outer VETH device name (corresponding to 'inner_dev' being restored) to 'outer_dev'. If optional *@*'bridge' is specified, 'outer_dev' is added to that bridge. If the option is not used, 'outer_dev' will be autogenerated by the kernel. *--external macvlan[*'inner_dev'*]:*'outer_dev':: When restoring an image that have a MacVLAN device in it, this option must be used to specify to which 'outer_dev' (an existing network device in CRIU namespace) the restored 'inner_dev' should be bound to. *--manage-cgroups* ['mode']:: Restore cgroups configuration associated with a task from the image. Controllers are always restored in an optimistic way -- if already present in system, *criu* reuses it, otherwise it will be created. The 'mode' may be one of the following: *none*::: Do not restore cgroup properties but require cgroup to pre-exist at the moment of *restore* procedure. *props*::: Restore cgroup properties and require cgroup to pre-exist. *soft*::: Restore cgroup properties if only cgroup has been created by *criu*, otherwise do not restore properties. This is the default if mode is unspecified. *full*::: Always restore all cgroups and their properties. *strict*::: Restore all cgroups and their properties from the scratch, requiring them to not present in the system. *--cgroup-root* ['controller'*:*]/'newroot':: Change the root cgroup the controller will be installed into. No controller means that root is the default for all controllers not specified. *--tcp-established*:: Restore previously dumped established TCP connections. This implies that the network has been locked between *dump* and *restore* phases so other side of a connection simply notice a kind of lag. *--veth-pair* 'IN'*=*'OUT':: Correspondence between outside and inside names of veth devices. *-l*, *--file-locks*:: Restore file locks from the image. *--auto-dedup*:: As soon as a page is restored it get punched out from image. *-j*, *--shell-job*:: Restore shell jobs, in other words inherit session and process group ID from the criu itself. *--cpu-cap* ['cap'[,'cap'...]]:: Specify CPU capabilities to be present on the CPU the process is restoring. To inverse a capability, prefix it with *^*. This option implies that *--cpu-cap* has been passed on *dump* as well, except *fpu* option case. The 'cap' argument can be the following (or a set of comma-separated values): *all*::: Require all capabilities. This is *default* mode if *--cpu-cap* is passed without arguments. Most safe mode. *cpu*::: Require the CPU to have all capabilities in image to match runtime CPU. *fpu*::: Require the CPU to have compatible FPU. For example the process might be dumped with xsave capability but attempted to restore without it present on target CPU. In such case we refuse to proceed. This is *default* mode if *--cpu-cap* is not present in command line. Note this argument might be passed even if on the *dump* no *--cpu-cap* have been specified because FPU frames are always encoded into images. *ins*::: Require CPU compatibility on instructions level. *none*::: Ignore capabilities. Most dangerous mode. The behaviour is implementation dependent. Try to not use it until really required. + For example, this option can be used in case *--cpu-cap=cpu* was used during *dump*, and images are migrated to a less capable CPU and are to be restored. By default, *criu* shows an error that CPU capabilities are not adequate, but this can be suppressed by using *--cpu-cap=none*. *--weak-sysctls*:: Silently skip restoring sysctls that are not available. This allows to restore on an older kernel, or a kernel configured without some options. *--lazy-pages*:: Restore the processes without filling out the entire memory contents. When this option is used, *restore* sets up the infrastructure required to fill memory pages either on demand when the process accesses them or in the background without stopping the restored process. This option requires running *lazy-pages* daemon. *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to dump and restore a process tree. There are three categories of kernel support, as described below. *criu check* always checks Category 1 features unless *--feature* is specified which only checks a specified feature. *Category 1*::: Absolutely required. These are features like support for */proc/PID/map_files*, *NETLINK_SOCK_DIAG* socket monitoring, */proc/sys/kernel/ns_last_pid* etc. *Category 2*::: Required only for specific cases. These are features like AIO remap, */dev/net/tun* and others that are only required if a process being dumped or restored is using those. *Category 3*::: Experimental. These are features like *task-diag* that are used for experimental purposes (mostly during development). If there are no errors or warnings, *criu* prints "Looks good." and its exit code is 0. A missing Category 1 feature causes *criu* to print "Does not look good." and its exit code is non-zero. Missing Category 2 and 3 features cause *criu* to print "Looks good but ..." and its exit code is be non-zero. Without any options, *criu check* checks Category 1 features. This behavior can be changed by using the following options: *--extra*:: Check kernel support for Category 2 features. *--experimental*:: Check kernel support for Category 3 features. *--all*:: Check kernel support for Category 1, 2, and 3 features. *--feature* 'name':: Check a specific feature. If 'name' is *list*, a list of valid kernel feature names that can be checked will be printed. *page-server* ~~~~~~~~~~~~~ Launches *criu* in page server mode. *--daemon*:: Runs page server as a daemon (background process). *--status_fd*:: Write \\0 to the FD and close it once page-server is ready to handle requests. The status-fd allows to not daemonize a process and get its exit code at the end. It isn't supposed to use --daemon and --status-fd together. *--address* 'address':: Page server IP address. *--port* 'number':: Page server port number. *--lazy-pages*:: Serve local memory dump to a remote *lazy-pages* daemon. In this mode the *page-server* reads local memory dump and allows the remote *lazy-pages* deamon to request memory pages in random order. *lazy-pages* ~~~~~~~~~~~~ Launches *criu* in lazy-pages daemon mode. The *lazy-pages* daemon is responsible for managing user-level demand paging for the restored processes. It gets information required to fill the process memory pages from the *restore* and from the checkpont directory. When a restored process access certain memory page for the first time, the *lazy-pages* daemon injects its contents into the process address space. The memory pages that are not yet requested by the restored processes are injected in the background. *exec* ~~~~~~ Executes a system call inside a destination task\'s context. This functionality is deprecated; please use *Compel* instead. *service* ~~~~~~~~~ Launches *criu* in RPC daemon mode, where *criu* is listening for RPC commands over socket to perform. This is convenient for a case where daemon itself is running in a privileged (superuser) mode but clients are not. dedup ~~~~~ Starts pagemap data deduplication procedure, where *criu* scans over all pagemap files and tries to minimize the number of pagemap entries by obtaining the references from a parent pagemap image. cpuinfo dump ~~~~~~~~~~~~ Fetches current CPU features and write them into an image file. cpuinfo check ~~~~~~~~~~~~~ Fetches current CPU features (i.e. CPU the *criu* is running on) and test if they are compatible with the ones present in an image file. EXAMPLES -------- To checkpoint a program with pid of *1234* and write all image files into directory *checkpoint*: ---------- criu dump -D checkpoint -t 1234 ---------- To restore this program detaching criu itself: ---------- criu restore -d -D checkpoint ---------- AUTHOR ------ The CRIU team. COPYRIGHT --------- Copyright \(C) 2011-2016, Parallels Holdings, Inc. criu-3.6/Documentation/custom.xsl000066400000000000000000000004241317335042600172030ustar00rootroot00000000000000 1 1 1 criu-3.6/INSTALL.md000066400000000000000000000023231317335042600137600ustar00rootroot00000000000000## Installing CRIU from source code Once CRIU is built one can easily setup the complete CRIU package (which includes executable itself, CRIT tool, libraries, manual and etc) simply typing make install this command accepts the following variables: * **DESTDIR**, to specify global root where all components will be placed under (empty by default); * **PREFIX**, to specify additional prefix for path of every component installed (`/usr/local` by default); * **BINDIR**, to specify where to put CRIT tool (`$(PREFIX)/bin` by default); * **SBINDIR**, to specify where to put CRIU executable (`$(PREFIX)/sbin` by default); * **MANDIR**, to specify directory for manual pages (`$(PREFIX)/share/man` by default); * **LIBDIR**, to specify directory where to put libraries (guess the correct path by default). Thus one can type make DESTDIR=/some/new/place install and get everything installed under `/some/new/place`. ## Uninstalling CRIU To clean up previously installed CRIU instance one can type make uninstall and everything should be removed. Note though that if some variable (**DESTDIR**, **BINDIR** and such) has been used during installation procedure, the same *must* be passed with uninstall action. criu-3.6/Makefile000066400000000000000000000225641317335042600140010ustar00rootroot00000000000000__nmk_dir=$(CURDIR)/scripts/nmk/scripts/ export __nmk_dir # # No need to try to remake our Makefiles Makefile: ; Makefile.%: ; scripts/%.mak: ; $(__nmk_dir)%.mk: ; # # Import the build engine include $(__nmk_dir)include.mk include $(__nmk_dir)macro.mk ifeq ($(origin HOSTCFLAGS), undefined) HOSTCFLAGS := $(CFLAGS) $(USERCFLAGS) endif UNAME-M := $(shell uname -m) # # Supported Architectures ifneq ($(filter-out x86 arm aarch64 ppc64 s390,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif # The PowerPC 64 bits architecture could be big or little endian. # They are handled in the same way. ifeq ($(UNAME-M),ppc64) error := $(error ppc64 big endian is not yet supported) endif # # Architecture specific options. ifeq ($(ARCH),arm) ARMV := $(shell echo $(UNAME-M) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') DEFINES := -DCONFIG_ARMV$(ARMV) ifeq ($(ARMV),6) USERCFLAGS += -march=armv6 endif ifeq ($(ARMV),7) USERCFLAGS += -march=armv7-a endif PROTOUFIX := y endif ifeq ($(ARCH),aarch64) VDSO := y DEFINES := -DCONFIG_AARCH64 endif ifeq ($(ARCH),ppc64) LDARCH := powerpc:common64 VDSO := y DEFINES := -DCONFIG_PPC64 -D__SANE_USERSPACE_TYPES__ endif ifeq ($(ARCH),x86) LDARCH := i386:x86-64 VDSO := y DEFINES := -DCONFIG_X86_64 endif # # CFLAGS_PIE: # # Ensure with -fno-optimize-sibling-calls that we don't create GOT # (Global Offset Table) relocations with gcc compilers that don't have # commit "S/390: Fix 64 bit sibcall". ifeq ($(ARCH),s390) ARCH := s390 SRCARCH := s390 VDSO := y DEFINES := -DCONFIG_S390 CFLAGS_PIE := -fno-optimize-sibling-calls endif export CFLAGS_PIE LDARCH ?= $(SRCARCH) export LDARCH VDSO export PROTOUFIX DEFINES # # Independent options for all tools. DEFINES += -D_FILE_OFFSET_BITS=64 DEFINES += -D_GNU_SOURCE WARNINGS := -Wall -Wformat-security CFLAGS-GCOV := --coverage -fno-exceptions -fno-inline export CFLAGS-GCOV ifneq ($(GCOV),) LDFLAGS += -lgcov CFLAGS += $(CFLAGS-GCOV) endif ifeq ($(ASAN),1) CFLAGS-ASAN := -fsanitize=address export CFLAGS-ASAN CFLAGS += $(CFLAGS-ASAN) endif ifneq ($(WERROR),0) WARNINGS += -Werror endif ifeq ($(DEBUG),1) DEFINES += -DCR_DEBUG CFLAGS += -O0 -ggdb3 else CFLAGS += -O2 -g endif ifeq ($(GMON),1) CFLAGS += -pg GMONLDOPT += -pg export GMON GMONLDOPT endif CFLAGS += $(USERCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export CFLAGS USERCLFAGS HOSTCFLAGS # Default target all: criu lib .PHONY: all # # Version headers. include Makefile.versions VERSION_HEADER := criu/include/version.h GITID_FILE := .gitid GITID := $(shell if [ -d ".git" ]; then git describe --always; fi) # Git repository wasn't inited in CRIU folder ifeq ($(GITID),) GITID := 0 else GITID_FILE_VALUE := $(shell if [ -f '$(GITID_FILE)' ]; then if [ `cat '$(GITID_FILE)'` = $(GITID) ]; then echo y; fi; fi) ifneq ($(GITID_FILE_VALUE),y) .PHONY: $(GITID_FILE) endif endif $(GITID_FILE): $(call msg-gen, $@) $(Q) echo "$(GITID)" > $(GITID_FILE) $(VERSION_HEADER): Makefile.versions $(GITID_FILE) $(call msg-gen, $@) $(Q) echo "/* Autogenerated, do not edit */" > $@ $(Q) echo "#ifndef __CR_VERSION_H__" >> $@ $(Q) echo "#define __CR_VERSION_H__" >> $@ $(Q) echo "#define CRIU_VERSION \"$(CRIU_VERSION)\"" >> $@ $(Q) echo "#define CRIU_VERSION_MAJOR " $(CRIU_VERSION_MAJOR) >> $@ $(Q) echo "#define CRIU_VERSION_MINOR " $(CRIU_VERSION_MINOR) >> $@ ifneq ($(CRIU_VERSION_SUBLEVEL),) $(Q) echo "#define CRIU_VERSION_SUBLEVEL " $(CRIU_VERSION_SUBLEVEL) >> $@ endif ifneq ($(CRIU_VERSION_EXTRA),) $(Q) echo "#define CRIU_VERSION_EXTRA " $(CRIU_VERSION_EXTRA) >> $@ endif $(Q) echo "#define CRIU_GITID \"$(GITID)\"" >> $@ $(Q) echo "#endif /* __CR_VERSION_H__ */" >> $@ criu-deps += $(VERSION_HEADER) # # Setup proper link for asm headers in common code. include/common/asm: include/common/arch/$(ARCH)/asm $(call msg-gen, $@) $(Q) ln -s ./arch/$(ARCH)/asm $@ criu-deps += include/common/asm # # Configure variables. export CONFIG_HEADER := criu/include/config.h ifeq ($(filter tags etags cscope clean mrproper,$(MAKECMDGOALS)),) include Makefile.config else # To clean all files, enable make/build options here export CONFIG_COMPAT := y endif # # Protobuf images first, they are not depending # on anything else. $(eval $(call gen-built-in,images)) criu-deps += images/built-in.o .PHONY: .FORCE # # Compel get used by CRIU, build it earlier include Makefile.compel # # Next the socket CR library # SOCCR_A := soccr/libsoccr.a SOCCR_CONFIG := soccr/config.h $(SOCCR_CONFIG): $(CONFIG_HEADER) $(Q) test -f $@ || ln -s ../$(CONFIG_HEADER) $@ soccr/Makefile: ; soccr/%: $(SOCCR_CONFIG) .FORCE $(Q) $(MAKE) $(build)=soccr $@ soccr/built-in.o: $(SOCCR_CONFIG) .FORCE $(Q) $(MAKE) $(build)=soccr all $(SOCCR_A): |soccr/built-in.o criu-deps += $(SOCCR_A) # # CRIU building done in own directory # with slightly different rules so we # can't use nmk engine directly (we # build syscalls library and such). # # But note that we're already included # the nmk so we can reuse it there. criu/Makefile: ; criu/Makefile.packages: ; criu/Makefile.crtools: ; criu/%: $(criu-deps) .FORCE $(Q) $(MAKE) $(build)=criu $@ criu: $(criu-deps) $(Q) $(MAKE) $(build)=criu all .PHONY: criu # # Libraries next once criu it ready # (we might generate headers and such # when building criu itself). lib/Makefile: ; lib/%: criu .FORCE $(Q) $(MAKE) $(build)=lib $@ lib: criu $(Q) $(MAKE) $(build)=lib all .PHONY: lib clean mrproper: $(Q) $(MAKE) $(build)=images $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ $(Q) $(MAKE) $(build)=lib $@ .PHONY: clean mrproper clean-top: $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) $(build)=test/compel clean $(Q) $(RM) .gitid .PHONY: clean-top clean: clean-top mrproper-top: clean-top $(Q) $(RM) $(CONFIG_HEADER) $(Q) $(RM) $(SOCCR_CONFIG) $(Q) $(RM) $(VERSION_HEADER) $(Q) $(RM) $(COMPEL_VERSION_HEADER) $(Q) $(RM) include/common/asm $(Q) $(RM) compel/include/asm $(Q) $(RM) cscope.* $(Q) $(RM) tags TAGS .PHONY: mrproper-top mrproper: mrproper-top # # Non-CRIU stuff. # docs: $(Q) $(MAKE) -s -C Documentation all .PHONY: docs zdtm: all $(Q) MAKEFLAGS= $(MAKE) -C test/zdtm all .PHONY: zdtm test: zdtm $(Q) MAKEFLAGS= $(MAKE) -C test .PHONY: test # # Generating tar requires tag matched CRIU_VERSION. # If not found then simply use GIT's describe with # "v" prefix stripped. head-name := $(shell git tag -l v$(CRIU_VERSION)) ifeq ($(head-name),) head-name := $(shell git describe 2>/dev/null) endif # If no git tag could describe current commit, # use pre-defined CRIU_VERSION with GITID (if any). ifeq ($(head-name),) ifneq ($(GITID),) head-name := $(CRIU_VERSION)-$(GITID) else head-name := $(CRIU_VERSION) endif endif tar-name := $(shell echo $(head-name) | sed -e 's/^v//g') criu-$(tar-name).tar.bz2: git archive --format tar --prefix 'criu-$(tar-name)/' $(head-name) | bzip2 > $@ dist tar: criu-$(tar-name).tar.bz2 ; .PHONY: dist tar TAGS_FILES_REGEXP := . -name '*.[hcS]' ! -path './.*' \( ! -path './test/*' -o -path './test/zdtm/lib/*' \) tags: $(call msg-gen, $@) $(Q) $(RM) tags $(Q) $(FIND) $(TAGS_FILES_REGEXP) -print | xargs $(CTAGS) -a .PHONY: tags etags: $(call msg-gen, $@) $(Q) $(RM) TAGS $(Q) $(FIND) $(TAGS_FILES_REGEXP) -print | xargs $(ETAGS) -a .PHONY: etags cscope: $(call msg-gen, $@) $(Q) $(FIND) $(TAGS_FILES_REGEXP) ! -type l -print > cscope.files $(Q) $(CSCOPE) -bkqu .PHONY: cscope gcov: $(E) " GCOV" $(Q) test -d gcov || mkdir gcov && \ geninfo --output-filename gcov/criu.info --no-recursion criu/ && \ cd gcov && \ genhtml --rc lcov_branch_coverage=1 --output-directory html criu.info @echo "Code coverage report is in `pwd`/gcov/html/ directory." .PHONY: gcov docker-build: $(MAKE) -C scripts/build/ x86_64 .PHONY: docker-build docker-test: docker run --rm -it --privileged criu-x86_64 ./test/zdtm.py run -a -x tcp6 -x tcpbuf6 -x static/rtc -x cgroup .PHONY: docker-test help: @echo ' Targets:' @echo ' all - Build all [*] targets' @echo ' * criu - Build criu' @echo ' zdtm - Build zdtm test-suite' @echo ' docs - Build documentation' @echo ' install - Install CRIU (see INSTALL.md)' @echo ' uninstall - Uninstall CRIU' @echo ' dist - Create a source tarball' @echo ' clean - Clean most, but leave enough to navigate' @echo ' mrproper - Delete all compiled/generated files' @echo ' tags - Generate tags file (ctags)' @echo ' etags - Generate TAGS file (etags)' @echo ' cscope - Generate cscope database' @echo ' test - Run zdtm test-suite' @echo ' gcov - Make code coverage report' .PHONY: help lint: flake8 --config=scripts/flake8.cfg test/zdtm.py include Makefile.install .DEFAULT_GOAL := all # Disable implicit rules in _this_ Makefile. .SUFFIXES: # # Optional local include. -include Makefile.local criu-3.6/Makefile.compel000066400000000000000000000052121317335042600152460ustar00rootroot00000000000000COMPEL_BIN := ./compel/compel-host export COMPEL_BIN COMPEL_VERSION_HEADER := compel/include/version.h $(COMPEL_VERSION_HEADER): Makefile.versions $(call msg-gen, $(COMPEL_VERSION_HEADER)) $(Q) echo "/* Autogenerated, do not edit */" > $(COMPEL_VERSION_HEADER) $(Q) echo "#ifndef COMPEL_SO_VERSION_H__" >> $(COMPEL_VERSION_HEADER) $(Q) echo "#define COMPEL_SO_VERSION_H__" >> $(COMPEL_VERSION_HEADER) $(Q) echo "#define COMPEL_SO_VERSION \"$(COMPEL_SO_VERSION)\"" >> $(COMPEL_VERSION_HEADER) $(Q) echo "#define COMPEL_SO_VERSION_MAJOR " $(COMPEL_SO_VERSION_MAJOR) >> $(COMPEL_VERSION_HEADER) $(Q) echo "#define COMPEL_SO_VERSION_MINOR " $(COMPEL_SO_VERSION_MINOR) >> $(COMPEL_VERSION_HEADER) $(Q) echo "#define COMPEL_SO_VERSION_SUBLEVEL " $(COMPEL_SO_VERSION_SUBLEVEL) >> $(COMPEL_VERSION_HEADER) $(Q) echo "#endif /* COMPEL_SO_VERSION_H__ */" >> $(COMPEL_VERSION_HEADER) compel/include/asm: $(call msg-gen, $@) $(Q) ln -s ../arch/$(ARCH)/src/lib/include $@ compel-deps += compel/include/asm compel-deps += $(COMPEL_VERSION_HEADER) compel-deps += $(CONFIG_HEADER) compel-deps += include/common/asm compel-plugins += compel/plugins/std.lib.a compel/plugins/fds.lib.a LIBCOMPEL_SO := libcompel.so LIBCOMPEL_A := libcompel.a export LIBCOMPEL_SO LIBCOMPEL_A criu-deps += compel/$(LIBCOMPEL_A) # # Compel itself. compel/Makefile: ; compel/%: $(compel-deps) $(compel-plugins) .FORCE $(Q) $(MAKE) $(build)=compel $@ criu-deps += compel/compel-host-bin # # Plugins compel/plugins/Makefile: ; compel/plugins/%: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ # # GNU make 4.x supports targets matching via wide # match targeting, where GNU make 3.x series (used on # Travis) is not, so we have to write them here explicitly. compel/plugins/std.lib.a: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ compel/plugins/shmem.lib.a: $(compel-deps) compel/plugins/std.lib.a .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ compel/plugins/fds.lib.a: $(compel-deps) compel/plugins/std.lib.a .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ compel/compel: compel/built-in.o compel/$(LIBCOMPEL_A) | $(compel-deps) $(call msg-link, $@) $(Q) $(CC) $(CFLAGS) $^ $(WRAPFLAGS) $(LDFLAGS) -rdynamic -o $@ # # And compel library. LIBCOMPEL_SO_CFLAGS += $(CFLAGS) -rdynamic -Wl,-soname,$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR) compel/$(LIBCOMPEL_SO): compel/$(LIBCOMPEL_A) $(call msg-link, $@) $(Q) $(CC) -shared $(LIBCOMPEL_SO_CFLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(LDFLAGS) compel-install-targets += compel/$(LIBCOMPEL_SO) compel-install-targets += compel/compel compel-install-targets += $(compel-plugins) criu-3.6/Makefile.config000066400000000000000000000036451317335042600152440ustar00rootroot00000000000000include $(__nmk_dir)utils.mk include $(__nmk_dir)msg.mk include scripts/feature-tests.mak ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD endif ifeq ($(call pkg-config-check,libselinux),y) LIBS_FEATURES += -lselinux FEATURE_DEFINES += -DCONFIG_HAS_SELINUX endif export LIBS += $(LIBS_FEATURES) CONFIG_FILE = .config $(CONFIG_FILE): touch $(CONFIG_FILE) ifeq ($(SRCARCH),x86) # CONFIG_COMPAT is only for x86 now, no need for compile-test other archs ifeq ($(call try-asm,$(FEATURE_TEST_X86_COMPAT)),true) export CONFIG_COMPAT := y FEATURE_DEFINES += -DCONFIG_COMPAT else $(info Note: Building without ia32 C/R, missed ia32 support in gcc) $(info $(info) That may be related to missing gcc-multilib in your) $(info $(info) distribution or you may have Debian with buggy toolchain) $(info $(info) (issue https://github.com/xemul/criu/issues/315)) endif endif export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW # $1 - config name define gen-feature-test ifeq ($$(call try-cc,$$(FEATURE_TEST_$(1)),$$(LIBS_FEATURES),$$(DEFINES)),true) $(Q) @echo '#define CONFIG_HAS_$(1)' >> $$@ $(Q) @echo '' >> $$@ endif endef define config-header-rule $(CONFIG_HEADER): scripts/feature-tests.mak $(CONFIG_FILE) $$(call msg-gen, $$@) $(Q) @echo '#ifndef __CR_CONFIG_H__' > $$@ $(Q) @echo '#define __CR_CONFIG_H__' >> $$@ $(Q) @echo '' >> $$@ $(call map,gen-feature-test,$(FEATURES_LIST)) $(Q) @cat $(CONFIG_FILE) | sed -n -e '/^[^#]/s/^/#define CONFIG_/p' >> $$@ ifeq ($$(VDSO),y) $(Q) @echo '#define CONFIG_VDSO' >> $$@ $(Q) @echo '' >> $$@ endif $(Q) @echo '#endif /* __CR_CONFIG_H__ */' >> $$@ endef $(eval $(config-header-rule)) criu-3.6/Makefile.install000066400000000000000000000025541317335042600154430ustar00rootroot00000000000000# # Installation paths. PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin SBINDIR ?= $(PREFIX)/sbin MANDIR ?= $(PREFIX)/share/man INCLUDEDIR ?= $(PREFIX)/include LIBEXECDIR ?= $(PREFIX)/libexec RUNDIR ?= /run # # For recent Debian/Ubuntu with multiarch support. DEB_HOST_MULTIARCH := $(shell dpkg-architecture -qDEB_HOST_MULTIARCH 2>/dev/null) ifneq "$(DEB_HOST_MULTIARCH)" "" LIBDIR ?= $(PREFIX)/lib/$(DEB_HOST_MULTIARCH) else # # For most other systems ifeq "$(shell uname -m)" "x86_64" LIBDIR ?= $(PREFIX)/lib64 endif endif # # LIBDIR falls back to the standard path. LIBDIR ?= $(PREFIX)/lib export PREFIX BINDIR SBINDIR MANDIR RUNDIR export LIBDIR INCLUDEDIR LIBEXECDIR install-man: $(Q) $(MAKE) -C Documentation install .PHONY: install-man install-lib: lib $(Q) $(MAKE) $(build)=lib install .PHONY: install-lib install-criu: criu $(Q) $(MAKE) $(build)=criu install .PHONY: install-criu install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel install $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel install: install-man install-lib install-criu install-compel ; .PHONY: install uninstall: $(Q) $(MAKE) -C Documentation $@ $(Q) $(MAKE) $(build)=lib $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ .PHONY: uninstall criu-3.6/Makefile.versions000066400000000000000000000016111317335042600156360ustar00rootroot00000000000000# # CRIU version. CRIU_VERSION_MAJOR := 3 CRIU_VERSION_MINOR := 6 CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := CRIU_VERSION_NAME := Alabaster Finch CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL export CRIU_VERSION_EXTRA CRIU_VERSION_NAME CRIU_VERSION # # C library for CRIU. CRIU_SO_VERSION_MAJOR := 1 CRIU_SO_VERSION_MINOR := 0 export CRIU_SO_VERSION_MAJOR CRIU_SO_VERSION_MINOR # # SOCCR library. SOCCR_SO_VERSION_MAJOR := 1 SOCCR_SO_VERSION_MINOR := 0 export SOCCR_SO_VERSION_MAJOR SOCCR_SO_VERSION_MINOR COMPEL_SO_VERSION_MAJOR := 1 COMPEL_SO_VERSION_MINOR := 0 COMPEL_SO_VERSION_SUBLEVEL := 0 export COMPEL_SO_VERSION_MAJOR COMPEL_SO_VERSION_MINOR COMPEL_SO_VERSION_SUBLEVEL criu-3.6/README.md000066400000000000000000000116431317335042600136140ustar00rootroot00000000000000[![master](https://travis-ci.org/xemul/criu.svg?branch=master)](https://travis-ci.org/xemul/criu) [![development](https://travis-ci.org/xemul/criu.svg?branch=criu-dev)](https://travis-ci.org/xemul/criu) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/55251ec7db28421da4481fc7c1cb0cee)](https://www.codacy.com/app/xemul/criu?utm_source=github.com&utm_medium=referral&utm_content=xemul/criu&utm_campaign=Badge_Grade)

## CRIU -- A project to implement checkpoint/restore functionality for Linux CRIU (stands for Checkpoint and Restore in Userspace) is a utility to checkpoint/restore Linux tasks. Using this tool, you can freeze a running application (or part of it) and checkpoint it to a hard drive as a collection of files. You can then use the files to restore and run the application from the point it was frozen at. The distinctive feature of the CRIU project is that it is mainly implemented in user space. There are some more projects doing C/R for Linux, and so far CRIU [appears to be](https://criu.org/Comparison_to_other_CR_projects) the most feature-rich and up-to-date with the kernel. The project [started](https://criu.org/History) as the way to do live migration for OpenVZ Linux containers, but later grew to more sophisticated and flexible tool. It is currently used by (integrated into) OpenVZ, LXC/LXD, Docker, and other software, project gets tremendous help from the community, and its packages are included into many Linux distributions. The project home is at http://criu.org. This wiki contains all the knowledge base for CRIU we have. Pages worth starting with are: - [Installation instructions](http://criu.org/Installation) - [A simple example of usage](http://criu.org/Simple_loop) - [Examples of more advanced usage](https://criu.org/Category:HOWTO) - Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) ### A video tour on basic CRIU features [![CRIU introduction](https://asciinema.org/a/7fnt2prsumvxiwf3ng61fgct3.png)](https://asciinema.org/a/7fnt2prsumvxiwf3ng61fgct3) ## Advanced features As main usage for CRIU is live migration, there's a library for it called P.Haul. Also the project exposes two cool core features as standalone libraries. These are libcompel for parasite code injection and libsoccr for TCP connections checkpoint-restore. ### Live migration True [live migration](https://criu.org/Live_migration) using CRIU is possible, but doing all the steps by hands might be complicated. The [phaul sub-project](https://criu.org/P.Haul) provides a Go library that encapsulates most of the complexity. ### Parasite code injection In order to get state of the running process CRIU needs to make this process execute some code, that would fetch the required information. To make this happen without killing the application itself, CRIU uses the [parasite code injection](https://criu.org/Parasite_code) technique, which is also available as a standalone library called [libcompel](https://criu.org/Compel). ### TCP sockets checkpoint-restore One of the CRIU features is the ability to save and restore state of a TCP socket without breaking the connection. This functionality is considered to be useful by itself, and we have it available as the [libsoccr library](https://criu.org/Libsoccr). ## How to contribute CRIU project is (almost) the never-ending story, because we have to always keep up with the Linux kernel supporting checkpoint and restore for all the features it provides. Thus we're looking for contributors of all kinds -- feedback, bug reports, testing, coding, writing, etc. Here are some useful hints to get involved. * We have both -- [very simple](https://github.com/xemul/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/xemul/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; * CRIU does need [extensive testing](https://github.com/xemul/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); * Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; * Feedback is expected on the github issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); * For historical reasons we do not accept PRs, instead [patches are welcome](http://criu.org/How_to_submit_patches); * Spread the word about CRIU in [social networks](http://criu.org/Contacts); * If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); ## Licence The project is licensed under GPLv2 (though files sitting in the lib/ directory are LGPLv2.1). criu-3.6/compel/000077500000000000000000000000001317335042600136075ustar00rootroot00000000000000criu-3.6/compel/.gitignore000066400000000000000000000010401317335042600155720ustar00rootroot00000000000000arch/x86/plugins/std/sys-exec-tbl-64.c arch/x86/plugins/std/syscalls-64.S arch/arm/plugins/std/syscalls/syscalls.S arch/aarch64/plugins/std/syscalls/syscalls.S arch/s390/plugins/std/syscalls/syscalls.S arch/ppc64/plugins/std/syscalls/syscalls.S include/version.h plugins/include/uapi/std/asm/syscall-types.h plugins/include/uapi/std/syscall-64.h plugins/include/uapi/std/syscall-codes-64.h plugins/include/uapi/std/syscall-codes.h plugins/include/uapi/std/syscall.h plugins/include/uapi/std/syscall-aux.h plugins/include/uapi/std/syscall-aux.S criu-3.6/compel/Makefile000066400000000000000000000065701317335042600152570ustar00rootroot00000000000000include Makefile.versions COMPEL_SO_VERSION := $(COMPEL_SO_VERSION_MAJOR)$(if $(COMPEL_SO_VERSION_MINOR),.$(COMPEL_SO_VERSION_MINOR))$(if $(COMPEL_SO_VERSION_SUBLEVEL),.$(COMPEL_SO_VERSION_SUBLEVEL)) COMPEL_SO_VERSION_CODE := $(shell expr $(COMPEL_SO_VERSION_MAJOR) \* 65536 \+ $(COMPEL_SO_VERSION_MINOR) \* 256 \+ $(COMPEL_SO_VERSION_SUBLEVEL)) ccflags-y += -DINCLUDEDIR=\"$(INCLUDEDIR)\" ccflags-y += -DLIBEXECDIR=\"$(LIBEXECDIR)\" ccflags-y += -DLIBDIR=\"$(LIBDIR)\" ccflags-y += -DSTATIC_LIB=\"$(LIBCOMPEL_A)\" ccflags-y += -DDYN_LIB=\"$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR)\" ccflags-y += -iquote compel/arch/$(ARCH)/src/lib/include ccflags-y += -iquote compel/include ccflags-y += -fno-strict-aliasing ccflags-y += -fPIC ccflags-y += $(CFLAGS_PIE) ldflags-y += -r # # UAPI inclusion, referred as ccflags-y += -I compel/include/uapi lib-name := $(LIBCOMPEL_A) lib-y += src/lib/log.o host-lib-y += src/lib/log.o lib-y += arch/$(ARCH)/src/lib/cpu.o lib-y += arch/$(ARCH)/src/lib/infect.o lib-y += src/lib/infect-rpc.o lib-y += src/lib/infect-util.o lib-y += src/lib/infect.o lib-y += src/lib/ptrace.o # handle_elf() has no support of ELF relocations on ARM (yet?) ifneq ($(filter arm aarch64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif obj-y += src/main.o obj-y += arch/$(ARCH)/src/lib/handle-elf.o obj-y += src/lib/handle-elf.o host-ccflags-y += $(ccflags-y) hostprogs-y += compel-host-bin compel-host-bin-objs := $(patsubst %.o,%-host.o,$(obj-y) $(host-lib-y)) cleanup-y += compel/compel cleanup-y += compel/compel-host-bin cleanup-y += compel/libcompel.so install: compel/compel compel/$(LIBCOMPEL_SO) compel/$(LIBCOMPEL_A) $(E) " INSTALL " compel $(Q) mkdir -p $(DESTDIR)$(BINDIR) $(Q) install -m 755 compel/compel $(DESTDIR)$(BINDIR) $(E) " INSTALL " $(LIBCOMPEL_SO) $(Q) mkdir -p $(DESTDIR)$(LIBDIR) $(Q) install -m 0644 compel/$(LIBCOMPEL_SO) $(DESTDIR)$(LIBDIR) $(Q) install -m 755 compel/$(LIBCOMPEL_SO) $(DESTDIR)$(LIBDIR)/$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR) $(Q) ln -fns $(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR) $(DESTDIR)$(LIBDIR)/$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR) $(Q) ln -fns $(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR) $(DESTDIR)$(LIBDIR)/$(LIBCOMPEL_SO) $(E) " INSTALL " $(LIBCOMPEL_A) $(Q) install -m 0644 compel/$(LIBCOMPEL_A) $(DESTDIR)$(LIBDIR) $(E) " INSTALL " compel uapi $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/compel/asm $(Q) cp -fr compel/include/uapi/*.h $(DESTDIR)$(INCLUDEDIR)/compel/ $(Q) cp -fr compel/include/uapi/asm/*.h $(DESTDIR)$(INCLUDEDIR)/compel/asm/ $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/compel/common/asm $(Q) cp -fr include/common/compiler.h $(DESTDIR)$(INCLUDEDIR)/compel/common/ .PHONY: install uninstall: $(E) " UNINSTALL" compel $(Q) $(RM) $(addprefix $(DESTDIR)$(BINDIR)/,compel) $(E) " UNINSTALL" $(LIBCOMPEL_SO) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_SO)) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR)) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR)) $(E) " UNINSTALL" $(LIBCOMPEL_A) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_A)) $(E) " UNINSTALL" compel uapi $(Q) $(RM) -rf $(addprefix $(DESTDIR)$(INCLUDEDIR)/,compel/*) .PHONY: uninstall criu-3.6/compel/arch/000077500000000000000000000000001317335042600145245ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/000077500000000000000000000000001317335042600157545ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/plugins/000077500000000000000000000000001317335042600174355ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/plugins/include/000077500000000000000000000000001317335042600210605ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/plugins/include/asm/000077500000000000000000000000001317335042600216405ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/plugins/include/asm/prologue.h000077700000000000000000000000001317335042600343622../../../../../arch/x86/plugins/include/asm/prologue.hustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/plugins/include/asm/syscall-types.h000066400000000000000000000011711317335042600246250ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ #define SA_RESTORER 0x04000000 typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define _KNSIG 64 #define _NSIG_BPW 64 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ criu-3.6/compel/arch/aarch64/plugins/include/features.h000066400000000000000000000001511317335042600230440ustar00rootroot00000000000000#ifndef __COMPEL_ARCH_FEATURES_H #define __COMPEL_ARCH_FEATURES_H #endif /* __COMPEL_ARCH_FEATURES_H */ criu-3.6/compel/arch/aarch64/plugins/std/000077500000000000000000000000001317335042600202275ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/plugins/std/parasite-head.S000066400000000000000000000007631317335042600230700ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .head.text, "ax" ENTRY(__export_parasite_head_start) adr x2, __export_parasite_head_start // get the address of this instruction ldr x0, __export_parasite_cmd ldr x1, parasite_args_ptr add x1, x1, x2 // fixup __export_parasite_args bl parasite_service brk #0 // the instruction BRK #0 generates the signal SIGTRAP in Linux parasite_args_ptr: .quad __export_parasite_args __export_parasite_cmd: .quad 0 END(__export_parasite_head_start) criu-3.6/compel/arch/aarch64/plugins/std/syscalls/000077500000000000000000000000001317335042600220645ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/plugins/std/syscalls/Makefile.syscalls000077700000000000000000000000001317335042600366122../../../../arm/plugins/std/syscalls/Makefile.syscallsustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/plugins/std/syscalls/gen-sys-exec-tbl.pl000077700000000000000000000000001317335042600370702../../../../arm/plugins/std/syscalls/gen-sys-exec-tbl.plustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/plugins/std/syscalls/gen-syscalls.pl000077700000000000000000000000001317335042600357242../../../../arm/plugins/std/syscalls/gen-syscalls.plustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.S000066400000000000000000000010311317335042600244500ustar00rootroot00000000000000/** * This source contains emulation of syscalls * that are not implemented in the AArch64 Linux kernel */ ENTRY(sys_open) mov x3, x2 mov x2, x1 mov x1, x0 mov x0, #-100 b sys_openat END(sys_open) ENTRY(sys_mkdir) mov x3, x2 mov x2, x1 mov x1, x0 mov x0, #-100 b sys_mkdirat END(sys_mkdir) ENTRY(sys_rmdir) mov x2, #0x200 // flags = AT_REMOVEDIR mov x1, x0 mov x0, #-100 b sys_unlinkat END(sys_rmdir) ENTRY(sys_unlink) mov x2, #0 // flags = 0 mov x1, x0 mov x0, #-100 b sys_unlinkat END(sys_unlink) criu-3.6/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.h000066400000000000000000000000631317335042600245010ustar00rootroot00000000000000#ifndef __NR_openat # define __NR_openat 56 #endif criu-3.6/compel/arch/aarch64/plugins/std/syscalls/syscall-common.S000066400000000000000000000003521317335042600251500ustar00rootroot00000000000000#include "common/asm/linkage.h" syscall_common: svc #0 ret .macro syscall name, nr ENTRY(\name) mov x8, \nr b syscall_common END(\name) .endm ENTRY(__cr_restore_rt) mov x8, __NR_rt_sigreturn svc #0 END(__cr_restore_rt) criu-3.6/compel/arch/aarch64/plugins/std/syscalls/syscall.def000077700000000000000000000000001317335042600343062../../../../arm/plugins/std/syscalls/syscall.defustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/scripts/000077500000000000000000000000001317335042600174435ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/scripts/compel-pack.lds.S000066400000000000000000000011361317335042600225440ustar00rootroot00000000000000OUTPUT_ARCH(aarch64) EXTERN(__export_parasite_head_start) SECTIONS { .crblob 0x0 : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) . = ALIGN(32); *(.data*) . = ALIGN(32); *(.rodata*) . = ALIGN(32); *(.bss*) . = ALIGN(32); *(.got*) . = ALIGN(32); *(.toc*) . = ALIGN(32); } =0x00000000, /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) *(*) } /* Parasite args should have 4 bytes align, as we have futex inside. */ . = ALIGN(4); __export_parasite_args = .; } criu-3.6/compel/arch/aarch64/src/000077500000000000000000000000001317335042600165435ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/src/lib/000077500000000000000000000000001317335042600173115ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/src/lib/cpu.c000066400000000000000000000012651317335042600202500ustar00rootroot00000000000000#include #include #include "compel-cpu.h" #include "common/bitops.h" #include "log.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static bool rt_info_done = false; void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } int compel_cpuid(compel_cpuinfo_t *info) { return 0; } bool compel_cpu_has_feature(unsigned int feature) { if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } return compel_test_cpu_cap(&rt_info, feature); } criu-3.6/compel/arch/aarch64/src/lib/handle-elf-host.c000077700000000000000000000000001317335042600245652handle-elf.custar00rootroot00000000000000criu-3.6/compel/arch/aarch64/src/lib/handle-elf.c000066400000000000000000000014401317335042600214530ustar00rootroot00000000000000#include #include "uapi/compel.h" #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; int handle_binary(void *mem, size_t size) { const unsigned char *elf_ident = #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ elf_ident_64_le; #else elf_ident_64_be; #endif if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) return handle_elf_aarch64(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } criu-3.6/compel/arch/aarch64/src/lib/include/000077500000000000000000000000001317335042600207345ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/src/lib/include/cpu.h000066400000000000000000000000001317335042600216620ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/src/lib/include/handle-elf.h000066400000000000000000000004471317335042600231110ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf64-types.h" #define __handle_elf handle_elf_aarch64 #define arch_is_machine_supported(e_machine) (e_machine == EM_AARCH64) extern int handle_elf_aarch64(void *mem, size_t size); #endif /* COMPEL_HANDLE_ELF_H__ */ criu-3.6/compel/arch/aarch64/src/lib/include/syscall.h000066400000000000000000000001561317335042600225610ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ #define __NR(syscall, compat) __NR_##syscall #endif criu-3.6/compel/arch/aarch64/src/lib/include/uapi/000077500000000000000000000000001317335042600216725ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/src/lib/include/uapi/asm/000077500000000000000000000000001317335042600224525ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/src/lib/include/uapi/asm/.gitignore000066400000000000000000000000001317335042600244300ustar00rootroot00000000000000criu-3.6/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000003771317335042600251530ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT static inline int ptrace_set_breakpoint(pid_t pid, void *addr) { return 0; } static inline int ptrace_flush_breakpoints(pid_t pid) { return 0; } #endif criu-3.6/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h000066400000000000000000000002141317335042600234070ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_CPU_H__ #define UAPI_COMPEL_ASM_CPU_H__ typedef struct { } compel_cpuinfo_t; #endif /* UAPI_COMPEL_ASM_CPU_H__ */ criu-3.6/compel/arch/aarch64/src/lib/include/uapi/asm/fpu.h000066400000000000000000000001211317335042600234070ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #endif /* __CR_ASM_FPU_H__ */ criu-3.6/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000013131317335042600252330ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #include #include #define SIGMAX 64 #define SIGMAX_OLD 31 /* * Copied from the Linux kernel header arch/arm64/include/uapi/asm/ptrace.h * * A thread ARM CPU context */ typedef struct user_pt_regs user_regs_struct_t; typedef struct user_fpsimd_state user_fpregs_struct_t; #define REG_RES(r) ((uint64_t)(r).regs[0]) #define REG_IP(r) ((uint64_t)(r).pc) #define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) #define user_regs_native(pregs) true #define ARCH_SI_TRAP TRAP_BRKPT #define __NR(syscall, compat) __NR_##syscall #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ criu-3.6/compel/arch/aarch64/src/lib/include/uapi/asm/processor-flags.h000066400000000000000000000002121317335042600257270ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ #define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ #endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ criu-3.6/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000036441317335042600244270ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include #include #include /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ #define FPSIMD_MAGIC 0x46508001 typedef struct fpsimd_context fpu_state_t; struct aux_context { struct fpsimd_context fpsimd; /* additional context to be added before "end" */ struct _aarch64_ctx end; }; // XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include /* Copied from the kernel source arch/arm64/kernel/signal.c */ struct rt_sigframe { siginfo_t info; ucontext_t uc; uint64_t fp; uint64_t lr; }; #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "mov sp, %0 \n" \ "mov x8, #"__stringify(__NR_rt_sigreturn)" \n" \ "svc #0 \n" \ : \ : "r"(new_sp) \ : "sp", "x8", "memory") /* cr_sigcontext is copied from arch/arm64/include/uapi/asm/sigcontext.h */ struct cr_sigcontext { __u64 fault_address; /* AArch64 registers */ __u64 regs[31]; __u64 sp; __u64 pc; __u64 pstate; /* 4K reserved for FP/SIMD state and future expansion */ __u8 __reserved[4096] __attribute__((__aligned__(16))); }; #define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) #define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.pc) #define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) #define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct aux_context*)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ criu-3.6/compel/arch/aarch64/src/lib/infect.c000066400000000000000000000075411317335042600207340ustar00rootroot00000000000000#include #include #include #include #include #include "common/page.h" #include "uapi/compel/asm/infect-types.h" #include "log.h" #include "errno.h" #include "infect.h" #include "infect-priv.h" /* * Injected syscall instruction */ const char code_syscall[] = { 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */ 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */ }; static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); static inline void __always_unused __check_code_syscall(void) { BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); memcpy(sigframe->uc.uc_mcontext.regs, regs->regs, sizeof(regs->regs)); sigframe->uc.uc_mcontext.sp = regs->sp; sigframe->uc.uc_mcontext.pc = regs->pc; sigframe->uc.uc_mcontext.pstate = regs->pstate; memcpy(fpsimd->vregs, fpregs->vregs, 32 * sizeof(__uint128_t)); fpsimd->fpsr = fpregs->fpsr; fpsimd->fpcr = fpregs->fpcr; fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); return 0; } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, void *arg) { struct iovec iov; user_fpregs_struct_t fpsimd; int ret; pr_info("Dumping GP/FPU registers for %d\n", pid); iov.iov_base = regs; iov.iov_len = sizeof(user_regs_struct_t); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { pr_perror("Failed to obtain CPU registers for %d", pid); goto err; } iov.iov_base = &fpsimd; iov.iov_len = sizeof(fpsimd); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { pr_perror("Failed to obtain FPU registers for %d", pid); goto err; } ret = save(arg, regs, &fpsimd); err: return ret; } int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; regs.regs[8] = (unsigned long)nr; regs.regs[0] = arg1; regs.regs[1] = arg2; regs.regs[2] = arg3; regs.regs[3] = arg4; regs.regs[4] = arg5; regs.regs[5] = arg6; regs.regs[6] = 0; regs.regs[7] = 0; err = compel_execute_syscall(ctl, ®s, code_syscall); *ret = regs.regs[0]; return err; } void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); if (err < 0 || (long)map < 0) map = 0; return (void *)map; } void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { regs->pc = new_ip; if (stack) regs->sp = (unsigned long)stack; } bool arch_can_dump_task(struct parasite_ctl *ctl) { /* * TODO: Add proper check here */ return true; } int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { long ret; int err; err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } /* * Range for task size calculated from the following Linux kernel files: * arch/arm64/include/asm/memory.h * arch/arm64/Kconfig * * TODO: handle 32 bit tasks */ #define TASK_SIZE_MIN (1UL << 39) #define TASK_SIZE_MAX (1UL << 48) unsigned long compel_task_size(void) { unsigned long task_size; for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) if (munmap((void *)task_size, page_size())) break; return task_size; } criu-3.6/compel/arch/arm/000077500000000000000000000000001317335042600153035ustar00rootroot00000000000000criu-3.6/compel/arch/arm/plugins/000077500000000000000000000000001317335042600167645ustar00rootroot00000000000000criu-3.6/compel/arch/arm/plugins/include/000077500000000000000000000000001317335042600204075ustar00rootroot00000000000000criu-3.6/compel/arch/arm/plugins/include/asm/000077500000000000000000000000001317335042600211675ustar00rootroot00000000000000criu-3.6/compel/arch/arm/plugins/include/asm/prologue.h000077700000000000000000000000001317335042600337112../../../../../arch/x86/plugins/include/asm/prologue.hustar00rootroot00000000000000criu-3.6/compel/arch/arm/plugins/include/asm/syscall-types.h000066400000000000000000000011711317335042600241540ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ #define SA_RESTORER 0x04000000 typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define _KNSIG 64 #define _NSIG_BPW 32 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ criu-3.6/compel/arch/arm/plugins/include/features.h000066400000000000000000000001511317335042600223730ustar00rootroot00000000000000#ifndef __COMPEL_ARCH_FEATURES_H #define __COMPEL_ARCH_FEATURES_H #endif /* __COMPEL_ARCH_FEATURES_H */ criu-3.6/compel/arch/arm/plugins/std/000077500000000000000000000000001317335042600175565ustar00rootroot00000000000000criu-3.6/compel/arch/arm/plugins/std/parasite-head.S000066400000000000000000000010251317335042600224070ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .head.text, "ax" ENTRY(__export_parasite_head_start) sub r2, pc, #8 @ get the address of this instruction adr r0, __export_parasite_cmd ldr r0, [r0] adr r1, parasite_args_ptr ldr r1, [r1] add r1, r1, r2 @ fixup __export_parasite_args bl parasite_service .byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux parasite_args_ptr: .long __export_parasite_args __export_parasite_cmd: .long 0 END(__export_parasite_head_start) criu-3.6/compel/arch/arm/plugins/std/syscalls/000077500000000000000000000000001317335042600214135ustar00rootroot00000000000000criu-3.6/compel/arch/arm/plugins/std/syscalls/Makefile.syscalls000066400000000000000000000036721317335042600247170ustar00rootroot00000000000000ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ sys-types := $(obj)/include/uapi/std/syscall-types.h sys-codes := $(obj)/include/uapi/std/syscall-codes.h sys-proto := $(obj)/include/uapi/std/syscall.h sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall.def sys-asm-common-name := std/syscalls/syscall-common.S sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c sys-gen := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-syscalls.pl sys-gen-tbl := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-sys-exec-tbl.pl sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S std-lib-y += $(sys-asm:.S=).o ifeq ($(ARCH),arm) arch_bits := 32 else arch_bits := 64 endif sys-exec-tbl := sys-exec-tbl.c $(sys-asm) $(sys-types) $(sys-codes) $(sys-proto): $(sys-gen) $(sys-def) $(sys-asm-common) $(sys-asm-types) $(E) " GEN " $@ $(Q) perl \ $(sys-gen) \ $(sys-def) \ $(sys-codes) \ $(sys-proto) \ $(sys-asm) \ $(sys-asm-common-name) \ $(sys-types) \ $(arch_bits) $(sys-asm:.S=).o: $(sys-asm) $(sys-exec-tbl): $(sys-gen-tbl) $(sys-def) $(E) " GEN " $@ $(Q) perl \ $(sys-gen-tbl) \ $(sys-def) \ $(sys-exec-tbl) \ $(arch_bits) $(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(call msg-gen, $@) $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.S $(obj)/include/uapi/std/syscall-aux.S $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.h $(obj)/include/uapi/std/syscall-aux.h std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) mrproper-y += $(std-headers-deps) mrproper-y += $(obj)/include/uapi/std/syscall-aux.S mrproper-y += $(obj)/include/uapi/std/syscall-aux.h criu-3.6/compel/arch/arm/plugins/std/syscalls/gen-sys-exec-tbl.pl000077500000000000000000000015741317335042600250500ustar00rootroot00000000000000#!/usr/bin/perl use strict; use warnings; my $in = $ARGV[0]; my $tblout = $ARGV[1]; my $bits = $ARGV[2]; my $code = "code$bits"; open TBLOUT, ">", $tblout or die $!; open IN, "<", $in or die $!; print TBLOUT "/* Autogenerated, don't edit */\n"; print TBLOUT "static struct syscall_exec_desc sc_exec_table[] = {\n"; for () { if ($_ =~ /\#/) { next; } my $sys_name; my $sys_num; if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { $sys_name = $+{alias}; } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { $sys_name = $+{name}; } else { unlink $tblout; die "Invalid syscall definition file: invalid entry $_\n"; } $sys_num = $+{$code}; if ($sys_num ne "!") { print TBLOUT "SYSCALL($sys_name, $sys_num)\n"; } } print TBLOUT " { }, /* terminator */"; print TBLOUT "};" criu-3.6/compel/arch/arm/plugins/std/syscalls/gen-syscalls.pl000077500000000000000000000044621317335042600243650ustar00rootroot00000000000000#!/usr/bin/perl use strict; use warnings; my $in = $ARGV[0]; my $codesout = $ARGV[1]; my $codes = $ARGV[1]; $codes =~ s/.*include\/uapi\//compel\/plugins\//g; my $protosout = $ARGV[2]; my $protos = $ARGV[2]; $protos =~ s/.*include\/uapi\//compel\/plugins\//g; my $asmout = $ARGV[3]; my $asmcommon = $ARGV[4]; my $prototypes = $ARGV[5]; $prototypes =~ s/.*include\/uapi\//compel\/plugins\//g; my $bits = $ARGV[6]; my $codesdef = $codes; $codesdef =~ tr/.\-\//_/; my $protosdef = $protos; $protosdef =~ tr/.\-\//_/; my $code = "code$bits"; my $need_aux = 0; unlink $codesout; unlink $protosout; unlink $asmout; open CODESOUT, ">", $codesout or die $!; open PROTOSOUT, ">", $protosout or die $!; open ASMOUT, ">", $asmout or die $!; open IN, "<", $in or die $!; print CODESOUT <<"END"; /* Autogenerated, don't edit */ #ifndef $codesdef #define $codesdef END print PROTOSOUT <<"END"; /* Autogenerated, don't edit */ #ifndef $protosdef #define $protosdef #include <$prototypes> #include <$codes> END print ASMOUT <<"END"; /* Autogenerated, don't edit */ #include <$codes> #include "$asmcommon" END for () { if ($_ =~ /\#/) { next; } my $code_macro; my $sys_macro; my $sys_name; if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { $code_macro = "__NR_$+{name}"; $sys_macro = "SYS_$+{name}"; $sys_name = "sys_$+{alias}"; } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { $code_macro = "__NR_$+{name}"; $sys_macro = "SYS_$+{name}"; $sys_name = "sys_$+{name}"; } else { unlink $codesout; unlink $protosout; unlink $asmout; die "Invalid syscall definition file: invalid entry $_\n"; } if ($+{$code} ne "!") { print CODESOUT "#ifndef $code_macro\n#define $code_macro $+{$code}\n#endif\n"; print CODESOUT "#ifndef $sys_macro\n#define $sys_macro $code_macro\n#endif\n"; print ASMOUT "syscall $sys_name, $code_macro\n"; } else { $need_aux = 1; } print PROTOSOUT "extern long $sys_name($+{args});\n"; } if ($need_aux == 1) { print ASMOUT "#include \n"; print CODESOUT "#include \n"; } print CODESOUT "#endif /* $codesdef */"; print PROTOSOUT "#endif /* $protosdef */"; criu-3.6/compel/arch/arm/plugins/std/syscalls/syscall-aux.S000066400000000000000000000003211317335042600240000ustar00rootroot00000000000000nr_sys_mmap: .long 192 ENTRY(sys_mmap) push {r4, r5, r7, lr} ldr r4, [sp, #16] ldr r5, [sp, #20] lsr r5, #12 adr r7, nr_sys_mmap ldr r7, [r7] svc 0x00000000 pop {r4, r5, r7, pc} END(sys_mmap) criu-3.6/compel/arch/arm/plugins/std/syscalls/syscall-aux.h000066400000000000000000000011201317335042600240230ustar00rootroot00000000000000#ifndef __NR_mmap2 # define __NR_mmap2 192 #endif #ifndef __ARM_NR_BASE # define __ARM_NR_BASE 0x0f0000 #endif #ifndef __ARM_NR_breakpoint # define __ARM_NR_breakpoint (__ARM_NR_BASE+1) #endif #ifndef __ARM_NR_cacheflush # define __ARM_NR_cacheflush (__ARM_NR_BASE+2) #endif #ifndef __ARM_NR_usr26 # define __ARM_NR_usr26 (__ARM_NR_BASE+3) #endif #ifndef __ARM_NR_usr32 # define __ARM_NR_usr32 (__ARM_NR_BASE+4) #endif #ifndef __ARM_NR_set_tls # define __ARM_NR_set_tls (__ARM_NR_BASE+5) #endif criu-3.6/compel/arch/arm/plugins/std/syscalls/syscall-common.S000066400000000000000000000013121317335042600244740ustar00rootroot00000000000000#include "common/asm/linkage.h" @ We use the register R8 unlike libc that uses R12. @ This avoids corruption of the register by the stub @ for the syscall sys_munmap() when syscalls are hooked @ by ptrace(). However we have to make sure that @ the compiler doesn't use the register on the route @ between parasite_service() and sys_munmap(). syscall_common: ldr r7, [r7] add r8, sp, #24 ldm r8, {r4, r5, r6} svc 0x00000000 pop {r4, r5, r6, r7, r8, pc} .macro syscall name, nr .nr_\name : .long \nr ENTRY(\name) push {r4, r5, r6, r7, r8, lr} adr r7, .nr_\name b syscall_common END(\name) .endm ENTRY(__cr_restore_rt) adr r7, .nr_sys_rt_sigreturn ldr r7, [r7] svc #0 END(__cr_restore_rt) criu-3.6/compel/arch/arm/plugins/std/syscalls/syscall.def000066400000000000000000000155151317335042600235540ustar00rootroot00000000000000# # System calls table, please make sure the table consists of only the syscalls # really used somewhere in the project. # # The template is (name and arguments are optional if you need only __NR_x # defined, but no real entry point in syscalls lib). # # name/alias code64 code32 arguments # ----------------------------------------------------------------------- # read 63 3 (int fd, void *buf, unsigned long count) write 64 4 (int fd, const void *buf, unsigned long count) open ! 5 (const char *filename, unsigned long flags, unsigned long mode) close 57 6 (int fd) lseek 62 19 (int fd, unsigned long offset, unsigned long origin) mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot) munmap 215 91 (void *addr, unsigned long len) brk 214 45 (void *addr) rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) rt_sigreturn 139 173 (void) ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg) pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos) ptrace 117 26 (long request, pid_t pid, void *addr, void *data) mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr) mincore 232 219 (void *addr, unsigned long size, unsigned char *vec) madvise 233 220 (unsigned long start, size_t len, int behavior) shmat 196 305 (int shmid, void *shmaddr, int shmflag) pause 1061 29 (void) nanosleep 101 162 (struct timespec *req, struct timespec *rem) getitimer 102 105 (int which, const struct itimerval *val) setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old) getpid 172 20 (void) socket 198 281 (int domain, int type, int protocol) connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen) sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) shutdown 210 293 (int sockfd, int how) bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) exit 93 1 (unsigned long error_code) wait4 260 114 (int pid, int *status, int options, struct rusage *ru) waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) kill 129 37 (long pid, int sig) fcntl 25 55 (int fd, int type, long arg) flock 32 143 (int fd, unsigned long cmd) mkdir ! 39 (const char *name, int mode) rmdir ! 40 (const char *name) unlink ! 10 (char *pathname) readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize) umask 166 60 (int mask) getgroups 158 205 (int gsize, unsigned int *groups) setgroups 159 206 (int gsize, unsigned int *groups) setresuid 147 164 (int uid, int euid, int suid) getresuid 148 165 (int *uid, int *euid, int *suid) setresgid 149 170 (int gid, int egid, int sgid) getresgid 150 171 (int *gid, int *egid, int *sgid) getpgid 155 132 (pid_t pid) setfsuid 151 138 (int fsuid) setfsgid 152 139 (int fsgid) getsid 156 147 (void) capget 90 184 (struct cap_header *h, struct cap_data *d) capset 91 185 (struct cap_header *h, struct cap_data *d) rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info) setpriority 140 97 (int which, int who, int nice) sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p) sigaltstack 132 186 (const void *uss, void *uoss) personality 92 136 (unsigned int personality) prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) arch_prctl ! 17 (int option, unsigned long addr) setrlimit 164 75 (int resource, struct krlimit *rlim) mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) umount2 39 52 (char *name, int flags) gettid 178 224 (void) futex 98 240 (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) set_tid_address 96 256 (int *tid_addr) restart_syscall 128 0 (void) timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) timer_getoverrun 109 260 (int timer_id) timer_delete 111 261 (kernel_timer_t timer_id) clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) exit_group 94 248 (int error_code) set_robust_list 99 338 (struct robust_list_head *head, size_t len) get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info) vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags) fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags) setns 268 375 (int fd, int nstype) kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode) mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode) unlinkat 35 328 (int dirfd, const char *pathname, int flags) memfd_create 279 385 (const char *name, unsigned int flags) io_setup 0 243 (unsigned nr_events, aio_context_t *ctx) io_submit 2 246 (aio_context_t ctx_id, long nr, struct iocb **iocbpp) io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs) gettimeofday 169 78 (struct timeval *tv, struct timezone *tz) preadv 69 361 (int fd, struct iovec *iov, unsigned long nr, loff_t off) userfaultfd 282 388 (int flags) criu-3.6/compel/arch/arm/scripts/000077500000000000000000000000001317335042600167725ustar00rootroot00000000000000criu-3.6/compel/arch/arm/scripts/compel-pack.lds.S000066400000000000000000000011321317335042600220670ustar00rootroot00000000000000OUTPUT_ARCH(arm) EXTERN(__export_parasite_head_start) SECTIONS { .crblob 0x0 : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) . = ALIGN(32); *(.data*) . = ALIGN(32); *(.rodata*) . = ALIGN(32); *(.bss*) . = ALIGN(32); *(.got*) . = ALIGN(32); *(.toc*) . = ALIGN(32); } =0x00000000, /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) *(*) } /* Parasite args should have 4 bytes align, as we have futex inside. */ . = ALIGN(4); __export_parasite_args = .; } criu-3.6/compel/arch/arm/src/000077500000000000000000000000001317335042600160725ustar00rootroot00000000000000criu-3.6/compel/arch/arm/src/lib/000077500000000000000000000000001317335042600166405ustar00rootroot00000000000000criu-3.6/compel/arch/arm/src/lib/cpu.c000066400000000000000000000012651317335042600175770ustar00rootroot00000000000000#include #include #include "compel-cpu.h" #include "common/bitops.h" #include "log.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static bool rt_info_done = false; void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } int compel_cpuid(compel_cpuinfo_t *info) { return 0; } bool compel_cpu_has_feature(unsigned int feature) { if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } return compel_test_cpu_cap(&rt_info, feature); } criu-3.6/compel/arch/arm/src/lib/handle-elf-host.c000077700000000000000000000000001317335042600241142handle-elf.custar00rootroot00000000000000criu-3.6/compel/arch/arm/src/lib/handle-elf.c000066400000000000000000000007461317335042600210120ustar00rootroot00000000000000#include #include "uapi/compel.h" #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_32[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; int handle_binary(void *mem, size_t size) { if (memcmp(mem, elf_ident_32, sizeof(elf_ident_32)) == 0) return handle_elf_arm(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } criu-3.6/compel/arch/arm/src/lib/include/000077500000000000000000000000001317335042600202635ustar00rootroot00000000000000criu-3.6/compel/arch/arm/src/lib/include/cpu.h000066400000000000000000000000001317335042600212110ustar00rootroot00000000000000criu-3.6/compel/arch/arm/src/lib/include/handle-elf.h000066400000000000000000000004331317335042600224330ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf32-types.h" #define __handle_elf handle_elf_arm #define arch_is_machine_supported(e_machine) (e_machine == EM_ARM) extern int handle_elf_arm(void *mem, size_t size); #endif /* COMPEL_HANDLE_ELF_H__ */ criu-3.6/compel/arch/arm/src/lib/include/syscall.h000066400000000000000000000001561317335042600221100ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ #define __NR(syscall, compat) __NR_##syscall #endif criu-3.6/compel/arch/arm/src/lib/include/uapi/000077500000000000000000000000001317335042600212215ustar00rootroot00000000000000criu-3.6/compel/arch/arm/src/lib/include/uapi/asm/000077500000000000000000000000001317335042600220015ustar00rootroot00000000000000criu-3.6/compel/arch/arm/src/lib/include/uapi/asm/.gitignore000066400000000000000000000000001317335042600237570ustar00rootroot00000000000000criu-3.6/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000003771317335042600245020ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT static inline int ptrace_set_breakpoint(pid_t pid, void *addr) { return 0; } static inline int ptrace_flush_breakpoints(pid_t pid) { return 0; } #endif criu-3.6/compel/arch/arm/src/lib/include/uapi/asm/cpu.h000066400000000000000000000002141317335042600227360ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_CPU_H__ #define UAPI_COMPEL_ASM_CPU_H__ typedef struct { } compel_cpuinfo_t; #endif /* UAPI_COMPEL_ASM_CPU_H__ */ criu-3.6/compel/arch/arm/src/lib/include/uapi/asm/fpu.h000066400000000000000000000001211317335042600227360ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #endif /* __CR_ASM_FPU_H__ */ criu-3.6/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000025171317335042600245710ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #define SIGMAX 64 #define SIGMAX_OLD 31 /* * Copied from the Linux kernel header arch/arm/include/asm/ptrace.h * * A thread ARM CPU context */ typedef struct { long uregs[18]; } user_regs_struct_t; typedef struct user_vfp user_fpregs_struct_t; #define ARM_cpsr uregs[16] #define ARM_pc uregs[15] #define ARM_lr uregs[14] #define ARM_sp uregs[13] #define ARM_ip uregs[12] #define ARM_fp uregs[11] #define ARM_r10 uregs[10] #define ARM_r9 uregs[9] #define ARM_r8 uregs[8] #define ARM_r7 uregs[7] #define ARM_r6 uregs[6] #define ARM_r5 uregs[5] #define ARM_r4 uregs[4] #define ARM_r3 uregs[3] #define ARM_r2 uregs[2] #define ARM_r1 uregs[1] #define ARM_r0 uregs[0] #define ARM_ORIG_r0 uregs[17] /* Copied from arch/arm/include/asm/user.h */ struct user_vfp { unsigned long long fpregs[32]; unsigned long fpscr; }; struct user_vfp_exc { unsigned long fpexc; unsigned long fpinst; unsigned long fpinst2; }; #define REG_RES(regs) ((regs).ARM_r0) #define REG_IP(regs) ((regs).ARM_pc) #define REG_SYSCALL_NR(regs) ((regs).ARM_r7) #define user_regs_native(pregs) true #define ARCH_SI_TRAP TRAP_BRKPT #define __NR(syscall, compat) __NR_##syscall #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ criu-3.6/compel/arch/arm/src/lib/include/uapi/asm/processor-flags.h000066400000000000000000000024361317335042600252700ustar00rootroot00000000000000#ifndef __CR_PROCESSOR_FLAGS_H__ #define __CR_PROCESSOR_FLAGS_H__ /* Copied from the Linux kernel header arch/arm/include/uapi/asm/ptrace.h */ /* * PSR bits */ #define USR26_MODE 0x00000000 #define FIQ26_MODE 0x00000001 #define IRQ26_MODE 0x00000002 #define SVC26_MODE 0x00000003 #define USR_MODE 0x00000010 #define FIQ_MODE 0x00000011 #define IRQ_MODE 0x00000012 #define SVC_MODE 0x00000013 #define ABT_MODE 0x00000017 #define UND_MODE 0x0000001b #define SYSTEM_MODE 0x0000001f #define MODE32_BIT 0x00000010 #define MODE_MASK 0x0000001f #define PSR_T_BIT 0x00000020 #define PSR_F_BIT 0x00000040 #define PSR_I_BIT 0x00000080 #define PSR_A_BIT 0x00000100 #define PSR_E_BIT 0x00000200 #define PSR_J_BIT 0x01000000 #define PSR_Q_BIT 0x08000000 #define PSR_V_BIT 0x10000000 #define PSR_C_BIT 0x20000000 #define PSR_Z_BIT 0x40000000 #define PSR_N_BIT 0x80000000 /* * Groups of PSR bits */ #define PSR_f 0xff000000 /* Flags */ #define PSR_s 0x00ff0000 /* Status */ #define PSR_x 0x0000ff00 /* Extension */ #define PSR_c 0x000000ff /* Control */ #endif criu-3.6/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000041501317335042600237470ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include /* Copied from the Linux kernel header arch/arm/include/asm/sigcontext.h */ struct rt_sigcontext { unsigned long trap_no; unsigned long error_code; unsigned long oldmask; unsigned long arm_r0; unsigned long arm_r1; unsigned long arm_r2; unsigned long arm_r3; unsigned long arm_r4; unsigned long arm_r5; unsigned long arm_r6; unsigned long arm_r7; unsigned long arm_r8; unsigned long arm_r9; unsigned long arm_r10; unsigned long arm_fp; unsigned long arm_ip; unsigned long arm_sp; unsigned long arm_lr; unsigned long arm_pc; unsigned long arm_cpsr; unsigned long fault_address; }; /* Copied from the Linux kernel header arch/arm/include/asm/ucontext.h */ #define VFP_MAGIC 0x56465001 #define VFP_STORAGE_SIZE sizeof(struct vfp_sigframe) struct vfp_sigframe { unsigned long magic; unsigned long size; struct user_vfp ufp; struct user_vfp_exc ufp_exc; }; typedef struct vfp_sigframe fpu_state_t; struct aux_sigframe { /* struct crunch_sigframe crunch; struct iwmmxt_sigframe iwmmxt; */ struct vfp_sigframe vfp; unsigned long end_magic; } __attribute__((aligned(8))); #include struct sigframe { struct rt_ucontext uc; unsigned long retcode[2]; }; struct rt_sigframe { struct rt_siginfo info; struct sigframe sig; }; #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "mov sp, %0 \n" \ "mov r7, #"__stringify(__NR_rt_sigreturn)" \n" \ "svc #0 \n" \ : \ : "r"(new_sp) \ : "sp","memory") #define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->sig.uc) #define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->sig.uc.uc_mcontext.arm_ip #define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 #define RT_SIGFRAME_AUX_SIGFRAME(rt_sigframe) ((struct aux_sigframe *)&(rt_sigframe)->sig.uc.uc_regspace) #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_SIGFRAME(rt_sigframe)->vfp) #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ criu-3.6/compel/arch/arm/src/lib/infect.c000066400000000000000000000115041317335042600202550ustar00rootroot00000000000000#include #include #include #include #include "common/page.h" #include "uapi/compel/asm/infect-types.h" #include "log.h" #include "errno.h" #include "infect.h" #include "infect-priv.h" /* * Injected syscall instruction */ const char code_syscall[] = { 0x00, 0x00, 0x00, 0xef, /* SVC #0 */ 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */ }; static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); static inline __always_unused void __check_code_syscall(void) { BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { struct aux_sigframe *aux = (struct aux_sigframe *)(void *)&sigframe->sig.uc.uc_regspace; sigframe->sig.uc.uc_mcontext.arm_r0 = regs->ARM_r0; sigframe->sig.uc.uc_mcontext.arm_r1 = regs->ARM_r1; sigframe->sig.uc.uc_mcontext.arm_r2 = regs->ARM_r2; sigframe->sig.uc.uc_mcontext.arm_r3 = regs->ARM_r3; sigframe->sig.uc.uc_mcontext.arm_r4 = regs->ARM_r4; sigframe->sig.uc.uc_mcontext.arm_r5 = regs->ARM_r5; sigframe->sig.uc.uc_mcontext.arm_r6 = regs->ARM_r6; sigframe->sig.uc.uc_mcontext.arm_r7 = regs->ARM_r7; sigframe->sig.uc.uc_mcontext.arm_r8 = regs->ARM_r8; sigframe->sig.uc.uc_mcontext.arm_r9 = regs->ARM_r9; sigframe->sig.uc.uc_mcontext.arm_r10 = regs->ARM_r10; sigframe->sig.uc.uc_mcontext.arm_fp = regs->ARM_fp; sigframe->sig.uc.uc_mcontext.arm_ip = regs->ARM_ip; sigframe->sig.uc.uc_mcontext.arm_sp = regs->ARM_sp; sigframe->sig.uc.uc_mcontext.arm_lr = regs->ARM_lr; sigframe->sig.uc.uc_mcontext.arm_pc = regs->ARM_pc; sigframe->sig.uc.uc_mcontext.arm_cpsr = regs->ARM_cpsr; memcpy(&aux->vfp.ufp.fpregs, &fpregs->fpregs, sizeof(aux->vfp.ufp.fpregs)); aux->vfp.ufp.fpscr = fpregs->fpscr; aux->vfp.magic = VFP_MAGIC; aux->vfp.size = VFP_STORAGE_SIZE; return 0; } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } #define PTRACE_GETVFPREGS 27 int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, void *arg) { user_fpregs_struct_t vfp; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); if (ptrace(PTRACE_GETVFPREGS, pid, NULL, &vfp)) { pr_perror("Can't obtain FPU registers for %d", pid); goto err; } /* Did we come from a system call? */ if ((int)regs->ARM_ORIG_r0 >= 0) { /* Restart the system call */ switch ((long)(int)regs->ARM_r0) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->ARM_r0 = regs->ARM_ORIG_r0; regs->ARM_pc -= 4; break; case -ERESTART_RESTARTBLOCK: regs->ARM_r0 = __NR_restart_syscall; regs->ARM_pc -= 4; break; } } ret = save(arg, regs, &vfp); err: return ret; } int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; regs.ARM_r7 = (unsigned long)nr; regs.ARM_r0 = arg1; regs.ARM_r1 = arg2; regs.ARM_r2 = arg3; regs.ARM_r3 = arg4; regs.ARM_r4 = arg5; regs.ARM_r5 = arg6; err = compel_execute_syscall(ctl, ®s, code_syscall); *ret = regs.ARM_r0; return err; } void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; if (offset & ~PAGE_MASK) return 0; err = compel_syscall(ctl, __NR_mmap2, &map, (unsigned long)addr, length, prot, flags, fd, offset >> 12); if (err < 0 || map > ctl->ictx.task_size) map = 0; return (void *)map; } void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { regs->ARM_pc = new_ip; if (stack) regs->ARM_sp = (unsigned long)stack; /* Make sure flags are in known state */ regs->ARM_cpsr &= PSR_f | PSR_s | PSR_x | MODE32_BIT; } bool arch_can_dump_task(struct parasite_ctl *ctl) { /* * TODO: Add proper check here */ return true; } int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { long ret; int err; err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->sig.uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } /* * Range for task size calculated from the following Linux kernel files: * arch/arm/include/asm/memory.h * arch/arm/Kconfig (PAGE_OFFSET values in Memory split section) */ #define TASK_SIZE_MIN 0x3f000000 #define TASK_SIZE_MAX 0xbf000000 #define SZ_1G 0x40000000 unsigned long compel_task_size(void) { unsigned long task_size; for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size += SZ_1G) if (munmap((void *)task_size, page_size())) break; return task_size; } criu-3.6/compel/arch/ppc64/000077500000000000000000000000001317335042600154605ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/plugins/000077500000000000000000000000001317335042600171415ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/plugins/include/000077500000000000000000000000001317335042600205645ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/plugins/include/asm/000077500000000000000000000000001317335042600213445ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/plugins/include/asm/prologue.h000077700000000000000000000000001317335042600340662../../../../../arch/x86/plugins/include/asm/prologue.hustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/plugins/include/asm/syscall-types.h000066400000000000000000000011741317335042600243340ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ #define SA_RESTORER 0x04000000U typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define _KNSIG 64 #define _NSIG_BPW 64 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ criu-3.6/compel/arch/ppc64/plugins/include/features.h000066400000000000000000000002321317335042600225500ustar00rootroot00000000000000#ifndef __COMPEL_ARCH_FEATURES_H #define __COMPEL_ARCH_FEATURES_H #define ARCH_HAS_MEMCPY #define ARCH_HAS_MEMCMP #endif /* __COMPEL_ARCH_FEATURES_H */ criu-3.6/compel/arch/ppc64/plugins/std/000077500000000000000000000000001317335042600177335ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/plugins/std/memcmp.S000066400000000000000000000056561317335042600213510ustar00rootroot00000000000000/* * Author: Anton Blanchard * Copyright 2015 IBM Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * -- * Copied form the linux file arch/powerpc/lib/memcmp_64.S */ #include "common/asm/linkage.h" #define off8 r6 #define off16 r7 #define off24 r8 #define rA r9 #define rB r10 #define rC r11 #define rD r27 #define rE r28 #define rF r29 #define rG r30 #define rH r31 #ifdef __LITTLE_ENDIAN__ #define LD ldbrx #else #define LD ldx #endif ENTRY(memcmp) cmpdi cr1,r5,0 /* Use the short loop if both strings are not 8B aligned */ or r6,r3,r4 andi. r6,r6,7 /* Use the short loop if length is less than 32B */ cmpdi cr6,r5,31 beq cr1,.Lzero bne .Lshort bgt cr6,.Llong .Lshort: mtctr r5 1: lbz rA,0(r3) lbz rB,0(r4) subf. rC,rB,rA bne .Lnon_zero bdz .Lzero lbz rA,1(r3) lbz rB,1(r4) subf. rC,rB,rA bne .Lnon_zero bdz .Lzero lbz rA,2(r3) lbz rB,2(r4) subf. rC,rB,rA bne .Lnon_zero bdz .Lzero lbz rA,3(r3) lbz rB,3(r4) subf. rC,rB,rA bne .Lnon_zero addi r3,r3,4 addi r4,r4,4 bdnz 1b .Lzero: li r3,0 blr .Lnon_zero: mr r3,rC blr .Llong: li off8,8 li off16,16 li off24,24 std r31,-8(r1) std r30,-16(r1) std r29,-24(r1) std r28,-32(r1) std r27,-40(r1) srdi r0,r5,5 mtctr r0 andi. r5,r5,31 LD rA,0,r3 LD rB,0,r4 LD rC,off8,r3 LD rD,off8,r4 LD rE,off16,r3 LD rF,off16,r4 LD rG,off24,r3 LD rH,off24,r4 cmpld cr0,rA,rB addi r3,r3,32 addi r4,r4,32 bdz .Lfirst32 LD rA,0,r3 LD rB,0,r4 cmpld cr1,rC,rD LD rC,off8,r3 LD rD,off8,r4 cmpld cr6,rE,rF LD rE,off16,r3 LD rF,off16,r4 cmpld cr7,rG,rH bne cr0,.LcmpAB LD rG,off24,r3 LD rH,off24,r4 cmpld cr0,rA,rB bne cr1,.LcmpCD addi r3,r3,32 addi r4,r4,32 bdz .Lsecond32 .balign 16 1: LD rA,0,r3 LD rB,0,r4 cmpld cr1,rC,rD bne cr6,.LcmpEF LD rC,off8,r3 LD rD,off8,r4 cmpld cr6,rE,rF bne cr7,.LcmpGH LD rE,off16,r3 LD rF,off16,r4 cmpld cr7,rG,rH bne cr0,.LcmpAB LD rG,off24,r3 LD rH,off24,r4 cmpld cr0,rA,rB bne cr1,.LcmpCD addi r3,r3,32 addi r4,r4,32 bdnz 1b .Lsecond32: cmpld cr1,rC,rD bne cr6,.LcmpEF cmpld cr6,rE,rF bne cr7,.LcmpGH cmpld cr7,rG,rH bne cr0,.LcmpAB bne cr1,.LcmpCD bne cr6,.LcmpEF bne cr7,.LcmpGH .Ltail: ld r31,-8(r1) ld r30,-16(r1) ld r29,-24(r1) ld r28,-32(r1) ld r27,-40(r1) cmpdi r5,0 beq .Lzero b .Lshort .Lfirst32: cmpld cr1,rC,rD cmpld cr6,rE,rF cmpld cr7,rG,rH bne cr0,.LcmpAB bne cr1,.LcmpCD bne cr6,.LcmpEF bne cr7,.LcmpGH b .Ltail .LcmpAB: li r3,1 bgt cr0,.Lout li r3,-1 b .Lout .LcmpCD: li r3,1 bgt cr1,.Lout li r3,-1 b .Lout .LcmpEF: li r3,1 bgt cr6,.Lout li r3,-1 b .Lout .LcmpGH: li r3,1 bgt cr7,.Lout li r3,-1 .Lout: ld r31,-8(r1) ld r30,-16(r1) ld r29,-24(r1) ld r28,-32(r1) ld r27,-40(r1) blr criu-3.6/compel/arch/ppc64/plugins/std/memcpy.S000066400000000000000000000076061317335042600213620ustar00rootroot00000000000000/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2012 * * Author: Anton Blanchard * * -- * Copied from the kernel file arch/powerpc/lib/memcpy_power7.S * Altivec support has been removed so we don't taint restored process. */ #include "common/asm/linkage.h" /* * When building the parasite code, the compiler may rely on the C library * service memcpy to initialise big local variable in the stack. */ ENTRY(memcpy) cmpldi r5,16 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) blt .Lshort_copy .Lnonvmx_copy: /* Get the source 8B aligned */ neg r6,r4 mtocrf 0x01,r6 clrldi r6,r6,(64-3) bf cr7*4+3,1f lbz r0,0(r4) addi r4,r4,1 stb r0,0(r3) addi r3,r3,1 1: bf cr7*4+2,2f lhz r0,0(r4) addi r4,r4,2 sth r0,0(r3) addi r3,r3,2 2: bf cr7*4+1,3f lwz r0,0(r4) addi r4,r4,4 stw r0,0(r3) addi r3,r3,4 3: sub r5,r5,r6 cmpldi r5,128 blt 5f mflr r0 stdu r1,-STACKFRAMESIZE(r1) std r14,STK_REG(R14)(r1) std r15,STK_REG(R15)(r1) std r16,STK_REG(R16)(r1) std r17,STK_REG(R17)(r1) std r18,STK_REG(R18)(r1) std r19,STK_REG(R19)(r1) std r20,STK_REG(R20)(r1) std r21,STK_REG(R21)(r1) std r22,STK_REG(R22)(r1) std r0,STACKFRAMESIZE+16(r1) srdi r6,r5,7 mtctr r6 /* Now do cacheline (128B) sized loads and stores. */ .align 5 4: ld r0,0(r4) ld r6,8(r4) ld r7,16(r4) ld r8,24(r4) ld r9,32(r4) ld r10,40(r4) ld r11,48(r4) ld r12,56(r4) ld r14,64(r4) ld r15,72(r4) ld r16,80(r4) ld r17,88(r4) ld r18,96(r4) ld r19,104(r4) ld r20,112(r4) ld r21,120(r4) addi r4,r4,128 std r0,0(r3) std r6,8(r3) std r7,16(r3) std r8,24(r3) std r9,32(r3) std r10,40(r3) std r11,48(r3) std r12,56(r3) std r14,64(r3) std r15,72(r3) std r16,80(r3) std r17,88(r3) std r18,96(r3) std r19,104(r3) std r20,112(r3) std r21,120(r3) addi r3,r3,128 bdnz 4b clrldi r5,r5,(64-7) ld r14,STK_REG(R14)(r1) ld r15,STK_REG(R15)(r1) ld r16,STK_REG(R16)(r1) ld r17,STK_REG(R17)(r1) ld r18,STK_REG(R18)(r1) ld r19,STK_REG(R19)(r1) ld r20,STK_REG(R20)(r1) ld r21,STK_REG(R21)(r1) ld r22,STK_REG(R22)(r1) addi r1,r1,STACKFRAMESIZE /* Up to 127B to go */ 5: srdi r6,r5,4 mtocrf 0x01,r6 6: bf cr7*4+1,7f ld r0,0(r4) ld r6,8(r4) ld r7,16(r4) ld r8,24(r4) ld r9,32(r4) ld r10,40(r4) ld r11,48(r4) ld r12,56(r4) addi r4,r4,64 std r0,0(r3) std r6,8(r3) std r7,16(r3) std r8,24(r3) std r9,32(r3) std r10,40(r3) std r11,48(r3) std r12,56(r3) addi r3,r3,64 /* Up to 63B to go */ 7: bf cr7*4+2,8f ld r0,0(r4) ld r6,8(r4) ld r7,16(r4) ld r8,24(r4) addi r4,r4,32 std r0,0(r3) std r6,8(r3) std r7,16(r3) std r8,24(r3) addi r3,r3,32 /* Up to 31B to go */ 8: bf cr7*4+3,9f ld r0,0(r4) ld r6,8(r4) addi r4,r4,16 std r0,0(r3) std r6,8(r3) addi r3,r3,16 9: clrldi r5,r5,(64-4) /* Up to 15B to go */ .Lshort_copy: mtocrf 0x01,r5 bf cr7*4+0,12f lwz r0,0(r4) /* Less chance of a reject with word ops */ lwz r6,4(r4) addi r4,r4,8 stw r0,0(r3) stw r6,4(r3) addi r3,r3,8 12: bf cr7*4+1,13f lwz r0,0(r4) addi r4,r4,4 stw r0,0(r3) addi r3,r3,4 13: bf cr7*4+2,14f lhz r0,0(r4) addi r4,r4,2 sth r0,0(r3) addi r3,r3,2 14: bf cr7*4+3,15f lbz r0,0(r4) stb r0,0(r3) 15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) blr .Lunwind_stack_nonvmx_copy: addi r1,r1,STACKFRAMESIZE b .Lnonvmx_copy criu-3.6/compel/arch/ppc64/plugins/std/parasite-head.S000066400000000000000000000020661317335042600225720ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .head.text .align 8 ENTRY(__export_parasite_head_start) // int __used parasite_service(unsigned int cmd, void *args) // cmd = r3 = *__export_parasite_cmd (u32 ?) // args = r4 = @parasite_args_ptr + @pc bl 0f 0: mflr r2 #define LOAD_REG_ADDR(reg, name) \ addis reg,r2,(name - 0b)@ha; \ addi reg,r2,(name - 0b)@l; LOAD_REG_ADDR(r3,__export_parasite_cmd) lwz r3,0(r3) LOAD_REG_ADDR(r4,parasite_args_ptr) ld r4,0(r4) LOAD_REG_ADDR(r12,parasite_service_ptr) ld r12,0(r12) mtctr r12 bctrl // call parasite_service twi 31,0,0 // Should generate SIGTRAP parasite_args_ptr: .quad __export_parasite_args parasite_service_ptr: // We want to run the function prototype to set r2. // Since the relocation will prefer the local entry // point, we force it to the global one which is 2 // instructions above the local one. // FIXME: There should be a way to specify the global entry here. .quad parasite_service - 8 __export_parasite_cmd: .long 0 END(__export_parasite_head_start) criu-3.6/compel/arch/ppc64/plugins/std/syscalls/000077500000000000000000000000001317335042600215705ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/plugins/std/syscalls/Makefile.syscalls000066400000000000000000000052101317335042600250620ustar00rootroot00000000000000ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ sys-types := $(obj)/include/uapi/std/syscall-types.h sys-codes := $(obj)/include/uapi/std/syscall-codes.h sys-proto := $(obj)/include/uapi/std/syscall.h sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall-ppc64.tbl sys-asm-common-name := std/syscalls/syscall-common-ppc64.S sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S std-lib-y += $(sys-asm:.S=).o $(sys-codes): $(sys-def) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) cat $< | awk '/^__NR/{SYSN=$$1; sub("^__NR", "SYS", SYSN);'\ 'print "\n#ifndef ", $$1, "\n#define", $$1, $$2, "\n#endif";'\ 'print "#ifndef ", SYSN, "\n#define ", SYSN, $$1, "\n#endif"}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ $(sys-proto): $(sys-def) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#include " >> $@ $(Q) echo "#include " >> $@ $(Q) cat $< | awk '/^__NR/{print "extern long", $$3, substr($$0, index($$0,$$4)), ";"}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ $(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#include " >> $@ $(Q) echo "#include \"$(sys-asm-common-name)\"" >> $@ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", $$3, ",", $$2, ")"}' >> $@ $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "static struct syscall_exec_desc sc_exec_table[] = {" >> $@ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", substr($$3, 5), ",", $$2, ")"}' >> $@ $(Q) echo " { }, /* terminator */" >> $@ $(Q) echo "};" >> $@ $(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(call msg-gen, $@) $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) mrproper-y += $(std-headers-deps) criu-3.6/compel/arch/ppc64/plugins/std/syscalls/syscall-common-ppc64.S000066400000000000000000000006641317335042600256140ustar00rootroot00000000000000#include "common/asm/linkage.h" #include /* for __NR_ipc */ #define SYSCALL(name, opcode) \ ENTRY(name); \ li r0, opcode; \ b __syscall_common; \ END(name) .text .align 4 ENTRY(__syscall_common) sc bnslr+ /* if no error return to LR */ neg r3,r3 /* r3 = -r3 to return -errno value */ blr END(__syscall_common) ENTRY(__cr_restore_rt) li r0, __NR_rt_sigreturn b __syscall_common END(__cr_restore_rt) criu-3.6/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl000066400000000000000000000202121317335042600246740ustar00rootroot00000000000000# # System calls table, please make sure the table consists of only the syscalls # really used somewhere in the project. # # The template is (name and arguments are optional if you need only __NR_x # defined, but no real entry point in syscalls lib). # # name code name arguments # ----------------------------------------------------------------------- # __NR_read 3 sys_read (int fd, void *buf, unsigned long count) __NR_write 4 sys_write (int fd, const void *buf, unsigned long count) __NR_open 5 sys_open (const char *filename, unsigned long flags, unsigned long mode) __NR_close 6 sys_close (int fd) __NR_lseek 19 sys_lseek (int fd, unsigned long offset, unsigned long origin) __NR_mmap 90 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) __NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) __NR_munmap 91 sys_munmap (void *addr, unsigned long len) __NR_brk 45 sys_brk (void *addr) __NR_rt_sigaction 173 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) __NR_rt_sigprocmask 174 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) __NR_rt_sigreturn 172 sys_rt_sigreturn (void) __NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_pread64 179 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) __NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data) __NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) __NR_mincore 206 sys_mincore (void *addr, unsigned long size, unsigned char *vec) __NR_madvise 205 sys_madvise (unsigned long start, size_t len, int behavior) __NR_pause 29 sys_pause (void) __NR_nanosleep 162 sys_nanosleep (struct timespec *req, struct timespec *rem) __NR_getitimer 105 sys_getitimer (int which, const struct itimerval *val) __NR_setitimer 104 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) __NR_getpid 20 sys_getpid (void) __NR_socket 326 sys_socket (int domain, int type, int protocol) __NR_connect 328 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) __NR_sendto 335 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) __NR_recvfrom 337 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) __NR_sendmsg 341 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) __NR_recvmsg 342 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) __NR_shutdown 338 sys_shutdown (int sockfd, int how) __NR_bind 327 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) __NR_setsockopt 339 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) __NR_getsockopt 340 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) __NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) __NR_exit 1 sys_exit (unsigned long error_code) __NR_wait4 114 sys_wait4 (int pid, int *status, int options, struct rusage *ru) __NR_kill 37 sys_kill (long pid, int sig) __NR_fcntl 55 sys_fcntl (int fd, int type, long arg) __NR_flock 143 sys_flock (int fd, unsigned long cmd) __NR_mkdir 39 sys_mkdir (const char *name, int mode) __NR_rmdir 40 sys_rmdir (const char *name) __NR_unlink 10 sys_unlink (char *pathname) __NR_readlinkat 296 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) __NR_umask 60 sys_umask (int mask) __NR_getgroups 80 sys_getgroups (int gsize, unsigned int *groups) __NR_setgroups 81 sys_setgroups (int gsize, unsigned int *groups) __NR_setresuid 164 sys_setresuid (int uid, int euid, int suid) __NR_getresuid 165 sys_getresuid (int *uid, int *euid, int *suid) __NR_setresgid 169 sys_setresgid (int gid, int egid, int sgid) __NR_getresgid 170 sys_getresgid (int *gid, int *egid, int *sgid) __NR_getpgid 132 sys_getpgid (pid_t pid) __NR_setfsuid 138 sys_setfsuid (int fsuid) __NR_setfsgid 139 sys_setfsgid (int fsgid) __NR_getsid 147 sys_getsid (void) __NR_capget 183 sys_capget (struct cap_header *h, struct cap_data *d) __NR_capset 184 sys_capset (struct cap_header *h, struct cap_data *d) __NR_rt_sigqueueinfo 177 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) __NR_sigaltstack 185 sys_sigaltstack (const void *uss, void *uoss) __NR_personality 136 sys_personality (unsigned int personality) __NR_setpriority 97 sys_setpriority (int which, int who, int nice) __NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_prctl 171 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) __NR_setrlimit 75 sys_setrlimit (int resource, struct krlimit *rlim) __NR_mount 21 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) __NR_umount2 52 sys_umount2 (char *name, int flags) __NR_gettid 207 sys_gettid (void) __NR_futex 221 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) __NR_set_tid_address 232 sys_set_tid_address (int *tid_addr) __NR_restart_syscall 0 sys_restart_syscall (void) __NR_sys_timer_create 240 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_sys_timer_settime 241 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) __NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 244 sys_timer_delete (kernel_timer_t timer_id) __NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 234 sys_exit_group (int error_code) __NR_waitid 272 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len) __NR_get_robust_list 299 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_vmsplice 285 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_openat 286 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_timerfd_settime 311 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_signalfd4 313 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_rt_tgsigqueueinfo 322 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) __NR_fanotify_init 323 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) __NR_fanotify_mark 324 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) __NR_open_by_handle_at 346 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) __NR_setns 350 sys_setns (int fd, int nstype) __NR_kcmp 354 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) __NR_seccomp 358 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags) __NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp) __NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) __NR_io_submit 230 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp) __NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth) __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_preadv 320 sys_preadv (int fd, struct iovec *iov, unsigned long nr, loff_t off) __NR_userfaultfd 364 sys_userfaultfd (int flags) criu-3.6/compel/arch/ppc64/scripts/000077500000000000000000000000001317335042600171475ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/scripts/compel-pack.lds.S000066400000000000000000000011021317335042600222410ustar00rootroot00000000000000OUTPUT_ARCH(powerpc:common64) EXTERN(__export_parasite_head_start) SECTIONS { .text : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) *(.compel.exit) *(.compel.init) } .data : { *(.data*) *(.bss*) } .rodata : { *(.rodata*) *(.got*) } .toc : ALIGN(8) { *(.toc*) } /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) } /* Parasite args should have 4 bytes align, as we have futex inside. */ . = ALIGN(4); __export_parasite_args = .; } criu-3.6/compel/arch/ppc64/src/000077500000000000000000000000001317335042600162475ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/src/lib/000077500000000000000000000000001317335042600170155ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/src/lib/cpu.c000066400000000000000000000016461317335042600177570ustar00rootroot00000000000000#include #include #include #include #include "compel-cpu.h" #include "common/bitops.h" #include "log.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static bool rt_info_done = false; void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } int compel_cpuid(compel_cpuinfo_t *info) { info->hwcap[0] = getauxval(AT_HWCAP); info->hwcap[1] = getauxval(AT_HWCAP2); if (!info->hwcap[0] || !info->hwcap[1]) { pr_err("Can't read the hardware capabilities\n"); return -1; } return 0; } bool compel_cpu_has_feature(unsigned int feature) { if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } return compel_test_cpu_cap(&rt_info, feature); } criu-3.6/compel/arch/ppc64/src/lib/handle-elf-host.c000077700000000000000000000000001317335042600242712handle-elf.custar00rootroot00000000000000criu-3.6/compel/arch/ppc64/src/lib/handle-elf.c000066400000000000000000000014361317335042600211640ustar00rootroot00000000000000#include #include "uapi/compel.h" #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; int handle_binary(void *mem, size_t size) { const unsigned char *elf_ident = #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ elf_ident_64_le; #else elf_ident_64_be; #endif if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) return handle_elf_ppc64(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } criu-3.6/compel/arch/ppc64/src/lib/include/000077500000000000000000000000001317335042600204405ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/src/lib/include/cpu.h000066400000000000000000000000001317335042600213660ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/src/lib/include/handle-elf.h000066400000000000000000000004621317335042600226120ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf64-types.h" #define ELF_PPC64 #define __handle_elf handle_elf_ppc64 #define arch_is_machine_supported(e_machine) (e_machine == EM_PPC64) extern int handle_elf_ppc64(void *mem, size_t size); #endif /* COMPEL_HANDLE_ELF_H__ */ criu-3.6/compel/arch/ppc64/src/lib/include/syscall.h000066400000000000000000000001561317335042600222650ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ #define __NR(syscall, compat) __NR_##syscall #endif criu-3.6/compel/arch/ppc64/src/lib/include/uapi/000077500000000000000000000000001317335042600213765ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/src/lib/include/uapi/asm/000077500000000000000000000000001317335042600221565ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/src/lib/include/uapi/asm/.gitignore000066400000000000000000000000001317335042600241340ustar00rootroot00000000000000criu-3.6/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000004151317335042600246500ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT static inline int ptrace_set_breakpoint(pid_t pid, void *addr) { return 0; } static inline int ptrace_flush_breakpoints(pid_t pid) { return 0; } #endif criu-3.6/compel/arch/ppc64/src/lib/include/uapi/asm/cpu.h000066400000000000000000000002651317335042600231210ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_CPU_H__ #define UAPI_COMPEL_ASM_CPU_H__ #include typedef struct { uint64_t hwcap[2]; } compel_cpuinfo_t; #endif /* UAPI_COMPEL_ASM_CPU_H__ */ criu-3.6/compel/arch/ppc64/src/lib/include/uapi/asm/fpu.h000066400000000000000000000001211317335042600231130ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #endif /* __CR_ASM_FPU_H__ */ criu-3.6/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000045311317335042600247440ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #include #define SIGMAX_OLD 31 #define SIGMAX 64 /* * Copied from kernel header arch/powerpc/include/uapi/asm/ptrace.h */ typedef struct { unsigned long gpr[32]; unsigned long nip; unsigned long msr; unsigned long orig_gpr3; /* Used for restarting system calls */ unsigned long ctr; unsigned long link; unsigned long xer; unsigned long ccr; unsigned long softe; /* Soft enabled/disabled */ unsigned long trap; /* Reason for being here */ /* * N.B. for critical exceptions on 4xx, the dar and dsisr * fields are overloaded to hold srr0 and srr1. */ unsigned long dar; /* Fault registers */ unsigned long dsisr; /* on 4xx/Book-E used for ESR */ unsigned long result; /* Result of a system call */ } user_regs_struct_t; #define NVSXREG 32 #define USER_FPREGS_FL_FP 0x00001 #define USER_FPREGS_FL_ALTIVEC 0x00002 #define USER_FPREGS_FL_VSX 0x00004 #define USER_FPREGS_FL_TM 0x00010 #ifndef NT_PPC_TM_SPR # define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ # define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ # define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ # define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ # define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ #endif #define MSR_TMA (1UL<<34) /* bit 29 Trans Mem state: Transactional */ #define MSR_TMS (1UL<<33) /* bit 30 Trans Mem state: Suspended */ #define MSR_TM (1UL<<32) /* bit 31 Trans Mem Available */ #define MSR_VEC (1UL<<25) #define MSR_VSX (1UL<<23) #define MSR_TM_ACTIVE(x) ((((x) & MSR_TM) && ((x)&(MSR_TMA|MSR_TMS))) != 0) typedef struct { uint64_t fpregs[NFPREG]; __vector128 vrregs[NVRREG]; uint64_t vsxregs[NVSXREG]; int flags; struct tm_regs { int flags; struct { uint64_t tfhar, texasr, tfiar; } tm_spr_regs; user_regs_struct_t regs; uint64_t fpregs[NFPREG]; __vector128 vrregs[NVRREG]; uint64_t vsxregs[NVSXREG]; } tm; } user_fpregs_struct_t; #define REG_RES(regs) ((uint64_t)(regs).gpr[3]) #define REG_IP(regs) ((uint64_t)(regs).nip) #define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) #define user_regs_native(pregs) true #define ARCH_SI_TRAP TRAP_BRKPT #define __NR(syscall, compat) __NR_##syscall #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ criu-3.6/compel/arch/ppc64/src/lib/include/uapi/asm/processor-flags.h000066400000000000000000000002121317335042600254330ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ #define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ #endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ criu-3.6/compel/arch/ppc64/src/lib/include/uapi/asm/processor.h000066400000000000000000000001701317335042600243440ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_PROCESSOR_H__ #define UAPI_COMPEL_ASM_PROCESSOR_H__ #endif /* UAPI_COMPEL_ASM_PROCESSOR_H__ */ criu-3.6/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000044511317335042600241300ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include #include #include /* * sigcontext structure defined in file * /usr/include/powerpc64le-linux-gnu/bits/sigcontext.h, * included from /usr/include/signal.h * * Kernel definition can be found in arch/powerpc/include/uapi/asm/sigcontext.h */ #include // XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 /* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */ #define USER_REDZONE_SIZE 512 /* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */ #define TRAMP_SIZE 6 /* * ucontext_t defined in /usr/include/powerpc64le-linux-gnu/sys/ucontext.h */ struct rt_sigframe { /* sys_rt_sigreturn requires the ucontext be the first field */ ucontext_t uc; ucontext_t uc_transact; /* Transactional state */ unsigned long _unused[2]; unsigned int tramp[TRAMP_SIZE]; struct rt_siginfo *pinfo; void *puc; struct rt_siginfo info; /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */ char abigap[USER_REDZONE_SIZE]; } __attribute__((aligned(16))); #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "mr 1, %0 \n" \ "li 0, "__stringify(__NR_rt_sigreturn)" \n" \ "sc \n" \ : \ : "r"(new_sp) \ : "1", "memory") #if _CALL_ELF != 2 # error Only supporting ABIv2. #else # define FRAME_MIN_SIZE_PARM 96 #endif #define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe)->uc) #define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.gp_regs[PT_NIP]) #define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) #define RT_SIGFRAME_FPU(rt_sigframe) (&(rt_sigframe)->uc.uc_mcontext) #define MSR_TMA (1UL<<34) /* bit 29 Trans Mem state: Transactional */ #define MSR_TMS (1UL<<33) /* bit 30 Trans Mem state: Suspended */ #define MSR_TM (1UL<<32) /* bit 31 Trans Mem Available */ #define MSR_VEC (1UL<<25) #define MSR_VSX (1UL<<23) #define MSR_TM_ACTIVE(x) ((((x) & MSR_TM) && ((x)&(MSR_TMA|MSR_TMS))) != 0) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ criu-3.6/compel/arch/ppc64/src/lib/infect.c000066400000000000000000000331251317335042600204350ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "uapi/compel/asm/infect-types.h" #include "errno.h" #include "log.h" #include "common/bug.h" #include "common/page.h" #include "infect.h" #include "infect-priv.h" #ifndef NT_PPC_TM_SPR #define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ #define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ #define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ #define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ #define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ #endif /* * Injected syscall instruction */ const uint32_t code_syscall[] = { 0x44000002, /* sc */ 0x0fe00000 /* twi 31,0,0 */ }; static inline __always_unused void __check_code_syscall(void) { BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } static void prep_gp_regs(mcontext_t *dst, user_regs_struct_t *regs) { memcpy(dst->gp_regs, regs->gpr, sizeof(regs->gpr)); dst->gp_regs[PT_NIP] = regs->nip; dst->gp_regs[PT_MSR] = regs->msr; dst->gp_regs[PT_ORIG_R3] = regs->orig_gpr3; dst->gp_regs[PT_CTR] = regs->ctr; dst->gp_regs[PT_LNK] = regs->link; dst->gp_regs[PT_XER] = regs->xer; dst->gp_regs[PT_CCR] = regs->ccr; dst->gp_regs[PT_TRAP] = regs->trap; } static void put_fpu_regs(mcontext_t *mc, uint64_t *fpregs) { uint64_t *mcfp = (uint64_t *)mc->fp_regs; memcpy(mcfp, fpregs, sizeof(*fpregs) * NFPREG); } static void put_altivec_regs(mcontext_t *mc, __vector128 *vrregs) { vrregset_t *v_regs = (vrregset_t *)(((unsigned long)mc->vmx_reserve + 15) & ~0xful); memcpy(&v_regs->vrregs[0][0], vrregs, sizeof(uint64_t) * 2 * (NVRREG - 1)); v_regs->vrsave = *((uint32_t *)&vrregs[NVRREG - 1]); mc->v_regs = v_regs; } static void put_vsx_regs(mcontext_t *mc, uint64_t *vsxregs) { memcpy((uint64_t *)(mc->v_regs + 1), vsxregs, sizeof(*vsxregs) * NVSXREG); } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { mcontext_t *dst_tc = &sigframe->uc_transact.uc_mcontext; mcontext_t *dst = &sigframe->uc.uc_mcontext; if (fpregs->flags & USER_FPREGS_FL_TM) { prep_gp_regs(&sigframe->uc_transact.uc_mcontext, &fpregs->tm.regs); prep_gp_regs(&sigframe->uc.uc_mcontext, &fpregs->tm.regs); } else { prep_gp_regs(&sigframe->uc.uc_mcontext, regs); } if (fpregs->flags & USER_FPREGS_FL_TM) sigframe->uc.uc_link = &sigframe->uc_transact; if (fpregs->flags & USER_FPREGS_FL_FP) { if (fpregs->flags & USER_FPREGS_FL_TM) { put_fpu_regs(&sigframe->uc_transact.uc_mcontext, fpregs->tm.fpregs); put_fpu_regs(&sigframe->uc.uc_mcontext, fpregs->tm.fpregs); } else { put_fpu_regs(&sigframe->uc.uc_mcontext, fpregs->fpregs); } } if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) { if (fpregs->flags & USER_FPREGS_FL_TM) { put_altivec_regs(&sigframe->uc_transact.uc_mcontext, fpregs->tm.vrregs); put_altivec_regs(&sigframe->uc.uc_mcontext, fpregs->tm.vrregs); dst_tc->gp_regs[PT_MSR] |= MSR_VEC; } else { put_altivec_regs(&sigframe->uc.uc_mcontext, fpregs->vrregs); } dst->gp_regs[PT_MSR] |= MSR_VEC; if (fpregs->flags & USER_FPREGS_FL_VSX) { if (fpregs->flags & USER_FPREGS_FL_TM) { put_vsx_regs(&sigframe->uc_transact.uc_mcontext, fpregs->tm.vsxregs); put_vsx_regs(&sigframe->uc.uc_mcontext, fpregs->tm.vsxregs); dst_tc->gp_regs[PT_MSR] |= MSR_VSX; } else { put_vsx_regs(&sigframe->uc.uc_mcontext, fpregs->vsxregs); } dst->gp_regs[PT_MSR] |= MSR_VSX; } } return 0; } static void update_vregs(mcontext_t *lcontext, mcontext_t *rcontext) { if (lcontext->v_regs) { uint64_t offset = (uint64_t)(lcontext->v_regs) - (uint64_t)lcontext; lcontext->v_regs = (vrregset_t *)((uint64_t)rcontext + offset); pr_debug("Updated v_regs:%llx (rcontext:%llx)\n", (unsigned long long)lcontext->v_regs, (unsigned long long)rcontext); } } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *frame, struct rt_sigframe *rframe) { uint64_t msr = frame->uc.uc_mcontext.gp_regs[PT_MSR]; update_vregs(&frame->uc.uc_mcontext, &rframe->uc.uc_mcontext); /* Sanity check: If TM so uc_link should be set, otherwise not */ if (MSR_TM_ACTIVE(msr) ^ (!!(frame->uc.uc_link))) { BUG(); return -1; } /* Updating the transactional state address if any */ if (frame->uc.uc_link) { update_vregs(&frame->uc_transact.uc_mcontext, &rframe->uc_transact.uc_mcontext); frame->uc.uc_link = &rframe->uc_transact; } return 0; } /* This is the layout of the POWER7 VSX registers and the way they * overlap with the existing FPR and VMX registers. * * VSR doubleword 0 VSR doubleword 1 * ---------------------------------------------------------------- * VSR[0] | FPR[0] | | * ---------------------------------------------------------------- * VSR[1] | FPR[1] | | * ---------------------------------------------------------------- * | ... | | * ---------------------------------------------------------------- * VSR[30] | FPR[30] | | * ---------------------------------------------------------------- * VSR[31] | FPR[31] | | * ---------------------------------------------------------------- * VSR[32] | VR[0] | * ---------------------------------------------------------------- * VSR[33] | VR[1] | * ---------------------------------------------------------------- * | ... | * ---------------------------------------------------------------- * VSR[62] | VR[30] | * ---------------------------------------------------------------- * VSR[63] | VR[31] | * ---------------------------------------------------------------- * * PTRACE_GETFPREGS returns FPR[0..31] + FPSCR * PTRACE_GETVRREGS returns VR[0..31] + VSCR + VRSAVE * PTRACE_GETVSRREGS returns VSR[0..31] * * PTRACE_GETVSRREGS and PTRACE_GETFPREGS are required since we need * to save FPSCR too. * * There 32 VSX double word registers to save since the 32 first VSX double * word registers are saved through FPR[0..32] and the remaining registers * are saved when saving the Altivec registers VR[0..32]. */ static int get_fpu_regs(pid_t pid, user_fpregs_struct_t *fp) { if (ptrace(PTRACE_GETFPREGS, pid, 0, (void *)&fp->fpregs) < 0) { pr_perror("Couldn't get floating-point registers"); return -1; } fp->flags |= USER_FPREGS_FL_FP; return 0; } static int get_altivec_regs(pid_t pid, user_fpregs_struct_t *fp) { if (ptrace(PTRACE_GETVRREGS, pid, 0, (void*)&fp->vrregs) < 0) { /* PTRACE_GETVRREGS returns EIO if Altivec is not supported. * This should not happen if msr_vec is set. */ if (errno != EIO) { pr_perror("Couldn't get Altivec registers"); return -1; } pr_debug("Altivec not supported\n"); } else { pr_debug("Dumping Altivec registers\n"); fp->flags |= USER_FPREGS_FL_ALTIVEC; } return 0; } /* * Since the FPR[0-31] is stored in the first double word of VSR[0-31] and * FPR are saved through the FP state, there is no need to save the upper part * of the first 32 VSX registers. * Furthermore, the 32 last VSX registers are also the 32 Altivec registers * already saved, so no need to save them. * As a consequence, only the doubleword 1 of the 32 first VSX registers have * to be saved (the ones are returned by PTRACE_GETVSRREGS). */ static int get_vsx_regs(pid_t pid, user_fpregs_struct_t *fp) { if (ptrace(PTRACE_GETVSRREGS, pid, 0, (void*)fp->vsxregs) < 0) { /* * EIO is returned in the case PTRACE_GETVRREGS is not * supported. */ if (errno != EIO) { pr_perror("Couldn't get VSX registers"); return -1; } pr_debug("VSX register's dump not supported.\n"); } else { pr_debug("Dumping VSX registers\n"); fp->flags |= USER_FPREGS_FL_VSX; } return 0; } static int get_tm_regs(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; pr_debug("Dumping TM registers\n"); #define TM_REQUIRED 0 #define TM_OPTIONAL 1 #define PTRACE_GET_TM(s,n,c,u) do { \ iov.iov_base = &s; \ iov.iov_len = sizeof(s); \ if (ptrace(PTRACE_GETREGSET, pid, c, &iov)) { \ if (!u || errno != EIO) { \ pr_perror("Couldn't get TM "n); \ pr_err("Your kernel seems to not support the " \ "new TM ptrace API (>= 4.8)\n"); \ goto out_free; \ } \ pr_debug("TM "n" not supported.\n"); \ iov.iov_base = NULL; \ } \ } while(0) /* Get special registers */ PTRACE_GET_TM(fpregs->tm.tm_spr_regs, "SPR", NT_PPC_TM_SPR, TM_REQUIRED); /* Get checkpointed regular registers */ PTRACE_GET_TM(fpregs->tm.regs, "GPR", NT_PPC_TM_CGPR, TM_REQUIRED); /* Get checkpointed FP registers */ PTRACE_GET_TM(fpregs->tm.fpregs, "FPR", NT_PPC_TM_CFPR, TM_OPTIONAL); if (iov.iov_base) fpregs->tm.flags |= USER_FPREGS_FL_FP; /* Get checkpointed VMX (Altivec) registers */ PTRACE_GET_TM(fpregs->tm.vrregs, "VMX", NT_PPC_TM_CVMX, TM_OPTIONAL); if (iov.iov_base) fpregs->tm.flags |= USER_FPREGS_FL_ALTIVEC; /* Get checkpointed VSX registers */ PTRACE_GET_TM(fpregs->tm.vsxregs, "VSX", NT_PPC_TM_CVSX, TM_OPTIONAL); if (iov.iov_base) fpregs->tm.flags |= USER_FPREGS_FL_VSX; return 0; out_free: return -1; /* still failing the checkpoint */ } static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { pr_info("Dumping GP/FPU registers for %d\n", pid); /* * This is inspired by kernel function check_syscall_restart in * arch/powerpc/kernel/signal.c */ #ifndef TRAP #define TRAP(r) ((r).trap & ~0xF) #endif if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) { /* Restart the system call */ switch (regs->gpr[3]) { case ERESTARTNOHAND: case ERESTARTSYS: case ERESTARTNOINTR: regs->gpr[3] = regs->orig_gpr3; regs->nip -= 4; break; case ERESTART_RESTARTBLOCK: regs->gpr[0] = __NR_restart_syscall; regs->nip -= 4; break; } } /* Resetting trap since we are now coming from user space. */ regs->trap = 0; fpregs->flags = 0; /* * Check for Transactional Memory operation in progress. * Until we have support of TM register's state through the ptrace API, * we can't checkpoint process with TM operation in progress (almost * impossible) or suspended (easy to get). */ if (MSR_TM_ACTIVE(regs->msr)) { pr_debug("Task %d has %s TM operation at 0x%lx\n", pid, (regs->msr & MSR_TMS) ? "a suspended" : "an active", regs->nip); if (get_tm_regs(pid, fpregs)) return -1; fpregs->flags = USER_FPREGS_FL_TM; } if (get_fpu_regs(pid, fpregs)) return -1; if (get_altivec_regs(pid, fpregs)) return -1; if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) { /* * Save the VSX registers if Altivec registers are supported */ if (get_vsx_regs(pid, fpregs)) return -1; } return 0; } int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, void *arg) { user_fpregs_struct_t fpregs; int ret; ret = __get_task_regs(pid, regs, &fpregs); if (ret) return ret; return save(arg, regs, &fpregs); } int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; regs.gpr[0] = (unsigned long)nr; regs.gpr[3] = arg1; regs.gpr[4] = arg2; regs.gpr[5] = arg3; regs.gpr[6] = arg4; regs.gpr[7] = arg5; regs.gpr[8] = arg6; err = compel_execute_syscall(ctl, ®s, (char*)code_syscall); *ret = regs.gpr[3]; return err; } void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map = 0; int err; err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); if (err < 0 || (long)map < 0) map = 0; return (void *)map; } void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { /* * OpenPOWER ABI requires that r12 is set to the calling function addressi * to compute the TOC pointer. */ regs->gpr[12] = new_ip; regs->nip = new_ip; if (stack) regs->gpr[1] = (unsigned long) stack; regs->trap = 0; } bool arch_can_dump_task(struct parasite_ctl *ctl) { /* * TODO: We should detect 32bit task when BE support is done. */ return true; } int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { long ret; int err; err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } /* * Copied for the Linux kernel arch/powerpc/include/asm/processor.h * * NOTE: 32bit tasks are not supported. */ #define TASK_SIZE_64TB (0x0000400000000000UL) #define TASK_SIZE_512TB (0x0002000000000000UL) #define TASK_SIZE_MIN TASK_SIZE_64TB #define TASK_SIZE_MAX TASK_SIZE_512TB unsigned long compel_task_size(void) { unsigned long task_size; for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) if (munmap((void *)task_size, page_size())) break; return task_size; } criu-3.6/compel/arch/s390/000077500000000000000000000000001317335042600152225ustar00rootroot00000000000000criu-3.6/compel/arch/s390/plugins/000077500000000000000000000000001317335042600167035ustar00rootroot00000000000000criu-3.6/compel/arch/s390/plugins/include/000077500000000000000000000000001317335042600203265ustar00rootroot00000000000000criu-3.6/compel/arch/s390/plugins/include/asm/000077500000000000000000000000001317335042600211065ustar00rootroot00000000000000criu-3.6/compel/arch/s390/plugins/include/asm/prologue.h000077700000000000000000000000001317335042600336302../../../../../arch/x86/plugins/include/asm/prologue.hustar00rootroot00000000000000criu-3.6/compel/arch/s390/plugins/include/asm/syscall-types.h000066400000000000000000000014011317335042600240670ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ #define SA_RESTORER 0x04000000U typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define _KNSIG 64 #define _NSIG_BPW 64 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; /* * Used for rt_sigaction() system call - see kernel "struct sigaction" in * include/linux/signal.h. */ typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; struct mmap_arg_struct; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ criu-3.6/compel/arch/s390/plugins/std/000077500000000000000000000000001317335042600174755ustar00rootroot00000000000000criu-3.6/compel/arch/s390/plugins/std/parasite-head.S000066400000000000000000000014531317335042600223330ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .head.text, "ax" /* * Entry point for parasite_service() * * Addresses of symbols are exported in auto-generated criu/pie/parasite-blob.h * * Function is called via parasite_run(). The command for parasite_service() * is stored in global variable __export_parasite_cmd. * * Load parameters for parasite_service(unsigned int cmd, void *args): * * - Parameter 1 (cmd) : %r2 = *(uint32 *)(__export_parasite_cmd + pc) * - Parameter 2 (args): %r3 = __export_parasite_args + pc */ ENTRY(__export_parasite_head_start) larl %r14,__export_parasite_cmd llgf %r2,0(%r14) larl %r3,__export_parasite_args brasl %r14,parasite_service .long 0x00010001 /* S390_BREAKPOINT_U16: Generates SIGTRAP */ __export_parasite_cmd: .long 0 END(__export_parasite_head_start) criu-3.6/compel/arch/s390/plugins/std/syscalls/000077500000000000000000000000001317335042600213325ustar00rootroot00000000000000criu-3.6/compel/arch/s390/plugins/std/syscalls/Makefile.syscalls000066400000000000000000000053061317335042600246320ustar00rootroot00000000000000ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ sys-types := $(obj)/include/uapi/std/syscall-types.h sys-codes := $(obj)/include/uapi/std/syscall-codes.h sys-proto := $(obj)/include/uapi/std/syscall.h sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall-s390.tbl sys-asm-common-name := std/syscalls/syscall-common-s390.S sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S std-lib-y += $(sys-asm:.S=).o std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls-s390.o $(sys-codes): $(sys-def) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) cat $< | awk '/^__NR/{SYSN=$$1; sub("^__NR", "SYS", SYSN);'\ 'print "\n#ifndef ", $$1, "\n#define", $$1, $$2, "\n#endif";'\ 'print "#ifndef ", SYSN, "\n#define ", SYSN, $$1, "\n#endif"}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ $(sys-proto): $(sys-def) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#include " >> $@ $(Q) echo "#include " >> $@ $(Q) cat $< | awk '/^__NR/{print "extern long", $$3, substr($$0, index($$0,$$4)), ";"}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ $(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#include " >> $@ $(Q) echo "#include \"$(sys-asm-common-name)\"" >> $@ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", $$3, ",", $$2, ")"}' >> $@ $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(E) " GEN " $@ $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "static struct syscall_exec_desc sc_exec_table[] = {" >> $@ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", substr($$3, 5), ",", $$2, ")"}' >> $@ $(Q) echo " { }, /* terminator */" >> $@ $(Q) echo "};" >> $@ $(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(call msg-gen, $@) $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) mrproper-y += $(std-headers-deps) criu-3.6/compel/arch/s390/plugins/std/syscalls/syscall-common-s390.S000066400000000000000000000017211317335042600251130ustar00rootroot00000000000000#include "common/asm/linkage.h" /* * Define a system call * * C-ABI on s390: * - Parameters 1-5 are passed in %r2-%r6 * - Parameter 6 is passed on the stack 160(%r15) * - Return value is in %r2 * - Return address is in %r14 * - Registers %r0-%r6,%r14 are call-clobbered * - Registers %r7-%r13,%r15 are call-saved * * SVC ABI on s390: * - For SVC 0 the system call number is passed in %r1 * - Parameters 1-6 are passed in %r2-%r7 * - Return value is passed in %r2 * - Besides of %r2 all registers are call-saved */ #define SYSCALL(name, opcode) \ ENTRY(name); \ lgr %r0,%r7; /* Save %r7 */ \ lg %r7,160(%r15); /* Load 6th parameter */ \ lghi %r1,opcode; /* Load SVC number */ \ svc 0; /* Issue SVC 0 */ \ lgr %r7,%r0; /* Restore %r7 */ \ br %r14; /* Return to caller */ \ END(name) \ /* * Issue rt_sigreturn system call for sa_restorer */ ENTRY(__cr_restore_rt) lghi %r1,__NR_rt_sigreturn svc 0 END(__cr_restore_rt) criu-3.6/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl000066400000000000000000000200571317335042600242070ustar00rootroot00000000000000# # System calls table, please make sure the table consists of only the syscalls # really used somewhere in the project. # # The template is (name and arguments are optional if you need only __NR_x # defined, but no real entry point in syscalls lib). # # name code name arguments # ----------------------------------------------------------------------- # __NR_read 3 sys_read (int fd, void *buf, unsigned long count) __NR_write 4 sys_write (int fd, const void *buf, unsigned long count) __NR_open 5 sys_open (const char *filename, unsigned long flags, unsigned long mode) __NR_close 6 sys_close (int fd) __NR_lseek 19 sys_lseek (int fd, unsigned long offset, unsigned long origin) __NR_mmap 90 sys_old_mmap (struct mmap_arg_struct *) __NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) __NR_munmap 91 sys_munmap (void *addr, unsigned long len) __NR_brk 45 sys_brk (void *addr) __NR_rt_sigaction 174 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) __NR_rt_sigprocmask 175 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) __NR_rt_sigreturn 173 sys_rt_sigreturn (void) __NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_pread64 180 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) __NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data) __NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) __NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char *vec) __NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior) __NR_pause 29 sys_pause (void) __NR_nanosleep 162 sys_nanosleep (struct timespec *req, struct timespec *rem) __NR_getitimer 105 sys_getitimer (int which, const struct itimerval *val) __NR_setitimer 104 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) __NR_getpid 20 sys_getpid (void) __NR_socket 359 sys_socket (int domain, int type, int protocol) __NR_connect 362 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) __NR_sendto 369 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) __NR_recvfrom 371 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) __NR_sendmsg 370 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) __NR_recvmsg 372 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) __NR_shutdown 373 sys_shutdown (int sockfd, int how) __NR_bind 361 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) __NR_setsockopt 366 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) __NR_getsockopt 365 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) __NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid, void *tls) __NR_exit 1 sys_exit (unsigned long error_code) __NR_wait4 114 sys_wait4 (int pid, int *status, int options, struct rusage *ru) __NR_kill 37 sys_kill (long pid, int sig) __NR_fcntl 55 sys_fcntl (int fd, int type, long arg) __NR_flock 143 sys_flock (int fd, unsigned long cmd) __NR_mkdir 39 sys_mkdir (const char *name, int mode) __NR_rmdir 40 sys_rmdir (const char *name) __NR_unlink 10 sys_unlink (char *pathname) __NR_readlinkat 298 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) __NR_umask 60 sys_umask (int mask) __NR_getgroups 205 sys_getgroups (int gsize, unsigned int *groups) __NR_setgroups 206 sys_setgroups (int gsize, unsigned int *groups) __NR_setresuid 208 sys_setresuid (int uid, int euid, int suid) __NR_getresuid 209 sys_getresuid (int *uid, int *euid, int *suid) __NR_setresgid 210 sys_setresgid (int gid, int egid, int sgid) __NR_getresgid 211 sys_getresgid (int *gid, int *egid, int *sgid) __NR_getpgid 132 sys_getpgid (pid_t pid) __NR_setfsuid 215 sys_setfsuid (int fsuid) __NR_setfsgid 216 sys_setfsgid (int fsgid) __NR_getsid 147 sys_getsid (void) __NR_capget 184 sys_capget (struct cap_header *h, struct cap_data *d) __NR_capset 185 sys_capset (struct cap_header *h, struct cap_data *d) __NR_rt_sigqueueinfo 178 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) __NR_sigaltstack 186 sys_sigaltstack (const void *uss, void *uoss) __NR_personality 136 sys_personality (unsigned int personality) __NR_setpriority 97 sys_setpriority (int which, int who, int nice) __NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_prctl 172 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) __NR_setrlimit 75 sys_setrlimit (int resource, struct krlimit *rlim) __NR_mount 21 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) __NR_umount2 52 sys_umount2 (char *name, int flags) __NR_gettid 236 sys_gettid (void) __NR_futex 238 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) __NR_set_tid_address 252 sys_set_tid_address (int *tid_addr) __NR_restart_syscall 7 sys_restart_syscall (void) __NR_sys_timer_create 254 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_sys_timer_settime 255 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) __NR_sys_timer_gettime 256 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 257 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 258 sys_timer_delete (kernel_timer_t timer_id) __NR_clock_gettime 260 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 248 sys_exit_group (int error_code) __NR_waitid 281 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, size_t len) __NR_get_robust_list 305 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_vmsplice 309 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_openat 288 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_timerfd_settime 320 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_signalfd4 322 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_rt_tgsigqueueinfo 330 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) __NR_fanotify_init 332 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) __NR_fanotify_mark 333 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) __NR_open_by_handle_at 336 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) __NR_setns 339 sys_setns (int fd, int nstype) __NR_kcmp 343 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) __NR_seccomp 348 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_memfd_create 350 sys_memfd_create (const char *name, unsigned int flags) __NR_io_setup 243 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp) __NR_io_getevents 245 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) __NR_io_submit 246 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp) __NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth) __NR_userfaultfd 355 sys_userfaultfd (int flags) __NR_preadv 328 sys_preadv (int fd, struct iovec *iov, unsigned long nr, loff_t off) __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) criu-3.6/compel/arch/s390/plugins/std/syscalls/syscalls-s390.c000066400000000000000000000012641317335042600240320ustar00rootroot00000000000000#include "asm/infect-types.h" /* * Define prototype because of compile error if we include uapi/std/syscall.h */ long sys_old_mmap (struct mmap_arg_struct *); /* * On s390 we have defined __ARCH_WANT_SYS_OLD_MMAP - Therefore implement * system call with one parameter "mmap_arg_struct". */ unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) { struct mmap_arg_struct arg_struct; arg_struct.addr = (unsigned long)addr; arg_struct.len = len; arg_struct.prot = prot; arg_struct.flags = flags; arg_struct.fd = fd; arg_struct.offset = offset; return sys_old_mmap(&arg_struct); } criu-3.6/compel/arch/s390/scripts/000077500000000000000000000000001317335042600167115ustar00rootroot00000000000000criu-3.6/compel/arch/s390/scripts/compel-pack.lds.S000066400000000000000000000010751317335042600220140ustar00rootroot00000000000000OUTPUT_ARCH(s390:64-bit) EXTERN(__export_parasite_head_start) SECTIONS { .text : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) *(.compel.exit) *(.compel.init) } .data : { *(.data*) *(.bss*) } .rodata : { *(.rodata*) *(.got*) } .toc : ALIGN(8) { *(.toc*) } /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) } /* Parasite args should have 4 bytes align, as we have futex inside. */ . = ALIGN(4); __export_parasite_args = .; } criu-3.6/compel/arch/s390/src/000077500000000000000000000000001317335042600160115ustar00rootroot00000000000000criu-3.6/compel/arch/s390/src/lib/000077500000000000000000000000001317335042600165575ustar00rootroot00000000000000criu-3.6/compel/arch/s390/src/lib/cpu.c000066400000000000000000000016051317335042600175140ustar00rootroot00000000000000#include #include #include #include "compel-cpu.h" #include "common/bitops.h" #include "common/compiler.h" #include "log.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static bool rt_info_done = false; void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { return 0; } int compel_cpuid(compel_cpuinfo_t *info) { info->hwcap[0] = getauxval(AT_HWCAP); info->hwcap[1] = getauxval(AT_HWCAP2); if (!info->hwcap[0]) { pr_err("Can't read the hardware capabilities"); return -1; } return 0; } bool cpu_has_feature(unsigned int feature) { if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } return compel_test_cpu_cap(&rt_info, feature); } criu-3.6/compel/arch/s390/src/lib/handle-elf-host.c000077700000000000000000000000001317335042600240332handle-elf.custar00rootroot00000000000000criu-3.6/compel/arch/s390/src/lib/handle-elf.c000066400000000000000000000007471317335042600207320ustar00rootroot00000000000000#include #include "uapi/compel.h" #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; int handle_binary(void *mem, size_t size) { if (memcmp(mem, elf_ident_64, sizeof(elf_ident_64)) == 0) return handle_elf_s390(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } criu-3.6/compel/arch/s390/src/lib/include/000077500000000000000000000000001317335042600202025ustar00rootroot00000000000000criu-3.6/compel/arch/s390/src/lib/include/handle-elf.h000066400000000000000000000004471317335042600223570ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf64-types.h" #define ELF_S390 #define __handle_elf handle_elf_s390 #define arch_is_machine_supported(e_machine) (e_machine == EM_S390) int handle_elf_s390(void *mem, size_t size); #endif /* COMPEL_HANDLE_ELF_H__ */ criu-3.6/compel/arch/s390/src/lib/include/syscall.h000066400000000000000000000003351317335042600220260ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset); #endif criu-3.6/compel/arch/s390/src/lib/include/uapi/000077500000000000000000000000001317335042600211405ustar00rootroot00000000000000criu-3.6/compel/arch/s390/src/lib/include/uapi/asm/000077500000000000000000000000001317335042600217205ustar00rootroot00000000000000criu-3.6/compel/arch/s390/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000003771317335042600244210ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT static inline int ptrace_set_breakpoint(pid_t pid, void *addr) { return 0; } static inline int ptrace_flush_breakpoints(pid_t pid) { return 0; } #endif criu-3.6/compel/arch/s390/src/lib/include/uapi/asm/cpu.h000066400000000000000000000002571317335042600226640ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_CPU_H__ #define UAPI_COMPEL_ASM_CPU_H__ #include typedef struct { uint64_t hwcap[2]; } compel_cpuinfo_t; #endif /* __CR_ASM_CPU_H__ */ criu-3.6/compel/arch/s390/src/lib/include/uapi/asm/fpu.h000066400000000000000000000003251317335042600226630ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #include #include /* * This one is used in restorer */ typedef struct { bool has_fpu; } fpu_state_t; #endif /* __CR_ASM_FPU_H__ */ criu-3.6/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000034731317335042600245120ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #include #include #include "common/page.h" #define SIGMAX 64 #define SIGMAX_OLD 31 /* * Definitions from /usr/include/asm/ptrace.h: * * typedef struct * { * __u32 fpc; * freg_t fprs[NUM_FPRS]; * } s390_fp_regs; * * typedef struct * { * psw_t psw; * unsigned long gprs[NUM_GPRS]; * unsigned int acrs[NUM_ACRS]; * unsigned long orig_gpr2; * } s390_regs; */ typedef struct { uint64_t part1; uint64_t part2; } vector128_t; struct prfpreg { uint32_t fpc; uint64_t fprs[16]; }; #define USER_FPREGS_VXRS 0x000000001 /* Guarded-storage control block */ #define USER_GS_CB 0x000000002 /* Guarded-storage broadcast control block */ #define USER_GS_BC 0x000000004 /* Runtime-instrumentation control block */ #define USER_RI_CB 0x000000008 /* Runtime-instrumentation bit set */ #define USER_RI_ON 0x000000010 typedef struct { uint32_t flags; struct prfpreg prfpreg; uint64_t vxrs_low[16]; vector128_t vxrs_high[16]; uint64_t gs_cb[4]; uint64_t gs_bc[4]; uint64_t ri_cb[8]; } user_fpregs_struct_t; typedef struct { s390_regs prstatus; uint32_t system_call; } user_regs_struct_t; #define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) #define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) /* * We assume that REG_SYSCALL_NR() is only used for pie code where we * always use svc 0 with opcode in %r1. */ #define REG_SYSCALL_NR(r) ((uint64_t)(r).prstatus.gprs[1]) #define user_regs_native(pregs) true #define __NR(syscall, compat) __NR_##syscall struct mmap_arg_struct { unsigned long addr; unsigned long len; unsigned long prot; unsigned long flags; unsigned long fd; unsigned long offset; }; #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ criu-3.6/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000032241317335042600236670ustar00rootroot00000000000000 #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include #include #include #include // XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 /* * From /usr/include/asm/sigcontext.h * * Redefine _sigregs_ext to be able to compile on older systems */ #ifndef __NUM_VXRS_LOW typedef struct { __u32 u[4]; } __vector128; typedef struct { unsigned long long vxrs_low[16]; __vector128 vxrs_high[16]; unsigned char __reserved[128]; } _sigregs_ext; #endif /* * From /usr/include/uapi/asm/ucontext.h */ struct ucontext_extended { unsigned long uc_flags; ucontext_t *uc_link; stack_t uc_stack; _sigregs uc_mcontext; sigset_t uc_sigmask; /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ unsigned char __unused[128 - sizeof(sigset_t)]; _sigregs_ext uc_mcontext_ext; }; /* * Signal stack frame for RT sigreturn */ struct rt_sigframe { uint8_t callee_used_stack[160]; uint8_t retcode[2]; siginfo_t info; struct ucontext_extended uc; }; /* * Do rt_sigreturn SVC */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "lgr %%r15,%0\n" \ "lghi %%r1,173\n" \ "svc 0\n" \ : \ : "d" (new_sp) \ : "15", "memory") #define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) #define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->uc.uc_mcontext.regs.psw.addr #define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ criu-3.6/compel/arch/s390/src/lib/infect.c000066400000000000000000000442611317335042600202020ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "uapi/compel/asm/infect-types.h" #include "errno.h" #include "log.h" #include "common/bug.h" #include "infect.h" #include "ptrace.h" #include "infect-priv.h" #define NT_PRFPREG 2 #define NT_S390_VXRS_LOW 0x309 #define NT_S390_VXRS_HIGH 0x30a #define NT_S390_GS_CB 0x30b #define NT_S390_GS_BC 0x30c #define NT_S390_RI_CB 0x30d /* * Print general purpose and access registers */ static void print_user_regs_struct(const char *msg, int pid, user_regs_struct_t *regs) { int i; pr_debug("%s: Registers for pid=%d\n", msg, pid); pr_debug("system_call %08lx\n", (unsigned long) regs->system_call); pr_debug(" psw %016lx %016lx\n", regs->prstatus.psw.mask, regs->prstatus.psw.addr); pr_debug(" orig_gpr2 %016lx\n", regs->prstatus.orig_gpr2); for (i = 0; i < 16; i++) pr_debug(" g%02d %016lx\n", i, regs->prstatus.gprs[i]); for (i = 0; i < 16; i++) pr_debug(" a%02d %08x\n", i, regs->prstatus.acrs[i]); } /* * Print vector registers */ static void print_vxrs(user_fpregs_struct_t *fpregs) { int i; if (!(fpregs->flags & USER_FPREGS_VXRS)) { pr_debug(" No VXRS\n"); return; } for (i = 0; i < 16; i++) pr_debug(" vx_low%02d %016lx\n", i, fpregs->vxrs_low[i]); for (i = 0; i < 16; i++) pr_debug(" vx_high%02d %016lx %016lx\n", i, fpregs->vxrs_high[i].part1, fpregs->vxrs_high[i].part2); } /* * Print guarded-storage control block */ static void print_gs_cb(user_fpregs_struct_t *fpregs) { int i; if (!(fpregs->flags & USER_GS_CB)) { pr_debug(" No GS_CB\n"); return; } for (i = 0; i < 4; i++) pr_debug(" gs_cb%02d %016lx\n", i, fpregs->gs_cb[i]); } /* * Print guarded-storage broadcast control block */ static void print_gs_bc(user_fpregs_struct_t *fpregs) { int i; if (!(fpregs->flags & USER_GS_BC)) { pr_debug(" No GS_BC\n"); return; } for (i = 0; i < 4; i++) pr_debug(" gs_bc%02d %016lx\n", i, fpregs->gs_bc[i]); } /* * Print runtime-instrumentation control block */ static void print_ri_cb(user_fpregs_struct_t *fpregs) { int i; if (!(fpregs->flags & USER_RI_CB)) { pr_debug(" No RI_CB\n"); return; } for (i = 0; i < 8; i++) pr_debug(" ri_cb%02d %016lx\n", i, fpregs->ri_cb[i]); } /* * Print FP registers, VX registers, guarded-storage, and * runtime-instrumentation */ static void print_user_fpregs_struct(const char *msg, int pid, user_fpregs_struct_t *fpregs) { int i; pr_debug("%s: FP registers for pid=%d\n", msg, pid); pr_debug(" fpc %08x\n", fpregs->prfpreg.fpc); for (i = 0; i < 16; i++) pr_debug(" f%02d %016lx\n", i, fpregs->prfpreg.fprs[i]); print_vxrs(fpregs); print_gs_cb(fpregs); print_gs_bc(fpregs); print_ri_cb(fpregs); } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { _sigregs_ext *dst_ext = &sigframe->uc.uc_mcontext_ext; _sigregs *dst = &sigframe->uc.uc_mcontext; memcpy(dst->regs.gprs, regs->prstatus.gprs, sizeof(regs->prstatus.gprs)); memcpy(dst->regs.acrs, regs->prstatus.acrs, sizeof(regs->prstatus.acrs)); memcpy(&dst->regs.psw, ®s->prstatus.psw, sizeof(regs->prstatus.psw)); memcpy(&dst->fpregs.fpc, &fpregs->prfpreg.fpc, sizeof(fpregs->prfpreg.fpc)); memcpy(&dst->fpregs.fprs, &fpregs->prfpreg.fprs, sizeof(fpregs->prfpreg.fprs)); if (fpregs->flags & USER_FPREGS_VXRS) { memcpy(&dst_ext->vxrs_low, &fpregs->vxrs_low, sizeof(fpregs->vxrs_low)); memcpy(&dst_ext->vxrs_high, &fpregs->vxrs_high, sizeof(fpregs->vxrs_high)); } else { memset(&dst_ext->vxrs_low, 0, sizeof(sizeof(fpregs->vxrs_low))); memset(&dst_ext->vxrs_high, 0, sizeof(sizeof(fpregs->vxrs_high))); } return 0; } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } /* * Rewind the psw for 'bytes' bytes */ static inline void rewind_psw(psw_t *psw, unsigned long bytes) { unsigned long mask; pr_debug("Rewind psw: %016lx bytes=%lu\n", psw->addr, bytes); mask = (psw->mask & PSW_MASK_EA) ? -1UL : (psw->mask & PSW_MASK_BA) ? (1UL << 31) - 1 : (1UL << 24) - 1; psw->addr = (psw->addr - bytes) & mask; } /* * Get vector registers */ int get_vx_regs(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; fpregs->flags &= ~USER_FPREGS_VXRS; iov.iov_base = &fpregs->vxrs_low; iov.iov_len = sizeof(fpregs->vxrs_low); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_VXRS_LOW, &iov) < 0) { /* * If the kernel does not support vector registers, we get * EINVAL. With kernel support and old hardware, we get ENODEV. */ if (errno == EINVAL || errno == ENODEV) { memset(fpregs->vxrs_low, 0, sizeof(fpregs->vxrs_low)); memset(fpregs->vxrs_high, 0, sizeof(fpregs->vxrs_high)); pr_debug("VXRS registers not supported\n"); return 0; } pr_perror("Couldn't get VXRS_LOW\n"); return -1; } iov.iov_base = &fpregs->vxrs_high; iov.iov_len = sizeof(fpregs->vxrs_high); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_VXRS_HIGH, &iov) < 0) { pr_perror("Couldn't get VXRS_HIGH\n"); return -1; } fpregs->flags |= USER_FPREGS_VXRS; return 0; } /* * Get guarded-storage control block */ int get_gs_cb(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; fpregs->flags &= ~(USER_GS_CB | USER_GS_BC); iov.iov_base = &fpregs->gs_cb; iov.iov_len = sizeof(fpregs->gs_cb); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_GS_CB, &iov) < 0) { switch (errno) { case EINVAL: case ENODEV: memset(&fpregs->gs_cb, 0, sizeof(fpregs->gs_cb)); memset(&fpregs->gs_bc, 0, sizeof(fpregs->gs_bc)); pr_debug("GS_CB not supported\n"); return 0; case ENODATA: pr_debug("GS_CB not set\n"); break; default: return -1; } } else { fpregs->flags |= USER_GS_CB; } iov.iov_base = &fpregs->gs_bc; iov.iov_len = sizeof(fpregs->gs_bc); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_GS_BC, &iov) < 0) { if (errno == ENODATA) { pr_debug("GS_BC not set\n"); return 0; } pr_perror("Couldn't get GS_BC\n"); return -1; } fpregs->flags |= USER_GS_BC; return 0; } /* * Get runtime-instrumentation control block */ int get_ri_cb(pid_t pid, user_fpregs_struct_t *fpregs) { user_regs_struct_t regs; struct iovec iov; psw_t *psw; fpregs->flags &= ~(USER_RI_CB | USER_RI_ON); iov.iov_base = &fpregs->ri_cb; iov.iov_len = sizeof(fpregs->ri_cb); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_RI_CB, &iov) < 0) { switch (errno) { case EINVAL: case ENODEV: memset(&fpregs->ri_cb, 0, sizeof(fpregs->ri_cb)); pr_debug("RI_CB not supported\n"); return 0; case ENODATA: pr_debug("RI_CB not set\n"); return 0; default: pr_perror("Couldn't get RI_CB\n"); return -1; } } fpregs->flags |= USER_RI_CB; /* Get PSW and check if runtime-instrumentation bit is enabled */ iov.iov_base = ®s.prstatus; iov.iov_len = sizeof(regs.prstatus); if (ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov) < 0) return -1; psw = ®s.prstatus.psw; if (psw->mask & PSW_MASK_RI) fpregs->flags |= USER_RI_ON; return 0; } /* * Disable runtime-instrumentation bit */ static int s390_disable_ri_bit(pid_t pid, user_regs_struct_t *regs) { struct iovec iov; psw_t *psw; iov.iov_base = ®s->prstatus; iov.iov_len = sizeof(regs->prstatus); psw = ®s->prstatus.psw; psw->mask &= ~PSW_MASK_RI; return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); } /* * Prepare task registers for restart */ int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, void *arg) { user_fpregs_struct_t fpregs; struct iovec iov; int rewind; print_user_regs_struct("get_task_regs", pid, regs); memset(&fpregs, 0, sizeof(fpregs)); iov.iov_base = &fpregs.prfpreg; iov.iov_len = sizeof(fpregs.prfpreg); if (ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov) < 0) { pr_perror("Couldn't get floating-point registers"); return -1; } if (get_vx_regs(pid, &fpregs)) { pr_perror("Couldn't get vector registers"); return -1; } if (get_gs_cb(pid, &fpregs)) { pr_perror("Couldn't get guarded-storage"); return -1; } if (get_ri_cb(pid, &fpregs)) { pr_perror("Couldn't get runtime-instrumentation"); return -1; } /* * If the runtime-instrumentation bit is set, we have to disable it * before we execute parasite code. Otherwise parasite operations * would be recorded. */ if (fpregs.flags & USER_RI_ON) s390_disable_ri_bit(pid, regs); print_user_fpregs_struct("get_task_regs", pid, &fpregs); /* Check for system call restarting. */ if (regs->system_call) { rewind = regs->system_call >> 16; /* see arch/s390/kernel/signal.c: do_signal() */ switch ((long)regs->prstatus.gprs[2]) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->prstatus.gprs[2] = regs->prstatus.orig_gpr2; rewind_psw(®s->prstatus.psw, rewind); pr_debug("New gpr2: %016lx\n", regs->prstatus.gprs[2]); break; case -ERESTART_RESTARTBLOCK: pr_warn("Will restore %d with interrupted system call\n", pid); regs->prstatus.gprs[2] = -EINTR; break; } } /* Call save_task_regs() */ return save(arg, regs, &fpregs); } /* * Injected syscall instruction */ const char code_syscall[] = { 0x0a, 0x00, /* sc 0 */ 0x00, 0x01, /* S390_BREAKPOINT_U16 */ 0x00, 0x01, /* S390_BREAKPOINT_U16 */ 0x00, 0x01, /* S390_BREAKPOINT_U16 */ }; static inline void __check_code_syscall(void) { BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } /* * Issue s390 system call */ int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; /* Load syscall number into %r1 */ regs.prstatus.gprs[1] = (unsigned long) nr; /* Load parameter registers %r2-%r7 */ regs.prstatus.gprs[2] = arg1; regs.prstatus.gprs[3] = arg2; regs.prstatus.gprs[4] = arg3; regs.prstatus.gprs[5] = arg4; regs.prstatus.gprs[6] = arg5; regs.prstatus.gprs[7] = arg6; err = compel_execute_syscall(ctl, ®s, (char *) code_syscall); /* Return code from system is in %r2 */ if (ret) *ret = regs.prstatus.gprs[2]; return err; } /* * Issue s390 mmap call */ void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; struct mmap_arg_struct arg_struct; pid_t pid = ctl->rpid; long map = 0; int err; /* Setup s390 mmap data */ arg_struct.addr = (unsigned long)addr; arg_struct.len = length; arg_struct.prot = prot; arg_struct.flags = flags; arg_struct.fd = fd; arg_struct.offset = offset; /* Move args to process */ if (ptrace_swap_area(pid, where, &arg_struct, sizeof(arg_struct))) { pr_err("Can't inject memfd args (pid: %d)\n", pid); return NULL; } /* Do syscall */ err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long) where, 0, 0, 0, 0, 0); if (err < 0 || (long)map < 0) map = 0; /* Restore data */ if (ptrace_poke_area(pid, &arg_struct, where, sizeof(arg_struct))) { pr_err("Can't restore mmap args (pid: %d)\n", pid); if (map != 0) { err = compel_syscall(ctl, __NR_munmap, NULL, map, length, 0, 0, 0, 0); map = 0; } } return (void *)map; } /* * Setup registers for parasite call */ void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { regs->prstatus.psw.addr = new_ip; if (!stack) return; regs->prstatus.gprs[15] = ((unsigned long) stack) - STACK_FRAME_OVERHEAD; } /* * Check if we have all kernel and CRIU features to dump the task */ bool arch_can_dump_task(struct parasite_ctl *ctl) { user_fpregs_struct_t fpregs; user_regs_struct_t regs; pid_t pid = ctl->rpid; char str[8]; psw_t *psw; if (ptrace_get_regs(pid, ®s)) return false; psw = ®s.prstatus.psw; /* Check if the kernel supports RI ptrace interface */ if (psw->mask & PSW_MASK_RI) { if (get_ri_cb(pid, &fpregs) < 0) { pr_perror("Can't dump process with RI bit active"); return -1; } } /* We don't support 24 and 31 bit mode - only 64 bit */ if (psw->mask & PSW_MASK_EA) { if (psw->mask & PSW_MASK_BA) return true; else sprintf(str, "??"); } else { if (psw->mask & PSW_MASK_BA) sprintf(str, "31"); else sprintf(str, "24"); } pr_err("Pid %d is %s bit: Only 64 bit tasks are supported\n", pid, str); return false; } /* * Return current alternate signal stack */ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { long ret; int err; err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } /* * Find last mapped address of current process */ static unsigned long max_mapped_addr(void) { unsigned long addr_end, addr_max = 0; char line[128]; FILE *fp; fp = fopen("/proc/self/maps", "r"); if (!fp) goto out; /* Parse lines like: 3fff415f000-3fff4180000 rw-p 00000000 00:00 0 */ while (fgets(line, sizeof(line), fp)) { char *ptr; /* First skip start address */ strtoul(&line[0], &ptr, 16); addr_end = strtoul(ptr + 1, NULL, 16); addr_max = max(addr_max, addr_end); } fclose(fp); out: return addr_max - 1; } /* * Kernel task size level * * We have (dynamic) 4 level page tables for 64 bit since linux 2.6.25: * * 5a216a2083 ("[S390] Add four level page tables for CONFIG_64BIT=y.") * 6252d702c5 ("[S390] dynamic page tables.") * * The code below is already prepared for future (dynamic) 5 level page tables. * * Besides that there is one problematic kernel bug that has been fixed for * linux 4.11 by the following commit: * * ee71d16d22 ("s390/mm: make TASK_SIZE independent from the number * of page table levels") * * A 64 bit process on s390x always starts with 3 levels and upgrades to 4 * levels for mmap(> 4 TB) and to 5 levels for mmap(> 16 EB). * * Unfortunately before fix ee71d16d22 for a 3 level process munmap() * and mremap() fail for addresses > 4 TB. CRIU uses the task size, * to unmap() all memory from a starting point to task size to get rid of * unwanted mappings. CRIU uses mremap() to establish the final mappings * which also fails if we want to restore mappings > 4 TB and the initial * restore process still runs with 3 levels. * * To support the current CRIU design on s390 we return task size = 4 TB when * a kernel without fix ee71d16d22 is detected. In this case we can dump at * least processes with < 4 TB which is the most likely case anyway. * * For kernels with fix ee71d16d22 we are fully functional. */ enum kernel_ts_level { /* Kernel with 4 level page tables without fix ee71d16d22 */ KERNEL_TS_LEVEL_4_FIX_NO, /* Kernel with 4 level page tables with fix ee71d16d22 */ KERNEL_TS_LEVEL_4_FIX_YES, /* Kernel with 4 level page tables with or without fix ee71d16d22 */ KERNEL_TS_LEVEL_4_FIX_UNKN, /* Kernel with 5 level page tables */ KERNEL_TS_LEVEL_5, }; /* See arch/s390/include/asm/processor.h */ #define TASK_SIZE_LEVEL_3 0x40000000000UL /* 4 TB */ #define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ #define TASK_SIZE_LEVEL_5 0xffffffffffffefffUL /* 16 EB - 0x1000 */ /* * Return detected kernel version regarding task size level * * We use unmap() to probe the maximum possible page table level of kernel */ static enum kernel_ts_level get_kernel_ts_level(void) { unsigned long criu_end_addr = max_mapped_addr(); /* Check for 5 levels */ if (criu_end_addr >= TASK_SIZE_LEVEL_4) return KERNEL_TS_LEVEL_5; else if (munmap((void *) TASK_SIZE_LEVEL_4, 0x1000) == 0) return KERNEL_TS_LEVEL_5; if (criu_end_addr < TASK_SIZE_LEVEL_3) { /* Check for 4 level kernel with fix */ if (munmap((void *) TASK_SIZE_LEVEL_3, 0x1000) == 0) return KERNEL_TS_LEVEL_4_FIX_YES; else return KERNEL_TS_LEVEL_4_FIX_NO; } /* We can't find out if kernel has the fix */ return KERNEL_TS_LEVEL_4_FIX_UNKN; } /* * Log detected level */ static void pr_levels(const char *str) { pr_debug("Max user page table levels (task size): %s\n", str); } /* * Return last address (+1) of biggest possible user address space for * current kernel */ unsigned long compel_task_size(void) { switch (get_kernel_ts_level()) { case KERNEL_TS_LEVEL_4_FIX_NO: pr_levels("KERNEL_TS_LEVEL_4_FIX_NO"); return TASK_SIZE_LEVEL_3; case KERNEL_TS_LEVEL_4_FIX_YES: pr_levels("KERNEL_TS_LEVEL_4_FIX_YES"); return TASK_SIZE_LEVEL_4; case KERNEL_TS_LEVEL_4_FIX_UNKN: pr_levels("KERNEL_TS_LEVEL_4_FIX_UNKN"); return TASK_SIZE_LEVEL_3; default: /* KERNEL_TS_LEVEL_5 */ pr_levels("KERNEL_TS_LEVEL_5"); return TASK_SIZE_LEVEL_5; } } /* * Get task registers (overwrites weak function) * * We don't store floating point and vector registers here because we * assue that compel/pie code does not change them. * * For verification issue: * * $ objdump -S criu/pie/parasite.built-in.bin.o | grep "%f" * $ objdump -S criu/pie/restorer.built-in.bin.o | grep "%f" */ int ptrace_get_regs(int pid, user_regs_struct_t *regs) { struct iovec iov; int rc; pr_debug("ptrace_get_regs: pid=%d\n", pid); iov.iov_base = ®s->prstatus; iov.iov_len = sizeof(regs->prstatus); rc = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); if (rc != 0) return rc; iov.iov_base = ®s->system_call; iov.iov_len = sizeof(regs->system_call); return ptrace(PTRACE_GETREGSET, pid, NT_S390_SYSTEM_CALL, &iov); } /* * Set task registers (overwrites weak function) */ int ptrace_set_regs(int pid, user_regs_struct_t *regs) { uint32_t system_call = 0; struct iovec iov; int rc; pr_debug("ptrace_set_regs: pid=%d\n", pid); iov.iov_base = ®s->prstatus; iov.iov_len = sizeof(regs->prstatus); rc = ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); if (rc) return rc; /* * If we attached to an inferior that is sleeping in a restarting * system call like futex_wait(), we have to reset the system_call * to 0. Otherwise the kernel would try to finish the interrupted * system call after PTRACE_CONT and we could not run the * parasite code. */ iov.iov_base = &system_call; iov.iov_len = sizeof(system_call); return ptrace(PTRACE_SETREGSET, pid, NT_S390_SYSTEM_CALL, &iov); } criu-3.6/compel/arch/x86/000077500000000000000000000000001317335042600151515ustar00rootroot00000000000000criu-3.6/compel/arch/x86/plugins/000077500000000000000000000000001317335042600166325ustar00rootroot00000000000000criu-3.6/compel/arch/x86/plugins/include/000077500000000000000000000000001317335042600202555ustar00rootroot00000000000000criu-3.6/compel/arch/x86/plugins/include/asm/000077500000000000000000000000001317335042600210355ustar00rootroot00000000000000criu-3.6/compel/arch/x86/plugins/include/asm/prologue.h000066400000000000000000000012671317335042600230500ustar00rootroot00000000000000#ifndef __ASM_PROLOGUE_H__ #define __ASM_PROLOGUE_H__ #ifndef __ASSEMBLY__ #include #include #include #include #define sys_recv(sockfd, ubuf, size, flags) \ sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) typedef struct prologue_init_args { struct sockaddr_un ctl_sock_addr; unsigned int ctl_sock_addr_len; unsigned int arg_s; void *arg_p; void *sigframe; } prologue_init_args_t; #endif /* __ASSEMBLY__ */ /* * Reserve enough space for sigframe. * * FIXME It is rather should be taken from sigframe header. */ #define PROLOGUE_SGFRAME_SIZE 4096 #define PROLOGUE_INIT_ARGS_SIZE 1024 #endif /* __ASM_PROLOGUE_H__ */ criu-3.6/compel/arch/x86/plugins/include/asm/syscall-types.h000066400000000000000000000030621317335042600240230ustar00rootroot00000000000000#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ /* Types for sigaction, sigprocmask syscalls */ typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; #define SA_RESTORER 0x04000000 #define _KNSIG 64 #define _NSIG_BPW 64 #define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) /* * Note: as k_rtsigset_t is the same size for 32-bit and 64-bit, * sig defined as uint64_t rather than (unsigned long) - for the * purpose if we ever going to support native 32-bit compilation. */ typedef struct { uint64_t sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { rt_sighandler_t rt_sa_handler; unsigned long rt_sa_flags; rt_sigrestore_t rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t; /* * Note: there is unaligned access on x86_64 and it's fine. * However, when porting this code -- keep in mind about possible issues * with unaligned rt_sa_mask. */ typedef struct __attribute__((packed)) { unsigned int rt_sa_handler; unsigned int rt_sa_flags; unsigned int rt_sa_restorer; k_rtsigset_t rt_sa_mask; } rt_sigaction_t_compat; /* Types for set_thread_area, get_thread_area syscalls */ typedef struct { unsigned int entry_number; unsigned int base_addr; unsigned int limit; unsigned int seg_32bit:1; unsigned int contents:2; unsigned int read_exec_only:1; unsigned int limit_in_pages:1; unsigned int seg_not_present:1; unsigned int useable:1; unsigned int lm:1; } user_desc_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ criu-3.6/compel/arch/x86/plugins/include/features.h000066400000000000000000000002021317335042600222360ustar00rootroot00000000000000#ifndef __COMPEL_ARCH_FEATURES_H #define __COMPEL_ARCH_FEATURES_H #define ARCH_HAS_MEMCPY #endif /* __COMPEL_ARCH_FEATURES_H */ criu-3.6/compel/arch/x86/plugins/std/000077500000000000000000000000001317335042600174245ustar00rootroot00000000000000criu-3.6/compel/arch/x86/plugins/std/call32.S000077700000000000000000000000001317335042600256222../../../../../criu/arch/x86/call32.Sustar00rootroot00000000000000criu-3.6/compel/arch/x86/plugins/std/memcpy.S000066400000000000000000000010061317335042600210370ustar00rootroot00000000000000#include "common/asm/linkage.h" /* The following code is taken from Linux kernel (arch/x86/lib/memcpy_64.S). * There are 3 implementations in there, we use the one that relies on * X86_FEATURE_REP_GOOD ("rep microcode works well"). */ /* * memcpy - Copy a memory block. * * Input: * rdi destination * rsi source * rdx count * * Output: * rax original destination */ ENTRY(memcpy) movq %rdi, %rax movq %rdx, %rcx shrq $3, %rcx andl $7, %edx rep movsq movl %edx, %ecx rep movsb ret END(memcpy) criu-3.6/compel/arch/x86/plugins/std/parasite-head.S000066400000000000000000000021231317335042600222550ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .head.text, "ax" #ifndef CONFIG_X86_64 # error 64-bit parasite should compile with CONFIG_X86_64 #endif .macro PARASITE_ENTRY num subq $16, %rsp andq $~15, %rsp pushq $\num movq %rsp, %rbp movl __export_parasite_cmd(%rip), %edi leaq __export_parasite_args(%rip), %rsi call parasite_service .endm #ifdef CONFIG_COMPAT .code32 ENTRY(__export_parasite_head_start_compat) /* A long jump to 64-bit parasite. */ jmp $__USER_CS,$1f 1: .code64 PARASITE_ENTRY 0 pushq $__USER32_CS pushq $2f lretq 2: .code32 /* * parasite_service() can run commands in non-daemon mode * with parasite_trap_cmd(): it waits that after return there * is a software break. * compel_run_in_thread() uses this and after hitting the break, * it restores register set - that's the reason, why we should * stop in 32-bit mode for compat tasks here. */ int $0x03 END(__export_parasite_head_start_compat) .code64 #endif ENTRY(__export_parasite_head_start) PARASITE_ENTRY 0 int $0x03 END(__export_parasite_head_start) .align 8 GLOBAL(__export_parasite_cmd) .long 0 criu-3.6/compel/arch/x86/plugins/std/prologue.S000066400000000000000000000015161317335042600214070ustar00rootroot00000000000000#include "common/asm/linkage.h" #include "asm/prologue.h" #include "uapi/std/syscall-codes.h" .section .compel.prologue.text, "ax" ENTRY(__export_std_prologue_start) push %rsp leaq __export_std_prologue_init_args(%rip), %rdi movq __export_std_plugin_begin(%rip), %rsi movq __export_std_plugin_size(%rip), %rdx call __export_std_compel_start do_rt_sigreturn: leaq __export_std_prologue_sigframe(%rip), %rax addq $8, %rax movq %rax, %rsp # we can't use sys_rt_sigreturn here mov $__NR_rt_sigreturn, %eax # because we're adjusting stack syscall GLOBAL(__export_std_prologue_init_args) .space PROLOGUE_INIT_ARGS_SIZE, 0 GLOBAL(__export_std_plugin_begin) .space 8, 0 GLOBAL(__export_std_plugin_size) .space 8, 0 .align 64 GLOBAL(__export_std_prologue_sigframe) .space PROLOGUE_SGFRAME_SIZE, 0 END(__export_std_prologue_start) criu-3.6/compel/arch/x86/plugins/std/syscalls/000077500000000000000000000000001317335042600212615ustar00rootroot00000000000000criu-3.6/compel/arch/x86/plugins/std/syscalls/Makefile.syscalls000066400000000000000000000117101317335042600245550ustar00rootroot00000000000000std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o sys-proto-types := $(obj)/include/uapi/std/syscall-types.h sys-proto-generic := $(obj)/include/uapi/std/syscall.h sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h sys-proto = $(obj)/include/uapi/std/syscall-$(1).h sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S sys-asm-common-name = std/syscalls/syscall-common-x86-$(1).S sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c sys-bits := 64 AV := $$$$ define gen-rule-sys-codes $(sys-codes): $(sys-def) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ sub("^__NR", "SYS", SYSN); \ print "\n#ifndef ", $(AV)1; \ print "#define", $(AV)1, $(AV)2; \ print "#endif"; \ print "\n#ifndef ", SYSN; \ print "#define ", SYSN, $(AV)1; \ print "#endif";}' >> $$@ $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ endef define gen-rule-sys-proto $(sys-proto): $(sys-def) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ $(Q) echo '#include ' >> $$@ $(Q) echo '#include ' >> $$@ ifeq ($(1),32) $(Q) echo '#include "asm/syscall32.h"' >> $$@ endif $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ endef define gen-rule-sys-asm $(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo '#include ' >> $$@ $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ endef define gen-rule-sys-exec-tbl $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) cat $$< | awk '/^__NR/{print \ "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ endef $(sys-codes-generic): $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_32.tbl $(sys-proto-types) $(call msg-gen, $@) $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo '#include ' >> $@ $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ sub("^__NR", "__NR32", NR32); \ print "\n#ifndef ", NR32; \ print "#define ", NR32, $$2; \ print "#endif";}' >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ mrproper-y += $(sys-codes-generic) $(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) $(call msg-gen, $@) $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ $(Q) echo "" >> $@ $(Q) echo "#ifdef CONFIG_X86_32" >> $@ $(Q) echo '#include ' >> $@ $(Q) echo "#else" >> $@ $(Q) echo '#include ' >> $@ $(Q) echo "#endif /* CONFIG_X86_32 */" >> $@ $(Q) echo "" >> $@ $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ mrproper-y += $(sys-proto-generic) define gen-rule-sys-exec-tbl $(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(call msg-gen, $$@) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) cat $$< | awk '/^__NR/{print \ "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ endef $(eval $(call map,gen-rule-sys-codes,$(sys-bits))) $(eval $(call map,gen-rule-sys-proto,$(sys-bits))) $(eval $(call map,gen-rule-sys-asm,$(sys-bits))) $(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) $(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(call msg-gen, $@) $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) std-headers-deps += $(call sys-codes,$(sys-bits)) std-headers-deps += $(call sys-proto,$(sys-bits)) std-headers-deps += $(call sys-asm,$(sys-bits)) std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) std-headers-deps += $(sys-codes-generic) std-headers-deps += $(sys-proto-generic) std-headers-deps += $(sys-asm-types) mrproper-y += $(std-headers-deps) criu-3.6/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-32.S000066400000000000000000000010731317335042600252130ustar00rootroot00000000000000#include "common/asm/linkage.h" #define SYSCALL(name, opcode) \ ENTRY(name); \ movl $opcode, %eax; \ jmp __syscall_common; \ END(name) ENTRY(__syscall_common) pushl %ebx pushl %esi pushl %edi pushl %ebp #define __arg(n) (4 * (n) + 20)(%esp) movl __arg(0),%ebx movl __arg(1),%ecx movl __arg(2),%edx movl __arg(3),%esi movl __arg(4),%edi movl __arg(5),%ebp #undef __arg int $0x80 popl %ebp popl %edi popl %esi popl %ebx ret END(__syscall_common) ENTRY(__cr_restore_rt) movl $__NR_rt_sigreturn, %eax jmp __syscall_common END(__cr_restore_rt) criu-3.6/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-64.S000066400000000000000000000005051317335042600252170ustar00rootroot00000000000000#include "common/asm/linkage.h" #define SYSCALL(name, opcode) \ ENTRY(name); \ movl $opcode, %eax; \ jmp __syscall_common; \ END(name) .text .align 4 ENTRY(__syscall_common) movq %rcx, %r10 syscall ret END(__syscall_common) ENTRY(__cr_restore_rt) movq $__NR_rt_sigreturn, %rax syscall END(__cr_restore_rt) criu-3.6/compel/arch/x86/plugins/std/syscalls/syscall32.c000066400000000000000000000057721317335042600232570ustar00rootroot00000000000000#include "asm/types.h" #include "syscall-32.h" #define SYS_SOCKET 1 /* sys_socket(2) */ #define SYS_BIND 2 /* sys_bind(2) */ #define SYS_CONNECT 3 /* sys_connect(2) */ #define SYS_SENDTO 11 /* sys_sendto(2) */ #define SYS_RECVFROM 12 /* sys_recvfrom(2) */ #define SYS_SHUTDOWN 13 /* sys_shutdown(2) */ #define SYS_SETSOCKOPT 14 /* sys_setsockopt(2) */ #define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */ #define SYS_SENDMSG 16 /* sys_sendmsg(2) */ #define SYS_RECVMSG 17 /* sys_recvmsg(2) */ long sys_socket(int domain, int type, int protocol) { uint32_t a[] = { (uint32_t)domain, (uint32_t)type, (uint32_t)protocol }; return sys_socketcall(SYS_SOCKET, (unsigned long *)a); } long sys_connect(int sockfd, struct sockaddr *addr, int addrlen) { uint32_t a[] = {(uint32_t)sockfd, (uint32_t)addr, (uint32_t)addrlen}; return sys_socketcall(SYS_CONNECT, (unsigned long *)a); } long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) { uint32_t a[] = {(uint32_t)sockfd, (uint32_t)buff, (uint32_t)len, (uint32_t)flags, (uint32_t)addr, (uint32_t)addr_len}; return sys_socketcall(SYS_SENDTO, (unsigned long *)a); } long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) { uint32_t a[] = {(uint32_t)sockfd, (uint32_t)ubuf, (uint32_t)size, (uint32_t)flags, (uint32_t)addr, (uint32_t)addr_len}; return sys_socketcall(SYS_RECVFROM, (unsigned long *)a); } long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags) { uint32_t a[] = {(uint32_t)sockfd, (uint32_t)msg, (uint32_t)flags}; return sys_socketcall(SYS_SENDMSG, (unsigned long *)a); } long sys_recvmsg(int sockfd, struct msghdr *msg, int flags) { uint32_t a[] = {(uint32_t)sockfd, (uint32_t)msg, (uint32_t)flags}; return sys_socketcall(SYS_RECVMSG, (unsigned long *)a); } long sys_shutdown(int sockfd, int how) { uint32_t a[] = {(uint32_t)sockfd, (uint32_t)how}; return sys_socketcall(SYS_SHUTDOWN, (unsigned long *)a); } long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen) { uint32_t a[] = {(uint32_t)sockfd, (uint32_t)addr, (uint32_t)addrlen}; return sys_socketcall(SYS_BIND, (unsigned long *)a); } long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen) { uint32_t a[] = {(uint32_t)sockfd, (uint32_t)level, (uint32_t)optname, (uint32_t)optval, (uint32_t)optlen}; return sys_socketcall(SYS_SETSOCKOPT, (unsigned long *)a); } long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen) { uint32_t a[] = {(uint32_t)sockfd, (uint32_t)level, (uint32_t)optname, (uint32_t)optval, (uint32_t)optlen}; return sys_socketcall(SYS_GETSOCKOPT, (unsigned long *)a); } #define SHMAT 21 long sys_shmat(int shmid, void *shmaddr, int shmflag) { return sys_ipc(SHMAT, shmid, shmflag, 0, shmaddr, 0); } long sys_pread(unsigned int fd, char *ubuf, uint32_t count, uint64_t pos) { return sys_pread64(fd, ubuf, count, (uint32_t)(pos & 0xffffffffu), (uint32_t)(pos >> 32)); } criu-3.6/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl000066400000000000000000000165371317335042600237560ustar00rootroot00000000000000# # System calls table, please make sure the table consist only the syscalls # really used somewhere in project. # # code name arguments # ------------------------------------------------------------------------------------------------------------------------------------------------------------- __NR_restart_syscall 0 sys_restart_syscall (void) __NR_exit 1 sys_exit (unsigned long error_code) __NR_read 3 sys_read (int fd, void *buf, unsigned long count) __NR_write 4 sys_write (int fd, const void *buf, unsigned long count) __NR_open 5 sys_open (const char *filename, int flags, unsigned int mode) __NR_close 6 sys_close (int fd) __NR_unlink 10 sys_unlink (char *pathname) __NR_lseek 19 sys_lseek (int fd, int32_t offset, unsigned int origin) __NR_getpid 20 sys_getpid (void) __NR_mount 21 sys_mount (const char *dev_name, const char *dir_name, const char *type, unsigned long flags, const void *data) __NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data) __NR_kill 37 sys_kill (long pid, int sig) __NR_mkdir 39 sys_mkdir (const char *name, int mode) __NR_rmdir 40 sys_rmdir (const char *name) __NR_brk 45 sys_brk (void *addr) __NR_umount2 52 sys_umount2 (char *name, int flags) __NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_fcntl 55 sys_fcntl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_umask 60 sys_umask (int mask) __NR_setrlimit 75 sys_setrlimit (unsigned int resource, struct krlimit *rlim) __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_munmap 91 sys_munmap (void *addr, unsigned long len) __NR_setpriority 97 sys_setpriority (int which, int who, int nice) __NR_socketcall 102 sys_socketcall (int call, unsigned long *args) __NR_setitimer 104 sys_setitimer (int which, struct itimerval *in, struct itimerval *out) __NR_getitimer 105 sys_getitimer (int which, struct itimerval *it) __NR_wait4 114 sys_wait4 (pid_t pid, int *stat_addr, int options, struct rusage *ru) __NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, void *ptr, long fifth) __NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) __NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) __NR_getpgid 132 sys_getpgid (pid_t pid) __NR_personality 136 sys_personality (unsigned int personality) __NR_flock 143 sys_flock (int fd, unsigned long cmd) __NR_getsid 147 sys_getsid (void) __NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_nanosleep 162 sys_nanosleep (struct timespec *rqtp, struct timespec *rmtp) __NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) __NR_prctl 172 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) __NR_rt_sigreturn 173 sys_rt_sigreturn (void) __NR_rt_sigaction 174 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) __NR_rt_sigprocmask 175 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *oset, size_t sigsetsize) __NR_rt_sigqueueinfo 178 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *uinfo) __NR_pread64 180 sys_pread64 (unsigned int fd, char *ubuf, uint32_t count, uint32_t poslo, uint32_t poshi) __NR_capget 184 sys_capget (struct cap_header *h, struct cap_data *d) __NR_capset 185 sys_capset (struct cap_header *h, struct cap_data *d) __NR_sigaltstack 186 sys_sigaltstack (const void *uss_ptr, void *uoss_ptr) __NR_mmap2 192 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) __NR_getgroups32 205 sys_getgroups (int gsize, unsigned int *groups) __NR_setgroups32 206 sys_setgroups (int gsize, unsigned int *groups) __NR_setresuid32 208 sys_setresuid (int uid, int euid, int suid) __NR_getresuid32 209 sys_getresuid (int *uid, int *euid, int *suid) __NR_setresgid32 210 sys_setresgid (int gid, int egid, int sgid) __NR_getresgid32 211 sys_getresgid (int *gid, int *egid, int *sgid) __NR_setfsuid32 215 sys_setfsuid (int fsuid) __NR_setfsgid32 216 sys_setfsgid (int fsgid) __NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char *vec) __NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior) __NR_gettid 224 sys_gettid (void) __NR_futex 240 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info) __NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info) __NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p) __NR_io_getevents 247 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) __NR_io_submit 248 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp) __NR_exit_group 252 sys_exit_group (int error_code) __NR_set_tid_address 258 sys_set_tid_address (int *tid_addr) __NR_timer_create 259 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_timer_settime 260 sys_timer_settime (kernel_timer_t timer_id, int flags, struct itimerspec *new, struct itimerspec *old) __NR_timer_gettime 261 sys_timer_gettime (int timer_id, struct itimerspec *setting) __NR_timer_getoverrun 262 sys_timer_getoverrun (int timer_id) __NR_timer_delete 263 sys_timer_delete (kernel_timer_t timer_id) __NR_clock_gettime 265 sys_clock_gettime (int which_clock, struct timespec *tp) __NR_waitid 284 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_openat 295 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_readlinkat 305 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) __NR_set_robust_list 311 sys_set_robust_list (struct robust_list_head *head, size_t len) __NR_get_robust_list 312 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_vmsplice 316 sys_vmsplice (int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags) __NR_signalfd 321 sys_signalfd (int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize) __NR_timerfd_settime 325 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_preadv 333 sys_preadv (int fd, struct iovec *iov, unsigned long nr, loff_t off) __NR_rt_tgsigqueueinfo 335 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo) __NR_fanotify_init 338 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) __NR_fanotify_mark 339 sys_fanotify_mark (int fanotify_fd, unsigned int flag, uint32_t mask, int dfd, const char *pathname) __NR_open_by_handle_at 342 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) __NR_setns 346 sys_setns (int fd, int nstype) __NR_kcmp 349 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) __NR_seccomp 354 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_memfd_create 356 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 374 sys_userfaultfd (int flags) criu-3.6/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl000066400000000000000000000204701317335042600237520ustar00rootroot00000000000000# # System calls table, please make sure the table consist only the syscalls # really used somewhere in project. # # __NR_name code name arguments # ------------------------------------------------------------------------------------------------------------------------------------------------------------- __NR_read 0 sys_read (int fd, void *buf, unsigned long count) __NR_write 1 sys_write (int fd, const void *buf, unsigned long count) __NR_open 2 sys_open (const char *filename, unsigned long flags, unsigned long mode) __NR_close 3 sys_close (int fd) __NR_lseek 8 sys_lseek (int fd, unsigned long offset, unsigned long origin) __NR_mmap 9 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) __NR_mprotect 10 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) __NR_munmap 11 sys_munmap (void *addr, unsigned long len) __NR_brk 12 sys_brk (void *addr) __NR_rt_sigaction 13 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) __NR_rt_sigprocmask 14 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) __NR_rt_sigreturn 15 sys_rt_sigreturn (void) __NR_ioctl 16 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) __NR_pread64 17 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) __NR_mremap 25 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) __NR_mincore 27 sys_mincore (void *addr, unsigned long size, unsigned char *vec) __NR_madvise 28 sys_madvise (unsigned long start, size_t len, int behavior) __NR_shmat 30 sys_shmat (int shmid, void *shmaddr, int shmflag) __NR_dup2 33 sys_dup2 (int oldfd, int newfd) __NR_nanosleep 35 sys_nanosleep (struct timespec *req, struct timespec *rem) __NR_getitimer 36 sys_getitimer (int which, const struct itimerval *val) __NR_setitimer 38 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) __NR_getpid 39 sys_getpid (void) __NR_socket 41 sys_socket (int domain, int type, int protocol) __NR_connect 42 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) __NR_sendto 44 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) __NR_recvfrom 45 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) __NR_sendmsg 46 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) __NR_recvmsg 47 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) __NR_shutdown 48 sys_shutdown (int sockfd, int how) __NR_bind 49 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) __NR_setsockopt 54 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) __NR_getsockopt 55 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) __NR_clone 56 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) __NR_exit 60 sys_exit (unsigned long error_code) __NR_wait4 61 sys_wait4 (int pid, int *status, int options, struct rusage *ru) __NR_kill 62 sys_kill (long pid, int sig) __NR_fcntl 72 sys_fcntl (int fd, int type, long arg) __NR_flock 73 sys_flock (int fd, unsigned long cmd) __NR_mkdir 83 sys_mkdir (const char *name, int mode) __NR_rmdir 84 sys_rmdir (const char *name) __NR_unlink 87 sys_unlink (char *pathname) __NR_umask 95 sys_umask (int mask) __NR_gettimeofday 96 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_ptrace 101 sys_ptrace (long request, pid_t pid, void *addr, void *data) __NR_getgroups 115 sys_getgroups (int gsize, unsigned int *groups) __NR_setgroups 116 sys_setgroups (int gsize, unsigned int *groups) __NR_setresuid 117 sys_setresuid (int uid, int euid, int suid) __NR_getresuid 118 sys_getresuid (int *uid, int *euid, int *suid) __NR_setresgid 119 sys_setresgid (int gid, int egid, int sgid) __NR_getresgid 120 sys_getresgid (int *gid, int *egid, int *sgid) __NR_getpgid 121 sys_getpgid (pid_t pid) __NR_setfsuid 122 sys_setfsuid (int fsuid) __NR_setfsgid 123 sys_setfsgid (int fsgid) __NR_getsid 124 sys_getsid (void) __NR_capget 125 sys_capget (struct cap_header *h, struct cap_data *d) __NR_capset 126 sys_capset (struct cap_header *h, struct cap_data *d) __NR_rt_sigqueueinfo 129 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) __NR_sigaltstack 131 sys_sigaltstack (const void *uss, void *uoss) __NR_personality 135 sys_personality (unsigned int personality) __NR_setpriority 141 sys_setpriority (int which, int who, int nice) __NR_sched_setscheduler 144 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_prctl 157 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) __NR_arch_prctl 158 sys_arch_prctl (int option, unsigned long addr) __NR_setrlimit 160 sys_setrlimit (int resource, struct krlimit *rlim) __NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) __NR_umount2 166 sys_umount2 (char *name, int flags) __NR_gettid 186 sys_gettid (void) __NR_futex 202 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) __NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info) __NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx) __NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) __NR_io_submit 209 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) __NR_get_thread_area 211 sys_get_thread_area (user_desc_t *info) __NR_set_tid_address 218 sys_set_tid_address (int *tid_addr) __NR_restart_syscall 219 sys_restart_syscall (void) __NR_sys_timer_create 222 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_sys_timer_settime 223 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) __NR_sys_timer_gettime 224 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 225 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 226 sys_timer_delete (kernel_timer_t timer_id) __NR_clock_gettime 228 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 231 sys_exit_group (int error_code) __NR_openat 257 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_waitid 247 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_readlinkat 267 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) __NR_set_robust_list 273 sys_set_robust_list (struct robust_list_head *head, size_t len) __NR_get_robust_list 274 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_seccomp 317 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_vmsplice 278 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_timerfd_settime 286 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_signalfd4 289 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_preadv 295 sys_preadv (int fd, struct iovec *iov, unsigned long nr, loff_t off) __NR_rt_tgsigqueueinfo 297 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) __NR_fanotify_init 300 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) __NR_fanotify_mark 301 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) __NR_open_by_handle_at 304 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) __NR_setns 308 sys_setns (int fd, int nstype) __NR_kcmp 312 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) __NR_memfd_create 319 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 323 sys_userfaultfd (int flags) criu-3.6/compel/arch/x86/scripts/000077500000000000000000000000001317335042600166405ustar00rootroot00000000000000criu-3.6/compel/arch/x86/scripts/compel-pack-compat.lds.S000066400000000000000000000011111317335042600232130ustar00rootroot00000000000000OUTPUT_ARCH(i386) TARGET(elf32-i386) EXTERN(__export_parasite_head_start) SECTIONS { .text : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) *(.compel.exit) *(.compel.init) } .data : { *(.data*) *(.bss*) } .rodata : { *(.rodata*) *(.got*) } .toc : ALIGN(8) { *(.toc*) } /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) } /* Parasite args should have 4 bytes align, as we have futex inside. */ . = ALIGN(4); __export_parasite_args = .; } criu-3.6/compel/arch/x86/scripts/compel-pack.lds.S000066400000000000000000000011221317335042600217340ustar00rootroot00000000000000OUTPUT_ARCH(i386:x86-64) TARGET(elf64-x86-64) EXTERN(__export_parasite_head_start) SECTIONS { .text : { *(.head.text) ASSERT(DEFINED(__export_parasite_head_start), "Symbol __export_parasite_head_start is missing"); *(.text*) *(.compel.exit) *(.compel.init) } .data : { *(.data*) *(.bss*) } .rodata : { *(.rodata*) *(.got*) } .toc : ALIGN(8) { *(.toc*) } /DISCARD/ : { *(.debug*) *(.comment*) *(.note*) *(.group*) *(.eh_frame*) } /* Parasite args should have 4 bytes align, as we have futex inside. */ . = ALIGN(4); __export_parasite_args = .; } criu-3.6/compel/arch/x86/src/000077500000000000000000000000001317335042600157405ustar00rootroot00000000000000criu-3.6/compel/arch/x86/src/lib/000077500000000000000000000000001317335042600165065ustar00rootroot00000000000000criu-3.6/compel/arch/x86/src/lib/cpu.c000066400000000000000000000113051317335042600174410ustar00rootroot00000000000000#include #include #include "compel-cpu.h" #include "common/bitops.h" #include "common/compiler.h" #include "log.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; static bool rt_info_done = false; void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { if (likely(feature < NCAPINTS_BITS)) set_bit(feature, (unsigned long *)c->x86_capability); } void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { if (likely(feature < NCAPINTS_BITS)) clear_bit(feature, (unsigned long *)c->x86_capability); } int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { if (likely(feature < NCAPINTS_BITS)) return test_bit(feature, (unsigned long *)c->x86_capability); return 0; } int compel_cpuid(compel_cpuinfo_t *c) { /* * See cpu_detect() in the kernel, also * read cpuid specs not only from general * SDM but for extended instructions set * reference. */ /* Get vendor name */ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, (unsigned int *)&c->x86_vendor_id[0], (unsigned int *)&c->x86_vendor_id[8], (unsigned int *)&c->x86_vendor_id[4]); if (!strcmp(c->x86_vendor_id, "GenuineIntel")) { c->x86_vendor = X86_VENDOR_INTEL; } else if (!strcmp(c->x86_vendor_id, "AuthenticAMD")) { c->x86_vendor = X86_VENDOR_AMD; } else { pr_err("Unsupported CPU vendor %s\n", c->x86_vendor_id); return -1; } c->x86_family = 4; /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { uint32_t eax, ebx, ecx, edx; cpuid(0x00000001, &eax, &ebx, &ecx, &edx); c->x86_family = (eax >> 8) & 0xf; c->x86_model = (eax >> 4) & 0xf; c->x86_mask = eax & 0xf; if (c->x86_family == 0xf) c->x86_family += (eax >> 20) & 0xff; if (c->x86_family >= 0x6) c->x86_model += ((eax >> 16) & 0xf) << 4; c->x86_capability[0] = edx; c->x86_capability[4] = ecx; } /* Additional Intel-defined flags: level 0x00000007 */ if (c->cpuid_level >= 0x00000007) { uint32_t eax, ebx, ecx, edx; cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); c->x86_capability[9] = ebx; c->x86_capability[11] = ecx; } /* Extended state features: level 0x0000000d */ if (c->cpuid_level >= 0x0000000d) { uint32_t eax, ebx, ecx, edx; cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); c->x86_capability[10] = eax; } /* AMD-defined flags: level 0x80000001 */ c->extended_cpuid_level = cpuid_eax(0x80000000); if ((c->extended_cpuid_level & 0xffff0000) == 0x80000000) { if (c->extended_cpuid_level >= 0x80000001) { c->x86_capability[1] = cpuid_edx(0x80000001); c->x86_capability[6] = cpuid_ecx(0x80000001); } } /* * We're don't care about scattered features for now, * otherwise look into init_scattered_cpuid_features() * in kernel. */ if (c->extended_cpuid_level >= 0x80000004) { unsigned int *v; char *p, *q; v = (unsigned int *)c->x86_model_id; cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); c->x86_model_id[48] = 0; /* * Intel chips right-justify this string for some dumb reason; * undo that brain damage: */ p = q = &c->x86_model_id[0]; while (*p == ' ') p++; if (p != q) { while (*p) *q++ = *p++; while (q <= &c->x86_model_id[48]) *q++ = '\0'; /* Zero-pad the rest */ } } /* On x86-64 NOP is always present */ compel_set_cpu_cap(c, X86_FEATURE_NOPL); switch (c->x86_vendor) { case X86_VENDOR_INTEL: /* * Strictly speaking we need to read MSR_IA32_MISC_ENABLE * here but on ring3 it's impossible. */ if (c->x86_family == 15) { compel_clear_cpu_cap(c, X86_FEATURE_REP_GOOD); compel_clear_cpu_cap(c, X86_FEATURE_ERMS); } else if (c->x86_family == 6) { /* On x86-64 rep is fine */ compel_set_cpu_cap(c, X86_FEATURE_REP_GOOD); } /* See filter_cpuid_features in kernel */ if ((int32_t)c->cpuid_level < (int32_t)0x0000000d) compel_clear_cpu_cap(c, X86_FEATURE_XSAVE); break; case X86_VENDOR_AMD: /* * Bit 31 in normal CPUID used for nonstandard 3DNow ID; * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ compel_clear_cpu_cap(c, 0 * 32 + 31); if (c->x86_family >= 0x10) compel_set_cpu_cap(c, X86_FEATURE_REP_GOOD); if (c->x86_family == 0xf) { uint32_t level; /* On C+ stepping K8 rep microcode works well for copy/memset */ level = cpuid_eax(1); if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) compel_set_cpu_cap(c, X86_FEATURE_REP_GOOD); } break; } return 0; } bool compel_cpu_has_feature(unsigned int feature) { if (!rt_info_done) { compel_cpuid(&rt_info); rt_info_done = true; } return compel_test_cpu_cap(&rt_info, feature); } criu-3.6/compel/arch/x86/src/lib/handle-elf-host.c000077700000000000000000000000001317335042600237622handle-elf.custar00rootroot00000000000000criu-3.6/compel/arch/x86/src/lib/handle-elf.c000066400000000000000000000007621317335042600206560ustar00rootroot00000000000000#include #include "uapi/compel.h" #include "handle-elf.h" #include "piegen.h" #include "log.h" static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; int handle_binary(void *mem, size_t size) { if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) return handle_elf_x86_64(mem, size); pr_err("Unsupported Elf format detected\n"); return -EINVAL; } criu-3.6/compel/arch/x86/src/lib/include/000077500000000000000000000000001317335042600201315ustar00rootroot00000000000000criu-3.6/compel/arch/x86/src/lib/include/cpu.h000066400000000000000000000023071317335042600210730ustar00rootroot00000000000000#ifndef __COMPEL_ASM_CPU_H__ #define __COMPEL_ASM_CPU_H__ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ asm volatile("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx) : "memory"); } static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = 0; native_cpuid(eax, ebx, ecx, edx); } static inline void cpuid_count(unsigned int op, int count, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = count; native_cpuid(eax, ebx, ecx, edx); } static inline unsigned int cpuid_eax(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return eax; } static inline unsigned int cpuid_ecx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return ecx; } static inline unsigned int cpuid_edx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return edx; } #endif criu-3.6/compel/arch/x86/src/lib/include/handle-elf.h000066400000000000000000000007741317335042600223110ustar00rootroot00000000000000#ifndef COMPEL_HANDLE_ELF_H__ #define COMPEL_HANDLE_ELF_H__ #include "elf64-types.h" #define ELF_X86_64 #ifndef R_X86_64_GOTPCRELX # define R_X86_64_GOTPCRELX 41 #endif #ifndef R_X86_64_REX_GOTPCRELX # define R_X86_64_REX_GOTPCRELX 42 #endif #define __handle_elf handle_elf_x86_64 #define arch_is_machine_supported(e_machine) (e_machine == EM_X86_64) extern int handle_elf_x86_32(void *mem, size_t size); extern int handle_elf_x86_64(void *mem, size_t size); #endif /* COMPEL_HANDLE_ELF_H__ */ criu-3.6/compel/arch/x86/src/lib/include/syscall.h000066400000000000000000000006511317335042600217560ustar00rootroot00000000000000#ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ #define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) /* * For x86_32 __NR_mmap inside the kernel represents old_mmap system * call, but since we didn't use it yet lets go further and simply * define own alias for __NR_mmap2 which would allow us to unify code * between 32 and 64 bits version. */ #define __NR32_mmap __NR32_mmap2 #endif criu-3.6/compel/arch/x86/src/lib/include/uapi/000077500000000000000000000000001317335042600210675ustar00rootroot00000000000000criu-3.6/compel/arch/x86/src/lib/include/uapi/asm/000077500000000000000000000000001317335042600216475ustar00rootroot00000000000000criu-3.6/compel/arch/x86/src/lib/include/uapi/asm/.gitignore000066400000000000000000000000001317335042600236250ustar00rootroot00000000000000criu-3.6/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h000066400000000000000000000003211317335042600243350ustar00rootroot00000000000000#ifndef __COMPEL_BREAKPOINTS_H__ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP SI_KERNEL extern int ptrace_set_breakpoint(pid_t pid, void *addr); extern int ptrace_flush_breakpoints(pid_t pid); #endif criu-3.6/compel/arch/x86/src/lib/include/uapi/asm/cpu.h000066400000000000000000000163311317335042600226130ustar00rootroot00000000000000#ifndef __CR_ASM_CPU_H__ #define __CR_ASM_CPU_H__ #include /* * Adopted from linux kernel and enhanced from Intel/AMD manuals. */ #define NCAPINTS (12) /* N 32-bit words worth of info */ #define NCAPINTS_BITS (NCAPINTS * 32) #define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */ #define X86_FEATURE_VME (0*32+ 1) /* Virtual 8086 Mode Enhancements */ #define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */ #define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extension */ #define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ #define X86_FEATURE_MSR (0*32+ 5) /* Model Specific Registers RDMSR and WRMSR Instructions */ #define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extension */ #define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */ #define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ #define X86_FEATURE_APIC (0*32+ 9) /* APIC On-Chip */ #define X86_FEATURE_SEP (0*32+11) /* SYSENTER and SYSEXIT Instructions */ #define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */ #define X86_FEATURE_PGE (0*32+13) /* PTE Global Bit */ #define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */ #define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ #define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */ #define X86_FEATURE_PSE36 (0*32+17) /* 36-Bit Page Size Extension */ #define X86_FEATURE_PSN (0*32+18) /* Processor Serial Number */ #define X86_FEATURE_DS (0*32+21) /* Debug Store */ #define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */ #define X86_FEATURE_ACPI (0*32+22) /* Thermal Monitor and Software Controlled Clock Facilities */ #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ #define X86_FEATURE_XMM (0*32+25) /* "sse" */ #define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */ #define X86_FEATURE_SS (0*32+27) /* Self Snoop */ #define X86_FEATURE_HTT (0*32+28) /* Multi-Threading */ #define X86_FEATURE_TM (0*32+29) /* Thermal Monitor */ #define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */ #define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ #define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */ #define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */ #define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */ #define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ #define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */ #define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit DS Area */ #define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */ #define X86_FEATURE_DSCPL (4*32+ 4) /* CPL Qualified Debug Store */ #define X86_FEATURE_VMX (4*32+ 5) /* Virtual Machine Extensions */ #define X86_FEATURE_SMX (4*32+ 6) /* Safer Mode Extensions */ #define X86_FEATURE_EST (4*32+ 7) /* Enhanced Intel SpeedStep technology */ #define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */ #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_CNXTID (4*32+10) /* L1 Context ID */ #define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */ #define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ #define X86_FEATURE_XTPR_UCTL (4*32+14) /* xTPR Update Control */ #define X86_FEATURE_PDCM (4*32+15) /* Perfmon and Debug Capability */ #define X86_FEATURE_PCID (4*32+17) /* Process-context identifiers */ #define X86_FEATURE_DCA (4*32+18) /* Ability to prefetch data from a memory mapped device */ #define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ #define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ #define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ #define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ #define X86_FEATURE_TSCDL (4*32+24) /* Local APIC timer supports one-shot operation using a TSC deadline value */ #define X86_FEATURE_AES (4*32+25) /* AES instructions */ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ #define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ #define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */ #define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */ #define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */ #define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */ #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ #define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ #define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ #define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* Supports RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */ #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ #define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 Foundation */ #define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ #define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_IPT (9*32+25) /* Intel Processor Trace */ #define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */ #define X86_FEATURE_SHA (9*32+29) /* Intel SHA extensions */ #define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 */ #define X86_FEATURE_AVXVL (9*32+31) /* AVX-512 */ #define X86_FEATURE_XSAVEOPT (10*32+0) /* XSAVEOPT */ #define X86_FEATURE_XSAVEC (10*32+1) /* XSAVEC */ #define X86_FEATURE_XGETBV1 (10*32+2) /* XGETBV with ECX = 1 */ #define X86_FEATURE_XSAVES (10*32+3) /* XSAVES/XRSTORS */ /* * Node 11 is our own, kernel has not such entry. */ #define X86_FEATURE_PREFETCHWT1 (11*32+0) /* The PREFETCHWT1 instruction */ enum { X86_VENDOR_INTEL = 0, X86_VENDOR_AMD = 1, X86_VENDOR_MAX }; struct cpuinfo_x86 { uint8_t x86_family; uint8_t x86_vendor; uint8_t x86_model; uint8_t x86_mask; uint32_t x86_capability[NCAPINTS]; uint32_t extended_cpuid_level; int cpuid_level; char x86_vendor_id[16]; char x86_model_id[64]; }; typedef struct cpuinfo_x86 compel_cpuinfo_t; #endif /* __CR_ASM_CPU_H__ */ criu-3.6/compel/arch/x86/src/lib/include/uapi/asm/fpu.h000066400000000000000000000064211317335042600226150ustar00rootroot00000000000000#ifndef __CR_ASM_FPU_H__ #define __CR_ASM_FPU_H__ #include #include #include #include #define FP_MIN_ALIGN_BYTES 64 #define FP_XSTATE_MAGIC1 0x46505853U #define FP_XSTATE_MAGIC2 0x46505845U #define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) #define XSTATE_FP 0x1 #define XSTATE_SSE 0x2 #define XSTATE_YMM 0x4 #define FXSAVE_SIZE 512 #define XSAVE_SIZE 832 struct fpx_sw_bytes { uint32_t magic1; uint32_t extended_size; uint64_t xstate_bv; uint32_t xstate_size; uint32_t padding[7]; }; struct i387_fxsave_struct { uint16_t cwd; /* Control Word */ uint16_t swd; /* Status Word */ uint16_t twd; /* Tag Word */ uint16_t fop; /* Last Instruction Opcode */ union { struct { uint64_t rip; /* Instruction Pointer */ uint64_t rdp; /* Data Pointer */ }; struct { uint32_t fip; /* FPU IP Offset */ uint32_t fcs; /* FPU IP Selector */ uint32_t foo; /* FPU Operand Offset */ uint32_t fos; /* FPU Operand Selector */ }; }; uint32_t mxcsr; /* MXCSR Register State */ uint32_t mxcsr_mask; /* MXCSR Mask */ /* 8*16 bytes for each FP-reg = 128 bytes */ uint32_t st_space[32]; /* 16*16 bytes for each XMM-reg = 256 bytes */ uint32_t xmm_space[64]; uint32_t padding[12]; union { uint32_t padding1[12]; uint32_t sw_reserved[12]; }; } __aligned(16); struct xsave_hdr_struct { uint64_t xstate_bv; uint64_t reserved1[2]; uint64_t reserved2[5]; } __packed; struct ymmh_struct { uint32_t ymmh_space[64]; } __packed; /* * cpu requires it to be 64 byte aligned */ struct xsave_struct { struct i387_fxsave_struct i387; struct xsave_hdr_struct xsave_hdr; struct ymmh_struct ymmh; } __aligned(FP_MIN_ALIGN_BYTES) __packed; struct xsave_struct_ia32 { struct i387_fxsave_struct i387; struct xsave_hdr_struct xsave_hdr; struct ymmh_struct ymmh; } __packed; typedef struct { /* * The FPU xsave area must be continious and FP_MIN_ALIGN_BYTES * aligned, thus make sure the compiler won't insert any hole here. */ union { struct xsave_struct xsave; uint8_t __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE]; }; uint8_t has_fpu; } fpu_state_64_t; struct user_i387_ia32_struct { uint32_t cwd; /* FPU Control Word */ uint32_t swd; /* FPU Status Word */ uint32_t twd; /* FPU Tag Word */ uint32_t fip; /* FPU IP Offset */ uint32_t fcs; /* FPU IP Selector */ uint32_t foo; /* FPU Operand Pointer Offset */ uint32_t fos; /* FPU Operand Pointer Selector */ uint32_t st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ } __packed; typedef struct { struct { struct user_i387_ia32_struct i387_ia32; /* Software status information [not touched by FSAVE]: */ uint32_t status; } __packed fregs_state; union { struct xsave_struct_ia32 xsave; uint8_t __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE]; } __packed; } __packed fpu_state_ia32_t; /* * This one is used in restorer. */ typedef struct { union { fpu_state_64_t fpu_state_64; fpu_state_ia32_t fpu_state_ia32; }; uint8_t has_fpu; } fpu_state_t; extern void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, struct i387_fxsave_struct *fxsave); #endif /* __CR_ASM_FPU_H__ */ criu-3.6/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h000066400000000000000000000054611317335042600244400ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_TYPES_H__ #define UAPI_COMPEL_ASM_TYPES_H__ #include #include #include #include #define SIGMAX 64 #define SIGMAX_OLD 31 typedef struct { uint64_t r15; uint64_t r14; uint64_t r13; uint64_t r12; uint64_t bp; uint64_t bx; uint64_t r11; uint64_t r10; uint64_t r9; uint64_t r8; uint64_t ax; uint64_t cx; uint64_t dx; uint64_t si; uint64_t di; uint64_t orig_ax; uint64_t ip; uint64_t cs; uint64_t flags; uint64_t sp; uint64_t ss; uint64_t fs_base; uint64_t gs_base; uint64_t ds; uint64_t es; uint64_t fs; uint64_t gs; } user_regs_struct64; typedef struct { uint32_t bx; uint32_t cx; uint32_t dx; uint32_t si; uint32_t di; uint32_t bp; uint32_t ax; uint32_t ds; uint32_t es; uint32_t fs; uint32_t gs; uint32_t orig_ax; uint32_t ip; uint32_t cs; uint32_t flags; uint32_t sp; uint32_t ss; } user_regs_struct32; /* * To be sure that we rely on inited reg->__is_native, this member * is (short int) instead of initial (bool). The right way to * check if regs are native or compat is to use user_regs_native() macro. * This should cost nothing, as *usually* sizeof(bool) == sizeof(short) */ typedef struct { union { user_regs_struct64 native; user_regs_struct32 compat; }; short __is_native; /* use user_regs_native macro to check it */ } user_regs_struct_t; #define NATIVE_MAGIC 0x0A #define COMPAT_MAGIC 0x0C static inline bool user_regs_native(user_regs_struct_t *pregs) { return pregs->__is_native == NATIVE_MAGIC; } #define get_user_reg(pregs, name) \ ((user_regs_native(pregs)) ? \ ((pregs)->native.name) : \ ((pregs)->compat.name)) #define set_user_reg(pregs, name, val) \ ((user_regs_native(pregs)) ? \ ((pregs)->native.name = (val)) : \ ((pregs)->compat.name = (val))) #if 0 typedef struct { unsigned short cwd; unsigned short swd; unsigned short twd; /* Note this is not the same as the 32bit/x87/FSAVE twd */ unsigned short fop; u64 rip; u64 rdp; u32 mxcsr; u32 mxcsr_mask; u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ u32 padding[24]; } user_fpregs_struct_t; #endif typedef struct xsave_struct user_fpregs_struct_t; #define REG_RES(regs) get_user_reg(®s, ax) #define REG_IP(regs) get_user_reg(®s, ip) #define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) #define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) /* * For x86_32 __NR_mmap inside the kernel represents old_mmap system * call, but since we didn't use it yet lets go further and simply * define own alias for __NR_mmap2 which would allow us to unify code * between 32 and 64 bits version. */ #define __NR32_mmap __NR32_mmap2 #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ criu-3.6/compel/arch/x86/src/lib/include/uapi/asm/processor-flags.h000066400000000000000000000021701317335042600251310ustar00rootroot00000000000000#ifndef __CR_PROCESSOR_FLAGS_H__ #define __CR_PROCESSOR_FLAGS_H__ /* Taken from linux kernel headers */ /* * EFLAGS bits */ #define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ #define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ #define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ #define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ #define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ #define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ #define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ #define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ #define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ #define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ #define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ #define X86_EFLAGS_NT 0x00004000 /* Nested Task */ #define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ #define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ #define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ #define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ #endif /* __CR_PROCESSOR_FLAGS_H__ */ criu-3.6/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h000066400000000000000000000106511317335042600236200ustar00rootroot00000000000000#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ #include #include #include #include #define SIGFRAME_MAX_OFFSET 8 struct rt_sigcontext { unsigned long r8; unsigned long r9; unsigned long r10; unsigned long r11; unsigned long r12; unsigned long r13; unsigned long r14; unsigned long r15; unsigned long rdi; unsigned long rsi; unsigned long rbp; unsigned long rbx; unsigned long rdx; unsigned long rax; unsigned long rcx; unsigned long rsp; unsigned long rip; unsigned long eflags; unsigned short cs; unsigned short gs; unsigned short fs; unsigned short ss; unsigned long err; unsigned long trapno; unsigned long oldmask; unsigned long cr2; void *fpstate; unsigned long reserved1[8]; }; struct rt_sigcontext_32 { uint32_t gs; uint32_t fs; uint32_t es; uint32_t ds; uint32_t di; uint32_t si; uint32_t bp; uint32_t sp; uint32_t bx; uint32_t dx; uint32_t cx; uint32_t ax; uint32_t trapno; uint32_t err; uint32_t ip; uint32_t cs; uint32_t flags; uint32_t sp_at_signal; uint32_t ss; uint32_t fpstate; uint32_t oldmask; uint32_t cr2; }; #include /* * XXX: move declarations to generic sigframe.h or sigframe-compat.h * when (if) other architectures will support compatible C/R */ typedef uint32_t compat_uptr_t; typedef uint32_t compat_size_t; typedef struct compat_siginfo { int si_signo; int si_errno; int si_code; int _pad[128/sizeof(int) - 3]; } compat_siginfo_t; typedef struct compat_sigaltstack { compat_uptr_t ss_sp; int ss_flags; compat_size_t ss_size; } compat_stack_t; struct ucontext_ia32 { unsigned int uc_flags; unsigned int uc_link; compat_stack_t uc_stack; struct rt_sigcontext_32 uc_mcontext; k_rtsigset_t uc_sigmask; /* mask last for extensibility */ } __packed; struct rt_sigframe_ia32 { uint32_t pretcode; int32_t sig; uint32_t pinfo; uint32_t puc; compat_siginfo_t info; struct ucontext_ia32 uc; char retcode[8]; /* fp state follows here */ fpu_state_t fpu_state; }; struct rt_sigframe_64 { char *pretcode; struct rt_ucontext uc; struct rt_siginfo info; /* fp state follows here */ fpu_state_t fpu_state; }; struct rt_sigframe { union { struct rt_sigframe_ia32 compat; struct rt_sigframe_64 native; }; bool is_native; }; #define RT_SIGFRAME_UC_SIGMASK(rt_sigframe) \ ((rt_sigframe->is_native) ? \ (&rt_sigframe->native.uc.uc_sigmask) : \ ((k_rtsigset_t *)(void *)&rt_sigframe->compat.uc.uc_sigmask)) #define RT_SIGFRAME_REGIP(rt_sigframe) \ ((rt_sigframe->is_native) ? \ (rt_sigframe)->native.uc.uc_mcontext.rip : \ (rt_sigframe)->compat.uc.uc_mcontext.ip) #define RT_SIGFRAME_FPU(rt_sigframe) \ ((rt_sigframe->is_native) ? \ (&(rt_sigframe)->native.fpu_state) : \ (&(rt_sigframe)->compat.fpu_state)) #define RT_SIGFRAME_HAS_FPU(rt_sigframe) (RT_SIGFRAME_FPU(rt_sigframe)->has_fpu) /* * Sigframe offset is different for native/compat tasks. * Offsets calculations one may see at kernel: * - compatible is in sys32_rt_sigreturn at arch/x86/ia32/ia32_signal.c * - native is in sys_rt_sigreturn at arch/x86/kernel/signal.c */ #define RT_SIGFRAME_OFFSET(rt_sigframe) (((rt_sigframe)->is_native) ? 8 : 4 ) #define USER32_CS 0x23 #define ARCH_RT_SIGRETURN_NATIVE(new_sp) \ asm volatile( \ "movq %0, %%rax \n" \ "movq %%rax, %%rsp \n" \ "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n" \ "syscall \n" \ : \ : "r"(new_sp) \ : "rax","rsp","memory") #define ARCH_RT_SIGRETURN_COMPAT(new_sp) \ asm volatile( \ "pushq $"__stringify(USER32_CS)" \n" \ "pushq $1f \n" \ "lretq \n" \ "1: \n" \ ".code32 \n" \ "movl %%edi, %%esp \n" \ "movl $"__stringify(__NR32_rt_sigreturn)",%%eax \n" \ "int $0x80 \n" \ ".code64 \n" \ : \ : "rdi"(new_sp) \ : "eax","esp", "r8", "r9", "r10", "r11", "memory") #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ do { \ if ((rt_sigframe)->is_native) \ ARCH_RT_SIGRETURN_NATIVE(new_sp); \ else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ criu-3.6/compel/arch/x86/src/lib/infect.c000066400000000000000000000321171317335042600201260ustar00rootroot00000000000000#include #include #include #include #include #include #include "asm/cpu.h" #include #include #include "errno.h" #include #include #include "common/err.h" #include "asm/infect-types.h" #include "ptrace.h" #include "infect.h" #include "infect-priv.h" #include "log.h" /* * Injected syscall instruction */ const char code_syscall[] = { 0x0f, 0x05, /* syscall */ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ }; const char code_int_80[] = { 0xcd, 0x80, /* int $0x80 */ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ }; static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); static const int code_int_80_aligned = round_up(sizeof(code_syscall), sizeof(long)); static inline __always_unused void __check_code_syscall(void) { BUILD_BUG_ON(code_int_80_aligned != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } /* 10-byte legacy floating point register */ struct fpreg { uint16_t significand[4]; uint16_t exponent; }; /* 16-byte floating point register */ struct fpxreg { uint16_t significand[4]; uint16_t exponent; uint16_t padding[3]; }; #define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) #define FP_EXP_TAG_VALID 0 #define FP_EXP_TAG_ZERO 1 #define FP_EXP_TAG_SPECIAL 2 #define FP_EXP_TAG_EMPTY 3 static inline uint32_t twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) { struct fpxreg *st; uint32_t tos = (fxsave->swd >> 11) & 7; uint32_t twd = (unsigned long)fxsave->twd; uint32_t tag; uint32_t ret = 0xffff0000u; int i; for (i = 0; i < 8; i++, twd >>= 1) { if (twd & 0x1) { st = FPREG_ADDR(fxsave, (i - tos) & 7); switch (st->exponent & 0x7fff) { case 0x7fff: tag = FP_EXP_TAG_SPECIAL; break; case 0x0000: if (!st->significand[0] && !st->significand[1] && !st->significand[2] && !st->significand[3]) tag = FP_EXP_TAG_ZERO; else tag = FP_EXP_TAG_SPECIAL; break; default: if (st->significand[3] & 0x8000) tag = FP_EXP_TAG_VALID; else tag = FP_EXP_TAG_SPECIAL; break; } } else { tag = FP_EXP_TAG_EMPTY; } ret |= tag << (2 * i); } return ret; } void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, struct i387_fxsave_struct *fxsave) { struct fpxreg *from = (struct fpxreg *)&fxsave->st_space[0]; struct fpreg *to = (struct fpreg *)&env->st_space[0]; int i; env->cwd = fxsave->cwd | 0xffff0000u; env->swd = fxsave->swd | 0xffff0000u; env->twd = twd_fxsr_to_i387(fxsave); env->fip = fxsave->rip; env->foo = fxsave->rdp; /* * should be actually ds/cs at fpu exception time, but * that information is not available in 64bit mode. */ env->fcs = 0x23; /* __USER32_CS */ env->fos = 0x2b; /* __USER32_DS */ env->fos |= 0xffff0000; for (i = 0; i < 8; ++i) memcpy(&to[i], &from[i], sizeof(to[0])); } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { bool is_native = user_regs_native(regs); fpu_state_t *fpu_state = is_native ? &sigframe->native.fpu_state : &sigframe->compat.fpu_state; if (is_native) { #define cpreg64_native(d, s) sigframe->native.uc.uc_mcontext.d = regs->native.s cpreg64_native(rdi, di); cpreg64_native(rsi, si); cpreg64_native(rbp, bp); cpreg64_native(rsp, sp); cpreg64_native(rbx, bx); cpreg64_native(rdx, dx); cpreg64_native(rcx, cx); cpreg64_native(rip, ip); cpreg64_native(rax, ax); cpreg64_native(r8, r8); cpreg64_native(r9, r9); cpreg64_native(r10, r10); cpreg64_native(r11, r11); cpreg64_native(r12, r12); cpreg64_native(r13, r13); cpreg64_native(r14, r14); cpreg64_native(r15, r15); cpreg64_native(cs, cs); cpreg64_native(eflags, flags); sigframe->is_native = true; #undef cpreg64_native } else { #define cpreg32_compat(d) sigframe->compat.uc.uc_mcontext.d = regs->compat.d cpreg32_compat(gs); cpreg32_compat(fs); cpreg32_compat(es); cpreg32_compat(ds); cpreg32_compat(di); cpreg32_compat(si); cpreg32_compat(bp); cpreg32_compat(sp); cpreg32_compat(bx); cpreg32_compat(dx); cpreg32_compat(cx); cpreg32_compat(ip); cpreg32_compat(ax); cpreg32_compat(cs); cpreg32_compat(ss); cpreg32_compat(flags); #undef cpreg32_compat sigframe->is_native = false; } fpu_state->has_fpu = true; if (is_native) { memcpy(&fpu_state->fpu_state_64.xsave, fpregs, sizeof(*fpregs)); } else { memcpy(&fpu_state->fpu_state_ia32.xsave, fpregs, sizeof(*fpregs)); compel_convert_from_fxsr(&fpu_state->fpu_state_ia32.fregs_state.i387_ia32, &fpu_state->fpu_state_ia32.xsave.i387); } return 0; } int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { fpu_state_t *fpu_state = (sigframe->is_native) ? &rsigframe->native.fpu_state : &rsigframe->compat.fpu_state; if (sigframe->is_native) { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_64.xsave; if ((addr % 64ul)) { pr_err("Unaligned address passed: %lx (native %d)\n", addr, sigframe->is_native); return -1; } sigframe->native.uc.uc_mcontext.fpstate = (void *)addr; } else if (!sigframe->is_native) { sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; } return 0; } #define get_signed_user_reg(pregs, name) \ ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : \ (int32_t)((pregs)->compat.name)) int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, void *arg) { user_fpregs_struct_t xsave = { }, *xs = NULL; struct iovec iov; int ret = -1; pr_info("Dumping general registers for %d in %s mode\n", pid, user_regs_native(regs) ? "native" : "compat"); /* Did we come from a system call? */ if (get_signed_user_reg(regs, orig_ax) >= 0) { /* Restart the system call */ switch (get_signed_user_reg(regs, ax)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: set_user_reg(regs, ax, get_user_reg(regs, orig_ax)); set_user_reg(regs, ip, get_user_reg(regs, ip) - 2); break; case -ERESTART_RESTARTBLOCK: pr_warn("Will restore %d with interrupted system call\n", pid); set_user_reg(regs, ax, -EINTR); break; } } if (!compel_cpu_has_feature(X86_FEATURE_FPU)) goto out; /* * FPU fetched either via fxsave or via xsave, * thus decode it accrodingly. */ pr_info("Dumping GP/FPU registers for %d\n", pid); if (compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { iov.iov_base = &xsave; iov.iov_len = sizeof(xsave); if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) { pr_perror("Can't obtain FPU registers for %d", pid); goto err; } } else { if (ptrace(PTRACE_GETFPREGS, pid, NULL, &xsave)) { pr_perror("Can't obtain FPU registers for %d", pid); goto err; } } xs = &xsave; out: ret = save(arg, regs, xs); err: return ret; } int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; if (user_regs_native(®s)) { user_regs_struct64 *r = ®s.native; r->ax = (uint64_t)nr; r->di = arg1; r->si = arg2; r->dx = arg3; r->r10 = arg4; r->r8 = arg5; r->r9 = arg6; err = compel_execute_syscall(ctl, ®s, code_syscall); } else { user_regs_struct32 *r = ®s.compat; r->ax = (uint32_t)nr; r->bx = arg1; r->cx = arg2; r->dx = arg3; r->si = arg4; r->di = arg5; r->bp = arg6; err = compel_execute_syscall(ctl, ®s, code_int_80); } *ret = get_user_reg(®s, ax); return err; } void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; bool compat_task = !user_regs_native(&ctl->orig.regs); err = compel_syscall(ctl, __NR(mmap, compat_task), &map, (unsigned long)addr, length, prot, flags, fd, offset); if (err < 0) return NULL; if (IS_ERR_VALUE(map)) { if (map == -EACCES && (prot & PROT_WRITE) && (prot & PROT_EXEC)) pr_warn("mmap(PROT_WRITE | PROT_EXEC) failed for %d, " "check selinux execmem policy\n", ctl->rpid); return NULL; } return (void *)map; } /* * regs must be inited when calling this function from original context */ void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { set_user_reg(regs, ip, new_ip); if (stack) set_user_reg(regs, sp, (unsigned long) stack); /* Avoid end of syscall processing */ set_user_reg(regs, orig_ax, -1); /* Make sure flags are in known state */ set_user_reg(regs, flags, get_user_reg(regs, flags) & ~(X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_IF)); } #define USER32_CS 0x23 #define USER_CS 0x33 static bool ldt_task_selectors(pid_t pid) { unsigned long cs; errno = 0; /* * Offset of register must be from 64-bit set even for * compatible tasks. Fix this to support native i386 tasks */ cs = ptrace(PTRACE_PEEKUSER, pid, offsetof(user_regs_struct64, cs), 0); if (errno != 0) { pr_perror("Can't get CS register for %d", pid); return -1; } return cs != USER_CS && cs != USER32_CS; } static int arch_task_compatible(pid_t pid) { user_regs_struct_t r; int ret = ptrace_get_regs(pid, &r); if (ret) return -1; return !user_regs_native(&r); } bool arch_can_dump_task(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; int ret; ret = arch_task_compatible(pid); if (ret < 0) return false; if (ret && !(ctl->ictx.flags & INFECT_COMPATIBLE)) { pr_err("Can't dump task %d running in 32-bit mode\n", pid); return false; } if (ldt_task_selectors(pid)) { pr_err("Can't dump task %d with LDT descriptors\n", pid); return false; } return true; } int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { int native = compel_mode_native(ctl); void *where = native ? (void *)&s->native.uc.uc_stack : (void *)&s->compat.uc.uc_stack; long ret; int err; err = compel_syscall(ctl, __NR(sigaltstack, !native), &ret, 0, (unsigned long)where, 0, 0, 0, 0); return err ? err : ret; } /* Copied from the gdb header gdb/nat/x86-dregs.h */ /* Debug registers' indices. */ #define DR_FIRSTADDR 0 #define DR_LASTADDR 3 #define DR_NADDR 4 /* The number of debug address registers. */ #define DR_STATUS 6 /* Index of debug status register (DR6). */ #define DR_CONTROL 7 /* Index of debug control register (DR7). */ #define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */ #define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */ #define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */ /* Locally enable the break/watchpoint in the I'th debug register. */ #define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i))) int ptrace_set_breakpoint(pid_t pid, void *addr) { int ret; /* Set a breakpoint */ if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_FIRSTADDR]), addr)) { pr_perror("Unable to setup a breakpoint into %d", pid); return -1; } /* Enable the breakpoint */ if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_CONTROL]), X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) { pr_perror("Unable to enable the breakpoint for %d", pid); return -1; } ret = ptrace(PTRACE_CONT, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the stopped tracee process %d", pid); return -1; } return 1; } int ptrace_flush_breakpoints(pid_t pid) { /* Disable the breakpoint */ if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_CONTROL]), 0)) { pr_perror("Unable to disable the breakpoint for %d", pid); return -1; } return 0; } int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs) { struct iovec iov; int ret; iov.iov_base = ®s->native; iov.iov_len = sizeof(user_regs_struct64); ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); if (ret == -1) { pr_perror("PTRACE_GETREGSET failed"); return -1; } if (iov.iov_len == sizeof(regs->native)) { regs->__is_native = NATIVE_MAGIC; return ret; } if (iov.iov_len == sizeof(regs->compat)) { regs->__is_native = COMPAT_MAGIC; return ret; } pr_err("PTRACE_GETREGSET read %zu bytes for pid %d, but native/compat regs sizes are %zu/%zu bytes\n", iov.iov_len, pid, sizeof(regs->native), sizeof(regs->compat)); return -1; } int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs) { struct iovec iov; if (user_regs_native(regs)) { iov.iov_base = ®s->native; iov.iov_len = sizeof(user_regs_struct64); } else { iov.iov_base = ®s->compat; iov.iov_len = sizeof(user_regs_struct32); } return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); } #define TASK_SIZE ((1UL << 47) - PAGE_SIZE) /* * Task size may be limited to 3G but we need a * higher limit, because it's backward compatible. */ #define TASK_SIZE_IA32 (0xffffe000) unsigned long compel_task_size(void) { return TASK_SIZE; } criu-3.6/compel/compel-host000077500000000000000000000003641317335042600157720ustar00rootroot00000000000000#!/bin/sh # # A wrapper to use compel-host right from the source dir # (i.e. when it is not yet installed). COMPEL_UNINSTALLED_ROOTDIR=$(dirname "$0") export COMPEL_UNINSTALLED_ROOTDIR exec "${COMPEL_UNINSTALLED_ROOTDIR}/compel-host-bin" "$@" criu-3.6/compel/include/000077500000000000000000000000001317335042600152325ustar00rootroot00000000000000criu-3.6/compel/include/compel-cpu.h000066400000000000000000000005251317335042600174510ustar00rootroot00000000000000#ifndef __COMPEL_CPU_H__ #define __COMPEL_CPU_H__ #include #include "asm/cpu.h" extern void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature); extern void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature); extern int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature); #endif criu-3.6/compel/include/elf32-types.h000066400000000000000000000005621317335042600174630ustar00rootroot00000000000000#ifndef COMPEL_ELF32_TYPES_H__ #define COMPEL_ELF32_TYPES_H__ #define Elf_Ehdr Elf32_Ehdr #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Rel Elf32_Rel #define Elf_Rela Elf32_Rela #define ELF_ST_TYPE ELF32_ST_TYPE #define ELF_ST_BIND ELF32_ST_BIND #define ELF_R_SYM ELF32_R_SYM #define ELF_R_TYPE ELF32_R_TYPE #endif /* COMPEL_ELF32_TYPES_H__ */ criu-3.6/compel/include/elf64-types.h000066400000000000000000000005621317335042600174700ustar00rootroot00000000000000#ifndef COMPEL_ELF64_TYPES_H__ #define COMPEL_ELF64_TYPES_H__ #define Elf_Ehdr Elf64_Ehdr #define Elf_Shdr Elf64_Shdr #define Elf_Sym Elf64_Sym #define Elf_Rel Elf64_Rel #define Elf_Rela Elf64_Rela #define ELF_ST_TYPE ELF64_ST_TYPE #define ELF_ST_BIND ELF64_ST_BIND #define ELF_R_SYM ELF64_R_SYM #define ELF_R_TYPE ELF64_R_TYPE #endif /* COMPEL_ELF64_TYPES_H__ */ criu-3.6/compel/include/errno.h000066400000000000000000000003071317335042600165300ustar00rootroot00000000000000#ifndef __COMPEL_ERRNO_H__ #define __COMPEL_ERRNO_H__ #define ERESTARTSYS 512 #define ERESTARTNOINTR 513 #define ERESTARTNOHAND 514 #define ERESTART_RESTARTBLOCK 516 #endif /* __CR_ERRNO_H__ */ criu-3.6/compel/include/infect-priv.h000066400000000000000000000040621317335042600176330ustar00rootroot00000000000000#ifndef __COMPEL_INFECT_PRIV_H__ #define __COMPEL_INFECT_PRIV_H__ #include #define BUILTIN_SYSCALL_SIZE 8 struct thread_ctx { k_rtsigset_t sigmask; user_regs_struct_t regs; }; /* parasite control block */ struct parasite_ctl { int rpid; /* Real pid of the victim */ void *remote_map; void *local_map; void *sigreturn_addr; /* A place for the breakpoint */ unsigned long map_length; struct infect_ctx ictx; /* thread leader data */ bool daemonized; struct thread_ctx orig; void *rstack; /* thread leader stack*/ struct rt_sigframe *sigframe; struct rt_sigframe *rsigframe; /* address in a parasite */ void *r_thread_stack; /* stack for non-leader threads */ unsigned long parasite_ip; /* service routine start ip */ unsigned int *addr_cmd; /* addr for command */ void *addr_args; /* address for arguments */ unsigned long args_size; int tsock; /* transport socket for transferring fds */ struct parasite_blob_desc pblob; }; struct parasite_thread_ctl { int tid; struct parasite_ctl *ctl; struct thread_ctx th; }; #define MEMFD_FNAME "CRIUMFD" #define MEMFD_FNAME_SZ sizeof(MEMFD_FNAME) struct ctl_msg; int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m); extern void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs); extern void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset); extern bool arch_can_dump_task(struct parasite_ctl *ctl); extern int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, void *arg); extern int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s); extern int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs); extern int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); extern int compel_execute_syscall(struct parasite_ctl *ctl, user_regs_struct_t *regs, const char *code_syscall); #endif criu-3.6/compel/include/log.h000066400000000000000000000031101317335042600161570ustar00rootroot00000000000000#ifndef COMPEL_LOG_H__ #define COMPEL_LOG_H__ #include "uapi/compel/compel.h" #include "uapi/compel/loglevels.h" #ifndef LOG_PREFIX # define LOG_PREFIX #endif static inline int pr_quelled(unsigned int loglevel) { return compel_log_get_loglevel() < loglevel && loglevel != COMPEL_LOG_MSG; } extern void compel_print_on_level(unsigned int loglevel, const char *format, ...) __attribute__ ((__format__ (__printf__, 2, 3))); #define pr_msg(fmt, ...) \ compel_print_on_level(COMPEL_LOG_MSG, \ fmt, ##__VA_ARGS__) #define pr_info(fmt, ...) \ compel_print_on_level(COMPEL_LOG_INFO, \ LOG_PREFIX fmt, ##__VA_ARGS__) #define pr_err(fmt, ...) \ compel_print_on_level(COMPEL_LOG_ERROR, \ "Error (%s:%d): " LOG_PREFIX fmt, \ __FILE__, __LINE__, ##__VA_ARGS__) #define pr_err_once(fmt, ...) \ do { \ static bool __printed; \ if (!__printed) { \ pr_err(fmt, ##__VA_ARGS__); \ __printed = 1; \ } \ } while (0) #define pr_warn(fmt, ...) \ compel_print_on_level(COMPEL_LOG_WARN, \ "Warn (%s:%d): " LOG_PREFIX fmt, \ __FILE__, __LINE__, ##__VA_ARGS__) #define pr_warn_once(fmt, ...) \ do { \ static bool __printed; \ if (!__printed) { \ pr_warn(fmt, ##__VA_ARGS__); \ __printed = 1; \ } \ } while (0) #define pr_debug(fmt, ...) \ compel_print_on_level(COMPEL_LOG_DEBUG, \ LOG_PREFIX fmt, ##__VA_ARGS__) #define pr_perror(fmt, ...) \ pr_err(fmt ": %m\n", ##__VA_ARGS__) #endif /* COMPEL_LOG_H__ */ criu-3.6/compel/include/piegen.h000066400000000000000000000007611317335042600166560ustar00rootroot00000000000000#ifndef COMPEL_PIEGEN_H__ #define COMPEL_PIEGEN_H__ #include #include #include #include "common/compiler.h" typedef struct { char *input_filename; char *output_filename; char *prefix; FILE *fout; } piegen_opt_t; extern piegen_opt_t opts; #define pr_out(fmt, ...) \ do { \ if (opts.fout) \ fprintf(opts.fout, fmt, ##__VA_ARGS__); \ } while (0) extern int handle_binary(void *mem, size_t size); #endif /* COMPEL_PIEGEN_H__ */ criu-3.6/compel/include/ptrace.h000066400000000000000000000005611317335042600166630ustar00rootroot00000000000000#ifndef COMPEL_PTRACE_H__ #define COMPEL_PTRACE_H__ #include #include #include #define PTRACE_SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8) extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs); extern int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs); #endif /* COMPEL_PTRACE_H__ */ criu-3.6/compel/include/rpc-pie-priv.h000066400000000000000000000017631317335042600177270ustar00rootroot00000000000000#ifndef __COMPEL_RPC_H__ #define __COMPEL_RPC_H__ struct ctl_msg { uint32_t cmd; /* command itself */ uint32_t ack; /* ack on command */ int32_t err; /* error code on reply */ }; #define ctl_msg_cmd(_cmd) \ (struct ctl_msg){.cmd = _cmd, } #define ctl_msg_ack(_cmd, _err) \ (struct ctl_msg){.cmd = _cmd, .ack = _cmd, .err = _err, } /* * NOTE: each command's args should be arch-independed sized. * If you want to use one of the standard types, declare * alternative type for compatible tasks in parasite-compat.h */ enum { PARASITE_CMD_IDLE = 0, PARASITE_CMD_ACK, PARASITE_CMD_INIT_DAEMON, /* * This must be greater than INITs. */ PARASITE_CMD_FINI, __PARASITE_END_CMDS, }; struct parasite_init_args { int32_t h_addr_len; struct sockaddr_un h_addr; int32_t log_level; uint64_t sigreturn_addr; uint64_t sigframe; /* pointer to sigframe */ futex_t daemon_connected; }; struct parasite_unmap_args { uint64_t parasite_start; uint64_t parasite_len; }; #endif criu-3.6/compel/include/shmem.h000066400000000000000000000003041317335042600165110ustar00rootroot00000000000000#ifndef __COMPEL_PLUGIN_SHMEM_PRIV_H__ #define __COMPEL_PLUGIN_SHMEM_PRIV_H__ struct shmem_plugin_msg { unsigned long start; unsigned long len; }; #endif /* __COMPEL_PLUGIN_SHMEM_PRIV_H__ */ criu-3.6/compel/include/uapi/000077500000000000000000000000001317335042600161705ustar00rootroot00000000000000criu-3.6/compel/include/uapi/asm000077700000000000000000000000001317335042600213222../asm/uapi/asmustar00rootroot00000000000000criu-3.6/compel/include/uapi/common000077700000000000000000000000001317335042600230752../../../include/commonustar00rootroot00000000000000criu-3.6/compel/include/uapi/compel000077700000000000000000000000001317335042600174452.ustar00rootroot00000000000000criu-3.6/compel/include/uapi/compel.h000066400000000000000000000004551317335042600176240ustar00rootroot00000000000000#ifndef UAPI_COMPEL_H__ #define UAPI_COMPEL_H__ #include #include #include #include #include #include #include #include #endif /* UAPI_COMPEL_H__ */ criu-3.6/compel/include/uapi/cpu.h000066400000000000000000000003711317335042600171310ustar00rootroot00000000000000#ifndef UAPI_COMPEL_CPU_H__ #define UAPI_COMPEL_CPU_H__ #include #include extern int compel_cpuid(compel_cpuinfo_t *info); extern bool compel_cpu_has_feature(unsigned int feature); #endif /* UAPI_COMPEL_CPU_H__ */ criu-3.6/compel/include/uapi/handle-elf.h000066400000000000000000000004471317335042600203450ustar00rootroot00000000000000#ifndef __COMPEL_UAPI_HANDLE_ELF__ #define __COMPEL_UAPI_HANDLE_ELF__ #define COMPEL_TYPE_INT (1u << 0) #define COMPEL_TYPE_LONG (1u << 1) #define COMPEL_TYPE_GOTPCREL (1u << 2) typedef struct { unsigned int offset; unsigned int type; long addend; long value; } compel_reloc_t; #endif criu-3.6/compel/include/uapi/infect-rpc.h000066400000000000000000000007221317335042600203740ustar00rootroot00000000000000#ifndef __COMPEL_INFECT_RPC_H__ #define __COMPEL_INFECT_RPC_H__ #include #include #include struct parasite_ctl; extern int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl); extern int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl); extern int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl); extern int compel_rpc_sock(struct parasite_ctl *ctl); #define PARASITE_USER_CMDS 64 #endif criu-3.6/compel/include/uapi/infect-util.h000066400000000000000000000003441317335042600205650ustar00rootroot00000000000000#ifndef __COMPEL_INFECT_UTIL_H__ #define __COMPEL_INFECT_UTIL_H__ struct parasite_ctl; extern int compel_util_send_fd(struct parasite_ctl *ctl, int fd); extern int compel_util_recv_fd(struct parasite_ctl *ctl, int *pfd); #endif criu-3.6/compel/include/uapi/infect.h000066400000000000000000000115631317335042600176170ustar00rootroot00000000000000#ifndef __COMPEL_INFECT_H__ #define __COMPEL_INFECT_H__ #include #include #include #include #include #include #include "common/compiler.h" #define PARASITE_START_AREA_MIN (4096) extern int compel_interrupt_task(int pid); struct seize_task_status { unsigned long long sigpnd; unsigned long long shdpnd; char state; int ppid; int seccomp_mode; }; extern int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_task_status *, void *data), void (*free_status)(int pid, struct seize_task_status *, void *data), struct seize_task_status *st, void *data); extern int compel_stop_task(int pid); extern int compel_resume_task(pid_t pid, int orig_state, int state); struct parasite_ctl; struct parasite_thread_ctl; extern struct parasite_ctl *compel_prepare(int pid); extern struct parasite_ctl *compel_prepare_noctx(int pid); extern int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); extern struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *); extern int compel_stop_daemon(struct parasite_ctl *ctl); extern int compel_cure_remote(struct parasite_ctl *ctl); extern int compel_cure_local(struct parasite_ctl *ctl); extern int compel_cure(struct parasite_ctl *ctl); #define PARASITE_ARG_SIZE_MIN ( 1 << 12) #define compel_parasite_args(ctl, type) \ ({ \ void *___ret; \ BUILD_BUG_ON(sizeof(type) > PARASITE_ARG_SIZE_MIN); \ ___ret = compel_parasite_args_p(ctl); \ ___ret; \ }) extern void *compel_parasite_args_p(struct parasite_ctl *ctl); extern void *compel_parasite_args_s(struct parasite_ctl *ctl, int args_size); extern int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6); extern int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd); extern int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs); /* * The PTRACE_SYSCALL will trap task twice -- on * enter into and on exit from syscall. If we trace * a single task, we may skip half of all getregs * calls -- on exit we don't need them. */ enum trace_flags { TRACE_ALL, TRACE_ENTER, TRACE_EXIT, }; extern int compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat, enum trace_flags trace); extern int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); extern int compel_unmap(struct parasite_ctl *ctl, unsigned long addr); extern int compel_mode_native(struct parasite_ctl *ctl); extern k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl); extern k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl); struct rt_sigframe; typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) __attribute__ ((__format__ (__printf__, 3, 4))); struct infect_ctx { int sock; /* * Regs manipulation context. */ int (*save_regs)(void *, user_regs_struct_t *, user_fpregs_struct_t *); int (*make_sigframe)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *); void *regs_arg; unsigned long task_size; unsigned long syscall_ip; /* entry point of infection */ unsigned long flags; /* fine-tune (e.g. faults) */ void (*child_handler)(int, siginfo_t *, void *); /* hander for SIGCHLD deaths */ struct sigaction orig_handler; open_proc_fn open_proc; int log_fd; /* fd for parasite code to send messages to */ }; extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *); #define INFECT_NO_MEMFD 0x1 /* don't use memfd() */ #define INFECT_FAIL_CONNECT 0x2 /* make parasite connect() fail */ #define INFECT_NO_BREAKPOINTS 0x4 /* no breakpoints in pie tracking */ #define INFECT_COMPATIBLE 0x8 /* can run parasite inside compat tasks */ /* * There are several ways to describe a blob to compel * library. The simplest one derived from criu is to * provide it from .h files. */ #define COMPEL_BLOB_CHEADER 0x1 struct parasite_blob_desc { unsigned parasite_type; union { struct { const void *mem; size_t bsize; size_t nr_gotpcrel; unsigned long parasite_ip_off; unsigned long addr_cmd_off; unsigned long addr_arg_off; compel_reloc_t *relocs; unsigned int nr_relocs; } hdr; }; }; extern struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *); typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int compel_get_thread_regs(struct parasite_thread_ctl *, save_regs_t, void *); extern void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs); extern unsigned long compel_task_size(void); #endif criu-3.6/compel/include/uapi/ksigset.h000066400000000000000000000007761317335042600200240ustar00rootroot00000000000000#ifndef __COMPEL_KSIGSET_H__ #define __COMPEL_KSIGSET_H__ #include static inline void ksigfillset(k_rtsigset_t *set) { int i; for (i = 0; i < _KNSIG_WORDS; i++) set->sig[i] = (unsigned long)-1; } static inline void ksigemptyset(k_rtsigset_t *set) { int i; for (i = 0; i < _KNSIG_WORDS; i++) set->sig[i] = 0; } static inline void ksigaddset(k_rtsigset_t *set, int _sig) { int sig = _sig - 1; set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW); } #endif criu-3.6/compel/include/uapi/log.h000066400000000000000000000005031317335042600171200ustar00rootroot00000000000000#ifndef __COMPEL_UAPI_LOG_H__ #define __COMPEL_UAPI_LOG_H__ #include #include typedef void (*compel_log_fn)(unsigned int lvl, const char *fmt, va_list parms); extern void compel_log_init(compel_log_fn log_fn, unsigned int level); extern unsigned int compel_log_get_loglevel(void); #endif criu-3.6/compel/include/uapi/loglevels.h000066400000000000000000000010371317335042600203360ustar00rootroot00000000000000#ifndef UAPI_COMPEL_LOGLEVELS_H__ #define UAPI_COMPEL_LOGLEVELS_H__ /* * Log levels used by compel itself (see compel_log_init()), * also by log functions in the std plugin. */ enum __compel_log_levels { COMPEL_LOG_MSG, /* Print message regardless of log level */ COMPEL_LOG_ERROR, /* Errors only, when we're in trouble */ COMPEL_LOG_WARN, /* Warnings */ COMPEL_LOG_INFO, /* Informative, everything is fine */ COMPEL_LOG_DEBUG, /* Debug only */ COMPEL_DEFAULT_LOGLEVEL = COMPEL_LOG_WARN }; #endif /* UAPI_COMPEL_LOGLEVELS_H__ */ criu-3.6/compel/include/uapi/plugins000077700000000000000000000000001317335042600242022../../plugins/include/uapiustar00rootroot00000000000000criu-3.6/compel/include/uapi/plugins.h000066400000000000000000000016731317335042600200310ustar00rootroot00000000000000#ifndef UAPI_COMPEL_PLUGIN_H__ #define UAPI_COMPEL_PLUGIN_H__ #define __init __attribute__((__used__)) __attribute__ ((__section__(".compel.init"))) #define __exit __attribute__((__used__)) __attribute__ ((__section__(".compel.exit"))) #ifndef __ASSEMBLY__ typedef struct { const char *name; int (*init)(void); void (*exit)(void); } plugin_init_t; #define plugin_register(___desc) \ static const plugin_init_t * const \ ___ptr__##___desc __init = &___desc; #define PLUGIN_REGISTER(___id, ___name, ___init, ___exit) \ static const plugin_init_t __plugin_desc_##___id = { \ .name = ___name, \ .init = ___init, \ .exit = ___exit, \ }; \ plugin_register(__plugin_desc_##___id); #define PLUGIN_REGISTER_DUMMY(___id) \ static const plugin_init_t __plugin_desc_##___id = { \ .name = #___id, \ }; \ plugin_register(__plugin_desc_##___id); #endif /* __ASSEMBLY__ */ #endif /* UAPI_COMPEL_PLUGIN_H__ */ criu-3.6/compel/include/uapi/ptrace.h000066400000000000000000000032741317335042600176250ustar00rootroot00000000000000#ifndef UAPI_COMPEL_PTRACE_H__ #define UAPI_COMPEL_PTRACE_H__ /* * We'd want to include both sys/ptrace.h and linux/ptrace.h, * hoping that most definitions come from either one or another. * Alas, on Alpine/musl both files declare struct ptrace_peeksiginfo_args, * so there is no way they can be used together. Let's rely on libc one. */ #include #include /* * Some constants for ptrace that might be missing from the * standard library includes due to being (relatively) new. */ #ifndef PTRACE_SEIZE # define PTRACE_SEIZE 0x4206 #endif #ifndef PTRACE_O_SUSPEND_SECCOMP # define PTRACE_O_SUSPEND_SECCOMP (1 << 21) #endif #ifndef PTRACE_INTERRUPT # define PTRACE_INTERRUPT 0x4207 #endif #ifndef PTRACE_PEEKSIGINFO #define PTRACE_PEEKSIGINFO 0x4209 /* Read signals from a shared (process wide) queue */ #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) #endif #ifndef PTRACE_GETREGSET # define PTRACE_GETREGSET 0x4204 # define PTRACE_SETREGSET 0x4205 #endif #ifndef PTRACE_GETSIGMASK # define PTRACE_GETSIGMASK 0x420a # define PTRACE_SETSIGMASK 0x420b #endif #ifndef PTRACE_SECCOMP_GET_FILTER #define PTRACE_SECCOMP_GET_FILTER 0x420c #endif #ifdef PTRACE_EVENT_STOP # if PTRACE_EVENT_STOP == 7 /* Bad value from Linux 3.1-3.3, fixed in 3.4 */ # undef PTRACE_EVENT_STOP # endif #endif #ifndef PTRACE_EVENT_STOP # define PTRACE_EVENT_STOP 128 #endif extern int ptrace_suspend_seccomp(pid_t pid); extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); extern int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes); #endif /* UAPI_COMPEL_PTRACE_H__ */ criu-3.6/compel/include/uapi/sigframe-common.h000066400000000000000000000031341317335042600214250ustar00rootroot00000000000000/* * Don't include it directly but use "arch-sigframe.h" instead. */ #ifndef UAPI_COMPEL_SIGFRAME_COMMON_H__ #define UAPI_COMPEL_SIGFRAME_COMMON_H__ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ # error "Direct inclusion is forbidden, use instead" #endif #include #include struct rt_sigframe; #ifndef SIGFRAME_MAX_OFFSET # define SIGFRAME_MAX_OFFSET RT_SIGFRAME_OFFSET(0) #endif #define RESTORE_STACK_ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) /* sigframe should be aligned on 64 byte for x86 and 8 bytes for arm */ #define RESTORE_STACK_SIGFRAME \ RESTORE_STACK_ALIGN(sizeof(struct rt_sigframe) + SIGFRAME_MAX_OFFSET, 64) #ifndef __ARCH_SI_PREAMBLE_SIZE # define __ARCH_SI_PREAMBLE_SIZE (3 * sizeof(int)) #endif #define SI_MAX_SIZE 128 #ifndef SI_PAD_SIZE # define SI_PAD_SIZE ((SI_MAX_SIZE - __ARCH_SI_PREAMBLE_SIZE) / sizeof(int)) #endif typedef struct rt_siginfo { int si_signo; int si_errno; int si_code; int _pad[SI_PAD_SIZE]; } rt_siginfo_t; typedef struct rt_sigaltstack { void *ss_sp; int ss_flags; size_t ss_size; } rt_stack_t; struct rt_ucontext { unsigned long uc_flags; struct rt_ucontext *uc_link; rt_stack_t uc_stack; struct rt_sigcontext uc_mcontext; k_rtsigset_t uc_sigmask; /* mask last for extensibility */ int __unused[32 - (sizeof (k_rtsigset_t) / sizeof (int))]; unsigned long uc_regspace[128] __attribute__((aligned(8))); }; extern int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, struct rt_sigframe *rframe); #endif /* UAPI_COMPEL_SIGFRAME_COMMON_H__ */ criu-3.6/compel/include/uapi/task-state.h000066400000000000000000000007101317335042600204170ustar00rootroot00000000000000#ifndef __COMPEL_UAPI_TASK_STATE_H__ #define __COMPEL_UAPI_TASK_STATE_H__ /* * Task state, as returned by compel_wait_task() * and used in arguments to compel_resume_task(). */ enum __compel_task_state { COMPEL_TASK_ALIVE = 0x01, COMPEL_TASK_DEAD = 0x02, COMPEL_TASK_STOPPED = 0x03, COMPEL_TASK_ZOMBIE = 0x06, /* Don't ever change the above values, they are used by CRIU! */ COMPEL_TASK_MAX = 0x7f }; #endif /* __COMPEL_UAPI_TASK_STATE_H__ */ criu-3.6/compel/plugins/000077500000000000000000000000001317335042600152705ustar00rootroot00000000000000criu-3.6/compel/plugins/Makefile000066400000000000000000000056201317335042600167330ustar00rootroot00000000000000CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) CFLAGS += -DCR_NOGLIBC -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += -Wp,-U_FORTIFY_SOURCE -Wp,-D_FORTIFY_SOURCE=0 PLUGIN_ARCH_DIR := compel/arch/$(ARCH)/plugins # # CFLAGS, ASFLAGS, LDFLAGS # Required for pie code ccflags-y += $(CFLAGS_PIE) # UAPI inclusion, referred as ccflags-y += -I compel/include/uapi asflags-y += -I compel/include/uapi # General compel includes ccflags-y += -iquote compel/include ccflags-y += -fpie -fno-stack-protector # General compel/plugins includes ccflags-y += -iquote $(obj)/include asflags-y += -iquote $(obj)/include # Arch compel/plugins includes ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/include asflags-y += -iquote $(PLUGIN_ARCH_DIR)/include asflags-y += -iquote $(PLUGIN_ARCH_DIR) # General flags for assembly asflags-y += -fpie -Wstrict-prototypes asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer asflags-y += -fno-stack-protector ldflags-y += -z noexecstack # # Shmem plugin target += shmem shmem-lib-y += shmem/shmem.o # # STD plugin target += std std-lib-y += std/std.o std-lib-y += std/fds.o std-lib-y += std/log.o std-lib-y += std/string.o std-lib-y += std/infect.o std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/parasite-head.o # # FDS plugin target += fds fds-lib-y += fds/fds.o ifeq ($(SRCARCH),x86) std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o endif ifeq ($(SRCARCH),ppc64) std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcmp.o endif include ./$(PLUGIN_ARCH_DIR)/std/syscalls/Makefile.syscalls define syscall-priority $(addprefix $(obj)/,$($(1):%.o=%.d)): | $($(2)) $(addprefix $(obj)/,$($(1):%.o=%.i)): | $($(2)) $(addprefix $(obj)/,$($(1):%.o=%.s)): | $($(2)) $(addprefix $(obj)/,$($(1))): | $($(2)) endef # # Almost all plugins depen on syscall headers # and definitions so we have to order their # generation manually. $(foreach t,$(target),$(eval $(call syscall-priority,$(t)-lib-y,std-headers-deps))) # # FIXME syscall-types.h should be setup earlier # install: compel/plugins/std.lib.a compel/plugins/fds.lib.a $(E) " INSTALL " compel plugins $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/compel/ $(Q) install -m 0644 $^ $(DESTDIR)$(LIBEXECDIR)/compel/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/compel/scripts $(Q) install -m 0644 compel/arch/$(ARCH)/scripts/compel-pack.lds.S $(DESTDIR)$(LIBEXECDIR)/compel/scripts $(E) " INSTALL " compel plugins uapi $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/compel/plugins $(Q) cp -frL compel/plugins/include/uapi/* $(DESTDIR)$(INCLUDEDIR)/compel/plugins/ .PHONY: install uninstall: $(E) " UNINSTALL" compel plugins $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBEXECDIR)/compel/,*.lib.a) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBEXECDIR)/compel/scripts/,compel-pack.lds.S) $(E) " UNINSTALL" compel and plugins uapi $(Q) $(RM) -rf $(addprefix $(DESTDIR)$(INCLUDEDIR)/,compel/plugins) .PHONY: uninstall criu-3.6/compel/plugins/fds/000077500000000000000000000000001317335042600160445ustar00rootroot00000000000000criu-3.6/compel/plugins/fds/fds.c000066400000000000000000000006561317335042600167730ustar00rootroot00000000000000#include #include "uapi/plugins.h" #include "uapi/plugins/std.h" #include #define pr_err(fmt, ...) #include "common/compiler.h" #include "common/bug.h" #define __sys(foo) sys_##foo #define __sys_err(ret) ret #include "common/scm.h" int fds_send_fd(int fd) { return send_fd(parasite_get_rpc_sock(), NULL, 0, fd); } int fds_recv_fd(void) { return recv_fd(parasite_get_rpc_sock()); } criu-3.6/compel/plugins/include/000077500000000000000000000000001317335042600167135ustar00rootroot00000000000000criu-3.6/compel/plugins/include/std-priv.h000066400000000000000000000002251317335042600206330ustar00rootroot00000000000000#ifndef __COMPEL_PLUGIN_STD_PRIV_H__ #define __COMPEL_PLUGIN_STD_PRIV_H__ extern int std_ctl_sock(void); #endif /* __COMPEL_PLUGIN_STD_PRIV_H__ */ criu-3.6/compel/plugins/include/uapi/000077500000000000000000000000001317335042600176515ustar00rootroot00000000000000criu-3.6/compel/plugins/include/uapi/plugin-fds.h000066400000000000000000000002531317335042600220720ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_STD_H__ #define COMPEL_PLUGIN_STD_STD_H__ extern int fds_send_fd(int fd); extern int fds_recv_fd(void); #endif /* COMPEL_PLUGIN_STD_STD_H__ */ criu-3.6/compel/plugins/include/uapi/shmem.h000066400000000000000000000007271317335042600211410ustar00rootroot00000000000000#ifndef __COMPEL_PLUGIN_SHMEM_H__ #define __COMPEL_PLUGIN_SHMEM_H__ /* * Creates local shmem mapping and announces it * to the peer. Peer can later "receive" one. The * local area should be munmap()-ed at the end. */ extern void *shmem_create(unsigned long size); /* * "Receives" shmem from peer and maps it. The * locally mapped area should be munmap()-ed at * the end */ extern void *shmem_receive(unsigned long *size); #endif /* __COMPEL_PLUGIN_SHMEM_H__ */ criu-3.6/compel/plugins/include/uapi/std.h000066400000000000000000000005071317335042600206160ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_STD_H__ #define COMPEL_PLUGIN_STD_STD_H__ #include #include #include #include #include #include #endif /* COMPEL_PLUGIN_STD_STD_H__ */ criu-3.6/compel/plugins/include/uapi/std/000077500000000000000000000000001317335042600204435ustar00rootroot00000000000000criu-3.6/compel/plugins/include/uapi/std/asm/000077500000000000000000000000001317335042600212235ustar00rootroot00000000000000criu-3.6/compel/plugins/include/uapi/std/asm/.gitignore000066400000000000000000000000001317335042600232010ustar00rootroot00000000000000criu-3.6/compel/plugins/include/uapi/std/fds.h000066400000000000000000000002401317335042600213640ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_FDS_H__ #define COMPEL_PLUGIN_STD_FDS_H__ #include #include #endif /* COMPEL_PLUGIN_STD_FDS_H__ */ criu-3.6/compel/plugins/include/uapi/std/infect.h000066400000000000000000000010301317335042600220560ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_INFECT_H__ #define COMPEL_PLUGIN_STD_INFECT_H__ extern int parasite_get_rpc_sock(void); extern int parasite_service(unsigned int cmd, void *args); /* * Must be supplied by user plugins. */ extern int parasite_daemon_cmd(int cmd, void *args); extern int parasite_trap_cmd(int cmd, void *args); extern void parasite_cleanup(void); /* * FIXME: Should be supplied by log module. */ extern void log_set_fd(int fd); extern void log_set_loglevel(unsigned int level); #endif /* COMPEL_PLUGIN_STD_INFECT_H__ */ criu-3.6/compel/plugins/include/uapi/std/log.h000066400000000000000000000011421317335042600213730ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_LOG_H__ #define COMPEL_PLUGIN_STD_LOG_H__ #define STD_LOG_SIMPLE_CHUNK 79 extern void std_log_set_fd(int fd); extern void std_log_set_loglevel(unsigned int level); extern void std_log_set_start(struct timeval *tv); extern int std_vprint_num(char *buf, int blen, int num, char **ps); extern void std_sprintf(char output[STD_LOG_SIMPLE_CHUNK], const char *format, ...) __attribute__ ((__format__ (__printf__, 2, 3))); extern void print_on_level(unsigned int loglevel, const char *format, ...) __attribute__ ((__format__ (__printf__, 2, 3))); #endif /* COMPEL_PLUGIN_STD_LOG_H__ */ criu-3.6/compel/plugins/include/uapi/std/string.h000066400000000000000000000022251317335042600221230ustar00rootroot00000000000000#ifndef COMPEL_PLUGIN_STD_STRING_H__ #define COMPEL_PLUGIN_STD_STRING_H__ #include #include #include /* Standard file descriptors. */ #define STDIN_FILENO 0 /* Standard input. */ #define STDOUT_FILENO 1 /* Standard output. */ #define STDERR_FILENO 2 /* Standard error output. */ extern void std_dputc(int fd, char c); extern void std_dputs(int fd, const char *s); extern void std_vdprintf(int fd, const char *format, va_list args); extern void std_dprintf(int fd, const char *format, ...) __attribute__ ((__format__ (__printf__, 2, 3))); #define std_printf(fmt, ...) std_dprintf(STDOUT_FILENO, fmt, ##__VA_ARGS__) #define std_puts(s) std_dputs(STDOUT_FILENO, s) #define std_putchar(c) std_dputc(STDOUT_FILENO, c) extern unsigned long std_strtoul(const char *nptr, char **endptr, int base); extern int std_strcmp(const char *cs, const char *ct); extern int std_strncmp(const char *cs, const char *ct, size_t n); extern void *memcpy(void *dest, const void *src, size_t n); extern int memcmp(const void *s1, const void *s2, size_t n); extern void *memset(void *s, int c, size_t n); #endif /* COMPEL_PLUGIN_STD_STRING_H__ */ criu-3.6/compel/plugins/include/uapi/std/syscall-types.h000066400000000000000000000016201317335042600234270ustar00rootroot00000000000000/* * Please add here type definitions if * syscall prototypes need them. */ #ifndef COMPEL_SYSCALL_TYPES_H__ #define COMPEL_SYSCALL_TYPES_H__ #include #include #include #include #include #include #include #include struct cap_header { uint32_t version; int pid; }; struct cap_data { uint32_t eff; uint32_t prm; uint32_t inh; }; struct robust_list_head; struct file_handle; struct itimerspec; struct io_event; struct sockaddr; struct timespec; struct siginfo; struct msghdr; struct rusage; struct iocb; typedef unsigned long aio_context_t; #ifndef F_GETFD # define F_GETFD 1 #endif struct krlimit { unsigned long rlim_cur; unsigned long rlim_max; }; /* Type of timers in the kernel. */ typedef int kernel_timer_t; #include #endif /* COMPEL_SYSCALL_TYPES_H__ */ criu-3.6/compel/plugins/shmem/000077500000000000000000000000001317335042600164015ustar00rootroot00000000000000criu-3.6/compel/plugins/shmem/shmem.c000066400000000000000000000013201317335042600176520ustar00rootroot00000000000000#include #include #include #include #include "shmem.h" #include "std-priv.h" void *shmem_create(unsigned long size) { int ret; void *mem; struct shmem_plugin_msg spi; mem = (void *)sys_mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0); if (mem == MAP_FAILED) return NULL; spi.start = (unsigned long)mem; spi.len = size; ret = sys_write(std_ctl_sock(), &spi, sizeof(spi)); if (ret != sizeof(spi)) { sys_munmap(mem, size); return NULL; } return mem; } void *shmem_receive(unsigned long *size) { /* master -> parasite not implemented yet */ return NULL; } PLUGIN_REGISTER_DUMMY(shmem) criu-3.6/compel/plugins/std/000077500000000000000000000000001317335042600160625ustar00rootroot00000000000000criu-3.6/compel/plugins/std/fds.c000066400000000000000000000004161317335042600170030ustar00rootroot00000000000000#include #include #include #include "std-priv.h" #define pr_err(fmt, ...) #include "common/compiler.h" #include "common/bug.h" #define __sys(foo) sys_##foo #define __sys_err(ret) ret #include "common/scm-code.c" criu-3.6/compel/plugins/std/infect.c000066400000000000000000000070671317335042600175100ustar00rootroot00000000000000#include #include "common/scm.h" #include "common/compiler.h" #include "common/lock.h" #define pr_err(fmt, ...) print_on_level(1, fmt, ##__VA_ARGS__) #define pr_info(fmt, ...) print_on_level(3, fmt, ##__VA_ARGS__) #define pr_debug(fmt, ...) print_on_level(4, fmt, ##__VA_ARGS__) #include "common/bug.h" #include "uapi/compel/asm/sigframe.h" #include "uapi/compel/infect-rpc.h" #include "rpc-pie-priv.h" static int tsock = -1; static struct rt_sigframe *sigframe; int parasite_get_rpc_sock(void) { return tsock; } /* RPC helpers */ static int __parasite_daemon_reply_ack(unsigned int cmd, int err) { struct ctl_msg m; int ret; m = ctl_msg_ack(cmd, err); ret = sys_sendto(tsock, &m, sizeof(m), 0, NULL, 0); if (ret != sizeof(m)) { pr_err("Sent only %d bytes while %zu expected\n", ret, sizeof(m)); return -1; } pr_debug("__sent ack msg: %d %d %d\n", m.cmd, m.ack, m.err); return 0; } static int __parasite_daemon_wait_msg(struct ctl_msg *m) { int ret; pr_debug("Daemon waits for command\n"); while (1) { *m = (struct ctl_msg){ }; ret = sys_recvfrom(tsock, m, sizeof(*m), MSG_WAITALL, NULL, 0); if (ret != sizeof(*m)) { pr_err("Trimmed message received (%d/%d)\n", (int)sizeof(*m), ret); return -1; } pr_debug("__fetched msg: %d %d %d\n", m->cmd, m->ack, m->err); return 0; } return -1; } /* Core infect code */ static noinline void fini_sigreturn(unsigned long new_sp) { ARCH_RT_SIGRETURN(new_sp, sigframe); } static int fini(void) { unsigned long new_sp; parasite_cleanup(); new_sp = (long)sigframe + RT_SIGFRAME_OFFSET(sigframe); pr_debug("%ld: new_sp=%lx ip %lx\n", sys_gettid(), new_sp, RT_SIGFRAME_REGIP(sigframe)); sys_close(tsock); std_log_set_fd(-1); fini_sigreturn(new_sp); BUG(); return -1; } static noinline __used int noinline parasite_daemon(void *args) { struct ctl_msg m; int ret = -1; pr_debug("Running daemon thread leader\n"); /* Reply we're alive */ if (__parasite_daemon_reply_ack(PARASITE_CMD_INIT_DAEMON, 0)) goto out; ret = 0; while (1) { if (__parasite_daemon_wait_msg(&m)) break; if (ret && m.cmd != PARASITE_CMD_FINI) { pr_err("Command rejected\n"); continue; } if (m.cmd == PARASITE_CMD_FINI) goto out; ret = parasite_daemon_cmd(m.cmd, args); if (__parasite_daemon_reply_ack(m.cmd, ret)) break; if (ret) { pr_err("Close the control socket for writing\n"); sys_shutdown(tsock, SHUT_WR); } } out: fini(); return 0; } static noinline __used int parasite_init_daemon(void *data) { struct parasite_init_args *args = data; int ret; args->sigreturn_addr = (uint64_t)(uintptr_t)fini_sigreturn; sigframe = (void*)(uintptr_t)args->sigframe; ret = tsock = sys_socket(PF_UNIX, SOCK_SEQPACKET, 0); if (tsock < 0) { pr_err("Can't create socket: %d\n", tsock); goto err; } ret = sys_connect(tsock, (struct sockaddr *)&args->h_addr, args->h_addr_len); if (ret < 0) { pr_err("Can't connect the control socket\n"); goto err; } futex_set_and_wake(&args->daemon_connected, 1); ret = recv_fd(tsock); if (ret >= 0) { std_log_set_fd(ret); std_log_set_loglevel(args->log_level); ret = 0; } else goto err; parasite_daemon(data); err: futex_set_and_wake(&args->daemon_connected, ret); fini(); BUG(); return -1; } #ifndef __parasite_entry # define __parasite_entry #endif int __used __parasite_entry parasite_service(unsigned int cmd, void *args) { pr_info("Parasite cmd %d/%x process\n", cmd, cmd); if (cmd == PARASITE_CMD_INIT_DAEMON) return parasite_init_daemon(args); return parasite_trap_cmd(cmd, args); } criu-3.6/compel/plugins/std/log.c000066400000000000000000000142051317335042600170110ustar00rootroot00000000000000#include #include "common/bitsperlong.h" #include #include #include #include struct simple_buf { char buf[STD_LOG_SIMPLE_CHUNK]; char *bp; int prefix_len; void (*flush)(struct simple_buf *b); }; static int logfd = -1; static int cur_loglevel = COMPEL_DEFAULT_LOGLEVEL; static struct timeval start; static void sbuf_log_flush(struct simple_buf *b); static inline void timediff(struct timeval *from, struct timeval *to) { to->tv_sec -= from->tv_sec; if (to->tv_usec >= from->tv_usec) to->tv_usec -= from->tv_usec; else { to->tv_sec--; to->tv_usec += 1000000 - from->tv_usec; } } static inline void pad_num(char **s, int *n, int nr) { while (*n < nr) { (*s)--; (*n)++; **s = '0'; } } static void sbuf_log_init(struct simple_buf *b) { char pbuf[12], *s; int n; /* * Format: * * (time)pie: pid: string-itself */ b->bp = b->buf; if (start.tv_sec != 0) { struct timeval now; sys_gettimeofday(&now, NULL); timediff(&start, &now); /* Seconds */ n = std_vprint_num(pbuf, sizeof(pbuf), (unsigned)now.tv_sec, &s); pad_num(&s, &n, 2); b->bp[0] = '('; memcpy(b->bp + 1, s, n); b->bp[n + 1] = '.'; b->bp += n + 2; /* Mu-seconds */ n = std_vprint_num(pbuf, sizeof(pbuf), (unsigned)now.tv_usec, &s); pad_num(&s, &n, 6); memcpy(b->bp, s, n); b->bp[n] = ')'; b->bp += n + 1; } n = std_vprint_num(pbuf, sizeof(pbuf), sys_gettid(), &s); b->bp[0] = 'p'; b->bp[1] = 'i'; b->bp[2] = 'e'; b->bp[3] = ':'; b->bp[4] = ' '; memcpy(b->bp + 5, s, n); b->bp[n + 5] = ':'; b->bp[n + 6] = ' '; b->bp += n + 7; b->prefix_len = b->bp - b->buf; b->flush = sbuf_log_flush; } static void sbuf_log_flush(struct simple_buf *b) { if (b->bp == b->buf + b->prefix_len) return; sys_write(logfd, b->buf, b->bp - b->buf); b->bp = b->buf + b->prefix_len; } static void sbuf_putc(struct simple_buf *b, char c) { /* TODO: maybe some warning or error here? */ if (b->bp - b->buf >= STD_LOG_SIMPLE_CHUNK) return; *b->bp = c; b->bp++; if (b->bp - b->buf >= STD_LOG_SIMPLE_CHUNK - 2) { b->bp[0] = '>'; b->bp[1] = '\n'; b->bp += 2; if (b->flush) b->flush(b); } } void std_log_set_fd(int fd) { sys_close(logfd); logfd = fd; } void std_log_set_loglevel(unsigned int level) { cur_loglevel = level; } void std_log_set_start(struct timeval *s) { start = *s; } static void print_string(const char *msg, struct simple_buf *b) { while (*msg) { sbuf_putc(b, *msg); msg++; } } int std_vprint_num(char *buf, int blen, int num, char **ps) { int neg = 0; char *s; s = &buf[blen - 1]; if (num < 0) { neg = 1; num = -num; } else if (num == 0) { *s = '0'; s--; goto done; } while (num > 0) { *s = (num % 10) + '0'; s--; num /= 10; } if (neg) { *s = '-'; s--; } done: s++; *ps = s; return blen - (s - buf); } static void print_num(int num, struct simple_buf *b) { char buf[12], *s; buf[11] = '\0'; std_vprint_num(buf, sizeof(buf) - 1, num, &s); print_string(s, b); } static void print_num_l(long num, struct simple_buf *b) { int neg = 0; char buf[22], *s; buf[21] = '\0'; s = &buf[20]; if (num < 0) { neg = 1; num = -num; } else if (num == 0) { *s = '0'; s--; goto done; } while (num > 0) { *s = (num % 10) + '0'; s--; num /= 10; } if (neg) { *s = '-'; s--; } done: s++; print_string(s, b); } static void hexdigit(unsigned int v, char *to, char **z) { *to = "0123456789abcdef"[v & 0xf]; if (*to != '0') *z = to; } static void print_hex(unsigned int num, struct simple_buf *b) { char buf[11], *z = &buf[9]; buf[10] = '\0'; hexdigit(num >> 0, &buf[9], &z); hexdigit(num >> 4, &buf[8], &z); hexdigit(num >> 8, &buf[7], &z); hexdigit(num >> 12, &buf[6], &z); hexdigit(num >> 16, &buf[5], &z); hexdigit(num >> 20, &buf[4], &z); hexdigit(num >> 24, &buf[3], &z); hexdigit(num >> 28, &buf[2], &z); z -= 2; z[0] = '0'; z[1] = 'x'; print_string(z, b); } static void print_hex_l(unsigned long num, struct simple_buf *b) { char buf[19], *z = &buf[17]; buf[18] = '\0'; hexdigit(num >> 0, &buf[17], &z); hexdigit(num >> 4, &buf[16], &z); hexdigit(num >> 8, &buf[15], &z); hexdigit(num >> 12, &buf[14], &z); hexdigit(num >> 16, &buf[13], &z); hexdigit(num >> 20, &buf[12], &z); hexdigit(num >> 24, &buf[11], &z); hexdigit(num >> 28, &buf[10], &z); #if BITS_PER_LONG == 64 hexdigit(num >> 32, &buf[9], &z); hexdigit(num >> 36, &buf[8], &z); hexdigit(num >> 40, &buf[7], &z); hexdigit(num >> 44, &buf[6], &z); hexdigit(num >> 48, &buf[5], &z); hexdigit(num >> 52, &buf[4], &z); hexdigit(num >> 56, &buf[3], &z); hexdigit(num >> 60, &buf[2], &z); #endif z -= 2; z[0] = '0'; z[1] = 'x'; print_string(z, b); } static void sbuf_printf(struct simple_buf *b, const char *format, va_list args) { const char *s = format; while (1) { int along = 0; if (*s == '\0') break; if (*s != '%') { sbuf_putc(b, *s); s++; continue; } s++; if (*s == 'l') { along = 1; s++; if (*s == 'l') s++; } else if (*s == 'z') { along = (sizeof(size_t) > sizeof(int)); s++; } switch (*s) { case 's': print_string(va_arg(args, char *), b); break; case 'd': if (along) print_num_l(va_arg(args, long), b); else print_num(va_arg(args, int), b); break; case 'x': if (along) print_hex_l(va_arg(args, long), b); else print_hex(va_arg(args, unsigned int), b); break; case 'p': print_hex_l((unsigned long)va_arg(args, void *), b); break; default: print_string("UNKNOWN FORMAT ", b); sbuf_putc(b, *s); break; } s++; } } void print_on_level(unsigned int loglevel, const char *format, ...) { va_list args; struct simple_buf b; if (loglevel > cur_loglevel) return; sbuf_log_init(&b); va_start(args, format); sbuf_printf(&b, format, args); va_end(args); sbuf_log_flush(&b); } void std_sprintf(char output[STD_LOG_SIMPLE_CHUNK], const char *format, ...) { va_list args; struct simple_buf b; char *p; b.bp = b.buf; b.flush = NULL; va_start(args, format); sbuf_printf(&b, format, args); va_end(args); *b.bp = 0; for (p = b.buf; p <= b.bp; p++) output[p - b.buf] = *p; } criu-3.6/compel/plugins/std/std.c000066400000000000000000000026671317335042600170330ustar00rootroot00000000000000#include #include #include #include "asm/prologue.h" static struct prologue_init_args *init_args; static int ctl_socket = -1; int std_ctl_sock(void) { return ctl_socket; } static int init_socket(struct prologue_init_args *args) { int ret; ctl_socket = sys_socket(PF_UNIX, SOCK_SEQPACKET, 0); if (ctl_socket < 0) return ctl_socket; ret = sys_connect(ctl_socket, (struct sockaddr *)&args->ctl_sock_addr, args->ctl_sock_addr_len); if (ret < 0) return ret; return 0; } static int fini_socket(void) { char buf[32]; int ret = 0; ret = sys_shutdown(ctl_socket, SHUT_WR); if (ret) goto err; ret = sys_recv(ctl_socket, buf, sizeof(buf), MSG_WAITALL); if (ret) goto err; err: sys_close(ctl_socket); ctl_socket = -1; return ret; } #define plugin_init_count(size) ((size) / (sizeof(plugin_init_t *))) int __export_std_compel_start(struct prologue_init_args *args, const plugin_init_t * const *init_array, size_t init_size) { unsigned int i; int ret = 0; init_args = args; ret = init_socket(args); if (ret) return ret; for (i = 0; i < plugin_init_count(init_size); i++) { const plugin_init_t *d = init_array[i]; if (d && d->init) { ret = d->init(); if (ret) break; } } for (; i > 0; i--) { const plugin_init_t *d = init_array[i - 1]; if (d && d->exit) d->exit(); } fini_socket(); return ret; } PLUGIN_REGISTER_DUMMY(std) criu-3.6/compel/plugins/std/string.c000066400000000000000000000117011317335042600175340ustar00rootroot00000000000000#include #include #include #include #include #include "features.h" static const char conv_tab[] = "0123456789abcdefghijklmnopqrstuvwxyz"; void std_dputc(int fd, char c) { sys_write(fd, &c, 1); } void std_dputs(int fd, const char *s) { for (; *s; s++) std_dputc(fd, *s); } static size_t __std_vprint_long_hex(char *buf, size_t blen, unsigned long num, char **ps) { char *s = &buf[blen - 2]; buf[blen - 1] = '\0'; if (num == 0) { *s = '0', s--; goto done; } while (num > 0) { *s = conv_tab[num % 16], s--; num /= 16; } done: s++; *ps = s; return blen - (s - buf); } static size_t __std_vprint_long(char *buf, size_t blen, long num, char **ps) { char *s = &buf[blen - 2]; int neg = 0; buf[blen - 1] = '\0'; if (num < 0) { neg = 1; num = -num; } else if (num == 0) { *s = '0'; s--; goto done; } while (num > 0) { *s = (num % 10) + '0'; s--; num /= 10; } if (neg) { *s = '-'; s--; } done: s++; *ps = s; return blen - (s - buf); } void std_vdprintf(int fd, const char *format, va_list args) { const char *s = format; for (; *s != '\0'; s++) { char buf[32], *t; int along = 0; if (*s != '%') { std_dputc(fd, *s); continue; } s++; if (*s == 'l') { along = 1; s++; if (*s == 'l') s++; } switch (*s) { case 's': std_dputs(fd, va_arg(args, char *)); break; case 'd': __std_vprint_long(buf, sizeof(buf), along ? va_arg(args, long) : (long)va_arg(args, int), &t); std_dputs(fd, t); break; case 'x': __std_vprint_long_hex(buf, sizeof(buf), along ? va_arg(args, long) : (long)va_arg(args, int), &t); std_dputs(fd, t); break; } } } void std_dprintf(int fd, const char *format, ...) { va_list args; va_start(args, format); std_vdprintf(fd, format, args); va_end(args); } static inline bool __isspace(unsigned char c) { return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v'; } static unsigned char __tolower(unsigned char c) { return (c <= 'Z' && c >= 'A') ? c - 'A' + 'a' : c; } static inline bool __isalpha(unsigned char c) { return ((c <= 'Z' && c >= 'A') || (c <= 'z' && c >= 'a')); } static inline bool __isdigit(unsigned char c) { return (c <= '9' && c >= '0'); } static inline bool __isalnum(unsigned char c) { return (__isalpha(c) || __isdigit(c)); } static unsigned int __conv_val(unsigned char c) { if (__isdigit(c)) return c - '0'; else if (__isalpha(c)) return &conv_tab[__tolower(c)] - conv_tab; return -1u; } unsigned long std_strtoul(const char *nptr, char **endptr, int base) { const char *s = nptr; bool neg = false; unsigned int v; long num = 0; if (base < 0 || base == 1 || base > 36) goto fin; while (__isspace(*s)) s++; if (!*s) goto fin; if (*s == '-') neg = true, s++; if (base == 0) { if (s[0] == '0') { unsigned char p = __tolower(s[1]); switch (p) { case 'b': base = 2, s += 2; break; case 'x': base = 16, s += 2; break; default: base = 8, s += 1; break; } } else base = 10; } else if (base == 16) { if (s[0] == '0' && __tolower(s[1]) == 'x') s += 2; } for (; *s; s++) { if (__isspace(*s)) continue; if (!__isalnum(*s)) goto fin; v = __conv_val(*s); if (v == -1u || v > base) goto fin; num *= base; num += v; } fin: if (endptr) *endptr = (char *)s; return neg ? (unsigned long)-num : (unsigned long)num; } /* * C compiler is free to insert implicit calls to memcmp, memset, * memcpy and memmove, assuming they are available during linking. * As the parasite code is not linked with libc, it must provide * our own implementations of the above functions. * Surely, these functions can also be called explicitly. * * Note: for now, not having memmove() seems OK for both gcc and clang. */ #ifndef ARCH_HAS_MEMCPY void *memcpy(void *to, const void *from, size_t n) { size_t i; unsigned char *cto = to; const unsigned char *cfrom = from; for (i = 0; i < n; ++i, ++cto, ++cfrom) *cto = *cfrom; return to; } #endif #ifndef ARCH_HAS_MEMCMP int memcmp(const void *cs, const void *ct, size_t count) { const unsigned char *su1, *su2; int res = 0; for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--) if ((res = *su1 - *su2) != 0) break; return res; } #endif #ifndef ARCH_HAS_MEMSET void *memset(void *s, const int c, size_t count) { volatile char *dest = s; size_t i = 0; while (i < count) dest[i++] = (char) c; return s; } #endif int std_strcmp(const char *cs, const char *ct) { unsigned char c1, c2; while (1) { c1 = *cs++; c2 = *ct++; if (c1 != c2) return c1 < c2 ? -1 : 1; if (!c1) break; } return 0; } int std_strncmp(const char *cs, const char *ct, size_t count) { size_t i; for (i = 0; i < count; i++) { if (cs[i] != ct[i]) return cs[i] < ct[i] ? -1 : 1; if (!cs[i]) break; } return 0; } criu-3.6/compel/src/000077500000000000000000000000001317335042600143765ustar00rootroot00000000000000criu-3.6/compel/src/lib/000077500000000000000000000000001317335042600151445ustar00rootroot00000000000000criu-3.6/compel/src/lib/handle-elf-host.c000077700000000000000000000000001317335042600224202handle-elf.custar00rootroot00000000000000criu-3.6/compel/src/lib/handle-elf.c000066400000000000000000000472761317335042600173270ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "uapi/compel.h" #include "handle-elf.h" #include "piegen.h" #include "log.h" /* Check if pointer is out-of-bound */ static bool __ptr_oob(const uintptr_t ptr, const uintptr_t start, const size_t size) { uintptr_t end = start + size; return ptr >= end || ptr < start; } /* Check if pointed structure's end is out-of-bound */ static bool __ptr_struct_end_oob(const uintptr_t ptr, const size_t struct_size, const uintptr_t start, const size_t size) { /* the last byte of the structure should be inside [begin, end) */ return __ptr_oob(ptr + struct_size - 1, start, size); } /* Check if pointed structure is out-of-bound */ static bool __ptr_struct_oob(const uintptr_t ptr, const size_t struct_size, const uintptr_t start, const size_t size) { return __ptr_oob(ptr, start, size) || __ptr_struct_end_oob(ptr, struct_size, start, size); } static bool test_pointer(const void *ptr, const void *start, const size_t size, const char *name, const char *file, const int line) { if (__ptr_oob((const uintptr_t)ptr, (const uintptr_t)start, size)) { pr_err("Corrupted pointer %p (%s) at %s:%d\n", ptr, name, file, line); return true; } return false; } #define ptr_func_exit(__ptr) \ do { \ if (test_pointer((__ptr), mem, size, #__ptr, \ __FILE__, __LINE__)) { \ free(sec_hdrs); \ return -1; \ } \ } while (0) #ifdef ELF_PPC64 static int do_relative_toc(long value, uint16_t *location, unsigned long mask, int complain_signed) { if (complain_signed && (value + 0x8000 > 0xffff)) { pr_err("TOC16 relocation overflows (%ld)\n", value); return -1; } if ((~mask & 0xffff) & value) { pr_err("bad TOC16 relocation (%ld) (0x%lx)\n", value, (~mask & 0xffff) & value); return -1; } *location = (*location & ~mask) | (value & mask); return 0; } #endif static bool is_header_supported(Elf_Ehdr *hdr) { if (!arch_is_machine_supported(hdr->e_machine)) return false; if ((hdr->e_type != ET_REL #ifdef NO_RELOCS && hdr->e_type != ET_EXEC #endif ) || hdr->e_version != EV_CURRENT) return false; return true; } static const char *get_strings_section(Elf_Ehdr *hdr, uintptr_t mem, size_t size) { size_t sec_table_size = ((size_t) hdr->e_shentsize) * hdr->e_shnum; uintptr_t sec_table = mem + hdr->e_shoff; Elf_Shdr *secstrings_hdr; uintptr_t addr; if (__ptr_struct_oob(sec_table, sec_table_size, mem, size)) { pr_err("Section table [%#zx, %#zx) is out of [%#zx, %#zx)\n", sec_table, sec_table + sec_table_size, mem, mem + size); return NULL; } /* * strings section header's offset in section headers table is * (size of section header * index of string section header) */ addr = sec_table + ((size_t) hdr->e_shentsize) * hdr->e_shstrndx; if (__ptr_struct_oob(addr, sizeof(Elf_Shdr), sec_table, sec_table_size)) { pr_err("String section header @%#zx is out of [%#zx, %#zx)\n", addr, sec_table, sec_table + sec_table_size); return NULL; } secstrings_hdr = (void*)addr; addr = mem + secstrings_hdr->sh_offset; if (__ptr_struct_oob(addr, secstrings_hdr->sh_size, mem, size)) { pr_err("String section @%#zx size %#lx is out of [%#zx, %#zx)\n", addr, (unsigned long)secstrings_hdr->sh_size, mem, mem + size); return NULL; } return (void*)addr; } /* * This name @__handle_elf get renamed into * @handle_elf_ppc64 or say @handle_elf_x86_64 * depending on the architecture it's compiled * under. */ int __handle_elf(void *mem, size_t size) { const char *symstrings = NULL; Elf_Shdr *symtab_hdr = NULL; Elf_Sym *symbols = NULL; Elf_Ehdr *hdr = mem; Elf_Shdr *strtab_hdr = NULL; Elf_Shdr **sec_hdrs = NULL; const char *secstrings; size_t i, k, nr_gotpcrel = 0; #ifdef ELF_PPC64 int64_t toc_offset = 0; #endif int ret = -EINVAL; pr_debug("Header\n"); pr_debug("------------\n"); pr_debug("\ttype 0x%x machine 0x%x version 0x%x\n", (unsigned)hdr->e_type, (unsigned)hdr->e_machine, (unsigned)hdr->e_version); if (!is_header_supported(hdr)) { pr_err("Unsupported header detected\n"); goto err; } sec_hdrs = malloc(sizeof(*sec_hdrs) * hdr->e_shnum); if (!sec_hdrs) { pr_err("No memory for section headers\n"); ret = -ENOMEM; goto err; } secstrings = get_strings_section(hdr, (uintptr_t)mem, size); if (!secstrings) goto err; pr_debug("Sections\n"); pr_debug("------------\n"); for (i = 0; i < hdr->e_shnum; i++) { Elf_Shdr *sh = mem + hdr->e_shoff + hdr->e_shentsize * i; ptr_func_exit(sh); if (sh->sh_type == SHT_SYMTAB) symtab_hdr = sh; ptr_func_exit(&secstrings[sh->sh_name]); pr_debug("\t index %-2zd type 0x%-2x name %s\n", i, (unsigned)sh->sh_type, &secstrings[sh->sh_name]); sec_hdrs[i] = sh; #ifdef ELF_PPC64 if (!strcmp(&secstrings[sh->sh_name], ".toc")) { toc_offset = sh->sh_addr + 0x8000; pr_debug("\t\tTOC offset 0x%lx\n", toc_offset); } #endif } if (!symtab_hdr) { pr_err("No symbol table present\n"); goto err; } if (!symtab_hdr->sh_link || symtab_hdr->sh_link >= hdr->e_shnum) { pr_err("Corrupted symtab header\n"); goto err; } pr_debug("Symbols\n"); pr_debug("------------\n"); strtab_hdr = sec_hdrs[symtab_hdr->sh_link]; ptr_func_exit(strtab_hdr); symbols = mem + symtab_hdr->sh_offset; ptr_func_exit(symbols); symstrings = mem + strtab_hdr->sh_offset; ptr_func_exit(symstrings); if (sizeof(*symbols) != symtab_hdr->sh_entsize) { pr_err("Symbol table align differ\n"); goto err; } pr_out("/* Autogenerated from %s */\n", opts.input_filename); pr_out("#include \n"); for (i = 0; i < symtab_hdr->sh_size / symtab_hdr->sh_entsize; i++) { Elf_Sym *sym = &symbols[i]; const char *name; Elf_Shdr *sh_src; ptr_func_exit(sym); name = &symstrings[sym->st_name]; ptr_func_exit(name); if (!*name) continue; pr_debug("\ttype 0x%-2x bind 0x%-2x shndx 0x%-4x value 0x%-2lx name %s\n", (unsigned)ELF_ST_TYPE(sym->st_info), (unsigned)ELF_ST_BIND(sym->st_info), (unsigned)sym->st_shndx, (unsigned long)sym->st_value, name); #ifdef ELF_PPC64 if (!sym->st_value && !strncmp(name, ".TOC.", 6)) { if (!toc_offset) { pr_err("No TOC pointer\n"); goto err; } sym->st_value = toc_offset; continue; } #endif if (strncmp(name, "__export", 8)) continue; if ((sym->st_shndx && sym->st_shndx < hdr->e_shnum) || sym->st_shndx == SHN_ABS) { if (sym->st_shndx == SHN_ABS) { sh_src = NULL; } else { sh_src = sec_hdrs[sym->st_shndx]; ptr_func_exit(sh_src); } pr_out("#define %s_sym%s 0x%lx\n", opts.prefix, name, (unsigned long)(sym->st_value + (sh_src ? sh_src->sh_addr : 0))); } } pr_out("static __maybe_unused compel_reloc_t %s_relocs[] = {\n", opts.prefix); #ifndef NO_RELOCS pr_debug("Relocations\n"); pr_debug("------------\n"); for (i = 0; i < hdr->e_shnum; i++) { Elf_Shdr *sh = sec_hdrs[i]; Elf_Shdr *sh_rel; if (sh->sh_type != SHT_REL && sh->sh_type != SHT_RELA) continue; sh_rel = sec_hdrs[sh->sh_info]; ptr_func_exit(sh_rel); pr_debug("\tsection %2zd type 0x%-2x link 0x%-2x info 0x%-2x name %s\n", i, (unsigned)sh->sh_type, (unsigned)sh->sh_link, (unsigned)sh->sh_info, &secstrings[sh->sh_name]); for (k = 0; k < sh->sh_size / sh->sh_entsize; k++) { int64_t __maybe_unused addend64, __maybe_unused value64; int32_t __maybe_unused addend32, __maybe_unused value32; unsigned long place; const char *name; void *where; Elf_Sym *sym; union { Elf_Rel rel; Elf_Rela rela; } *r = mem + sh->sh_offset + sh->sh_entsize * k; ptr_func_exit(r); sym = &symbols[ELF_R_SYM(r->rel.r_info)]; ptr_func_exit(sym); name = &symstrings[sym->st_name]; ptr_func_exit(name); where = mem + sh_rel->sh_offset + r->rel.r_offset; ptr_func_exit(where); pr_debug("\t\tr_offset 0x%-4lx r_info 0x%-4lx / sym 0x%-2lx type 0x%-2lx symsecoff 0x%-4lx\n", (unsigned long)r->rel.r_offset, (unsigned long)r->rel.r_info, (unsigned long)ELF_R_SYM(r->rel.r_info), (unsigned long)ELF_R_TYPE(r->rel.r_info), (unsigned long)sh_rel->sh_addr); if (sym->st_shndx == SHN_UNDEF) { #ifdef ELF_PPC64 /* On PowerPC, TOC symbols appear to be * undefined but should be processed as well. * Their type is STT_NOTYPE, so report any * other one. */ if (ELF32_ST_TYPE(sym->st_info) != STT_NOTYPE || strncmp(name, ".TOC.", 6)) { pr_err("Unexpected undefined symbol:%s\n", name); goto err; } #else pr_err("Unexpected undefined symbol: `%s'. External symbol in PIE?\n", name); goto err; #endif } if (sh->sh_type == SHT_REL) { addend32 = *(int32_t *)where; addend64 = *(int64_t *)where; } else { addend32 = (int32_t)r->rela.r_addend; addend64 = (int64_t)r->rela.r_addend; } place = sh_rel->sh_addr + r->rel.r_offset; pr_debug("\t\t\tvalue 0x%-8lx addend32 %-4d addend64 %-8ld place %-8lx symname %s\n", (unsigned long)sym->st_value, addend32, (long)addend64, (long)place, name); if (sym->st_shndx == SHN_ABS) { value32 = (int32_t)sym->st_value; value64 = (int64_t)sym->st_value; } else { Elf_Shdr *sh_src; if ((unsigned)sym->st_shndx > (unsigned)hdr->e_shnum) { pr_err("Unexpected symbol section index %u/%u\n", (unsigned)sym->st_shndx, hdr->e_shnum); goto err; } sh_src = sec_hdrs[sym->st_shndx]; ptr_func_exit(sh_src); value32 = (int32_t)sh_src->sh_addr + (int32_t)sym->st_value; value64 = (int64_t)sh_src->sh_addr + (int64_t)sym->st_value; } #ifdef ELF_PPC64 /* * Snippet from the OpenPOWER ABI for Linux Supplement: * * The OpenPOWER ABI uses the three most-significant bits in the symbol * st_other field specifies the number of instructions between a function's * global entry point and local entry point. The global entry point is used * when it is necessary to set up the TOC pointer (r2) for the function. The * local entry point is used when r2 is known to already be valid for the * function. A value of zero in these bits asserts that the function does * not use r2. * * The st_other values have the following meanings: * 0 and 1, the local and global entry points are the same. * 2, the local entry point is at 1 instruction past the global entry point. * 3, the local entry point is at 2 instructions past the global entry point. * 4, the local entry point is at 4 instructions past the global entry point. * 5, the local entry point is at 8 instructions past the global entry point. * 6, the local entry point is at 16 instructions past the global entry point. * 7, reserved. * * Here we are only handle the case '3' which is the most commonly seen. */ #define LOCAL_OFFSET(s) ((s->st_other >> 5) & 0x7) if (LOCAL_OFFSET(sym)) { if (LOCAL_OFFSET(sym) != 3) { pr_err("Unexpected local offset value %d\n", LOCAL_OFFSET(sym)); goto err; } pr_debug("\t\t\tUsing local offset\n"); value64 += 8; value32 += 8; } #endif switch (ELF_R_TYPE(r->rel.r_info)) { #ifdef ELF_PPC64 case R_PPC64_REL24: /* Update PC relative offset, linker has not done this yet */ pr_debug("\t\t\tR_PPC64_REL24 at 0x%-4lx val 0x%lx\n", place, value64); /* Convert value to relative */ value64 -= place; if (value64 + 0x2000000 > 0x3ffffff || (value64 & 3) != 0) { pr_err("REL24 %li out of range!\n", (long int)value64); goto err; } /* Only replace bits 2 through 26 */ *(uint32_t *)where = (*(uint32_t *)where & ~0x03fffffc) | (value64 & 0x03fffffc); break; case R_PPC64_ADDR32: case R_PPC64_REL32: pr_debug("\t\t\tR_PPC64_ADDR32 at 0x%-4lx val 0x%x\n", place, (unsigned int)(value32 + addend32)); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " " .addend = %-8d, .value = 0x%-16x, " "}, /* R_PPC64_ADDR32 */\n", (unsigned int) place, addend32, value32); break; case R_PPC64_ADDR64: case R_PPC64_REL64: pr_debug("\t\t\tR_PPC64_ADDR64 at 0x%-4lx val 0x%lx\n", place, value64 + addend64); pr_out("\t{ .offset = 0x%-8x, .type = COMPEL_TYPE_LONG," " .addend = %-8ld, .value = 0x%-16lx, " "}, /* R_PPC64_ADDR64 */\n", (unsigned int) place, (long)addend64, (long)value64); break; case R_PPC64_TOC16_HA: pr_debug("\t\t\tR_PPC64_TOC16_HA at 0x%-4lx val 0x%lx\n", place, value64 + addend64 - toc_offset + 0x8000); if (do_relative_toc((value64 + addend64 - toc_offset + 0x8000) >> 16, where, 0xffff, 1)) goto err; break; case R_PPC64_TOC16_LO: pr_debug("\t\t\tR_PPC64_TOC16_LO at 0x%-4lx val 0x%lx\n", place, value64 + addend64 - toc_offset); if (do_relative_toc(value64 + addend64 - toc_offset, where, 0xffff, 1)) goto err; break; case R_PPC64_TOC16_LO_DS: pr_debug("\t\t\tR_PPC64_TOC16_LO_DS at 0x%-4lx val 0x%lx\n", place, value64 + addend64 - toc_offset); if (do_relative_toc(value64 + addend64 - toc_offset, where, 0xfffc, 0)) goto err; break; case R_PPC64_REL16_HA: value64 += addend64 - place; pr_debug("\t\t\tR_PPC64_REL16_HA at 0x%-4lx val 0x%lx\n", place, value64); /* check that we are dealing with the addis 2,12 instruction */ if (((*(uint32_t*)where) & 0xffff0000) != 0x3c4c0000) { pr_err("Unexpected instruction for R_PPC64_REL16_HA\n"); goto err; } *(uint16_t *)where = ((value64 + 0x8000) >> 16) & 0xffff; break; case R_PPC64_REL16_LO: value64 += addend64 - place; pr_debug("\t\t\tR_PPC64_REL16_LO at 0x%-4lx val 0x%lx\n", place, value64); /* check that we are dealing with the addi 2,2 instruction */ if (((*(uint32_t*)where) & 0xffff0000) != 0x38420000) { pr_err("Unexpected instruction for R_PPC64_REL16_LO\n"); goto err; } *(uint16_t *)where = value64 & 0xffff; break; #endif /* ELF_PPC64 */ #ifdef ELF_X86_64 case R_X86_64_32: /* Symbol + Addend (4 bytes) */ case R_X86_64_32S: /* Symbol + Addend (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_32 at 0x%-4lx val 0x%x\n", place, value32); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " ".addend = %-8d, .value = 0x%-16x, }, /* R_X86_64_32 */\n", (unsigned int)place, addend32, value32); break; case R_X86_64_64: /* Symbol + Addend (8 bytes) */ pr_debug("\t\t\t\tR_X86_64_64 at 0x%-4lx val 0x%lx\n", place, (long)value64); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_LONG, " ".addend = %-8ld, .value = 0x%-16lx, }, /* R_X86_64_64 */\n", (unsigned int)place, (long)addend64, (long)value64); break; case R_X86_64_PC32: /* Symbol + Addend - Place (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); /* * R_X86_64_PC32 are relative, patch them inplace. */ *((int32_t *)where) = value32 + addend32 - place; break; case R_X86_64_PLT32: /* ProcLinkage + Addend - Place (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_PLT32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); /* * R_X86_64_PLT32 are relative, patch them inplace. */ *((int32_t *)where) = value32 + addend32 - place; break; case R_X86_64_GOTPCRELX: case R_X86_64_REX_GOTPCRELX: case R_X86_64_GOTPCREL: /* SymbolOffsetInGot + GOT + Addend - Place (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_GOTPCREL at 0x%-4lx val 0x%x\n", place, value32); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_LONG | COMPEL_TYPE_GOTPCREL, " ".addend = %-8d, .value = 0x%-16x, }, /* R_X86_64_GOTPCREL */\n", (unsigned int)place, addend32, value32); nr_gotpcrel++; break; #endif #ifdef ELF_X86_32 case R_386_32: /* Symbol + Addend */ pr_debug("\t\t\t\tR_386_32 at 0x%-4lx val 0x%x\n", place, value32 + addend32); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " ".addend = %-4d, .value = 0x%x, },\n", (unsigned int)place, addend32, value32); break; case R_386_PC32: /* Symbol + Addend - Place */ pr_debug("\t\t\t\tR_386_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); /* * R_386_PC32 are relative, patch them inplace. */ *((int32_t *)where) = value32 + addend32 - place; break; #endif #ifdef ELF_S390 /* * See also arch/s390/kernel/module.c/apply_rela(): * A PLT reads the GOT (global offest table). We can handle it like * R_390_PC32DBL because we have linked statically. */ case R_390_PLT32DBL: /* PC relative on a PLT (predure link table) */ pr_debug("\t\t\t\tR_390_PLT32DBL at 0x%-4lx val 0x%x\n", place, value32 + addend32); *((int32_t *)where) = (value64 + addend64 - place) >> 1; break; case R_390_PC32DBL: /* PC relative on a symbol */ pr_debug("\t\t\t\tR_390_PC32DBL at 0x%-4lx val 0x%x\n", place, value32 + addend32); *((int32_t *)where) = (value64 + addend64 - place) >> 1; break; case R_390_64: /* 64 bit absolute address */ pr_debug("\t\t\t\tR_390_64 at 0x%-4lx val 0x%lx\n", place, (long)value64); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_LONG, " ".addend = %-8ld, .value = 0x%-16lx, }, /* R_390_64 */\n", (unsigned int)place, (long)addend64, (long)value64); break; case R_390_PC64: /* 64 bit relative address */ *((int64_t *)where) = value64 + addend64 - place; pr_debug("\t\t\t\tR_390_PC64 at 0x%-4lx val 0x%lx\n", place, (long)value64); break; #endif default: pr_err("Unsupported relocation of type %lu\n", (unsigned long)ELF_R_TYPE(r->rel.r_info)); goto err; } } } #endif /* !NO_RELOCS */ pr_out("};\n"); pr_out("static __maybe_unused size_t %s_nr_gotpcrel = %zd;\n", opts.prefix, nr_gotpcrel); pr_out("static __maybe_unused const char %s_blob[] = {\n\t", opts.prefix); for (i = 0, k = 0; i < hdr->e_shnum; i++) { Elf_Shdr *sh = sec_hdrs[i]; unsigned char *shdata; size_t j; if (!(sh->sh_flags & SHF_ALLOC) || !sh->sh_size) continue; shdata = mem + sh->sh_offset; pr_debug("Copying section '%s'\n" "\tstart:0x%lx (gap:0x%lx) size:0x%lx\n", &secstrings[sh->sh_name], (unsigned long) sh->sh_addr, (unsigned long)(sh->sh_addr - k), (unsigned long) sh->sh_size); /* write 0 in the gap between the 2 sections */ for (; k < sh->sh_addr; k++) { if (k && (k % 8) == 0) pr_out("\n\t"); pr_out("0x00,"); } for (j = 0; j < sh->sh_size; j++, k++) { if (k && (k % 8) == 0) pr_out("\n\t"); pr_out("0x%02x,", shdata[j]); } } pr_out("};\n"); pr_out("\n"); pr_out("static void __maybe_unused %s_setup_c_header(struct parasite_ctl *ctl)\n", opts.prefix); pr_out( "{\n" " struct parasite_blob_desc *pbd;\n" "\n" " pbd = compel_parasite_blob_desc(ctl);\n" " pbd->parasite_type = COMPEL_BLOB_CHEADER;\n" ); pr_out("\tpbd->hdr.mem = %s_blob;\n", opts.prefix); pr_out("\tpbd->hdr.bsize = sizeof(%s_blob);\n", opts.prefix); pr_out("\tpbd->hdr.nr_gotpcrel = %s_nr_gotpcrel;\n", opts.prefix); pr_out("\tif (compel_mode_native(ctl))\n"); pr_out("\t\tpbd->hdr.parasite_ip_off = " "%s_sym__export_parasite_head_start;\n", opts.prefix); pr_out("#ifdef CONFIG_COMPAT\n"); pr_out("\telse\n"); pr_out("\t\tpbd->hdr.parasite_ip_off = " "%s_sym__export_parasite_head_start_compat;\n", opts.prefix); pr_out("#endif /* CONFIG_COMPAT */\n"); pr_out("\tpbd->hdr.addr_cmd_off = " "%s_sym__export_parasite_cmd;\n", opts.prefix); pr_out("\tpbd->hdr.addr_arg_off = " "%s_sym__export_parasite_args;\n", opts.prefix); pr_out("\tpbd->hdr.relocs = %s_relocs;\n", opts.prefix); pr_out("\tpbd->hdr.nr_relocs = " "sizeof(%s_relocs) / sizeof(%s_relocs[0]);\n", opts.prefix, opts.prefix); pr_out("}\n"); ret = 0; err: free(sec_hdrs); return ret; } criu-3.6/compel/src/lib/infect-rpc.c000066400000000000000000000037511317335042600173500ustar00rootroot00000000000000#include "log.h" #include "common/bug.h" #include "common/xmalloc.h" #include "common/lock.h" #include "infect.h" #include "infect-priv.h" #include "infect-rpc.h" #include "rpc-pie-priv.h" static int __parasite_send_cmd(int sockfd, struct ctl_msg *m) { int ret; BUILD_BUG_ON(PARASITE_USER_CMDS < __PARASITE_END_CMDS); ret = send(sockfd, m, sizeof(*m), 0); if (ret == -1) { pr_perror("Failed to send command %d to daemon", m->cmd); return -1; } else if (ret != sizeof(*m)) { pr_err("Message to daemon is trimmed (%d/%d)\n", (int)sizeof(*m), ret); return -1; } pr_debug("Sent msg to daemon %d %d %d\n", m->cmd, m->ack, m->err); return 0; } int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m) { int ret; pr_debug("Wait for ack %d on daemon socket\n", cmd); while (1) { memzero(m, sizeof(*m)); ret = recv(sockfd, m, sizeof(*m), MSG_WAITALL); if (ret == -1) { pr_perror("Failed to read ack"); return -1; } else if (ret != sizeof(*m)) { pr_err("Message reply from daemon is trimmed (%d/%d)\n", (int)sizeof(*m), ret); return -1; } pr_debug("Fetched ack: %d %d %d\n", m->cmd, m->ack, m->err); if (m->cmd != cmd || m->ack != cmd) { pr_err("Communication error, this is not " "the ack we expected\n"); return -1; } return 0; } return -1; } int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl) { struct ctl_msg m; if (parasite_wait_ack(ctl->tsock, cmd, &m)) return -1; if (m.err != 0) { pr_err("Command %d for daemon failed with %d\n", cmd, m.err); return -1; } return 0; } int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl) { struct ctl_msg m; m = ctl_msg_cmd(cmd); return __parasite_send_cmd(ctl->tsock, &m); } int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl) { int ret; ret = compel_rpc_call(cmd, ctl); if (!ret) ret = compel_rpc_sync(cmd, ctl); return ret; } int compel_rpc_sock(struct parasite_ctl *ctl) { return ctl->tsock; } criu-3.6/compel/src/lib/infect-util.c000066400000000000000000000010741317335042600175350ustar00rootroot00000000000000#include "log.h" #include "common/bug.h" #include "common/lock.h" #include "uapi/compel/plugins/std/fds.h" #include "infect-rpc.h" #include "infect-util.h" int compel_util_send_fd(struct parasite_ctl *ctl, int fd) { int sk; sk = compel_rpc_sock(ctl); if (send_fd(sk, NULL, 0, fd) < 0) { pr_perror("Can't send file descriptor"); return -1; } return 0; } int compel_util_recv_fd(struct parasite_ctl *ctl, int *pfd) { int sk; sk = compel_rpc_sock(ctl); if ((*pfd = recv_fd(sk)) < 0) { pr_perror("Can't send file descriptor"); return -1; } return 0; } criu-3.6/compel/src/lib/infect.c000066400000000000000000001063031317335042600165630ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "log.h" #include "common/bug.h" #include "common/xmalloc.h" #include "common/lock.h" #include "common/page.h" #include #include #include "uapi/compel/plugins/std/syscall.h" #include "asm/infect-types.h" #include "asm/sigframe.h" #include "infect.h" #include "ptrace.h" #include "infect-rpc.h" #include "infect-priv.h" #include "infect-util.h" #include "rpc-pie-priv.h" #include "infect-util.h" #define __sys(foo) foo #define __sys_err(ret) (-errno) #include "common/scm.h" #include "common/scm-code.c" #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \ (size_t)((struct sockaddr_un *) 0)->sun_path) #define PARASITE_STACK_SIZE (16 << 10) #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif static int prepare_thread(int pid, struct thread_ctx *ctx); static inline void close_safe(int *pfd) { if (*pfd > -1) { close(*pfd); *pfd = -1; } } static int parse_pid_status(int pid, struct seize_task_status *ss, void *data) { char aux[128]; FILE *f; sprintf(aux, "/proc/%d/status", pid); f = fopen(aux, "r"); if (!f) return -1; ss->ppid = -1; /* Not needed at this point */ ss->seccomp_mode = SECCOMP_MODE_DISABLED; while (fgets(aux, sizeof(aux), f)) { if (!strncmp(aux, "State:", 6)) { ss->state = aux[7]; continue; } if (!strncmp(aux, "Seccomp:", 8)) { if (sscanf(aux + 9, "%d", &ss->seccomp_mode) != 1) goto err_parse; continue; } if (!strncmp(aux, "ShdPnd:", 7)) { if (sscanf(aux + 7, "%llx", &ss->shdpnd) != 1) goto err_parse; continue; } if (!strncmp(aux, "SigPnd:", 7)) { if (sscanf(aux + 7, "%llx", &ss->sigpnd) != 1) goto err_parse; continue; } } fclose(f); return 0; err_parse: fclose(f); return -1; } int compel_stop_task(int pid) { int ret; struct seize_task_status ss; ret = compel_interrupt_task(pid); if (ret == 0) ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &ss, NULL); return ret; } int compel_interrupt_task(int pid) { int ret; ret = ptrace(PTRACE_SEIZE, pid, NULL, 0); if (ret) { /* * ptrace API doesn't allow to distinguish * attaching to zombie from other errors. * All errors will be handled in compel_wait_task(). */ pr_warn("Unable to interrupt task: %d (%s)\n", pid, strerror(errno)); return ret; } /* * If we SEIZE-d the task stop it before going * and reading its stat from proc. Otherwise task * may die _while_ we're doing it and we'll have * inconsistent seize/state pair. * * If task dies after we seize it but before we * do this interrupt, we'll notice it via proc. */ ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL); if (ret < 0) { pr_warn("SEIZE %d: can't interrupt task: %s", pid, strerror(errno)); if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) pr_perror("Unable to detach from %d", pid); } return ret; } static int skip_sigstop(int pid, int nr_signals) { int i, status, ret; /* * 1) SIGSTOP is queued, but isn't handled yet: * SGISTOP can't be blocked, so we need to wait when the kernel * handles this signal. * * Otherwise the process will be stopped immediately after * starting it. * * 2) A seized task was stopped: * PTRACE_SEIZE doesn't affect signal or group stop state. * Currently ptrace reported that task is in stopped state. * We need to start task again, and it will be trapped * immediately, because we sent PTRACE_INTERRUPT to it. */ for (i = 0; i < nr_signals; i++) { ret = ptrace(PTRACE_CONT, pid, 0, 0); if (ret) { pr_perror("Unable to start process"); return -1; } ret = wait4(pid, &status, __WALL, NULL); if (ret < 0) { pr_perror("SEIZE %d: can't wait task", pid); return -1; } if (!WIFSTOPPED(status)) { pr_err("SEIZE %d: task not stopped after seize\n", pid); return -1; } } return 0; } /* * This routine seizes task putting it into a special * state where we can manipulate the task via ptrace * interface, and finally we can detach ptrace out of * of it so the task would not know if it was saddled * up with someone else. */ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_task_status *, void *), void (*free_status)(int pid, struct seize_task_status *, void *), struct seize_task_status *ss, void *data) { siginfo_t si; int status, nr_sigstop; int ret = 0, ret2, wait_errno = 0; /* * It's ugly, but the ptrace API doesn't allow to distinguish * attaching to zombie from other errors. Thus we have to parse * the target's /proc/pid/stat. Sad, but parse whatever else * we might need at that early point. */ try_again: ret = wait4(pid, &status, __WALL, NULL); if (ret < 0) { /* * wait4() can expectedly fail only in a first time * if a task is zombie. If we are here from try_again, * this means that we are tracing this task. * * So here we can be only once in this function. */ wait_errno = errno; } ret2 = get_status(pid, ss, data); if (ret2) goto err; if (ret < 0 || WIFEXITED(status) || WIFSIGNALED(status)) { if (ss->state != 'Z') { if (pid == getpid()) pr_err("The criu itself is within dumped tree.\n"); else pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n", pid, ss->state, ret, wait_errno); return -1; } if (ret < 0) return COMPEL_TASK_ZOMBIE; else return COMPEL_TASK_DEAD; } if ((ppid != -1) && (ss->ppid != ppid)) { pr_err("Task pid reused while suspending (%d: %d -> %d)\n", pid, ppid, ss->ppid); goto err; } if (!WIFSTOPPED(status)) { pr_err("SEIZE %d: task not stopped after seize\n", pid); goto err; } ret = ptrace(PTRACE_GETSIGINFO, pid, NULL, &si); if (ret < 0) { pr_perror("SEIZE %d: can't read signfo", pid); goto err; } if (PTRACE_SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) { /* * Kernel notifies us about the task being seized received some * event other than the STOP, i.e. -- a signal. Let the task * handle one and repeat. */ if (ptrace(PTRACE_CONT, pid, NULL, (void *)(unsigned long)si.si_signo)) { pr_perror("Can't continue signal handling, aborting"); goto err; } ret = 0; if (free_status) free_status(pid, ss, data); goto try_again; } if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && ptrace_suspend_seccomp(pid) < 0) goto err; nr_sigstop = 0; if (ss->sigpnd & (1 << (SIGSTOP - 1))) nr_sigstop++; if (ss->shdpnd & (1 << (SIGSTOP - 1))) nr_sigstop++; if (si.si_signo == SIGSTOP) nr_sigstop++; if (nr_sigstop) { if (skip_sigstop(pid, nr_sigstop)) goto err_stop; return COMPEL_TASK_STOPPED; } if (si.si_signo == SIGTRAP) return COMPEL_TASK_ALIVE; else { pr_err("SEIZE %d: unsupported stop signal %d\n", pid, si.si_signo); goto err; } err_stop: kill(pid, SIGSTOP); err: if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) pr_perror("Unable to detach from %d", pid); return -1; } int compel_resume_task(pid_t pid, int orig_st, int st) { pr_debug("\tUnseizing %d into %d\n", pid, st); if (st == COMPEL_TASK_DEAD) { kill(pid, SIGKILL); return 0; } else if (st == COMPEL_TASK_STOPPED) { /* * Task might have had STOP in queue. We detected such * guy as COMPEL_TASK_STOPPED, but cleared signal to run * the parasite code. Thus after detach the task will become * running. That said -- STOP everyone regardless of * the initial state. */ kill(pid, SIGSTOP); } else if (st == COMPEL_TASK_ALIVE) { /* * Same as in the comment above -- there might be a * task with STOP in queue that would get lost after * detach, so stop it again. */ if (orig_st == COMPEL_TASK_STOPPED) kill(pid, SIGSTOP); } else pr_err("Unknown final state %d\n", st); if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) { pr_perror("Unable to detach from %d", pid); return -1; } return 0; } static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) { int sun_len; saddr->sun_family = AF_UNIX; snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d", key); sun_len = SUN_LEN(saddr); *saddr->sun_path = '\0'; return sun_len; } static int prepare_tsock(struct parasite_ctl *ctl, pid_t pid, struct parasite_init_args *args) { int ssock = -1; socklen_t sk_len; struct sockaddr_un addr; pr_info("Putting tsock into pid %d\n", pid); args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid()); ssock = ctl->ictx.sock; sk_len = sizeof(addr); if (ssock == -1) { pr_err("No socket in ictx\n"); goto err; } if (getsockname(ssock, (struct sockaddr *) &addr, &sk_len) < 0) { pr_perror("Unable to get name for a socket"); return -1; } if (sk_len == sizeof(addr.sun_family)) { if (bind(ssock, (struct sockaddr *)&args->h_addr, args->h_addr_len) < 0) { pr_perror("Can't bind socket"); goto err; } if (listen(ssock, 1)) { pr_perror("Can't listen on transport socket"); goto err; } } /* Check a case when parasite can't initialize a command socket */ if (ctl->ictx.flags & INFECT_FAIL_CONNECT) args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid() + 1); /* * Set to -1 to prevent any accidental misuse. The * only valid user of it is accept_tsock(). */ ctl->tsock = -ssock; return 0; err: close_safe(&ssock); return -1; } static int setup_child_handler(struct parasite_ctl *ctl) { struct sigaction sa = { .sa_sigaction = ctl->ictx.child_handler, .sa_flags = SA_SIGINFO | SA_RESTART, }; sigemptyset(&sa.sa_mask); sigaddset(&sa.sa_mask, SIGCHLD); if (sigaction(SIGCHLD, &sa, NULL)) { pr_perror("Unable to setup SIGCHLD handler"); return -1; } return 0; } static int restore_child_handler(struct parasite_ctl *ctl) { if (sigaction(SIGCHLD, &ctl->ictx.orig_handler, NULL)) { pr_perror("Unable to setup SIGCHLD handler"); return -1; } return 0; } static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, user_regs_struct_t *regs, struct thread_ctx *octx) { k_rtsigset_t block; ksigfillset(&block); if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { pr_perror("Can't block signals for %d", pid); goto err_sig; } parasite_setup_regs(ip, stack, regs); if (ptrace_set_regs(pid, regs)) { pr_perror("Can't set registers for %d", pid); goto err_regs; } if (ptrace(cmd, pid, NULL, NULL)) { pr_perror("Can't run parasite at %d", pid); goto err_cont; } return 0; err_cont: if (ptrace_set_regs(pid, &octx->regs)) pr_perror("Can't restore regs for %d", pid); err_regs: if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &octx->sigmask)) pr_perror("Can't restore sigmask for %d", pid); err_sig: return -1; } static int restore_thread_ctx(int pid, struct thread_ctx *ctx) { int ret = 0; if (ptrace_set_regs(pid, &ctx->regs)) { pr_perror("Can't restore registers (pid: %d)", pid); ret = -1; } if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) { pr_perror("Can't block signals"); ret = -1; } return ret; } /* we run at @regs->ip */ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, user_regs_struct_t *regs, struct thread_ctx *octx) { siginfo_t siginfo; int status; int ret = -1; /* * Most ideas are taken from Tejun Heo's parasite thread * https://code.google.com/p/ptrace-parasite/ */ if (wait4(pid, &status, __WALL, NULL) != pid) { pr_perror("Waited pid mismatch (pid: %d)", pid); goto err; } if (!WIFSTOPPED(status)) { pr_err("Task is still running (pid: %d)\n", pid); goto err; } if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) { pr_perror("Can't get siginfo (pid: %d)", pid); goto err; } if (ptrace_get_regs(pid, regs)) { pr_perror("Can't obtain registers (pid: %d)", pid); goto err; } if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != ARCH_SI_TRAP) { pr_debug("** delivering signal %d si_code=%d\n", siginfo.si_signo, siginfo.si_code); pr_err("Unexpected %d task interruption, aborting\n", pid); goto err; } /* * We've reached this point if int3 is triggered inside our * parasite code. So we're done. */ ret = 0; err: if (restore_thread_ctx(pid, octx)) ret = -1; return ret; } int compel_execute_syscall(struct parasite_ctl *ctl, user_regs_struct_t *regs, const char *code_syscall) { pid_t pid = ctl->rpid; int err; uint8_t code_orig[BUILTIN_SYSCALL_SIZE]; /* * Inject syscall instruction and remember original code, * we will need it to restore original program content. */ memcpy(code_orig, code_syscall, sizeof(code_orig)); if (ptrace_swap_area(pid, (void *)ctl->ictx.syscall_ip, (void *)code_orig, sizeof(code_orig))) { pr_err("Can't inject syscall blob (pid: %d)\n", pid); return -1; } err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig); if (!err) err = parasite_trap(ctl, pid, regs, &ctl->orig); if (ptrace_poke_area(pid, (void *)code_orig, (void *)ctl->ictx.syscall_ip, sizeof(code_orig))) { pr_err("Can't restore syscall blob (pid: %d)\n", ctl->rpid); err = -1; } return err; } int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs) { user_regs_struct_t regs = ctl->orig.regs; int ret; ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig); if (!ret) ret = parasite_trap(ctl, ctl->rpid, ret_regs ? ret_regs : ®s, &ctl->orig); return ret; } static int accept_tsock(struct parasite_ctl *ctl) { int sock; int ask = -ctl->tsock; /* this '-' is explained above */ sock = accept(ask, NULL, 0); if (sock < 0) { pr_perror("Can't accept connection to the transport socket"); close(ask); return -1; } ctl->tsock = sock; return 0; } static int parasite_init_daemon(struct parasite_ctl *ctl) { struct parasite_init_args *args; pid_t pid = ctl->rpid; user_regs_struct_t regs; struct ctl_msg m = { }; *ctl->addr_cmd = PARASITE_CMD_INIT_DAEMON; args = compel_parasite_args(ctl, struct parasite_init_args); args->sigframe = (uintptr_t)ctl->rsigframe; args->log_level = compel_log_get_loglevel(); futex_set(&args->daemon_connected, 0); if (prepare_tsock(ctl, pid, args)) goto err; /* after this we can catch parasite errors in chld handler */ if (setup_child_handler(ctl)) goto err; regs = ctl->orig.regs; if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig)) goto err; futex_wait_while_eq(&args->daemon_connected, 0); if (futex_get(&args->daemon_connected) != 1) { errno = -(int)futex_get(&args->daemon_connected); pr_perror("Unable to connect a transport socket"); goto err; } if (accept_tsock(ctl) < 0) goto err; if (compel_util_send_fd(ctl, ctl->ictx.log_fd)) goto err; pr_info("Wait for parasite being daemonized...\n"); if (parasite_wait_ack(ctl->tsock, PARASITE_CMD_INIT_DAEMON, &m)) { pr_err("Can't switch parasite %d to daemon mode %d\n", pid, m.err); goto err; } ctl->sigreturn_addr = (void*)(uintptr_t)args->sigreturn_addr; ctl->daemonized = true; pr_info("Parasite %d has been switched to daemon mode\n", pid); return 0; err: return -1; } static int parasite_start_daemon(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; struct infect_ctx *ictx = &ctl->ictx; /* * Get task registers before going daemon, since the * compel_get_task_regs needs to call ptrace on _stopped_ task, * while in daemon it is not such. */ if (get_task_regs(pid, &ctl->orig.regs, ictx->save_regs, ictx->regs_arg)) { pr_err("Can't obtain regs for thread %d\n", pid); return -1; } if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask)) return -1; if (parasite_init_daemon(ctl)) return -1; return 0; } static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size) { int fd; ctl->remote_map = remote_mmap(ctl, NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_SHARED, -1, 0); if (!ctl->remote_map) { pr_err("Can't allocate memory for parasite blob (pid: %d)\n", ctl->rpid); return -1; } ctl->map_length = round_up(size, page_size()); fd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "map_files/%p-%p", ctl->remote_map, ctl->remote_map + ctl->map_length); if (fd < 0) return -1; ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FILE, fd, 0); close(fd); if (ctl->local_map == MAP_FAILED) { ctl->local_map = NULL; pr_perror("Can't map remote parasite map"); return -1; } return 0; } static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) { void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; pid_t pid = ctl->rpid; long sret = -ENOSYS; int ret, fd, lfd; bool __maybe_unused compat_task = !compel_mode_native(ctl); if (ctl->ictx.flags & INFECT_NO_MEMFD) return 1; BUILD_BUG_ON(sizeof(orig_code) < sizeof(long)); if (ptrace_swap_area(pid, where, (void *)orig_code, sizeof(orig_code))) { pr_err("Can't inject memfd args (pid: %d)\n", pid); return -1; } ret = compel_syscall(ctl, __NR(memfd_create, compat_task), &sret, (unsigned long)where, 0, 0, 0, 0, 0); if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) { fd = (int)(long)sret; if (fd >= 0) compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); pr_err("Can't restore memfd args (pid: %d)\n", pid); return -1; } if (ret < 0) return ret; fd = (int)(long)sret; if (fd == -ENOSYS) return 1; if (fd < 0) { errno = -fd; pr_perror("Can't create memfd in victim"); return fd; } ctl->map_length = round_up(size, page_size()); lfd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "fd/%d", fd); if (lfd < 0) goto err_cure; if (ftruncate(lfd, ctl->map_length) < 0) { pr_perror("Fail to truncate memfd for parasite"); goto err_cure; } ctl->remote_map = remote_mmap(ctl, NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FILE | MAP_SHARED, fd, 0); if (!ctl->remote_map) { pr_err("Can't rmap memfd for parasite blob\n"); goto err_curef; } ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FILE, lfd, 0); if (ctl->local_map == MAP_FAILED) { ctl->local_map = NULL; pr_perror("Can't lmap memfd for parasite blob"); goto err_curef; } compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); close(lfd); pr_info("Set up parasite blob using memfd\n"); return 0; err_curef: close(lfd); err_cure: compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); return -1; } void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs) { size_t i, j; for (i = 0, j = 0; i < nr_relocs; i++) { if (elf_relocs[i].type & COMPEL_TYPE_LONG) { long *where = mem + elf_relocs[i].offset; long *p = mem + size; if (elf_relocs[i].type & COMPEL_TYPE_GOTPCREL) { int *value = (int *)where; int rel; p[j] = (long)vbase + elf_relocs[i].value; rel = (unsigned)((void *)&p[j] - (void *)mem) - elf_relocs[i].offset + elf_relocs[i].addend; *value = rel; j++; } else *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase; } else if (elf_relocs[i].type & COMPEL_TYPE_INT) { int *where = (mem + elf_relocs[i].offset); *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase; } else BUG(); } } static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) { int ret; ret = parasite_memfd_exchange(ctl, size); if (ret == 1) { pr_info("MemFD parasite doesn't work, goto legacy mmap\n"); ret = parasite_mmap_exchange(ctl, size); } return ret; } static inline unsigned long total_pie_size(size_t blob_size) { return round_up(blob_size, page_size()); } int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) { int ret; unsigned long p, map_exchange_size, parasite_size = 0; if (ctl->pblob.parasite_type != COMPEL_BLOB_CHEADER) goto err; if (ctl->ictx.log_fd < 0) goto err; if (!arch_can_dump_task(ctl)) goto err; /* * Inject a parasite engine. Ie allocate memory inside alien * space and copy engine code there. Then re-map the engine * locally, so we will get an easy way to access engine memory * without using ptrace at all. */ parasite_size = total_pie_size(ctl->pblob.hdr.bsize); ctl->args_size = round_up(args_size, PAGE_SIZE); parasite_size += ctl->args_size; map_exchange_size = parasite_size; map_exchange_size += RESTORE_STACK_SIGFRAME + PARASITE_STACK_SIZE; if (nr_threads > 1) map_exchange_size += PARASITE_STACK_SIZE; ret = compel_map_exchange(ctl, map_exchange_size); if (ret) goto err; pr_info("Putting parasite blob into %p->%p\n", ctl->local_map, ctl->remote_map); ctl->parasite_ip = (unsigned long)(ctl->remote_map + ctl->pblob.hdr.parasite_ip_off); ctl->addr_cmd = ctl->local_map + ctl->pblob.hdr.addr_cmd_off; ctl->addr_args = ctl->local_map + ctl->pblob.hdr.addr_arg_off; memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); if (ctl->pblob.hdr.nr_relocs) compel_relocs_apply(ctl->local_map, ctl->remote_map, ctl->pblob.hdr.bsize, ctl->pblob.hdr.relocs, ctl->pblob.hdr.nr_relocs); p = parasite_size; ctl->rsigframe = ctl->remote_map + p; ctl->sigframe = ctl->local_map + p; p += RESTORE_STACK_SIGFRAME; p += PARASITE_STACK_SIZE; ctl->rstack = ctl->remote_map + p; if (nr_threads > 1) { p += PARASITE_STACK_SIZE; ctl->r_thread_stack = ctl->remote_map + p; } ret = arch_fetch_sas(ctl, ctl->rsigframe); if (ret) { pr_err("Can't fetch sigaltstack for task %d (ret %d)\n", ctl->rpid, ret); goto err; } if (parasite_start_daemon(ctl)) goto err; return 0; err: return -1; } struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid) { struct parasite_thread_ctl *tctl; tctl = xmalloc(sizeof(*tctl)); if (tctl) { if (prepare_thread(pid, &tctl->th)) { xfree(tctl); tctl = NULL; } else { tctl->tid = pid; tctl->ctl = ctl; } } return tctl; } static int prepare_thread(int pid, struct thread_ctx *ctx) { if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) { pr_perror("can't get signal blocking mask for %d", pid); return -1; } if (ptrace_get_regs(pid, &ctx->regs)) { pr_perror("Can't obtain registers (pid: %d)", pid); return -1; } return 0; } void compel_release_thread(struct parasite_thread_ctl *tctl) { /* * No stuff to cure in thread here, all routines leave the * guy intact (for now) */ xfree(tctl); } struct parasite_ctl *compel_prepare_noctx(int pid) { struct parasite_ctl *ctl = NULL; /* * Control block early setup. */ ctl = xzalloc(sizeof(*ctl)); if (!ctl) { pr_err("Parasite control block allocation failed (pid: %d)\n", pid); goto err; } ctl->tsock = -1; ctl->ictx.log_fd = -1; if (prepare_thread(pid, &ctl->orig)) goto err; ctl->rpid = pid; BUILD_BUG_ON(PARASITE_START_AREA_MIN < BUILTIN_SYSCALL_SIZE + MEMFD_FNAME_SZ); return ctl; err: xfree(ctl); return NULL; } /* * Find first executable VMA that would fit the initial * syscall injection. */ static unsigned long find_executable_area(int pid) { char aux[128]; FILE *f; unsigned long ret = (unsigned long)MAP_FAILED; sprintf(aux, "/proc/%d/maps", pid); f = fopen(aux, "r"); if (!f) goto out; while (fgets(aux, sizeof(aux), f)) { unsigned long start, end; char *f; start = strtoul(aux, &f, 16); end = strtoul(f + 1, &f, 16); /* f now points at " rwx" (yes, with space) part */ if (f[3] == 'x') { BUG_ON(end - start < PARASITE_START_AREA_MIN); ret = start; break; } } fclose(f); out: return ret; } /* * This routine is to create PF_UNIX/SOCK_SEQPACKET socket * in the target net namespace */ static int make_sock_for(int pid) { int ret, mfd, fd, sk = -1; char p[32]; pr_debug("Preparing seqsk for %d\n", pid); sprintf(p, "/proc/%d/ns/net", pid); fd = open(p, O_RDONLY); if (fd < 0) { pr_perror("Can't open %p", p); goto out; } mfd = open("/proc/self/ns/net", O_RDONLY); if (mfd < 0) { pr_perror("Can't open self netns"); goto out_c; } if (setns(fd, CLONE_NEWNET)) { pr_perror("Can't setup target netns"); goto out_cm; } sk = socket(PF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK, 0); if (sk < 0) pr_perror("Can't create seqsk"); ret = setns(mfd, CLONE_NEWNET); if (ret) { pr_perror("Can't restore former netns"); if (sk >= 0) close(sk); sk = -1; } out_cm: close(mfd); out_c: close(fd); out: return sk; } static int simple_open_proc(int pid, int mode, const char *fmt, ...) { int l; char path[128]; va_list args; l = sprintf(path, "/proc/%d/", pid); va_start(args, fmt); vsnprintf(path + l, sizeof(path) - l, fmt, args); va_end(args); return open(path, mode); } static void handle_sigchld(int signal, siginfo_t *siginfo, void *data) { int pid, status; pid = waitpid(-1, &status, WNOHANG); if (pid <= 0) return; pr_err("si_code=%d si_pid=%d si_status=%d\n", siginfo->si_code, siginfo->si_pid, siginfo->si_status); if (WIFEXITED(status)) pr_err("%d exited with %d unexpectedly\n", pid, WEXITSTATUS(status)); else if (WIFSIGNALED(status)) pr_err("%d was killed by %d unexpectedly: %s\n", pid, WTERMSIG(status), strsignal(WTERMSIG(status))); else if (WIFSTOPPED(status)) pr_err("%d was stopped by %d unexpectedly\n", pid, WSTOPSIG(status)); /* FIXME Should we exit? */ /* exit(1); */ } struct plain_regs_struct { user_regs_struct_t regs; user_fpregs_struct_t fpregs; }; static int save_regs_plain(void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) { struct plain_regs_struct *prs = to; prs->regs = *r; prs->fpregs = *f; return 0; } #ifndef RT_SIGFRAME_UC_SIGMASK #define RT_SIGFRAME_UC_SIGMASK(sigframe) \ (k_rtsigset_t*)(void *)&RT_SIGFRAME_UC(sigframe)->uc_sigmask #endif static int make_sigframe_plain(void *from, struct rt_sigframe *f, struct rt_sigframe *rtf, k_rtsigset_t *b) { struct plain_regs_struct *prs = from; k_rtsigset_t *blk_sigset; /* * Make sure it's zeroified. */ memset(f, 0, sizeof(*f)); if (sigreturn_prep_regs_plain(f, &prs->regs, &prs->fpregs)) return -1; blk_sigset = RT_SIGFRAME_UC_SIGMASK(f); if (b) memcpy(blk_sigset, b, sizeof(k_rtsigset_t)); else memset(blk_sigset, 0, sizeof(k_rtsigset_t)); if (RT_SIGFRAME_HAS_FPU(f)) { if (sigreturn_prep_fpu_frame_plain(f, rtf)) return -1; } /* * FIXME What about sas? * setup_sas(sigframe, core->thread_core->sas); */ return 0; } struct parasite_ctl *compel_prepare(int pid) { struct parasite_ctl *ctl; struct infect_ctx *ictx; ctl = compel_prepare_noctx(pid); if (ctl == NULL) goto out; ictx = &ctl->ictx; ictx->task_size = compel_task_size(); ictx->open_proc = simple_open_proc; ictx->syscall_ip = find_executable_area(pid); ictx->child_handler = handle_sigchld; sigaction(SIGCHLD, NULL, &ictx->orig_handler); ictx->save_regs = save_regs_plain; ictx->make_sigframe = make_sigframe_plain; ictx->regs_arg = xmalloc(sizeof(struct plain_regs_struct)); if (ictx->regs_arg == NULL) goto err; if (ictx->syscall_ip == (unsigned long)MAP_FAILED) goto err; ictx->sock = make_sock_for(pid); if (ictx->sock < 0) goto err; out: return ctl; err: xfree(ictx->regs_arg); xfree(ctl); ctl = NULL; goto out; } static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs) { void *addr = (void *) REG_IP(*regs); return addr >= ctl->remote_map && addr < ctl->remote_map + ctl->map_length; } static int parasite_fini_seized(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; user_regs_struct_t regs; int status, ret = 0; enum trace_flags flag; /* stop getting chld from parasite -- we're about to step-by-step it */ if (restore_child_handler(ctl)) return -1; /* Start to trace syscalls for each thread */ if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL)) { pr_perror("Unable to interrupt the process"); return -1; } pr_debug("Waiting for %d to trap\n", pid); if (wait4(pid, &status, __WALL, NULL) != pid) { pr_perror("Waited pid mismatch (pid: %d)", pid); return -1; } pr_debug("Daemon %d exited trapping\n", pid); if (!WIFSTOPPED(status)) { pr_err("Task is still running (pid: %d)\n", pid); return -1; } ret = ptrace_get_regs(pid, ®s); if (ret) { pr_perror("Unable to get registers"); return -1; } if (!task_in_parasite(ctl, ®s)) { pr_err("The task is not in parasite code\n"); return -1; } ret = compel_rpc_call(PARASITE_CMD_FINI, ctl); close_safe(&ctl->tsock); if (ret) return -1; /* Go to sigreturn as closer as we can */ ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); if (ret < 0) return ret; if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag)) return -1; if (ptrace_flush_breakpoints(pid)) return -1; /* * All signals are unblocked now. The kernel notifies about leaving * syscall before starting to deliver signals. All parasite code are * executed with blocked signals, so we can sefly unmap a parasite blob. */ return 0; } int compel_stop_daemon(struct parasite_ctl *ctl) { if (ctl->daemonized) { /* * Looks like a previous attempt failed, we should do * nothing in this case. parasite will try to cure itself. */ if (ctl->tsock < 0) return -1; if (parasite_fini_seized(ctl)) { close_safe(&ctl->tsock); return -1; } } ctl->daemonized = false; return 0; } int compel_cure_remote(struct parasite_ctl *ctl) { long ret; if (compel_stop_daemon(ctl)) return -1; if (!ctl->remote_map) return 0; compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret, (unsigned long)ctl->remote_map, ctl->map_length, 0, 0, 0, 0); if (ret) { pr_err("munmap for remote map %p, %lu returned %lu\n", ctl->remote_map, ctl->map_length, ret); return -1; } return 0; } int compel_cure_local(struct parasite_ctl *ctl) { int ret = 0; if (ctl->local_map) { if (munmap(ctl->local_map, ctl->map_length)) { pr_err("munmap failed (pid: %d)\n", ctl->rpid); ret = -1; } } free(ctl); return ret; } int compel_cure(struct parasite_ctl *ctl) { int ret; ret = compel_cure_remote(ctl); if (!ret) ret = compel_cure_local(ctl); return ret; } void *compel_parasite_args_p(struct parasite_ctl *ctl) { return ctl->addr_args; } void *compel_parasite_args_s(struct parasite_ctl *ctl, int args_size) { BUG_ON(args_size > ctl->args_size); return compel_parasite_args_p(ctl); } int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd) { int pid = tctl->tid; struct parasite_ctl *ctl = tctl->ctl; struct thread_ctx *octx = &tctl->th; void *stack = ctl->r_thread_stack; user_regs_struct_t regs = octx->regs; int ret; *ctl->addr_cmd = cmd; ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx); if (ret == 0) ret = parasite_trap(ctl, pid, ®s, octx); if (ret == 0) ret = (int)REG_RES(regs); if (ret) pr_err("Parasite exited with %d\n", ret); return ret; } /* * compel_unmap() is used for unmapping parasite and restorer blobs. * A blob can contain code for unmapping itself, so the porcess is * trapped on the exit from the munmap syscall. */ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) { user_regs_struct_t regs = ctl->orig.regs; pid_t pid = ctl->rpid; int ret = -1; ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig); if (ret) goto err; ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1), TRACE_ENTER); if (restore_thread_ctx(pid, &ctl->orig)) ret = -1; err: return ret; } int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) { int ret; if (no_bp) { pr_debug("Force no-breakpoints restore\n"); ret = 0; } else ret = ptrace_set_breakpoint(pid, addr); if (ret < 0) return ret; if (ret > 0) { /* * PIE will stop on a breakpoint, next * stop after that will be syscall enter. */ *tf = TRACE_EXIT; return 0; } /* * No breakpoints available -- start tracing it * in a per-syscall manner. */ ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the %d process", pid); return -1; } *tf = TRACE_ENTER; return 0; } static bool task_is_trapped(int status, pid_t pid) { if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) return true; pr_err("Task %d is in unexpected state: %x\n", pid, status); if (WIFEXITED(status)) pr_err("Task exited with %d\n", WEXITSTATUS(status)); if (WIFSIGNALED(status)) pr_err("Task signaled with %d: %s\n", WTERMSIG(status), strsignal(WTERMSIG(status))); if (WIFSTOPPED(status)) pr_err("Task stopped with %d: %s\n", WSTOPSIG(status), strsignal(WSTOPSIG(status))); if (WIFCONTINUED(status)) pr_err("Task continued\n"); return false; } static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, const int sys_nr, const int sys_nr_compat) { const char *mode = user_regs_native(regs) ? "native" : "compat"; int req_sysnr = user_regs_native(regs) ? sys_nr : sys_nr_compat; pr_debug("%d (%s) is going to execute the syscall %lu, required is %d\n", pid, mode, REG_SYSCALL_NR(*regs), req_sysnr); return (REG_SYSCALL_NR(*regs) == req_sysnr); } /* * Trap tasks on the exit from the specified syscall * * tasks - number of processes, which should be trapped * sys_nr - the required syscall number * sys_nr_compat - the required compatible syscall number */ int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, enum trace_flags trace) { user_regs_struct_t regs; int status, ret; pid_t pid; if (tasks > 1) trace = TRACE_ALL; /* Stop all threads on the enter point in sys_rt_sigreturn */ while (tasks) { pid = wait4(-1, &status, __WALL, NULL); if (pid == -1) { pr_perror("wait4 failed"); return -1; } if (!task_is_trapped(status, pid)) return -1; pr_debug("%d was trapped\n", pid); if (trace == TRACE_EXIT) { trace = TRACE_ENTER; pr_debug("`- Expecting exit\n"); goto goon; } if (trace == TRACE_ENTER) trace = TRACE_EXIT; ret = ptrace_get_regs(pid, ®s); if (ret) { pr_perror("ptrace"); return -1; } if (is_required_syscall(®s, pid, sys_nr, sys_nr_compat)) { /* * The process is going to execute the required syscall, * the next stop will be on the exit from this syscall */ ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); if (ret) { pr_perror("ptrace"); return -1; } pid = wait4(pid, &status, __WALL, NULL); if (pid == -1) { pr_perror("wait4 failed"); return -1; } if (!task_is_trapped(status, pid)) return -1; pr_debug("%d was stopped\n", pid); tasks--; continue; } goon: ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); if (ret) { pr_perror("ptrace"); return -1; } } return 0; } int compel_mode_native(struct parasite_ctl *ctl) { return user_regs_native(&ctl->orig.regs); } static inline k_rtsigset_t *thread_ctx_sigmask(struct thread_ctx *tctx) { return &tctx->sigmask; } k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl) { return thread_ctx_sigmask(&tctl->th); } k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl) { return thread_ctx_sigmask(&ctl->orig); } int compel_get_thread_regs(struct parasite_thread_ctl *tctl, save_regs_t save, void * arg) { return get_task_regs(tctl->tid, &tctl->th.regs, save, arg); } struct infect_ctx *compel_infect_ctx(struct parasite_ctl *ctl) { return &ctl->ictx; } struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *ctl) { return &ctl->pblob; } criu-3.6/compel/src/lib/log-host.c000077700000000000000000000000001317335042600177642log.custar00rootroot00000000000000criu-3.6/compel/src/lib/log.c000066400000000000000000000013111317335042600160650ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "log.h" static unsigned int current_loglevel = COMPEL_DEFAULT_LOGLEVEL; static compel_log_fn logfn; void compel_log_init(compel_log_fn log_fn, unsigned int level) { logfn = log_fn; current_loglevel = level; } unsigned int compel_log_get_loglevel(void) { return current_loglevel; } void compel_print_on_level(unsigned int loglevel, const char *format, ...) { va_list params; compel_log_fn fn = logfn; if (fn != NULL && !pr_quelled(loglevel)) { va_start(params, format); fn(loglevel, format, params); va_end(params); } } criu-3.6/compel/src/lib/ptrace.c000066400000000000000000000040431317335042600165670ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/compiler.h" #include "uapi/compel/asm/infect-types.h" #include "ptrace.h" #include "log.h" int ptrace_suspend_seccomp(pid_t pid) { if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { pr_perror("suspending seccomp failed"); return -1; } return 0; } int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes) { unsigned long w; if (bytes & (sizeof(long) - 1)) return -1; for (w = 0; w < bytes / sizeof(long); w++) { unsigned long *d = dst, *a = addr; d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL); if (d[w] == -1U && errno) goto err; } return 0; err: return -2; } int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes) { unsigned long w; if (bytes & (sizeof(long) - 1)) return -1; for (w = 0; w < bytes / sizeof(long); w++) { unsigned long *s = src, *a = addr; if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w])) goto err; } return 0; err: return -2; } /* don't swap big space, it might overflow the stack */ int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes) { void *t = alloca(bytes); if (ptrace_peek_area(pid, t, dst, bytes)) return -1; if (ptrace_poke_area(pid, src, dst, bytes)) { if (ptrace_poke_area(pid, t, dst, bytes)) return -2; return -1; } memcpy(src, t, bytes); return 0; } int __attribute__((weak)) ptrace_get_regs(int pid, user_regs_struct_t *regs) { struct iovec iov; iov.iov_base = regs; iov.iov_len = sizeof(user_regs_struct_t); return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); } int __attribute__((weak)) ptrace_set_regs(int pid, user_regs_struct_t *regs) { struct iovec iov; iov.iov_base = regs; iov.iov_len = sizeof(user_regs_struct_t); return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); } criu-3.6/compel/src/main-host.c000077700000000000000000000000001317335042600175242main.custar00rootroot00000000000000criu-3.6/compel/src/main.c000066400000000000000000000214321317335042600154700ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "uapi/compel/compel.h" #include "version.h" #include "piegen.h" #include "log.h" #define CFLAGS_DEFAULT_SET \ "-Wstrict-prototypes " \ "-fno-stack-protector -nostdlib -fomit-frame-pointer " #define COMPEL_CFLAGS_PIE CFLAGS_DEFAULT_SET "-fpie" #define COMPEL_CFLAGS_NOPIC CFLAGS_DEFAULT_SET "-fno-pic" #ifdef NO_RELOCS #define COMPEL_LDFLAGS_COMMON "-z noexecstack -T " #else #define COMPEL_LDFLAGS_COMMON "-r -z noexecstack -T " #endif typedef struct { const char *arch; // dir name under arch/ const char *cflags; const char *cflags_compat; } flags_t; static const flags_t flags = { #if defined CONFIG_X86_64 .arch = "x86", .cflags = COMPEL_CFLAGS_PIE, .cflags_compat = COMPEL_CFLAGS_NOPIC, #elif defined CONFIG_AARCH64 .arch = "aarch64", .cflags = COMPEL_CFLAGS_PIE, #elif defined(CONFIG_ARMV6) || defined(CONFIG_ARMV7) .arch = "arm", .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_PPC64 .arch = "ppc64", .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_S390 .arch = "s390", .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif }; piegen_opt_t opts = {}; const char *uninst_root; static int piegen(void) { struct stat st; void *mem; int fd, ret = -1; fd = open(opts.input_filename, O_RDONLY); if (fd < 0) { pr_perror("Can't open file %s", opts.input_filename); return -1; } if (fstat(fd, &st)) { pr_perror("Can't stat file %s", opts.input_filename); goto err; } opts.fout = fopen(opts.output_filename, "w"); if (opts.fout == NULL) { pr_perror("Can't open %s", opts.output_filename); goto err; } mem = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, 0); if (mem == MAP_FAILED) { pr_perror("Can't mmap file %s", opts.input_filename); goto err; } if (handle_binary(mem, st.st_size)) { close(fd), fd = -1; unlink(opts.output_filename); goto err; } ret = 0; err: if (fd >= 0) close(fd); if (opts.fout) fclose(opts.fout); if (!ret) pr_info("%s generated successfully.\n", opts.output_filename); return ret; } static void cli_log(unsigned int lvl, const char *fmt, va_list parms) { FILE *f = stdout; if (pr_quelled(lvl)) return; if ((lvl == COMPEL_LOG_ERROR) || (lvl == COMPEL_LOG_WARN)) f = stderr; vfprintf(f, fmt, parms); } static int usage(int rc) { FILE *out = (rc == 0) ? stdout : stderr; fprintf(out, "Usage:\n" " compel [--compat] includes | cflags | ldflags\n" " compel plugins [PLUGIN_NAME ...]\n" " compel [--compat] [--static] libs\n" " compel -f FILE -o FILE [-p NAME] [-l N] hgen\n" " -f, --file FILE input (parasite object) file name\n" " -o, --output FILE output (header) file name\n" " -p, --prefix NAME prefix for var names\n" " -l, --log-level NUM log level (default: %d)\n" " compel -h|--help\n" " compel -V|--version\n" , COMPEL_DEFAULT_LOGLEVEL ); return rc; } static void print_includes(void) { int i; /* list of standard include dirs (built into C preprocessor) */ const char *standard_includes[] = { "/usr/include", "/usr/local/include", }; /* I am not installed, called via a wrapper */ if (uninst_root) { printf("-I %s/include/uapi\n", uninst_root); return; } /* I am installed * Make sure to not print banalities */ for (i = 0; i < ARRAY_SIZE(standard_includes); i++) if (strcmp(INCLUDEDIR, standard_includes[i]) == 0) return; /* Finally, print our non-standard include path */ printf("%s\n", "-I " INCLUDEDIR); } static void print_cflags(bool compat) { printf("%s\n", compat ? flags.cflags_compat : flags.cflags); print_includes(); } static void print_ldflags(bool compat) { const char *compat_str = (compat) ? "-compat" : ""; printf("%s", COMPEL_LDFLAGS_COMMON); if (uninst_root) { printf("%s/arch/%s/scripts/compel-pack%s.lds.S\n", uninst_root, flags.arch, compat_str); } else { printf("%s/compel/scripts/compel-pack%s.lds.S\n", LIBEXECDIR, compat_str); } } static void print_plugin(const char *name) { const char suffix[] = ".lib.a"; if (uninst_root) printf("%s/plugins/%s%s\n", uninst_root, name, suffix); else printf("%s/compel/%s%s\n", LIBEXECDIR, name, suffix); } static void print_plugins(char *const list[]) { char *builtin_list[] = { "std", NULL }; char **p = builtin_list; while (*p != NULL) print_plugin(*p++); while (*list != NULL) print_plugin(*list++); } static int print_libs(bool is_static) { if (uninst_root) { if (!is_static) { fprintf(stderr, "Compel is not installed, can " "only link with static libraries " "(use --static)\n"); return 1; } printf("%s/%s\n", uninst_root, STATIC_LIB); } else { printf("%s/%s\n", LIBDIR, (is_static) ? STATIC_LIB : DYN_LIB); } return 0; } /* Extracts the file name (removing directory path and suffix, * and checks the result for being a valid C identifier * (replacing - with _ along the way). * * If everything went fine, return the resulting string, * otherwise NULL. * * Example: get_prefix("./some/path/to/file.c") ==> "file" */ static char *gen_prefix(const char *path) { const char *p1 = NULL, *p2 = NULL; size_t len; int i; char *p, *ret; len = strlen(path); if (len == 0) return NULL; // Find the last slash (p1) // and the first dot after it (p2) for (i = len - 1; i >= 0; i--) { if (!p1 && path[i] == '.') { p2 = path + i - 1; } else if (!p1 && path[i] == '/') { p1 = path + i + 1; break; } } if (!p1) // no slash in path p1 = path; if (!p2) // no dot (after slash) p2 = path + len; len = p2 - p1 + 1; if (len < 1) return NULL; ret = strndup(p1, len); // Now, check if we got a valid C identifier. We don't need to care // about C reserved keywords, as this is only used as a prefix. for (p = ret; *p != '\0'; p++) { if (isalpha(*p)) continue; // digit is fine, except the first character if (isdigit(*p) && p > ret) continue; // only allowed special character is _ if (*p == '_') continue; // as a courtesy, replace - with _ if (*p == '-') { *p = '_'; continue; } // invalid character! free(ret); return NULL; } return ret; } int main(int argc, char *argv[]) { int log_level = COMPEL_DEFAULT_LOGLEVEL; bool compat = false; bool is_static = false; int opt, idx; char *action; static const char short_opts[] = "csf:o:p:hVl:"; static struct option long_opts[] = { { "compat", no_argument, 0, 'c' }, { "static", no_argument, 0, 's' }, { "file", required_argument, 0, 'f' }, { "output", required_argument, 0, 'o' }, { "prefix", required_argument, 0, 'p' }, { "help", no_argument, 0, 'h' }, { "version", no_argument, 0, 'V' }, { "log-level", required_argument, 0, 'l' }, { }, }; uninst_root = getenv("COMPEL_UNINSTALLED_ROOTDIR"); while (1) { idx = -1; opt = getopt_long(argc, argv, short_opts, long_opts, &idx); if (opt == -1) break; switch (opt) { case 'c': compat = true; break; case 's': is_static = true; break; case 'f': opts.input_filename = optarg; break; case 'o': opts.output_filename = optarg; break; case 'p': opts.prefix = optarg; break; case 'l': log_level = atoi(optarg); break; case 'h': return usage(0); case 'V': printf("Version: %d.%d.%d\n", COMPEL_SO_VERSION_MAJOR, COMPEL_SO_VERSION_MINOR, COMPEL_SO_VERSION_SUBLEVEL); exit(0); break; default: // '?' // error message already printed by getopt_long() return usage(1); break; } } if (optind >= argc) { fprintf(stderr, "Error: action argument required\n"); return usage(1); } action = argv[optind++]; if (!strcmp(action, "includes")) { print_includes(); return 0; } if (!strcmp(action, "cflags")) { print_cflags(compat); return 0; } if (!strcmp(action, "ldflags")) { print_ldflags(compat); return 0; } if (!strcmp(action, "plugins")) { print_plugins(argv + optind); return 0; } if (!strcmp(action, "libs")) { return print_libs(is_static); } if (!strcmp(action, "hgen")) { if (!opts.input_filename) { fprintf(stderr, "Error: option --file required\n"); return usage(1); } if (!opts.output_filename) { fprintf(stderr, "Error: option --output required\n"); return usage(1); } if (!opts.prefix) { // prefix not provided, let's autogenerate opts.prefix = gen_prefix(opts.input_filename); if (!opts.prefix) opts.prefix = gen_prefix(opts.output_filename); if (!opts.prefix) { fprintf(stderr, "Error: can't autogenerate " "prefix (supply --prefix)"); return 2; } } compel_log_init(&cli_log, log_level); return piegen(); } fprintf(stderr, "Error: unknown action '%s'\n", action); return usage(1); } criu-3.6/compel/test/000077500000000000000000000000001317335042600145665ustar00rootroot00000000000000criu-3.6/compel/test/fdspy/000077500000000000000000000000001317335042600157135ustar00rootroot00000000000000criu-3.6/compel/test/fdspy/.gitignore000066400000000000000000000000421317335042600176770ustar00rootroot00000000000000parasite.h parasite.po spy victim criu-3.6/compel/test/fdspy/Makefile000066400000000000000000000010701317335042600173510ustar00rootroot00000000000000CC := gcc CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host all: victim spy clean: rm -f victim rm -f spy rm -f parasite.h rm -f parasite.po rm -f parasite.o victim: victim.c $(CC) $(CFLAGS) -o $@ $^ spy: spy.c parasite.h $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) parasite.h: parasite.po $(COMPEL) hgen -o $@ -f $< parasite.po: parasite.o ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins fds) parasite.o: parasite.c $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ criu-3.6/compel/test/fdspy/parasite.c000066400000000000000000000006701317335042600176720ustar00rootroot00000000000000#include #include #include /* * Stubs for std compel plugin. */ int compel_main(void *arg_p, unsigned int arg_s) { return 0; } int parasite_trap_cmd(int cmd, void *args) { return 0; } void parasite_cleanup(void) { } #define PARASITE_CMD_GETFD PARASITE_USER_CMDS int parasite_daemon_cmd(int cmd, void *args) { if (cmd == PARASITE_CMD_GETFD) fds_send_fd(2); return 0; } criu-3.6/compel/test/fdspy/spy.c000066400000000000000000000067571317335042600167110ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "parasite.h" #define PARASITE_CMD_GETFD PARASITE_USER_CMDS static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { printf("\tLC%u: ", lvl); vprintf(fmt, parms); } static int do_infection(int pid, int *stolen_fd) { #define err_and_ret(msg) do { fprintf(stderr, msg); return -1; } while (0) int state; struct parasite_ctl *ctl; struct infect_ctx *ictx; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); printf("Stopping task\n"); state = compel_stop_task(pid); if (state < 0) err_and_ret("Can't stop task"); printf("Preparing parasite ctl\n"); ctl = compel_prepare(pid); if (!ctl) err_and_ret("Can't prepare for infection"); printf("Configuring contexts\n"); /* * First -- the infection context. Most of the stuff * is already filled by compel_prepare(), just set the * log descriptor for parasite side, library cannot * live w/o it. */ ictx = compel_infect_ctx(ctl); ictx->log_fd = STDERR_FILENO; parasite_setup_c_header(ctl); printf("Infecting\n"); if (compel_infect(ctl, 1, sizeof(int))) err_and_ret("Can't infect victim"); printf("Stealing fd\n"); if (compel_rpc_call(PARASITE_CMD_GETFD, ctl)) err_and_ret("Can't run cmd"); if (compel_util_recv_fd(ctl, stolen_fd)) err_and_ret("Can't recv fd"); if (compel_rpc_sync(PARASITE_CMD_GETFD, ctl)) err_and_ret("Con't finalize cmd"); printf("Stole %d fd\n", *stolen_fd); /* * Done. Cure and resume the task. */ printf("Curing\n"); if (compel_cure(ctl)) err_and_ret("Can't cure victim"); if (compel_resume_task(pid, state, state)) err_and_ret("Can't unseize task"); printf("Done\n"); return 0; } static int check_pipe_ends(int wfd, int rfd) { struct stat r, w; char aux[4] = "0000"; printf("Check pipe ends are at hands\n"); if (fstat(wfd, &w) < 0) { perror("Can't stat wfd"); return 0; } if (fstat(rfd, &r) < 0) { perror("Can't stat rfd"); return 0; } if (w.st_dev != r.st_dev || w.st_ino != r.st_ino) { perror("Pipe's not the same"); return 0; } printf("Check pipe ends are connected\n"); write(wfd, "1", 2); read(rfd, aux, sizeof(aux)); if (aux[0] != '1' || aux[1] != '\0') { fprintf(stderr, "Pipe connectivity lost\n"); return 0; } return 1; } int main(int argc, char **argv) { int p_in[2], p_out[2], p_err[2], pid, pass = 1, stolen_fd = -1; /* * Prepare IO-s and fork the victim binary */ if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { perror("Can't make pipe"); return -1; } printf("Run the victim\n"); pid = vfork(); if (pid == 0) { close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); execl("./victim", "victim", NULL); exit(1); } close(p_in[0]); close(p_out[1]); close(p_err[1]); /* * Now do the infection with parasite.c */ printf("Infecting the victim\n"); if (do_infection(pid, &stolen_fd)) return 1; /* * Stop the victim and check the infection went well */ printf("Closing victim stdin\n"); close(p_in[1]); printf("Waiting for victim to die\n"); wait(NULL); printf("Checking the result\n"); /* * Stolen fd is the stderr of the task * Check these are the ends of the same pipe * and message passing works OK */ pass = check_pipe_ends(stolen_fd, p_err[0]); if (pass) printf("All OK\n"); else printf("Something went WRONG\n"); return 0; } criu-3.6/compel/test/fdspy/victim.c000066400000000000000000000002031317335042600173450ustar00rootroot00000000000000#include int main(int argc, char **argv) { int i, aux; do { i = read(0, &aux, 1); } while (i > 0); return 0; } criu-3.6/compel/test/infect/000077500000000000000000000000001317335042600160365ustar00rootroot00000000000000criu-3.6/compel/test/infect/.gitignore000066400000000000000000000000421317335042600200220ustar00rootroot00000000000000parasite.h parasite.po spy victim criu-3.6/compel/test/infect/Makefile000066400000000000000000000010641317335042600174770ustar00rootroot00000000000000CC := gcc CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host all: victim spy clean: rm -f victim rm -f spy rm -f parasite.h rm -f parasite.po rm -f parasite.o victim: victim.c $(CC) $(CFLAGS) -o $@ $^ spy: spy.c parasite.h $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) parasite.h: parasite.po $(COMPEL) hgen -o $@ -f $< parasite.po: parasite.o ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins) parasite.o: parasite.c $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ criu-3.6/compel/test/infect/parasite.c000066400000000000000000000010671317335042600200160ustar00rootroot00000000000000#include #include #include /* * Stubs for std compel plugin. */ int parasite_trap_cmd(int cmd, void *args) { return 0; } void parasite_cleanup(void) { } #define PARASITE_CMD_INC PARASITE_USER_CMDS #define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 int parasite_daemon_cmd(int cmd, void *args) { int v; switch (cmd) { case PARASITE_CMD_INC: v = (*(int *)args) + 1; break; case PARASITE_CMD_DEC: v = (*(int *)args) - 1; break; default: v = -1; break; } sys_write(1, &v, sizeof(int)); return 0; } criu-3.6/compel/test/infect/spy.c000066400000000000000000000071761317335042600170300ustar00rootroot00000000000000#include #include #include #include #include #include "parasite.h" #define PARASITE_CMD_INC PARASITE_USER_CMDS #define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { printf("\tLC%u: ", lvl); vprintf(fmt, parms); } static int do_infection(int pid) { #define err_and_ret(msg) do { fprintf(stderr, msg); return -1; } while (0) int state; struct parasite_ctl *ctl; struct infect_ctx *ictx; int *arg; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); printf("Stopping task\n"); state = compel_stop_task(pid); if (state < 0) err_and_ret("Can't stop task"); printf("Preparing parasite ctl\n"); ctl = compel_prepare(pid); if (!ctl) err_and_ret("Can't prepare for infection"); printf("Configuring contexts\n"); /* * First -- the infection context. Most of the stuff * is already filled by compel_prepare(), just set the * log descriptor for parasite side, library cannot * live w/o it. */ ictx = compel_infect_ctx(ctl); ictx->log_fd = STDERR_FILENO; parasite_setup_c_header(ctl); printf("Infecting\n"); if (compel_infect(ctl, 1, sizeof(int))) err_and_ret("Can't infect victim"); /* * Now get the area with arguments and run two * commands one by one. */ arg = compel_parasite_args(ctl, int); printf("Running cmd 1\n"); *arg = 137; if (compel_rpc_call_sync(PARASITE_CMD_INC, ctl)) err_and_ret("Can't run parasite command 1"); printf("Running cmd 2\n"); *arg = 404; if (compel_rpc_call_sync(PARASITE_CMD_DEC, ctl)) err_and_ret("Can't run parasite command 2"); /* * Done. Cure and resume the task. */ printf("Curing\n"); if (compel_cure(ctl)) err_and_ret("Can't cure victim"); if (compel_resume_task(pid, state, state)) err_and_ret("Can't unseize task"); printf("Done\n"); return 0; } static inline int chk(int fd, int val) { int v = 0; if (read(fd, &v, sizeof(v)) != sizeof(v)) return 0; printf("%d, want %d\n", v, val); return v == val; } int main(int argc, char **argv) { int p_in[2], p_out[2], p_err[2], pid, i, pass = 1; /* * Prepare IO-s and fork the victim binary */ if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { perror("Can't make pipe"); return -1; } pid = vfork(); if (pid == 0) { close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); execl("./victim", "victim", NULL); exit(1); } close(p_in[0]); close(p_out[1]); close(p_err[1]); /* * Tell the little guy some numbers */ i = 1; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; i = 42; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; printf("Checking the victim alive\n"); pass = chk(p_out[0], 1); pass = chk(p_out[0], 42); if (!pass) return 1; /* * Now do the infection with parasite.c */ printf("Infecting the victim\n"); if (do_infection(pid)) return 1; /* * Tell the victim some more stuff to check it's alive */ i = 1234; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; i = 4096; if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) return 1; /* * Stop the victim and check the infection went well */ printf("Closing victim stdin\n"); close(p_in[1]); printf("Waiting for victim to die\n"); wait(NULL); printf("Checking the result\n"); /* These two came from parasite */ pass = chk(p_out[0], 138); pass = chk(p_out[0], 403); /* These two came from post-infect */ pass = chk(p_out[0], 1234); pass = chk(p_out[0], 4096); if (pass) printf("All OK\n"); else printf("Something went WRONG\n"); return 0; } criu-3.6/compel/test/infect/victim.c000066400000000000000000000003121317335042600174710ustar00rootroot00000000000000#include int main(int argc, char **argv) { int i; while (1) { if (read(0, &i, sizeof(i)) != sizeof(i)) break; if (write(1, &i, sizeof(i)) != sizeof(i)) break; } return 0; } criu-3.6/compel/test/rsys/000077500000000000000000000000001317335042600155665ustar00rootroot00000000000000criu-3.6/compel/test/rsys/.gitignore000066400000000000000000000000131317335042600175500ustar00rootroot00000000000000spy victim criu-3.6/compel/test/rsys/Makefile000066400000000000000000000004221317335042600172240ustar00rootroot00000000000000CC := gcc CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host all: victim spy clean: rm -f victim rm -f spy victim: victim.c $(CC) $(CFLAGS) -o $@ $^ spy: spy.c $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $^ $(shell $(COMPEL) --static libs) criu-3.6/compel/test/rsys/spy.c000066400000000000000000000053011317335042600165440ustar00rootroot00000000000000#include #include #include #include #include #include static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { printf("\tLC%u: ", lvl); vprintf(fmt, parms); } static int do_rsetsid(int pid) { #define err_and_ret(msg) do { fprintf(stderr, msg); return -1; } while (0) int state; long ret; struct parasite_ctl *ctl; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); printf("Stopping task\n"); state = compel_stop_task(pid); if (state < 0) err_and_ret("Can't stop task"); printf("Preparing parasite ctl\n"); ctl = compel_prepare(pid); if (!ctl) err_and_ret("Can't prepare for infection"); ret = -1000; if (compel_syscall(ctl, __NR_getpid, &ret, 0, 0, 0, 0, 0, 0) < 0) err_and_ret("Can't run rgetpid"); printf("Remote getpid returned %ld\n", ret); if (ret != pid) err_and_ret("Pid mismatch!"); ret = -1000; if (compel_syscall(ctl, __NR_setsid, &ret, 0, 0, 0, 0, 0, 0) < 0) err_and_ret("Can't run rsetsid"); printf("Remote setsid returned %ld\n", ret); /* * Done. Cure and resume the task. */ printf("Curing\n"); if (compel_cure(ctl)) err_and_ret("Can't cure victim"); if (compel_resume_task(pid, state, state)) err_and_ret("Can't unseize task"); printf("Done\n"); return 0; } static inline int chk(int fd, int val) { int v = 0; read(fd, &v, sizeof(v)); printf("%d, want %d\n", v, val); return v == val; } int main(int argc, char **argv) { int p_in[2], p_out[2], p_err[2], pid, i, pass = 1, sid; /* * Prepare IO-s and fork the victim binary */ if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { perror("Can't make pipe"); return -1; } pid = vfork(); if (pid == 0) { close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); execl("./victim", "victim", NULL); exit(1); } close(p_in[0]); close(p_out[1]); close(p_err[1]); sid = getsid(0); /* * Kick the victim once */ i = 0; write(p_in[1], &i, sizeof(i)); printf("Checking the victim session to be %d\n", sid); pass = chk(p_out[0], sid); if (!pass) return 1; /* * Now do the infection with parasite.c */ printf("Setsid() the victim\n"); if (do_rsetsid(pid)) return 1; /* * Kick the victim again so it tells new session */ write(p_in[1], &i, sizeof(i)); /* * Stop the victim and check the intrusion went well */ printf("Closing victim stdin\n"); close(p_in[1]); printf("Waiting for victim to die\n"); wait(NULL); printf("Checking the new session to be %d\n", pid); pass = chk(p_out[0], pid); if (pass) printf("All OK\n"); else printf("Something went WRONG\n"); return 0; } criu-3.6/compel/test/rsys/victim.c000066400000000000000000000003001317335042600172160ustar00rootroot00000000000000#include int main(int argc, char **argv) { int i; while (1) { if (read(0, &i, sizeof(i)) != sizeof(i)) break; i = getsid(0); write(1, &i, sizeof(i)); } return 0; } criu-3.6/contrib/000077500000000000000000000000001317335042600137705ustar00rootroot00000000000000criu-3.6/contrib/debian/000077500000000000000000000000001317335042600152125ustar00rootroot00000000000000criu-3.6/contrib/debian/dev-packages.lst000066400000000000000000000005301317335042600202660ustar00rootroot00000000000000# Required packages for development in Debian build-essential libprotobuf-dev libprotobuf-c0-dev protobuf-c-compiler protobuf-compiler python-protobuf libnet-dev # Extra packages, required for testing and building other tools pkg-config libnl-3-dev python-ipaddr libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev python-yaml libnl-route-3-dev criu-3.6/contrib/docker_cr.sh000077500000000000000000000273771317335042600163020ustar00rootroot00000000000000#!/bin/bash # # A convenience shell script to call criu for checkpointing and restoring # a Docker container. # # This script saves the user from having to remember all the command # line options, some of which are very long. Note that once Docker # has native support for checkpoint and restore, there will no longer # be a need for this particular shell script. # set -o errexit set -o nounset set -o pipefail # # These can be set in the environment to override their defaults. # Note that while the default value of CRIU_IMG_DIR in this script # is a directory in DOCKER_HOME, it doesn't have to be tied to # DOCKER_HOME. For example, it can be /var/spool/criu_img. # : ${DOCKER_HOME=/var/lib/docker} : ${DOCKER_BINARY=docker} : ${CRIU_IMG_DIR=${DOCKER_HOME}/criu_img} : ${CRIU_BINARY=criu} : ${DOCKERINIT_BINARY=} # # Patterns for different filesystem types in dump.log. # readonly AUFS_PATTERN='/sys/fs/aufs/si_' readonly OVERLAYFS_PATTERN='type.*source.*options.*lowerdir=.*upperdir=.*workdir=' readonly UNIONFS_PATTERN='type.*source.*options.*dirs=' # # These globals will be set by init_container_vars() # declare CID declare CONTAINER_IMG_DIR declare CONTAINER_DUMP_LOG declare -A BIND_MOUNT BIND_MOUNT[/etc/resolv.conf]=.ResolvConfPath BIND_MOUNT[/etc/hosts]=.HostsPath BIND_MOUNT[/etc/hostname]=.HostnamePath MOUNT_MAP_ARGS=() # # The default mode is non-verbose, printing only a short message # saying if the command succeeded or failed. For the verbose mode, # we could have used set -o xtrace but this option would have # generated excessive output suitable for debugging, not normal # usage. So we set ${ECHO} to echo in the verbose mode to print # selected messages. # VERBOSE="" ECHO=":" CMD="" PGNAME=$(basename "$0") usage() { local rv=0 if [[ -n "${1-}" ]]; then rv=1 echo -e "${PGNAME}: $1\n" >&2 fi cat <] -c, --checkpoint checkpoint container -h, --help print help message -r, --restore restore container -v, --verbose enable verbose mode Environment: DOCKER_HOME (default ${DOCKER_HOME}) CRIU_IMG_DIR (default ${CRIU_IMG_DIR}) DOCKER_BINARY (default ${DOCKER_BINARY}) DOCKERINIT_BINARY (default \${DOCKER_HOME}/init/dockerinit--dev) CRIU_BINARY (default ${CRIU_BINARY}) EOF exit ${rv} } # # If the user has not specified a bind mount file for the container's # /.dockerinit, try to determine it from the Docker version. # find_dockerinit() { local v if [[ -z "${DOCKERINIT_BINARY}" ]]; then v=$("${DOCKER_BINARY}" --version | sed -e 's/.*version \(.*\),.*/\1/') DOCKERINIT_BINARY="${DOCKER_HOME}/init/dockerinit-${v}" elif [[ "${DOCKERINIT_BINARY}" != /* ]]; then DOCKERINIT_BINARY="${DOCKER_HOME}/init/${DOCKERINIT_BINARY}" fi if [[ ! -x "${DOCKERINIT_BINARY}" ]]; then echo "${DOCKERINIT_BINARY} does not exist" exit 1 fi BIND_MOUNT[/.dockerinit]="${DOCKERINIT_BINARY}" } parse_args() { local args local flags args=$(getopt --options 'chrv' \ --longoptions 'checkpoint help restore verbose' -- "$@") [[ $? == 0 ]] || usage eval set -- "${args}" while :; do arg="${1}" shift case "${arg}" in -c|--checkpoint) CMD="dump" ;; -h|--help) usage ;; -r|--restore) CMD="restore" ;; -v|--verbose) VERBOSE="-v"; ECHO="echo" ;; --) break ;; *) usage "internal error parsing arguments!" ;; esac done [[ "${CMD}" == "" ]] && usage "need either -c or -r" [[ $# -gt 1 ]] && usage "$# too many arguments" # if no container id in args, prompt the user if [[ $# -eq 1 ]]; then CID="$1" else if [[ "${CMD}" == "dump" ]]; then flags="" else # we need -a only for restore flags="-a" fi "${DOCKER_BINARY}" ps ${flags} read -rp $'\nContainer ID: ' CID fi } execute() { # since commands are pretty long and can wrap around # several lines, print a blank line to make it visually # easier to see ${ECHO} -e "\n$*" "$@" } init_container_vars() { local d CID=$(get_container_conf .Id) d=$("${DOCKER_BINARY}" info 2> /dev/null | awk '/Storage Driver:/ { print $3 }') if [[ "${d}" == "vfs" ]]; then CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/dir/${CID}" elif [[ "${d}" == "aufs" || "${d}" == "unionfs" ]]; then CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/mnt/${CID}" elif [[ "${d}" == "overlay" ]]; then CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/${CID}/merged" else echo "${d}: unknown filesystem type" return 1 fi CONTAINER_IMG_DIR="${CRIU_IMG_DIR}/${CID}" CONTAINER_DUMP_LOG="${CONTAINER_IMG_DIR}/dump.log" } get_container_conf() { local val val=$("${DOCKER_BINARY}" inspect --format "{{$1}}" "${CID}") [[ "${val}" == "" ]] && exit 1 echo "${val//}" } setup_mount_map() { local key if [[ "$1" == "dump" ]]; then for key in "${!BIND_MOUNT[@]}"; do MOUNT_MAP_ARGS+=(--ext-mount-map "${key}:${key}") done else for key in "${!BIND_MOUNT[@]}"; do if [[ "${key}" == "/.dockerinit" ]]; then MOUNT_MAP_ARGS+=("--ext-mount-map" "${key}:${BIND_MOUNT[$key]}") else MOUNT_MAP_ARGS+=("--ext-mount-map" "${key}:$(get_container_conf "${BIND_MOUNT[$key]}")") fi done fi } fs_mounted() { if grep -wq "$1" /proc/self/mountinfo; then ${ECHO} "container root directory already mounted" return 0 fi ${ECHO} "container root directory not mounted" return 1 } # # Pretty print the mount command in verbose mode by putting each branch # pathname on a single line for easier visual inspection. # pp_mount() { ${ECHO} -e "\nmount -t $1 -o" ${ECHO} "${2}" | tr ':,' '\n' ${ECHO} "${3}" ${ECHO} "${4}" } # # Reconstruct the AUFS filesystem from information in CRIU's dump log. # The dump log has a series of branch entries for each process in the # entire process tree in the following form: # # (00.014075) /sys/fs/aufs/si_f598876b0855b883/br0 : /var/lib/docker/aufs/diff/ # # Note that this script assumes that all processes in the process # tree have the same AUFS filesystem. This assumption is fairly # safe for typical Docker containers. # setup_aufs() { local -r tmpf="${CONTAINER_IMG_DIR}/aufs.br" local br local branches # nothing to do if filesystem already mounted fs_mounted "${CONTAINER_ROOT_DIR}" && return # create a temporary file with branches listed in # ascending order (line 1 is branch 0) awk '/aufs.si_/ { print $2, $4 }' "${CONTAINER_DUMP_LOG}" | \ sort | uniq | awk '{ print $2 }' > "${tmpf}" # construct the mount option string from branches branches="" while read br; do branches+="${branches:+:}${br}" done < "${tmpf}" # mount the container's filesystem pp_mount "aufs" "${branches}" "none" "${CONTAINER_ROOT_DIR}" mount -t aufs -o br="${branches}" none "${CONTAINER_ROOT_DIR}" rm -f "${tmpf}" } setup_overlayfs() { local lowerdir local upperdir local workdir local ovlydirs local -r f="${CONTAINER_DUMP_LOG}" # nothing to do if filesystem already mounted fs_mounted "${CONTAINER_ROOT_DIR}" && return lowerdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*lowerdir=\([^,]*\).*/\1/p') upperdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*upperdir=\([^,]*\).*/\1/p') workdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*workdir=\([^,]*\).*/\1/p') ovlydirs="lowerdir=${lowerdir},upperdir=${upperdir},workdir=${workdir}" # mount the container's filesystem pp_mount "overlay" "${ovlydirs}" "overlay" "${CONTAINER_ROOT_DIR}" mount -t overlay -o "${ovlydirs}" overlay "${CONTAINER_ROOT_DIR}" } # # Reconstruct the UnionFS filesystem from information in CRIU's dump log. # The dump log has the mountinfo root entry for the filesystem. The # options field contains the list of directories that make up the UnionFS. # # Note that this script assumes that all processes in the process # tree have the same UnionFS filesystem. This assumption is fairly # safe for typical Docker containers. # # XXX If /dev/null was manually created by Docker (i.e., it's not in # a branch), create it. Although this has worked so far, it needs # a deeper look as I am not sure if /dev/null should be created as # a regular file to be the target of a bind mount or created as a # device file by mknod. # setup_unionfs() { local dirs # nothing to do if filesystem already mounted fs_mounted "${CONTAINER_ROOT_DIR}" && return dirs=$(sed -n -e 's/.*type.*dirs=/dirs=/p' "${CONTAINER_DUMP_LOG}") [[ "${dirs}" = "" ]] && echo "do not have branch information" && exit 1 # mount the container's filesystem pp_mount "unionfs" "${dirs}" "none" "${CONTAINER_ROOT_DIR}" mount -t unionfs -o "${dirs}" none "${CONTAINER_ROOT_DIR}" # see comment at the beginning of the function if [[ ! -e "${CONTAINER_ROOT_DIR}/dev/null" ]]; then execute touch "${CONTAINER_ROOT_DIR}/dev/null" fi } prep_dump() { local pid pid=$(get_container_conf .State.Pid) # docker returns 0 for containers it thinks have exited # (i.e., dumping a restored container again) if [[ ${pid} -eq 0 ]]; then echo -e "\nCheckpointing a restored container?" read -p "Process ID: " pid fi # remove files previously created by criu but not others files (if any) mkdir -p "${CONTAINER_IMG_DIR}" rm -f "${CONTAINER_IMG_DIR}"/*.{img,log,pid} "${CONTAINER_IMG_DIR}"/stats-restore CMD_ARGS=("-t" "${pid}") # we need --root only for aufs to compensate for the # erroneous information in /proc//map_files if [[ "${CONTAINER_ROOT_DIR}" == *aufs* ]]; then CMD_ARGS+=("--root" "${CONTAINER_ROOT_DIR}") fi } # # Set up container's root filesystem if not already set up. # prep_restore() { local -r f="${CONTAINER_DUMP_LOG}" if [[ ! -f "${f}" ]]; then echo "${f} does not exist" return 1 fi if grep -q "${AUFS_PATTERN}" "${f}"; then setup_aufs elif grep -q "${OVERLAYFS_PATTERN}" "${f}"; then setup_overlayfs elif grep -q "${UNIONFS_PATTERN}" "${f}"; then setup_unionfs fi # criu requires this (due to container using pivot_root) if ! grep -qw "${CONTAINER_ROOT_DIR}" /proc/self/mountinfo; then execute mount --rbind "${CONTAINER_ROOT_DIR}" "${CONTAINER_ROOT_DIR}" MOUNTED=1 else MOUNTED=0 fi CMD_ARGS=("-d" "--root" "${CONTAINER_ROOT_DIR}" "--pidfile" "${CONTAINER_IMG_DIR}/restore.pid") } # # Since this function produces output string (either in the # verbose mode or from ${CRIU_BINARY}), we set the return value # in parameter 1. # run_criu() { local -a common_args=("-v4" "-D" "${CONTAINER_IMG_DIR}" \ "-o" "${CMD}.log" \ "--manage-cgroups" \ "--evasive-devices") setup_mount_map "${CMD}" common_args+=("${MOUNT_MAP_ARGS[@]}") # we do not want to exit if there's an error execute "${CRIU_BINARY}" "${CMD}" "${common_args[@]}" "${CMD_ARGS[@]}" } wrap_up() { local -r logf="${CONTAINER_IMG_DIR}/${CMD}.log" local -r pidf="${CONTAINER_IMG_DIR}/restore.pid" if [[ $1 -eq 0 ]]; then ${ECHO} -e "\n" echo "${CMD} successful" else ${ECHO} -e "\n" echo "${CMD} failed" fi if [[ "${VERBOSE}" == "-v" && -e "${logf}" ]]; then if ! grep "finished successfully" "${logf}"; then grep Error "${logf}" fi fi if [[ "${CMD}" == "restore" ]]; then if [[ ${MOUNTED} -eq 1 ]]; then execute umount "${CONTAINER_ROOT_DIR}" fi if [[ -e "${pidf}" ]]; then ${ECHO} -e "\n$(ps -f -p "$(cat "${pidf}")" --no-headers)" fi fi } resolve_path() { local p p="${2}" if which realpath > /dev/null; then p=$(realpath "${p}") fi ${ECHO} "${1}: ${p}" } resolve_cmd() { local cpath cpath=$(which "${2}") resolve_path "${1}" "${cpath}" } main() { local rv=0 if [[ $(id -u) -ne 0 ]]; then echo "not running as root" exit 1 fi parse_args "$@" find_dockerinit init_container_vars if [[ "${VERBOSE}" == "-v" ]]; then echo resolve_cmd "docker binary" "${DOCKER_BINARY}" resolve_cmd "dockerinit binary" "${DOCKERINIT_BINARY}" resolve_cmd "criu binary" "${CRIU_BINARY}" resolve_path "image directory" "${CONTAINER_IMG_DIR}" resolve_path "container root directory" "${CONTAINER_ROOT_DIR}" fi if [[ "${CMD}" == "dump" ]]; then prep_dump else prep_restore fi run_criu || rv=$? wrap_up ${rv} exit ${rv} } main "$@" criu-3.6/coredump/000077500000000000000000000000001317335042600141465ustar00rootroot00000000000000criu-3.6/coredump/criu-coredump000077500000000000000000000016701317335042600166560ustar00rootroot00000000000000#!/usr/bin/env python2 import argparse import os import criu_coredump def coredump(opts): generator = criu_coredump.coredump_generator() cores = generator(os.path.realpath(opts['in'])) for pid in cores: if opts['pid'] and pid != opts['pid']: continue with open(os.path.realpath(opts['out'])+"/core."+str(pid), 'w+') as f: cores[pid].write(f) def main(): desc = 'CRIU core dump' parser = argparse.ArgumentParser(description=desc, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--in', default = '.', help = 'directory where to get images from') parser.add_argument('-p', '--pid', type = int, help = 'generate coredump for specific pid(all pids py default)') parser.add_argument('-o', '--out', default = '.', help = 'directory to write coredumps to') opts = vars(parser.parse_args()) coredump(opts) if __name__ == '__main__': main() criu-3.6/coredump/criu_coredump/000077500000000000000000000000001317335042600170065ustar00rootroot00000000000000criu-3.6/coredump/criu_coredump/.gitignore000066400000000000000000000000061317335042600207720ustar00rootroot00000000000000*.pyc criu-3.6/coredump/criu_coredump/__init__.py000066400000000000000000000000421317335042600211130ustar00rootroot00000000000000from coredump import * import elf criu-3.6/coredump/criu_coredump/coredump.py000066400000000000000000000517641317335042600212130ustar00rootroot00000000000000# Functions and classes for creating core dump from criu images. # Code is inspired by outdated google coredumper(RIP) [1] and # fs/binfmt_elf.h from Linux kernel [2]. # # [1] https://code.google.com/p/google-coredumper/ # probably already dead, so consider trying: # https://github.com/efiop/google-coredumper/ # [2] https://www.kernel.org/ # # On my x86_64 systems with fresh kernel ~3.17 core dump looks like: # # 1) Elf file header; # 2) PT_NOTE program header describing notes section; # 3) PT_LOAD program headers for (almost?) each vma; # 4) NT_PRPSINFO note with elf_prpsinfo inside; # 5) An array of notes for each thread of the process: # NT_PRSTATUS note with elf_prstatus inside; # NT_FPREGSET note with elf_fpregset inside; # NT_X86_XSTATE note with x86 extended state using xsave; # NT_SIGINFO note with siginfo_t inside; # 6) NT_AUXV note with auxv; # 7) NT_FILE note with mapped files; # 8) VMAs themselves; # # Or, you can represent it in less details as: # 1) Elf file header; # 2) Program table; # 3) Notes; # 4) VMAs contents; # import io import elf import ctypes from pycriu import images # Some memory-related constants PAGESIZE = 4096 status = { "VMA_AREA_NONE" : 0 << 0, "VMA_AREA_REGULAR" : 1 << 0, "VMA_AREA_STACK" : 1 << 1, "VMA_AREA_VSYSCALL" : 1 << 2, "VMA_AREA_VDSO" : 1 << 3, "VMA_FORCE_READ" : 1 << 4, "VMA_AREA_HEAP" : 1 << 5, "VMA_FILE_PRIVATE" : 1 << 6, "VMA_FILE_SHARED" : 1 << 7, "VMA_ANON_SHARED" : 1 << 8, "VMA_ANON_PRIVATE" : 1 << 9, "VMA_AREA_SYSVIPC" : 1 << 10, "VMA_AREA_SOCKET" : 1 << 11, "VMA_AREA_VVAR" : 1 << 12, "VMA_AREA_AIORING" : 1 << 13, "VMA_AREA_UNSUPP" : 1 << 31 } prot = { "PROT_READ" : 0x1, "PROT_WRITE" : 0x2, "PROT_EXEC" : 0x4 } class elf_note: nhdr = None # Elf_Nhdr; owner = None # i.e. CORE or LINUX; data = None # Ctypes structure with note data; class coredump: """ A class to keep elf core dump components inside and functions to properly write them to file. """ ehdr = None # Elf ehdr; phdrs = [] # Array of Phdrs; notes = [] # Array of elf_notes; vmas = [] # Array of BytesIO with memory content; # FIXME keeping all vmas in memory is a bad idea; def write(self, f): """ Write core dump to file f. """ buf = io.BytesIO() buf.write(self.ehdr) for phdr in self.phdrs: buf.write(phdr) for note in self.notes: buf.write(note.nhdr) buf.write(note.owner) buf.write("\0"*(8-len(note.owner))) buf.write(note.data) offset = ctypes.sizeof(elf.Elf64_Ehdr()) offset += (len(self.vmas) + 1)*ctypes.sizeof(elf.Elf64_Phdr()) filesz = 0 for note in self.notes: filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 note_align = PAGESIZE - ((offset + filesz) % PAGESIZE) if note_align == PAGESIZE: note_align = 0 if note_align != 0: scratch = (ctypes.c_char * note_align)() ctypes.memset(ctypes.addressof(scratch), 0, ctypes.sizeof(scratch)) buf.write(scratch) for vma in self.vmas: buf.write(vma.data) buf.seek(0) f.write(buf.read()) class coredump_generator: """ Generate core dump from criu images. """ coredumps = {} # coredumps by pid; pstree = {} # process info by pid; cores = {} # cores by pid; mms = {} # mm by pid; reg_files = None # reg-files; pagemaps = {} # pagemap by pid; def _img_open_and_strip(self, name, single = False, pid = None): """ Load criu image and strip it from magic and redundant list. """ path = self._imgs_dir + "/" + name if pid: path += "-"+str(pid) path += ".img" with open(path) as f: img = images.load(f) if single: return img["entries"][0] else: return img["entries"] def __call__(self, imgs_dir): """ Parse criu images stored in directory imgs_dir to fill core dumps. """ self._imgs_dir = imgs_dir pstree = self._img_open_and_strip("pstree") for p in pstree: pid = p['pid'] self.pstree[pid] = p for tid in p['threads']: self.cores[tid] = self._img_open_and_strip("core", True, tid) self.mms[pid] = self._img_open_and_strip("mm", True, pid) self.pagemaps[pid] = self._img_open_and_strip("pagemap", False, pid) self.reg_files = self._img_open_and_strip("reg-files", False) for pid in self.pstree: self.coredumps[pid] = self._gen_coredump(pid) return self.coredumps def write(self, coredumps_dir, pid = None): """ Write core dumpt to cores_dir directory. Specify pid to choose core dump of only one process. """ for p in self.coredumps: if pid and p != pid: continue with open(coredumps_dir+"/"+"core."+str(p), 'w+') as f: self.coredumps[p].write(f) def _gen_coredump(self, pid): """ Generate core dump for pid. """ cd = coredump() # Generate everything backwards so it is easier to calculate offset. cd.vmas = self._gen_vmas(pid) cd.notes = self._gen_notes(pid) cd.phdrs = self._gen_phdrs(pid, cd.notes, cd.vmas) cd.ehdr = self._gen_ehdr(pid, cd.phdrs) return cd def _gen_ehdr(self, pid, phdrs): """ Generate elf header for process pid with program headers phdrs. """ ehdr = elf.Elf64_Ehdr() ctypes.memset(ctypes.addressof(ehdr), 0, ctypes.sizeof(ehdr)) ehdr.e_ident[elf.EI_MAG0] = elf.ELFMAG0 ehdr.e_ident[elf.EI_MAG1] = elf.ELFMAG1 ehdr.e_ident[elf.EI_MAG2] = elf.ELFMAG2 ehdr.e_ident[elf.EI_MAG3] = elf.ELFMAG3 ehdr.e_ident[elf.EI_CLASS] = elf.ELFCLASS64 ehdr.e_ident[elf.EI_DATA] = elf.ELFDATA2LSB ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT ehdr.e_type = elf.ET_CORE ehdr.e_machine = elf.EM_X86_64 ehdr.e_version = elf.EV_CURRENT ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) ehdr.e_phentsize = ctypes.sizeof(elf.Elf64_Phdr()) #FIXME Case len(phdrs) > PN_XNUM should be handled properly. # See fs/binfmt_elf.c from linux kernel. ehdr.e_phnum = len(phdrs) return ehdr def _gen_phdrs(self, pid, notes, vmas): """ Generate program headers for process pid. """ phdrs = [] offset = ctypes.sizeof(elf.Elf64_Ehdr()) offset += (len(vmas) + 1)*ctypes.sizeof(elf.Elf64_Phdr()) filesz = 0 for note in notes: filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 # PT_NOTE phdr = elf.Elf64_Phdr() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_NOTE phdr.p_offset = offset phdr.p_filesz = filesz phdrs.append(phdr) note_align = PAGESIZE - ((offset + filesz) % PAGESIZE) if note_align == PAGESIZE: note_align = 0 offset += note_align # VMA phdrs for vma in vmas: offset += filesz filesz = vma.filesz phdr = elf.Elf64_Phdr() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_LOAD phdr.p_align = PAGESIZE phdr.p_paddr = 0 phdr.p_offset = offset phdr.p_vaddr = vma.start phdr.p_memsz = vma.memsz phdr.p_filesz = vma.filesz phdr.p_flags = vma.flags phdrs.append(phdr) return phdrs def _gen_prpsinfo(self, pid): """ Generate NT_PRPSINFO note for process pid. """ pstree = self.pstree[pid] core = self.cores[pid] prpsinfo = elf.elf_prpsinfo() ctypes.memset(ctypes.addressof(prpsinfo), 0, ctypes.sizeof(prpsinfo)) # FIXME TASK_ALIVE means that it is either running or sleeping, need to # teach criu to distinguish them. TASK_ALIVE = 0x1 # XXX A bit of confusion here, as in ps "dead" and "zombie" # state are two separate states, and we use TASK_DEAD for zombies. TASK_DEAD = 0x2 TASK_STOPPED = 0x3 if core["tc"]["task_state"] == TASK_ALIVE: prpsinfo.pr_state = 0 if core["tc"]["task_state"] == TASK_DEAD: prpsinfo.pr_state = 4 if core["tc"]["task_state"] == TASK_STOPPED: prpsinfo.pr_state = 3 # Don't even ask me why it is so, just borrowed from linux # source and made pr_state match. prpsinfo.pr_sname = '.' if prpsinfo.pr_state > 5 else "RSDTZW"[prpsinfo.pr_state] prpsinfo.pr_zomb = 1 if prpsinfo.pr_state == 4 else 0 prpsinfo.pr_nice = core["thread_core"]["sched_prio"] if "sched_prio" in core["thread_core"] else 0 prpsinfo.pr_flag = core["tc"]["flags"] prpsinfo.pr_uid = core["thread_core"]["creds"]["uid"] prpsinfo.pr_gid = core["thread_core"]["creds"]["gid"] prpsinfo.pr_pid = pid prpsinfo.pr_ppid = pstree["ppid"] prpsinfo.pr_pgrp = pstree["pgid"] prpsinfo.pr_sid = pstree["sid"] prpsinfo.pr_fname = core["tc"]["comm"] prpsinfo.pr_psargs = self._gen_cmdline(pid) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prpsinfo()) nhdr.n_type = elf.NT_PRPSINFO note = elf_note() note.data = prpsinfo note.owner = "CORE" note.nhdr = nhdr return note def _gen_prstatus(self, pid, tid): """ Generate NT_PRSTATUS note for thread tid of process pid. """ core = self.cores[tid] regs = core["thread_info"]["gpregs"] pstree = self.pstree[pid] prstatus = elf.elf_prstatus() ctypes.memset(ctypes.addressof(prstatus), 0, ctypes.sizeof(prstatus)) #FIXME setting only some of the fields for now. Revisit later. prstatus.pr_pid = tid prstatus.pr_ppid = pstree["ppid"] prstatus.pr_pgrp = pstree["pgid"] prstatus.pr_sid = pstree["sid"] prstatus.pr_reg.r15 = regs["r15"] prstatus.pr_reg.r14 = regs["r14"] prstatus.pr_reg.r13 = regs["r13"] prstatus.pr_reg.r12 = regs["r12"] prstatus.pr_reg.rbp = regs["bp"] prstatus.pr_reg.rbx = regs["bx"] prstatus.pr_reg.r11 = regs["r11"] prstatus.pr_reg.r10 = regs["r10"] prstatus.pr_reg.r9 = regs["r9"] prstatus.pr_reg.r8 = regs["r8"] prstatus.pr_reg.rax = regs["ax"] prstatus.pr_reg.rcx = regs["cx"] prstatus.pr_reg.rdx = regs["dx"] prstatus.pr_reg.rsi = regs["si"] prstatus.pr_reg.rdi = regs["di"] prstatus.pr_reg.orig_rax = regs["orig_ax"] prstatus.pr_reg.rip = regs["ip"] prstatus.pr_reg.cs = regs["cs"] prstatus.pr_reg.eflags = regs["flags"] prstatus.pr_reg.rsp = regs["sp"] prstatus.pr_reg.ss = regs["ss"] prstatus.pr_reg.fs_base = regs["fs_base"] prstatus.pr_reg.gs_base = regs["gs_base"] prstatus.pr_reg.ds = regs["ds"] prstatus.pr_reg.es = regs["es"] prstatus.pr_reg.fs = regs["fs"] prstatus.pr_reg.gs = regs["gs"] nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prstatus()) nhdr.n_type = elf.NT_PRSTATUS note = elf_note() note.data = prstatus note.owner = "CORE" note.nhdr = nhdr return note def _gen_fpregset(self, pid, tid): """ Generate NT_FPREGSET note for thread tid of process pid. """ core = self.cores[tid] regs = core["thread_info"]["fpregs"] fpregset = elf.elf_fpregset_t() ctypes.memset(ctypes.addressof(fpregset), 0, ctypes.sizeof(fpregset)) fpregset.cwd = regs["cwd"] fpregset.swd = regs["swd"] fpregset.ftw = regs["twd"] fpregset.fop = regs["fop"] fpregset.rip = regs["rip"] fpregset.rdp = regs["rdp"] fpregset.mxcsr = regs["mxcsr"] fpregset.mxcr_mask = regs["mxcsr_mask"] fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))(*regs["st_space"]) fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))(*regs["xmm_space"]) #fpregset.padding = regs["padding"] unused nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_fpregset_t()) nhdr.n_type = elf.NT_FPREGSET note = elf_note() note.data = fpregset note.owner = "CORE" note.nhdr = nhdr return note def _gen_x86_xstate(self, pid, tid): """ Generate NT_X86_XSTATE note for thread tid of process pid. """ core = self.cores[tid] fpregs = core["thread_info"]["fpregs"] data = elf.elf_xsave_struct() ctypes.memset(ctypes.addressof(data), 0, ctypes.sizeof(data)) data.i387.cwd = fpregs["cwd"] data.i387.swd = fpregs["swd"] data.i387.twd = fpregs["twd"] data.i387.fop = fpregs["fop"] data.i387.rip = fpregs["rip"] data.i387.rdp = fpregs["rdp"] data.i387.mxcsr = fpregs["mxcsr"] data.i387.mxcsr_mask = fpregs["mxcsr_mask"] data.i387.st_space = (ctypes.c_uint * len(fpregs["st_space"]))(*fpregs["st_space"]) data.i387.xmm_space = (ctypes.c_uint * len(fpregs["xmm_space"]))(*fpregs["xmm_space"]) if "xsave" in fpregs: data.xsave_hdr.xstate_bv = fpregs["xsave"]["xstate_bv"] data.ymmh.ymmh_space = (ctypes.c_uint * len(fpregs["xsave"]["ymmh_space"]))(*fpregs["xsave"]["ymmh_space"]) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 6 nhdr.n_descsz = ctypes.sizeof(data) nhdr.n_type = elf.NT_X86_XSTATE note = elf_note() note.data = data note.owner = "LINUX" note.nhdr = nhdr return note def _gen_siginfo(self, pid, tid): """ Generate NT_SIGINFO note for thread tid of process pid. """ siginfo = elf.siginfo_t() # FIXME zeroify everything for now ctypes.memset(ctypes.addressof(siginfo), 0, ctypes.sizeof(siginfo)) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.siginfo_t()) nhdr.n_type = elf.NT_SIGINFO note = elf_note() note.data = siginfo note.owner = "CORE" note.nhdr = nhdr return note def _gen_auxv(self, pid): """ Generate NT_AUXV note for thread tid of process pid. """ mm = self.mms[pid] num_auxv = len(mm["mm_saved_auxv"])/2 class elf_auxv(ctypes.Structure): _fields_ = [("auxv", elf.Elf64_auxv_t*num_auxv)] auxv = elf_auxv() for i in range(num_auxv): auxv.auxv[i].a_type = mm["mm_saved_auxv"][i] auxv.auxv[i].a_val = mm["mm_saved_auxv"][i+1] nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf_auxv()) nhdr.n_type = elf.NT_AUXV note = elf_note() note.data = auxv note.owner = "CORE" note.nhdr = nhdr return note def _gen_files(self, pid): """ Generate NT_FILE note for process pid. """ mm = self.mms[pid] class mmaped_file_info: start = None end = None file_ofs = None name = None infos = [] for vma in mm["vmas"]: if vma["shmid"] == 0: # shmid == 0 means that it is not a file continue shmid = vma["shmid"] size = vma["end"] - vma["start"] off = vma["pgoff"]/PAGESIZE files = self.reg_files fname = filter(lambda x: x["id"] == shmid, files)[0]["name"] info = mmaped_file_info() info.start = vma["start"] info.end = vma["end"] info.file_ofs = off info.name = fname infos.append(info) # /* # * Format of NT_FILE note: # * # * long count -- how many files are mapped # * long page_size -- units for file_ofs # * array of [COUNT] elements of # * long start # * long end # * long file_ofs # * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... # */ fields = [] fields.append(("count", ctypes.c_long)) fields.append(("page_size", ctypes.c_long)) for i in range(len(infos)): fields.append(("start"+str(i), ctypes.c_long)) fields.append(("end"+str(i), ctypes.c_long)) fields.append(("file_ofs"+str(i), ctypes.c_long)) for i in range(len(infos)): fields.append(("name"+str(i), ctypes.c_char*(len(infos[i].name)+1))) class elf_files(ctypes.Structure): _fields_ = fields data = elf_files() data.count = len(infos) data.page_size = PAGESIZE for i in range(len(infos)): info = infos[i] setattr(data, "start"+str(i), info.start) setattr(data, "end"+str(i), info.end) setattr(data, "file_ofs"+str(i), info.file_ofs) setattr(data, "name"+str(i), info.name) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5#XXX strlen + 1 nhdr.n_descsz = ctypes.sizeof(elf_files()) nhdr.n_type = elf.NT_FILE note = elf_note() note.nhdr = nhdr note.owner = "CORE" note.data = data return note def _gen_thread_notes(self, pid, tid): notes = [] notes.append(self._gen_prstatus(pid, tid)) notes.append(self._gen_fpregset(pid, tid)) notes.append(self._gen_x86_xstate(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) return notes def _gen_notes(self, pid): """ Generate notes for core dump of process pid. """ notes = [] notes.append(self._gen_prpsinfo(pid)) threads = self.pstree[pid]["threads"] # Main thread first notes += self._gen_thread_notes(pid, pid) # Then other threads for tid in threads: if tid == pid: continue notes += self._gen_thread_notes(pid, tid) notes.append(self._gen_auxv(pid)) notes.append(self._gen_files(pid)) return notes def _get_page(self, pid, page_no): """ Try to find memory page page_no in pages.img image for process pid. """ pagemap = self.pagemaps[pid] # First entry is pagemap_head, we will need it later to open # proper pages.img. pages_id = pagemap[0]["pages_id"] off = 0# in pages for m in pagemap[1:]: found = False for i in xrange(m["nr_pages"]): if m["vaddr"] + i*PAGESIZE == page_no*PAGESIZE: found = True break off += 1 if not found: continue if "in_parent" in m and m["in_parent"] == True: ppid = self.pstree[pid]["ppid"] return self._get_page(ppid, page_no) else: with open(self._imgs_dir+"/"+"pages-"+str(pages_id)+".img") as f: f.seek(off*PAGESIZE) return f.read(PAGESIZE) return None def _gen_mem_chunk(self, pid, vma, size): """ Obtain vma contents for process pid. """ f = None if size == 0: return "" if vma["status"] & status["VMA_AREA_VVAR"]: #FIXME this is what gdb does, as vvar vma # is not readable from userspace? return "\0"*size elif vma["status"] & status["VMA_AREA_VSYSCALL"]: #FIXME need to dump it with criu or read from # current process. return "\0"*size if vma["status"] & status["VMA_FILE_SHARED"] or \ vma["status"] & status["VMA_FILE_PRIVATE"]: # Open file before iterating vma pages shmid = vma["shmid"] off = vma["pgoff"] files = self.reg_files fname = filter(lambda x: x["id"] == shmid, files)[0]["name"] f = open(fname) f.seek(off) start = vma["start"] end = vma["start"] + size # Split requested memory chunk into pages, so it could be # pictured as: # # "----" -- part of page with memory outside of our vma; # "XXXX" -- memory from our vma; # # Start page Pages in the middle End page # [-----XXXXX]...[XXXXXXXXXX][XXXXXXXXXX]...[XXX-------] # # Each page could be found in pages.img or in a standalone # file described by shmid field in vma entry and # corresponding entry in reg-files.img. # For VMA_FILE_PRIVATE vma, unchanged pages are taken from # a file, and changed ones -- from pages.img. # Finally, if no page is found neither in pages.img nor # in file, hole in inserted -- a page filled with zeroes. start_page = start/PAGESIZE end_page = end/PAGESIZE buf = "" for page_no in range(start_page, end_page+1): page = None # Search for needed page in pages.img and reg-files.img # and choose appropriate. page_mem = self._get_page(pid, page_no) if f != None: page = f.read(PAGESIZE) if page_mem != None: # Page from pages.img has higher priority # than one from maped file on disk. page = page_mem if page == None: # Hole page = PAGESIZE*"\0" # If it is a start or end page, we need to read # only part of it. if page_no == start_page: n_skip = start - page_no*PAGESIZE if start_page == end_page: n_read = size else: n_read = PAGESIZE - n_skip elif page_no == end_page: n_skip = 0 n_read = end - page_no*PAGESIZE else: n_skip = 0 n_read = PAGESIZE buf += page[n_skip : n_skip + n_read] # Don't forget to close file. if f != None: f.close() return buf def _gen_cmdline(self, pid): """ Generate full command with arguments. """ mm = self.mms[pid] vma = {} vma["start"] = mm["mm_arg_start"] vma["end"] = mm["mm_arg_end"] # Dummy flags and status. vma["flags"] = 0 vma["status"] = 0 size = vma["end"] - vma["start"] chunk = self._gen_mem_chunk(pid, vma, size) # Replace all '\0's with spaces. return chunk.replace('\0', ' ') def _get_vma_dump_size(self, vma): """ Calculate amount of vma to put into core dump. """ if vma["status"] & status["VMA_AREA_VVAR"] or \ vma["status"] & status["VMA_AREA_VSYSCALL"] or \ vma["status"] & status["VMA_AREA_VDSO"]: size = vma["end"] - vma["start"] elif vma["prot"] == 0: size = 0 elif vma["prot"] & prot["PROT_READ"] and \ vma["prot"] & prot["PROT_EXEC"]: size = PAGESIZE elif vma["status"] & status["VMA_ANON_SHARED"] or \ vma["status"] & status["VMA_FILE_SHARED"] or \ vma["status"] & status["VMA_ANON_PRIVATE"] or \ vma["status"] & status["VMA_FILE_PRIVATE"]: size = vma["end"] - vma["start"] else: size = 0 return size def _get_vma_flags(self, vma): """ Convert vma flags int elf flags. """ flags = 0 if vma['prot'] & prot["PROT_READ"]: flags = flags | elf.PF_R if vma['prot'] & prot["PROT_WRITE"]: flags = flags | elf.PF_W if vma['prot'] & prot["PROT_EXEC"]: flags = flags | elf.PF_X return flags def _gen_vmas(self, pid): """ Generate vma contents for core dump for process pid. """ mm = self.mms[pid] class vma_class: data = None filesz = None memsz = None flags = None start = None vmas = [] for vma in mm["vmas"]: size = self._get_vma_dump_size(vma) chunk = self._gen_mem_chunk(pid, vma, size) v = vma_class() v.filesz = self._get_vma_dump_size(vma) v.data = self._gen_mem_chunk(pid, vma, v.filesz) v.memsz = vma["end"] - vma["start"] v.start = vma["start"] v.flags = self._get_vma_flags(vma) vmas.append(v) return vmas criu-3.6/coredump/criu_coredump/elf.py000066400000000000000000000574211317335042600201370ustar00rootroot00000000000000# Define structures and constants for generating elf file. from ctypes import * Elf64_Half = c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = c_uint32 # typedef uint32_t Elf64_Word; Elf64_Addr = c_uint64 # typedef uint64_t Elf64_Addr; Elf64_Off = c_uint64 # typedef uint64_t Elf64_Off; Elf64_Xword = c_uint64 # typedef uint64_t Elf64_Xword; # Elf64_Ehdr related constants. # e_ident size. EI_NIDENT = 16 # #define EI_NIDENT (16) EI_MAG0 = 0 # #define EI_MAG0 0 /* File identification byte 0 index */ ELFMAG0 = 0x7f # #define ELFMAG0 0x7f /* Magic number byte 0 */ EI_MAG1 = 1 # #define EI_MAG1 1 /* File identification byte 1 index */ ELFMAG1 = ord('E') # #define ELFMAG1 'E' /* Magic number byte 1 */ EI_MAG2 = 2 # #define EI_MAG2 2 /* File identification byte 2 index */ ELFMAG2 = ord('L') # #define ELFMAG2 'L' /* Magic number byte 2 */ EI_MAG3 = 3 # #define EI_MAG3 3 /* File identification byte 3 index */ ELFMAG3 = ord('F') # #define ELFMAG3 'F' /* Magic number byte 3 */ EI_CLASS = 4 # #define EI_CLASS 4 /* File class byte index */ EI_DATA = 5 # #define EI_DATA 5 /* Data encoding byte index */ EI_VERSION = 6 # #define EI_VERSION 6 /* File version byte index */ ELFDATA2LSB = 1 # #define ELFDATA2LSB 1 /* 2's complement, little endian */ ELFCLASS64 = 2 # #define ELFCLASS64 2 /* 64-bit objects */ # Legal values for e_type (object file type). ET_CORE = 4 # #define ET_CORE 4 /* Core file */ # Legal values for e_machine (architecture). EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ class Elf64_Ehdr(Structure): # typedef struct _fields_ = [ # { ("e_ident", c_ubyte*EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; ("e_type", Elf64_Half), # Elf64_Half e_type; ("e_machine", Elf64_Half), # Elf64_Half e_machine; ("e_version", Elf64_Word), # Elf64_Word e_version; ("e_entry", Elf64_Addr), # Elf64_Addr e_entry; ("e_phoff", Elf64_Off), # Elf64_Off e_phoff; ("e_shoff", Elf64_Off), # Elf64_Off e_shoff; ("e_flags", Elf64_Word), # Elf64_Word e_flags; ("e_ehsize", Elf64_Half), # Elf64_Half e_ehsize; ("e_phentsize", Elf64_Half), # Elf64_Half e_phentsize; ("e_phnum", Elf64_Half), # Elf64_Half e_phnum; ("e_shentsize", Elf64_Half), # Elf64_Half e_shentsize; ("e_shnum", Elf64_Half), # Elf64_Half e_shnum; ("e_shstrndx", Elf64_Half) # Elf64_Half e_shstrndx; ] # } Elf64_Ehdr; # Elf64_Phdr related constants. # Legal values for p_type (segment type). PT_LOAD = 1 # #define PT_LOAD 1 /* Loadable program segment */ PT_NOTE = 4 # #define PT_NOTE 4 /* Auxiliary information */ # Legal values for p_flags (segment flags). PF_X = 1 # #define PF_X (1 << 0) /* Segment is executable */ PF_W = 1 << 1 # #define PF_W (1 << 1) /* Segment is writable */ PF_R = 1 << 2 # #define PF_R (1 << 2) /* Segment is readable */ class Elf64_Phdr(Structure): # typedef struct _fields_ = [ # { ("p_type", Elf64_Word), # Elf64_Word p_type; ("p_flags", Elf64_Word), # Elf64_Word p_flags; ("p_offset", Elf64_Off), # Elf64_Off p_offset; ("p_vaddr", Elf64_Addr), # Elf64_Addr p_vaddr; ("p_paddr", Elf64_Addr), # Elf64_Addr p_paddr; ("p_filesz", Elf64_Xword), # Elf64_Xword p_filesz; ("p_memsz", Elf64_Xword), # Elf64_Xword p_memsz; ("p_align", Elf64_Xword), # Elf64_Xword p_align; ] # } Elf64_Phdr; # Elf64_auxv_t related constants. class _Elf64_auxv_t_U(Union): _fields_ = [ ("a_val", c_uint64) ] class Elf64_auxv_t(Structure): # typedef struct _fields_ = [ # { ("a_type", c_uint64), # uint64_t a_type; /* Entry type */ ("a_un", _Elf64_auxv_t_U) # union # { # uint64_t a_val; /* Integer value */ # /* We use to have pointer elements added here. We cannot do that, # though, since it does not work when using 32-bit definitions # on 64-bit platforms and vice versa. */ # } a_un; ] # } Elf64_auxv_t; # Elf64_Nhdr related constants. NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ NT_PRPSINFO = 3 # #define NT_PRPSINFO 3 /* Contains copy of prpsinfo struct */ NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, # size might increase */ NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped # files */ NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ class Elf64_Nhdr(Structure): # typedef struct _fields_ = [ # { ("n_namesz", Elf64_Word), # Elf64_Word n_namesz; /* Length of the note's name. */ ("n_descsz", Elf64_Word), # Elf64_Word n_descsz; /* Length of the note's descriptor. */ ("n_type", Elf64_Word), # Elf64_Word n_type; /* Type of the note. */ ] # } Elf64_Nhdr; # Elf64_Shdr related constants. class Elf64_Shdr(Structure): # typedef struct _fields_ = [ # { ("sh_name", Elf64_Word), # Elf64_Word sh_name; /* Section name (string tbl index) */ ("sh_type", Elf64_Word), # Elf64_Word sh_type; /* Section type */ ("sh_flags", Elf64_Xword), # Elf64_Xword sh_flags; /* Section flags */ ("sh_addr", Elf64_Addr), # Elf64_Addr sh_addr; /* Section virtual addr at execution */ ("sh_offset", Elf64_Off), # Elf64_Off sh_offset; /* Section file offset */ ("sh_size", Elf64_Xword), # Elf64_Xword sh_size; /* Section size in bytes */ ("sh_link", Elf64_Word), # Elf64_Word sh_link; /* Link to another section */ ("sh_info", Elf64_Word), # Elf64_Word sh_info; /* Additional section information */ ("sh_addralign",Elf64_Xword), # Elf64_Xword sh_addralign; /* Section alignment */ ("sh_entsize", Elf64_Xword) # Elf64_Xword sh_entsize; /* Entry size if section holds table */ ] # } Elf64_Shdr; # elf_prstatus related constants. # Signal info. class elf_siginfo(Structure): # struct elf_siginfo _fields_ = [ # { ("si_signo", c_int), # int si_signo; /* Signal number. */ ("si_code", c_int), # int si_code; /* Extra code. */ ("si_errno", c_int) # int si_errno; /* Errno. */ ] # }; # A time value that is accurate to the nearest # microsecond but also has a range of years. class timeval(Structure): # struct timeval _fields_ = [ # { ("tv_sec", c_long), # __time_t tv_sec; /* Seconds. */ ("tv_usec", c_long) # __suseconds_t tv_usec; /* Microseconds. */ ] # }; class user_regs_struct(Structure): # struct user_regs_struct _fields_ = [ # { ("r15", c_ulonglong), # __extension__ unsigned long long int r15; ("r14", c_ulonglong), # __extension__ unsigned long long int r14; ("r13", c_ulonglong), # __extension__ unsigned long long int r13; ("r12", c_ulonglong), # __extension__ unsigned long long int r12; ("rbp", c_ulonglong), # __extension__ unsigned long long int rbp; ("rbx", c_ulonglong), # __extension__ unsigned long long int rbx; ("r11", c_ulonglong), # __extension__ unsigned long long int r11; ("r10", c_ulonglong), # __extension__ unsigned long long int r10; ("r9", c_ulonglong), # __extension__ unsigned long long int r9; ("r8", c_ulonglong), # __extension__ unsigned long long int r8; ("rax", c_ulonglong), # __extension__ unsigned long long int rax; ("rcx", c_ulonglong), # __extension__ unsigned long long int rcx; ("rdx", c_ulonglong), # __extension__ unsigned long long int rdx; ("rsi", c_ulonglong), # __extension__ unsigned long long int rsi; ("rdi", c_ulonglong), # __extension__ unsigned long long int rdi; ("orig_rax", c_ulonglong), # __extension__ unsigned long long int orig_rax; ("rip", c_ulonglong), # __extension__ unsigned long long int rip; ("cs", c_ulonglong), # __extension__ unsigned long long int cs; ("eflags", c_ulonglong), # __extension__ unsigned long long int eflags; ("rsp", c_ulonglong), # __extension__ unsigned long long int rsp; ("ss", c_ulonglong), # __extension__ unsigned long long int ss; ("fs_base", c_ulonglong), # __extension__ unsigned long long int fs_base; ("gs_base", c_ulonglong), # __extension__ unsigned long long int gs_base; ("ds", c_ulonglong), # __extension__ unsigned long long int ds; ("es", c_ulonglong), # __extension__ unsigned long long int es; ("fs", c_ulonglong), # __extension__ unsigned long long int fs; ("gs", c_ulonglong) # __extension__ unsigned long long int gs; ] # }; #elf_greg_t = c_ulonglong #ELF_NGREG = sizeof(user_regs_struct)/sizeof(elf_greg_t) #elf_gregset_t = elf_greg_t*ELF_NGREG elf_gregset_t = user_regs_struct class elf_prstatus(Structure): # struct elf_prstatus _fields_ = [ # { ("pr_info", elf_siginfo), # struct elf_siginfo pr_info; /* Info associated with signal. */ ("pr_cursig", c_short), # short int pr_cursig; /* Current signal. */ ("pr_sigpend", c_ulong), # unsigned long int pr_sigpend; /* Set of pending signals. */ ("pr_sighold", c_ulong), # unsigned long int pr_sighold; /* Set of held signals. */ ("pr_pid", c_int), # __pid_t pr_pid; ("pr_ppid", c_int), # __pid_t pr_ppid; ("pr_pgrp", c_int), # __pid_t pr_pgrp; ("pr_sid", c_int), # __pid_t pr_sid; ("pr_utime", timeval), # struct timeval pr_utime; /* User time. */ ("pr_stime", timeval), # struct timeval pr_stime; /* System time. */ ("pr_cutime", timeval), # struct timeval pr_cutime; /* Cumulative user time. */ ("pr_cstime", timeval), # struct timeval pr_cstime; /* Cumulative system time. */ ("pr_reg", elf_gregset_t), # elf_gregset_t pr_reg; /* GP registers. */ ("pr_fpvalid", c_int) # int pr_fpvalid; /* True if math copro being used. */ ] # }; # elf_prpsinfo related constants. ELF_PRARGSZ = 80 # #define ELF_PRARGSZ (80) /* Number of chars for args. */ class elf_prpsinfo(Structure): # struct elf_prpsinfo _fields_ = [ # { ("pr_state", c_byte), # char pr_state; /* Numeric process state. */ ("pr_sname", c_char), # char pr_sname; /* Char for pr_state. */ ("pr_zomb", c_byte), # char pr_zomb; /* Zombie. */ ("pr_nice", c_byte), # char pr_nice; /* Nice val. */ ("pr_flag", c_ulong), # unsigned long int pr_flag; /* Flags. */ # #if __WORDSIZE == 32 # unsigned short int pr_uid; # unsigned short int pr_gid; # #else ("pr_uid", c_uint), # unsigned int pr_uid; ("pr_gid", c_uint), # unsigned int pr_gid; # #endif ("pr_pid", c_int), # int pr_pid, pr_ppid, pr_pgrp, pr_sid; ("pr_ppid", c_int), ("pr_pgrp", c_int), ("pr_sid", c_int), # /* Lots missing */ ("pr_fname", c_char*16), # char pr_fname[16]; /* Filename of executable. */ ("pr_psargs", c_char*ELF_PRARGSZ) # char pr_psargs[ELF_PRARGSZ]; /* Initial part of arg list. */ ] # }; class user_fpregs_struct(Structure): # struct user_fpregs_struct _fields_ = [ # { ("cwd", c_ushort), # unsigned short int cwd; ("swd", c_ushort), # unsigned short int swd; ("ftw", c_ushort), # unsigned short int ftw; ("fop", c_ushort), # unsigned short int fop; ("rip", c_ulonglong), # __extension__ unsigned long long int rip; ("rdp", c_ulonglong), # __extension__ unsigned long long int rdp; ("mxcsr", c_uint), # unsigned int mxcsr; ("mxcr_mask", c_uint), # unsigned int mxcr_mask; ("st_space", c_uint*32), # unsigned int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ ("xmm_space", c_uint*64), # unsigned int xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ ("padding", c_uint*24), # unsigned int padding[24]; ] # }; elf_fpregset_t = user_fpregs_struct # siginfo_t related constants. _SI_MAX_SIZE = 128 _SI_PAD_SIZE = (_SI_MAX_SIZE/sizeof(c_int)) - 4 # /* kill(). */ class _siginfo_t_U_kill(Structure): # struct _fields_ = [ # { ("si_pid", c_int), # __pid_t si_pid; /* Sending process ID. */ ("si_uid", c_uint) # __uid_t si_uid; /* Real user ID of sending process. */ ] # } _kill; # Type for data associated with a signal. class sigval_t(Union): # typedef union sigval _fields_ = [ # { ("sival_int", c_int), # int sival_int; ("sical_ptr", c_void_p), # void *sival_ptr; ] # } sigval_t; # /* POSIX.1b timers. */ class _siginfo_t_U_timer(Structure): # struct _fields_ = [ # { ("si_tid", c_int), # int si_tid; /* Timer ID. */ ("si_overrun", c_int), # int si_overrun; /* Overrun count. */ ("si_sigval", sigval_t) # sigval_t si_sigval; /* Signal value. */ ] # } _timer; # /* POSIX.1b signals. */ class _siginfo_t_U_rt(Structure): # struct _fields_ = [ # { ("si_pid", c_int), # __pid_t si_pid; /* Sending process ID. */ ("si_uid", c_uint), # __uid_t si_uid; /* Real user ID of sending process. */ ("si_sigval", sigval_t) # sigval_t si_sigval; /* Signal value. */ ] # } _rt; # /* SIGCHLD. */ class _siginfo_t_U_sigchld(Structure): # struct _fields_ = [ # { ("si_pid", c_int), # __pid_t si_pid; /* Which child. */ ("si_uid", c_uint), # __uid_t si_uid; /* Real user ID of sending process. */ ("si_status", c_int), # int si_status; /* Exit value or signal. */ ("si_utime", c_long), # __sigchld_clock_t si_utime; ("si_stime", c_long) # __sigchld_clock_t si_stime; ] # } _sigchld; # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ class _siginfo_t_U_sigfault(Structure): # struct _fields_ = [ # { ("si_addr", c_void_p), # void *si_addr; /* Faulting insn/memory ref. */ ("si_addr_lsb", c_short) # short int si_addr_lsb; /* Valid LSB of the reported address. */ ] # } _sigfault; # /* SIGPOLL. */ class _siginfo_t_U_sigpoll(Structure): # struct _fields_ = [ # { ("si_band", c_long), # long int si_band; /* Band event for SIGPOLL. */ ("si_fd", c_int) # int si_fd; ] # } _sigpoll; # /* SIGSYS. */ class _siginfo_t_U_sigsys(Structure): # struct _fields_ = [ # { ("_call_addr", c_void_p), # void *_call_addr; /* Calling user insn. */ ("_syscall", c_int), # int _syscall; /* Triggering system call number. */ ("_arch", c_uint) # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ ] # } _sigsys; class _siginfo_t_U(Union): # union _fields_ = [ # { ("_pad", c_int*_SI_PAD_SIZE), # int _pad[__SI_PAD_SIZE]; # # /* kill(). */ ("_kill", _siginfo_t_U_kill), # struct # { # __pid_t si_pid; /* Sending process ID. */ # __uid_t si_uid; /* Real user ID of sending process. */ # } _kill; # # /* POSIX.1b timers. */ ("_timer", _siginfo_t_U_timer), # struct # { # int si_tid; /* Timer ID. */ # int si_overrun; /* Overrun count. */ # sigval_t si_sigval; /* Signal value. */ # } _timer; # # /* POSIX.1b signals. */ ("_rt", _siginfo_t_U_rt), # struct # { # __pid_t si_pid; /* Sending process ID. */ # __uid_t si_uid; /* Real user ID of sending process. */ # sigval_t si_sigval; /* Signal value. */ # } _rt; # # /* SIGCHLD. */ ("_sigchld", _siginfo_t_U_sigchld), # struct # { # __pid_t si_pid; /* Which child. */ # __uid_t si_uid; /* Real user ID of sending process. */ # int si_status; /* Exit value or signal. */ # __sigchld_clock_t si_utime; # __sigchld_clock_t si_stime; # } _sigchld; # # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ ("_sigfault", _siginfo_t_U_sigfault), # struct # { # void *si_addr; /* Faulting insn/memory ref. */ # short int si_addr_lsb; /* Valid LSB of the reported address. */ # } _sigfault; # # /* SIGPOLL. */ ("_sigpoll", _siginfo_t_U_sigpoll), # struct # { # long int si_band; /* Band event for SIGPOLL. */ # int si_fd; # } _sigpoll; # # /* SIGSYS. */ ("_sigsys", _siginfo_t_U_sigpoll) # struct # { # void *_call_addr; /* Calling user insn. */ # int _syscall; /* Triggering system call number. */ # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ # } _sigsys; ] # } _sifields; class siginfo_t(Structure): # typedef struct _fields_ = [ # { ("si_signo", c_int), # int si_signo; /* Signal number. */ ("si_errno", c_int), # int si_errno; /* If non-zero, an errno value associated with # this signal, as defined in . */ ("si_code", c_int), # int si_code; /* Signal code. */ # ("_sifields", _siginfo_t_U) # union # { # int _pad[__SI_PAD_SIZE]; # # /* kill(). */ # struct # { # __pid_t si_pid; /* Sending process ID. */ # __uid_t si_uid; /* Real user ID of sending process. */ # } _kill; # # /* POSIX.1b timers. */ # struct # { # int si_tid; /* Timer ID. */ # int si_overrun; /* Overrun count. */ # sigval_t si_sigval; /* Signal value. */ # } _timer; # # /* POSIX.1b signals. */ # struct # { # __pid_t si_pid; /* Sending process ID. */ # __uid_t si_uid; /* Real user ID of sending process. */ # sigval_t si_sigval; /* Signal value. */ # } _rt; # # /* SIGCHLD. */ # struct # { # __pid_t si_pid; /* Which child. */ # __uid_t si_uid; /* Real user ID of sending process. */ # int si_status; /* Exit value or signal. */ # __sigchld_clock_t si_utime; # __sigchld_clock_t si_stime; # } _sigchld; # # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ # struct # { # void *si_addr; /* Faulting insn/memory ref. */ # short int si_addr_lsb; /* Valid LSB of the reported address. */ # } _sigfault; # # /* SIGPOLL. */ # struct # { # long int si_band; /* Band event for SIGPOLL. */ # int si_fd; # } _sigpoll; # # /* SIGSYS. */ # struct # { # void *_call_addr; /* Calling user insn. */ # int _syscall; /* Triggering system call number. */ # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ # } _sigsys; # } _sifields; ] # } siginfo_t __SI_ALIGNMENT; # xsave related. class ymmh_struct(Structure): # struct ymmh_struct { _fields_ = [ ("ymmh_space", 64*c_uint) # u32 ymmh_space[64]; ] # } __packed; class xsave_hdr_struct(Structure): # struct xsave_hdr_struct { _fields_ = [ ("xstate_bv", c_ulonglong), # u64 xstate_bv; ("reserved1", c_ulonglong*2), # u64 reserved1[2]; ("reserved2", c_ulonglong*5) # u64 reserved2[5]; ] # } __packed; class i387_fxsave_struct(Structure): # struct i387_fxsave_struct { _fields_ = [ ("cwd", c_ushort), # u16 cwd; /* Control Word */ ("swd", c_ushort), # u16 swd; /* Status Word */ ("twd", c_ushort), # u16 twd; /* Tag Word */ ("fop", c_ushort), # u16 fop; /* Last Instruction Opcode */ # union { # struct { ("rip", c_ulonglong), # u64 rip; /* Instruction Pointer */ ("rdp", c_ulonglong), # u64 rdp; /* Data Pointer */ # }; # struct { # u32 fip; /* FPU IP Offset */ # u32 fcs; /* FPU IP Selector */ # u32 foo; /* FPU Operand Offset */ # u32 fos; /* FPU Operand Selector */ # }; # }; ("mxcsr", c_uint), # u32 mxcsr; /* MXCSR Register State */ ("mxcsr_mask", c_uint), # u32 mxcsr_mask; /* MXCSR Mask */ # # /* 8*16 bytes for each FP-reg = 128 bytes */ ("st_space", c_uint*32), # u32 st_space[32]; # # /* 16*16 bytes for each XMM-reg = 256 bytes */ ("xmm_space", c_uint*64), # u32 xmm_space[64]; # ("padding", c_uint*12), # u32 padding[12]; # # union { ("padding1", c_uint*12) # u32 padding1[12]; # u32 sw_reserved[12]; # }; # ] # } __aligned(16); class elf_xsave_struct(Structure): # struct xsave_struct { _fields_ = [ ("i387", i387_fxsave_struct), # struct i387_fxsave_struct i387; ("xsave_hdr", xsave_hdr_struct), # struct xsave_hdr_struct xsave_hdr; ("ymmh", ymmh_struct) # struct ymmh_struct ymmh; ] # } __aligned(FP_MIN_ALIGN_BYTES) __packed; criu-3.6/coredump/pycriu000077700000000000000000000000001317335042600170122../lib/py/ustar00rootroot00000000000000criu-3.6/crit/000077500000000000000000000000001317335042600132715ustar00rootroot00000000000000criu-3.6/crit/crit000077500000000000000000000206211317335042600141610ustar00rootroot00000000000000#!/usr/bin/env python2 import argparse import sys import json import os import pycriu def inf(opts): if opts['in']: return open(opts['in'], 'r') else: return sys.stdin def outf(opts): if opts['out']: return open(opts['out'], 'w+') else: return sys.stdout def dinf(opts, name): return open(os.path.join(opts['dir'], name)) def decode(opts): indent = None try: img = pycriu.images.load(inf(opts), opts['pretty'], opts['nopl']) except pycriu.images.MagicException as exc: print >>sys.stderr, "Unknown magic %#x.\n"\ "Maybe you are feeding me an image with "\ "raw data(i.e. pages.img)?" % exc.magic sys.exit(1) if opts['pretty']: indent = 4 f = outf(opts) json.dump(img, f, indent=indent) if f == sys.stdout: f.write("\n") def encode(opts): img = json.load(inf(opts)) pycriu.images.dump(img, outf(opts)) def info(opts): infs = pycriu.images.info(inf(opts)) json.dump(infs, sys.stdout, indent = 4) print # # Explorers # class ps_item: def __init__(self, p, core): self.pid = p['pid'] self.ppid = p['ppid'] self.p = p self.core = core self.kids = [] def show_ps(p, opts, depth = 0): print "%7d%7d%7d %s%s" % (p.pid, p.p['pgid'], p.p['sid'], ' ' * (4 * depth), p.core['tc']['comm']) for kid in p.kids: show_ps(kid, opts, depth + 1) def explore_ps(opts): pss = { } ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) for p in ps_img['entries']: core = pycriu.images.load(dinf(opts, 'core-%d.img' % p['pid'])) ps = ps_item(p, core['entries'][0]) pss[ps.pid] = ps # Build tree psr = None for pid in pss: p = pss[pid] if p.ppid == 0: psr = p continue pp = pss[p.ppid] pp.kids.append(p) print "%7s%7s%7s %s" % ('PID', 'PGID', 'SID', 'COMM') show_ps(psr, opts) files_img = None def ftype_find_in_files(opts, ft, fid): global files_img if files_img is None: try: files_img = pycriu.images.load(dinf(opts, "files.img"))['entries'] except: files_img = [] if len(files_img) == 0: return None for f in files_img: if f['id'] == fid: return f return None def ftype_find_in_image(opts, ft, fid, img): f = ftype_find_in_files(opts, ft, fid) if f: return f[ft['field']] if ft['img'] == None: ft['img'] = pycriu.images.load(dinf(opts, img))['entries'] for f in ft['img']: if f['id'] == fid: return f return None def ftype_reg(opts, ft, fid): rf = ftype_find_in_image(opts, ft, fid, 'reg-files.img') return rf and rf['name'] or 'unknown path' def ftype_pipe(opts, ft, fid): p = ftype_find_in_image(opts, ft, fid, 'pipes.img') return p and 'pipe[%d]' % p['pipe_id'] or 'pipe[?]' def ftype_unix(opts, ft, fid): ux = ftype_find_in_image(opts, ft, fid, 'unixsk.img') if not ux: return 'unix[?]' n = ux['name'] and ' %s' % ux['name'] or '' return 'unix[%d (%d)%s]' % (ux['ino'], ux['peer'], n) file_types = { 'REG': {'get': ftype_reg, 'img': None, 'field': 'reg'}, 'PIPE': {'get': ftype_pipe, 'img': None, 'field': 'pipe'}, 'UNIXSK': {'get': ftype_unix, 'img': None, 'field': 'usk'}, } def ftype_gen(opts, ft, fid): return '%s.%d' % (ft['typ'], fid) files_cache = { } def get_file_str(opts, fd): key = (fd['type'], fd['id']) f = files_cache.get(key, None) if not f: ft = file_types.get(fd['type'], {'get': ftype_gen, 'typ': fd['type']}) f = ft['get'](opts, ft, fd['id']) files_cache[key] = f return f def explore_fds(opts): ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) for p in ps_img['entries']: pid = p['pid'] idi = pycriu.images.load(dinf(opts, 'ids-%s.img' % pid)) fdt = idi['entries'][0]['files_id'] fdi = pycriu.images.load(dinf(opts, 'fdinfo-%d.img' % fdt)) print "%d" % pid for fd in fdi['entries']: print "\t%7d: %s" % (fd['fd'], get_file_str(opts, fd)) fdi = pycriu.images.load(dinf(opts, 'fs-%d.img' % pid))['entries'][0] print "\t%7s: %s" % ('cwd', get_file_str(opts, {'type': 'REG', 'id': fdi['cwd_id']})) print "\t%7s: %s" % ('root', get_file_str(opts, {'type': 'REG', 'id': fdi['root_id']})) class vma_id: def __init__(self): self.__ids = {} self.__last = 1 def get(self, iid): ret = self.__ids.get(iid, None) if not ret: ret = self.__last self.__last += 1 self.__ids[iid] = ret return ret def explore_mems(opts): ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) vids = vma_id() for p in ps_img['entries']: pid = p['pid'] mmi = pycriu.images.load(dinf(opts, 'mm-%d.img' % pid))['entries'][0] print "%d" % pid print "\t%-36s %s" % ('exe', get_file_str(opts, {'type': 'REG', 'id': mmi['exe_file_id']})) for vma in mmi['vmas']: st = vma['status'] if st & (1 << 10): fn = ' ' + 'ips[%lx]' % vids.get(vma['shmid']) elif st & (1 << 8): fn = ' ' + 'shmem[%lx]' % vids.get(vma['shmid']) elif st & (1 << 11): fn = ' ' + 'packet[%lx]' % vids.get(vma['shmid']) elif st & ((1 << 6) | (1 << 7)): fn = ' ' + get_file_str(opts, {'type': 'REG', 'id': vma['shmid']}) if vma['pgoff']: fn += ' + %#lx' % vma['pgoff'] if st & (1 << 7): fn += ' (s)' elif st & (1 << 1): fn = ' [stack]' elif st & (1 << 2): fn = ' [vsyscall]' elif st & (1 << 3): fn = ' [vdso]' elif vma['flags'] & 0x0100: # growsdown fn = ' [stack?]' else: fn = '' if not st & (1 << 0): fn += ' *' prot = vma['prot'] & 0x1 and 'r' or '-' prot += vma['prot'] & 0x2 and 'w' or '-' prot += vma['prot'] & 0x4 and 'x' or '-' astr = '%08lx-%08lx' % (vma['start'], vma['end']) print "\t%-36s%s%s" % (astr, prot, fn) def explore_rss(opts): ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) for p in ps_img['entries']: pid = p['pid'] vmas = pycriu.images.load(dinf(opts, 'mm-%d.img' % pid))['entries'][0]['vmas'] pms = pycriu.images.load(dinf(opts, 'pagemap-%d.img' % pid))['entries'] print "%d" % pid vmi = 0 pvmi = -1 for pm in pms[1:]: pstr = '\t%lx / %-8d' % (pm['vaddr'], pm['nr_pages']) while vmas[vmi]['end'] <= pm['vaddr']: vmi += 1 pme = pm['vaddr'] + (pm['nr_pages'] << 12) vstr = '' while vmas[vmi]['start'] < pme: vma = vmas[vmi] if vmi == pvmi: vstr += ' ~' else: vstr += ' %08lx / %-8d' % (vma['start'], (vma['end'] - vma['start'])>>12) if vma['status'] & ((1 << 6) | (1 << 7)): vstr += ' ' + get_file_str(opts, {'type': 'REG', 'id': vma['shmid']}) pvmi = vmi vstr += '\n\t%23s' % '' vmi += 1 vmi -= 1 print '%-24s%s' % (pstr, vstr) explorers = { 'ps': explore_ps, 'fds': explore_fds, 'mems': explore_mems, 'rss': explore_rss } def explore(opts): explorers[opts['what']](opts) def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser(description=desc, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(help='Use crit CMD --help for command-specific help') # Decode decode_parser = subparsers.add_parser('decode', help = 'convert criu image from binary type to json') decode_parser.add_argument('--pretty', help = 'Multiline with indents and some numerical fields in field-specific format', action = 'store_true') decode_parser.add_argument('-i', '--in', help = 'criu image in binary format to be decoded (stdin by default)') decode_parser.add_argument('-o', '--out', help = 'where to put criu image in json format (stdout by default)') decode_parser.set_defaults(func=decode, nopl=False) # Encode encode_parser = subparsers.add_parser('encode', help = 'convert criu image from json type to binary') encode_parser.add_argument('-i', '--in', help = 'criu image in json format to be encoded (stdin by default)') encode_parser.add_argument('-o', '--out', help = 'where to put criu image in binary format (stdout by default)') encode_parser.set_defaults(func=encode) # Info info_parser = subparsers.add_parser('info', help = 'show info about image') info_parser.add_argument("in") info_parser.set_defaults(func=info) # Explore x_parser = subparsers.add_parser('x', help = 'explore image dir') x_parser.add_argument('dir') x_parser.add_argument('what', choices = [ 'ps', 'fds', 'mems', 'rss']) x_parser.set_defaults(func=explore) # Show show_parser = subparsers.add_parser('show', help = "convert criu image from binary to human-readable json") show_parser.add_argument("in") show_parser.add_argument('--nopl', help = 'do not show entry payload (if exists)', action = 'store_true') show_parser.set_defaults(func=decode, pretty=True, out=None) opts = vars(parser.parse_args()) opts["func"](opts) if __name__ == '__main__': main() criu-3.6/crit/pycriu000077700000000000000000000000001317335042600161352../lib/py/ustar00rootroot00000000000000criu-3.6/criu/000077500000000000000000000000001317335042600132725ustar00rootroot00000000000000criu-3.6/criu/Makefile000066400000000000000000000066451317335042600147450ustar00rootroot00000000000000.PHONY: .FORCE # here is a workaround for a bug in libnl-3: # 6a8d90f5fec4 "attr: Allow attribute type 0" WRAPFLAGS += -Wl,--wrap=nla_parse,--wrap=nlmsg_parse ARCH_DIR := criu/arch/$(SRCARCH) PIE_DIR := criu/pie export ARCH_DIR PIE_DIR ifeq ($(filter clean mrproper,$(MAKECMDGOALS)),) COMPEL_UAPI_INCLUDES := $(shell $(COMPEL_BIN) includes) export COMPEL_UAPI_INCLUDES COMPEL_LIBS := $(shell $(COMPEL_BIN) --static libs) endif # # General flags. ccflags-y += -fno-strict-aliasing ccflags-y += -iquote criu/include ccflags-y += -iquote include ccflags-y += -iquote images ccflags-y += -iquote $(ARCH_DIR)/include ccflags-y += -iquote . ccflags-y += -I/usr/include/libnl3 ccflags-y += $(COMPEL_UAPI_INCLUDES) export ccflags-y ifeq ($(GMON),1) CFLAGS += -pg GMONLDOPT := -pg endif # msg-* printing include $(__nmk_dir)msg.mk # # Needed libraries checks include criu/Makefile.packages # # Architecture dependent part. ARCH-LIB := $(ARCH_DIR)/crtools.built-in.o $(ARCH-LIB): .FORCE $(Q) $(MAKE) $(build)=$(ARCH_DIR) all # # PIE library code. criu/pie/pie.lib.a: $(ARCH-LIB) .FORCE $(Q) $(MAKE) $(call build-as,Makefile.library,criu/pie) all # # PIE code blobs themseves. pie: criu/pie/pie.lib.a $(Q) $(MAKE) $(build)=criu/pie all .PHONY: pie criu/pie/Makefile: ; criu/pie/Makefile.library: ; criu/pie/%: pie ; # # CRIU executable PROGRAM-BUILTINS += criu/pie/pie.lib.a PROGRAM-BUILTINS += images/built-in.o PROGRAM-BUILTINS += $(obj)/built-in.o PROGRAM-BUILTINS += $(ARCH-LIB) PROGRAM-BUILTINS += soccr/libsoccr.a PROGRAM-BUILTINS += $(COMPEL_LIBS) $(obj)/built-in.o: pie $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) all $(obj)/Makefile: ; $(obj)/Makefile.crtools: ; $(obj)/Makefile.packages: ; $(obj)/%: pie $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) $@ $(obj)/criu: $(PROGRAM-BUILTINS) $(call msg-link, $@) $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ # # Clean the most, except generated c files subclean: $(Q) $(RM) $(obj)/*.{gcda,gcno,gcov} $(Q) $(RM) $(obj)/pie/*.{gcda,gcno,gcov} $(Q) $(RM) -r $(obj)/gcov $(Q) $(MAKE) $(build)=$(ARCH_DIR) clean $(Q) $(MAKE) $(call build-as,Makefile.library,$(PIE_DIR)) clean $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) clean $(Q) $(MAKE) $(build)=$(PIE_DIR) clean .PHONY: subclean cleanup-y += $(obj)/criu clean: subclean # # Delete all generated files subproper: $(Q) $(MAKE) $(build)=$(ARCH_DIR) mrproper $(Q) $(MAKE) $(call build-as,Makefile.library,$(PIE_DIR)) mrproper $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) mrproper $(Q) $(MAKE) $(build)=$(PIE_DIR) mrproper .PHONY: subproper mrproper: subproper UAPI_HEADERS := criu/include/criu-plugin.h UAPI_HEADERS += criu/include/criu-log.h install: $(obj)/criu $(E) " INSTALL " $(obj)/criu $(Q) mkdir -p $(DESTDIR)$(SBINDIR) $(Q) install -m 755 $(obj)/criu $(DESTDIR)$(SBINDIR) $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/criu/ $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/criu/scripts $(Q) install -m 755 scripts/systemd-autofs-restart.sh $(DESTDIR)$(LIBEXECDIR)/criu/scripts .PHONY: install uninstall: $(E) " UNINSTALL" criu $(Q) $(RM) $(addprefix $(DESTDIR)$(SBINDIR)/,criu) $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBEXECDIR)/criu/scripts/,systemd-autofs-restart.sh) .PHONY: uninstall all-y += check-packages $(obj)/criu criu-3.6/criu/Makefile.crtools000066400000000000000000000046321317335042600164230ustar00rootroot00000000000000ccflags-y += -iquote criu/$(ARCH) ccflags-y += $(COMPEL_UAPI_INCLUDES) CFLAGS_REMOVE_clone-noasan.o += $(CFLAGS-ASAN) CFLAGS_kerndat.o += -DKDAT_MAGIC_2=${shell echo $${SOURCE_DATE_EPOCH:-$$(date +%s)}} -DKDAT_RUNDIR=\"$(RUNDIR)\" ldflags-y += -r obj-y += action-scripts.o obj-y += external.o obj-y += aio.o obj-y += bfd.o obj-y += bitmap.o obj-y += cgroup.o obj-y += cgroup-props.o obj-y += clone-noasan.o obj-y += cr-check.o obj-y += cr-dedup.o obj-y += cr-dump.o obj-y += cr-errno.o obj-y += cr-restore.o obj-y += cr-service.o obj-y += crtools.o obj-y += eventfd.o obj-y += eventpoll.o obj-y += fault-injection.o obj-y += fifo.o obj-y += file-ids.o obj-y += file-lock.o obj-y += files-ext.o obj-y += files.o obj-y += files-reg.o obj-y += fsnotify.o obj-y += image-desc.o obj-y += image.o obj-y += ipc_ns.o obj-y += irmap.o obj-y += kcmp-ids.o obj-y += kerndat.o obj-y += libnetlink.o obj-y += log.o obj-y += lsm.o obj-y += mem.o obj-y += mount.o obj-y += filesystems.o obj-y += namespaces.o obj-y += netfilter.o obj-y += net.o obj-y += pagemap-cache.o obj-y += page-pipe.o obj-y += pagemap.o obj-y += page-xfer.o obj-y += parasite-syscall.o obj-y += pie-util.o obj-y += pipes.o obj-y += plugin.o obj-y += proc_parse.o obj-y += protobuf-desc.o obj-y += protobuf.o obj-y += pstree.o obj-y += rbtree.o obj-y += rst-malloc.o obj-y += seccomp.o obj-y += seize.o obj-y += shmem.o obj-y += sigframe.o obj-y += signalfd.o obj-y += sk-inet.o obj-y += sk-netlink.o obj-y += sk-packet.o obj-y += sk-queue.o obj-y += sk-tcp.o obj-y += sk-unix.o obj-y += sockets.o obj-y += stats.o obj-y += string.o obj-y += sysctl.o obj-y += sysfs_parse.o obj-y += timerfd.o obj-y += tty.o obj-y += tun.o obj-y += util.o obj-y += uts_ns.o obj-y += path.o obj-y += autofs.o obj-y += fdstore.o obj-y += uffd.o ifeq ($(VDSO),y) obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) endif PROTOBUF_GEN := scripts/protobuf-gen.sh $(obj)/protobuf-desc.d: $(obj)/protobuf-desc-gen.h $(obj)/protobuf-desc-gen.h: $(PROTOBUF_GEN) criu/include/protobuf-desc.h $(call msg-gen, $@) $(Q) $(SH) $(PROTOBUF_GEN) > $@ mrproper-y += $(obj)/protobuf-desc-gen.h criu-3.6/criu/Makefile.packages000066400000000000000000000026221317335042600165110ustar00rootroot00000000000000REQ-RPM-PKG-NAMES += protobuf REQ-RPM-PKG-NAMES += protobuf-c REQ-RPM-PKG-NAMES += protobuf-c-devel REQ-RPM-PKG-NAMES += protobuf-compiler REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel REQ-RPM-PKG-TEST-NAMES += libaio-devel REQ-DEB-PKG-NAMES += libprotobuf-dev REQ-DEB-PKG-NAMES += libprotobuf-c0-dev REQ-DEB-PKG-NAMES += protobuf-c-compiler REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += python-protobuf REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev REQ-DEB-PKG-TEST-NAMES += libaio-dev export LIBS += -lrt -lpthread -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet check-packages-failed: $(warning Can not find some of the required libraries) $(warning Make sure the following packages are installed) $(warning RPM based distros: $(REQ-RPM-PKG-NAMES)) $(warning DEB based distros: $(REQ-DEB-PKG-NAMES)) $(warning To run tests the following packages are needed) $(warning RPM based distros: $(REQ-RPM-PKG-TEST-NAMES)) $(warning DEB based distros: $(REQ-DEB-PKG-TEST-NAMES)) $(error Compilation aborted) # # Make sure all required libs are installed PROGRAM_STUB := int main(int argc, char **argv) { return 0; } check-packages: $(Q) $(call try-cc,$(PROGRAM_STUB),$(LIBS)) \ || $(MAKE) -f $(obj)/Makefile.packages check-packages-failed .PHONY: check-packages-failed check-packages criu-3.6/criu/action-scripts.c000066400000000000000000000067011317335042600164040ustar00rootroot00000000000000#include #include #include #include #include "cr_options.h" #include "common/list.h" #include "xmalloc.h" #include "log.h" #include "servicefd.h" #include "cr-service.h" #include "action-scripts.h" #include "pstree.h" #include "common/bug.h" #include "util.h" #include #include #include "common/scm.h" static const char *action_names[ACT_MAX] = { [ ACT_PRE_DUMP ] = "pre-dump", [ ACT_POST_DUMP ] = "post-dump", [ ACT_PRE_RESTORE ] = "pre-restore", [ ACT_POST_RESTORE ] = "post-restore", [ ACT_NET_LOCK ] = "network-lock", [ ACT_NET_UNLOCK ] = "network-unlock", [ ACT_SETUP_NS ] = "setup-namespaces", [ ACT_POST_SETUP_NS ] = "post-setup-namespaces", [ ACT_PRE_RESUME ] = "pre-resume", [ ACT_POST_RESUME ] = "post-resume", [ ACT_ORPHAN_PTS_MASTER ] = "orphan-pts-master", }; struct script { struct list_head node; char *path; }; enum { SCRIPTS_NONE, SCRIPTS_SHELL, SCRIPTS_RPC }; static int scripts_mode = SCRIPTS_NONE; static int rpc_sk; static LIST_HEAD(scripts); static int run_shell_scripts(const char *action) { int retval = 0; struct script *script; char image_dir[PATH_MAX]; static unsigned env_set = 0; #define ENV_IMGDIR 0x1 #define ENV_ROOTPID 0x2 if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) { pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action); return -1; } if (!(env_set & ENV_IMGDIR)) { sprintf(image_dir, "/proc/%ld/fd/%d", (long) getpid(), get_service_fd(IMG_FD_OFF)); if (setenv("CRTOOLS_IMAGE_DIR", image_dir, 1)) { pr_perror("Can't set CRTOOLS_IMAGE_DIR=%s", image_dir); return -1; } env_set |= ENV_IMGDIR; } if (!(env_set & ENV_ROOTPID) && root_item) { int pid; char root_item_pid[16]; pid = root_item->pid->real; if (pid != -1) { snprintf(root_item_pid, sizeof(root_item_pid), "%d", pid); if (setenv("CRTOOLS_INIT_PID", root_item_pid, 1)) { pr_perror("Can't set CRTOOLS_INIT_PID=%s", root_item_pid); return -1; } env_set |= ENV_ROOTPID; } } list_for_each_entry(script, &scripts, node) { int err; pr_debug("\t[%s]\n", script->path); err = cr_system(-1, -1, -1, script->path, (char *[]) { script->path, NULL }, 0); if (err) pr_err("Script %s exited with %d\n", script->path, err); retval |= err; } unsetenv("CRTOOLS_SCRIPT_ACTION"); return retval; } int rpc_send_fd(enum script_actions act, int fd) { const char *action = action_names[act]; if (scripts_mode != SCRIPTS_RPC) return -1; pr_debug("\tRPC\n"); return send_criu_rpc_script(act, (char *)action, rpc_sk, fd); } int run_scripts(enum script_actions act) { int ret = 0; const char *action = action_names[act]; pr_debug("Running %s scripts\n", action); if (scripts_mode == SCRIPTS_NONE) return 0; if (scripts_mode == SCRIPTS_RPC) { pr_debug("\tRPC\n"); ret = send_criu_rpc_script(act, (char *)action, rpc_sk, -1); goto out; } if (scripts_mode == SCRIPTS_SHELL) { ret = run_shell_scripts(action); goto out; } BUG(); out: if (ret) pr_err("One of more action scripts failed\n"); return ret; } int add_script(char *path) { struct script *script; BUG_ON(scripts_mode == SCRIPTS_RPC); scripts_mode = SCRIPTS_SHELL; script = xmalloc(sizeof(struct script)); if (script == NULL) return 1; script->path = path; list_add(&script->node, &scripts); return 0; } int add_rpc_notify(int sk) { BUG_ON(scripts_mode == SCRIPTS_SHELL); scripts_mode = SCRIPTS_RPC; rpc_sk = install_service_fd(RPC_SK_OFF, sk); return 0; } criu-3.6/criu/aio.c000066400000000000000000000064331317335042600142140ustar00rootroot00000000000000#include #include #include #include "vma.h" #include "xmalloc.h" #include "pstree.h" #include "restorer.h" #include "aio.h" #include "rst_info.h" #include "rst-malloc.h" #include "parasite.h" #include "parasite-syscall.h" #include "images/mm.pb-c.h" #include #define NR_IOEVENTS_IN_NPAGES(npages) ((PAGE_SIZE * npages - sizeof(struct aio_ring)) / sizeof(struct io_event)) int dump_aio_ring(MmEntry *mme, struct vma_area *vma) { int nr = mme->n_aios; AioRingEntry *re; mme->aios = xrealloc(mme->aios, (nr + 1) * sizeof(re)); if (!mme->aios) return -1; re = xmalloc(sizeof(*re)); if (!re) return -1; aio_ring_entry__init(re); re->id = vma->e->start; re->ring_len = vma->e->end - vma->e->start; re->nr_req = aio_estimate_nr_reqs(re->ring_len); if (!re->nr_req) { xfree(re); return -1; } mme->aios[nr] = re; mme->n_aios = nr + 1; pr_info("Dumping AIO ring @%"PRIx64"-%"PRIx64"\n", vma->e->start, vma->e->end); return 0; } void free_aios(MmEntry *mme) { int i; if (mme->aios) { for (i = 0; i < mme->n_aios; i++) xfree(mme->aios[i]); xfree(mme->aios); } } unsigned int aio_estimate_nr_reqs(unsigned int size) { unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size/PAGE_SIZE); if (size & ~PAGE_MASK) { pr_err("Ring size is not aligned\n"); return 0; } /* * Kernel does * * nr_reqs = max(nr_reqs, nr_cpus * 4) * nr_reqs *= 2 * nr_reqs += 2 * ring = roundup(sizeof(head) + nr_reqs * sizeof(req)) * nr_reqs = (ring - sizeof(head)) / sizeof(req) * * And the k_max_reqs here is the resulting value. * * We need to get the initial nr_reqs that would grow * up back to the k_max_reqs. */ return (k_max_reqs - 2) / 2; } unsigned long aio_rings_args_size(struct vm_area_list *vmas) { return sizeof(struct parasite_check_aios_args) + vmas->nr_aios * sizeof(struct parasite_aio); } int parasite_collect_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas) { struct vma_area *vma; struct parasite_check_aios_args *aa; struct parasite_aio *pa; if (!vmas->nr_aios) return 0; pr_info("Checking AIO rings\n"); /* * Go to parasite and * a) check that no requests are currently pengind * b) get the maximum number of requests kernel handles * to estimate what was the user request on ring * creation. */ aa = compel_parasite_args_s(ctl, aio_rings_args_size(vmas)); pa = &aa->ring[0]; list_for_each_entry(vma, &vmas->h, list) { if (!vma_area_is(vma, VMA_AREA_AIORING)) continue; pr_debug(" `- Ring #%ld @%"PRIx64"\n", (long)(pa - &aa->ring[0]), vma->e->start); pa->ctx = vma->e->start; pa->size = vma->e->end - vma->e->start; pa++; } aa->nr_rings = vmas->nr_aios; if (compel_rpc_call_sync(PARASITE_CMD_CHECK_AIOS, ctl)) return -1; return 0; } int prepare_aios(struct pstree_item *t, struct task_restore_args *ta) { int i; MmEntry *mm = rsti(t)->mm; /* * Put info about AIO rings, they will get remapped */ ta->rings = (struct rst_aio_ring *)rst_mem_align_cpos(RM_PRIVATE); ta->rings_n = mm->n_aios; for (i = 0; i < mm->n_aios; i++) { struct rst_aio_ring *raio; raio = rst_mem_alloc(sizeof(*raio), RM_PRIVATE); if (!raio) return -1; raio->addr = mm->aios[i]->id; raio->nr_req = mm->aios[i]->nr_req; raio->len = mm->aios[i]->ring_len; } return 0; } criu-3.6/criu/arch/000077500000000000000000000000001317335042600142075ustar00rootroot00000000000000criu-3.6/criu/arch/aarch64/000077500000000000000000000000001317335042600154375ustar00rootroot00000000000000criu-3.6/criu/arch/aarch64/Makefile000066400000000000000000000004421317335042600170770ustar00rootroot00000000000000builtin-name := crtools.built-in.o ccflags-y += -iquote $(obj)/include -iquote criu/include ccflags-y += -iquote include ccflags-y += $(COMPEL_UAPI_INCLUDES) asflags-y += -D__ASSEMBLY__ ldflags-y += -r obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o obj-y += bitops.o criu-3.6/criu/arch/aarch64/bitops.S000066400000000000000000000004721317335042600170660ustar00rootroot00000000000000#include "common/asm/linkage.h" .text ENTRY(test_and_set_bit) and w3, w0, #63 eor w0, w0, w3 mov x2, #1 add x1, x1, x0, lsr #3 lsl x4, x2, x3 1: ldaxr x2, [x1] lsr x0, x2, x3 orr x2, x2, x4 stlxr w5, x2, [x1] cbnz w5, 1b and x0, x0, #1 3: ret END(test_and_set_bit) criu-3.6/criu/arch/aarch64/cpu.c000066400000000000000000000006501317335042600163730ustar00rootroot00000000000000#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include #include "cpu.h" int cpu_init(void) { return 0; } int cpu_dump_cpuinfo(void) { return 0; } int cpu_validate_cpuinfo(void) { return 0; } int cpu_dump_cpuinfo_single(void) { return -ENOTSUP; } int cpu_validate_image_cpuinfo_single(void) { return -ENOTSUP; } int cpuinfo_dump(void) { return -ENOTSUP; } int cpuinfo_check(void) { return -ENOTSUP; } criu-3.6/criu/arch/aarch64/crtools.c000066400000000000000000000063101317335042600172700ustar00rootroot00000000000000#include #include #include #include "types.h" #include #include #include "asm/restorer.h" #include "common/compiler.h" #include #include "asm/dump.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" #include "parasite-syscall.h" #include "log.h" #include "util.h" #include "cpu.h" #include "restorer.h" #include #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; CoreEntry *core = x; // Save the Aarch64 CPU state for (i = 0; i < 31; ++i) assign_reg(core->ti_aarch64->gpregs, regs, regs[i]); assign_reg(core->ti_aarch64->gpregs, regs, sp); assign_reg(core->ti_aarch64->gpregs, regs, pc); assign_reg(core->ti_aarch64->gpregs, regs, pstate); // Save the FP/SIMD state for (i = 0; i < 32; ++i) { core->ti_aarch64->fpsimd->vregs[2*i] = fpsimd->vregs[i]; core->ti_aarch64->fpsimd->vregs[2*i + 1] = fpsimd->vregs[i] >> 64; } assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); return 0; } int arch_alloc_thread_info(CoreEntry *core) { ThreadInfoAarch64 *ti_aarch64; UserAarch64RegsEntry *gpregs; UserAarch64FpsimdContextEntry *fpsimd; ti_aarch64 = xmalloc(sizeof(*ti_aarch64)); if (!ti_aarch64) goto err; thread_info_aarch64__init(ti_aarch64); core->ti_aarch64 = ti_aarch64; gpregs = xmalloc(sizeof(*gpregs)); if (!gpregs) goto err; user_aarch64_regs_entry__init(gpregs); gpregs->regs = xmalloc(31*sizeof(uint64_t)); if (!gpregs->regs) goto err; gpregs->n_regs = 31; ti_aarch64->gpregs = gpregs; fpsimd = xmalloc(sizeof(*fpsimd)); if (!fpsimd) goto err; user_aarch64_fpsimd_context_entry__init(fpsimd); ti_aarch64->fpsimd = fpsimd; fpsimd->vregs = xmalloc(64*sizeof(fpsimd->vregs[0])); fpsimd->n_vregs = 64; if (!fpsimd->vregs) goto err; return 0; err: return -1; } void arch_free_thread_info(CoreEntry *core) { if (CORE_THREAD_ARCH_INFO(core)) { if (CORE_THREAD_ARCH_INFO(core)->fpsimd) { xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs); xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd); } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); CORE_THREAD_ARCH_INFO(core) = NULL; } } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { int i; struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); if (core->ti_aarch64->fpsimd->n_vregs != 64) return 1; for (i = 0; i < 32; ++i) fpsimd->vregs[i] = (__uint128_t)core->ti_aarch64->fpsimd->vregs[2*i] | ((__uint128_t)core->ti_aarch64->fpsimd->vregs[2*i + 1] << 64); assign_reg(fpsimd, core->ti_aarch64->fpsimd, fpsr); assign_reg(fpsimd, core->ti_aarch64->fpsimd, fpcr); fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); return 0; } int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) { #define CPREG1(d) f->uc.uc_mcontext.d = r->d int i; for (i = 0; i < 31; ++i) CPREG1(regs[i]); CPREG1(sp); CPREG1(pc); CPREG1(pstate); #undef CPREG1 return 0; } criu-3.6/criu/arch/aarch64/include/000077500000000000000000000000001317335042600170625ustar00rootroot00000000000000criu-3.6/criu/arch/aarch64/include/asm/000077500000000000000000000000001317335042600176425ustar00rootroot00000000000000criu-3.6/criu/arch/aarch64/include/asm/dump.h000066400000000000000000000006171317335042600207640ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); static inline void core_put_tls(CoreEntry *core, tls_t tls) { core->ti_aarch64->tls = tls; } #define get_task_futex_robust_list_compat(pid, info) -1 #endif criu-3.6/criu/arch/aarch64/include/asm/int.h000066400000000000000000000001571317335042600206100ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ criu-3.6/criu/arch/aarch64/include/asm/parasite-syscall.h000066400000000000000000000001521317335042600232710ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ struct parasite_ctl; #endif criu-3.6/criu/arch/aarch64/include/asm/parasite.h000066400000000000000000000002621317335042600216230ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ static inline void arch_get_tls(tls_t *ptls) { tls_t tls; asm("mrs %0, tpidr_el0" : "=r" (tls)); *ptls = tls; } #endif criu-3.6/criu/arch/aarch64/include/asm/restore.h000066400000000000000000000011271317335042600214770ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ "and sp, %0, #~15 \n" \ "mov x0, %2 \n" \ "br %1 \n" \ : \ : "r"(new_sp), \ "r"(restore_task_exec_start), \ "r"(task_args) \ : "sp", "x0", "memory") static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { *ptls = pcore->ti_aarch64->tls; } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif criu-3.6/criu/arch/aarch64/include/asm/restorer.h000066400000000000000000000036271317335042600216700ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include #include #include "asm/types.h" #include "images/core.pb-c.h" #include #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "clone_emul: \n" \ "ldr x1, %2 \n" \ "and x1, x1, #~15 \n" \ "sub x1, x1, #16 \n" \ "stp %5, %6, [x1] \n" \ "mov x0, %1 \n" \ "mov x2, %3 \n" \ "mov x3, %4 \n" \ "mov x8, #"__stringify(__NR_clone)" \n" \ "svc #0 \n" \ \ "cbz x0, thread_run \n" \ \ "mov %0, x0 \n" \ "b clone_end \n" \ \ "thread_run: \n" \ "ldp x1, x0, [sp] \n" \ "br x1 \n" \ \ "clone_end: \n" \ : "=r"(ret) \ : "r"(clone_flags), \ "m"(new_sp), \ "r"(&parent_tid), \ "r"(&thread_args[i].pid), \ "r"(clone_restore_fn), \ "r"(&thread_args[i]) \ : "x0", "x1", "x2", "x3", "x8", "memory") #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "mov sp, %0 \n" \ "mov x0, #0 \n" \ "b x0 \n" \ : \ : "r"(ret) \ : "sp", "x0", "memory") #define kdat_compatible_cr() 0 #define kdat_can_map_vdso() 0 #define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserAarch64RegsEntry *r); int restore_nonsigframe_gpregs(UserAarch64RegsEntry *r); static inline void restore_tls(tls_t *ptls) { asm("msr tpidr_el0, %0" : : "r" (*ptls)); } static inline void *alloc_compat_syscall_stack(void) { return NULL; } static inline void free_compat_syscall_stack(void *stack32) { } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } #endif criu-3.6/criu/arch/aarch64/include/asm/types.h000066400000000000000000000013631317335042600211620ustar00rootroot00000000000000#ifndef __CR_ASM_TYPES_H__ #define __CR_ASM_TYPES_H__ #include #include #include #include "images/core.pb-c.h" #include "page.h" #include "bitops.h" #include "asm/int.h" #include #define core_is_compat(core) false typedef UserAarch64RegsEntry UserRegsEntry; #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__AARCH64 #define CORE_THREAD_ARCH_INFO(core) core->ti_aarch64 #define TI_SP(core) ((core)->ti_aarch64->gpregs->sp) static inline void *decode_pointer(uint64_t v) { return (void*)v; } static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } #define AT_VECTOR_SIZE 40 typedef uint64_t auxv_t; typedef uint64_t tls_t; #endif /* __CR_ASM_TYPES_H__ */ criu-3.6/criu/arch/aarch64/include/asm/vdso.h000066400000000000000000000016321317335042600207700ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "common/compiler.h" #include "asm-generic/vdso.h" /* * This is a minimal amount of symbols * we should support at the moment. */ #define VDSO_SYMBOL_MAX 4 /* * Workaround for VDSO array symbol table's relocation. * XXX: remove when compel/piegen will support aarch64. */ static const char* __maybe_unused aarch_vdso_symbol1 = "__kernel_clock_getres"; static const char* __maybe_unused aarch_vdso_symbol2 = "__kernel_clock_gettime"; static const char* __maybe_unused aarch_vdso_symbol3 = "__kernel_gettimeofday"; static const char* __maybe_unused aarch_vdso_symbol4 = "__kernel_rt_sigreturn"; #define ARCH_VDSO_SYMBOLS \ aarch_vdso_symbol1, \ aarch_vdso_symbol2, \ aarch_vdso_symbol3, \ aarch_vdso_symbol4 extern void write_intraprocedure_branch(unsigned long to, unsigned long from); #endif /* __CR_ASM_VDSO_H__ */ criu-3.6/criu/arch/aarch64/intraprocedure.S000066400000000000000000000010071317335042600206070ustar00rootroot00000000000000.global write_intraprocedure_branch /* to is x0, from is x1 */ write_intraprocedure_branch: /* load two 32-bit instructions */ ldr x2, loadbranch /* store 64 bits of instructions and 64 bits of destination address */ stp x2, x0, [x1] /* perform required cache maintenance and synronization operations */ dc cvau, x1 dsb ish ic ivau, x1 dsb ish isb ret /* intraprocedure trampoline instructions */ loadbranch: ldr x16, =destination br x16 /* label to get relative position of literal pool */ destination: criu-3.6/criu/arch/aarch64/restorer.c000066400000000000000000000003551317335042600174530ustar00rootroot00000000000000#include #include "restorer.h" #include "asm/restorer.h" #include #include "log.h" #include #include "cpu.h" int restore_nonsigframe_gpregs(UserRegsEntry *r) { return 0; } criu-3.6/criu/arch/aarch64/sigframe.c000066400000000000000000000003101317335042600173720ustar00rootroot00000000000000#include "asm/types.h" #include #include "asm/sigframe.h" int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } criu-3.6/criu/arch/aarch64/vdso-pie.c000066400000000000000000000014121317335042600173270ustar00rootroot00000000000000#include #include "asm/types.h" #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX # undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, struct vdso_symtable *from, bool __always_unused compat_vdso) { unsigned int i; for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { if (vdso_symbol_empty(&from->symbols[i])) continue; pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, from->symbols[i].offset, base_to, to->symbols[i].offset, i); write_intraprocedure_branch(base_to + to->symbols[i].offset, base_from + from->symbols[i].offset); } return 0; } criu-3.6/criu/arch/arm/000077500000000000000000000000001317335042600147665ustar00rootroot00000000000000criu-3.6/criu/arch/arm/Makefile000066400000000000000000000004621317335042600164300ustar00rootroot00000000000000builtin-name := crtools.built-in.o ccflags-y += -iquote $(obj)/include ccflags-y += -iquote criu/include -iquote include ccflags-y += $(COMPEL_UAPI_INCLUDES) asflags-y += -D__ASSEMBLY__ ldflags-y += -r -z noexecstack obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o obj-y += bitops.o criu-3.6/criu/arch/arm/bitops.S000066400000000000000000000011331317335042600164100ustar00rootroot00000000000000#include "common/asm/linkage.h" .syntax unified ENTRY(test_and_set_bit) ands ip, r1, #3 strbne r1, [ip] @ assert word-aligned mov r2, #1 and r3, r0, #31 @ Get bit offset mov r0, r0, lsr #5 add r1, r1, r0, lsl #2 @ Get word offset mov r3, r2, lsl r3 @ create mask dmb ish 1: ldrex r2, [r1] ands r0, r2, r3 @ save old value of bit orreq r2, r2, r3 @ toggle bit strex ip, r2, [r1] cmp ip, #0 bne 1b dmb ish cmp r0, #0 movne r0, #1 2: bx lr END(test_and_set_bit) criu-3.6/criu/arch/arm/cpu.c000066400000000000000000000006501317335042600157220ustar00rootroot00000000000000#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include #include "cpu.h" int cpu_init(void) { return 0; } int cpu_dump_cpuinfo(void) { return 0; } int cpu_validate_cpuinfo(void) { return 0; } int cpu_dump_cpuinfo_single(void) { return -ENOTSUP; } int cpu_validate_image_cpuinfo_single(void) { return -ENOTSUP; } int cpuinfo_dump(void) { return -ENOTSUP; } int cpuinfo_check(void) { return -ENOTSUP; } criu-3.6/criu/arch/arm/crtools.c000066400000000000000000000067661317335042600166360ustar00rootroot00000000000000#include #include #include "types.h" #include #include #include "asm/restorer.h" #include "common/compiler.h" #include "asm/dump.h" #include #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" #include "log.h" #include "util.h" #include "cpu.h" #include "elf.h" #include "parasite-syscall.h" #include "restorer.h" #include #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; // Save the ARM CPU state assign_reg(core->ti_arm->gpregs, regs, r0); assign_reg(core->ti_arm->gpregs, regs, r1); assign_reg(core->ti_arm->gpregs, regs, r2); assign_reg(core->ti_arm->gpregs, regs, r3); assign_reg(core->ti_arm->gpregs, regs, r4); assign_reg(core->ti_arm->gpregs, regs, r5); assign_reg(core->ti_arm->gpregs, regs, r6); assign_reg(core->ti_arm->gpregs, regs, r7); assign_reg(core->ti_arm->gpregs, regs, r8); assign_reg(core->ti_arm->gpregs, regs, r9); assign_reg(core->ti_arm->gpregs, regs, r10); assign_reg(core->ti_arm->gpregs, regs, fp); assign_reg(core->ti_arm->gpregs, regs, ip); assign_reg(core->ti_arm->gpregs, regs, sp); assign_reg(core->ti_arm->gpregs, regs, lr); assign_reg(core->ti_arm->gpregs, regs, pc); assign_reg(core->ti_arm->gpregs, regs, cpsr); core->ti_arm->gpregs->orig_r0 = regs->ARM_ORIG_r0; // Save the VFP state memcpy(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, &fpregs->fpregs, sizeof(fpregs->fpregs)); CORE_THREAD_ARCH_INFO(core)->fpstate->fpscr = fpregs->fpscr; return 0; } int arch_alloc_thread_info(CoreEntry *core) { ThreadInfoArm *ti_arm; UserArmRegsEntry *gpregs; UserArmVfpstateEntry *fpstate; ti_arm = xmalloc(sizeof(*ti_arm)); if (!ti_arm) goto err; thread_info_arm__init(ti_arm); core->ti_arm = ti_arm; gpregs = xmalloc(sizeof(*gpregs)); user_arm_regs_entry__init(gpregs); ti_arm->gpregs = gpregs; fpstate = xmalloc(sizeof(*fpstate)); if (!fpstate) goto err; user_arm_vfpstate_entry__init(fpstate); ti_arm->fpstate = fpstate; fpstate->vfp_regs = xmalloc(32*sizeof(unsigned long long)); fpstate->n_vfp_regs = 32; if (!fpstate->vfp_regs) goto err; return 0; err: return -1; } void arch_free_thread_info(CoreEntry *core) { if (CORE_THREAD_ARCH_INFO(core)) { if (CORE_THREAD_ARCH_INFO(core)->fpstate) { xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs); xfree(CORE_THREAD_ARCH_INFO(core)->fpstate); } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); CORE_THREAD_ARCH_INFO(core) = NULL; } } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { struct aux_sigframe *aux = (struct aux_sigframe *)&sigframe->sig.uc.uc_regspace; memcpy(&aux->vfp.ufp.fpregs, CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, sizeof(aux->vfp.ufp.fpregs)); aux->vfp.ufp.fpscr = CORE_THREAD_ARCH_INFO(core)->fpstate->fpscr; aux->vfp.magic = VFP_MAGIC; aux->vfp.size = VFP_STORAGE_SIZE; return 0; } int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r) { #define CPREG1(d) f->sig.uc.uc_mcontext.arm_##d = r->d #define CPREG2(d, s) f->sig.uc.uc_mcontext.arm_##d = r->s CPREG1(r0); CPREG1(r1); CPREG1(r2); CPREG1(r3); CPREG1(r4); CPREG1(r5); CPREG1(r6); CPREG1(r7); CPREG1(r8); CPREG1(r9); CPREG1(r10); CPREG1(fp); CPREG1(ip); CPREG1(sp); CPREG1(lr); CPREG1(pc); CPREG1(cpsr); #undef CPREG1 #undef CPREG2 return 0; } criu-3.6/criu/arch/arm/include/000077500000000000000000000000001317335042600164115ustar00rootroot00000000000000criu-3.6/criu/arch/arm/include/asm/000077500000000000000000000000001317335042600171715ustar00rootroot00000000000000criu-3.6/criu/arch/arm/include/asm/dump.h000066400000000000000000000006131317335042600203070ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); static inline void core_put_tls(CoreEntry *core, tls_t tls) { core->ti_arm->tls = tls; } #define get_task_futex_robust_list_compat(pid, info) -1 #endif criu-3.6/criu/arch/arm/include/asm/int.h000066400000000000000000000001571317335042600201370ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ criu-3.6/criu/arch/arm/include/asm/parasite-syscall.h000066400000000000000000000001521317335042600226200ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ struct parasite_ctl; #endif criu-3.6/criu/arch/arm/include/asm/parasite.h000066400000000000000000000002321317335042600211470ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ static inline void arch_get_tls(tls_t *ptls) { *ptls = ((tls_t (*)(void))0xffff0fe0)(); } #endif criu-3.6/criu/arch/arm/include/asm/restore.h000066400000000000000000000012521317335042600210250ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ "mov sp, %0 \n" \ "mov r1, %1 \n" \ "mov r0, %2 \n" \ "bx r1 \n" \ : \ : "r"(new_sp), \ "r"(restore_task_exec_start), \ "r"(task_args) \ : "sp", "r0", "r1", "memory") static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { *ptls = pcore->ti_arm->tls; } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif criu-3.6/criu/arch/arm/include/asm/restorer.h000066400000000000000000000044531317335042600212150ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include "asm/types.h" #include "images/core.pb-c.h" #include #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "clone_emul: \n" \ "ldr r1, %2 \n" \ "sub r1, #16 \n" \ "mov r0, %6 \n" \ "str r0, [r1, #4] \n" \ "mov r0, %5 \n" \ "str r0, [r1] \n" \ "mov r0, %1 \n" \ "mov r2, %3 \n" \ "mov r3, %4 \n" \ "mov r7, #"__stringify(__NR_clone)" \n" \ "svc #0 \n" \ \ "cmp r0, #0 \n" \ "beq thread_run \n" \ \ "mov %0, r0 \n" \ "b clone_end \n" \ \ "thread_run: \n" \ "pop { r1 } \n" \ "pop { r0 } \n" \ "bx r1 \n" \ \ "clone_end: \n" \ : "=r"(ret) \ : "r"(clone_flags), \ "m"(new_sp), \ "r"(&parent_tid), \ "r"(&thread_args[i].pid), \ "r"(clone_restore_fn), \ "r"(&thread_args[i]) \ : "r0", "r1", "r2", "r3", "r7", "memory") #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "mov sp, %0 \n" \ "mov r0, #0 \n" \ "bx r0 \n" \ : \ : "r"(ret) \ : "memory") #define kdat_compatible_cr() 0 #define kdat_can_map_vdso() 0 #define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r); int restore_nonsigframe_gpregs(UserArmRegsEntry *r); #define ARCH_HAS_SHMAT_HOOK unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size); static inline void restore_tls(tls_t *ptls) { asm ( "mov r7, #15 \n" "lsl r7, #16 \n" "mov r0, #5 \n" "add r7, r0 \n" /* r7 = 0xF005 */ "ldr r0, [%0] \n" "svc #0 \n" : : "r"(ptls) : "r0", "r7" ); } static inline void *alloc_compat_syscall_stack(void) { return NULL; } static inline void free_compat_syscall_stack(void *stack32) { } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } #endif criu-3.6/criu/arch/arm/include/asm/types.h000066400000000000000000000013021317335042600205020ustar00rootroot00000000000000#ifndef __CR_ASM_TYPES_H__ #define __CR_ASM_TYPES_H__ #include #include #include "images/core.pb-c.h" #include "page.h" #include "bitops.h" #include "asm/int.h" #include #define core_is_compat(core) false typedef UserArmRegsEntry UserRegsEntry; #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__ARM #define CORE_THREAD_ARCH_INFO(core) core->ti_arm #define TI_SP(core) ((core)->ti_arm->gpregs->sp) static inline void *decode_pointer(u64 v) { return (void*)(u32)v; } static inline u64 encode_pointer(void *p) { return (u32)p; } #define AT_VECTOR_SIZE 40 typedef uint32_t auxv_t; typedef uint32_t tls_t; #endif /* __CR_ASM_TYPES_H__ */ criu-3.6/criu/arch/arm/restorer.c000066400000000000000000000050151317335042600170000ustar00rootroot00000000000000#include #include "restorer.h" #include "asm/restorer.h" #include #include "log.h" #include #include "cpu.h" #include "page.h" #include "common/err.h" int restore_nonsigframe_gpregs(UserArmRegsEntry *r) { return 0; } /* * On ARMv6 CPUs with VIPT caches there are aliasing issues: * if two different cache line indexes correspond to the same physical * address, then changes made to one of the alias might be lost or they * can overwrite each other. To overcome aliasing issues, page coloring * with 4 pages align for shared mappings was introduced (SHMLBA) in kernel. * Which resulted in unique physical address after any tag in cache * (because two upper bits corresponding to page address get unused in tags). * * The problem here is in shmat() syscall: * 1. if shmaddr is NULL then do_shmat() uses arch_get_unmapped_area() * to allocate shared mapping. Which checks if CPU cache is VIPT * and only then use SHMLBA alignment. * 2. if shmaddr is specified then do_shmat() checks that address has * SHMLBA alignment regardless to CPU cache aliasing. * * All above means that on non-VIPT CPU (like any ARMv7) we can get * non-SHMLBA, but page-aligned address with shmat(shmid, NULL, shmflg), * but we can't restore it with shmat(shmid, shmaddr, shmflg). * Which results that we can dump e.g., application with shmem aligned * on 2 pages, but can't restore it on the same ARMv7 CPU. * * To workaround this kernel feature, use mremap() on shmem mapping, * allocated with shmat(shmid, NULL, shmflg). */ #define SHMLBA (4UL * PAGE_SIZE) unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size) { unsigned long smap; /* SHMLBA-aligned, direct call shmat() */ if (!((unsigned long)shmaddr & (SHMLBA - 1))) return sys_shmat(shmid, shmaddr, shmflg); smap = sys_shmat(shmid, NULL, shmflg); if (IS_ERR_VALUE(smap)) { pr_err("shmat() with NULL shmaddr failed: %d\n", (int)smap); return smap; } /* We're lucky! */ if (smap == (unsigned long)shmaddr) return smap; /* Warn ALOUD */ pr_warn("Restoring shmem %p unaligned to SHMLBA.\n", shmaddr); pr_warn("Make sure that you don't migrate shmem from non-VIPT cached CPU to VIPT cached (e.g., ARMv7 -> ARMv6)\n"); pr_warn("Otherwise YOU HAVE A CHANCE OF DATA CORRUPTIONS in writeable shmem\n"); smap = sys_mremap(smap, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)shmaddr); if (IS_ERR_VALUE(smap)) pr_err("mremap() for shmem failed: %d\n", (int)smap); return smap; } criu-3.6/criu/arch/arm/sigframe.c000066400000000000000000000003101317335042600167210ustar00rootroot00000000000000#include "asm/types.h" #include #include "asm/sigframe.h" int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } criu-3.6/criu/arch/arm/uidiv.S000066400000000000000000000101041317335042600162260ustar00rootroot00000000000000.globl __aeabi_uidiv work .req r4 @ XXXX is this safe ? dividend .req r0 divisor .req r1 overdone .req r2 result .req r2 curbit .req r3 #define LSYM(x) x .macro THUMB_DIV_MOD_BODY modulo @ Load the constant 0x10000000 into our work register. mov work, #1 lsl work, #28 LSYM(Loop1): @ Unless the divisor is very big, shift it up in multiples of @ four bits, since this is the amount of unwinding in the main @ division loop. Continue shifting until the divisor is @ larger than the dividend. cmp divisor, work bhs LSYM(Lbignum) cmp divisor, dividend bhs LSYM(Lbignum) lsl divisor, #4 lsl curbit, #4 b LSYM(Loop1) LSYM(Lbignum): @ Set work to 0x80000000 lsl work, #3 LSYM(Loop2): @ For very big divisors, we must shift it a bit at a time, or @ we will be in danger of overflowing. cmp divisor, work bhs LSYM(Loop3) cmp divisor, dividend bhs LSYM(Loop3) lsl divisor, #1 lsl curbit, #1 b LSYM(Loop2) LSYM(Loop3): @ Test for possible subtractions ... .if \modulo @ ... On the final pass, this may subtract too much from the dividend, @ so keep track of which subtractions are done, we can fix them up @ afterwards. mov overdone, #0 cmp dividend, divisor blo LSYM(Lover1) sub dividend, dividend, divisor LSYM(Lover1): lsr work, divisor, #1 cmp dividend, work blo LSYM(Lover2) sub dividend, dividend, work mov ip, curbit mov work, #1 ror curbit, work orr overdone, curbit mov curbit, ip LSYM(Lover2): lsr work, divisor, #2 cmp dividend, work blo LSYM(Lover3) sub dividend, dividend, work mov ip, curbit mov work, #2 ror curbit, work orr overdone, curbit mov curbit, ip LSYM(Lover3): lsr work, divisor, #3 cmp dividend, work blo LSYM(Lover4) sub dividend, dividend, work mov ip, curbit mov work, #3 ror curbit, work orr overdone, curbit mov curbit, ip LSYM(Lover4): mov ip, curbit .else @ ... and note which bits are done in the result. On the final pass, @ this may subtract too much from the dividend, but the result will be ok, @ since the "bit" will have been shifted out at the bottom. cmp dividend, divisor blo LSYM(Lover1) sub dividend, dividend, divisor orr result, result, curbit LSYM(Lover1): lsr work, divisor, #1 cmp dividend, work blo LSYM(Lover2) sub dividend, dividend, work lsr work, curbit, #1 orr result, work LSYM(Lover2): lsr work, divisor, #2 cmp dividend, work blo LSYM(Lover3) sub dividend, dividend, work lsr work, curbit, #2 orr result, work LSYM(Lover3): lsr work, divisor, #3 cmp dividend, work blo LSYM(Lover4) sub dividend, dividend, work lsr work, curbit, #3 orr result, work LSYM(Lover4): .endif cmp dividend, #0 @ Early termination? beq LSYM(Lover5) lsr curbit, #4 @ No, any more bits to do? beq LSYM(Lover5) lsr divisor, #4 b LSYM(Loop3) LSYM(Lover5): .if \modulo @ Any subtractions that we should not have done will be recorded in @ the top three bits of "overdone". Exactly which were not needed @ are governed by the position of the bit, stored in ip. mov work, #0xe lsl work, #28 and overdone, work beq LSYM(Lgot_result) @ If we terminated early, because dividend became zero, then the @ bit in ip will not be in the bottom nibble, and we should not @ perform the additions below. We must test for this though @ (rather relying upon the TSTs to prevent the additions) since @ the bit in ip could be in the top two bits which might then match @ with one of the smaller RORs. mov curbit, ip mov work, #0x7 tst curbit, work beq LSYM(Lgot_result) mov curbit, ip mov work, #3 ror curbit, work tst overdone, curbit beq LSYM(Lover6) lsr work, divisor, #3 add dividend, work LSYM(Lover6): mov curbit, ip mov work, #2 ror curbit, work tst overdone, curbit beq LSYM(Lover7) lsr work, divisor, #2 add dividend, work LSYM(Lover7): mov curbit, ip mov work, #1 ror curbit, work tst overdone, curbit beq LSYM(Lgot_result) lsr work, divisor, #1 add dividend, work .endif LSYM(Lgot_result): .endm .thumb .text __aeabi_uidiv: mov curbit, #1 mov result, #0 push { work } cmp dividend, divisor blo LSYM(Lgot_result) THUMB_DIV_MOD_BODY 0 mov r0, result pop { work } bx lr criu-3.6/criu/arch/ppc64/000077500000000000000000000000001317335042600151435ustar00rootroot00000000000000criu-3.6/criu/arch/ppc64/Makefile000066400000000000000000000003611317335042600166030ustar00rootroot00000000000000builtin-name := crtools.built-in.o ccflags-y += -iquote $(obj)/include ccflags-y += -iquote criu/include -iquote include ccflags-y += $(COMPEL_UAPI_INCLUDES) ldflags-y += -r obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o criu-3.6/criu/arch/ppc64/cpu.c000066400000000000000000000056401317335042600161030ustar00rootroot00000000000000#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include #include #include #include "asm/types.h" #include "cr_options.h" #include "image.h" #include "util.h" #include "log.h" #include "cpu.h" #include "protobuf.h" #include "images/cpuinfo.pb-c.h" static compel_cpuinfo_t rt_cpuinfo; #ifdef __LITTLE_ENDIAN__ #define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANNESS__LITTLEENDIAN #else #define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANESS__BIGENDIAN #endif int cpu_init(void) { return compel_cpuid(&rt_cpuinfo); } int cpu_dump_cpuinfo(void) { CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT; CpuinfoPpc64Entry cpu_ppc64_info = CPUINFO_PPC64_ENTRY__INIT; CpuinfoPpc64Entry *cpu_ppc64_info_ptr = &cpu_ppc64_info; struct cr_img *img; int ret = -1; img = open_image(CR_FD_CPUINFO, O_DUMP); if (!img) return -1; cpu_info.ppc64_entry = &cpu_ppc64_info_ptr; cpu_info.n_ppc64_entry = 1; cpu_ppc64_info.endian = CURRENT_ENDIANNESS; cpu_ppc64_info.n_hwcap = 2; cpu_ppc64_info.hwcap = rt_cpuinfo.hwcap; ret = pb_write_one(img, &cpu_info, PB_CPUINFO); close_image(img); return ret; } int cpu_validate_cpuinfo(void) { CpuinfoEntry *cpu_info; CpuinfoPpc64Entry *cpu_ppc64_entry; struct cr_img *img; int ret = -1; img = open_image(CR_FD_CPUINFO, O_RSTR); if (!img) return -1; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; if (cpu_info->n_ppc64_entry != 1) { pr_err("No PPC64 related entry in image\n"); goto error; } cpu_ppc64_entry = cpu_info->ppc64_entry[0]; if (cpu_ppc64_entry->endian != CURRENT_ENDIANNESS) { pr_err("Bad endianness\n"); goto error; } if (cpu_ppc64_entry->n_hwcap != 2) { pr_err("Hardware capabilities information missing\n"); goto error; } #define CHECK_FEATURE(s,f) do { \ if ((cpu_ppc64_entry->hwcap[s] & f) && \ !(rt_cpuinfo.hwcap[s] & f)) { \ pr_err("CPU Feature %s required by image " \ "is not supported on host.\n", #f); \ goto error; \ } \ } while(0) #define REQUIRE_FEATURE(s,f) do { \ if (!(cpu_ppc64_entry->hwcap[s] & f)) { \ pr_err("CPU Feature %s missing in image.\n", #f); \ goto error; \ } \ } while(0) REQUIRE_FEATURE(0, PPC_FEATURE_64); REQUIRE_FEATURE(0, PPC_FEATURE_HAS_FPU); REQUIRE_FEATURE(0, PPC_FEATURE_HAS_MMU); REQUIRE_FEATURE(0, PPC_FEATURE_HAS_VSX); REQUIRE_FEATURE(1, PPC_FEATURE2_ARCH_2_07); CHECK_FEATURE(0, PPC_FEATURE_TRUE_LE); CHECK_FEATURE(1, PPC_FEATURE2_HTM); CHECK_FEATURE(1, PPC_FEATURE2_DSCR); CHECK_FEATURE(1, PPC_FEATURE2_EBB); CHECK_FEATURE(1, PPC_FEATURE2_ISEL); CHECK_FEATURE(1, PPC_FEATURE2_TAR); CHECK_FEATURE(1, PPC_FEATURE2_VEC_CRYPTO); ret = 0; error: close_image(img); return ret; } int cpuinfo_dump(void) { if (cpu_init()) return -1; if (cpu_dump_cpuinfo()) return -1; return 0; } int cpuinfo_check(void) { if (cpu_init()) return -1; if (cpu_validate_cpuinfo()) return 1; return 0; } criu-3.6/criu/arch/ppc64/crtools.c000066400000000000000000000302641317335042600170010ustar00rootroot00000000000000#include #include #include #include #include #include #include "types.h" #include #include "asm/restorer.h" #include "asm/dump.h" #include "cr_options.h" #include "common/compiler.h" #include #include "parasite-syscall.h" #include "log.h" #include "util.h" #include "cpu.h" #include #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" static UserPpc64FpstateEntry *copy_fp_regs(uint64_t *fpregs) { UserPpc64FpstateEntry *fpe; int i; fpe = xmalloc(sizeof(UserPpc64FpstateEntry)); if (!fpe) return NULL; user_ppc64_fpstate_entry__init(fpe); fpe->n_fpregs = NFPREG; fpe->fpregs = xmalloc(fpe->n_fpregs * sizeof(fpe->fpregs[0])); if (!fpe->fpregs) { xfree(fpe); return NULL; } /* FPSRC is the last (33th) register in the set */ for (i = 0; i < NFPREG; i++) fpe->fpregs[i] = fpregs[i]; return fpe; } static void put_fpu_regs(mcontext_t *mc, UserPpc64FpstateEntry *fpe) { uint64_t *mcfp = (uint64_t *)mc->fp_regs; size_t i; for (i = 0; i < fpe->n_fpregs; i++) mcfp[i] = fpe->fpregs[i]; } static UserPpc64VrstateEntry *copy_altivec_regs(__vector128 *vrregs) { UserPpc64VrstateEntry *vse; uint64_t *p64; uint32_t *p32; int i; vse = xmalloc(sizeof(*vse)); if (!vse) return NULL; user_ppc64_vrstate_entry__init(vse); /* protocol buffer store only 64bit entries and we need 128bit */ vse->n_vrregs = (NVRREG-1) * 2; vse->vrregs = xmalloc(vse->n_vrregs * sizeof(vse->vrregs[0])); if (!vse->vrregs) { xfree(vse); return NULL; } /* Vectors are 2*64bits entries */ for (i = 0; i < (NVRREG-1); i++) { p64 = (uint64_t*) &vrregs[i]; vse->vrregs[i*2] = p64[0]; vse->vrregs[i*2 + 1] = p64[1]; } p32 = (uint32_t*) &vrregs[NVRREG-1]; vse->vrsave = *p32; return vse; } static int put_altivec_regs(mcontext_t *mc, UserPpc64VrstateEntry *vse) { vrregset_t *v_regs = (vrregset_t *)(((unsigned long)mc->vmx_reserve + 15) & ~0xful); pr_debug("Restoring Altivec registers\n"); if (vse->n_vrregs != (NVRREG-1)*2) { pr_err("Corrupted Altivec dump data\n"); return -1; } /* Note that this should only be done in the case MSR_VEC is set but * this is not a big deal to do that in all cases. */ memcpy(&v_regs->vrregs[0][0], vse->vrregs, sizeof(uint64_t) * 2 * (NVRREG-1)); /* vscr has been restored with the previous memcpy which copied 32 * 128bits registers + a 128bits field containing the vscr value in * the low part. */ v_regs->vrsave = vse->vrsave; mc->v_regs = v_regs; return 0; } static UserPpc64VsxstateEntry* copy_vsx_regs(uint64_t *vsregs) { UserPpc64VsxstateEntry *vse; int i; vse = xmalloc(sizeof(*vse)); if (!vse) return NULL; user_ppc64_vsxstate_entry__init(vse); vse->n_vsxregs = NVSXREG; vse->vsxregs = xmalloc(vse->n_vsxregs*sizeof(vse->vsxregs[0])); if (!vse->vsxregs) { xfree(vse); return NULL; } for (i = 0; i < vse->n_vsxregs; i++) vse->vsxregs[i] = vsregs[i]; return vse; } static int put_vsx_regs(mcontext_t *mc, UserPpc64VsxstateEntry *vse) { uint64_t *buf; int i; pr_debug("Restoring VSX registers\n"); if (!mc->v_regs) { /* VSX implies Altivec so v_regs should be set */ pr_err("Internal error\n"); return -1; } /* point after the Altivec registers */ buf = (uint64_t*) (mc->v_regs + 1); /* Copy the value saved by get_vsx_regs in the sigframe */ for (i=0; i < vse->n_vsxregs; i++) buf[i] = vse->vsxregs[i]; return 0; } static void copy_gp_regs(UserPpc64RegsEntry *dst, user_regs_struct_t *src) { int i; #define assign_reg(e) do { \ dst->e = (__typeof__(dst->e))src->e; \ } while (0) for (i=0; i<32; i++) assign_reg(gpr[i]); assign_reg(nip); assign_reg(msr); assign_reg(orig_gpr3); assign_reg(ctr); assign_reg(link); assign_reg(xer); assign_reg(ccr); assign_reg(trap); #undef assign_reg } static void restore_gp_regs(mcontext_t *dst, UserPpc64RegsEntry *src) { int i; /* r0 to r31 */ for (i=0; i<32; i++) dst->gp_regs[i] = src->gpr[i]; dst->gp_regs[PT_NIP] = src->nip; dst->gp_regs[PT_MSR] = src->msr; dst->gp_regs[PT_ORIG_R3] = src->orig_gpr3; dst->gp_regs[PT_CTR] = src->ctr; dst->gp_regs[PT_LNK] = src->link; dst->gp_regs[PT_XER] = src->xer; dst->gp_regs[PT_CCR] = src->ccr; dst->gp_regs[PT_TRAP] = src->trap; } static UserPpc64RegsEntry *allocate_gp_regs(void) { UserPpc64RegsEntry *gpregs; gpregs = xmalloc(sizeof(*gpregs)); if (!gpregs) return NULL; user_ppc64_regs_entry__init(gpregs); gpregs->n_gpr = 32; gpregs->gpr = xmalloc(32 * sizeof(uint64_t)); if (!gpregs->gpr) { xfree(gpregs); return NULL; } return gpregs; } /**************************************************************************** * TRANSACTIONAL MEMORY SUPPORT */ static void xfree_tm_state(UserPpc64TmRegsEntry *tme) { if (tme) { if (tme->fpstate) { xfree(tme->fpstate->fpregs); xfree(tme->fpstate); } if (tme->vrstate) { xfree(tme->vrstate->vrregs); xfree(tme->vrstate); } if (tme->vsxstate) { xfree(tme->vsxstate->vsxregs); xfree(tme->vsxstate); } if (tme->gpregs) { if (tme->gpregs->gpr) xfree(tme->gpregs->gpr); xfree(tme->gpregs); } xfree(tme); } } static int put_tm_regs(struct rt_sigframe *f, UserPpc64TmRegsEntry *tme) { /* * WARNING: As stated in kernel's restore_tm_sigcontexts, TEXASR has to be * restored by the process itself : * TEXASR was set by the signal delivery reclaim, as was TFIAR. * Users doing anything abhorrent like thread-switching w/ signals for * TM-Suspended code will have to back TEXASR/TFIAR up themselves. * For the case of getting a signal and simply returning from it, * we don't need to re-copy them here. */ ucontext_t *tm_uc = &f->uc_transact; pr_debug("Restoring TM registers FP:%d VR:%d VSX:%d\n", !!(tme->fpstate), !!(tme->vrstate), !!(tme->vsxstate)); restore_gp_regs(&tm_uc->uc_mcontext, tme->gpregs); if (tme->fpstate) put_fpu_regs(&tm_uc->uc_mcontext, tme->fpstate); if (tme->vrstate && put_altivec_regs(&tm_uc->uc_mcontext, tme->vrstate)) return -1; if (tme->vsxstate && put_vsx_regs(&tm_uc->uc_mcontext, tme->vsxstate)) return -1; f->uc.uc_link = tm_uc; return 0; } /****************************************************************************/ static int copy_tm_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, CoreEntry *core) { UserPpc64TmRegsEntry *tme; UserPpc64RegsEntry *gpregs = core->ti_ppc64->gpregs; pr_debug("Copying TM registers\n"); tme = xmalloc(sizeof(*tme)); if (!tme) return -1; user_ppc64_tm_regs_entry__init(tme); tme->gpregs = allocate_gp_regs(); if (!tme->gpregs) goto out_free; gpregs->has_tfhar = true; gpregs->tfhar = fpregs->tm.tm_spr_regs.tfhar; gpregs->has_texasr = true; gpregs->texasr = fpregs->tm.tm_spr_regs.texasr; gpregs->has_tfiar = true; gpregs->tfiar = fpregs->tm.tm_spr_regs.tfiar; /* This is the checkpointed state, we must save it in place of the * current state because the signal handler is made in this way. * We invert the 2 states instead of when building the signal frame, * because we can't modify the gpregs manipulated by the common layer. */ copy_gp_regs(gpregs, &fpregs->tm.regs); if (fpregs->tm.flags & USER_FPREGS_FL_FP) { core->ti_ppc64->fpstate = copy_fp_regs(fpregs->tm.fpregs); if (!core->ti_ppc64->fpstate) goto out_free; } if (fpregs->tm.flags & USER_FPREGS_FL_ALTIVEC) { core->ti_ppc64->vrstate = copy_altivec_regs(fpregs->tm.vrregs); if (!core->ti_ppc64->vrstate) goto out_free; /* * Force the MSR_VEC bit of the restored MSR otherwise the * kernel will not restore them from the signal frame. */ gpregs->msr |= MSR_VEC; if (fpregs->tm.flags & USER_FPREGS_FL_VSX) { core->ti_ppc64->vsxstate = copy_vsx_regs(fpregs->tm.vsxregs); if (!core->ti_ppc64->vsxstate) goto out_free; /* * Force the MSR_VSX bit of the restored MSR otherwise * the kernel will not restore them from the signal * frame. */ gpregs->msr |= MSR_VSX; } } core->ti_ppc64->tmstate = tme; return 0; out_free: xfree_tm_state(tme); return -1; } static int __copy_task_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, CoreEntry *core) { UserPpc64RegsEntry *gpregs; UserPpc64FpstateEntry **fpstate; UserPpc64VrstateEntry **vrstate; UserPpc64VsxstateEntry **vsxstate; /* Copy retrieved registers in the proto data * If TM is in the loop we switch the saved register set because * the signal frame is built with checkpointed registers on top to not * confused TM unaware process, while ptrace is retrieving the * checkpointed set through the TM specific ELF notes. */ if (fpregs->flags & USER_FPREGS_FL_TM) { if (copy_tm_regs(regs, fpregs, core)) return -1; gpregs = core->ti_ppc64->tmstate->gpregs; fpstate = &(core->ti_ppc64->tmstate->fpstate); vrstate = &(core->ti_ppc64->tmstate->vrstate); vsxstate = &(core->ti_ppc64->tmstate->vsxstate); } else { gpregs = core->ti_ppc64->gpregs; fpstate = &(core->ti_ppc64->fpstate); vrstate = &(core->ti_ppc64->vrstate); vsxstate = &(core->ti_ppc64->vsxstate); } copy_gp_regs(gpregs, regs); if (fpregs->flags & USER_FPREGS_FL_FP) { *fpstate = copy_fp_regs(fpregs->fpregs); if (!*fpstate) return -1; } if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) { *vrstate = copy_altivec_regs(fpregs->vrregs); if (!*vrstate) return -1; /* * Force the MSR_VEC bit of the restored MSR otherwise the * kernel will not restore them from the signal frame. */ gpregs->msr |= MSR_VEC; if (fpregs->flags & USER_FPREGS_FL_VSX) { *vsxstate = copy_vsx_regs(fpregs->vsxregs); if (!*vsxstate) return -1; /* * Force the MSR_VSX bit of the restored MSR otherwise * the kernel will not restore them from the signal * frame. */ gpregs->msr |= MSR_VSX; } } return 0; } int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { return __copy_task_regs(u, f, (CoreEntry *)arg); } /****************************************************************************/ int arch_alloc_thread_info(CoreEntry *core) { ThreadInfoPpc64 *ti_ppc64; ti_ppc64 = xmalloc(sizeof(*ti_ppc64)); if(!ti_ppc64) return -1; thread_info_ppc64__init(ti_ppc64); ti_ppc64->gpregs = allocate_gp_regs(); if (!ti_ppc64->gpregs) { xfree(ti_ppc64); return -1; } CORE_THREAD_ARCH_INFO(core) = ti_ppc64; return 0; } void arch_free_thread_info(CoreEntry *core) { if (CORE_THREAD_ARCH_INFO(core)) { if (CORE_THREAD_ARCH_INFO(core)->fpstate) { xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->fpregs); xfree(CORE_THREAD_ARCH_INFO(core)->fpstate); } if (CORE_THREAD_ARCH_INFO(core)->vrstate) { xfree(CORE_THREAD_ARCH_INFO(core)->vrstate->vrregs); xfree(CORE_THREAD_ARCH_INFO(core)->vrstate); } if (CORE_THREAD_ARCH_INFO(core)->vsxstate) { xfree(CORE_THREAD_ARCH_INFO(core)->vsxstate->vsxregs); xfree(CORE_THREAD_ARCH_INFO(core)->vsxstate); } xfree_tm_state(CORE_THREAD_ARCH_INFO(core)->tmstate); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->gpr); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); CORE_THREAD_ARCH_INFO(core) = NULL; } } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { int ret = 0; if (CORE_THREAD_ARCH_INFO(core)->fpstate) put_fpu_regs(&sigframe->uc.uc_mcontext, CORE_THREAD_ARCH_INFO(core)->fpstate); if (CORE_THREAD_ARCH_INFO(core)->vrstate) ret = put_altivec_regs(&sigframe->uc.uc_mcontext, CORE_THREAD_ARCH_INFO(core)->vrstate); else if (core->ti_ppc64->gpregs->msr & MSR_VEC) { pr_err("Register's data mismatch, corrupted image ?\n"); ret = -1; } if (!ret && CORE_THREAD_ARCH_INFO(core)->vsxstate) ret = put_vsx_regs(&sigframe->uc.uc_mcontext, CORE_THREAD_ARCH_INFO(core)->vsxstate); else if (core->ti_ppc64->gpregs->msr & MSR_VSX) { pr_err("VSX register's data mismatch, corrupted image ?\n"); ret = -1; } if (!ret && CORE_THREAD_ARCH_INFO(core)->tmstate) ret = put_tm_regs(sigframe, CORE_THREAD_ARCH_INFO(core)->tmstate); else if (MSR_TM_ACTIVE(core->ti_ppc64->gpregs->msr)) { pr_err("TM register's data mismatch, corrupted image ?\n"); ret = -1; } return ret; } int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r) { restore_gp_regs(&f->uc.uc_mcontext, r); return 0; } criu-3.6/criu/arch/ppc64/include/000077500000000000000000000000001317335042600165665ustar00rootroot00000000000000criu-3.6/criu/arch/ppc64/include/asm/000077500000000000000000000000001317335042600173465ustar00rootroot00000000000000criu-3.6/criu/arch/ppc64/include/asm/dump.h000066400000000000000000000005211317335042600204620ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); #define core_put_tls(core, tls) #define get_task_futex_robust_list_compat(pid, info) -1 #endif criu-3.6/criu/arch/ppc64/include/asm/int.h000066400000000000000000000001571317335042600203140ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ criu-3.6/criu/arch/ppc64/include/asm/parasite-syscall.h000066400000000000000000000001521317335042600227750ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ struct parasite_ctl; #endif criu-3.6/criu/arch/ppc64/include/asm/parasite.h000066400000000000000000000002721317335042600213300ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ /* TLS is accessed through r13, which is already processed */ static inline void arch_get_tls(tls_t *ptls) { (void)ptls; } #endif criu-3.6/criu/arch/ppc64/include/asm/restore.h000066400000000000000000000014451317335042600212060ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" /* * Set R2 to blob + 8000 which is the default value * Jump to restore_task_exec_start + 8 since R2 is already set (local call) */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ "mr 1,%0 \n" \ "mr 12,%1 \n" \ "mtctr 12 \n" \ "mr 3,%2 \n" \ "bctr \n" \ : \ : "r"(new_sp), \ "r"((unsigned long)restore_task_exec_start), \ "r"(task_args) \ : "1", "3", "12") /* There is nothing to do since TLS is accessed through r13 */ #define core_get_tls(pcore, ptls) int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif /* __CR_ASM_RESTORE_H__ */ criu-3.6/criu/arch/ppc64/include/asm/restorer.h000066400000000000000000000044401317335042600213660ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include #include #include #include "asm/types.h" #include #include /* * Clone trampoline * * See glibc sysdeps/powerpc/powerpc64/sysdep.h for FRAME_MIN_SIZE defines */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "clone_emul: \n" \ "/* Save fn, args, stack across syscall. */ \n" \ "mr 14, %5 /* clone_restore_fn in r14 */ \n" \ "mr 15, %6 /* &thread_args[i] in r15 */ \n" \ "mr 3, %1 /* clone_flags */ \n" \ "ld 4, %2 /* new_sp */ \n" \ "mr 5, %3 /* &parent_tid */ \n" \ "li 6, 0 /* tls = 0 ? */ \n" \ "mr 7, %4 /* &thread_args[i].pid */ \n" \ "li 0,"__stringify(__NR_clone)" \n" \ "sc \n" \ "/* Check for child process. */ \n" \ "cmpdi cr1,3,0 \n" \ "crandc cr1*4+eq,cr1*4+eq,cr0*4+so \n" \ "bne- cr1,clone_end \n" \ "/* child */ \n" \ "addi 14, 14, 8 /* jump over r2 fixup */ \n" \ "mtctr 14 \n" \ "mr 3,15 \n" \ "bctr \n" \ "clone_end: \n" \ "mr %0,3 \n" \ : "=r"(ret) /* %0 */ \ : "r"(clone_flags), /* %1 */ \ "m"(new_sp), /* %2 */ \ "r"(&parent_tid), /* %3 */ \ "r"(&thread_args[i].pid), /* %4 */ \ "r"(clone_restore_fn), /* %5 */ \ "r"(&thread_args[i]) /* %6 */ \ : "memory","0","3","4","5","6","7","14","15") #define kdat_compatible_cr() 0 #define kdat_can_map_vdso() 0 #define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r); int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r); /* Nothing to do, TLS is accessed through r13 */ static inline void restore_tls(tls_t *ptls) { (void)ptls; } /* * Defined in arch/ppc64/syscall-common-ppc64.S */ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg); static inline void *alloc_compat_syscall_stack(void) { return NULL; } static inline void free_compat_syscall_stack(void *stack32) { } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } #endif /*__CR_ASM_RESTORER_H__*/ criu-3.6/criu/arch/ppc64/include/asm/types.h000066400000000000000000000020351317335042600206630ustar00rootroot00000000000000#ifndef __CR_ASM_TYPES_H__ #define __CR_ASM_TYPES_H__ #include #include #include "images/core.pb-c.h" #include "page.h" #include "bitops.h" #include "asm/int.h" #include typedef UserPpc64RegsEntry UserRegsEntry; #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__PPC64 #define core_is_compat(core) false #define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64 static inline void *decode_pointer(uint64_t v) { return (void*)v; } static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } /* * Copied from the following kernel header files : * include/linux/auxvec.h * arch/powerpc/include/uapi/asm/auxvec.h * include/linux/mm_types.h */ #define AT_VECTOR_SIZE_BASE 20 #if !defined AT_VECTOR_SIZE_ARCH #define AT_VECTOR_SIZE_ARCH 6 #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) typedef uint64_t auxv_t; /* Not used but the structure parasite_dump_thread needs a tls_t field */ typedef uint64_t tls_t; #endif /* __CR_ASM_TYPES_H__ */ criu-3.6/criu/arch/ppc64/include/asm/vdso.h000066400000000000000000000014511317335042600204730ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "asm-generic/vdso.h" /* This definition is used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' * * Poke from kernel file arch/powerpc/kernel/vdso64/vdso64.lds.S * * Note that '__kernel_datapage_offset' is not a service but mostly a data * inside the text page which should not be used as is from user space. */ #define VDSO_SYMBOL_MAX 10 #define ARCH_VDSO_SYMBOLS \ "__kernel_clock_getres", \ "__kernel_clock_gettime", \ "__kernel_get_syscall_map", \ "__kernel_get_tbfreq", \ "__kernel_getcpu", \ "__kernel_gettimeofday", \ "__kernel_sigtramp_rt64", \ "__kernel_sync_dicache", \ "__kernel_sync_dicache_p5", \ "__kernel_time" #endif /* __CR_ASM_VDSO_H__ */ criu-3.6/criu/arch/ppc64/misc.S000066400000000000000000000104031317335042600162200ustar00rootroot00000000000000/* * This is from linux/arch/powerpc/lib/crtsavres.S: * * Special support for eabi and SVR4 * * Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc. * Copyright 2008 Freescale Semiconductor, Inc. * Written By Michael Meissner * * Based on gcc/config/rs6000/crtsavres.asm from gcc * 64 bit additions from reading the PPC elf64abi document. * * This file is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2, or (at your option) any * later version. * * In addition to the permissions in the GNU General Public License, the * Free Software Foundation gives you unlimited permission to link the * compiled version of this file with other programs, and to distribute * those programs without any restriction coming from the use of this * file. (The General Public License restrictions do apply in other * respects; for example, they cover modification of the file, and * distribution when not linked into another program.) * * This file is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; see the file COPYING. If not, write to * the Free Software Foundation, 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * * As a special exception, if you link this library with files * compiled with GCC to produce an executable, this does not cause * the resulting executable to be covered by the GNU General Public License. * This exception does not however invalidate any other reasons why * the executable file might be covered by the GNU General Public License. */ #define r0 0 #define r1 1 #define r2 2 #define r3 3 #define r4 4 #define r5 5 #define r6 6 #define r7 7 #define r8 8 #define r9 9 #define r10 10 #define r11 11 #define r12 12 #define r13 13 #define r14 14 #define r15 15 #define r16 16 #define r17 17 #define r18 18 #define r19 19 #define r20 20 #define r21 21 #define r22 22 #define r23 23 #define r24 24 #define r25 25 #define r26 26 #define r27 27 #define r28 28 #define r29 29 #define r30 30 #define r31 31 .text .globl _savegpr0_14 _savegpr0_14: std r14,-144(r1) .globl _savegpr0_15 _savegpr0_15: std r15,-136(r1) .globl _savegpr0_16 _savegpr0_16: std r16,-128(r1) .globl _savegpr0_17 _savegpr0_17: std r17,-120(r1) .globl _savegpr0_18 _savegpr0_18: std r18,-112(r1) .globl _savegpr0_19 _savegpr0_19: std r19,-104(r1) .globl _savegpr0_20 _savegpr0_20: std r20,-96(r1) .globl _savegpr0_21 _savegpr0_21: std r21,-88(r1) .globl _savegpr0_22 _savegpr0_22: std r22,-80(r1) .globl _savegpr0_23 _savegpr0_23: std r23,-72(r1) .globl _savegpr0_24 _savegpr0_24: std r24,-64(r1) .globl _savegpr0_25 _savegpr0_25: std r25,-56(r1) .globl _savegpr0_26 _savegpr0_26: std r26,-48(r1) .globl _savegpr0_27 _savegpr0_27: std r27,-40(r1) .globl _savegpr0_28 _savegpr0_28: std r28,-32(r1) .globl _savegpr0_29 _savegpr0_29: std r29,-24(r1) .globl _savegpr0_30 _savegpr0_30: std r30,-16(r1) .globl _savegpr0_31 _savegpr0_31: std r31,-8(r1) std r0,16(r1) blr .globl _restgpr0_14 _restgpr0_14: ld r14,-144(r1) .globl _restgpr0_15 _restgpr0_15: ld r15,-136(r1) .globl _restgpr0_16 _restgpr0_16: ld r16,-128(r1) .globl _restgpr0_17 _restgpr0_17: ld r17,-120(r1) .globl _restgpr0_18 _restgpr0_18: ld r18,-112(r1) .globl _restgpr0_19 _restgpr0_19: ld r19,-104(r1) .globl _restgpr0_20 _restgpr0_20: ld r20,-96(r1) .globl _restgpr0_21 _restgpr0_21: ld r21,-88(r1) .globl _restgpr0_22 _restgpr0_22: ld r22,-80(r1) .globl _restgpr0_23 _restgpr0_23: ld r23,-72(r1) .globl _restgpr0_24 _restgpr0_24: ld r24,-64(r1) .globl _restgpr0_25 _restgpr0_25: ld r25,-56(r1) .globl _restgpr0_26 _restgpr0_26: ld r26,-48(r1) .globl _restgpr0_27 _restgpr0_27: ld r27,-40(r1) .globl _restgpr0_28 _restgpr0_28: ld r28,-32(r1) .globl _restgpr0_29 _restgpr0_29: ld r0,16(r1) ld r29,-24(r1) mtlr r0 ld r30,-16(r1) ld r31,-8(r1) blr .globl _restgpr0_30 _restgpr0_30: ld r30,-16(r1) .globl _restgpr0_31 _restgpr0_31: ld r0,16(r1) ld r31,-8(r1) mtlr r0 blr criu-3.6/criu/arch/ppc64/restorer.c000066400000000000000000000021671317335042600171620ustar00rootroot00000000000000#include #include "restorer.h" #include "asm/restorer.h" #include #include #include "log.h" int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r) { #define SPRN_TFHAR 128 #define SPRN_TFIAR 129 #define SPRN_TEXASR 130 if (r->has_tfhar) { asm __volatile__ ( "ld 3, %[value] ;" "mtspr %[sprn],3 ;" : [value]"=m"(r->tfhar) : [sprn]"i"(SPRN_TFHAR) : "r3"); } if (r->has_tfiar) { asm __volatile__ ( "ld 3, %[value] ;" "mtspr %[sprn],3 ;" : [value]"=m"(r->tfiar) : [sprn]"i"(SPRN_TFIAR) : "r3"); } if (r->has_texasr) { asm __volatile__ ( "ld 3, %[value] ;" "mtspr %[sprn],3 ;" : [value]"=m"(r->texasr) : [sprn]"i"(SPRN_TEXASR) : "r3"); } return 0; } unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) { unsigned long raddr; int ret; ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ shmflg, /* second */ (unsigned long)&raddr, /* third */ shmaddr, /* ptr */ 0 /* fifth not used */); if (ret) raddr = (unsigned long) ret; return raddr; } criu-3.6/criu/arch/ppc64/sigframe.c000066400000000000000000000024751317335042600171140ustar00rootroot00000000000000#include #include #include "asm/sigframe.h" #include "asm/types.h" #include "log.h" #include "common/bug.h" /* * The signal frame has been built using local addresses. Since it has to be * used in the context of the checkpointed process, the v_regs pointer in the * signal frame must be updated to match the address in the remote stack. */ static inline void update_vregs(mcontext_t *lcontext, mcontext_t *rcontext) { if (lcontext->v_regs) { uint64_t offset = (uint64_t)(lcontext->v_regs) - (uint64_t)lcontext; lcontext->v_regs = (vrregset_t *)((uint64_t)rcontext + offset); pr_debug("Updated v_regs:%llx (rcontext:%llx)\n", (unsigned long long) lcontext->v_regs, (unsigned long long) rcontext); } } int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, struct rt_sigframe *rframe) { uint64_t msr = frame->uc.uc_mcontext.gp_regs[PT_MSR]; update_vregs(&frame->uc.uc_mcontext, &rframe->uc.uc_mcontext); /* Sanity check: If TM so uc_link should be set, otherwise not */ if (MSR_TM_ACTIVE(msr) ^ (!!(frame->uc.uc_link))) { BUG(); return 1; } /* Updating the transactional state address if any */ if (frame->uc.uc_link) { update_vregs(&frame->uc_transact.uc_mcontext, &rframe->uc_transact.uc_mcontext); frame->uc.uc_link = &rframe->uc_transact; } return 0; } criu-3.6/criu/arch/ppc64/vdso-pie.c000066400000000000000000000100741317335042600170370ustar00rootroot00000000000000#include #include "asm/types.h" #include #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX # undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " /* This symbols are defined in vdso-trampoline.S */ extern char *vdso_trampoline, *vdso_trampoline_end; static inline void invalidate_caches(unsigned long at) { asm volatile("isync \n" \ "li 3,0 \n" \ "dcbf 3,%0 \n" \ "sync \n" \ "icbi 3,%0 \n" \ "isync \n" \ : /* no output */ \ : "r"(at) \ :"memory", "r3"); } /* This is the size of the trampoline call : * mlfr r0 * bl trampoline * <64 bit address> */ #define TRAMP_CALL_SIZE (2*sizeof(uint32_t) + sizeof(uint64_t)) /* * put_trampoline does 2 things : * * 1. it looks for a place in the checkpointed vDSO where to put the * trampoline code (see vdso-trampoline.S). * * 2. for each symbol from the checkpointed vDSO, it checks that there are * enough place to put the call to the vDSO trampoline (see * TRAMP_CALL_SIZE's comment above). * This done by checking that there is no interesting symbols in the range * of current one's offset -> (current one's offset + TRAMP_CALL_SIZE). * Unfortunately the symbols are not sorted by address so we have to look * for the complete table all the time. Since the vDSO is small, this is * not a big issue. */ static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym) { int i,j; unsigned long size; unsigned long trampoline = 0; /* First of all we have to find a place where to put the trampoline * code. */ size = (unsigned long)&vdso_trampoline_end - (unsigned long)&vdso_trampoline; for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { if (vdso_symbol_empty(&sym->symbols[i])) continue; pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, sym->symbols[i].offset); /* find the nearest followin symbol we are interested in */ for (j=0; j < ARRAY_SIZE(sym->symbols); j++) { if (i==j || vdso_symbol_empty(&sym->symbols[j])) continue; if (sym->symbols[j].offset <= sym->symbols[i].offset) /* this symbol is above the current one */ continue; if ((sym->symbols[i].offset+TRAMP_CALL_SIZE) > sym->symbols[j].offset) { /* we have a major issue here since we cannot * even put the trampoline call for this symbol */ pr_err("Can't handle small vDSO symbol %s\n", sym->symbols[i].name); return 0; } if (trampoline) /* no need to put it twice */ continue; if ((sym->symbols[j].offset - (sym->symbols[i].offset+TRAMP_CALL_SIZE)) <= size) /* not enough place */ continue; /* We can put the trampoline there */ trampoline = at + sym->symbols[i].offset; trampoline += TRAMP_CALL_SIZE; pr_debug("Putting vDSO trampoline in %s at %lx\n", sym->symbols[i].name, trampoline); memcpy((void *)trampoline, &vdso_trampoline, size); invalidate_caches(trampoline); } } return trampoline; } static inline void put_trampoline_call(unsigned long at, unsigned long to, unsigned long tr) { uint32_t *addr = (uint32_t *)at;; *addr++ = 0x7C0802a6; /* mflr r0 */ *addr++ = 0x48000001 | ((long)(tr-at-4) & 0x3fffffc); /* bl tr */ *(uint64_t *)addr = to; /* the address to read by the trampoline */ invalidate_caches(at); } int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, struct vdso_symtable *from, bool __always_unused compat_vdso) { unsigned int i; unsigned long trampoline; trampoline = (unsigned long)put_trampoline(base_from, from); if (!trampoline) return 1; for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { if (vdso_symbol_empty(&from->symbols[i])) continue; pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", base_from, from->symbols[i].offset, base_to, to->symbols[i].offset, i, from->symbols[i].name); put_trampoline_call(base_from + from->symbols[i].offset, base_to + to->symbols[i].offset, trampoline); } return 0; } criu-3.6/criu/arch/ppc64/vdso-trampoline.S000066400000000000000000000004041317335042600204100ustar00rootroot00000000000000#include "common/asm/linkage.h" .section .text GLOBAL(vdso_trampoline) mflr r12 /* r12 vdso_ptr's address */ mtlr r0 /* restore lr */ ld r12,0(r12) /* read value store in vdso_ptr */ mtctr r12 /* branch to it */ bctr GLOBAL(vdso_trampoline_end) criu-3.6/criu/arch/s390/000077500000000000000000000000001317335042600147055ustar00rootroot00000000000000criu-3.6/criu/arch/s390/Makefile000066400000000000000000000003611317335042600163450ustar00rootroot00000000000000builtin-name := crtools.built-in.o ccflags-y += -iquote $(obj)/include ccflags-y += -iquote criu/include -iquote include ccflags-y += $(COMPEL_UAPI_INCLUDES) ldflags-y += -r obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o criu-3.6/criu/arch/s390/cpu.c000066400000000000000000000061341317335042600156440ustar00rootroot00000000000000#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include #include #include "asm/types.h" #include "cr_options.h" #include "image.h" #include "util.h" #include "log.h" #include "cpu.h" #include "protobuf.h" #include "images/cpuinfo.pb-c.h" static compel_cpuinfo_t rt_cpuinfo; static const char *hwcap_str1[64] = { "HWCAP_S390_ESAN3", "HWCAP_S390_ZARCH", "HWCAP_S390_STFLE", "HWCAP_S390_MSA", "HWCAP_S390_LDISP", "HWCAP_S390_EIMM", "HWCAP_S390_DFP", "HWCAP_S390_HPAGE", "HWCAP_S390_ETF3EH", "HWCAP_S390_HIGH_GPRS", "HWCAP_S390_TE", "HWCAP_S390_VXRS", "HWCAP_S390_VXRS_BCD", "HWCAP_S390_VXRS_EXT", }; static const char *hwcap_str2[64] = { }; static const char **hwcap_str[2] = { hwcap_str1, hwcap_str2 }; static void print_hwcaps(const char *msg, unsigned long hwcap[2]) { int nr, cap; pr_debug("%s: Capabilities: %016lx %016lx\n", msg, hwcap[0], hwcap[1]); for (nr = 0; nr < 2; nr++) { for (cap = 0; cap < 64; cap++) { if (!(hwcap[nr] & (1 << cap))) continue; if (hwcap_str[nr][cap]) pr_debug("%s\n", hwcap_str[nr][cap]); else pr_debug("Capability %d/0x%x\n", nr, 1 << cap); } } } int cpu_init(void) { int ret; ret = compel_cpuid(&rt_cpuinfo); print_hwcaps("Host (init)", rt_cpuinfo.hwcap); return ret; } int cpu_dump_cpuinfo(void) { CpuinfoS390Entry cpu_s390_info = CPUINFO_S390_ENTRY__INIT; CpuinfoS390Entry *cpu_s390_info_ptr = &cpu_s390_info; CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT; struct cr_img *img; int ret = -1; img = open_image(CR_FD_CPUINFO, O_DUMP); if (!img) return -1; cpu_info.s390_entry = &cpu_s390_info_ptr; cpu_info.n_s390_entry = 1; cpu_s390_info.n_hwcap = 2; cpu_s390_info.hwcap = rt_cpuinfo.hwcap; ret = pb_write_one(img, &cpu_info, PB_CPUINFO); close_image(img); return ret; } int cpu_validate_cpuinfo(void) { CpuinfoS390Entry *cpu_s390_entry; CpuinfoEntry *cpu_info; struct cr_img *img; int cap, nr, ret; img = open_image(CR_FD_CPUINFO, O_RSTR); if (!img) return -1; ret = 0; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; if (cpu_info->n_s390_entry != 1) { pr_err("No S390 related entry in image"); goto error; } cpu_s390_entry = cpu_info->s390_entry[0]; if (cpu_s390_entry->n_hwcap != 2) { pr_err("Hardware capabilities information missing\n"); ret = -1; goto error; } print_hwcaps("Host", rt_cpuinfo.hwcap); print_hwcaps("Image", cpu_s390_entry->hwcap); for (nr = 0; nr < 2; nr++) { for (cap = 0; cap < 64; cap++) { if (!(cpu_s390_entry->hwcap[nr] & (1 << cap))) continue; if (rt_cpuinfo.hwcap[nr] & (1 << cap)) continue; if (hwcap_str[nr][cap]) pr_err("CPU Feature %s not supported on host\n", hwcap_str[nr][cap]); else pr_err("CPU Feature %d/%x not supported on host\n", nr, 1 << cap); ret = -1; } } if (ret == -1) pr_err("See also: /usr/include/bits/hwcap.h\n"); error: close_image(img); return ret; } int cpuinfo_dump(void) { if (cpu_init()) return -1; if (cpu_dump_cpuinfo()) return -1; return 0; } int cpuinfo_check(void) { if (cpu_init()) return 1; if (cpu_validate_cpuinfo()) return 1; return 0; } criu-3.6/criu/arch/s390/crtools.c000066400000000000000000000433201317335042600165400ustar00rootroot00000000000000#include #include #include #include #include #include #include "types.h" #include #include "asm/restorer.h" #include "asm/dump.h" #include "cr_options.h" #include "common/compiler.h" #include #include "parasite-syscall.h" #include "log.h" #include "util.h" #include "cpu.h" #include #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" #include "ptrace.h" #include "pstree.h" #include "image.h" #define NT_PRFPREG 2 #define NT_S390_VXRS_LOW 0x309 #define NT_S390_VXRS_HIGH 0x30a #define NT_S390_GS_CB 0x30b #define NT_S390_GS_BC 0x30c #define NT_S390_RI_CB 0x30d /* * Print general purpose and access registers */ static void print_core_gpregs(const char *msg, UserS390RegsEntry *gpregs) { int i; pr_debug("%s: General purpose registers\n", msg); pr_debug(" psw %016lx %016lx\n", gpregs->psw_mask, gpregs->psw_addr); pr_debug(" orig_gpr2 %016lx\n", gpregs->orig_gpr2); for (i = 0; i < 16; i++) pr_debug(" g%02d %016lx\n", i, gpregs->gprs[i]); for (i = 0; i < 16; i++) pr_debug(" a%02d %08x\n", i, gpregs->acrs[i]); } /* * Print vector registers */ static void print_core_vx_regs(CoreEntry *core) { UserS390VxrsHighEntry *vxrs_high; UserS390VxrsLowEntry *vxrs_low; int i; vxrs_high = CORE_THREAD_ARCH_INFO(core)->vxrs_high; vxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (vxrs_low == NULL) { pr_debug(" No VXRS\n"); return; } for (i = 0; i < 16; i++) pr_debug(" vx_low%02d %016lx\n", i, vxrs_low->regs[i]); for (i = 0; i < 32; i += 2) pr_debug(" vx_high%02d %016lx %016lx\n", i / 2, vxrs_high->regs[i], vxrs_high->regs[i + 1]); } /* * Print guarded-storage control block */ static void print_core_gs_cb(CoreEntry *core) { UserS390GsCbEntry *gs_cb; int i; gs_cb = CORE_THREAD_ARCH_INFO(core)->gs_cb; if (!gs_cb) { pr_debug(" No GS_CB\n"); return; } for (i = 0; i < 4; i++) pr_debug(" gs_cb%d %lx\n", i, gs_cb->regs[i]); } /* * Print guarded-storage broadcast control block */ static void print_core_gs_bc(CoreEntry *core) { UserS390GsCbEntry *gs_bc; int i; gs_bc = CORE_THREAD_ARCH_INFO(core)->gs_bc; if (!gs_bc) { pr_debug(" No GS_BC\n"); return; } for (i = 0; i < 4; i++) pr_debug(" gs_bc%d %lx\n", i, gs_bc->regs[i]); } /* * Print runtime-instrumentation control block */ static void print_core_ri_cb(CoreEntry *core) { UserS390RiEntry *ri_cb; int i; ri_cb = CORE_THREAD_ARCH_INFO(core)->ri_cb; if (!ri_cb) { pr_debug(" No RI_CB\n"); return; } for (i = 0; i < 8; i++) pr_debug(" ri_cb%d %lx\n", i, ri_cb->regs[i]); } /* * Print architecture registers */ static void print_core_fp_regs(const char *msg, CoreEntry *core) { UserS390FpregsEntry *fpregs; int i; fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; pr_debug("%s: Floating point registers\n", msg); pr_debug(" fpc %08x\n", fpregs->fpc); for (i = 0; i < 16; i++) pr_debug(" f%02d %016lx\n", i, fpregs->fprs[i]); print_core_vx_regs(core); print_core_gs_cb(core); print_core_gs_bc(core); print_core_ri_cb(core); } /* * Allocate VxrsLow registers */ static UserS390VxrsLowEntry *allocate_vxrs_low_regs(void) { UserS390VxrsLowEntry *vxrs_low; vxrs_low = xmalloc(sizeof(*vxrs_low)); if (!vxrs_low) return NULL; user_s390_vxrs_low_entry__init(vxrs_low); vxrs_low->n_regs = 16; vxrs_low->regs = xzalloc(16 * sizeof(uint64_t)); if (!vxrs_low->regs) goto fail_free_vxrs_low; return vxrs_low; fail_free_vxrs_low: xfree(vxrs_low); return NULL; } /* * Free VxrsLow registers */ static void free_vxrs_low_regs(UserS390VxrsLowEntry *vxrs_low) { if (vxrs_low) { xfree(vxrs_low->regs); xfree(vxrs_low); } } /* * Allocate VxrsHigh registers */ static UserS390VxrsHighEntry *allocate_vxrs_high_regs(void) { UserS390VxrsHighEntry *vxrs_high; vxrs_high = xmalloc(sizeof(*vxrs_high)); if (!vxrs_high) return NULL; user_s390_vxrs_high_entry__init(vxrs_high); vxrs_high->n_regs = 32; vxrs_high->regs = xzalloc(32 * sizeof(uint64_t)); if (!vxrs_high->regs) goto fail_free_vxrs_high; return vxrs_high; fail_free_vxrs_high: xfree(vxrs_high); return NULL; } /* * Free VxrsHigh registers */ static void free_vxrs_high_regs(UserS390VxrsHighEntry *vxrs_high) { if (vxrs_high) { xfree(vxrs_high->regs); xfree(vxrs_high); } } /* * Allocate guarded-storage control block (GS_CB and GS_BC) */ static UserS390GsCbEntry *allocate_gs_cb(void) { UserS390GsCbEntry *gs_cb; gs_cb = xmalloc(sizeof(*gs_cb)); if (!gs_cb) return NULL; user_s390_gs_cb_entry__init(gs_cb); gs_cb->n_regs = 4; gs_cb->regs = xzalloc(4 * sizeof(uint64_t)); if (!gs_cb->regs) goto fail_free_gs_cb; return gs_cb; fail_free_gs_cb: xfree(gs_cb); return NULL; } /* * Free Guareded Storage control blocks */ static void free_gs_cb(UserS390GsCbEntry *gs_cb) { if (gs_cb) { xfree(gs_cb->regs); xfree(gs_cb); } } /* * Allocate runtime-instrumentation control block */ static UserS390RiEntry *allocate_ri_cb(void) { UserS390RiEntry *ri_cb; ri_cb = xmalloc(sizeof(*ri_cb)); if (!ri_cb) return NULL; user_s390_ri_entry__init(ri_cb); ri_cb->ri_on = 0; ri_cb->n_regs = 8; ri_cb->regs = xzalloc(8 * sizeof(uint64_t)); if (!ri_cb->regs) goto fail_free_ri_cb; return ri_cb; fail_free_ri_cb: xfree(ri_cb); return NULL; } /* * Free runtime-instrumentation control block */ static void free_ri_cb(UserS390RiEntry *ri_cb) { if (ri_cb) { xfree(ri_cb->regs); xfree(ri_cb); } } /* * Copy internal structures into Google Protocol Buffers */ int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { UserS390VxrsHighEntry *vxrs_high = NULL; UserS390VxrsLowEntry *vxrs_low = NULL; UserS390FpregsEntry *fpregs = NULL; UserS390RegsEntry *gpregs = NULL; UserS390GsCbEntry *gs_cb = NULL; UserS390GsCbEntry *gs_bc = NULL; UserS390RiEntry *ri_cb = NULL; CoreEntry *core = arg; gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; /* Vector registers */ if (f->flags & USER_FPREGS_VXRS) { vxrs_low = allocate_vxrs_low_regs(); if (!vxrs_low) return -1; vxrs_high = allocate_vxrs_high_regs(); if (!vxrs_high) goto fail_free_vxrs_low; memcpy(vxrs_low->regs, &f->vxrs_low, sizeof(f->vxrs_low)); memcpy(vxrs_high->regs, &f->vxrs_high, sizeof(f->vxrs_high)); CORE_THREAD_ARCH_INFO(core)->vxrs_low = vxrs_low; CORE_THREAD_ARCH_INFO(core)->vxrs_high = vxrs_high; } /* Guarded-storage control block */ if (f->flags & USER_GS_CB) { gs_cb = allocate_gs_cb(); if (!gs_cb) goto fail_free_gs_cb; memcpy(gs_cb->regs, &f->gs_cb, sizeof(f->gs_cb)); CORE_THREAD_ARCH_INFO(core)->gs_cb = gs_cb; } /* Guarded-storage broadcast control block */ if (f->flags & USER_GS_BC) { gs_bc = allocate_gs_cb(); if (!gs_bc) goto fail_free_gs_bc; memcpy(gs_bc->regs, &f->gs_bc, sizeof(f->gs_bc)); CORE_THREAD_ARCH_INFO(core)->gs_bc = gs_bc; } /* Runtime-instrumentation control block */ if (f->flags & USER_RI_CB) { ri_cb = allocate_ri_cb(); if (!ri_cb) goto fail_free_ri_cb; memcpy(ri_cb->regs, &f->ri_cb, sizeof(f->ri_cb)); CORE_THREAD_ARCH_INFO(core)->ri_cb = ri_cb; /* We need to remember that the RI bit was on */ if (f->flags & USER_RI_ON) ri_cb->ri_on = 1; } /* General purpose registers */ memcpy(gpregs->gprs, u->prstatus.gprs, sizeof(u->prstatus.gprs)); gpregs->psw_mask = u->prstatus.psw.mask; gpregs->psw_addr = u->prstatus.psw.addr; /* Access registers */ memcpy(gpregs->acrs, u->prstatus.acrs, sizeof(u->prstatus.acrs)); /* System call */ gpregs->system_call = u->system_call; /* Floating point registers */ fpregs->fpc = f->prfpreg.fpc; memcpy(fpregs->fprs, f->prfpreg.fprs, sizeof(f->prfpreg.fprs)); return 0; fail_free_ri_cb: free_ri_cb(ri_cb); fail_free_gs_cb: free_gs_cb(gs_cb); fail_free_gs_bc: free_gs_cb(gs_bc); fail_free_vxrs_low: free_vxrs_low_regs(vxrs_low); return -1; } /* * Copy general and access registers to signal frame */ int restore_gpregs(struct rt_sigframe *f, UserS390RegsEntry *src) { _sigregs *dst = &f->uc.uc_mcontext; dst->regs.psw.mask = src->psw_mask; dst->regs.psw.addr = src->psw_addr; memcpy(dst->regs.gprs, src->gprs, sizeof(dst->regs.gprs)); memcpy(dst->regs.acrs, src->acrs, sizeof(dst->regs.acrs)); print_core_gpregs("restore_gpregs_regs", src); return 0; } /* * Copy floating point and vector registers to mcontext */ int restore_fpu(struct rt_sigframe *f, CoreEntry *core) { UserS390VxrsHighEntry *vxrs_high; UserS390VxrsLowEntry *vxrs_low; UserS390FpregsEntry *fpregs; _sigregs *dst = &f->uc.uc_mcontext; _sigregs_ext *dst_ext = &f->uc.uc_mcontext_ext; fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; vxrs_high = CORE_THREAD_ARCH_INFO(core)->vxrs_high; vxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; dst->fpregs.fpc = fpregs->fpc; memcpy(dst->fpregs.fprs, fpregs->fprs, sizeof(dst->fpregs.fprs)); if (vxrs_low) { memcpy(&dst_ext->vxrs_low, vxrs_low->regs, sizeof(dst_ext->vxrs_low)); memcpy(&dst_ext->vxrs_high, vxrs_high->regs, sizeof(dst_ext->vxrs_high)); } return 0; } /* * Allocate floating point registers */ static UserS390FpregsEntry *allocate_fp_regs(void) { UserS390FpregsEntry *fpregs; fpregs = xmalloc(sizeof(*fpregs)); if (!fpregs) return NULL; user_s390_fpregs_entry__init(fpregs); fpregs->n_fprs = 16; fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); if (!fpregs->fprs) goto fail_free_fpregs; return fpregs; fail_free_fpregs: xfree(fpregs); return NULL; } /* * Free floating point registers */ static void free_fp_regs(UserS390FpregsEntry *fpregs) { xfree(fpregs->fprs); xfree(fpregs); } /* * Allocate general purpose and access registers */ static UserS390RegsEntry *allocate_gp_regs(void) { UserS390RegsEntry *gpregs; gpregs = xmalloc(sizeof(*gpregs)); if (!gpregs) return NULL; user_s390_regs_entry__init(gpregs); gpregs->n_gprs = 16; gpregs->gprs = xzalloc(16 * sizeof(uint64_t)); if (!gpregs->gprs) goto fail_free_gpregs; gpregs->n_acrs = 16; gpregs->acrs = xzalloc(16 * sizeof(uint32_t)); if (!gpregs->acrs) goto fail_free_gprs; return gpregs; fail_free_gprs: xfree(gpregs->gprs); fail_free_gpregs: xfree(gpregs); return NULL; } /* * Free general purpose and access registers */ static void free_gp_regs(UserS390RegsEntry *gpregs) { xfree(gpregs->gprs); xfree(gpregs->acrs); xfree(gpregs); } /* * Allocate thread info */ int arch_alloc_thread_info(CoreEntry *core) { ThreadInfoS390 *ti_s390; ti_s390 = xmalloc(sizeof(*ti_s390)); if (!ti_s390) return -1; thread_info_s390__init(ti_s390); ti_s390->gpregs = allocate_gp_regs(); if (!ti_s390->gpregs) goto fail_free_ti_s390; ti_s390->fpregs = allocate_fp_regs(); if (!ti_s390->fpregs) goto fail_free_gp_regs; CORE_THREAD_ARCH_INFO(core) = ti_s390; return 0; fail_free_gp_regs: free_gp_regs(ti_s390->gpregs); fail_free_ti_s390: xfree(ti_s390); return -1; } /* * Free thread info */ void arch_free_thread_info(CoreEntry *core) { if (!CORE_THREAD_ARCH_INFO(core)) return; free_gp_regs(CORE_THREAD_ARCH_INFO(core)->gpregs); free_fp_regs(CORE_THREAD_ARCH_INFO(core)->fpregs); free_vxrs_low_regs(CORE_THREAD_ARCH_INFO(core)->vxrs_low); free_vxrs_high_regs(CORE_THREAD_ARCH_INFO(core)->vxrs_high); free_gs_cb(CORE_THREAD_ARCH_INFO(core)->gs_cb); free_gs_cb(CORE_THREAD_ARCH_INFO(core)->gs_bc); free_ri_cb(CORE_THREAD_ARCH_INFO(core)->ri_cb); xfree(CORE_THREAD_ARCH_INFO(core)); CORE_THREAD_ARCH_INFO(core) = NULL; } /* * Set regset for pid */ static int setregset(int pid, int set, const char *set_str, struct iovec *iov) { if (ptrace(PTRACE_SETREGSET, pid, set, iov) == 0) return 0; pr_perror("Couldn't set %s registers for pid %d", set_str, pid); return -1; } /* * Set floating point registers for pid from fpregs */ static int set_fp_regs(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; iov.iov_base = &fpregs->prfpreg; iov.iov_len = sizeof(fpregs->prfpreg); return setregset(pid, NT_PRFPREG, "PRFPREG", &iov); } /* * Set vector registers */ static int set_vx_regs(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; if (!(fpregs->flags & USER_FPREGS_VXRS)) return 0; iov.iov_base = &fpregs->vxrs_low; iov.iov_len = sizeof(fpregs->vxrs_low); if (setregset(pid, NT_S390_VXRS_LOW, "S390_VXRS_LOW", &iov)) return -1; iov.iov_base = &fpregs->vxrs_high; iov.iov_len = sizeof(fpregs->vxrs_high); return setregset(pid, NT_S390_VXRS_HIGH, "S390_VXRS_HIGH", &iov); } /* * Set guarded-storage control block */ static int set_gs_cb(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; if (fpregs->flags & USER_GS_CB) { iov.iov_base = &fpregs->gs_cb; iov.iov_len = sizeof(fpregs->gs_cb); if (setregset(pid, NT_S390_GS_CB, "S390_GS_CB", &iov)) return -1; } if (!(fpregs->flags & USER_GS_BC)) return 0; iov.iov_base = &fpregs->gs_bc; iov.iov_len = sizeof(fpregs->gs_bc); return setregset(pid, NT_S390_GS_BC, "S390_GS_BC", &iov); } /* * Set runtime-instrumentation control block */ static int set_ri_cb(pid_t pid, user_fpregs_struct_t *fpregs) { struct iovec iov; if (!(fpregs->flags & USER_RI_CB)) return 0; iov.iov_base = &fpregs->ri_cb; iov.iov_len = sizeof(fpregs->ri_cb); return setregset(pid, NT_S390_RI_CB, "S390_RI_CB", &iov); } /* * Set runtime-instrumentation bit * * The CPU collects information when the RI bit of the PSW is set. * The RI control block is not part of the signal frame. Therefore during * sigreturn it is not set. If the RI control block is present, the CPU * writes into undefined storage. Hence, we have disabled the RI bit in * the sigreturn PSW and set this bit after sigreturn by modifying the PSW * of the task. */ static int set_ri_bit(pid_t pid) { user_regs_struct_t regs; struct iovec iov; psw_t *psw; iov.iov_base = ®s.prstatus; iov.iov_len = sizeof(regs.prstatus); if (ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov) < 0) { pr_perror("Fail to activate RI bit"); return -1; } psw = ®s.prstatus.psw; psw->mask |= PSW_MASK_RI; return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); } /* * Restore registers not present in sigreturn signal frame */ static int set_task_regs_nosigrt(pid_t pid, CoreEntry *core) { user_fpregs_struct_t fpregs; UserS390GsCbEntry *cgs_cb; UserS390GsCbEntry *cgs_bc; UserS390RiEntry *cri_cb; int ret = 0; memset(&fpregs, 0, sizeof(fpregs)); /* Guarded-storage control block (optional) */ cgs_cb = CORE_THREAD_ARCH_INFO(core)->gs_cb; if (cgs_cb != NULL) { fpregs.flags |= USER_GS_CB; memcpy(&fpregs.gs_cb, cgs_cb->regs, sizeof(fpregs.gs_cb)); } /* Guarded-storage broadcast control block (optional) */ cgs_bc = CORE_THREAD_ARCH_INFO(core)->gs_bc; if (cgs_bc != NULL) { fpregs.flags |= USER_GS_BC; memcpy(&fpregs.gs_bc, cgs_bc->regs, sizeof(fpregs.gs_bc)); } if (set_gs_cb(pid, &fpregs) < 0) return -1; /* Runtime-instrumentation control block (optional) */ cri_cb = CORE_THREAD_ARCH_INFO(core)->ri_cb; if (cri_cb != NULL) { fpregs.flags |= USER_RI_CB; memcpy(&fpregs.ri_cb, cri_cb->regs, sizeof(fpregs.ri_cb)); if (set_ri_cb(pid, &fpregs) < 0) return -1; if (cri_cb->ri_on) { fpregs.flags |= USER_RI_ON; ret = set_ri_bit(pid); } } return ret; } /* * Restore registers for pid from core */ static int set_task_regs(pid_t pid, CoreEntry *core) { UserS390VxrsHighEntry *cvxrs_high; UserS390VxrsLowEntry *cvxrs_low; UserS390FpregsEntry *cfpregs; user_fpregs_struct_t fpregs; memset(&fpregs, 0, sizeof(fpregs)); /* Floating point registers */ cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; if (!cfpregs) return -1; fpregs.prfpreg.fpc = cfpregs->fpc; memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); if (set_fp_regs(pid, &fpregs) < 0) return -1; /* Vector registers (optional) */ cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (cvxrs_low != NULL) { cvxrs_high = CORE_THREAD_ARCH_INFO(core)->vxrs_high; if (!cvxrs_high) return -1; fpregs.flags |= USER_FPREGS_VXRS; memcpy(&fpregs.vxrs_low, cvxrs_low->regs, sizeof(fpregs.vxrs_low)); memcpy(&fpregs.vxrs_high, cvxrs_high->regs, sizeof(fpregs.vxrs_high)); if (set_vx_regs(pid, &fpregs) < 0) return -1; } return set_task_regs_nosigrt(pid, core); } /* * Restore registers for all threads: * - Floating point registers * - Vector registers * - Guarded-storage control block * - Guarded-storage broadcast control block * - Runtime-instrumentation control block */ int arch_set_thread_regs(struct pstree_item *item, bool with_threads) { int i; for_each_pstree_item(item) { if (item->pid->state == TASK_DEAD || item->pid->state == TASK_ZOMBIE) continue; for (i = 0; i < item->nr_threads; i++) { if (item->threads[i].state == TASK_DEAD || item->threads[i].state == TASK_ZOMBIE) continue; if (!with_threads && i > 0) continue; if (set_task_regs(item->threads[i].real, item->core[i])) { pr_perror("Not set registers for task %d", item->threads[i].real); return -1; } } } return 0; } static int open_core(int pid, CoreEntry **pcore) { struct cr_img *img; int ret; img = open_image(CR_FD_CORE, O_RSTR, pid); if (!img) { pr_err("Can't open core data for %d\n", pid); return -1; } ret = pb_read_one(img, pcore, PB_CORE); close_image(img); return ret <= 0 ? -1 : 0; } /* * Restore all registers not present in sigreturn signal frame * * - Guarded-storage control block * - Guarded-storage broadcast control block * - Runtime-instrumentation control block */ int arch_set_thread_regs_nosigrt(struct pid *pid) { CoreEntry *core; core = xmalloc(sizeof(*core)); if (open_core(pid->ns[0].virt, &core) < 0) { pr_perror("Cannot open core for virt pid %d", pid->ns[0].virt); return -1; } if (set_task_regs_nosigrt(pid->real, core) < 0) { pr_perror("Set register for pid %d", pid->real); return -1; } print_core_fp_regs("restore_fp_regs", core); return 0; } criu-3.6/criu/arch/s390/include/000077500000000000000000000000001317335042600163305ustar00rootroot00000000000000criu-3.6/criu/arch/s390/include/asm/000077500000000000000000000000001317335042600171105ustar00rootroot00000000000000criu-3.6/criu/arch/s390/include/asm/dump.h000066400000000000000000000005401317335042600202250ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); int arch_alloc_thread_info(CoreEntry *core); void arch_free_thread_info(CoreEntry *core); static inline void core_put_tls(CoreEntry *core, tls_t tls) { } #define get_task_futex_robust_list_compat(pid, info) -1 #endif criu-3.6/criu/arch/s390/include/asm/int.h000066400000000000000000000001571317335042600200560ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ criu-3.6/criu/arch/s390/include/asm/parasite-syscall.h000066400000000000000000000001521317335042600225370ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ struct parasite_ctl; #endif criu-3.6/criu/arch/s390/include/asm/parasite.h000066400000000000000000000002731317335042600210730ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ /* TLS is accessed through %a01, which is already processed */ static inline void arch_get_tls(tls_t *ptls) { (void)ptls; } #endif criu-3.6/criu/arch/s390/include/asm/restore.h000066400000000000000000000013271317335042600207470ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" /* * Load stack to %r15, return address in %r14 and argument 1 into %r2 */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ "lgr %%r15,%0\n" \ "lgr %%r14,%1\n" \ "lgr %%r2,%2\n" \ "basr %%r14,%%r14\n" \ : \ : "d" (new_sp), \ "d"((unsigned long)restore_task_exec_start), \ "d" (task_args) \ : "2", "14", "15", "memory") /* There is nothing to do since TLS is accessed through %a01 */ #define core_get_tls(pcore, ptls) int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif criu-3.6/criu/arch/s390/include/asm/restorer.h000066400000000000000000000042761317335042600211370ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include #include #include "asm/types.h" #include "sigframe.h" /* * Clone trampoline - see glibc sysdeps/unix/sysv/linux/s390/s390-64/clone.S */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "lgr %%r0,%6\n" /* Save thread_args in %r0 */ \ "lgr %%r1,%5\n" /* Save clone_restore_fn in %r1 */ \ "lgr %%r2,%2\n" /* Parm 1: new_sp (child stack) */ \ "lgr %%r3,%1\n" /* Parm 2: clone_flags */ \ "lgr %%r4,%3\n" /* Parm 3: &parent_tid */ \ "lgr %%r5,%4\n" /* Parm 4: &thread_args[i].pid */ \ "lghi %%r6,0\n" /* Parm 5: tls = 0 */ \ "svc "__stringify(__NR_clone)"\n" \ "ltgr %0,%%r2\n" /* Set and check "ret" */ \ "jnz 0f\n" /* ret != 0: Continue caller */ \ "lgr %%r2,%%r0\n" /* Parm 1: &thread_args */ \ "aghi %%r15,-160\n" /* Prepare stack frame */ \ "xc 0(8,%%r15),0(%%r15)\n" \ "basr %%r14,%%r1\n" /* Jump to clone_restore_fn() */ \ "j .+2\n" /* BUG(): Force PGM check */ \ "0:\n" /* Continue caller */ \ : "=d"(ret) \ : "d"(clone_flags), \ "a"(new_sp), \ "d"(&parent_tid), \ "d"(&thread_args[i].pid), \ "d"(clone_restore_fn), \ "d"(&thread_args[i]) \ : "0", "1", "2", "3", "4", "5", "6", "cc", "memory") #define kdat_compatible_cr() 0 #define kdat_can_map_vdso() 0 #define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserS390RegsEntry *r); int restore_nonsigframe_gpregs(UserS390RegsEntry *r); unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg); unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset); static inline void restore_tls(tls_t *ptls) { (void)ptls; } static inline void *alloc_compat_syscall_stack(void) { return NULL; } static inline void free_compat_syscall_stack(void *stack32) { } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } #endif /*__CR_ASM_RESTORER_H__*/ criu-3.6/criu/arch/s390/include/asm/types.h000066400000000000000000000015541317335042600204320ustar00rootroot00000000000000#ifndef _UAPI_S390_TYPES_H #define _UAPI_S390_TYPES_H #include #include #include "images/core.pb-c.h" #include "page.h" #include "bitops.h" #include "asm/int.h" #include typedef UserS390RegsEntry UserRegsEntry; #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__S390 #define core_is_compat(core) false #define CORE_THREAD_ARCH_INFO(core) core->ti_s390 static inline u64 encode_pointer(void *p) { return (u64) p; } static inline void *decode_pointer(u64 v) { return (void *) v; } /* * See also: * * arch/s390/include/uapi/asm/auxvec.h * * include/linux/auxvec.h */ #define AT_VECTOR_SIZE_BASE 20 #define AT_VECTOR_SIZE_ARCH 1 #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) typedef uint64_t auxv_t; typedef uint64_t tls_t; #endif /* _UAPI_S390_TYPES_H */ criu-3.6/criu/arch/s390/include/asm/vdso.h000066400000000000000000000007721317335042600202420ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "asm-generic/vdso.h" /* * This is a minimal amount of symbols * we should support at the moment. */ #define VDSO_SYMBOL_MAX 4 /* * This definition is used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' */ #define ARCH_VDSO_SYMBOLS \ "__kernel_gettimeofday", \ "__kernel_clock_gettime", \ "__kernel_clock_getres", \ "__kernel_getcpu" #endif /* __CR_ASM_VDSO_H__ */ criu-3.6/criu/arch/s390/restorer.c000066400000000000000000000013121317335042600167130ustar00rootroot00000000000000#include #include "restorer.h" #include "asm/restorer.h" #include #include #include "log.h" /* * All registers are restored by sigreturn - nothing to do here */ int restore_nonsigframe_gpregs(UserS390RegsEntry *r) { return 0; } /* * Call underlying ipc system call for shmat */ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) { unsigned long raddr; int ret; ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ shmflg, /* second */ (unsigned long)&raddr, /* third */ shmaddr, /* ptr */ 0 /* fifth not used */); if (ret) raddr = (unsigned long) ret; return raddr; } criu-3.6/criu/arch/s390/sigframe.c000066400000000000000000000006451317335042600166530ustar00rootroot00000000000000#include #include #include "asm/sigframe.h" #include "asm/types.h" #include "log.h" /* * Nothing to do since we don't have any pointers to adjust * in the signal frame. * * - sigframe : Pointer to local signal frame * - rsigframe: Pointer to remote signal frame of inferior */ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } criu-3.6/criu/arch/s390/vdso-pie.c000066400000000000000000000027261317335042600166060ustar00rootroot00000000000000#include #include "asm/types.h" #include #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX # undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " /* * Trampoline instruction sequence */ typedef struct { u8 larl[6]; /* Load relative address of imm64 */ u8 lg[6]; /* Load %r1 with imm64 */ u8 br[2]; /* Branch to %r1 */ u64 addr; /* Jump address */ u32 guards; /* Guard bytes */ } __packed jmp_t; /* * Trampoline template: Use %r1 to jump */ jmp_t jmp = { /* larl %r1,e (addr) */ .larl = {0xc0, 0x10, 0x00, 0x00, 0x00, 0x07}, /* lg %r1,0(%r1) */ .lg = {0xe3, 0x10, 0x10, 0x00, 0x00, 0x04}, /* br %r1 */ .br = {0x07, 0xf1}, .guards = 0xcccccccc, }; /* * Insert trampoline code into old vdso entry points to * jump to new vdso functions. */ int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, struct vdso_symtable *from, bool __always_unused compat_vdso) { unsigned int i; for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { if (vdso_symbol_empty(&from->symbols[i])) continue; pr_debug("jmp: %s: %lx/%lx -> %lx/%lx (index %d)\n", from->symbols[i].name, base_from, from->symbols[i].offset, base_to, to->symbols[i].offset, i); jmp.addr = base_to + to->symbols[i].offset; memcpy((void *)(base_from + from->symbols[i].offset), &jmp, sizeof(jmp)); } return 0; } criu-3.6/criu/arch/x86/000077500000000000000000000000001317335042600146345ustar00rootroot00000000000000criu-3.6/criu/arch/x86/Makefile000066400000000000000000000007441317335042600163010ustar00rootroot00000000000000builtin-name := crtools.built-in.o ccflags-y += -iquote $(obj)/include ccflags-y += -iquote criu/include -iquote include ccflags-y += $(COMPEL_UAPI_INCLUDES) asflags-y += -Wstrict-prototypes asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer asflags-y += -iquote $(obj)/include ldflags-y += -r -z noexecstack obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o ifeq ($(CONFIG_COMPAT),y) obj-y += sigaction_compat.o obj-y += call32.o endif criu-3.6/criu/arch/x86/call32.S000066400000000000000000000025511317335042600160430ustar00rootroot00000000000000/* * call32.S - assembly helpers for mixed-bitness code * From kernel selftests originally: tools/testing/selftests/x86/thunks.S * Copyright (c) 2015 Andrew Lutomirski * * This program is free software; you can redistribute it and/or modify * it under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * These are little helpers that make it easier to switch bitness on * the fly. */ #include "common/asm/linkage.h" .text /* * @rdi: Stack to use * @esi: Pointer to function for calling */ ENTRY(call32_from_64) /* Callee-saving registers due to ABI */ pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushfq /* Switch stacks */ sub $8, %rdi mov %rsp,(%rdi) mov %rdi,%rsp /* Switch into compatibility mode */ pushq $__USER32_CS pushq $1f lretq 1: .code32 /* Run function and switch back */ call *%esi jmp $__USER_CS,$1f .code64 1: /* Restore the stack */ mov (%rsp),%rsp add $8, %rdi /* Restore registers */ popfq popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret END(call32_from_64) criu-3.6/criu/arch/x86/cpu.c000066400000000000000000000176371317335042600156050ustar00rootroot00000000000000#include #include #include #include #include #include #include "bitops.h" #include "asm/types.h" #include "asm/cpu.h" #include #include #include "common/compiler.h" #include "cr_options.h" #include "image.h" #include "util.h" #include "log.h" #include "cpu.h" #include "protobuf.h" #include "images/cpuinfo.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_cpu_info; int cpu_init(void) { if (compel_cpuid(&rt_cpu_info)) return -1; BUILD_BUG_ON(sizeof(struct xsave_struct) != XSAVE_SIZE); BUILD_BUG_ON(sizeof(struct i387_fxsave_struct) != FXSAVE_SIZE); /* * Make sure that at least FPU is onboard * and fxsave is supported. */ if (compel_cpu_has_feature(X86_FEATURE_FPU)) { if (!compel_cpu_has_feature(X86_FEATURE_FXSR)) { pr_err("missing support fxsave/restore insns\n"); return -1; } } pr_debug("fpu:%d fxsr:%d xsave:%d\n", !!compel_cpu_has_feature(X86_FEATURE_FPU), !!compel_cpu_has_feature(X86_FEATURE_FXSR), !!compel_cpu_has_feature(X86_FEATURE_OSXSAVE)); return 0; } int cpu_dump_cpuinfo(void) { CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT; CpuinfoX86Entry cpu_x86_info = CPUINFO_X86_ENTRY__INIT; CpuinfoX86Entry *cpu_x86_info_ptr = &cpu_x86_info; struct cr_img *img; img = open_image(CR_FD_CPUINFO, O_DUMP); if (!img) return -1; cpu_info.x86_entry = &cpu_x86_info_ptr; cpu_info.n_x86_entry = 1; cpu_x86_info.vendor_id = (rt_cpu_info.x86_vendor == X86_VENDOR_INTEL) ? CPUINFO_X86_ENTRY__VENDOR__INTEL : CPUINFO_X86_ENTRY__VENDOR__AMD; cpu_x86_info.cpu_family = rt_cpu_info.x86_family; cpu_x86_info.model = rt_cpu_info.x86_model; cpu_x86_info.stepping = rt_cpu_info.x86_mask; cpu_x86_info.capability_ver = 1; cpu_x86_info.n_capability = ARRAY_SIZE(rt_cpu_info.x86_capability); cpu_x86_info.capability = (void *)rt_cpu_info.x86_capability; if (rt_cpu_info.x86_model_id[0]) cpu_x86_info.model_id = rt_cpu_info.x86_model_id; if (pb_write_one(img, &cpu_info, PB_CPUINFO) < 0) { close_image(img); return -1; } close_image(img); return 0; } #define __ins_bit(__l, __v) (1u << ((__v) - 32u * (__l))) static u32 x86_ins_capability_mask[NCAPINTS] = { [0] = __ins_bit(0, X86_FEATURE_FPU) | __ins_bit(0, X86_FEATURE_TSC) | __ins_bit(0, X86_FEATURE_CX8) | __ins_bit(0, X86_FEATURE_SEP) | __ins_bit(0, X86_FEATURE_CMOV) | __ins_bit(0, X86_FEATURE_CLFLUSH) | __ins_bit(0, X86_FEATURE_MMX) | __ins_bit(0, X86_FEATURE_FXSR) | __ins_bit(0, X86_FEATURE_XMM) | __ins_bit(0, X86_FEATURE_XMM2), [1] = __ins_bit(1, X86_FEATURE_SYSCALL) | __ins_bit(1, X86_FEATURE_MMXEXT) | __ins_bit(1, X86_FEATURE_RDTSCP) | __ins_bit(1, X86_FEATURE_3DNOWEXT) | __ins_bit(1, X86_FEATURE_3DNOW), [3] = __ins_bit(3, X86_FEATURE_REP_GOOD) | __ins_bit(3, X86_FEATURE_NOPL), [4] = __ins_bit(4, X86_FEATURE_XMM3) | __ins_bit(4, X86_FEATURE_PCLMULQDQ) | __ins_bit(4, X86_FEATURE_MWAIT) | __ins_bit(4, X86_FEATURE_SSSE3) | __ins_bit(4, X86_FEATURE_CX16) | __ins_bit(4, X86_FEATURE_XMM4_1) | __ins_bit(4, X86_FEATURE_XMM4_2) | __ins_bit(4, X86_FEATURE_MOVBE) | __ins_bit(4, X86_FEATURE_POPCNT) | __ins_bit(4, X86_FEATURE_AES) | __ins_bit(4, X86_FEATURE_XSAVE) | __ins_bit(4, X86_FEATURE_OSXSAVE) | __ins_bit(4, X86_FEATURE_AVX) | __ins_bit(4, X86_FEATURE_F16C) | __ins_bit(4, X86_FEATURE_RDRAND), [6] = __ins_bit(6, X86_FEATURE_ABM) | __ins_bit(6, X86_FEATURE_SSE4A) | __ins_bit(6, X86_FEATURE_MISALIGNSSE) | __ins_bit(6, X86_FEATURE_3DNOWPREFETCH) | __ins_bit(6, X86_FEATURE_XOP) | __ins_bit(6, X86_FEATURE_FMA4) | __ins_bit(6, X86_FEATURE_TBM), [9] = __ins_bit(9, X86_FEATURE_FSGSBASE) | __ins_bit(9, X86_FEATURE_BMI1) | __ins_bit(9, X86_FEATURE_HLE) | __ins_bit(9, X86_FEATURE_AVX2) | __ins_bit(9, X86_FEATURE_BMI2) | __ins_bit(9, X86_FEATURE_ERMS) | __ins_bit(9, X86_FEATURE_RTM) | __ins_bit(9, X86_FEATURE_MPX) | __ins_bit(9, X86_FEATURE_AVX512F) | __ins_bit(9, X86_FEATURE_AVX512DQ) | __ins_bit(9, X86_FEATURE_RDSEED) | __ins_bit(9, X86_FEATURE_ADX) | __ins_bit(9, X86_FEATURE_CLFLUSHOPT) | __ins_bit(9, X86_FEATURE_AVX512PF) | __ins_bit(9, X86_FEATURE_AVX512ER) | __ins_bit(9, X86_FEATURE_AVX512CD) | __ins_bit(9, X86_FEATURE_SHA) | __ins_bit(9, X86_FEATURE_AVX512BW) | __ins_bit(9, X86_FEATURE_AVXVL), [10] = __ins_bit(10, X86_FEATURE_XSAVEOPT) | __ins_bit(10, X86_FEATURE_XSAVEC) | __ins_bit(10, X86_FEATURE_XGETBV1) | __ins_bit(10, X86_FEATURE_XSAVES), [11] = __ins_bit(11, X86_FEATURE_PREFETCHWT1), }; #undef __ins_bit static int cpu_validate_ins_features(CpuinfoX86Entry *img_x86_entry) { size_t i; for (i = 0; i < ARRAY_SIZE(rt_cpu_info.x86_capability); i++) { u32 s = img_x86_entry->capability[i] & x86_ins_capability_mask[i]; u32 d = rt_cpu_info.x86_capability[i] & x86_ins_capability_mask[i]; /* * Destination might be more feature rich * but not the reverse. */ if (s & ~d) { pr_err("CPU instruction capabilities do not match run time\n"); return -1; } } return 0; } static int cpu_validate_features(CpuinfoX86Entry *img_x86_entry) { if (img_x86_entry->n_capability != ARRAY_SIZE(rt_cpu_info.x86_capability)) { /* * Image carries different number of bits. * Simply reject, we can't guarantee anything * in such case. */ pr_err("Size of features in image mismatch " "one provided by run time CPU (%d:%d)\n", (unsigned)img_x86_entry->n_capability, (unsigned)ARRAY_SIZE(rt_cpu_info.x86_capability)); return -1; } if (opts.cpu_cap == CPU_CAP_FPU) { /* * If we're requested to check FPU only ignore * any other bit. It's up to a user if the * rest of mismatches won't cause problems. */ #define __mismatch_fpu_bit(__bit) \ (test_bit(__bit, (void *)img_x86_entry->capability) && \ !compel_cpu_has_feature(__bit)) if (__mismatch_fpu_bit(X86_FEATURE_FPU) || __mismatch_fpu_bit(X86_FEATURE_FXSR) || __mismatch_fpu_bit(X86_FEATURE_OSXSAVE)) { pr_err("FPU feature required by image " "is not supported on host.\n"); return -1; } else return 0; #undef __mismatch_fpu_bit } /* * Capability on instructions level only. */ if (opts.cpu_cap == CPU_CAP_INS) return cpu_validate_ins_features(img_x86_entry); /* * Strict capability mode. Everything must match. */ if (memcmp(img_x86_entry->capability, rt_cpu_info.x86_capability, sizeof(rt_cpu_info.x86_capability))) { pr_err("CPU capabilites do not match run time\n"); return -1; } return 0; } int cpu_validate_cpuinfo(void) { CpuinfoX86Entry *img_x86_entry; CpuinfoEntry *img_cpu_info; struct cr_img *img; int ret = -1; img = open_image(CR_FD_CPUINFO, O_RSTR); if (!img) return -1; if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) goto err; if (img_cpu_info->n_x86_entry != 1) { pr_err("No x86 related cpuinfo in image, " "corruption (n_x86_entry = %zi)\n", img_cpu_info->n_x86_entry); goto err; } img_x86_entry = img_cpu_info->x86_entry[0]; if (img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__INTEL && img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__AMD) { pr_err("Unknown cpu vendor %d\n", img_x86_entry->vendor_id); goto err; } if (img_x86_entry->n_capability != ARRAY_SIZE(rt_cpu_info.x86_capability)) { pr_err("Image carries %u words while %u expected\n", (unsigned)img_x86_entry->n_capability, (unsigned)ARRAY_SIZE(rt_cpu_info.x86_capability)); goto err; } ret = cpu_validate_features(img_x86_entry); err: close_image(img); return ret; } int cpuinfo_dump(void) { if (cpu_init()) return -1; if (cpu_dump_cpuinfo()) return -1; return 0; } int cpuinfo_check(void) { if (cpu_init()) return 1; /* * Force to check all caps if empty passed, * still allow to check instructions only * and etc. */ if (!opts.cpu_cap) opts.cpu_cap = CPU_CAP_ALL; if (cpu_validate_cpuinfo()) return 1; return 0; } criu-3.6/criu/arch/x86/crtools.c000066400000000000000000000400651317335042600164720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "types.h" #include "log.h" #include "asm/compat.h" #include "asm/parasite-syscall.h" #include "asm/restorer.h" #include #include "asm/dump.h" #include "cr_options.h" #include "common/compiler.h" #include "restorer.h" #include "parasite-syscall.h" #include "util.h" #include "cpu.h" #include #include "kerndat.h" #include #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" int kdat_can_map_vdso(void) { pid_t child; int stat; /* * Running under fork so if vdso_64 is disabled - don't create * it for criu accidentally. */ child = fork(); if (child < 0) return -1; if (child == 0) { int ret; ret = syscall(SYS_arch_prctl, ARCH_MAP_VDSO_32, 0); if (ret == 0) exit(1); /* * Mapping vDSO while have not unmap it yet: * this is restricted by API if ARCH_MAP_VDSO_* is supported. */ if (ret == -1 && errno == EEXIST) exit(1); exit(0); } if (waitpid(child, &stat, 0) != child) { pr_err("Failed to wait for arch_prctl() test"); kill(child, SIGKILL); return -1; } if (!WIFEXITED(stat)) return -1; return WEXITSTATUS(stat); } #ifdef CONFIG_COMPAT void *mmap_ia32(void *addr, size_t len, int prot, int flags, int fildes, off_t off) { struct syscall_args32 s; s.nr = __NR32_mmap2; s.arg0 = (uint32_t)(uintptr_t)addr; s.arg1 = (uint32_t)len; s.arg2 = prot; s.arg3 = flags; s.arg4 = fildes; s.arg5 = (uint32_t)off; do_full_int80(&s); return (void *)(uintptr_t)s.nr; } /* * The idea of the test: * From kernel's top-down allocator we assume here that * 1. A = mmap(0, ...); munmap(A); * 2. B = mmap(0, ...); * results in A == B. * ...but if we have 32-bit mmap() bug, then A will have only lower * 4 bytes of 64-bit address allocated with mmap(). * That means, that the next mmap() will return B != A * (as munmap(A) hasn't really unmapped A mapping). * * As mapping with lower 4 bytes of A may really exist, we run * this test under fork(). * * Another approach to test bug's presence would be to parse * /proc/self/maps before and after 32-bit mmap(), but that would * be soo slow. */ static void mmap_bug_test(void) { void *map1, *map2; int err; map1 = mmap_ia32(0, PAGE_SIZE, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0); /* 32-bit error, not sign-extended - can't use IS_ERR_VALUE() here */ err = (uintptr_t)map1 % PAGE_SIZE; if (err) { pr_err("ia32 mmap() failed: %d\n", err); exit(1); } if (munmap(map1, PAGE_SIZE)) { pr_err("Failed to unmap() 32-bit mapping: %m\n"); exit(1); } map2 = mmap_ia32(0, PAGE_SIZE, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0); err = (uintptr_t)map2 % PAGE_SIZE; if (err) { pr_err("ia32 mmap() failed: %d\n", err); exit(1); } if (map1 != map2) exit(1); exit(0); } /* * Pre v4.12 kernels have a bug: for a process started as 64-bit * 32-bit mmap() may return 8 byte pointer. * Which is fatal for us: after 32-bit C/R a task will map 64-bit * addresses, cut upper 4 bytes and try to use lower 4 bytes. * This is a check if the bug was fixed in the kernel. */ static int has_32bit_mmap_bug(void) { pid_t child = fork(); int stat; if (child == 0) mmap_bug_test(); if (waitpid(child, &stat, 0) != child) { pr_err("Failed to wait for mmap test"); kill(child, SIGKILL); return -1; } if (!WIFEXITED(stat) || WEXITSTATUS(stat) != 0) return 1; return 0; } int kdat_compatible_cr(void) { if (!kdat.can_map_vdso) return 0; if (has_32bit_mmap_bug()) return 0; return 1; } #else /* !CONFIG_COMPAT */ int kdat_compatible_cr(void) { return 0; } #endif int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; UserX86RegsEntry *gpregs = core->thread_info->gpregs; #define assign_reg(dst, src, e) do { dst->e = (__typeof__(dst->e))src.e; } while (0) #define assign_array(dst, src, e) memcpy(dst->e, &src.e, sizeof(src.e)) if (user_regs_native(regs)) { assign_reg(gpregs, regs->native, r15); assign_reg(gpregs, regs->native, r14); assign_reg(gpregs, regs->native, r13); assign_reg(gpregs, regs->native, r12); assign_reg(gpregs, regs->native, bp); assign_reg(gpregs, regs->native, bx); assign_reg(gpregs, regs->native, r11); assign_reg(gpregs, regs->native, r10); assign_reg(gpregs, regs->native, r9); assign_reg(gpregs, regs->native, r8); assign_reg(gpregs, regs->native, ax); assign_reg(gpregs, regs->native, cx); assign_reg(gpregs, regs->native, dx); assign_reg(gpregs, regs->native, si); assign_reg(gpregs, regs->native, di); assign_reg(gpregs, regs->native, orig_ax); assign_reg(gpregs, regs->native, ip); assign_reg(gpregs, regs->native, cs); assign_reg(gpregs, regs->native, flags); assign_reg(gpregs, regs->native, sp); assign_reg(gpregs, regs->native, ss); assign_reg(gpregs, regs->native, fs_base); assign_reg(gpregs, regs->native, gs_base); assign_reg(gpregs, regs->native, ds); assign_reg(gpregs, regs->native, es); assign_reg(gpregs, regs->native, fs); assign_reg(gpregs, regs->native, gs); gpregs->mode = USER_X86_REGS_MODE__NATIVE; } else { assign_reg(gpregs, regs->compat, bx); assign_reg(gpregs, regs->compat, cx); assign_reg(gpregs, regs->compat, dx); assign_reg(gpregs, regs->compat, si); assign_reg(gpregs, regs->compat, di); assign_reg(gpregs, regs->compat, bp); assign_reg(gpregs, regs->compat, ax); assign_reg(gpregs, regs->compat, ds); assign_reg(gpregs, regs->compat, es); assign_reg(gpregs, regs->compat, fs); assign_reg(gpregs, regs->compat, gs); assign_reg(gpregs, regs->compat, orig_ax); assign_reg(gpregs, regs->compat, ip); assign_reg(gpregs, regs->compat, cs); assign_reg(gpregs, regs->compat, flags); assign_reg(gpregs, regs->compat, sp); assign_reg(gpregs, regs->compat, ss); gpregs->mode = USER_X86_REGS_MODE__COMPAT; } gpregs->has_mode = true; if (!fpregs) return 0; assign_reg(core->thread_info->fpregs, fpregs->i387, cwd); assign_reg(core->thread_info->fpregs, fpregs->i387, swd); assign_reg(core->thread_info->fpregs, fpregs->i387, twd); assign_reg(core->thread_info->fpregs, fpregs->i387, fop); assign_reg(core->thread_info->fpregs, fpregs->i387, rip); assign_reg(core->thread_info->fpregs, fpregs->i387, rdp); assign_reg(core->thread_info->fpregs, fpregs->i387, mxcsr); assign_reg(core->thread_info->fpregs, fpregs->i387, mxcsr_mask); /* Make sure we have enough space */ BUG_ON(core->thread_info->fpregs->n_st_space != ARRAY_SIZE(fpregs->i387.st_space)); BUG_ON(core->thread_info->fpregs->n_xmm_space != ARRAY_SIZE(fpregs->i387.xmm_space)); assign_array(core->thread_info->fpregs, fpregs->i387, st_space); assign_array(core->thread_info->fpregs, fpregs->i387, xmm_space); if (compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { BUG_ON(core->thread_info->fpregs->xsave->n_ymmh_space != ARRAY_SIZE(fpregs->ymmh.ymmh_space)); assign_reg(core->thread_info->fpregs->xsave, fpregs->xsave_hdr, xstate_bv); assign_array(core->thread_info->fpregs->xsave, fpregs->ymmh, ymmh_space); } #undef assign_reg #undef assign_array return 0; } static void alloc_tls(ThreadInfoX86 *ti, void **mempool) { int i; ti->tls = xptr_pull_s(mempool, GDT_ENTRY_TLS_NUM*sizeof(UserDescT*)); ti->n_tls = GDT_ENTRY_TLS_NUM; for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { ti->tls[i] = xptr_pull(mempool, UserDescT); user_desc_t__init(ti->tls[i]); } } int arch_alloc_thread_info(CoreEntry *core) { size_t sz; bool with_fpu, with_xsave = false; void *m; ThreadInfoX86 *ti = NULL; with_fpu = compel_cpu_has_feature(X86_FEATURE_FPU); sz = sizeof(ThreadInfoX86) + sizeof(UserX86RegsEntry) + GDT_ENTRY_TLS_NUM*sizeof(UserDescT) + GDT_ENTRY_TLS_NUM*sizeof(UserDescT*); if (with_fpu) { sz += sizeof(UserX86FpregsEntry); with_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); if (with_xsave) sz += sizeof(UserX86XsaveEntry); } m = xmalloc(sz); if (!m) return -1; ti = core->thread_info = xptr_pull(&m, ThreadInfoX86); thread_info_x86__init(ti); ti->gpregs = xptr_pull(&m, UserX86RegsEntry); user_x86_regs_entry__init(ti->gpregs); alloc_tls(ti, &m); if (with_fpu) { UserX86FpregsEntry *fpregs; fpregs = ti->fpregs = xptr_pull(&m, UserX86FpregsEntry); user_x86_fpregs_entry__init(fpregs); /* These are numbers from kernel */ fpregs->n_st_space = 32; fpregs->n_xmm_space = 64; fpregs->st_space = xzalloc(pb_repeated_size(fpregs, st_space)); fpregs->xmm_space = xzalloc(pb_repeated_size(fpregs, xmm_space)); if (!fpregs->st_space || !fpregs->xmm_space) goto err; if (with_xsave) { UserX86XsaveEntry *xsave; xsave = fpregs->xsave = xptr_pull(&m, UserX86XsaveEntry); user_x86_xsave_entry__init(xsave); xsave->n_ymmh_space = 64; xsave->ymmh_space = xzalloc(pb_repeated_size(xsave, ymmh_space)); if (!xsave->ymmh_space) goto err; } } return 0; err: return -1; } void arch_free_thread_info(CoreEntry *core) { if (!core->thread_info) return; if (core->thread_info->fpregs->xsave) xfree(core->thread_info->fpregs->xsave->ymmh_space); xfree(core->thread_info->fpregs->st_space); xfree(core->thread_info->fpregs->xmm_space); xfree(core->thread_info); } static bool valid_xsave_frame(CoreEntry *core) { struct xsave_struct *x = NULL; if (core->thread_info->fpregs->n_st_space < ARRAY_SIZE(x->i387.st_space)) { pr_err("Corruption in FPU st_space area " "(got %li but %li expected)\n", (long)core->thread_info->fpregs->n_st_space, (long)ARRAY_SIZE(x->i387.st_space)); return false; } if (core->thread_info->fpregs->n_xmm_space < ARRAY_SIZE(x->i387.xmm_space)) { pr_err("Corruption in FPU xmm_space area " "(got %li but %li expected)\n", (long)core->thread_info->fpregs->n_st_space, (long)ARRAY_SIZE(x->i387.xmm_space)); return false; } if (compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { if (core->thread_info->fpregs->xsave && core->thread_info->fpregs->xsave->n_ymmh_space < ARRAY_SIZE(x->ymmh.ymmh_space)) { pr_err("Corruption in FPU ymmh_space area " "(got %li but %li expected)\n", (long)core->thread_info->fpregs->xsave->n_ymmh_space, (long)ARRAY_SIZE(x->ymmh.ymmh_space)); return false; } } else { /* * If the image has xsave area present then CPU we're restoring * on must have X86_FEATURE_OSXSAVE feature until explicitly * stated in options. */ if (core->thread_info->fpregs->xsave) { if (opts.cpu_cap & CPU_CAP_FPU) { pr_err("FPU xsave area present, " "but host cpu doesn't support it\n"); return false; } else pr_warn_once("FPU is about to restore ignoring ymm state!\n"); } } return true; } static void show_rt_xsave_frame(struct xsave_struct *x) { struct fpx_sw_bytes *fpx = (void *)&x->i387.sw_reserved; struct xsave_hdr_struct *xsave_hdr = &x->xsave_hdr; struct i387_fxsave_struct *i387 = &x->i387; pr_debug("xsave runtime structure\n"); pr_debug("-----------------------\n"); pr_debug("cwd:%x swd:%x twd:%x fop:%x mxcsr:%x mxcsr_mask:%x\n", (int)i387->cwd, (int)i387->swd, (int)i387->twd, (int)i387->fop, (int)i387->mxcsr, (int)i387->mxcsr_mask); pr_debug("magic1:%x extended_size:%x xstate_bv:%lx xstate_size:%x\n", fpx->magic1, fpx->extended_size, (long)fpx->xstate_bv, fpx->xstate_size); pr_debug("xstate_bv: %lx\n", (long)xsave_hdr->xstate_bv); pr_debug("-----------------------\n"); } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { fpu_state_t *fpu_state = core_is_compat(core) ? &sigframe->compat.fpu_state : &sigframe->native.fpu_state; struct xsave_struct *x = core_is_compat(core) ? (void *)&fpu_state->fpu_state_ia32.xsave : (void *)&fpu_state->fpu_state_64.xsave; /* * If no FPU information provided -- we're restoring * old image which has no FPU support, or the dump simply * has no FPU support at all. */ if (!core->thread_info->fpregs) { fpu_state->has_fpu = false; return 0; } if (!valid_xsave_frame(core)) return -1; fpu_state->has_fpu = true; #define assign_reg(dst, src, e) do { dst.e = (__typeof__(dst.e))src->e; } while (0) #define assign_array(dst, src, e) memcpy(dst.e, (src)->e, sizeof(dst.e)) assign_reg(x->i387, core->thread_info->fpregs, cwd); assign_reg(x->i387, core->thread_info->fpregs, swd); assign_reg(x->i387, core->thread_info->fpregs, twd); assign_reg(x->i387, core->thread_info->fpregs, fop); assign_reg(x->i387, core->thread_info->fpregs, rip); assign_reg(x->i387, core->thread_info->fpregs, rdp); assign_reg(x->i387, core->thread_info->fpregs, mxcsr); assign_reg(x->i387, core->thread_info->fpregs, mxcsr_mask); assign_array(x->i387, core->thread_info->fpregs, st_space); assign_array(x->i387, core->thread_info->fpregs, xmm_space); if (core_is_compat(core)) compel_convert_from_fxsr(&fpu_state->fpu_state_ia32.fregs_state.i387_ia32, &fpu_state->fpu_state_ia32.xsave.i387); if (compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { struct fpx_sw_bytes *fpx_sw = (void *)&x->i387.sw_reserved; void *magic2; x->xsave_hdr.xstate_bv = XSTATE_FP | XSTATE_SSE | XSTATE_YMM; /* * fpregs->xsave pointer might not present on image so we * simply clear out all ymm registers. */ if (core->thread_info->fpregs->xsave) assign_array(x->ymmh, core->thread_info->fpregs->xsave, ymmh_space); fpx_sw->magic1 = FP_XSTATE_MAGIC1; fpx_sw->xstate_bv = XSTATE_FP | XSTATE_SSE | XSTATE_YMM; fpx_sw->xstate_size = sizeof(struct xsave_struct); fpx_sw->extended_size = sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE; /* * This should be at the end of xsave frame. */ magic2 = (void *)x + sizeof(struct xsave_struct); *(u32 *)magic2 = FP_XSTATE_MAGIC2; } show_rt_xsave_frame(x); #undef assign_reg #undef assign_array return 0; } #define CPREG32(d) f->compat.uc.uc_mcontext.d = r->d static void restore_compat_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) { CPREG32(gs); CPREG32(fs); CPREG32(es); CPREG32(ds); CPREG32(di); CPREG32(si); CPREG32(bp); CPREG32(sp); CPREG32(bx); CPREG32(dx); CPREG32(cx); CPREG32(ip); CPREG32(ax); CPREG32(cs); CPREG32(ss); CPREG32(flags); f->is_native = false; } #undef CPREG32 #define CPREG64(d, s) f->native.uc.uc_mcontext.d = r->s static void restore_native_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) { CPREG64(rdi, di); CPREG64(rsi, si); CPREG64(rbp, bp); CPREG64(rsp, sp); CPREG64(rbx, bx); CPREG64(rdx, dx); CPREG64(rcx, cx); CPREG64(rip, ip); CPREG64(rax, ax); CPREG64(r8, r8); CPREG64(r9, r9); CPREG64(r10, r10); CPREG64(r11, r11); CPREG64(r12, r12); CPREG64(r13, r13); CPREG64(r14, r14); CPREG64(r15, r15); CPREG64(cs, cs); CPREG64(eflags, flags); f->is_native = true; } #undef CPREG64 int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) { switch (r->mode) { case USER_X86_REGS_MODE__NATIVE: restore_native_gpregs(f, r); break; case USER_X86_REGS_MODE__COMPAT: restore_compat_gpregs(f, r); break; default: pr_err("Can't prepare rt_sigframe: registers mode corrupted (%d)\n", r->mode); return -1; } return 0; } static int get_robust_list32(pid_t pid, uintptr_t head, uintptr_t len) { struct syscall_args32 s = { .nr = __NR32_get_robust_list, .arg0 = pid, .arg1 = (uint32_t)head, .arg2 = (uint32_t)len, }; do_full_int80(&s); return (int)s.nr; } static int set_robust_list32(uint32_t head, uint32_t len) { struct syscall_args32 s = { .nr = __NR32_set_robust_list, .arg0 = head, .arg1 = len, }; do_full_int80(&s); return (int)s.nr; } int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info) { void *mmap32; int ret = -1; mmap32 = alloc_compat_syscall_stack(); if (!mmap32) return -1; ret = get_robust_list32(pid, (uintptr_t)mmap32, (uintptr_t)mmap32 + 4); if (ret == -ENOSYS) { /* Check native get_task_futex_robust_list() for details. */ if (set_robust_list32(0, 0) == (uint32_t)-ENOSYS) { info->futex_rla = 0; info->futex_rla_len = 0; ret = 0; } } else if (ret == 0) { uint32_t *arg1 = (uint32_t*)mmap32; info->futex_rla = *arg1; info->futex_rla_len = *(arg1 + 1); ret = 0; } free_compat_syscall_stack(mmap32); return ret; } criu-3.6/criu/arch/x86/include/000077500000000000000000000000001317335042600162575ustar00rootroot00000000000000criu-3.6/criu/arch/x86/include/asm/000077500000000000000000000000001317335042600170375ustar00rootroot00000000000000criu-3.6/criu/arch/x86/include/asm/compat.h000066400000000000000000000033211317335042600204720ustar00rootroot00000000000000#ifndef __CR_ASM_COMPAT_H__ #define __CR_ASM_COMPAT_H__ #ifdef CR_NOGLIBC # include # include #else # define sys_mmap mmap # define sys_munmap munmap #endif #include static inline void *alloc_compat_syscall_stack(void) { void *mem = (void*)sys_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if ((uintptr_t)mem % PAGE_SIZE) { int err = (~(uint32_t)(uintptr_t)mem) + 1; pr_err("mmap() of compat syscall stack failed with %d\n", err); return 0; } return mem; } static inline void free_compat_syscall_stack(void *mem) { long int ret = sys_munmap(mem, PAGE_SIZE); if (ret) pr_err("munmap() of compat addr %p failed with %ld\n", mem, ret); } struct syscall_args32 { uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5; }; static inline void do_full_int80(struct syscall_args32 *args) { /* * r8-r11 registers are cleared during returning to userspace * from syscall - that's x86_64 ABI to avoid leaking kernel * pointers. * * Other than that - we can't use %rbp in clobbers as GCC's inline * assembly doesn't allow to do so. So, here is explicitly saving * %rbp before syscall and restoring it's value afterward. */ asm volatile ("pushq %%rbp\n\t" "mov %6, %%ebp\n\t" "int $0x80\n\t" "mov %%ebp, %6\n\t" "popq %%rbp\n\t" : "+a" (args->nr), "+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2), "+S" (args->arg3), "+D" (args->arg4), "+g" (args->arg5) : : "r8", "r9", "r10", "r11"); } #ifdef CONFIG_COMPAT extern unsigned long call32_from_64(void *stack, void *func); #endif #ifndef CR_NOGLIBC # undef sys_mmap # undef sys_munmap #endif #endif criu-3.6/criu/arch/x86/include/asm/dump.h000066400000000000000000000016011317335042600201530ustar00rootroot00000000000000#ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); static inline void core_put_tls(CoreEntry *core, tls_t tls) { ThreadInfoX86 *ti = core->thread_info; int i; for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *from = &tls.desc[i]; UserDescT *to = ti->tls[i]; #define COPY_TLS(field) to->field = from->field COPY_TLS(entry_number); COPY_TLS(base_addr); COPY_TLS(limit); COPY_TLS(seg_32bit); to->contents_h = from->contents & 0x2; to->contents_l = from->contents & 0x1; COPY_TLS(read_exec_only); COPY_TLS(limit_in_pages); COPY_TLS(seg_not_present); COPY_TLS(useable); #undef COPY_TLS } } #endif criu-3.6/criu/arch/x86/include/asm/int.h000066400000000000000000000001571317335042600200050ustar00rootroot00000000000000#ifndef __CR_ASM_INT_H__ #define __CR_ASM_INT_H__ #include "asm-generic/int.h" #endif /* __CR_ASM_INT_H__ */ criu-3.6/criu/arch/x86/include/asm/parasite-syscall.h000066400000000000000000000002021317335042600224620ustar00rootroot00000000000000#ifndef __CR_ASM_PARASITE_SYSCALL_H__ #define __CR_ASM_PARASITE_SYSCALL_H__ #include "asm/types.h" struct parasite_ctl; #endif criu-3.6/criu/arch/x86/include/asm/parasite.h000066400000000000000000000035221317335042600210220ustar00rootroot00000000000000#ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ #include #include #include "asm/compat.h" static int arch_get_user_desc(user_desc_t *desc) { int ret = __NR32_get_thread_area; /* * For 64-bit applications, TLS (fs_base for Glibc) is * in MSR, which are dumped with the help of arch_prctl(). * * But SET_FS_BASE will update GDT if base pointer fits in 4 bytes. * Otherwise it will set only MSR, which allows for mixed 64/32-bit * code to use: 2 MSRs as TLS base _and_ 3 GDT entries. * Having in sum 5 TLS pointers, 3 of which are four bytes and * other two bigger than four bytes: * struct thread_struct { * struct desc_struct tls_array[3]; * ... * #ifdef CONFIG_X86_64 * unsigned long fsbase; * unsigned long gsbase; * #endif * ... * }; */ asm volatile ( " mov %0,%%eax \n" " mov %1,%%rbx \n" " int $0x80 \n" " mov %%eax,%0 \n" : "+m"(ret) : "m"(desc) : "rax", "rbx", "r8", "r9", "r10", "r11", "memory"); if (ret) pr_err("Failed to dump TLS descriptor #%d: %d\n", desc->entry_number, ret); return ret; } static void arch_get_tls(tls_t *ptls) { void *syscall_mem; int i; syscall_mem = alloc_compat_syscall_stack(); if (!syscall_mem) { pr_err("Failed to allocate memory <4Gb for compat syscall\n"); for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *d = &ptls->desc[i]; d->seg_not_present = 1; d->entry_number = GDT_ENTRY_TLS_MIN + i; } return; } for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *d = syscall_mem; memset(d, 0, sizeof(user_desc_t)); d->seg_not_present = 1; d->entry_number = GDT_ENTRY_TLS_MIN + i; arch_get_user_desc(d); memcpy(&ptls->desc[i], d, sizeof(user_desc_t)); } free_compat_syscall_stack(syscall_mem); } #endif criu-3.6/criu/arch/x86/include/asm/restore.h000066400000000000000000000026011317335042600206720ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORE_H__ #define __CR_ASM_RESTORE_H__ #include "asm/restorer.h" #include "images/core.pb-c.h" #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ "movq %0, %%rbx \n" \ "movq %1, %%rax \n" \ "movq %2, %%rdi \n" \ "movq %%rbx, %%rsp \n" \ "callq *%%rax \n" \ : \ : "g"(new_sp), \ "g"(restore_task_exec_start), \ "g"(task_args) \ : "rsp", "rdi", "rsi", "rbx", "rax", "memory") static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { ThreadInfoX86 *ti = pcore->thread_info; size_t i; for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *to = &ptls->desc[i]; UserDescT *from; /* * If proto image has lesser TLS entries, * mark them as not present (and thus skip restore). */ if (i >= ti->n_tls) { to->seg_not_present = 1; continue; } from = ti->tls[i]; #define COPY_TLS(field) to->field = from->field COPY_TLS(entry_number); COPY_TLS(base_addr); COPY_TLS(limit); COPY_TLS(seg_32bit); to->contents = ((u32)from->contents_h << 1) | from->contents_l; COPY_TLS(read_exec_only); COPY_TLS(limit_in_pages); COPY_TLS(seg_not_present); COPY_TLS(useable); #undef COPY_TLS } } int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif criu-3.6/criu/arch/x86/include/asm/restorer.h000066400000000000000000000066231317335042600210640ustar00rootroot00000000000000#ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ #include "asm/types.h" #include #include "images/core.pb-c.h" #include #include #include "asm/compat.h" #ifdef CONFIG_COMPAT extern void restore_tls(tls_t *ptls); extern int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act); extern int set_compat_robust_list(uint32_t head_ptr, uint32_t len); #else /* CONFIG_COMPAT */ static inline void restore_tls(tls_t *ptls) { } static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; } static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { return -1; } #endif /* !CONFIG_COMPAT */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "clone_emul: \n" \ "movq %2, %%rsi \n" \ "subq $16, %%rsi \n" \ "movq %6, %%rdi \n" \ "movq %%rdi, 8(%%rsi) \n" \ "movq %5, %%rdi \n" \ "movq %%rdi, 0(%%rsi) \n" \ "movq %1, %%rdi \n" \ "movq %3, %%rdx \n" \ "movq %4, %%r10 \n" \ "movl $"__stringify(__NR_clone)", %%eax \n" \ "syscall \n" \ \ "testq %%rax,%%rax \n" \ "jz thread_run \n" \ \ "movq %%rax, %0 \n" \ "jmp clone_end \n" \ \ "thread_run: \n" \ "xorq %%rbp, %%rbp \n" \ "popq %%rax \n" \ "popq %%rdi \n" \ "callq *%%rax \n" \ \ "clone_end: \n" \ : "=r"(ret) \ : "g"(clone_flags), \ "g"(new_sp), \ "g"(&parent_tid), \ "g"(&thread_args[i].pid), \ "g"(clone_restore_fn), \ "g"(&thread_args[i]) \ : "rax", "rcx", "rdi", "rsi", "rdx", "r10", "r11", "memory") #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "movq %0, %%rsp \n" \ "movq 0, %%rax \n" \ "jmp *%%rax \n" \ : \ : "r"(ret) \ : "memory") #ifndef ARCH_MAP_VDSO_32 # define ARCH_MAP_VDSO_32 0x2002 #endif #ifndef ARCH_MAP_VDSO_64 # define ARCH_MAP_VDSO_64 0x2003 #endif extern int kdat_compatible_cr(void); extern int kdat_can_map_vdso(void); static inline void __setup_sas_compat(struct ucontext_ia32* uc, ThreadSasEntry *sas) { uc->uc_stack.ss_sp = (compat_uptr_t)(sas)->ss_sp; uc->uc_stack.ss_flags = (int)(sas)->ss_flags; uc->uc_stack.ss_size = (compat_size_t)(sas)->ss_size; } static inline void __setup_sas(struct rt_sigframe* sigframe, ThreadSasEntry *sas) { if (sigframe->is_native) { struct rt_ucontext *uc = &sigframe->native.uc; uc->uc_stack.ss_sp = (void *)decode_pointer((sas)->ss_sp); uc->uc_stack.ss_flags = (int)(sas)->ss_flags; uc->uc_stack.ss_size = (size_t)(sas)->ss_size; } else { __setup_sas_compat(&sigframe->compat.uc, sas); } } static inline void _setup_sas(struct rt_sigframe* sigframe, ThreadSasEntry *sas) { if (sas) __setup_sas(sigframe, sas); } #define setup_sas _setup_sas int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r); int restore_nonsigframe_gpregs(UserX86RegsEntry *r); int ptrace_set_breakpoint(pid_t pid, void *addr); int ptrace_flush_breakpoints(pid_t pid); extern int arch_map_vdso(unsigned long map_at, bool compatible); #endif criu-3.6/criu/arch/x86/include/asm/syscall32.h000066400000000000000000000020551317335042600210310ustar00rootroot00000000000000#ifndef __CR_SYSCALL32_H__ #define __CR_SYSCALL32_H__ extern long sys_socket(int domain, int type, int protocol); extern long sys_connect(int sockfd, struct sockaddr *addr, int addrlen); extern long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len); extern long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len); extern long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags); extern long sys_recvmsg(int sockfd, struct msghdr *msg, int flags); extern long sys_shutdown(int sockfd, int how); extern long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen); extern long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen); extern long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen); extern long sys_shmat(int shmid, void *shmaddr, int shmflag); extern long sys_pread(unsigned int fd, char *ubuf, u32 count, u64 pos); #endif /* __CR_SYSCALL32_H__ */ criu-3.6/criu/arch/x86/include/asm/types.h000066400000000000000000000022321317335042600203530ustar00rootroot00000000000000#ifndef __CR_ASM_TYPES_H__ #define __CR_ASM_TYPES_H__ #include #include #include "page.h" #include "bitops.h" #include "asm/int.h" #include #include "images/core.pb-c.h" static inline int core_is_compat(CoreEntry *c) { switch (c->thread_info->gpregs->mode) { case USER_X86_REGS_MODE__NATIVE: return 0; case USER_X86_REGS_MODE__COMPAT: return 1; default: return -1; } } #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__X86_64 #define CORE_THREAD_ARCH_INFO(core) core->thread_info typedef UserX86RegsEntry UserRegsEntry; static inline u64 encode_pointer(void *p) { return (u64)(long)p; } static inline void *decode_pointer(u64 v) { return (void*)(long)v; } #define AT_VECTOR_SIZE 44 typedef uint64_t auxv_t; /* * Linux preserves three TLS segments in GDT. * Offsets in GDT differ between 32-bit and 64-bit machines. * For 64-bit x86 those GDT offsets are the same * for native and compat tasks. */ #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 #define GDT_ENTRY_TLS_NUM 3 typedef struct { user_desc_t desc[GDT_ENTRY_TLS_NUM]; } tls_t; #endif /* __CR_ASM_TYPES_H__ */ criu-3.6/criu/arch/x86/include/asm/vdso.h000066400000000000000000000010741317335042600201650ustar00rootroot00000000000000#ifndef __CR_ASM_VDSO_H__ #define __CR_ASM_VDSO_H__ #include "asm/int.h" #include "asm-generic/vdso.h" /* This definition is used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' */ /* * This is a minimal amount of symbols * we should support at the moment. */ #define VDSO_SYMBOL_MAX 7 #define ARCH_VDSO_SYMBOLS \ "__vdso_clock_gettime", \ "__vdso_getcpu", \ "__vdso_gettimeofday", \ "__vdso_time", \ "__kernel_vsyscall", \ "__kernel_sigreturn", \ "__kernel_rt_sigreturn" #endif /* __CR_ASM_VDSO_H__ */ criu-3.6/criu/arch/x86/restorer.c000066400000000000000000000047201317335042600166500ustar00rootroot00000000000000#include #include #include "types.h" #include "restorer.h" #include "asm/compat.h" #include "asm/restorer.h" #include #include #include #include #include "log.h" #include "cpu.h" int arch_map_vdso(unsigned long map_at, bool compatible) { int vdso_type = compatible ? ARCH_MAP_VDSO_32 : ARCH_MAP_VDSO_64; pr_debug("Mapping %s vDSO at %lx\n", compatible ? "compatible" : "native", map_at); return sys_arch_prctl(vdso_type, map_at); } int restore_nonsigframe_gpregs(UserX86RegsEntry *r) { long ret; unsigned long fsgs_base; fsgs_base = r->fs_base; ret = sys_arch_prctl(ARCH_SET_FS, fsgs_base); if (ret) { pr_info("SET_FS fail %ld\n", ret); return -1; } fsgs_base = r->gs_base; ret = sys_arch_prctl(ARCH_SET_GS, fsgs_base); if (ret) { pr_info("SET_GS fail %ld\n", ret); return -1; } return 0; } #ifdef CONFIG_COMPAT int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { struct syscall_args32 s = { .nr = __NR32_set_robust_list, .arg0 = head_ptr, .arg1 = len, }; do_full_int80(&s); return (int)s.nr; } static int prepare_stack32(void **stack32) { if (*stack32) return 0; *stack32 = alloc_compat_syscall_stack(); if (!*stack32) { pr_err("Failed to allocate stack for 32-bit TLS restore\n"); return -1; } return 0; } void restore_tls(tls_t *ptls) { /* * We need here compatible stack, because 32-bit syscalls get * 4-byte pointer and _usally_ restorer is also under 4Gb, but * it can be upper and then pointers are messed up. * (we lose high 4 bytes and... BANG!) * Nothing serious, but syscall will return -EFAULT - or if we're * lucky and lower 4 bytes points on some writeable VMA - corruption). */ void *stack32 = NULL; unsigned i; for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *desc = &ptls->desc[i]; int ret; if (desc->seg_not_present) continue; if (prepare_stack32(&stack32) < 0) return; memcpy(stack32, desc, sizeof(user_desc_t)); asm volatile ( " mov %1,%%eax \n" " mov %2,%%ebx \n" " int $0x80 \n" " mov %%eax,%0 \n" : "=g"(ret) : "r"(__NR32_set_thread_area), "r"((uint32_t)(uintptr_t)stack32) : "eax", "ebx", "r8", "r9", "r10", "r11", "memory"); if (ret) pr_err("Failed to restore TLS descriptor %u in GDT: %d\n", desc->entry_number, ret); } if (stack32) free_compat_syscall_stack(stack32); } #endif criu-3.6/criu/arch/x86/restorer_unmap.S000066400000000000000000000004161317335042600200260ustar00rootroot00000000000000#include "common/asm/linkage.h" #include "compel/plugins/std/syscall-codes.h" .text ENTRY(__export_unmap_compat) .code32 mov bootstrap_start, %ebx mov bootstrap_len, %ecx sub vdso_rt_size, %ecx movl $__NR32_munmap, %eax int $0x80 int $0x03 /* Guard */ .code64 criu-3.6/criu/arch/x86/sigaction_compat.c000066400000000000000000000023661317335042600203320ustar00rootroot00000000000000#include "log.h" #include "asm/restorer.h" #include #include "asm/compat.h" #include #ifdef CR_NOGLIBC # include #endif #include "cpu.h" asm ( " .pushsection .text \n" " .global restore_rt_sigaction \n" " .code32 \n" "restore_rt_sigaction: \n" " mov %edx, %esi \n" " mov $0, %edx \n" " movl $"__stringify(__NR32_rt_sigaction)",%eax \n" " int $0x80 \n" " ret \n" " .popsection \n" " .code64"); extern char restore_rt_sigaction; /* * Call raw rt_sigaction syscall through int80 - so the ABI kernel choses * to deliver this signal would be i386. */ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) { int ret; /* * To be sure, that sigaction pointer lies under 4G, * coping it on the bottom of the stack. */ memcpy(stack32, act, sizeof(rt_sigaction_t_compat)); asm volatile ("\t movl %%ebx,%%ebx\n" : :"b"(sig)); /* signum */ asm volatile ("\t movl %%ecx,%%ecx\n" : :"c"(stack32)); /* act */ asm volatile ("\t movl %%edx,%%edx\n" : :"d"(sizeof(act->rt_sa_mask))); call32_from_64(stack32 + PAGE_SIZE, &restore_rt_sigaction); asm volatile ("\t movl %%eax,%0\n" : "=r"(ret)); return ret; } criu-3.6/criu/arch/x86/sigaction_compat_pie.c000077700000000000000000000000001317335042600246422sigaction_compat.custar00rootroot00000000000000criu-3.6/criu/arch/x86/sigframe.c000066400000000000000000000015561317335042600166040ustar00rootroot00000000000000#include #include #include "asm/sigframe.h" #include "asm/types.h" #include "log.h" int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { /* * Use local sigframe to check native/compat type, * but set address for rsigframe. */ fpu_state_t *fpu_state = (sigframe->is_native) ? &rsigframe->native.fpu_state : &rsigframe->compat.fpu_state; if (sigframe->is_native) { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_64.xsave; if ((addr % 64ul)) { pr_err("Unaligned address passed: %lx (native %d)\n", addr, sigframe->is_native); return -1; } sigframe->native.uc.uc_mcontext.fpstate = (void *)addr; } else if (!sigframe->is_native) { sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; } return 0; } criu-3.6/criu/arch/x86/sys-exec-tbl.c000066400000000000000000000017741317335042600173300ustar00rootroot00000000000000#include static struct syscall_exec_desc sc_exec_table_64[] = { #include "sys-exec-tbl-64.c" { }, /* terminator */ }; #ifdef CONFIG_COMPAT static struct syscall_exec_desc sc_exec_table_32[] = { #include "sys-exec-tbl-32.c" { }, /* terminator */ }; #endif struct syscall_exec_desc; static inline struct syscall_exec_desc * find_syscall_table(char *name, struct syscall_exec_desc *tbl) { int i; for (i = 0; tbl[i].name != NULL; i++) if (!strcmp(tbl[i].name, name)) return &tbl[i]; return NULL; } #define ARCH_HAS_FIND_SYSCALL /* overwrite default to search in two tables above */ #ifdef CONFIG_COMPAT struct syscall_exec_desc * find_syscall(char *name, struct parasite_ctl *ctl) { if (compel_mode_native(ctl)) return find_syscall_table(name, sc_exec_table_64); else return find_syscall_table(name, sc_exec_table_32); } #else struct syscall_exec_desc * find_syscall(char *name, __always_unused struct parasite_ctl *ctl) { return find_syscall_table(name, sc_exec_table_64); } #endif criu-3.6/criu/arch/x86/vdso-pie.c000066400000000000000000000027021317335042600165270ustar00rootroot00000000000000#include #include "asm/types.h" #include #include #include "parasite-vdso.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX # undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " static void insert_trampoline32(uintptr_t from, uintptr_t to) { struct { u8 movl; u32 imm32; u16 jmp_eax; u32 guards; } __packed jmp = { .movl = 0xb8, .imm32 = (uint32_t)to, .jmp_eax = 0xe0ff, .guards = 0xcccccccc, }; memcpy((void *)from, &jmp, sizeof(jmp)); } static void insert_trampoline64(uintptr_t from, uintptr_t to) { struct { u16 movabs; u64 imm64; u16 jmp_rax; u32 guards; } __packed jmp = { .movabs = 0xb848, .imm64 = to, .jmp_rax = 0xe0ff, .guards = 0xcccccccc, }; memcpy((void *)from, &jmp, sizeof(jmp)); } int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, struct vdso_symtable *sfrom, bool compat_vdso) { unsigned int i; for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { uintptr_t from, to; if (vdso_symbol_empty(&sfrom->symbols[i])) continue; pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, sto->symbols[i].offset, i); from = base_from + sfrom->symbols[i].offset; to = base_to + sto->symbols[i].offset; if (!compat_vdso) insert_trampoline64(from, to); else insert_trampoline32(from, to); } return 0; } criu-3.6/criu/autofs.c000066400000000000000000000601161317335042600147430ustar00rootroot00000000000000#include #include #include #include #include #include "int.h" #include "fdinfo.h" #include "autofs.h" #include "rst-malloc.h" #include "mount.h" #include "pstree.h" #include "namespaces.h" #include "protobuf.h" #include "pipes.h" #include "crtools.h" #include "util.h" #include "images/autofs.pb-c.h" #define AUTOFS_OPT_UNKNOWN INT_MIN #define AUTOFS_MODE_DIRECT 0 #define AUTOFS_MODE_INDIRECT 1 #define AUTOFS_MODE_OFFSET 2 #define AUTOFS_CATATONIC_FD -1 static int autofs_mnt_open(const char *mnt_path, dev_t devid); struct autofs_pipe_s { struct list_head list; unsigned long inode; }; struct list_head autofs_pipes = LIST_HEAD_INIT(autofs_pipes); bool is_autofs_pipe(unsigned long inode) { struct autofs_pipe_s *p; list_for_each_entry(p, &autofs_pipes, list) { if (p->inode == inode) return true; } return false; } static int autofs_gather_pipe(unsigned long inode) { struct autofs_pipe_s *pipe; pipe = xmalloc(sizeof(*pipe)); if (!pipe) return -1; pipe->inode = inode; list_add_tail(&pipe->list, &autofs_pipes); return 0; } int autofs_parse(struct mount_info *pm) { long pipe_ino = AUTOFS_OPT_UNKNOWN; char **opts; int nr_opts, i; split(pm->options, ',', &opts, &nr_opts); if (!opts) return -1; for (i = 0; i < nr_opts; i++) { if (!strncmp(opts[i], "pipe_ino=", strlen("pipe_ino="))) pipe_ino = atoi(opts[i] + strlen("pipe_ino=")); } for (i = 0; i < nr_opts; i++) xfree(opts[i]); free(opts); if (pipe_ino == AUTOFS_OPT_UNKNOWN) { pr_warn("Failed to find pipe_ino option (old kernel?)\n"); return 0; } return autofs_gather_pipe(pipe_ino); } static int autofs_check_fd_stat(struct stat *stat, int prgp, int fd, long ino, int *mode) { struct fdinfo_common fdinfo; if (!S_ISFIFO(stat->st_mode)) return 0; if (stat->st_ino != ino) return 0; if (parse_fdinfo_pid(prgp, fd, FD_TYPES__UND, &fdinfo)) return -1; *mode = fdinfo.flags & O_WRONLY; return 1; } static int autofs_kernel_pipe_alive(int pgrp, int fd, int ino) { struct stat buf; char *path; int ret, fd_mode; path = xsprintf("/proc/%d/fd/%d", pgrp, fd); if (!path) return -1; if (stat(path, &buf) < 0) { if (errno == ENOENT) return 0; pr_perror("Failed to stat %s", path); return -1; } xfree(path); ret = autofs_check_fd_stat(&buf, pgrp, fd, ino, &fd_mode); if (ret <= 0) return ret; return O_WRONLY == fd_mode; } static int autofs_find_pipe_read_end(int pgrp, long ino, int *read_fd) { DIR *dir; struct dirent *de; int ret = -1; dir = opendir_proc(pgrp, "fd"); if (dir == NULL) return -1; *read_fd = -1; while ((de = readdir(dir))) { struct stat buf; int found, mode, fd; if (dir_dots(de)) continue; if (fstatat(dirfd(dir), de->d_name, &buf, 0) < 0) { pr_perror("Failed to fstatat"); goto out; } ret = xatoi(de->d_name, &fd); if (ret) goto out; found = autofs_check_fd_stat(&buf, pgrp, fd, ino, &mode); if (found < 0) goto out; if (found && (mode == O_RDONLY)) { *read_fd = fd; break; } } ret = 0; out: closedir(dir); close_pid_proc(); return ret; } static int autofs_find_read_fd(int pgrp, long pipe_ino) { int read_fd, fd; /* We need to find read end and make sure, that it's empty */ if (autofs_find_pipe_read_end(pgrp, pipe_ino, &read_fd) < 0) { pr_err("Failed to find read pipe fd (ino %ld) " "in process %d\n", pipe_ino, pgrp); return -1; } if (read_fd == -1) { pr_err("Master %d doesn't have a read end of the pipe with " "inode %ld opened\n", pgrp, pipe_ino); pr_err("Abandoned mount or control was delegated to child?\n"); return -ENOENT; } /* Let's check, that read end is empty */ fd = open_proc(pgrp, "fd/%d", read_fd); if (fd < 0) return -1; if (fd_has_data(fd)) { pr_err("Process %d autofs pipe fd %d is not empty.\n", pgrp, read_fd); pr_err("Try again later.\n"); return -1; } close(fd); return read_fd; } static int parse_options(char *options, AutofsEntry *entry, long *pipe_ino) { char **opts; int nr_opts, i; entry->fd = AUTOFS_OPT_UNKNOWN; entry->timeout = AUTOFS_OPT_UNKNOWN; entry->minproto = AUTOFS_OPT_UNKNOWN; entry->maxproto = AUTOFS_OPT_UNKNOWN; entry->mode = AUTOFS_OPT_UNKNOWN; entry->pgrp = AUTOFS_OPT_UNKNOWN; entry->uid = AUTOFS_OPT_UNKNOWN; entry->gid = AUTOFS_OPT_UNKNOWN; *pipe_ino = AUTOFS_OPT_UNKNOWN; split(options, ',', &opts, &nr_opts); if (!opts) return -1; for (i = 0; i < nr_opts; i++) { char *opt = opts[i]; int err = 0; if (!strncmp(opt, "fd=", strlen("fd="))) err = xatoi(opt + strlen("fd="), &entry->fd); else if (!strncmp(opt, "pipe_ino=", strlen("pipe_ino="))) err = xatol(opt + strlen("pipe_ino="), pipe_ino); else if (!strncmp(opt, "pgrp=", strlen("pgrp="))) err = xatoi(opt + strlen("pgrp="), &entry->pgrp); else if (!strncmp(opt, "timeout=", strlen("timeout="))) err = xatoi(opt + strlen("timeout="), &entry->timeout); else if (!strncmp(opt, "minproto=", strlen("minproto="))) err = xatoi(opt + strlen("minproto="), &entry->minproto); else if (!strncmp(opt, "maxproto=", strlen("maxproto="))) err = xatoi(opt + strlen("maxproto="), &entry->maxproto); else if (!strcmp(opt, "indirect")) entry->mode = AUTOFS_MODE_INDIRECT; else if (!strcmp(opt, "offset")) entry->mode = AUTOFS_MODE_OFFSET; else if (!strcmp(opt, "direct")) entry->mode = AUTOFS_MODE_DIRECT; else if (!strncmp(opt, "uid=", strlen("uid="))) err = xatoi(opt + strlen("uid="), &entry->uid); else if (!strncmp(opt, "gid=", strlen("gid="))) err = xatoi(opt + strlen("gid="), &entry->gid); if (err) return -1; } for (i = 0; i < nr_opts; i++) xfree(opts[i]); xfree(opts); if (entry->fd == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find fd option\n"); return -1; } if (entry->pgrp == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find pgrp option\n"); return -1; } if (entry->timeout == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find timeout option\n"); return -1; } if (entry->minproto == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find minproto option\n"); return -1; } if (entry->maxproto == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find maxproto option\n"); return -1; } if (entry->mode == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find mode (direct,indirect,offset) option\n"); return -1; } if (*pipe_ino == AUTOFS_OPT_UNKNOWN) { pr_err("Failed to find pipe_ino option (old kernel?)\n"); return -1; } return 0; } static int autofs_revisit_options(struct mount_info *pm) { FILE *f; char *str; int ret = -ENOMEM; str = xmalloc(1024); if (!str) { return -ENOMEM; } f = fopen_proc(getpid(), "mountinfo"); if (!f) goto free_str; while (fgets(str, 1024, f)) { int mnt_id = -1; char *token; /* Removing '/n' */ str[strlen(str)-1] = '\0'; while ((token = strsep(&str, " ")) != NULL) { if (mnt_id == -1) { ret = xatoi(token, &mnt_id); if (ret) goto close_proc; if (mnt_id != pm->mnt_id) break; } else if (strstr(token, "pipe_ino=")) { ret = 0; free(pm->options); pm->options = xstrdup(token); if (!pm->options) pr_err("failed to duplicate string\n"); else ret = 0; goto close_proc; } } } pr_err("failed to find autofs mount with mnt_id %d\n", pm->mnt_id); ret = -ENOENT; close_proc: fclose(f); free_str: free(str); return ret; } /* * To access the mount point we have to set proper mount namespace. * But, unfortunatelly, we have to set proper pid namespace as well, * because otherwise autofs driver won't find the autofs master. */ static int access_autofs_mount(struct mount_info *pm) { const char *mnt_path = pm->mountpoint + 1; dev_t dev_id = pm->s_dev; int new_pid_ns = -1, old_pid_ns = -1; int old_mnt_ns; int autofs_mnt; int err = -1; int pid, status; /* * To be able to set proper pid namespace, we must open fd before * switching to the mount namespace. * The same applies to pid namespace fd to restore back. */ new_pid_ns = open_proc(pm->nsid->ns_pid, "ns/pid"); if (new_pid_ns < 0) return -1; old_pid_ns = open_proc(PROC_SELF, "ns/pid"); if (old_pid_ns < 0) goto close_new_pid_ns; if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &old_mnt_ns)) { pr_err("failed to switch to mount namespace\n"); goto close_old_pid_ns; } err = restore_ns(new_pid_ns, &pid_ns_desc); new_pid_ns = -1; if (err) { pr_err("failed to restore pid namespace\n"); goto restore_mnt_ns; } autofs_mnt = autofs_mnt_open(mnt_path, dev_id); if (autofs_mnt < 0) goto restore_pid_ns; pid = fork(); switch (pid) { case -1: pr_err("failed to fork\n"); goto close_autofs_mnt; case 0: /* We don't care about results. * All we need is to "touch" */ openat(autofs_mnt, mnt_path, O_RDONLY|O_NONBLOCK|O_DIRECTORY); _exit(0); } /* Here we also don't care about results */ waitpid(pid, &status, 0); err = autofs_revisit_options(pm); close_autofs_mnt: close(autofs_mnt); restore_pid_ns: if (restore_ns(old_pid_ns, &pid_ns_desc)) { pr_err("failed to restore pid namespace\n"); err = -1; } old_pid_ns = -1; restore_mnt_ns: if (restore_ns(old_mnt_ns, &mnt_ns_desc)) { pr_err("failed to restore mount namespace\n"); err = -1; } close_old_pid_ns: if (old_pid_ns >= 0) close(old_pid_ns); close_new_pid_ns: if (new_pid_ns >= 0) close(new_pid_ns); return err; } static int autofs_create_entry(struct mount_info *pm, AutofsEntry *entry) { long pipe_ino; if (parse_options(pm->options, entry, &pipe_ino)) return -1; if (entry->uid != AUTOFS_OPT_UNKNOWN) entry->has_uid = true; if (entry->gid != AUTOFS_OPT_UNKNOWN) entry->has_gid = true; if (entry->fd != AUTOFS_CATATONIC_FD) { int found, read_fd, virt_pgrp; read_fd = autofs_find_read_fd(entry->pgrp, pipe_ino); if (read_fd < 0) { if (read_fd != -ENOENT) return -1; /* Ok, our read end doesn't exist. * There can be a case, when mount looks normal, but * it's a "hidden" or "abandoned" catatonic mount in * reality. * This can happen if: * 1) autofs master process has exited without switching * the mount to catatonic mode (or was killed). * 2) mount point was unmounted, but not propagated to * nested mount namespace with private mounts. * We can try handle these cases by accessing the mount * point. If it's catatonic, it will update it's * options, then we can read them again and dump it. */ if (access_autofs_mount(pm)) { pr_err("failed to access autofs %s\n", pm->mountpoint + 1); return -1; } if (parse_options(pm->options, entry, &pipe_ino)) return -1; if (entry->fd == AUTOFS_CATATONIC_FD) return 0; pr_err("Autofs %d is alive, but unreachable.\n", pm->mnt_id); return -1; } /* Let' check whether write end is still open */ found = autofs_kernel_pipe_alive(entry->pgrp, entry->fd, pipe_ino); if (found < 0) { pr_err("Failed to check fd %d in process %d\n", entry->fd, entry->pgrp); return -1; } /* Write end is absent. we need to carry read end to restore. */ if (!found) { entry->has_read_fd = true; entry->read_fd = read_fd; } /* We need to get virtual pgrp to restore mount */ virt_pgrp = pid_to_virt(entry->pgrp); if (!virt_pgrp) { pr_err("failed to find pstree item with pid %d\n", entry->pgrp); pr_err("Non-catatonic mount without master?\n"); return -1; } entry->pgrp = virt_pgrp; } return 0; } static int autofs_dump_entry(struct mount_info *pm, AutofsEntry *entry) { struct cr_img *img; int ret = -1; img = open_image(CR_FD_AUTOFS, O_DUMP, pm->s_dev); if (img) { ret = pb_write_one(img, entry, PB_AUTOFS); close_image(img); } return ret; } int autofs_dump(struct mount_info *pm) { AutofsEntry *entry; int err; entry = xmalloc(sizeof(*entry)); if (!entry) return -1; autofs_entry__init(entry); err = autofs_create_entry(pm, entry); if (err) goto free_entry; err = autofs_dump_entry(pm, entry); free_entry: free(entry); return err < 0 ? err : 0; } typedef struct autofs_info_s { struct pipe_info pi; AutofsEntry *entry; char *mnt_path; dev_t mnt_dev; struct mount_info *mi; struct pprep_head ph; } autofs_info_t; static int dup_pipe_info(struct pipe_info *pi, int flags, struct file_desc_ops *ops) { struct pipe_info *new; PipeEntry *pe; new = shmalloc(sizeof(*new)); if (!new) return -1; pe = shmalloc(sizeof(*pe)); if (!pe) return -1; pe->id = pi->pe->id; pe->pipe_id = pi->pe->pipe_id; pe->fown = pi->pe->fown; pe->flags = flags; if (collect_one_pipe_ops(new, &pe->base, ops) < 0) { pr_err("Failed to add pipe info for write end\n"); return -1; } return 0; } static int autofs_dup_pipe(struct pstree_item *task, struct fdinfo_list_entry *ple, int new_fd) { struct pipe_info *pi = container_of(ple->desc, struct pipe_info, d); unsigned flags = O_WRONLY; new_fd = find_unused_fd(task, new_fd); if (dup_pipe_info(pi, flags, pi->d.ops) < 0) { pr_err("Failed to dup pipe entry ID %#x PIPE_ID %#x\n", pi->pe->id, pi->pe->pipe_id); return -1; } if (dup_fle(task, ple, new_fd, flags) < 0) { pr_err("Failed to add fd %d to process %d\n", new_fd, vpid(task)); return -1; } pr_info("autofs: added pipe fd %d, flags %#x to %d\n", new_fd, flags, vpid(task)); return new_fd; } static int autofs_ioctl(const char *path, int fd, int cmd, const void *param) { int err; err = ioctl(fd, cmd, param); if (err) pr_perror("%s ioctl failed", path); return err; } static int autofs_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { char *path = "/dev/"AUTOFS_DEVICE_NAME; int fd, err; fd = open(path, O_RDONLY); if (fd == -1) { pr_perror("failed to open %s", path); return -1; } err = autofs_ioctl(path, fd, cmd, param); close(fd); return err; } static int autofs_mnt_make_catatonic(const char *mnt_path, int mnt_fd) { pr_info("%s: set %s catatonic\n", __func__, mnt_path); return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_CATATONIC, NULL); } static int autofs_mnt_set_timeout(time_t timeout, const char *mnt_path, int mnt_fd) { pr_info("%s: set timeout %ld for %s\n", __func__, timeout, mnt_path); return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_SETTIMEOUT, &timeout); } static int autofs_mnt_set_pipefd(const autofs_info_t *i, int mnt_fd) { struct autofs_dev_ioctl param; /* Restore pipe and pgrp only for non-cataonic mounts */ if (i->entry->fd == AUTOFS_CATATONIC_FD) return 0; pr_info("%s: set pipe fd %d (pgrp %d) for mount %s\n", __func__, i->entry->fd, getpgrp(), i->mnt_path); init_autofs_dev_ioctl(¶m); param.ioctlfd = mnt_fd; param.setpipefd.pipefd = i->entry->fd; return autofs_dev_ioctl(AUTOFS_DEV_IOCTL_SETPIPEFD, ¶m); } static int autofs_mnt_close(const char *mnt_path, int mnt_fd) { struct autofs_dev_ioctl param; pr_info("%s: closing fd %d for mount %s\n", __func__, mnt_fd, mnt_path); init_autofs_dev_ioctl(¶m); param.ioctlfd = mnt_fd; return autofs_dev_ioctl(AUTOFS_DEV_IOCTL_CLOSEMOUNT, ¶m); } static int autofs_mnt_open(const char *mnt_path, dev_t devid) { struct autofs_dev_ioctl *param; int err; size_t size, fd; pr_info("%s: open mount %s\n", __func__, mnt_path); size = sizeof(*param) + strlen(mnt_path) + 1; param = xmalloc(size); if (!param) return -1; init_autofs_dev_ioctl(param); param->size = size; strcpy(param->path, mnt_path); param->openmount.devid = devid; err = autofs_dev_ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT, param); fd = param->ioctlfd; free(param); if (err < 0) { pr_err("Failed to get %s fd (devid: %ld)\n", mnt_path, (long)devid); return -1; } return fd; } static int autofs_create_dentries(const struct mount_info *mi, char *mnt_path) { struct mount_info *c; list_for_each_entry(c, &mi->children, siblings) { char *path, *basename; basename = strrchr(c->mountpoint, '/'); if (!basename) { pr_info("%s: mount path \"%s\" doesn't have '/'\n", __func__, c->mountpoint); return -1; } path = xsprintf("%s%s", mnt_path, basename); if (!path) return -1; if (mkdir(path, 0555) < 0) { pr_perror("Failed to create autofs dentry %s", path); return -1; } free(path); } return 0; } static int autofs_populate_mount(const struct mount_info *mi, const AutofsEntry *entry) { if (entry->mode != AUTOFS_MODE_INDIRECT) return 0; return autofs_create_dentries(mi, mi->mountpoint); } static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, time_t timeout) { int mnt_fd; pr_info("%s: set timeout for %s and make it catatonic\n", __func__, mnt_path); mnt_fd = autofs_mnt_open(mnt_path, mnt_dev); if (mnt_fd < 0) { pr_err("Failed to open %s\n", mnt_path); return -1; } if (autofs_mnt_set_timeout(timeout, mnt_path, mnt_fd)) { pr_err("Failed to set timeout %ld for %s\n", timeout, mnt_path); return -1; } if (autofs_mnt_make_catatonic(mnt_path, mnt_fd)) { pr_err("Failed to set %s catatonic\n", mnt_path); return -1; } if (autofs_mnt_close(mnt_path, mnt_fd) < 0) { pr_err("Failed to close %s\n", mnt_path); return -1; } return 0; } /* Here to fixup Autofs mount */ static int autofs_post_open(struct file_desc *d, int fd) { struct pipe_info *pi = container_of(d, struct pipe_info, d); autofs_info_t *i = container_of(pi, autofs_info_t, pi); int mnt_fd; pr_info("%s: restoring %s\n", __func__, i->mnt_path); mnt_fd = autofs_mnt_open(i->mnt_path, i->mnt_dev); if (mnt_fd < 0) { pr_err("Failed to open %s\n", i->mnt_path); return -1; } if (autofs_mnt_set_pipefd(i, mnt_fd)) { pr_err("Failed to set %s owner\n", i->mnt_path); return -1; } if (autofs_mnt_close(i->mnt_path, mnt_fd) < 0) { pr_err("Failed to close %s\n", i->mnt_path); return -1; } pr_info("autofs mount %s owner restored: pgrp=%d, fd=%d\n", i->mnt_path, getpgrp(), i->entry->fd); if (i->entry->has_read_fd) { pr_info("%s: pid %d, closing write end %d\n", __func__, getpid(), i->entry->fd); close(i->entry->fd); } pr_info("%s: pid %d, closing artificial pipe end %d\n", __func__, getpid(), fd); close(fd); return 0; } static autofs_info_t *autofs_create_info(const struct mount_info *mi, const struct file_desc *desc, const autofs_info_t *info) { autofs_info_t *i; i = shmalloc(sizeof(*i)); if (!i) return NULL; i->mnt_path = shmalloc(strlen(mi->ns_mountpoint) + 1); if (!i->mnt_path) return NULL; /* Here we copy autofs dev_id and entry from private data to shared. * See autofs_mount(). */ i->entry = shmalloc(sizeof(*info->entry)); if (!i->entry) return NULL; memcpy(i->entry, info->entry, sizeof(*info->entry)); i->mnt_dev = info->mnt_dev; /* We need mountpoint to be able to opne mount in autofs_post_open() * callback. And this have to be internal path, because process cwd * will be changed already. That's why ns_mountpoint is used. */ strcpy(i->mnt_path, mi->ns_mountpoint); return i; } static struct fdinfo_list_entry *autofs_pipe_le(struct pstree_item *master, AutofsEntry *entry) { struct fdinfo_list_entry *ple; int pipe_fd = entry->fd; if (entry->has_read_fd) pipe_fd = entry->read_fd; ple = find_used_fd(master, pipe_fd); if (!ple) { pr_err("Failed to find pipe fd %d in process %d\n", pipe_fd, vpid(master)); return NULL; } if (ple->fe->type != FD_TYPES__PIPE) { pr_err("Fd %d in process %d is not a pipe: %d\n", pipe_fd, vpid(master), ple->fe->type); return NULL; } return ple; } static int autofs_open_pipefd(struct file_desc *d, int *new_fd) { struct fdinfo_list_entry *fle = file_master(d); int ret; if (fle->stage < FLE_OPEN) { ret = open_pipe(d, new_fd); if (ret != 0) return ret; set_fds_event(fle->pid); return 1; } return autofs_post_open(d, fle->fe->fd); } static int autofs_create_pipe(struct pstree_item *task, autofs_info_t *i, struct fdinfo_list_entry *ple) { struct pipe_info *pi = container_of(ple->desc, struct pipe_info, d); int fd = -1; FdinfoEntry *fe; unsigned flags = O_RDONLY; struct file_desc_ops *ops; PipeEntry *pe; fd = find_unused_fd(task, fd); ops = shmalloc(sizeof(*ops)); if (!ops) return -1; memcpy(ops, pi->d.ops, sizeof(*ops)); ops->open = autofs_open_pipefd; pe = shmalloc(sizeof(*pe)); if (!pe) return -1; pe->id = pi->pe->id; pe->pipe_id = pi->pe->pipe_id; pe->fown = pi->pe->fown; pe->flags = flags; if (collect_one_pipe_ops(&i->pi, &pe->base, ops) < 0) { pr_err("Failed to add pipe info for write end\n"); return -1; } fe = dup_fdinfo(ple->fe, fd, flags); if (!fe) return -1; pr_info("autofs: adding pipe fd %d, flags %#x to %d (with post_open)\n", fe->fd, fe->flags, vpid(task)); return collect_fd(vpid(task), fe, rsti(task), false); } static int autofs_add_mount_info(struct pprep_head *ph) { autofs_info_t *ai = container_of(ph, autofs_info_t, ph); struct mount_info *mi = ai->mi; autofs_info_t *info = mi->private; AutofsEntry *entry = info->entry; autofs_info_t *i; struct pstree_item *master; struct fdinfo_list_entry *ple; if (entry->fd == -1) /* Catatonic mounts have no owner. Keep them with init. */ master = pstree_item_by_virt(getpid()); else master = pstree_item_by_virt(entry->pgrp); BUG_ON(!master); ple = autofs_pipe_le(master, entry); if (!ple) return -1; if (entry->has_read_fd) { /* Original pipe write end was closed. * We need create one to be able to fixup AutoFS mount. */ entry->fd = autofs_dup_pipe(master, ple, entry->fd); if (entry->fd < 0) { pr_err("Failed to find free fd in process %d\n", vpid(master)); return -1; } } i = autofs_create_info(mi, ple->desc, info); if (!i) return -1; /* Another pipe descriptor is needed to call post_open callback */ if (autofs_create_pipe(master, i, ple)) return -1; mi->private = i; return 0; } static int autofs_restore_entry(struct mount_info *mi, AutofsEntry **entry) { struct cr_img *img; img = open_image(CR_FD_AUTOFS, O_RSTR, mi->s_dev); if (!img) return -1; if (empty_image(img)) { close_image(img); return -1; } if (pb_read_one_eof(img, entry, PB_AUTOFS) < 0) return -1; close_image(img); return 0; } int autofs_mount(struct mount_info *mi, const char *source, const char *filesystemtype, unsigned long mountflags) { AutofsEntry *entry; autofs_info_t *info; char *opts, *mode; int control_pipe[2], ret = -1; struct stat buf; if (autofs_restore_entry(mi, &entry) < 0) return -1; if (pipe(control_pipe) < 0) { pr_perror("Can't create pipe"); return -1; } mode = "direct"; if (entry->mode == AUTOFS_MODE_INDIRECT) mode = "indirect"; if (entry->mode == AUTOFS_MODE_OFFSET) mode = "offset"; opts = xsprintf("fd=%d,pgrp=%d,minproto=%d,maxproto=%d,%s", control_pipe[1], getpgrp(), entry->minproto, entry->maxproto, mode); if (opts && entry->has_uid) opts = xstrcat(opts, ",uid=%d", entry->uid); if (opts && entry->has_gid) opts = xstrcat(opts, ",gid=%d", entry->gid); if (!opts) { pr_err("Failed to create options string\n"); goto close_pipe; } pr_info("autofs: mounting to %s with options: \"%s\"\n", mi->mountpoint, opts); if (mount(source, mi->mountpoint, filesystemtype, mountflags, opts) < 0) { pr_perror("Failed to mount autofs to %s", mi->mountpoint); goto free_opts; } info = xmalloc(sizeof(*info)); if (!info) goto umount; info->entry = entry; /* We need autofs dev_id to be able to open direct mount point. * But we can't call stat in autofs_add_mount_info(), because autofs * mount can be overmounted. Thus we have to call it here. But shared * data is not ready yet. So, let's put in on mi->private and copy to * shared data in autofs_add_mount_info(). */ if (stat(mi->mountpoint, &buf) < 0) { pr_perror("Failed to stat %s", mi->mountpoint); goto free_info; } info->mnt_dev = buf.st_dev; /* We need to create dentries for nested mounts */ ret = autofs_populate_mount(mi, entry); if (ret < 0) goto free_info; /* In case of catatonic mounts all we need as the function call below */ ret = autofs_post_mount(mi->mountpoint, buf.st_dev, entry->timeout); if (ret < 0) goto free_info; /* Otherwise we have to add shared object creation callback */ if (entry->fd != AUTOFS_CATATONIC_FD) { info->ph.actor = autofs_add_mount_info; add_post_prepare_cb(&info->ph); } info->mi = mi; mi->private = info; free_opts: free(opts); close_pipe: close(control_pipe[1]); close(control_pipe[0]); return ret; free_info: free(info); umount: if (umount(mi->mountpoint) < 0) pr_perror("Failed to umount %s", mi->mountpoint); goto close_pipe; } criu-3.6/criu/bfd.c000066400000000000000000000124111317335042600141700ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "int.h" #include "log.h" #include "common/bug.h" #include "bfd.h" #include "common/list.h" #include "util.h" #include "xmalloc.h" #include "page.h" #undef LOG_PREFIX #define LOG_PREFIX "bfd: " /* * Kernel doesn't produce more than one page of * date per one read call on proc files. */ #define BUFSIZE (PAGE_SIZE) struct bfd_buf { char *mem; struct list_head l; }; static LIST_HEAD(bufs); #define BUFBATCH (16) static int buf_get(struct xbuf *xb) { struct bfd_buf *b; if (list_empty(&bufs)) { void *mem; int i; mem = mmap(NULL, BUFBATCH * BUFSIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, 0, 0); if (mem == MAP_FAILED) { pr_perror("No buf"); return -1; } for (i = 0; i < BUFBATCH; i++) { b = xmalloc(sizeof(*b)); if (!b) { if (i == 0) { pr_err("No buffer for bfd\n"); return -1; } pr_warn("BFD buffers partial refil!\n"); break; } b->mem = mem + i * BUFSIZE; list_add_tail(&b->l, &bufs); } } b = list_first_entry(&bufs, struct bfd_buf, l); list_del_init(&b->l); xb->mem = b->mem; xb->data = xb->mem; xb->sz = 0; xb->buf = b; return 0; } static void buf_put(struct xbuf *xb) { /* * Don't unmap buffer back, it will get reused * by next bfdopen call */ list_add(&xb->buf->l, &bufs); xb->buf = NULL; xb->mem = NULL; xb->data = NULL; } static int bfdopen(struct bfd *f, bool writable) { if (buf_get(&f->b)) { close(f->fd); return -1; } f->writable = writable; return 0; } int bfdopenr(struct bfd *f) { return bfdopen(f, false); } int bfdopenw(struct bfd *f) { return bfdopen(f, true); } static int bflush(struct bfd *bfd); static bool flush_failed = false; int bfd_flush_images(void) { return flush_failed ? -1 : 0; } void bclose(struct bfd *f) { if (bfd_buffered(f)) { if (f->writable && bflush(f) < 0) { /* * This is to propagate error up. It's * hardly possible by returning and * checking it, but setting a static * flag, failing further bfdopen-s and * checking one at the end would work. */ flush_failed = true; pr_perror("Error flushing image"); } buf_put(&f->b); } close_safe(&f->fd); } static int brefill(struct bfd *f) { int ret; struct xbuf *b = &f->b; memmove(b->mem, b->data, b->sz); b->data = b->mem; ret = read(f->fd, b->mem + b->sz, BUFSIZE - b->sz); if (ret < 0) { pr_perror("Error reading file"); return -1; } if (ret == 0) return 0; b->sz += ret; return 1; } static char *strnchr(char *str, unsigned int len, char c) { while (len > 0 && *str != c) { str++; len--; } return len == 0 ? NULL : str; } char *breadline(struct bfd *f) { return breadchr(f, '\n'); } char *breadchr(struct bfd *f, char c) { struct xbuf *b = &f->b; bool refilled = false; char *n; unsigned int ss = 0; again: n = strnchr(b->data + ss, b->sz - ss, c); if (n) { char *ret; ret = b->data; b->data = n + 1; /* skip the \n found */ *n = '\0'; b->sz -= (b->data - ret); return ret; } if (refilled) { if (!b->sz) return NULL; /* * Last bytes may lack the \n at the * end, need to report this as full * line anyway */ b->data[b->sz] = '\0'; /* * The b->data still points to old data, * but we say that no bytes left there * so next call to breadline will not * "find" these bytes again. */ b->sz = 0; return b->data; } /* * small optimization -- we've scanned b->sz * symols already, no need to re-scan them after * the buffer refill. */ ss = b->sz; /* no full line in the buffer -- refill one */ if (brefill(f) < 0) return ERR_PTR(-EIO); refilled = true; goto again; } static int bflush(struct bfd *bfd) { struct xbuf *b = &bfd->b; int ret; if (!b->sz) return 0; ret = write(bfd->fd, b->data, b->sz); if (ret != b->sz) return -1; b->sz = 0; return 0; } static int __bwrite(struct bfd *bfd, const void *buf, int size) { struct xbuf *b = &bfd->b; if (b->sz + size > BUFSIZE) { int ret; ret = bflush(bfd); if (ret < 0) return ret; } if (size > BUFSIZE) return write(bfd->fd, buf, size); memcpy(b->data + b->sz, buf, size); b->sz += size; return size; } int bwrite(struct bfd *bfd, const void *buf, int size) { if (!bfd_buffered(bfd)) return write(bfd->fd, buf, size); return __bwrite(bfd, buf, size); } int bwritev(struct bfd *bfd, const struct iovec *iov, int cnt) { int i, written = 0; if (!bfd_buffered(bfd)) return writev(bfd->fd, iov, cnt); for (i = 0; i < cnt; i++) { int ret; ret = __bwrite(bfd, (const void *)iov[i].iov_base, iov[i].iov_len); if (ret < 0) return ret; written += ret; if (ret < iov[i].iov_len) break; } return written; } int bread(struct bfd *bfd, void *buf, int size) { struct xbuf *b = &bfd->b; int more = 1, filled = 0; if (!bfd_buffered(bfd)) return read(bfd->fd, buf, size); while (more > 0) { int chunk; chunk = size - filled; if (chunk > b->sz) chunk = b->sz; if (chunk) { memcpy(buf + filled, b->data, chunk); b->data += chunk; b->sz -= chunk; filled += chunk; } if (filled < size) more = brefill(bfd); else { BUG_ON(filled > size); more = 0; } } return more < 0 ? more : filled; } criu-3.6/criu/bitmap.c000066400000000000000000000025241317335042600147150ustar00rootroot00000000000000#include "common/bitsperlong.h" #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) #define BITMAP_FIRST_WORD_MASK(start) (~0ul << ((start) % BITS_PER_LONG)) #define BITMAP_LAST_WORD_MASK(nbits) \ ( \ ((nbits) % BITS_PER_LONG) ? \ (1ul << ((nbits) % BITS_PER_LONG)) - 1 : ~0ul \ ) #define small_const_nbits(nbits) \ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) void bitmap_set(unsigned long *map, int start, int nr) { unsigned long *p = map + BIT_WORD(start); const int size = start + nr; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); while (nr - bits_to_set >= 0) { *p |= mask_to_set; nr -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = ~0UL; p++; } if (nr) { mask_to_set &= BITMAP_LAST_WORD_MASK(size); *p |= mask_to_set; } } void bitmap_clear(unsigned long *map, int start, int nr) { unsigned long *p = map + BIT_WORD(start); const int size = start + nr; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); while (nr - bits_to_clear >= 0) { *p &= ~mask_to_clear; nr -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = ~0UL; p++; } if (nr) { mask_to_clear &= BITMAP_LAST_WORD_MASK(size); *p &= ~mask_to_clear; } } criu-3.6/criu/cgroup-props.c000066400000000000000000000276321317335042600161100ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "int.h" #include "common/compiler.h" #include "cgroup-props.h" #include "cr_options.h" #include "config.h" #include "xmalloc.h" #include "string.h" #include "util.h" #include "common/list.h" #include "log.h" #include "common/bug.h" #undef LOG_PREFIX #define LOG_PREFIX "cg-prop: " enum { CGP_MERGE, CGP_REPLACE, }; static const char *____criu_global_props____[] = { "cgroup.clone_children", "notify_on_release", "cgroup.procs", "tasks", }; cgp_t cgp_global = { .name = "____criu_global_props____", .nr_props = ARRAY_SIZE(____criu_global_props____), .props = ____criu_global_props____, }; typedef struct { struct list_head list; cgp_t cgp; } cgp_list_entry_t; static LIST_HEAD(cgp_list); static void cgp_free(cgp_list_entry_t *p) { size_t i; if (p) { for (i = 0; i < p->cgp.nr_props; i++) xfree((void *)p->cgp.props[i]); xfree((void *)p->cgp.name); xfree((void *)p->cgp.props); xfree(p); } } static int cgp_merge_props(cgp_list_entry_t *d, cgp_list_entry_t *s) { size_t nr_props, i, j; nr_props = d->cgp.nr_props + s->cgp.nr_props; if (xrealloc_safe(&d->cgp.props, nr_props * sizeof(char *))) return -ENOMEM; /* * FIXME: Check for duplicates in propties? */ for (i = d->cgp.nr_props, j = 0; i < nr_props; i++, j++) { d->cgp.props[i] = xstrdup(s->cgp.props[j]); if (!d->cgp.props[i]) return -ENOMEM; d->cgp.nr_props++; } return 0; } static int cgp_handle_props(cgp_list_entry_t **p, int strategy) { cgp_list_entry_t *s = *p; cgp_list_entry_t *t; list_for_each_entry(t, &cgp_list, list) { if (strcmp(t->cgp.name, s->cgp.name)) continue; pr_debug("%s \"%s\" controller properties\n", strategy == CGP_MERGE ? "Merging" : "Replacing", s->cgp.name); if (strategy == CGP_MERGE) { int ret; ret = cgp_merge_props(t, s); cgp_free(s); *p = NULL; return ret; } else if (strategy == CGP_REPLACE) { /* * Simply drop out previous instance. */ list_del(&t->list); cgp_free(t); break; } else BUG(); } /* * New controller, simply add it. */ list_add(&s->list, &cgp_list); *p = NULL; return 0; } static char *skip_spaces(char **stream, size_t *len) { if (stream && *len) { char *p = *stream; while (p && *len && *p == ' ') p++, (*len)--; if (p != *stream) *stream = p; return p; } return NULL; } static bool eat_symbol(char **stream, size_t *len, char sym, bool skip_ws) { char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); if (!p || *p != sym || !*len) return false; (*stream) = p + 1; (*len)--; return true; } static bool eat_symbols(char **stream, size_t *len, char *syms, size_t n_syms, bool skip_ws) { char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); size_t i; if (p && *len) { char *stream_orig = *stream; size_t len_orig = *len; for (i = 0; i < n_syms; i++) { if (!eat_symbol(stream, len, syms[i], false)) { *stream = stream_orig; *len = len_orig; goto nomatch; } } return true; } nomatch: return false; } static bool eat_word(char **stream, size_t *len, char *word, size_t word_len, bool skip_ws) { char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); if (p && *len >= word_len) { if (!strncmp(p, word, word_len)) { (*stream) += word_len; (*len) -= word_len; return true; } } return false; } static char *get_quoted(char **stream, size_t *len, bool skip_ws) { char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); char *from = p + 1; char *dst; if (!p || *p != '\"') return NULL; for (p = from, (*len)--; (*len); p++, (*len)--) { if (*p == '\"') { if (p == from) break; dst = xmalloc(p - from + 1); if (!dst) break; memcpy(dst, from, p - from); dst[p - from] = '\0'; (*stream) = p + 1; (*len)--; return dst; } } return NULL; } static int cgp_parse_stream(char *stream, size_t len) { cgp_list_entry_t *cgp_entry = NULL; int strategy; int ret = 0; char *p; /* * We expect the following format here * (very simplified YAML!) * * "cpu": * - "strategy": "replace" * - "properties": ["cpu.shares", "cpu.cfs_period_us"] * "memory": * - "strategy": "merge" * - "properties": ["memory.limit_in_bytes", "memory.memsw.limit_in_bytes"] * * and etc. */ while (len) { /* * Controller name. */ p = get_quoted(&stream, &len, false); if (!p) { pr_err("Expecting controller name\n"); goto err_parse; } pr_info("Parsing controller \"%s\"\n", p); cgp_entry = xzalloc(sizeof(*cgp_entry)); if (cgp_entry) { INIT_LIST_HEAD(&cgp_entry->list); cgp_entry->cgp.name = p; } else { pr_err("Can't allocate memory for controller %s\n", p); xfree(p); return -ENOMEM; } if (!eat_symbols(&stream, &len, ":\n - ", 5, true)) { pr_err("Expected \':\\n - \' sequence controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_word(&stream, &len, "\"strategy\":", 11, true)) { pr_err("Expected \'stategy:\' keyword in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } p = get_quoted(&stream, &len, true); if (!p) { pr_err("Expected strategy in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; }; if (!strcmp(p, "merge")) { strategy = CGP_MERGE; } else if (!strcmp(p, "replace")) { strategy = CGP_REPLACE; } else { pr_err("Unknown strategy \"%s\" in controller's %s stream\n", p, cgp_entry->cgp.name); xfree(p); goto err_parse; } pr_info("\tStrategy \"%s\"\n", p); xfree(p); if (!eat_symbols(&stream, &len, "\n - ", 4, true)) { pr_err("Expected \':\\n - \' sequence controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_word(&stream, &len, "\"properties\":", 13, true)) { pr_err("Expected \"properties:\" keyword in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_symbol(&stream, &len, '[', true)) { pr_err("Expected \'[\' sequence controller's %s properties stream\n", cgp_entry->cgp.name); goto err_parse; } while ((p = get_quoted(&stream, &len, true))) { if (!p) { pr_err("Expected property name for controller %s\n", cgp_entry->cgp.name); goto err_parse; } if (xrealloc_safe(&cgp_entry->cgp.props, (cgp_entry->cgp.nr_props + 1) * sizeof(char *))) { pr_err("Can't allocate property for controller %s\n", cgp_entry->cgp.name); goto err_parse; } cgp_entry->cgp.props[cgp_entry->cgp.nr_props++] = p; pr_info("\tProperty \"%s\"\n", p); if (!eat_symbol(&stream, &len, ',', true)) { if (stream[0] == ']') { stream++, len--; break; } pr_err("Expected ']' in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } } if (cgp_entry->cgp.nr_props == 0 && !eat_symbol(&stream, &len, ']', true)) { pr_err("Expected ']' in empty property list for %s\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_symbol(&stream, &len, '\n', true) && len) { pr_err("Expected \'\\n\' symbol in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (cgp_handle_props(&cgp_entry, strategy)) goto err_parse; cgp_entry = NULL; } ret = 0; out: return ret; err_parse: cgp_free(cgp_entry); ret = -EINVAL; goto out; } static int cgp_parse_file(char *path) { void *mem = MAP_FAILED; int fd = -1, ret = -1; struct stat st; fd = open(path, O_RDONLY); if (fd < 0) { pr_perror("Can't open file %s", path); goto err; } if (fstat(fd, &st)) { pr_perror("Can't stat file %s", path); goto err; } mem = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, 0); if (mem == MAP_FAILED) { pr_perror("Can't mmap file %s", path); goto err; } if (cgp_parse_stream(mem, st.st_size)) { pr_err("Failed to parse file `%s'\n", path); goto err; } ret = 0; err: if (mem != MAP_FAILED) munmap(mem, st.st_size); close_safe(&fd); return ret; } static int cgp_parse_builtins(void) { static const char predefined_stream[] = "\"cpu\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"cpu.shares\", " "\"cpu.cfs_period_us\", " "\"cpu.cfs_quota_us\", " "\"cpu.rt_period_us\", " "\"cpu.rt_runtime_us\" " "]\n" /* limit_in_bytes and memsw.limit_in_bytes must be set in this order */ "\"memory\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"memory.limit_in_bytes\", " "\"memory.memsw.limit_in_bytes\", " "\"memory.swappiness\", " "\"memory.soft_limit_in_bytes\", " "\"memory.move_charge_at_immigrate\", " "\"memory.oom_control\", " "\"memory.use_hierarchy\", " "\"memory.kmem.limit_in_bytes\", " "\"memory.kmem.tcp.limit_in_bytes\" " "]\n" /* * cpuset.cpus and cpuset.mems must be set before the process moves * into its cgroup; they are "initialized" below to whatever the root * values are in copy_special_cg_props so as not to cause ENOSPC when * values are restored via this code. */ "\"cpuset\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"cpuset.cpus\", " "\"cpuset.mems\", " "\"cpuset.memory_migrate\", " "\"cpuset.cpu_exclusive\", " "\"cpuset.mem_exclusive\", " "\"cpuset.mem_hardwall\", " "\"cpuset.memory_spread_page\", " "\"cpuset.memory_spread_slab\", " "\"cpuset.sched_load_balance\", " "\"cpuset.sched_relax_domain_level\" " "]\n" "\"blkio\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"blkio.weight\" " "]\n" "\"freezer\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "]\n" "\"perf_event\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "]\n" "\"net_cls\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"net_cls.classid\" " "]\n" "\"net_prio\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"net_prio.ifpriomap\" " "]\n" "\"pids\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"pids.max\" " "]\n" "\"devices\":\n" " - \"strategy\": \"replace\"\n" " - \"properties\": " "[ " "\"devices.list\" " "]\n"; return cgp_parse_stream((void *)predefined_stream, strlen(predefined_stream)); } int cgp_init(char *stream, size_t len, char *path) { int ret; ret = cgp_parse_builtins(); if (ret) goto err; if (stream && len) { ret = cgp_parse_stream(stream, len); if (ret) goto err; } if (path) ret = cgp_parse_file(path); err: return ret; } static char **dump_controllers; static size_t nr_dump_controllers; bool cgp_add_dump_controller(const char *name) { if (xrealloc_safe(&dump_controllers, (nr_dump_controllers + 1) * sizeof(char *))) { pr_err("Can't add controller \"%s\" to mark\n", name); return false; } dump_controllers[nr_dump_controllers] = xstrdup(name); if (!dump_controllers[nr_dump_controllers]) return false; pr_debug("Mark controller \"%s\" to dump\n", name); nr_dump_controllers++; return true; } bool cgp_should_skip_controller(const char *name) { size_t i; /* * Dump all by default. */ if (!nr_dump_controllers) return false; for (i = 0; i < nr_dump_controllers; i++) { if (!strcmp(name, dump_controllers[i])) return false; } return true; } const cgp_t *cgp_get_props(const char *name) { cgp_list_entry_t *p; list_for_each_entry(p, &cgp_list, list) { if (!strcmp(p->cgp.name, name)) return &p->cgp; } return NULL; } void cgp_fini(void) { cgp_list_entry_t *p, *t; size_t i; list_for_each_entry_safe(p, t, &cgp_list, list) cgp_free(p); INIT_LIST_HEAD(&cgp_list); for (i = 0; i < nr_dump_controllers; i++) xfree(dump_controllers[i]); xfree(dump_controllers); nr_dump_controllers = 0; } criu-3.6/criu/cgroup.c000066400000000000000000001206361317335042600147450ustar00rootroot00000000000000#define LOG_PREFIX "cg: " #include #include #include #include #include #include #include #include #include #include "common/list.h" #include "xmalloc.h" #include "cgroup.h" #include "cgroup-props.h" #include "cr_options.h" #include "pstree.h" #include "criu-log.h" #include "util.h" #include "imgset.h" #include "util-pie.h" #include "namespaces.h" #include "seize.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/cgroup.pb-c.h" /* * This structure describes set of controller groups * a task lives in. The cg_ctl entries are stored in * the @ctls list sorted by the .name field and then * by the .path field. */ struct cg_set { u32 id; struct list_head l; unsigned int n_ctls; struct list_head ctls; }; static LIST_HEAD(cg_sets); static unsigned int n_sets; static CgSetEntry **rst_sets; static unsigned int n_controllers; static CgControllerEntry **controllers; static char *cg_yard; static struct cg_set *root_cgset; /* Set root item lives in */ static struct cg_set *criu_cgset; /* Set criu process lives in */ static u32 cg_set_ids = 1; static LIST_HEAD(cgroups); static unsigned int n_cgroups; static CgSetEntry *find_rst_set_by_id(u32 id) { int i; for (i = 0; i < n_sets; i++) if (rst_sets[i]->id == id) return rst_sets[i]; return NULL; } #define CGCMP_MATCH 1 /* check for exact match */ #define CGCMP_ISSUB 2 /* check set is subset of ctls */ static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what) { struct list_head *l1 = &set->ctls, *l2 = ctls; while (1) { struct cg_ctl *c1 = NULL, *c2 = NULL; if (l1->next != &set->ctls) c1 = list_first_entry(l1, struct cg_ctl, l); if (l2->next != ctls) c2 = list_first_entry(l2, struct cg_ctl, l); if (!c1 || !c2) /* Nowhere to move next */ return !c1 && !c2; /* Both lists scanned -- match */ if (strcmp(c1->name, c2->name)) return false; switch (what) { case CGCMP_MATCH: /* must have the same cgns prefix to be considered equal */ if (c1->cgns_prefix != c2->cgns_prefix) return false; if (strcmp(c1->path, c2->path)) return false; break; case CGCMP_ISSUB: if (!strstartswith(c1->path, c2->path)) return false; break; } l1 = l1->next; l2 = l2->next; } } static int collect_cgroups(struct list_head *ctls); static struct cg_set *get_cg_set(struct list_head *ctls, unsigned int n_ctls, bool collect) { struct cg_set *cs; list_for_each_entry(cs, &cg_sets, l) if (cg_set_compare(cs, ctls, CGCMP_MATCH)) { pr_debug(" `- Existing css %d found\n", cs->id); put_ctls(ctls); return cs; } pr_debug(" `- New css ID %d\n", cg_set_ids); cs = xmalloc(sizeof(*cs)); if (cs) { cs->id = cg_set_ids++; INIT_LIST_HEAD(&cs->ctls); list_splice_init(ctls, &cs->ctls); cs->n_ctls = n_ctls; list_add_tail(&cs->l, &cg_sets); n_sets++; if (!pr_quelled(LOG_DEBUG)) { struct cg_ctl *ctl; list_for_each_entry(ctl, &cs->ctls, l) pr_debug(" `- [%s] -> [%s] [%u]\n", ctl->name, ctl->path, ctl->cgns_prefix); } if (collect && collect_cgroups(&cs->ctls)) { list_del(&cs->l); n_sets--; put_ctls(&cs->ctls); xfree(cs); return NULL; } } return cs; } struct cg_controller *new_controller(const char *name) { struct cg_controller *nc = xmalloc(sizeof(*nc)); if (!nc) return NULL; nc->controllers = xmalloc(sizeof(char *)); if (!nc->controllers) { xfree(nc); return NULL; } nc->controllers[0] = xstrdup(name); if (!nc->controllers[0]) { xfree(nc->controllers); xfree(nc); return NULL; } nc->n_controllers = 1; nc->n_heads = 0; INIT_LIST_HEAD(&nc->heads); return nc; } int parse_cg_info(void) { if (collect_controllers(&cgroups, &n_cgroups) < 0) return -1; return 0; } /* Check that co-mounted controllers from /proc/cgroups (e.g. cpu and cpuacct) * are contained in a comma separated string (e.g. from /proc/self/cgroup or * mount options). */ static bool cgroup_contains(char **controllers, unsigned int n_controllers, char *name) { unsigned int i; bool all_match = true; for (i = 0; i < n_controllers; i++) { bool found = false; const char *loc = name; do { loc = strstr(loc, controllers[i]); if (loc) { loc += strlen(controllers[i]); switch (*loc) { case '\0': case ',': found = true; break; } } } while (loc); all_match &= found; } return all_match && n_controllers > 0; } /* This is for use in add_cgroup() as additional arguments for the ftw() * callback */ static struct cg_controller *current_controller; static unsigned int path_pref_len; #define EXACT_MATCH 0 #define PARENT_MATCH 1 #define NO_MATCH 2 static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir **rdir) { struct cgroup_dir *d; list_for_each_entry(d, dirs, siblings) { if (strcmp(d->path, path) == 0) { *rdir = d; return EXACT_MATCH; } if (strstartswith(path, d->path)) { int ret = find_dir(path, &d->children, rdir); if (ret == NO_MATCH) { *rdir = d; return PARENT_MATCH; } return ret; } } return NO_MATCH; } /* * Strips trailing '\n' from the string */ static inline char *strip(char *str) { char *e; e = strchr(str, '\0'); if (e != str && *(e - 1) == '\n') *(e - 1) = '\0'; return str; } /* * Currently this function only supports properties that have a string value * under 1024 chars. */ static int read_cgroup_prop(struct cgroup_prop *property, const char *fullpath) { char buf[1024]; int fd, ret; struct stat sb; fd = open(fullpath, O_RDONLY); if (fd == -1) { property->value = NULL; pr_perror("Failed opening %s", fullpath); return -1; } if (fstat(fd, &sb) < 0) { pr_perror("failed statting cgroup prop %s", fullpath); close(fd); return -1; } property->mode = sb.st_mode; property->uid = sb.st_uid; property->gid = sb.st_gid; /* skip dumping the value of these, since it doesn't make sense (we * just want to restore the perms) */ if (!strcmp(property->name, "cgroup.procs") || !strcmp(property->name, "tasks")) { ret = 0; /* libprotobuf segfaults if we leave a null pointer in a * string, so let's not do that */ property->value = xstrdup(""); if (!property->value) ret = -1; close(fd); return ret; } ret = read(fd, buf, sizeof(buf) - 1); if (ret == -1) { pr_err("Failed scanning %s\n", fullpath); close(fd); return -1; } close(fd); buf[ret] = 0; if (strtoll(buf, NULL, 10) == LLONG_MAX) strcpy(buf, "-1"); property->value = xstrdup(strip(buf)); if (!property->value) return -1; return 0; } static struct cgroup_prop *create_cgroup_prop(const char *name) { struct cgroup_prop *property; property = xmalloc(sizeof(*property)); if (!property) return NULL; property->name = xstrdup(name); if (!property->name) { xfree(property); return NULL; } property->value = NULL; return property; } static void free_cgroup_prop(struct cgroup_prop *prop) { xfree(prop->name); xfree(prop->value); xfree(prop); } static void free_all_cgroup_props(struct cgroup_dir *ncd) { struct cgroup_prop *prop, *t; list_for_each_entry_safe(prop, t, &ncd->properties, list) { list_del(&prop->list); free_cgroup_prop(prop); } INIT_LIST_HEAD(&ncd->properties); ncd->n_properties = 0; } static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp) { int j; char buf[PATH_MAX]; struct cgroup_prop *prop; for (j = 0; cgp && j < cgp->nr_props; j++) { if (snprintf(buf, PATH_MAX, "%s/%s", fpath, cgp->props[j]) >= PATH_MAX) { pr_err("snprintf output was truncated\n"); return -1; } if (access(buf, F_OK) < 0 && errno == ENOENT) { pr_info("Couldn't open %s. This cgroup property may not exist on this kernel\n", buf); continue; } prop = create_cgroup_prop(cgp->props[j]); if (!prop) { free_all_cgroup_props(ncd); return -1; } if (read_cgroup_prop(prop, buf) < 0) { free_cgroup_prop(prop); free_all_cgroup_props(ncd); return -1; } if (!strcmp("memory.oom_control", cgp->props[j])) { char *new; int disable; if (sscanf(prop->value, "oom_kill_disable %d\n", &disable) != 1) { pr_err("couldn't scan oom state from %s\n", prop->value); free_cgroup_prop(prop); free_all_cgroup_props(ncd); return -1; } if (asprintf(&new, "%d", disable) < 0) { pr_err("couldn't aloocate new oom value\n"); free_cgroup_prop(prop); free_all_cgroup_props(ncd); return -1; } xfree(prop->value); prop->value = new; } pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); list_add_tail(&prop->list, &ncd->properties); ncd->n_properties++; } return 0; } static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, struct cg_controller *controller) { int i; for (i = 0; i < controller->n_controllers; ++i) { const cgp_t *cgp = cgp_get_props(controller->controllers[i]); if (dump_cg_props_array(fpath, ncd, cgp) < 0) { pr_err("dumping known properties failed\n"); return -1; } if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) { pr_err("dumping global properties failed\n"); return -1; } } return 0; } static int add_cgroup(const char *fpath, const struct stat *sb, int typeflag) { struct cgroup_dir *ncd = NULL, *match; int exit_code = -1; if (typeflag == FTW_D) { int mtype; pr_info("adding cgroup %s\n", fpath); ncd = xmalloc(sizeof(*ncd)); if (!ncd) goto out; ncd->mode = sb->st_mode; ncd->uid = sb->st_uid; ncd->gid = sb->st_gid; /* chop off the first "/proc/self/fd/N" str */ if (fpath[path_pref_len] == '\0') ncd->path = xstrdup("/"); else ncd->path = xstrdup(fpath + path_pref_len); if (!ncd->path) goto out; mtype = find_dir(ncd->path, ¤t_controller->heads, &match); switch (mtype) { /* ignore co-mounted cgroups and already dumped cgroups */ case EXACT_MATCH: exit_code = 0; goto out; case PARENT_MATCH: list_add_tail(&ncd->siblings, &match->children); match->n_children++; break; case NO_MATCH: list_add_tail(&ncd->siblings, ¤t_controller->heads); current_controller->n_heads++; break; default: BUG(); } INIT_LIST_HEAD(&ncd->children); ncd->n_children = 0; INIT_LIST_HEAD(&ncd->properties); ncd->n_properties = 0; if (add_cgroup_properties(fpath, ncd, current_controller) < 0) { list_del(&ncd->siblings); if (mtype == PARENT_MATCH) match->n_children--; else if (mtype == NO_MATCH) current_controller->n_heads--; goto out; } } return 0; out: if (ncd) xfree(ncd->path); xfree(ncd); return exit_code; } static int add_freezer_state(struct cg_controller *controller) { struct cgroup_dir *it; /* There is one more case, that cgroup namespaces might * generate "multiple" heads if nothing is actually in the * root freezer cgroup, e.g. --freeze-cgroup=/lxc/foo and all * tasks in either /lxc/foo/a or /lxc/foo/b. * * In this case */ list_for_each_entry(it, &controller->heads, siblings) { struct cgroup_dir *cg_head; struct cgroup_prop *prop; cg_head = list_first_entry(&controller->heads, struct cgroup_dir, siblings); prop = create_cgroup_prop("freezer.state"); if (!prop) return -1; prop->value = xstrdup(get_real_freezer_state()); if (!prop->value) { free_cgroup_prop(prop); return -1; } list_add_tail(&prop->list, &cg_head->properties); cg_head->n_properties++; } return 0; } static int collect_cgroups(struct list_head *ctls) { struct cg_ctl *cc; int ret = 0; int fd = -1; list_for_each_entry(cc, ctls, l) { char path[PATH_MAX], mopts[1024], *root; char prefix[] = ".criu.cgmounts.XXXXXX"; struct cg_controller *cg; struct cg_root_opt *o; current_controller = NULL; /* We should get all the "real" (i.e. not name=systemd type) * controller from parse_cgroups(), so find that controller if * it exists. */ list_for_each_entry(cg, &cgroups, l) { if (cgroup_contains(cg->controllers, cg->n_controllers, cc->name)) { current_controller = cg; break; } } if (!current_controller) { /* only allow "fake" controllers to be created this way */ if (!strstartswith(cc->name, "name=")) { pr_err("controller %s not found\n", cc->name); return -1; } else { struct cg_controller *nc; nc = new_controller(cc->name); if (!nc) return -1; list_add_tail(&nc->l, &cg->l); n_cgroups++; current_controller = nc; } } if (!opts.manage_cgroups) continue; if (strstartswith(cc->name, "name=")) snprintf(mopts, sizeof(mopts), "none,%s", cc->name); else snprintf(mopts, sizeof(mopts), "%s", cc->name); if (mkdtemp(prefix) == NULL) { pr_perror("can't make dir for cg mounts"); return -1; } if (mount("none", prefix, "cgroup", 0, mopts) < 0) { pr_perror("couldn't mount %s", mopts); rmdir(prefix); return -1; } fd = open_detach_mount(prefix); if (fd < 0) return -1; path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); root = cc->path; if (opts.new_global_cg_root) root = opts.new_global_cg_root; list_for_each_entry(o, &opts.new_cgroup_roots, node) { if (!strcmp(cc->name, o->controller)) root = o->newroot; } snprintf(path + path_pref_len, PATH_MAX - path_pref_len, "%s", root); ret = ftw(path, add_cgroup, 4); if (ret < 0) pr_perror("failed walking %s for empty cgroups", path); close_safe(&fd); if (ret < 0) return ret; if (opts.freeze_cgroup && !strcmp(cc->name, "freezer") && add_freezer_state(current_controller)) return -1; } return 0; } int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args) { int pid; LIST_HEAD(ctls); unsigned int n_ctls = 0; struct cg_set *cs; if (item) pid = item->pid->real; else pid = getpid(); pr_info("Dumping cgroups for %d\n", pid); if (parse_task_cgroup(pid, args, &ctls, &n_ctls)) return -1; cs = get_cg_set(&ctls, n_ctls, item); if (!cs) return -1; if (!item) { BUG_ON(criu_cgset); criu_cgset = cs; pr_info("Set %d is criu one\n", cs->id); } else { if (item == root_item) { BUG_ON(root_cgset); root_cgset = cs; pr_info("Set %d is root one\n", cs->id); } else { struct cg_ctl *root, *stray; BUG_ON(!root_cgset); pr_info("Set %d is a stray\n", cs->id); /* Copy the cgns prefix from the root cgset for each * controller. This is ok because we know that there is * only one cgroup namespace. */ list_for_each_entry(root, &root_cgset->ctls, l) { list_for_each_entry(stray, &cs->ctls, l) { if (strcmp(root->name, stray->name)) continue; if (strlen(stray->path) < root->cgns_prefix) { pr_err("cg %s shorter than path prefix %d?\n", stray->path, root->cgns_prefix); return -1; } stray->cgns_prefix = root->cgns_prefix; } } } } *cg_id = cs->id; return 0; } static int dump_cg_dir_props(struct list_head *props, size_t n_props, CgroupPropEntry ***ents) { struct cgroup_prop *prop_cur; CgroupPropEntry *cpe; void *m; int i = 0; m = xmalloc(n_props * (sizeof(CgroupPropEntry *) + sizeof(CgroupPropEntry))); *ents = m; if (!m) return -1; cpe = m + n_props * sizeof(CgroupPropEntry *); list_for_each_entry(prop_cur, props, list) { cgroup_prop_entry__init(cpe); cpe->perms = xmalloc(sizeof(*cpe->perms)); if (!cpe->perms) goto error; cgroup_perms__init(cpe->perms); cpe->name = xstrdup(prop_cur->name); cpe->value = xstrdup(prop_cur->value); if (!cpe->name || !cpe->value) goto error; cpe->perms->mode = prop_cur->mode; cpe->perms->uid = prop_cur->uid; cpe->perms->gid = prop_cur->gid; (*ents)[i++] = cpe++; } return 0; error: while (i >= 0) { xfree(cpe->name); xfree(cpe->value); --cpe; --i; } xfree(*ents); return -1; } static int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ***ents, int poff) { struct cgroup_dir *cur; CgroupDirEntry *cde; void *m; int i = 0; m = xmalloc(n_dirs * (sizeof(CgroupDirEntry *) + sizeof(CgroupDirEntry))); *ents = m; if (!m) return -1; cde = m + n_dirs * sizeof(CgroupDirEntry *); list_for_each_entry(cur, dirs, siblings) { cgroup_dir_entry__init(cde); cde->dir_perms = xmalloc(sizeof(*cde->dir_perms)); if (!cde->dir_perms) return -1; cgroup_perms__init(cde->dir_perms); cde->dir_perms->mode = cur->mode; cde->dir_perms->uid = cur->uid; cde->dir_perms->gid = cur->gid; cde->dir_name = cur->path + poff; if (poff != 1) /* parent isn't "/" */ cde->dir_name++; /* leading / */ cde->n_children = cur->n_children; if (cur->n_children > 0) if (dump_cg_dirs(&cur->children, cur->n_children, &cde->children, strlen(cur->path)) < 0) { xfree(*ents); return -1; } cde->n_properties = cur->n_properties; if (cde->n_properties > 0) { if (dump_cg_dir_props(&cur->properties, cde->n_properties, &cde->properties) < 0) { xfree(*ents); return -1; } } (*ents)[i++] = cde++; } return 0; } static int dump_controllers(CgroupEntry *cg) { struct cg_controller *cur; CgControllerEntry *ce; void *m; int i; cg->n_controllers = n_cgroups; m = xmalloc(n_cgroups * (sizeof(CgControllerEntry *) + sizeof(CgControllerEntry))); cg->controllers = m; ce = m + cg->n_controllers * sizeof(CgControllerEntry *); if (!m) return -1; i = 0; list_for_each_entry(cur, &cgroups, l) { cg_controller_entry__init(ce); ce->cnames = cur->controllers; ce->n_cnames = cur->n_controllers; ce->n_dirs = cur->n_heads; if (ce->n_dirs > 0) if (dump_cg_dirs(&cur->heads, cur->n_heads, &ce->dirs, 0) < 0) { xfree(cg->controllers); return -1; } cg->controllers[i++] = ce++; } return 0; } static void free_sets(CgroupEntry *cg, unsigned nr) { unsigned i; for (i = 0; i < nr; i++) xfree(cg->sets[i]->ctls); xfree(cg->sets); } static int dump_sets(CgroupEntry *cg) { struct cg_set *set; struct cg_ctl *ctl; unsigned s, c; void *m; CgSetEntry *se; CgMemberEntry *ce; pr_info("Dumping %d sets\n", n_sets - 1); cg->n_sets = n_sets - 1; m = xmalloc(cg->n_sets * (sizeof(CgSetEntry *) + sizeof(CgSetEntry))); cg->sets = m; se = m + cg->n_sets * sizeof(CgSetEntry *); if (!m) return -1; s = 0; list_for_each_entry(set, &cg_sets, l) { if (set == criu_cgset) continue; /* * Now encode them onto the image entry */ cg_set_entry__init(se); se->id = set->id; se->n_ctls = set->n_ctls; m = xmalloc(se->n_ctls * (sizeof(CgMemberEntry *) + sizeof(CgMemberEntry))); se->ctls = m; ce = m + se->n_ctls * sizeof(CgMemberEntry *); if (!m) { free_sets(cg, s); return -1; } c = 0; list_for_each_entry(ctl, &set->ctls, l) { pr_info(" `- Dumping %s of %s\n", ctl->name, ctl->path); cg_member_entry__init(ce); ce->name = ctl->name; ce->path = ctl->path; if (ctl->cgns_prefix > 0) { ce->has_cgns_prefix = true; ce->cgns_prefix = ctl->cgns_prefix; } se->ctls[c++] = ce++; } cg->sets[s++] = se++; } return 0; } int dump_cgroups(void) { CgroupEntry cg = CGROUP_ENTRY__INIT; int ret = -1; BUG_ON(!criu_cgset || !root_cgset); /* * Check whether root task lives in its own set as compared * to criu. If yes, we should not dump anything. Note that * list_is_singular() is slightly wrong here: if the criu cgset has * empty cgroups, those will not be restored on the target host, since * we're not dumping anything here. */ if (root_cgset == criu_cgset && list_is_singular(&cg_sets)) { pr_info("All tasks in criu's cgroups. Nothing to dump.\n"); return 0; } if (dump_sets(&cg)) return -1; if (dump_controllers(&cg)) { goto err; } pr_info("Writing CG image\n"); ret = pb_write_one(img_from_set(glob_imgset, CR_FD_CGROUP), &cg, PB_CGROUP); err: free_sets(&cg, cg.n_sets); xfree(cg.controllers); return ret; } static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, char *opt, int os) { int i, doff = 0, ooff = 0; bool none_opt = false; for (i = 0; i < ctl->n_cnames; i++) { char *n; n = ctl->cnames[i]; if (strstartswith(n, "name=")) { n += 5; if (opt && !none_opt) { ooff += snprintf(opt + ooff, os - ooff, "none,"); none_opt = true; } } doff += snprintf(dir + doff, ds - doff, "%s,", n); if (opt) ooff += snprintf(opt + ooff, os - ooff, "%s,", ctl->cnames[i]); } /* Chop the trailing ','-s */ dir[--doff] = '\0'; if (opt) opt[ooff - 1] = '\0'; return doff; } /* Some properties cannot be restored after the cgroup has children or tasks in * it. We restore these properties as soon as the cgroup is created. */ static const char *special_props[] = { "cpuset.cpus", "cpuset.mems", "devices.list", "memory.kmem.limit_in_bytes", "memory.swappiness", "memory.oom_control", "memory.use_hierarchy", NULL, }; static int userns_move(void *arg, int fd, pid_t pid) { char pidbuf[32]; int cg, len, err; len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid); if (len >= sizeof(pidbuf)) { pr_err("pid printing failed: %d\n", pid); return -1; } cg = get_service_fd(CGROUP_YARD); err = fd = openat(cg, arg, O_WRONLY); if (fd >= 0) { err = write(fd, pidbuf, len); close(fd); } if (err < 0) { pr_perror("Can't move %s into %s (%d/%d)", pidbuf, (char *)arg, err, fd); return -1; } return 0; } static int prepare_cgns(CgSetEntry *se) { int i; bool do_unshare = false; for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int j, aux_off; CgMemberEntry *ce = se->ctls[i]; CgControllerEntry *ctrl = NULL; for (j = 0; j < n_controllers; j++) { CgControllerEntry *cur = controllers[j]; if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name)) { ctrl = cur; break; } } if (!ctrl) { pr_err("No cg_controller_entry found for %s/%s\n", ce->name, ce->path); return -1; } aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); /* We need to do an unshare() here as unshare() pins the root * of the cgroup namespace to whatever the current cgroups are. * For example, consider a task in a cgroup (according to the * host): * * /unsprefix/insidecontainer * * If the task first moved itself into /unsprefix, then did unshare(), * when the task examines its own /proc/self/cgroup file it will see /, * but to the host it is really in /unsprefix. Then if it further enters * /insidecontainer here, the full host path will be * /unsprefix/insidecontianer. There is no way to say "set the cgroup * namespace boundary at /unsprefix" without first entering that, doing * the unshare, and then entering the rest of the path. */ if (ce->has_cgns_prefix) { char tmp = ce->path[ce->cgns_prefix]; ce->path[ce->cgns_prefix] = '\0'; pr_info("setting cgns prefix to %s\n", ce->path); snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path); ce->path[ce->cgns_prefix] = tmp; if (userns_call(userns_move, 0, aux, strlen(aux) + 1, -1) < 0) { pr_perror("couldn't set cgns prefix %s", aux); return -1; } do_unshare = true; } } if (do_unshare && unshare(CLONE_NEWCGROUP) < 0) { pr_perror("couldn't unshare cgns"); return -1; } return 0; } static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) { int i; pr_info("Move into %d\n", se->id); if (setup_cgns && prepare_cgns(se) < 0) { pr_err("failed preparing cgns\n"); return -1; } for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int fd = -1, err, j, aux_off; CgMemberEntry *ce = se->ctls[i]; CgControllerEntry *ctrl = NULL; for (j = 0; j < n_controllers; j++) { CgControllerEntry *cur = controllers[j]; if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name)) { ctrl = cur; break; } } if (!ctrl) { pr_err("No cg_controller_entry found for %s/%s\n", ce->name, ce->path); return -1; } aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); /* Note that unshare(CLONE_NEWCGROUP) doesn't change the view * of previously mounted cgroupfses; since we're restoring via * a dirfd pointing to the cg yard set up by when criu was in * the root cgns, we still want to use the full path here when * we move into the cgroup. */ snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path); pr_debug(" `-> %s\n", aux); err = userns_call(userns_move, 0, aux, strlen(aux) + 1, -1); if (err < 0) { pr_perror("Can't move into %s (%d/%d)", aux, err, fd); return -1; } } return 0; } int prepare_task_cgroup(struct pstree_item *me) { CgSetEntry *se; u32 current_cgset; if (!rsti(me)->cg_set) return 0; if (me->parent) current_cgset = rsti(me->parent)->cg_set; else current_cgset = root_cg_set; if (rsti(me)->cg_set == current_cgset) { pr_info("Cgroups %d inherited from parent\n", current_cgset); return 0; } se = find_rst_set_by_id(rsti(me)->cg_set); if (!se) { pr_err("No set %d found\n", rsti(me)->cg_set); return -1; } /* Since don't support nesting of cgroup namespaces, let's only set up * the cgns (if it exists) in the init task. In the future, we should * just check that the cgns prefix string matches for all the entries * in the cgset, and only unshare if that's true. */ return move_in_cgroup(se, !me->parent); } void fini_cgroup(void) { if (!cg_yard) return; close_service_fd(CGROUP_YARD); umount2(cg_yard, MNT_DETACH); rmdir(cg_yard); xfree(cg_yard); cg_yard = NULL; } static int restore_perms(int fd, const char *path, CgroupPerms *perms) { struct stat sb; if (perms) { if (fstat(fd, &sb) < 0) { pr_perror("stat of property %s failed", path); return -1; } /* only chmod/chown if the perms are actually different: we aren't * allowed to chmod some cgroup props (e.g. the read only ones), so we * don't want to try if the perms already match. */ if (sb.st_mode != (mode_t) perms->mode && fchmod(fd, perms->mode) < 0) { pr_perror("chmod of %s failed", path); return -1; } if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) && fchown(fd, perms->uid, perms->gid)) { pr_perror("chown of %s failed", path); return -1; } } return 0; } static int restore_cgroup_prop(const CgroupPropEntry * cg_prop_entry_p, char *path, int off) { int cg, fd, len, ret = -1; CgroupPerms *perms = cg_prop_entry_p->perms; if (!cg_prop_entry_p->value) { pr_err("cg_prop_entry->value was empty when should have had a value\n"); return -1; } if (snprintf(path + off, PATH_MAX - off, "/%s", cg_prop_entry_p->name) >= PATH_MAX) { pr_err("snprintf output was truncated for %s\n", cg_prop_entry_p->name); return -1; } pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path); cg = get_service_fd(CGROUP_YARD); fd = openat(cg, path, O_WRONLY); if (fd < 0) { pr_perror("bad cgroup path: %s", path); return -1; } if (restore_perms(fd, path, perms) < 0) goto out; /* skip these two since restoring their values doesn't make sense */ if (!strcmp(cg_prop_entry_p->name, "cgroup.procs") || !strcmp(cg_prop_entry_p->name, "tasks")) { ret = 0; goto out; } len = strlen(cg_prop_entry_p->value); if (write(fd, cg_prop_entry_p->value, len) != len) { pr_perror("Failed writing %s to %s", cg_prop_entry_p->value, path); goto out; } ret = 0; out: if (close(fd) != 0) pr_perror("Failed closing %s", path); return ret; } static CgroupPropEntry *freezer_state_entry; static char freezer_path[PATH_MAX]; int restore_freezer_state(void) { size_t freezer_path_len; if (!freezer_state_entry) return 0; freezer_path_len = strlen(freezer_path); return restore_cgroup_prop(freezer_state_entry, freezer_path, freezer_path_len); } static void add_freezer_state_for_restore(CgroupPropEntry *entry, char *path, size_t path_len) { BUG_ON(path_len >= sizeof(freezer_path)); if (freezer_state_entry) { int max_len, i; max_len = strlen(freezer_path); if (max_len > path_len) max_len = path_len; /* If there are multiple freezer.state properties, that means they had * one common path prefix with no tasks in it. Let's find that common * prefix. */ for (i = 0; i < max_len; i++) { if (freezer_path[i] != path[i]) { freezer_path[i] = 0; return; } } } freezer_state_entry = entry; /* Path is not null terminated at path_len */ strncpy(freezer_path, path, path_len); freezer_path[path_len] = 0; } static int next_device_entry(char *buf) { char *pos = buf; while (1) { if (*pos == '\n') { *pos = '\0'; pos++; break; } else if (*pos == '\0') { break; } pos++; } return pos - buf; } static int prepare_cgroup_dir_properties(char *path, int off, CgroupDirEntry **ents, unsigned int n_ents) { unsigned int i, j; for (i = 0; i < n_ents; i++) { CgroupDirEntry *e = ents[i]; size_t off2 = off; if (strcmp(e->dir_name, "") == 0) goto skip; /* skip root cgroups */ off2 += sprintf(path + off, "/%s", e->dir_name); if (e->n_properties > 0) { for (j = 0; j < e->n_properties; ++j) { int k; bool special = false; if (!strcmp(e->properties[j]->name, "freezer.state")) { add_freezer_state_for_restore(e->properties[j], path, off2); continue; /* skip restore now */ } /* Skip restoring special cpuset props now. * They were restored earlier, and can cause * the restore to fail if some other task has * entered the cgroup. */ for (k = 0; special_props[k]; k++) { if (!strcmp(e->properties[j]->name, special_props[k])) { special = true; break; } } if (special) continue; if (restore_cgroup_prop(e->properties[j], path, off2) < 0) { return -1; } } } skip: if (prepare_cgroup_dir_properties(path, off2, e->children, e->n_children) < 0) return -1; } return 0; } int prepare_cgroup_properties(void) { char cname_path[PATH_MAX]; unsigned int i, off; for (i = 0; i < n_controllers; i++) { CgControllerEntry *c = controllers[i]; if (c->n_cnames < 1) { pr_err("Each CgControllerEntry should have at least 1 cname\n"); return -1; } off = ctrl_dir_and_opt(c, cname_path, sizeof(cname_path), NULL, 0); if (prepare_cgroup_dir_properties(cname_path, off, c->dirs, c->n_dirs) < 0) return -1; } return 0; } static int restore_special_props(char *paux, size_t off, CgroupDirEntry *e) { int i, j; pr_info("Restore special props\n"); for (i = 0; special_props[i]; i++) { const char *name = special_props[i]; for (j = 0; j < e->n_properties; j++) { CgroupPropEntry *prop = e->properties[j]; if (strcmp(name, prop->name) == 0) { /* XXX: we can drop this hack and make * memory.swappiness and memory.oom_control * regular properties when we drop support for * kernels < 3.16. See 3dae7fec5. */ if (!strcmp(prop->name, "memory.swappiness") && !strcmp(prop->value, "60")) { continue; } else if (!strcmp(prop->name, "memory.oom_control") && !strcmp(prop->value, "0")) { continue; } if (!strcmp(e->properties[j]->name, "devices.list")) { /* The devices cgroup must be restored in a * special way: only the contents of * devices.list can be read, and it is a * whitelist of all the devices the cgroup is * allowed to create. To re-creat this * whitelist, we first deny everything via * devices.deny, and then write the list back * into devices.allow. * * Further, we must have a write() call for * each line, because the kernel only parses * the first line of any write(). */ CgroupPropEntry *pe = e->properties[j]; char *old_val = pe->value, *old_name = pe->name; int ret; char *pos; /* A bit of a fudge here. These are * write only by owner by default, but * the container engine could have * changed the perms. We should come up * with a better way to restore all of * this stuff. */ pe->perms->mode = 0200; pe->name = "devices.deny"; pe->value = "a"; ret = restore_cgroup_prop(e->properties[j], paux, off); pe->name = old_name; pe->value = old_val; /* an emptry string here means nothing * is allowed, and the kernel disallows * writing an "" to devices.allow, so * let's just keep going. */ if (!strcmp(pe->value, "")) continue; if (ret < 0) return -1; pe->name = "devices.allow"; pos = pe->value; while (*pos) { int offset = next_device_entry(pos); pe->value = pos; ret = restore_cgroup_prop(pe, paux, off); if (ret < 0) { pe->name = old_name; pe->value = old_val; return -1; } pos += offset; } pe->value = old_val; pe->name = old_name; continue; } if (restore_cgroup_prop(prop, paux, off) < 0) { return -1; } } } } return 0; } static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) { int fd, ret; fd = openat(cg, path, O_DIRECTORY); if (fd < 0) { pr_perror("failed to open cg dir fd (%s) for chowning", path); return -1; } ret = restore_perms(fd, path, perms); close(fd); return ret; } static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux, size_t off, CgroupDirEntry **ents, size_t n_ents) { size_t i, j; CgroupDirEntry *e; int cg = get_service_fd(CGROUP_YARD); for (i = 0; i < n_ents; i++) { size_t off2 = off; e = ents[i]; off2 += sprintf(paux + off, "/%s", e->dir_name); if (faccessat(cg, paux, F_OK, 0) < 0) { if (errno != ENOENT) { pr_perror("Failed accessing cgroup dir %s", paux); return -1; } if (opts.manage_cgroups & (CG_MODE_NONE | CG_MODE_PROPS)) { pr_err("Cgroup dir %s doesn't exist\n", paux); return -1; } if (mkdirpat(cg, paux, 0755)) { pr_perror("Can't make cgroup dir %s", paux); return -1; } pr_info("Created cgroup dir %s\n", paux); if (prepare_dir_perms(cg, paux, e->dir_perms) < 0) return -1; for (j = 0; j < n_controllers; j++) { if (!strcmp(controllers[j], "cpuset") || !strcmp(controllers[j], "memory") || !strcmp(controllers[j], "devices")) { if (restore_special_props(paux, off2, e) < 0) { pr_err("Restoring special cpuset props failed!\n"); return -1; } } } } else { pr_info("Determined cgroup dir %s already exist\n", paux); if (opts.manage_cgroups & CG_MODE_STRICT) { pr_err("Abort restore of existing cgroups\n"); return -1; } if (opts.manage_cgroups & (CG_MODE_SOFT | CG_MODE_NONE)) { pr_info("Skip restoring properties on cgroup dir %s\n", paux); if (e->n_properties > 0) { xfree(e->properties); e->properties = NULL; e->n_properties = 0; } } if (!(opts.manage_cgroups & CG_MODE_NONE) && prepare_dir_perms(cg, paux, e->dir_perms) < 0) return -1; } if (prepare_cgroup_dirs(controllers, n_controllers, paux, off2, e->children, e->n_children) < 0) return -1; } return 0; } /* * Prepare the CGROUP_YARD service descriptor. This guy is * tmpfs mount with the set of ctl->name directories each * one having the respective cgroup mounted. * * It's required for two reasons. * * First, if we move more than one task into cgroups it's * faster to have cgroup tree visible by them all in sime * single place. Searching for this thing existing in the * criu's space is not nice, as parsing /proc/mounts is not * very fast, other than this not all cgroups may be mounted. * * Second, when we have user-namespaces support we will * loose the ability to mount cgroups on-demand, so prepare * them in advance. */ static int prepare_cgroup_sfd(CgroupEntry *ce) { int off, i, ret; char paux[PATH_MAX]; if (!opts.manage_cgroups) return 0; pr_info("Preparing cgroups yard (cgroups restore mode %#x)\n", opts.manage_cgroups); off = sprintf(paux, ".criu.cgyard.XXXXXX"); if (mkdtemp(paux) == NULL) { pr_perror("Can't make temp cgyard dir"); return -1; } cg_yard = xstrdup(paux); if (!cg_yard) { rmdir(paux); return -1; } if (make_yard(cg_yard)) goto err; pr_debug("Opening %s as cg yard\n", cg_yard); i = open(cg_yard, O_DIRECTORY); if (i < 0) { pr_perror("Can't open cgyard"); goto err; } ret = install_service_fd(CGROUP_YARD, i); close(i); if (ret < 0) goto err; paux[off++] = '/'; for (i = 0; i < ce->n_controllers; i++) { int ctl_off = off, yard_off; char opt[128], *yard; CgControllerEntry *ctrl = ce->controllers[i]; if (ctrl->n_cnames < 1) { pr_err("Each cg_controller_entry must have at least 1 controller\n"); goto err; } ctl_off += ctrl_dir_and_opt(ctrl, paux + ctl_off, sizeof(paux) - ctl_off, opt, sizeof(opt)); /* Create controller if not yet present */ if (access(paux, F_OK)) { pr_debug("\tMaking controller dir %s (%s)\n", paux, opt); if (mkdir(paux, 0700)) { pr_perror("\tCan't make controller dir %s", paux); return -1; } if (mount("none", paux, "cgroup", 0, opt) < 0) { pr_perror("\tCan't mount controller dir %s", paux); return -1; } } /* * Finally handle all cgroups for this controller. */ yard = paux + strlen(cg_yard) + 1; yard_off = ctl_off - (strlen(cg_yard) + 1); if (opts.manage_cgroups && prepare_cgroup_dirs(ctrl->cnames, ctrl->n_cnames, yard, yard_off, ctrl->dirs, ctrl->n_dirs)) goto err; } return 0; err: fini_cgroup(); return -1; } static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot) { size_t dirlen = strlen(*dir_name); char *dir = *dir_name; char *dirnew = NULL; size_t i, j; /* * For example we may have the following in the image: * * set * name "hugetlb" * path "/300" * * controller * cnames hugetlb * dirs * dirname "300" * properties ... * * when we're switching to a new root we need to change * @path and don't forget to update the @dirname into * new state. */ for (i = 0; i < cge->n_sets; i++) { CgSetEntry *set = cge->sets[i]; for (j = 0; j < set->n_ctls; j++) { CgMemberEntry *cg = set->ctls[j]; /* * Make sure if it's same controller * and its path with stripping leading * "/" is matching to be renamed. */ if (!(cgroup_contains(controllers, n_controllers, cg->name) && strstartswith(cg->path + 1, dir))) continue; if (cg->has_cgns_prefix && cg->cgns_prefix) { char *prev = cg->path; cg->path = xsprintf("%s%s", newroot, cg->path + cg->cgns_prefix); if (!cg->path) { cg->path = prev; return -ENOMEM; } xfree(prev); if (!dirnew) { /* -1 because cgns_prefix includes leading "/" */ dirnew = xsprintf("%s%s", newroot, dir + cg->cgns_prefix - 1); if (!dirnew) return -ENOMEM; } cg->cgns_prefix = strlen(newroot); } else { char *prev = cg->path; /* * If no prefix present simply rename the * root but make sure the rest of path is * untouched. */ cg->path = xsprintf("%s%s", newroot, cg->path + dirlen + 1); if (!cg->path) { cg->path = prev; return -ENOMEM; } xfree(prev); if (!dirnew) { dirnew = xstrdup(newroot); if (!dirnew) return -ENOMEM; } } } } if (dirnew) { xfree(dir); *dir_name = dirnew; } return 0; } static int rewrite_cgroup_roots(CgroupEntry *cge) { int i, j; struct cg_root_opt *o; char *newroot = NULL; for (i = 0; i < cge->n_controllers; i++) { CgControllerEntry *ctrl = cge->controllers[i]; newroot = opts.new_global_cg_root; list_for_each_entry(o, &opts.new_cgroup_roots, node) { if (cgroup_contains(ctrl->cnames, ctrl->n_cnames, o->controller)) { newroot = o->newroot; break; } } if (newroot) { for (j = 0; j < ctrl->n_dirs; j++) { CgroupDirEntry *cgde = ctrl->dirs[j]; pr_info("rewriting %s to %s\n", cgde->dir_name, newroot); if (rewrite_cgsets(cge, ctrl->cnames, ctrl->n_cnames, &cgde->dir_name, newroot)) return -1; } } } return 0; } int prepare_cgroup(void) { int ret; struct cr_img *img; CgroupEntry *ce; img = open_image(CR_FD_CGROUP, O_RSTR); if (!img) return -1; ret = pb_read_one_eof(img, &ce, PB_CGROUP); close_image(img); if (ret <= 0) /* Zero is OK -- no sets there. */ return ret; if (rewrite_cgroup_roots(ce)) return -1; n_sets = ce->n_sets; rst_sets = ce->sets; n_controllers = ce->n_controllers; controllers = ce->controllers; if (n_sets) /* * We rely on the fact that all sets contain the same * set of controllers. This is checked during dump * with cg_set_compare(CGCMP_ISSUB) call. */ ret = prepare_cgroup_sfd(ce); else ret = 0; return ret; } int new_cg_root_add(char *controller, char *newroot) { struct cg_root_opt *o; if (!controller) { opts.new_global_cg_root = newroot; return 0; } o = xmalloc(sizeof(*o)); if (!o) return -1; o->controller = controller; o->newroot = newroot; list_add(&o->node, &opts.new_cgroup_roots); return 0; } struct ns_desc cgroup_ns_desc = NS_DESC_ENTRY(CLONE_NEWCGROUP, "cgroup"); criu-3.6/criu/clone-noasan.c000066400000000000000000000020401317335042600160070ustar00rootroot00000000000000#include #include "common/compiler.h" /* * ASan doesn't play nicely with clone if we use current stack for * child task. ASan puts local variables on the fake stack * to catch use-after-return bug: * https://github.com/google/sanitizers/wiki/AddressSanitizerUseAfterReturn#algorithm * * So it's become easy to overflow this fake stack frame in cloned child. * We need a real stack for clone(). * * To workaround this we add clone_noasan() not-instrumented wrapper for * clone(). Unfortunately we can't use __attrbute__((no_sanitize_addresss)) * for this because of bug in GCC > 6: * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69863 * * So the only way is to put this wrapper in separate non-instrumented file */ int clone_noasan(int (*fn)(void *), int flags, void *arg) { /* * Reserve some space for clone() to locate arguments * and retcode in this place */ char stack[128] __stack_aligned__; char *stack_ptr = &stack[sizeof(stack)]; int ret; ret = clone(fn, stack_ptr, flags, arg); return ret; } criu-3.6/criu/cr-check.c000066400000000000000000000601241317335042600151200ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../soccr/soccr.h" #include "types.h" #include "fdinfo.h" #include "sockets.h" #include "crtools.h" #include "log.h" #include "util-pie.h" #include "prctl.h" #include "files.h" #include "sk-inet.h" #include "proc_parse.h" #include "mount.h" #include "tty.h" #include #include "ptrace-compat.h" #include "kerndat.h" #include "timerfd.h" #include "util.h" #include "tun.h" #include "namespaces.h" #include "pstree.h" #include "cr_options.h" #include "libnetlink.h" #include "net.h" #include "restorer.h" #include "uffd.h" static char *feature_name(int (*func)()); static int check_tty(void) { int master = -1, slave = -1; const int lock = 1; struct termios t; char *slavename; int ret = -1; if (ARRAY_SIZE(t.c_cc) < TERMIOS_NCC) { pr_msg("struct termios has %d @c_cc while " "at least %d expected.\n", (int)ARRAY_SIZE(t.c_cc), TERMIOS_NCC); goto out; } master = open("/dev/ptmx", O_RDWR); if (master < 0) { pr_perror("Can't open /dev/ptmx"); goto out; } if (ioctl(master, TIOCSPTLCK, &lock)) { pr_perror("Can't lock pty master"); goto out; } slavename = ptsname(master); slave = open(slavename, O_RDWR); if (slave < 0) { if (errno != EIO) { pr_perror("Unexpected error on locked pty"); goto out; } } else { pr_err("Managed to open locked pty.\n"); goto out; } ret = 0; out: close_safe(&master); close_safe(&slave); return ret; } static int check_map_files(void) { int ret; ret = access("/proc/self/map_files", R_OK); if (!ret) return 0; pr_perror("/proc//map_files is inaccessible"); return -1; } static int check_sock_diag(void) { int ret; struct ns_id ns; ns.ns_pid = 0; ns.type = NS_CRIU; ns.net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); if (ns.net.nlsk < 0) { pr_perror("Can't make diag socket for check"); return -1; } ret = collect_sockets(&ns); if (!ret) return 0; pr_msg("The sock diag infrastructure is incomplete.\n"); pr_msg("Make sure you have:\n"); pr_msg(" 1. *_DIAG kernel config options turned on;\n"); pr_msg(" 2. *_diag.ko modules loaded (if compiled as modules).\n"); return -1; } static int check_ns_last_pid(void) { int ret; ret = access("/proc/" LAST_PID_PATH, W_OK); if (!ret) return 0; pr_perror("%s sysctl is inaccessible", LAST_PID_PATH); return -1; } static int check_sock_peek_off(void) { int sk; int ret, off, sz; sk = socket(PF_UNIX, SOCK_DGRAM, 0); if (sk < 0) { pr_perror("Can't create unix socket for check"); return -1; } sz = sizeof(off); ret = getsockopt(sk, SOL_SOCKET, SO_PEEK_OFF, &off, (socklen_t *)&sz); close(sk); if ((ret == 0) && (off == -1) && (sz == sizeof(int))) return 0; pr_msg("SO_PEEK_OFF sockoption doesn't work.\n"); return -1; } static int check_kcmp(void) { int ret = syscall(SYS_kcmp, getpid(), -1, -1, -1, -1); if (ret < 0 && errno == ENOSYS) { pr_perror("System call kcmp is not supported"); return -1; } return 0; } static int check_prctl_cat1(void) { unsigned long user_auxv = 0; unsigned int *tid_addr; unsigned int size = 0; int ret; ret = prctl(PR_GET_TID_ADDRESS, (unsigned long)&tid_addr, 0, 0, 0); if (ret < 0) { pr_msg("prctl: PR_GET_TID_ADDRESS is not supported: %m"); return -1; } /* * It's OK if the new interface is not supported because it's * a Category 2 feature, but the old interface has to be supported. */ ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); if (ret < 0) { pr_msg("Info prctl: PR_SET_MM_MAP_SIZE is not supported\n"); ret = prctl(PR_SET_MM, PR_SET_MM_BRK, (unsigned long)sbrk(0), 0, 0); if (ret < 0) { if (errno == EPERM) pr_msg("prctl: One needs CAP_SYS_RESOURCE capability to perform testing\n"); else pr_msg("prctl: PR_SET_MM_BRK is not supported: %m\n"); return -1; } ret = prctl(PR_SET_MM, PR_SET_MM_EXE_FILE, -1, 0, 0); if (ret < 0 && errno != EBADF) { pr_msg("prctl: PR_SET_MM_EXE_FILE is not supported: %m\n"); return -1; } ret = prctl(PR_SET_MM, PR_SET_MM_AUXV, (long)&user_auxv, sizeof(user_auxv), 0); if (ret < 0) { pr_msg("prctl: PR_SET_MM_AUXV is not supported: %m\n"); return -1; } } return 0; } static int check_prctl_cat2(void) { unsigned int size = 0; int ret; ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); if (ret) { pr_warn("prctl: PR_SET_MM_MAP_SIZE is not supported\n"); return -1; } return 0; } static int check_fcntl(void) { u32 v[2]; int fd; fd = open_proc(PROC_SELF, "comm"); if (fd < 0) return -1; if (fcntl(fd, F_GETOWNER_UIDS, (long)v)) { pr_perror("Can'r fetch file owner UIDs"); close(fd); return -1; } close(fd); return 0; } static int check_proc_stat(void) { struct proc_pid_stat stat; int ret; ret = parse_pid_stat(getpid(), &stat); if (ret) { pr_msg("procfs: stat extension is not supported\n"); return -1; } return 0; } static int check_fdinfo_eventfd(void) { int fd, ret; int cnt = 13; EventfdFileEntry fe = EVENTFD_FILE_ENTRY__INIT; fd = eventfd(cnt, 0); if (fd < 0) { pr_perror("Can't make eventfd"); return -1; } ret = parse_fdinfo(fd, FD_TYPES__EVENTFD, &fe); close(fd); if (ret) { pr_err("Error parsing proc fdinfo\n"); return -1; } if (fe.counter != cnt) { pr_err("Counter mismatch (or not met) %d want %d\n", (int)fe.counter, cnt); return -1; } pr_info("Eventfd fdinfo works OK (%d vs %d)\n", cnt, (int)fe.counter); return 0; } int check_mnt_id(void) { struct fdinfo_common fdinfo = { .mnt_id = -1 }; int ret; ret = parse_fdinfo(get_service_fd(LOG_FD_OFF), FD_TYPES__UND, &fdinfo); if (ret < 0) return -1; if (fdinfo.mnt_id == -1) { pr_err("fdinfo doesn't contain the mnt_id field\n"); return -1; } return 0; } static int check_fdinfo_signalfd(void) { int fd, ret; sigset_t mask; SignalfdEntry sfd = SIGNALFD_ENTRY__INIT; sigemptyset(&mask); sigaddset(&mask, SIGUSR1); fd = signalfd(-1, &mask, 0); if (fd < 0) { pr_perror("Can't make signalfd"); return -1; } ret = parse_fdinfo(fd, FD_TYPES__SIGNALFD, &sfd); close(fd); if (ret) { pr_err("Error parsing proc fdinfo\n"); return -1; } return 0; } static int check_fdinfo_eventpoll(void) { int efd, pfd[2], ret = -1; struct epoll_event ev; EventpollFileEntry efe = EVENTPOLL_FILE_ENTRY__INIT; if (pipe(pfd)) { pr_perror("Can't make pipe to watch"); return -1; } efd = epoll_create(1); if (efd < 0) { pr_perror("Can't make epoll fd"); goto pipe_err; } memset(&ev, 0, sizeof(ev)); ev.events = EPOLLIN | EPOLLOUT; if (epoll_ctl(efd, EPOLL_CTL_ADD, pfd[0], &ev)) { pr_perror("Can't add epoll tfd"); goto epoll_err; } ret = parse_fdinfo(efd, FD_TYPES__EVENTPOLL, &efe); if (ret) { pr_err("Error parsing proc fdinfo\n"); goto epoll_err; } if (efe.n_tfd != 1 || efe.tfd[0]->tfd != pfd[0]) { pr_err("TFD mismatch (or not met)\n"); ret = -1; goto epoll_err; } pr_info("Epoll fdinfo works OK\n"); epoll_err: close(efd); pipe_err: close(pfd[0]); close(pfd[1]); return ret; } static int check_fdinfo_inotify(void) { int ifd, wd, ret; InotifyFileEntry ify = INOTIFY_FILE_ENTRY__INIT; ifd = inotify_init1(0); if (ifd < 0) { pr_perror("Can't make inotify fd"); return -1; } wd = inotify_add_watch(ifd, ".", IN_ALL_EVENTS); if (wd < 0) { pr_perror("Can't add watch"); close(ifd); return -1; } ret = parse_fdinfo(ifd, FD_TYPES__INOTIFY, &ify); close(ifd); if (ret < 0) { pr_err("Error parsing proc fdinfo\n"); return -1; } if (ify.n_wd != 1 || ify.wd[0]->wd != wd) { pr_err("WD mismatch (or not met)\n"); return -1; } pr_info("Inotify fdinfo works OK\n"); return 0; } static int check_fdinfo_ext(void) { int ret = 0; ret |= check_fdinfo_eventfd(); ret |= check_fdinfo_eventpoll(); ret |= check_fdinfo_signalfd(); ret |= check_fdinfo_inotify(); return ret; } static int check_unaligned_vmsplice(void) { int p[2], ret; char buf; /* :) */ struct iovec iov; ret = pipe(p); if (ret < 0) { pr_perror("Can't create pipe"); return ret; } iov.iov_base = &buf; iov.iov_len = sizeof(buf); ret = vmsplice(p[1], &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK); if (ret < 0) { pr_perror("Unaligned vmsplice doesn't work"); goto err; } pr_info("Unaligned vmsplice works OK\n"); ret = 0; err: close(p[0]); close(p[1]); return ret; } #ifndef SO_GET_FILTER #define SO_GET_FILTER SO_ATTACH_FILTER #endif static int check_so_gets(void) { int sk, ret = -1; socklen_t len; char name[IFNAMSIZ]; sk = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); if (sk < 0) { pr_perror("No socket"); return -1; } len = 0; if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len)) { pr_perror("Can't get socket filter"); goto err; } len = sizeof(name); if (getsockopt(sk, SOL_SOCKET, SO_BINDTODEVICE, name, &len)) { pr_perror("Can't get socket bound dev"); goto err; } ret = 0; err: close(sk); return ret; } static int check_ipc(void) { int ret; ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK); if (!ret) return 0; pr_perror("/proc/sys/kernel/sem_next_id is inaccessible"); return -1; } static int check_sigqueuinfo() { siginfo_t info = { .si_code = 1 }; signal(SIGUSR1, SIG_IGN); if (syscall(SYS_rt_sigqueueinfo, getpid(), SIGUSR1, &info) < 0) { pr_perror("Unable to send siginfo with positive si_code to itself"); return -1; } return 0; } static pid_t fork_and_ptrace_attach(int (*child_setup)(void)) { pid_t pid; int sk_pair[2], sk; char c = 0; if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } pid = fork(); if (pid < 0) { pr_perror("fork"); return -1; } else if (pid == 0) { sk = sk_pair[1]; close(sk_pair[0]); if (child_setup && child_setup() != 0) exit(1); if (write(sk, &c, 1) != 1) { pr_perror("write"); exit(1); } while (1) sleep(1000); exit(1); } sk = sk_pair[0]; close(sk_pair[1]); if (read(sk, &c, 1) != 1) { close(sk); kill(pid, SIGKILL); pr_perror("read"); return -1; } close(sk); if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) { pr_perror("Unable to ptrace the child"); kill(pid, SIGKILL); return -1; } waitpid(pid, NULL, 0); return pid; } static int check_ptrace_peeksiginfo() { struct ptrace_peeksiginfo_args arg; siginfo_t siginfo; pid_t pid, ret = 0; k_rtsigset_t mask; pid = fork_and_ptrace_attach(NULL); if (pid < 0) return -1; arg.flags = 0; arg.off = 0; arg.nr = 1; if (ptrace(PTRACE_PEEKSIGINFO, pid, &arg, &siginfo) != 0) { pr_perror("Unable to dump pending signals"); ret = -1; } if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(mask), &mask) != 0) { pr_perror("Unable to dump signal blocking mask"); ret = -1; } kill(pid, SIGKILL); return ret; } static int check_ptrace_suspend_seccomp(void) { pid_t pid; int ret = 0; pid = fork_and_ptrace_attach(NULL); if (pid < 0) return -1; if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { if (errno == EINVAL) { pr_err("Kernel doesn't support PTRACE_O_SUSPEND_SECCOMP\n"); } else { pr_perror("couldn't suspend seccomp"); } ret = -1; } kill(pid, SIGKILL); return ret; } static int setup_seccomp_filter(void) { struct sock_filter filter[] = { BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), /* Allow all syscalls except ptrace */ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_ptrace, 0, 1), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog bpf_prog = { .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), .filter = filter, }; if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, (long) &bpf_prog, 0, 0) < 0) return -1; return 0; } static int check_ptrace_dump_seccomp_filters(void) { pid_t pid; int ret = 0, len; pid = fork_and_ptrace_attach(setup_seccomp_filter); if (pid < 0) return -1; len = ptrace(PTRACE_SECCOMP_GET_FILTER, pid, 0, NULL); if (len < 0) { ret = -1; pr_perror("Dumping seccomp filters not supported"); } kill(pid, SIGKILL); return ret; } static int check_mem_dirty_track(void) { if (!kdat.has_dirty_track) { pr_warn("Dirty tracking is OFF. Memory snapshot will not work.\n"); return -1; } return 0; } static int check_posix_timers(void) { int ret; ret = access("/proc/self/timers", R_OK); if (!ret) return 0; pr_msg("/proc//timers file is missing.\n"); return -1; } static unsigned long get_ring_len(unsigned long addr) { FILE *maps; char buf[256]; maps = fopen_proc(PROC_SELF, "maps"); if (!maps) return 0; while (fgets(buf, sizeof(buf), maps)) { unsigned long start, end; int r, tail; r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n", &start, &end, &tail); if (r != 2) { fclose(maps); pr_err("Bad maps format %d.%d (%s)\n", r, tail, buf + tail); return 0; } if (start == addr) { fclose(maps); if (strcmp(buf + tail, "/[aio] (deleted)\n")) goto notfound; return end - start; } } fclose(maps); notfound: pr_err("No AIO ring at expected location\n"); return 0; } static int check_aio_remap(void) { aio_context_t ctx = 0; unsigned long len; void *naddr; int r; if (syscall(SYS_io_setup, 16, &ctx) < 0) { pr_err("No AIO syscall: %m\n"); return -1; } len = get_ring_len((unsigned long) ctx); if (!len) return -1; naddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0); if (naddr == MAP_FAILED) { pr_perror("Can't find place for new AIO ring"); return -1; } if (mremap((void *)ctx, len, len, MREMAP_FIXED | MREMAP_MAYMOVE, naddr) == MAP_FAILED) { pr_perror("Can't remap AIO ring"); return -1; } ctx = (aio_context_t)naddr; r = syscall(SYS_io_getevents, ctx, 0, 1, NULL, NULL); if (r < 0) { pr_err("AIO remap doesn't work properly: %m\n"); return -1; } return 0; } static int check_fdinfo_lock(void) { if (!kdat.has_fdinfo_lock) { pr_err("fdinfo doesn't contain the lock field\n"); return -1; } return 0; } struct clone_arg { /* * Reserve some space for clone() to locate arguments * and retcode in this place */ char stack[128] __stack_aligned__; char stack_ptr[0]; }; static int clone_cb(void *_arg) { exit(0); } static int check_clone_parent_vs_pid() { struct clone_arg ca; pid_t pid; pid = clone(clone_cb, ca.stack_ptr, CLONE_NEWPID | CLONE_PARENT, &ca); if (pid < 0) { pr_err("CLONE_PARENT | CLONE_NEWPID don't work together\n"); return -1; } return 0; } static int check_autofs_pipe_ino(void) { FILE *f; char str[1024]; int ret = -ENOENT; f = fopen_proc(PROC_SELF, "mountinfo"); if (!f) return -1; while (fgets(str, sizeof(str), f)) { if (strstr(str, " autofs ")) { if (strstr(str, "pipe_ino=")) ret = 0; else { pr_err("autofs not supported.\n"); ret = -ENOTSUP; } break; } } fclose(f); return ret; } static int check_autofs(void) { char *dir, *options, template[] = "/tmp/.criu.mnt.XXXXXX"; int ret, pfd[2]; ret = check_autofs_pipe_ino(); if (ret != -ENOENT) return ret; if (pipe(pfd) < 0) { pr_perror("failed to create pipe"); return -1; } ret = -1; options = xsprintf("fd=%d,pgrp=%d,minproto=5,maxproto=5,direct", pfd[1], getpgrp()); if (!options) { pr_err("failed to allocate autofs options\n"); goto close_pipe; } dir = mkdtemp(template); if (!dir) { pr_perror("failed to construct temporary name"); goto free_options; } if (mount("criu", dir, "autofs", 0, options) < 0) { pr_perror("failed to mount autofs"); goto unlink_dir; } ret = check_autofs_pipe_ino(); if (umount(dir)) pr_perror("failed to umount %s", dir); unlink_dir: if (rmdir(dir)) pr_perror("failed to unlink %s", dir); free_options: free(options); close_pipe: close(pfd[0]); close(pfd[1]); return ret; } static int check_cgroupns(void) { int ret; ret = access("/proc/self/ns/cgroup", F_OK); if (ret < 0) { pr_err("cgroupns not supported. This is not fatal.\n"); return -1; } return 0; } static int check_tcp(void) { socklen_t optlen; int sk, ret; int val; sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); if (sk < 0) { pr_perror("Can't create TCP socket :("); return -1; } val = 1; ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); if (ret < 0) { pr_perror("Can't turn TCP repair mode ON"); goto out; } optlen = sizeof(val); ret = getsockopt(sk, SOL_TCP, TCP_TIMESTAMP, &val, &optlen); if (ret) pr_perror("Can't get TCP_TIMESTAMP"); out: close(sk); return ret; } static int check_tcp_halt_closed(void) { if (!kdat.has_tcp_half_closed) { pr_err("TCP_REPAIR can't be enabled for half-closed sockets\n"); return -1; } return 0; } static int kerndat_tcp_repair_window(void) { struct tcp_repair_window opt; socklen_t optlen = sizeof(opt); int sk, val = 1; sk = socket(AF_INET, SOCK_STREAM, 0); if (sk < 0) { pr_perror("Unable to create inet socket"); goto errn; } if (setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val))) { if (errno == EPERM) { pr_warn("TCP_REPAIR isn't available to unprivileged users\n"); goto now; } pr_perror("Unable to set TCP_REPAIR"); goto err; } if (getsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &opt, &optlen)) { if (errno != ENOPROTOOPT) { pr_perror("Unable to set TCP_REPAIR_WINDOW"); goto err; } now: val = 0; } else val = 1; close(sk); return val; err: close(sk); errn: return -1; } static int check_tcp_window(void) { int ret; ret = kerndat_tcp_repair_window(); if (ret < 0) return -1; if (ret == 0) { pr_err("The TCP_REPAIR_WINDOW option isn't supported.\n"); return -1; } return 0; } static int check_userns(void) { int ret; unsigned long size = 0; ret = access("/proc/self/ns/user", F_OK); if (ret) { pr_perror("No userns proc file"); return -1; } ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); if (ret < 0) { pr_perror("prctl: PR_SET_MM_MAP_SIZE is not supported"); return -1; } return 0; } static int check_loginuid(void) { if (kdat.luid != LUID_FULL) { pr_warn("Loginuid restore is OFF.\n"); return -1; } return 0; } static int check_compat_cr(void) { #ifdef CONFIG_COMPAT if (kdat_compatible_cr()) return 0; pr_warn("compat_cr is not supported. Requires kernel >= v4.12\n"); #else pr_warn("CRIU built without CONFIG_COMPAT - can't C/R ia32\n"); #endif return -1; } static int check_uffd(void) { if (!kdat.has_uffd) { pr_err("UFFD is not supported\n"); return -1; } return 0; } static int check_uffd_noncoop(void) { if (check_uffd()) return -1; if (!uffd_noncooperative()) { pr_err("Non-cooperative UFFD is not supported\n"); return -1; } return 0; } static int check_can_map_vdso(void) { if (kdat_can_map_vdso() == 1) return 0; pr_warn("Do not have API to map vDSO - will use mremap() to restore vDSO\n"); return -1; } static int (*chk_feature)(void); /* * There are three categories of kernel features: * * 1. Absolutely required (/proc/pid/map_files, ptrace PEEKSIGINFO, etc.). * 2. Required only for specific cases (aio remap, tun, etc.). * Checked when --extra or --all is specified. * 3. Experimental (task-diag). * Checked when --experimental or --all is specified. * * We fail if any feature in category 1 is missing but tolerate failures * in the other categories. Currently, there is nothing in category 3. */ #define CHECK_GOOD "Looks good." #define CHECK_BAD "Does not look good." #define CHECK_MAYBE "Looks good but some kernel features are missing\n" \ "which, depending on your process tree, may cause\n" \ "dump or restore failure." #define CHECK_CAT1(fn) do { \ if ((ret = fn) != 0) { \ print_on_level(DEFAULT_LOGLEVEL, "%s\n", CHECK_BAD); \ return ret; \ } \ } while (0) int cr_check(void) { struct ns_id *ns; int ret = 0; if (!is_root_user()) return -1; root_item = alloc_pstree_item(); if (root_item == NULL) return -1; root_item->pid->real = getpid(); if (collect_pstree_ids()) return -1; ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); if (ns == NULL) return -1; mntinfo = collect_mntinfo(ns, false); if (mntinfo == NULL) return -1; if (chk_feature) { if (chk_feature()) return -1; print_on_level(DEFAULT_LOGLEVEL, "%s is supported\n", feature_name(chk_feature)); return 0; } /* * Category 1 - absolutely required. * So that the user can see clearly what's missing, we exit with * non-zero status on the first failure because it gets very * confusing when there are many warnings and error messages. */ CHECK_CAT1(check_map_files()); CHECK_CAT1(check_sock_diag()); CHECK_CAT1(check_ns_last_pid()); CHECK_CAT1(check_sock_peek_off()); CHECK_CAT1(check_kcmp()); CHECK_CAT1(check_prctl_cat1()); CHECK_CAT1(check_fcntl()); CHECK_CAT1(check_proc_stat()); CHECK_CAT1(check_tcp()); CHECK_CAT1(check_fdinfo_ext()); CHECK_CAT1(check_unaligned_vmsplice()); CHECK_CAT1(check_tty()); CHECK_CAT1(check_so_gets()); CHECK_CAT1(check_ipc()); CHECK_CAT1(check_sigqueuinfo()); CHECK_CAT1(check_ptrace_peeksiginfo()); /* * Category 2 - required for specific cases. * Unlike Category 1 features, we don't exit with non-zero status * on a failure because CRIU may still work. */ if (opts.check_extra_features) { ret |= check_prctl_cat2(); ret |= check_ptrace_suspend_seccomp(); ret |= check_ptrace_dump_seccomp_filters(); ret |= check_mem_dirty_track(); ret |= check_posix_timers(); ret |= check_tun_cr(0); ret |= check_timerfd(); ret |= check_mnt_id(); ret |= check_aio_remap(); ret |= check_fdinfo_lock(); ret |= check_clone_parent_vs_pid(); ret |= check_cgroupns(); ret |= check_tcp_window(); ret |= check_tcp_halt_closed(); ret |= check_userns(); ret |= check_loginuid(); ret |= check_can_map_vdso(); } /* * Category 3 - experimental. */ if (opts.check_experimental_features) { ret |= check_autofs(); ret |= check_compat_cr(); } print_on_level(DEFAULT_LOGLEVEL, "%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); return ret; } #undef CHECK_GOOD #undef CHECK_BAD #undef CHECK_MAYBE #undef CHECK_CAT1 static int check_tun(void) { /* * In case there's no TUN support at all we * should report error. Unlike this plain criu * check would report "Looks good" in this case * since C/R effectively works, just not for TUN. */ return check_tun_cr(-1); } struct feature_list { char *name; int (*func)(); }; static struct feature_list feature_list[] = { { "mnt_id", check_mnt_id }, { "mem_dirty_track", check_mem_dirty_track }, { "aio_remap", check_aio_remap }, { "timerfd", check_timerfd }, { "tun", check_tun }, { "userns", check_userns }, { "fdinfo_lock", check_fdinfo_lock }, { "seccomp_suspend", check_ptrace_suspend_seccomp }, { "seccomp_filters", check_ptrace_dump_seccomp_filters }, { "loginuid", check_loginuid }, { "cgroupns", check_cgroupns }, { "autofs", check_autofs }, { "tcp_half_closed", check_tcp_halt_closed }, { "compat_cr", check_compat_cr }, { "uffd", check_uffd }, { "uffd-noncoop", check_uffd_noncoop }, { "can_map_vdso", check_can_map_vdso}, { NULL, NULL }, }; void pr_check_features(const char *offset, const char *sep, int width) { struct feature_list *fl; int pos = width + 1; int sep_len = strlen(sep); int offset_len = strlen(offset); for (fl = feature_list; fl->name; fl++) { int len = strlen(fl->name); if (pos + len + sep_len > width) { pr_msg("\n%s", offset); pos = offset_len; } pr_msg("%s", fl->name); pos += len; if ((fl + 1)->name) { // not the last item pr_msg("%s", sep); pos += sep_len; } } pr_msg("\n"); } int check_add_feature(char *feat) { struct feature_list *fl; for (fl = feature_list; fl->name; fl++) { if (!strcmp(feat, fl->name)) { chk_feature = fl->func; return 0; } } pr_err("Unknown feature %s\n", feat); return -1; } static char *feature_name(int (*func)()) { struct feature_list *fl; for (fl = feature_list; fl->func; fl++) { if (fl->func == func) return fl->name; } return NULL; } criu-3.6/criu/cr-dedup.c000066400000000000000000000033711317335042600151450ustar00rootroot00000000000000#include #include #include #include #include "int.h" #include "crtools.h" #include "pagemap.h" #include "restorer.h" static int cr_dedup_one_pagemap(int id, int flags); int cr_dedup(void) { int close_ret, ret = 0; int id; DIR * dirp; struct dirent *ent; dirp = opendir(CR_PARENT_LINK); if (dirp == NULL) { pr_perror("Can't enter previous snapshot folder, error=%d", errno); ret = -1; goto err; } while (1) { errno = 0; ent = readdir(dirp); if (ent == NULL) { if (errno) { pr_perror("Failed readdir, error=%d", errno); ret = -1; goto err; } break; } ret = sscanf(ent->d_name, "pagemap-%d.img", &id); if (ret == 1) { pr_info("pid=%d\n", id); ret = cr_dedup_one_pagemap(id, PR_TASK); if (ret < 0) break; } ret = sscanf(ent->d_name, "pagemap-shmem-%d.img", &id); if (ret == 1) { pr_info("shmid=%d\n", id); ret = cr_dedup_one_pagemap(id, PR_SHMEM); if (ret < 0) break; } } err: if (dirp) { close_ret = closedir(dirp); if (close_ret == -1) return close_ret; } if (ret < 0) return ret; pr_info("Deduplicated\n"); return 0; } static int cr_dedup_one_pagemap(int id, int flags) { int ret; struct page_read pr; struct page_read * prp; flags |= PR_MOD; ret = open_page_read(id, &pr, flags); if (ret <= 0) return -1; prp = pr.parent; if (!prp) goto exit; while (1) { ret = pr.advance(&pr); if (ret <= 0) goto exit; pr_debug("dedup iovec base=%"PRIx64", len=%lu\n", pr.pe->vaddr, pagemap_len(pr.pe)); if (!pagemap_in_parent(pr.pe)) { ret = dedup_one_iovec(prp, pr.pe->vaddr, pagemap_len(pr.pe)); if (ret) goto exit; } } exit: pr.close(&pr); if (ret < 0) return ret; return 0; } criu-3.6/criu/cr-dump.c000066400000000000000000001156671317335042600150250ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/fs.pb-c.h" #include "images/mm.pb-c.h" #include "images/creds.pb-c.h" #include "images/core.pb-c.h" #include "images/file-lock.pb-c.h" #include "images/rlimit.pb-c.h" #include "images/siginfo.pb-c.h" #include "common/list.h" #include "imgset.h" #include "file-ids.h" #include "kcmp-ids.h" #include "common/compiler.h" #include "crtools.h" #include "cr_options.h" #include "servicefd.h" #include "string.h" #include "ptrace-compat.h" #include "util.h" #include "namespaces.h" #include "image.h" #include "proc_parse.h" #include "parasite.h" #include "parasite-syscall.h" #include "files.h" #include "files-reg.h" #include "shmem.h" #include "sk-inet.h" #include "pstree.h" #include "mount.h" #include "tty.h" #include "net.h" #include "sk-packet.h" #include "cpu.h" #include "elf.h" #include "cgroup.h" #include "cgroup-props.h" #include "file-lock.h" #include "page-xfer.h" #include "kerndat.h" #include "stats.h" #include "mem.h" #include "page-pipe.h" #include "posix-timer.h" #include "vdso.h" #include "vma.h" #include "cr-service.h" #include "plugin.h" #include "irmap.h" #include "sysfs_parse.h" #include "action-scripts.h" #include "aio.h" #include "lsm.h" #include "seccomp.h" #include "seize.h" #include "fault-injection.h" #include "dump.h" /* * Architectures can overwrite this function to restore register sets that * are not covered by ptrace_set/get_regs(). * * with_threads = false: Only the register sets of the tasks are restored * with_threads = true : The register sets of the tasks with all their threads * are restored */ int __attribute__((weak)) arch_set_thread_regs(struct pstree_item *item, bool with_threads) { return 0; } static char loc_buf[PAGE_SIZE]; void free_mappings(struct vm_area_list *vma_area_list) { struct vma_area *vma_area, *p; list_for_each_entry_safe(vma_area, p, &vma_area_list->h, list) { if (!vma_area->file_borrowed) free(vma_area->vmst); free(vma_area); } INIT_LIST_HEAD(&vma_area_list->h); vma_area_list->nr = 0; } int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_file) { int ret = -1; pr_info("\n"); pr_info("Collecting mappings (pid: %d)\n", pid); pr_info("----------------------------------------\n"); ret = parse_smaps(pid, vma_area_list, dump_file); if (ret < 0) goto err; pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->priv_longest); pr_info_vma_list(&vma_area_list->h); pr_info("----------------------------------------\n"); err: return ret; } static int dump_sched_info(int pid, ThreadCoreEntry *tc) { int ret; struct sched_param sp; BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */ /* * In musl-libc sched_getscheduler and sched_getparam don't call * syscalls and instead the always return -ENOSYS */ ret = syscall(__NR_sched_getscheduler, pid); if (ret < 0) { pr_perror("Can't get sched policy for %d", pid); return -1; } pr_info("%d has %d sched policy\n", pid, ret); tc->has_sched_policy = true; tc->sched_policy = ret; if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) { ret = syscall(__NR_sched_getparam, pid, &sp); if (ret < 0) { pr_perror("Can't get sched param for %d", pid); return -1; } pr_info("\tdumping %d prio for %d\n", sp.sched_priority, pid); tc->has_sched_prio = true; tc->sched_prio = sp.sched_priority; } /* * The nice is ignored for RT sched policies, but is stored * in kernel. Thus we have to take it with us in the image. */ errno = 0; ret = getpriority(PRIO_PROCESS, pid); if (ret == -1 && errno) { pr_perror("Can't get nice for %d ret %d", pid, ret); return -1; } pr_info("\tdumping %d nice for %d\n", ret, pid); tc->has_sched_nice = true; tc->sched_nice = ret; return 0; } struct cr_imgset *glob_imgset; static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) { struct dirent *de; DIR *fd_dir; int size = 0; int n; pr_info("\n"); pr_info("Collecting fds (pid: %d)\n", pid); pr_info("----------------------------------------\n"); fd_dir = opendir_proc(pid, "fd"); if (!fd_dir) return -1; n = 0; while ((de = readdir(fd_dir))) { if (dir_dots(de)) continue; if (sizeof(struct parasite_drain_fd) + sizeof(int) * (n + 1) > size) { struct parasite_drain_fd *t; size += PAGE_SIZE; t = xrealloc(*dfds, size); if (!t) return -1; *dfds = t; } (*dfds)->fds[n++] = atoi(de->d_name); } (*dfds)->nr_fds = n; pr_info("Found %d file descriptors\n", n); pr_info("----------------------------------------\n"); closedir(fd_dir); return 0; } static int fill_fd_params_special(int fd, struct fd_parms *p) { *p = FD_PARMS_INIT; if (fstat(fd, &p->stat) < 0) { pr_perror("Can't fstat exe link"); return -1; } if (get_fd_mntid(fd, &p->mnt_id)) return -1; return 0; } static long get_fs_type(int lfd) { struct statfs fst; if (fstatfs(lfd, &fst)) { pr_perror("Unable to statfs fd %d", lfd); return -1; } return fst.f_type; } static int dump_one_reg_file_cond(int lfd, u32 *id, struct fd_parms *parms) { if (fd_id_generate_special(parms, id)) { parms->fs_type = get_fs_type(lfd); if (parms->fs_type < 0) return -1; return dump_one_reg_file(lfd, *id, parms); } return 0; } static int dump_task_exe_link(pid_t pid, MmEntry *mm) { struct fd_parms params; int fd, ret = 0; fd = open_proc_path(pid, "exe"); if (fd < 0) return -1; if (fill_fd_params_special(fd, ¶ms)) return -1; ret = dump_one_reg_file_cond(fd, &mm->exe_file_id, ¶ms); close(fd); return ret; } static int dump_task_fs(pid_t pid, struct parasite_dump_misc *misc, struct cr_imgset *imgset) { struct fd_parms p; FsEntry fe = FS_ENTRY__INIT; int fd, ret; fe.has_umask = true; fe.umask = misc->umask; fd = open_proc_path(pid, "cwd"); if (fd < 0) return -1; if (fill_fd_params_special(fd, &p)) return -1; ret = dump_one_reg_file_cond(fd, &fe.cwd_id, &p); if (ret < 0) return ret; close(fd); fd = open_proc_path(pid, "root"); if (fd < 0) return -1; if (fill_fd_params_special(fd, &p)) return -1; ret = dump_one_reg_file_cond(fd, &fe.root_id, &p); if (ret < 0) return ret; close(fd); pr_info("Dumping task cwd id %#x root id %#x\n", fe.cwd_id, fe.root_id); return pb_write_one(img_from_set(imgset, CR_FD_FS), &fe, PB_FS); } static inline rlim_t encode_rlim(rlim_t val) { return val == RLIM_INFINITY ? -1 : val; } static int dump_task_rlimits(int pid, TaskRlimitsEntry *rls) { int res; for (res = 0; res n_rlimits ; res++) { struct rlimit64 lim; if (syscall(__NR_prlimit64, pid, res, NULL, &lim)) { pr_perror("Can't get rlimit %d", res); return -1; } rls->rlimits[res]->cur = encode_rlim(lim.rlim_cur); rls->rlimits[res]->max = encode_rlim(lim.rlim_max); } return 0; } static int dump_pid_misc(pid_t pid, TaskCoreEntry *tc) { int ret; if (kdat.luid != LUID_NONE) { pr_info("dumping /proc/%d/loginuid\n", pid); tc->has_loginuid = true; tc->loginuid = parse_pid_loginuid(pid, &ret, false); tc->loginuid = userns_uid(tc->loginuid); /* * loginuid dumping is critical, as if not correctly * restored, you may loss ability to login via SSH to CT */ if (ret < 0) return ret; } else { tc->has_loginuid = false; } pr_info("dumping /proc/%d/oom_score_adj\n", pid); tc->oom_score_adj = parse_pid_oom_score_adj(pid, &ret); /* * oom_score_adj dumping is not very critical, as it will affect * on victim in OOM situation and one will find dumping error in log */ if (ret < 0) tc->has_oom_score_adj = false; else tc->has_oom_score_adj = true; return 0; } static int dump_filemap(struct vma_area *vma_area, int fd) { struct fd_parms p = FD_PARMS_INIT; VmaEntry *vma = vma_area->e; int ret = 0; u32 id; BUG_ON(!vma_area->vmst); p.stat = *vma_area->vmst; p.mnt_id = vma_area->mnt_id; /* * AUFS support to compensate for the kernel bug * exposing branch pathnames in map_files. * * If the link found in vma_get_mapfile() pointed * inside a branch, we should use the pathname * from root that was saved in vma_area->aufs_rpath. */ if (vma_area->aufs_rpath) { struct fd_link aufs_link; strlcpy(aufs_link.name, vma_area->aufs_rpath, sizeof(aufs_link.name)); aufs_link.len = strlen(aufs_link.name); p.link = &aufs_link; } /* Flags will be set during restore in open_filmap() */ ret = dump_one_reg_file_cond(fd, &id, &p); vma->shmid = id; return ret; } static int check_sysvipc_map_dump(pid_t pid, VmaEntry *vma) { if (root_ns_mask & CLONE_NEWIPC) return 0; pr_err("Task %d with SysVIPC shmem map @%"PRIx64" doesn't live in IPC ns\n", pid, vma->start); return -1; } static int get_task_auxv(pid_t pid, MmEntry *mm) { auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; int fd, i, ret; pr_info("Obtaining task auvx ...\n"); fd = open_proc(pid, "auxv"); if (fd < 0) return -1; ret = read(fd, mm_saved_auxv, sizeof(mm_saved_auxv)); if (ret < 0) { ret = -1; pr_perror("Error reading %d's auxv", pid); goto err; } else { mm->n_mm_saved_auxv = ret / sizeof(auxv_t); for (i = 0; i < mm->n_mm_saved_auxv; i++) mm->mm_saved_auxv[i] = (u64)mm_saved_auxv[i]; } ret = 0; err: close_safe(&fd); return ret; } static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, const struct parasite_dump_misc *misc, const struct vm_area_list *vma_area_list, const struct cr_imgset *imgset) { MmEntry mme = MM_ENTRY__INIT; struct vma_area *vma_area; int ret = -1, i = 0; pr_info("\n"); pr_info("Dumping mm (pid: %d)\n", pid); pr_info("----------------------------------------\n"); mme.n_vmas = vma_area_list->nr; mme.vmas = xmalloc(mme.n_vmas * sizeof(VmaEntry *)); if (!mme.vmas) return -1; list_for_each_entry(vma_area, &vma_area_list->h, list) { VmaEntry *vma = vma_area->e; pr_info_vma(vma_area); if (!vma_entry_is(vma, VMA_AREA_REGULAR)) ret = 0; else if (vma_entry_is(vma, VMA_AREA_SYSVIPC)) ret = check_sysvipc_map_dump(pid, vma); else if (vma_entry_is(vma, VMA_AREA_SOCKET)) ret = dump_socket_map(vma_area); else ret = 0; if (ret) goto err; mme.vmas[i++] = vma; if (vma_entry_is(vma, VMA_AREA_AIORING)) { ret = dump_aio_ring(&mme, vma_area); if (ret) goto err; } } mme.mm_start_code = stat->start_code; mme.mm_end_code = stat->end_code; mme.mm_start_data = stat->start_data; mme.mm_end_data = stat->end_data; mme.mm_start_stack = stat->start_stack; mme.mm_start_brk = stat->start_brk; mme.mm_arg_start = stat->arg_start; mme.mm_arg_end = stat->arg_end; mme.mm_env_start = stat->env_start; mme.mm_env_end = stat->env_end; mme.mm_brk = misc->brk; mme.dumpable = misc->dumpable; mme.has_dumpable = true; mme.thp_disabled = misc->thp_disabled; mme.has_thp_disabled = true; mme.n_mm_saved_auxv = AT_VECTOR_SIZE; mme.mm_saved_auxv = xmalloc(pb_repeated_size(&mme, mm_saved_auxv)); if (!mme.mm_saved_auxv) goto err; if (get_task_auxv(pid, &mme)) goto err; if (dump_task_exe_link(pid, &mme)) goto err; ret = pb_write_one(img_from_set(imgset, CR_FD_MM), &mme, PB_MM); xfree(mme.mm_saved_auxv); free_aios(&mme); err: xfree(mme.vmas); return ret; } static int get_task_futex_robust_list(pid_t pid, ThreadCoreEntry *info) { struct robust_list_head *head = NULL; size_t len = 0; int ret; ret = syscall(SYS_get_robust_list, pid, &head, &len); if (ret < 0 && errno == ENOSYS) { /* * If the kernel says get_robust_list is not implemented, then * check whether set_robust_list is also not implemented, in * that case we can assume it is empty, since set_robust_list * is the only way to populate it. This case is possible when * "futex_cmpxchg_enabled" is unset in the kernel. * * The following system call should always fail, even if it is * implemented, in which case it will return -EINVAL because * len should be greater than zero. */ ret = syscall(SYS_set_robust_list, NULL, 0); if (ret == 0 || (ret < 0 && errno != ENOSYS)) goto err; head = NULL; len = 0; } else if (ret) { goto err; } info->futex_rla = encode_pointer(head); info->futex_rla_len = (u32)len; return 0; err: pr_err("Failed obtaining futex robust list on %d\n", pid); return -1; } static int get_task_personality(pid_t pid, u32 *personality) { int fd, ret = -1; pr_info("Obtaining personality ... \n"); fd = open_proc(pid, "personality"); if (fd < 0) goto err; ret = read(fd, loc_buf, sizeof(loc_buf) - 1); close(fd); if (ret >= 0) { loc_buf[ret] = '\0'; *personality = atoi(loc_buf); } err: return ret; } static DECLARE_KCMP_TREE(vm_tree, KCMP_VM); static DECLARE_KCMP_TREE(fs_tree, KCMP_FS); static DECLARE_KCMP_TREE(files_tree, KCMP_FILES); static DECLARE_KCMP_TREE(sighand_tree, KCMP_SIGHAND); static int dump_task_kobj_ids(struct pstree_item *item) { int new; struct kid_elem elem; int pid = item->pid->real; TaskKobjIdsEntry *ids = item->ids; elem.pid = pid; elem.idx = 0; /* really 0 for all */ elem.genid = 0; /* FIXME optimize */ new = 0; ids->vm_id = kid_generate_gen(&vm_tree, &elem, &new); if (!ids->vm_id || !new) { pr_err("Can't make VM id for %d\n", pid); return -1; } new = 0; ids->fs_id = kid_generate_gen(&fs_tree, &elem, &new); if (!ids->fs_id || !new) { pr_err("Can't make FS id for %d\n", pid); return -1; } new = 0; ids->files_id = kid_generate_gen(&files_tree, &elem, &new); if (!ids->files_id || (!new && !shared_fdtable(item))) { pr_err("Can't make FILES id for %d\n", pid); return -1; } new = 0; ids->sighand_id = kid_generate_gen(&sighand_tree, &elem, &new); if (!ids->sighand_id || !new) { pr_err("Can't make IO id for %d\n", pid); return -1; } return 0; } int get_task_ids(struct pstree_item *item) { int ret; item->ids = xmalloc(sizeof(*item->ids)); if (!item->ids) goto err; task_kobj_ids_entry__init(item->ids); if (item->pid->state != TASK_DEAD) { ret = dump_task_kobj_ids(item); if (ret) goto err_free; ret = dump_task_ns_ids(item); if (ret) goto err_free; } return 0; err_free: xfree(item->ids); item->ids = NULL; err: return -1; } static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_imgset) { return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS); } int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti) { int ret; ThreadCoreEntry *tc = core->thread_core; ret = collect_lsm_profile(pid, tc->creds); if (!ret) { /* * XXX: It's possible to set two: 32-bit and 64-bit * futex list's heads. That makes about no sense, but * it's possible. Until we meet such application, dump * only one: native or compat futex's list pointer. */ if (!core_is_compat(core)) ret = get_task_futex_robust_list(pid, tc); else ret = get_task_futex_robust_list_compat(pid, tc); } if (!ret) ret = dump_sched_info(pid, tc); if (!ret) { core_put_tls(core, ti->tls); CORE_THREAD_ARCH_INFO(core)->clear_tid_addr = encode_pointer(ti->tid_addr); BUG_ON(!tc->sas); copy_sas(tc->sas, &ti->sas); if (ti->pdeath_sig) { tc->has_pdeath_sig = true; tc->pdeath_sig = ti->pdeath_sig; } } return ret; } static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item, const struct proc_pid_stat *stat, const struct cr_imgset *cr_imgset) { struct cr_img *img; CoreEntry *core = item->core[0]; pid_t pid = item->pid->real; int ret = -1; struct proc_status_creds *creds; struct parasite_dump_cgroup_args cgroup_args, *info = NULL; BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); pr_info("\n"); pr_info("Dumping core (pid: %d)\n", pid); pr_info("----------------------------------------\n"); ret = get_task_personality(pid, &core->tc->personality); if (ret < 0) goto err; creds = dmpi(item)->pi_creds; if (creds->s.seccomp_mode != SECCOMP_MODE_DISABLED) { pr_info("got seccomp mode %d for %d\n", creds->s.seccomp_mode, vpid(item)); core->tc->has_seccomp_mode = true; core->tc->seccomp_mode = creds->s.seccomp_mode; if (creds->s.seccomp_mode == SECCOMP_MODE_FILTER) { core->tc->has_seccomp_filter = true; core->tc->seccomp_filter = creds->last_filter; } } strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); core->tc->flags = stat->flags; core->tc->task_state = item->pid->state; core->tc->exit_code = 0; ret = parasite_dump_thread_leader_seized(ctl, pid, core); if (ret) goto err; ret = dump_pid_misc(pid, core->tc); if (ret) goto err; ret = dump_task_rlimits(pid, core->tc->rlimits); if (ret) goto err; /* For now, we only need to dump the root task's cgroup ns, because we * know all the tasks are in the same cgroup namespace because we don't * allow nesting. */ if (item->ids->has_cgroup_ns_id && !item->parent) { info = &cgroup_args; ret = parasite_dump_cgroup(ctl, &cgroup_args); if (ret) goto err; } core->tc->has_cg_set = true; ret = dump_task_cgroup(item, &core->tc->cg_set, info); if (ret) goto err; img = img_from_set(cr_imgset, CR_FD_CORE); ret = pb_write_one(img, core, PB_CORE); if (ret < 0) goto err; err: pr_info("----------------------------------------\n"); return ret; } static int collect_pstree_ids_predump(void) { struct pstree_item *item; struct pid pid; struct { struct pstree_item i; struct dmp_info d; } crt = { .i.pid = &pid, }; /* * This thing is normally done inside * write_img_inventory(). */ crt.i.pid->state = TASK_ALIVE; crt.i.pid->real = getpid(); if (predump_task_ns_ids(&crt.i)) return -1; for_each_pstree_item(item) { if (item->pid->state == TASK_DEAD) continue; if (predump_task_ns_ids(item)) return -1; } return 0; } int collect_pstree_ids(void) { struct pstree_item *item; for_each_pstree_item(item) if (get_task_ids(item)) return -1; return 0; } static int collect_file_locks(void) { return parse_file_locks(); } static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstree_item *item, int id) { struct pid *tid = &item->threads[id]; CoreEntry *core = item->core[id]; pid_t pid = tid->real; int ret = -1; struct cr_img *img; pr_info("\n"); pr_info("Dumping core for thread (pid: %d)\n", pid); pr_info("----------------------------------------\n"); ret = parasite_dump_thread_seized(parasite_ctl, id, tid, core); if (ret) { pr_err("Can't dump thread for pid %d\n", pid); goto err; } pstree_insert_pid(tid); img = open_image(CR_FD_CORE, O_DUMP, tid->ns[0].virt); if (!img) goto err; ret = pb_write_one(img, core, PB_CORE); close_image(img); err: pr_info("----------------------------------------\n"); return ret; } static int dump_one_zombie(const struct pstree_item *item, const struct proc_pid_stat *pps) { CoreEntry *core; int ret = -1; struct cr_img *img; core = core_entry_alloc(0, 1); if (!core) return -1; strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); core->tc->task_state = TASK_DEAD; core->tc->exit_code = pps->exit_code; img = open_image(CR_FD_CORE, O_DUMP, vpid(item)); if (!img) goto err; ret = pb_write_one(img, core, PB_CORE); close_image(img); err: core_entry_free(core); return ret; } #define SI_BATCH 32 static int dump_signal_queue(pid_t tid, SignalQueueEntry **sqe, bool group) { struct ptrace_peeksiginfo_args arg; int ret; SignalQueueEntry *queue = NULL; pr_debug("Dump %s signals of %d\n", group ? "shared" : "private", tid); arg.nr = SI_BATCH; arg.flags = 0; if (group) arg.flags |= PTRACE_PEEKSIGINFO_SHARED; arg.off = 0; queue = xmalloc(sizeof(*queue)); if (!queue) return -1; signal_queue_entry__init(queue); while (1) { int nr, si_pos; siginfo_t *si; si = xmalloc(SI_BATCH * sizeof(*si)); if (!si) { ret = -1; break; } nr = ret = ptrace(PTRACE_PEEKSIGINFO, tid, &arg, si); if (ret == 0) break; /* Finished */ if (ret < 0) { if (errno == EIO) { pr_warn("ptrace doesn't support PTRACE_PEEKSIGINFO\n"); ret = 0; } else pr_perror("ptrace"); break; } queue->n_signals += nr; queue->signals = xrealloc(queue->signals, sizeof(*queue->signals) * queue->n_signals); if (!queue->signals) { ret = -1; break; } for (si_pos = queue->n_signals - nr; si_pos < queue->n_signals; si_pos++) { SiginfoEntry *se; se = xmalloc(sizeof(*se)); if (!se) { ret = -1; break; } siginfo_entry__init(se); se->siginfo.len = sizeof(siginfo_t); se->siginfo.data = (void *)si++; /* XXX we don't free cores, but when * we will, this would cause problems */ queue->signals[si_pos] = se; } if (ret < 0) break; arg.off += nr; } *sqe = queue; return ret; } static int dump_task_signals(pid_t pid, struct pstree_item *item) { int i, ret; /* Dump private signals for each thread */ for (i = 0; i < item->nr_threads; i++) { ret = dump_signal_queue(item->threads[i].real, &item->core[i]->thread_core->signals_p, false); if (ret) { pr_err("Can't dump private signals for thread %d\n", item->threads[i].real); return -1; } } /* Dump shared signals */ ret = dump_signal_queue(pid, &item->core[0]->tc->signals_s, true); if (ret) { pr_err("Can't dump shared signals (pid: %d)\n", pid); return -1; } return 0; } static struct proc_pid_stat pps_buf; static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) { int i; for (i = 0; i < item->nr_threads; i++) { /* Leader is already dumped */ if (item->pid->real == item->threads[i].real) { item->threads[i].ns[0].virt = vpid(item); continue; } if (dump_task_thread(parasite_ctl, item, i)) return -1; } return 0; } /* * What this routine does is just reads pid-s of dead * tasks in item's children list from item's ns proc. * * It does *not* find wihch real pid corresponds to * which virtual one, but it's not required -- all we * need to dump for zombie can be found in the same * ns proc. */ static int fill_zombies_pids(struct pstree_item *item) { struct pstree_item *child; int i, nr; pid_t *ch; /* * Pids read here are virtual -- caller has set up * the proc of target pid namespace. */ if (parse_children(vpid(item), &ch, &nr) < 0) return -1; /* * Step 1 -- filter our ch's pid of alive tasks */ list_for_each_entry(child, &item->children, sibling) { if (vpid(child) < 0) continue; for (i = 0; i < nr; i++) { if (ch[i] == vpid(child)) { ch[i] = -1; break; } } } /* * Step 2 -- assign remaining pids from ch on * children's items in arbitrary order. The caller * will then re-read everything needed to dump * zombies using newly obtained virtual pids. */ i = 0; list_for_each_entry(child, &item->children, sibling) { if (vpid(child) > 0) continue; for (; i < nr; i++) { if (ch[i] < 0) continue; child->pid->ns[0].virt = ch[i]; ch[i] = -1; break; } BUG_ON(i == nr); } xfree(ch); return 0; } static int dump_zombies(void) { struct pstree_item *item; int ret = -1; int pidns = root_ns_mask & CLONE_NEWPID; if (pidns && set_proc_fd(get_service_fd(CR_PROC_FD_OFF))) return -1; /* * We dump zombies separately becase for pid-ns case * we'd have to resolve their pids w/o parasite via * target ns' proc. */ for_each_pstree_item(item) { if (item->pid->state != TASK_DEAD) continue; if (vpid(item) < 0) { if (!pidns) item->pid->ns[0].virt = item->pid->real; else if (root_item == item) { pr_err("A root task is dead\n"); goto err; } else if (fill_zombies_pids(item->parent)) goto err; } pr_info("Obtaining zombie stat ... \n"); if (parse_pid_stat(vpid(item), &pps_buf) < 0) goto err; item->sid = pps_buf.sid; item->pgid = pps_buf.pgid; BUG_ON(!list_empty(&item->children)); if (dump_one_zombie(item, &pps_buf) < 0) goto err; } ret = 0; err: if (pidns) close_proc(); return ret; } static int pre_dump_one_task(struct pstree_item *item) { pid_t pid = item->pid->real; struct vm_area_list vmas; struct parasite_ctl *parasite_ctl; int ret = -1; struct parasite_dump_misc misc; struct mem_dump_ctl mdc; INIT_LIST_HEAD(&vmas.h); vmas.nr = 0; pr_info("========================================\n"); pr_info("Pre-dumping task (pid: %d)\n", pid); pr_info("========================================\n"); if (item->pid->state == TASK_STOPPED) { pr_warn("Stopped tasks are not supported\n"); return 0; } if (item->pid->state == TASK_DEAD) return 0; ret = collect_mappings(pid, &vmas, NULL); if (ret) { pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret); goto err; } ret = -1; parasite_ctl = parasite_infect_seized(pid, item, &vmas); if (!parasite_ctl) { pr_err("Can't infect (pid: %d) with parasite\n", pid); goto err_free; } ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); if (ret) { pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); goto err_cure; } ret = parasite_dump_misc_seized(parasite_ctl, &misc); if (ret) { pr_err("Can't dump misc (pid: %d)\n", pid); goto err_cure; } ret = predump_task_files(pid); if (ret) { pr_err("Pre-dumping files failed (pid: %d)\n", pid); goto err_cure; } item->pid->ns[0].virt = misc.pid; mdc.pre_dump = true; mdc.lazy = false; ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); if (ret) goto err_cure; if (compel_cure_remote(parasite_ctl)) pr_err("Can't cure (pid: %d) from parasite\n", pid); err_free: free_mappings(&vmas); err: return ret; err_cure: if (compel_cure(parasite_ctl)) pr_err("Can't cure (pid: %d) from parasite\n", pid); goto err_free; } static int dump_one_task(struct pstree_item *item) { pid_t pid = item->pid->real; struct vm_area_list vmas; struct parasite_ctl *parasite_ctl; int ret, exit_code = -1; struct parasite_dump_misc misc; struct cr_imgset *cr_imgset = NULL; struct parasite_drain_fd *dfds = NULL; struct proc_posix_timers_stat proc_args; struct mem_dump_ctl mdc; INIT_LIST_HEAD(&vmas.h); vmas.nr = 0; pr_info("========================================\n"); pr_info("Dumping task (pid: %d)\n", pid); pr_info("========================================\n"); if (item->pid->state == TASK_DEAD) /* * zombies are dumped separately in dump_zombies() */ return 0; pr_info("Obtaining task stat ... \n"); ret = parse_pid_stat(pid, &pps_buf); if (ret < 0) goto err; ret = collect_mappings(pid, &vmas, dump_filemap); if (ret) { pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret); goto err; } if (!shared_fdtable(item)) { dfds = xmalloc(sizeof(*dfds)); if (!dfds) goto err; ret = collect_fds(pid, &dfds); if (ret) { pr_err("Collect fds (pid: %d) failed with %d\n", pid, ret); goto err; } parasite_ensure_args_size(drain_fds_size(dfds)); } ret = parse_posix_timers(pid, &proc_args); if (ret < 0) { pr_err("Can't read posix timers file (pid: %d)\n", pid); goto err; } parasite_ensure_args_size(posix_timers_dump_size(proc_args.timer_n)); ret = dump_task_signals(pid, item); if (ret) { pr_err("Dump %d signals failed %d\n", pid, ret); goto err; } parasite_ctl = parasite_infect_seized(pid, item, &vmas); if (!parasite_ctl) { pr_err("Can't infect (pid: %d) with parasite\n", pid); goto err; } if (fault_injected(FI_DUMP_EARLY)) { pr_info("fault: CRIU sudden detach\n"); kill(getpid(), SIGKILL); } if (root_ns_mask & CLONE_NEWPID && root_item == item) { int pfd; pfd = parasite_get_proc_fd_seized(parasite_ctl); if (pfd < 0) { pr_err("Can't get proc fd (pid: %d)\n", pid); goto err_cure_imgset; } if (install_service_fd(CR_PROC_FD_OFF, pfd) < 0) goto err_cure_imgset; close(pfd); } ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); if (ret) { pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); goto err_cure_imgset; } ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ if (ret) { pr_err("Failed to check aio rings (pid: %d)\n", pid); goto err_cure_imgset; } ret = parasite_dump_misc_seized(parasite_ctl, &misc); if (ret) { pr_err("Can't dump misc (pid: %d)\n", pid); goto err_cure_imgset; } item->pid->ns[0].virt = misc.pid; pstree_insert_pid(item->pid); item->sid = misc.sid; item->pgid = misc.pgid; pr_info("sid=%d pgid=%d pid=%d\n", item->sid, item->pgid, vpid(item)); if (item->sid == 0) { pr_err("A session leader of %d(%d) is outside of its pid namespace\n", item->pid->real, vpid(item)); goto err_cure; } cr_imgset = cr_task_imgset_open(vpid(item), O_DUMP); if (!cr_imgset) goto err_cure; ret = dump_task_ids(item, cr_imgset); if (ret) { pr_err("Dump ids (pid: %d) failed with %d\n", pid, ret); goto err_cure; } if (dfds) { ret = dump_task_files_seized(parasite_ctl, item, dfds); if (ret) { pr_err("Dump files (pid: %d) failed with %d\n", pid, ret); goto err_cure; } } mdc.pre_dump = false; mdc.lazy = opts.lazy_pages; ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); if (ret) goto err_cure; ret = parasite_dump_sigacts_seized(parasite_ctl, item); if (ret) { pr_err("Can't dump sigactions (pid: %d) with parasite\n", pid); goto err_cure; } ret = parasite_dump_itimers_seized(parasite_ctl, item); if (ret) { pr_err("Can't dump itimers (pid: %d)\n", pid); goto err_cure; } ret = parasite_dump_posix_timers_seized(&proc_args, parasite_ctl, item); if (ret) { pr_err("Can't dump posix timers (pid: %d)\n", pid); goto err_cure; } ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset); if (ret) { pr_err("Dump core (pid: %d) failed with %d\n", pid, ret); goto err_cure; } ret = compel_stop_daemon(parasite_ctl); if (ret) { pr_err("Can't cure (pid: %d) from parasite\n", pid); goto err; } ret = dump_task_threads(parasite_ctl, item); if (ret) { pr_err("Can't dump threads\n"); goto err; } if (opts.lazy_pages) ret = compel_cure_remote(parasite_ctl); else ret = compel_cure(parasite_ctl); if (ret) { pr_err("Can't cure (pid: %d) from parasite\n", pid); goto err; } ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset); if (ret) { pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret); goto err; } ret = dump_task_fs(pid, &misc, cr_imgset); if (ret) { pr_err("Dump fs (pid: %d) failed with %d\n", pid, ret); goto err; } close_cr_imgset(&cr_imgset); exit_code = 0; err: close_pid_proc(); free_mappings(&vmas); xfree(dfds); return exit_code; err_cure: close_cr_imgset(&cr_imgset); err_cure_imgset: compel_cure(parasite_ctl); goto err; } static int alarm_attempts = 0; bool alarm_timeouted() { return alarm_attempts > 0; } static void alarm_handler(int signo) { pr_err("Timeout reached. Try to interrupt: %d\n", alarm_attempts); if (alarm_attempts++ < 5) { alarm(1); /* A curren syscall will be exited with EINTR */ return; } pr_err("FATAL: Unable to interrupt the current operation\n"); BUG(); } static int setup_alarm_handler() { struct sigaction sa = { .sa_handler = alarm_handler, .sa_flags = 0, /* Don't restart syscalls */ }; sigemptyset(&sa.sa_mask); sigaddset(&sa.sa_mask, SIGALRM); if (sigaction(SIGALRM, &sa, NULL)) { pr_perror("Unable to setup SIGALRM handler"); return -1; } return 0; } static int cr_pre_dump_finish(int ret) { struct pstree_item *item; /* * Restore registers for tasks only. The threads have not been * infected. Therefore, the thread register sets have not been changed. */ if (arch_set_thread_regs(root_item, false) < 0) goto err; pstree_switch_state(root_item, TASK_ALIVE); timing_stop(TIME_FROZEN); if (ret < 0) goto err; pr_info("Pre-dumping tasks' memory\n"); for_each_pstree_item(item) { struct parasite_ctl *ctl = dmpi(item)->parasite_ctl; struct page_pipe *mem_pp; struct page_xfer xfer; if (!ctl) continue; pr_info("\tPre-dumping %d\n", vpid(item)); timing_start(TIME_MEMWRITE); ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, vpid(item)); if (ret < 0) goto err; mem_pp = dmpi(item)->mem_pp; ret = page_xfer_dump_pages(&xfer, mem_pp); xfer.close(&xfer); if (ret) goto err; timing_stop(TIME_MEMWRITE); destroy_page_pipe(mem_pp); compel_cure_local(ctl); } free_pstree(root_item); if (irmap_predump_run()) { ret = -1; goto err; } err: if (disconnect_from_page_server()) ret = -1; if (bfd_flush_images()) ret = -1; if (ret) pr_err("Pre-dumping FAILED.\n"); else { write_stats(DUMP_STATS); pr_info("Pre-dumping finished successfully\n"); } return ret; } int cr_pre_dump_tasks(pid_t pid) { struct pstree_item *item; int ret = -1; root_item = alloc_pstree_item(); if (!root_item) goto err; root_item->pid->real = pid; if (!opts.track_mem) { pr_info("Enforcing memory tracking for pre-dump.\n"); opts.track_mem = true; } if (opts.final_state == TASK_DEAD) { pr_info("Enforcing tasks run after pre-dump.\n"); opts.final_state = TASK_ALIVE; } if (init_stats(DUMP_STATS)) goto err; if (cr_plugin_init(CR_PLUGIN_STAGE__PRE_DUMP)) goto err; if (lsm_check_opts()) goto err; if (irmap_load_cache()) goto err; if (cpu_init()) goto err; if (vdso_init_dump()) goto err; if (connect_to_page_server_to_send() < 0) goto err; if (setup_alarm_handler()) goto err; if (collect_pstree()) goto err; if (collect_pstree_ids_predump()) goto err; if (collect_namespaces(false) < 0) goto err; for_each_pstree_item(item) if (pre_dump_one_task(item)) goto err; ret = cr_dump_shmem(); if (ret) goto err; if (irmap_predump_prep()) goto err; ret = 0; err: return cr_pre_dump_finish(ret); } static int cr_lazy_mem_dump(void) { struct pstree_item *item; int ret = 0; pr_info("Starting lazy pages server\n"); ret = cr_page_server(false, true, -1); for_each_pstree_item(item) { destroy_page_pipe(dmpi(item)->mem_pp); compel_cure_local(dmpi(item)->parasite_ctl); } if (ret) pr_err("Lazy pages transfer FAILED.\n"); else pr_info("Lazy pages transfer finished successfully\n"); return ret; } static int cr_dump_finish(int ret) { int post_dump_ret = 0; if (disconnect_from_page_server()) ret = -1; close_cr_imgset(&glob_imgset); if (bfd_flush_images()) ret = -1; cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); cgp_fini(); if (!ret) { /* * It might be a migration case, where we're asked * to dump everything, then some script transfer * image on a new node and we're supposed to kill * dumpee because it continue running somewhere * else. * * Thus ask user via script if we're to break * checkpoint. */ post_dump_ret = run_scripts(ACT_POST_DUMP); if (post_dump_ret) { post_dump_ret = WEXITSTATUS(post_dump_ret); pr_info("Post dump script passed with %d\n", post_dump_ret); } } /* * Dump is complete at this stage. To choose what * to do next we need to consider the following * scenarios * * - error happened during checkpoint: just clean up * everything and continue execution of the dumpee; * * - dump successed but post-dump script returned * some ret code: same as in previous scenario -- * just clean up everything and continue execution, * we will return script ret code back to criu caller * and it's up to a caller what to do with running instance * of the dumpee -- either kill it, or continue running; * * - dump successed but -R option passed, pointing that * we're asked to continue execution of the dumpee. It's * assumed that a user will use post-dump script to keep * consistency of the FS and other resources, we simply * start rollback procedure and cleanup everyhting. */ if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) { network_unlock(); delete_link_remaps(); clean_cr_time_mounts(); } if (!ret && opts.lazy_pages) ret = cr_lazy_mem_dump(); if (arch_set_thread_regs(root_item, true) < 0) return -1; pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); timing_stop(TIME_FROZEN); free_pstree(root_item); free_file_locks(); free_link_remaps(); free_aufs_branches(); free_userns_maps(); close_service_fd(CR_PROC_FD_OFF); if (ret) { pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); pr_info("Dumping finished successfully\n"); } return post_dump_ret ? : (ret != 0); } int cr_dump_tasks(pid_t pid) { InventoryEntry he = INVENTORY_ENTRY__INIT; struct pstree_item *item; int pre_dump_ret = 0; int ret = -1; pr_info("========================================\n"); pr_info("Dumping processes (pid: %d)\n", pid); pr_info("========================================\n"); root_item = alloc_pstree_item(); if (!root_item) goto err; root_item->pid->real = pid; pre_dump_ret = run_scripts(ACT_PRE_DUMP); if (pre_dump_ret != 0) { pr_err("Pre dump script failed with %d!\n", pre_dump_ret); goto err; } if (init_stats(DUMP_STATS)) goto err; if (cr_plugin_init(CR_PLUGIN_STAGE__DUMP)) goto err; if (lsm_check_opts()) goto err; if (irmap_load_cache()) goto err; if (cpu_init()) goto err; if (vdso_init_dump()) goto err; if (cgp_init(opts.cgroup_props, opts.cgroup_props ? strlen(opts.cgroup_props) : 0, opts.cgroup_props_file)) goto err; if (parse_cg_info()) goto err; if (prepare_inventory(&he)) goto err; if (opts.cpu_cap & (CPU_CAP_CPU | CPU_CAP_INS)) { if (cpu_dump_cpuinfo()) goto err; } if (connect_to_page_server_to_send() < 0) goto err; if (setup_alarm_handler()) goto err; /* * The collect_pstree will also stop (PTRACE_SEIZE) the tasks * thus ensuring that they don't modify anything we collect * afterwards. */ if (collect_pstree()) goto err; if (collect_pstree_ids()) goto err; if (network_lock()) goto err; if (collect_file_locks()) goto err; if (collect_namespaces(true) < 0) goto err; glob_imgset = cr_glob_imgset_open(O_DUMP); if (!glob_imgset) goto err; if (collect_seccomp_filters() < 0) goto err; for_each_pstree_item(item) { if (dump_one_task(item)) goto err; } /* * It may happen that a process has completed but its files in * /proc/PID/ are still open by another process. If the PID has been * given to some newer thread since then, we may be unable to dump * all this. */ if (dead_pid_conflict()) goto err; /* MNT namespaces are dumped after files to save remapped links */ if (dump_mnt_namespaces() < 0) goto err; if (dump_file_locks()) goto err; if (dump_verify_tty_sids()) goto err; if (dump_zombies()) goto err; if (dump_pstree(root_item)) goto err; /* * TODO: cr_dump_shmem has to be called before dump_namespaces(), * because page_ids is a global variable and it is used to dump * ipc shared memory, but an ipc namespace is dumped in a child * process. */ ret = cr_dump_shmem(); if (ret) goto err; if (root_ns_mask) if (dump_namespaces(root_item, root_ns_mask) < 0) goto err; ret = dump_cgroups(); if (ret) goto err; ret = fix_external_unix_sockets(); if (ret) goto err; ret = tty_post_actions(); if (ret) goto err; ret = write_img_inventory(&he); if (ret) goto err; err: return cr_dump_finish(ret); } criu-3.6/criu/cr-errno.c000066400000000000000000000002151317335042600151630ustar00rootroot00000000000000static int cr_errno; int get_cr_errno(void) { return cr_errno; } void set_cr_errno(int new_err) { if (!cr_errno) cr_errno = new_err; } criu-3.6/criu/cr-restore.c000066400000000000000000002262441317335042600155350ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include #include "common/compiler.h" #include "clone-noasan.h" #include "cr_options.h" #include "servicefd.h" #include "image.h" #include "util.h" #include "util-pie.h" #include "criu-log.h" #include "restorer.h" #include "sockets.h" #include "sk-packet.h" #include "common/lock.h" #include "files.h" #include "files-reg.h" #include "pipes.h" #include "fifo.h" #include "sk-inet.h" #include "eventfd.h" #include "eventpoll.h" #include "signalfd.h" #include "proc_parse.h" #include "pie/restorer-blob.h" #include "crtools.h" #include "uffd.h" #include "namespaces.h" #include "mem.h" #include "mount.h" #include "fsnotify.h" #include "pstree.h" #include "net.h" #include "tty.h" #include "cpu.h" #include "file-lock.h" #include "vdso.h" #include "stats.h" #include "tun.h" #include "vma.h" #include "kerndat.h" #include "rst-malloc.h" #include "plugin.h" #include "cgroup.h" #include "timerfd.h" #include "file-lock.h" #include "action-scripts.h" #include "shmem.h" #include #include "aio.h" #include "lsm.h" #include "seccomp.h" #include "fault-injection.h" #include "sk-queue.h" #include "sigframe.h" #include "fdstore.h" #include "parasite-syscall.h" #include "files-reg.h" #include #include "compel/include/asm/syscall.h" #include "protobuf.h" #include "images/sa.pb-c.h" #include "images/timer.pb-c.h" #include "images/vma.pb-c.h" #include "images/rlimit.pb-c.h" #include "images/pagemap.pb-c.h" #include "images/siginfo.pb-c.h" #include "restore.h" #include "cr-errno.h" #include "pie/pie-relocs.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread #endif #ifndef arch_export_restore_task #define arch_export_restore_task __export_restore_task #endif #ifndef arch_export_unmap #define arch_export_unmap __export_unmap #define arch_export_unmap_compat __export_unmap_compat #endif struct pstree_item *current; static int restore_task_with_children(void *); static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); static int prepare_restorer_blob(void); static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); /* * Architectures can overwrite this function to restore registers that are not * present in the sigreturn signal frame. */ int __attribute__((weak)) arch_set_thread_regs_nosigrt(struct pid *pid) { return 0; } static inline int stage_participants(int next_stage) { switch (next_stage) { case CR_STATE_FAIL: return 0; case CR_STATE_ROOT_TASK: case CR_STATE_PREPARE_NAMESPACES: return 1; case CR_STATE_FORKING: return task_entries->nr_tasks + task_entries->nr_helpers; case CR_STATE_RESTORE: return task_entries->nr_threads + task_entries->nr_helpers; case CR_STATE_RESTORE_SIGCHLD: return task_entries->nr_threads; case CR_STATE_RESTORE_CREDS: return task_entries->nr_threads; } BUG(); return -1; } static inline int stage_current_participants(int next_stage) { switch (next_stage) { case CR_STATE_FORKING: return 1; case CR_STATE_RESTORE: /* * Each thread has to be reported about this stage, * so if we want to wait all other tast, we have to * exclude all threads of the current process. * It is supposed that we will wait other tasks, * before creating threads of the current task. */ return current->nr_threads; } BUG(); return -1; } static int __restore_wait_inprogress_tasks(int participants) { int ret; futex_t *np = &task_entries->nr_in_progress; futex_wait_while_gt(np, participants); ret = (int)futex_get(np); if (ret < 0) { set_cr_errno(get_task_cr_err()); return ret; } return 0; } static int restore_wait_inprogress_tasks() { return __restore_wait_inprogress_tasks(0); } /* Wait all tasks except the current one */ static int restore_wait_other_tasks() { int participants, stage; stage = futex_get(&task_entries->start); participants = stage_current_participants(stage); return __restore_wait_inprogress_tasks(participants); } static inline void __restore_switch_stage_nw(int next_stage) { futex_set(&task_entries->nr_in_progress, stage_participants(next_stage)); futex_set(&task_entries->start, next_stage); } static inline void __restore_switch_stage(int next_stage) { if (next_stage != CR_STATE_COMPLETE) futex_set(&task_entries->nr_in_progress, stage_participants(next_stage)); futex_set_and_wake(&task_entries->start, next_stage); } static int restore_switch_stage(int next_stage) { __restore_switch_stage(next_stage); return restore_wait_inprogress_tasks(); } static int restore_finish_ns_stage(int from, int to) { if (root_ns_mask) return restore_finish_stage(task_entries, from); /* Nobody waits for this stage change, just go ahead */ __restore_switch_stage_nw(to); return 0; } static int crtools_prepare_shared(void) { if (prepare_files()) return -1; /* We might want to remove ghost files on failed restore */ if (collect_remaps_and_regfiles()) return -1; /* Connections are unlocked from criu */ if (!files_collected() && collect_image(&inet_sk_cinfo)) return -1; if (collect_binfmt_misc()) return -1; if (tty_prep_fds()) return -1; if (prepare_cgroup()) return -1; return 0; } /* * Collect order information: * - reg_file should be before remap, as the latter needs * to find file_desc objects * - per-pid collects (mm and fd) should be after remap and * reg_file since both per-pid ones need to get fdesc-s * and bump counters on remaps if they exist */ static struct collect_image_info *cinfos[] = { &file_locks_cinfo, &pipe_data_cinfo, &fifo_data_cinfo, &sk_queues_cinfo, }; static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, &fanotify_mark_cinfo, &ext_file_cinfo, }; /* These images are requered to restore namespaces */ static struct collect_image_info *before_ns_cinfos[] = { &tty_info_cinfo, /* Restore devpts content */ &tty_cdata, }; static struct pprep_head *post_prepare_heads = NULL; void add_post_prepare_cb(struct pprep_head *ph) { ph->next = post_prepare_heads; post_prepare_heads = ph; } static int run_post_prepare(void) { struct pprep_head *ph; for (ph = post_prepare_heads; ph != NULL; ph = ph->next) if (ph->actor(ph)) return -1; return 0; } static int root_prepare_shared(void) { int ret = 0; struct pstree_item *pi; pr_info("Preparing info about shared resources\n"); if (prepare_remaps()) return -1; if (prepare_seccomp_filters()) return -1; if (collect_images(cinfos, ARRAY_SIZE(cinfos))) return -1; if (!files_collected() && collect_images(cinfos_files, ARRAY_SIZE(cinfos_files))) return -1; for_each_pstree_item(pi) { if (pi->pid->state == TASK_HELPER) continue; ret = prepare_mm_pid(pi); if (ret < 0) break; ret = prepare_fd_pid(pi); if (ret < 0) break; ret = prepare_fs_pid(pi); if (ret < 0) break; } if (ret < 0) goto err; prepare_cow_vmas(); ret = prepare_restorer_blob(); if (ret) goto err; /* * This should be called with all packets collected AND all * fdescs and fles prepared BUT post-prep-s not run. */ ret = prepare_scms(); if (ret) goto err; ret = run_post_prepare(); if (ret) goto err; show_saved_files(); err: return ret; } static rt_sigaction_t sigchld_act; /* * If parent's sigaction has blocked SIGKILL (which is non-sence), * this parent action is non-valid and shouldn't be inherited. * Used to mark parent_act* no more valid. */ static rt_sigaction_t parent_act[SIGMAX]; #ifdef CONFIG_COMPAT static rt_sigaction_t_compat parent_act_compat[SIGMAX]; #endif static bool sa_inherited(int sig, rt_sigaction_t *sa) { rt_sigaction_t *pa; int i; if (current == root_item) return false; /* XXX -- inherit from CRIU? */ pa = &parent_act[sig]; /* Omitting non-valid sigaction */ if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) return false; for (i = 0; i < _KNSIG_WORDS; i++) if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) return false; return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && pa->rt_sa_restorer == sa->rt_sa_restorer; } static int restore_native_sigaction(int sig, SaEntry *e) { rt_sigaction_t act; int ret; ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); ASSIGN_TYPED(act.rt_sa_flags, e->flags); ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); if (sig == SIGCHLD) { sigchld_act = act; return 0; } if (sa_inherited(sig - 1, &act)) return 1; /* * A pure syscall is used, because glibc * sigaction overwrites se_restorer. */ ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); if (ret < 0) { pr_perror("Can't restore sigaction"); return ret; } parent_act[sig - 1] = act; /* Mark SIGKILL blocked which makes compat sigaction non-valid */ #ifdef CONFIG_COMPAT parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; #endif return 1; } static void *stack32; #ifdef CONFIG_COMPAT static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) { rt_sigaction_t_compat *pa; int i; if (current == root_item) return false; pa = &parent_act_compat[sig]; /* Omitting non-valid sigaction */ if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) return false; for (i = 0; i < _KNSIG_WORDS; i++) if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) return false; return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && pa->rt_sa_restorer == sa->rt_sa_restorer; } static int restore_compat_sigaction(int sig, SaEntry *e) { rt_sigaction_t_compat act; int ret; ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); ASSIGN_TYPED(act.rt_sa_flags, e->flags); ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); if (sig == SIGCHLD) { memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); return 0; } if (sa_compat_inherited(sig - 1, &act)) return 1; if (!stack32) { stack32 = alloc_compat_syscall_stack(); if (!stack32) return -1; } ret = arch_compat_rt_sigaction(stack32, sig, &act); if (ret < 0) { pr_err("Can't restore compat sigaction: %d\n", ret); return ret; } parent_act_compat[sig - 1] = act; /* Mark SIGKILL blocked which makes native sigaction non-valid */ parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; return 1; } #else static int restore_compat_sigaction(int sig, SaEntry *e) { return -1; } #endif static int prepare_sigactions_from_core(TaskCoreEntry *tc) { int sig, i; if (tc->n_sigactions != SIGMAX - 2) { pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); return -1; } pr_info("Restore on-core sigactions for %d\n", vpid(current)); for (sig = 1, i = 0; sig <= SIGMAX; sig++) { int ret; SaEntry *e; bool sigaction_is_compat; if (sig == SIGKILL || sig == SIGSTOP) continue; e = tc->sigactions[i++]; sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; if (sigaction_is_compat) ret = restore_compat_sigaction(sig, e); else ret = restore_native_sigaction(sig, e); if (ret < 0) return ret; } return 0; } /* Returns number of restored signals, -1 or negative errno on fail */ static int restore_one_sigaction(int sig, struct cr_img *img, int pid) { bool sigaction_is_compat; SaEntry *e; int ret = 0; BUG_ON(sig == SIGKILL || sig == SIGSTOP); ret = pb_read_one_eof(img, &e, PB_SIGACT); if (ret == 0) { if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ pr_err("Unexpected EOF %d\n", sig); return -1; } pr_warn("This format of sigacts-%d.img is deprecated\n", pid); return -1; } if (ret < 0) return ret; sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; if (sigaction_is_compat) ret = restore_compat_sigaction(sig, e); else ret = restore_native_sigaction(sig, e); sa_entry__free_unpacked(e, NULL); return ret; } static int prepare_sigactions_from_image(void) { int pid = vpid(current); struct cr_img *img; int sig, rst = 0; int ret = 0; pr_info("Restore sigacts for %d\n", pid); img = open_image(CR_FD_SIGACT, O_RSTR, pid); if (!img) return -1; for (sig = 1; sig <= SIGMAX; sig++) { if (sig == SIGKILL || sig == SIGSTOP) continue; ret = restore_one_sigaction(sig, img, pid); if (ret < 0) break; if (ret) rst++; } pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); close_image(img); return ret; } static int prepare_sigactions(CoreEntry *core) { int ret; if (!task_alive(current)) return 0; if (core->tc->n_sigactions != 0) ret = prepare_sigactions_from_core(core->tc); else ret = prepare_sigactions_from_image(); if (stack32) { free_compat_syscall_stack(stack32); stack32 = NULL; } return ret; } static int __collect_child_pids(struct pstree_item *p, int state, unsigned int *n) { struct pstree_item *pi; list_for_each_entry(pi, &p->children, sibling) { pid_t *child; if (pi->pid->state != state) continue; child = rst_mem_alloc(sizeof(*child), RM_PRIVATE); if (!child) return -1; (*n)++; *child = vpid(pi); } return 0; } static int collect_child_pids(int state, unsigned int *n) { struct pstree_item *pi; *n = 0; /* * All children of helpers and zombies will be reparented to the init * process and they have to be collected too. */ if (current == root_item) { for_each_pstree_item(pi) { if (pi->pid->state != TASK_HELPER && pi->pid->state != TASK_DEAD) continue; if (__collect_child_pids(pi, state, n)) return -1; } } return __collect_child_pids(current, state, n); } static int collect_helper_pids(struct task_restore_args *ta) { ta->helpers = (pid_t *)rst_mem_align_cpos(RM_PRIVATE); return collect_child_pids(TASK_HELPER, &ta->helpers_n); } static int collect_zombie_pids(struct task_restore_args *ta) { ta->zombies = (pid_t *)rst_mem_align_cpos(RM_PRIVATE); return collect_child_pids(TASK_DEAD, &ta->zombies_n); } static int open_core(int pid, CoreEntry **pcore) { int ret; struct cr_img *img; img = open_image(CR_FD_CORE, O_RSTR, pid); if (!img) { pr_err("Can't open core data for %d\n", pid); return -1; } ret = pb_read_one(img, pcore, PB_CORE); close_image(img); return ret <= 0 ? -1 : 0; } static int open_cores(int pid, CoreEntry *leader_core) { int i, tpid; CoreEntry **cores = NULL; cores = xmalloc(sizeof(*cores)*current->nr_threads); if (!cores) goto err; for (i = 0; i < current->nr_threads; i++) { tpid = current->threads[i].ns[0].virt; if (tpid == pid) cores[i] = leader_core; else if (open_core(tpid, &cores[i])) goto err; } current->core = cores; return 0; err: xfree(cores); return -1; } static int prepare_oom_score_adj(int value) { int fd, ret = 0; char buf[11]; fd = open_proc_rw(PROC_SELF, "oom_score_adj"); if (fd < 0) return -1; snprintf(buf, 11, "%d", value); if (write(fd, buf, 11) < 0) { pr_perror("Write %s to /proc/self/oom_score_adj failed", buf); ret = -1; } close(fd); return ret; } static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc) { int ret; /* loginuid value is critical to restore */ if (kdat.luid == LUID_FULL && tc->has_loginuid && tc->loginuid != INVALID_UID) { ret = prepare_loginuid(tc->loginuid, LOG_ERROR); if (ret < 0) return ret; } /* oom_score_adj is not critical: only log errors */ if (tc->has_oom_score_adj && tc->oom_score_adj != 0) prepare_oom_score_adj(tc->oom_score_adj); return 0; } static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); static int prepare_mm(pid_t pid, struct task_restore_args *args); static int restore_one_alive_task(int pid, CoreEntry *core) { unsigned args_len; struct task_restore_args *ta; pr_info("Restoring resources\n"); rst_mem_switch_to_private(); args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) * current->nr_threads, page_size()); ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0); if (!ta) return -1; memzero(ta, args_len); if (prepare_fds(current)) return -1; if (prepare_file_locks(pid)) return -1; if (open_vmas(current)) return -1; if (prepare_aios(current, ta)) return -1; if (fixup_sysv_shmems()) return -1; if (open_cores(pid, core)) return -1; if (prepare_signals(pid, ta, core)) return -1; if (prepare_posix_timers(pid, ta, core)) return -1; if (prepare_rlimits(pid, ta, core) < 0) return -1; if (collect_helper_pids(ta) < 0) return -1; if (collect_zombie_pids(ta) < 0) return -1; if (inherit_fd_fini() < 0) return -1; if (prepare_proc_misc(pid, core->tc)) return -1; /* * Get all the tcp sockets fds into rst memory -- restorer * will turn repair off before going sigreturn */ if (prepare_tcp_socks(ta)) return -1; /* * Copy timerfd params for restorer args, we need to proceed * timer setting at the very late. */ if (prepare_timerfds(ta)) return -1; if (seccomp_filters_get_rst_pos(core, ta) < 0) return -1; if (prepare_itimers(pid, ta, core) < 0) return -1; if (prepare_mm(pid, ta)) return -1; if (prepare_vmas(current, ta)) return -1; if (setup_uffd(pid, ta)) return -1; return sigreturn_restore(pid, ta, args_len, core); } static void zombie_prepare_signals(void) { sigset_t blockmask; int sig; struct sigaction act; sigfillset(&blockmask); sigprocmask(SIG_UNBLOCK, &blockmask, NULL); memset(&act, 0, sizeof(act)); act.sa_handler = SIG_DFL; for (sig = 1; sig <= SIGMAX; sig++) sigaction(sig, &act, NULL); } #define SIG_FATAL_MASK ( \ (1 << SIGHUP) |\ (1 << SIGINT) |\ (1 << SIGQUIT) |\ (1 << SIGILL) |\ (1 << SIGTRAP) |\ (1 << SIGABRT) |\ (1 << SIGIOT) |\ (1 << SIGBUS) |\ (1 << SIGFPE) |\ (1 << SIGKILL) |\ (1 << SIGUSR1) |\ (1 << SIGSEGV) |\ (1 << SIGUSR2) |\ (1 << SIGPIPE) |\ (1 << SIGALRM) |\ (1 << SIGTERM) |\ (1 << SIGXCPU) |\ (1 << SIGXFSZ) |\ (1 << SIGVTALRM)|\ (1 << SIGPROF) |\ (1 << SIGPOLL) |\ (1 << SIGIO) |\ (1 << SIGSYS) |\ (1 << SIGSTKFLT)|\ (1 << SIGPWR) \ ) static inline int sig_fatal(int sig) { return (sig > 0) && (sig < SIGMAX) && (SIG_FATAL_MASK & (1UL << sig)); } struct task_entries *task_entries; static unsigned long task_entries_pos; static int wait_on_helpers_zombies(void) { struct pstree_item *pi; list_for_each_entry(pi, ¤t->children, sibling) { pid_t pid = vpid(pi); int status; switch (pi->pid->state) { case TASK_DEAD: if (waitid(P_PID, pid, NULL, WNOWAIT | WEXITED) < 0) { pr_perror("Wait on %d zombie failed", pid); return -1; } break; case TASK_HELPER: if (waitpid(pid, &status, 0) != pid) { pr_perror("waitpid for helper %d failed", pid); return -1; } break; } } return 0; } static int restore_one_zombie(CoreEntry *core) { int exit_code = core->tc->exit_code; pr_info("Restoring zombie with %d code\n", exit_code); if (inherit_fd_fini() < 0) return -1; if (lazy_pages_setup_zombie(vpid(current))) return -1; prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0); if (task_entries != NULL) { restore_finish_stage(task_entries, CR_STATE_RESTORE); zombie_prepare_signals(); } if (exit_code & 0x7f) { int signr; /* prevent generating core files */ if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0)) pr_perror("Can't drop the dumpable flag"); signr = exit_code & 0x7F; if (!sig_fatal(signr)) { pr_warn("Exit with non fatal signal ignored\n"); signr = SIGABRT; } if (kill(vpid(current), signr) < 0) pr_perror("Can't kill myself, will just exit"); exit_code = 0; } exit((exit_code >> 8) & 0x7f); /* never reached */ BUG_ON(1); return -1; } static int check_core(CoreEntry *core, struct pstree_item *me) { int ret = -1; if (core->mtype != CORE_ENTRY__MARCH) { pr_err("Core march mismatch %d\n", (int)core->mtype); goto out; } if (!core->tc) { pr_err("Core task state data missed\n"); goto out; } if (core->tc->task_state != TASK_DEAD) { if (!core->ids && !me->ids) { pr_err("Core IDS data missed for non-zombie\n"); goto out; } if (!CORE_THREAD_ARCH_INFO(core)) { pr_err("Core info data missed for non-zombie\n"); goto out; } } ret = 0; out: return ret; } /* * Find if there are children which are zombies or helpers - processes * which are expected to die during the restore. */ static bool child_death_expected(void) { struct pstree_item *pi; list_for_each_entry(pi, ¤t->children, sibling) { switch (pi->pid->state) { case TASK_DEAD: case TASK_HELPER: return true; } } return false; } /* * Restore a helper process - artificially created by criu * to restore attributes of process tree. * - sessions for each leaders are dead * - process groups with dead leaders * - dead tasks for which /proc//... is opened by restoring task * - whatnot */ static int restore_one_helper(void) { siginfo_t info; if (!child_death_expected()) { /* * Restoree has no children that should die, during restore, * wait for the next stage on futex. * The default SIGCHLD handler will handle an unexpected * child's death and abort the restore if someone dies. */ restore_finish_stage(task_entries, CR_STATE_RESTORE); return 0; } /* * The restoree has children which will die - decrement itself from * nr. of tasks processing the stage and wait for anyone to die. * Tasks may die only when they're on the following stage. * If one dies earlier - that's unexpected - treat it as an error * and abort the restore. */ if (block_sigmask(NULL, SIGCHLD)) return -1; /* Finish CR_STATE_RESTORE, but do not wait for the next stage. */ futex_dec_and_wake(&task_entries->nr_in_progress); if (waitid(P_ALL, 0, &info, WEXITED | WNOWAIT)) { pr_perror("Failed to wait\n"); return -1; } if (futex_get(&task_entries->start) == CR_STATE_RESTORE) { pr_err("Child %d died too early\n", info.si_pid); return -1; } if (wait_on_helpers_zombies()) { pr_err("Failed to wait on helpers and zombies\n"); return -1; } return 0; } static int restore_one_task(int pid, CoreEntry *core) { int ret; /* No more fork()-s => no more per-pid logs */ if (task_alive(current)) ret = restore_one_alive_task(pid, core); else if (current->pid->state == TASK_DEAD) ret = restore_one_zombie(core); else if (current->pid->state == TASK_HELPER) { ret = restore_one_helper(); } else { pr_err("Unknown state in code %d\n", (int)core->tc->task_state); ret = -1; } if (core) core_entry__free_unpacked(core, NULL); return ret; } /* All arguments should be above stack, because it grows down */ struct cr_clone_arg { struct pstree_item *item; unsigned long clone_flags; int fd; CoreEntry *core; }; static void maybe_clone_parent(struct pstree_item *item, struct cr_clone_arg *ca) { /* * zdtm runs in kernel 3.11, which has the problem described below. We * avoid this by including the pdeath_sig test. Once users/zdtm migrate * off of 3.11, this condition can be simplified to just test the * options and not have the pdeath_sig test. */ if (opts.restore_sibling) { /* * This means we're called from lib's criu_restore_child(). * In that case create the root task as the child one to+ * the caller. This is the only way to correctly restore the * pdeath_sig of the root task. But also looks nice. * * Alternatively, if we are --restore-detached, a similar trick is * needed to correctly restore pdeath_sig and prevent processes from * dying once restored. * * There were a problem in kernel 3.11 -- CLONE_PARENT can't be * set together with CLONE_NEWPID, which has been solved in further * versions of the kernels, but we treat 3.11 as a base, so at * least warn a user about potential problems. */ rsti(item)->clone_flags |= CLONE_PARENT; if (rsti(item)->clone_flags & CLONE_NEWPID) pr_warn("Set CLONE_PARENT | CLONE_NEWPID but it might cause restore problem," "because not all kernels support such clone flags combinations!\n"); } else if (opts.restore_detach) { if (ca->core->thread_core->pdeath_sig) pr_warn("Root task has pdeath_sig configured, so it will receive one _right_" "after restore on CRIU exit\n"); } } static inline int fork_with_pid(struct pstree_item *item) { struct cr_clone_arg ca; int ret = -1; pid_t pid = vpid(item); if (item->pid->state != TASK_HELPER) { if (open_core(pid, &ca.core)) return -1; if (check_core(ca.core, item)) return -1; item->pid->state = ca.core->tc->task_state; rsti(item)->cg_set = ca.core->tc->cg_set; rsti(item)->has_seccomp = ca.core->tc->seccomp_mode != SECCOMP_MODE_DISABLED; if (item->pid->state != TASK_DEAD && !task_alive(item)) { pr_err("Unknown task state %d\n", item->pid->state); return -1; } if (unlikely(item == root_item)) maybe_clone_parent(item, &ca); } else { /* * Helper entry will not get moved around and thus * will live in the parent's cgset. */ rsti(item)->cg_set = rsti(item->parent)->cg_set; ca.core = NULL; } ret = -1; ca.item = item; ca.clone_flags = rsti(item)->clone_flags; BUG_ON(ca.clone_flags & CLONE_VM); pr_info("Forking task with %d pid (flags 0x%lx)\n", pid, ca.clone_flags); if (!(ca.clone_flags & CLONE_NEWPID)) { char buf[32]; int len; ca.fd = open_proc_rw(PROC_GEN, LAST_PID_PATH); if (ca.fd < 0) goto err; if (flock(ca.fd, LOCK_EX)) { close(ca.fd); pr_perror("%d: Can't lock %s", pid, LAST_PID_PATH); goto err; } len = snprintf(buf, sizeof(buf), "%d", pid - 1); if (write(ca.fd, buf, len) != len) { pr_perror("%d: Write %s to %s", pid, buf, LAST_PID_PATH); goto err_unlock; } } else { ca.fd = -1; BUG_ON(pid != INIT_PID); } /* * Some kernel modules, such as netwrok packet generator * run kernel thread upon net-namespace creattion taking * the @pid we've been requeting via LAST_PID_PATH interface * so that we can't restore a take with pid needed. * * Here is an idea -- unhare net namespace in callee instead. */ /* * The cgroup namespace is also unshared explicitly in the * move_in_cgroup(), so drop this flag here as well. */ ret = clone_noasan(restore_task_with_children, (ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP)) | SIGCHLD, &ca); if (ret < 0) { pr_perror("Can't fork for %d", pid); goto err_unlock; } if (item == root_item) { item->pid->real = ret; pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); } err_unlock: if (ca.fd >= 0) { if (flock(ca.fd, LOCK_UN)) pr_perror("%d: Can't unlock %s", pid, LAST_PID_PATH); close(ca.fd); } err: if (ca.core) core_entry__free_unpacked(ca.core, NULL); return ret; } static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) { int status, pid, exit; while (1) { pid = waitpid(-1, &status, WNOHANG); if (pid <= 0) return; if (!current && WIFSTOPPED(status) && WSTOPSIG(status) == SIGCHLD) { /* The root task is ptraced. Allow it to handle SIGCHLD */ ptrace(PTRACE_CONT, siginfo->si_pid, 0, SIGCHLD); return; } exit = WIFEXITED(status); status = exit ? WEXITSTATUS(status) : WTERMSIG(status); break; } if (exit) pr_err("%d exited, status=%d\n", pid, status); else pr_err("%d killed by signal %d: %s\n", pid, status, strsignal(status)); futex_abort_and_wake(&task_entries->nr_in_progress); } static int criu_signals_setup(void) { int ret; struct sigaction act; sigset_t blockmask; ret = sigaction(SIGCHLD, NULL, &act); if (ret < 0) { pr_perror("sigaction() failed"); return -1; } act.sa_flags |= SA_NOCLDSTOP | SA_SIGINFO | SA_RESTART; act.sa_sigaction = sigchld_handler; sigemptyset(&act.sa_mask); sigaddset(&act.sa_mask, SIGCHLD); ret = sigaction(SIGCHLD, &act, NULL); if (ret < 0) { pr_perror("sigaction() failed"); return -1; } /* * The block mask will be restored in sigreturn. * * TODO: This code should be removed, when a freezer will be added. */ sigfillset(&blockmask); sigdelset(&blockmask, SIGCHLD); /* * Here we use SIG_SETMASK instead of SIG_BLOCK to avoid the case where * we've been forked from a parent who had blocked SIGCHLD. If SIGCHLD * is blocked when a task dies (e.g. if the task fails to restore * somehow), we hang because our SIGCHLD handler is never run. Since we * depend on SIGCHLD being unblocked, let's set the mask explicitly. */ ret = sigprocmask(SIG_SETMASK, &blockmask, NULL); if (ret < 0) { pr_perror("Can't block signals"); return -1; } return 0; } static void restore_sid(void) { pid_t sid; /* * SID can only be reset to pid or inherited from parent. * Thus we restore it right here to let our kids inherit * one in case they need it. * * PGIDs are restored late when all tasks are forked and * we can call setpgid() on custom values. */ if (vpid(current) == current->sid) { pr_info("Restoring %d to %d sid\n", vpid(current), current->sid); sid = setsid(); if (sid != current->sid) { pr_perror("Can't restore sid (%d)", sid); exit(1); } } else { sid = getsid(getpid()); if (sid != current->sid) { /* Skip the root task if it's not init */ if (current == root_item && vpid(root_item) != INIT_PID) return; pr_err("Requested sid %d doesn't match inherited %d\n", current->sid, sid); exit(1); } } } static void restore_pgid(void) { /* * Unlike sessions, process groups (a.k.a. pgids) can be joined * by any task, provided the task with pid == pgid (group leader) * exists. Thus, in order to restore pgid we must make sure that * group leader was born and created the group, then join one. * * We do this _before_ finishing the forking stage to make sure * helpers are still with us. */ pid_t pgid, my_pgid = current->pgid; pr_info("Restoring %d to %d pgid\n", vpid(current), my_pgid); pgid = getpgrp(); if (my_pgid == pgid) return; if (my_pgid != vpid(current)) { struct pstree_item *leader; /* * Wait for leader to become such. * Missing leader means we're going to crtools * group (-j option). */ leader = rsti(current)->pgrp_leader; if (leader) { BUG_ON(my_pgid != vpid(leader)); futex_wait_until(&rsti(leader)->pgrp_set, 1); } } pr_info("\twill call setpgid, mine pgid is %d\n", pgid); if (setpgid(0, my_pgid) != 0) { pr_perror("Can't restore pgid (%d/%d->%d)", vpid(current), pgid, current->pgid); exit(1); } if (my_pgid == vpid(current)) futex_set_and_wake(&rsti(current)->pgrp_set, 1); } static int mount_proc(void) { int fd, ret; char proc_mountpoint[] = "crtools-proc.XXXXXX"; if (root_ns_mask == 0) fd = ret = open("/proc", O_DIRECTORY); else { if (mkdtemp(proc_mountpoint) == NULL) { pr_perror("mkdtemp failed %s", proc_mountpoint); return -1; } pr_info("Mount procfs in %s\n", proc_mountpoint); if (mount("proc", proc_mountpoint, "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) { pr_perror("mount failed"); rmdir(proc_mountpoint); return -1; } ret = fd = open_detach_mount(proc_mountpoint); } if (fd >= 0) { ret = set_proc_fd(fd); close(fd); } return ret; } /* * Tasks cannot change sid (session id) arbitrary, but can either * inherit one from ancestor, or create a new one with id equal to * their pid. Thus sid-s restore is tied with children creation. */ static int create_children_and_session(void) { int ret; struct pstree_item *child; pr_info("Restoring children in alien sessions:\n"); list_for_each_entry(child, ¤t->children, sibling) { if (!restore_before_setsid(child)) continue; BUG_ON(child->born_sid != -1 && getsid(getpid()) != child->born_sid); ret = fork_with_pid(child); if (ret < 0) return ret; } if (current->parent) restore_sid(); pr_info("Restoring children in our session:\n"); list_for_each_entry(child, ¤t->children, sibling) { if (restore_before_setsid(child)) continue; ret = fork_with_pid(child); if (ret < 0) return ret; } return 0; } static int restore_task_with_children(void *_arg) { struct cr_clone_arg *ca = _arg; pid_t pid; int ret; current = ca->item; if (current != root_item) { char buf[12]; int fd; /* Determine PID in CRIU's namespace */ fd = get_service_fd(CR_PROC_FD_OFF); if (fd < 0) goto err; ret = readlinkat(fd, "self", buf, sizeof(buf) - 1); if (ret < 0) { pr_perror("Unable to read the /proc/self link"); goto err; } buf[ret] = '\0'; current->pid->real = atoi(buf); pr_debug("PID: real %d virt %d\n", current->pid->real, vpid(current)); } if ( !(ca->clone_flags & CLONE_FILES)) close_safe(&ca->fd); if (current->pid->state != TASK_HELPER) { ret = clone_service_fd(rsti(current)->service_fd_id); if (ret) goto err; } pid = getpid(); if (vpid(current) != pid) { pr_err("Pid %d do not match expected %d\n", pid, vpid(current)); set_task_cr_err(EEXIST); goto err; } ret = log_init_by_pid(); if (ret < 0) goto err; if (ca->clone_flags & CLONE_NEWNET) { ret = unshare(CLONE_NEWNET); if (ret) { pr_perror("Can't unshare net-namespace"); goto err; } } if (!(ca->clone_flags & CLONE_FILES)) { ret = close_old_fds(); if (ret) goto err; } /* Wait prepare_userns */ if (current->parent == NULL && restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; /* * Call this _before_ forking to optimize cgroups * restore -- if all tasks live in one set of cgroups * we will only move the root one there, others will * just have it inherited. */ if (prepare_task_cgroup(current) < 0) goto err; /* Restore root task */ if (current->parent == NULL) { if (fdstore_init()) goto err; if (join_namespaces()) { pr_perror("Join namespaces failed"); goto err; } pr_info("Calling restore_sid() for init\n"); restore_sid(); /* * We need non /proc proc mount for restoring pid and mount * namespaces and do not care for the rest of the cases. * Thus -- mount proc at custom location for any new namespace */ if (mount_proc()) goto err; if (!files_collected() && collect_image(&tty_cinfo)) goto err; if (collect_images(before_ns_cinfos, ARRAY_SIZE(before_ns_cinfos))) goto err; if (prepare_namespace(current, ca->clone_flags)) goto err; if (restore_finish_ns_stage(CR_STATE_PREPARE_NAMESPACES, CR_STATE_FORKING) < 0) goto err; if (root_prepare_shared()) goto err; } if (restore_task_mnt_ns(current)) goto err; if (prepare_mappings(current)) goto err; if (prepare_sigactions(ca->core) < 0) goto err; if (fault_injected(FI_RESTORE_ROOT_ONLY)) { pr_info("fault: Restore root task failure!\n"); kill(getpid(), SIGKILL); } timing_start(TIME_FORK); if (create_children_and_session()) goto err; timing_stop(TIME_FORK); if (unmap_guard_pages(current)) goto err; restore_pgid(); if (open_transport_socket()) return -1; if (current->parent == NULL) { /* * Wait when all tasks passed the CR_STATE_FORKING stage. * The stage was started by criu, but now it waits for * the CR_STATE_RESTORE to finish. See comment near the * CR_STATE_FORKING macro for details. * * It means that all tasks entered into their namespaces. */ if (restore_wait_other_tasks()) goto err; fini_restore_mntns(); __restore_switch_stage(CR_STATE_RESTORE); } else { if (restore_finish_stage(task_entries, CR_STATE_FORKING) < 0) goto err; } if (restore_one_task(vpid(current), ca->core)) goto err; return 0; err: if (current->parent == NULL) futex_abort_and_wake(&task_entries->nr_in_progress); exit(1); } static int attach_to_tasks(bool root_seized) { struct pstree_item *item; for_each_pstree_item(item) { int status, i; if (!task_alive(item)) continue; if (item->nr_threads == 1) { item->threads[0].real = item->pid->real; } else { if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) return -1; } for (i = 0; i < item->nr_threads; i++) { pid_t pid = item->threads[i].real; if (item != root_item || !root_seized || i != 0) { if (ptrace(PTRACE_SEIZE, pid, 0, 0)) { pr_perror("Can't attach to %d", pid); return -1; } } if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { pr_perror("Can't interrupt the %d task", pid); return -1; } if (wait4(pid, &status, __WALL, NULL) != pid) { pr_perror("waitpid(%d) failed", pid); return -1; } /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the * restorer blob (and the final sigreturn is ok), here we're * doing an munmap in the process, which may be blocked by * seccomp and cause the task to be killed. */ if (rsti(item)->has_seccomp && ptrace_suspend_seccomp(pid) < 0) pr_err("failed to suspend seccomp, restore will probably fail...\n"); if (ptrace(PTRACE_CONT, pid, NULL, NULL) ) { pr_perror("Unable to resume %d", pid); return -1; } } } return 0; } static int catch_tasks(bool root_seized, enum trace_flags *flag) { struct pstree_item *item; for_each_pstree_item(item) { int status, i, ret; if (!task_alive(item)) continue; if (item->nr_threads == 1) { item->threads[0].real = item->pid->real; } else { if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) return -1; } for (i = 0; i < item->nr_threads; i++) { pid_t pid = item->threads[i].real; if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { pr_perror("Can't interrupt the %d task", pid); return -1; } if (wait4(pid, &status, __WALL, NULL) != pid) { pr_perror("waitpid(%d) failed", pid); return -1; } ret = compel_stop_pie(pid, rsti(item)->breakpoint, flag, fault_injected(FI_NO_BREAKPOINTS)); if (ret < 0) return -1; } } return 0; } static int clear_breakpoints() { struct pstree_item *item; int ret = 0, i; if (fault_injected(FI_NO_BREAKPOINTS)) return 0; for_each_pstree_item(item) { if (!task_alive(item)) continue; for (i = 0; i < item->nr_threads; i++) ret |= ptrace_flush_breakpoints(item->threads[i].real); } return ret; } static void finalize_restore(void) { struct pstree_item *item; for_each_pstree_item(item) { pid_t pid = item->pid->real; struct parasite_ctl *ctl; if (!task_alive(item)) continue; /* Unmap the restorer blob */ ctl = compel_prepare_noctx(pid); if (ctl == NULL) continue; compel_unmap(ctl, (unsigned long)rsti(item)->munmap_restorer); xfree(ctl); if ((item->pid->state == TASK_STOPPED) || (opts.final_state == TASK_STOPPED)) kill(item->pid->real, SIGSTOP); } } static void finalize_restore_detach(int status) { struct pstree_item *item; for_each_pstree_item(item) { pid_t pid; int i; if (!task_alive(item)) continue; for (i = 0; i < item->nr_threads; i++) { pid = item->threads[i].real; if (pid < 0) { BUG_ON(status >= 0); break; } if (arch_set_thread_regs_nosigrt(&item->threads[i])) pr_perror("Restoring regs for %d failed", pid); if (ptrace(PTRACE_DETACH, pid, NULL, 0)) pr_perror("Unable to execute %d", pid); } } } static void ignore_kids(void) { struct sigaction sa = { .sa_handler = SIG_DFL }; if (sigaction(SIGCHLD, &sa, NULL) < 0) pr_perror("Restoring CHLD sigaction failed"); } static unsigned int saved_loginuid; static int prepare_userns_hook(void) { int ret; if (kdat.luid != LUID_FULL) return 0; /* * Save old loginuid and set it to INVALID_UID: * this value means that loginuid is unset and it will be inherited. * After you set some value to /proc/<>/loginuid it can't be changed * inside container due to permissions. * But you still can set this value if it was unset. */ saved_loginuid = parse_pid_loginuid(getpid(), &ret, false); if (ret < 0) return -1; if (prepare_loginuid(INVALID_UID, LOG_ERROR) < 0) { pr_err("Setting loginuid for CT init task failed, CAP_AUDIT_CONTROL?\n"); return -1; } return 0; } static void restore_origin_ns_hook(void) { if (kdat.luid != LUID_FULL) return; /* not critical: it does not affect CT in any way */ if (prepare_loginuid(saved_loginuid, LOG_ERROR) < 0) pr_err("Restore original /proc/self/loginuid failed\n"); } static int write_restored_pid(void) { int pid; if (!opts.pidfile) return 0; pid = root_item->pid->real; if (write_pidfile(pid) < 0) { pr_perror("Can't write pidfile"); return -1; } return 0; } static int restore_root_task(struct pstree_item *init) { enum trace_flags flag = TRACE_ALL; int ret, fd, mnt_ns_fd = -1; int root_seized = 0; struct pstree_item *item; ret = run_scripts(ACT_PRE_RESTORE); if (ret != 0) { pr_err("Aborting restore due to pre-restore script ret code %d\n", ret); return -1; } fd = open("/proc", O_DIRECTORY | O_RDONLY); if (fd < 0) { pr_perror("Unable to open /proc"); return -1; } ret = install_service_fd(CR_PROC_FD_OFF, fd); close(fd); if (ret < 0) return -1; /* * FIXME -- currently we assume that all the tasks live * in the same set of namespaces. This is done to debug * the ns contents dumping/restoring. Need to revisit * this later. */ if (vpid(init) == INIT_PID) { if (!(root_ns_mask & CLONE_NEWPID)) { pr_err("This process tree can only be restored " "in a new pid namespace.\n" "criu should be re-executed with the " "\"--namespace pid\" option.\n"); return -1; } } else if (root_ns_mask & CLONE_NEWPID) { pr_err("Can't restore pid namespace without the process init\n"); return -1; } if (prepare_userns_hook()) return -1; if (prepare_namespace_before_tasks()) return -1; __restore_switch_stage_nw(CR_STATE_ROOT_TASK); ret = fork_with_pid(init); if (ret < 0) goto out; restore_origin_ns_hook(); if (rsti(init)->clone_flags & CLONE_PARENT) { struct sigaction act; root_seized = 1; /* * Root task will be our sibling. This means, that * we will not notice when (if) it dies in SIGCHLD * handler, but we should. To do this -- attach to * the guy with ptrace (below) and (!) make the kernel * deliver us the signal when it will get stopped. * It will in case of e.g. segfault before handling * the signal. */ sigaction(SIGCHLD, NULL, &act); act.sa_flags &= ~SA_NOCLDSTOP; sigaction(SIGCHLD, &act, NULL); if (ptrace(PTRACE_SEIZE, init->pid->real, 0, 0)) { pr_perror("Can't attach to init"); goto out_kill; } } if (!root_ns_mask) goto skip_ns_bouncing; /* * uid_map and gid_map must be filled from a parent user namespace. * prepare_userns_creds() must be called after filling mappings. */ if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init)) goto out_kill; pr_info("Wait until namespaces are created\n"); ret = restore_wait_inprogress_tasks(); if (ret) goto out_kill; ret = run_scripts(ACT_SETUP_NS); if (ret) goto out_kill; ret = restore_switch_stage(CR_STATE_PREPARE_NAMESPACES); if (ret) goto out_kill; if (root_ns_mask & CLONE_NEWNS) { mnt_ns_fd = open_proc(init->pid->real, "ns/mnt"); if (mnt_ns_fd < 0) goto out_kill; } if (opts.empty_ns & CLONE_NEWNET) { /* * Local TCP connections were locked by network_lock_internal() * on dump and normally should have been C/R-ed by respectively * dump_iptables() and restore_iptables() in net.c. However in * the '--empty-ns net' mode no iptables C/R is done and we * need to return these rules by hands. */ ret = network_lock_internal(); if (ret) goto out_kill; } ret = run_scripts(ACT_POST_SETUP_NS); if (ret) goto out_kill; __restore_switch_stage(CR_STATE_FORKING); skip_ns_bouncing: ret = restore_wait_inprogress_tasks(); if (ret < 0) goto out_kill; /* * Zombies die after CR_STATE_RESTORE which is switched * by root task, not by us. See comment before CR_STATE_FORKING * in the header for details. */ for_each_pstree_item(item) { if (item->pid->state == TASK_DEAD) task_entries->nr_threads--; } ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD); if (ret < 0) goto out_kill; ret = stop_usernsd(); if (ret < 0) goto out_kill; ret = move_veth_to_bridge(); if (ret < 0) goto out_kill; ret = prepare_cgroup_properties(); if (ret < 0) goto out_kill; if (fault_injected(FI_POST_RESTORE)) goto out_kill; ret = run_scripts(ACT_POST_RESTORE); if (ret != 0) { pr_err("Aborting restore due to post-restore script ret code %d\n", ret); timing_stop(TIME_RESTORE); write_stats(RESTORE_STATS); goto out_kill; } /* * There is no need to call try_clean_remaps() after this point, * as restore went OK and all ghosts were removed by the openers. */ if (depopulate_roots_yard(mnt_ns_fd, false)) goto out_kill; close_safe(&mnt_ns_fd); if (write_restored_pid()) goto out_kill; /* Unlock network before disabling repair mode on sockets */ network_unlock(); /* * Stop getting sigchld, after we resume the tasks they * may start to exit poking criu in vain. */ ignore_kids(); /* * ------------------------------------------------------------- * Below this line nothing should fail, because network is unlocked */ attach_to_tasks(root_seized); ret = restore_switch_stage(CR_STATE_RESTORE_CREDS); BUG_ON(ret); timing_stop(TIME_RESTORE); ret = catch_tasks(root_seized, &flag); pr_info("Restore finished successfully. Resuming tasks.\n"); __restore_switch_stage(CR_STATE_COMPLETE); if (ret == 0) ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); if (clear_breakpoints()) pr_err("Unable to flush breakpoints\n"); if (ret == 0) finalize_restore(); ret = run_scripts(ACT_PRE_RESUME); if (ret) pr_err("Pre-resume script ret code %d\n", ret); if (restore_freezer_state()) pr_err("Unable to restore freezer state\n"); fini_cgroup(); /* Detaches from processes and they continue run through sigreturn. */ finalize_restore_detach(ret); write_stats(RESTORE_STATS); ret = run_scripts(ACT_POST_RESUME); if (ret != 0) pr_err("Post-resume script ret code %d\n", ret); if (!opts.restore_detach && !opts.exec_cmd) wait(NULL); return 0; out_kill: /* * The processes can be killed only when all of them have been created, * otherwise an external proccesses can be killed. */ if (root_ns_mask & CLONE_NEWPID) { int status; /* Kill init */ if (root_item->pid->real > 0) kill(root_item->pid->real, SIGKILL); if (waitpid(root_item->pid->real, &status, 0) < 0) pr_warn("Unable to wait %d: %s", root_item->pid->real, strerror(errno)); } else { struct pstree_item *pi; for_each_pstree_item(pi) if (vpid(pi) > 0) kill(vpid(pi), SIGKILL); } out: fini_cgroup(); depopulate_roots_yard(mnt_ns_fd, true); stop_usernsd(); __restore_switch_stage(CR_STATE_FAIL); pr_err("Restoring FAILED.\n"); return -1; } int prepare_task_entries(void) { task_entries_pos = rst_mem_align_cpos(RM_SHREMAP); task_entries = rst_mem_alloc(sizeof(*task_entries), RM_SHREMAP); if (!task_entries) { pr_perror("Can't map shmem"); return -1; } task_entries->nr_threads = 0; task_entries->nr_tasks = 0; task_entries->nr_helpers = 0; futex_set(&task_entries->start, CR_STATE_FAIL); mutex_init(&task_entries->userns_sync_lock); return 0; } int prepare_dummy_task_state(struct pstree_item *pi) { CoreEntry *core; if (open_core(vpid(pi), &core)) return -1; pi->pid->state = core->tc->task_state; core_entry__free_unpacked(core, NULL); return 0; } int cr_restore_tasks(void) { int ret = -1; if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) return -1; if (check_img_inventory() < 0) goto err; if (init_stats(RESTORE_STATS)) goto err; if (lsm_check_opts()) goto err; timing_start(TIME_RESTORE); if (cpu_init() < 0) goto err; if (vdso_init_restore()) goto err; if (opts.cpu_cap & (CPU_CAP_INS | CPU_CAP_CPU)) { if (cpu_validate_cpuinfo()) goto err; } if (prepare_task_entries() < 0) goto err; if (prepare_pstree() < 0) goto err; if (crtools_prepare_shared() < 0) goto err; if (criu_signals_setup() < 0) goto err; if (prepare_lazy_pages_socket() < 0) goto err; ret = restore_root_task(root_item); err: cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); return ret; } static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long vma_len) { struct vma_area *t_vma, *s_vma; long prev_vma_end = 0; struct vma_area end_vma; VmaEntry end_e; end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; prev_vma_end = kdat.mmap_min_addr; s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); while (1) { if (prev_vma_end + vma_len > s_vma->e->start) { if (s_vma->list.next == self_vma_list) { s_vma = &end_vma; continue; } if (s_vma == &end_vma) break; if (prev_vma_end < s_vma->e->end) prev_vma_end = s_vma->e->end; s_vma = vma_next(s_vma); continue; } if (prev_vma_end + vma_len > t_vma->e->start) { if (t_vma->list.next == tgt_vma_list) { t_vma = &end_vma; continue; } if (t_vma == &end_vma) break; if (prev_vma_end < t_vma->e->end) prev_vma_end = t_vma->e->end; t_vma = vma_next(t_vma); continue; } return prev_vma_end; } return -1; } static inline int timeval_valid(struct timeval *tv) { return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); } static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) { if (ie->isec == 0 && ie->iusec == 0) { memzero_p(val); return 0; } val->it_interval.tv_sec = ie->isec; val->it_interval.tv_usec = ie->iusec; if (!timeval_valid(&val->it_interval)) { pr_err("Invalid timer interval\n"); return -1; } if (ie->vsec == 0 && ie->vusec == 0) { /* * Remaining time was too short. Set it to * interval to make the timer armed and work. */ val->it_value.tv_sec = ie->isec; val->it_value.tv_usec = ie->iusec; } else { val->it_value.tv_sec = ie->vsec; val->it_value.tv_usec = ie->vusec; } if (!timeval_valid(&val->it_value)) { pr_err("Invalid timer value\n"); return -1; } pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, val->it_interval.tv_sec, val->it_interval.tv_usec); return 0; } /* * Legacy itimers restore from CR_FD_ITIMERS */ static int prepare_itimers_from_fd(int pid, struct task_restore_args *args) { int ret = -1; struct cr_img *img; ItimerEntry *ie; if (!deprecated_ok("Itimers")) return -1; img = open_image(CR_FD_ITIMERS, O_RSTR, pid); if (!img) return -1; ret = pb_read_one(img, &ie, PB_ITIMER); if (ret < 0) goto out; ret = decode_itimer("real", ie, &args->itimers[0]); itimer_entry__free_unpacked(ie, NULL); if (ret < 0) goto out; ret = pb_read_one(img, &ie, PB_ITIMER); if (ret < 0) goto out; ret = decode_itimer("virt", ie, &args->itimers[1]); itimer_entry__free_unpacked(ie, NULL); if (ret < 0) goto out; ret = pb_read_one(img, &ie, PB_ITIMER); if (ret < 0) goto out; ret = decode_itimer("prof", ie, &args->itimers[2]); itimer_entry__free_unpacked(ie, NULL); if (ret < 0) goto out; out: close_image(img); return ret; } static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) { int ret = 0; TaskTimersEntry *tte = core->tc->timers; if (!tte) return prepare_itimers_from_fd(pid, args); ret |= decode_itimer("real", tte->real, &args->itimers[0]); ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); return ret; } static inline int timespec_valid(struct timespec *ts) { return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); } static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) { pt->val.it_interval.tv_sec = pte->isec; pt->val.it_interval.tv_nsec = pte->insec; if (!timespec_valid(&pt->val.it_interval)) { pr_err("Invalid timer interval(posix)\n"); return -1; } if (pte->vsec == 0 && pte->vnsec == 0) { // Remaining time was too short. Set it to // interval to make the timer armed and work. pt->val.it_value.tv_sec = pte->isec; pt->val.it_value.tv_nsec = pte->insec; } else { pt->val.it_value.tv_sec = pte->vsec; pt->val.it_value.tv_nsec = pte->vnsec; } if (!timespec_valid(&pt->val.it_value)) { pr_err("Invalid timer value(posix)\n"); return -1; } pt->spt.it_id = pte->it_id; pt->spt.clock_id = pte->clock_id; pt->spt.si_signo = pte->si_signo; pt->spt.it_sigev_notify = pte->it_sigev_notify; pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); pt->overrun = pte->overrun; return 0; } static int cmp_posix_timer_proc_id(const void *p1, const void *p2) { return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; } static void sort_posix_timers(struct task_restore_args *ta) { void *tmem; /* * This is required for restorer's create_posix_timers(), * it will probe them one-by-one for the desired ID, since * kernel doesn't provide another API for timer creation * with given ID. */ if (ta->posix_timers_n > 0) { tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); } } /* * Legacy posix timers restoration from CR_FD_POSIX_TIMERS */ static int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) { struct cr_img *img; int ret = -1; struct restore_posix_timer *t; if (!deprecated_ok("Posix timers")) return -1; img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); if (!img) return -1; ta->posix_timers_n = 0; while (1) { PosixTimerEntry *pte; ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); if (ret <= 0) break; t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); if (!t) break; ret = decode_posix_timer(pte, t); if (ret < 0) break; posix_timer_entry__free_unpacked(pte, NULL); ta->posix_timers_n++; } close_image(img); if (!ret) sort_posix_timers(ta); return ret; } static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) { int i, ret = -1; TaskTimersEntry *tte = core->tc->timers; struct restore_posix_timer *t; ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); if (!tte) return prepare_posix_timers_from_fd(pid, ta); ta->posix_timers_n = tte->n_posix; for (i = 0; i < ta->posix_timers_n; i++) { t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); if (!t) goto out; if (decode_posix_timer(tte->posix[i], t)) goto out; } ret = 0; sort_posix_timers(ta); out: return ret; } static inline int verify_cap_size(CredsEntry *ce) { return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) && (ce->n_cap_prm == CR_CAP_SIZE) && (ce->n_cap_bnd == CR_CAP_SIZE)); } static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; MmEntry *mm = rsti(current)->mm; args->mm = *mm; args->mm.n_mm_saved_auxv = 0; args->mm.mm_saved_auxv = NULL; if (mm->n_mm_saved_auxv > AT_VECTOR_SIZE) { pr_err("Image corrupted on pid %d\n", pid); goto out; } args->mm_saved_auxv_size = mm->n_mm_saved_auxv*sizeof(auxv_t); for (i = 0; i < mm->n_mm_saved_auxv; ++i) { args->mm_saved_auxv[i] = (auxv_t)mm->mm_saved_auxv[i]; } exe_fd = open_reg_by_id(mm->exe_file_id); if (exe_fd < 0) goto out; args->fd_exe_link = exe_fd; args->has_thp_enabled = rsti(current)->has_thp_enabled; ret = 0; out: return ret; } static void *restorer; static unsigned long restorer_len; static int prepare_restorer_blob(void) { /* * We map anonymous mapping, not mremap the restorer itself later. * Otherwise the restorer vma would be tied to criu binary which * in turn will lead to set-exe-file prctl to fail with EBUSY. */ restorer_len = pie_size(restorer); restorer = mmap(NULL, restorer_len, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, 0, 0); if (restorer == MAP_FAILED) { pr_perror("Can't map restorer code"); return -1; } memcpy(restorer, &restorer_blob, sizeof(restorer_blob)); return 0; } static int remap_restorer_blob(void *addr) { void *mem; mem = mremap(restorer, restorer_len, restorer_len, MREMAP_FIXED | MREMAP_MAYMOVE, addr); if (mem != addr) { pr_perror("Can't remap restorer blob"); return -1; } compel_relocs_apply(addr, addr, sizeof(restorer_blob), restorer_relocs, ARRAY_SIZE(restorer_relocs)); return 0; } static int validate_sched_parm(struct rst_sched_param *sp) { if ((sp->nice < -20) || (sp->nice > 19)) return 0; switch (sp->policy) { case SCHED_RR: case SCHED_FIFO: return ((sp->prio > 0) && (sp->prio < 100)); case SCHED_IDLE: case SCHED_OTHER: case SCHED_BATCH: return sp->prio == 0; } return 0; } static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc) { if (!tc->has_sched_policy) { sp->policy = SCHED_OTHER; sp->nice = 0; return 0; } sp->policy = tc->sched_policy; sp->nice = tc->sched_nice; sp->prio = tc->sched_prio; if (!validate_sched_parm(sp)) { pr_err("Inconsistent sched params received (%d.%d.%d)\n", sp->policy, sp->nice, sp->prio); return -1; } return 0; } static rlim_t decode_rlim(rlim_t ival) { return ival == -1 ? RLIM_INFINITY : ival; } /* * Legacy rlimits restore from CR_FD_RLIMIT */ static int prepare_rlimits_from_fd(int pid, struct task_restore_args *ta) { struct rlimit *r; int ret; struct cr_img *img; if (!deprecated_ok("Rlimits")) return -1; /* * Old image -- read from the file. */ img = open_image(CR_FD_RLIMIT, O_RSTR, pid); if (!img) return -1; ta->rlims_n = 0; while (1) { RlimitEntry *re; ret = pb_read_one_eof(img, &re, PB_RLIMIT); if (ret <= 0) break; r = rst_mem_alloc(sizeof(*r), RM_PRIVATE); if (!r) { pr_err("Can't allocate memory for resource %d\n", ta->rlims_n); return -1; } r->rlim_cur = decode_rlim(re->cur); r->rlim_max = decode_rlim(re->max); if (r->rlim_cur > r->rlim_max) { pr_err("Can't restore cur > max for %d.%d\n", pid, ta->rlims_n); r->rlim_cur = r->rlim_max; } rlimit_entry__free_unpacked(re, NULL); ta->rlims_n++; } close_image(img); return 0; } static int prepare_rlimits(int pid, struct task_restore_args *ta, CoreEntry *core) { int i; TaskRlimitsEntry *rls = core->tc->rlimits; struct rlimit64 *r; ta->rlims = (struct rlimit64 *)rst_mem_align_cpos(RM_PRIVATE); if (!rls) return prepare_rlimits_from_fd(pid, ta); for (i = 0; i < rls->n_rlimits; i++) { r = rst_mem_alloc(sizeof(*r), RM_PRIVATE); if (!r) { pr_err("Can't allocate memory for resource %d\n", i); return -1; } r->rlim_cur = decode_rlim(rls->rlimits[i]->cur); r->rlim_max = decode_rlim(rls->rlimits[i]->max); if (r->rlim_cur > r->rlim_max) { pr_warn("Can't restore cur > max for %d.%d\n", pid, i); r->rlim_cur = r->rlim_max; } } ta->rlims_n = rls->n_rlimits; return 0; } static int signal_to_mem(SiginfoEntry *sie) { siginfo_t *info, *t; info = (siginfo_t *) sie->siginfo.data; t = rst_mem_alloc(sizeof(siginfo_t), RM_PRIVATE); if (!t) return -1; memcpy(t, info, sizeof(*info)); return 0; } static int open_signal_image(int type, pid_t pid, unsigned int *nr) { int ret; struct cr_img *img; img = open_image(type, O_RSTR, pid); if (!img) return -1; *nr = 0; while (1) { SiginfoEntry *sie; ret = pb_read_one_eof(img, &sie, PB_SIGINFO); if (ret <= 0) break; if (sie->siginfo.len != sizeof(siginfo_t)) { pr_err("Unknown image format\n"); ret = -1; break; } ret = signal_to_mem(sie); if (ret) break; (*nr)++; siginfo_entry__free_unpacked(sie, NULL); } close_image(img); return ret ? : 0; } static int prepare_one_signal_queue(SignalQueueEntry *sqe, unsigned int *nr) { int i; for (i = 0; i < sqe->n_signals; i++) if (signal_to_mem(sqe->signals[i])) return -1; *nr = sqe->n_signals; return 0; } static unsigned int *siginfo_priv_nr; /* FIXME -- put directly on thread_args */ static int prepare_signals(int pid, struct task_restore_args *ta, CoreEntry *leader_core) { int ret = -1, i; ta->siginfo = (siginfo_t *)rst_mem_align_cpos(RM_PRIVATE); siginfo_priv_nr = xmalloc(sizeof(int) * current->nr_threads); if (siginfo_priv_nr == NULL) goto out; /* Prepare shared signals */ if (!leader_core->tc->signals_s)/*backward compatibility*/ ret = open_signal_image(CR_FD_SIGNAL, pid, &ta->siginfo_n); else ret = prepare_one_signal_queue(leader_core->tc->signals_s, &ta->siginfo_n); if (ret < 0) goto out; for (i = 0; i < current->nr_threads; i++) { if (!current->core[i]->thread_core->signals_p)/*backward compatibility*/ ret = open_signal_image(CR_FD_PSIGNAL, current->threads[i].ns[0].virt, &siginfo_priv_nr[i]); else ret = prepare_one_signal_queue(current->core[i]->thread_core->signals_p, &siginfo_priv_nr[i]); if (ret < 0) goto out; } out: return ret; } extern void __gcov_flush(void) __attribute__((weak)); void __gcov_flush(void) {} static void rst_reloc_creds(struct thread_restore_args *thread_args, unsigned long *creds_pos_next) { struct thread_creds_args *args; if (unlikely(!*creds_pos_next)) return; args = rst_mem_remap_ptr(*creds_pos_next, RM_PRIVATE); if (args->lsm_profile) args->lsm_profile = rst_mem_remap_ptr(args->mem_lsm_profile_pos, RM_PRIVATE); if (args->groups) args->groups = rst_mem_remap_ptr(args->mem_groups_pos, RM_PRIVATE); *creds_pos_next = args->mem_pos_next; thread_args->creds_args = args; } static struct thread_creds_args * rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) { unsigned long this_pos; struct thread_creds_args *args; if (!verify_cap_size(ce)) { pr_err("Caps size mismatch %d %d %d %d\n", (int)ce->n_cap_inh, (int)ce->n_cap_eff, (int)ce->n_cap_prm, (int)ce->n_cap_bnd); return ERR_PTR(-EINVAL); } this_pos = rst_mem_align_cpos(RM_PRIVATE); args = rst_mem_alloc(sizeof(*args), RM_PRIVATE); if (!args) return ERR_PTR(-ENOMEM); args->cap_last_cap = kdat.last_cap; memcpy(&args->creds, ce, sizeof(args->creds)); if (ce->lsm_profile || opts.lsm_supplied) { char *rendered = NULL, *profile; profile = ce->lsm_profile; if (opts.lsm_supplied) profile = opts.lsm_profile; if (validate_lsm(profile) < 0) return ERR_PTR(-EINVAL); if (profile && render_lsm_profile(profile, &rendered)) { return ERR_PTR(-EINVAL); } if (rendered) { size_t lsm_profile_len; char *lsm_profile; args->mem_lsm_profile_pos = rst_mem_align_cpos(RM_PRIVATE); lsm_profile_len = strlen(rendered); lsm_profile = rst_mem_alloc(lsm_profile_len + 1, RM_PRIVATE); if (!lsm_profile) { xfree(rendered); return ERR_PTR(-ENOMEM); } args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_profile = lsm_profile; strncpy(args->lsm_profile, rendered, lsm_profile_len); xfree(rendered); } } else { args->lsm_profile = NULL; args->mem_lsm_profile_pos = 0; } /* * Zap fields which we can't use. */ args->creds.cap_inh = NULL; args->creds.cap_eff = NULL; args->creds.cap_prm = NULL; args->creds.cap_bnd = NULL; args->creds.groups = NULL; args->creds.lsm_profile = NULL; memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh)); memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff)); memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm)); memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd)); if (ce->n_groups) { unsigned int *groups; args->mem_groups_pos = rst_mem_align_cpos(RM_PRIVATE); groups = rst_mem_alloc(ce->n_groups * sizeof(u32), RM_PRIVATE); if (!groups) return ERR_PTR(-ENOMEM); args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->groups = groups; memcpy(args->groups, ce->groups, ce->n_groups * sizeof(u32)); } else { args->groups = NULL; args->mem_groups_pos = 0; } args->mem_pos_next = 0; if (prev_pos) { if (*prev_pos) { struct thread_creds_args *prev; prev = rst_mem_remap_ptr(*prev_pos, RM_PRIVATE); prev->mem_pos_next = this_pos; } *prev_pos = this_pos; } return args; } static int rst_prep_creds_from_img(pid_t pid) { CredsEntry *ce = NULL; struct cr_img *img; int ret; img = open_image(CR_FD_CREDS, O_RSTR, pid); if (!img) return -ENOENT; ret = pb_read_one(img, &ce, PB_CREDS); close_image(img); if (ret > 0) { struct thread_creds_args *args; args = rst_prep_creds_args(ce, NULL); if (IS_ERR(args)) ret = PTR_ERR(args); else ret = 0; } creds_entry__free_unpacked(ce, NULL); return ret; } static int rst_prep_creds(pid_t pid, CoreEntry *core, unsigned long *creds_pos) { struct thread_creds_args *args = NULL; unsigned long this_pos = 0; size_t i; /* * This is _really_ very old image * format where @thread_core were not * present. It means we don't have * creds either, just ignore and exit * early. */ if (unlikely(!core->thread_core)) { *creds_pos = 0; return 0; } *creds_pos = rst_mem_align_cpos(RM_PRIVATE); /* * Old format: one Creds per task carried in own image file. */ if (!core->thread_core->creds) return rst_prep_creds_from_img(pid); for (i = 0; i < current->nr_threads; i++) { CredsEntry *ce = current->core[i]->thread_core->creds; args = rst_prep_creds_args(ce, &this_pos); if (IS_ERR(args)) return PTR_ERR(args); } return 0; } static void *restorer_munmap_addr(CoreEntry *core, void *restorer_blob) { #ifdef CONFIG_COMPAT if (core_is_compat(core)) return restorer_sym(restorer_blob, arch_export_unmap_compat); #endif return restorer_sym(restorer_blob, arch_export_unmap); } static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, unsigned long alen, CoreEntry *core) { void *mem = MAP_FAILED; void *restore_task_exec_start; long new_sp; long ret; long rst_mem_size; long memzone_size; struct thread_restore_args *thread_args; struct restore_mem_zone *mz; #ifdef CONFIG_VDSO struct vdso_maps vdso_maps_rt; unsigned long vdso_rt_size = 0; #endif struct vm_area_list self_vmas; struct vm_area_list *vmas = &rsti(current)->vmas; int i, siginfo_n; unsigned long creds_pos = 0; unsigned long creds_pos_next; sigset_t blockmask; pr_info("Restore via sigreturn\n"); /* pr_info_vma_list(&self_vma_list); */ BUILD_BUG_ON(sizeof(struct task_restore_args) & 1); BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1); /* * Read creds info for every thread and allocate memory * needed so we can use this data inside restorer. */ if (rst_prep_creds(pid, core, &creds_pos)) goto err_nv; /* * We're about to search for free VM area and inject the restorer blob * into it. No irrelevant mmaps/mremaps beyond this point, otherwise * this unwanted mapping might get overlapped by the restorer. */ ret = parse_self_maps_lite(&self_vmas); if (ret < 0) goto err; rst_mem_size = rst_mem_lock(); memzone_size = round_up(sizeof(struct restore_mem_zone) * current->nr_threads, page_size()); task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size; BUG_ON(task_args->bootstrap_len & (PAGE_SIZE - 1)); pr_info("%d threads require %ldK of memory\n", current->nr_threads, KBYTES(task_args->bootstrap_len)); #ifdef CONFIG_VDSO if (core_is_compat(core)) vdso_maps_rt = vdso_maps_compat; else vdso_maps_rt = vdso_maps; /* * Figure out how much memory runtime vdso and vvar will need. */ vdso_rt_size = vdso_maps_rt.sym.vdso_size; if (vdso_rt_size && vdso_maps_rt.sym.vvar_size) vdso_rt_size += ALIGN(vdso_maps_rt.sym.vvar_size, PAGE_SIZE); task_args->bootstrap_len += vdso_rt_size; #endif /* * Restorer is a blob (code + args) that will get mapped in some * place, that should _not_ intersect with both -- current mappings * and mappings of the task we're restoring here. The subsequent * call finds the start address for the restorer. * * After the start address is found we populate it with the restorer * parts one by one (some are remap-ed, some are mmap-ed and copied * or inited from scratch). */ mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, task_args->bootstrap_len); if (mem == (void *)-1) { pr_err("No suitable area for task_restore bootstrap (%ldK)\n", task_args->bootstrap_len); goto err; } pr_info("Found bootstrap VMA hint at: %p (needs ~%ldK)\n", mem, KBYTES(task_args->bootstrap_len)); ret = remap_restorer_blob(mem); if (ret < 0) goto err; /* * Prepare a memory map for restorer. Note a thread space * might be completely unused so it's here just for convenience. */ task_args->clone_restore_fn = restorer_sym(mem, arch_export_restore_thread); restore_task_exec_start = restorer_sym(mem, arch_export_restore_task); rsti(current)->munmap_restorer = restorer_munmap_addr(core, mem); task_args->bootstrap_start = mem; mem += restorer_len; /* VMA we need for stacks and sigframes for threads */ if (mmap(mem, memzone_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_FIXED, 0, 0) != mem) { pr_err("Can't mmap section for restore code\n"); goto err; } memzero(mem, memzone_size); mz = mem; mem += memzone_size; /* New home for task_restore_args and thread_restore_args */ task_args = mremap(task_args, alen, alen, MREMAP_MAYMOVE|MREMAP_FIXED, mem); if (task_args != mem) { pr_perror("Can't move task args"); goto err; } task_args->rst_mem = mem; task_args->rst_mem_size = rst_mem_size + alen; thread_args = (struct thread_restore_args *)(task_args + 1); /* * And finally -- the rest arguments referenced by task_ and * thread_restore_args. Pointers will get remapped below. */ mem += alen; if (rst_mem_remap(mem)) goto err; /* * At this point we've found a gap in VM that fits in both -- current * and target tasks' mappings -- and its structure is * * | restorer code | memzone (stacks and sigframes) | arguments | * * Arguments is task_restore_args, thread_restore_args-s and all * the bunch of objects allocated with rst_mem_alloc(). * Note, that the task_args itself is inside the 3rd section and (!) * it gets unmapped at the very end of __export_restore_task */ task_args->proc_fd = dup(get_service_fd(PROC_FD_OFF)); if (task_args->proc_fd < 0) { pr_perror("can't dup proc fd"); goto err; } task_args->breakpoint = &rsti(current)->breakpoint; task_args->fault_strategy = fi_strategy; sigemptyset(&blockmask); sigaddset(&blockmask, SIGCHLD); if (sigprocmask(SIG_BLOCK, &blockmask, NULL) == -1) { pr_perror("Can not set mask of blocked signals"); return -1; } task_args->task_entries = rst_mem_remap_ptr(task_entries_pos, RM_SHREMAP); task_args->premmapped_addr = (unsigned long)rsti(current)->premmapped_addr; task_args->premmapped_len = rsti(current)->premmapped_len; task_args->task_size = kdat.task_size; RST_MEM_FIXUP_PPTR(task_args->vmas); RST_MEM_FIXUP_PPTR(task_args->rings); RST_MEM_FIXUP_PPTR(task_args->tcp_socks); RST_MEM_FIXUP_PPTR(task_args->timerfd); RST_MEM_FIXUP_PPTR(task_args->posix_timers); RST_MEM_FIXUP_PPTR(task_args->siginfo); RST_MEM_FIXUP_PPTR(task_args->rlims); RST_MEM_FIXUP_PPTR(task_args->helpers); RST_MEM_FIXUP_PPTR(task_args->zombies); RST_MEM_FIXUP_PPTR(task_args->seccomp_filters); RST_MEM_FIXUP_PPTR(task_args->vma_ios); if (core->tc->has_seccomp_mode) task_args->seccomp_mode = core->tc->seccomp_mode; task_args->compatible_mode = core_is_compat(core); /* * Arguments for task restoration. */ BUG_ON(core->mtype != CORE_ENTRY__MARCH); task_args->logfd = log_get_fd(); task_args->loglevel = log_get_loglevel(); log_get_logstart(&task_args->logstart); task_args->sigchld_act = sigchld_act; strncpy(task_args->comm, core->tc->comm, sizeof(task_args->comm)); /* * Fill up per-thread data. */ creds_pos_next = creds_pos; siginfo_n = task_args->siginfo_n; for (i = 0; i < current->nr_threads; i++) { CoreEntry *tcore; struct rt_sigframe *sigframe; k_rtsigset_t *blkset = NULL; thread_args[i].pid = current->threads[i].ns[0].virt; thread_args[i].siginfo_n = siginfo_priv_nr[i]; thread_args[i].siginfo = task_args->siginfo; thread_args[i].siginfo += siginfo_n; siginfo_n += thread_args[i].siginfo_n; /* skip self */ if (thread_args[i].pid == pid) { task_args->t = thread_args + i; tcore = core; blkset = (void *)&tcore->tc->blk_sigset; } else { tcore = current->core[i]; if (tcore->thread_core->has_blk_sigset) blkset = (void *)&tcore->thread_core->blk_sigset; } if ((tcore->tc || tcore->ids) && thread_args[i].pid != pid) { pr_err("Thread has optional fields present %d\n", thread_args[i].pid); ret = -1; } if (ret < 0) { pr_err("Can't read core data for thread %d\n", thread_args[i].pid); goto err; } thread_args[i].ta = task_args; thread_args[i].gpregs = *CORE_THREAD_ARCH_INFO(tcore)->gpregs; thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); rst_reloc_creds(&thread_args[i], &creds_pos_next); thread_args[i].futex_rla = tcore->thread_core->futex_rla; thread_args[i].futex_rla_len = tcore->thread_core->futex_rla_len; thread_args[i].pdeath_sig = tcore->thread_core->pdeath_sig; if (tcore->thread_core->pdeath_sig > _KNSIG) { pr_err("Pdeath signal is too big\n"); goto err; } ret = prep_sched_info(&thread_args[i].sp, tcore->thread_core); if (ret) goto err; thread_args[i].mz = mz + i; sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe; if (construct_sigframe(sigframe, sigframe, blkset, tcore)) goto err; if (thread_args[i].pid != pid) core_entry__free_unpacked(tcore, NULL); pr_info("Thread %4d stack %8p rt_sigframe %8p\n", i, mz[i].stack, mz[i].rt_sigframe); } #ifdef CONFIG_VDSO /* * Restorer needs own copy of vdso parameters. Runtime * vdso must be kept non intersecting with anything else, * since we need it being accessible even when own * self-vmas are unmaped. */ mem += rst_mem_size; task_args->vdso_rt_parked_at = (unsigned long)mem; task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_rt_size = vdso_rt_size; task_args->can_map_vdso = kdat.can_map_vdso; #endif new_sp = restorer_stack(task_args->t->mz); /* No longer need it */ core_entry__free_unpacked(core, NULL); xfree(current->core); /* * Now prepare run-time data for threads restore. */ task_args->nr_threads = current->nr_threads; task_args->thread_args = thread_args; /* * Make root and cwd restore _that_ late not to break any * attempts to open files by paths above (e.g. /proc). */ if (restore_fs(current)) goto err; close_image_dir(); close_proc(); close_service_fd(ROOT_FD_OFF); close_service_fd(USERNSD_SK); close_service_fd(FDSTORE_SK_OFF); close_service_fd(RPC_SK_OFF); __gcov_flush(); pr_info("task_args: %p\n" "task_args->pid: %d\n" "task_args->nr_threads: %d\n" "task_args->clone_restore_fn: %p\n" "task_args->thread_args: %p\n", task_args, task_args->t->pid, task_args->nr_threads, task_args->clone_restore_fn, task_args->thread_args); /* * An indirect call to task_restore, note it never returns * and restoring core is extremely destructive. */ JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args); err: free_mappings(&self_vmas); err_nv: /* Just to be sure */ exit(1); return -1; } criu-3.6/criu/cr-service.c000066400000000000000000000603461317335042600155110ustar00rootroot00000000000000#ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "version.h" #include "crtools.h" #include "cr_options.h" #include "external.h" #include "util.h" #include "criu-log.h" #include "cpu.h" #include "files.h" #include "pstree.h" #include "cr-service.h" #include "cr-service-const.h" #include "page-xfer.h" #include "net.h" #include "mount.h" #include "filesystems.h" #include "cgroup.h" #include "cgroup-props.h" #include "action-scripts.h" #include "sockets.h" #include "irmap.h" #include "kerndat.h" #include "proc_parse.h" #include #include #include "common/scm.h" #include "uffd.h" #include "setproctitle.h" #include "cr-errno.h" #include "namespaces.h" unsigned int service_sk_ino = -1; static int recv_criu_msg(int socket_fd, CriuReq **req) { unsigned char *buf; int len; len = recv(socket_fd, NULL, 0, MSG_TRUNC | MSG_PEEK); if (len == -1) { pr_perror("Can't read request"); return -1; } buf = xmalloc(len); if (!buf) return -ENOMEM; len = recv(socket_fd, buf, len, MSG_TRUNC); if (len == -1) { pr_perror("Can't read request"); goto err; } if (len == 0) { pr_info("Client exited unexpectedly\n"); errno = ECONNRESET; goto err; } *req = criu_req__unpack(NULL, len, buf); if (!*req) { pr_perror("Failed unpacking request"); goto err; } xfree(buf); return 0; err: xfree(buf); return -1; } static int send_criu_msg_with_fd(int socket_fd, CriuResp *msg, int fd) { unsigned char *buf; int len, ret; len = criu_resp__get_packed_size(msg); buf = xmalloc(len); if (!buf) return -ENOMEM; if (criu_resp__pack(msg, buf) != len) { pr_perror("Failed packing response"); goto err; } if (fd >= 0) { ret = send_fds(socket_fd, NULL, 0, &fd, 1, buf, len); } else ret = write(socket_fd, buf, len); if (ret < 0) { pr_perror("Can't send response"); goto err; } xfree(buf); return 0; err: xfree(buf); return -1; } static int send_criu_msg(int socket_fd, CriuResp *msg) { return send_criu_msg_with_fd(socket_fd, msg, -1); } static void set_resp_err(CriuResp *resp) { resp->cr_errno = get_cr_errno(); resp->has_cr_errno = resp->cr_errno ? true : false; resp->cr_errmsg = log_first_err(); } static void send_criu_err(int sk, char *msg) { CriuResp resp = CRIU_RESP__INIT; pr_perror("RPC error: %s", msg); resp.type = CRIU_REQ_TYPE__EMPTY; resp.success = false; set_resp_err(&resp); send_criu_msg(sk, &resp); } int send_criu_dump_resp(int socket_fd, bool success, bool restored) { CriuResp msg = CRIU_RESP__INIT; CriuDumpResp resp = CRIU_DUMP_RESP__INIT; msg.type = CRIU_REQ_TYPE__DUMP; msg.success = success; set_resp_err(&msg); msg.dump = &resp; resp.has_restored = true; resp.restored = restored; return send_criu_msg(socket_fd, &msg); } static int send_criu_pre_dump_resp(int socket_fd, bool success) { CriuResp msg = CRIU_RESP__INIT; msg.type = CRIU_REQ_TYPE__PRE_DUMP; msg.success = success; set_resp_err(&msg); return send_criu_msg(socket_fd, &msg); } int send_criu_restore_resp(int socket_fd, bool success, int pid) { CriuResp msg = CRIU_RESP__INIT; CriuRestoreResp resp = CRIU_RESTORE_RESP__INIT; msg.type = CRIU_REQ_TYPE__RESTORE; msg.success = success; set_resp_err(&msg); msg.restore = &resp; resp.pid = pid; return send_criu_msg(socket_fd, &msg); } int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd) { int ret; CriuResp msg = CRIU_RESP__INIT; CriuReq *req; CriuNotify cn = CRIU_NOTIFY__INIT; msg.type = CRIU_REQ_TYPE__NOTIFY; msg.success = true; msg.notify = &cn; cn.script = name; switch (act) { case ACT_SETUP_NS: case ACT_POST_RESTORE: /* * FIXME pid is required only once on * restore. Need some more sane way of * checking this. */ cn.has_pid = true; cn.pid = root_item->pid->real; break; default: break; } ret = send_criu_msg_with_fd(sk, &msg, fd); if (ret < 0) return ret; ret = recv_criu_msg(sk, &req); if (ret < 0) return ret; if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { pr_err("RPC client reported script error\n"); return -1; } criu_req__free_unpacked(req, NULL); return 0; } static char images_dir[PATH_MAX]; static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); char images_dir_path[PATH_MAX]; char work_dir_path[PATH_MAX]; char status_fd[PATH_MAX]; int i; if (getsockopt(sk, SOL_SOCKET, SO_PEERCRED, &ids, &ids_len)) { pr_perror("Can't get socket options"); goto err; } if (fstat(sk, &st)) { pr_perror("Can't get socket stat"); goto err; } BUG_ON(st.st_ino == -1); service_sk_ino = st.st_ino; /* open images_dir */ sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); if (req->parent_img) opts.img_parent = req->parent_img; if (open_image_dir(images_dir_path) < 0) { pr_perror("Can't open images directory"); goto err; } /* get full path to images_dir to use in process title */ if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { pr_perror("Can't readlink %s", images_dir_path); goto err; } /* chdir to work dir */ if (req->has_work_dir_fd) sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); else strcpy(work_dir_path, images_dir_path); if (chdir(work_dir_path)) { pr_perror("Can't chdir to work_dir"); goto err; } /* initiate log file in work dir */ if (req->log_file) { if (strchr(req->log_file, '/')) { pr_perror("No subdirs are allowed in log_file name"); goto err; } opts.output = req->log_file; } else opts.output = DEFAULT_LOG_FILENAME; log_set_loglevel(req->log_level); if (log_init(opts.output) == -1) { pr_perror("Can't initiate log"); goto err; } if (log_keep_err()) { pr_perror("Can't tune log"); goto err; } /* checking flags from client */ if (req->has_leave_running && req->leave_running) opts.final_state = TASK_ALIVE; if (!req->has_pid) { req->has_pid = true; req->pid = ids.pid; } if (req->has_ext_unix_sk) { opts.ext_unix_sk = req->ext_unix_sk; for (i = 0; i < req->n_unix_sk_ino; i++) { if (unix_sk_id_add((unsigned int)req->unix_sk_ino[i]->inode) < 0) goto err; } } if (req->root) opts.root = req->root; if (req->has_rst_sibling) { if (!opts.swrk_restore) { pr_err("rst_sibling is not allowed in standalone service\n"); goto err; } opts.restore_sibling = req->rst_sibling; } if (req->has_tcp_established) opts.tcp_established_ok = req->tcp_established; if (req->has_tcp_skip_in_flight) opts.tcp_skip_in_flight = req->tcp_skip_in_flight; if (req->has_weak_sysctls) opts.weak_sysctls = req->weak_sysctls; if (req->has_evasive_devices) opts.evasive_devices = req->evasive_devices; if (req->has_shell_job) opts.shell_job = req->shell_job; if (req->has_file_locks) opts.handle_file_locks = req->file_locks; if (req->has_track_mem) opts.track_mem = req->track_mem; if (req->has_link_remap) opts.link_remap_ok = req->link_remap; if (req->has_auto_dedup) opts.auto_dedup = req->auto_dedup; if (req->has_force_irmap) opts.force_irmap = req->force_irmap; if (req->n_exec_cmd > 0) { opts.exec_cmd = xmalloc((req->n_exec_cmd + 1) * sizeof(char *)); memcpy(opts.exec_cmd, req->exec_cmd, req->n_exec_cmd * sizeof(char *)); opts.exec_cmd[req->n_exec_cmd] = NULL; } if (req->has_lazy_pages) { opts.lazy_pages = req->lazy_pages; } if (req->ps) { opts.port = htons((short)req->ps->port); if (!opts.lazy_pages) { opts.use_page_server = true; opts.addr = req->ps->address; if (req->ps->has_fd) { if (!opts.swrk_restore) goto err; opts.ps_socket = req->ps->fd; } } } if (req->notify_scripts && add_rpc_notify(sk)) goto err; for (i = 0; i < req->n_veths; i++) { if (veth_pair_add(req->veths[i]->if_in, req->veths[i]->if_out)) goto err; } for (i = 0; i < req->n_ext_mnt; i++) { if (ext_mount_add(req->ext_mnt[i]->key, req->ext_mnt[i]->val)) goto err; } for (i = 0; i < req->n_join_ns; i++) { if (join_ns_add(req->join_ns[i]->ns, req->join_ns[i]->ns_file, req->join_ns[i]->extra_opt)) goto err; } if (req->n_inherit_fd && !opts.swrk_restore) { pr_err("inherit_fd is not allowed in standalone service\n"); goto err; } for (i = 0; i < req->n_inherit_fd; i++) { if (inherit_fd_add(req->inherit_fd[i]->fd, req->inherit_fd[i]->key)) goto err; } for (i = 0; i < req->n_external; i++) if (add_external(req->external[i])) goto err; for (i = 0; i < req->n_cg_root; i++) { if (new_cg_root_add(req->cg_root[i]->ctrl, req->cg_root[i]->path)) goto err; } for (i = 0; i < req->n_enable_fs; i++) { if (!add_fsname_auto(req->enable_fs[i])) goto err; } for (i = 0; i < req->n_skip_mnt; i++) { if (!add_skip_mount(req->skip_mnt[i])) goto err; } if (req->has_cpu_cap) opts.cpu_cap = req->cpu_cap; /* * FIXME: For backward compatibility we setup * soft mode here, need to enhance to support * other modes as well via separate option * probably. */ if (req->has_manage_cgroups) opts.manage_cgroups = req->manage_cgroups ? CG_MODE_SOFT : CG_MODE_IGNORE; /* Override the manage_cgroup if mode is set explicitly */ if (req->has_manage_cgroups_mode) { unsigned int mode; switch (req->manage_cgroups_mode) { case CRIU_CG_MODE__IGNORE: mode = CG_MODE_IGNORE; break; case CRIU_CG_MODE__CG_NONE: mode = CG_MODE_NONE; break; case CRIU_CG_MODE__PROPS: mode = CG_MODE_PROPS; break; case CRIU_CG_MODE__SOFT: mode = CG_MODE_SOFT; break; case CRIU_CG_MODE__FULL: mode = CG_MODE_FULL; break; case CRIU_CG_MODE__STRICT: mode = CG_MODE_STRICT; break; case CRIU_CG_MODE__DEFAULT: mode = CG_MODE_DEFAULT; break; default: goto err; } opts.manage_cgroups = mode; } if (req->freeze_cgroup) opts.freeze_cgroup = req->freeze_cgroup; if (req->has_timeout) opts.timeout = req->timeout; if (req->cgroup_props) opts.cgroup_props = req->cgroup_props; if (req->cgroup_props_file) opts.cgroup_props_file = req->cgroup_props_file; for (i = 0; i < req->n_cgroup_dump_controller; i++) { if (!cgp_add_dump_controller(req->cgroup_dump_controller[i])) goto err; } if (req->has_auto_ext_mnt) opts.autodetect_ext_mounts = req->auto_ext_mnt; if (req->has_ext_sharing) opts.enable_external_sharing = req->ext_sharing; if (req->has_ext_masters) opts.enable_external_masters = req->ext_masters; if (req->has_ghost_limit) opts.ghost_limit = req->ghost_limit; if (req->has_empty_ns) { opts.empty_ns = req->empty_ns; if (req->empty_ns & ~(CLONE_NEWNET)) goto err; } if (req->n_irmap_scan_paths) { for (i = 0; i < req->n_irmap_scan_paths; i++) { if (irmap_scan_path_add(req->irmap_scan_paths[i])) goto err; } } if (req->has_status_fd) { sprintf(status_fd, "/proc/%d/fd/%d", ids.pid, req->status_fd); opts.status_fd = open(status_fd, O_WRONLY); if (opts.status_fd < 0) goto err; } if (req->orphan_pts_master) opts.orphan_pts_master = true; if (check_namespace_opts()) goto err; return 0; err: set_cr_errno(EBADRQC); return -1; } static int dump_using_req(int sk, CriuOpts *req) { bool success = false; bool self_dump = !req->pid; if (setup_opts_from_req(sk, req)) goto exit; setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); /* * FIXME -- cr_dump_tasks() may return code from custom * scripts, that can be positive. However, right now we * don't have ability to push scripts via RPC, so psitive * ret values are impossible here. */ if (cr_dump_tasks(req->pid)) goto exit; success = true; exit: if (req->leave_running || !self_dump || !success) { if (send_criu_dump_resp(sk, success, false) == -1) { pr_perror("Can't send response"); success = false; } } return success ? 0 : 1; } static int restore_using_req(int sk, CriuOpts *req) { bool success = false; /* * We can't restore processes under arbitrary task yet. * Thus for now we force the detached restore under the * cr service task. */ opts.restore_detach = true; if (setup_opts_from_req(sk, req)) goto exit; setproctitle("restore --rpc -D %s", images_dir); if (cr_restore_tasks()) goto exit; success = true; exit: if (send_criu_restore_resp(sk, success, root_item ? root_item->pid->real : -1) == -1) { pr_perror("Can't send response"); success = false; } if (success && opts.exec_cmd) { int logfd; logfd = log_get_fd(); if (dup2(logfd, STDOUT_FILENO) == -1 || dup2(logfd, STDERR_FILENO) == -1) { pr_perror("Failed to redirect stdout and stderr to the logfile"); return 1; } close_pid_proc(); close(sk); execvp(opts.exec_cmd[0], opts.exec_cmd); pr_perror("Failed to exec cmd %s", opts.exec_cmd[0]); success = false; } return success ? 0 : 1; } static int check(int sk) { CriuResp resp = CRIU_RESP__INIT; resp.type = CRIU_REQ_TYPE__CHECK; setproctitle("check --rpc"); if (!cr_check()) resp.success = true; return send_criu_msg(sk, &resp); } static int pre_dump_using_req(int sk, CriuOpts *req) { int pid, status; bool success = false; pid = fork(); if (pid < 0) { pr_perror("Can't fork"); goto out; } if (pid == 0) { int ret = 1; if (setup_opts_from_req(sk, req)) goto cout; setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); if (cr_pre_dump_tasks(req->pid)) goto cout; ret = 0; cout: exit(ret); } wait(&status); if (!WIFEXITED(status)) goto out; if (WEXITSTATUS(status) != 0) goto out; success = true; out: if (send_criu_pre_dump_resp(sk, success) == -1) { pr_perror("Can't send pre-dump resp"); success = false; } return success ? 0 : -1; } static int pre_dump_loop(int sk, CriuReq *msg) { int ret; do { ret = pre_dump_using_req(sk, msg->opts); if (ret < 0) return ret; criu_req__free_unpacked(msg, NULL); if (recv_criu_msg(sk, &msg) == -1) { pr_perror("Can't recv request"); return -1; } } while (msg->type == CRIU_REQ_TYPE__PRE_DUMP); if (msg->type != CRIU_REQ_TYPE__DUMP) { send_criu_err(sk, "Bad req seq"); return -1; } return dump_using_req(sk, msg->opts); } struct ps_info { int pid; unsigned short port; }; static int start_page_server_req(int sk, CriuOpts *req) { int ret = -1, pid, start_pipe[2]; ssize_t count; bool success = false; CriuResp resp = CRIU_RESP__INIT; CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; struct ps_info info; if (pipe(start_pipe)) { pr_perror("No start pipe"); goto out; } pid = fork(); if (pid == 0) { close(start_pipe[0]); if (setup_opts_from_req(sk, req)) goto out_ch; setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); pr_debug("Starting page server\n"); pid = cr_page_server(true, false, start_pipe[1]); if (pid < 0) goto out_ch; info.pid = pid; info.port = opts.port; count = write(start_pipe[1], &info, sizeof(info)); if (count != sizeof(info)) goto out_ch; ret = 0; out_ch: if (ret < 0 && pid > 0) kill(pid, SIGKILL); close(start_pipe[1]); exit(ret); } close(start_pipe[1]); wait(&ret); if (WIFEXITED(ret)) { if (WEXITSTATUS(ret)) { pr_err("Child exited with an error\n"); goto out; } } else { pr_err("Child wasn't terminated normally\n"); goto out; } count = read(start_pipe[0], &info, sizeof(info)); close(start_pipe[0]); if (count != sizeof(info)) goto out; success = true; ps.has_pid = true; ps.pid = info.pid; ps.has_port = true; ps.port = info.port; resp.ps = &ps; pr_debug("Page server started\n"); out: resp.type = CRIU_REQ_TYPE__PAGE_SERVER; resp.success = success; return send_criu_msg(sk, &resp); } static int chk_keepopen_req(CriuReq *msg) { if (!msg->keep_open) return 0; /* * Service may (well, it will) leave some * resources leaked after processing e.g. * dump or restore requests. Before we audit * the code for this, let's first enable * mreq RPCs for those requests we know do * good work */ if (msg->type == CRIU_REQ_TYPE__PAGE_SERVER) /* This just fork()-s so no leaks */ return 0; else if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP || msg->type == CRIU_REQ_TYPE__CPUINFO_CHECK) return 0; else if (msg->type == CRIU_REQ_TYPE__FEATURE_CHECK) return 0; else if (msg->type == CRIU_REQ_TYPE__VERSION) return 0; return -1; } /* * Return the version information, depending on the information * available in version.h */ static int handle_version(int sk, CriuReq * msg) { CriuResp resp = CRIU_RESP__INIT; CriuVersion version = CRIU_VERSION__INIT; /* This assumes we will always have a major and minor version */ version.major = CRIU_VERSION_MAJOR; version.minor = CRIU_VERSION_MINOR; if (strcmp(CRIU_GITID, "0")) { version.gitid = CRIU_GITID; } #ifdef CRIU_VERSION_SUBLEVEL version.has_sublevel = 1; version.sublevel = CRIU_VERSION_SUBLEVEL; #endif #ifdef CRIU_VERSION_EXTRA version.has_extra = 1; version.extra = CRIU_VERSION_EXTRA; #endif #ifdef CRIU_VERSION_NAME /* This is not actually exported in version.h */ version.name = CRIU_VERSION_NAME; #endif resp.type = msg->type; resp.success = true; resp.version = &version; return send_criu_msg(sk, &resp); } /* * Generic function to handle CRIU_REQ_TYPE__FEATURE_CHECK. * * The function will have resp.success = true for most cases * and the actual result will be in resp.features. * * For each feature which has been requested in msg->features * the corresponding parameter will be set in resp.features. */ static int handle_feature_check(int sk, CriuReq * msg) { CriuResp resp = CRIU_RESP__INIT; CriuFeatures feat = CRIU_FEATURES__INIT; int pid, status; int ret; /* enable setting of an optional message */ feat.has_mem_track = 1; feat.mem_track = false; feat.has_lazy_pages = 1; feat.lazy_pages = false; pid = fork(); if (pid < 0) { pr_perror("Can't fork"); goto out; } if (pid == 0) { setproctitle("feature-check --rpc"); if ((msg->features->has_mem_track == 1) && (msg->features->mem_track == true)) feat.mem_track = kdat.has_dirty_track; if ((msg->features->has_lazy_pages == 1) && (msg->features->lazy_pages == true)) feat.lazy_pages = kdat.has_uffd && uffd_noncooperative(); resp.features = &feat; resp.type = msg->type; /* The feature check is working, actual results are in resp.features */ resp.success = true; /* * If this point is reached the information about the features * is transmitted from the forked CRIU process (here). * If an error occured earlier, the feature check response will be * be send from the parent process. */ ret = send_criu_msg(sk, &resp); exit(ret); } ret = waitpid(pid, &status, 0); if (ret == -1) goto out; if (WIFEXITED(status) && !WEXITSTATUS(status)) /* * The child process exited was able to send the answer. * Nothing more to do here. */ return 0; /* * The child process was not able to send an answer. Tell * the RPC client that something did not work as expected. */ out: resp.type = msg->type; resp.success = false; return send_criu_msg(sk, &resp); } static int handle_cpuinfo(int sk, CriuReq *msg) { CriuResp resp = CRIU_RESP__INIT; bool success = false; int pid, status; pid = fork(); if (pid < 0) { pr_perror("Can't fork"); goto out; } if (pid == 0) { int ret = 1; if (setup_opts_from_req(sk, msg->opts)) goto cout; setproctitle("cpuinfo %s --rpc -D %s", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check", images_dir); if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ret = cpuinfo_dump(); else ret = cpuinfo_check(); cout: exit(ret); } wait(&status); if (!WIFEXITED(status)) goto out; switch (WEXITSTATUS(status)) { case (-ENOTSUP & 0xff): resp.has_cr_errno = 1; /* * Let's return the actual error code and * not just (-ENOTSUP & 0xff) */ resp.cr_errno = ENOTSUP; break; case 0: success = true; break; default: break; } out: resp.type = msg->type; resp.success = success; return send_criu_msg(sk, &resp); } int cr_service_work(int sk) { int ret = -1; CriuReq *msg = 0; more: if (recv_criu_msg(sk, &msg) == -1) { pr_perror("Can't recv request"); goto err; } if (chk_keepopen_req(msg)) goto err; switch (msg->type) { case CRIU_REQ_TYPE__DUMP: ret = dump_using_req(sk, msg->opts); break; case CRIU_REQ_TYPE__RESTORE: ret = restore_using_req(sk, msg->opts); break; case CRIU_REQ_TYPE__CHECK: ret = check(sk); break; case CRIU_REQ_TYPE__PRE_DUMP: ret = pre_dump_loop(sk, msg); break; case CRIU_REQ_TYPE__PAGE_SERVER: ret = start_page_server_req(sk, msg->opts); break; case CRIU_REQ_TYPE__CPUINFO_DUMP: case CRIU_REQ_TYPE__CPUINFO_CHECK: ret = handle_cpuinfo(sk, msg); break; case CRIU_REQ_TYPE__FEATURE_CHECK: ret = handle_feature_check(sk, msg); break; case CRIU_REQ_TYPE__VERSION: ret = handle_version(sk, msg); break; default: send_criu_err(sk, "Invalid req"); break; } if (!ret && msg->keep_open) { criu_req__free_unpacked(msg, NULL); ret = -1; goto more; } err: return ret; } static void reap_worker(int signo) { int saved_errno; int status; pid_t pid; saved_errno = errno; /* * As we block SIGCHLD, lets wait for every child that has * already changed state. */ while (1) { pid = waitpid(-1, &status, WNOHANG); if (pid <= 0) { errno = saved_errno; return; } if (WIFEXITED(status)) pr_info("Worker(pid %d) exited with %d\n", pid, WEXITSTATUS(status)); else if (WIFSIGNALED(status)) pr_info("Worker(pid %d) was killed by %d: %s\n", pid, WTERMSIG(status), strsignal(WTERMSIG(status))); } } static int setup_sigchld_handler() { struct sigaction action; sigemptyset(&action.sa_mask); sigaddset(&action.sa_mask, SIGCHLD); action.sa_handler = reap_worker; action.sa_flags = SA_RESTART; if (sigaction(SIGCHLD, &action, NULL)) { pr_perror("Can't setup SIGCHLD handler"); return -1; } return 0; } static int restore_sigchld_handler() { struct sigaction action; sigemptyset(&action.sa_mask); sigaddset(&action.sa_mask, SIGCHLD); action.sa_handler = SIG_DFL; action.sa_flags = SA_RESTART; if (sigaction(SIGCHLD, &action, NULL)) { pr_perror("Can't restore SIGCHLD handler"); return -1; } return 0; } int cr_service(bool daemon_mode) { int server_fd = -1; int child_pid; struct sockaddr_un client_addr; socklen_t client_addr_len; { struct sockaddr_un server_addr; socklen_t server_addr_len; server_fd = socket(AF_LOCAL, SOCK_SEQPACKET, 0); if (server_fd == -1) { pr_perror("Can't initialize service socket"); goto err; } memset(&server_addr, 0, sizeof(server_addr)); memset(&client_addr, 0, sizeof(client_addr)); server_addr.sun_family = AF_LOCAL; if (opts.addr == NULL) { pr_warn("Binding to local dir address!\n"); opts.addr = CR_DEFAULT_SERVICE_ADDRESS; } strcpy(server_addr.sun_path, opts.addr); server_addr_len = strlen(server_addr.sun_path) + sizeof(server_addr.sun_family); client_addr_len = sizeof(client_addr); unlink(server_addr.sun_path); if (bind(server_fd, (struct sockaddr *) &server_addr, server_addr_len) == -1) { pr_perror("Can't bind"); goto err; } pr_info("The service socket is bound to %s\n", server_addr.sun_path); /* change service socket permissions, so anyone can connect to it */ if (chmod(server_addr.sun_path, 0666)) { pr_perror("Can't change permissions of the service socket"); goto err; } if (listen(server_fd, 16) == -1) { pr_perror("Can't listen for socket connections"); goto err; } } if (daemon_mode) { if (daemon(1, 0) == -1) { pr_perror("Can't run service server in the background"); goto err; } } if (opts.pidfile) { if (write_pidfile(getpid()) == -1) { pr_perror("Can't write pidfile"); goto err; } } if (setup_sigchld_handler()) goto err; while (1) { int sk; pr_info("Waiting for connection...\n"); sk = accept(server_fd, (struct sockaddr *)&client_addr, &client_addr_len); if (sk == -1) { pr_perror("Can't accept connection"); goto err; } pr_info("Connected.\n"); child_pid = fork(); if (child_pid == 0) { int ret; if (restore_sigchld_handler()) exit(1); close(server_fd); init_opts(); ret = cr_service_work(sk); close(sk); exit(ret != 0); } if (child_pid < 0) pr_perror("Can't fork a child"); close(sk); } err: close_safe(&server_fd); return 1; } criu-3.6/criu/crtools.c000066400000000000000000000656361317335042600151430ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "int.h" #include "page.h" #include "common/compiler.h" #include "crtools.h" #include "cr_options.h" #include "external.h" #include "files.h" #include "sk-inet.h" #include "net.h" #include "version.h" #include "page-xfer.h" #include "tty.h" #include "file-lock.h" #include "cr-service.h" #include "plugin.h" #include "criu-log.h" #include "util.h" #include "mount.h" #include "filesystems.h" #include "namespaces.h" #include "cgroup.h" #include "cgroup-props.h" #include "cpu.h" #include "action-scripts.h" #include "irmap.h" #include "fault-injection.h" #include "lsm.h" #include "proc_parse.h" #include "kerndat.h" #include "setproctitle.h" #include "sysctl.h" #include "../soccr/soccr.h" struct cr_options opts; void init_opts(void) { memset(&opts, 0, sizeof(opts)); /* Default options */ opts.final_state = TASK_DEAD; INIT_LIST_HEAD(&opts.ext_mounts); INIT_LIST_HEAD(&opts.inherit_fds); INIT_LIST_HEAD(&opts.external); INIT_LIST_HEAD(&opts.join_ns); INIT_LIST_HEAD(&opts.new_cgroup_roots); INIT_LIST_HEAD(&opts.irmap_scan_paths); opts.cpu_cap = CPU_CAP_DEFAULT; opts.manage_cgroups = CG_MODE_DEFAULT; opts.ps_socket = -1; opts.ghost_limit = DEFAULT_GHOST_LIMIT; opts.timeout = DEFAULT_TIMEOUT; opts.empty_ns = 0; opts.status_fd = -1; } static int parse_join_ns(const char *ptr) { char *aux, *ns_file, *extra_opts = NULL; aux = strchr(ptr, ':'); if (aux == NULL) return -1; *aux = '\0'; ns_file = aux + 1; aux = strchr(ns_file, ','); if (aux != NULL) { *aux = '\0'; extra_opts = aux + 1; } else { extra_opts = NULL; } if (join_ns_add(ptr, ns_file, extra_opts)) return -1; return 0; } static int parse_cpu_cap(struct cr_options *opts, const char *optarg) { bool inverse = false; #define ____cpu_set_cap(__opts, __cap, __inverse) \ do { \ if ((__inverse)) \ (__opts)->cpu_cap &= ~(__cap); \ else \ (__opts)->cpu_cap |= (__cap); \ } while (0) if (!optarg) { ____cpu_set_cap(opts, CPU_CAP_ALL, false); return 0; } while (*optarg) { if (optarg[0] == '^') { inverse = !inverse; optarg++; continue; } else if (optarg[0] == ',') { inverse = false; optarg++; continue; } if (!strncmp(optarg, "fpu", 3)) { ____cpu_set_cap(opts, CPU_CAP_FPU, inverse); optarg += 3; } else if (!strncmp(optarg, "all", 3)) { ____cpu_set_cap(opts, CPU_CAP_ALL, inverse); optarg += 3; } else if (!strncmp(optarg, "none", 4)) { if (inverse) opts->cpu_cap = CPU_CAP_ALL; else opts->cpu_cap = CPU_CAP_NONE; optarg += 4; } else if (!strncmp(optarg, "cpu", 3)) { ____cpu_set_cap(opts, CPU_CAP_CPU, inverse); optarg += 3; } else if (!strncmp(optarg, "ins", 3)) { ____cpu_set_cap(opts, CPU_CAP_INS, inverse); optarg += 3; } else goto Esyntax; } #undef ____cpu_set_cap return 0; Esyntax: pr_err("Unknown FPU mode `%s' selected\n", optarg); return -1; } static int parse_manage_cgroups(struct cr_options *opts, const char *optarg) { if (!optarg) { opts->manage_cgroups = CG_MODE_SOFT; return 0; } if (!strcmp(optarg, "none")) { opts->manage_cgroups = CG_MODE_NONE; } else if (!strcmp(optarg, "props")) { opts->manage_cgroups = CG_MODE_PROPS; } else if (!strcmp(optarg, "soft")) { opts->manage_cgroups = CG_MODE_SOFT; } else if (!strcmp(optarg, "full")) { opts->manage_cgroups = CG_MODE_FULL; } else if (!strcmp(optarg, "strict")) { opts->manage_cgroups = CG_MODE_STRICT; } else goto Esyntax; return 0; Esyntax: pr_err("Unknown cgroups mode `%s' selected\n", optarg); return -1; } static size_t parse_size(char *optarg) { if (index(optarg, 'K')) return (size_t)KILO(atol(optarg)); else if (index(optarg, 'M')) return (size_t)MEGA(atol(optarg)); else if (index(optarg, 'G')) return (size_t)GIGA(atol(optarg)); return (size_t)atol(optarg); } bool deprecated_ok(char *what) { if (opts.deprecated_ok) return true; pr_err("Deprecated functionality (%s) rejected.\n", what); pr_err("Use the --deprecated option or set CRIU_DEPRECATED environment.\n"); pr_err("For details visit https://criu.org/Deprecation\n"); return false; } int main(int argc, char *argv[], char *envp[]) { #define BOOL_OPT(OPT_NAME, SAVE_TO) \ {OPT_NAME, no_argument, SAVE_TO, true},\ {"no-" OPT_NAME, no_argument, SAVE_TO, false} pid_t pid = 0, tree_id = 0; int ret = -1; bool usage_error = true; bool has_exec_cmd = false; bool has_sub_command; int opt, idx; int log_level = DEFAULT_LOGLEVEL; char *imgs_dir = "."; static const char short_opts[] = "dSsRf:F:t:p:hcD:o:v::x::Vr:jJ:lW:L:M:"; static struct option long_opts[] = { { "tree", required_argument, 0, 't' }, { "pid", required_argument, 0, 'p' }, { "leave-stopped", no_argument, 0, 's' }, { "leave-running", no_argument, 0, 'R' }, BOOL_OPT("restore-detached", &opts.restore_detach), BOOL_OPT("restore-sibling", &opts.restore_sibling), BOOL_OPT("daemon", &opts.restore_detach), { "contents", no_argument, 0, 'c' }, { "file", required_argument, 0, 'f' }, { "fields", required_argument, 0, 'F' }, { "images-dir", required_argument, 0, 'D' }, { "work-dir", required_argument, 0, 'W' }, { "log-file", required_argument, 0, 'o' }, { "join-ns", required_argument, 0, 'J' }, { "root", required_argument, 0, 'r' }, { USK_EXT_PARAM, optional_argument, 0, 'x' }, { "help", no_argument, 0, 'h' }, BOOL_OPT(SK_EST_PARAM, &opts.tcp_established_ok), { "close", required_argument, 0, 1043 }, BOOL_OPT("log-pid", &opts.log_file_per_pid), { "version", no_argument, 0, 'V' }, BOOL_OPT("evasive-devices", &opts.evasive_devices), { "pidfile", required_argument, 0, 1046 }, { "veth-pair", required_argument, 0, 1047 }, { "action-script", required_argument, 0, 1049 }, BOOL_OPT(LREMAP_PARAM, &opts.link_remap_ok), BOOL_OPT(OPT_SHELL_JOB, &opts.shell_job), BOOL_OPT(OPT_FILE_LOCKS, &opts.handle_file_locks), BOOL_OPT("page-server", &opts.use_page_server), { "address", required_argument, 0, 1051 }, { "port", required_argument, 0, 1052 }, { "prev-images-dir", required_argument, 0, 1053 }, { "ms", no_argument, 0, 1054 }, BOOL_OPT("track-mem", &opts.track_mem), BOOL_OPT("auto-dedup", &opts.auto_dedup), { "libdir", required_argument, 0, 'L' }, { "cpu-cap", optional_argument, 0, 1057 }, BOOL_OPT("force-irmap", &opts.force_irmap), { "ext-mount-map", required_argument, 0, 'M' }, { "exec-cmd", no_argument, 0, 1059 }, { "manage-cgroups", optional_argument, 0, 1060 }, { "cgroup-root", required_argument, 0, 1061 }, { "inherit-fd", required_argument, 0, 1062 }, { "feature", required_argument, 0, 1063 }, { "skip-mnt", required_argument, 0, 1064 }, { "enable-fs", required_argument, 0, 1065 }, { "enable-external-sharing", no_argument, &opts.enable_external_sharing, true }, { "enable-external-masters", no_argument, &opts.enable_external_masters, true }, { "freeze-cgroup", required_argument, 0, 1068 }, { "ghost-limit", required_argument, 0, 1069 }, { "irmap-scan-path", required_argument, 0, 1070 }, { "lsm-profile", required_argument, 0, 1071 }, { "timeout", required_argument, 0, 1072 }, { "external", required_argument, 0, 1073 }, { "empty-ns", required_argument, 0, 1074 }, { "lazy-pages", no_argument, 0, 1076 }, BOOL_OPT("extra", &opts.check_extra_features), BOOL_OPT("experimental", &opts.check_experimental_features), { "all", no_argument, 0, 1079 }, { "cgroup-props", required_argument, 0, 1080 }, { "cgroup-props-file", required_argument, 0, 1081 }, { "cgroup-dump-controller", required_argument, 0, 1082 }, BOOL_OPT(SK_INFLIGHT_PARAM, &opts.tcp_skip_in_flight), BOOL_OPT("deprecated", &opts.deprecated_ok), BOOL_OPT("display-stats", &opts.display_stats), BOOL_OPT("weak-sysctls", &opts.weak_sysctls), { "status-fd", required_argument, 0, 1088 }, BOOL_OPT(SK_CLOSE_PARAM, &opts.tcp_close), { "verbosity", optional_argument, 0, 'v' }, { }, }; #undef BOOL_OPT BUILD_BUG_ON(PAGE_SIZE != PAGE_IMAGE_SIZE); BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); if (fault_injection_init()) return 1; cr_pb_init(); setproctitle_init(argc, argv, envp); if (argc < 2) goto usage; init_opts(); if (init_service_fd()) return 1; if (kerndat_init()) return 1; if (!strcmp(argv[1], "swrk")) { if (argc < 3) goto usage; /* * This is to start criu service worker from libcriu calls. * The usage is "criu swrk " and is not for CLI/scripts. * The arguments semantics can change at any time with the * corresponding lib call change. */ opts.swrk_restore = true; return cr_service_work(atoi(argv[2])); } while (1) { idx = -1; opt = getopt_long(argc, argv, short_opts, long_opts, &idx); if (opt == -1) break; if (!opt) continue; switch (opt) { case 's': opts.final_state = TASK_STOPPED; break; case 'R': opts.final_state = TASK_ALIVE; break; case 'x': if (optarg && unix_sk_ids_parse(optarg) < 0) return 1; opts.ext_unix_sk = true; break; case 'p': pid = atoi(optarg); if (pid <= 0) goto bad_arg; break; case 't': tree_id = atoi(optarg); if (tree_id <= 0) goto bad_arg; break; case 'c': opts.show_pages_content = true; break; case 'f': opts.show_dump_file = optarg; break; case 'F': opts.show_fmt = optarg; break; case 'r': opts.root = optarg; break; case 'd': opts.restore_detach = true; break; case 'S': opts.restore_sibling = true; break; case 'D': imgs_dir = optarg; break; case 'W': opts.work_dir = optarg; break; case 'o': opts.output = optarg; break; case 'J': if (parse_join_ns(optarg)) goto bad_arg; break; case 'v': if (optarg) { if (optarg[0] == 'v') /* handle -vvvvv */ log_level += strlen(optarg) + 1; else log_level = atoi(optarg); } else log_level++; break; case 1043: { int fd; fd = atoi(optarg); pr_info("Closing fd %d\n", fd); close(fd); break; } case 1046: opts.pidfile = optarg; break; case 1047: { char *aux; aux = strchr(optarg, '='); if (aux == NULL) goto bad_arg; *aux = '\0'; if (veth_pair_add(optarg, aux + 1)) return 1; } break; case 1049: if (add_script(optarg)) return 1; break; case 1051: opts.addr = optarg; break; case 1052: opts.port = htons(atoi(optarg)); if (!opts.port) goto bad_arg; break; case 'j': opts.shell_job = true; break; case 'l': opts.handle_file_locks = true; break; case 1053: opts.img_parent = optarg; break; case 1057: if (parse_cpu_cap(&opts, optarg)) goto usage; break; case 1058: opts.force_irmap = true; break; case 1054: pr_err("--ms is deprecated; see \"Check options\" of criu --help\n"); return 1; case 'L': opts.libdir = optarg; break; case 1059: has_exec_cmd = true; break; case 1060: if (parse_manage_cgroups(&opts, optarg)) goto usage; break; case 1061: { char *path, *ctl; path = strchr(optarg, ':'); if (path) { *path = '\0'; path++; ctl = optarg; } else { path = optarg; ctl = NULL; } if (new_cg_root_add(ctl, path)) return -1; } break; case 1062: if (inherit_fd_parse(optarg) < 0) return 1; break; case 1063: ret = check_add_feature(optarg); if (ret < 0) /* invalid kernel feature name */ return 1; if (ret > 0) /* list kernel features and exit */ return 0; break; case 1064: if (!add_skip_mount(optarg)) return 1; break; case 1065: if (!add_fsname_auto(optarg)) return 1; break; case 1068: opts.freeze_cgroup = optarg; break; case 1069: opts.ghost_limit = parse_size(optarg); break; case 1070: if (irmap_scan_path_add(optarg)) return -1; break; case 1071: opts.lsm_profile = optarg; opts.lsm_supplied = true; break; case 1072: opts.timeout = atoi(optarg); break; case 1076: opts.lazy_pages = true; break; case 'M': { char *aux; if (strcmp(optarg, "auto") == 0) { opts.autodetect_ext_mounts = true; break; } aux = strchr(optarg, ':'); if (aux == NULL) goto bad_arg; *aux = '\0'; if (ext_mount_add(optarg, aux + 1)) return 1; } break; case 1073: if (add_external(optarg)) return 1; break; case 1074: if (!strcmp("net", optarg)) opts.empty_ns |= CLONE_NEWNET; else { pr_err("Unsupported empty namespace: %s\n", optarg); return 1; } break; case 1079: opts.check_extra_features = true; opts.check_experimental_features = true; break; case 1080: opts.cgroup_props = optarg; break; case 1081: opts.cgroup_props_file = optarg; break; case 1082: if (!cgp_add_dump_controller(optarg)) return 1; break; case 1088: if (sscanf(optarg, "%d", &opts.status_fd) != 1) { pr_err("Unable to parse a value of --status-fd\n"); return 1; } break; case 'V': pr_msg("Version: %s\n", CRIU_VERSION); if (strcmp(CRIU_GITID, "0")) pr_msg("GitID: %s\n", CRIU_GITID); return 0; case 'h': usage_error = false; goto usage; default: goto usage; } } if (opts.deprecated_ok) pr_msg("Turn deprecated stuff ON\n"); if (opts.tcp_skip_in_flight) pr_msg("Will skip in-flight TCP connections\n"); if (opts.tcp_established_ok) pr_info("Will dump TCP connections\n"); if (opts.link_remap_ok) pr_info("Will allow link remaps on FS\n"); if (opts.weak_sysctls) pr_msg("Will skip non-existant sysctls on restore\n"); if (getenv("CRIU_DEPRECATED")) { pr_msg("Turn deprecated stuff ON via env\n"); opts.deprecated_ok = true; } if (check_namespace_opts()) { pr_msg("Error: namespace flags conflict\n"); return 1; } if (!opts.restore_detach && opts.restore_sibling) { pr_msg("--restore-sibling only makes sense with --restore-detach\n"); return 1; } if (opts.work_dir == NULL) opts.work_dir = imgs_dir; if (optind >= argc) { pr_msg("Error: command is required\n"); goto usage; } if (!strcmp(argv[optind], "exec")) { pr_msg("The \"exec\" action is deprecated by the Compel library.\n"); return -1; } has_sub_command = (argc - optind) > 1; if (has_exec_cmd) { if (!has_sub_command) { pr_msg("Error: --exec-cmd requires a command\n"); goto usage; } if (strcmp(argv[optind], "restore")) { pr_msg("Error: --exec-cmd is available for the restore command only\n"); goto usage; } if (opts.restore_detach) { pr_msg("Error: --restore-detached and --exec-cmd cannot be used together\n"); goto usage; } opts.exec_cmd = xmalloc((argc - optind) * sizeof(char *)); if (!opts.exec_cmd) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; } else { /* No subcommands except for cpuinfo and restore --exec-cmd */ if (strcmp(argv[optind], "cpuinfo") && has_sub_command) { pr_msg("Error: excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); goto usage; } } /* We must not open imgs dir, if service is called */ if (strcmp(argv[optind], "service")) { ret = open_image_dir(imgs_dir); if (ret < 0) return 1; } /* * When a process group becomes an orphan, * its processes are sent a SIGHUP signal */ if (!strcmp(argv[optind], "restore") && opts.restore_detach && opts.final_state == TASK_STOPPED && opts.shell_job) pr_warn("Stopped and detached shell job will get SIGHUP from OS."); if (chdir(opts.work_dir)) { pr_perror("Can't change directory to %s", opts.work_dir); return 1; } log_set_loglevel(log_level); if (log_init(opts.output)) return 1; libsoccr_set_log(log_level, print_on_level); compel_log_init(vprint_on_level, log_get_loglevel()); pr_debug("Version: %s (gitid %s)\n", CRIU_VERSION, CRIU_GITID); if (opts.deprecated_ok) pr_debug("DEPRECATED ON\n"); if (!list_empty(&opts.inherit_fds)) { if (strcmp(argv[optind], "restore")) { pr_err("--inherit-fd is restore-only option\n"); return 1; } /* now that log file is set up, print inherit fd list */ inherit_fd_log(); } if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); if (!strcmp(argv[optind], "dump")) { if (!tree_id) goto opt_pid_missing; return cr_dump_tasks(tree_id); } if (!strcmp(argv[optind], "pre-dump")) { if (!tree_id) goto opt_pid_missing; return cr_pre_dump_tasks(tree_id) != 0; } if (!strcmp(argv[optind], "restore")) { if (tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); ret = cr_restore_tasks(); if (ret == 0 && opts.exec_cmd) { close_pid_proc(); execvp(opts.exec_cmd[0], opts.exec_cmd); pr_perror("Failed to exec command %s", opts.exec_cmd[0]); ret = 1; } return ret != 0; } if (!strcmp(argv[optind], "show")) { pr_msg("The \"show\" action is deprecated by the CRIT utility.\n"); pr_msg("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; } if (!strcmp(argv[optind], "lazy-pages")) return cr_lazy_pages(opts.daemon_mode) != 0; if (!strcmp(argv[optind], "check")) return cr_check() != 0; if (!strcmp(argv[optind], "page-server")) return cr_page_server(opts.daemon_mode, false, -1) != 0; if (!strcmp(argv[optind], "service")) return cr_service(opts.daemon_mode); if (!strcmp(argv[optind], "dedup")) return cr_dedup() != 0; if (!strcmp(argv[optind], "cpuinfo")) { if (!argv[optind + 1]) { pr_msg("Error: cpuinfo requires an action: dump or check\n"); goto usage; } if (!strcmp(argv[optind + 1], "dump")) return cpuinfo_dump(); else if (!strcmp(argv[optind + 1], "check")) return cpuinfo_check(); } pr_msg("Error: unknown command: %s\n", argv[optind]); usage: pr_msg("\n" "Usage:\n" " criu dump|pre-dump -t PID []\n" " criu restore []\n" " criu check [--feature FEAT]\n" " criu page-server\n" " criu service []\n" " criu dedup\n" " criu lazy-pages -D DIR []\n" "\n" "Commands:\n" " dump checkpoint a process/tree identified by pid\n" " pre-dump pre-dump task(s) minimizing their frozen time\n" " restore restore a process/tree\n" " check checks whether the kernel support is up-to-date\n" " page-server launch page server\n" " service launch service\n" " dedup remove duplicates in memory dump\n" " cpuinfo dump writes cpu information into image file\n" " cpuinfo check validates cpu information read from image file\n" ); if (usage_error) { pr_msg("\nTry -h|--help for more info\n"); return 1; } pr_msg("\n" "Most of the true / false long options (the ones without arguments) can be\n" "prefixed with --no- to negate the option (example: --display-stats and\n" "--no-display-stats).\n" "\n" "Dump/Restore options:\n" "\n" "* Generic:\n" " -t|--tree PID checkpoint a process tree identified by PID\n" " -d|--restore-detached detach after restore\n" " -S|--restore-sibling restore root task as sibling\n" " -s|--leave-stopped leave tasks in stopped state after checkpoint\n" " -R|--leave-running leave tasks in running state after checkpoint\n" " -D|--images-dir DIR directory for image files\n" " --pidfile FILE write root task, service or page-server pid to FILE\n" " -W|--work-dir DIR directory to cd and write logs/pidfiles/stats to\n" " (if not specified, value of --images-dir is used)\n" " --cpu-cap [CAP] CPU capabilities to write/check. CAP is comma-separated\n" " list of: cpu, fpu, all, ins, none. To disable\n" " a capability, use ^CAP. Empty argument implies all\n" " --exec-cmd execute the command specified after '--' on successful\n" " restore making it the parent of the restored process\n" " --freeze-cgroup use cgroup freezer to collect processes\n" " --weak-sysctls skip restoring sysctls that are not available\n" " --lazy-pages restore pages on demand\n" " this requires running a second instance of criu\n" " in lazy-pages mode: 'criu lazy-pages -D DIR'\n" " --lazy-pages and lazy-pages mode require userfaultfd\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" " Formats of RES on dump:\n" " tty[rdev:dev]\n" " file[mnt_id:inode]\n" " dev[major/minor]:NAME\n" " unix[ino]\n" " mnt[MOUNTPOINT]:COOKIE\n" " mnt[]{:AUTO_OPTIONS}\n" " Formats of RES on restore:\n" " dev[NAME]:DEVPATH\n" " veth[IFNAME]:OUTNAME{@BRIDGE}\n" " macvlan[IFNAME]:OUTNAME\n" " mnt[COOKIE]:ROOT\n" "\n" "* Special resources support:\n" " --" SK_EST_PARAM " checkpoint/restore established TCP connections\n" " --" SK_INFLIGHT_PARAM " skip (ignore) in-flight TCP connections\n" " --" SK_CLOSE_PARAM " restore connected TCP sockets in closed state\n" " -r|--root PATH change the root filesystem (when run in mount namespace)\n" " --evasive-devices use any path to a device file if the original one\n" " is inaccessible\n" " --link-remap allow one to link unlinked files back when possible\n" " --ghost-limit size limit max size of deleted file contents inside image\n" " --action-script FILE add an external action script\n" " -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" " -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n" " -L|--libdir path to a plugin directory (by default " CR_PLUGIN_DEFAULT ")\n" " --force-irmap force resolving names for inotify/fsnotify watches\n" " --irmap-scan-path FILE\n" " add a path the irmap hints to scan\n" " --manage-cgroups [m] dump/restore process' cgroups; argument can be one of\n" " 'none', 'props', 'soft' (default), 'full' or 'strict'\n" " --cgroup-root [controller:]/newroot\n" " on dump: change the root for the controller that will\n" " be dumped. By default, only the paths with tasks in\n" " them and below will be dumped.\n" " on restore: change the root cgroup the controller will\n" " be installed into. No controller means that root is the\n" " default for all controllers not specified\n" " --cgroup-props STRING\n" " define cgroup controllers and properties\n" " to be checkpointed, which are described\n" " via STRING using simplified YAML format\n" " --cgroup-props-file FILE\n" " same as --cgroup-props, but taking description\n" " from the path specified\n" " --cgroup-dump-controller NAME\n" " define cgroup controller to be dumped\n" " and skip anything else present in system\n" " --skip-mnt PATH ignore this mountpoint when dumping the mount namespace\n" " --enable-fs FSNAMES a comma separated list of filesystem names or \"all\"\n" " force criu to (try to) dump/restore these filesystem's\n" " mountpoints even if fs is not supported\n" " --inherit-fd fd[NUM]:RES\n" " Inherit file descriptors, treating fd NUM as being\n" " already opened via an existing RES, which can be:\n" " tty[rdev:dev]\n" " pipe[inode]\n" " socket[inode]\n" " file[mnt_id:inode]\n" " path/to/file\n" " --empty-ns net Create a namespace, but don't restore its properties\n" " (assuming it will be restored by action scripts)\n" " -J|--join-ns NS:{PID|NS_FILE}[,OPTIONS]\n" " Join existing namespace and restore process in it.\n" " Namespace can be specified as either pid or file path.\n" " OPTIONS can be used to specify parameters for userns:\n" " user:PID,UID,GID\n" "\n" "Check options:\n" " Without options, \"criu check\" checks availability of absolutely required\n" " kernel features, critical for performing dump and restore.\n" " --extra add check for extra kernel features\n" " --experimental add check for experimental kernel features\n" " --all same as --extra --experimental\n" " --feature FEAT only check a particular feature, one of:" ); pr_check_features(" ", ", ", 80); pr_msg( "\n" "* Logging:\n" " -o|--log-file FILE log file name\n" " --log-pid enable per-process logging to separate FILE.pid files\n" " -v[v...]|--verbosity increase verbosity (can use multiple v)\n" " -vNUM|--verbosity=NUM set verbosity to NUM (higher level means more output):\n" " -v1 - only errors and messages\n" " -v2 - also warnings (default level)\n" " -v3 - also information messages and timestamps\n" " -v4 - lots of debug\n" " --display-stats print out dump/restore stats\n" "\n" "* Memory dumping options:\n" " --track-mem turn on memory changes tracker in kernel\n" " --prev-images-dir DIR path to images from previous dump (relative to -D)\n" " --page-server send pages to page server (see options below as well)\n" " --auto-dedup when used on dump it will deduplicate \"old\" data in\n" " pages images of previous dump\n" " when used on restore, as soon as page is restored, it\n" " will be punched from the image\n" "\n" "Page/Service server options:\n" " --address ADDR address of server or service\n" " --port PORT port of page server\n" " -d|--daemon run in the background after creating socket\n" " --status-fd FD write \\0 to the FD and close it once process is ready\n" " to handle requests\n" "\n" "Other options:\n" " -h|--help show this text\n" " -V|--version show version\n" ); return 0; opt_pid_missing: pr_msg("Error: pid not specified\n"); return 1; bad_arg: if (idx < 0) /* short option */ pr_msg("Error: invalid argument for -%c: %s\n", opt, optarg); else /* long option */ pr_msg("Error: invalid argument for --%s: %s\n", long_opts[idx].name, optarg); return 1; } criu-3.6/criu/eventfd.c000066400000000000000000000050231317335042600150710ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/compiler.h" #include "imgset.h" #include "eventfd.h" #include "fdinfo.h" #include "image.h" #include "util.h" #include "log.h" #include "protobuf.h" #include "images/eventfd.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "eventfd: " struct eventfd_file_info { EventfdFileEntry *efe; struct file_desc d; }; /* Checks if file descriptor @lfd is eventfd */ int is_eventfd_link(char *link) { return is_anon_link_type(link, "[eventfd]"); } static void pr_info_eventfd(char *action, EventfdFileEntry *efe) { pr_info("%s: id %#08x flags %#04x counter %#016"PRIx64"\n", action, efe->id, efe->flags, efe->counter); } static int dump_one_eventfd(int lfd, u32 id, const struct fd_parms *p) { EventfdFileEntry efd = EVENTFD_FILE_ENTRY__INIT; FileEntry fe = FILE_ENTRY__INIT; if (parse_fdinfo(lfd, FD_TYPES__EVENTFD, &efd)) return -1; efd.id = id; efd.flags = p->flags; efd.fown = (FownEntry *)&p->fown; fe.type = FD_TYPES__EVENTFD; fe.id = efd.id; fe.efd = &efd; pr_info_eventfd("Dumping ", &efd); return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } const struct fdtype_ops eventfd_dump_ops = { .type = FD_TYPES__EVENTFD, .dump = dump_one_eventfd, }; static int eventfd_open(struct file_desc *d, int *new_fd) { struct eventfd_file_info *info; int tmp; info = container_of(d, struct eventfd_file_info, d); tmp = eventfd(info->efe->counter, 0); if (tmp < 0) { pr_perror("Can't create eventfd %#08x", info->efe->id); return -1; } if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) { pr_perror("Can't restore params on eventfd %#08x", info->efe->id); goto err_close; } *new_fd = tmp; return 0; err_close: close(tmp); return -1; } static struct file_desc_ops eventfd_desc_ops = { .type = FD_TYPES__EVENTFD, .open = eventfd_open, }; static int collect_one_efd(void *obj, ProtobufCMessage *msg, struct cr_img *i) { struct eventfd_file_info *info = obj; info->efe = pb_msg(msg, EventfdFileEntry); pr_info_eventfd("Collected ", info->efe); return file_desc_add(&info->d, info->efe->id, &eventfd_desc_ops); } struct collect_image_info eventfd_cinfo = { .fd_type = CR_FD_EVENTFD_FILE, .pb_type = PB_EVENTFD_FILE, .priv_size = sizeof(struct eventfd_file_info), .collect = collect_one_efd, }; criu-3.6/criu/eventpoll.c000066400000000000000000000122631317335042600154520ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include "crtools.h" #include "common/compiler.h" #include "imgset.h" #include "rst_info.h" #include "eventpoll.h" #include "fdinfo.h" #include "image.h" #include "util.h" #include "log.h" #include "pstree.h" #include "protobuf.h" #include "images/eventpoll.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "epoll: " struct eventpoll_file_info { EventpollFileEntry *efe; struct file_desc d; }; /* Checks if file descriptor @lfd is eventfd */ int is_eventpoll_link(char *link) { return is_anon_link_type(link, "[eventpoll]"); } static void pr_info_eventpoll_tfd(char *action, EventpollTfdEntry *e) { pr_info("%seventpoll-tfd: id %#08x tfd %#08x events %#08x data %#016"PRIx64"\n", action, e->id, e->tfd, e->events, e->data); } static void pr_info_eventpoll(char *action, EventpollFileEntry *e) { pr_info("%seventpoll: id %#08x flags %#04x\n", action, e->id, e->flags); } static int dump_one_eventpoll(int lfd, u32 id, const struct fd_parms *p) { FileEntry fe = FILE_ENTRY__INIT; EventpollFileEntry e = EVENTPOLL_FILE_ENTRY__INIT; int i, ret = -1; e.id = id; e.flags = p->flags; e.fown = (FownEntry *)&p->fown; if (parse_fdinfo(lfd, FD_TYPES__EVENTPOLL, &e)) goto out; fe.type = FD_TYPES__EVENTPOLL; fe.id = e.id; fe.epfd = &e; pr_info_eventpoll("Dumping ", &e); ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); out: for (i = 0; i < e.n_tfd; i++) { if (!ret) pr_info_eventpoll_tfd("Dumping: ", e.tfd[i]); eventpoll_tfd_entry__free_unpacked(e.tfd[i], NULL); } xfree(e.tfd); return ret; } const struct fdtype_ops eventpoll_dump_ops = { .type = FD_TYPES__EVENTPOLL, .dump = dump_one_eventpoll, }; static int eventpoll_post_open(struct file_desc *d, int fd); static int eventpoll_open(struct file_desc *d, int *new_fd) { struct fdinfo_list_entry *fle = file_master(d); struct eventpoll_file_info *info; int tmp; info = container_of(d, struct eventpoll_file_info, d); if (fle->stage >= FLE_OPEN) return eventpoll_post_open(d, fle->fe->fd); pr_info_eventpoll("Restore ", info->efe); tmp = epoll_create(1); if (tmp < 0) { pr_perror("Can't create epoll %#08x", info->efe->id); return -1; } if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) { pr_perror("Can't restore file params on epoll %#08x", info->efe->id); goto err_close; } *new_fd = tmp; return 1; err_close: close(tmp); return -1; } static int epoll_not_ready_tfd(EventpollTfdEntry *tdefe) { struct fdinfo_list_entry *fle; list_for_each_entry(fle, &rsti(current)->fds, ps_list) { if (tdefe->tfd != fle->fe->fd) continue; if (fle->desc->ops->type == FD_TYPES__EVENTPOLL) return (fle->stage < FLE_OPEN); else return (fle->stage != FLE_RESTORED); } /* * If tgt fle is not on the fds list, it's already * restored (see open_fdinfos), so we're ready. */ return 0; } static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe) { struct epoll_event event; pr_info_eventpoll_tfd("Restore ", tdefe); event.events = tdefe->events; event.data.u64 = tdefe->data; if (epoll_ctl(fd, EPOLL_CTL_ADD, tdefe->tfd, &event)) { pr_perror("Can't add event on %#08x", id); return -1; } return 0; } static int eventpoll_post_open(struct file_desc *d, int fd) { struct eventpoll_file_info *info; int i; info = container_of(d, struct eventpoll_file_info, d); for (i = 0; i < info->efe->n_tfd; i++) { if (epoll_not_ready_tfd(info->efe->tfd[i])) return 1; } for (i = 0; i < info->efe->n_tfd; i++) { if (eventpoll_retore_tfd(fd, info->efe->id, info->efe->tfd[i])) return -1; } return 0; } static struct file_desc_ops desc_ops = { .type = FD_TYPES__EVENTPOLL, .open = eventpoll_open, }; static int collect_one_epoll_tfd(void *o, ProtobufCMessage *msg, struct cr_img *i) { EventpollTfdEntry *tfde; struct file_desc *d; struct eventpoll_file_info *ef; EventpollFileEntry *efe; int n_tfd; if (!deprecated_ok("Epoll TFD image")) return -1; tfde = pb_msg(msg, EventpollTfdEntry); d = find_file_desc_raw(FD_TYPES__EVENTPOLL, tfde->id); if (!d) { pr_err("No epoll FD for %u\n", tfde->id); return -1; } ef = container_of(d, struct eventpoll_file_info, d); efe = ef->efe; n_tfd = efe->n_tfd + 1; if (xrealloc_safe(&efe->tfd, n_tfd * sizeof(EventpollTfdEntry *))) return -1; efe->tfd[efe->n_tfd] = tfde; efe->n_tfd = n_tfd; return 0; } struct collect_image_info epoll_tfd_cinfo = { .fd_type = CR_FD_EVENTPOLL_TFD, .pb_type = PB_EVENTPOLL_TFD, .collect = collect_one_epoll_tfd, .flags = COLLECT_NOFREE, }; static int collect_one_epoll(void *o, ProtobufCMessage *msg, struct cr_img *i) { struct eventpoll_file_info *info = o; info->efe = pb_msg(msg, EventpollFileEntry); pr_info_eventpoll("Collected ", info->efe); return file_desc_add(&info->d, info->efe->id, &desc_ops); } struct collect_image_info epoll_cinfo = { .fd_type = CR_FD_EVENTPOLL_FILE, .pb_type = PB_EVENTPOLL_FILE, .priv_size = sizeof(struct eventpoll_file_info), .collect = collect_one_epoll, }; criu-3.6/criu/external.c000066400000000000000000000031321317335042600152570ustar00rootroot00000000000000#include "common/err.h" #include "common/list.h" #include "cr_options.h" #include "xmalloc.h" #include "mount.h" #include "external.h" #include "util.h" #include "net.h" int add_external(char *key) { struct external *ext; ext = xmalloc(sizeof(*ext)); if (!ext) return -1; ext->id = key; if (strstartswith(key, "macvlan") && macvlan_ext_add(ext) < 0) { xfree(ext); return -1; } if (strstartswith(key, "mnt[]")) { xfree(ext); return ext_mount_parse_auto(key + 5); } list_add(&ext->node, &opts.external); return 0; } bool external_lookup_id(char *id) { struct external *ext; list_for_each_entry(ext, &opts.external, node) if (!strcmp(ext->id, id)) return true; return false; } void *external_lookup_data(char *key) { struct external *ext; int len = strlen(key); list_for_each_entry(ext, &opts.external, node) { if (strncmp(ext->id, key, len)) continue; return ext->data; } return ERR_PTR(-ENOENT); } char *external_lookup_by_key(char *key) { struct external *ext; int len = strlen(key); list_for_each_entry(ext, &opts.external, node) { if (strncmp(ext->id, key, len)) continue; if (ext->id[len] == ':') return ext->id + len + 1; else if (ext->id[len] == '\0') return NULL; } return ERR_PTR(-ENOENT); } int external_for_each_type(char *type, int (*cb)(struct external *, void *), void *arg) { struct external *ext; int ln = strlen(type); int ret = 0; list_for_each_entry(ext, &opts.external, node) { if (strncmp(ext->id, type, ln)) continue; if (ext->id[ln] != '[') continue; ret = cb(ext, arg); if (ret) break; } return ret; } criu-3.6/criu/fault-injection.c000066400000000000000000000004501317335042600165300ustar00rootroot00000000000000#include #include "fault-injection.h" enum faults fi_strategy; int fault_injection_init() { char *val; int strat; val = getenv("CRIU_FAULT"); if (val == NULL) return 0; strat = atoi(val); if (strat <= 0 || strat >= FI_MAX) return -1; fi_strategy = strat; return 0; } criu-3.6/criu/fdstore.c000066400000000000000000000045641317335042600151150ustar00rootroot00000000000000#include #include #include #include #include #include #include "common/scm.h" #include "common/lock.h" #include "servicefd.h" #include "fdstore.h" #include "xmalloc.h" #include "rst-malloc.h" #include "log.h" static struct fdstore_desc { int next_id; mutex_t lock; /* to protect a peek offset */ } *desc; int fdstore_init(void) { struct sockaddr_un addr; unsigned int addrlen; struct stat st; int sk, ret; desc = shmalloc(sizeof(*desc)); if (!desc) return -1; desc->next_id = 0; mutex_init(&desc->lock); sk = socket(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0); if (sk < 0) { pr_perror("Unable to create a socket"); return -1; } if (fstat(sk, &st)) { pr_perror("Unable to stat a file descriptor"); close(sk); return -1; } addr.sun_family = AF_UNIX; addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%"PRIx64, st.st_ino); addrlen += sizeof(addr.sun_family); addr.sun_path[0] = 0; /* * This socket is connected to itself, so all messages are queued to * its receive queue. Here we are going to use this socket to store * file descriptors. For that we need to send a file descriptor in * a queue and remeber its sequence number. Then we can set SO_PEEK_OFF * to get a file descriptor without dequeuing it. */ if (bind(sk, (struct sockaddr *) &addr, addrlen)) { pr_perror("Unable to bind a socket"); close(sk); return -1; } if (connect(sk, (struct sockaddr *) &addr, addrlen)) { pr_perror("Unable to connect a socket"); close(sk); return -1; } ret = install_service_fd(FDSTORE_SK_OFF, sk); close(sk); if (ret < 0) return -1; return 0; } int fdstore_add(int fd) { int sk = get_service_fd(FDSTORE_SK_OFF); int id; mutex_lock(&desc->lock); if (send_fd(sk, NULL, 0, fd)) { mutex_unlock(&desc->lock); return -1; } id = desc->next_id++; mutex_unlock(&desc->lock); return id; } int fdstore_get(int id) { int sk = get_service_fd(FDSTORE_SK_OFF); int fd; mutex_lock(&desc->lock); if (setsockopt(sk, SOL_SOCKET, SO_PEEK_OFF, &id, sizeof(id))) { mutex_unlock(&desc->lock); pr_perror("Unable to a peek offset"); return -1; } if (__recv_fds(sk, &fd, 1, NULL, 0, MSG_PEEK) < 0) { mutex_unlock(&desc->lock); pr_perror("Unable to get a file descriptor with the %d id", id); return -1; } mutex_unlock(&desc->lock); return fd; } criu-3.6/criu/fifo.c000066400000000000000000000104221317335042600143600ustar00rootroot00000000000000#include #include #include #include #include #include "imgset.h" #include "image.h" #include "files.h" #include "files-reg.h" #include "file-ids.h" #include "pipes.h" #include "fifo.h" #include "protobuf.h" #include "images/regfile.pb-c.h" #include "images/fifo.pb-c.h" /* * FIFO checkpoint and restore is done in a bit unusual manner. * We use files-reg.c engine to save fifo path and flags, * thus regular files image will contain fifo descriptors which * are useless for reg-files engine itself but needed for our fifo * engine. * * In particular we dump fifo-entry automatically and appropriate * reg-file entry manually, thus on restore we need to ask reg-file * engine to restore fifo path and flags via direct call. */ struct fifo_info { struct list_head list; struct file_desc d; FifoEntry *fe; bool restore_data; }; static LIST_HEAD(fifo_head); static struct pipe_data_dump pd_fifo = { .img_type = CR_FD_FIFO_DATA, }; static int dump_one_fifo(int lfd, u32 id, const struct fd_parms *p) { struct cr_img *img = img_from_set(glob_imgset, CR_FD_FILES); FileEntry fe = FILE_ENTRY__INIT; FifoEntry e = FIFO_ENTRY__INIT; u32 rf_id; fd_id_generate_special(NULL, &rf_id); /* * It's a trick here, we use regular files dumping * code to save path to a fifo, then we reuse it * on restore. */ if (dump_one_reg_file(lfd, rf_id, p)) return -1; pr_info("Dumping fifo %d with id %#x pipe_id %#x\n", lfd, id, pipe_id(p)); e.id = id; e.pipe_id = pipe_id(p); e.has_regf_id = true; e.regf_id = rf_id; fe.type = FD_TYPES__FIFO; fe.id = e.id; fe.fifo = &e; if (pb_write_one(img, &fe, PB_FILE)) return -1; return dump_one_pipe_data(&pd_fifo, lfd, p); } const struct fdtype_ops fifo_dump_ops = { .type = FD_TYPES__FIFO, .dump = dump_one_fifo, }; static struct pipe_data_rst *pd_hash_fifo[PIPE_DATA_HASH_SIZE]; static int do_open_fifo(int ns_root_fd, struct reg_file_info *rfi, void *arg) { struct fifo_info *info = arg; int new_fifo, fake_fifo = -1; /* * The fifos (except read-write fifos) do wait until * another pipe-end get connected, so to be able to * proceed the restoration procedure we open a fake * fifo here. */ fake_fifo = openat(ns_root_fd, rfi->path, O_RDWR); if (fake_fifo < 0) { pr_perror("Can't open fake fifo %#x [%s]", info->fe->id, rfi->path); return -1; } new_fifo = openat(ns_root_fd, rfi->path, rfi->rfe->flags); if (new_fifo < 0) { pr_perror("Can't open fifo %#x [%s]", info->fe->id, rfi->path); goto out; } if (info->restore_data) if (restore_pipe_data(CR_FD_FIFO_DATA, fake_fifo, info->fe->pipe_id, pd_hash_fifo)) { close(new_fifo); new_fifo = -1; } out: close(fake_fifo); return new_fifo; } static int open_fifo_fd(struct file_desc *d, int *new_fd) { struct fifo_info *info = container_of(d, struct fifo_info, d); struct file_desc *reg_d; int fd; reg_d = collect_special_file(info->fe->has_regf_id ? info->fe->regf_id : info->fe->id); if (!reg_d) return -1; fd = open_path(reg_d, do_open_fifo, info); if (fd < 0) return -1; *new_fd = fd; return 0; } static struct file_desc_ops fifo_desc_ops = { .type = FD_TYPES__FIFO, .open = open_fifo_fd, }; static int collect_one_fifo(void *o, ProtobufCMessage *base, struct cr_img *i) { struct fifo_info *info = o, *f; info->fe = pb_msg(base, FifoEntry); pr_info("Collected fifo entry ID %#x PIPE ID %#x\n", info->fe->id, info->fe->pipe_id); /* check who will restore the fifo data */ list_for_each_entry(f, &fifo_head, list) if (f->fe->pipe_id == info->fe->pipe_id) break; if (&f->list == &fifo_head) { list_add(&info->list, &fifo_head); info->restore_data = true; } else { INIT_LIST_HEAD(&info->list); info->restore_data = false; } return file_desc_add(&info->d, info->fe->id, &fifo_desc_ops); } struct collect_image_info fifo_cinfo = { .fd_type = CR_FD_FIFO, .pb_type = PB_FIFO, .priv_size = sizeof(struct fifo_info), .collect = collect_one_fifo, }; static int collect_fifo_data(void *obj, ProtobufCMessage *msg, struct cr_img *img) { return do_collect_pipe_data(obj, msg, img, pd_hash_fifo); } struct collect_image_info fifo_data_cinfo = { .fd_type = CR_FD_FIFO_DATA, .pb_type = PB_PIPE_DATA, .priv_size = sizeof(struct pipe_data_rst), .collect = collect_fifo_data, }; criu-3.6/criu/file-ids.c000066400000000000000000000037131317335042600151360ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "int.h" #include "file-ids.h" #include "rbtree.h" #include "kcmp-ids.h" #include "common/compiler.h" #include "image.h" #include "util.h" #include "irmap.h" #include "files.h" static DECLARE_KCMP_TREE(fd_tree, KCMP_FILE); #define FDID_BITS 5 #define FDID_SIZE (1 << FDID_BITS) #define FDID_MASK (FDID_SIZE - 1) static inline int fdid_hashfn(unsigned int s_dev, unsigned long i_ino) { return (s_dev + i_ino) & FDID_MASK; } struct fd_id { int mnt_id; unsigned int dev; unsigned long ino; u32 id; struct fd_id *n; }; static struct fd_id *fd_id_cache[FDID_SIZE]; static void fd_id_cache_one(u32 id, struct fd_parms *p) { struct fd_id *fi; unsigned hv; fi = xmalloc(sizeof(*fi)); if (fi) { fi->dev = p->stat.st_dev; fi->ino = p->stat.st_ino; fi->mnt_id = p->mnt_id; fi->id = id; hv = fdid_hashfn(p->stat.st_dev, p->stat.st_ino); fi->n = fd_id_cache[hv]; fd_id_cache[hv] = fi; } } static struct fd_id *fd_id_cache_lookup(struct fd_parms *p) { struct stat *st = &p->stat; struct fd_id *fi; for (fi = fd_id_cache[fdid_hashfn(st->st_dev, st->st_ino)]; fi; fi = fi->n) if (fi->dev == st->st_dev && fi->ino == st->st_ino && fi->mnt_id == p->mnt_id) return fi; return NULL; } int fd_id_generate_special(struct fd_parms *p, u32 *id) { if (p) { struct fd_id *fi; fi = fd_id_cache_lookup(p); if (fi) { *id = fi->id; return 0; } } *id = fd_tree.subid++; if (p) fd_id_cache_one(*id, p); return 1; } int fd_id_generate(pid_t pid, FdinfoEntry *fe, struct fd_parms *p) { u32 id; struct kid_elem e; int new_id = 0; e.pid = pid; e.genid = fe->id; e.idx = fe->fd; id = kid_generate_gen(&fd_tree, &e, &new_id); if (!id) return -ENOMEM; if (new_id) fd_id_cache_one(id, p); fe->id = id; return new_id; } criu-3.6/criu/file-lock.c000066400000000000000000000217301317335042600153060ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "cr_options.h" #include "imgset.h" #include "files.h" #include "fs-magic.h" #include "kerndat.h" #include "image.h" #include "util.h" #include "mount.h" #include "proc_parse.h" #include "servicefd.h" #include "file-lock.h" struct file_lock_rst { FileLockEntry *fle; struct list_head l; }; struct list_head file_lock_list = LIST_HEAD_INIT(file_lock_list); static int collect_one_file_lock(void *o, ProtobufCMessage *m, struct cr_img *i) { struct file_lock_rst *lr = o; lr->fle = pb_msg(m, FileLockEntry); list_add_tail(&lr->l, &file_lock_list); return 0; } struct collect_image_info file_locks_cinfo = { .fd_type = CR_FD_FILE_LOCKS, .pb_type = PB_FILE_LOCK, .priv_size = sizeof(struct file_lock_rst), .collect = collect_one_file_lock, }; struct file_lock *alloc_file_lock(void) { struct file_lock *flock; flock = xzalloc(sizeof(*flock)); if (!flock) return NULL; INIT_LIST_HEAD(&flock->list); flock->real_owner = -1; flock->owners_fd = -1; return flock; } void free_file_locks(void) { struct file_lock *flock, *tmp; list_for_each_entry_safe(flock, tmp, &file_lock_list, list) { xfree(flock); } INIT_LIST_HEAD(&file_lock_list); } static int dump_one_file_lock(FileLockEntry *fle) { pr_info("LOCK flag: %d,type: %d,pid: %d,fd: %d,start: %8"PRIx64",len: %8"PRIx64"\n", fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); return pb_write_one(img_from_set(glob_imgset, CR_FD_FILE_LOCKS), fle, PB_FILE_LOCK); } static void fill_flock_entry(FileLockEntry *fle, int fl_kind, int fl_ltype) { fle->flag |= fl_kind; fle->type = fl_ltype; } int dump_file_locks(void) { FileLockEntry fle; struct file_lock *fl; int ret = 0; pr_info("Dumping file-locks\n"); list_for_each_entry(fl, &file_lock_list, list) { if (fl->real_owner == -1) { if (fl->fl_kind == FL_POSIX) { pr_err("Unresolved lock found pid %d ino %ld\n", fl->fl_owner, fl->i_no); return -1; } continue; } file_lock_entry__init(&fle); fle.pid = fl->real_owner; fle.fd = fl->owners_fd; fill_flock_entry(&fle, fl->fl_kind, fl->fl_ltype); fle.start = fl->start; if (!strncmp(fl->end, "EOF", 3)) fle.len = 0; else fle.len = (atoll(fl->end) + 1) - fl->start; ret = dump_one_file_lock(&fle); if (ret) { pr_err("Dump file lock failed!\n"); goto err; } } err: return ret; } static int lock_btrfs_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p) { int phys_dev = MKKDEV(fl->maj, fl->min); char link[PATH_MAX], t[32]; struct ns_id *ns; int ret; snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd); ret = readlink(t, link, sizeof(link)) - 1; if (ret < 0) { pr_perror("Can't read link of fd %d", fd); return -1; } else if ((size_t)ret == sizeof(link)) { pr_err("Buffer for read link of fd %d is too small\n", fd); return -1; } link[ret] = 0; ns = lookup_nsid_by_mnt_id(p->mnt_id); return phys_stat_dev_match(p->stat.st_dev, phys_dev, ns, link); } static inline int lock_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p) { dev_t dev = p->stat.st_dev; if (fl->i_no != p->stat.st_ino) return 0; /* * Get the right devices for BTRFS. Look at phys_stat_resolve_dev() * for more details. */ if (p->fs_type == BTRFS_SUPER_MAGIC) { if (p->mnt_id != -1) { struct mount_info *m; m = lookup_mnt_id(p->mnt_id); BUG_ON(m == NULL); dev = kdev_to_odev(m->s_dev); } else /* old kernel */ return lock_btrfs_file_match(pid, fd, fl, p); } return makedev(fl->maj, fl->min) == dev; } static int lock_check_fd(int lfd, struct file_lock *fl) { int ret; if (fl->fl_ltype & LOCK_MAND) ret = flock(lfd, LOCK_MAND | LOCK_RW); else ret = flock(lfd, LOCK_EX | LOCK_NB); pr_debug(" `- %d/%d\n", ret, errno); if (ret != 0) { if (errno != EAGAIN) { pr_err("Bogus lock test result %d\n", ret); return -1; } return 0; } else { /* * The ret == 0 means, that new lock doesn't conflict * with any others on the file. But since we do know, * that there should be some other one (file is found * in /proc/locks), it means that the lock is already * on file pointed by fd. */ pr_debug(" `- downgrading lock back\n"); if (fl->fl_ltype & LOCK_MAND) ret = flock(lfd, fl->fl_ltype); else if (fl->fl_ltype == F_RDLCK) ret = flock(lfd, LOCK_SH); if (ret) { pr_err("Can't downgrade lock back %d\n", ret); return -1; } } return 1; } static int lock_ofd_check_fd(int lfd, struct file_lock *fl) { int ret; struct flock lck = { .l_whence = SEEK_SET, .l_type = F_WRLCK, .l_start = fl->start }; if (strcmp(fl->end, "EOF")) { unsigned long end; ret = sscanf(fl->end, "%lu", &end); if (ret <= 0) { pr_err("Invalid lock entry\n"); return -1; } lck.l_len = end - fl->start + 1; } else { lck.l_len = 0; } ret = fcntl(lfd, F_OFD_SETLK, &lck); pr_debug(" `- %d/%d\n", ret, errno); if (ret != 0) { if (errno != EAGAIN) { pr_err("Bogus lock test result %d\n", ret); return -1; } return 0; } else { /* * The ret == 0 means, that new lock doesn't conflict * with any others on the file. But since we do know, * that there should be some other one (file is found * in /proc/locks), it means that the lock is already * on file pointed by fd. */ pr_debug(" `- downgrading lock back\n"); if (fl->fl_ltype & LOCK_WRITE) lck.l_type = F_WRLCK; else lck.l_type = F_RDLCK; ret = fcntl(lfd, F_OFD_SETLK, &lck); if (ret) { pr_err("Can't downgrade lock back %d\n", ret); return -1; } } return 1; } int note_file_lock(struct pid *pid, int fd, int lfd, struct fd_parms *p) { struct file_lock *fl; int ret; if (kdat.has_fdinfo_lock) return 0; list_for_each_entry(fl, &file_lock_list, list) { ret = lock_file_match(pid->real, fd, fl, p); if (ret < 0) return -1; if (ret == 0) continue; if (!opts.handle_file_locks) { pr_err("Some file locks are hold by dumping tasks!" "You can try --" OPT_FILE_LOCKS " to dump them.\n"); return -1; } if (fl->fl_kind == FL_POSIX) { /* * POSIX locks cannot belong to anyone * but creator. */ if (fl->fl_owner != pid->real) continue; } else /* fl->fl_kind == FL_FLOCK || fl->fl_kind == FL_OFD */ { int ret; /* * OFD locks & FLOCKs can be inherited across fork, * thus we can have any task as lock * owner. But the creator is preferred * anyway. */ if (fl->fl_owner != pid->real && fl->real_owner != -1) continue; pr_debug("Checking lock holder %d:%d\n", pid->real, fd); if (fl->fl_kind == FL_FLOCK) ret = lock_check_fd(lfd, fl); else ret = lock_ofd_check_fd(lfd, fl); if (ret < 0) return ret; if (ret == 0) continue; } fl->real_owner = pid->ns[0].virt; fl->owners_fd = fd; pr_info("Found lock entry %d.%d %d vs %d\n", pid->real, pid->ns[0].virt, fd, fl->fl_owner); } return 0; } static int restore_file_lock(FileLockEntry *fle) { int ret = -1; unsigned int cmd; if (fle->flag & FL_FLOCK) { if (fle->type & LOCK_MAND) { cmd = fle->type; } else if (fle->type == F_RDLCK) { cmd = LOCK_SH; } else if (fle->type == F_WRLCK) { cmd = LOCK_EX; } else if (fle->type == F_UNLCK) { cmd = LOCK_UN; } else { pr_err("Unknown flock type!\n"); goto err; } pr_info("(flock)flag: %d, type: %d, cmd: %d, pid: %d, fd: %d\n", fle->flag, fle->type, cmd, fle->pid, fle->fd); ret = flock(fle->fd, cmd); if (ret < 0) { pr_err("Can not set flock!\n"); goto err; } } else if (fle->flag & FL_POSIX) { struct flock flk; memset(&flk, 0, sizeof(flk)); flk.l_whence = SEEK_SET; flk.l_start = fle->start; flk.l_len = fle->len; flk.l_pid = fle->pid; flk.l_type = fle->type; pr_info("(posix)flag: %d, type: %d, pid: %d, fd: %d, " "start: %8"PRIx64", len: %8"PRIx64"\n", fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); ret = fcntl(fle->fd, F_SETLKW, &flk); if (ret < 0) { pr_err("Can not set posix lock!\n"); goto err; } } else if (fle->flag & FL_OFD) { struct flock flk = { .l_whence = SEEK_SET, .l_start = fle->start, .l_len = fle->len, .l_pid = 0, .l_type = fle->type }; pr_info("(ofd)flag: %d, type: %d, pid: %d, fd: %d, " "start: %8"PRIx64", len: %8"PRIx64"\n", fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); ret = fcntl(fle->fd, F_OFD_SETLK, &flk); if (ret < 0) { pr_err("Can not set ofd lock!\n"); goto err; } } else { pr_err("Unknown file lock style!\n"); goto err; } return 0; err: return ret; } static int restore_file_locks(int pid) { int ret = 0; struct file_lock_rst *lr; list_for_each_entry(lr, &file_lock_list, l) { if (lr->fle->pid == pid) { ret = restore_file_lock(lr->fle); if (ret) break; } } return ret; } int prepare_file_locks(int pid) { if (!opts.handle_file_locks) return 0; return restore_file_locks(pid); } criu-3.6/criu/files-ext.c000066400000000000000000000040151317335042600153360ustar00rootroot00000000000000/* An external file is a file, which is dumped with help a plugin */ #include #include "imgset.h" #include "files.h" #include "plugin.h" #include "protobuf.h" #include "images/ext-file.pb-c.h" static int dump_one_ext_file(int lfd, u32 id, const struct fd_parms *p) { int ret; struct cr_img *rimg; FileEntry fe = FILE_ENTRY__INIT; ExtFileEntry xfe = EXT_FILE_ENTRY__INIT; ret = run_plugins(DUMP_EXT_FILE, lfd, id); if (ret < 0) return ret; xfe.id = id; xfe.fown = (FownEntry *)&p->fown; fe.type = FD_TYPES__EXT; fe.id = xfe.id; fe.ext = &xfe; rimg = img_from_set(glob_imgset, CR_FD_FILES); return pb_write_one(rimg, &fe, PB_FILE); } const struct fdtype_ops ext_dump_ops = { .type = FD_TYPES__EXT, .dump = dump_one_ext_file, }; struct ext_file_info { struct file_desc d; ExtFileEntry *xfe; }; static int open_fd(struct file_desc *d, int *new_fd) { struct ext_file_info *xfi; int fd; xfi = container_of(d, struct ext_file_info, d); fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id); if (fd < 0) { pr_err("Unable to restore %#x\n", xfi->xfe->id); return -1; } if (restore_fown(fd, xfi->xfe->fown)) return -1; *new_fd = fd; return 0; } static struct file_desc_ops ext_desc_ops = { .type = FD_TYPES__EXT, .open = open_fd, }; static int collect_one_ext(void *o, ProtobufCMessage *base, struct cr_img *i) { struct ext_file_info *xfi = o; xfi->xfe = pb_msg(base, ExtFileEntry); pr_info("Collected external file with ID %#x\n", xfi->xfe->id); return file_desc_add(&xfi->d, xfi->xfe->id, &ext_desc_ops); } struct collect_image_info ext_file_cinfo = { .fd_type = CR_FD_EXT_FILES, .pb_type = PB_EXT_FILE, .priv_size = sizeof(struct ext_file_info), .collect = collect_one_ext, }; int dump_unsupp_fd(struct fd_parms *p, int lfd, char *more, char *info, FdinfoEntry *e) { int ret; ret = do_dump_gen_file(p, lfd, &ext_dump_ops, e); if (ret == 0) return 0; if (ret == -ENOTSUP) pr_err("Can't dump file %d of that type [%o] (%s %s)\n", p->fd, p->stat.st_mode, more, info); return -1; } criu-3.6/criu/files-reg.c000066400000000000000000001270411317335042600153200ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef SEEK_DATA #define SEEK_DATA 3 #define SEEK_HOLE 4 #endif /* Stolen from kernel/fs/nfs/unlink.c */ #define SILLYNAME_PREF ".nfs" #define SILLYNAME_SUFF_LEN (((unsigned)sizeof(u64) << 1) + ((unsigned)sizeof(unsigned int) << 1)) #include "cr_options.h" #include "imgset.h" #include "file-ids.h" #include "mount.h" #include "files.h" #include "common/list.h" #include "rst-malloc.h" #include "fs-magic.h" #include "namespaces.h" #include "proc_parse.h" #include "pstree.h" #include "fault-injection.h" #include "external.h" #include "protobuf.h" #include "util.h" #include "images/regfile.pb-c.h" #include "images/remap-file-path.pb-c.h" #include "files-reg.h" #include "plugin.h" int setfsuid(uid_t fsuid); int setfsgid(gid_t fsuid); /* * Ghost files are those not visible from the FS. Dumping them is * nasty and the only way we have -- just carry its contents with * us. Any brave soul to implement link unlinked file back? */ struct ghost_file { struct list_head list; u32 id; u32 dev; u32 ino; struct file_remap remap; }; static u32 ghost_file_ids = 1; static LIST_HEAD(ghost_files); /* * When opening remaps we first create a link on the remap * target, then open one, then unlink. In case the remap * source has more than one instance, these tree steps * should be serialized with each other. */ static mutex_t *remap_open_lock; static inline int init_remap_lock(void) { remap_open_lock = shmalloc(sizeof(*remap_open_lock)); if (!remap_open_lock) return -1; mutex_init(remap_open_lock); return 0; } static LIST_HEAD(remaps); /* * Remember the name to delete it if needed on error or * rollback action. Note we don't expect that there will * be a HUGE number of link remaps, so in a sake of speed * we keep all data in memory. */ struct link_remap_rlb { struct list_head list; struct ns_id *mnt_ns; char *path; }; static int note_link_remap(char *path, struct ns_id *nsid) { struct link_remap_rlb *rlb; rlb = xmalloc(sizeof(*rlb)); if (!rlb) goto err; rlb->path = xstrdup(path); if (!rlb->path) goto err2; rlb->mnt_ns = nsid; list_add(&rlb->list, &remaps); return 0; err2: xfree(rlb); err: pr_err("Can't note link remap for %s\n", path); return -1; } /* Trim "a/b/c/d" to "a/b/d" */ static int trim_last_parent(char *path) { char *fname, *p; p = strrchr(path, '/'); fname = p + 1; if (!p || *fname == '\0') return -1; while (p >= path && *p == '/') p--; if (p < path) return -1; while (p >= path && *p != '/') p--; p++; while (*fname != '\0') *p++ = *fname++; *p = '\0'; return 0; } #define BUFSIZE (4096) static int copy_chunk_from_file(int fd, int img, off_t off, size_t len) { char *buf = NULL; int ret; while (len > 0) { ret = sendfile(img, fd, &off, len); if (ret <= 0) { pr_perror("Can't send ghost to image"); return -1; } len -= ret; } xfree(buf); return 0; } static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size) { GhostChunkEntry ce = GHOST_CHUNK_ENTRY__INIT; off_t data, hole = 0; while (hole < file_size) { data = lseek(fd, hole, SEEK_DATA); if (data < 0) { if (errno == ENXIO) /* No data */ break; else if (hole == 0) { /* No SEEK_HOLE/DATA by FS */ data = 0; hole = file_size; } else { pr_perror("Can't seek file data"); return -1; } } else { hole = lseek(fd, data, SEEK_HOLE); if (hole < 0) { pr_perror("Can't seek file hole"); return -1; } } ce.len = hole - data; ce.off = data; if (pb_write_one(img, &ce, PB_GHOST_CHUNK)) return -1; if (copy_chunk_from_file(fd, img_raw_fd(img), ce.off, ce.len)) return -1; } return 0; } static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) { char *buf = NULL; int ret; while (len > 0) { if (lseek(fd, off, SEEK_SET) < 0) { pr_perror("Can't seek file"); return -1; } ret = sendfile(fd, img, NULL, len); if (ret < 0) { pr_perror("Can't send data"); return -1; } off += ret; len -= ret; } xfree(buf); return 0; } static int copy_file_from_chunks(struct cr_img *img, int fd, size_t file_size) { if (ftruncate(fd, file_size) < 0) { pr_perror("Can't make file size"); return -1; } while (1) { int ret; GhostChunkEntry *ce; ret = pb_read_one_eof(img, &ce, PB_GHOST_CHUNK); if (ret <= 0) return ret; if (copy_chunk_to_file(img_raw_fd(img), fd, ce->off, ce->len)) return -1; ghost_chunk_entry__free_unpacked(ce, NULL); } } static int mkreg_ghost(char *path, GhostFileEntry *gfe, struct cr_img *img) { int gfd, ret; gfd = open(path, O_WRONLY | O_CREAT | O_EXCL, gfe->mode); if (gfd < 0) return -1; if (gfe->chunks) { if (!gfe->has_size) { pr_err("Corrupted ghost image -> no size\n"); return -1; } ret = copy_file_from_chunks(img, gfd, gfe->size); } else ret = copy_file(img_raw_fd(img), gfd, 0); if (ret < 0) unlink(path); close(gfd); return ret; } static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) { struct timeval tv[2]; int ret = -1; if (chown(path, gfe->uid, gfe->gid) < 0) { pr_perror("Can't reset user/group on ghost %s", path); goto err; } if (chmod(path, gfe->mode)) { pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); goto err; } if (gfe->atim) { tv[0].tv_sec = gfe->atim->tv_sec; tv[0].tv_usec = gfe->atim->tv_usec; tv[1].tv_sec = gfe->mtim->tv_sec; tv[1].tv_usec = gfe->mtim->tv_usec; if (lutimes(path, tv)) { pr_perror("Can't set access and modufication times on ghost %s", path); goto err; } } ret = 0; err: return ret; } static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img) { char path[PATH_MAX]; int ret, root_len; char *msg; root_len = ret = rst_get_mnt_root(gf->remap.rmnt_id, path, sizeof(path)); if (ret < 0) { pr_err("The %d mount is not found for ghost\n", gf->remap.rmnt_id); goto err; } snprintf(path + ret, sizeof(path) - ret, "/%s", gf->remap.rpath); ret = -1; again: if (S_ISFIFO(gfe->mode)) { if ((ret = mknod(path, gfe->mode, 0)) < 0) msg = "Can't create node for ghost file"; } else if (S_ISCHR(gfe->mode) || S_ISBLK(gfe->mode)) { if (!gfe->has_rdev) { pr_err("No rdev for ghost device\n"); goto err; } if ((ret = mknod(path, gfe->mode, gfe->rdev)) < 0) msg = "Can't create node for ghost dev"; } else if (S_ISDIR(gfe->mode)) { if ((ret = mkdirpat(AT_FDCWD, path, gfe->mode)) < 0) msg = "Can't make ghost dir"; } else { if ((ret = mkreg_ghost(path, gfe, img)) < 0) msg = "Can't create ghost regfile"; } if (ret < 0) { /* Use grand parent, if parent directory does not exist */ if (errno == ENOENT) { if (trim_last_parent(path) < 0) { pr_err("trim failed: @%s@\n", path); goto err; } goto again; } pr_perror("%s", msg); goto err; } strcpy(gf->remap.rpath, path + root_len + 1); pr_debug("Remap rpath is %s\n", gf->remap.rpath); ret = -1; if (ghost_apply_metadata(path, gfe)) goto err; ret = 0; err: return ret; } static inline void ghost_path(char *path, int plen, struct reg_file_info *rfi, RemapFilePathEntry *rfe) { snprintf(path, plen, "%s.cr.%x.ghost", rfi->path, rfe->remap_id); } static int collect_remap_ghost(struct reg_file_info *rfi, RemapFilePathEntry *rfe) { struct ghost_file *gf; list_for_each_entry(gf, &ghost_files, list) if (gf->id == rfe->remap_id) goto gf_found; /* * Ghost not found. We will create one in the same dir * as the very first client of it thus resolving any * issues with cross-device links. */ pr_info("Opening ghost file %#x for %s\n", rfe->remap_id, rfi->path); gf = shmalloc(sizeof(*gf)); if (!gf) return -1; /* * The rpath is shmalloc-ed because we create the ghost * file in root task context and generate its path there. * However the path should be visible by the criu task * in order to remove the ghost files from root FS (see * try_clean_remaps()). */ gf->remap.rpath = shmalloc(PATH_MAX); if (!gf->remap.rpath) return -1; gf->remap.rpath[0] = 0; gf->id = rfe->remap_id; list_add_tail(&gf->list, &ghost_files); gf_found: rfi->is_dir = gf->remap.is_dir; rfi->remap = &gf->remap; return 0; } static int open_remap_ghost(struct reg_file_info *rfi, RemapFilePathEntry *rfe) { struct ghost_file *gf = container_of(rfi->remap, struct ghost_file, remap); GhostFileEntry *gfe = NULL; struct cr_img *img; if (rfi->remap->rpath[0]) return 0; img = open_image(CR_FD_GHOST_FILE, O_RSTR, rfe->remap_id); if (!img) goto err; if (pb_read_one(img, &gfe, PB_GHOST_FILE) < 0) goto close_ifd; /* * For old formats where optional has_[dev|ino] is * not present we will have zeros here which is quite * a sign for "absent" fields. */ gf->dev = gfe->dev; gf->ino = gfe->ino; gf->remap.rmnt_id = rfi->rfe->mnt_id; if (S_ISDIR(gfe->mode)) strncpy(gf->remap.rpath, rfi->path, PATH_MAX); else ghost_path(gf->remap.rpath, PATH_MAX, rfi, rfe); if (create_ghost(gf, gfe, img)) goto close_ifd; close_image(img); gf->remap.is_dir = S_ISDIR(gfe->mode); gf->remap.uid = gfe->uid; gf->remap.gid = gfe->gid; ghost_file_entry__free_unpacked(gfe, NULL); return 0; close_ifd: close_image(img); err: if (gfe) ghost_file_entry__free_unpacked(gfe, NULL); xfree(gf->remap.rpath); shfree_last(gf); return -1; } static int collect_remap_linked(struct reg_file_info *rfi, RemapFilePathEntry *rfe) { struct file_remap *rm; struct file_desc *rdesc; struct reg_file_info *rrfi; rdesc = find_file_desc_raw(FD_TYPES__REG, rfe->remap_id); if (!rdesc) { pr_err("Can't find target file %x\n", rfe->remap_id); return -1; } rm = xmalloc(sizeof(*rm)); if (!rm) return -1; rrfi = container_of(rdesc, struct reg_file_info, d); pr_info("Remapped %s -> %s\n", rfi->path, rrfi->path); rm->rpath = rrfi->path; rm->is_dir = false; rm->uid = -1; rm->gid = -1; rm->rmnt_id = rfi->rfe->mnt_id; rfi->remap = rm; return 0; } static int open_remap_linked(struct reg_file_info *rfi, RemapFilePathEntry *rfe) { if (root_ns_mask & CLONE_NEWUSER) { int rfd; struct stat st; rfd = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id); if (fstatat(rfd, rfi->remap->rpath, &st, AT_SYMLINK_NOFOLLOW)) { pr_perror("Can't get owner of link remap %s", rfi->remap->rpath); return -1; } rfi->remap->uid = st.st_uid; rfi->remap->gid = st.st_gid; } return 0; } static int collect_remap_dead_process(struct reg_file_info *rfi, RemapFilePathEntry *rfe) { struct pstree_item *helper; helper = lookup_create_item(rfe->remap_id); if (!helper) return -1; if (helper->pid->state != TASK_UNDEF) { pr_info("Skipping helper for restoring /proc/%d; pid exists\n", rfe->remap_id); return 0; } init_pstree_helper(helper); helper->sid = root_item->sid; helper->pgid = root_item->pgid; helper->pid->ns[0].virt = rfe->remap_id; helper->parent = root_item; helper->ids = root_item->ids; list_add_tail(&helper->sibling, &root_item->children); pr_info("Added a helper for restoring /proc/%d\n", vpid(helper)); return 0; } struct remap_info { struct list_head list; RemapFilePathEntry *rfe; struct reg_file_info *rfi; }; static int collect_one_remap(void *obj, ProtobufCMessage *msg, struct cr_img *i) { struct remap_info *ri = obj; RemapFilePathEntry *rfe; struct file_desc *fdesc; ri->rfe = rfe = pb_msg(msg, RemapFilePathEntry); if (!rfe->has_remap_type) { rfe->has_remap_type = true; /* backward compatibility with images */ if (rfe->remap_id & REMAP_GHOST) { rfe->remap_id &= ~REMAP_GHOST; rfe->remap_type = REMAP_TYPE__GHOST; } else rfe->remap_type = REMAP_TYPE__LINKED; } fdesc = find_file_desc_raw(FD_TYPES__REG, rfe->orig_id); if (fdesc == NULL) { pr_err("Remap for non existing file %#x\n", rfe->orig_id); return -1; } ri->rfi = container_of(fdesc, struct reg_file_info, d); switch (rfe->remap_type) { case REMAP_TYPE__GHOST: if (collect_remap_ghost(ri->rfi, ri->rfe)) return -1; break; case REMAP_TYPE__LINKED: if (collect_remap_linked(ri->rfi, ri->rfe)) return -1; break; case REMAP_TYPE__PROCFS: if (collect_remap_dead_process(ri->rfi, rfe) < 0) return -1; break; default: break; } list_add_tail(&ri->list, &remaps); return 0; } static int prepare_one_remap(struct remap_info *ri) { int ret = -1; RemapFilePathEntry *rfe = ri->rfe; struct reg_file_info *rfi = ri->rfi; pr_info("Configuring remap %#x -> %#x\n", rfi->rfe->id, rfe->remap_id); switch (rfe->remap_type) { case REMAP_TYPE__LINKED: ret = open_remap_linked(rfi, rfe); break; case REMAP_TYPE__GHOST: ret = open_remap_ghost(rfi, rfe); break; case REMAP_TYPE__PROCFS: /* handled earlier by collect_remap_dead_process */ ret = 0; break; default: pr_err("unknown remap type %u\n", rfe->remap_type); goto out; } out: return ret; } int prepare_remaps(void) { struct remap_info *ri; int ret = 0; ret = init_remap_lock(); if (ret) return ret; list_for_each_entry(ri, &remaps, list) { ret = prepare_one_remap(ri); if (ret) break; } return ret; } static int clean_one_remap(struct remap_info *ri) { char path[PATH_MAX]; int mnt_id, ret, rmntns_root; struct file_remap *remap = ri->rfi->remap; if (remap->rpath[0] == 0) return 0; mnt_id = ri->rfi->rfe->mnt_id; /* rirfirfe %) */ ret = rst_get_mnt_root(mnt_id, path, sizeof(path)); if (ret < 0) return -1; if (ret >= sizeof(path) - 1) { pr_err("The path buffer is too small\n"); return -1; } rmntns_root = open(path, O_RDONLY); if (rmntns_root < 0) { pr_perror("Unbale to open %s", path); return -1; } pr_info("Unlink remap %s\n", remap->rpath); ret = unlinkat(rmntns_root, remap->rpath, remap->is_dir ? AT_REMOVEDIR : 0); if (ret < 0) { close(rmntns_root); pr_perror("Couldn't unlink remap %d %s", rmntns_root, remap->rpath); return -1; } close(rmntns_root); remap->rpath[0] = 0; return 0; } int try_clean_remaps(bool only_ghosts) { struct remap_info *ri; int ret = 0; list_for_each_entry(ri, &remaps, list) { if (ri->rfe->remap_type == REMAP_TYPE__GHOST) ret |= clean_one_remap(ri); else if (only_ghosts) continue; else if (ri->rfe->remap_type == REMAP_TYPE__LINKED) ret |= clean_one_remap(ri); } return ret; } static struct collect_image_info remap_cinfo = { .fd_type = CR_FD_REMAP_FPATH, .pb_type = PB_REMAP_FPATH, .priv_size = sizeof(struct remap_info), .collect = collect_one_remap, }; /* Tiny files don't need to generate chunks in ghost image. */ #define GHOST_CHUNKS_THRESH (3 * 4096) static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_dev) { struct cr_img *img; GhostFileEntry gfe = GHOST_FILE_ENTRY__INIT; Timeval atim = TIMEVAL__INIT, mtim = TIMEVAL__INIT; pr_info("Dumping ghost file contents (id %#x)\n", id); img = open_image(CR_FD_GHOST_FILE, O_DUMP, id); if (!img) return -1; gfe.uid = userns_uid(st->st_uid); gfe.gid = userns_gid(st->st_gid); gfe.mode = st->st_mode; gfe.atim = &atim; gfe.mtim = &mtim; gfe.atim->tv_sec = st->st_atim.tv_sec; gfe.atim->tv_usec = st->st_atim.tv_nsec / 1000; gfe.mtim->tv_sec = st->st_mtim.tv_sec; gfe.mtim->tv_usec = st->st_mtim.tv_nsec / 1000; gfe.has_dev = gfe.has_ino = true; gfe.dev = phys_dev; gfe.ino = st->st_ino; if (S_ISCHR(st->st_mode) || S_ISBLK(st->st_mode)) { gfe.has_rdev = true; gfe.rdev = st->st_rdev; } if (S_ISREG(st->st_mode) && (st->st_size >= GHOST_CHUNKS_THRESH)) { gfe.has_chunks = gfe.chunks = true; gfe.has_size = true; gfe.size = st->st_size; } if (pb_write_one(img, &gfe, PB_GHOST_FILE)) return -1; if (S_ISREG(st->st_mode)) { int fd, ret; char lpath[PSFDS]; /* * Reopen file locally since it may have no read * permissions when drained */ sprintf(lpath, "/proc/self/fd/%d", _fd); fd = open(lpath, O_RDONLY); if (fd < 0) { pr_perror("Can't open ghost original file"); return -1; } if (gfe.chunks) ret = copy_file_to_chunks(fd, img, st->st_size); else ret = copy_file(fd, img_raw_fd(img), st->st_size); close(fd); if (ret) return -1; } close_image(img); return 0; } struct file_remap *lookup_ghost_remap(u32 dev, u32 ino) { struct ghost_file *gf; list_for_each_entry(gf, &ghost_files, list) { if (gf->ino == ino && (gf->dev == dev)) { return &gf->remap; } } return NULL; } static int dump_ghost_remap(char *path, const struct stat *st, int lfd, u32 id, struct ns_id *nsid) { struct ghost_file *gf; RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; dev_t phys_dev; pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id); if (st->st_size > opts.ghost_limit) { pr_err("Can't dump ghost file %s of %"PRIu64" size, increase limit\n", path, st->st_size); return -1; } phys_dev = phys_stat_resolve_dev(nsid, st->st_dev, path); list_for_each_entry(gf, &ghost_files, list) if ((gf->dev == phys_dev) && (gf->ino == st->st_ino)) goto dump_entry; gf = xmalloc(sizeof(*gf)); if (gf == NULL) return -1; gf->dev = phys_dev; gf->ino = st->st_ino; gf->id = ghost_file_ids++; list_add_tail(&gf->list, &ghost_files); if (dump_ghost_file(lfd, gf->id, st, phys_dev)) return -1; dump_entry: rpe.orig_id = id; rpe.remap_id = gf->id; rpe.has_remap_type = true; rpe.remap_type = REMAP_TYPE__GHOST; return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), &rpe, PB_REMAP_FPATH); } static void __rollback_link_remaps(bool do_unlink) { struct link_remap_rlb *rlb, *tmp; int mntns_root; list_for_each_entry_safe(rlb, tmp, &remaps, list) { if (do_unlink) { mntns_root = mntns_get_root_fd(rlb->mnt_ns); if (mntns_root >= 0) unlinkat(mntns_root, rlb->path, 0); else pr_err("Failed to clenaup %s link remap\n", rlb->path); } list_del(&rlb->list); xfree(rlb->path); xfree(rlb); } } void delete_link_remaps(void) { __rollback_link_remaps(true); } void free_link_remaps(void) { __rollback_link_remaps(false); } static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, gid_t gid, int flags); static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_id *nsid, const struct stat *st) { char link_name[PATH_MAX], *tmp; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; FownEntry fwn = FOWN_ENTRY__INIT; int mntns_root; int ret; if (!opts.link_remap_ok) { pr_err("Can't create link remap for %s. " "Use " LREMAP_PARAM " option.\n", path); return -1; } /* * Linked remapping -- we create a hard link on a removed file * in the directory original file used to sit. * * Bad news is than we can't easily open lfd's parent dir. Thus * we have to just generate an absolute path and use it. The linkat * will fail if we chose the bad one. */ link_name[0] = '.'; memcpy(link_name + 1, path, len); tmp = link_name + len; while (*tmp != '/') { BUG_ON(tmp == link_name); tmp--; } fd_id_generate_special(NULL, idp); rfe.id = *idp; rfe.flags = 0; rfe.pos = 0; rfe.fown = &fwn; rfe.name = link_name + 1; /* Any 'unique' name works here actually. Remap works by reg-file ids. */ snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); mntns_root = mntns_get_root_fd(nsid); again: ret = linkat_hard(lfd, "", mntns_root, link_name, st->st_uid, st->st_gid, AT_EMPTY_PATH); if (ret < 0 && errno == ENOENT) { /* Use grand parent, if parent directory does not exist. */ if (trim_last_parent(link_name) < 0) { pr_err("trim failed: @%s@\n", link_name); return -1; } goto again; } else if (ret < 0) { pr_perror("Can't link remap to %s", path); return -1; } if (note_link_remap(link_name, nsid)) return -1; fe.type = FD_TYPES__REG; fe.id = rfe.id; fe.reg = &rfe; return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } static int dump_linked_remap(char *path, int len, const struct stat *ost, int lfd, u32 id, struct ns_id *nsid) { u32 lid; RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; if (create_link_remap(path, len, lfd, &lid, nsid, ost)) return -1; rpe.orig_id = id; rpe.remap_id = lid; rpe.has_remap_type = true; rpe.remap_type = REMAP_TYPE__LINKED; return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), &rpe, PB_REMAP_FPATH); } static pid_t *dead_pids; static int n_dead_pids; int dead_pid_conflict(void) { int i; for (i = 0; i < n_dead_pids; i++) { struct pid *node; pid_t pid = dead_pids[i]; node = pstree_pid_by_virt(pid); if (!node) continue; if (node->state != TASK_THREAD) { struct pstree_item *item; /* * If the dead PID was given to a main thread of another * process, this is handled during restore. */ item = node->item; if (item->pid->real == item->threads[i].real || item->threads[i].ns[0].virt != pid) continue; } pr_err("Conflict with a dead task with the same PID as of this thread (virt %d, real %d).\n", node->ns[0].virt, node->real); return -1; } return 0; } static int have_seen_dead_pid(pid_t pid) { int i; for (i = 0; i < n_dead_pids; i++) { if (dead_pids[i] == pid) return 1; } if (xrealloc_safe(&dead_pids, sizeof(*dead_pids) * (n_dead_pids + 1))) return -1; dead_pids[n_dead_pids++] = pid; return 0; } static int dump_dead_process_remap(pid_t pid, u32 id) { RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; int ret; ret = have_seen_dead_pid(pid); if (ret < 0) return -1; if (ret) { pr_info("Found dead pid %d already, skipping remap\n", pid); return 0; } rpe.orig_id = id; rpe.remap_id = pid; rpe.has_remap_type = true; rpe.remap_type = REMAP_TYPE__PROCFS; return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), &rpe, PB_REMAP_FPATH); } static bool is_sillyrename_name(char *name) { int i; name = strrchr(name, '/'); BUG_ON(name == NULL); /* see check in dump_one_reg_file */ name++; /* * Strictly speaking this check is not bullet-proof. User * can create file with this name by hands and we have no * API to distinguish really-silly-renamed files from those * fake names :( * * But since NFS people expect .nfsXXX files to be unstable, * we treat them as such too. */ if (strncmp(name, SILLYNAME_PREF, sizeof(SILLYNAME_PREF) - 1)) return false; name += sizeof(SILLYNAME_PREF) - 1; for (i = 0; i < SILLYNAME_SUFF_LEN; i++) if (!isxdigit(name[i])) return false; return true; } static inline bool nfs_silly_rename(char *rpath, const struct fd_parms *parms) { return (parms->fs_type == NFS_SUPER_MAGIC) && is_sillyrename_name(rpath); } int strip_deleted(struct fd_link *link) { struct dcache_prepends { const char *str; size_t len; } static const prepends[] = { { .str = " (deleted)", .len = 10, }, { .str = "//deleted", .len = 9, } }; size_t i; for (i = 0; i < ARRAY_SIZE(prepends); i++) { size_t at; if (link->len <= prepends[i].len) continue; at = link->len - prepends[i].len; if (!strcmp(&link->name[at], prepends[i].str)) { pr_debug("Strip '%s' tag from '%s'\n", prepends[i].str, link->name); link->name[at] = '\0'; link->len -= prepends[i].len; return 1; } } return 0; } static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, int lfd, u32 id, struct ns_id *nsid) { char *rpath = link->name; int plen = link->len; int ret, mntns_root; struct stat pst; const struct stat *ost = &parms->stat; if (parms->fs_type == PROC_SUPER_MAGIC) { /* The file points to /proc/pid/ where pid is a dead * process. We remap this file by adding this pid to be * fork()ed into a TASK_HELPER state so that we can point to it * on restore. */ pid_t pid; char *start, *end; /* skip "./proc/" */ start = strstr(rpath, "/"); if (!start) return -1; start = strstr(start + 1, "/"); if (!start) /* it's /proc */ return 0; pid = strtol(start + 1, &end, 10); /* If strtol didn't convert anything, then we are looking at * something like /proc/kmsg, which we shouldn't mess with. * Anything under /proc/ (including that directory itself) * can be c/r'd with a dead pid remap, so let's allow all such * cases. */ if (pid != 0) { bool is_dead = strip_deleted(link); /* /proc/ will be "/proc/1 (deleted)" when it is * dead, but a path like /proc/1/mountinfo won't have * the suffix, since it isn't actually deleted (still * exists, but the parent dir is deleted). So, if we * have a path like /proc/1/mountinfo, test if /proc/1 * exists instead, since this is what CRIU will need to * open on restore. */ if (!is_dead) { *end = 0; is_dead = access(rpath, F_OK); *end = '/'; } if (is_dead) { pr_info("Dumping dead process remap of %d\n", pid); return dump_dead_process_remap(pid, id); } } return 0; } else if (parms->fs_type == DEVPTS_SUPER_MAGIC) { /* * It's safe to call stripping here because * file paths are having predefined format for * this FS and can't have a valid " (deleted)" * postfix as a part of not deleted filename. */ strip_deleted(link); /* * Devpts devices/files are generated by the * kernel itself so we should not try to generate * any kind of ghost files here even if file is * no longer exist. */ return 0; } if (ost->st_nlink == 0) { /* * Unpleasant, but easy case. File is completely invisible * from the FS. Just dump its contents and that's it. But * be careful whether anybody still has any of its hardlinks * also open. */ strip_deleted(link); return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid); } if (nfs_silly_rename(rpath, parms)) { /* * If this is NFS silly-rename file the path we have at hands * will be accessible by fstat(), but once we kill the dumping * tasks it will disappear. So we just go ahead an dump it as * linked-remap file (NFS will allow us to create more hard * links on it) to have some persistent name at hands. */ pr_debug("Dump silly-rename linked remap for %x\n", id); return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid); } mntns_root = mntns_get_root_fd(nsid); if (mntns_root < 0) return -1; ret = fstatat(mntns_root, rpath, &pst, 0); if (ret < 0) { /* * Linked file, but path is not accessible (unless any * other error occurred). We can create a temporary link to it * uning linkat with AT_EMPTY_PATH flag and remap it to this * name. */ if (errno == ENOENT) return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid); pr_perror("Can't stat path"); return -1; } if ((pst.st_ino != ost->st_ino) || (pst.st_dev != ost->st_dev)) { if (opts.evasive_devices && (S_ISCHR(ost->st_mode) || S_ISBLK(ost->st_mode)) && pst.st_rdev == ost->st_rdev) return 0; /* * FIXME linked file, but the name we see it by is reused * by somebody else. We can dump it with linked remaps, but * we'll have difficulties on restore -- we will have to * move the exisint file aside, then restore this one, * unlink, then move the original file back. It's fairly * easy to do, but we don't do it now, since unlinked files * have the "(deleted)" suffix in proc and name conflict * is unlikely :) */ pr_err("Unaccessible path opened %u:%u, need %u:%u\n", (int)pst.st_dev, (int)pst.st_ino, (int)ost->st_dev, (int)ost->st_ino); return -1; } /* * File is linked and visible by the name it is opened by * this task. Go ahead and dump it. */ return 0; } static bool should_check_size(int flags) { /* Skip size if file has O_APPEND and O_WRONLY flags (e.g. log file). */ if (((flags & O_ACCMODE) == O_WRONLY) && (flags & O_APPEND)) return false; return true; } int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) { struct fd_link _link, *link; struct ns_id *nsid; struct cr_img *rimg; char ext_id[64]; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; if (!p->link) { if (fill_fdlink(lfd, p, &_link)) return -1; link = &_link; } else link = p->link; snprintf(ext_id, sizeof(ext_id), "file[%x:%"PRIx64"]", p->mnt_id, p->stat.st_ino); if (external_lookup_id(ext_id)) { /* the first symbol will be cut on restore to get an relative path*/ rfe.name = xstrdup(ext_id); rfe.ext = true; rfe.has_ext = true; goto ext; } nsid = lookup_nsid_by_mnt_id(p->mnt_id); if (nsid == NULL) { pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); return -1; } if (p->mnt_id >= 0 && (root_ns_mask & CLONE_NEWNS)) { rfe.mnt_id = p->mnt_id; rfe.has_mnt_id = true; } pr_info("Dumping path for %d fd via self %d [%s]\n", p->fd, lfd, &link->name[1]); /* * The regular path we can handle should start with slash. */ if (link->name[1] != '/') { pr_err("The path [%s] is not supported\n", &link->name[1]); return -1; } if (check_path_remap(link, p, lfd, id, nsid)) return -1; rfe.name = &link->name[1]; ext: rfe.id = id; rfe.flags = p->flags; rfe.pos = p->pos; rfe.fown = (FownEntry *)&p->fown; rfe.has_mode = true; rfe.mode = p->stat.st_mode; if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags)) { rfe.has_size = true; rfe.size = p->stat.st_size; } fe.type = FD_TYPES__REG; fe.id = rfe.id; fe.reg = &rfe; rimg = img_from_set(glob_imgset, CR_FD_FILES); return pb_write_one(rimg, &fe, PB_FILE); } const struct fdtype_ops regfile_dump_ops = { .type = FD_TYPES__REG, .dump = dump_one_reg_file, }; static void convert_path_from_another_mp(char *src, char *dst, int dlen, struct mount_info *smi, struct mount_info *dmi) { int off; /* * mi->mountpoint ./foo/bar * mi->ns_mountpoint /foo/bar * rfi->path foo/bar/baz */ off = strlen(smi->ns_mountpoint + 1); BUG_ON(strlen(smi->root) < strlen(dmi->root)); /* * Create paths relative to this mount. * Absolute path to the mount point + difference between source * and destination roots + path relative to the mountpoint. */ snprintf(dst, dlen, "%s/%s/%s", dmi->ns_mountpoint + 1, smi->root + strlen(dmi->root), src + off); } static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, gid_t gid, int flags) { struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; struct __user_cap_header_struct hdr; int ret, old_fsuid = -1, old_fsgid = -1; int errno_save; ret = linkat(odir, opath, ndir, npath, flags); if (ret == 0) return 0; if (!( (errno == EPERM || errno == EOVERFLOW) && (root_ns_mask & CLONE_NEWUSER) )) { errno_save = errno; pr_perror("Can't link %s -> %s", opath, npath); errno = errno_save; return ret; } /* * Kernel before 4.3 has strange secutiry restrictions about * linkat. If the fsuid of the caller doesn't equals * the uid of the file and the file is not "safe" * one, then only global CAP_CHOWN will be allowed * to link(). * * Next, when we're in user namespace we're ns root, * but not global CAP_CHOWN. Thus, even though we * ARE ns root, we will not be allowed to link() at * files that belong to regular users %) * * Fortunately, the setfsuid() requires ns-level * CAP_SETUID which we have. * * Starting with 4.8 the kernel doesn't allow to create inodes * with a uid or gid unknown to an user namespace. * 036d523641c66 ("vfs: Don't create inodes with a uid or gid unknown to the vfs") */ old_fsuid = setfsuid(uid); old_fsgid = setfsgid(gid); /* AT_EMPTY_PATH requires CAP_DAC_READ_SEARCH */ if (flags & AT_EMPTY_PATH) { hdr.version = _LINUX_CAPABILITY_VERSION_3; hdr.pid = 0; if (capget(&hdr, data) < 0) { errno_save = errno; pr_perror("capget"); goto out; } data[0].effective = data[0].permitted; data[1].effective = data[1].permitted; if (capset(&hdr, data) < 0) { errno_save = errno; pr_perror("capset"); goto out; } } ret = linkat(odir, opath, ndir, npath, flags); errno_save = errno; if (ret < 0) pr_perror("Can't link %s -> %s", opath, npath); out: setfsuid(old_fsuid); setfsgid(old_fsgid); if (setfsuid(-1) != old_fsuid) { pr_warn("Failed to restore old fsuid!\n"); /* * Don't fail here. We still have chances to run till * the pie/restorer, and if _this_ guy fails to set * the proper fsuid, then we'll abort the restore. */ } /* * Restoring PR_SET_DUMPABLE flag is required after setfsuid, * as if it not set, proc inode will be created with root cred * (see proc_pid_make_inode), which will result in permission * check fail when trying to access files in /proc/self/ */ prctl(PR_SET_DUMPABLE, 1, 0); errno = errno_save; return ret; } static void rm_parent_dirs(int mntns_root, char *path, int count) { char *p, *prev = NULL; if (!count) return; while (count > 0) { count -= 1; p = strrchr(path, '/'); if (p) *p = '\0'; if (prev) *prev = '/'; if (unlinkat(mntns_root, path, AT_REMOVEDIR)) pr_perror("Can't remove %s AT %d", path, mntns_root); else pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root); prev = p; } if (prev) *prev = '/'; } /* Construct parent dir name and mkdir parent/grandparents if they're not exist */ static int make_parent_dirs_if_need(int mntns_root, char *path) { char *p, *last_delim; int err, count = 0; struct stat st; p = last_delim = strrchr(path, '/'); if (!p) { pr_err("Path %s has no parent dir\n", path); return -1; } *p = '\0'; if (fstatat(mntns_root, path, &st, AT_EMPTY_PATH) == 0) goto out; if (errno != ENOENT) { pr_perror("Can't stat %s", path); count = -1; goto out; } p = path; do { p = strchr(p, '/'); if (p) *p = '\0'; err = mkdirat(mntns_root, path, 0777); if (err && errno != EEXIST) { pr_perror("Can't create dir: %s AT %d", path, mntns_root); rm_parent_dirs(mntns_root, path, count); count = -1; goto out; } else if (!err) { pr_debug("Created parent dir: %s AT %d\n", path, mntns_root); count++; } if (p) *p++ = '/'; } while (p); out: *last_delim = '/'; return count; } /* * This routine properly resolves d's path handling ghost/link-remaps. * The open_cb is a routine that does actual open, it differs for * files, directories, fifos, etc. */ static int rfi_remap(struct reg_file_info *rfi, int *level) { struct mount_info *mi, *rmi, *tmi; char _path[PATH_MAX], *path = _path; char _rpath[PATH_MAX], *rpath = _rpath; int mntns_root; if (rfi->rfe->mnt_id == -1) { /* Know nothing about mountpoints */ mntns_root = mntns_get_root_by_mnt_id(-1); path = rfi->path; rpath = rfi->remap->rpath; goto out_root; } mi = lookup_mnt_id(rfi->rfe->mnt_id); if (rfi->rfe->mnt_id == rfi->remap->rmnt_id) { /* Both links on the same mount point */ tmi = mi; path = rfi->path; rpath = rfi->remap->rpath; goto out; } rmi = lookup_mnt_id(rfi->remap->rmnt_id); /* * Find the common bind-mount. We know that one mount point was * really mounted and all other were bind-mounted from it, so the * lowest mount must contains all bind-mounts. */ for (tmi = mi; tmi->bind; tmi = tmi->bind) ; BUG_ON(tmi->s_dev != rmi->s_dev); BUG_ON(tmi->s_dev != mi->s_dev); /* Calcalate paths on the device (root mount) */ convert_path_from_another_mp(rfi->path, path, sizeof(_path), mi, tmi); convert_path_from_another_mp(rfi->remap->rpath, rpath, sizeof(_rpath), rmi, tmi); out: pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path); mntns_root = mntns_get_root_fd(tmi->nsid); out_root: *level = make_parent_dirs_if_need(mntns_root, path); if (*level < 0) return -1; if (linkat_hard(mntns_root, rpath, mntns_root, path, rfi->remap->uid, rfi->remap->gid, 0) < 0) { int errno_saved = errno; rm_parent_dirs(mntns_root, path, *level); errno = errno_saved; return -1; } return 0; } int open_path(struct file_desc *d, int(*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg) { int tmp, mntns_root, level = 0; struct reg_file_info *rfi; char *orig_path = NULL; char path[PATH_MAX]; if (inherited_fd(d, &tmp)) return tmp; rfi = container_of(d, struct reg_file_info, d); if (rfi->rfe->ext) { tmp = inherit_fd_lookup_id(rfi->rfe->name); if (tmp >= 0) { mntns_root = open_pid_proc(PROC_SELF); snprintf(path, sizeof(path), "fd/%d", tmp); orig_path = rfi->path; rfi->path = path; goto ext; } } if (rfi->remap) { if (fault_injected(FI_RESTORE_OPEN_LINK_REMAP)) { pr_info("fault: Open link-remap failure!\n"); kill(getpid(), SIGKILL); } mutex_lock(remap_open_lock); if (rfi->remap->is_dir) { /* * FIXME Can't make directory under new name. * Will have to open it under the ghost one :( */ orig_path = rfi->path; rfi->path = rfi->remap->rpath; } else if (rfi_remap(rfi, &level) < 0) { static char tmp_path[PATH_MAX]; if (errno != EEXIST) { pr_perror("Can't link %s -> %s", rfi->path, rfi->remap->rpath); return -1; } /* * The file whose name we're trying to create * exists. Need to pick some other one, we're * going to remove it anyway. * * Strictly speaking, this is cheating, file * name shouldn't change. But since NFS with * its silly-rename doesn't care, why should we? */ orig_path = rfi->path; rfi->path = tmp_path; snprintf(tmp_path, sizeof(tmp_path), "%s.cr_link", orig_path); pr_debug("Fake %s -> %s link\n", rfi->path, rfi->remap->rpath); if (rfi_remap(rfi, &level) < 0) { pr_perror("Can't create even fake link!"); return -1; } } } mntns_root = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id); ext: tmp = open_cb(mntns_root, rfi, arg); if (tmp < 0) { pr_perror("Can't open file %s", rfi->path); return -1; } if ((rfi->rfe->has_size || rfi->rfe->has_mode) && !rfi->size_mode_checked) { struct stat st; if (fstat(tmp, &st) < 0) { pr_perror("Can't fstat opened file"); return -1; } if (rfi->rfe->has_size && (st.st_size != rfi->rfe->size)) { pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n", rfi->path, st.st_size, rfi->rfe->size); return -1; } if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { if (st.st_mode != rfi->rfe->mode) { pr_err("File %s has bad mode 0%o (expect 0%o)\n", rfi->path, (int)st.st_mode, rfi->rfe->mode); return -1; } } /* * This is only visible in the current process, so * change w/o locks. Other tasks sharing the same * file will get one via unix sockets. */ rfi->size_mode_checked = true; } if (rfi->remap) { if (!rfi->remap->is_dir) { unlinkat(mntns_root, rfi->path, 0); rm_parent_dirs(mntns_root, rfi->path, level); } mutex_unlock(remap_open_lock); } if (orig_path) rfi->path = orig_path; if (restore_fown(tmp, rfi->rfe->fown)) return -1; return tmp; } int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg) { u32 flags = *(u32 *)arg; int fd; fd = openat(ns_root_fd, rfi->path, flags); if (fd < 0) { pr_perror("Can't open file %s on restore", rfi->path); return fd; } return fd; } static int do_open_reg_noseek(int ns_root_fd, struct reg_file_info *rfi, void *arg) { return do_open_reg_noseek_flags(ns_root_fd, rfi, &rfi->rfe->flags); } static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) { int fd; fd = do_open_reg_noseek(ns_root_fd, rfi, arg); if (fd < 0) return fd; if ((rfi->rfe->pos != -1ULL) && lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { pr_perror("Can't restore file pos"); close(fd); return -1; } return fd; } int open_reg_fd(struct file_desc *fd) { return open_path(fd, do_open_reg_noseek, NULL); } int open_reg_by_id(u32 id) { struct file_desc *fd; /* * This one gets called by exe link, chroot and cwd * restoring code. No need in calling lseek on either * of them. */ fd = find_file_desc_raw(FD_TYPES__REG, id); if (fd == NULL) { pr_err("Can't find regfile for %#x\n", id); return -1; } return open_reg_fd(fd); } struct filemap_ctx { u32 flags; struct file_desc *desc; int fd; /* * Whether or not to close the fd when we're about to * put a new one into ctx. * * True is used by premap, so that it just calls vm_open * in sequence, immediatelly mmap()s the file, then it * can be closed. * * False is used by open_vmas() which pre-opens the files * for restorer, and the latter mmap()s them and closes. * * ... */ bool close; /* ... * * but closing all vmas won't work, as some of them share * the descriptor, so only the ones that terminate the * fd-sharing chain are marked with VMA_CLOSE flag, saying * restorer to close the vma's fd. * * Said that, this vma pointer references the previously * seen vma, so that once fd changes, this one gets the * closing flag. */ struct vma_area *vma; }; static struct filemap_ctx ctx; void filemap_ctx_init(bool auto_close) { ctx.desc = NULL; /* to fail the first comparison in open_ */ ctx.fd = -1; /* not to close random fd in _fini */ ctx.vma = NULL; /* not to put spurious VMA_CLOSE in _fini */ /* flags may remain any */ ctx.close = auto_close; } void filemap_ctx_fini(void) { if (ctx.close) { if (ctx.fd >= 0) close(ctx.fd); } else { if (ctx.vma) ctx.vma->e->status |= VMA_CLOSE; } } static int open_filemap(int pid, struct vma_area *vma) { u32 flags; int ret; /* * Thevma->fd should have been assigned in collect_filemap * * We open file w/o lseek, as mappings don't care about it */ BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags); flags = vma->e->fdflags; if (ctx.flags != flags || ctx.desc != vma->vmfd) { ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); if (ret < 0) return ret; filemap_ctx_fini(); ctx.flags = flags; ctx.desc = vma->vmfd; ctx.fd = ret; } ctx.vma = vma; vma->e->fd = ctx.fd; return 0; } int collect_filemap(struct vma_area *vma) { struct file_desc *fd; if (!vma->e->has_fdflags) { /* Make a wild guess for the fdflags */ vma->e->has_fdflags = true; if ((vma->e->prot & PROT_WRITE) && vma_area_is(vma, VMA_FILE_SHARED)) vma->e->fdflags = O_RDWR; else vma->e->fdflags = O_RDONLY; } fd = collect_special_file(vma->e->shmid); if (!fd) return -1; vma->vmfd = fd; vma->vm_open = open_filemap; return 0; } static int open_fe_fd(struct file_desc *fd, int *new_fd) { int tmp; tmp = open_path(fd, do_open_reg, NULL); if (tmp < 0) return -1; *new_fd = tmp; return 0; } static char *reg_file_path(struct file_desc *d, char *buf, size_t s) { struct reg_file_info *rfi; rfi = container_of(d, struct reg_file_info, d); return rfi->path; } static struct file_desc_ops reg_desc_ops = { .type = FD_TYPES__REG, .open = open_fe_fd, .name = reg_file_path, }; struct file_desc *try_collect_special_file(u32 id, int optional) { struct file_desc *fdesc; /* * Files dumped for vmas/exe links can have remaps * configured. Need to bump-up users for them, otherwise * the open_path() would unlink the remap file after * the very first open. */ fdesc = find_file_desc_raw(FD_TYPES__REG, id); if (fdesc == NULL) { if (!optional) pr_err("No entry for reg-file-ID %#x\n", id); return NULL; } return fdesc; } static int collect_one_regfile(void *o, ProtobufCMessage *base, struct cr_img *i) { struct reg_file_info *rfi = o; static char dot[] = "."; rfi->rfe = pb_msg(base, RegFileEntry); /* change "/foo" into "foo" and "/" into "." */ if (rfi->rfe->name[1] == '\0') rfi->path = dot; else rfi->path = rfi->rfe->name + 1; rfi->remap = NULL; rfi->size_mode_checked = false; pr_info("Collected [%s] ID %#x\n", rfi->path, rfi->rfe->id); return file_desc_add(&rfi->d, rfi->rfe->id, ®_desc_ops); } struct collect_image_info reg_file_cinfo = { .fd_type = CR_FD_REG_FILES, .pb_type = PB_REG_FILE, .priv_size = sizeof(struct reg_file_info), .collect = collect_one_regfile, .flags = COLLECT_SHARED, }; int collect_remaps_and_regfiles(void) { if (!files_collected() && collect_image(®_file_cinfo)) return -1; if (collect_image(&remap_cinfo)) return -1; return 0; } criu-3.6/criu/files.c000066400000000000000000001175201317335042600145460ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include "files.h" #include "file-ids.h" #include "files-reg.h" #include "file-lock.h" #include "image.h" #include "common/list.h" #include "rst-malloc.h" #include "util-pie.h" #include "common/lock.h" #include "sockets.h" #include "pstree.h" #include "tty.h" #include "pipes.h" #include "fifo.h" #include "eventfd.h" #include "eventpoll.h" #include "fsnotify.h" #include "sk-packet.h" #include "mount.h" #include "signalfd.h" #include "namespaces.h" #include "tun.h" #include "timerfd.h" #include "imgset.h" #include "fs-magic.h" #include "fdinfo.h" #include "cr_options.h" #include "autofs.h" #include "parasite.h" #include "parasite-syscall.h" #include "protobuf.h" #include "util.h" #include "images/fs.pb-c.h" #include "images/ext-file.pb-c.h" #include "plugin.h" #define FDESC_HASH_SIZE 64 static struct hlist_head file_desc_hash[FDESC_HASH_SIZE]; /* file_desc's, which fle is not owned by a process, that is able to open them */ static LIST_HEAD(fake_master_head); static void init_fdesc_hash(void) { int i; for (i = 0; i < FDESC_HASH_SIZE; i++) INIT_HLIST_HEAD(&file_desc_hash[i]); } void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops) { INIT_LIST_HEAD(&d->fd_info_head); INIT_LIST_HEAD(&d->fake_master_list); INIT_HLIST_NODE(&d->hash); d->id = id; d->ops = ops; } int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops) { file_desc_init(d, id, ops); hlist_add_head(&d->hash, &file_desc_hash[id % FDESC_HASH_SIZE]); return 0; /* this is to make tail-calls in collect_one_foo look nice */ } struct file_desc *find_file_desc_raw(int type, u32 id) { struct file_desc *d; struct hlist_head *chain; chain = &file_desc_hash[id % FDESC_HASH_SIZE]; hlist_for_each_entry(d, chain, hash) if ((d->id == id) && (d->ops->type == type || type == FD_TYPES__UND)) /* * Warning -- old CRIU might generate matching IDs * for different file types! So any code that uses * FD_TYPES__UND for fdesc search MUST make sure it's * dealing with the merged files images where all * descs are forced to have different IDs. */ return d; return NULL; } static inline struct file_desc *find_file_desc(FdinfoEntry *fe) { return find_file_desc_raw(fe->type, fe->id); } struct fdinfo_list_entry *find_used_fd(struct pstree_item *task, int fd) { struct list_head *head; struct fdinfo_list_entry *fle; head = &rsti(task)->fds; list_for_each_entry_reverse(fle, head, ps_list) { if (fle->fe->fd == fd) return fle; /* List is ordered, so let's stop */ if (fle->fe->fd < fd) break; } return NULL; } void collect_task_fd(struct fdinfo_list_entry *new_fle, struct rst_info *ri) { struct fdinfo_list_entry *fle; /* fles in fds list are ordered by fd */ list_for_each_entry(fle, &ri->fds, ps_list) { if (new_fle->fe->fd < fle->fe->fd) break; } list_add_tail(&new_fle->ps_list, &fle->ps_list); } unsigned int find_unused_fd(struct pstree_item *task, int hint_fd) { struct list_head *head; struct fdinfo_list_entry *fle; int fd = 0, prev_fd; if ((hint_fd >= 0) && (!find_used_fd(task, hint_fd))) { fd = hint_fd; goto out; } prev_fd = service_fd_min_fd() - 1; head = &rsti(task)->fds; list_for_each_entry_reverse(fle, head, ps_list) { fd = fle->fe->fd; if (prev_fd > fd) { fd++; goto out; } prev_fd = fd - 1; } BUG(); out: return fd; } int set_fds_event(pid_t virt) { struct pstree_item *item; bool is_set; item = pstree_item_by_virt(virt); BUG_ON(!item); is_set = !!test_and_set_bit_le(FDS_EVENT_BIT, &item->task_st_le_bits); if (!is_set) futex_wake(&item->task_st); return 0; } void clear_fds_event(void) { clear_bit_le(FDS_EVENT_BIT, ¤t->task_st_le_bits); } void wait_fds_event(void) { futex_t *f = ¤t->task_st; int value; value = htole32(FDS_EVENT); futex_wait_if_cond(f, value, &); clear_fds_event(); } struct fdinfo_list_entry *try_file_master(struct file_desc *d) { if (list_empty(&d->fd_info_head)) return NULL; return list_first_entry(&d->fd_info_head, struct fdinfo_list_entry, desc_list); } struct fdinfo_list_entry *file_master(struct file_desc *d) { struct fdinfo_list_entry *fle; fle = try_file_master(d); if (!fle) { pr_err("Empty list on file desc id %#x(%d)\n", d->id, d->ops ? d->ops->type : -1); BUG(); } return fle; } void show_saved_files(void) { int i; struct file_desc *fd; pr_info("File descs:\n"); for (i = 0; i < FDESC_HASH_SIZE; i++) hlist_for_each_entry(fd, &file_desc_hash[i], hash) { struct fdinfo_list_entry *le; pr_info(" `- type %d ID %#x\n", fd->ops->type, fd->id); list_for_each_entry(le, &fd->fd_info_head, desc_list) pr_info(" `- FD %d pid %d\n", le->fe->fd, le->pid); } } /* * Workaround for the OverlayFS bug present before Kernel 4.2 * * This is here only to support the Linux Kernel between versions * 3.18 and 4.2. After that, this workaround is not needed anymore, * but it will work properly on both a kernel with and withouth the bug. * * When a process has a file open in an OverlayFS directory, * the information in /proc//fd/ and /proc//fdinfo/ * is wrong. We can't even rely on stat()-ing /proc//fd/ since * this will show us the wrong filesystem type. * * So we grab that information from the mountinfo table instead. This is done * every time fill_fdlink is called. See lookup_overlayfs for more details. * */ static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link) { struct mount_info *m; if (!link) return 0; m = lookup_overlayfs(link->name, p->stat.st_dev, p->stat.st_ino, p->mnt_id); if (IS_ERR(m)) return -1; if (!m) return 0; p->mnt_id = m->mnt_id; /* * If the bug is present, the file path from /proc//fd * does not include the mountpoint, so we prepend it ourselves. */ if (strcmp("./", m->mountpoint) != 0) { char buf[PATH_MAX]; int n; strncpy(buf, link->name, PATH_MAX - 1); n = snprintf(link->name, PATH_MAX, "%s/%s", m->mountpoint, buf + 2); if (n >= PATH_MAX) { pr_err("Not enough space to replace %s\n", buf); return -1; } } return 0; } /* * The gen_id thing is used to optimize the comparison of shared files. * If two files have different gen_ids, then they are different for sure. * If it matches, we don't know it and have to call sys_kcmp(). * * The kcmp-ids.c engine does this trick, see comments in it for more info. */ static u32 make_gen_id(const struct fd_parms *p) { return ((u32)p->stat.st_dev) ^ ((u32)p->stat.st_ino) ^ ((u32)p->pos); } int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops, FdinfoEntry *e) { int ret = -1; e->type = ops->type; e->id = make_gen_id(p); e->fd = p->fd; e->flags = p->fd_flags; ret = fd_id_generate(p->pid, e, p); if (ret == 1) /* new ID generated */ ret = ops->dump(lfd, e->id, p); return ret; } int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link) { int len; link->name[0] = '.'; len = read_fd_link(lfd, &link->name[1], sizeof(link->name) - 1); if (len < 0) { pr_err("Can't read link for pid %d fd %d\n", p->pid, p->fd); return -1; } link->len = len + 1; if (opts.overlayfs) if (fixup_overlayfs((struct fd_parms *)p, link) < 0) return -1; return 0; } static int fill_fd_params(struct pid *owner_pid, int fd, int lfd, struct fd_opts *opts, struct fd_parms *p) { int ret; struct statfs fsbuf; struct fdinfo_common fdinfo = { .mnt_id = -1, .owner = owner_pid->ns[0].virt }; if (fstat(lfd, &p->stat) < 0) { pr_perror("Can't stat fd %d", lfd); return -1; } if (fstatfs(lfd, &fsbuf) < 0) { pr_perror("Can't statfs fd %d", lfd); return -1; } if (parse_fdinfo_pid(owner_pid->real, fd, FD_TYPES__UND, &fdinfo)) return -1; p->fs_type = fsbuf.f_type; p->fd = fd; p->pos = fdinfo.pos; p->flags = fdinfo.flags; p->mnt_id = fdinfo.mnt_id; p->pid = owner_pid->real; p->fd_flags = opts->flags; fown_entry__init(&p->fown); pr_info("%d fdinfo %d: pos: %#16"PRIx64" flags: %16o/%#x\n", owner_pid->real, fd, p->pos, p->flags, (int)p->fd_flags); ret = fcntl(lfd, F_GETSIG, 0); if (ret < 0) { pr_perror("Can't get owner signum on %d", lfd); return -1; } p->fown.signum = ret; if (opts->fown.pid == 0) return 0; p->fown.pid = opts->fown.pid; p->fown.pid_type = opts->fown.pid_type; p->fown.uid = opts->fown.uid; p->fown.euid = opts->fown.euid; return 0; } static const struct fdtype_ops *get_misc_dev_ops(int minor) { switch (minor) { case TUN_MINOR: return &tunfile_dump_ops; case AUTOFS_MINOR: return ®file_dump_ops; }; return NULL; } static const struct fdtype_ops *get_mem_dev_ops(struct fd_parms *p, int minor) { const struct fdtype_ops *ops = NULL; switch (minor) { case 11: /* * If /dev/kmsg is opened in write-only mode the file position * should not be set up upon restore, kernel doesn't allow that. */ if ((p->flags & O_ACCMODE) == O_WRONLY && p->pos == 0) p->pos = -1ULL; /* * Fallthrough. */ default: ops = ®file_dump_ops; break; }; return ops; } static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) { struct fd_link *link_old = p->link; int maj = major(p->stat.st_rdev); const struct fdtype_ops *ops; struct fd_link link; int err; switch (maj) { case MEM_MAJOR: ops = get_mem_dev_ops(p, minor(p->stat.st_rdev)); break; case MISC_MAJOR: ops = get_misc_dev_ops(minor(p->stat.st_rdev)); if (ops) break; /* fallthrough */ default: { char more[32]; if (is_tty(p->stat.st_rdev, p->stat.st_dev)) { if (fill_fdlink(lfd, p, &link)) return -1; p->link = &link; ops = &tty_dump_ops; break; } sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev)); err = dump_unsupp_fd(p, lfd, "chr", more, e); p->link = link_old; return err; } } err = do_dump_gen_file(p, lfd, ops, e); p->link = link_old; return err; } static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, struct parasite_ctl *ctl, FdinfoEntry *e) { struct fd_parms p = FD_PARMS_INIT; const struct fdtype_ops *ops; struct fd_link link; if (fill_fd_params(pid, fd, lfd, opts, &p) < 0) { pr_err("Can't get stat on %d\n", fd); return -1; } if (note_file_lock(pid, fd, lfd, &p)) return -1; p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ if (S_ISSOCK(p.stat.st_mode)) return dump_socket(&p, lfd, e); if (S_ISCHR(p.stat.st_mode)) return dump_chrdev(&p, lfd, e); if (p.fs_type == ANON_INODE_FS_MAGIC) { char link[32]; if (read_fd_link(lfd, link, sizeof(link)) < 0) return -1; if (is_eventfd_link(link)) ops = &eventfd_dump_ops; else if (is_eventpoll_link(link)) ops = &eventpoll_dump_ops; else if (is_inotify_link(link)) ops = &inotify_dump_ops; else if (is_fanotify_link(link)) ops = &fanotify_dump_ops; else if (is_signalfd_link(link)) ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; else return dump_unsupp_fd(&p, lfd, "anon", link, e); return do_dump_gen_file(&p, lfd, ops, e); } if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; p.link = &link; if (link.name[1] == '/') return do_dump_gen_file(&p, lfd, ®file_dump_ops, e); if (check_ns_proc(&link)) return do_dump_gen_file(&p, lfd, &nsfile_dump_ops, e); return dump_unsupp_fd(&p, lfd, "reg", link.name + 1, e); } if (S_ISFIFO(p.stat.st_mode)) { if (p.fs_type == PIPEFS_MAGIC) ops = &pipe_dump_ops; else ops = &fifo_dump_ops; return do_dump_gen_file(&p, lfd, ops, e); } /* * For debug purpose -- at least show the link * file pointing to when reporting unsupported file. * On error simply empty string here. */ if (fill_fdlink(lfd, &p, &link)) memzero(&link, sizeof(link)); return dump_unsupp_fd(&p, lfd, "unknown", link.name + 1, e); } int dump_my_file(int lfd, u32 *id, int *type) { struct pid me = {}; struct fd_opts fo = {}; FdinfoEntry e = FDINFO_ENTRY__INIT; me.real = getpid(); me.ns[0].virt = -1; /* FIXME */ if (dump_one_file(&me, lfd, lfd, &fo, NULL, &e)) return -1; *id = e.id; *type = e.type; return 0; } int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds) { int *lfds = NULL; struct cr_img *img = NULL; struct fd_opts *opts = NULL; int i, ret = -1; int off, nr_fds = min((int) PARASITE_MAX_FDS, dfds->nr_fds); pr_info("\n"); pr_info("Dumping opened files (pid: %d)\n", item->pid->real); pr_info("----------------------------------------\n"); lfds = xmalloc(nr_fds * sizeof(int)); if (!lfds) goto err; opts = xmalloc(nr_fds * sizeof(struct fd_opts)); if (!opts) goto err; img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id); if (!img) goto err; ret = 0; /* Don't fail if nr_fds == 0 */ for (off = 0; off < dfds->nr_fds; off += nr_fds) { if (nr_fds + off > dfds->nr_fds) nr_fds = dfds->nr_fds - off; ret = parasite_drain_fds_seized(ctl, dfds, nr_fds, off, lfds, opts); if (ret) goto err; for (i = 0; i < nr_fds; i++) { FdinfoEntry e = FDINFO_ENTRY__INIT; ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e); close(lfds[i]); if (ret) break; ret = pb_write_one(img, &e, PB_FDINFO); if (ret) break; } } pr_info("----------------------------------------\n"); err: if (img) close_image(img); xfree(opts); xfree(lfds); return ret; } static int predump_one_fd(int pid, int fd) { const struct fdtype_ops *ops; char link[PATH_MAX], t[32]; int ret = 0; snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd); ret = readlink(t, link, sizeof(link)); if (ret < 0) { pr_perror("Can't read link of fd %d", fd); return -1; } else if ((size_t)ret == sizeof(link)) { pr_err("Buffer for read link of fd %d is too small\n", fd); return -1; } link[ret] = 0; ret = 0; if (is_inotify_link(link)) ops = &inotify_dump_ops; else if (is_fanotify_link(link)) ops = &fanotify_dump_ops; else goto out; pr_debug("Pre-dumping %d's %d fd\n", pid, fd); ret = ops->pre_dump(pid, fd); out: return ret; } int predump_task_files(int pid) { struct dirent *de; DIR *fd_dir; int ret = -1; pr_info("Pre-dump fds for %d)\n", pid); fd_dir = opendir_proc(pid, "fd"); if (!fd_dir) return -1; while ((de = readdir(fd_dir))) { if (dir_dots(de)) continue; if (predump_one_fd(pid, atoi(de->d_name))) goto out; } ret = 0; out: closedir(fd_dir); return ret; } int restore_fown(int fd, FownEntry *fown) { struct f_owner_ex owner; uid_t uids[3]; if (fown->signum) { if (fcntl(fd, F_SETSIG, fown->signum)) { pr_perror("Can't set signal"); return -1; } } /* May be untouched */ if (!fown->pid) return 0; if (getresuid(&uids[0], &uids[1], &uids[2])) { pr_perror("Can't get current UIDs"); return -1; } if (setresuid(fown->uid, fown->euid, uids[2])) { pr_perror("Can't set UIDs"); return -1; } owner.type = fown->pid_type; owner.pid = fown->pid; if (fcntl(fd, F_SETOWN_EX, &owner)) { pr_perror("Can't setup %d file owner pid", fd); return -1; } if (setresuid(uids[0], uids[1], uids[2])) { pr_perror("Can't revert UIDs back"); return -1; } if (prctl(PR_SET_DUMPABLE, 1, 0)) pr_perror("Unable to set PR_SET_DUMPABLE"); return 0; } int rst_file_params(int fd, FownEntry *fown, int flags) { if (set_fd_flags(fd, flags) < 0) return -1; if (restore_fown(fd, fown) < 0) return -1; return 0; } static struct fdinfo_list_entry *alloc_fle(int pid, FdinfoEntry *fe) { struct fdinfo_list_entry *fle; fle = shmalloc(sizeof(*fle)); if (!fle) return NULL; fle->pid = pid; fle->fe = fe; fle->received = 0; fle->fake = 0; fle->stage = FLE_INITIALIZED; fle->task = pstree_item_by_virt(pid); if (!fle->task) { pr_err("Can't find task with pid %d\n", pid); shfree_last(fle); return NULL; } return fle; } static void collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc) { struct fdinfo_list_entry *le; new_le->desc = fdesc; list_for_each_entry(le, &fdesc->fd_info_head, desc_list) if (pid_rst_prio(new_le->pid, le->pid)) break; list_add_tail(&new_le->desc_list, &le->desc_list); } struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, struct rst_info *rst_info, struct file_desc *fdesc, bool fake) { struct fdinfo_list_entry *new_le; new_le = alloc_fle(pid, e); if (new_le) { new_le->fake = (!!fake); collect_desc_fle(new_le, fdesc); collect_task_fd(new_le, rst_info); } return new_le; } int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool fake) { struct file_desc *fdesc; pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n", pid, e->fd, e->id); fdesc = find_file_desc(e); if (fdesc == NULL) { pr_err("No file for fd %d id %#x\n", e->fd, e->id); return -1; } if (!collect_fd_to(pid, e, rst_info, fdesc, fake)) return -1; return 0; } FdinfoEntry *dup_fdinfo(FdinfoEntry *old, int fd, unsigned flags) { FdinfoEntry *e; e = shmalloc(sizeof(*e)); if (!e) return NULL; fdinfo_entry__init(e); e->id = old->id; e->type = old->type; e->fd = fd; e->flags = flags; return e; } int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, int fd, unsigned flags) { FdinfoEntry *e; e = dup_fdinfo(ple->fe, fd, flags); if (!e) return -1; return collect_fd(vpid(task), e, rsti(task), false); } int prepare_ctl_tty(int pid, struct rst_info *rst_info, u32 ctl_tty_id) { FdinfoEntry *e; if (!ctl_tty_id) return 0; pr_info("Requesting for ctl tty %#x into service fd\n", ctl_tty_id); e = xmalloc(sizeof(*e)); if (!e) return -1; fdinfo_entry__init(e); e->id = ctl_tty_id; e->fd = reserve_service_fd(CTL_TTY_OFF); e->type = FD_TYPES__TTY; if (collect_fd(pid, e, rst_info, false)) { xfree(e); return -1; } return 0; } int prepare_fd_pid(struct pstree_item *item) { int ret = 0; struct cr_img *img; pid_t pid = vpid(item); struct rst_info *rst_info = rsti(item); INIT_LIST_HEAD(&rst_info->fds); if (item->ids == NULL) /* zombie */ return 0; if (rsti(item)->fdt && rsti(item)->fdt->pid != vpid(item)) return 0; img = open_image(CR_FD_FDINFO, O_RSTR, item->ids->files_id); if (!img) return -1; while (1) { FdinfoEntry *e; ret = pb_read_one_eof(img, &e, PB_FDINFO); if (ret <= 0) break; if (e->fd >= service_fd_min_fd()) { ret = -1; pr_err("Too big FD number to restore %d\n", e->fd); break; } ret = collect_fd(pid, e, rst_info, false); if (ret < 0) { fdinfo_entry__free_unpacked(e, NULL); break; } } close_image(img); return ret; } #define SETFL_MASK (O_APPEND | O_ASYNC | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) int set_fd_flags(int fd, int flags) { int ret; ret = fcntl(fd, F_GETFL, 0); if (ret < 0) goto err; flags = (SETFL_MASK & flags) | (ret & ~SETFL_MASK); ret = fcntl(fd, F_SETFL, flags); if (ret < 0) goto err; /* Let's check, that now actual flags contains those we need */ ret = fcntl(fd, F_GETFL, 0); if (ret < 0) goto err; if (ret != flags) { pr_err("fcntl call on fd %d (flags %#o) succeeded, " "but some flags were dropped: %#o\n", fd, flags, ret); return -1; } return 0; err: pr_perror("fcntl call on fd %d (flags %x) failed", fd, flags); return -1; } struct fd_open_state { char *name; int (*cb)(int, struct fdinfo_list_entry *); }; static int receive_fd(struct fdinfo_list_entry *fle); static void transport_name_gen(struct sockaddr_un *addr, int *len, int pid) { addr->sun_family = AF_UNIX; snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d", pid); *len = SUN_LEN(addr); *addr->sun_path = '\0'; } static bool task_fle(struct pstree_item *task, struct fdinfo_list_entry *fle) { struct fdinfo_list_entry *tmp; list_for_each_entry(tmp, &rsti(task)->fds, ps_list) if (fle == tmp) return true; return false; } static int plant_fd(struct fdinfo_list_entry *fle, int fd) { BUG_ON(fle->received); fle->received = 1; return reopen_fd_as(fle->fe->fd, fd); } static int recv_fd_from_peer(struct fdinfo_list_entry *fle) { struct fdinfo_list_entry *tmp; int fd, ret, tsock; if (fle->received) return 0; tsock = get_service_fd(TRANSPORT_FD_OFF); do { ret = __recv_fds(tsock, &fd, 1, (void *)&tmp, sizeof(struct fdinfo_list_entry *), MSG_DONTWAIT); if (ret == -EAGAIN || ret == -EWOULDBLOCK) return 1; else if (ret) return -1; pr_info("Further fle=%p, pid=%d\n", tmp, fle->pid); if (!task_fle(current, tmp)) { pr_err("Unexpected fle %p, pid=%d\n", tmp, vpid(current)); return -1; } if (plant_fd(tmp, fd)) return -1; } while (tmp != fle); return 0; } static int send_fd_to_peer(int fd, struct fdinfo_list_entry *fle) { struct sockaddr_un saddr; int len, sock, ret; sock = get_service_fd(TRANSPORT_FD_OFF); transport_name_gen(&saddr, &len, fle->pid); pr_info("\t\tSend fd %d to %s\n", fd, saddr.sun_path + 1); ret = send_fds(sock, &saddr, len, &fd, 1, (void *)&fle, sizeof(struct fdinfo_list_entry *)); if (ret < 0) return -1; return set_fds_event(fle->pid); } /* * Helpers to scatter file_desc across users for those files, that * create two descriptors from a single system call at once (e.g. * ... or better i.e. -- pipes, socketpairs and ttys) */ int recv_desc_from_peer(struct file_desc *d, int *fd) { struct fdinfo_list_entry *fle; fle = file_master(d); *fd = fle->fe->fd; return recv_fd_from_peer(fle); } int send_desc_to_peer(int fd, struct file_desc *d) { return send_fd_to_peer(fd, file_master(d)); } static int send_fd_to_self(int fd, struct fdinfo_list_entry *fle) { int dfd = fle->fe->fd; if (fd == dfd) return 0; /* make sure we won't clash with an inherit fd */ if (inherit_fd_resolve_clash(dfd) < 0) return -1; BUG_ON(dfd == get_service_fd(TRANSPORT_FD_OFF)); pr_info("\t\t\tGoing to dup %d into %d\n", fd, dfd); if (dup2(fd, dfd) != dfd) { pr_perror("Can't dup local fd %d -> %d", fd, dfd); return -1; } if (fcntl(dfd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; } fle->received = 1; return 0; } static int serve_out_fd(int pid, int fd, struct file_desc *d) { int ret; struct fdinfo_list_entry *fle; pr_info("\t\tCreate fd for %d\n", fd); list_for_each_entry(fle, &d->fd_info_head, desc_list) { if (pid == fle->pid) ret = send_fd_to_self(fd, fle); else ret = send_fd_to_peer(fd, fle); if (ret) { pr_err("Can't sent fd %d to %d\n", fd, fle->pid); goto out; } } ret = 0; out: return ret; } static int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) { struct file_desc *d = fle->desc; pid_t pid = fle->pid; if (reopen_fd_as(fle->fe->fd, new_fd)) return -1; if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; } BUG_ON(fle->stage != FLE_INITIALIZED); fle->stage = FLE_OPEN; if (serve_out_fd(pid, fle->fe->fd, d)) return -1; return 0; } static int open_fd(struct fdinfo_list_entry *fle) { struct file_desc *d = fle->desc; struct fdinfo_list_entry *flem; int new_fd = -1, ret; flem = file_master(d); if (fle != flem) { BUG_ON (fle->stage != FLE_INITIALIZED); ret = receive_fd(fle); if (ret != 0) return ret; goto fixup_ctty; } /* * Open method returns the following values: * 0 -- restore is successefuly finished; * 1 -- restore is in process or can't be started * yet, because of it depends on another fles, * so the method should be called once again; * -1 -- restore failed. * In case of 0 and 1 return values, new_fd may * be not negative. In this case it contains newly * opened file descriptor, which may be served out. * For every fle, new_fd is populated only once. * See setup_and_serve_out() BUG_ON for the details. */ ret = d->ops->open(d, &new_fd); if (ret != -1 && new_fd >= 0) { if (setup_and_serve_out(fle, new_fd) < 0) return -1; } fixup_ctty: if (ret == 0) { if (fle->fe->fd == get_service_fd(CTL_TTY_OFF)) { ret = tty_restore_ctl_terminal(fle->desc, fle->fe->fd); if (ret == -1) return ret; } fle->stage = FLE_RESTORED; } return ret; } static int receive_fd(struct fdinfo_list_entry *fle) { int ret; pr_info("\tReceive fd for %d\n", fle->fe->fd); ret = recv_fd_from_peer(fle); if (ret != 0) { if (ret != 1) pr_err("Can't get fd=%d, pid=%d\n", fle->fe->fd, fle->pid); return ret; } if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; } return 0; } static void close_fdinfos(struct list_head *list) { struct fdinfo_list_entry *fle; list_for_each_entry(fle, list, ps_list) close(fle->fe->fd); } static int open_fdinfos(struct pstree_item *me) { struct list_head *list = &rsti(me)->fds; struct fdinfo_list_entry *fle, *tmp; LIST_HEAD(completed); LIST_HEAD(fake); bool progress, again; int st, ret = 0; do { progress = again = false; clear_fds_event(); list_for_each_entry_safe(fle, tmp, list, ps_list) { st = fle->stage; BUG_ON(st == FLE_RESTORED); ret = open_fd(fle); if (ret == -1) { pr_err("Unable to open fd=%d id=%#x\n", fle->fe->fd, fle->fe->id); goto splice; } if (st != fle->stage || ret == 0) progress = true; if (ret == 0) { /* * We delete restored items from fds list, * so open() methods may base on this feature * and reduce number of fles in their checks. */ list_del(&fle->ps_list); if (!fle->fake) list_add(&fle->ps_list, &completed); else list_add(&fle->ps_list, &fake); } if (ret == 1) again = true; } if (!progress && again) wait_fds_event(); } while (again || progress); BUG_ON(!list_empty(list)); /* * Fake fles may be used for restore other * file types, so their closing is delayed. */ close_fdinfos(&fake); splice: list_splice(&fake, list); list_splice(&completed, list); return ret; } static struct inherit_fd *inherit_fd_lookup_fd(int fd, const char *caller); int close_old_fds(void) { DIR *dir; struct dirent *de; int fd, ret; dir = opendir_proc(PROC_SELF, "fd"); if (dir == NULL) return -1; while ((de = readdir(dir))) { if (dir_dots(de)) continue; ret = sscanf(de->d_name, "%d", &fd); if (ret != 1) { pr_err("Can't parse %s\n", de->d_name); return -1; } if ((!is_any_service_fd(fd)) && (dirfd(dir) != fd) && !inherit_fd_lookup_fd(fd, __FUNCTION__)) close_safe(&fd); } closedir(dir); close_pid_proc(); return 0; } int prepare_fds(struct pstree_item *me) { u32 ret = 0; pr_info("Opening fdinfo-s\n"); /* * This must be done after forking to allow child * to get the cgroup fd so it can move into the * correct /tasks file if it is in a different cgroup * set than its parent */ close_service_fd(CGROUP_YARD); close_pid_proc(); /* flush any proc cached fds we may have */ if (rsti(me)->fdt) { struct fdt *fdt = rsti(me)->fdt; /* * Wait all tasks, who share a current fd table. * We should be sure, that nobody use any file * descriptor while fdtable is being restored. */ futex_inc_and_wake(&fdt->fdt_lock); futex_wait_while_lt(&fdt->fdt_lock, fdt->nr); if (fdt->pid != vpid(me)) { pr_info("File descriptor table is shared with %d\n", fdt->pid); futex_wait_until(&fdt->fdt_lock, fdt->nr + 1); goto out; } } ret = open_fdinfos(me); close_service_fd(TRANSPORT_FD_OFF); if (rsti(me)->fdt) futex_inc_and_wake(&rsti(me)->fdt->fdt_lock); out: close_service_fd(CR_PROC_FD_OFF); tty_fini_fds(); return ret; } static int fchroot(int fd) { /* * There's no such thing in syscalls. We can emulate * it using fchdir() */ if (fchdir(fd) < 0) { pr_perror("Can't chdir to proc"); return -1; } pr_debug("Going to chroot into /proc/self/fd/%d\n", fd); return chroot("."); } int restore_fs(struct pstree_item *me) { int dd_root = -1, dd_cwd = -1, ret, err = -1; struct rst_info *ri = rsti(me); /* * First -- open both descriptors. We will not * be able to open the cwd one after we chroot. */ dd_root = open_reg_fd(ri->root); if (dd_root < 0) { pr_err("Can't open root\n"); goto out; } dd_cwd = open_reg_fd(ri->cwd); if (dd_cwd < 0) { pr_err("Can't open cwd\n"); goto out; } /* * Now do chroot/chdir. Chroot goes first as it calls chdir into * dd_root so we'd need to fix chdir after it anyway. */ ret = fchroot(dd_root); if (ret < 0) { pr_perror("Can't change root"); goto out; } ret = fchdir(dd_cwd); if (ret < 0) { pr_perror("Can't change cwd"); goto out; } if (ri->has_umask) { pr_info("Restoring umask to %o\n", ri->umask); umask(ri->umask); } err = 0; out: if (dd_cwd >= 0) close(dd_cwd); if (dd_root >= 0) close(dd_root); return err; } int prepare_fs_pid(struct pstree_item *item) { pid_t pid = vpid(item); struct rst_info *ri = rsti(item); struct cr_img *img; FsEntry *fe; int ret = -1; img = open_image(CR_FD_FS, O_RSTR, pid); if (!img) goto out; ret = pb_read_one_eof(img, &fe, PB_FS); close_image(img); if (ret <= 0) goto out; ri->cwd = collect_special_file(fe->cwd_id); if (!ri->cwd) { pr_err("Can't find task cwd file\n"); goto out_f; } ri->root = collect_special_file(fe->root_id); if (!ri->root) { pr_err("Can't find task root file\n"); goto out_f; } ri->has_umask = fe->has_umask; ri->umask = fe->umask; ret = 0; out_f: fs_entry__free_unpacked(fe, NULL); out: return ret; } int shared_fdt_prepare(struct pstree_item *item) { struct pstree_item *parent = item->parent; struct fdt *fdt; if (!rsti(parent)->fdt) { fdt = shmalloc(sizeof(*rsti(item)->fdt)); if (fdt == NULL) return -1; rsti(parent)->fdt = fdt; futex_init(&fdt->fdt_lock); fdt->nr = 1; fdt->pid = vpid(parent); } else fdt = rsti(parent)->fdt; rsti(item)->fdt = fdt; rsti(item)->service_fd_id = fdt->nr; fdt->nr++; if (pid_rst_prio(vpid(item), fdt->pid)) fdt->pid = vpid(item); return 0; } /* * Inherit fd support. * * There are cases where a process's file descriptor cannot be restored * from the checkpointed image. For example, a pipe file descriptor with * one end in the checkpointed process and the other end in a separate * process (that was not part of the checkpointed process tree) cannot be * restored because after checkpoint the pipe would be broken and removed. * * There are also cases where the user wants to use a new file during * restore instead of the original file in the checkpointed image. For * example, the user wants to change the log file of a process from * /path/to/oldlog to /path/to/newlog. * * In these cases, criu's caller should set up a new file descriptor to be * inherited by the restored process and specify it with the --inherit-fd * command line option. The argument of --inherit-fd has the format * fd[%d]:%s, where %d tells criu which of its own file descriptor to use * for restoring file identified by %s. * * As a debugging aid, if the argument has the format debug[%d]:%s, it tells * criu to write out the string after colon to the file descriptor %d. This * can be used to leave a "restore marker" in the output stream of the process. * * It's important to note that inherit fd support breaks applications * that depend on the state of the file descriptor being inherited. So, * consider inherit fd only for specific use cases that you know for sure * won't break the application. * * For examples please visit http://criu.org/Category:HOWTO. */ struct inherit_fd { struct list_head inh_list; char *inh_id; /* file identifier */ int inh_fd; /* criu's descriptor to inherit */ dev_t inh_dev; ino_t inh_ino; mode_t inh_mode; dev_t inh_rdev; }; /* * Return 1 if inherit fd has been closed or reused, 0 otherwise. * * Some parts of the file restore engine can close an inherit fd * explicitly by close() or implicitly by dup2() to reuse that descriptor. * In some specific functions (for example, send_fd_to_self()), we * check for clashes at the beginning of the function and, therefore, * these specific functions will not reuse an inherit fd. However, to * avoid adding a ton of clash detect and resolve code everywhere we close() * and/or dup2(), we just make sure that when we're dup()ing or close()ing * our inherit fd we're still dealing with the same fd that we inherited. */ static int inherit_fd_reused(struct inherit_fd *inh) { struct stat sbuf; if (fstat(inh->inh_fd, &sbuf) == -1) { if (errno == EBADF) { pr_debug("Inherit fd %s -> %d has been closed\n", inh->inh_id, inh->inh_fd); return 1; } pr_perror("Can't fstat inherit fd %d", inh->inh_fd); return -1; } if (inh->inh_dev != sbuf.st_dev || inh->inh_ino != sbuf.st_ino || inh->inh_mode != sbuf.st_mode || inh->inh_rdev != sbuf.st_rdev) { pr_info("Inherit fd %s -> %d has been reused\n", inh->inh_id, inh->inh_fd); return 1; } return 0; } /* * We can't print diagnostics messages in this function because the * log file isn't initialized yet. */ int inherit_fd_parse(char *optarg) { char *cp = NULL; int n = -1; int fd = -1; int dbg = 0; /* * Parse the argument. */ if (!strncmp(optarg, "fd", 2)) cp = &optarg[2]; else if (!strncmp(optarg, "debug", 5)) { cp = &optarg[5]; dbg = 1; } if (cp) { n = sscanf(cp, "[%d]:", &fd); cp = strchr(optarg, ':'); } if (n != 1 || fd < 0 || !cp || !cp[1]) { pr_err("Invalid inherit fd argument: %s\n", optarg); return -1; } /* * If the argument is a debug string, write it to fd. * Otherwise, add it to the inherit fd list. */ cp++; if (dbg) { n = strlen(cp); if (write(fd, cp, n) != n) { pr_err("Can't write debug message %s to inherit fd %d\n", cp, fd); return -1; } return 0; } return inherit_fd_add(fd, cp); } int inherit_fd_add(int fd, char *key) { struct inherit_fd *inh; struct stat sbuf; if (fstat(fd, &sbuf) == -1) { pr_perror("Can't fstat inherit fd %d", fd); return -1; } inh = xmalloc(sizeof *inh); if (inh == NULL) return -1; inh->inh_id = key; inh->inh_fd = fd; inh->inh_dev = sbuf.st_dev; inh->inh_ino = sbuf.st_ino; inh->inh_mode = sbuf.st_mode; inh->inh_rdev = sbuf.st_rdev; list_add_tail(&inh->inh_list, &opts.inherit_fds); return 0; } /* * Log the inherit fd list. Called for diagnostics purposes * after the log file is initialized. */ void inherit_fd_log(void) { struct inherit_fd *inh; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { pr_info("File %s will be restored from inherit fd %d\n", inh->inh_id, inh->inh_fd); } } /* * Look up the inherit fd list by a file identifier. */ int inherit_fd_lookup_id(char *id) { int ret; struct inherit_fd *inh; ret = -1; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { if (!strcmp(inh->inh_id, id)) { if (!inherit_fd_reused(inh)) { ret = inh->inh_fd; pr_debug("Found id %s (fd %d) in inherit fd list\n", id, ret); } break; } } return ret; } bool inherited_fd(struct file_desc *d, int *fd_p) { char buf[32], *id_str; int i_fd; if (!d->ops->name) return false; id_str = d->ops->name(d, buf, sizeof(buf)); i_fd = inherit_fd_lookup_id(id_str); if (i_fd < 0) return false; if (fd_p == NULL) return true; *fd_p = dup(i_fd); if (*fd_p < 0) pr_perror("Inherit fd DUP failed"); else pr_info("File %s will be restored from fd %d dumped " "from inherit fd %d\n", id_str, *fd_p, i_fd); return true; } /* * Look up the inherit fd list by a file descriptor. */ static struct inherit_fd *inherit_fd_lookup_fd(int fd, const char *caller) { struct inherit_fd *ret; struct inherit_fd *inh; ret = NULL; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { if (inh->inh_fd == fd) { if (!inherit_fd_reused(inh)) { ret = inh; pr_debug("Found fd %d (id %s) in inherit fd list (caller %s)\n", fd, inh->inh_id, caller); } break; } } return ret; } /* * If the specified fd clashes with an inherit fd, * move the inherit fd. */ int inherit_fd_resolve_clash(int fd) { int newfd; struct inherit_fd *inh; inh = inherit_fd_lookup_fd(fd, __FUNCTION__); if (inh == NULL) return 0; newfd = dup(fd); if (newfd == -1) { pr_perror("Can't dup inherit fd %d", fd); return -1; } if (close(fd) == -1) { close(newfd); pr_perror("Can't close inherit fd %d", fd); return -1; } inh->inh_fd = newfd; pr_debug("Inherit fd %d moved to %d to resolve clash\n", fd, inh->inh_fd); return 0; } /* * Close all inherit fds. */ int inherit_fd_fini() { int reused; struct inherit_fd *inh; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { if (inh->inh_fd < 0) { pr_err("File %s in inherit fd list has invalid fd %d\n", inh->inh_id, inh->inh_fd); return -1; } reused = inherit_fd_reused(inh); if (reused < 0) return -1; if (!reused) { pr_debug("Closing inherit fd %d -> %s\n", inh->inh_fd, inh->inh_id); if (close_safe(&inh->inh_fd) < 0) return -1; } } return 0; } int open_transport_socket(void) { struct fdt *fdt = rsti(current)->fdt; pid_t pid = vpid(current); struct sockaddr_un saddr; int sock, slen; if (!task_alive(current) || (fdt && fdt->pid != pid)) return 0; sock = socket(PF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0); if (sock < 0) { pr_perror("Can't create socket"); return -1; } transport_name_gen(&saddr, &slen, pid); if (bind(sock, (struct sockaddr *)&saddr, slen) < 0) { pr_perror("Can't bind transport socket %s", saddr.sun_path + 1); close(sock); return -1; } if (install_service_fd(TRANSPORT_FD_OFF, sock) < 0) { close(sock); return -1; } close(sock); return 0; } static int collect_one_file_entry(FileEntry *fe, u_int32_t id, ProtobufCMessage *base, struct collect_image_info *cinfo) { if (fe->id != id) { pr_err("ID mismatch %u != %u\n", fe->id, id); return -1; } return collect_entry(base, cinfo); } static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) { int ret = 0; FileEntry *fe; fe = pb_msg(base, FileEntry); switch (fe->type) { default: pr_err("Unknown file type %d\n", fe->type); return -1; case FD_TYPES__REG: ret = collect_one_file_entry(fe, fe->reg->id, &fe->reg->base, ®_file_cinfo); break; case FD_TYPES__INETSK: ret = collect_one_file_entry(fe, fe->isk->id, &fe->isk->base, &inet_sk_cinfo); break; case FD_TYPES__NS: ret = collect_one_file_entry(fe, fe->nsf->id, &fe->nsf->base, &nsfile_cinfo); break; case FD_TYPES__PACKETSK: ret = collect_one_file_entry(fe, fe->psk->id, &fe->psk->base, &packet_sk_cinfo); break; case FD_TYPES__NETLINKSK: ret = collect_one_file_entry(fe, fe->nlsk->id, &fe->nlsk->base, &netlink_sk_cinfo); break; case FD_TYPES__EVENTFD: ret = collect_one_file_entry(fe, fe->efd->id, &fe->efd->base, &eventfd_cinfo); break; case FD_TYPES__EVENTPOLL: ret = collect_one_file_entry(fe, fe->epfd->id, &fe->epfd->base, &epoll_cinfo); break; case FD_TYPES__SIGNALFD: ret = collect_one_file_entry(fe, fe->sgfd->id, &fe->sgfd->base, &signalfd_cinfo); break; case FD_TYPES__TUNF: ret = collect_one_file_entry(fe, fe->tunf->id, &fe->tunf->base, &tunfile_cinfo); break; case FD_TYPES__TIMERFD: ret = collect_one_file_entry(fe, fe->tfd->id, &fe->tfd->base, &timerfd_cinfo); break; case FD_TYPES__INOTIFY: ret = collect_one_file_entry(fe, fe->ify->id, &fe->ify->base, &inotify_cinfo); break; case FD_TYPES__FANOTIFY: ret = collect_one_file_entry(fe, fe->ffy->id, &fe->ffy->base, &fanotify_cinfo); break; case FD_TYPES__EXT: ret = collect_one_file_entry(fe, fe->ext->id, &fe->ext->base, &ext_file_cinfo); break; case FD_TYPES__UNIXSK: ret = collect_one_file_entry(fe, fe->usk->id, &fe->usk->base, &unix_sk_cinfo); break; case FD_TYPES__FIFO: ret = collect_one_file_entry(fe, fe->fifo->id, &fe->fifo->base, &fifo_cinfo); break; case FD_TYPES__PIPE: ret = collect_one_file_entry(fe, fe->pipe->id, &fe->pipe->base, &pipe_cinfo); break; case FD_TYPES__TTY: ret = collect_one_file_entry(fe, fe->tty->id, &fe->tty->base, &tty_cinfo); break; } return ret; } struct collect_image_info files_cinfo = { .fd_type = CR_FD_FILES, .pb_type = PB_FILE, .priv_size = 0, .collect = collect_one_file, .flags = COLLECT_NOFREE, }; int prepare_files(void) { init_fdesc_hash(); return collect_image(&files_cinfo); } criu-3.6/criu/filesystems.c000066400000000000000000000416011317335042600160070ustar00rootroot00000000000000#include #include #include #include #include #include "config.h" #include "int.h" #include "common/compiler.h" #include "xmalloc.h" #include "cr_options.h" #include "filesystems.h" #include "namespaces.h" #include "mount.h" #include "pstree.h" #include "kerndat.h" #include "protobuf.h" #include "autofs.h" #include "util.h" #include "fs-magic.h" #include "tty.h" #include "images/mnt.pb-c.h" #include "images/binfmt-misc.pb-c.h" static int attach_option(struct mount_info *pm, char *opt) { if (pm->options[0] == '\0') pm->options = xstrcat(pm->options, "%s", opt); else pm->options = xstrcat(pm->options, ",%s", opt); return pm->options ? 0 : -1; } #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED struct binfmt_misc_info { BinfmtMiscEntry *bme; struct list_head list; }; LIST_HEAD(binfmt_misc_list); static int binfmt_misc_parse_or_collect(struct mount_info *pm) { opts.has_binfmt_misc = true; return 0; } static int binfmt_misc_virtual(struct mount_info *pm) { return kerndat_fs_virtualized(KERNDAT_FS_STAT_BINFMT_MISC, pm->s_dev); } static int parse_binfmt_misc_entry(struct bfd *f, BinfmtMiscEntry *bme) { while (1) { char *str; str = breadline(f); if (IS_ERR(str)) return -1; if (!str) break; if (!strncmp(str, "enabled", 7)) { bme->enabled = true; continue; } if (!strncmp(str, "disabled", 8)) continue; if (!strncmp(str, "offset ", 7)) { if (sscanf(str + 7, "%i", &bme->offset) != 1) return -1; bme->has_offset = true; continue; } #define DUP_EQUAL_AS(key, member) \ if (!strncmp(str, key, strlen(key))) { \ bme->member = xstrdup(str + strlen(key)); \ if (!bme->member) \ return -1; \ continue; \ } DUP_EQUAL_AS("interpreter ", interpreter) DUP_EQUAL_AS("flags: ", flags) DUP_EQUAL_AS("extension .", extension) DUP_EQUAL_AS("magic ", magic) DUP_EQUAL_AS("mask ", mask) #undef DUP_EQUAL_AS pr_perror("binfmt_misc: unsupported feature %s", str); return -1; } return 0; } static int dump_binfmt_misc_entry(int dfd, char *name, struct cr_img *img) { BinfmtMiscEntry bme = BINFMT_MISC_ENTRY__INIT; struct bfd f; int ret = -1; f.fd = openat(dfd, name, O_RDONLY); if (f.fd < 0) { pr_perror("binfmt_misc: can't open %s", name); return -1; } if (bfdopenr(&f)) return -1; if (parse_binfmt_misc_entry(&f, &bme)) goto err; bme.name = name; if (pb_write_one(img, &bme, PB_BINFMT_MISC)) goto err; ret = 0; err: free(bme.interpreter); free(bme.flags); free(bme.extension); free(bme.magic); free(bme.mask); bclose(&f); return ret; } static int binfmt_misc_dump(struct mount_info *pm) { static bool dumped = false; struct cr_img *img = NULL; struct dirent *de; DIR *fdir = NULL; int fd, ret; ret = binfmt_misc_virtual(pm); if (ret <= 0) return ret; if (dumped) { pr_err("Second binfmt_misc superblock\n"); return -1; } dumped = true; fd = open_mountpoint(pm); if (fd < 0) return fd; fdir = fdopendir(fd); if (fdir == NULL) { close(fd); return -1; } ret = -1; while ((de = readdir(fdir))) { if (dir_dots(de)) continue; if (!strcmp(de->d_name, "register")) continue; if (!strcmp(de->d_name, "status")) continue; if (!img) { /* Create image only if an extry exists, i.e. here */ img = open_image(CR_FD_BINFMT_MISC, O_DUMP); if (!img) goto out; } if (dump_binfmt_misc_entry(fd, de->d_name, img)) goto out; } ret = 0; out: if (img) close_image(img); closedir(fdir); return ret; } static int write_binfmt_misc_entry(char *mp, char *buf, BinfmtMiscEntry *bme) { int fd, len, ret = -1; char path[PATH_MAX+1]; snprintf(path, PATH_MAX, "%s/register", mp); fd = open(path, O_WRONLY); if (fd < 0) { pr_perror("binfmt_misc: can't open %s", path); return -1; } len = strlen(buf); if (write(fd, buf, len) != len) { pr_perror("binfmt_misc: can't write to %s", path); goto close; } if (!bme->enabled) { close(fd); snprintf(path, PATH_MAX, "%s/%s", mp, bme->name); fd = open(path, O_WRONLY); if (fd < 0) { pr_perror("binfmt_misc: can't open %s", path); goto out; } if (write(fd, "0", 1) != 1) { pr_perror("binfmt_misc: can't write to %s", path); goto close; } } ret = 0; close: close(fd); out: return ret; } #define BINFMT_MISC_STR (1920 + 1) static int make_bfmtm_magic_str(char *buf, BinfmtMiscEntry *bme) { int i, len; /* * Format is ":name:type(M):offset:magic:mask:interpreter:flags". * Magic and mask are special fields. Kernel outputs them as * a sequence of hexadecimal numbers (abc -> 616263), and we * dump them without changes. But for registering a new entry * it expects every byte is prepended with \x, i.e. \x61\x62\x63. */ len = strlen(bme->name) + 3 /* offset < 128 */ + 2 * strlen(bme->magic) + (bme->mask ? 2 * strlen(bme->mask) : 0) + strlen(bme->interpreter) + (bme->flags ? strlen(bme->flags) : 0) + strlen(":::::::"); if ((len > BINFMT_MISC_STR - 1) || bme->offset > 128) return -1; buf += sprintf(buf, ":%s:M:%d:", bme->name, bme->offset); len = strlen(bme->magic); for (i = 0; i < len; i += 2) buf += sprintf(buf, "\\x%c%c", bme->magic[i], bme->magic[i + 1]); buf += sprintf(buf, ":"); if (bme->mask) { len = strlen(bme->mask); for (i = 0; i < len; i += 2) buf += sprintf(buf, "\\x%c%c", bme->mask[i], bme->mask[i + 1]); } sprintf(buf, ":%s:%s", bme->interpreter, bme->flags ? : "\0"); return 1; } static int binfmt_misc_restore_bme(struct mount_info *mi, BinfmtMiscEntry *bme, char *buf) { int ret; if (!bme->name || !bme->interpreter) goto bad_dump; /* Either magic or extension should be there */ if (bme->magic) { ret = make_bfmtm_magic_str(buf, bme); } else if (bme->extension) { /* :name:E::extension::interpreter:flags */ ret = snprintf(buf, BINFMT_MISC_STR, ":%s:E::%s::%s:%s", bme->name, bme->extension, bme->interpreter, bme->flags ? : "\0"); if (ret >= BINFMT_MISC_STR) /* output truncated */ ret = -1; } else ret = -1; if (ret < 0) goto bad_dump; pr_debug("binfmt_misc_pattern=%s\n", buf); ret = write_binfmt_misc_entry(mi->mountpoint, buf, bme); return ret; bad_dump: pr_perror("binfmt_misc: bad dump"); return -1; } static int binfmt_misc_restore(struct mount_info *mi) { struct cr_img *img; char *buf; int ret = -1; buf = xmalloc(BINFMT_MISC_STR); if (!buf) return -1; if (!list_empty(&binfmt_misc_list)) { struct binfmt_misc_info *bmi; list_for_each_entry(bmi, &binfmt_misc_list, list) { ret = binfmt_misc_restore_bme(mi, bmi->bme, buf); if (ret) break; } goto free_buf; } img = open_image(CR_FD_BINFMT_MISC_OLD, O_RSTR, mi->s_dev); if (!img) { pr_err("Can't open binfmt_misc_old image\n"); goto free_buf; } else if (empty_image(img)) { close_image(img); ret = 0; goto free_buf; } ret = 0; while (ret == 0) { BinfmtMiscEntry *bme; ret = pb_read_one_eof(img, &bme, PB_BINFMT_MISC); if (ret <= 0) break; ret = binfmt_misc_restore_bme(mi, bme, buf); binfmt_misc_entry__free_unpacked(bme, NULL); } close_image(img); free_buf: free(buf); return ret; } static int collect_one_binfmt_misc_entry(void *o, ProtobufCMessage *msg, struct cr_img *img) { struct binfmt_misc_info *bmi = o; bmi->bme = pb_msg(msg, BinfmtMiscEntry); list_add_tail(&bmi->list, &binfmt_misc_list); return 0; } struct collect_image_info binfmt_misc_cinfo = { .fd_type = CR_FD_BINFMT_MISC, .pb_type = PB_BINFMT_MISC, .priv_size = sizeof(struct binfmt_misc_info), .collect = collect_one_binfmt_misc_entry, }; int collect_binfmt_misc(void) { return collect_image(&binfmt_misc_cinfo); } #else #define binfmt_misc_dump NULL #define binfmt_misc_restore NULL #define binfmt_misc_parse_or_collect NULL #endif static int tmpfs_dump(struct mount_info *pm) { int ret = -1, fd = -1, userns_pid = -1; char tmpfs_path[PSFDS]; struct cr_img *img; fd = open_mountpoint(pm); if (fd < 0) return fd; /* if fd happens to be 0 here, we need to move it to something * non-zero, because cr_system_userns closes STDIN_FILENO as we are not * interested in passing stdin to tar. */ if (move_fd_from(&fd, STDIN_FILENO) < 0) goto out; if (fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) & ~FD_CLOEXEC) == -1) { pr_perror("Can not drop FD_CLOEXEC"); goto out; } img = open_image(CR_FD_TMPFS_DEV, O_DUMP, pm->s_dev); if (!img) goto out; sprintf(tmpfs_path, "/proc/self/fd/%d", fd); if (root_ns_mask & CLONE_NEWUSER) userns_pid = root_item->pid->real; ret = cr_system_userns(-1, img_raw_fd(img), -1, "tar", (char *[]) { "tar", "--create", "--gzip", "--no-unquote", "--no-wildcards", "--one-file-system", "--check-links", "--preserve-permissions", "--sparse", "--numeric-owner", "--directory", tmpfs_path, ".", NULL }, 0, userns_pid); if (ret) pr_err("Can't dump tmpfs content\n"); close_image(img); out: close_safe(&fd); return ret; } static int tmpfs_restore(struct mount_info *pm) { int ret; struct cr_img *img; img = open_image(CR_FD_TMPFS_DEV, O_RSTR, pm->s_dev); if (empty_image(img)) { close_image(img); img = open_image(CR_FD_TMPFS_IMG, O_RSTR, pm->mnt_id); } if (!img) return -1; if (empty_image(img)) { close_image(img); return -1; } ret = cr_system(img_raw_fd(img), -1, -1, "tar", (char *[]) {"tar", "--extract", "--gzip", "--no-unquote", "--no-wildcards", "--directory", pm->mountpoint, NULL}, 0); close_image(img); if (ret) { pr_err("Can't restore tmpfs content\n"); return -1; } return 0; } /* * Virtualized devtmpfs on any side (dump or restore) * means, that we should try to handle it as a plain * tmpfs. * * Interesting case -- shared on dump and virtual on * restore -- will fail, since no tarball with the fs * contents will be found. */ static int devtmpfs_virtual(struct mount_info *pm) { return kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVTMPFS, pm->s_dev); } static int devtmpfs_dump(struct mount_info *pm) { int ret; ret = devtmpfs_virtual(pm); if (ret == 1) ret = tmpfs_dump(pm); return ret; } static int devtmpfs_restore(struct mount_info *pm) { int ret; ret = devtmpfs_virtual(pm); if (ret == 1) ret = tmpfs_restore(pm); return ret; } /* Is it mounted w or w/o the newinstance option */ static int devpts_parse(struct mount_info *pm) { int ret; ret = kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVPTS, pm->s_dev); if (ret <= 0) return ret; /* * Kernel hides this option, but if the fs instance * is new (virtualized) we know that it was created * with -o newinstance. */ return attach_option(pm, "newinstance"); } static int fusectl_dump(struct mount_info *pm) { int fd, ret = -1; struct dirent *de; DIR *fdir = NULL; fd = open_mountpoint(pm); if (fd < 0) return fd; fdir = fdopendir(fd); if (fdir == NULL) { close(fd); return -1; } while ((de = readdir(fdir))) { int id; struct mount_info *it; if (dir_dots(de)) continue; if (sscanf(de->d_name, "%d", &id) != 1) { pr_err("wrong number of items scanned in fusectl dump\n"); goto out; } for (it = mntinfo; it; it = it->next) { if (it->fstype->code == FSTYPE__FUSE && id == kdev_minor(it->s_dev) && !it->external) { pr_err("%s is a fuse mount but not external\n", it->mountpoint); goto out; } } } ret = 0; out: closedir(fdir); return ret; } static int debugfs_parse(struct mount_info *pm) { /* tracefs is automounted underneath debugfs sometimes, and the * kernel's overmounting protection prevents us from mounting debugfs * first without tracefs, so let's always mount debugfs MS_REC. */ pm->flags |= MS_REC; return 0; } static int tracefs_parse(struct mount_info *pm) { return 1; } static bool cgroup_sb_equal(struct mount_info *a, struct mount_info *b) { if (a->private && b->private && strcmp(a->private, b->private)) return false; if (strcmp(a->options, b->options)) return false; return true; } static int cgroup_parse(struct mount_info *pm) { if (!(root_ns_mask & CLONE_NEWCGROUP)) return 0; /* cgroup namespaced mounts don't look rooted to CRIU, so let's fake it * here. */ pm->private = pm->root; pm->root = xstrdup("/"); if (!pm->root) return -1; return 0; } static bool btrfs_sb_equal(struct mount_info *a, struct mount_info *b) { /* There is a btrfs bug where it doesn't emit subvol= correctly when * files are bind mounted, so let's ignore it for now. * https://marc.info/?l=linux-btrfs&m=145857372803614&w=2 */ char *posa = strstr(a->options, "subvol="), *posb = strstr(b->options, "subvol="); bool equal; if (!posa || !posb) { pr_err("invalid btrfs options, no subvol argument\n"); return false; } *posa = *posb = 0; equal = !strcmp(a->options, b->options); *posa = *posb = 's'; if (!equal) return false; posa = strchr(posa, ','); posb = strchr(posb, ','); if ((posa && !posb) || (!posa && posb)) return false; if (posa && strcmp(posa, posb)) return false; return true; } static int dump_empty_fs(struct mount_info *pm) { int fd, ret = -1; fd = open_mountpoint(pm); if (fd < 0) return fd; ret = is_empty_dir(fd); close(fd); if (ret < 0) { pr_err("%s isn't empty\n", pm->fstype->name); return -1; } return ret ? 0 : -1; } /* * Some fses (fuse) cannot be dumped, so we should always fail on dump/restore * of these fses. */ static int always_fail(struct mount_info *pm) { pr_err("failed to dump fs %s (%s): always fail\n", pm->mountpoint, pm->fstype->name); return -1; } static struct fstype fstypes[] = { { .name = "unsupported", .code = FSTYPE__UNSUPPORTED, }, { .name = "auto_cr", .code = FSTYPE__AUTO, }, { .name = "proc", .code = FSTYPE__PROC, }, { .name = "sysfs", .code = FSTYPE__SYSFS, }, { .name = "devtmpfs", .code = FSTYPE__DEVTMPFS, .dump = devtmpfs_dump, .restore = devtmpfs_restore, }, { .name = "binfmt_misc", .parse = binfmt_misc_parse_or_collect, .collect = binfmt_misc_parse_or_collect, .code = FSTYPE__BINFMT_MISC, .dump = binfmt_misc_dump, .restore = binfmt_misc_restore, }, { .name = "tmpfs", .code = FSTYPE__TMPFS, .dump = tmpfs_dump, .restore = tmpfs_restore, }, { .name = "devpts", .parse = devpts_parse, .code = FSTYPE__DEVPTS, .restore = devpts_restore, .check_bindmount = devpts_check_bindmount, }, { .name = "simfs", .code = FSTYPE__SIMFS, }, { .name = "btrfs", .code = FSTYPE__UNSUPPORTED, .sb_equal = btrfs_sb_equal, }, { .name = "pstore", .dump = dump_empty_fs, .code = FSTYPE__PSTORE, }, { .name = "mqueue", .dump = dump_empty_fs, .code = FSTYPE__MQUEUE, }, { .name = "securityfs", .code = FSTYPE__SECURITYFS, }, { .name = "fusectl", .dump = fusectl_dump, .code = FSTYPE__FUSECTL, }, { .name = "debugfs", .code = FSTYPE__DEBUGFS, .parse = debugfs_parse, }, { .name = "tracefs", .code = FSTYPE__TRACEFS, .parse = tracefs_parse, }, { .name = "cgroup", .code = FSTYPE__CGROUP, .parse = cgroup_parse, .sb_equal = cgroup_sb_equal, }, { .name = "aufs", .code = FSTYPE__AUFS, .parse = aufs_parse, }, { .name = "fuse", .code = FSTYPE__FUSE, .dump = always_fail, .restore = always_fail, }, { .name = "overlay", .code = FSTYPE__OVERLAYFS, .parse = overlayfs_parse, }, { .name = "autofs", .code = FSTYPE__AUTOFS, .parse = autofs_parse, .dump = autofs_dump, .mount = autofs_mount, }, }; struct fstype *fstype_auto(void) { return &fstypes[1]; } static char fsauto_all[] = "all"; static char *fsauto_names; static bool css_contains(const char *css, const char *str) { int len = strlen(str); const char *cur; if (!len) return false; for (cur = css; (cur = strstr(cur, str)); cur += len) { if (cur > css && cur[-1] != ',') continue; if (cur[len] && cur[len] != ',') continue; return true; } return false; } static bool fsname_is_auto(const char *name) { if (!fsauto_names) return false; if (fsauto_names == fsauto_all) return true; return css_contains(fsauto_names, name); } bool add_fsname_auto(const char *names) { char *old = fsauto_names; if (old == fsauto_all) return true; if (css_contains(names, fsauto_all)) fsauto_names = fsauto_all; else if (!old) fsauto_names = xstrdup(names); else { if (asprintf(&fsauto_names, "%s,%s", old, names) < 0) fsauto_names = NULL; } xfree(old); return fsauto_names != NULL; } struct fstype *find_fstype_by_name(char *fst) { int i; /* * This fn is required for two things. * 1st -- to check supported filesystems (as just mounting * anything is wrong, almost every fs has its own features) * 2nd -- save some space in the image (since we scan all * names anyway) */ for (i = 1; i < ARRAY_SIZE(fstypes); i++) { struct fstype *fstype = fstypes + i; if (!strcmp(fstype->name, fst)) return fstype; } if (fsname_is_auto(fst)) return &fstypes[1]; return &fstypes[0]; } struct fstype *decode_fstype(u32 fst) { int i; if (fst == FSTYPE__UNSUPPORTED) goto uns; for (i = 1; i < ARRAY_SIZE(fstypes); i++) { struct fstype *fstype = fstypes + i; if (!fstype->name) break; if (fstype->code == fst) return fstype; } uns: return &fstypes[0]; } criu-3.6/criu/fsnotify.c000066400000000000000000000522161317335042600153050ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/compiler.h" #include "imgset.h" #include "fsnotify.h" #include "fdinfo.h" #include "mount.h" #include "filesystems.h" #include "image.h" #include "util.h" #include "crtools.h" #include "files.h" #include "files-reg.h" #include "file-ids.h" #include "criu-log.h" #include "common/list.h" #include "common/lock.h" #include "irmap.h" #include "cr_options.h" #include "namespaces.h" #include "pstree.h" #include "fault-injection.h" #include #include "protobuf.h" #include "images/fsnotify.pb-c.h" #include "images/mnt.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "fsnotify: " struct fsnotify_mark_info { struct list_head list; union { InotifyWdEntry *iwe; FanotifyMarkEntry *fme; }; struct pprep_head prep; /* XXX union with remap */ struct file_remap *remap; }; struct fsnotify_file_info { union { InotifyFileEntry *ife; FanotifyFileEntry *ffe; }; struct list_head marks; struct file_desc d; }; /* File handle */ typedef struct { u32 bytes; u32 type; u64 __handle[16]; } fh_t; /* Checks if file descriptor @lfd is inotify */ int is_inotify_link(char *link) { return is_anon_link_type(link, "inotify"); } /* Checks if file descriptor @lfd is fanotify */ int is_fanotify_link(char *link) { return is_anon_link_type(link, "[fanotify]"); } static void decode_handle(fh_t *handle, FhEntry *img) { memzero(handle, sizeof(*handle)); handle->type = img->type; handle->bytes = img->bytes; memcpy(handle->__handle, img->handle, min(pb_repeated_size(img, handle), sizeof(handle->__handle))); } static int open_by_handle(void *arg, int fd, int pid) { return syscall(__NR_open_by_handle_at, fd, arg, O_PATH); } static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle) { struct mount_info *m; fh_t handle; int fd = -1; char *path; decode_handle(&handle, f_handle); /* * We gonna try to open the handle and then * depending on command line options and type * of the filesystem (tmpfs/devtmpfs do not * preserve their inodes between mounts) we * might need to find out an openable path * get used on restore as a watch destination. */ for (m = mntinfo; m; m = m->next) { char buf[PATH_MAX], *__path; int mntfd, openable_fd; struct stat st; if (m->s_dev != s_dev) continue; if (!mnt_is_dir(m)) continue; mntfd = __open_mountpoint(m, -1); pr_debug("\t\tTrying via mntid %d root %s ns_mountpoint @%s (%d)\n", m->mnt_id, m->root, m->ns_mountpoint, mntfd); if (mntfd < 0) continue; fd = userns_call(open_by_handle, UNS_FDOUT, &handle, sizeof(handle), mntfd); close(mntfd); if (fd < 0) continue; if (read_fd_link(fd, buf, sizeof(buf)) < 0) { close(fd); goto err; } close(fd); /* * Convert into a relative path. */ __path = (buf[1] != '\0') ? buf + 1 : "."; pr_debug("\t\t\tlink as %s\n", __path); mntfd = mntns_get_root_fd(m->nsid); if (mntfd < 0) goto err; openable_fd = openat(mntfd, __path, O_PATH); if (openable_fd >= 0) { if (fstat(openable_fd, &st)) { pr_perror("Can't stat on %s", __path); close(openable_fd); return ERR_PTR(-errno); } close(openable_fd); pr_debug("\t\t\topenable (inode %s) as %s\n", st.st_ino == i_ino ? "match" : "don't match", __path); if (st.st_ino == i_ino) { path = xstrdup(buf); if (path == NULL) goto err; if (root_ns_mask & CLONE_NEWNS) { f_handle->has_mnt_id = true; f_handle->mnt_id = m->mnt_id; } return path; } } else pr_debug("\t\t\tnot openable as %s (%m)\n", __path); } return ERR_PTR(-ENOENT); err: return ERR_PTR(-1); } static int open_handle(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle) { int mntfd, fd = -1; fh_t handle; decode_handle(&handle, f_handle); pr_debug("Opening fhandle %x:%Lx...\n", s_dev, (unsigned long long)handle.__handle[0]); mntfd = open_mount(s_dev); if (mntfd < 0) { pr_err("Mount root for %#08x not found\n", s_dev); goto out; } fd = userns_call(open_by_handle, UNS_FDOUT, &handle, sizeof(handle), mntfd); if (fd < 0) { pr_perror("Can't open file handle for %#08x:%#016lx", s_dev, i_ino); } close(mntfd); out: return fd; } int check_open_handle(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle) { int fd = -1; char *path; if (fault_injected(FI_CHECK_OPEN_HANDLE)) { fd = -1; goto fault; } fd = open_handle(s_dev, i_ino, f_handle); fault: if (fd >= 0) { struct mount_info *mi; pr_debug("\tHandle 0x%x:0x%lx is openable\n", s_dev, i_ino); mi = lookup_mnt_sdev(s_dev); if (mi == NULL) { pr_err("Unable to lookup a mount by dev 0x%x\n", s_dev); goto err; } /* * Always try to fetch watchee path first. There are several reasons: * * - tmpfs/devtmps do not save inode numbers between mounts, * so it is critical to have the complete path under our * hands for restore purpose; * * - in case of migration the inodes might be changed as well * so the only portable solution is to carry the whole path * to the watchee inside image. */ path = alloc_openable(s_dev, i_ino, f_handle); if (!IS_ERR_OR_NULL(path)) goto out; if ((mi->fstype->code == FSTYPE__TMPFS) || (mi->fstype->code == FSTYPE__DEVTMPFS)) { pr_err("Can't find suitable path for handle (dev %#x ino %#lx): %d\n", s_dev, i_ino, (int)PTR_ERR(path)); goto err; } if (!opts.force_irmap) /* * If we're not forced to do irmap, then * say we have no path for watch. Otherwise * do irmap scan even if the handle is * working. * * FIXME -- no need to open-by-handle if * we are in force-irmap and not on tempfs */ goto out_nopath; } pr_warn("\tHandle 0x%x:0x%lx cannot be opened\n", s_dev, i_ino); path = irmap_lookup(s_dev, i_ino); if (!path) { pr_err("\tCan't dump that handle\n"); return -1; } out: pr_debug("\tDumping %s as path for handle\n", path); f_handle->path = path; out_nopath: close_safe(&fd); return 0; err: close_safe(&fd); return -1; } static int check_one_wd(InotifyWdEntry *we) { pr_info("wd: wd %#08x s_dev %#08x i_ino %#16"PRIx64" mask %#08x\n", we->wd, we->s_dev, we->i_ino, we->mask); pr_info("\t[fhandle] bytes %#08x type %#08x __handle %#016"PRIx64":%#016"PRIx64"\n", we->f_handle->bytes, we->f_handle->type, we->f_handle->handle[0], we->f_handle->handle[1]); if (we->mask & KERNEL_FS_EVENT_ON_CHILD) pr_warn_once("\t\tDetected FS_EVENT_ON_CHILD bit " "in mask (will be ignored on restore)\n"); if (check_open_handle(we->s_dev, we->i_ino, we->f_handle)) return -1; return 0; } static int dump_one_inotify(int lfd, u32 id, const struct fd_parms *p) { FileEntry fe = FILE_ENTRY__INIT; InotifyFileEntry ie = INOTIFY_FILE_ENTRY__INIT; int exit_code = -1, i, ret; ret = fd_has_data(lfd); if (ret < 0) return -1; else if (ret > 0) pr_warn("The %#08x inotify events will be dropped\n", id); ie.id = id; ie.flags = p->flags; ie.fown = (FownEntry *)&p->fown; if (parse_fdinfo(lfd, FD_TYPES__INOTIFY, &ie)) goto free; for (i = 0; i < ie.n_wd; i++) if (check_one_wd(ie.wd[i])) goto free; fe.type = FD_TYPES__INOTIFY; fe.id = ie.id; fe.ify = &ie; pr_info("id %#08x flags %#08x\n", ie.id, ie.flags); if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) goto free; exit_code = 0; free: for (i = 0; i < ie.n_wd; i++) xfree(ie.wd[i]); xfree(ie.wd); return exit_code; } static int pre_dump_one_inotify(int pid, int lfd) { InotifyFileEntry ie = INOTIFY_FILE_ENTRY__INIT; int i; if (parse_fdinfo_pid(pid, lfd, FD_TYPES__INOTIFY, &ie)) return -1; for (i = 0; i < ie.n_wd; i++) { InotifyWdEntry *we = ie.wd[i]; if (irmap_queue_cache(we->s_dev, we->i_ino, we->f_handle)) return -1; xfree(we); } return 0; } const struct fdtype_ops inotify_dump_ops = { .type = FD_TYPES__INOTIFY, .dump = dump_one_inotify, .pre_dump = pre_dump_one_inotify, }; static int check_one_mark(FanotifyMarkEntry *fme) { if (fme->type == MARK_TYPE__INODE) { BUG_ON(!fme->ie); pr_info("mark: s_dev %#08x i_ino %#016"PRIx64" mask %#08x\n", fme->s_dev, fme->ie->i_ino, fme->mask); pr_info("\t[fhandle] bytes %#08x type %#08x __handle %#016"PRIx64":%#016"PRIx64"\n", fme->ie->f_handle->bytes, fme->ie->f_handle->type, fme->ie->f_handle->handle[0], fme->ie->f_handle->handle[1]); if (check_open_handle(fme->s_dev, fme->ie->i_ino, fme->ie->f_handle)) return -1; } if (fme->type == MARK_TYPE__MOUNT) { struct mount_info *m; BUG_ON(!fme->me); m = lookup_mnt_id(fme->me->mnt_id); if (!m) { pr_err("Can't find mnt_id 0x%x\n", fme->me->mnt_id); return -1; } if (!(root_ns_mask & CLONE_NEWNS)) fme->me->path = m->mountpoint + 1; fme->s_dev = m->s_dev; pr_info("mark: s_dev %#08x mnt_id %#08x mask %#08x\n", fme->s_dev, fme->me->mnt_id, fme->mask); } return 0; } static int dump_one_fanotify(int lfd, u32 id, const struct fd_parms *p) { FileEntry fle = FILE_ENTRY__INIT; FanotifyFileEntry fe = FANOTIFY_FILE_ENTRY__INIT; int ret = -1, i; ret = fd_has_data(lfd); if (ret < 0) return -1; else if (ret > 0) pr_warn("The %#08x fanotify events will be dropped\n", id); ret = -1; fe.id = id; fe.flags = p->flags; fe.fown = (FownEntry *)&p->fown; if (parse_fdinfo(lfd, FD_TYPES__FANOTIFY, &fe) < 0) goto free; for (i = 0; i < fe.n_mark; i++) if (check_one_mark(fe.mark[i])) goto free; pr_info("id %#08x flags %#08x\n", fe.id, fe.flags); fle.type = FD_TYPES__FANOTIFY; fle.id = fe.id; fle.ffy = &fe; ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fle, PB_FILE); free: for (i = 0; i < fe.n_mark; i++) xfree(fe.mark[i]); xfree(fe.mark); return ret; } static int pre_dump_one_fanotify(int pid, int lfd) { FanotifyFileEntry fe = FANOTIFY_FILE_ENTRY__INIT; int i; if (parse_fdinfo_pid(pid, lfd, FD_TYPES__FANOTIFY, &fe)) return -1; for (i = 0; i < fe.n_mark; i++) { FanotifyMarkEntry *me = fe.mark[i]; if (me->type == MARK_TYPE__INODE && irmap_queue_cache(me->s_dev, me->ie->i_ino, me->ie->f_handle)) return -1; xfree(me); } xfree(fe.mark); return 0; } const struct fdtype_ops fanotify_dump_ops = { .type = FD_TYPES__FANOTIFY, .dump = dump_one_fanotify, .pre_dump = pre_dump_one_fanotify, }; static char *get_mark_path(const char *who, struct file_remap *remap, FhEntry *f_handle, unsigned long i_ino, unsigned int s_dev, char *buf, int *target) { char *path = NULL; if (remap) { int mntns_root; mntns_root = mntns_get_root_by_mnt_id(remap->rmnt_id); pr_debug("\t\tRestore %s watch for %#08x:%#016lx (via %s)\n", who, s_dev, i_ino, remap->rpath); *target = openat(mntns_root, remap->rpath, O_PATH); } else if (f_handle->path) { int mntns_root; char *path = "."; uint32_t mnt_id = f_handle->has_mnt_id ? f_handle->mnt_id : -1; /* irmap cache is collected in the root namespaces. */ mntns_root = mntns_get_root_by_mnt_id(mnt_id); /* change "/foo" into "foo" and "/" into "." */ if (f_handle->path[1] != '\0') path = f_handle->path + 1; pr_debug("\t\tRestore with path hint %d:%s\n", mnt_id, path); *target = openat(mntns_root, path, O_PATH); } else *target = open_handle(s_dev, i_ino, f_handle); if (*target < 0) { pr_perror("Unable to open %s", f_handle->path); goto err; } /* * fanotify/inotify open syscalls want path to attach * watch to. But the only thing we have is an FD obtained * via fhandle. Fortunatelly, when trying to attach the * /proc/pid/fd/ link, we will watch the inode the link * points to, i.e. -- just what we want. */ sprintf(buf, "/proc/self/fd/%d", *target); path = buf; if (!pr_quelled(LOG_DEBUG)) { char link[PATH_MAX]; if (read_fd_link(*target, link, sizeof(link)) < 0) link[0] = '\0'; pr_debug("\t\tRestore %s watch for %#08x:%#016lx (via %s -> %s)\n", who, s_dev, i_ino, path, link); } err: return path; } static int restore_one_inotify(int inotify_fd, struct fsnotify_mark_info *info) { InotifyWdEntry *iwe = info->iwe; int ret = -1, target = -1; char buf[PSFDS], *path; uint32_t mask; path = get_mark_path("inotify", info->remap, iwe->f_handle, iwe->i_ino, iwe->s_dev, buf, &target); if (!path) goto err; mask = iwe->mask & IN_ALL_EVENTS; if (iwe->mask & ~IN_ALL_EVENTS) { pr_info("\t\tfilter event mask %#x -> %#x\n", iwe->mask, mask); } /* * FIXME The kernel allocates wd-s sequentially, * this is suboptimal, but the kernel doesn't * provide and API for this yet :( */ while (1) { int wd; wd = inotify_add_watch(inotify_fd, path, mask); if (wd < 0) { pr_perror("Can't add watch for 0x%x with 0x%x", inotify_fd, iwe->wd); break; } else if (wd == iwe->wd) { ret = 0; break; } else if (wd > iwe->wd) { pr_err("Unsorted watch 0x%x found for 0x%x with 0x%x\n", wd, inotify_fd, iwe->wd); break; } pr_debug("\t\tWatch got 0x%x but 0x%x expected\n", wd, iwe->wd); inotify_rm_watch(inotify_fd, wd); } err: close_safe(&target); return ret; } static int restore_one_fanotify(int fd, struct fsnotify_mark_info *mark) { FanotifyMarkEntry *fme = mark->fme; unsigned int flags = FAN_MARK_ADD; int ret = -1, target = -1; char buf[PSFDS], *path = NULL; if (fme->type == MARK_TYPE__MOUNT) { struct mount_info *m; int mntns_root; char *p = fme->me->path; struct ns_id *nsid = NULL; if (root_ns_mask & CLONE_NEWNS) { m = lookup_mnt_id(fme->me->mnt_id); if (!m) { pr_err("Can't find mount mnt_id 0x%x\n", fme->me->mnt_id); return -1; } nsid = m->nsid; p = m->ns_mountpoint; } mntns_root = mntns_get_root_fd(nsid); target = openat(mntns_root, p, O_PATH); if (target == -1) { pr_perror("Unable to open %s", p); goto err; } flags |= FAN_MARK_MOUNT; snprintf(buf, sizeof(buf), "/proc/self/fd/%d", target); path = buf; } else if (fme->type == MARK_TYPE__INODE) { path = get_mark_path("fanotify", mark->remap, fme->ie->f_handle, fme->ie->i_ino, fme->s_dev, buf, &target); if (!path) goto err; } else { pr_err("Bad fsnotify mark type 0x%x\n", fme->type); goto err; } flags |= fme->mflags; if (mark->fme->mask) { ret = fanotify_mark(fd, flags, fme->mask, AT_FDCWD, path); if (ret) { pr_err("Adding fanotify mask 0x%x on 0x%x/%s failed (%d)\n", fme->mask, fme->id, path, ret); goto err; } } if (fme->ignored_mask) { ret = fanotify_mark(fd, flags | FAN_MARK_IGNORED_MASK, fme->ignored_mask, AT_FDCWD, path); if (ret) { pr_err("Adding fanotify ignored-mask 0x%x on 0x%x/%s failed (%d)\n", fme->ignored_mask, fme->id, path, ret); goto err; } } err: close_safe(&target); return ret; } static int open_inotify_fd(struct file_desc *d, int *new_fd) { struct fsnotify_file_info *info; struct fsnotify_mark_info *wd_info; int tmp; info = container_of(d, struct fsnotify_file_info, d); tmp = inotify_init1(info->ife->flags); if (tmp < 0) { pr_perror("Can't create inotify for %#08x", info->ife->id); return -1; } list_for_each_entry(wd_info, &info->marks, list) { pr_info("\tRestore 0x%x wd for %#08x\n", wd_info->iwe->wd, wd_info->iwe->id); if (restore_one_inotify(tmp, wd_info)) { close_safe(&tmp); break; } } if (restore_fown(tmp, info->ife->fown)) close_safe(&tmp); *new_fd = tmp; return 0; } static int open_fanotify_fd(struct file_desc *d, int *new_fd) { struct fsnotify_file_info *info; struct fsnotify_mark_info *mark; unsigned int flags = 0; int ret; info = container_of(d, struct fsnotify_file_info, d); flags = info->ffe->faflags; if (info->ffe->flags & O_CLOEXEC) flags |= FAN_CLOEXEC; if (info->ffe->flags & O_NONBLOCK) flags |= FAN_NONBLOCK; ret = fanotify_init(flags, info->ffe->evflags); if (ret < 0) { pr_perror("Can't init fanotify mark (%d)", ret); return -1; } list_for_each_entry(mark, &info->marks, list) { pr_info("\tRestore fanotify for %#08x\n", mark->fme->id); if (restore_one_fanotify(ret, mark)) { close_safe(&ret); break; } } if (restore_fown(ret, info->ffe->fown)) close_safe(&ret); *new_fd = ret; return 0; } static struct file_desc_ops inotify_desc_ops = { .type = FD_TYPES__INOTIFY, .open = open_inotify_fd, }; static struct file_desc_ops fanotify_desc_ops = { .type = FD_TYPES__FANOTIFY, .open = open_fanotify_fd, }; static int inotify_resolve_remap(struct pprep_head *ph) { struct fsnotify_mark_info *m; m = container_of(ph, struct fsnotify_mark_info, prep); m->remap = lookup_ghost_remap(m->iwe->s_dev, m->iwe->i_ino); return 0; } static int fanotify_resolve_remap(struct pprep_head *ph) { struct fsnotify_mark_info *m; m = container_of(ph, struct fsnotify_mark_info, prep); m->remap = lookup_ghost_remap(m->fme->s_dev, m->fme->ie->i_ino); return 0; } static int __collect_inotify_mark(struct fsnotify_file_info *p, struct fsnotify_mark_info *mark) { struct fsnotify_mark_info *m; /* * We should put marks in wd ascending order. See comment * in restore_one_inotify() for explanation. */ list_for_each_entry(m, &p->marks, list) if (m->iwe->wd > mark->iwe->wd) break; list_add_tail(&mark->list, &m->list); mark->prep.actor = inotify_resolve_remap; add_post_prepare_cb(&mark->prep); return 0; } static int __collect_fanotify_mark(struct fsnotify_file_info *p, struct fsnotify_mark_info *mark) { list_add(&mark->list, &p->marks); if (mark->fme->type == MARK_TYPE__INODE) { mark->prep.actor = fanotify_resolve_remap; add_post_prepare_cb(&mark->prep); } return 0; } static int collect_one_inotify(void *o, ProtobufCMessage *msg, struct cr_img *img) { struct fsnotify_file_info *info = o; int i; info->ife = pb_msg(msg, InotifyFileEntry); INIT_LIST_HEAD(&info->marks); pr_info("Collected id %#08x flags %#08x\n", info->ife->id, info->ife->flags); for (i = 0; i < info->ife->n_wd; i++) { struct fsnotify_mark_info *mark; mark = xmalloc(sizeof(*mark)); if (!mark) return -1; mark->iwe = info->ife->wd[i]; INIT_LIST_HEAD(&mark->list); mark->remap = NULL; if (__collect_inotify_mark(info, mark)) return -1; } return file_desc_add(&info->d, info->ife->id, &inotify_desc_ops); } struct collect_image_info inotify_cinfo = { .fd_type = CR_FD_INOTIFY_FILE, .pb_type = PB_INOTIFY_FILE, .priv_size = sizeof(struct fsnotify_file_info), .collect = collect_one_inotify, }; static int collect_one_fanotify(void *o, ProtobufCMessage *msg, struct cr_img *img) { struct fsnotify_file_info *info = o; int i; info->ffe = pb_msg(msg, FanotifyFileEntry); INIT_LIST_HEAD(&info->marks); pr_info("Collected id %#08x flags %#08x\n", info->ffe->id, info->ffe->flags); for (i = 0; i < info->ffe->n_mark; i++) { struct fsnotify_mark_info *mark; mark = xmalloc(sizeof(*mark)); if (!mark) return -1; mark->fme = info->ffe->mark[i]; INIT_LIST_HEAD(&mark->list); mark->remap = NULL; if (__collect_fanotify_mark(info, mark)) return -1; } return file_desc_add(&info->d, info->ffe->id, &fanotify_desc_ops); } struct collect_image_info fanotify_cinfo = { .fd_type = CR_FD_FANOTIFY_FILE, .pb_type = PB_FANOTIFY_FILE, .priv_size = sizeof(struct fsnotify_file_info), .collect = collect_one_fanotify, }; static int collect_one_inotify_mark(void *o, ProtobufCMessage *msg, struct cr_img *i) { struct fsnotify_mark_info *mark = o; struct file_desc *d; if (!deprecated_ok("separate images for fsnotify marks")) return -1; mark->iwe = pb_msg(msg, InotifyWdEntry); INIT_LIST_HEAD(&mark->list); mark->remap = NULL; /* * The kernel prior 4.3 might export internal event * mask bits which are not part of user-space API. It * is fixed in kernel but we have to keep backward * compatibility with old images. So mask out * inappropriate bits (in particular fdinfo might * have FS_EVENT_ON_CHILD bit set). */ mark->iwe->mask &= ~KERNEL_FS_EVENT_ON_CHILD; d = find_file_desc_raw(FD_TYPES__INOTIFY, mark->iwe->id); if (!d) { pr_err("Can't find inotify with id %#08x\n", mark->iwe->id); return -1; } return __collect_inotify_mark(container_of(d, struct fsnotify_file_info, d), mark); } struct collect_image_info inotify_mark_cinfo = { .fd_type = CR_FD_INOTIFY_WD, .pb_type = PB_INOTIFY_WD, .priv_size = sizeof(struct fsnotify_mark_info), .collect = collect_one_inotify_mark, }; static int collect_one_fanotify_mark(void *o, ProtobufCMessage *msg, struct cr_img *i) { struct fsnotify_mark_info *mark = o; struct file_desc *d; if (!deprecated_ok("separate images for fsnotify marks")) return -1; mark->fme = pb_msg(msg, FanotifyMarkEntry); INIT_LIST_HEAD(&mark->list); mark->remap = NULL; d = find_file_desc_raw(FD_TYPES__FANOTIFY, mark->fme->id); if (!d) { pr_err("Can't find fanotify with id %#08x\n", mark->fme->id); return -1; } return __collect_fanotify_mark(container_of(d, struct fsnotify_file_info, d), mark); } struct collect_image_info fanotify_mark_cinfo = { .fd_type = CR_FD_FANOTIFY_MARK, .pb_type = PB_FANOTIFY_MARK, .priv_size = sizeof(struct fsnotify_mark_info), .collect = collect_one_fanotify_mark, }; criu-3.6/criu/image-desc.c000066400000000000000000000072561317335042600154460ustar00rootroot00000000000000#include #include "image-desc.h" #include "magic.h" #include "image.h" /* * The cr fd set is the set of files where the information * about dumped processes is stored. Each file carries some * small portion of info about the whole picture, see below * for more details. */ #define FD_ENTRY(_name, _fmt) \ [CR_FD_##_name] = { \ .fmt = _fmt ".img", \ .magic = _name##_MAGIC, \ } #define FD_ENTRY_F(_name, _fmt, _f) \ [CR_FD_##_name] = { \ .fmt = _fmt ".img", \ .magic = _name##_MAGIC, \ .oflags = _f, \ } struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY(INVENTORY, "inventory"), FD_ENTRY(FDINFO, "fdinfo-%d"), FD_ENTRY(PAGEMAP, "pagemap-%ld"), FD_ENTRY(SHMEM_PAGEMAP, "pagemap-shmem-%ld"), FD_ENTRY(REG_FILES, "reg-files"), FD_ENTRY(EXT_FILES, "ext-files"), FD_ENTRY(NS_FILES, "ns-files"), FD_ENTRY(EVENTFD_FILE, "eventfd"), FD_ENTRY(EVENTPOLL_FILE,"eventpoll"), FD_ENTRY(EVENTPOLL_TFD, "eventpoll-tfd"), FD_ENTRY(SIGNALFD, "signalfd"), FD_ENTRY(INOTIFY_FILE, "inotify"), FD_ENTRY(INOTIFY_WD, "inotify-wd"), FD_ENTRY(FANOTIFY_FILE, "fanotify"), FD_ENTRY(FANOTIFY_MARK, "fanotify-mark"), FD_ENTRY(CORE, "core-%d"), FD_ENTRY(IDS, "ids-%d"), FD_ENTRY(MM, "mm-%d"), FD_ENTRY(VMAS, "vmas-%d"), FD_ENTRY(PIPES, "pipes"), FD_ENTRY_F(PIPES_DATA, "pipes-data", O_NOBUF), /* splices data */ FD_ENTRY(FIFO, "fifo"), FD_ENTRY_F(FIFO_DATA, "fifo-data", O_NOBUF), /* the same */ FD_ENTRY(PSTREE, "pstree"), FD_ENTRY(SIGACT, "sigacts-%d"), FD_ENTRY(UNIXSK, "unixsk"), FD_ENTRY(INETSK, "inetsk"), FD_ENTRY(PACKETSK, "packetsk"), FD_ENTRY(NETLINK_SK, "netlinksk"), FD_ENTRY_F(SK_QUEUES, "sk-queues", O_NOBUF), /* lseeks the image */ FD_ENTRY(ITIMERS, "itimers-%d"), FD_ENTRY(POSIX_TIMERS, "posix-timers-%d"), FD_ENTRY(CREDS, "creds-%d"), FD_ENTRY(UTSNS, "utsns-%d"), FD_ENTRY(IPC_VAR, "ipcns-var-%d"), FD_ENTRY_F(IPCNS_SHM, "ipcns-shm-%d", O_NOBUF), /* writes segments of data */ FD_ENTRY(IPCNS_MSG, "ipcns-msg-%d"), FD_ENTRY(IPCNS_SEM, "ipcns-sem-%d"), FD_ENTRY(FS, "fs-%d"), FD_ENTRY(REMAP_FPATH, "remap-fpath"), FD_ENTRY_F(GHOST_FILE, "ghost-file-%x", O_NOBUF), FD_ENTRY(TCP_STREAM, "tcp-stream-%x"), FD_ENTRY(MNTS, "mountpoints-%d"), FD_ENTRY(NETDEV, "netdev-%d"), FD_ENTRY(NETNS, "netns-%d"), FD_ENTRY_F(IFADDR, "ifaddr-%d", O_NOBUF), FD_ENTRY_F(ROUTE, "route-%d", O_NOBUF), FD_ENTRY_F(ROUTE6, "route6-%d", O_NOBUF), FD_ENTRY_F(RULE, "rule-%d", O_NOBUF), FD_ENTRY_F(IPTABLES, "iptables-%d", O_NOBUF), FD_ENTRY_F(IP6TABLES, "ip6tables-%d", O_NOBUF), FD_ENTRY_F(TMPFS_IMG, "tmpfs-%d.tar.gz", O_NOBUF), FD_ENTRY_F(TMPFS_DEV, "tmpfs-dev-%d.tar.gz", O_NOBUF), FD_ENTRY_F(AUTOFS, "autofs-%d", O_NOBUF), FD_ENTRY(BINFMT_MISC_OLD, "binfmt-misc-%d"), FD_ENTRY(BINFMT_MISC, "binfmt-misc"), FD_ENTRY(TTY_FILES, "tty"), FD_ENTRY(TTY_INFO, "tty-info"), FD_ENTRY_F(TTY_DATA, "tty-data", O_NOBUF), FD_ENTRY(FILE_LOCKS, "filelocks"), FD_ENTRY(RLIMIT, "rlimit-%d"), FD_ENTRY_F(PAGES, "pages-%u", O_NOBUF), FD_ENTRY_F(PAGES_OLD, "pages-%d", O_NOBUF), FD_ENTRY_F(SHM_PAGES_OLD, "pages-shmem-%ld", O_NOBUF), FD_ENTRY(SIGNAL, "signal-s-%d"), FD_ENTRY(PSIGNAL, "signal-p-%d"), FD_ENTRY(TUNFILE, "tunfile"), FD_ENTRY(CGROUP, "cgroup"), FD_ENTRY(TIMERFD, "timerfd"), FD_ENTRY(CPUINFO, "cpuinfo"), FD_ENTRY(SECCOMP, "seccomp"), FD_ENTRY(USERNS, "userns-%d"), FD_ENTRY(NETNF_CT, "netns-ct-%d"), FD_ENTRY(NETNF_EXP, "netns-exp-%d"), FD_ENTRY(FILES, "files"), [CR_FD_STATS] = { .fmt = "stats-%s", .magic = STATS_MAGIC, .oflags = O_SERVICE, }, [CR_FD_IRMAP_CACHE] = { .fmt = "irmap-cache", .magic = IRMAP_CACHE_MAGIC, .oflags = O_SERVICE, }, [CR_FD_FILE_LOCKS_PID] = { .fmt = "filelocks-%d.img", .magic = FILE_LOCKS_MAGIC, }, }; criu-3.6/criu/image.c000066400000000000000000000253541317335042600145310ustar00rootroot00000000000000#include #include #include #include #include #include #include "crtools.h" #include "cr_options.h" #include "imgset.h" #include "image.h" #include "pstree.h" #include "stats.h" #include "cgroup.h" #include "lsm.h" #include "protobuf.h" #include "xmalloc.h" #include "images/inventory.pb-c.h" #include "images/pagemap.pb-c.h" bool ns_per_id = false; bool img_common_magic = true; TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; int check_img_inventory(void) { int ret = -1; struct cr_img *img; InventoryEntry *he; img = open_image(CR_FD_INVENTORY, O_RSTR); if (!img) return -1; if (pb_read_one(img, &he, PB_INVENTORY) < 0) goto out_close; if (!he->has_fdinfo_per_id || !he->fdinfo_per_id) { pr_err("Too old image, no longer supported\n"); goto out_close; } ns_per_id = he->has_ns_per_id ? he->ns_per_id : false; if (he->root_ids) { root_ids = xmalloc(sizeof(*root_ids)); if (!root_ids) goto out_err; memcpy(root_ids, he->root_ids, sizeof(*root_ids)); } if (he->has_root_cg_set) { if (he->root_cg_set == 0) { pr_err("Corrupted root cgset\n"); goto out_err; } root_cg_set = he->root_cg_set; } if (he->has_lsmtype) image_lsm = he->lsmtype; else image_lsm = LSMTYPE__NO_LSM; switch (he->img_version) { case CRTOOLS_IMAGES_V1: /* good old images. OK */ img_common_magic = false; break; case CRTOOLS_IMAGES_V1_1: /* newer images with extra magic in the head */ break; default: pr_err("Not supported images version %u\n", he->img_version); goto out_err; } ret = 0; out_err: inventory_entry__free_unpacked(he, NULL); out_close: close_image(img); return ret; } int write_img_inventory(InventoryEntry *he) { struct cr_img *img; pr_info("Writing image inventory (version %u)\n", CRTOOLS_IMAGES_V1); img = open_image(CR_FD_INVENTORY, O_DUMP); if (!img) return -1; if (pb_write_one(img, he, PB_INVENTORY) < 0) return -1; xfree(he->root_ids); close_image(img); return 0; } int prepare_inventory(InventoryEntry *he) { struct pid pid; struct { struct pstree_item i; struct dmp_info d; } crt = { .i.pid = &pid }; pr_info("Perparing image inventory (version %u)\n", CRTOOLS_IMAGES_V1); he->img_version = CRTOOLS_IMAGES_V1_1; he->fdinfo_per_id = true; he->has_fdinfo_per_id = true; he->ns_per_id = true; he->has_ns_per_id = true; he->has_lsmtype = true; he->lsmtype = host_lsm_type(); crt.i.pid->state = TASK_ALIVE; crt.i.pid->real = getpid(); if (get_task_ids(&crt.i)) return -1; he->has_root_cg_set = true; if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) return -1; he->root_ids = crt.i.ids; return 0; } static struct cr_imgset *alloc_cr_imgset(int nr) { struct cr_imgset *cr_imgset; unsigned int i; cr_imgset = xmalloc(sizeof(*cr_imgset)); if (cr_imgset == NULL) return NULL; cr_imgset->_imgs = xmalloc(nr * sizeof(struct cr_img *)); if (cr_imgset->_imgs == NULL) { xfree(cr_imgset); return NULL; } for (i = 0; i < nr; i++) cr_imgset->_imgs[i] = NULL; cr_imgset->fd_nr = nr; return cr_imgset; } static void __close_cr_imgset(struct cr_imgset *cr_imgset) { unsigned int i; if (!cr_imgset) return; for (i = 0; i < cr_imgset->fd_nr; i++) { if (!cr_imgset->_imgs[i]) continue; close_image(cr_imgset->_imgs[i]); cr_imgset->_imgs[i] = NULL; } } void close_cr_imgset(struct cr_imgset **cr_imgset) { if (!cr_imgset || !*cr_imgset) return; __close_cr_imgset(*cr_imgset); xfree((*cr_imgset)->_imgs); xfree(*cr_imgset); *cr_imgset = NULL; } struct cr_imgset *cr_imgset_open_range(int pid, int from, int to, unsigned long flags) { struct cr_imgset *imgset; unsigned int i; imgset = alloc_cr_imgset(to - from); if (!imgset) goto err; from++; imgset->fd_off = from; for (i = from; i < to; i++) { struct cr_img *img; img = open_image(i, flags, pid); if (!img) { if (!(flags & O_CREAT)) /* caller should check himself */ continue; goto err; } imgset->_imgs[i - from] = img; } return imgset; err: close_cr_imgset(&imgset); return NULL; } struct cr_imgset *cr_task_imgset_open(int pid, int mode) { return cr_imgset_open(pid, TASK, mode); } struct cr_imgset *cr_glob_imgset_open(int mode) { return cr_imgset_open(-1 /* ignored */, GLOB, mode); } static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long flags, char *path); struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...) { struct cr_img *img; unsigned long oflags; char path[PATH_MAX]; va_list args; bool lazy = false; if (dfd == -1) { dfd = get_service_fd(IMG_FD_OFF); lazy = (flags & O_CREAT); } img = xmalloc(sizeof(*img)); if (!img) return NULL; oflags = flags | imgset_template[type].oflags; va_start(args, flags); vsnprintf(path, PATH_MAX, imgset_template[type].fmt, args); va_end(args); if (lazy) { img->fd = LAZY_IMG_FD; img->type = type; img->oflags = oflags; img->path = xstrdup(path); return img; } else img->fd = EMPTY_IMG_FD; if (do_open_image(img, dfd, type, oflags, path)) { close_image(img); return NULL; } return img; } static inline u32 head_magic(int oflags) { return oflags & O_SERVICE ? IMG_SERVICE_MAGIC : IMG_COMMON_MAGIC; } static int img_check_magic(struct cr_img *img, int oflags, int type, char *path) { u32 magic; if (read_img(img, &magic) < 0) return -1; if (img_common_magic && (type != CR_FD_INVENTORY)) { if (magic != head_magic(oflags)) { pr_err("Head magic doesn't match for %s\n", path); return -1; } if (read_img(img, &magic) < 0) return -1; } if (magic != imgset_template[type].magic) { pr_err("Magic doesn't match for %s\n", path); return -1; } return 0; } static int img_write_magic(struct cr_img *img, int oflags, int type) { if (img_common_magic && (type != CR_FD_INVENTORY)) { u32 cmagic; cmagic = head_magic(oflags); if (write_img(img, &cmagic)) return -1; } return write_img(img, &imgset_template[type].magic); } static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long oflags, char *path) { int ret, flags; flags = oflags & ~(O_NOBUF | O_SERVICE); ret = openat(dfd, path, flags, CR_FD_PERM); if (ret < 0) { if (!(flags & O_CREAT) && (errno == ENOENT)) { pr_info("No %s image\n", path); img->_x.fd = EMPTY_IMG_FD; goto skip_magic; } pr_perror("Unable to open %s", path); goto err; } img->_x.fd = ret; if (oflags & O_NOBUF) bfd_setraw(&img->_x); else { if (flags == O_RDONLY) ret = bfdopenr(&img->_x); else ret = bfdopenw(&img->_x); if (ret) goto err; } if (imgset_template[type].magic == RAW_IMAGE_MAGIC) goto skip_magic; if (flags == O_RDONLY) ret = img_check_magic(img, oflags, type, path); else ret = img_write_magic(img, oflags, type); if (ret) goto err; skip_magic: return 0; err: return -1; } int open_image_lazy(struct cr_img *img) { int dfd; char *path = img->path; img->path = NULL; dfd = get_service_fd(IMG_FD_OFF); if (do_open_image(img, dfd, img->type, img->oflags, path)) { xfree(path); return -1; } xfree(path); return 0; } void close_image(struct cr_img *img) { if (lazy_image(img)) { /* * Remove the image file if it's there so that * subsequent restore doesn't read wrong or fake * data from it. */ unlinkat(get_service_fd(IMG_FD_OFF), img->path, 0); xfree(img->path); } else if (!empty_image(img)) bclose(&img->_x); xfree(img); } struct cr_img *img_from_fd(int fd) { struct cr_img *img; img = xmalloc(sizeof(*img)); if (img) { img->_x.fd = fd; bfd_setraw(&img->_x); } return img; } int open_image_dir(char *dir) { int fd, ret; fd = open(dir, O_RDONLY); if (fd < 0) { pr_perror("Can't open dir %s", dir); return -1; } ret = install_service_fd(IMG_FD_OFF, fd); close(fd); fd = ret; if (opts.img_parent) { ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK); if (ret < 0 && errno != EEXIST) { pr_perror("Can't link parent snapshot"); goto err; } if (opts.img_parent[0] == '/') pr_warn("Absolute paths for parent links " "may not work on restore!\n"); } return 0; err: close_image_dir(); return -1; } void close_image_dir(void) { close_service_fd(IMG_FD_OFF); } static unsigned long page_ids = 1; void up_page_ids_base(void) { /* * When page server and criu dump work on * the same dir, the shmem pagemaps and regular * pagemaps may have IDs conflicts. Fix this by * making page server produce page images with * higher IDs. */ BUG_ON(page_ids != 1); page_ids += 0x10000; } struct cr_img *open_pages_image_at(int dfd, unsigned long flags, struct cr_img *pmi, u32 *id) { if (flags == O_RDONLY || flags == O_RDWR) { PagemapHead *h; if (pb_read_one(pmi, &h, PB_PAGEMAP_HEAD) < 0) return NULL; *id = h->pages_id; pagemap_head__free_unpacked(h, NULL); } else { PagemapHead h = PAGEMAP_HEAD__INIT; *id = h.pages_id = page_ids++; if (pb_write_one(pmi, &h, PB_PAGEMAP_HEAD) < 0) return NULL; } return open_image_at(dfd, CR_FD_PAGES, flags, *id); } struct cr_img *open_pages_image(unsigned long flags, struct cr_img *pmi, u32 *id) { return open_pages_image_at(get_service_fd(IMG_FD_OFF), flags, pmi, id); } /* * Write buffer @ptr of @size bytes into @fd file * Returns * 0 on success * -1 on error (error message is printed) */ int write_img_buf(struct cr_img *img, const void *ptr, int size) { int ret; ret = bwrite(&img->_x, ptr, size); if (ret == size) return 0; if (ret < 0) pr_perror("Can't write img file"); else pr_err("Img trimmed %d/%d\n", ret, size); return -1; } /* * Read buffer @ptr of @size bytes from @fd file * Returns * 1 on success * 0 on EOF (silently) * -1 on error (error message is printed) */ int read_img_buf_eof(struct cr_img *img, void *ptr, int size) { int ret; ret = bread(&img->_x, ptr, size); if (ret == size) return 1; if (ret == 0) return 0; if (ret < 0) pr_perror("Can't read img file"); else pr_err("Img trimmed %d/%d\n", ret, size); return -1; } /* * Read buffer @ptr of @size bytes from @fd file * Returns * 1 on success * -1 on error or EOF (error message is printed) */ int read_img_buf(struct cr_img *img, void *ptr, int size) { int ret; ret = read_img_buf_eof(img, ptr, size); if (ret == 0) { pr_err("Unexpected EOF\n"); ret = -1; } return ret; } /* * read_img_str -- same as read_img_buf, but allocates memory for * the buffer and puts the '\0' at the end */ int read_img_str(struct cr_img *img, char **pstr, int size) { int ret; char *str; str = xmalloc(size + 1); if (!str) return -1; ret = read_img_buf(img, str, size); if (ret < 0) { xfree(str); return -1; } str[size] = '\0'; *pstr = str; return 0; } off_t img_raw_size(struct cr_img *img) { struct stat stat; if (fstat(img->_x.fd, &stat)) { pr_perror("Failed to get image stats"); return -1; } return stat.st_size; } criu-3.6/criu/include/000077500000000000000000000000001317335042600147155ustar00rootroot00000000000000criu-3.6/criu/include/action-scripts.h000066400000000000000000000011411317335042600200250ustar00rootroot00000000000000#ifndef __CR_ACTION_SCRIPTS_H__ #define __CR_ACTION_SCRIPTS_H__ #include "asm/int.h" enum script_actions { ACT_PRE_DUMP, ACT_POST_DUMP, ACT_PRE_RESTORE, ACT_POST_RESTORE, ACT_NET_LOCK, ACT_NET_UNLOCK, ACT_SETUP_NS, ACT_POST_SETUP_NS, ACT_POST_RESUME, ACT_PRE_RESUME, ACT_ORPHAN_PTS_MASTER, ACT_MAX }; extern int add_script(char *path); extern int add_rpc_notify(int sk); extern int run_scripts(enum script_actions); extern int rpc_send_fd(enum script_actions, int fd); extern int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd); #endif /* __CR_ACTION_SCRIPTS_H__ */ criu-3.6/criu/include/aio.h000066400000000000000000000020451317335042600156370ustar00rootroot00000000000000#ifndef __CR_AIO_H__ #define __CR_AIO_H__ #include #include "images/mm.pb-c.h" unsigned int aio_estimate_nr_reqs(unsigned int size); int dump_aio_ring(MmEntry *mme, struct vma_area *vma); void free_aios(MmEntry *mme); struct parasite_ctl; int parasite_collect_aios(struct parasite_ctl *, struct vm_area_list *); unsigned long aio_rings_args_size(struct vm_area_list *); struct task_restore_args; int prepare_aios(struct pstree_item *t, struct task_restore_args *ta); struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; unsigned compat_features; unsigned incompat_features; unsigned header_length; /* size of aio_ring */ struct io_event io_events[0]; }; struct rst_aio_ring { unsigned long addr; unsigned long len; unsigned int nr_req; }; #endif /* __CR_AIO_H__ */ criu-3.6/criu/include/asm-generic/000077500000000000000000000000001317335042600171075ustar00rootroot00000000000000criu-3.6/criu/include/asm-generic/int.h000066400000000000000000000004101317335042600200450ustar00rootroot00000000000000#ifndef __CR_INT_H__ #define __CR_INT_H__ #include typedef uint64_t u64; typedef int64_t s64; typedef uint32_t u32; typedef int32_t s32; typedef uint16_t u16; typedef int16_t s16; typedef uint8_t u8; typedef int8_t s8; #endif /* __CR_INT_H__ */ criu-3.6/criu/include/asm-generic/vdso.h000066400000000000000000000006641317335042600202410ustar00rootroot00000000000000#ifndef __CR_ASM_GENERIC_VDSO_H__ #define __CR_ASM_GENERIC_VDSO_H__ #define VDSO_PROT (PROT_READ | PROT_EXEC) #define VVAR_PROT (PROT_READ) /* Just in case of LPAE system PFN is u64. */ #define VDSO_BAD_PFN (-1ull) #define VVAR_BAD_PFN (-1ull) #define VDSO_BAD_ADDR (-1ul) #define VVAR_BAD_ADDR (-1ul) #define VDSO_BAD_SIZE (-1ul) #define VVAR_BAD_SIZE (-1ul) #endif /* __CR_ASM_GENERIC_VDSO_H__ */ criu-3.6/criu/include/atomic.h000066400000000000000000000001361317335042600163420ustar00rootroot00000000000000#ifndef __CR_INC_ATOMIC_H__ #define __CR_INC_ATOMIC_H__ #include "common/asm/atomic.h" #endif criu-3.6/criu/include/autofs.h000066400000000000000000000122071317335042600163710ustar00rootroot00000000000000#ifndef __CR_AUTOFS_H__ #define __CR_AUTOFS_H__ #ifndef AUTOFS_MINOR #define AUTOFS_MINOR 235 #endif #include bool is_autofs_pipe(unsigned long inode); struct mount_info; int autofs_parse(struct mount_info *pm); int autofs_dump(struct mount_info *pm); int autofs_mount(struct mount_info *mi, const char *source, const char *filesystemtype, unsigned long mountflags); #include #include #include #define AUTOFS_DEVICE_NAME "autofs" #define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 #define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 #define AUTOFS_DEVID_LEN 16 #define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) /* * An ioctl interface for autofs mount point control. */ struct args_protover { __u32 version; }; struct args_protosubver { __u32 sub_version; }; struct args_openmount { __u32 devid; }; struct args_ready { __u32 token; }; struct args_fail { __u32 token; __s32 status; }; struct args_setpipefd { __s32 pipefd; }; struct args_timeout { __u64 timeout; }; struct args_requester { __u32 uid; __u32 gid; }; struct args_expire { __u32 how; }; struct args_askumount { __u32 may_umount; }; struct args_ismountpoint { union { struct args_in { __u32 type; } in; struct args_out { __u32 devid; __u32 magic; } out; }; }; /* * All the ioctls use this structure. * When sending a path size must account for the total length * of the chunk of memory otherwise is is the size of the * structure. */ struct autofs_dev_ioctl { __u32 ver_major; __u32 ver_minor; __u32 size; /* total size of data passed in * including this struct */ __s32 ioctlfd; /* automount command fd */ /* Command parameters */ union { struct args_protover protover; struct args_protosubver protosubver; struct args_openmount openmount; struct args_ready ready; struct args_fail fail; struct args_setpipefd setpipefd; struct args_timeout timeout; struct args_requester requester; struct args_expire expire; struct args_askumount askumount; struct args_ismountpoint ismountpoint; }; char path[0]; }; static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) { memset(in, 0, sizeof(struct autofs_dev_ioctl)); in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; in->size = sizeof(struct autofs_dev_ioctl); in->ioctlfd = -1; return; } /* * If you change this make sure you make the corresponding change * to autofs-dev-ioctl.c:lookup_ioctl() */ enum { /* Get various version info */ AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71, AUTOFS_DEV_IOCTL_PROTOVER_CMD, AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, /* Open mount ioctl fd */ AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, /* Close mount ioctl fd */ AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, /* Mount/expire status returns */ AUTOFS_DEV_IOCTL_READY_CMD, AUTOFS_DEV_IOCTL_FAIL_CMD, /* Activate/deactivate autofs mount */ AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, AUTOFS_DEV_IOCTL_CATATONIC_CMD, /* Expiry timeout */ AUTOFS_DEV_IOCTL_TIMEOUT_CMD, /* Get mount last requesting uid and gid */ AUTOFS_DEV_IOCTL_REQUESTER_CMD, /* Check for eligible expire candidates */ AUTOFS_DEV_IOCTL_EXPIRE_CMD, /* Request busy status */ AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, /* Check if path is a mountpoint */ AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, }; #define AUTOFS_IOCTL 0x93 #define AUTOFS_DEV_IOCTL_VERSION \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_PROTOVER \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_PROTOSUBVER \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_OPENMOUNT \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_CLOSEMOUNT \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_READY \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_FAIL \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_SETPIPEFD \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_CATATONIC \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_TIMEOUT \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_REQUESTER \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_EXPIRE \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_ASKUMOUNT \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl) #endif criu-3.6/criu/include/bfd.h000066400000000000000000000014571317335042600156300ustar00rootroot00000000000000#ifndef __CR_BFD_H__ #define __CR_BFD_H__ #include "common/err.h" struct bfd_buf; struct xbuf { char *mem; /* buffer */ char *data; /* position we see bytes at */ unsigned int sz; /* bytes sitting after b->pos */ struct bfd_buf *buf; }; struct bfd { int fd; bool writable; struct xbuf b; }; static inline bool bfd_buffered(struct bfd *b) { return b->b.mem != NULL; } static inline void bfd_setraw(struct bfd *b) { b->b.mem = NULL; } int bfdopenr(struct bfd *f); int bfdopenw(struct bfd *f); void bclose(struct bfd *f); char *breadline(struct bfd *f); char *breadchr(struct bfd *f, char c); int bwrite(struct bfd *f, const void *buf, int sz); struct iovec; int bwritev(struct bfd *f, const struct iovec *iov, int cnt); int bread(struct bfd *f, void *buf, int sz); int bfd_flush_images(void); #endif criu-3.6/criu/include/bitmap.h000066400000000000000000000003171317335042600163430ustar00rootroot00000000000000#ifndef __CR_BITMAP_H__ #define __CR_BITMAP_H__ extern void bitmap_set(unsigned long *map, int start, int nr); extern void bitmap_clear(unsigned long *map, int start, int nr); #endif /* __CR_BITMAP_H__ */ criu-3.6/criu/include/bitops.h000066400000000000000000000001321317335042600163620ustar00rootroot00000000000000#ifndef __CR_INC_BITOPS_H__ #define __CR_INC_BITOPS_H__ #include "common/bitops.h" #endif criu-3.6/criu/include/bitsperlong.h000066400000000000000000000001511317335042600174130ustar00rootroot00000000000000#ifndef __CR_INC_BITSPERLONG_H__ #define __CR_INC_BITSPERLONG_H__ #include "common/bitsperlong.h" #endif criu-3.6/criu/include/cgroup-props.h000066400000000000000000000007441317335042600175330ustar00rootroot00000000000000#ifndef __CR_CGROUP_PROPS_H__ #define __CR_CGROUP_PROPS_H__ #include typedef struct { const char *name; size_t nr_props; const char **props; } cgp_t; extern cgp_t cgp_global; extern const cgp_t *cgp_get_props(const char *name); extern bool cgp_should_skip_controller(const char *name); extern bool cgp_add_dump_controller(const char *name); extern int cgp_init(char *stream, size_t len, char *path); extern void cgp_fini(void); #endif /* __CR_CGROUP_PROPS_H__ */ criu-3.6/criu/include/cgroup.h000066400000000000000000000044031317335042600163660ustar00rootroot00000000000000#ifndef __CR_CGROUP_H__ #define __CR_CGROUP_H__ #include "int.h" #include "images/core.pb-c.h" struct pstree_item; struct parasite_dump_cgroup_args; extern u32 root_cg_set; int dump_task_cgroup(struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args); int dump_cgroups(void); int prepare_task_cgroup(struct pstree_item *); int prepare_cgroup(void); /* Restore things like cpu_limit in known cgroups. */ int prepare_cgroup_properties(void); int restore_freezer_state(void); void fini_cgroup(void); struct cg_controller; struct cgroup_prop { char *name; char *value; mode_t mode; uid_t uid; gid_t gid; struct list_head list; }; /* This describes a particular cgroup path, e.g. the '/lxc/u1' part of * 'blkio/lxc/u1' and any properties it has. */ struct cgroup_dir { char *path; mode_t mode; uid_t uid; gid_t gid; struct list_head properties; unsigned int n_properties; /* this is how children are linked together */ struct list_head siblings; /* more cgroup_dirs */ struct list_head children; unsigned int n_children; }; /* This describes a particular cgroup controller, e.g. blkio or cpuset. * The heads are subdirectories organized in their tree format. */ struct cg_controller { unsigned int n_controllers; char **controllers; /* cgroup_dirs */ struct list_head heads; unsigned int n_heads; /* for cgroup list in cgroup.c */ struct list_head l; }; struct cg_controller *new_controller(const char *name); /* parse all global cgroup information into structures */ int parse_cg_info(void); int new_cg_root_add(char *controller, char *newroot); extern struct ns_desc cgroup_ns_desc; /* * This struct describes a group controlled by one controller. * The @name is the controller name or 'name=...' for named cgroups. * The @path is the path from the hierarchy root. */ struct cg_ctl { struct list_head l; char *name; char *path; u32 cgns_prefix; }; /* * Returns the list of cg_ctl-s sorted by name */ struct list_head; struct parasite_dump_cgroup_args; extern int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *l, unsigned int *n); extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); #endif /* __CR_CGROUP_H__ */ criu-3.6/criu/include/clone-noasan.h000066400000000000000000000002341317335042600174420ustar00rootroot00000000000000#ifndef __CR_CLONE_NOASAN_H__ #define __CR_CLONE_NOASAN_H__ int clone_noasan(int (*fn)(void *), int flags, void *arg); #endif /* __CR_CLONE_NOASAN_H__ */ criu-3.6/criu/include/cpu.h000066400000000000000000000004031317335042600156520ustar00rootroot00000000000000#ifndef __CR_CPU_H__ #define __CR_CPU_H__ #include extern int cpu_init(void); extern int cpu_dump_cpuinfo(void); extern int cpu_validate_cpuinfo(void); extern int cpuinfo_dump(void); extern int cpuinfo_check(void); #endif /* __CR_CPU_H__ */ criu-3.6/criu/include/cr-errno.h000066400000000000000000000007171317335042600166220ustar00rootroot00000000000000#ifndef __CR_ERRNO_H__ #define __CR_ERRNO_H__ void set_cr_errno(int err); int get_cr_errno(void); /* * List of symbolic error names: * ESRCH - no process can be found corresponding to that specified by pid * EEXIST - process with such pid already exists * EBADRQC - bad options */ #define set_task_cr_err(new_err) atomic_cmpxchg(&task_entries->cr_err, 0, new_err) #define get_task_cr_err() atomic_read(&task_entries->cr_err) #endif /* __CR_ERRNO_H__ */ criu-3.6/criu/include/cr-service-const.h000066400000000000000000000002371317335042600202560ustar00rootroot00000000000000#ifndef __CR_SERVICE_CONST_H__ #define __CR_SERVICE_CONST_H__ #define CR_DEFAULT_SERVICE_ADDRESS "./criu_service.socket" #endif /* __CR_SERVICE_CONST_H__ */ criu-3.6/criu/include/cr-service.h000066400000000000000000000005351317335042600171330ustar00rootroot00000000000000#ifndef __CR_SERVICE_H__ #define __CR_SERVICE_H__ #include "images/rpc.pb-c.h" extern int cr_service(bool deamon_mode); int cr_service_work(int sk); extern int send_criu_dump_resp(int socket_fd, bool success, bool restored); extern struct _cr_service_client *cr_service_client; extern unsigned int service_sk_ino; #endif /* __CR_SERVICE_H__ */ criu-3.6/criu/include/cr_options.h000066400000000000000000000056271317335042600172570ustar00rootroot00000000000000#ifndef __CR_OPTIONS_H__ #define __CR_OPTIONS_H__ #include #include "config.h" #include "common/list.h" /* * CPU capability options. */ #define CPU_CAP_NONE (0u) #define CPU_CAP_ALL (-1u) #define CPU_CAP_FPU (1u) /* Only FPU capability required */ #define CPU_CAP_CPU (2u) /* Strict CPU capability required */ #define CPU_CAP_INS (4u) /* Instructions CPU capatibility */ #define CPU_CAP_DEFAULT (CPU_CAP_FPU) struct cg_root_opt { struct list_head node; char *controller; char *newroot; }; /* * Cgroup management options. */ #define CG_MODE_IGNORE (0u << 0) /* Zero is important here */ #define CG_MODE_NONE (1u << 0) #define CG_MODE_PROPS (1u << 1) #define CG_MODE_SOFT (1u << 2) #define CG_MODE_FULL (1u << 3) #define CG_MODE_STRICT (1u << 4) #define CG_MODE_DEFAULT (CG_MODE_SOFT) /* * Ghost file size we allow to carry by default. */ #define DEFAULT_GHOST_LIMIT (1 << 20) #define DEFAULT_TIMEOUT 10 struct irmap; struct irmap_path_opt { struct list_head node; struct irmap *ir; }; struct cr_options { int final_state; char *show_dump_file; char *show_fmt; int check_extra_features; int check_experimental_features; bool show_pages_content; union { int restore_detach; bool daemon_mode; }; int restore_sibling; bool ext_unix_sk; int shell_job; int handle_file_locks; int tcp_established_ok; int tcp_close; int evasive_devices; int link_remap_ok; int log_file_per_pid; bool swrk_restore; char *output; char *root; char *pidfile; char *freeze_cgroup; struct list_head ext_mounts; struct list_head inherit_fds; struct list_head external; struct list_head join_ns; char *libdir; int use_page_server; unsigned short port; char *addr; int ps_socket; int track_mem; char *img_parent; int auto_dedup; unsigned int cpu_cap; int force_irmap; char **exec_cmd; unsigned int manage_cgroups; char *new_global_cg_root; char *cgroup_props; char *cgroup_props_file; struct list_head new_cgroup_roots; bool autodetect_ext_mounts; int enable_external_sharing; int enable_external_masters; bool aufs; /* auto-deteced, not via cli */ bool overlayfs; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED bool has_binfmt_misc; /* auto-detected */ #endif size_t ghost_limit; struct list_head irmap_scan_paths; bool lsm_supplied; char *lsm_profile; unsigned int timeout; unsigned int empty_ns; int tcp_skip_in_flight; bool lazy_pages; char *work_dir; /* * When we scheduler for removal some functionality we first * deprecate it and it sits in criu for some time. By default * the deprecated stuff is not working, but it's still possible * to turn one ON while the code is in. */ int deprecated_ok; int display_stats; int weak_sysctls; int status_fd; bool orphan_pts_master; }; extern struct cr_options opts; extern void init_opts(void); #endif /* __CR_OPTIONS_H__ */ criu-3.6/criu/include/criu-log.h000066400000000000000000000027671317335042600166230ustar00rootroot00000000000000/* This file defines types and macros for CRIU plugins. Copyright (C) 2013 Parallels, Inc This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __CRIU_LOG_H__ #define __CRIU_LOG_H__ #include "log.h" extern int log_init(const char *output); extern void log_fini(void); extern int log_init_by_pid(void); extern void log_closedir(void); extern int log_keep_err(void); extern char *log_first_err(void); extern void log_set_fd(int fd); extern int log_get_fd(void); extern void log_set_loglevel(unsigned int loglevel); extern unsigned int log_get_loglevel(void); extern void log_get_logstart(struct timeval *); extern int write_pidfile(int pid); #define DEFAULT_LOG_FILENAME "criu.log" static inline int pr_quelled(unsigned int loglevel) { return log_get_loglevel() < loglevel && loglevel != LOG_MSG; } #endif /* __CR_LOG_LEVELS_H__ */ criu-3.6/criu/include/criu-plugin.h000066400000000000000000000103321317335042600173230ustar00rootroot00000000000000/* * This file defines types and macros for CRIU plugins. * Copyright (C) 2013-2014 Parallels, Inc * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __CRIU_PLUGIN_H__ #define __CRIU_PLUGIN_H__ #include #include #define CRIU_PLUGIN_GEN_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c)) #define CRIU_PLUGIN_VERSION_MAJOR 0 #define CRIU_PLUGIN_VERSION_MINOR 2 #define CRIU_PLUGIN_VERSION_SUBLEVEL 0 #define CRIU_PLUGIN_VERSION_OLD CRIU_PLUGIN_GEN_VERSION(0,1,0) #define CRIU_PLUGIN_VERSION \ CRIU_PLUGIN_GEN_VERSION(CRIU_PLUGIN_VERSION_MAJOR, \ CRIU_PLUGIN_VERSION_MINOR, \ CRIU_PLUGIN_VERSION_SUBLEVEL) /* * Plugin hook points and their arguments in hooks. */ enum { CR_PLUGIN_HOOK__DUMP_UNIX_SK = 0, CR_PLUGIN_HOOK__RESTORE_UNIX_SK = 1, CR_PLUGIN_HOOK__DUMP_EXT_FILE = 2, CR_PLUGIN_HOOK__RESTORE_EXT_FILE = 3, CR_PLUGIN_HOOK__DUMP_EXT_MOUNT = 4, CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT = 5, CR_PLUGIN_HOOK__DUMP_EXT_LINK = 6, CR_PLUGIN_HOOK__MAX }; #define DECLARE_PLUGIN_HOOK_ARGS(__hook, ...) \ typedef int (__hook ##_t)(__VA_ARGS__) DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind); enum { CR_PLUGIN_STAGE__DUMP, CR_PLUGIN_STAGE__PRE_DUMP, CR_PLUGIN_STAGE__RESTORE, CR_PLUGIN_STAGE_MAX }; /* * Plugin descriptor. */ typedef struct { const char *name; int (*init)(int stage); void (*exit)(int stage, int ret); unsigned int version; unsigned int max_hooks; void *hooks[CR_PLUGIN_HOOK__MAX]; } cr_plugin_desc_t; extern cr_plugin_desc_t CR_PLUGIN_DESC; #define CR_PLUGIN_REGISTER(___name, ___init, ___exit) \ cr_plugin_desc_t CR_PLUGIN_DESC = { \ .name = ___name, \ .init = ___init, \ .exit = ___exit, \ .version = CRIU_PLUGIN_VERSION, \ .max_hooks = CR_PLUGIN_HOOK__MAX, \ }; static inline int cr_plugin_dummy_init(int stage) { return 0; } static inline void cr_plugin_dummy_exit(int stage, int ret) { } #define CR_PLUGIN_REGISTER_DUMMY(___name) \ cr_plugin_desc_t CR_PLUGIN_DESC = { \ .name = ___name, \ .init = cr_plugin_dummy_init, \ .exit = cr_plugin_dummy_exit, \ .version = CRIU_PLUGIN_VERSION, \ .max_hooks = CR_PLUGIN_HOOK__MAX, \ }; #define CR_PLUGIN_REGISTER_HOOK(__hook, __func) \ static void __attribute__((constructor)) cr_plugin_register_hook_##__func (void) \ { \ CR_PLUGIN_DESC.hooks[__hook] = (void *)__func; \ } /* Public API */ extern int criu_get_image_dir(void); /* * Deprecated, will be removed in next version. */ typedef int (cr_plugin_init_t)(void); typedef void (cr_plugin_fini_t)(void); typedef int (cr_plugin_dump_unix_sk_t)(int fd, int id); typedef int (cr_plugin_restore_unix_sk_t)(int id); typedef int (cr_plugin_dump_file_t)(int fd, int id); typedef int (cr_plugin_restore_file_t)(int id); typedef int (cr_plugin_dump_ext_mount_t)(char *mountpoint, int id); typedef int (cr_plugin_restore_ext_mount_t)(int id, char *mountpoint, char *old_root, int *is_file); typedef int (cr_plugin_dump_ext_link_t)(int index, int type, char *kind); #endif /* __CRIU_PLUGIN_H__ */ criu-3.6/criu/include/crtools.h000066400000000000000000000023731317335042600165600ustar00rootroot00000000000000#ifndef __CR_CRTOOLS_H__ #define __CR_CRTOOLS_H__ #include #include "common/list.h" #include "servicefd.h" #include "images/inventory.pb-c.h" #define CR_FD_PERM (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) extern int check_img_inventory(void); extern int write_img_inventory(InventoryEntry *he); extern int prepare_inventory(InventoryEntry *he); struct pprep_head { int (*actor)(struct pprep_head *); struct pprep_head *next; }; extern void add_post_prepare_cb(struct pprep_head *); extern bool deprecated_ok(char *what); extern int cr_dump_tasks(pid_t pid); extern int cr_pre_dump_tasks(pid_t pid); extern int cr_restore_tasks(void); extern int convert_to_elf(char *elf_path, int fd_core); extern int cr_check(void); extern int cr_dedup(void); extern int cr_lazy_pages(bool daemon); extern int check_add_feature(char *arg); extern void pr_check_features(const char *offset, const char *sep, int width); #define PPREP_HEAD_INACTIVE ((struct pprep_head *)-1) #define add_post_prepare_cb_once(phead) do { \ if ((phead)->next == PPREP_HEAD_INACTIVE)\ add_post_prepare_cb(phead); \ } while (0) #define MAKE_PPREP_HEAD(name) struct pprep_head name = { \ .next = PPREP_HEAD_INACTIVE, \ .actor = name##_cb, \ } #endif /* __CR_CRTOOLS_H__ */ criu-3.6/criu/include/dump.h000066400000000000000000000002411317335042600160300ustar00rootroot00000000000000#ifndef __CR_INC_DUMP_H__ #define __CR_INC_DUMP_H__ #include "asm/dump.h" extern int arch_set_thread_regs(struct pstree_item *item, bool with_threads); #endif criu-3.6/criu/include/eventfd.h000066400000000000000000000003571317335042600165260ustar00rootroot00000000000000#ifndef __CR_EVENTFD_H__ #define __CR_EVENTFD_H__ #include "files.h" extern int is_eventfd_link(char *link); extern const struct fdtype_ops eventfd_dump_ops; extern struct collect_image_info eventfd_cinfo; #endif /* __CR_EVENTFD_H__ */ criu-3.6/criu/include/eventpoll.h000066400000000000000000000004511317335042600170760ustar00rootroot00000000000000#ifndef __CR_EVENTPOLL_H__ #define __CR_EVENTPOLL_H__ #include "files.h" extern int is_eventpoll_link(char *link); extern const struct fdtype_ops eventpoll_dump_ops; extern struct collect_image_info epoll_tfd_cinfo; extern struct collect_image_info epoll_cinfo; #endif /* __CR_EVENTPOLL_H__ */ criu-3.6/criu/include/external.h000066400000000000000000000011301317335042600167030ustar00rootroot00000000000000#ifndef __CR_EXTERNAL_H__ #define __CR_EXTERNAL_H__ struct external { struct list_head node; char *id; void *data; }; extern int add_external(char *key); extern bool external_lookup_id(char *id); extern char *external_lookup_by_key(char *id); extern void *external_lookup_data(char *id); extern int external_for_each_type(char *type, int (*cb)(struct external *, void *), void *arg); static inline char *external_val(struct external *e) { char *aux; aux = strchr(e->id, '['); if (aux) { aux = strchr(aux + 1, ']'); if (aux && aux[1] == ':') return aux + 2; } return NULL; } #endif criu-3.6/criu/include/fault-injection.h000066400000000000000000000016521317335042600201650ustar00rootroot00000000000000#ifndef __CR_FAULT_INJECTION_H__ #define __CR_FAULT_INJECTION_H__ #include enum faults { FI_NONE = 0, FI_DUMP_EARLY, FI_RESTORE_ROOT_ONLY, FI_DUMP_PAGES, FI_RESTORE_OPEN_LINK_REMAP, FI_PARASITE_CONNECT, FI_POST_RESTORE, /* not fatal */ FI_VDSO_TRAMPOLINES = 127, FI_CHECK_OPEN_HANDLE = 128, FI_NO_MEMFD = 129, FI_NO_BREAKPOINTS = 130, FI_PARTIAL_PAGES = 131, FI_MAX, }; static inline bool __fault_injected(enum faults f, enum faults fi_strategy) { /* * Temporary workaround for Xen guests. Breakpoints degrade * performance linearly, so until we find out the reason, * let's disable them. */ if (f == FI_NO_BREAKPOINTS) return true; return fi_strategy == f; } #ifndef CR_NOGLIBC extern enum faults fi_strategy; #define fault_injected(f) __fault_injected(f, fi_strategy) extern int fault_injection_init(void); #else /* CR_NOGLIBC */ extern bool fault_injected(enum faults f); #endif #endif criu-3.6/criu/include/fcntl.h000066400000000000000000000012441317335042600161750ustar00rootroot00000000000000#ifndef __CR_ASM_GENERIC_FCNTL_H__ #define __CR_ASM_GENERIC_FCNTL_H__ #include #include #ifndef F_SETOWN_EX #define F_SETOWN_EX 15 #define F_GETOWN_EX 16 struct f_owner_ex { int type; pid_t pid; }; #endif #ifndef F_GETOWNER_UIDS #define F_GETOWNER_UIDS 17 #endif /* * These things are required to compile on CentOS-6 */ #ifndef F_LINUX_SPECIFIC_BASE # define F_LINUX_SPECIFIC_BASE 1024 #endif #ifndef F_SETPIPE_SZ # define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) #endif #ifndef F_GETPIPE_SZ # define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) #endif #ifndef O_PATH # define O_PATH 010000000 #endif #endif /* __CR_ASM_GENERIC_FCNTL_H__ */ criu-3.6/criu/include/fdinfo.h000066400000000000000000000007041317335042600163340ustar00rootroot00000000000000#ifndef __CR_FDINFO_H__ #define __CR_FDINFO_H__ #include "common/list.h" #include "images/eventfd.pb-c.h" #include "images/eventpoll.pb-c.h" #include "images/signalfd.pb-c.h" #include "images/fsnotify.pb-c.h" #include "images/timerfd.pb-c.h" struct fdinfo_common { off64_t pos; int flags; int mnt_id; int owner; }; extern int parse_fdinfo(int fd, int type, void *arg); extern int parse_fdinfo_pid(int pid, int fd, int type, void *arg); #endif criu-3.6/criu/include/fdstore.h000066400000000000000000000005241317335042600165350ustar00rootroot00000000000000#ifndef __CRIU_FDSTORE_H__ #define __CRIU_FDSTORE_H__ /* * fdstore is a storage for file descriptors which is shared * between processes. */ int fdstore_init(void); /* Add a file descriptor to the storage and return its id */ int fdstore_add(int fd); /* Get a file descriptor from a storage by id */ int fdstore_get(int id); #endif criu-3.6/criu/include/fifo.h000066400000000000000000000003721317335042600160130ustar00rootroot00000000000000#ifndef __CR_FIFO_H__ #define __CR_FIFO_H__ struct fd_parms; struct cr_imgset; extern const struct fdtype_ops fifo_dump_ops; extern struct collect_image_info fifo_cinfo; extern struct collect_image_info fifo_data_cinfo; #endif /* __CR_FIFO_H__ */ criu-3.6/criu/include/file-ids.h000066400000000000000000000006451317335042600165670ustar00rootroot00000000000000#ifndef __CR_FILE_IDS_H__ #define __CR_FILE_IDS_H__ #include "common/compiler.h" #include "rbtree.h" #include "images/fdinfo.pb-c.h" #define FD_PID_INVALID (-2U) #define FD_DESC_INVALID (-3U) struct fdinfo_entry; struct stat; struct fd_parms; extern int fd_id_generate(pid_t pid, FdinfoEntry *fe, struct fd_parms *p); extern int fd_id_generate_special(struct fd_parms *p, u32 *id); #endif /* __CR_FILE_IDS_H__ */ criu-3.6/criu/include/file-lock.h000066400000000000000000000031761317335042600167420ustar00rootroot00000000000000#ifndef __FILE_LOCK_H__ #define __FILE_LOCK_H__ #include "common/list.h" #include "protobuf.h" #include "images/file-lock.pb-c.h" #define FL_UNKNOWN -1 #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_OFD 4 /* for posix fcntl() and lockf() */ #ifndef F_RDLCK #define F_RDLCK 0 #define F_WRLCK 1 #define F_UNLCK 2 #endif /* for OFD locks fcntl() */ #ifndef F_OFD_GETLK #define F_OFD_GETLK 36 #define F_OFD_SETLK 37 #define F_OFD_SETLKW 38 #endif /* operations for bsd flock(), also used by the kernel implementation */ #define LOCK_SH 1 /* shared lock */ #define LOCK_EX 2 /* exclusive lock */ #define LOCK_NB 4 /* or'd with one of the above to prevent blocking */ #define LOCK_UN 8 /* remove lock */ #define LOCK_MAND 32 /* This is a mandatory flock ... */ #define LOCK_READ 64 /* which allows concurrent read operations */ #define LOCK_WRITE 128 /* which allows concurrent write operations */ #define LOCK_RW 192 /* which allows concurrent read & write ops */ struct file_lock { long long fl_id; int fl_kind; int fl_ltype; pid_t fl_owner; int maj, min; unsigned long i_no; long long start; char end[32]; struct list_head list; /* list of all file locks */ int real_owner; int owners_fd; }; extern struct list_head file_lock_list; extern struct file_lock *alloc_file_lock(void); extern void free_file_locks(void); extern int prepare_file_locks(int pid); extern struct collect_image_info file_locks_cinfo; struct pid; struct fd_parms; extern int note_file_lock(struct pid *, int fd, int lfd, struct fd_parms *); extern int dump_file_locks(void); #define OPT_FILE_LOCKS "file-locks" #endif /* __FILE_LOCK_H__ */ criu-3.6/criu/include/files-reg.h000066400000000000000000000030541317335042600167450ustar00rootroot00000000000000#ifndef __CR_FILES_REG_H__ #define __CR_FILES_REG_H__ #include "files.h" #include "images/regfile.pb-c.h" #include "images/ghost-file.pb-c.h" struct cr_imgset; struct fd_parms; struct file_remap { char *rpath; bool is_dir; int rmnt_id; uid_t uid; gid_t gid; }; struct reg_file_info { struct file_desc d; RegFileEntry *rfe; struct file_remap *remap; bool size_mode_checked; bool is_dir; char *path; }; extern int open_reg_by_id(u32 id); extern int open_reg_fd(struct file_desc *); extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, struct reg_file_info *, void *), void *arg); extern void clear_ghost_files(void); extern const struct fdtype_ops regfile_dump_ops; extern int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg); extern int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p); extern struct file_remap *lookup_ghost_remap(u32 dev, u32 ino); extern struct file_desc *try_collect_special_file(u32 id, int optional); #define collect_special_file(id) try_collect_special_file(id, 0) extern int collect_filemap(struct vma_area *); extern void filemap_ctx_init(bool auto_close); extern void filemap_ctx_fini(void); extern struct collect_image_info reg_file_cinfo; extern int collect_remaps_and_regfiles(void); extern void delete_link_remaps(void); extern void free_link_remaps(void); extern int prepare_remaps(void); extern int try_clean_remaps(bool only_ghosts); extern int strip_deleted(struct fd_link *link); extern int dead_pid_conflict(void); #endif /* __CR_FILES_REG_H__ */ criu-3.6/criu/include/files.h000066400000000000000000000133011317335042600161660ustar00rootroot00000000000000#ifndef __CR_FILES_H__ #define __CR_FILES_H__ #include #include "int.h" #include "common/compiler.h" #include "fcntl.h" #include "common/lock.h" #include "common/list.h" #include "pid.h" #include "rst_info.h" #include "images/fdinfo.pb-c.h" #include "images/fown.pb-c.h" #include "images/vma.pb-c.h" struct pstree_item; struct file_desc; struct cr_imgset; struct rst_info; struct parasite_ctl; struct fd_link { union { /* Link info for generic file (path) */ struct { char name[PATH_MAX + 1]; size_t len; }; /* Link info for proc-ns file */ struct { struct ns_desc *ns_d; unsigned int ns_kid; }; }; }; struct fd_parms { int fd; off_t pos; unsigned int flags; char fd_flags; struct stat stat; pid_t pid; FownEntry fown; struct fd_link *link; long fs_type; int mnt_id; struct parasite_ctl *fd_ctl; }; #define FD_PARMS_INIT \ (struct fd_parms) { \ .fd = FD_DESC_INVALID, \ .fown = FOWN_ENTRY__INIT, \ .link = NULL, \ .mnt_id = -1, \ } extern int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link); struct file_desc; enum { FLE_INITIALIZED, /* * FLE is open (via open() or socket() or etc syscalls), and * common file setting are set up (type-specific are not yet). * Most possible, the master was already served out. */ FLE_OPEN, /* * File-type specific settings and preparations are finished, * and FLE is completely restored. */ FLE_RESTORED, }; struct fdinfo_list_entry { struct list_head desc_list; /* To chain on @fd_info_head */ struct file_desc *desc; /* Associated file descriptor */ struct list_head ps_list; /* To chain per-task files */ struct pstree_item *task; FdinfoEntry *fe; int pid; u8 received:1; u8 stage:3; u8 fake:1; }; /* reports whether fd_a takes prio over fd_b */ static inline int fdinfo_rst_prio(struct fdinfo_list_entry *fd_a, struct fdinfo_list_entry *fd_b) { return pid_rst_prio(fd_a->pid, fd_b->pid) || ((fd_a->pid == fd_b->pid) && (fd_a->fe->fd < fd_b->fe->fd)); } struct file_desc_ops { /* fd_types from images/fdinfo.proto */ unsigned int type; /* * Opens a file by whatever syscall is required for that. * The returned descriptor may be closed (dup2-ed to another) * so it shouldn't be saved for any post-actions. */ int (*open)(struct file_desc *d, int *new_fd); char * (*name)(struct file_desc *, char *b, size_t s); }; int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool ghost); void collect_task_fd(struct fdinfo_list_entry *new_fle, struct rst_info *ri); struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, struct rst_info *rst_info, struct file_desc *fdesc, bool fake); unsigned int find_unused_fd(struct pstree_item *, int hint_fd); struct fdinfo_list_entry *find_used_fd(struct pstree_item *, int fd); struct file_desc { u32 id; /* File id, unique */ struct hlist_node hash; /* Descriptor hashing and lookup */ struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */ struct file_desc_ops *ops; /* Associated operations */ struct list_head fake_master_list;/* To chain in the list of file_desc, which don't have a fle in a task, that having permissions */ }; struct fdtype_ops { unsigned int type; int (*dump)(int lfd, u32 id, const struct fd_parms *p); int (*pre_dump)(int pid, int lfd); }; struct cr_img; extern int dump_my_file(int lfd, u32 *, int *type); extern int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops, FdinfoEntry *e); struct parasite_drain_fd; int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds); int predump_task_files(int pid); extern void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops); extern int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops); extern struct fdinfo_list_entry *try_file_master(struct file_desc *d); extern struct fdinfo_list_entry *file_master(struct file_desc *d); extern struct file_desc *find_file_desc_raw(int type, u32 id); extern int recv_desc_from_peer(struct file_desc *d, int *fd); extern int send_desc_to_peer(int fd, struct file_desc *d); extern int restore_fown(int fd, FownEntry *fown); extern int rst_file_params(int fd, FownEntry *fown, int flags); extern void show_saved_files(void); extern int prepare_fds(struct pstree_item *me); extern int prepare_fd_pid(struct pstree_item *me); extern int prepare_ctl_tty(int pid, struct rst_info *rst_info, u32 ctl_tty_id); extern int prepare_files(void); extern int restore_fs(struct pstree_item *); extern int prepare_fs_pid(struct pstree_item *); extern int set_fd_flags(int fd, int flags); extern struct collect_image_info files_cinfo; #define files_collected() (files_cinfo.flags & COLLECT_HAPPENED) extern int close_old_fds(void); #ifndef AT_EMPTY_PATH #define AT_EMPTY_PATH 0x1000 #endif #define LREMAP_PARAM "link-remap" extern int shared_fdt_prepare(struct pstree_item *item); extern struct collect_image_info ext_file_cinfo; extern int dump_unsupp_fd(struct fd_parms *p, int lfd, char *more, char *info, FdinfoEntry *); extern int inherit_fd_parse(char *optarg); extern int inherit_fd_add(int fd, char *key); extern void inherit_fd_log(void); extern int inherit_fd_resolve_clash(int fd); extern int inherit_fd_fini(void); extern int inherit_fd_lookup_id(char *id); extern bool inherited_fd(struct file_desc *, int *fdp); extern FdinfoEntry *dup_fdinfo(FdinfoEntry *old, int fd, unsigned flags); int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, int fd, unsigned flags); extern int open_transport_socket(void); extern int set_fds_event(pid_t virt); extern void wait_fds_event(void); #endif /* __CR_FILES_H__ */ criu-3.6/criu/include/filesystems.h000066400000000000000000000016731317335042600174440ustar00rootroot00000000000000#ifndef __CR_FILESYSTEMS_H__ #define __CR_FILESYSTEMS_H__ extern struct fstype *find_fstype_by_name(char *fst); extern struct fstype *decode_fstype(u32 fst); extern bool add_fsname_auto(const char *names); struct mount_info; typedef int (*mount_fn_t)(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags); struct fstype { char *name; int code; int (*dump)(struct mount_info *pm); int (*restore)(struct mount_info *pm); int (*check_bindmount)(struct mount_info *pm); int (*parse)(struct mount_info *pm); int (*collect)(struct mount_info *pm); bool (*sb_equal)(struct mount_info *a, struct mount_info *b); mount_fn_t mount; }; extern struct fstype *fstype_auto(void); /* callback for AUFS support */ extern int aufs_parse(struct mount_info *mi); /* callback for OverlayFS support */ extern int overlayfs_parse(struct mount_info *mi); /* FIXME -- remove */ extern struct list_head binfmt_misc_list; #endif criu-3.6/criu/include/fs-magic.h000066400000000000000000000016661317335042600165650ustar00rootroot00000000000000#ifndef __CR_FS_MAGIC_H__ #define __CR_FS_MAGIC_H__ #include /* * Gather magic numbers in case if distros * do not provide appropriate entry in * linux/magic.h. */ #ifndef NFS_SUPER_MAGIC # define NFS_SUPER_MAGIC 0x6969 #endif #ifndef PIPEFS_MAGIC # define PIPEFS_MAGIC 0x50495045 #endif #ifndef ANON_INODE_FS_MAGIC # define ANON_INODE_FS_MAGIC 0x09041934 #endif #ifndef TMPFS_MAGIC # define TMPFS_MAGIC 0x01021994 #endif #ifndef SOCKFS_MAGIC # define SOCKFS_MAGIC 0x534f434b #endif #ifndef DEVPTS_SUPER_MAGIC #define DEVPTS_SUPER_MAGIC 0x1cd1 #endif #ifndef BTRFS_SUPER_MAGIC #define BTRFS_SUPER_MAGIC 0x9123683E #endif #ifndef AUFS_SUPER_MAGIC #define AUFS_SUPER_MAGIC 0x61756673 #endif #ifndef PROC_SUPER_MAGIC #define PROC_SUPER_MAGIC 0x9fa0 #endif #ifndef BINFMTFS_MAGIC #define BINFMTFS_MAGIC 0x42494e4d #endif #ifndef AUTOFS_SUPER_MAGIC #define AUTOFS_SUPER_MAGIC 0x0187 #endif #endif /* __CR_FS_MAGIC_H__ */ criu-3.6/criu/include/fsnotify.h000066400000000000000000000011171317335042600167270ustar00rootroot00000000000000#ifndef __CR_FSNOTIFY_H__ #define __CR_FSNOTIFY_H__ #include "files.h" #include "protobuf.h" #include "images/fsnotify.pb-c.h" #define KERNEL_FS_EVENT_ON_CHILD 0x08000000 extern int is_inotify_link(char *link); extern int is_fanotify_link(char *link); extern const struct fdtype_ops inotify_dump_ops; extern const struct fdtype_ops fanotify_dump_ops; extern struct collect_image_info inotify_cinfo; extern struct collect_image_info inotify_mark_cinfo; extern struct collect_image_info fanotify_cinfo; extern struct collect_image_info fanotify_mark_cinfo; #endif /* __CR_FSNOTIFY_H__ */ criu-3.6/criu/include/image-desc.h000066400000000000000000000035711317335042600170720ustar00rootroot00000000000000#ifndef __CR_IMAGE_DESC_H__ #define __CR_IMAGE_DESC_H__ #include "int.h" enum { CR_FD_INVENTORY, CR_FD_STATS, /* * Task entries */ _CR_FD_TASK_FROM, CR_FD_CORE, CR_FD_IDS, CR_FD_MM, CR_FD_CREDS, CR_FD_FS, _CR_FD_TASK_TO, CR_FD_PAGEMAP, /* * NS entries */ CR_FD_UTSNS, CR_FD_MNTS, CR_FD_USERNS, _CR_FD_IPCNS_FROM, CR_FD_IPC_VAR, CR_FD_IPCNS_SHM, CR_FD_IPCNS_MSG, CR_FD_IPCNS_SEM, _CR_FD_IPCNS_TO, _CR_FD_NETNS_FROM, CR_FD_NETDEV, CR_FD_IFADDR, CR_FD_ROUTE, CR_FD_ROUTE6, CR_FD_RULE, CR_FD_IPTABLES, CR_FD_IP6TABLES, CR_FD_NETNS, CR_FD_NETNF_CT, CR_FD_NETNF_EXP, _CR_FD_NETNS_TO, CR_FD_PSTREE, CR_FD_SHMEM_PAGEMAP, CR_FD_GHOST_FILE, CR_FD_TCP_STREAM, CR_FD_FDINFO, _CR_FD_GLOB_FROM, CR_FD_FILES, CR_FD_SK_QUEUES, CR_FD_PIPES_DATA, CR_FD_FIFO_DATA, CR_FD_TTY_INFO, CR_FD_TTY_DATA, CR_FD_REMAP_FPATH, CR_FD_CGROUP, CR_FD_FILE_LOCKS, CR_FD_SECCOMP, _CR_FD_GLOB_TO, CR_FD_TMPFS_IMG, CR_FD_TMPFS_DEV, CR_FD_BINFMT_MISC, CR_FD_BINFMT_MISC_OLD, CR_FD_PAGES, CR_FD_SIGACT, CR_FD_VMAS, CR_FD_PAGES_OLD, CR_FD_SHM_PAGES_OLD, CR_FD_RLIMIT, CR_FD_ITIMERS, CR_FD_POSIX_TIMERS, CR_FD_FILE_LOCKS_PID, CR_FD_IRMAP_CACHE, CR_FD_CPUINFO, CR_FD_SIGNAL, CR_FD_PSIGNAL, CR_FD_INOTIFY_WD, CR_FD_FANOTIFY_MARK, CR_FD_EVENTPOLL_TFD, CR_FD_REG_FILES, CR_FD_INETSK, CR_FD_NS_FILES, CR_FD_PACKETSK, CR_FD_NETLINK_SK, CR_FD_EVENTFD_FILE, CR_FD_EVENTPOLL_FILE, CR_FD_SIGNALFD, CR_FD_TUNFILE, CR_FD_TIMERFD, CR_FD_INOTIFY_FILE, CR_FD_FANOTIFY_FILE, CR_FD_EXT_FILES, CR_FD_UNIXSK, CR_FD_FIFO, CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_AUTOFS, CR_FD_MAX }; /* file descriptors template */ struct cr_fd_desc_tmpl { const char *fmt; /* format for the name */ u32 magic; /* magic in the header */ int oflags; /* flags for image_open */ }; extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX]; #endif /* __CR_IMAGE_DESC_H__ */ criu-3.6/criu/include/image.h000066400000000000000000000114031317335042600161470ustar00rootroot00000000000000#ifndef __CR_IMAGE_H__ #define __CR_IMAGE_H__ #include #include "common/compiler.h" #include "servicefd.h" #include "image-desc.h" #include "fcntl.h" #include "magic.h" #include "bfd.h" #include "log.h" #include "common/bug.h" #ifdef _ARCH_PPC64 #define PAGE_IMAGE_SIZE 65536 #else #define PAGE_IMAGE_SIZE 4096 #endif /* _ARCH_PPC64 */ #define PAGE_RSS 1 #define PAGE_ANON 2 /* * Top bit set in the tgt id means we've remapped * to a ghost file. */ #define REMAP_GHOST (1 << 31) /* * VMA_AREA status: * * - none * VmaEntry is just allocated and has not been used * for anything yet * - regular * VmaEntry represent some memory area which should be * dumped and restored; this is a general sign that we * should not skip the area content from processing in * compare with special areas such as vsyscall * - stack * the memory area is used in application stack so we * should be careful about guard page here * - vsyscall * special memory area injected into the task memory * space by the kernel itself, represent virtual syscall * implementation and it is specific to every kernel version, * its contents should not be dumped ever * - vdso,vvar * the vDSO area, it might reqire additional memory * contents modification especially when tasks are * migrating between different kernel versions * - heap * "heap" area in application, currently for inforamtion only * - file private * stands for privately memory mapped files * - file shared * stands for shared memory mapped files * - anon shared * represent shared anonymous memory areas * - anon private * represent private anonymous memory areas * - SysV IPC * IPC shared memory area * - socket * memory map for socket * - AIO ring * memory area serves AIO buffers * - unsupported * stands for any unknown memory areas, usually means * we don't know how to work with it and should stop * processing exiting with error; while the rest of bits * are part of image ABI, this particular one must never * be used in image. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) #define VMA_AREA_STACK (1 << 1) #define VMA_AREA_VSYSCALL (1 << 2) #define VMA_AREA_VDSO (1 << 3) #define VMA_AREA_HEAP (1 << 5) #define VMA_FILE_PRIVATE (1 << 6) #define VMA_FILE_SHARED (1 << 7) #define VMA_ANON_SHARED (1 << 8) #define VMA_ANON_PRIVATE (1 << 9) #define VMA_AREA_SYSVIPC (1 << 10) #define VMA_AREA_SOCKET (1 << 11) #define VMA_AREA_VVAR (1 << 12) #define VMA_AREA_AIORING (1 << 13) #define VMA_CLOSE (1 << 28) #define VMA_NO_PROT_WRITE (1 << 29) #define VMA_PREMMAPED (1 << 30) #define VMA_UNSUPP (1 << 31) #define CR_CAP_SIZE 2 #define TASK_COMM_LEN 16 #define CR_PARENT_LINK "parent" extern bool ns_per_id; extern bool img_common_magic; #define O_NOBUF (O_DIRECT) #define O_SERVICE (O_DIRECTORY) #define O_DUMP (O_WRONLY | O_CREAT | O_TRUNC) #define O_SHOW (O_RDONLY | O_NOBUF) #define O_RSTR (O_RDONLY) struct cr_img { union { struct bfd _x; struct { int fd; /* should be first to coincide with _x.fd */ int type; unsigned long oflags; char *path; }; }; }; #define EMPTY_IMG_FD (-404) #define LAZY_IMG_FD (-505) static inline bool empty_image(struct cr_img *img) { return img && img->_x.fd == EMPTY_IMG_FD; } static inline bool lazy_image(struct cr_img *img) { return img->_x.fd == LAZY_IMG_FD; } extern int open_image_lazy(struct cr_img *img); static inline int img_raw_fd(struct cr_img *img) { if (lazy_image(img) && open_image_lazy(img)) return -1; BUG_ON(bfd_buffered(&img->_x)); return img->_x.fd; } extern off_t img_raw_size(struct cr_img *img); extern int open_image_dir(char *dir); extern void close_image_dir(void); extern struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...); #define open_image(typ, flags, ...) open_image_at(-1, typ, flags, ##__VA_ARGS__) extern int open_image_lazy(struct cr_img *img); extern struct cr_img *open_pages_image(unsigned long flags, struct cr_img *pmi, u32 *pages_id); extern struct cr_img *open_pages_image_at(int dfd, unsigned long flags, struct cr_img *pmi, u32 *pages_id); extern void up_page_ids_base(void); extern struct cr_img *img_from_fd(int fd); /* for cr-show mostly */ extern int write_img_buf(struct cr_img *, const void *ptr, int size); #define write_img(img, ptr) write_img_buf((img), (ptr), sizeof(*(ptr))) extern int read_img_buf_eof(struct cr_img *, void *ptr, int size); #define read_img_eof(img, ptr) read_img_buf_eof((img), (ptr), sizeof(*(ptr))) extern int read_img_buf(struct cr_img *, void *ptr, int size); #define read_img(img, ptr) read_img_buf((img), (ptr), sizeof(*(ptr))) extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); #endif /* __CR_IMAGE_H__ */ criu-3.6/criu/include/imgset.h000066400000000000000000000016401317335042600163570ustar00rootroot00000000000000#ifndef __CR_IMGSET_H__ #define __CR_IMGSET_H__ #include "image-desc.h" #include "log.h" #include "common/bug.h" #include "image.h" struct cr_imgset { int fd_off; int fd_nr; struct cr_img **_imgs; }; static inline struct cr_img *img_from_set(const struct cr_imgset *imgset, int type) { int idx; idx = type - imgset->fd_off; BUG_ON(idx > imgset->fd_nr); return imgset->_imgs[idx]; } extern struct cr_imgset *glob_imgset; extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX]; extern struct cr_imgset *cr_task_imgset_open(int pid, int mode); extern struct cr_imgset *cr_imgset_open_range(int pid, int from, int to, unsigned long flags); #define cr_imgset_open(pid, type, flags) cr_imgset_open_range(pid, \ _CR_FD_##type##_FROM, _CR_FD_##type##_TO, flags) extern struct cr_imgset *cr_glob_imgset_open(int mode); extern void close_cr_imgset(struct cr_imgset **cr_imgset); #endif /* __CR_IMGSET_H__ */ criu-3.6/criu/include/inet_diag.h000066400000000000000000000047361317335042600170230ustar00rootroot00000000000000#ifndef __CR_INET_DIAG_H__ #define __CR_INET_DIAG_H__ #include /* Just some random number */ #define TCPDIAG_GETSOCK 18 #define DCCPDIAG_GETSOCK 19 #define INET_DIAG_GETSOCK_MAX 24 /* Socket identity */ struct inet_diag_sockid { __be16 idiag_sport; __be16 idiag_dport; __be32 idiag_src[4]; __be32 idiag_dst[4]; __u32 idiag_if; __u32 idiag_cookie[2]; #define INET_DIAG_NOCOOKIE (~0U) }; /* Request structure */ struct inet_diag_req_compat { __u8 idiag_family; /* Family of addresses. */ __u8 idiag_src_len; __u8 idiag_dst_len; __u8 idiag_ext; /* Query extended information */ struct inet_diag_sockid id; __u32 idiag_states; /* States to dump */ __u32 idiag_dbs; /* Tables to dump (NI) */ }; struct inet_diag_req_v2 { __u8 sdiag_family; __u8 sdiag_protocol; __u8 idiag_ext; __u8 pad; __u32 idiag_states; struct inet_diag_sockid id; }; enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, }; #define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE /* Bytecode is sequence of 4 byte commands followed by variable arguments. * All the commands identified by "code" are conditional jumps forward: * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be * length of the command and its arguments. */ struct inet_diag_bc_op { unsigned char code; unsigned char yes; unsigned short no; }; enum { INET_DIAG_BC_NOP, INET_DIAG_BC_JMP, INET_DIAG_BC_S_GE, INET_DIAG_BC_S_LE, INET_DIAG_BC_D_GE, INET_DIAG_BC_D_LE, INET_DIAG_BC_AUTO, INET_DIAG_BC_S_COND, INET_DIAG_BC_D_COND, }; struct inet_diag_hostcond { __u8 family; __u8 prefix_len; int port; __be32 addr[0]; }; /* Base info structure. It contains socket identity (addrs/ports/cookie) * and, alas, the information shown by netstat. */ struct inet_diag_msg { __u8 idiag_family; __u8 idiag_state; __u8 idiag_timer; __u8 idiag_retrans; struct inet_diag_sockid id; __u32 idiag_expires; __u32 idiag_rqueue; __u32 idiag_wqueue; __u32 idiag_uid; __u32 idiag_inode; }; /* Extensions */ enum { INET_DIAG_NONE, INET_DIAG_MEMINFO, INET_DIAG_INFO, INET_DIAG_VEGASINFO, INET_DIAG_CONG, INET_DIAG_TOS, INET_DIAG_TCLASS, INET_DIAG_SKMEMINFO, INET_DIAG_SHUTDOWN, }; #define INET_DIAG_MAX INET_DIAG_SHUTDOWN /* INET_DIAG_MEM */ struct inet_diag_meminfo { __u32 idiag_rmem; __u32 idiag_wmem; __u32 idiag_fmem; __u32 idiag_tmem; }; /* INET_DIAG_VEGASINFO */ struct tcpvegas_info { __u32 tcpv_enabled; __u32 tcpv_rttcnt; __u32 tcpv_rtt; __u32 tcpv_minrtt; }; #endif /* __CR_INET_DIAG_H__ */ criu-3.6/criu/include/infect-pie.h000066400000000000000000000003631317335042600171130ustar00rootroot00000000000000#ifndef __CR_INFECT_PIE_H__ #define __CR_INFECT_PIE_H__ extern int parasite_daemon_cmd(int cmd, void *args); extern int parasite_trap_cmd(int cmd, void *args); extern void parasite_cleanup(void); extern int parasite_get_rpc_sock(void); #endif criu-3.6/criu/include/int.h000066400000000000000000000001161317335042600156560ustar00rootroot00000000000000#ifndef __CR_INC_INT_H__ #define __CR_INC_INT_H__ #include "asm/int.h" #endif criu-3.6/criu/include/ipc_ns.h000066400000000000000000000002721317335042600163420ustar00rootroot00000000000000#ifndef __CR_IPC_NS_H__ #define __CR_IPC_NS_H__ extern int dump_ipc_ns(int ns_id); extern int prepare_ipc_ns(int pid); extern struct ns_desc ipc_ns_desc; #endif /* __CR_IPC_NS_H__ */ criu-3.6/criu/include/irmap.h000066400000000000000000000006611317335042600162010ustar00rootroot00000000000000#ifndef __CR_IRMAP__H__ #define __CR_IRMAP__H__ char *irmap_lookup(unsigned int s_dev, unsigned long i_ino); struct _FhEntry; int irmap_queue_cache(unsigned int dev, unsigned long ino, struct _FhEntry *fh); int irmap_predump_prep(void); int irmap_predump_run(void); int check_open_handle(unsigned int s_dev, unsigned long i_ino, struct _FhEntry *f_handle); int irmap_load_cache(void); int irmap_scan_path_add(char *path); #endif criu-3.6/criu/include/kcmp-ids.h000066400000000000000000000007441317335042600166020ustar00rootroot00000000000000#ifndef __CR_KCMP_IDS_H__ #define __CR_KCMP_IDS_H__ #include "kcmp.h" struct kid_tree { struct rb_root root; unsigned kcmp_type; unsigned long subid; }; #define DECLARE_KCMP_TREE(name, type) \ struct kid_tree name = { \ .root = RB_ROOT, \ .kcmp_type = type, \ .subid = 1, \ } struct kid_elem { int pid; unsigned genid; unsigned idx; }; extern u32 kid_generate_gen(struct kid_tree *tree, struct kid_elem *elem, int *new_id); #endif /* __CR_KCMP_IDS_H__ */ criu-3.6/criu/include/kcmp.h000066400000000000000000000003001317335042600160110ustar00rootroot00000000000000#ifndef __CR_KCMP_H__ #define __CR_KCMP_H__ enum kcmp_type { KCMP_FILE, KCMP_VM, KCMP_FILES, KCMP_FS, KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, KCMP_TYPES, }; #endif /* __CR_KCMP_H__ */ criu-3.6/criu/include/kerndat.h000066400000000000000000000034141317335042600165200ustar00rootroot00000000000000#ifndef __CR_KERNDAT_H__ #define __CR_KERNDAT_H__ #include #include "int.h" #include "config.h" #ifdef CONFIG_VDSO #include "util-vdso.h" #endif struct stat; /* * kerndat stands for "kernel data" and is a collection * of run-time information about current kernel */ extern int kerndat_init(void); extern int kerndat_get_dirty_track(void); extern int kerndat_fdinfo_has_lock(void); extern int kerndat_loginuid(void); enum pagemap_func { PM_UNKNOWN, PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */ PM_FLAGS_ONLY, /* pagemap zeroes pfn part (user mode) */ PM_FULL, }; enum loginuid_func { LUID_NONE, LUID_READ, LUID_FULL, }; struct kerndat_s { u32 magic1, magic2; dev_t shmem_dev; int last_cap; u64 zero_page_pfn; bool has_dirty_track; bool has_memfd; bool has_fdinfo_lock; unsigned long task_size; bool ipv6; enum loginuid_func luid; bool compat_cr; enum pagemap_func pmap; unsigned int has_xtlocks; unsigned long mmap_min_addr; bool has_tcp_half_closed; bool stack_guard_gap_hidden; int lsm; bool has_uffd; unsigned long uffd_features; bool has_thp_disable; bool can_map_vdso; bool vdso_hint_reliable; #ifdef CONFIG_VDSO struct vdso_symtable vdso_sym; #ifdef CONFIG_COMPAT struct vdso_symtable vdso_sym_compat; #endif #endif }; extern struct kerndat_s kdat; enum { KERNDAT_FS_STAT_DEVPTS, KERNDAT_FS_STAT_DEVTMPFS, KERNDAT_FS_STAT_BINFMT_MISC, KERNDAT_FS_STAT_MAX }; /* * Check whether the fs @which with kdevice @kdev * is the same as host's. If yes, this means that * the fs mount is shared with host, if no -- it's * a new (likely virtuzlized) fs instance. */ extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); extern int kerndat_tcp_repair(); extern int kerndat_uffd(void); #endif /* __CR_KERNDAT_H__ */ criu-3.6/criu/include/libnetlink.h000066400000000000000000000011541317335042600172220ustar00rootroot00000000000000#ifndef __CR_LIBNETLINK_H__ #define __CR_LIBNETLINK_H__ #define CR_NLMSG_SEQ 24680 /* arbitrary chosen */ extern int do_rtnl_req(int nl, void *req, int size, int (*receive_callback)(struct nlmsghdr *h, void *), int (*error_callback)(int err, void *), void *); extern int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, int alen); #define NLMSG_TAIL(nmsg) \ ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) #ifndef NETNS_RTA #define NETNS_RTA(r) \ ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtgenmsg)))) #endif #endif /* __CR_LIBNETLINK_H__ */ criu-3.6/criu/include/linux/000077500000000000000000000000001317335042600160545ustar00rootroot00000000000000criu-3.6/criu/include/linux/userfaultfd.h000066400000000000000000000141421317335042600205530ustar00rootroot00000000000000/* * include/linux/userfaultfd.h * * Copyright (C) 2007 Davide Libenzi * Copyright (C) 2015 Red Hat, Inc. * */ #ifndef _LINUX_USERFAULTFD_H #define _LINUX_USERFAULTFD_H #include /* * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ (__u64)1 << _UFFDIO_API) #define UFFD_API_RANGE_IOCTLS \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY) /* * Valid ioctl command number range with this API is from 0x00 to * 0x3F. UFFDIO_API is the fixed number, everything else can be * changed by implementing a different UFFD_API. If sticking to the * same UFFD_API more ioctl can be added and userland will be aware of * which ioctl the running kernel implements through the ioctl command * bitmask written by the UFFDIO_API. */ #define _UFFDIO_REGISTER (0x00) #define _UFFDIO_UNREGISTER (0x01) #define _UFFDIO_WAKE (0x02) #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ #define UFFDIO 0xAA #define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ struct uffdio_api) #define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ struct uffdio_register) #define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ struct uffdio_range) #define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ struct uffdio_range) #define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ struct uffdio_copy) #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ struct uffdio_zeropage) /* read() structure */ struct uffd_msg { __u8 event; __u8 reserved1; __u16 reserved2; __u32 reserved3; union { struct { __u64 flags; __u64 address; } pagefault; struct { __u32 ufd; } fork; struct { __u64 from; __u64 to; __u64 len; } remap; struct { __u64 start; __u64 end; } remove; struct { /* unused reserved fields */ __u64 reserved1; __u64 reserved2; __u64 reserved3; } reserved; } arg; } __packed; /* * Start at 0x12 and not at 0 to be more strict against bugs. */ #define UFFD_EVENT_PAGEFAULT 0x12 #define UFFD_EVENT_FORK 0x13 #define UFFD_EVENT_REMAP 0x14 #define UFFD_EVENT_REMOVE 0x15 #define UFFD_EVENT_UNMAP 0x16 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ struct uffdio_api { /* userland asks for an API number and the features to enable */ __u64 api; /* * Kernel answers below with the all available features for * the API, this notifies userland of which events and/or * which flags for each event are enabled in the current * kernel. * * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE * are to be considered implicitly always enabled in all kernels as * long as the uffdio_api.api requested matches UFFD_API. * * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on * hugetlbfs virtual memory ranges. Adding or not adding * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has * no real functional effect after UFFDIO_API returns, but * it's only useful for an initial feature set probe at * UFFDIO_API time. There are two ways to use it: * * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the * uffdio_api.features before calling UFFDIO_API, an error * will be returned by UFFDIO_API on a kernel without * hugetlbfs missing support * * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in * uffdio_api.features and instead it will be set by the * kernel in the uffdio_api.features if the kernel supports * it, so userland can later check if the feature flag is * present in uffdio_api.features after UFFDIO_API * succeeded. * * UFFD_FEATURE_MISSING_SHMEM works the same as * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem * (i.e. tmpfs and other shmem based APIs). */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) #define UFFD_FEATURE_EVENT_REMAP (1<<2) #define UFFD_FEATURE_EVENT_REMOVE (1<<3) #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) #define UFFD_FEATURE_MISSING_SHMEM (1<<5) #define UFFD_FEATURE_EVENT_UNMAP (1<<6) __u64 features; __u64 ioctls; }; struct uffdio_range { __u64 start; __u64 len; }; struct uffdio_register { struct uffdio_range range; #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) __u64 mode; /* * kernel answers which ioctl commands are available for the * range, keep at the end as the last 8 bytes aren't read. */ __u64 ioctls; }; struct uffdio_copy { __u64 dst; __u64 src; __u64 len; /* * There will be a wrprotection flag later that allows to map * pages wrprotected on the fly. And such a flag will be * available if the wrprotection ioctl are implemented for the * range according to the uffdio_register.ioctls. */ #define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) __u64 mode; /* * "copy" is written by the ioctl and must be at the end: the * copy_from_user will not read the last 8 bytes. */ __s64 copy; }; struct uffdio_zeropage { struct uffdio_range range; #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) __u64 mode; /* * "zeropage" is written by the ioctl and must be at the end: * the copy_from_user will not read the last 8 bytes. */ __s64 zeropage; }; #endif /* _LINUX_USERFAULTFD_H */ criu-3.6/criu/include/log.h000066400000000000000000000036501317335042600156530ustar00rootroot00000000000000#ifndef __CR_LOG_H__ #define __CR_LOG_H__ #include #ifndef CR_NOGLIBC #include #include #include extern void vprint_on_level(unsigned int loglevel, const char *format, va_list params); #endif /* CR_NOGLIBC */ #define LOG_UNSET (-1) #define LOG_MSG (0) /* Print message regardless of log level */ #define LOG_ERROR (1) /* Errors only, when we're in trouble */ #define LOG_WARN (2) /* Warnings, dazen and confused but trying to continue */ #define LOG_INFO (3) /* Informative, everything is fine */ #define LOG_DEBUG (4) /* Debug only */ #define DEFAULT_LOGLEVEL LOG_WARN extern void print_on_level(unsigned int loglevel, const char *format, ...) __attribute__ ((__format__ (__printf__, 2, 3))); #ifndef LOG_PREFIX # define LOG_PREFIX #endif #define print_once(loglevel, fmt, ...) \ do { \ static bool __printed; \ if (!__printed) { \ print_on_level(loglevel, fmt, ##__VA_ARGS__); \ __printed = 1; \ } \ } while (0) #define pr_msg(fmt, ...) \ print_on_level(LOG_MSG, \ fmt, ##__VA_ARGS__) #define pr_info(fmt, ...) \ print_on_level(LOG_INFO, \ LOG_PREFIX fmt, ##__VA_ARGS__) #define pr_err(fmt, ...) \ print_on_level(LOG_ERROR, \ "Error (%s:%d): " LOG_PREFIX fmt, \ __FILE__, __LINE__, ##__VA_ARGS__) #define pr_err_once(fmt, ...) \ print_once(LOG_ERROR, fmt, ##__VA_ARGS__) #define pr_warn(fmt, ...) \ print_on_level(LOG_WARN, \ "Warn (%s:%d): " LOG_PREFIX fmt, \ __FILE__, __LINE__, ##__VA_ARGS__) #define pr_warn_once(fmt, ...) \ print_once(LOG_WARN, fmt, ##__VA_ARGS__) #define pr_debug(fmt, ...) \ print_on_level(LOG_DEBUG, \ LOG_PREFIX fmt, ##__VA_ARGS__) #ifndef CR_NOGLIBC #define pr_perror(fmt, ...) \ pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) #endif /* CR_NOGLIBC */ #endif /* __CR_LOG_H__ */ criu-3.6/criu/include/lsm.h000066400000000000000000000014451317335042600156650ustar00rootroot00000000000000#ifndef __CR_LSM_H__ #define __CR_LSM_H__ #include "images/inventory.pb-c.h" #include "images/creds.pb-c.h" #define AA_SECURITYFS_PATH "/sys/kernel/security/apparmor" /* * Get the Lsmtype for the current host. */ extern Lsmtype host_lsm_type(void); /* * Initialize the Lsmtype for the current host */ extern void kerndat_lsm(void); /* * Read the LSM profile for the pstree item */ extern int collect_lsm_profile(pid_t, CredsEntry *); /* * Validate that the LSM profiles can be correctly applied (must happen after * pstree is set up). */ int validate_lsm(char *profile); /* * Render the profile name in the way that the LSM wants it written to * /proc//attr/current. */ int render_lsm_profile(char *profile, char **val); extern int lsm_check_opts(void); #endif /* __CR_LSM_H__ */ criu-3.6/criu/include/magic.h000066400000000000000000000107121317335042600161470ustar00rootroot00000000000000#ifndef __CR_MAGIC_H__ #define __CR_MAGIC_H__ /* * Basic multi-file images */ #define CRTOOLS_IMAGES_V1 1 /* * v1.1 has common magic in the head of each image file, * except for inventory */ #define CRTOOLS_IMAGES_V1_1 2 /* * Raw images are images in which data is stored in some * non-crtool format (ip tool dumps, tarballs, etc.) */ #define RAW_IMAGE_MAGIC 0x0 /* * Images have the IMG_COMMON_MAGIC in the head. Service files * such as stats and irmap-cache have the IMG_SERVICE_MAGIC. */ #define IMG_COMMON_MAGIC 0x54564319 /* Sarov (a.k.a. Arzamas-16) */ #define IMG_SERVICE_MAGIC 0x55105940 /* Zlatoust */ /* * The magic-s below correspond to coordinates * of various Russian towns in the NNNNEEEE form. */ #define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ #define PSTREE_MAGIC 0x50273030 /* Kyiv */ #define FDINFO_MAGIC 0x56213732 /* Dmitrov */ #define PAGEMAP_MAGIC 0x56084025 /* Vladimir */ #define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC #define PAGES_MAGIC RAW_IMAGE_MAGIC #define CORE_MAGIC 0x55053847 /* Kolomna */ #define IDS_MAGIC 0x54432030 /* Konigsberg */ #define VMAS_MAGIC 0x54123737 /* Tula */ #define PIPES_MAGIC 0x56513555 /* Tver */ #define PIPES_DATA_MAGIC 0x56453709 /* Dubna */ #define FIFO_MAGIC 0x58364939 /* Kirov */ #define FIFO_DATA_MAGIC 0x59333054 /* Tosno */ #define SIGACT_MAGIC 0x55344201 /* Murom */ #define UNIXSK_MAGIC 0x54373943 /* Ryazan */ #define INETSK_MAGIC 0x56443851 /* Pereslavl */ #define PACKETSK_MAGIC 0x60454618 /* Veliky Ustyug */ #define ITIMERS_MAGIC 0x57464056 /* Kostroma */ #define POSIX_TIMERS_MAGIC 0x52603957 /* Lipetsk */ #define SK_QUEUES_MAGIC 0x56264026 /* Suzdal */ #define UTSNS_MAGIC 0x54473203 /* Smolensk */ #define CREDS_MAGIC 0x54023547 /* Kozelsk */ #define IPC_VAR_MAGIC 0x53115007 /* Samara */ #define IPCNS_SHM_MAGIC 0x46283044 /* Odessa */ #define IPCNS_MSG_MAGIC 0x55453737 /* Moscow */ #define IPCNS_SEM_MAGIC 0x59573019 /* St. Petersburg */ #define REG_FILES_MAGIC 0x50363636 /* Belgorod */ #define EXT_FILES_MAGIC 0x59255641 /* Usolye */ #define FS_MAGIC 0x51403912 /* Voronezh */ #define MM_MAGIC 0x57492820 /* Pskov */ #define REMAP_FPATH_MAGIC 0x59133954 /* Vologda */ #define GHOST_FILE_MAGIC 0x52583605 /* Oryol */ #define TCP_STREAM_MAGIC 0x51465506 /* Orenburg */ #define EVENTFD_FILE_MAGIC 0x44523722 /* Anapa */ #define EVENTPOLL_FILE_MAGIC 0x45023858 /* Krasnodar */ #define EVENTPOLL_TFD_MAGIC 0x44433746 /* Novorossiysk */ #define SIGNALFD_MAGIC 0x57323820 /* Uglich */ #define INOTIFY_FILE_MAGIC 0x48424431 /* Volgograd */ #define INOTIFY_WD_MAGIC 0x54562009 /* Svetlogorsk (Rauschen) */ #define MNTS_MAGIC 0x55563928 /* Petushki */ #define NETDEV_MAGIC 0x57373951 /* Yaroslavl */ #define NETNS_MAGIC 0x55933752 /* Dolgoprudny */ #define TTY_FILES_MAGIC 0x59433025 /* Pushkin */ #define TTY_INFO_MAGIC 0x59453036 /* Kolpino */ #define TTY_DATA_MAGIC 0x59413026 /* Pavlovsk */ #define FILE_LOCKS_MAGIC 0x54323616 /* Kaluga */ #define RLIMIT_MAGIC 0x57113925 /* Rostov */ #define FANOTIFY_FILE_MAGIC 0x55096122 /* Chelyabinsk */ #define FANOTIFY_MARK_MAGIC 0x56506035 /* Yekaterinburg */ #define SIGNAL_MAGIC 0x59255647 /* Berezniki */ #define PSIGNAL_MAGIC SIGNAL_MAGIC #define NETLINK_SK_MAGIC 0x58005614 /* Perm */ #define NS_FILES_MAGIC 0x61394011 /* Nyandoma */ #define TUNFILE_MAGIC 0x57143751 /* Kalyazin */ #define CGROUP_MAGIC 0x59383330 /* Tikhvin */ #define TIMERFD_MAGIC 0x50493712 /* Korocha */ #define CPUINFO_MAGIC 0x61404013 /* Nyandoma */ #define USERNS_MAGIC 0x55474906 /* Kazan */ #define SECCOMP_MAGIC 0x64413049 /* Kostomuksha */ #define BINFMT_MISC_MAGIC 0x67343323 /* Apatity */ #define AUTOFS_MAGIC 0x49353943 /* Sochi */ #define FILES_MAGIC 0x56303138 /* Toropets */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC #define ROUTE6_MAGIC RAW_IMAGE_MAGIC #define RULE_MAGIC RAW_IMAGE_MAGIC #define TMPFS_IMG_MAGIC RAW_IMAGE_MAGIC #define TMPFS_DEV_MAGIC RAW_IMAGE_MAGIC #define IPTABLES_MAGIC RAW_IMAGE_MAGIC #define IP6TABLES_MAGIC RAW_IMAGE_MAGIC #define NETNF_CT_MAGIC RAW_IMAGE_MAGIC #define NETNF_EXP_MAGIC RAW_IMAGE_MAGIC #define PAGES_OLD_MAGIC PAGEMAP_MAGIC #define SHM_PAGES_OLD_MAGIC PAGEMAP_MAGIC #define BINFMT_MISC_OLD_MAGIC BINFMT_MISC_MAGIC /* * These are special files, not exactly images */ #define STATS_MAGIC 0x57093306 /* Ostashkov */ #define IRMAP_CACHE_MAGIC 0x57004059 /* Ivanovo */ /* * Main magic for kerndat_s structure. */ #define KDAT_MAGIC 0x57023458 /* Torzhok */ #endif /* __CR_MAGIC_H__ */ criu-3.6/criu/include/mem.h000066400000000000000000000027211317335042600156460ustar00rootroot00000000000000#ifndef __CR_MEM_H__ #define __CR_MEM_H__ #include #include "int.h" #include "vma.pb-c.h" struct parasite_ctl; struct vm_area_list; struct page_pipe; struct pstree_item; struct vma_area; struct mem_dump_ctl { bool pre_dump; bool lazy; }; extern bool vma_has_guard_gap_hidden(struct vma_area *vma); extern bool page_is_zero(u64 pme); extern bool page_in_parent(bool dirty); extern int prepare_mm_pid(struct pstree_item *i); extern void prepare_cow_vmas(void); extern int do_task_reset_dirty_track(int pid); extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl); #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) #define PME_FILE (1ULL << 61) #define PME_SOFT_DIRTY (1ULL << 55) #define PME_PSHIFT_BITS (6) #define PME_STATUS_BITS (3) #define PME_STATUS_OFFSET (64 - PME_STATUS_BITS) #define PME_PSHIFT_OFFSET (PME_STATUS_OFFSET - PME_PSHIFT_BITS) #define PME_PFRAME_MASK ((1ULL << PME_PSHIFT_OFFSET) - 1) #define PME_PFRAME(x) ((x) & PME_PFRAME_MASK) struct task_restore_args; int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); bool should_dump_page(VmaEntry *vmae, u64 pme); #endif /* __CR_MEM_H__ */ criu-3.6/criu/include/mman.h000066400000000000000000000004551317335042600160220ustar00rootroot00000000000000#ifndef __CR_MMAN_H__ #define __CR_MMAN_H__ #ifndef MAP_HUGETLB # define MAP_HUGETLB 0x40000 #endif #ifndef MADV_HUGEPAGE # define MADV_HUGEPAGE 14 #endif #ifndef MADV_NOHUGEPAGE # define MADV_NOHUGEPAGE 15 #endif #ifndef MADV_DONTDUMP # define MADV_DONTDUMP 16 #endif #endif /* __CR_MMAN_H__ */ criu-3.6/criu/include/mount.h000066400000000000000000000071611317335042600162350ustar00rootroot00000000000000#ifndef __CR_MOUNT_H__ #define __CR_MOUNT_H__ #include #include "common/list.h" struct proc_mountinfo; struct pstree_item; struct fstype; struct ns_id; #define MOUNT_INVALID_DEV (0) struct mount_info { int mnt_id; int parent_mnt_id; unsigned int s_dev; unsigned int s_dev_rt; char *root; /* * During dump mountpoint contains path with dot at the * beginning. It allows to use openat, statat, etc without * creating a temporary copy of the path. * * On restore mountpoint is prepended with so called ns * root path -- it's a place in fs where the namespace * mount tree is constructed. Check mnt_roots for details. * The ns_mountpoint contains path w/o this prefix. */ char *mountpoint; char *ns_mountpoint; int fd; unsigned flags; unsigned sb_flags; int master_id; int shared_id; struct fstype *fstype; char *source; char *options; char *fsname; union { bool mounted; bool dumped; }; bool need_plugin; bool is_ns_root; bool deleted; struct mount_info *next; struct ns_id *nsid; char *external; bool internal_sharing; /* tree linkage */ struct mount_info *parent; struct mount_info *bind; struct list_head children; struct list_head siblings; struct list_head mnt_bind; /* circular list of derivatives of one real mount */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list; /* list of slave mounts */ struct list_head mnt_slave; /* slave list entry */ struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */ struct list_head postpone; void *private; /* associated filesystem data */ }; extern struct mount_info *mntinfo; extern struct ns_desc mnt_ns_desc; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED extern int collect_binfmt_misc(void); #else static inline int collect_binfmt_misc(void) { return 0; } #endif extern struct mount_info *mnt_entry_alloc(); extern void mnt_entry_free(struct mount_info *mi); extern int __mntns_get_root_fd(pid_t pid); extern int mntns_get_root_fd(struct ns_id *ns); extern int mntns_get_root_by_mnt_id(int mnt_id); extern struct ns_id *lookup_nsid_by_mnt_id(int mnt_id); extern int open_mount(unsigned int s_dev); extern int __open_mountpoint(struct mount_info *pm, int mnt_fd); extern int mnt_is_dir(struct mount_info *pm); extern int open_mountpoint(struct mount_info *pm); extern struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump); extern int prepare_mnt_ns(void); extern int pivot_root(const char *new_root, const char *put_old); extern struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev, unsigned int st_ino, unsigned int mnt_id); extern struct mount_info *lookup_mnt_id(unsigned int id); extern struct mount_info *lookup_mnt_sdev(unsigned int s_dev); extern dev_t phys_stat_resolve_dev(struct ns_id *, dev_t st_dev, const char *path); extern bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, struct ns_id *, const char *path); extern int restore_task_mnt_ns(struct pstree_item *current); extern void fini_restore_mntns(void); extern int depopulate_roots_yard(int mntns_root, bool clean_remaps); extern int rst_get_mnt_root(int mnt_id, char *path, int plen); extern int ext_mount_add(char *key, char *val); extern int ext_mount_parse_auto(char *key); extern int mntns_maybe_create_roots(void); extern int read_mnt_ns_img(void); extern void cleanup_mnt_ns(void); extern void clean_cr_time_mounts(void); extern bool add_skip_mount(const char *mountpoint); struct ns_id; extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump); extern int check_mnt_id(void); #endif /* __CR_MOUNT_H__ */ criu-3.6/criu/include/namespaces.h000066400000000000000000000115061317335042600172100ustar00rootroot00000000000000#ifndef __CR_NS_H__ #define __CR_NS_H__ #include "common/compiler.h" #include "files.h" #include "common/list.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 #endif #ifndef CLONE_NEWPID #define CLONE_NEWPID 0x20000000 #endif #ifndef CLONE_NEWUTS #define CLONE_NEWUTS 0x04000000 #endif #ifndef CLONE_NEWIPC #define CLONE_NEWIPC 0x08000000 #endif #ifndef CLONE_NEWNET #define CLONE_NEWNET 0x40000000 #endif #ifndef CLONE_NEWUSER #define CLONE_NEWUSER 0x10000000 #endif #ifndef CLONE_NEWCGROUP #define CLONE_NEWCGROUP 0x02000000 #endif #define CLONE_ALLNS (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP) /* Nested namespaces are supported only for these types */ #define CLONE_SUBNS (CLONE_NEWNS) #define EXTRA_SIZE 20 struct ns_desc { unsigned int cflag; char *str; size_t len; }; struct user_ns_extra { char *uid; char *gid; }; /* struct join_ns is used for storing parameters specified by --join-ns */ struct join_ns { struct list_head list; char *ns_file; struct ns_desc *nd; /* namespace descriptor */ int ns_fd; /* extra options of --join-ns, like uid&gid in user namespace */ union { struct user_ns_extra user_extra; char *common_extra; } extra_opts; }; enum ns_type { NS_UNKNOWN = 0, NS_CRIU, NS_ROOT, NS_OTHER, }; struct ns_id { unsigned int kid; unsigned int id; pid_t ns_pid; struct ns_desc *nd; struct ns_id *next; enum ns_type type; /* * For mount namespaces on restore -- indicates that * the namespace in question is created (all mounts * are mounted) and other tasks may do setns on it * and proceed. */ bool ns_populated; union { struct { struct mount_info *mntinfo_list; struct mount_info *mntinfo_tree; int ns_fd; int root_fd; } mnt; struct { int nlsk; /* for sockets collection */ int seqsk; /* to talk to parasite daemons */ } net; }; }; extern struct ns_id *ns_ids; #define NS_DESC_ENTRY(_cflag, _str) \ { \ .cflag = _cflag, \ .str = _str, \ .len = sizeof(_str) - 1, \ } extern bool check_ns_proc(struct fd_link *link); extern struct ns_desc pid_ns_desc; extern struct ns_desc user_ns_desc; extern unsigned long root_ns_mask; extern const struct fdtype_ops nsfile_dump_ops; extern struct collect_image_info nsfile_cinfo; extern int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void *oarg); extern int collect_namespaces(bool for_dump); extern int collect_mnt_namespaces(bool for_dump); extern int dump_mnt_namespaces(void); extern int dump_namespaces(struct pstree_item *item, unsigned int ns_flags); extern int prepare_namespace_before_tasks(void); extern int prepare_namespace(struct pstree_item *item, unsigned long clone_flags); extern int switch_ns(int pid, struct ns_desc *nd, int *rst); extern int switch_ns_by_fd(int nsfd, struct ns_desc *nd, int *rst); extern int restore_ns(int rst, struct ns_desc *nd); extern int dump_task_ns_ids(struct pstree_item *); extern int predump_task_ns_ids(struct pstree_item *); extern struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, struct ns_desc *nd, enum ns_type t); extern int rst_add_ns_id(unsigned int id, struct pstree_item *, struct ns_desc *nd); extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd); extern int collect_user_namespaces(bool for_dump); extern int prepare_userns(struct pstree_item *item); extern int stop_usernsd(void); extern uid_t userns_uid(uid_t uid); extern gid_t userns_gid(gid_t gid); extern int dump_user_ns(pid_t pid, int ns_id); extern void free_userns_maps(void); extern int join_ns_add(const char *type, char *ns_file, char *extra_opts); extern int check_namespace_opts(void); extern int join_namespaces(void); typedef int (*uns_call_t)(void *arg, int fd, pid_t pid); /* * Async call -- The call is guaranteed to be done till the * CR_STATE_COMPLETE happens. The function may return even * before the call starts. * W/o flag the call is synchronous -- this function returns * strictly after the call finishes. */ #define UNS_ASYNC 0x1 /* * The call returns an FD which should be sent back. Conflicts * with UNS_ASYNC. */ #define UNS_FDOUT 0x2 #define MAX_UNSFD_MSG_SIZE 4096 /* * When we're restoring inside user namespace, some things are * not allowed to be done there due to insufficient capabilities. * If the operation in question can be offloaded to another process, * this call allows to do that. * * In case we're not in userns, just call the callback immediately * in the context of calling task. */ extern int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, size_t arg_size, int fd); #define userns_call(__call, __flags, __arg, __arg_size, __fd) \ __userns_call(__stringify(__call), __call, __flags, \ __arg, __arg_size, __fd) extern int add_ns_shared_cb(int (*actor)(void *data), void *data); #endif /* __CR_NS_H__ */ criu-3.6/criu/include/net.h000066400000000000000000000017361317335042600156630ustar00rootroot00000000000000#ifndef __CR_NET_H__ #define __CR_NET_H__ #include #include "common/list.h" #include "external.h" #ifndef RTM_GETNSID #define RTM_GETNSID 90 #endif struct cr_imgset; extern int dump_net_ns(int ns_id); extern int prepare_net_ns(int pid); extern int netns_keep_nsfd(void); struct veth_pair { struct list_head node; char *inside; char *outside; char *bridge; }; extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); extern int network_lock_internal(); extern struct ns_desc net_ns_desc; #include "images/netdev.pb-c.h" extern int write_netdev_img(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **info); extern int read_ns_sys_file(char *path, char *buf, int len); extern int restore_link_parms(NetDeviceEntry *nde, int nlsk); extern int veth_pair_add(char *in, char *out); extern int macvlan_ext_add(struct external *ext); extern int move_veth_to_bridge(void); #endif /* __CR_NET_H__ */ criu-3.6/criu/include/netfilter.h000066400000000000000000000005341317335042600170640ustar00rootroot00000000000000#ifndef __CR_NETFILTER_H__ #define __CR_NETFILTER_H__ struct inet_sk_desc; extern int nf_lock_connection(struct inet_sk_desc *); extern int nf_unlock_connection(struct inet_sk_desc *); struct inet_sk_info; extern int nf_unlock_connection_info(struct inet_sk_info *); extern void preload_netfilter_modules(void); #endif /* __CR_NETFILTER_H__ */ criu-3.6/criu/include/netlink_diag.h000066400000000000000000000014451317335042600175220ustar00rootroot00000000000000#ifndef __CR_NETLINK_DIAG_H__ #define __CR_NETLINK_DIAG_H__ #include struct netlink_diag_req { __u8 sdiag_family; __u8 sdiag_protocol; __u16 pad; __u32 ndiag_ino; __u32 ndiag_show; __u32 ndiag_cookie[2]; }; struct netlink_diag_msg { __u8 ndiag_family; __u8 ndiag_type; __u8 ndiag_protocol; __u8 ndiag_state; __u32 ndiag_portid; __u32 ndiag_dst_portid; __u32 ndiag_dst_group; __u32 ndiag_ino; __u32 ndiag_cookie[2]; }; enum { NETLINK_DIAG_MEMINFO, NETLINK_DIAG_GROUPS, __NETLINK_DIAG_MAX, }; #define NETLINK_DIAG_MAX (__NETLINK_DIAG_MAX - 1) #define NDIAG_PROTO_ALL ((__u8) ~0) #define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ #define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ #endif /* __CR_NETLINK_DIAG_H__ */ criu-3.6/criu/include/packet_diag.h000066400000000000000000000027521317335042600173270ustar00rootroot00000000000000#ifndef __CR_PACKET_DIAG_H__ #define __CR_PACKET_DIAG_H__ #include struct packet_diag_req { __u8 sdiag_family; __u8 sdiag_protocol; __u16 pad; __u32 pdiag_ino; __u32 pdiag_show; __u32 pdiag_cookie[2]; }; #define PACKET_SHOW_INFO 0x00000001 /* Basic packet_sk information */ #define PACKET_SHOW_MCLIST 0x00000002 /* A set of packet_diag_mclist-s */ #define PACKET_SHOW_RING_CFG 0x00000004 /* Rings configuration parameters */ #define PACKET_SHOW_FANOUT 0x00000008 struct packet_diag_msg { __u8 pdiag_family; __u8 pdiag_type; __u16 pdiag_num; __u32 pdiag_ino; __u32 pdiag_cookie[2]; }; enum { PACKET_DIAG_INFO, PACKET_DIAG_MCLIST, PACKET_DIAG_RX_RING, PACKET_DIAG_TX_RING, PACKET_DIAG_FANOUT, PACKET_DIAG_MAX, }; struct packet_diag_info { __u32 pdi_index; __u32 pdi_version; __u32 pdi_reserve; __u32 pdi_copy_thresh; __u32 pdi_tstamp; __u32 pdi_flags; #define PDI_RUNNING 0x1 #define PDI_AUXDATA 0x2 #define PDI_ORIGDEV 0x4 #define PDI_VNETHDR 0x8 #define PDI_LOSS 0x10 }; #ifndef MAX_ADDR_LEN #define MAX_ADDR_LEN 32 #endif struct packet_diag_mclist { __u32 pdmc_index; __u32 pdmc_count; __u16 pdmc_type; __u16 pdmc_alen; __u8 pdmc_addr[MAX_ADDR_LEN]; }; struct packet_diag_ring { __u32 pdr_block_size; __u32 pdr_block_nr; __u32 pdr_frame_size; __u32 pdr_frame_nr; __u32 pdr_retire_tmo; __u32 pdr_sizeof_priv; __u32 pdr_features; }; #endif /* __CR_PACKET_DIAG_H__ */ criu-3.6/criu/include/page-pipe.h000066400000000000000000000107141317335042600167400ustar00rootroot00000000000000#ifndef __CR_PAGE_PIPE_H__ #define __CR_PAGE_PIPE_H__ #include #include "common/list.h" #define PAGE_ALLOC_COSTLY_ORDER 3 /* from the kernel source code */ struct kernel_pipe_buffer { struct page *page; unsigned int offset, len; const struct pipe_buf_operations *ops; unsigned int flags; unsigned long private; }; /* * The kernel allocates the linear chunk of memory for pipe buffers. * Allocation of chunks with size more than PAGE_ALLOC_COSTLY_ORDER * fails very often, so we need to restrict the pipe capacity to not * allocate big chunks. */ #define PIPE_MAX_SIZE ((1 << PAGE_ALLOC_COSTLY_ORDER) * PAGE_SIZE / \ sizeof(struct kernel_pipe_buffer)) /* The number of pipes for one chunk */ #define NR_PIPES_PER_CHUNK 8 /* * page_pipe is a descriptor of task's virtual memory * with pipes, containing pages. * * A page-pipe may contain holes -- these are pagemap * entries without pages. Holes are stored in separate * array to optimize paged iovs feed into vmsplice -- * they will be sent there in one go. * * A hole is a pagemap entry that doesn't have pages * in it, since they are present in previous (parent) * snapshot. * * * This page-pipe vs holes vs task vmem vs image layout * is described below. * * Task memory: (+ present, - not present pages) * 0 0 0 0 1 1 1 * 0 3 6 B 1 8 C * ---+++-----++++++-------++++---- * * Page-pipe iovs: * * bufs = 03:3,0B:6,18:4 * holes = * * The pagemap.img would purely contain page-pipe bufs. * * Pages image will contain pages at * * 03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B * * stored one by one. * * Not let's imagine task touches some pages and its mem * looks like: (+ present, = old present, - non present) * * 0 0 0 0 11 11 1 * 0 3 6 B 12 78 C * ---==+-----====+++-----++===---- * * (not new pages at 11 and 17 vaddrs) * * The new --snapshot'ed page-pipe would look like * * bufs = 05:1,0F:3,17:2 * holes = 03:2,0B:4,19:3 * * So the pagemap.img would look like * * 03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P * * (the page_xfer_dump_pages generates one) * * where P means "in parent", i.e. respective pages should * be looked up in the parent pagemap (not pages.img, but * the pagemap, and then the offset in previous pages.img * should be calculated, see the read_pagemap_page routine). * * New pages.img file would contain only pages for * * 05,0F,10,11,17,18 */ struct page_pipe_buf { int p[2]; /* pipe with pages */ unsigned int pipe_size; /* how many pages can be fit into pipe */ unsigned int pages_in; /* how many pages are there */ unsigned int nr_segs; /* how many iov-s are busy */ #define PPB_LAZY (1 << 0) unsigned int flags; struct iovec *iov; /* vaddr:len map */ struct list_head l; /* links into page_pipe->bufs */ }; #define PP_HOLE_PARENT (1 << 0) struct page_pipe { unsigned int nr_pipes; /* how many page_pipe_bufs in there */ struct list_head bufs; /* list of bufs */ struct list_head free_bufs; /* list of bufs */ unsigned int nr_iovs; /* number of iovs */ unsigned int free_iov; /* first free iov */ struct iovec *iovs; /* iovs. They are provided into create_page_pipe and all bufs have their iov-s in there */ unsigned int nr_holes; /* number of holes allocated */ unsigned int free_hole; /* number of holes in use */ struct iovec *holes; /* holes */ unsigned int *hole_flags; unsigned flags; /* PP_FOO flags below */ }; #define PP_CHUNK_MODE 0x1 /* Restrict the maximum buffer size of pipes and dump memory for a few iterations */ #define PP_OWN_IOVS 0x4 /* create_page_pipe allocated IOVs memory */ struct page_pipe *create_page_pipe(unsigned int nr_segs, struct iovec *iovs, unsigned flags); extern void destroy_page_pipe(struct page_pipe *p); extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr, unsigned int flags); extern int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr, unsigned int flags); extern void debug_show_page_pipe(struct page_pipe *pp); void page_pipe_reinit(struct page_pipe *pp); extern void page_pipe_destroy_ppb(struct page_pipe_buf *ppb); struct pipe_read_dest { int p[2]; int sink_fd; }; extern int pipe_read_dest_init(struct pipe_read_dest *prd); extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, unsigned int ppb_flags); #endif /* __CR_PAGE_PIPE_H__ */ criu-3.6/criu/include/page-xfer.h000066400000000000000000000041551317335042600167510ustar00rootroot00000000000000#ifndef __CR_PAGE_XFER__H__ #define __CR_PAGE_XFER__H__ #include "pagemap.h" extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd); /* * page_xfer -- transfer pages into image file. * Two images backends are implemented -- local image file * and page-server image file. */ struct page_xfer { /* transfers one vaddr:len entry */ int (*write_pagemap)(struct page_xfer *self, struct iovec *iov, u32 flags); /* transfers pages related to previous pagemap */ int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len); void (*close)(struct page_xfer *self); /* * In case we need to dump pagemaps not as-is, but * relative to some address. Used, e.g. by shmem. */ unsigned long offset; bool transfer_lazy; /* private data for every page-xfer engine */ union { struct /* local */ { struct cr_img *pmi; /* pagemaps */ struct cr_img *pi; /* pages */ }; struct /* page-server */ { int sk; u64 dst_id; }; }; struct page_read *parent; }; extern int open_page_xfer(struct page_xfer *xfer, int fd_type, long id); struct page_pipe; extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *); extern int connect_to_page_server_to_send(void); extern int connect_to_page_server_to_recv(int epfd); extern int disconnect_from_page_server(void); extern int check_parent_page_xfer(int fd_type, long id); /* * The post-copy migration makes it necessary to receive pages from * remote dump. The protocol we use for that is quite simple: * - lazy-pages sedns request containing PS_IOV_GET(nr_pages, vaddr, pid) * - dump-side page server responds with PS_IOV_ADD(nr_pages, vaddr, pid) or PS_IOV_ADD(0, 0, 0) if it failed to locate the required pages * - dump-side page server sends the raw page data */ /* async request/receive of remote pages */ extern int request_remote_pages(int pid, unsigned long addr, int nr_pages); typedef int (*ps_async_read_complete)(int pid, unsigned long vaddr, int nr_pages, void *); extern int page_server_start_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); #endif /* __CR_PAGE_XFER__H__ */ criu-3.6/criu/include/page.h000066400000000000000000000001241317335042600157770ustar00rootroot00000000000000#ifndef __CR_INC_PAGE_H__ #define __CR_INC_PAGE_H__ #include "common/page.h" #endif criu-3.6/criu/include/pagemap-cache.h000066400000000000000000000014331317335042600175420ustar00rootroot00000000000000#ifndef __CR_PAGEMAP_H__ #define __CR_PAGEMAP_H__ #include #include "int.h" #include "common/list.h" struct vma_area; #define PAGEMAP_PFN_OFF(addr) (PAGE_PFN(addr) * sizeof(u64)) typedef struct { pid_t pid; /* which process it belongs */ unsigned long start; /* start of area */ unsigned long end; /* end of area */ const struct list_head *vma_head; /* list head of VMAs we're serving */ u64 *map; /* local buffer */ size_t map_len; /* length of a buffer */ int fd; /* file to read PMs from */ } pmc_t; #define PMC_INIT (pmc_t){ } extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); extern void pmc_fini(pmc_t *pmc); #endif /* __CR_PAGEMAP_H__ */ criu-3.6/criu/include/pagemap.h000066400000000000000000000104461317335042600165050ustar00rootroot00000000000000#ifndef __CR_PAGE_READ_H__ #define __CR_PAGE_READ_H__ #include "common/list.h" #include "images/pagemap.pb-c.h" #include "page.h" /* * page_read -- engine, that reads pages from image file(s) * * Several page-read's can be arranged in a chain to read * pages from a series of snapshot. * * A task's address space vs pagemaps+page image pairs can * look like this (taken from comment in page-pipe.h): * * task: * * 0 0 0 0 1 1 1 * 0 3 6 B 2 7 C * ---+++-----+++++++-----+++++---- * pm1: ---+++-----++++++-------++++---- * pm2: ---==+-----====+++-----++===---- * * Here + is present page, - is non prsent, = is present, * but is not modified from last snapshot. * * Thus pagemap.img and pages.img entries are * * pm1: 03:3,0B:6,18:4 * pm2: 03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P * * where P means "page is in parent pagemap". * * pg1: 03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B * pg2: 05,0F,10,11,17,18 * * When trying to restore from these 4 files we'd have * to carefully scan pagemap.img's one by one and read or * skip pages from pages.img where appropriate. * * All this is implemented in read_pagemap_page. */ struct page_read { /* reads page from current pagemap */ int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *, unsigned flags); /* Advance page_read to the next entry */ int (*advance)(struct page_read *pr); void (*close)(struct page_read *); void (*skip_pages)(struct page_read *, unsigned long len); int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); void (*reset)(struct page_read *pr); int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags); /* Whether or not pages can be read in PIE code */ bool pieok; /* Private data of reader */ struct cr_img *pmi; struct cr_img *pi; u32 pages_img_id; PagemapEntry *pe; /* current pagemap we are on */ struct page_read *parent; /* parent pagemap (if ->in_parent pagemap is met in image, then go to this guy for page, see read_pagemap_page */ unsigned long cvaddr; /* vaddr we are on */ off_t pi_off; /* current offset in pages file */ struct iovec bunch; /* record consequent neighbour iovecs to punch together */ unsigned id; /* for logging */ int pid; /* PID of the process */ PagemapEntry **pmes; int nr_pmes; int curr_pme; struct list_head async; }; /* flags for ->read_pages */ #define PR_ASYNC 0x1 /* may exit w/o data in the buffer */ #define PR_ASAP 0x2 /* PR_ASYNC, but start the IO right now */ /* flags for open_page_read */ #define PR_SHMEM 0x1 #define PR_TASK 0x2 #define PR_TYPE_MASK 0x3 #define PR_MOD 0x4 /* Will need to modify */ #define PR_REMOTE 0x8 /* * -1 -- error * 0 -- no images * 1 -- opened */ extern int open_page_read(int pid, struct page_read *, int pr_flags); extern int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags); struct task_restore_args; int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, struct list_head *to); int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); /* * Create a shallow copy of page_read object. * The new object shares the pagemap structures with the original, but * maintains its own set of references to those structures. */ extern void dup_page_read(struct page_read *src, struct page_read *dst); extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned long len); static inline unsigned long pagemap_len(PagemapEntry *pe) { return pe->nr_pages * PAGE_SIZE; } static inline bool page_read_has_parent(struct page_read *pr) { return pr->parent != NULL; } /* Pagemap flags */ #define PE_PARENT (1 << 0) /* pages are in parent snapshot */ #define PE_LAZY (1 << 1) /* pages can be lazily restored */ #define PE_PRESENT (1 << 2) /* pages are present in pages*img */ static inline bool pagemap_in_parent(PagemapEntry *pe) { return !!(pe->flags & PE_PARENT); } static inline bool pagemap_lazy(PagemapEntry *pe) { return !!(pe->flags & PE_LAZY); } static inline bool pagemap_present(PagemapEntry *pe) { return !!(pe->flags & PE_PRESENT); } #endif /* __CR_PAGE_READ_H__ */ criu-3.6/criu/include/parasite-syscall.h000066400000000000000000000042451317335042600203530ustar00rootroot00000000000000#ifndef __CR_PARASITE_SYSCALL_H__ #define __CR_PARASITE_SYSCALL_H__ #include "pid.h" #include "common/list.h" #include "config.h" #include "asm/parasite-syscall.h" struct parasite_dump_thread; struct parasite_dump_misc; struct parasite_drain_fd; struct vm_area_list; struct pstree_item; struct _CredsEntry; struct _CoreEntry; struct list_head; struct cr_imgset; struct fd_opts; struct pid; struct parasite_dump_cgroup_args; struct rt_sigframe; struct parasite_ctl; extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *); struct proc_posix_timers_stat; extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, struct pstree_item *); extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); extern int parasite_dump_creds(struct parasite_ctl *ctl, struct _CredsEntry *ce); extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, struct _CoreEntry *core); extern int parasite_dump_thread_seized(struct parasite_ctl *ctl, int id, struct pid *tid, struct _CoreEntry *core); extern int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *dt); extern int parasite_drain_fds_seized(struct parasite_ctl *ctl, struct parasite_drain_fd *dfds, int nr_fds, int off, int *lfds, struct fd_opts *flags); extern int parasite_get_proc_fd_seized(struct parasite_ctl *ctl); extern struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, struct vm_area_list *vma_area_list); extern void parasite_ensure_args_size(unsigned long sz); extern unsigned long get_exec_start(struct vm_area_list *); extern int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_args *cgroup); extern struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type); extern int parasite_init_threads_seized(struct parasite_ctl *ctl, struct pstree_item *item); extern int parasite_fini_threads_seized(struct parasite_ctl *ctl); #endif /* __CR_PARASITE_SYSCALL_H__ */ criu-3.6/criu/include/parasite-vdso.h000066400000000000000000000060571317335042600176570ustar00rootroot00000000000000#ifndef __CR_PARASITE_VDSO_H__ #define __CR_PARASITE_VDSO_H__ #include "config.h" #ifdef CONFIG_VDSO #include "util-vdso.h" #include "images/vma.pb-c.h" struct parasite_ctl; struct vm_area_list; /* Check if symbol present in symtable */ static inline bool vdso_symbol_empty(struct vdso_symbol *s) { return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0'; } /* * Special mark which allows to identify runtime vdso (rt-vdso) where * calls from proxy (original) vdso are redirected. This mark usually * placed at the start of vdso area where Elf header lives. * Since such runtime vdso is solely used by the proxy and * nobody else is supposed to access it, it's more-less * safe to screw the Elf header with @signature and * vvar/vdso addresses for next dumping. * * The @orig_addr deserves a few comments. When we redirect the calls * from the original vdso to runtime vdso, on next checkpoint it won't * be possible to find original vdso/vvar pair, thus we save their * addresses in the member. * * As on the following dumps we need to drop rt-{vvar,vdso} pair * from list of VMAs to save in images, we save rt-vvar address also. */ struct vdso_mark { u64 signature; unsigned long orig_vdso_addr; unsigned long version; unsigned long orig_vvar_addr; unsigned long rt_vvar_addr; }; #define VDSO_MARK_SIGNATURE_V1 (0x6f73647675697263ULL) /* Magic number (criuvdso) */ #define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */ #define VDSO_MARK_SIGNATURE_V3 (0x4f53447655495243ULL) /* Magic number (CRIUvDSO) */ #define VDSO_MARK_CUR_VERSION (3) static inline void vdso_put_mark(void *where, unsigned long rt_vvar_addr, unsigned long orig_vdso_addr, unsigned long orig_vvar_addr) { struct vdso_mark *m = where; m->signature = VDSO_MARK_SIGNATURE_V3; m->orig_vdso_addr = orig_vdso_addr; m->version = VDSO_MARK_CUR_VERSION; m->orig_vvar_addr = orig_vvar_addr; m->rt_vvar_addr = rt_vvar_addr; } static inline bool is_vdso_mark(void *addr) { struct vdso_mark *m = addr; switch (m->signature) { case VDSO_MARK_SIGNATURE_V3: return true; /* * Old formats -- simply extend the mark up * to the version we support. */ case VDSO_MARK_SIGNATURE_V2: vdso_put_mark(m, VVAR_BAD_ADDR, m->orig_vdso_addr, m->orig_vvar_addr); return true; case VDSO_MARK_SIGNATURE_V1: vdso_put_mark(m, VVAR_BAD_ADDR, m->orig_vdso_addr, VVAR_BAD_ADDR); return true; } return false; } extern int vdso_do_park(struct vdso_maps *rt, unsigned long park_at, unsigned long park_size); extern int vdso_map_compat(unsigned long map_at); extern int vdso_proxify(struct vdso_symtable *sym_rt, unsigned long vdso_rt_parked_at, VmaEntry *vmas, size_t nr_vmas, bool compat_vdso, bool force_trampolines); extern int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, struct vdso_symtable *from, bool compat_vdso); #else /* CONFIG_VDSO */ #define vdso_do_park(sym_rt, park_at, park_size) (0) #define vdso_map_compat(map_at) (0) #endif /* CONFIG_VDSO */ #endif /* __CR_PARASITE_VDSO_H__ */ criu-3.6/criu/include/parasite.h000066400000000000000000000116501317335042600167010ustar00rootroot00000000000000#ifndef __CR_PARASITE_H__ #define __CR_PARASITE_H__ #define PARASITE_MAX_SIZE (64 << 10) #ifndef __ASSEMBLY__ #include #include #include #include #include "image.h" #include "util-pie.h" #include "common/lock.h" #include "infect-rpc.h" #include "images/vma.pb-c.h" #include "images/tty.pb-c.h" #define __head __used __section(.head.text) enum { PARASITE_CMD_DUMP_THREAD = PARASITE_USER_CMDS, PARASITE_CMD_MPROTECT_VMAS, PARASITE_CMD_DUMPPAGES, PARASITE_CMD_DUMP_SIGACTS, PARASITE_CMD_DUMP_ITIMERS, PARASITE_CMD_DUMP_POSIX_TIMERS, PARASITE_CMD_DUMP_MISC, PARASITE_CMD_DRAIN_FDS, PARASITE_CMD_GET_PROC_FD, PARASITE_CMD_DUMP_TTY, PARASITE_CMD_CHECK_VDSO_MARK, PARASITE_CMD_CHECK_AIOS, PARASITE_CMD_DUMP_CGROUP, PARASITE_CMD_MAX, }; struct parasite_vma_entry { unsigned long start; unsigned long len; int prot; }; struct parasite_vdso_vma_entry { unsigned long start; unsigned long len; unsigned long orig_vdso_addr; unsigned long orig_vvar_addr; unsigned long rt_vvar_addr; int is_marked; bool try_fill_symtable; bool is_vdso; }; struct parasite_dump_pages_args { unsigned int nr_vmas; unsigned int add_prot; unsigned int off; unsigned int nr_segs; unsigned int nr_pages; }; static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) { return (struct parasite_vma_entry *)(a + 1); } static inline struct iovec *pargs_iovs(struct parasite_dump_pages_args *a) { return (struct iovec *)(pargs_vmas(a) + a->nr_vmas); } struct parasite_dump_sa_args { rt_sigaction_t sas[SIGMAX]; }; struct parasite_dump_itimers_args { struct itimerval real; struct itimerval virt; struct itimerval prof; }; struct posix_timer { int it_id; struct itimerspec val; int overrun; }; struct parasite_dump_posix_timers_args { int timer_n; struct posix_timer timer[0]; }; struct parasite_aio { unsigned long ctx; unsigned int size; }; struct parasite_check_aios_args { unsigned nr_rings; struct parasite_aio ring[0]; }; static inline int posix_timers_dump_size(int timer_n) { return sizeof(int) + sizeof(struct posix_timer) * timer_n; } /* * Misc sfuff, that is too small for separate file, but cannot * be read w/o using parasite */ struct parasite_dump_misc { unsigned long brk; u32 pid; u32 sid; u32 pgid; u32 umask; int dumpable; int thp_disabled; }; /* * Calculate how long we can make the groups array in parasite_dump_creds * and still fit the struct in one page */ #define PARASITE_MAX_GROUPS \ ((PAGE_SIZE - sizeof(struct parasite_dump_thread) - \ offsetof(struct parasite_dump_creds, groups)) / sizeof(unsigned int)) /* groups */ struct parasite_dump_creds { unsigned int cap_last_cap; u32 cap_inh[CR_CAP_SIZE]; u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; int uids[4]; int gids[4]; unsigned int secbits; unsigned int ngroups; /* * FIXME -- this structure is passed to parasite code * through parasite args area so in parasite_dump_creds() * call we check for size of this data fits the size of * the area. Unfortunatelly, we _actually_ use more bytes * than the sizeof() -- we put PARASITE_MAX_GROUPS int-s * in there, so the size check is not correct. * * However, all this works simply because we make sure * the PARASITE_MAX_GROUPS is so, that the total amount * of memory in use doesn't exceed the PAGE_SIZE and the * args area is at least one page (PARASITE_ARG_SIZE_MIN). */ unsigned int groups[0]; }; struct parasite_dump_thread { unsigned int *tid_addr; pid_t tid; tls_t tls; stack_t sas; int pdeath_sig; struct parasite_dump_creds creds[0]; }; static inline void copy_sas(ThreadSasEntry *dst, const stack_t *src) { dst->ss_sp = encode_pointer(src->ss_sp); dst->ss_size = (u64)src->ss_size; dst->ss_flags = src->ss_flags; } /* * How many descriptors can be transferred from parasite: * * 1) struct parasite_drain_fd + all descriptors should fit into one page * 2) The value should be a multiple of CR_SCM_MAX_FD, because descriptors * are transferred with help of send_fds and recv_fds. * 3) criu should work with a defaul value of the file limit (1024) */ #define PARASITE_MAX_FDS CR_SCM_MAX_FD * 3 struct parasite_drain_fd { int nr_fds; int fds[0]; }; struct fd_opts { char flags; struct { uint32_t uid; uint32_t euid; uint32_t signum; uint32_t pid_type; uint32_t pid; } fown; }; static inline int drain_fds_size(struct parasite_drain_fd *dfds) { int nr_fds = min((int)PARASITE_MAX_FDS, dfds->nr_fds); return sizeof(*dfds) + nr_fds * (sizeof(dfds->fds[0]) + sizeof(struct fd_opts)); } struct parasite_tty_args { int fd; int type; int sid; int pgrp; bool hangup; int st_pckt; int st_lock; int st_excl; }; struct parasite_dump_cgroup_args { /* * 4K should be enough for most cases. * * The string is null terminated. */ char contents[1 << 12]; }; #endif /* !__ASSEMBLY__ */ #endif /* __CR_PARASITE_H__ */ criu-3.6/criu/include/path.h000066400000000000000000000020441317335042600160220ustar00rootroot00000000000000#ifndef __CR_PATH_H__ #define __CR_PATH_H__ #include "namespaces.h" #include "pstree.h" /* Asolute paths are used on dump and relative paths are used on restore */ static inline int is_root(char *p) { return (!strcmp(p, "/")); } /* True for the root mount (the topmost one) */ static inline int is_root_mount(struct mount_info *mi) { return mi->parent == NULL && mi->nsid->id == root_item->ids->mnt_ns_id; } /* * True if the mountpoint target is root on its FS. * * This is used to determine whether we need to postpone * mounting. E.g. one can bind mount some subdir from a * disk, and in this case we'll have to get the root disk * mount first, then bind-mount it. See do_mount_one(). */ static inline int fsroot_mounted(struct mount_info *mi) { return is_root(mi->root); } char *cut_root_for_bind(char *target_root, char *source_root); /* * Get a mount point for a sibling of m if m->parent and p are in the same * shared group. */ char *mnt_get_sibling_path(struct mount_info *m, struct mount_info *p, char *buf, int len); #endif criu-3.6/criu/include/pid.h000066400000000000000000000025661317335042600156530ustar00rootroot00000000000000#ifndef __CR_PID_H__ #define __CR_PID_H__ #include #include "stdbool.h" #include "rbtree.h" /* * Task states, used in e.g. struct pid's state. */ enum __criu_task_state { /* Values shared with compel */ TASK_ALIVE = COMPEL_TASK_ALIVE, TASK_DEAD = COMPEL_TASK_DEAD, TASK_STOPPED = COMPEL_TASK_STOPPED, TASK_ZOMBIE = COMPEL_TASK_ZOMBIE, /* Own internal states */ TASK_HELPER = COMPEL_TASK_MAX + 1, TASK_THREAD, /* new values are to be added before this line */ TASK_UNDEF = 0xff }; struct pid { struct pstree_item *item; /* * The @real pid is used to fetch tasks during dumping stage, * This is a global pid seen from the context where the dumping * is running. */ pid_t real; int state; /* TASK_XXX constants */ /* * The @virt pid is one which used in the image itself and keeps * the pid value to be restored. This pid fetched from the * dumpee context, because the dumpee might have own pid namespace. */ struct { pid_t virt; struct rb_node node; } ns[1]; /* Must be at the end of struct pid */ }; /* * When we have to restore a shared resource, we mush select which * task should do it, and make other(s) wait for it. In order to * avoid deadlocks, always make task with lower pid be the restorer. */ static inline bool pid_rst_prio(unsigned pid_a, unsigned pid_b) { return pid_a < pid_b; } #endif /* __CR_PID_H__ */ criu-3.6/criu/include/pipes.h000066400000000000000000000032401317335042600162050ustar00rootroot00000000000000#ifndef __CR_PIPES_H__ #define __CR_PIPES_H__ #include "images/pipe-data.pb-c.h" #include "images/pipe.pb-c.h" extern struct collect_image_info pipe_cinfo; extern struct collect_image_info pipe_data_cinfo; extern const struct fdtype_ops pipe_dump_ops; static inline u32 pipe_id(const struct fd_parms *p) { return p->stat.st_ino; } #define NR_PIPES_WITH_DATA 1024 struct pipe_data_dump { int img_type; unsigned int nr; u32 ids[NR_PIPES_WITH_DATA]; }; extern int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms *p); struct pipe_data_rst { PipeDataEntry *pde; void *data; struct pipe_data_rst *next; }; #define PIPE_DATA_HASH_BITS 5 #define PIPE_DATA_HASH_SIZE (1 << PIPE_DATA_HASH_BITS) #define PIPE_DATA_HASH_MASK (PIPE_DATA_HASH_SIZE - 1) extern int do_collect_pipe_data(struct pipe_data_rst *, ProtobufCMessage *, struct cr_img *, struct pipe_data_rst **hash); extern int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash); /* * The sequence of objects which should be restored: * pipe -> files struct-s -> fd-s. * pipe_entry describes pipe's file structs-s. * A pipe doesn't have own properties, so it has no object. */ #include "images/pipe.pb-c.h" struct pipe_info { PipeEntry *pe; struct list_head pipe_list; /* All pipe_info with the same pipe_id * This is pure circular list without head */ struct list_head list; /* global list of pipes */ struct file_desc d; unsigned int create : 1, reopen : 1; }; extern int collect_one_pipe_ops(void *o, ProtobufCMessage *base, struct file_desc_ops *ops); extern int open_pipe(struct file_desc *d, int *new_fd); #endif /* __CR_PIPES_H__ */ criu-3.6/criu/include/plugin.h000066400000000000000000000022621317335042600163660ustar00rootroot00000000000000#ifndef __CR_PLUGIN_H__ #define __CR_PLUGIN_H__ #include "criu-plugin.h" #include "common/compiler.h" #include "common/list.h" #define CR_PLUGIN_DEFAULT "/var/lib/criu/" void cr_plugin_fini(int stage, int err); int cr_plugin_init(int stage); typedef struct { struct list_head head; struct list_head hook_chain[CR_PLUGIN_HOOK__MAX]; } cr_plugin_ctl_t; extern cr_plugin_ctl_t cr_plugin_ctl; typedef struct { cr_plugin_desc_t *d; struct list_head list; void *dlhandle; struct list_head link[CR_PLUGIN_HOOK__MAX]; } plugin_desc_t; #define run_plugins(__hook, ...) \ ({ \ plugin_desc_t *this; \ int __ret = -ENOTSUP; \ \ list_for_each_entry(this, &cr_plugin_ctl.hook_chain[CR_PLUGIN_HOOK__ ##__hook], \ link[CR_PLUGIN_HOOK__ ##__hook]) { \ pr_debug("plugin: `%s' hook %u -> %p\n", \ this->d->name, CR_PLUGIN_HOOK__ ##__hook, \ this->d->hooks[CR_PLUGIN_HOOK__ ##__hook]); \ __ret = ((CR_PLUGIN_HOOK__ ##__hook ##_t *) \ this->d->hooks[CR_PLUGIN_HOOK__ ##__hook])(__VA_ARGS__); \ if (__ret == -ENOTSUP) \ continue; \ break; \ } \ __ret; \ }) #endif criu-3.6/criu/include/posix-timer.h000066400000000000000000000010421317335042600173430ustar00rootroot00000000000000#ifndef __CR_PROC_POSIX_TIMER_H__ #define __CR_PROC_POSIX_TIMER_H__ #include "common/list.h" struct str_posix_timer { long it_id; int clock_id; int si_signo; int it_sigev_notify; void * sival_ptr; }; struct proc_posix_timer { struct list_head list; struct str_posix_timer spt; }; struct proc_posix_timers_stat { int timer_n; struct list_head timers; }; extern int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat * args); void free_posix_timers(struct proc_posix_timers_stat *st); #endif /* __CR_PROC_POSIX_TIMER_H__ */ criu-3.6/criu/include/prctl.h000066400000000000000000000030731317335042600162150ustar00rootroot00000000000000#ifndef __CR_PRCTL_H__ #define __CR_PRCTL_H__ #include "int.h" #ifndef PR_SET_NAME # define PR_SET_NAME 15 #endif #ifndef PR_GET_NAME # define PR_GET_NAME 16 #endif #ifndef PR_SET_SECCOMP # define PR_SET_SECCOMP 22 #endif #ifndef PR_CAPBSET_READ # define PR_CAPBSET_READ 23 #endif #ifndef PR_CAPBSET_DROP # define PR_CAPBSET_DROP 24 #endif #ifndef PR_GET_SECUREBITS # define PR_GET_SECUREBITS 27 #endif #ifndef PR_SET_SECUREBITS # define PR_SET_SECUREBITS 28 #endif #ifndef PR_GET_DUMPABLE # define PR_GET_DUMPABLE 3 #endif #ifndef PR_SET_DUMPABLE # define PR_SET_DUMPABLE 4 #endif #ifndef PR_SET_MM #define PR_SET_MM 35 # define PR_SET_MM_START_CODE 1 # define PR_SET_MM_END_CODE 2 # define PR_SET_MM_START_DATA 3 # define PR_SET_MM_END_DATA 4 # define PR_SET_MM_START_STACK 5 # define PR_SET_MM_START_BRK 6 # define PR_SET_MM_BRK 7 # define PR_SET_MM_ARG_START 8 # define PR_SET_MM_ARG_END 9 # define PR_SET_MM_ENV_START 10 # define PR_SET_MM_ENV_END 11 # define PR_SET_MM_AUXV 12 # define PR_SET_MM_EXE_FILE 13 #endif #ifndef PR_SET_MM_MAP # define PR_SET_MM_MAP 14 # define PR_SET_MM_MAP_SIZE 15 struct prctl_mm_map { u64 start_code; u64 end_code; u64 start_data; u64 end_data; u64 start_brk; u64 brk; u64 start_stack; u64 arg_start; u64 arg_end; u64 env_start; u64 env_end; u64 *auxv; u32 auxv_size; u32 exe_fd; }; #endif #ifndef PR_GET_TID_ADDRESS # define PR_GET_TID_ADDRESS 40 #endif #ifndef PR_SET_THP_DISABLE # define PR_SET_THP_DISABLE 41 #endif #ifndef PR_GET_THP_DISABLE # define PR_GET_THP_DISABLE 42 #endif #endif /* __CR_PRCTL_H__ */ criu-3.6/criu/include/proc_parse.h000066400000000000000000000050521317335042600172250ustar00rootroot00000000000000#ifndef __CR_PROC_PARSE_H__ #define __CR_PROC_PARSE_H__ #include #include #include "images/seccomp.pb-c.h" #define PROC_TASK_COMM_LEN 32 #define PROC_TASK_COMM_LEN_FMT "(%31s" struct proc_pid_stat { int pid; char comm[PROC_TASK_COMM_LEN]; char state; int ppid; int pgid; int sid; int tty_nr; int tty_pgrp; unsigned int flags; unsigned long min_flt; unsigned long cmin_flt; unsigned long maj_flt; unsigned long cmaj_flt; unsigned long utime; unsigned long stime; long cutime; long cstime; long priority; long nice; int num_threads; int zero0; unsigned long long start_time; unsigned long vsize; long mm_rss; unsigned long rsslim; unsigned long start_code; unsigned long end_code; unsigned long start_stack; unsigned long esp; unsigned long eip; unsigned long sig_pending; unsigned long sig_blocked; unsigned long sig_ignored; unsigned long sig_handled; unsigned long wchan; unsigned long zero1; unsigned long zero2; int exit_signal; int task_cpu; unsigned int rt_priority; unsigned int policy; unsigned long long delayacct_blkio_ticks; unsigned long gtime; long cgtime; unsigned long start_data; unsigned long end_data; unsigned long start_brk; unsigned long arg_start; unsigned long arg_end; unsigned long env_start; unsigned long env_end; int exit_code; }; struct seccomp_info { SeccompFilter filter; int id; struct seccomp_info *prev; }; #define PROC_CAP_SIZE 2 struct proc_status_creds { struct seize_task_status s; unsigned int uids[4]; unsigned int gids[4]; u32 last_filter; /* * Keep them at the end of structure * for fast comparison reason. */ u32 cap_inh[PROC_CAP_SIZE]; u32 cap_prm[PROC_CAP_SIZE]; u32 cap_eff[PROC_CAP_SIZE]; u32 cap_bnd[PROC_CAP_SIZE]; }; #define INVALID_UID ((uid_t)-1) extern int parse_pid_stat(pid_t pid, struct proc_pid_stat *s); extern unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent); extern int parse_pid_oom_score_adj(pid_t pid, int *err); extern int prepare_loginuid(unsigned int value, unsigned int loglevel); extern int parse_pid_status(pid_t pid, struct seize_task_status *, void *data); extern int parse_file_locks(void); extern int get_fd_mntid(int fd, int *mnt_id); struct pid; extern int parse_threads(int pid, struct pid **_t, int *_n); int parse_children(pid_t pid, pid_t **_c, int *_n); extern bool is_vma_range_fmt(char *line); extern void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf); #endif /* __CR_PROC_PARSE_H__ */ criu-3.6/criu/include/protobuf-desc.h000066400000000000000000000031531317335042600176440ustar00rootroot00000000000000#ifndef __CR_PROTOBUF_DESC_H__ #define __CR_PROTOBUF_DESC_H__ #include #include enum { /* PB_AUTOGEN_START */ PB_INVENTORY, /* 0 */ PB_STATS, PB_FDINFO, PB_CORE, PB_MM, PB_VMA, PB_ITIMER, PB_POSIX_TIMER, PB_CREDS, PB_FS, PB_UTSNS, /* 10 */ PB_IPC_VAR, PB_IPC_SHM, PB_IPC_SEM, PB_MNT, PB_PSTREE, PB_GHOST_FILE, PB_TCP_STREAM, PB_REG_FILE, PB_EXT_FILE, PB_NS_FILE, /* 20 */ PB_INET_SK, PB_UNIX_SK, PB_PACKET_SOCK, PB_NETLINK_SK, PB_PIPE, PB_FIFO, PB_PIPE_DATA, PB_EVENTFD_FILE, PB_EVENTPOLL_FILE, PB_EVENTPOLL_TFD, /* 30 */ PB_SIGNALFD, PB_INOTIFY_FILE, PB_INOTIFY_WD, PB_FANOTIFY_FILE, PB_FANOTIFY_MARK, PB_TTY_FILE, PB_TTY_INFO, PB_FILE_LOCK, PB_RLIMIT, PB_PAGEMAP, /* 40 */ PB_SIGINFO, PB_TUNFILE, PB_IRMAP_CACHE, PB_CGROUP, PB_SECCOMP, PB_TIMERFD, PB_CPUINFO, PB_USERNS, PB_NETNS, PB_BINFMT_MISC, /* 50 */ PB_TTY_DATA, PB_AUTOFS, PB_GHOST_CHUNK, PB_FILE, /* PB_AUTOGEN_STOP */ PB_PAGEMAP_HEAD, PB_IDS, PB_SIGACT, PB_NETDEV, PB_REMAP_FPATH, PB_SK_QUEUES, PB_IPCNS_MSG, PB_IPCNS_MSG_ENT, PB_MAX, }; typedef size_t (*pb_getpksize_t)(void *obj); typedef size_t (*pb_pack_t)(void *obj, void *where); typedef void *(*pb_unpack_t)(void *allocator, size_t size, void *from); typedef void (*pb_free_t)(void *obj, void *allocator); struct cr_pb_message_desc { pb_getpksize_t getpksize; pb_pack_t pack; pb_unpack_t unpack; pb_free_t free; const ProtobufCMessageDescriptor *pb_desc; }; extern void cr_pb_init(void); extern struct cr_pb_message_desc cr_pb_descs[PB_MAX]; #endif /* __CR_PROTOBUF_DESC_H__ */ criu-3.6/criu/include/protobuf.h000066400000000000000000000030071317335042600167260ustar00rootroot00000000000000#ifndef __CR_PROTOBUF_H__ #define __CR_PROTOBUF_H__ #include #include "protobuf-desc.h" #include "common/compiler.h" #include "util.h" struct cr_img; extern int do_pb_read_one(struct cr_img *, void **objp, int type, bool eof); #define pb_read_one(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, false) #define pb_read_one_eof(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, true) extern int pb_write_one(struct cr_img *, void *obj, int type); #define pb_pksize(__obj, __proto_message_name) \ (__proto_message_name ##__get_packed_size(__obj) + sizeof(u32)) #define pb_repeated_size(__obj, __member) \ ((size_t)(sizeof(*(__obj)->__member) * (__obj)->n_ ##__member)) #define pb_msg(__base, __type) \ container_of(__base, __type, base) #include struct collect_image_info { int fd_type; int pb_type; unsigned int priv_size; int (*collect)(void *, ProtobufCMessage *, struct cr_img *); unsigned flags; }; #define COLLECT_SHARED 0x1 /* use shared memory for obj-s */ #define COLLECT_NOFREE 0x2 /* don't free entry after callback */ #define COLLECT_HAPPENED 0x4 /* image was opened and collected */ extern int collect_image(struct collect_image_info *); extern int collect_entry(ProtobufCMessage *base, struct collect_image_info *cinfo); static inline int collect_images(struct collect_image_info **array, unsigned size) { int i; for (i = 0; i < size; i++) { if (collect_image(array[i])) return -1; } return 0; } #endif /* __CR_PROTOBUF_H__ */ criu-3.6/criu/include/pstree.h000066400000000000000000000065751317335042600164050ustar00rootroot00000000000000#ifndef __CR_PSTREE_H__ #define __CR_PSTREE_H__ #include "common/list.h" #include "common/lock.h" #include "pid.h" #include "images/core.pb-c.h" /* * That's the init process which usually inherit * all orphaned children in the system. */ #define INIT_PID (1) struct pstree_item { struct pstree_item *parent; struct list_head children; /* list of my children */ struct list_head sibling; /* linkage in my parent's children list */ struct pid *pid; pid_t pgid; pid_t sid; pid_t born_sid; int nr_threads; /* number of threads */ struct pid *threads; /* array of threads */ CoreEntry **core; TaskKobjIdsEntry *ids; union { futex_t task_st; unsigned long task_st_le_bits; }; }; static inline pid_t vpid(const struct pstree_item *i) { return i->pid->ns[0].virt; } enum { FDS_EVENT_BIT = 0, }; #define FDS_EVENT (1 << FDS_EVENT_BIT) struct pstree_item *current; struct rst_info; /* See alloc_pstree_item() for details */ static inline struct rst_info *rsti(struct pstree_item *i) { return (struct rst_info *)(i + 1); } struct ns_id; struct dmp_info { struct ns_id *netns; /* * We keep the creds here so that we can compare creds while seizing * threads. Dumping tasks with different creds is not supported. */ struct proc_status_creds *pi_creds; struct page_pipe *mem_pp; struct parasite_ctl *parasite_ctl; }; static inline struct dmp_info *dmpi(const struct pstree_item *i) { return (struct dmp_info *)(i + 1); } /* ids is alocated and initialized for all alive tasks */ static inline int shared_fdtable(struct pstree_item *item) { return (item->parent && item->ids->files_id == item->parent->ids->files_id); } static inline bool is_alive_state(int state) { return (state == TASK_ALIVE) || (state == TASK_STOPPED); } static inline bool task_alive(struct pstree_item *i) { return is_alive_state(i->pid->state); } extern void free_pstree(struct pstree_item *root_item); extern struct pstree_item *__alloc_pstree_item(bool rst); #define alloc_pstree_item() __alloc_pstree_item(false) extern void init_pstree_helper(struct pstree_item *ret); extern struct pstree_item *lookup_create_item(pid_t pid); extern void pstree_insert_pid(struct pid *pid_node); extern struct pid *pstree_pid_by_virt(pid_t pid); extern struct pstree_item *root_item; extern struct pstree_item *pstree_item_next(struct pstree_item *item); #define for_each_pstree_item(pi) \ for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) extern bool restore_before_setsid(struct pstree_item *child); extern int prepare_pstree(void); extern int prepare_dummy_pstree(void); extern int dump_pstree(struct pstree_item *root_item); struct pstree_item *pstree_item_by_real(pid_t virt); struct pstree_item *pstree_item_by_virt(pid_t virt); extern int pid_to_virt(pid_t pid); struct task_entries; extern struct task_entries *task_entries; extern int prepare_task_entries(void); extern int prepare_dummy_task_state(struct pstree_item *pi); extern int get_task_ids(struct pstree_item *); extern struct _TaskKobjIdsEntry *root_ids; extern void core_entry_free(CoreEntry *core); extern CoreEntry *core_entry_alloc(int alloc_thread_info, int alloc_tc); extern int pstree_alloc_cores(struct pstree_item *item); extern void pstree_free_cores(struct pstree_item *item); extern int collect_pstree_ids(void); extern int preorder_pstree_traversal(struct pstree_item *item, int (*f)(struct pstree_item *)); #endif /* __CR_PSTREE_H__ */ criu-3.6/criu/include/ptrace-compat.h000066400000000000000000000005441317335042600176300ustar00rootroot00000000000000#ifndef __CR_PTRACE_H__ #define __CR_PTRACE_H__ #include #include #include "config.h" #ifndef CONFIG_HAS_PTRACE_PEEKSIGINFO struct ptrace_peeksiginfo_args { __u64 off; /* from which siginfo to start */ __u32 flags; __u32 nr; /* how may siginfos to take */ }; #endif #endif /* __CR_PTRACE_H__ */ criu-3.6/criu/include/rbtree.h000066400000000000000000000047751317335042600163660ustar00rootroot00000000000000/* * RBtree implementation adopted from the Linux kernel sources. */ #ifndef __CR_RBTREE_H__ #define __CR_RBTREE_H__ #include #include "common/compiler.h" #define RB_RED 0 #define RB_BLACK 1 #define RB_MASK 3 struct rb_node { unsigned long rb_parent_color; /* Keeps both parent anc color */ struct rb_node *rb_right; struct rb_node *rb_left; } __aligned(sizeof(long)); struct rb_root { struct rb_node *rb_node; }; #define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~RB_MASK)) #define rb_color(r) ((r)->rb_parent_color & RB_BLACK) #define rb_is_red(r) (!rb_color(r)) #define rb_is_black(r) (rb_color(r)) #define rb_set_red(r) do { (r)->rb_parent_color &= ~RB_BLACK; } while (0) #define rb_set_black(r) do { (r)->rb_parent_color |= RB_BLACK; } while (0) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->rb_parent_color = (rb->rb_parent_color & RB_MASK) | (unsigned long)p; } static inline void rb_set_color(struct rb_node *rb, int color) { rb->rb_parent_color = (rb->rb_parent_color & ~RB_BLACK) | color; } #define RB_ROOT (struct rb_root){ NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) #define RB_EMPTY_NODE(node) (rb_parent(node) == node) #define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) static inline void rb_init_node(struct rb_node *node) { *node = (struct rb_node){ }; RB_CLEAR_NODE(node); } extern void rb_insert_color(struct rb_node *node, struct rb_root *root); extern void rb_erase(struct rb_node *node, struct rb_root *root); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_first(const struct rb_root *root); extern struct rb_node *rb_last(const struct rb_root *root); extern struct rb_node *rb_next(const struct rb_node *node); extern struct rb_node *rb_prev(const struct rb_node *node); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_and_balance(struct rb_root *root, struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { rb_link_node(node, parent, rb_link); rb_insert_color(node, root); } #endif /* __CR_RBTREE_H__ */ criu-3.6/criu/include/restore.h000066400000000000000000000002731317335042600165530ustar00rootroot00000000000000#ifndef __CR_INC_RESTORE_H__ #define __CR_INC_RESTORE_H__ #include "pid.h" #include "types.h" #include "asm/restore.h" extern int arch_set_thread_regs_nosigrt(struct pid *pid); #endif criu-3.6/criu/include/restorer.h000066400000000000000000000161201317335042600167330ustar00rootroot00000000000000#ifndef __CR_RESTORER_H__ #define __CR_RESTORER_H__ #include #include #include #include "types.h" #include "int.h" #include "types.h" #include "common/compiler.h" #include #include "common/lock.h" #include "util.h" #include "asm/restorer.h" #include "config.h" #include "posix-timer.h" #include "timerfd.h" #include "shmem.h" #include "parasite-vdso.h" #include "fault-injection.h" #include #include "images/mm.pb-c.h" /* * These *must* be power of two values. */ #define RESTORE_ARGS_SIZE (512) #define RESTORE_STACK_REDZONE (128) #define RESTORE_STACK_SIZE (KILO(32)) struct restore_mem_zone { u8 redzone[RESTORE_STACK_REDZONE]; u8 stack[RESTORE_STACK_SIZE]; u8 rt_sigframe[RESTORE_STACK_SIGFRAME]; } __stack_aligned__; struct rst_sched_param { int policy; int nice; int prio; }; struct restore_posix_timer { struct str_posix_timer spt; struct itimerspec val; int overrun; }; /* * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame, * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things * simpler, force both _args alignment be 64 bytes. */ struct thread_creds_args { CredsEntry creds; unsigned int cap_last_cap; u32 cap_inh[CR_CAP_SIZE]; u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; unsigned int secbits; char *lsm_profile; unsigned int *groups; unsigned long mem_lsm_profile_pos; unsigned long mem_groups_pos; unsigned long mem_pos_next; }; struct thread_restore_args { struct restore_mem_zone *mz; int pid; UserRegsEntry gpregs; u64 clear_tid_addr; u64 futex_rla; u32 futex_rla_len; struct rst_sched_param sp; struct task_restore_args *ta; tls_t tls; siginfo_t *siginfo; unsigned int siginfo_n; int pdeath_sig; struct thread_creds_args *creds_args; } __aligned(64); typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args); struct restore_vma_io { int nr_iovs; loff_t off; struct iovec iovs[0]; }; #define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec)) struct task_restore_args { struct thread_restore_args *t; /* thread group leader */ int fd_exe_link; /* opened self->exe file */ int logfd; unsigned int loglevel; struct timeval logstart; int uffd; bool has_thp_enabled; /* threads restoration */ int nr_threads; /* number of threads */ thread_restore_fcall_t clone_restore_fn; /* helper address for clone() call */ struct thread_restore_args *thread_args; /* array of thread arguments */ struct task_entries *task_entries; void *rst_mem; unsigned long rst_mem_size; /* Below arrays get remapped from RM_PRIVATE in sigreturn_restore */ VmaEntry *vmas; unsigned int vmas_n; int vma_ios_fd; struct restore_vma_io *vma_ios; unsigned int vma_ios_n; struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; struct restore_timerfd *timerfd; unsigned int timerfd_n; siginfo_t *siginfo; unsigned int siginfo_n; struct rst_tcp_sock *tcp_socks; unsigned int tcp_socks_n; struct rst_aio_ring *rings; unsigned int rings_n; struct rlimit64 *rlims; unsigned int rlims_n; pid_t *helpers /* the TASK_HELPERS to wait on at the end of restore */; unsigned int helpers_n; pid_t *zombies; unsigned int zombies_n; struct sock_fprog *seccomp_filters; unsigned int seccomp_filters_n; /* * * * * * * * * * * * * * * * * * * * */ unsigned long task_size; unsigned long premmapped_addr; unsigned long premmapped_len; rt_sigaction_t sigchld_act; void *bootstrap_start; unsigned long bootstrap_len; struct itimerval itimers[3]; MmEntry mm; auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; u32 mm_saved_auxv_size; char comm[TASK_COMM_LEN]; /* * proc_fd is a handle to /proc that the restorer blob can use to open * files there, because some of them can't be opened before the * restorer blob is called. */ int proc_fd; int seccomp_mode; bool compatible_mode; bool can_map_vdso; #ifdef CONFIG_VDSO unsigned long vdso_rt_size; struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */ unsigned long vdso_rt_parked_at; /* safe place to keep vdso */ #endif void **breakpoint; enum faults fault_strategy; } __aligned(64); /* * For arm64 stack needs to aligned to 16 bytes. * Hence align to 16 bytes for all */ #define RESTORE_ALIGN_STACK(start, size) \ (ALIGN((start) + (size) - 16, 16)) static inline unsigned long restorer_stack(struct restore_mem_zone *mz) { return RESTORE_ALIGN_STACK((long)&mz->stack, RESTORE_STACK_SIZE); } enum { /* * Restore stages. The stage is started by criu process, then * confirmed by all tasks involved in it. Then criu does some * actions and starts the next stage. * * The first stated stage is CR_STATE_ROOT_TASK which is started * right before calling fork_with_pid() for the root_item. */ CR_STATE_FAIL = -1, /* * Root task is created and does some pre-checks. * After the stage ACT_SETUP_NS scripts are performed. */ CR_STATE_ROOT_TASK = 0, /* * The prepare_namespace() is called. * After the stage criu opens root task's mntns and * calls ACT_POST_SETUP_NS scripts. */ CR_STATE_PREPARE_NAMESPACES, /* * All tasks fork and call open_transport_socket(). * Stage is needed to make sure they all have the socket. * Also this stage is a sync point after which the * fini_restore_mntns() can be called. * * This stage is a little bit special. Normally all stages * are controlled by criu process, but when this stage * starts criu process starts waiting for the tasks to * finish it, but by the time it gets woken up the stage * finished is CR_STATE_RESTORE. The forking stage is * barrier-ed by the root task, this task is also the one * that switches the stage (into restoring). * * The above is done to lower the amount of context * switches from root task to criu and back, since the * separate forking stage is not needed by criu, it's * purely to make sure all tasks be in sync. */ CR_STATE_FORKING, /* * Main restore stage. By the end of it all tasks are * almost ready and what's left is: * pick up zombies and helpers * restore sigchild handlers used to detect restore errors * restore credentials */ CR_STATE_RESTORE, /* * Tasks restore sigchild handlers. * Stage is needed to synchronize the change in error * propagation via sigchild. */ CR_STATE_RESTORE_SIGCHLD, /* * Final stage. * For security reason processes can be resumed only when all * credentials are restored. Otherwise someone can attach to a * process, which are not restored credentials yet and execute * some code. */ CR_STATE_RESTORE_CREDS, CR_STATE_COMPLETE }; #define restore_finish_stage(__v, __stage) ({ \ futex_dec_and_wake(&(__v)->nr_in_progress); \ futex_wait_while(&(__v)->start, __stage); \ (s32) futex_get(&(__v)->start); \ }) #define __r_sym(name) restorer_sym ## name #define restorer_sym(rblob, name) (void*)(rblob + __r_sym(name)) #endif /* __CR_RESTORER_H__ */ criu-3.6/criu/include/rst-malloc.h000066400000000000000000000050271317335042600171470ustar00rootroot00000000000000#ifndef __CR_RST_MALLOC__H__ #define __CR_RST_MALLOC__H__ /* * On restore we need differetn types of memory allocation. * Here's an engine that tries to generalize them all. The * main difference is in how the buffer with objects is being * grown up. * * Buffers, that are to be used by restorer will be remapped * into restorer address space with rst_mem_remap() call. Thus * we have to either keep track of all the buffers and objects, * or keep objects one-by-one in a plain linear buffer. The * engine uses the 2nd approach. */ enum { /* * Shared non-remapable allocations. These can happen only * in "global" context, i.e. when objects are allocated to * be used by any process to be restored. The objects are * not going to be used in restorer blob, thus allocation * engine grows buffers in a simple manner. */ RM_SHARED, /* * Shared objects, that are about to be used in restorer * blob. For these the *_remap_* stuff below is used to get * the actual pointer on any object. Growing a buffer is * done with mremap, so that we don't have to keep track * of all the buffer chunks and can remap them in restorer * in one call. */ RM_SHREMAP, /* * Privately used objects. Buffer grow and remap is the * same as for SHREMAP, but memory regions are MAP_PRIVATE. */ RM_PRIVATE, RST_MEM_TYPES, }; /* * Disables SHARED and SHREMAP allocations, turns on PRIVATE */ extern void rst_mem_switch_to_private(void); /* * Reports a cookie of a current shared buffer position, that * can later be used in rst_mem_remap_ptr() to find out the object * pointer in the restorer blob. */ extern unsigned long rst_mem_align_cpos(int type); extern void *rst_mem_remap_ptr(unsigned long pos, int type); #define RST_MEM_FIXUP_PPTR(ptr) do { \ ptr = rst_mem_remap_ptr((unsigned long)ptr, RM_PRIVATE);\ } while (0) /* * Allocate and free objects. We don't need to free arbitrary * object, thus allocation is simple (linear) and only the * last object can be freed (pop-ed from buffer). */ extern void *rst_mem_alloc(unsigned long size, int type); extern void rst_mem_free_last(int type); /* Word-align the current freelist pointer for the next allocation. If we don't * align pointers, some futex and atomic operations can fail. */ extern void rst_mem_align(int type); /* * Routines to remap SHREMAP and PRIVATE into restorer address space */ extern unsigned long rst_mem_lock(void); extern int rst_mem_remap(void *to); extern void *shmalloc(size_t bytes); extern void shfree_last(void *ptr); #endif /* __CR_RST_MALLOC__H__ */ criu-3.6/criu/include/rst_info.h000066400000000000000000000024121317335042600167100ustar00rootroot00000000000000#ifndef __CR_RST_INFO_H__ #define __CR_RST_INFO_H__ #include "common/lock.h" #include "common/list.h" #include "vma.h" struct task_entries { int nr_threads, nr_tasks, nr_helpers; futex_t nr_in_progress; futex_t start; atomic_t cr_err; mutex_t userns_sync_lock; }; struct fdt { int nr; /* How many tasks share this fd table */ pid_t pid; /* Who should restore this fd table */ /* * The fd table is ready for restoing, if fdt_lock is equal to nr * The fdt table was restrored, if fdt_lock is equal to nr + 1 */ futex_t fdt_lock; }; struct _MmEntry; struct rst_info { struct list_head fds; void *premmapped_addr; unsigned long premmapped_len; unsigned long clone_flags; void *munmap_restorer; int service_fd_id; struct fdt *fdt; struct vm_area_list vmas; struct _MmEntry *mm; struct list_head vma_io; unsigned int pages_img_id; u32 cg_set; union { struct pstree_item *pgrp_leader; futex_t pgrp_set; }; struct file_desc *cwd; struct file_desc *root; bool has_umask; u32 umask; /* * We set this flag when process has seccomp filters * so that we know to suspend them before we unmap the * restorer blob. */ bool has_seccomp; bool has_thp_enabled; void *breakpoint; }; #endif /* __CR_RST_INFO_H__ */ criu-3.6/criu/include/seccomp.h000066400000000000000000000012641317335042600165220ustar00rootroot00000000000000#ifndef __CR_SECCOMP_H__ #define __CR_SECCOMP_H__ #include #include #include "images/core.pb-c.h" #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif #ifndef SECCOMP_MODE_STRICT #define SECCOMP_MODE_STRICT 1 #endif #ifndef SECCOMP_MODE_FILTER #define SECCOMP_MODE_FILTER 2 #endif #ifndef SECCOMP_SET_MODE_FILTER #define SECCOMP_SET_MODE_FILTER 1 #endif #ifndef SECCOMP_FILTER_FLAG_TSYNC #define SECCOMP_FILTER_FLAG_TSYNC 1 #endif extern int collect_seccomp_filters(void); extern int prepare_seccomp_filters(void); struct task_restore_args; extern int seccomp_filters_get_rst_pos(CoreEntry *item, struct task_restore_args *); #endif criu-3.6/criu/include/seize.h000066400000000000000000000003641317335042600162100ustar00rootroot00000000000000#ifndef __CR_SEIZE_H__ #define __CR_SEIZE_H__ extern int collect_pstree(void); extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); extern bool alarm_timeouted(void); #endif criu-3.6/criu/include/servicefd.h000066400000000000000000000021341317335042600170400ustar00rootroot00000000000000#ifndef __CR_SERVICE_FD_H__ #define __CR_SERVICE_FD_H__ #include enum sfd_type { SERVICE_FD_MIN, LOG_FD_OFF, IMG_FD_OFF, PROC_FD_OFF, /* fd with /proc for all proc_ calls */ CTL_TTY_OFF, SELF_STDIN_OFF, CR_PROC_FD_OFF, /* some other's proc fd. * For dump -- target ns' proc * For restore -- CRIU ns' proc */ ROOT_FD_OFF, /* Root of the namespace we dump/restore */ CGROUP_YARD, USERNSD_SK, /* Socket for usernsd */ NS_FD_OFF, /* Node's net namespace fd */ TRANSPORT_FD_OFF, /* to transfer file descriptors */ RPC_SK_OFF, FDSTORE_SK_OFF, LAZY_PAGES_SK_OFF, /* socket for communication with lazy-pages daemon */ SERVICE_FD_MAX }; extern int clone_service_fd(int id); extern int init_service_fd(void); extern int get_service_fd(enum sfd_type type); extern int reserve_service_fd(enum sfd_type type); extern int install_service_fd(enum sfd_type type, int fd); extern int close_service_fd(enum sfd_type type); extern bool is_service_fd(int fd, enum sfd_type type); extern bool is_any_service_fd(int fd); extern int service_fd_min_fd(void); #endif /* __CR_SERVICE_FD_H__ */ criu-3.6/criu/include/setproctitle.h000066400000000000000000000006131317335042600176070ustar00rootroot00000000000000#ifndef __CR_SETPROCTITLE_H__ #define __CR_SETPROCTITLE_H__ #ifdef CONFIG_HAS_LIBBSD #include #else /* * setproctitle_init is in the libbsd since v0.6.0. This macro allows to * compile criu with libbsd<0.6.0. */ #ifndef CONFIG_HAS_SETPROCTITLE_INIT #define setproctitle_init(argc, argv, envp) #endif #define setproctitle(fmt, ...) #endif #endif /* __CR_SETPROCTITLE_H__ */ criu-3.6/criu/include/shmem.h000066400000000000000000000012261317335042600162000ustar00rootroot00000000000000#ifndef __CR_SHMEM_H__ #define __CR_SHMEM_H__ #include "int.h" #include "common/lock.h" #include "images/vma.pb-c.h" struct _VmaEntry; struct vma_area; extern int collect_shmem(int pid, struct vma_area *vma); extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); extern int fixup_sysv_shmems(void); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); extern int restore_sysv_shmem_content(void *addr, unsigned long size, unsigned long shmid); #define SYSV_SHMEM_SKIP_FD (0x7fffffff) #endif /* __CR_SHMEM_H__ */ criu-3.6/criu/include/sigframe.h000066400000000000000000000005251317335042600166650ustar00rootroot00000000000000/* * Generic sigframe bits. */ #ifndef __CR_SIGFRAME_H__ #define __CR_SIGFRAME_H__ #include #include "images/core.pb-c.h" extern int construct_sigframe(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe, k_rtsigset_t *blkset, CoreEntry *core); #endif /* __CR_SIGFRAME_H__ */ criu-3.6/criu/include/signalfd.h000066400000000000000000000004041317335042600166530ustar00rootroot00000000000000#ifndef __CR_SIGNALFD_H__ #define __CR_SIGNALFD_H__ struct cr_imgset; struct fd_parms; extern int is_signalfd_link(char *link); extern const struct fdtype_ops signalfd_dump_ops; extern struct collect_image_info signalfd_cinfo; #endif /* __CR_SIGNALFD_H__ */ criu-3.6/criu/include/sk-inet.h000066400000000000000000000042341317335042600164430ustar00rootroot00000000000000#ifndef __CR_SK_INET_H__ #define __CR_SK_INET_H__ #include #include "sockets.h" #include "files.h" #include "common/list.h" #include "images/sk-inet.pb-c.h" #define INET_ADDR_LEN 48 /* max of INET_ADDRSTRLEN and INET6_ADDRSTRLEN */ #ifndef TCP_REPAIR #define TCP_REPAIR 19 /* TCP sock is under repair right now */ #define TCP_REPAIR_QUEUE 20 #define TCP_QUEUE_SEQ 21 #define TCP_REPAIR_OPTIONS 22 #endif struct inet_sk_desc { struct socket_desc sd; unsigned int type; unsigned int src_port; unsigned int dst_port; unsigned int state; unsigned int rqlen; unsigned int wqlen; /* sent + unsent data */ unsigned int uwqlen; /* unsent data */ unsigned int src_addr[4]; unsigned int dst_addr[4]; unsigned short shutdown; int rfd; int cpt_reuseaddr; struct list_head rlist; void *priv; }; struct inet_port; struct inet_sk_info { InetSkEntry *ie; struct file_desc d; struct inet_port *port; struct list_head port_list; /* * This is an fd by which the socket is opened. * It will be carried down to restorer code to * repair-off the socket at the very end. */ int sk_fd; struct list_head rlist; }; extern int inet_bind(int sk, struct inet_sk_info *); extern int inet_connect(int sk, struct inet_sk_info *); #ifdef CR_NOGLIBC #define setsockopt sys_setsockopt #endif static inline void tcp_repair_off(int fd) { int aux = 0, ret; ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) pr_err("Failed to turn off repair mode on socket: %m\n"); } extern void tcp_locked_conn_add(struct inet_sk_info *); extern void rst_unlock_tcp_connections(void); extern void cpt_unlock_tcp_connections(void); extern int dump_one_tcp(int sk, struct inet_sk_desc *sd); extern int restore_one_tcp(int sk, struct inet_sk_info *si); #define SK_EST_PARAM "tcp-established" #define SK_INFLIGHT_PARAM "skip-in-flight" #define SK_CLOSE_PARAM "tcp-close" struct task_restore_args; int prepare_tcp_socks(struct task_restore_args *); struct rst_tcp_sock { int sk; bool reuseaddr; }; union libsoccr_addr; int restore_sockaddr(union libsoccr_addr *sa, int family, u32 pb_port, u32 *pb_addr, u32 ifindex); #endif /* __CR_SK_INET_H__ */ criu-3.6/criu/include/sk-packet.h000066400000000000000000000014561317335042600167560ustar00rootroot00000000000000#ifndef __CR_SK_PACKET_H__ #define __CR_SK_PACKET_H__ #ifndef PACKET_TIMESTAMP #define PACKET_TIMESTAMP 17 #endif struct cr_imgset; struct fd_parms; struct vma_area; extern struct collect_image_info packet_sk_cinfo; extern int dump_socket_map(struct vma_area *vma); extern int collect_socket_map(struct vma_area *); struct nlmsghdr; extern int packet_receive_one(struct nlmsghdr *h, void *arg); #ifndef PACKET_VNET_HDR #define PACKET_VNET_HDR 15 #endif #ifndef PACKET_FANOUT #define PACKET_FANOUT 18 #endif #ifndef TPACKET3_HDRLEN struct tpacket_req3 { unsigned int tp_block_size; unsigned int tp_block_nr; unsigned int tp_frame_size; unsigned int tp_frame_nr; unsigned int tp_retire_blk_tov; unsigned int tp_sizeof_priv; unsigned int tp_feature_req_word; }; #endif #endif /* __CR_SK_PACKET_H__ */ criu-3.6/criu/include/sk-queue.h000066400000000000000000000003661317335042600166320ustar00rootroot00000000000000#ifndef __CR_SK_QUEUE_H__ #define __CR_SK_QUEUE_H__ extern struct collect_image_info sk_queues_cinfo; extern int dump_sk_queue(int sock_fd, int sock_id); extern int restore_sk_queue(int fd, unsigned int peer_id); #endif /* __CR_SK_QUEUE_H__ */ criu-3.6/criu/include/sockets.h000066400000000000000000000051751317335042600165510ustar00rootroot00000000000000#ifndef __CR_SOCKETS_H__ #define __CR_SOCKETS_H__ #include #include #include "images/sk-opts.pb-c.h" #include "images/fdinfo.pb-c.h" struct fdinfo_list_entry; struct sk_opts_entry; struct file_desc; struct fd_parms; struct cr_imgset; struct nlmsghdr; struct cr_img; struct socket_desc { unsigned int family; unsigned int ino; struct socket_desc *next; int already_dumped; }; extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); extern int dump_socket_opts(int sk, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); extern void release_skopts(SkOptsEntry *); extern int restore_prepare_socket(int sk); extern void preload_socket_modules(void); extern bool socket_test_collect_bit(unsigned int family, unsigned int proto); extern int sk_collect_one(unsigned ino, int family, struct socket_desc *d); struct ns_id; extern int collect_sockets(struct ns_id *); extern struct collect_image_info inet_sk_cinfo; extern struct collect_image_info unix_sk_cinfo; extern int fix_external_unix_sockets(void); extern int prepare_scms(void); extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids); extern struct collect_image_info netlink_sk_cinfo; extern struct socket_desc *lookup_socket(unsigned ino, int family, int proto); extern const struct fdtype_ops unix_dump_ops; extern const struct fdtype_ops inet_dump_ops; extern const struct fdtype_ops inet6_dump_ops; extern const struct fdtype_ops netlink_dump_ops; extern const struct fdtype_ops packet_dump_ops; extern int inet_collect_one(struct nlmsghdr *h, int family, int type); extern int unix_receive_one(struct nlmsghdr *h, void *); extern int netlink_receive_one(struct nlmsghdr *hdr, void *arg); extern int unix_sk_id_add(unsigned int ino); extern int unix_sk_ids_parse(char *optarg); extern int do_dump_opt(int sk, int level, int name, void *val, int len); #define dump_opt(s, l, n, f) do_dump_opt(s, l, n, f, sizeof(*f)) extern int do_restore_opt(int sk, int level, int name, void *val, int len); #define restore_opt(s, l, n, f) do_restore_opt(s, l, n, f, sizeof(*f)) #define sk_encode_shutdown(img, mask) do { \ /* \ * protobuf SK_SHUTDOWN__ bits match those \ * reported by kernel \ */ \ (img)->shutdown = mask; \ if ((img)->shutdown != SK_SHUTDOWN__NONE) \ (img)->has_shutdown = true; \ } while (0) static inline int sk_decode_shutdown(int val) { static const int hows[] = {-1, SHUT_RD, SHUT_WR, SHUT_RDWR}; return hows[val]; } #define USK_EXT_PARAM "ext-unix-sk" #ifndef NETLINK_SOCK_DIAG #define NETLINK_SOCK_DIAG NETLINK_INET_DIAG #endif #endif /* __CR_SOCKETS_H__ */ criu-3.6/criu/include/stats.h000066400000000000000000000012731317335042600162270ustar00rootroot00000000000000#ifndef __CR_STATS_H__ #define __CR_STATS_H__ enum { TIME_FREEZING, TIME_FROZEN, TIME_MEMDUMP, TIME_MEMWRITE, TIME_IRMAP_RESOLVE, DUMP_TIME_NR_STATS, }; enum { TIME_FORK, TIME_RESTORE, RESTORE_TIME_NS_STATS, }; extern void timing_start(int t); extern void timing_stop(int t); enum { CNT_PAGES_SCANNED, CNT_PAGES_SKIPPED_PARENT, CNT_PAGES_WRITTEN, CNT_PAGES_LAZY, DUMP_CNT_NR_STATS, }; enum { CNT_PAGES_COMPARED, CNT_PAGES_SKIPPED_COW, CNT_PAGES_RESTORED, RESTORE_CNT_NR_STATS, }; extern void cnt_add(int c, unsigned long val); #define DUMP_STATS 1 #define RESTORE_STATS 2 extern int init_stats(int what); extern void write_stats(int what); #endif /* __CR_STATS_H__ */ criu-3.6/criu/include/string.h000066400000000000000000000005761317335042600164040ustar00rootroot00000000000000#ifndef __CR_STRING_H__ #define __CR_STRING_H__ #include #ifdef CONFIG_HAS_LIBBSD # include #endif #include "config.h" #ifndef CONFIG_HAS_STRLCPY extern size_t strlcpy(char *dest, const char *src, size_t size); #endif #ifndef CONFIG_HAS_STRLCAT extern size_t strlcat(char *dest, const char *src, size_t count); #endif #endif /* __CR_STRING_H__ */ criu-3.6/criu/include/sysctl.h000066400000000000000000000016621317335042600164140ustar00rootroot00000000000000#ifndef __CR_SYSCTL_H__ #define __CR_SYSCTL_H__ struct sysctl_req { char *name; void *arg; int type; int flags; }; extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns); enum { CTL_READ, CTL_WRITE, }; #define CTL_SHIFT 4 /* Up to 16 types */ #define CTL_U32 1 /* Single u32 */ #define CTL_U64 2 /* Single u64 */ #define __CTL_U32A 3 /* Array of u32 */ #define __CTL_U64A 4 /* Array of u64 */ #define __CTL_STR 5 /* String */ #define CTL_32 6 /* Single s32 */ #define CTL_U32A(n) (__CTL_U32A | ((n) << CTL_SHIFT)) #define CTL_U64A(n) (__CTL_U64A | ((n) << CTL_SHIFT)) #define CTL_STR(len) (__CTL_STR | ((len) << CTL_SHIFT)) #define CTL_LEN(t) ((t) >> CTL_SHIFT) #define CTL_TYPE(t) ((t) & ((1 << CTL_SHIFT) - 1)) /* * Some entries might be missing mark them as optional. */ #define CTL_FLAGS_OPTIONAL 1 #define CTL_FLAGS_HAS 2 #define CTL_FLAGS_READ_EIO_SKIP 4 #endif /* __CR_SYSCTL_H__ */ criu-3.6/criu/include/sysfs_parse.h000066400000000000000000000010411317335042600174230ustar00rootroot00000000000000#ifndef __CR_SYSFS_PARSE_H__ #define __CR_SYSFS_PARSE_H__ #define SYSFS_AUFS "/sys/fs/aufs/" #define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ #define SBINFO_PATH_LEN (sizeof SYSFS_AUFS + SBINFO_LEN) /* /sys/fs/aufs/ */ #define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%3d */ struct mount_info; struct vma_area; extern int parse_aufs_branches(struct mount_info *mi); extern int fixup_aufs_vma_fd(struct vma_area *vma, int vm_file_fd); extern void free_aufs_branches(void); #endif /* __CR_SYSFS_PARSE_H__ */ criu-3.6/criu/include/timerfd.h000066400000000000000000000017531317335042600165260ustar00rootroot00000000000000#ifndef __CR_TIMERFD_H__ #define __CR_TIMERFD_H__ #include #include #include "files.h" #include "images/timerfd.pb-c.h" struct pstree_item; struct restore_timerfd { int id; int fd; int clockid; int settime_flags; unsigned long ticks; struct itimerspec val; }; extern const struct fdtype_ops timerfd_dump_ops; extern struct collect_image_info timerfd_cinfo; struct task_restore_args; int prepare_timerfds(struct task_restore_args *); extern int check_timerfd(void); extern int is_timerfd_link(char *link); #ifndef TFD_TIMER_ABSTIME # define TFD_TIMER_ABSTIME (1 << 0) #endif #ifndef TFD_IOC_SET_TICKS # define TFD_IOC_SET_TICKS _IOW('T', 0, u64) #endif static inline int verify_timerfd(TimerfdEntry *tfe) { if (tfe->clockid != CLOCK_REALTIME && tfe->clockid != CLOCK_BOOTTIME && tfe->clockid != CLOCK_MONOTONIC) { pr_err("Unknown clock type %d for %#x\n", tfe->clockid, tfe->id); return -1; } return 0; } #endif /* __CR_TIMERFD_H__ */ criu-3.6/criu/include/tty.h000066400000000000000000000020301317335042600157010ustar00rootroot00000000000000#ifndef __CR_TTY_H__ #define __CR_TTY_H__ #include #include #include "files.h" /* Kernel's limit */ #define TERMIOS_NCC 19 /* Popular serial console's majors, which not defined in */ #define USB_SERIAL_MAJOR 188 #define LOW_DENSE_SERIAL_MAJOR 204 extern const struct fdtype_ops tty_dump_ops; struct tty_driver; struct tty_driver *get_tty_driver(dev_t rdev, dev_t dev); static inline int is_tty(dev_t rdev, dev_t dev) { return get_tty_driver(rdev, dev) != NULL; } extern int tty_post_actions(void); extern int dump_verify_tty_sids(void); extern struct collect_image_info tty_info_cinfo; extern struct collect_image_info tty_cinfo; extern struct collect_image_info tty_cdata; struct mount_info; extern int devpts_restore(struct mount_info *pm); extern int tty_prep_fds(void); extern void tty_fini_fds(void); extern int tty_restore_ctl_terminal(struct file_desc *d, int fd); extern int devpts_check_bindmount(struct mount_info *m); #define OPT_SHELL_JOB "shell-job" #endif /* __CR_TTY_H__ */ criu-3.6/criu/include/tun.h000066400000000000000000000007241317335042600156770ustar00rootroot00000000000000#ifndef __CR_TUN_H__ #define __CR_TUN_H__ #ifndef TUN_MINOR #define TUN_MINOR 200 #endif #include #include "images/netdev.pb-c.h" extern const struct fdtype_ops tunfile_dump_ops; extern int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **info); extern int restore_one_tun(NetDeviceEntry *nde, int nlsk); extern struct collect_image_info tunfile_cinfo; extern int check_tun_cr(int no_tun_err); #endif /* __CR_TUN_H__ */ criu-3.6/criu/include/types.h000066400000000000000000000001711317335042600162310ustar00rootroot00000000000000#ifndef __CR_INC_TYPES_H__ #define __CR_INC_TYPES_H__ #include #include "asm/types.h" #endif criu-3.6/criu/include/uffd.h000066400000000000000000000005401317335042600160110ustar00rootroot00000000000000#ifndef __CR_UFFD_H_ #define __CR_UFFD_H_ struct task_restore_args; extern int uffd_open(int flags, unsigned long *features); extern bool uffd_noncooperative(void); extern int setup_uffd(int pid, struct task_restore_args *task_args); extern int lazy_pages_setup_zombie(int pid); extern int prepare_lazy_pages_socket(void); #endif /* __CR_UFFD_H_ */ criu-3.6/criu/include/unix_diag.h000066400000000000000000000023231317335042600170350ustar00rootroot00000000000000#ifndef __CR_UNIX_DIAG_H__ #define __CR_UNIX_DIAG_H__ struct unix_diag_req { u8 sdiag_family; u8 sdiag_protocol; u16 pad; u32 udiag_states; u32 udiag_ino; u32 udiag_show; u32 udiag_cookie[2]; }; #define UDIAG_SHOW_NAME 0x00000001 /* show name (not path) */ #define UDIAG_SHOW_VFS 0x00000002 /* show VFS inode info */ #define UDIAG_SHOW_PEER 0x00000004 /* show peer socket info */ #define UDIAG_SHOW_ICONS 0x00000008 /* show pending connections */ #define UDIAG_SHOW_RQLEN 0x00000010 /* show skb receive queue len */ #define UDIAG_SHOW_MEMINFO 0x00000020 /* show memory info of a socket */ struct unix_diag_msg { u8 udiag_family; u8 udiag_type; u8 udiag_state; u8 pad; u32 udiag_ino; u32 udiag_cookie[2]; }; enum { SK_MEMINFO_RMEM_ALLOC, SK_MEMINFO_RCVBUF, SK_MEMINFO_WMEM_ALLOC, SK_MEMINFO_SNDBUF, SK_MEMINFO_FWD_ALLOC, SK_MEMINFO_WMEM_QUEUED, SK_MEMINFO_OPTMEM, SK_MEMINFO_VARS, }; enum { UNIX_DIAG_NAME, UNIX_DIAG_VFS, UNIX_DIAG_PEER, UNIX_DIAG_ICONS, UNIX_DIAG_RQLEN, UNIX_DIAG_MEMINFO, UNIX_DIAG_SHUTDOWN, UNIX_DIAG_MAX, }; struct unix_diag_vfs { u32 udiag_vfs_ino; u32 udiag_vfs_dev; }; struct unix_diag_rqlen { u32 udiag_rqueue; u32 udiag_wqueue; }; #endif /* __CR_UNIX_DIAG_H__ */ criu-3.6/criu/include/util-pie.h000066400000000000000000000005521317335042600166200ustar00rootroot00000000000000#ifndef __CR_UTIL_NET_H__ #define __CR_UTIL_NET_H__ #include #include #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \ (size_t)((struct sockaddr_un *) 0)->sun_path) #ifndef SO_PEEK_OFF #define SO_PEEK_OFF 42 #endif #include "common/scm.h" extern int open_detach_mount(char *dir); #endif /* __CR_UTIL_NET_H__ */ criu-3.6/criu/include/util-vdso.h000066400000000000000000000046211317335042600170170ustar00rootroot00000000000000#ifndef __CR_UTIL_VDSO_H__ #define __CR_UTIL_VDSO_H__ /* * VDSO management common definitions. * * This header file is included by the criu main code and the parasite code. * It contains definitions shared by these 2 parts. * * This file should not be included except in pie/util-vdso.c, include/vdso.h * and include/parasite-vdso.h */ #include /* * Each architecture must export: * VDSO_SYMBOL_MAX, the number of vDSO symbols to manage * ARCH_VDSO_SYMBOLS, a table of string containing the vDSO symbol names * vdso_redirect_calls, a service called to redirect the vDSO symbols in * the parasite code. */ #include "asm/vdso.h" struct vdso_symbol { char name[32]; unsigned long offset; }; struct vdso_symtable { unsigned long vdso_size; unsigned long vvar_size; struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; bool vdso_before_vvar; /* order of vdso/vvar pair */ }; struct vdso_maps { unsigned long vdso_start; unsigned long vvar_start; struct vdso_symtable sym; }; #define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, } #define VDSO_SYMTABLE_INIT \ { \ .vdso_size = VDSO_BAD_SIZE, \ .vvar_size = VVAR_BAD_SIZE, \ .symbols = { \ [0 ... VDSO_SYMBOL_MAX - 1] = \ (struct vdso_symbol)VDSO_SYMBOL_INIT, \ }, \ .vdso_before_vvar = false, \ } #define VDSO_MAPS_INIT \ { \ .vdso_start = VDSO_BAD_ADDR, \ .vvar_start = VVAR_BAD_ADDR, \ .sym = VDSO_SYMTABLE_INIT, \ } #ifdef CONFIG_VDSO_32 #define Ehdr_t Elf32_Ehdr #define Sym_t Elf32_Sym #define Phdr_t Elf32_Phdr #define Word_t Elf32_Word #define Dyn_t Elf32_Dyn #define ELF_ST_TYPE ELF32_ST_TYPE #define ELF_ST_BIND ELF32_ST_BIND #else /* CONFIG_VDSO_32 */ #define Ehdr_t Elf64_Ehdr #define Sym_t Elf64_Sym #define Phdr_t Elf64_Phdr #define Word_t Elf64_Word #define Dyn_t Elf64_Dyn #ifndef ELF_ST_TYPE #define ELF_ST_TYPE ELF64_ST_TYPE #endif #ifndef ELF_ST_BIND #define ELF_ST_BIND ELF64_ST_BIND #endif #endif /* CONFIG_VDSO_32 */ #if defined(CONFIG_VDSO_32) # define vdso_fill_symtable vdso_fill_symtable_compat #endif extern int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t); #if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) #ifndef ARCH_MAP_VDSO_32 # define ARCH_MAP_VDSO_32 0x2002 #endif extern int vdso_fill_symtable_compat(uintptr_t mem, size_t size, struct vdso_symtable *t); #endif #endif /* __CR_UTIL_VDSO_H__ */ criu-3.6/criu/include/util.h000066400000000000000000000215701317335042600160500ustar00rootroot00000000000000#ifndef __CR_UTIL_H__ #define __CR_UTIL_H__ /* * Some bits are stolen from perf and kvm tools */ #include #include #include #include #include #include #include #include #include #include "int.h" #include "common/compiler.h" #include "xmalloc.h" #include "common/bug.h" #include "log.h" #include "common/err.h" #define PREF_SHIFT_OP(pref, op, size) ((size) op (pref ##BYTES_SHIFT)) #define KBYTES_SHIFT 10 #define MBYTES_SHIFT 20 #define GBYTES_SHIFT 30 #define KBYTES(size) PREF_SHIFT_OP(K, >>, size) #define MBYTES(size) PREF_SHIFT_OP(M, >>, size) #define GBYTES(size) PREF_SHIFT_OP(G, >>, size) #define KILO(size) PREF_SHIFT_OP(K, <<, size) #define MEGA(size) PREF_SHIFT_OP(M, <<, size) #define GIGA(size) PREF_SHIFT_OP(G, <<, size) struct vma_area; struct list_head; extern void pr_vma(unsigned int loglevel, const struct vma_area *vma_area); #define pr_info_vma(vma_area) pr_vma(LOG_INFO, vma_area) #define pr_vma_list(level, head) \ do { \ struct vma_area *vma; \ list_for_each_entry(vma, head, list) \ pr_vma(level, vma); \ } while (0) #define pr_info_vma_list(head) pr_vma_list(LOG_INFO, head) extern int move_fd_from(int *img_fd, int want_fd); extern int close_safe(int *fd); extern int reopen_fd_as_safe(char *file, int line, int new_fd, int old_fd, bool allow_reuse_fd); #define reopen_fd_as(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, false) #define reopen_fd_as_nocheck(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, true) extern void close_proc(void); extern int open_pid_proc(pid_t pid); extern int close_pid_proc(void); extern int set_proc_fd(int fd); /* * Values for pid argument of the proc opening routines below. * SELF would open file under /proc/self * GEN would open a file under /proc itself * NONE is internal, don't use it ;) */ #define PROC_SELF 0 #define PROC_GEN -1 #define PROC_NONE -2 extern int do_open_proc(pid_t pid, int flags, const char *fmt, ...) __attribute__ ((__format__ (__printf__, 3, 4))); #define __open_proc(pid, ier, flags, fmt, ...) \ ({ \ int __fd = do_open_proc(pid, flags, \ fmt, ##__VA_ARGS__); \ if (__fd < 0 && (errno != (ier))) \ pr_perror("Can't open %d/" fmt " on procfs", \ pid, ##__VA_ARGS__); \ \ __fd; \ }) /* int open_proc(pid_t pid, const char *fmt, ...); */ #define open_proc(pid, fmt, ...) \ __open_proc(pid, 0, O_RDONLY, fmt, ##__VA_ARGS__) /* int open_proc_rw(pid_t pid, const char *fmt, ...); */ #define open_proc_rw(pid, fmt, ...) \ __open_proc(pid, 0, O_RDWR, fmt, ##__VA_ARGS__) #define open_proc_path(pid, fmt, ...) \ __open_proc(pid, 0, O_PATH, fmt, ##__VA_ARGS__) /* DIR *opendir_proc(pid_t pid, const char *fmt, ...); */ #define opendir_proc(pid, fmt, ...) \ ({ \ int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \ DIR *__d = NULL; \ \ if (__fd >= 0) { \ __d = fdopendir(__fd); \ if (__d == NULL) \ pr_perror("Can't fdopendir %d " \ "(%d/" fmt " on procfs)", \ __fd, pid, ##__VA_ARGS__); \ } \ __d; \ }) /* FILE *fopen_proc(pid_t pid, const char *fmt, ...); */ #define fopen_proc(pid, fmt, ...) \ ({ \ int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \ FILE *__f = NULL; \ \ if (__fd >= 0) { \ __f = fdopen(__fd, "r"); \ if (__f == NULL) \ pr_perror("Can't fdopen %d " \ "(%d/" fmt " on procfs)", \ __fd, pid, ##__VA_ARGS__); \ } \ __f; \ }) #define DEVZERO (makedev(1, 5)) #define KDEV_MINORBITS 20 #define KDEV_MINORMASK ((1UL << KDEV_MINORBITS) - 1) #define MKKDEV(ma, mi) (((ma) << KDEV_MINORBITS) | (mi)) static inline u32 kdev_major(u32 kdev) { return kdev >> KDEV_MINORBITS; } static inline u32 kdev_minor(u32 kdev) { return kdev & KDEV_MINORMASK; } static inline dev_t kdev_to_odev(u32 kdev) { /* * New kernels encode devices in a new form. * See kernel's fs/stat.c for details, there * choose_32_64 helpers which are the key. */ unsigned major = kdev_major(kdev); unsigned minor = kdev_minor(kdev); return makedev(major, minor); } extern int copy_file(int fd_in, int fd_out, size_t bytes); extern int is_anon_link_type(char *link, char *type); #define is_hex_digit(c) \ (((c) >= '0' && (c) <= '9') || \ ((c) >= 'a' && (c) <= 'f') || \ ((c) >= 'A' && (c) <= 'F')) #define CRS_CAN_FAIL 0x1 /* cmd can validly exit with non zero code */ extern int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags); extern int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], unsigned flags, int userns_pid); extern int cr_daemon(int nochdir, int noclose, int *keep_fd, int close_fd); extern int close_status_fd(void); extern int is_root_user(void); static inline bool dir_dots(const struct dirent *de) { return !strcmp(de->d_name, ".") || !strcmp(de->d_name, ".."); } extern int is_empty_dir(int dirfd); /* * Size of buffer to carry the worst case or /proc/self/fd/N * path. Since fd is an integer, we can easily estimate one :) */ #define PSFDS (sizeof("/proc/self/fd/2147483647")) extern int read_fd_link(int lfd, char *buf, size_t size); #define USEC_PER_SEC 1000000L #define NSEC_PER_SEC 1000000000L int vaddr_to_pfn(int fd, unsigned long vaddr, u64 *pfn); /* * Check whether @str starts with @sub and report the * next character of @str in @end */ static inline bool strstartswith2(const char *str, const char *sub, char *end) { const char *osub = sub; while (1) { if (*sub == '\0') /* end of sub -- match */ { if (end) { if (sub == osub + 1) /* pure root */ *end = '/'; else *end = *str; } return true; } if (*str == '\0') /* end of str, sub is NOT ended -- miss */ return false; if (*str != *sub) return false; str++; sub++; } } static inline bool strstartswith(const char *str, const char *sub) { return strstartswith2(str, sub, NULL); } /* * Checks whether the @path has @sub_path as a sub path, i.e. * sub_path is the beginning of path and the last component * match is full (next character terminates path component). * * Paths shouldn't contain excessive /-s, i.e. only one slash * between path components and no slash at the end (except for * the "/" path. This is pretty good assumption to what paths * are used by criu. */ static inline bool issubpath(const char *path, const char *sub_path) { char end; return strstartswith2(path, sub_path, &end) && (end == '/' || end == '\0'); } /* * mkdir -p */ int mkdirpat(int fd, const char *path, int mode); /* * Tests whether a path is a prefix of another path. This is different than * strstartswith because "/foo" is _not_ a path prefix of "/foobar", since they * refer to different directories. */ bool is_path_prefix(const char *path, const char *prefix); FILE *fopenat(int dirfd, char *path, char *cflags); void split(char *str, char token, char ***out, int *n); int fd_has_data(int lfd); int make_yard(char *path); static inline int sk_wait_data(int sk) { struct pollfd pfd = {sk, POLLIN, 0}; return poll(&pfd, 1, -1); } void tcp_nodelay(int sk, bool on); void tcp_cork(int sk, bool on); const char *ns_to_string(unsigned int ns); int xatol(const char *string, long *number); int xatoi(const char *string, int *number); char *xstrcat(char *str, const char *fmt, ...) __attribute__ ((__format__ (__printf__, 2, 3))); char *xsprintf(const char *fmt, ...) __attribute__ ((__format__ (__printf__, 1, 2))); void print_data(unsigned long addr, unsigned char *data, size_t size); int setup_tcp_server(char *type); int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk); int setup_tcp_client(char *addr); #define LAST_PID_PATH "sys/kernel/ns_last_pid" #define PID_MAX_PATH "sys/kernel/pid_max" #define block_sigmask(saved_mask, sig_mask) ({ \ sigset_t ___blocked_mask; \ int ___ret = 0; \ sigemptyset(&___blocked_mask); \ sigaddset(&___blocked_mask, sig_mask); \ if (sigprocmask(SIG_BLOCK, &___blocked_mask, saved_mask) == -1) { \ pr_perror("Can not set mask of blocked signals"); \ ___ret = -1; \ } \ ___ret; \ }) #define restore_sigmask(saved_mask) ({ \ int ___ret = 0; \ if (sigprocmask(SIG_SETMASK, saved_mask, NULL) == -1) { \ pr_perror("Can not unset mask of blocked signals"); \ ___ret = -1; \ } \ ___ret; \ }) /* * Helpers to organize asynchronous reading from a bunch * of file descriptors. */ #include struct epoll_rfd { int fd; int (*revent)(struct epoll_rfd *); }; extern int epoll_add_rfd(int epfd, struct epoll_rfd *); extern int epoll_del_rfd(int epfd, struct epoll_rfd *rfd); extern int epoll_run_rfds(int epfd, struct epoll_event *evs, int nr_fds, int tmo); extern int epoll_prepare(int nr_events, struct epoll_event **evs); #endif /* __CR_UTIL_H__ */ criu-3.6/criu/include/uts_ns.h000066400000000000000000000002711317335042600164010ustar00rootroot00000000000000#ifndef __CR_UTS_NS_H__ #define __CR_UTS_NS_H__ extern int dump_uts_ns(int ns_id); extern int prepare_utsns(int pid); extern struct ns_desc uts_ns_desc; #endif /* __CR_UTS_NS_H__ */ criu-3.6/criu/include/vdso.h000066400000000000000000000016671317335042600160530ustar00rootroot00000000000000#ifndef __CR_VDSO_H__ #define __CR_VDSO_H__ #include #include #include "config.h" #ifdef CONFIG_VDSO #include "util-vdso.h" extern struct vdso_maps vdso_maps; extern struct vdso_maps vdso_maps_compat; extern int vdso_init_dump(void); extern int vdso_init_restore(void); extern int kerndat_vdso_fill_symtable(void); extern int kerndat_vdso_preserves_hint(void); extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list *vma_area_list); #ifdef CONFIG_COMPAT extern void compat_vdso_helper(struct vdso_maps *native, int pipe_fd, int err_fd, void *vdso_buf, size_t buf_size); #endif #else /* CONFIG_VDSO */ #define vdso_init_dump() (0) #define vdso_init_restore() (0) #define kerndat_vdso_fill_symtable() (0) #define kerndat_vdso_preserves_hint() (0) #define parasite_fixup_vdso(ctl, pid, vma_area_list) (0) #endif /* CONFIG_VDSO */ #endif /* __CR_VDSO_H__ */ criu-3.6/criu/include/vma.h000066400000000000000000000075301317335042600156560ustar00rootroot00000000000000#ifndef __CR_VMA_H__ #define __CR_VMA_H__ #include "image.h" #include "common/list.h" #include "images/vma.pb-c.h" #include struct vm_area_list { struct list_head h; unsigned nr; unsigned int nr_aios; unsigned long priv_size; /* nr of pages in private VMAs */ unsigned long priv_longest; /* nr of pages in longest private VMA */ unsigned long shared_longest; /* nr of pages in longest shared VMA */ }; #define VM_AREA_LIST(name) struct vm_area_list name = { .h = LIST_HEAD_INIT(name.h), .nr = 0, } static inline void vm_area_list_init(struct vm_area_list *vml) { INIT_LIST_HEAD(&vml->h); vml->nr = 0; vml->priv_size = 0; vml->priv_longest = 0; vml->shared_longest = 0; } struct file_desc; struct vma_area { struct list_head list; VmaEntry *e; union { struct /* for dump */ { int vm_socket_id; char *aufs_rpath; /* path from aufs root */ char *aufs_fpath; /* full path from global root */ /* * When several subsequent vmas have the same * dev:ino pair all 'tail' ones set this to true * and the vmst points to the head's stat buf. */ bool file_borrowed; struct stat *vmst; int mnt_id; }; struct /* for restore */ { int (*vm_open)(int pid, struct vma_area *vma); struct file_desc *vmfd; struct vma_area *pvma; /* parent for inherited VMAs */ unsigned long *page_bitmap; /* existent pages */ unsigned long premmaped_addr; /* restore only */ /* * Some notes about pvma, page_bitmap and premmaped_addr bits * above. * * The pvma is set in prepare_cow_vmas() when we resolve which * VMAs _may_ inherit pages from each other. * The page_bitmap and premmaped_addr are set in prepare_mappings() * when the respective VMAs get mmap-ed or mremap-ed. * These VMAs are then inherited during fork_with_pid()-s * called from create_children_and_session(). */ }; }; }; #define VMA_COW_ROOT ((struct vma_area *)1) typedef int (*dump_filemap_t)(struct vma_area *vma_area, int fd); extern struct vma_area *alloc_vma_area(void); extern int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t cb); extern void free_mappings(struct vm_area_list *vma_area_list); extern int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t cb); extern int parse_self_maps_lite(struct vm_area_list *vms); #define vma_area_is(vma_area, s) vma_entry_is((vma_area)->e, s) #define vma_area_len(vma_area) vma_entry_len((vma_area)->e) #define vma_entry_is(vma, s) (((vma)->status & (s)) == (s)) #define vma_entry_len(vma) ((vma)->end - (vma)->start) /* * vma_premmaped_start() can be used only in restorer. * In other cases vma_area->premmaped_addr must be used. * This hack is required, because vma_area isn't tranfered in restorer and * shmid is used to determing which vma-s are cowed. */ #define vma_premmaped_start(vma) ((vma)->shmid) static inline int in_vma_area(struct vma_area *vma, unsigned long addr) { return addr >= (unsigned long)vma->e->start && addr < (unsigned long)vma->e->end; } static inline bool vma_entry_is_private(VmaEntry *entry, unsigned long task_size) { return (vma_entry_is(entry, VMA_AREA_REGULAR) && (vma_entry_is(entry, VMA_ANON_PRIVATE) || vma_entry_is(entry, VMA_FILE_PRIVATE)) && (entry->end <= task_size)) || vma_entry_is(entry, VMA_AREA_AIORING); } static inline bool vma_area_is_private(struct vma_area *vma, unsigned long task_size) { return vma_entry_is_private(vma->e, task_size); } static inline struct vma_area *vma_next(struct vma_area *vma) { return list_entry(vma->list.next, struct vma_area, list); } static inline bool vma_entry_can_be_lazy(VmaEntry *e) { return ((e->flags & MAP_ANONYMOUS) && (e->flags & MAP_PRIVATE) && !(e->flags & MAP_LOCKED) && !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VSYSCALL))); } #endif /* __CR_VMA_H__ */ criu-3.6/criu/include/xmalloc.h000066400000000000000000000000551317335042600165250ustar00rootroot00000000000000#include "log.h" #include "common/xmalloc.h" criu-3.6/criu/ipc_ns.c000066400000000000000000000501331317335042600147130ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "util.h" #include "cr_options.h" #include "imgset.h" #include "namespaces.h" #include "sysctl.h" #include "ipc_ns.h" #include "shmem.h" #include "protobuf.h" #include "images/ipc-var.pb-c.h" #include "images/ipc-shm.pb-c.h" #include "images/ipc-sem.pb-c.h" #include "images/ipc-msg.pb-c.h" #if defined (__GLIBC__) && __GLIBC__ >= 2 #define KEY __key #else #define KEY key #endif #ifndef MSGMAX #define MSGMAX 8192 #endif #ifndef MSG_COPY #define MSG_COPY 040000 #endif static void pr_ipc_desc_entry(unsigned int loglevel, const IpcDescEntry *desc) { print_on_level(loglevel, "id: %-10d key: %#08x uid: %-10d gid: %-10d " "cuid: %-10d cgid: %-10d mode: %-10o ", desc->id, desc->key, desc->uid, desc->gid, desc->cuid, desc->cgid, desc->mode); } static void fill_ipc_desc(int id, IpcDescEntry *desc, const struct ipc_perm *ipcp) { desc->id = id; desc->key = ipcp->KEY; desc->uid = userns_uid(ipcp->uid); desc->gid = userns_gid(ipcp->gid); desc->cuid = userns_uid(ipcp->cuid); desc->cgid = userns_gid(ipcp->cgid); desc->mode = ipcp->mode; } static void pr_ipc_sem_array(unsigned int loglevel, int nr, u16 *values) { while (nr--) print_on_level(loglevel, " %-5d", values[nr]); print_on_level(loglevel, "\n"); } #define pr_info_ipc_sem_array(nr, values) pr_ipc_sem_array(LOG_INFO, nr, values) static void pr_info_ipc_sem_entry(const IpcSemEntry *sem) { pr_ipc_desc_entry(LOG_INFO, sem->desc); print_on_level(LOG_INFO, "nsems: %-10d\n", sem->nsems); } static int dump_ipc_sem_set(struct cr_img *img, const IpcSemEntry *sem) { size_t rounded; int ret, size; u16 *values; size = sizeof(u16) * sem->nsems; rounded = round_up(size, sizeof(u64)); values = xmalloc(rounded); if (values == NULL) { pr_err("Failed to allocate memory for semaphore set values\n"); ret = -ENOMEM; goto out; } ret = semctl(sem->desc->id, 0, GETALL, values); if (ret < 0) { pr_perror("Failed to get semaphore set values"); ret = -errno; goto out; } pr_info_ipc_sem_array(sem->nsems, values); memzero((void *)values + size, rounded - size); ret = write_img_buf(img, values, rounded); if (ret < 0) { pr_err("Failed to write IPC message data\n"); goto out; } out: xfree(values); return ret; } static int dump_ipc_sem_desc(struct cr_img *img, int id, const struct semid_ds *ds) { IpcSemEntry sem = IPC_SEM_ENTRY__INIT; IpcDescEntry desc = IPC_DESC_ENTRY__INIT; int ret; sem.desc = &desc; sem.nsems = ds->sem_nsems; fill_ipc_desc(id, sem.desc, &ds->sem_perm); pr_info_ipc_sem_entry(&sem); ret = pb_write_one(img, &sem, PB_IPC_SEM); if (ret < 0) { pr_err("Failed to write IPC semaphores set\n"); return ret; } return dump_ipc_sem_set(img, &sem); } static int dump_ipc_sem(struct cr_img *img) { int i, maxid; struct seminfo info; int slot; maxid = semctl(0, 0, SEM_INFO, &info); if (maxid < 0) { pr_perror("semctl failed"); return -errno; } pr_info("IPC semaphore sets: %d\n", info.semusz); for (i = 0, slot = 0; i <= maxid; i++) { struct semid_ds ds; int id, ret; id = semctl(i, 0, SEM_STAT, &ds); if (id < 0) { if (errno == EINVAL) continue; pr_perror("Failed to get stats for IPC semaphore set"); break; } ret = dump_ipc_sem_desc(img, id, &ds); if (!ret) slot++; } if (slot != info.semusz) { pr_err("Failed to collect %d (only %d succeeded)\n", info.semusz, slot); return -EFAULT; } return info.semusz; } static void pr_info_ipc_msg(int nr, const IpcMsg *msg) { print_on_level(LOG_INFO, " %-5d: type: %-20"PRId64" size: %-10d\n", nr++, msg->mtype, msg->msize); } static void pr_info_ipc_msg_entry(const IpcMsgEntry *msg) { pr_ipc_desc_entry(LOG_INFO, msg->desc); print_on_level(LOG_INFO, "qbytes: %-10d qnum: %-10d\n", msg->qbytes, msg->qnum); } static int dump_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq, unsigned int msg_nr) { struct msgbuf *message = NULL; unsigned int msgmax; int ret, msg_cnt = 0; struct sysctl_req req[] = { { "kernel/msgmax", &msgmax, CTL_U32 }, }; ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, CLONE_NEWIPC); if (ret < 0) { pr_err("Failed to read max IPC message size\n"); goto err; } msgmax += sizeof(struct msgbuf); message = xmalloc(round_up(msgmax, sizeof(u64))); if (message == NULL) { pr_err("Failed to allocate memory for IPC message\n"); return -ENOMEM; } for (msg_cnt = 0; msg_cnt < msg_nr; msg_cnt++) { IpcMsg msg = IPC_MSG__INIT; size_t rounded; ret = msgrcv(msq->desc->id, message, msgmax, msg_cnt, IPC_NOWAIT | MSG_COPY); if (ret < 0) { pr_perror("Failed to copy IPC message"); goto err; } msg.msize = ret; msg.mtype = message->mtype; pr_info_ipc_msg(msg_cnt, &msg); ret = pb_write_one(img, &msg, PB_IPCNS_MSG); if (ret < 0) { pr_err("Failed to write IPC message header\n"); break; } rounded = round_up(msg.msize, sizeof(u64)); memzero(((void *)message->mtext + msg.msize), rounded - msg.msize); ret = write_img_buf(img, message->mtext, rounded); if (ret < 0) { pr_err("Failed to write IPC message data\n"); break; } } ret = 0; err: xfree(message); return ret; } static int dump_ipc_msg_queue(struct cr_img *img, int id, const struct msqid_ds *ds) { IpcMsgEntry msg = IPC_MSG_ENTRY__INIT; IpcDescEntry desc = IPC_DESC_ENTRY__INIT; int ret; msg.desc = &desc; fill_ipc_desc(id, msg.desc, &ds->msg_perm); msg.qbytes = ds->msg_qbytes; msg.qnum = ds->msg_qnum; pr_info_ipc_msg_entry(&msg); ret = pb_write_one(img, &msg, PB_IPCNS_MSG_ENT); if (ret < 0) { pr_err("Failed to write IPC message queue\n"); return ret; } return dump_ipc_msg_queue_messages(img, &msg, ds->msg_qnum); } static int dump_ipc_msg(struct cr_img *img) { int i, maxid; struct msginfo info; int slot; maxid = msgctl(0, MSG_INFO, (struct msqid_ds *)&info); if (maxid < 0) { pr_perror("msgctl failed"); return -errno; } pr_info("IPC message queues: %d\n", info.msgpool); for (i = 0, slot = 0; i <= maxid; i++) { struct msqid_ds ds; int id, ret; id = msgctl(i, MSG_STAT, &ds); if (id < 0) { if (errno == EINVAL) continue; pr_perror("Failed to get stats for IPC message queue"); break; } ret = dump_ipc_msg_queue(img, id, &ds); if (!ret) slot++; } if (slot != info.msgpool) { pr_err("Failed to collect %d message queues (only %d succeeded)\n", info.msgpool, slot); return -EFAULT; } return info.msgpool; } static void pr_info_ipc_shm(const IpcShmEntry *shm) { pr_ipc_desc_entry(LOG_INFO, shm->desc); print_on_level(LOG_INFO, "size: %-10"PRIu64"\n", shm->size); } #define NR_MANDATORY_IPC_SYSCTLS 9 static int ipc_sysctl_req(IpcVarEntry *e, int op) { struct sysctl_req req[] = { { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) }, { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 }, { "kernel/msgmnb", &e->msg_ctlmnb, CTL_U32 }, { "kernel/auto_msgmni", &e->auto_msgmni, CTL_U32 }, { "kernel/msgmni", &e->msg_ctlmni, CTL_U32 }, { "kernel/shmmax", &e->shm_ctlmax, CTL_U64 }, { "kernel/shmall", &e->shm_ctlall, CTL_U64 }, { "kernel/shmmni", &e->shm_ctlmni, CTL_U32 }, { "kernel/shm_rmid_forced", &e->shm_rmid_forced, CTL_U32 }, /* We have 9 mandatory sysctls above and 8 optional below */ { "fs/mqueue/queues_max", &e->mq_queues_max, CTL_U32 }, { "fs/mqueue/msg_max", &e->mq_msg_max, CTL_U32 }, { "fs/mqueue/msgsize_max", &e->mq_msgsize_max, CTL_U32 }, { "fs/mqueue/msg_default", &e->mq_msg_default, CTL_U32 }, { "fs/mqueue/msgsize_default", &e->mq_msgsize_default, CTL_U32 }, { "kernel/msg_next_id", &e->msg_next_id, CTL_U32 }, { "kernel/sem_next_id", &e->sem_next_id, CTL_U32 }, { "kernel/shm_next_id", &e->shm_next_id, CTL_U32 }, }; int nr = NR_MANDATORY_IPC_SYSCTLS; /* Skip sysctls which can't be set or haven't existed on dump */ if (access("/proc/sys/fs/mqueue", X_OK)) pr_info("Mqueue sysctls are missing\n"); else { nr += 3; if (e->has_mq_msg_default) { req[nr++] = req[12]; req[nr++] = req[13]; } } if (e->has_msg_next_id) req[nr++] = req[14]; if (e->has_sem_next_id) req[nr++] = req[15]; if (e->has_shm_next_id) req[nr++] = req[16]; return sysctl_op(req, nr, op, CLONE_NEWIPC); } static int dump_ipc_shm_pages(const IpcShmEntry *shm) { int ret; void *data; data = shmat(shm->desc->id, NULL, SHM_RDONLY); if (data == (void *)-1) { pr_perror("Failed to attach IPC shared memory"); return -errno; } ret = dump_one_sysv_shmem(data, shm->size, shm->desc->id); if (shmdt(data)) { pr_perror("Failed to detach IPC shared memory"); return -errno; } return ret; } static int dump_ipc_shm_seg(struct cr_img *img, int id, const struct shmid_ds *ds) { IpcShmEntry shm = IPC_SHM_ENTRY__INIT; IpcDescEntry desc = IPC_DESC_ENTRY__INIT; int ret; shm.desc = &desc; shm.size = ds->shm_segsz; shm.has_in_pagemaps = true; shm.in_pagemaps = true; fill_ipc_desc(id, shm.desc, &ds->shm_perm); pr_info_ipc_shm(&shm); ret = pb_write_one(img, &shm, PB_IPC_SHM); if (ret < 0) { pr_err("Failed to write IPC shared memory segment\n"); return ret; } return dump_ipc_shm_pages(&shm); } static int dump_ipc_shm(struct cr_img *img) { int i, maxid, slot; struct shm_info info; maxid = shmctl(0, SHM_INFO, (void *)&info); if (maxid < 0) { pr_perror("shmctl(SHM_INFO) failed"); return -errno; } pr_info("IPC shared memory segments: %d\n", info.used_ids); for (i = 0, slot = 0; i <= maxid; i++) { struct shmid_ds ds; int id, ret; id = shmctl(i, SHM_STAT, &ds); if (id < 0) { if (errno == EINVAL) continue; pr_perror("Failed to get stats for IPC shared memory"); break; } ret = dump_ipc_shm_seg(img, id, &ds); if (ret < 0) return ret; slot++; } if (slot != info.used_ids) { pr_err("Failed to collect %d (only %d succeeded)\n", info.used_ids, slot); return -EFAULT; } return 0; } static int dump_ipc_var(struct cr_img *img) { IpcVarEntry var = IPC_VAR_ENTRY__INIT; int ret = -1; var.n_sem_ctls = 4; var.sem_ctls = xmalloc(pb_repeated_size(&var, sem_ctls)); if (!var.sem_ctls) goto err; var.has_mq_msg_default = true; var.has_mq_msgsize_default = true; var.has_msg_next_id = true; var.has_sem_next_id = true; var.has_shm_next_id = true; ret = ipc_sysctl_req(&var, CTL_READ); if (ret < 0) { pr_err("Failed to read IPC variables\n"); goto err; } /* * One can not write to msg_next_xxx sysctls -1, * which is their initial value */ if (var.msg_next_id == -1) var.has_msg_next_id = false; if (var.sem_next_id == -1) var.has_sem_next_id = false; if (var.shm_next_id == -1) var.has_shm_next_id = false; ret = pb_write_one(img, &var, PB_IPC_VAR); if (ret < 0) { pr_err("Failed to write IPC variables\n"); goto err; } err: xfree(var.sem_ctls); return ret; } static int dump_ipc_data(const struct cr_imgset *imgset) { int ret; ret = dump_ipc_var(img_from_set(imgset, CR_FD_IPC_VAR)); if (ret < 0) return ret; ret = dump_ipc_shm(img_from_set(imgset, CR_FD_IPCNS_SHM)); if (ret < 0) return ret; ret = dump_ipc_msg(img_from_set(imgset, CR_FD_IPCNS_MSG)); if (ret < 0) return ret; ret = dump_ipc_sem(img_from_set(imgset, CR_FD_IPCNS_SEM)); if (ret < 0) return ret; return 0; } int dump_ipc_ns(int ns_id) { int ret; struct cr_imgset *imgset; imgset = cr_imgset_open(ns_id, IPCNS, O_DUMP); if (imgset == NULL) return -1; ret = dump_ipc_data(imgset); if (ret < 0) { pr_err("Failed to write IPC namespace data\n"); goto err; } err: close_cr_imgset(&imgset); return ret < 0 ? -1 : 0; } static int prepare_ipc_sem_values(struct cr_img *img, const IpcSemEntry *sem) { int ret, size; u16 *values; size = round_up(sizeof(u16) * sem->nsems, sizeof(u64)); values = xmalloc(size); if (values == NULL) { pr_err("Failed to allocate memory for semaphores set values\n"); ret = -ENOMEM; goto out; } ret = read_img_buf(img, values, size); if (ret < 0) { pr_err("Failed to allocate memory for semaphores set values\n"); ret = -ENOMEM; goto out; } pr_info_ipc_sem_array(sem->nsems, values); ret = semctl(sem->desc->id, 0, SETALL, values); if (ret < 0) { pr_perror("Failed to set semaphores set values"); ret = -errno; } out: xfree(values); return ret; } static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem) { int ret, id; struct sysctl_req req[] = { { "kernel/sem_next_id", &sem->desc->id, CTL_U32 }, }; struct semid_ds semid; ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC); if (ret < 0) { pr_err("Failed to set desired IPC sem ID\n"); return ret; } id = semget(sem->desc->key, sem->nsems, sem->desc->mode | IPC_CREAT | IPC_EXCL); if (id == -1) { pr_perror("Failed to create sem set"); return -errno; } if (id != sem->desc->id) { pr_err("Failed to restore sem id (%d instead of %d)\n", id, sem->desc->id); return -EFAULT; } ret = semctl(id, sem->nsems, IPC_STAT, &semid); if (ret == -1) { pr_err("Failed to get sem stat structure\n"); return -EFAULT; } semid.sem_perm.uid = sem->desc->uid; semid.sem_perm.gid = sem->desc->gid; ret = semctl(id, sem->nsems, IPC_SET, &semid); if (ret == -1) { pr_err("Failed to set sem uid and gid\n"); return -EFAULT; } ret = prepare_ipc_sem_values(img, sem); if (ret < 0) { pr_err("Failed to update sem pages\n"); return ret; } return 0; } static int prepare_ipc_sem(int pid) { int ret; struct cr_img *img; pr_info("Restoring IPC semaphores sets\n"); img = open_image(CR_FD_IPCNS_SEM, O_RSTR, pid); if (!img) return -1; while (1) { IpcSemEntry *sem; ret = pb_read_one_eof(img, &sem, PB_IPC_SEM); if (ret < 0) { ret = -EIO; goto err; } if (ret == 0) break; pr_info_ipc_sem_entry(sem); ret = prepare_ipc_sem_desc(img, sem); ipc_sem_entry__free_unpacked(sem, NULL); if (ret < 0) { pr_err("Failed to prepare semaphores set\n"); goto err; } } close_image(img); return 0; err: close_image(img); return ret; } static int prepare_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq) { IpcMsg *msg = NULL; int msg_nr = 0; int ret = 0; while (msg_nr < msq->qnum) { struct msgbuf { long mtype; char mtext[MSGMAX]; } data; ret = pb_read_one(img, &msg, PB_IPCNS_MSG); if (ret <= 0) return -EIO; pr_info_ipc_msg(msg_nr, msg); if (msg->msize > MSGMAX) { ret = -1; pr_err("Unsupported message size: %d (MAX: %d)\n", msg->msize, MSGMAX); break; } ret = read_img_buf(img, data.mtext, round_up(msg->msize, sizeof(u64))); if (ret < 0) { pr_err("Failed to read IPC message data\n"); break; } data.mtype = msg->mtype; ret = msgsnd(msq->desc->id, &data, msg->msize, IPC_NOWAIT); if (ret < 0) { pr_perror("Failed to send IPC message"); ret = -errno; break; } msg_nr++; } if (msg) ipc_msg__free_unpacked(msg, NULL); return ret; } static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq) { int ret, id; struct sysctl_req req[] = { { "kernel/msg_next_id", &msq->desc->id, CTL_U32 }, }; struct msqid_ds msqid; ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC); if (ret < 0) { pr_err("Failed to set desired IPC msg ID\n"); return ret; } id = msgget(msq->desc->key, msq->desc->mode | IPC_CREAT | IPC_EXCL); if (id == -1) { pr_perror("Failed to create msg set"); return -errno; } if (id != msq->desc->id) { pr_err("Failed to restore msg id (%d instead of %d)\n", id, msq->desc->id); return -EFAULT; } ret = msgctl(id, IPC_STAT, &msqid); if (ret == -1) { pr_err("Failed to get msq stat structure\n"); return -EFAULT; } msqid.msg_perm.uid = msq->desc->uid; msqid.msg_perm.gid = msq->desc->gid; ret = msgctl(id, IPC_SET, &msqid); if (ret == -1) { pr_err("Failed to set msq queue uid and gid\n"); return -EFAULT; } ret = prepare_ipc_msg_queue_messages(img, msq); if (ret < 0) { pr_err("Failed to update message queue messages\n"); return ret; } return 0; } static int prepare_ipc_msg(int pid) { int ret; struct cr_img *img; pr_info("Restoring IPC message queues\n"); img = open_image(CR_FD_IPCNS_MSG, O_RSTR, pid); if (!img) return -1; while (1) { IpcMsgEntry *msq; ret = pb_read_one_eof(img, &msq, PB_IPCNS_MSG_ENT); if (ret < 0) { pr_err("Failed to read IPC messages queue\n"); ret = -EIO; goto err; } if (ret == 0) break; pr_info_ipc_msg_entry(msq); ret = prepare_ipc_msg_queue(img, msq); ipc_msg_entry__free_unpacked(msq, NULL); if (ret < 0) { pr_err("Failed to prepare messages queue\n"); goto err; } } close_image(img); return 0; err: close_image(img); return ret; } static int restore_content(void *data, struct cr_img *img, const IpcShmEntry *shm) { int ifd; ssize_t size, off; ifd = img_raw_fd(img); size = round_up(shm->size, sizeof(u32)); off = 0; do { ssize_t ret; ret = read(ifd, data + off, size - off); if (ret <= 0) { pr_perror("Failed to write IPC shared memory data"); return (int)ret; } off += ret; } while (off < size); return 0; } static int prepare_ipc_shm_pages(struct cr_img *img, const IpcShmEntry *shm) { int ret; void *data; data = shmat(shm->desc->id, NULL, 0); if (data == (void *)-1) { pr_perror("Failed to attach IPC shared memory"); return -errno; } if (shm->has_in_pagemaps && shm->in_pagemaps) ret = restore_sysv_shmem_content(data, shm->size, shm->desc->id); else ret = restore_content(data, img, shm); if (shmdt(data)) { pr_perror("Failed to detach IPC shared memory"); return -errno; } return ret; } static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) { int ret, id; struct sysctl_req req[] = { { "kernel/shm_next_id", &shm->desc->id, CTL_U32 }, }; struct shmid_ds shmid; if (collect_sysv_shmem(shm->desc->id, shm->size)) return -1; ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC); if (ret < 0) { pr_err("Failed to set desired IPC shm ID\n"); return ret; } id = shmget(shm->desc->key, shm->size, shm->desc->mode | IPC_CREAT | IPC_EXCL); if (id == -1) { pr_perror("Failed to create shm set"); return -errno; } if (id != shm->desc->id) { pr_err("Failed to restore shm id (%d instead of %d)\n", id, shm->desc->id); return -EFAULT; } ret = shmctl(id, IPC_STAT, &shmid); if (ret == -1) { pr_err("Failed to get shm stat structure\n"); return -EFAULT; } shmid.shm_perm.uid = shm->desc->uid; shmid.shm_perm.gid = shm->desc->gid; ret = shmctl(id, IPC_SET, &shmid); if (ret == -1) { pr_err("Failed to set shm uid and gid\n"); return -EFAULT; } ret = prepare_ipc_shm_pages(img, shm); if (ret < 0) { pr_err("Failed to update shm pages\n"); return ret; } return 0; } static int prepare_ipc_shm(int pid) { int ret; struct cr_img *img; pr_info("Restoring IPC shared memory\n"); img = open_image(CR_FD_IPCNS_SHM, O_RSTR, pid); if (!img) return -1; while (1) { IpcShmEntry *shm; ret = pb_read_one_eof(img, &shm, PB_IPC_SHM); if (ret < 0) { pr_err("Failed to read IPC shared memory segment\n"); ret = -EIO; goto err; } if (ret == 0) break; pr_info_ipc_shm(shm); ret = prepare_ipc_shm_seg(img, shm); ipc_shm_entry__free_unpacked(shm, NULL); if (ret < 0) { pr_err("Failed to prepare shm segment\n"); goto err; } } close_image(img); return 0; err: close_image(img); return ret; } static int prepare_ipc_var(int pid) { int ret; struct cr_img *img; IpcVarEntry *var; pr_info("Restoring IPC variables\n"); img = open_image(CR_FD_IPC_VAR, O_RSTR, pid); if (!img) return -1; ret = pb_read_one(img, &var, PB_IPC_VAR); close_image(img); if (ret <= 0) { pr_err("Failed to read IPC namespace variables\n"); return -EFAULT; } ret = ipc_sysctl_req(var, CTL_WRITE); ipc_var_entry__free_unpacked(var, NULL); if (ret < 0) { pr_err("Failed to prepare IPC namespace variables\n"); return -EFAULT; } return 0; } int prepare_ipc_ns(int pid) { int ret; pr_info("Restoring IPC namespace\n"); ret = prepare_ipc_var(pid); if (ret < 0) return ret; ret = prepare_ipc_shm(pid); if (ret < 0) return ret; ret = prepare_ipc_msg(pid); if (ret < 0) return ret; ret = prepare_ipc_sem(pid); if (ret < 0) return ret; return 0; } struct ns_desc ipc_ns_desc = NS_DESC_ENTRY(CLONE_NEWIPC, "ipc"); criu-3.6/criu/irmap.c000066400000000000000000000230721317335042600145520ustar00rootroot00000000000000/* * IRMAP -- inode reverse mapping. * * Helps us to map inode number (and device) back to path * so that we can restore inotify/fanotify-s. * * Scanning _is_ slow, so we limit it with hints, which are * heurisitical known places where notifies are typically put. */ #include #include #include #include #include #include #include #include "xmalloc.h" #include "irmap.h" #include "mount.h" #include "log.h" #include "util.h" #include "image.h" #include "stats.h" #include "pstree.h" #include "cr_options.h" #include "protobuf.h" #include "images/fsnotify.pb-c.h" #include "images/fh.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "irmap: " #define IRMAP_CACHE_BITS 5 #define IRMAP_CACHE_SIZE (1 << IRMAP_CACHE_BITS) #define IRMAP_CACHE_MASK (IRMAP_CACHE_SIZE - 1) static inline int irmap_hashfn(unsigned int s_dev, unsigned long i_ino) { return (s_dev + i_ino) & IRMAP_CACHE_MASK; } struct irmap { unsigned int dev; unsigned long ino; char *path; struct irmap *next; bool revalidate; int nr_kids; struct irmap *kids; }; static struct irmap *cache[IRMAP_CACHE_SIZE]; static struct irmap hints[] = { { .path = "/etc", .nr_kids = -1, }, { .path = "/var/spool", .nr_kids = -1, }, { .path = "/var/log", .nr_kids = -1, }, { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 }, { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 }, { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 }, { .path = "/lib/udev", .nr_kids = -1, }, { .path = "/.", .nr_kids = 0, }, { .path = "/no-such-path", .nr_kids = -1, }, { }, }; /* * Update inode (and device) number and cache the entry */ static int irmap_update_stat(struct irmap *i) { struct stat st; int mntns_root; unsigned hv; if (i->ino) return 0; mntns_root = get_service_fd(ROOT_FD_OFF); pr_debug("Refresh stat for %s\n", i->path); if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { pr_perror("Can't stat %s", i->path); return -1; } i->revalidate = false; i->dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); i->ino = st.st_ino; if (!S_ISDIR(st.st_mode)) i->nr_kids = 0; /* don't irmap_update_dir */ hv = irmap_hashfn(i->dev, i->ino); i->next = cache[hv]; cache[hv] = i; return 0; } /* * Update list of children, but don't cache any. Later * we'll scan them one-by-one and cache. */ static int irmap_update_dir(struct irmap *t) { int fd, nr = 0, mntns_root; DIR *dfd; struct dirent *de; if (t->nr_kids >= 0) return 0; mntns_root = get_service_fd(ROOT_FD_OFF); pr_debug("Refilling %s dir\n", t->path); fd = openat(mntns_root, t->path + 1, O_RDONLY); if (fd < 0) { pr_perror("Can't open %s", t->path); return -1; } dfd = fdopendir(fd); if (!dfd) { pr_perror("Can't opendir %s", t->path); return -1; } errno = 0; while ((de = readdir(dfd)) != NULL) { struct irmap *k; if (dir_dots(de)) continue; nr++; if (xrealloc_safe(&t->kids, nr * sizeof(struct irmap))) goto out_err; k = &t->kids[nr - 1]; k->kids = NULL; /* for xrealloc above */ k->ino = 0; /* for irmap_update_stat */ k->nr_kids = -1; /* for irmap_update_dir */ k->path = xsprintf("%s/%s", t->path, de->d_name); if (!k->path) goto out_err; } if (errno) { pr_perror("Readdir failed"); goto out_err; } closedir(dfd); close(fd); t->nr_kids = nr; return 0; out_err: xfree(t->kids); closedir(dfd); close(fd); return -1; } static struct irmap *irmap_scan(struct irmap *t, unsigned int dev, unsigned long ino) { struct irmap *c; int i; if (irmap_update_stat(t)) return NULL; if (t->dev == dev && t->ino == ino) return t; if (irmap_update_dir(t)) return NULL; for (i = 0; i < t->nr_kids; i++) { c = irmap_scan(&t->kids[i], dev, ino); if (c) return c; } return NULL; } static int irmap_revalidate(struct irmap *c, struct irmap **p) { struct stat st; int mntns_root; mntns_root = get_service_fd(ROOT_FD_OFF); pr_debug("Revalidate stat for %s\n", c->path); if (fstatat(mntns_root, c->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { /* File can be (re)moved, so just treat it as invalid */ pr_perror("Can't stat %s", c->path); goto invalid; } if (c->dev != MKKDEV(major(st.st_dev), minor(st.st_dev))) goto invalid; if (c->ino != st.st_ino) goto invalid; c->revalidate = false; return 0; invalid: pr_debug("\t%x:%lx is invalid\n", c->dev, c->ino); *p = c->next; xfree(c->path); xfree(c); return 1; } static bool doing_predump = false; char *irmap_lookup(unsigned int s_dev, unsigned long i_ino) { struct irmap *c, *h, **p; char *path = NULL; int hv; struct irmap_path_opt *o; pr_debug("Resolving %x:%lx path\n", s_dev, i_ino); /* * If we're in predump, then processes already run * and the root_item is already freed by that time. * But the root service fd is already set by the * irmap_predump_prep, so we just go ahead and scan. */ if (!doing_predump && __mntns_get_root_fd(root_item->pid->real) < 0) goto out; timing_start(TIME_IRMAP_RESOLVE); hv = irmap_hashfn(s_dev, i_ino); for (p = &cache[hv]; *p; ) { c = *p; if (!(c->dev == s_dev && c->ino == i_ino)) { p = &(*p)->next; continue; } if (c->revalidate && irmap_revalidate(c, p)) continue; pr_debug("\tFound %s in cache\n", c->path); path = c->path; goto out; } /* Let's scan any user provided paths first; since the user told us * about them, hopefully they're more interesting than our hints. */ list_for_each_entry(o, &opts.irmap_scan_paths, node) { c = irmap_scan(o->ir, s_dev, i_ino); if (c) { pr_debug("\tScanned %s\n", c->path); path = c->path; goto out; } } for (h = hints; h->path; h++) { pr_debug("Scanning %s hint\n", h->path); c = irmap_scan(h, s_dev, i_ino); if (c) { pr_debug("\tScanned %s\n", c->path); path = c->path; goto out; } } out: timing_stop(TIME_IRMAP_RESOLVE); return path; } /* * IRMAP pre-cache -- do early irmap scan on pre-dump to reduce * the freeze time on dump */ struct irmap_predump { unsigned int dev; unsigned long ino; FhEntry fh; struct irmap_predump *next; }; static struct irmap_predump *predump_queue; int irmap_queue_cache(unsigned int dev, unsigned long ino, FhEntry *fh) { struct irmap_predump *ip; ip = xmalloc(sizeof(*ip)); if (!ip) return -1; ip->dev = dev; ip->ino = ino; ip->fh = *fh; ip->fh.handle = xmemdup(fh->handle, FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); if (!ip->fh.handle) { xfree(ip); return -1; } pr_debug("Queue %x:%lx for pre-dump\n", dev, ino); ip->next = predump_queue; predump_queue = ip; return 0; } int irmap_predump_prep(void) { /* * Tasks are about to get released soon, but * we'll need to do FS scan for irmaps. In this * scan we will need to know the root dir tasks * live in. Need to make sure the respective fd * (service) is set to that root, so that the * scan works and doesn't race with the tasks * dying or changind root. */ doing_predump = true; return __mntns_get_root_fd(root_item->pid->real) < 0 ? -1 : 0; } int irmap_predump_run(void) { int ret = 0; struct cr_img *img; struct irmap_predump *ip; img = open_image_at(AT_FDCWD, CR_FD_IRMAP_CACHE, O_DUMP); if (!img) return -1; pr_info("Running irmap pre-dump\n"); for (ip = predump_queue; ip; ip = ip->next) { pr_debug("\tchecking %x:%lx\n", ip->dev, ip->ino); ret = check_open_handle(ip->dev, ip->ino, &ip->fh); if (ret) { pr_err("Failed to resolve %x:%lx\n", ip->dev, ip->ino); break; } if (ip->fh.path) { IrmapCacheEntry ic = IRMAP_CACHE_ENTRY__INIT; pr_info("Irmap cache %x:%lx -> %s\n", ip->dev, ip->ino, ip->fh.path); ic.dev = ip->dev; ic.inode = ip->ino; ic.path = ip->fh.path; ret = pb_write_one(img, &ic, PB_IRMAP_CACHE); if (ret) break; } } close_image(img); return ret; } static int irmap_cache_one(IrmapCacheEntry *ie) { struct irmap *ic; unsigned hv; ic = xmalloc(sizeof(*ic)); if (!ic) return -1; ic->dev = ie->dev; ic->ino = ie->inode; ic->path = xstrdup(ie->path); if (!ie->path) { xfree(ic); return -1; } ic->nr_kids = 0; /* * We've loaded entry from cache, thus we'll need to check * whether it's still valid when find it in cache. */ ic->revalidate = true; pr_debug("Pre-cache %x:%lx -> %s\n", ic->dev, ic->ino, ic->path); hv = irmap_hashfn(ic->dev, ic->ino); ic->next = cache[hv]; cache[hv] = ic; return 0; } static int open_irmap_cache(struct cr_img **img) { int dir = AT_FDCWD; pr_info("Searching irmap cache in work dir\n"); in: *img = open_image_at(dir, CR_FD_IRMAP_CACHE, O_RSTR); if (dir != AT_FDCWD) close(dir); if (empty_image(*img)) { close_image(*img); if (dir == AT_FDCWD) { pr_info("Searching irmap cache in parent\n"); dir = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); if (dir >= 0) goto in; if (errno != ENOENT) return -1; } pr_info("No irmap cache\n"); return 0; } if (!*img) return -1; pr_info("... done\n"); return 1; } int irmap_load_cache(void) { int ret; struct cr_img *img; ret = open_irmap_cache(&img); if (ret <= 0) return ret; pr_info("Loading irmap cache\n"); while (1) { IrmapCacheEntry *ic; ret = pb_read_one_eof(img, &ic, PB_IRMAP_CACHE); if (ret <= 0) break; ret = irmap_cache_one(ic); if (ret < 0) break; irmap_cache_entry__free_unpacked(ic, NULL); } close_image(img); return ret; } int irmap_scan_path_add(char *path) { struct irmap_path_opt *o; o = xzalloc(sizeof(*o)); if (!o) return -1; o->ir = xzalloc(sizeof(*o->ir)); if (!o->ir) { xfree(o); return -1; } o->ir->path = path; o->ir->nr_kids = -1; list_add(&o->node, &opts.irmap_scan_paths); return 0; } criu-3.6/criu/kcmp-ids.c000066400000000000000000000076011317335042600151510ustar00rootroot00000000000000#include #include #include #include "rbtree.h" #include "util.h" #include "kcmp-ids.h" /* * We track shared files by global rbtree, where each node might * be a root for subtree. The reason for that is the nature of data * we obtain from operating system. * * Basically OS provides us two ways to distinguish files * * - information obtained from fstat call * - shiny new sys_kcmp system call (which may compare the file descriptor * pointers inside the kernel and provide us order info) * * So, to speedup procedure of searching for shared file descriptors * we use both techniques. From fstat call we get that named general file * IDs (genid) which are carried in the main rbtree. * * In case if two genid are the same -- we need to use a second way and * call for sys_kcmp. Thus, if kernel tells us that files have identical * genid but in real they are different from kernel point of view -- we assign * a second unique key (subid) to such file descriptor and put it into a subtree. * * So the tree will look like * * (root) * genid-1 * / \ * genid-2 genid-3 * / \ / \ * * Where each genid node might be a sub-rbtree as well * * (genid-N) * / \ * subid-1 subid-2 * / \ / \ * * Carrying two rbtree at once allow us to minimize the number * of sys_kcmp syscalls, also to collect and dump file descriptors * in one pass. */ struct kid_entry { struct rb_node node; struct rb_root subtree_root; struct rb_node subtree_node; u32 subid; /* subid is always unique */ struct kid_elem elem; } __aligned(sizeof(long)); static struct kid_entry *alloc_kid_entry(struct kid_tree *tree, struct kid_elem *elem) { struct kid_entry *e; e = xmalloc(sizeof(*e)); if (!e) goto err; e->subid = tree->subid++; e->elem = *elem; /* Make sure no overflow here */ BUG_ON(!e->subid); rb_init_node(&e->node); rb_init_node(&e->subtree_node); e->subtree_root = RB_ROOT; rb_link_and_balance(&e->subtree_root, &e->subtree_node, NULL, &e->subtree_root.rb_node); err: return e; } static u32 kid_generate_sub(struct kid_tree *tree, struct kid_entry *e, struct kid_elem *elem, int *new_id) { struct rb_node *node = e->subtree_root.rb_node; struct kid_entry *sub = NULL; struct rb_node **new = &e->subtree_root.rb_node; struct rb_node *parent = NULL; BUG_ON(!node); while (node) { struct kid_entry *this = rb_entry(node, struct kid_entry, subtree_node); int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, tree->kcmp_type, this->elem.idx, elem->idx); parent = *new; if (ret == 1) node = node->rb_left, new = &((*new)->rb_left); else if (ret == 2) node = node->rb_right, new = &((*new)->rb_right); else if (ret == 0) return this->subid; else { pr_perror("kcmp failed: pid (%d %d) type %u idx (%u %u)", this->elem.pid, elem->pid, tree->kcmp_type, this->elem.idx, elem->idx); return 0; } } sub = alloc_kid_entry(tree, elem); if (!sub) return 0; rb_link_and_balance(&e->subtree_root, &sub->subtree_node, parent, new); *new_id = 1; return sub->subid; } u32 kid_generate_gen(struct kid_tree *tree, struct kid_elem *elem, int *new_id) { struct rb_node *node = tree->root.rb_node; struct kid_entry *e = NULL; struct rb_node **new = &tree->root.rb_node; struct rb_node *parent = NULL; while (node) { struct kid_entry *this = rb_entry(node, struct kid_entry, node); parent = *new; if (elem->genid < this->elem.genid) node = node->rb_left, new = &((*new)->rb_left); else if (elem->genid > this->elem.genid) node = node->rb_right, new = &((*new)->rb_right); else return kid_generate_sub(tree, this, elem, new_id); } e = alloc_kid_entry(tree, elem); if (!e) return 0; rb_link_and_balance(&tree->root, &e->node, parent, new); *new_id = 1; return e->subid; } criu-3.6/criu/kerndat.c000066400000000000000000000432361317335042600150760ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for sockaddr_in and inet_ntoa() */ #include #include "int.h" #include "log.h" #include "restorer.h" #include "kerndat.h" #include "fs-magic.h" #include "mem.h" #include "common/compiler.h" #include "sysctl.h" #include "cr_options.h" #include "util.h" #include "lsm.h" #include "proc_parse.h" #include "config.h" #include "sk-inet.h" #include #include #include "netfilter.h" #include "linux/userfaultfd.h" #include "prctl.h" #include "uffd.h" #include "vdso.h" struct kerndat_s kdat = { }; static int check_pagemap(void) { int ret, fd; u64 pfn = 0; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); if (fd < 0) { if (errno == EPERM) { pr_info("Pagemap disabled"); kdat.pmap = PM_DISABLED; return 0; } return -1; } /* Get the PFN of some present page. Stack is here, so try it :) */ ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn)); if (ret != sizeof(pfn)) { pr_perror("Can't read pagemap"); return -1; } close(fd); if ((pfn & PME_PFRAME_MASK) == 0) { pr_info("Pagemap provides flags only\n"); kdat.pmap = PM_FLAGS_ONLY; } else { pr_info("Pagemap is fully functional\n"); kdat.pmap = PM_FULL; } return 0; } /* * Anonymous shared mappings are backed by hidden tmpfs * mount. Find out its dev to distinguish such mappings * from real tmpfs files maps. */ static int parse_self_maps(unsigned long vm_start, dev_t *device) { FILE *maps; char buf[1024]; maps = fopen_proc(PROC_SELF, "maps"); if (maps == NULL) return -1; while (fgets(buf, sizeof(buf), maps) != NULL) { char *end, *aux; unsigned long start; int maj, min; start = strtoul(buf, &end, 16); if (vm_start > start) continue; if (vm_start < start) break; /* It's ours */ aux = strchr(end + 1, ' '); /* end prot */ aux = strchr(aux + 1, ' '); /* prot pgoff */ aux = strchr(aux + 1, ' '); /* pgoff dev */ maj = strtoul(aux + 1, &end, 16); min = strtoul(end + 1, NULL, 16); *device = makedev(maj, min); fclose(maps); return 0; } fclose(maps); return -1; } static void kerndat_mmap_min_addr(void) { /* From kernel's default CONFIG_LSM_MMAP_MIN_ADDR */ static const unsigned long default_mmap_min_addr = 65536; uint64_t value; struct sysctl_req req[] = { { .name = "vm/mmap_min_addr", .arg = &value, .type = CTL_U64, }, }; if (sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0)) { pr_warn("Can't fetch %s value, use default %#lx\n", req[0].name, (unsigned long)default_mmap_min_addr); kdat.mmap_min_addr = default_mmap_min_addr; return; } if (value < default_mmap_min_addr) { pr_debug("Adjust mmap_min_addr %#lx -> %#lx\n", (unsigned long)value, (unsigned long)default_mmap_min_addr); kdat.mmap_min_addr = default_mmap_min_addr; } else kdat.mmap_min_addr = value; pr_debug("Found mmap_min_addr %#lx\n", (unsigned long)kdat.mmap_min_addr); } static int kerndat_get_shmemdev(void) { void *map; char maps[128]; struct stat buf; dev_t dev; map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (map == MAP_FAILED) { pr_perror("Can't mmap memory for shmemdev test"); return -1; } sprintf(maps, "/proc/self/map_files/%lx-%lx", (unsigned long)map, (unsigned long)map + page_size()); if (stat(maps, &buf) < 0) { int e = errno; if (errno == EPERM) { /* * Kernel disables messing with map_files. * OK, let's go the slower route. */ if (parse_self_maps((unsigned long)map, &dev) < 0) { pr_err("Can't read self maps\n"); goto err; } } else { pr_perror("Can't stat self map_files %d", e); goto err; } } else dev = buf.st_dev; munmap(map, PAGE_SIZE); kdat.shmem_dev = dev; pr_info("Found anon-shmem device at %"PRIx64"\n", kdat.shmem_dev); return 0; err: munmap(map, PAGE_SIZE); return -1; } static dev_t get_host_dev(unsigned int which) { static struct kst { const char *name; const char *path; unsigned int magic; dev_t fs_dev; } kstat[KERNDAT_FS_STAT_MAX] = { [KERNDAT_FS_STAT_DEVPTS] = { .name = "devpts", .path = "/dev/pts", .magic = DEVPTS_SUPER_MAGIC, }, [KERNDAT_FS_STAT_DEVTMPFS] = { .name = "devtmpfs", .path = "/dev", .magic = TMPFS_MAGIC, }, [KERNDAT_FS_STAT_BINFMT_MISC] = { .name = "binfmt_misc", .path = "/proc/sys/fs/binfmt_misc", .magic = BINFMTFS_MAGIC, }, }; if (which >= KERNDAT_FS_STAT_MAX) { pr_err("Wrong fs type %u passed\n", which); return 0; } if (kstat[which].fs_dev == 0) { struct statfs fst; struct stat st; if (statfs(kstat[which].path, &fst)) { pr_perror("Unable to statefs %s", kstat[which].path); return 0; } /* * XXX: If the fs we need is not there, it still * may mean that it's virtualized, but just not * mounted on the host. */ if (fst.f_type != kstat[which].magic) { pr_err("%s isn't mount on the host\n", kstat[which].name); return 0; } if (stat(kstat[which].path, &st)) { pr_perror("Unable to stat %s", kstat[which].path); return 0; } BUG_ON(st.st_dev == 0); kstat[which].fs_dev = st.st_dev; } return kstat[which].fs_dev; } int kerndat_fs_virtualized(unsigned int which, u32 kdev) { dev_t host_fs_dev; host_fs_dev = get_host_dev(which); if (host_fs_dev == 0) return -1; return (kdev_to_odev(kdev) == host_fs_dev) ? 0 : 1; } /* * Check whether pagemap reports soft dirty bit. Kernel has * this functionality under CONFIG_MEM_SOFT_DIRTY option. */ int kerndat_get_dirty_track(void) { char *map; int pm2; u64 pmap = 0; int ret = -1; map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (map == MAP_FAILED) { pr_perror("Can't mmap memory for pagemap test"); return ret; } /* * Kernel shows soft-dirty bits only if this soft-dirty * was at least once re-set. (this is to be removed in * a couple of kernel releases) */ ret = do_task_reset_dirty_track(getpid()); if (ret < 0) return ret; if (ret == 1) goto no_dt; ret = -1; pm2 = open_proc(PROC_SELF, "pagemap"); if (pm2 < 0) { munmap(map, PAGE_SIZE); return ret; } map[0] = '\0'; lseek(pm2, (unsigned long)map / PAGE_SIZE * sizeof(u64), SEEK_SET); ret = read(pm2, &pmap, sizeof(pmap)); if (ret < 0) pr_perror("Read pmap err!"); close(pm2); munmap(map, PAGE_SIZE); if (pmap & PME_SOFT_DIRTY) { pr_info("Dirty track supported on kernel\n"); kdat.has_dirty_track = true; } else { no_dt: pr_info("Dirty tracking support is OFF\n"); if (opts.track_mem) { pr_err("Tracking memory is not available\n"); return -1; } } return 0; } /* The page frame number (PFN) is constant for the zero page */ static int init_zero_page_pfn() { void *addr; int ret = 0; kdat.zero_page_pfn = -1; if (kdat.pmap != PM_FULL) { pr_info("Zero page detection failed, optimization turns off.\n"); return 0; } addr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { pr_perror("Unable to map zero page"); return 0; } if (*((int *) addr) != 0) { BUG(); return -1; } ret = vaddr_to_pfn(-1, (unsigned long)addr, &kdat.zero_page_pfn); munmap(addr, PAGE_SIZE); if (kdat.zero_page_pfn == 0) ret = -1; return ret; } static int get_last_cap(void) { struct sysctl_req req[] = { { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 }, }; return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); } static bool kerndat_has_memfd_create(void) { int ret; ret = syscall(SYS_memfd_create, NULL, 0); if (ret == -1 && errno == ENOSYS) kdat.has_memfd = false; else if (ret == -1 && errno == EFAULT) kdat.has_memfd = true; else { pr_err("Unexpected error from memfd_create(NULL, 0): %m\n"); return -1; } return 0; } static int get_task_size(void) { kdat.task_size = compel_task_size(); pr_debug("Found task size of %lx\n", kdat.task_size); return 0; } int kerndat_fdinfo_has_lock() { int fd, pfd = -1, exit_code = -1, len; char buf[PAGE_SIZE]; fd = open_proc(PROC_GEN, "locks"); if (fd < 0) return -1; if (flock(fd, LOCK_SH)) { pr_perror("Can't take a lock"); goto out; } pfd = open_proc(PROC_SELF, "fdinfo/%d", fd); if (pfd < 0) goto out; len = read(pfd, buf, sizeof(buf) - 1); if (len < 0) { pr_perror("Unable to read"); goto out; } buf[len] = 0; kdat.has_fdinfo_lock = (strstr(buf, "lock:") != NULL); exit_code = 0; out: close(pfd); close(fd); return exit_code; } static int get_ipv6() { if (access("/proc/sys/net/ipv6", F_OK) < 0) { if (errno == ENOENT) { pr_debug("ipv6 is disabled\n"); kdat.ipv6 = false; return 0; } pr_perror("Unable to access /proc/sys/net/ipv6"); return -1; } kdat.ipv6 = true; return 0; } int kerndat_loginuid(void) { unsigned int saved_loginuid; int ret; kdat.luid = LUID_NONE; /* No such file: CONFIG_AUDITSYSCALL disabled */ saved_loginuid = parse_pid_loginuid(PROC_SELF, &ret, true); if (ret < 0) return 0; kdat.luid = LUID_READ; /* * From kernel v3.13-rc2 it's possible to unset loginuid value, * on that rely dump/restore code. * See also: marc.info/?l=git-commits-head&m=138509506407067 */ if (prepare_loginuid(INVALID_UID, LOG_WARN) < 0) return 0; /* Cleaning value back as it was */ if (prepare_loginuid(saved_loginuid, LOG_WARN) < 0) return 0; kdat.luid = LUID_FULL; return 0; } static int kerndat_iptables_has_xtlocks(void) { int fd; char *argv[4] = { "sh", "-c", "iptables -w -L", NULL }; fd = open("/dev/null", O_RDWR); if (fd < 0) { fd = -1; pr_perror("failed to open /dev/null, using log fd for xtlocks check"); } kdat.has_xtlocks = 1; if (cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL) == -1) kdat.has_xtlocks = 0; close_safe(&fd); return 0; } int kerndat_tcp_repair(void) { int sock, clnt = -1, yes = 1, exit_code = -1; struct sockaddr_in addr; socklen_t aux; memset(&addr,0,sizeof(addr)); addr.sin_family = AF_INET; inet_pton(AF_INET, "127.0.0.1", &(addr.sin_addr)); addr.sin_port = 0; sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { pr_perror("Unable to create a socket"); return -1; } if (bind(sock, (struct sockaddr *) &addr, sizeof(addr))) { pr_perror("Unable to bind a socket"); goto err; } aux = sizeof(addr); if (getsockname(sock, (struct sockaddr *) &addr, &aux)) { pr_perror("Unable to get a socket name"); goto err; } if (listen(sock, 1)) { pr_perror("Unable to listen a socket"); goto err; } clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (clnt < 0) { pr_perror("Unable to create a socket"); goto err; } if (connect(clnt, (struct sockaddr *) &addr, sizeof(addr))) { pr_perror("Unable to connect a socket"); goto err; } if (shutdown(clnt, SHUT_WR)) { pr_perror("Unable to shutdown a socket"); goto err; } if (setsockopt(clnt, SOL_TCP, TCP_REPAIR, &yes, sizeof(yes))) { if (errno != EPERM) goto err; kdat.has_tcp_half_closed = false; } else kdat.has_tcp_half_closed = true; exit_code = 0; err: close_safe(&clnt); close(sock); return exit_code; } static int kerndat_compat_restore(void) { int ret; ret = kdat_can_map_vdso(); if (ret < 0) return ret; kdat.can_map_vdso = !!ret; /* depends on kdat.can_map_vdso result */ kdat.compat_cr = kdat_compatible_cr(); return 0; } static int kerndat_detect_stack_guard_gap(void) { int num, ret = -1, detected = 0; unsigned long start, end; char r, w, x, s; char buf[1024]; FILE *maps; void *mem; mem = mmap(NULL, (3ul << 20), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, -1, 0); if (mem == MAP_FAILED) { pr_perror("Can't mmap stack area"); return -1; } munmap(mem, (3ul << 20)); mem = mmap(mem + (2ul << 20), (1ul << 20), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | MAP_GROWSDOWN, -1, 0); if (mem == MAP_FAILED) { pr_perror("Can't mmap stack area"); return -1; } maps = fopen("/proc/self/maps", "r"); if (maps == NULL) { munmap(mem, 4096); return -1; } while (fgets(buf, sizeof(buf), maps)) { num = sscanf(buf, "%lx-%lx %c%c%c%c", &start, &end, &r, &w, &x, &s); if (num < 6) { pr_err("Can't parse: %s\n", buf); goto err; } /* * When reading /proc/$pid/[s]maps the * start/end addresses migh be cutted off * with PAGE_SIZE on kernels prior 4.12 * (see kernel commit 1be7107fbe18ee). * * Same time there was semi-complete * patch released which hitted a number * of repos (Ubuntu, Fedora) where instead * of PAGE_SIZE the 1M gap is cutted off. */ if (start == (unsigned long)mem) { kdat.stack_guard_gap_hidden = false; detected = 1; break; } else if (start == ((unsigned long)mem + (1ul << 20))) { pr_warn("Unsupported stack guard detected, confused but continue\n"); kdat.stack_guard_gap_hidden = true; detected = 1; break; } else if (start == ((unsigned long)mem + PAGE_SIZE)) { kdat.stack_guard_gap_hidden = true; detected = 1; break; } } if (detected) ret = 0; err: munmap(mem, (1ul << 20)); fclose(maps); return ret; } #define KERNDAT_CACHE_FILE KDAT_RUNDIR"/criu.kdat" #define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR"/.criu.kdat" static int kerndat_try_load_cache(void) { int fd, ret; fd = open(KERNDAT_CACHE_FILE, O_RDONLY); if (fd < 0) { pr_warn("Can't load %s\n", KERNDAT_CACHE_FILE); return 1; } ret = read(fd, &kdat, sizeof(kdat)); if (ret < 0) { pr_perror("Can't read kdat cache"); return -1; } close(fd); if (ret != sizeof(kdat) || kdat.magic1 != KDAT_MAGIC || kdat.magic2 != KDAT_MAGIC_2) { pr_warn("Stale %s file\n", KERNDAT_CACHE_FILE); unlink(KERNDAT_CACHE_FILE); return 1; } pr_info("Loaded kdat cache from %s\n", KERNDAT_CACHE_FILE); return 0; } static void kerndat_save_cache(void) { int fd, ret; struct statfs s; fd = open(KERNDAT_CACHE_FILE_TMP, O_CREAT | O_EXCL | O_WRONLY, 0600); if (fd < 0) /* * It can happen that we race with some other criu * instance. That's OK, just ignore this error and * proceed. */ return; if (fstatfs(fd, &s) < 0 || s.f_type != TMPFS_MAGIC) { pr_warn("Can't keep kdat cache on non-tempfs\n"); close(fd); goto unl; } /* * One magic to make sure we're reading the kdat file. * One more magic to make somehow sure we don't read kdat * from some other criu */ kdat.magic1 = KDAT_MAGIC; kdat.magic2 = KDAT_MAGIC_2; ret = write(fd, &kdat, sizeof(kdat)); close(fd); if (ret == sizeof(kdat)) ret = rename(KERNDAT_CACHE_FILE_TMP, KERNDAT_CACHE_FILE); else { ret = -1; errno = EIO; } if (ret < 0) { pr_perror("Couldn't save %s", KERNDAT_CACHE_FILE); unl: unlink(KERNDAT_CACHE_FILE_TMP); } } int kerndat_uffd(void) { int uffd; kdat.uffd_features = 0; uffd = uffd_open(0, &kdat.uffd_features); /* * uffd == -ENOSYS means userfaultfd is not supported on this * system and we just happily return with kdat.has_uffd = false. * Error other than -ENOSYS would mean "Houston, Houston, we * have a problem!" */ if (uffd < 0) { if (uffd == -ENOSYS) return 0; pr_err("Lazy pages are not available\n"); return -1; } kdat.has_uffd = true; /* * we have to close the uffd and reopen in later in restorer * to enable non-cooperative features */ close(uffd); return 0; } int kerndat_has_thp_disable(void) { struct bfd f; void *addr; char *str; int ret = -1; bool vma_match = false; if (prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0)) { if (errno != EINVAL) return -1; pr_info("PR_SET_THP_DISABLE is not available\n"); return 0; } addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (addr == MAP_FAILED) { pr_perror("Can't mmap memory for THP disable test"); return -1; } if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) return -1; f.fd = open("/proc/self/smaps", O_RDONLY); if (f.fd < 0) { pr_perror("Can't open /proc/self/smaps"); goto out_unmap; } if (bfdopenr(&f)) goto out_unmap; while ((str = breadline(&f)) != NULL) { if (IS_ERR(str)) goto out_close; if (is_vma_range_fmt(str)) { unsigned long vma_addr; if (sscanf(str, "%lx-", &vma_addr) != 1) { pr_err("Can't parse: %s\n", str); goto out_close; } if (vma_addr == (unsigned long)addr) vma_match = true; } if (vma_match && !strncmp(str, "VmFlags: ", 9)) { u32 flags = 0; u64 madv = 0; int io_pf = 0; parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); break; } } ret = 0; out_close: bclose(&f); out_unmap: munmap(addr, PAGE_SIZE); return ret; } int kerndat_init(void) { int ret; ret = kerndat_try_load_cache(); if (ret <= 0) return ret; preload_socket_modules(); preload_netfilter_modules(); ret = check_pagemap(); if (!ret) ret = kerndat_get_shmemdev(); if (!ret) ret = kerndat_get_dirty_track(); if (!ret) ret = init_zero_page_pfn(); if (!ret) ret = get_last_cap(); if (!ret) ret = kerndat_fdinfo_has_lock(); if (!ret) ret = get_task_size(); if (!ret) ret = get_ipv6(); if (!ret) ret = kerndat_loginuid(); if (!ret) ret = kerndat_iptables_has_xtlocks(); if (!ret) ret = kerndat_tcp_repair(); if (!ret) ret = kerndat_compat_restore(); if (!ret) ret = kerndat_has_memfd_create(); if (!ret) ret = kerndat_detect_stack_guard_gap(); if (!ret) ret = kerndat_uffd(); if (!ret) ret = kerndat_has_thp_disable(); /* Needs kdat.compat_cr filled before */ if (!ret) ret = kerndat_vdso_fill_symtable(); /* Depends on kerndat_vdso_fill_symtable() */ if (!ret) ret = kerndat_vdso_preserves_hint(); kerndat_lsm(); kerndat_mmap_min_addr(); if (!ret) kerndat_save_cache(); return ret; } criu-3.6/criu/libnetlink.c000066400000000000000000000117531317335042600156000ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "libnetlink.h" #include "util.h" static int nlmsg_receive(char *buf, int len, int (*cb)(struct nlmsghdr *, void *), int (*err_cb)(int, void *), void *arg) { struct nlmsghdr *hdr; for (hdr = (struct nlmsghdr *)buf; NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) { if (hdr->nlmsg_seq != CR_NLMSG_SEQ) continue; if (hdr->nlmsg_type == NLMSG_DONE) { int *len = (int *)NLMSG_DATA(hdr); if (*len < 0) { pr_err("ERROR %d reported by netlink (%s)\n", *len, strerror(-*len)); return *len; } return 0; } if (hdr->nlmsg_type == NLMSG_ERROR) { struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(hdr); if (hdr->nlmsg_len - sizeof(*hdr) < sizeof(struct nlmsgerr)) { pr_err("ERROR truncated\n"); return -1; } if (err->error == 0) return 0; return err_cb(err->error, arg); } if (cb(hdr, arg)) return -1; } return 1; } static int rtnl_return_err(int err, void *arg) { pr_warn("ERROR %d reported by netlink\n", err); return err; } int do_rtnl_req(int nl, void *req, int size, int (*receive_callback)(struct nlmsghdr *h, void *), int (*error_callback)(int err, void *), void *arg) { struct msghdr msg; struct sockaddr_nl nladdr; struct iovec iov; static char buf[16384]; int err; if (!error_callback) error_callback = rtnl_return_err; memset(&msg, 0, sizeof(msg)); msg.msg_name = &nladdr; msg.msg_namelen = sizeof(nladdr); msg.msg_iov = &iov; msg.msg_iovlen = 1; memset(&nladdr, 0, sizeof(nladdr)); nladdr.nl_family = AF_NETLINK; iov.iov_base = req; iov.iov_len = size; if (sendmsg(nl, &msg, 0) < 0) { err = -errno; pr_perror("Can't send request message"); goto err; } iov.iov_base = buf; iov.iov_len = sizeof(buf); while (1) { memset(&msg, 0, sizeof(msg)); msg.msg_name = &nladdr; msg.msg_namelen = sizeof(nladdr); msg.msg_iov = &iov; msg.msg_iovlen = 1; err = recvmsg(nl, &msg, 0); if (err < 0) { if (errno == EINTR) continue; else { err = -errno; pr_perror("Error receiving nl report"); goto err; } } if (err == 0) break; if (msg.msg_flags & MSG_TRUNC) { pr_err("Message truncated\n"); err = -EMSGSIZE; goto err; } err = nlmsg_receive(buf, err, receive_callback, error_callback, arg); if (err < 0) goto err; if (err == 0) break; } return 0; err: return err; } int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, int alen) { int len = nla_attr_size(alen); struct rtattr *rta; if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) { pr_err("addattr_l ERROR: message exceeded bound of %d\n", maxlen); return -1; } rta = NLMSG_TAIL(n); rta->rta_type = type; rta->rta_len = len; memcpy(RTA_DATA(rta), data, alen); n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); return 0; } /* * Here is a workaround for a bug in libnl-3: * 6a8d90f5fec4 "attr: Allow attribute type 0 */ /** * Create attribute index based on a stream of attributes. * @arg tb Index array to be filled (maxtype+1 elements). * @arg maxtype Maximum attribute type expected and accepted. * @arg head Head of attribute stream. * @arg len Length of attribute stream. * @arg policy Attribute validation policy. * * Iterates over the stream of attributes and stores a pointer to each * attribute in the index array using the attribute type as index to * the array. Attribute with a type greater than the maximum type * specified will be silently ignored in order to maintain backwards * compatibility. If \a policy is not NULL, the attribute will be * validated using the specified policy. * * @see nla_validate * @return 0 on success or a negative error code. */ int __wrap_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, struct nla_policy *policy) { struct nlattr *nla; int rem; memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { int type = nla_type(nla); if (type > maxtype) continue; if (tb[type]) pr_warn("Attribute of type %#x found multiple times in message, " "previous attribute is being ignored.\n", type); tb[type] = nla; } if (rem > 0) pr_warn("netlink: %d bytes leftover after parsing " "attributes.\n", rem); return 0; } /** * parse attributes of a netlink message * @arg nlh netlink message header * @arg hdrlen length of family specific header * @arg tb destination array with maxtype+1 elements * @arg maxtype maximum attribute type to be expected * @arg policy validation policy * * See nla_parse() */ int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, struct nla_policy *policy) { if (!nlmsg_valid_hdr(nlh, hdrlen)) return -NLE_MSG_TOOSHORT; return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy); } criu-3.6/criu/log.c000066400000000000000000000120021317335042600142120ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "page.h" #include "common/compiler.h" #include "util.h" #include "cr_options.h" #include "servicefd.h" #include "rst-malloc.h" #include "common/lock.h" #include "string.h" #define DEFAULT_LOGFD STDERR_FILENO /* Enable timestamps if verbosity is increased from default */ #define LOG_TIMESTAMP (DEFAULT_LOGLEVEL + 1) static unsigned int current_loglevel = DEFAULT_LOGLEVEL; static char buffer[PAGE_SIZE * 2]; static char buf_off = 0; static struct timeval start; /* * Manual buf len as sprintf will _always_ put '\0' at the end, * but we want a "constant" pid to be there on restore */ #define TS_BUF_OFF 12 static void timediff(struct timeval *from, struct timeval *to) { to->tv_sec -= from->tv_sec; if (to->tv_usec >= from->tv_usec) to->tv_usec -= from->tv_usec; else { to->tv_sec--; to->tv_usec += 1000000 - from->tv_usec; } } static void print_ts(void) { struct timeval t; gettimeofday(&t, NULL); timediff(&start, &t); snprintf(buffer, TS_BUF_OFF, "(%02u.%06u)", (unsigned)t.tv_sec, (unsigned)t.tv_usec); buffer[TS_BUF_OFF - 1] = ' '; /* kill the '\0' produced by snprintf */ } int log_get_fd(void) { int fd = get_service_fd(LOG_FD_OFF); return fd < 0 ? DEFAULT_LOGFD : fd; } void log_get_logstart(struct timeval *s) { if (current_loglevel >= LOG_TIMESTAMP) *s = start; else { s->tv_sec = 0; s->tv_usec = 0; } } static void reset_buf_off(void) { if (current_loglevel >= LOG_TIMESTAMP) /* reserve space for a timestamp */ buf_off = TS_BUF_OFF; else buf_off = 0; } /* * Keeping the very first error messsage for RPC to report back. */ struct str_and_lock { mutex_t l; char s[1024]; }; static struct str_and_lock *first_err; int log_keep_err(void) { first_err = shmalloc(sizeof(struct str_and_lock)); if (first_err == NULL) return -1; mutex_init(&first_err->l); first_err->s[0] = '\0'; return 0; } static void log_note_err(char *msg) { if (first_err && first_err->s[0] == '\0') { /* * In any action other than restore this locking is * actually not required, but ... it's error path * anyway, so it doesn't make much sence to try hard * and optimize this out. */ mutex_lock(&first_err->l); if (first_err->s[0] == '\0') strlcpy(first_err->s, msg, sizeof(first_err->s)); mutex_unlock(&first_err->l); } } char *log_first_err(void) { if (!first_err) return NULL; if (first_err->s[0] == '\0') return NULL; return first_err->s; } int log_init(const char *output) { int new_logfd, fd; gettimeofday(&start, NULL); reset_buf_off(); if (output && !strncmp(output, "-", 2)) { new_logfd = dup(STDOUT_FILENO); if (new_logfd < 0) { pr_perror("Cant't dup stdout stream"); return -1; } } else if (output) { new_logfd = open(output, O_CREAT|O_TRUNC|O_WRONLY|O_APPEND, 0600); if (new_logfd < 0) { pr_perror("Can't create log file %s", output); return -1; } } else { new_logfd = dup(DEFAULT_LOGFD); if (new_logfd < 0) { pr_perror("Can't dup log file"); return -1; } } fd = install_service_fd(LOG_FD_OFF, new_logfd); close(new_logfd); if (fd < 0) goto err; return 0; err: pr_perror("Log engine failure, can't duplicate descriptor"); return -1; } int log_init_by_pid(void) { char path[PATH_MAX]; /* * reset buf_off as this fn is called on each fork while * restoring process tree */ reset_buf_off(); if (!opts.log_file_per_pid) { buf_off += snprintf(buffer + buf_off, sizeof buffer - buf_off, "%6d: ", getpid()); return 0; } if (!opts.output) return 0; snprintf(path, PATH_MAX, "%s.%d", opts.output, getpid()); return log_init(path); } void log_fini(void) { close_service_fd(LOG_FD_OFF); } void log_set_loglevel(unsigned int level) { current_loglevel = level; } unsigned int log_get_loglevel(void) { return current_loglevel; } void vprint_on_level(unsigned int loglevel, const char *format, va_list params) { int fd, size, ret, off = 0; int __errno = errno; if (unlikely(loglevel == LOG_MSG)) { fd = STDOUT_FILENO; off = buf_off; /* skip dangling timestamp */ } else { if (loglevel > current_loglevel) return; fd = log_get_fd(); if (current_loglevel >= LOG_TIMESTAMP) print_ts(); } size = vsnprintf(buffer + buf_off, sizeof buffer - buf_off, format, params); size += buf_off; while (off < size) { ret = write(fd, buffer + off, size - off); if (ret <= 0) break; off += ret; } if (loglevel == LOG_ERROR) log_note_err(buffer + buf_off); errno = __errno; } void print_on_level(unsigned int loglevel, const char *format, ...) { va_list params; va_start(params, format); vprint_on_level(loglevel, format, params); va_end(params); } int write_pidfile(int pid) { int fd; fd = open(opts.pidfile, O_WRONLY | O_EXCL | O_CREAT, 0600); if (fd == -1) { pr_perror("Can't open %s", opts.pidfile); return -1; } dprintf(fd, "%d", pid); close(fd); return 0; } criu-3.6/criu/lsm.c000066400000000000000000000113571317335042600142400ustar00rootroot00000000000000#include #include #include #include #include #include #include "kerndat.h" #include "config.h" #include "pstree.h" #include "util.h" #include "cr_options.h" #include "lsm.h" #include "protobuf.h" #include "images/inventory.pb-c.h" #include "images/creds.pb-c.h" #ifdef CONFIG_HAS_SELINUX #include #endif static int apparmor_get_label(pid_t pid, char **profile_name) { FILE *f; char *space; f = fopen_proc(pid, "attr/current"); if (!f) return -1; if (fscanf(f, "%ms", profile_name) != 1) { fclose(f); pr_perror("err scanfing"); return -1; } fclose(f); /* * A profile name can be followed by an enforcement mode, e.g. * lxc-default-with-nesting (enforced) * but the profile name is just the part before the space. */ space = strstr(*profile_name, " "); if (space) *space = 0; /* * An "unconfined" value means there is no profile, so we don't need to * worry about trying to restore one. */ if (strcmp(*profile_name, "unconfined") == 0) { free(*profile_name); *profile_name = NULL; } return 0; } #ifdef CONFIG_HAS_SELINUX static int selinux_get_label(pid_t pid, char **output) { security_context_t ctx; char *pos, *last; int i; if (getpidcon_raw(pid, &ctx) < 0) { pr_perror("getting selinux profile failed"); return -1; } *output = NULL; /* * Since SELinux attributes can be finer grained than at the task * level, and we currently don't try to dump any of these other bits, * let's only allow unconfined profiles, which look something like: * * unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 */ pos = (char*)ctx; for (i = 0; i < 3; i++) { last = pos; pos = strstr(pos, ":"); if (!pos) { pr_err("Invalid selinux context %s\n", (char *)ctx); freecon(ctx); return -1; } *pos = 0; if (!strstartswith(last, "unconfined_")) { pr_err("Non unconfined selinux contexts not supported %s\n", last); freecon(ctx); return -1; } pos++; } freecon(ctx); return 0; } #endif void kerndat_lsm(void) { if (access(AA_SECURITYFS_PATH, F_OK) == 0) { kdat.lsm = LSMTYPE__APPARMOR; return; } #ifdef CONFIG_HAS_SELINUX /* * This seems to be the canonical place to mount this fs if it is * enabled, although we may (?) want to check /selinux for posterity as * well. */ if (access("/sys/fs/selinux", F_OK) == 0) { kdat.lsm = LSMTYPE__SELINUX; return; } #endif kdat.lsm = LSMTYPE__NO_LSM; } Lsmtype host_lsm_type(void) { return kdat.lsm; } int collect_lsm_profile(pid_t pid, CredsEntry *ce) { int ret; ce->lsm_profile = NULL; switch (kdat.lsm) { case LSMTYPE__NO_LSM: ret = 0; break; case LSMTYPE__APPARMOR: ret = apparmor_get_label(pid, &ce->lsm_profile); break; #ifdef CONFIG_HAS_SELINUX case LSMTYPE__SELINUX: ret = selinux_get_label(pid, &ce->lsm_profile); break; #endif default: BUG(); ret = -1; break; } if (ce->lsm_profile) pr_info("%d has lsm profile %s\n", pid, ce->lsm_profile); return ret; } // in inventory.c extern Lsmtype image_lsm; int validate_lsm(char *lsm_profile) { if (image_lsm == LSMTYPE__NO_LSM || image_lsm == kdat.lsm) return 0; /* * This is really only a problem if the processes have actually * specified an LSM profile. If not, we won't restore anything anyway, * so it's fine. */ if (lsm_profile) { pr_err("mismatched lsm types and lsm profile specified\n"); return -1; } return 0; } int render_lsm_profile(char *profile, char **val) { *val = NULL; switch (kdat.lsm) { case LSMTYPE__APPARMOR: if (strcmp(profile, "unconfined") != 0 && asprintf(val, "changeprofile %s", profile) < 0) { pr_err("allocating lsm profile failed\n"); *val = NULL; return -1; } break; case LSMTYPE__SELINUX: if (asprintf(val, "%s", profile) < 0) { *val = NULL; return -1; } break; default: pr_err("can't render profile %s for lsmtype %d\n", profile, LSMTYPE__NO_LSM); return -1; } return 0; } int lsm_check_opts(void) { char *aux; if (!opts.lsm_supplied) return 0; aux = strchr(opts.lsm_profile, ':'); if (aux == NULL) { pr_err("invalid argument %s for --lsm-profile\n", opts.lsm_profile); return -1; } *aux = '\0'; aux++; if (strcmp(opts.lsm_profile, "apparmor") == 0) { if (kdat.lsm != LSMTYPE__APPARMOR) { pr_err("apparmor LSM specified but apparmor not supported by kernel\n"); return -1; } opts.lsm_profile = aux; } else if (strcmp(opts.lsm_profile, "selinux") == 0) { if (kdat.lsm != LSMTYPE__SELINUX) { pr_err("selinux LSM specified but selinux not supported by kernel\n"); return -1; } opts.lsm_profile = aux; } else if (strcmp(opts.lsm_profile, "none") == 0) { opts.lsm_profile = NULL; } else { pr_err("unknown lsm %s\n", opts.lsm_profile); return -1; } return 0; } criu-3.6/criu/mem.c000066400000000000000000000721441317335042600142240ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "types.h" #include "cr_options.h" #include "servicefd.h" #include "mem.h" #include "parasite-syscall.h" #include "parasite.h" #include "page-pipe.h" #include "page-xfer.h" #include "log.h" #include "kerndat.h" #include "stats.h" #include "vma.h" #include "shmem.h" #include "uffd.h" #include "pstree.h" #include "restorer.h" #include "rst-malloc.h" #include "bitmap.h" #include "sk-packet.h" #include "files-reg.h" #include "pagemap-cache.h" #include "fault-injection.h" #include "prctl.h" #include #include "protobuf.h" #include "images/pagemap.pb-c.h" static int task_reset_dirty_track(int pid) { int ret; if (!opts.track_mem) return 0; BUG_ON(!kdat.has_dirty_track); ret = do_task_reset_dirty_track(pid); BUG_ON(ret == 1); return ret; } int do_task_reset_dirty_track(int pid) { int fd, ret; char cmd[] = "4"; pr_info("Reset %d's dirty tracking\n", pid); fd = __open_proc(pid, EACCES, O_RDWR, "clear_refs"); if (fd < 0) return errno == EACCES ? 1 : -1; ret = write(fd, cmd, sizeof(cmd)); if (ret < 0) { if (errno == EINVAL) /* No clear-soft-dirty in kernel */ ret = 1; else { pr_perror("Can't reset %d's dirty memory tracker (%d)", pid, errno); ret = -1; } } else { pr_info(" ... done\n"); ret = 0; } close(fd); return ret; } unsigned long dump_pages_args_size(struct vm_area_list *vmas) { /* In the worst case I need one iovec for each page */ return sizeof(struct parasite_dump_pages_args) + vmas->nr * sizeof(struct parasite_vma_entry) + (vmas->priv_size + 1) * sizeof(struct iovec); } static inline bool __page_is_zero(u64 pme) { return (pme & PME_PFRAME_MASK) == kdat.zero_page_pfn; } static inline bool __page_in_parent(bool dirty) { /* * If we do memory tracking, but w/o parent images, * then we have to dump all memory */ return opts.track_mem && opts.img_parent && !dirty; } bool should_dump_page(VmaEntry *vmae, u64 pme) { #ifdef CONFIG_VDSO /* * vDSO area must be always dumped because on restore * we might need to generate a proxy. */ if (vma_entry_is(vmae, VMA_AREA_VDSO)) return true; /* * In turn VVAR area is special and referenced from * vDSO area by IP addressing (at least on x86) thus * never ever dump its content but always use one provided * by the kernel on restore, ie runtime VVAR area must * be remapped into proper place.. */ if (vma_entry_is(vmae, VMA_AREA_VVAR)) return false; #endif /* * Optimisation for private mapping pages, that haven't * yet being COW-ed */ if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) return true; return false; } bool page_is_zero(u64 pme) { return __page_is_zero(pme); } bool page_in_parent(bool dirty) { return __page_in_parent(dirty); } /* * This routine finds out what memory regions to grab from the * dumpee. The iovs generated are then fed into vmsplice to * put the memory into the page-pipe's pipe. * * "Holes" in page-pipe are regions, that should be dumped, but * the memory contents is present in the pagent image set. */ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, bool has_parent) { u64 *at = &map[PAGE_PFN(*off)]; unsigned long pfn, nr_to_scan; unsigned long pages[3] = {}; nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; for (pfn = 0; pfn < nr_to_scan; pfn++) { unsigned long vaddr; unsigned int ppb_flags = 0; int ret; if (!should_dump_page(vma->e, at[pfn])) continue; vaddr = vma->e->start + *off + pfn * PAGE_SIZE; if (vma_entry_can_be_lazy(vma->e)) ppb_flags |= PPB_LAZY; /* * If we're doing incremental dump (parent images * specified) and page is not soft-dirty -- we dump * hole and expect the parent images to contain this * page. The latter would be checked in page-xfer. */ if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); pages[0]++; } else { ret = page_pipe_add_page(pp, vaddr, ppb_flags); if (ppb_flags & PPB_LAZY && opts.lazy_pages) pages[1]++; else pages[2]++; } if (ret) { *off += pfn * PAGE_SIZE; return ret; } } *off += pfn * PAGE_SIZE; cnt_add(CNT_PAGES_SCANNED, nr_to_scan); cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); pr_info("Pagemap generated: %lu pages (%lu lazy) %lu holes\n", pages[2] + pages[1], pages[1], pages[0]); return 0; } static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_list, bool skip_non_trackable) { struct parasite_dump_pages_args *args; struct parasite_vma_entry *p_vma; struct vma_area *vma; args = compel_parasite_args_s(ctl, dump_pages_args_size(vma_area_list)); p_vma = pargs_vmas(args); args->nr_vmas = 0; list_for_each_entry(vma, &vma_area_list->h, list) { if (!vma_area_is_private(vma, kdat.task_size)) continue; /* * Kernel write to aio ring is not soft-dirty tracked, * so we ignore them at pre-dump. */ if (vma_entry_is(vma->e, VMA_AREA_AIORING) && skip_non_trackable) continue; if (vma->e->prot & PROT_READ) continue; p_vma->start = vma->e->start; p_vma->len = vma_area_len(vma); p_vma->prot = vma->e->prot; args->nr_vmas++; p_vma++; } return args; } static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct parasite_dump_pages_args *args) { struct page_pipe_buf *ppb; int ret = 0; debug_show_page_pipe(pp); /* Step 2 -- grab pages into page-pipe */ list_for_each_entry(ppb, &pp->bufs, l) { args->nr_segs = ppb->nr_segs; args->nr_pages = ppb->pages_in; pr_debug("PPB: %d pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); if (ret < 0) return -1; ret = compel_util_send_fd(ctl, ppb->p[1]); if (ret) return -1; ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl); if (ret < 0) return -1; args->off += args->nr_segs; } return 0; } static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer) { int ret; /* * Step 3 -- write pages into image (or delay writing for * pre-dump action (see pre_dump_one_task) */ timing_start(TIME_MEMWRITE); ret = page_xfer_dump_pages(xfer, pp); timing_stop(TIME_MEMWRITE); return ret; } static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasite_dump_pages_args *args, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl) { pmc_t pmc = PMC_INIT; struct page_pipe *pp; struct vma_area *vma_area; struct page_xfer xfer = { .parent = NULL }; int ret = -1; unsigned cpp_flags = 0; unsigned long pmc_size; pr_info("\n"); pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real); pr_info("----------------------------------------\n"); timing_start(TIME_MEMDUMP); pr_debug(" Private vmas %lu/%lu pages\n", vma_area_list->priv_longest, vma_area_list->priv_size); /* * Step 0 -- prepare */ pmc_size = max(vma_area_list->priv_longest, vma_area_list->shared_longest); if (pmc_init(&pmc, item->pid->real, &vma_area_list->h, pmc_size * PAGE_SIZE)) return -1; ret = -1; if (!(mdc->pre_dump || mdc->lazy)) /* * Chunk mode pushes pages portion by portion. This mode * only works when we don't need to keep pp for later * use, i.e. on non-lazy non-predump. */ cpp_flags |= PP_CHUNK_MODE; pp = create_page_pipe(vma_area_list->priv_size, mdc->lazy ? NULL : pargs_iovs(args), cpp_flags); if (!pp) goto out; if (!mdc->pre_dump) { /* * Regular dump -- create xfer object and send pages to it * right here. For pre-dumps the pp will be taken by the * caller and handled later. */ ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, vpid(item)); if (ret < 0) goto out_pp; xfer.transfer_lazy = !mdc->lazy; } else { ret = check_parent_page_xfer(CR_FD_PAGEMAP, vpid(item)); if (ret < 0) goto out_pp; if (ret) xfer.parent = NULL + 1; } /* * Step 1 -- generate the pagemap */ args->off = 0; list_for_each_entry(vma_area, &vma_area_list->h, list) { bool has_parent = !!xfer.parent; u64 off = 0; u64 *map; if (!vma_area_is_private(vma_area, kdat.task_size) && !vma_area_is(vma_area, VMA_ANON_SHARED)) continue; if (vma_entry_is(vma_area->e, VMA_AREA_AIORING)) { if (mdc->pre_dump) continue; has_parent = false; } map = pmc_get_map(&pmc, vma_area); if (!map) goto out_xfer; if (vma_area_is(vma_area, VMA_ANON_SHARED)) ret = add_shmem_area(item->pid->real, vma_area->e, map); else { again: ret = generate_iovs(vma_area, pp, map, &off, has_parent); if (ret == -EAGAIN) { BUG_ON(!(pp->flags & PP_CHUNK_MODE)); ret = drain_pages(pp, ctl, args); if (!ret) ret = xfer_pages(pp, &xfer); if (!ret) { page_pipe_reinit(pp); goto again; } } } if (ret < 0) goto out_xfer; } if (mdc->lazy) memcpy(pargs_iovs(args), pp->iovs, sizeof(struct iovec) * pp->nr_iovs); ret = drain_pages(pp, ctl, args); if (!ret && !mdc->pre_dump) ret = xfer_pages(pp, &xfer); if (ret) goto out_xfer; timing_stop(TIME_MEMDUMP); /* * Step 4 -- clean up */ ret = task_reset_dirty_track(item->pid->real); out_xfer: if (!mdc->pre_dump) xfer.close(&xfer); out_pp: if (ret || !(mdc->pre_dump || mdc->lazy)) destroy_page_pipe(pp); else dmpi(item)->mem_pp = pp; out: pmc_fini(&pmc); pr_info("----------------------------------------\n"); return ret; } int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl) { int ret; struct parasite_dump_pages_args *pargs; pargs = prep_dump_pages_args(ctl, vma_area_list, mdc->pre_dump); /* * Add PROT_READ protection for all VMAs we're about to * dump if they don't have one. Otherwise we'll not be * able to read the memory contents. * * Afterwards -- reprotect memory back. */ pargs->add_prot = PROT_READ; ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl); if (ret) { pr_err("Can't dump unprotect vmas with parasite\n"); return ret; } if (fault_injected(FI_DUMP_PAGES)) { pr_err("fault: Dump VMA pages failure!\n"); return -1; } ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl); if (ret) { pr_err("Can't dump page with parasite\n"); /* Parasite will unprotect VMAs after fail in fini() */ return ret; } pargs->add_prot = 0; if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) { pr_err("Can't rollback unprotected vmas with parasite\n"); ret = -1; } return ret; } int prepare_mm_pid(struct pstree_item *i) { pid_t pid = vpid(i); int ret = -1, vn = 0; struct cr_img *img; struct rst_info *ri = rsti(i); img = open_image(CR_FD_MM, O_RSTR, pid); if (!img) return -1; ret = pb_read_one_eof(img, &ri->mm, PB_MM); close_image(img); if (ret <= 0) return ret; if (collect_special_file(ri->mm->exe_file_id) == NULL) return -1; pr_debug("Found %zd VMAs in image\n", ri->mm->n_vmas); img = NULL; if (ri->mm->n_vmas == 0) { /* * Old image. Read VMAs from vma-.img */ img = open_image(CR_FD_VMAS, O_RSTR, pid); if (!img) return -1; } while (vn < ri->mm->n_vmas || img != NULL) { struct vma_area *vma; ret = -1; vma = alloc_vma_area(); if (!vma) break; ret = 0; ri->vmas.nr++; if (!img) vma->e = ri->mm->vmas[vn++]; else { ret = pb_read_one_eof(img, &vma->e, PB_VMA); if (ret <= 0) { xfree(vma); close_image(img); break; } } list_add_tail(&vma->list, &ri->vmas.h); if (vma_area_is_private(vma, kdat.task_size)) { ri->vmas.priv_size += vma_area_len(vma); if (vma_has_guard_gap_hidden(vma)) ri->vmas.priv_size += PAGE_SIZE; } pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end); if (vma_area_is(vma, VMA_ANON_SHARED)) ret = collect_shmem(pid, vma); else if (vma_area_is(vma, VMA_FILE_PRIVATE) || vma_area_is(vma, VMA_FILE_SHARED)) ret = collect_filemap(vma); else if (vma_area_is(vma, VMA_AREA_SOCKET)) ret = collect_socket_map(vma); else ret = 0; if (ret) break; } return ret; } static inline bool check_cow_vmas(struct vma_area *vma, struct vma_area *pvma) { /* * VMAs that _may_[1] have COW-ed pages should ... * * [1] I say "may" because whether or not particular pages are * COW-ed is determined later in restore_priv_vma_content() by * memcmp'aring the contents. */ /* ... coinside by start/stop pair (start is checked by caller) */ if (vma->e->end != pvma->e->end) return false; /* ... both be private (and thus have space in premmaped area) */ if (!vma_area_is_private(vma, kdat.task_size)) return false; if (!vma_area_is_private(pvma, kdat.task_size)) return false; /* ... have growsdown and anon flags coinside */ if ((vma->e->flags ^ pvma->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS)) return false; /* ... belong to the same file if being filemap */ if (!(vma->e->flags & MAP_ANONYMOUS) && vma->e->shmid != pvma->e->shmid) return false; pr_debug("Found two COW VMAs @0x%"PRIx64"-0x%"PRIx64"\n", vma->e->start, pvma->e->end); return true; } static inline bool vma_inherited(struct vma_area *vma) { return (vma->pvma != NULL && vma->pvma != VMA_COW_ROOT); } static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list *pvmas) { struct vma_area *vma, *pvma; vma = list_first_entry(&vmas->h, struct vma_area, list); pvma = list_first_entry(&pvmas->h, struct vma_area, list); while (1) { if ((vma->e->start == pvma->e->start) && check_cow_vmas(vma, pvma)) { vma->pvma = pvma; if (pvma->pvma == NULL) pvma->pvma = VMA_COW_ROOT; } /* <= here to shift from matching VMAs and ... */ while (vma->e->start <= pvma->e->start) { vma = vma_next(vma); if (&vma->list == &vmas->h) return; } /* ... no == here since we must stop on matching pair */ while (pvma->e->start < vma->e->start) { pvma = vma_next(pvma); if (&pvma->list == &pvmas->h) return; } } } void prepare_cow_vmas(void) { struct pstree_item *pi; for_each_pstree_item(pi) { struct pstree_item *ppi; struct vm_area_list *vmas, *pvmas; ppi = pi->parent; if (!ppi) continue; vmas = &rsti(pi)->vmas; if (vmas->nr == 0) /* Zombie */ continue; pvmas = &rsti(ppi)->vmas; if (pvmas->nr == 0) /* zombies cannot have kids, * but helpers can (and do) */ continue; if (rsti(pi)->mm->exe_file_id != rsti(ppi)->mm->exe_file_id) /* * Tasks running different executables have * close to zero chance of having cow-ed areas * and actually kernel never creates such. */ continue; prepare_cow_vmas_for(vmas, pvmas); } } /* Map a private vma, if it is not mapped by a parent yet */ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void **tgt_addr) { int ret; void *addr; unsigned long nr_pages, size; nr_pages = vma_entry_len(vma->e) / PAGE_SIZE; vma->page_bitmap = xzalloc(BITS_TO_LONGS(nr_pages) * sizeof(long)); if (vma->page_bitmap == NULL) return -1; /* * A grow-down VMA has a guard page, which protect a VMA below it. * So one more page is mapped here to restore content of the first page */ if (vma_has_guard_gap_hidden(vma)) vma->e->start -= PAGE_SIZE; size = vma_entry_len(vma->e); if (!vma_inherited(vma)) { int flag = 0; /* * The respective memory area was NOT found in the parent. * Map a new one. */ /* * Restore AIO ring buffer content to temporary anonymous area. * This will be placed in io_setup'ed AIO in restore_aio_ring(). */ if (vma_entry_is(vma->e, VMA_AREA_AIORING)) flag |= MAP_ANONYMOUS; else if (vma_area_is(vma, VMA_FILE_PRIVATE)) { ret = vma->vm_open(vpid(t), vma); if (ret < 0) { pr_err("Can't fixup VMA's fd\n"); return -1; } } /* * All mappings here get PROT_WRITE regardless of whether we * put any data into it or not, because this area will get * mremap()-ed (branch below) so we MIGHT need to have WRITE * bits there. Ideally we'd check for the whole COW-chain * having any data in. */ addr = mmap(*tgt_addr, size, vma->e->prot | PROT_WRITE, vma->e->flags | MAP_FIXED | flag, vma->e->fd, vma->e->pgoff); if (addr == MAP_FAILED) { pr_perror("Unable to map ANON_VMA"); return -1; } } else { void *paddr; /* * The area in question can be COWed with the parent. Remap the * parent area. Note, that it has already being passed through * the restore_priv_vma_content() call and thus may have some * pages in it. */ paddr = decode_pointer(vma->pvma->premmaped_addr); if (vma_has_guard_gap_hidden(vma)) paddr -= PAGE_SIZE; addr = mremap(paddr, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr); if (addr != *tgt_addr) { pr_perror("Unable to remap a private vma"); return -1; } } vma->e->status |= VMA_PREMMAPED; vma->premmaped_addr = (unsigned long) addr; pr_debug("\tpremap %#016"PRIx64"-%#016"PRIx64" -> %016lx\n", vma->e->start, vma->e->end, (unsigned long)addr); if (vma_has_guard_gap_hidden(vma)) { /* Skip gurad page */ vma->e->start += PAGE_SIZE; vma->premmaped_addr += PAGE_SIZE; } if (vma_area_is(vma, VMA_FILE_PRIVATE)) vma->vm_open = NULL; /* prevent from 2nd open in prepare_vmas */ *tgt_addr += size; return 0; } static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head) { /* * On kernels with 4K guard pages, growsdown VMAs * always have one guard page at the * beginning and sometimes this page contains data. * In case the VMA is premmaped, we premmap one page * larger VMA. In case of in place restore we can only * do this if the VMA in question is not "guarded" by * some other VMA. */ if (vma->e->flags & MAP_GROWSDOWN) { if (vma->list.prev != head) { struct vma_area *prev; prev = list_entry(vma->list.prev, struct vma_area, list); if (prev->e->end == vma->e->start) { pr_debug("Force premmap for 0x%"PRIx64":0x%"PRIx64"\n", vma->e->start, vma->e->end); return true; } } } return false; } /* * Ensure for s390x that vma is below task size on restore system */ static int task_size_check(pid_t pid, VmaEntry *entry) { #ifdef __s390x__ if (entry->end <= kdat.task_size) return 0; pr_err("Can't restore high memory region %lx-%lx because kernel does only support vmas up to %lx\n", entry->start, entry->end, kdat.task_size); return -1; #else return 0; #endif } static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, void **at, struct page_read *pr) { struct vma_area *vma; unsigned long pstart = 0; int ret = 0; LIST_HEAD(empty); filemap_ctx_init(true); list_for_each_entry(vma, &vmas->h, list) { if (task_size_check(vpid(t), vma->e)) { ret = -1; break; } if (pstart > vma->e->start) { ret = -1; pr_err("VMA-s are not sorted in the image file\n"); break; } pstart = vma->e->start; if (!vma_area_is_private(vma, kdat.task_size)) continue; if (vma->pvma == NULL && pr->pieok && !vma_force_premap(vma, &vmas->h)) { /* * VMA in question is not shared with anyone. We'll * restore it with its contents in restorer. * Now let's check whether we need to map it with * PROT_WRITE or not. */ do { if (pr->pe->vaddr + pr->pe->nr_pages * PAGE_SIZE <= vma->e->start) continue; if (pr->pe->vaddr > vma->e->end) vma->e->status |= VMA_NO_PROT_WRITE; break; } while (pr->advance(pr)); continue; } ret = premap_private_vma(t, vma, at); if (ret < 0) break; } filemap_ctx_fini(); return ret; } static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) { struct vma_area *vma; int ret = 0; struct list_head *vmas = &rsti(t)->vmas.h; struct list_head *vma_io = &rsti(t)->vma_io; unsigned int nr_restored = 0; unsigned int nr_shared = 0; unsigned int nr_droped = 0; unsigned int nr_compared = 0; unsigned int nr_lazy = 0; unsigned long va; vma = list_first_entry(vmas, struct vma_area, list); rsti(t)->pages_img_id = pr->pages_img_id; /* * Read page contents. */ while (1) { unsigned long off, i, nr_pages; ret = pr->advance(pr); if (ret <= 0) break; va = (unsigned long)decode_pointer(pr->pe->vaddr); nr_pages = pr->pe->nr_pages; /* * This means that userfaultfd is used to load the pages * on demand. */ if (opts.lazy_pages && pagemap_lazy(pr->pe)) { pr_debug("Lazy restore skips %ld pages at %lx\n", nr_pages, va); pr->skip_pages(pr, nr_pages * PAGE_SIZE); nr_lazy += nr_pages; continue; } for (i = 0; i < nr_pages; i++) { unsigned char buf[PAGE_SIZE]; void *p; /* * The lookup is over *all* possible VMAs * read from image file. */ while (va >= vma->e->end) { if (vma->list.next == vmas) goto err_addr; vma = vma_next(vma); } /* * Make sure the page address is inside existing VMA * and the VMA it refers to still private one, since * there is no guarantee that the data from pagemap is * valid. */ if (va < vma->e->start) goto err_addr; else if (unlikely(!vma_area_is_private(vma, kdat.task_size))) { pr_err("Trying to restore page for non-private VMA\n"); goto err_addr; } if (!vma_area_is(vma, VMA_PREMMAPED)) { unsigned long len = min_t(unsigned long, (nr_pages - i) * PAGE_SIZE, vma->e->end - va); if (vma->e->status & VMA_NO_PROT_WRITE) { pr_debug("VMA 0x%"PRIx64":0x%"PRIx64" RO %#lx:%lu IO\n", vma->e->start, vma->e->end, va, nr_pages); BUG(); } if (pagemap_enqueue_iovec(pr, (void *)va, len, vma_io)) return -1; pr->skip_pages(pr, len); va += len; len >>= PAGE_SHIFT; nr_restored += len; i += len - 1; pr_debug("Enqueue page-read\n"); continue; } /* * Otherwise to the COW restore */ off = (va - vma->e->start) / PAGE_SIZE; p = decode_pointer((off) * PAGE_SIZE + vma->premmaped_addr); set_bit(off, vma->page_bitmap); if (vma_inherited(vma)) { clear_bit(off, vma->pvma->page_bitmap); ret = pr->read_pages(pr, va, 1, buf, 0); if (ret < 0) goto err_read; va += PAGE_SIZE; nr_compared++; if (memcmp(p, buf, PAGE_SIZE) == 0) { nr_shared++; /* the page is cowed */ continue; } nr_restored++; memcpy(p, buf, PAGE_SIZE); } else { int nr; /* * Try to read as many pages as possible at once. * * Within the t pagemap we still have * nr_pages - i pages (not all, as we might have * switched VMA above), within the t VMA * we have at most (vma->end - t_addr) bytes. */ nr = min_t(int, nr_pages - i, (vma->e->end - va) / PAGE_SIZE); ret = pr->read_pages(pr, va, nr, p, PR_ASYNC); if (ret < 0) goto err_read; va += nr * PAGE_SIZE; nr_restored += nr; i += nr - 1; bitmap_set(vma->page_bitmap, off + 1, nr - 1); } } } err_read: if (pr->sync(pr)) return -1; pr->close(pr); if (ret < 0) return ret; /* Remove pages, which were not shared with a child */ list_for_each_entry(vma, vmas, list) { unsigned long size, i = 0; void *addr = decode_pointer(vma->premmaped_addr); if (!vma_inherited(vma)) continue; size = vma_entry_len(vma->e) / PAGE_SIZE; while (1) { /* Find all pages, which are not shared with this child */ i = find_next_bit(vma->pvma->page_bitmap, size, i); if ( i >= size) break; ret = madvise(addr + PAGE_SIZE * i, PAGE_SIZE, MADV_DONTNEED); if (ret < 0) { pr_perror("madvise failed"); return -1; } i++; nr_droped++; } } cnt_add(CNT_PAGES_COMPARED, nr_compared); cnt_add(CNT_PAGES_SKIPPED_COW, nr_shared); cnt_add(CNT_PAGES_RESTORED, nr_restored); pr_info("nr_restored_pages: %d\n", nr_restored); pr_info("nr_shared_pages: %d\n", nr_shared); pr_info("nr_droped_pages: %d\n", nr_droped); pr_info("nr_lazy: %d\n", nr_lazy); return 0; err_addr: pr_err("Page entry address %lx outside of VMA %lx-%lx\n", va, (long)vma->e->start, (long)vma->e->end); return -1; } static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) { struct _MmEntry *mm = rsti(t)->mm; /* * There is no need to disable it if the page read doesn't * have parent. In this case VMA will be empty until * userfaultfd_register, so there would be no pages to * collapse. And, once we register the VMA with uffd, * khugepaged will skip it. */ if (!(opts.lazy_pages && page_read_has_parent(pr))) return 0; if (!kdat.has_thp_disable) pr_warn("Disabling transparent huge pages. " "It may affect performance!\n"); /* * temporarily disable THP to avoid collapse of pages * in the areas that will be monitored by uffd */ if (prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0)) { pr_perror("Cannot disable THP"); return -1; } if (!(mm->has_thp_disabled && mm->thp_disabled)) rsti(t)->has_thp_enabled = true; return 0; } int prepare_mappings(struct pstree_item *t) { int ret = 0; void *addr; struct vm_area_list *vmas; struct page_read pr; void *old_premmapped_addr = NULL; unsigned long old_premmapped_len; vmas = &rsti(t)->vmas; if (vmas->nr == 0) /* Zombie */ goto out; /* Reserve a place for mapping private vma-s one by one */ addr = mmap(NULL, vmas->priv_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (addr == MAP_FAILED) { ret = -1; pr_perror("Unable to reserve memory (%lu bytes)", vmas->priv_size); goto out; } old_premmapped_addr = rsti(t)->premmapped_addr; old_premmapped_len = rsti(t)->premmapped_len; rsti(t)->premmapped_addr = addr; rsti(t)->premmapped_len = vmas->priv_size; ret = open_page_read(vpid(t), &pr, PR_TASK); if (ret <= 0) return -1; if (maybe_disable_thp(t, &pr)) return -1; pr.advance(&pr); /* shift to the 1st iovec */ ret = premap_priv_vmas(t, vmas, &addr, &pr); if (ret < 0) goto out; pr.reset(&pr); ret = restore_priv_vma_content(t, &pr); if (ret < 0) goto out; if (old_premmapped_addr) { ret = munmap(old_premmapped_addr, old_premmapped_len); if (ret < 0) pr_perror("Unable to unmap %p(%lx)", old_premmapped_addr, old_premmapped_len); } /* * Not all VMAs were premmaped. Find out the unused tail of the * premapped area and unmap it. */ old_premmapped_len = addr - rsti(t)->premmapped_addr; if (old_premmapped_len < rsti(t)->premmapped_len) { unsigned long tail; tail = rsti(t)->premmapped_len - old_premmapped_len; ret = munmap(addr, tail); if (ret < 0) pr_perror("Unable to unmap %p(%lx)", addr, tail); rsti(t)->premmapped_len = old_premmapped_len; pr_info("Shrunk premap area to %p(%lx)\n", rsti(t)->premmapped_addr, rsti(t)->premmapped_len); } out: return ret; } bool vma_has_guard_gap_hidden(struct vma_area *vma) { return kdat.stack_guard_gap_hidden && (vma->e->flags & MAP_GROWSDOWN); } /* * A gard page must be unmapped after restoring content and * forking children to restore COW memory. */ int unmap_guard_pages(struct pstree_item *t) { struct vma_area *vma; struct list_head *vmas = &rsti(t)->vmas.h; if (!kdat.stack_guard_gap_hidden) return 0; list_for_each_entry(vma, vmas, list) { if (!vma_area_is(vma, VMA_PREMMAPED)) continue; if (vma->e->flags & MAP_GROWSDOWN) { void *addr = decode_pointer(vma->premmaped_addr); if (munmap(addr - PAGE_SIZE, PAGE_SIZE)) { pr_perror("Can't unmap guard page"); return -1; } } } return 0; } int open_vmas(struct pstree_item *t) { int pid = vpid(t); struct vma_area *vma; struct vm_area_list *vmas = &rsti(t)->vmas; filemap_ctx_init(false); list_for_each_entry(vma, &vmas->h, list) { if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) continue; pr_info("Opening %#016"PRIx64"-%#016"PRIx64" %#016"PRIx64" (%x) vma\n", vma->e->start, vma->e->end, vma->e->pgoff, vma->e->status); if (vma->vm_open(pid, vma)) { pr_err("`- Can't open vma\n"); return -1; } /* * File mappings have vm_open set to open_filemap which, in * turn, puts the VMA_CLOSE bit itself. For all the rest we * need to put it by hads, so that the restorer closes the fd */ if (!(vma_area_is(vma, VMA_FILE_PRIVATE) || vma_area_is(vma, VMA_FILE_SHARED))) vma->e->status |= VMA_CLOSE; } filemap_ctx_fini(); return 0; } static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta) { struct cr_img *pages; pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id); if (!pages) return -1; ta->vma_ios_fd = img_raw_fd(pages); return pagemap_render_iovec(&rsti(t)->vma_io, ta); } int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) { struct vma_area *vma; struct vm_area_list *vmas = &rsti(t)->vmas; ta->vmas = (VmaEntry *)rst_mem_align_cpos(RM_PRIVATE); ta->vmas_n = vmas->nr; list_for_each_entry(vma, &vmas->h, list) { VmaEntry *vme; vme = rst_mem_alloc(sizeof(*vme), RM_PRIVATE); if (!vme) return -1; /* * Copy VMAs to private rst memory so that it's able to * walk them and m(un|re)map. */ *vme = *vma->e; if (vma_area_is(vma, VMA_PREMMAPED)) vma_premmaped_start(vme) = vma->premmaped_addr; } return prepare_vma_ios(t, ta); } criu-3.6/criu/mount.c000066400000000000000000002306051317335042600146060ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "cr_options.h" #include "util.h" #include "util-pie.h" #include "log.h" #include "plugin.h" #include "filesystems.h" #include "mount.h" #include "pstree.h" #include "image.h" #include "namespaces.h" #include "protobuf.h" #include "fs-magic.h" #include "path.h" #include "files-reg.h" #include "external.h" #include "images/mnt.pb-c.h" /* * Put a : in here since those are invalid on * the cli, so we know it's autogenerated in * debugging. */ #define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED" #define NO_ROOT_MOUNT "CRIU:NO_ROOT" #define MS_PROPAGATE (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE | MS_SLAVE) #undef LOG_PREFIX #define LOG_PREFIX "mnt: " #define BINFMT_MISC_HOME "proc/sys/fs/binfmt_misc" #define CRTIME_MNT_ID 0 /* A helper mount_info entry for the roots yard */ static struct mount_info *root_yard_mp = NULL; int ext_mount_add(char *key, char *val) { char *e_str; e_str = xmalloc(strlen(key) + strlen(val) + 8); if (!e_str) return -1; /* * On dump the key is the mountpoint as seen from the mount * namespace, the val is some name that will be put into image * instead of the mount point's root path. * * On restore the key is the name from the image (the one * mentioned above) and the val is the path in criu's mount * namespace that will become the mount point's root, i.e. -- * be bind mounted to the respective mountpoint. */ sprintf(e_str, "mnt[%s]:%s", key, val); return add_external(e_str); } int ext_mount_parse_auto(char *key) { opts.autodetect_ext_mounts = true; if (*key == ':') { while (1) { key++; if (*key == '\0') break; else if (*key == 'm') opts.enable_external_masters = true; else if (*key == 's') opts.enable_external_sharing = true; else return -1; } } return 0; } /* Lookup ext_mount by key field */ static char *ext_mount_lookup(char *key) { char *v; int len = strlen(key); char mkey[len + 8]; sprintf(mkey, "mnt[%s]", key); v = external_lookup_by_key(mkey); if (IS_ERR(v)) v = NULL; return v; } /* * Single linked list of mount points get from proc/images */ struct mount_info *mntinfo; static void mntinfo_add_list(struct mount_info *new) { if (!mntinfo) mntinfo = new; else { struct mount_info *pm; /* Add to the tail. (FIXME -- make O(1) ) */ for (pm = mntinfo; pm->next != NULL; pm = pm->next) ; pm->next = new; } } static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath, unsigned int st_dev, unsigned int st_ino, unsigned int mnt_id) { /* * Goes through all entries in the mountinfo table * looking for a mount point that contains the file specified * in rpath. Uses the device number st_dev and the inode number st_ino * to make sure the file is correct. */ struct mount_info *mi_ret = NULL; struct mount_info *m; int mntns_root = -1; for (m = list; m != NULL; m = m->next) { struct stat f_stat; int ret_stat; if (m->fstype->code != FSTYPE__OVERLAYFS) continue; /* * We need the mntns root fd of the process to be dumped, * to make sure we stat the correct file */ if (mntns_root == -1) { mntns_root = __mntns_get_root_fd(root_item->pid->real); if (mntns_root < 0) { pr_err("Unable to get the root file descriptor of pid %d\n", root_item->pid->real); return ERR_PTR(-ENOENT); } } /* Concatenates m->mountpoint with rpath and attempts to stat the resulting path */ if (is_root_mount(m)) { ret_stat = fstatat(mntns_root, rpath, &f_stat, 0); } else { char _full_path[PATH_MAX]; int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->mountpoint, rpath); if (n >= PATH_MAX) { pr_err("Not enough space to concatenate %s and %s\n", m->mountpoint, rpath); return ERR_PTR(-ENOSPC); } ret_stat = fstatat(mntns_root, _full_path, &f_stat, 0); } if (ret_stat == 0 && st_dev == f_stat.st_dev && st_ino == f_stat.st_ino) mi_ret = m; } return mi_ret; } /* * Looks up the mnt_id and path of a file in an overlayFS directory. * * This is useful in order to fix the OverlayFS bug present in the * Linux Kernel before version 4.2. See fixup_overlayfs for details. * * We first check to see if the mnt_id and st_dev numbers currently match * some entry in the mountinfo table. If so, we already have the correct mnt_id * and no fixup is needed. * * Then we proceed to see if there are any overlayFS mounted directories * in the mountinfo table. If so, we concatenate the mountpoint with the * name of the file, and stat the resulting path to check if we found the * correct device id and node number. If that is the case, we update the * mount id and link variables with the correct values. */ struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev, unsigned int st_ino, unsigned int mnt_id) { struct mount_info *m; /* If the mnt_id and device number match for some entry, no fixup is needed */ for (m = mntinfo; m != NULL; m = m->next) if (st_dev == kdev_to_odev(m->s_dev) && mnt_id == m->mnt_id) return NULL; return __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id); } static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id) { struct mount_info *m; for (m = list; m != NULL; m = m->next) if (m->mnt_id == id) return m; return NULL; } struct mount_info *lookup_mnt_id(unsigned int id) { return __lookup_mnt_id(mntinfo, id); } struct mount_info *lookup_mnt_sdev(unsigned int s_dev) { struct mount_info *m; for (m = mntinfo; m != NULL; m = m->next) /* * We should not provide notdir bindmounts to open_mount as * opening them can fail/hang for binds of unix sockets/fifos */ if (m->s_dev == s_dev && mnt_is_dir(m)) return m; return NULL; } static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, const char *path) { size_t pathlen = strlen(path); struct mount_info *m = mntinfo_tree, *c; while (1) { list_for_each_entry(c, &m->children, siblings) { size_t n; n = strlen(c->mountpoint + 1); if (n > pathlen) continue; if (strncmp(c->mountpoint + 1, path, min(n, pathlen))) continue; if (n < pathlen && path[n] != '/') continue; m = c; break; } if (&c->siblings == &m->children) break; } pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->mountpoint); return m; } dev_t phys_stat_resolve_dev(struct ns_id *ns, dev_t st_dev, const char *path) { struct mount_info *m; m = mount_resolve_path(ns->mnt.mntinfo_tree, path); /* * BTRFS returns subvolume dev-id instead of * superblock dev-id, in such case return device * obtained from mountinfo (ie subvolume0). */ return strcmp(m->fstype->name, "btrfs") ? MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev; } bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, struct ns_id *ns, const char *path) { if (st_dev == kdev_to_odev(phys_dev)) return true; return phys_dev == phys_stat_resolve_dev(ns, st_dev, path); } /* * Compare super-blocks mounted at two places */ static bool mounts_sb_equal(struct mount_info *a, struct mount_info *b) { if (a->fstype != b->fstype) return false; if (a->s_dev != b->s_dev) return false; if (strcmp(a->source, b->source) != 0) return false; if (a->fstype->sb_equal) /* :) */ return b->fstype->sb_equal(a, b); if (strcmp(a->options, b->options)) return false; return true; } /* * Compare superblocks AND the way they are mounted */ static bool mounts_equal(struct mount_info *a, struct mount_info *b) { if (!mounts_sb_equal(a, b)) return false; if (strcmp(a->root, b->root)) return false; return true; } /* * mnt_roots is a temporary directory for restoring sub-trees of * non-root namespaces. */ static char *mnt_roots; static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mount_info *yard_mount) { struct mount_info *m, *root = NULL; /* * Just resolve the mnt_id:parent_mnt_id relations */ pr_debug("\tBuilding plain mount tree\n"); for (m = list; m != NULL; m = m->next) { struct mount_info *parent; pr_debug("\t\tWorking on %d->%d\n", m->mnt_id, m->parent_mnt_id); if (m->mnt_id != m->parent_mnt_id) parent = __lookup_mnt_id(list, m->parent_mnt_id); else /* a circular mount reference. It's rootfs or smth like it. */ parent = NULL; if (!parent) { /* Only a root mount can be without parent */ if (root == NULL && m->is_ns_root) { root = m; if (!yard_mount) continue; } if (!root) { pr_err("No parent found for mountpoint %d (@%s)\n", m->mnt_id, m->mountpoint); return NULL; } pr_debug("Mountpoint %d (@%s) w/o parent %d\n", m->mnt_id, m->mountpoint, m->parent_mnt_id); if (!mounts_sb_equal(root, m) || strcmp(root->root, m->root)) { pr_err("Nested mount namespaces with different " "roots %d (@%s %s) %d (@%s %s) are not supported yet\n", root->mnt_id, root->mountpoint, root->root, m->mnt_id, m->mountpoint, m->root); return NULL; } /* Mount all namespace roots into the roots yard. */ parent = yard_mount; if (unlikely(!yard_mount)) { pr_err("Nested mount %d (@%s %s) w/o root insertion detected\n", m->mnt_id, m->mountpoint, m->root); return NULL; } pr_debug("Mountpoint %d (@%s) get parent %d (@%s)\n", m->mnt_id, m->mountpoint, parent->mnt_id, parent->mountpoint); } m->parent = parent; list_add_tail(&m->siblings, &parent->children); } if (!root) { pr_err("No root found for tree\n"); return NULL; } if (yard_mount) return yard_mount; return root; } static unsigned int mnt_depth(struct mount_info *m) { unsigned int depth = 0; char *c; for (c = m->mountpoint; *c != '\0'; c++) if (*c == '/') depth++; return depth; } static void mnt_resort_siblings(struct mount_info *tree) { struct mount_info *m, *p; LIST_HEAD(list); /* * Put siblings of each node in an order they can be (u)mounted * I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/ * we should put them in the foo/bar/foobar/, foo/bar/, foo/ order. * Otherwise we will not be able to (u)mount them in a sequence. * * Funny, but all we need for this is to sort them in the descending * order of the amount of /-s in a path =) * * Use stupid insertion sort here, we're not expecting mount trees * to contain hundreds (or more) elements. */ pr_info("\tResorting siblings on %d\n", tree->mnt_id); while (!list_empty(&tree->children)) { unsigned int depth; m = list_first_entry(&tree->children, struct mount_info, siblings); list_del(&m->siblings); depth = mnt_depth(m); list_for_each_entry(p, &list, siblings) if (mnt_depth(p) <= depth) break; list_add(&m->siblings, &p->siblings); mnt_resort_siblings(m); } list_splice(&list, &tree->children); } static void mnt_tree_show(struct mount_info *tree, int off) { struct mount_info *m; pr_info("%*s[%s](%d->%d)\n", off, "", tree->mountpoint, tree->mnt_id, tree->parent_mnt_id); list_for_each_entry(m, &tree->children, siblings) mnt_tree_show(m, off + 1); pr_info("%*s<--\n", off, ""); } /* Returns -1 on error, 1 if external mount resolved, 0 otherwise */ static int try_resolve_ext_mount(struct mount_info *info) { char *ext; char devstr[64]; ext = ext_mount_lookup(info->mountpoint + 1 /* trim the . */); if (ext) { pr_info("Found %s mapping for %s mountpoint\n", ext, info->mountpoint); info->external = ext; return 1; } snprintf(devstr, sizeof(devstr), "dev[%d/%d]", kdev_major(info->s_dev), kdev_minor(info->s_dev)); if (info->fstype->code == FSTYPE__UNSUPPORTED) { char *val; val = external_lookup_by_key(devstr); if (!IS_ERR_OR_NULL(val)) { char *source; int len; len = strlen(val) + sizeof("dev[]"); source = xmalloc(len); if (source == NULL) return -1; snprintf(source, len, "dev[%s]", val); info->fstype = fstype_auto(); BUG_ON(info->fstype->code != FSTYPE__AUTO); xfree(info->source); info->source = source; return 1; } } return 0; } static struct mount_info *find_wider_shared(struct mount_info *m) { struct mount_info *p; /* * Try to find a mount, which is wider or equal. * A is wider than B, if A->root is a subpath of B->root. */ list_for_each_entry(p, &m->mnt_share, mnt_share) if (issubpath(m->root, p->root)) return p; return NULL; } static struct mount_info *find_shared_peer(struct mount_info *m, struct mount_info *ct, char *ct_mountpoint) { struct mount_info *cm; list_for_each_entry(cm, &m->children, siblings) { if (strcmp(ct_mountpoint, cm->mountpoint)) continue; if (!mounts_equal(cm, ct)) break; return cm; } return NULL; } static int validate_shared(struct mount_info *m) { struct mount_info *t, *ct; char buf[PATH_MAX], *sibling_path; LIST_HEAD(children); /* * Check that all mounts in one shared group has the same set of * children. Only visible children are accounted. A non-root bind-mount * doesn't see children out of its root and it's excpected case. * * Here is a few conditions: * 1. t is wider than m * 2. We search a wider mount in the same direction, so when we * enumirate all mounts, we can't be sure that all of them * has the same set of children. */ t = find_wider_shared(m); if (!t) /* * The current mount is the widest one in its shared group, * all others will be compared to it or with some other, * which will be compared to it. */ return 0; /* Search a child, which is visiable in both mounts. */ list_for_each_entry(ct, &t->children, siblings) { struct mount_info *cm; if (ct->is_ns_root || ct->mnt_id == CRTIME_MNT_ID) continue; sibling_path = mnt_get_sibling_path(ct, m, buf, sizeof(buf)); if (sibling_path == NULL) continue; cm = find_shared_peer(m, ct, sibling_path); if (!cm) goto err; /* * Keep this one aside. At the end of t's children scan we should * move _all_ m's children here (the list_empty check below). */ list_move(&cm->siblings, &children); } /* Now all real mounts should be moved */ list_for_each_entry(ct, &m->children, siblings) { if (ct->mnt_id != CRTIME_MNT_ID) goto err; } list_splice(&children, &m->children); return 0; err: list_splice(&children, &m->children); pr_err("%d:%s and %d:%s have different set of mounts\n", m->mnt_id, m->mountpoint, t->mnt_id, t->mountpoint); return -1; } /* * Find the mount_info from which the respective bind-mount * can be created. It can be either an FS-root mount, or the * root of the tree (the latter only if its root path is the * sub-path of the bind mount's root). */ static struct mount_info *find_fsroot_mount_for(struct mount_info *bm) { struct mount_info *sm; list_for_each_entry(sm, &bm->mnt_bind, mnt_bind) if (fsroot_mounted(sm) || (sm->parent == root_yard_mp && strstartswith(bm->root, sm->root))) return sm; return NULL; } static bool does_mnt_overmount(struct mount_info *m) { struct mount_info *t; if (!m->parent) return false; list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; if (issubpath(t->mountpoint, m->mountpoint)) return true; } return false; } /* * Say mount is external if it was explicitly specified as an * external or it will be bind from such an explicit external * mount, we set bind in propagate_mount and propagate_siblings */ static bool mnt_is_external(struct mount_info *m) { struct mount_info *t; while (m) { if (m->external) return 1; if (!list_empty(&m->mnt_share)) list_for_each_entry(t, &m->mnt_share, mnt_share) if (t->external) return 1; if (m->master_id <= 0 && !list_empty(&m->mnt_bind)) list_for_each_entry(t, &m->mnt_bind, mnt_bind) if (issubpath(m->root, t->root) && t->external) return 1; m = m->mnt_master; } return 0; } static int validate_mounts(struct mount_info *info, bool for_dump) { struct mount_info *m, *t; for (m = info; m; m = m->next) { if (m->parent == NULL || m->is_ns_root) /* root mount can be any */ continue; if (m->shared_id && validate_shared(m)) return -1; if (mnt_is_external(m)) goto skip_fstype; /* * Mountpoint can point to / of an FS. In that case this FS * should be of some known type so that we can just mount one. * * Otherwise it's a bindmount mountpoint and we try to find * what fsroot mountpoint it's bound to. If this point is the * root mount, the path to bindmount root should be accessible * form the rootmount path (the strstartswith check in the * else branch below). */ if (fsroot_mounted(m)) { if (m->fstype->code == FSTYPE__UNSUPPORTED) { pr_err("FS mnt %s dev %#x root %s unsupported id %d\n", m->mountpoint, m->s_dev, m->root, m->mnt_id); return -1; } } else { t = find_fsroot_mount_for(m); if (!t) { int ret; /* * No root-mount found for this bind and it's neither * marked nor auto-resolved as external one. So last * chance not to fail is to talk to plugins. */ if (for_dump) { ret = run_plugins(DUMP_EXT_MOUNT, m->mountpoint, m->mnt_id); if (ret == 0) m->need_plugin = true; } else /* * Plugin should take care of this one * in restore_ext_mount, or do_bind_mount * will mount it as external */ ret = m->need_plugin ? 0 : -ENOTSUP; if (ret < 0) { if (ret == -ENOTSUP) pr_err("%d:%s doesn't have a proper root mount\n", m->mnt_id, m->mountpoint); return -1; } } } skip_fstype: if (does_mnt_overmount(m) && !list_empty(&m->parent->mnt_share)) { pr_err("Unable to handle mounts under %d:%s\n", m->mnt_id, m->mountpoint); return -1; } } return 0; } static struct mount_info *find_best_external_match(struct mount_info *list, struct mount_info *info) { struct mount_info *it, *candidate = NULL; for (it = list; it; it = it->next) { if (!mounts_sb_equal(info, it)) continue; /* * This means we have a situation like: * * root@criu:~# mount --bind bind1/subdir/ bind2 * root@criu:~# mount --bind bind1/ bind3 * * outside the container, and bind1 is directly bind mounted * inside the container. mounts_equal() considers these mounts * equal for bind purposes, but their roots are different, and * we want to match the one with the right root. */ if (!issubpath(info->root, it->root)) continue; candidate = it; /* * Consider the case of: * * mount /xxx * mount --bind /xxx /yyy * mount --make-shared /yyy * mount --bind /xxx /zzz * mount --make-shared /zzz * bind mount a shared mount into the namespace * * Here, we want to return the /right/ mount, not just a mount * that's equal. However, in the case: * * bind mount a shared mount into the namespace * inside the namespace, remount MS_PRIVATE * inside the namespace, remount MS_SHARED * * there will be no external mount with matching sharing * because the sharing is only internal; we still want to bind * mount from this mountinfo so we should return it, but we * should make the sharing namespace private after that bind * mount. * * Below are the cases where we found an exact match. */ if (info->flags & MS_SHARED && info->shared_id == it->shared_id) return candidate; if (info->flags & MS_SLAVE && info->master_id == it->shared_id) return candidate; } return candidate; } static struct ns_id *find_ext_ns_id(void) { struct ns_id *ns; for (ns = ns_ids; ns->next; ns = ns->next) if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) { if (!ns->mnt.mntinfo_list && !collect_mntinfo(ns, true)) break; return ns; } pr_err("Failed to find criu pid's mount ns\n"); return NULL; } static int resolve_external_mounts(struct mount_info *info) { struct ns_id *ext_ns = NULL; struct mount_info *m; if (opts.autodetect_ext_mounts) { ext_ns = find_ext_ns_id(); if (!ext_ns) return -1; } for (m = info; m; m = m->next) { int ret; char *p, *cut_root; struct mount_info *match; if (m->parent == NULL || m->is_ns_root) continue; ret = try_resolve_ext_mount(m); if (ret < 0) return ret; if (ret == 1 || !ext_ns) continue; match = find_best_external_match(ext_ns->mnt.mntinfo_list, m); if (!match) continue; if (m->flags & MS_SHARED) { if (!opts.enable_external_sharing) continue; if (m->shared_id != match->shared_id) m->internal_sharing = true; } if (m->flags & MS_SLAVE) { if (!opts.enable_external_masters) continue; /* * In order to support something like internal slavery, * we need to teach can_mount_now and do_mount_one * about slavery relationships in external mounts. This * seems like an uncommon case, so we punt for not. */ if (m->master_id != match->shared_id && m->master_id != match->master_id) continue; } cut_root = cut_root_for_bind(m->root, match->root); p = xsprintf("%s/%s", match->mountpoint + 1, cut_root); if (!p) return -1; m->external = AUTODETECTED_MOUNT; /* * Put the guessed name in source. It will be picked up * as auto-root in get_mp_root() on restore. */ xfree(m->source); m->source = p; pr_info("autodetected external mount %s for %s\n", p, m->mountpoint); } return 0; } static int resolve_shared_mounts(struct mount_info *info, int root_master_id) { struct mount_info *m, *t; /* * If we have a shared mounts, both master * slave targets are to be present in mount * list, otherwise we can't be sure if we can * recreate the scheme later on restore. */ for (m = info; m; m = m->next) { bool need_share, need_master; /* the root master_id can be ignored, because it's already created */ if (root_master_id && root_master_id == m->master_id) m->master_id = -1; need_share = m->shared_id && list_empty(&m->mnt_share); need_master = m->master_id > 0; pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", m->mnt_id, m->shared_id, m->master_id, m->mountpoint); for (t = info; t && (need_share || need_master); t = t->next) { if (t == m) continue; if (need_master && t->shared_id == m->master_id) { pr_debug("\tThe mount %3d is slave for %3d (@%s -> @%s)\n", m->mnt_id, t->mnt_id, m->mountpoint, t->mountpoint); list_add(&m->mnt_slave, &t->mnt_slave_list); m->mnt_master = t; need_master = false; } /* Collect all mounts from this group */ if (need_share && t->shared_id == m->shared_id) { pr_debug("\tMount %3d is shared with %3d group %3d (@%s -> @%s)\n", m->mnt_id, t->mnt_id, m->shared_id, t->mountpoint, m->mountpoint); list_add(&t->mnt_share, &m->mnt_share); } } /* * If we haven't already determined this mount is external, * or bind of external, then we don't know where it came from. */ if (need_master && m->parent && !mnt_is_external(m)) { pr_err("Mount %d %s (master_id: %d shared_id: %d) " "has unreachable sharing. Try --enable-external-masters.\n", m->mnt_id, m->mountpoint, m->master_id, m->shared_id); return -1; } /* Search bind-mounts */ if (list_empty(&m->mnt_bind)) { /* * A first mounted point will be set up as a source point * for others. Look at propagate_mount() */ for (t = m->next; t; t = t->next) { if (mounts_sb_equal(m, t)) { list_add(&t->mnt_bind, &m->mnt_bind); pr_debug("\tThe mount %3d is bind for %3d (@%s -> @%s)\n", t->mnt_id, m->mnt_id, t->mountpoint, m->mountpoint); } } } } return 0; } static struct mount_info *mnt_build_tree(struct mount_info *list, struct mount_info *root_mp) { struct mount_info *tree; /* * Organize them in a sequence in which they can be mounted/umounted. */ pr_info("Building mountpoints tree\n"); tree = mnt_build_ids_tree(list, root_mp); if (!tree) return NULL; mnt_resort_siblings(tree); pr_info("Done:\n"); mnt_tree_show(tree, 0); return tree; } int mnt_is_dir(struct mount_info *pm) { int mntns_root; struct stat st; mntns_root = mntns_get_root_fd(pm->nsid); if (mntns_root < 0) { pr_perror("Can't get root fd of mntns for %d", pm->mnt_id); return 0; } if (fstatat(mntns_root, pm->ns_mountpoint, &st, 0)) { pr_perror("Can't fstatat on %s", pm->ns_mountpoint); return 0; } if (S_ISDIR(st.st_mode)) return 1; return 0; } /* * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case. * If mnt_fd is -1, the mountpoint will be opened by this function. */ int __open_mountpoint(struct mount_info *pm, int mnt_fd) { struct stat st; int dev; int ret; if (mnt_fd == -1) { int mntns_root; mntns_root = mntns_get_root_fd(pm->nsid); if (mntns_root < 0) return -1; mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY); if (mnt_fd < 0) { pr_perror("Can't open %s", pm->ns_mountpoint); return -1; } } ret = fstat(mnt_fd, &st); if (ret < 0) { pr_perror("fstat(%s) failed", pm->ns_mountpoint); goto err; } if (pm->s_dev_rt == MOUNT_INVALID_DEV) { pr_err("Resolving over unvalid device for %#x %s %s\n", pm->s_dev, pm->fstype->name, pm->ns_mountpoint); goto err; } dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); /* * Always check for @s_dev_rt here, because the @s_dev * from the image (in case of restore) has all rights * to not match the device (say it's migrated and kernel * allocates new device ID). */ if (dev != pm->s_dev_rt) { pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, pm->fstype->name, pm->ns_mountpoint); goto err; } return mnt_fd; err: close(mnt_fd); return -1; } int open_mount(unsigned int s_dev) { struct mount_info *m; m = lookup_mnt_sdev(s_dev); if (!m) return -ENOENT; return __open_mountpoint(m, -1); } /* Bind-mount a mount point in a temporary place without children */ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_path_root) { char *mnt_path; mnt_path = mkdtemp(mnt_path_tmp); if (mnt_path == NULL && errno == ENOENT) mnt_path = mkdtemp(mnt_path_root); if (mnt_path == NULL) { pr_perror("Can't create a temporary directory"); return NULL; } if (mount(mi->mountpoint, mnt_path, NULL, MS_BIND, NULL)) { pr_perror("Can't bind-mount %d:%s to %s", mi->mnt_id, mi->mountpoint, mnt_path); rmdir(mnt_path); return NULL; } return mnt_path; } #define MNT_UNREACHABLE INT_MIN int open_mountpoint(struct mount_info *pm) { struct mount_info *c; int fd = -1, ns_old = -1; char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX"; char mnt_path_root[] = "/cr-tmpfs.XXXXXX"; char *mnt_path = mnt_path_tmp; int cwd_fd; /* * If a mount doesn't have children, we can open a mount point, * otherwise we need to create a "private" copy. */ if (list_empty(&pm->children)) return __open_mountpoint(pm, -1); pr_info("Something is mounted on top of %s\n", pm->mountpoint); list_for_each_entry(c, &pm->children, siblings) { if (!strcmp(c->mountpoint, pm->mountpoint)) { pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint); return MNT_UNREACHABLE; } } /* * To create a "private" copy, the target mount is bind-mounted * in a temporary place w/o MS_REC (non-recursively). * A mount point can't be bind-mounted in criu's namespace, it will be * mounted in a target namespace. The sequence of actions is * mkdtemp, setns(tgt), mount, open, detach, setns(old). */ cwd_fd = open(".", O_DIRECTORY); if (cwd_fd < 0) { pr_perror("Unable to open cwd"); return -1; } if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0) goto out; mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root); if (mnt_path == NULL) { /* * We probably can't create a temporary direcotry, * so we can try to clone the mount namespace, open * the required mount and destroy this mount namespace * by calling restore_ns() below in this function. */ if (unshare(CLONE_NEWNS)) { pr_perror("Unable to clone a mount namespace"); goto out; } fd = open(pm->mountpoint, O_RDONLY | O_DIRECTORY, 0); if (fd < 0) pr_perror("Can't open directory %s: %d", pm->mountpoint, fd); } else fd = open_detach_mount(mnt_path); if (fd < 0) goto out; if (restore_ns(ns_old, &mnt_ns_desc)) { ns_old = -1; goto out; } if (fchdir(cwd_fd)) { pr_perror("Unable to restore cwd"); close(cwd_fd); close(fd); return -1; } close(cwd_fd); return __open_mountpoint(pm, fd); out: if (ns_old >= 0) restore_ns(ns_old, &mnt_ns_desc); close_safe(&fd); if (fchdir(cwd_fd)) pr_perror("Unable to restore cwd"); close(cwd_fd); return -1; } static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev) { struct mount_info *mi, *t, *parent; bool add_slash = false; int len; if (!root->nsid) { /* On restore we have fake top mount_info. Find real NS_ROOT */ list_for_each_entry(t, &root->children, siblings) if (t->nsid->type == NS_ROOT) { root = t; break; } if (!root->nsid) { pr_err("Can't find NS_ROOT\n"); return -1; } } mi = mnt_entry_alloc(); if (!mi) return -1; len = strlen(root->mountpoint); /* It may be "./" or "./path/to/dir" */ if (root->mountpoint[len - 1] != '/') { add_slash = true; len++; } mi->mountpoint = xmalloc(len + strlen(path) + 1); if (!mi->mountpoint) return -1; mi->ns_mountpoint = mi->mountpoint; if (!add_slash) sprintf(mi->mountpoint, "%s%s", root->mountpoint, path); else sprintf(mi->mountpoint, "%s/%s", root->mountpoint, path); mi->mnt_id = CRTIME_MNT_ID; mi->flags = mi->sb_flags = 0; mi->root = xstrdup("/"); mi->fsname = xstrdup(fsname); mi->source = xstrdup(fsname); mi->options = xstrdup(""); if (!mi->root || !mi->fsname || !mi->source || !mi->options) return -1; mi->fstype = find_fstype_by_name(fsname); mi->s_dev = mi->s_dev_rt = s_dev; parent = root; while (1) { list_for_each_entry(t, &parent->children, siblings) { if (strstartswith(mi->mountpoint, t->mountpoint)) { parent = t; break; } } if (&t->siblings == &parent->children) break; } mi->nsid = parent->nsid; mi->parent = parent; mi->parent_mnt_id = parent->mnt_id; mi->next = parent->next; parent->next = mi; list_add(&mi->siblings, &parent->children); pr_info("Add cr-time mountpoint %s with parent %s(%u)\n", mi->mountpoint, parent->mountpoint, parent->mnt_id); return 0; } /* Returns 1 in case of success, -errno in case of mount fail, and 0 on other errors */ static __maybe_unused int mount_cr_time_mount(struct ns_id *ns, unsigned int *s_dev, const char *source, const char *target, const char *type) { int mnt_fd, ret, exit_code = 0; struct stat st; ret = switch_ns(ns->ns_pid, &mnt_ns_desc, &mnt_fd); if (ret < 0) { pr_err("Can't switch mnt_ns\n"); goto out; } ret = mount(source, target, type, 0, NULL); if (ret < 0) { exit_code = -errno; goto restore_ns; } else { if (stat(target, &st) < 0) { pr_perror("Can't stat %s", target); exit_code = 0; } else { *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); exit_code = 1; } } restore_ns: ret = restore_ns(mnt_fd, &mnt_ns_desc); out: return ret < 0 ? 0 : exit_code; } static int dump_one_fs(struct mount_info *mi) { struct mount_info *pm = mi; struct mount_info *t; bool first = true; if (mi->is_ns_root || mi->need_plugin || mnt_is_external(mi) || !mi->fstype->dump) return 0; /* mnt_bind is a cycled list, so list_for_each can't be used here. */ for (; &pm->mnt_bind != &mi->mnt_bind || first; pm = list_entry(pm->mnt_bind.next, typeof(*pm), mnt_bind)) { int ret; first = false; if (!fsroot_mounted(pm)) continue; ret = pm->fstype->dump(pm); if (ret == MNT_UNREACHABLE) continue; if (ret < 0) return ret; list_for_each_entry(t, &pm->mnt_bind, mnt_bind) t->dumped = true; return 0; } pr_err("Unable to dump a file system for %d:%s\n", mi->mnt_id, mi->mountpoint); return -1; } static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img) { MntEntry me = MNT_ENTRY__INIT; pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev, pm->root, pm->mountpoint); me.fstype = pm->fstype->code; if (me.fstype == FSTYPE__AUTO) me.fsname = pm->fsname; if (!pm->external) { if (!pm->dumped && dump_one_fs(pm)) return -1; if (!fsroot_mounted(pm) && pm->fstype->check_bindmount && pm->fstype->check_bindmount(pm)) return -1; } if (pm->mnt_id == CRTIME_MNT_ID) { pr_info("Skip dumping cr-time mountpoint: %s\n", pm->mountpoint); return 0; } me.mnt_id = pm->mnt_id; me.root_dev = pm->s_dev; me.parent_mnt_id = pm->parent_mnt_id; me.flags = pm->flags; me.sb_flags = pm->sb_flags; me.has_sb_flags = true; me.mountpoint = pm->mountpoint + 1; me.source = pm->source; me.options = pm->options; me.shared_id = pm->shared_id; me.has_shared_id = true; me.master_id = pm->master_id; me.has_master_id = true; if (pm->need_plugin) { me.has_with_plugin = true; me.with_plugin = true; } if (pm->deleted) { me.has_deleted = true; me.deleted = true; } if (pm->internal_sharing) { me.has_internal_sharing = true; me.internal_sharing = true; } if (pm->external) /* * For external mount points dump the mapping's * value, see collect_mnt_from_image -> get_mp_root * for reverse mapping details. */ me.ext_key = pm->external; me.root = pm->root; if (pb_write_one(img, &me, PB_MNT)) return -1; return 0; } static void free_mntinfo(struct mount_info *pms) { while (pms) { struct mount_info *pm; pm = pms->next; mnt_entry_free(pms); pms = pm; } } struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump) { struct mount_info *pm; pm = parse_mountinfo(ns->ns_pid, ns, for_dump); if (!pm) { pr_err("Can't parse %d's mountinfo\n", ns->ns_pid); return NULL; } ns->mnt.mntinfo_tree = mnt_build_tree(pm, NULL); if (ns->mnt.mntinfo_tree == NULL) goto err; ns->mnt.mntinfo_list = pm; return pm; err: free_mntinfo(pm); return NULL; } static int dump_mnt_ns(struct ns_id *ns, struct mount_info *pms) { struct mount_info *pm; int ret = -1; struct cr_img *img; int ns_id = ns->id; pr_info("Dumping mountpoints\n"); img = open_image(CR_FD_MNTS, O_DUMP, ns_id); if (!img) goto err; for (pm = pms; pm && pm->nsid == ns; pm = pm->next) if (dump_one_mountpoint(pm, img)) goto err_i; ret = 0; err_i: close_image(img); err: return ret; } /* * _fn_f - pre-order traversal function * _fn_f - post-order traversal function * _plist - a postpone list. _el is added to this list, if _fn_f returns * a positive value, and all lower elements are not enumirated. */ #define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) do { \ struct mount_info *_mi = _r; \ \ while (1) { \ int ret; \ \ list_del_init(&_mi->postpone); \ \ ret = _fn_f(_mi); \ if (ret < 0) \ return -1; \ else if (ret > 0) { \ list_add_tail(&_mi->postpone, _plist); \ goto up; \ } \ \ _prgs++; \ \ if (!list_empty(&_mi->children)) { \ _mi = list_entry(_mi->children._el, \ struct mount_info, siblings); \ continue; \ } \ up: \ if (_fn_r(_mi)) \ return -1; \ if (_mi == _r) \ break; \ if (_mi->siblings._el == &_mi->parent->children) { \ _mi = _mi->parent; \ goto up; \ } \ _mi = list_entry(_mi->siblings._el, \ struct mount_info, siblings); \ } \ } while (0) #define MNT_WALK_NONE 0 && static int mnt_tree_for_each(struct mount_info *start, int (*fn)(struct mount_info *)) { struct mount_info *tmp; LIST_HEAD(postpone); LIST_HEAD(postpone2); int progress; pr_debug("Start with %d:%s\n", start->mnt_id, start->mountpoint); list_add(&start->postpone, &postpone); again: progress = 0; list_for_each_entry_safe(start, tmp, &postpone, postpone) MNT_TREE_WALK(start, next, fn, MNT_WALK_NONE, &postpone2, progress); if (!progress) { struct mount_info *m; pr_err("A few mount points can't be mounted\n"); list_for_each_entry(m, &postpone2, postpone) { pr_err("%d:%d %s %s %s\n", m->mnt_id, m->parent_mnt_id, m->root, m->mountpoint, m->source); } return -1; } list_splice_init(&postpone2, &postpone); if (!list_empty(&postpone)) goto again; return 0; } static int mnt_tree_for_each_reverse(struct mount_info *m, int (*fn)(struct mount_info *)) { int progress = 0; MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *) NULL, progress); return 0; } static char *resolve_source(struct mount_info *mi) { if (kdev_major(mi->s_dev) == 0) /* * Anonymous block device. Kernel creates them for * diskless mounts. */ return mi->source; if (mi->fstype->code == FSTYPE__AUTO) { struct stat st; char *val; val = external_lookup_by_key(mi->source); if (!IS_ERR_OR_NULL(val)) return val; if (!stat(mi->source, &st) && S_ISBLK(st.st_mode) && major(st.st_rdev) == kdev_major(mi->s_dev) && minor(st.st_rdev) == kdev_minor(mi->s_dev)) return mi->source; } pr_err("No device for %s mount\n", mi->mountpoint); return NULL; } static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave) { pr_debug("%d:%s private %d shared %d slave %d\n", mi->mnt_id, mi->mountpoint, private, shared, slave); if (mi->flags & MS_UNBINDABLE) { if (shared || slave) pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", mi->mountpoint); else return mount(NULL, mi->mountpoint, NULL, MS_UNBINDABLE, NULL); } if (private && mount(NULL, mi->mountpoint, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to make %s private", mi->mountpoint); return -1; } if (slave && mount(NULL, mi->mountpoint, NULL, MS_SLAVE, NULL)) { pr_perror("Unable to make %s slave", mi->mountpoint); return -1; } if (shared && mount(NULL, mi->mountpoint, NULL, MS_SHARED, NULL)) { pr_perror("Unable to make %s shared", mi->mountpoint); return -1; } return 0; } /* * Umount points, which are propagated in slave parents, because * we can't be sure, that they were inherited in a real life. */ static int umount_from_slaves(struct mount_info *mi) { struct mount_info *t; char *mpath, buf[PATH_MAX]; list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) { if (!t->mounted) continue; mpath = mnt_get_sibling_path(mi, t, buf, sizeof(buf)); if (mpath == NULL) continue; pr_debug("\t\tUmount slave %s\n", mpath); if (umount(mpath) == -1) { pr_perror("Can't umount slave %s", mpath); return -1; } } return 0; } /* * If something is mounted in one shared point, it will be spread in * all other points from this shared group. * * Look at Documentation/filesystems/sharedsubtree.txt for more details */ static int propagate_siblings(struct mount_info *mi) { struct mount_info *t; /* * Find all mounts, which must be bind-mounted from this one * to inherite shared group or master id */ list_for_each_entry(t, &mi->mnt_share, mnt_share) { if (t->mounted) continue; if (t->bind && t->bind->shared_id == t->shared_id) continue; pr_debug("\t\tBind share %s\n", t->mountpoint); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) { if (t->mounted || t->bind) continue; pr_debug("\t\tBind slave %s\n", t->mountpoint); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } return 0; } static int propagate_mount(struct mount_info *mi) { struct mount_info *t; propagate_siblings(mi); if (!mi->parent) goto skip_parent; umount_from_slaves(mi); /* Propagate this mount to everyone from a parent group */ list_for_each_entry(t, &mi->parent->mnt_share, mnt_share) { struct mount_info *c; char path[PATH_MAX], *mp; bool found = false; mp = mnt_get_sibling_path(mi, t, path, sizeof(path)); if (mp == NULL) continue; list_for_each_entry(c, &t->children, siblings) { if (mounts_equal(mi, c) && !strcmp(mp, c->mountpoint)) { pr_debug("\t\tPropagate %s\n", c->mountpoint); /* * When a mount is propagated, the result mount * is always shared. If we want to get a private * mount, we need to convert it. */ restore_shared_options(c, !c->shared_id, 0, 0); c->mounted = true; propagate_siblings(c); umount_from_slaves(c); found = true; } } if (!found) { pr_err("Unable to find %s\n", mp); return -1; } } skip_parent: /* * FIXME Currently non-root mounts can be restored * only if a proper root mount exists */ if (fsroot_mounted(mi) || mi->parent == root_yard_mp || mi->external) { list_for_each_entry(t, &mi->mnt_bind, mnt_bind) { if (t->mounted) continue; if (t->bind) continue; if (t->master_id > 0) continue; if (!issubpath(t->root, mi->root)) continue; pr_debug("\t\tBind private %s\n", t->mountpoint); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } } return 0; } static int fetch_rt_stat(struct mount_info *m, const char *where) { struct stat st; if (stat(where, &st)) { pr_perror("Can't stat on %s", where); return -1; } m->s_dev_rt = MKKDEV(major(st.st_dev), minor(st.st_dev)); return 0; } /* * Here are a set of flags which we know how to handle for the one mount call. * All of them except MS_RDONLY are set only as mnt flags. * MS_RDONLY is set for both mnt ans sb flags, so we can restore it for one * mount call only if it set for both masks. */ #define MS_MNT_KNOWN_FLAGS (MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME | \ MS_NODIRATIME | MS_RELATIME | MS_RDONLY) static int do_simple_mount(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags) { return mount(src, mi->mountpoint, fstype, mountflags, mi->options); } static char *mnt_fsname(struct mount_info *mi) { if (mi->fstype->code == FSTYPE__AUTO) return mi->fsname; return mi->fstype->name; } static int apply_sb_flags(void *args, int fd, pid_t pid) { unsigned long flags = *(unsigned long *) args; int rst = -1, err = -1; char path[PSFDS]; snprintf(path, sizeof(path), "/proc/self/fd/%d", fd); if (pid != getpid() && switch_ns(pid, &mnt_ns_desc, &rst)) return -1; err = mount(NULL, path, NULL, MS_REMOUNT | flags, NULL); if (err) pr_perror("Unable to remount %s", path); if (rst >= 0 && restore_ns(rst, &mnt_ns_desc)) return -1; return err; } static int do_new_mount(struct mount_info *mi) { unsigned long sflags = mi->sb_flags; unsigned long mflags = mi->flags & (~MS_PROPAGATE); char *src; struct fstype *tp = mi->fstype; bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY); mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount; src = resolve_source(mi); if (!src) return -1; /* Merge superblock and mount flags if it's possible */ if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) { sflags |= mflags; mflags = 0; } if (remount_ro) sflags &= ~MS_RDONLY; if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) { pr_perror("Can't mount at %s", mi->mountpoint); return -1; } if (tp->restore && tp->restore(mi)) return -1; if (mi->mnt_id == CRTIME_MNT_ID) { /* C-r time mountpoint, umount it */ if (umount(mi->mountpoint) < 0) { pr_perror("Can't umount %s", mi->mountpoint); return -1; } goto out; } if (!mi->is_ns_root && remount_ro) { int fd; fd = open(mi->mountpoint, O_PATH); if (fd < 0) { pr_perror("Unable to open %s", mi->mountpoint); return -1; } sflags |= MS_RDONLY; if (userns_call(apply_sb_flags, 0, &sflags, sizeof(sflags), fd)) { pr_perror("Unable to apply mount falgs %d for %s", mi->sb_flags, mi->mountpoint); close(fd); return -1; } close(fd); } if (mflags && mount(NULL, mi->mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { pr_perror("Unable to apply bind-mount options"); return -1; } /* * A slave should be mounted from do_bind_mount(). * Look at can_mount_now() for details. */ BUG_ON(mi->master_id); if (restore_shared_options(mi, !mi->shared_id, mi->shared_id, 0)) return -1; out: mi->mounted = true; return 0; } static int restore_ext_mount(struct mount_info *mi) { int ret; pr_debug("Restoring external bind mount %s\n", mi->mountpoint); ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, mi->mountpoint, "/", NULL); if (ret) pr_err("Can't restore ext mount (%d)\n", ret); return ret; } static char mnt_clean_path[] = "/tmp/cr-tmpfs.XXXXXX"; static int mount_clean_path() { /* * To make a bind mount, we need to have access to a source directory, * which can be over-mounted. The idea is to mount a source mount in * an intermediate place without MS_REC and then create a target mounts. * This intermediate place should be a private mount to not affect * properties of the source mount. */ if (mkdtemp(mnt_clean_path) == NULL) { pr_perror("Unable to create a temporary directory"); return -1; } if (mount(mnt_clean_path, mnt_clean_path, NULL, MS_BIND, NULL)) { pr_perror("Unable to mount tmpfs into %s", mnt_clean_path); return -1; } if (mount(NULL, mnt_clean_path, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to mark %s as private", mnt_clean_path); return -1; } return 0; } static int umount_clean_path() { if (umount2(mnt_clean_path, MNT_DETACH)) { pr_perror("Unable to umount %s", mnt_clean_path); return -1; } if (rmdir(mnt_clean_path)) { pr_perror("Unable to remove %s", mnt_clean_path); } return 0; } static int do_bind_mount(struct mount_info *mi) { char mnt_fd_path[PSFDS]; char *root, *cut_root, rpath[PATH_MAX]; unsigned long mflags; int exit_code = -1, mp_len; bool shared = false; bool master = false; bool private = false; char *mnt_path = NULL; struct stat st; bool umount_mnt_path = false; struct mount_info *c; if (mi->need_plugin) { if (restore_ext_mount(mi)) return -1; goto out; } if (mi->external) { /* * We have / pointing to criu's ns root still, * so just use the mapping's path. The mountpoint * is tuned in collect_mnt_from_image to refer * to proper location in the namespace we restore. */ root = mi->external; private = !mi->master_id && (mi->internal_sharing || !mi->shared_id); goto do_bind; } shared = mi->shared_id && mi->shared_id == mi->bind->shared_id; master = mi->master_id && mi->master_id == mi->bind->master_id; private = !mi->master_id && !shared; cut_root = cut_root_for_bind(mi->root, mi->bind->root); /* Mount private can be initialized on mount() callback, which is * called only once. * It have to be copied to all it's sibling structures to provide users * of it with actual data. */ mi->private = mi->bind->private; mnt_path = mi->bind->mountpoint; /* Access a mount by fd if mi->bind->mountpoint is overmounted */ if (mi->bind->fd >= 0) { snprintf(mnt_fd_path, sizeof(mnt_fd_path), "/proc/self/fd/%d", mi->bind->fd); mnt_path = mnt_fd_path; } if (cut_root[0] == 0) /* This case is handled by mi->bind->fd */ goto skip_overmount_check; /* * The target path may be over-mounted by one of child mounts * and we need to create a new bind-mount to get access to the path. */ mp_len = strlen(mi->bind->mountpoint); if (mp_len > 1) /* skip a joining / if mi->bind->mountpoint isn't "/" */ mp_len++; list_for_each_entry(c, &mi->bind->children, siblings) { if (!c->mounted) continue; if (issubpath(cut_root, c->mountpoint + mp_len)) break; /* a source path is overmounted */ } if (&c->siblings != &mi->bind->children) { /* Get a copy of mi->bind without child mounts */ if (mount(mnt_path, mnt_clean_path, NULL, MS_BIND, NULL)) { pr_perror("Unable to bind-mount %s to %s", mnt_path, mnt_clean_path); return -1; } mnt_path = mnt_clean_path; umount_mnt_path = true; } if (mnt_path == NULL) return -1; skip_overmount_check: snprintf(rpath, sizeof(rpath), "%s/%s", mnt_path, cut_root); root = rpath; do_bind: pr_info("\tBind %s to %s\n", root, mi->mountpoint); if (unlikely(mi->deleted)) { if (stat(mi->mountpoint, &st)) { pr_perror("Can't fetch stat on %s", mi->mountpoint); goto err; } if (S_ISDIR(st.st_mode)) { if (mkdir(root, (st.st_mode & ~S_IFMT))) { pr_perror("Can't re-create deleted directory %s", root); goto err; } } else if (S_ISREG(st.st_mode)) { int fd = open(root, O_WRONLY | O_CREAT | O_EXCL, st.st_mode & ~S_IFMT); if (fd < 0) { pr_perror("Can't re-create deleted file %s", root); goto err; } close(fd); } else { pr_err("Unsupported st_mode 0%o deleted root %s\n", (int)st.st_mode, root); goto err; } } if (mount(root, mi->mountpoint, NULL, MS_BIND | (mi->flags & MS_REC), NULL) < 0) { pr_perror("Can't mount at %s", mi->mountpoint); goto err; } mflags = mi->flags & (~MS_PROPAGATE); if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE))) if (mount(NULL, mi->mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { pr_perror("Can't mount at %s", mi->mountpoint); goto err; } if (unlikely(mi->deleted)) { if (S_ISDIR(st.st_mode)) { if (rmdir(root)) { pr_perror("Can't remove deleted directory %s", root); goto err; } } else if (S_ISREG(st.st_mode)) { if (unlink(root)) { pr_perror("Can't unlink deleted file %s", root); goto err; } } } out: /* * shared - the mount is in the same shared group with mi->bind * mi->shared_id && !shared - create a new shared group */ if (restore_shared_options(mi, private, mi->shared_id && !shared, mi->master_id && !master)) return -1; mi->mounted = true; exit_code = 0; err: if (umount_mnt_path) { /* * If mnt_path was shared, a new mount may be propagated * into it. */ if (mount(NULL, mnt_path, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to make %s private", mnt_path); return -1; } if (umount2(mnt_path, MNT_DETACH)) { pr_perror("Unable to umount %s", mnt_path); return -1; } } return exit_code; } static bool rst_mnt_is_root(struct mount_info *m) { return (m->is_ns_root && m->nsid->id == root_item->ids->mnt_ns_id); } static bool can_mount_now(struct mount_info *mi) { if (rst_mnt_is_root(mi)) return true; if (mi->external) goto shared; /* * We're the slave peer: * - Make sure the master peer is already mounted * - Make sure all children is mounted as well to * eliminame mounts duplications */ if (mi->master_id > 0) { struct mount_info *c; if (mi->bind == NULL) return false; list_for_each_entry(c, &mi->bind->children, siblings) { if (!c->mounted) return false; } } if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin)) return false; shared: if (mi->parent->shared_id) { struct mount_info *p = mi->parent, *n; if (mi->parent->shared_id == mi->shared_id) { int rlen = strlen(mi->root); list_for_each_entry(n, &p->mnt_share, mnt_share) if (strlen(n->root) < rlen && !n->mounted) return false; } else { list_for_each_entry(n, &p->mnt_share, mnt_share) if (!n->mounted) return false; } } return true; } static int do_mount_root(struct mount_info *mi) { if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; return fetch_rt_stat(mi, mi->mountpoint); } static int do_close_one(struct mount_info *mi) { close_safe(&mi->fd); return 0; } static int do_mount_one(struct mount_info *mi) { int ret; if (mi->mounted) return 0; if (!can_mount_now(mi)) { pr_debug("Postpone slave %s\n", mi->mountpoint); return 1; } if (!strcmp(mi->parent->mountpoint, mi->mountpoint)) { mi->parent->fd = open(mi->parent->mountpoint, O_PATH); if (mi->parent->fd < 0) { pr_perror("Unable to open %s", mi->mountpoint); return -1; } } pr_debug("\tMounting %s @%s (%d)\n", mi->fstype->name, mi->mountpoint, mi->need_plugin); if (rst_mnt_is_root(mi)) { /* do_mount_root() is called from populate_mnt_ns() */ if (mount(opts.root, mi->mountpoint, NULL, MS_BIND | MS_REC, NULL)) return -1; if (do_mount_root(mi)) return -1; mi->mounted = true; ret = 0; } else if (!mi->bind && !mi->need_plugin && !mi->external) ret = do_new_mount(mi); else ret = do_bind_mount(mi); if (ret == 0 && fetch_rt_stat(mi, mi->mountpoint)) return -1; if (ret == 0 && propagate_mount(mi)) return -1; if (mi->fstype->code == FSTYPE__UNSUPPORTED) { struct statfs st; if (statfs(mi->mountpoint, &st)) { pr_perror("Unable to statfs %s", mi->mountpoint); return -1; } if (st.f_type == BTRFS_SUPER_MAGIC) mi->fstype = find_fstype_by_name("btrfs"); } return ret; } static int do_umount_one(struct mount_info *mi) { if (!mi->parent) return 0; if (mount("none", mi->parent->mountpoint, "none", MS_REC|MS_PRIVATE, NULL)) { pr_perror("Can't mark %s as private", mi->parent->mountpoint); return -1; } if (umount(mi->mountpoint)) { pr_perror("Can't umount at %s", mi->mountpoint); return -1; } pr_info("Umounted at %s\n", mi->mountpoint); return 0; } /* * If a mount overmounts other mounts, it is restored separetly in the roots * yard and then moved to the right place. * * mnt_remap_entry is created for each such mount and it's added into * mnt_remap_list. The origin mount point is replaced on a new one in * roots_yard where it will be restored. The remapped mount will be * moved to the right places after restoring all mounts. */ static inline int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs); static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root, int root_len); static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { struct mount_info *mi; /* child is remaped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; static int do_remap_mount(struct mount_info *m) { int len; /* A path in root_yard has a fixed size, so it can be replaced. */ len = print_ns_root(m->nsid, remap_id, m->mountpoint, PATH_MAX); m->mountpoint[len] = '/'; return 0; } static int try_remap_mount(struct mount_info *m) { struct mnt_remap_entry *r; if (!does_mnt_overmount(m)) return 0; BUG_ON(!m->parent || !list_empty(&m->parent->mnt_share)); r = xmalloc(sizeof(struct mnt_remap_entry)); if (!r) return -1; r->mi = m; list_add(&r->node, &mnt_remap_list); return 0; } static int find_remap_mounts(struct mount_info *root) { struct mnt_remap_entry *r; struct mount_info *m; /* * It's impossible to change a tree without interrupting * enumeration, so on the first step mounts are added * into mnt_remap_list and then they are connected to root_yard_mp. */ if (mnt_tree_for_each(root, try_remap_mount)) return -1; /* Move remapped mounts to root_yard */ list_for_each_entry(r, &mnt_remap_list, node) { m = r->mi; r->parent = m->parent; m->parent = root_yard_mp; list_del(&m->siblings); list_add(&m->siblings, &root_yard_mp->children); remap_id++; mnt_tree_for_each(m, do_remap_mount); pr_debug("Restore the %d mount in %s\n", m->mnt_id, m->mountpoint); } return 0; } /* Move remapped mounts to places where they have to be */ static int fixup_remap_mounts() { struct mnt_remap_entry *r; list_for_each_entry(r, &mnt_remap_list, node) { struct mount_info *m = r->mi; char path[PATH_MAX]; int len; strncpy(path, m->mountpoint, PATH_MAX); len = print_ns_root(m->nsid, 0, path, PATH_MAX); path[len] = '/'; pr_debug("Move mount %s -> %s\n", m->mountpoint, path); if (mount(m->mountpoint, path, NULL, MS_MOVE, NULL)) { pr_perror("Unable to move mount %s -> %s", m->mountpoint, path); return -1; } /* Insert child back to its place in the tree */ list_del(&r->mi->siblings); list_add(&r->mi->siblings, &r->parent->children); r->mi->parent = r->parent; } return 0; } static int cr_pivot_root(char *root) { char tmp_dir_tmpl[] = "crtools-put-root.XXXXXX"; bool tmp_dir = false; char *put_root = "tmp"; int exit_code = -1; struct stat st; pr_info("Move the root to %s\n", root ? : "."); if (root) { if (chdir(root)) { pr_perror("chdir(%s) failed", root); return -1; } } if (stat(put_root, &st) || !S_ISDIR(st.st_mode)) { put_root = mkdtemp(tmp_dir_tmpl); if (put_root == NULL) { pr_perror("Can't create a temporary directory"); return -1; } tmp_dir = true; } if (mount(put_root, put_root, NULL, MS_BIND, NULL)) { pr_perror("Unable to mount tmpfs in %s", put_root); goto err_root; } if (mount(NULL, put_root, NULL, MS_PRIVATE, NULL)) { pr_perror("Can't remount %s with MS_PRIVATE", put_root); goto err_tmpfs; } if (pivot_root(".", put_root)) { pr_perror("pivot_root(., %s) failed", put_root); goto err_tmpfs; } if (mount("none", put_root, "none", MS_REC|MS_SLAVE, NULL)) { pr_perror("Can't remount root with MS_PRIVATE"); return -1; } exit_code = 0; if (umount2(put_root, MNT_DETACH)) { pr_perror("Can't umount %s", put_root); return -1; } err_tmpfs: if (umount2(put_root, MNT_DETACH)) { pr_perror("Can't umount %s", put_root); return -1; } err_root: if (tmp_dir && rmdir(put_root)) { pr_perror("Can't remove the directory %s", put_root); return -1; } return exit_code; } struct mount_info *mnt_entry_alloc() { struct mount_info *new; /* * We rely on xzalloc here for MOUNT_INVALID_DEV. */ BUILD_BUG_ON(MOUNT_INVALID_DEV); new = xzalloc(sizeof(struct mount_info)); if (new) { new->fd = -1; INIT_LIST_HEAD(&new->children); INIT_LIST_HEAD(&new->siblings); INIT_LIST_HEAD(&new->mnt_slave_list); INIT_LIST_HEAD(&new->mnt_share); INIT_LIST_HEAD(&new->mnt_bind); INIT_LIST_HEAD(&new->postpone); } return new; } void mnt_entry_free(struct mount_info *mi) { if (mi) { xfree(mi->root); xfree(mi->mountpoint); xfree(mi->source); xfree(mi->options); xfree(mi->fsname); xfree(mi); } } /* * Helper for getting a path to where the namespace's root * is re-constructed. */ static inline int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs) { return snprintf(buf, bs, "%s/%d-%010d", mnt_roots, ns->id, remap_id); } static int create_mnt_roots(void) { int exit_code = -1; if (mnt_roots) return 0; mnt_roots = xstrdup("/tmp/.criu.mntns.XXXXXX"); if (mnt_roots == NULL) goto out; if (mkdtemp(mnt_roots) == NULL) { pr_perror("Unable to create a temporary directory"); mnt_roots = NULL; goto out; } chmod(mnt_roots, 0777); exit_code = 0; out: return exit_code; } static int get_mp_root(MntEntry *me, struct mount_info *mi) { char *ext = NULL; BUG_ON(me->ext_mount && me->ext_key); /* Forward compatibility fixup */ if (me->ext_mount) { me->ext_key = me->root; /* * Puting the id of external mount which is provided by user, * to ->root can confuse mnt_is_external and other functions * which expect to see the path in the file system to the root * of these mount (mounts_equal, mnt_build_ids_tree, * find_wider_shared, find_fsroot_mount_for, * find_best_external_match, etc.) */ me->root = NO_ROOT_MOUNT; } mi->root = xstrdup(me->root); if (!mi->root) return -1; if (!me->ext_key) goto out; /* * External mount point -- get the reverse mapping * from the command line and put into root's place */ ext = ext_mount_lookup(me->ext_key); if (!ext) { if (!opts.autodetect_ext_mounts) { pr_err("No mapping for %s mountpoint\n", me->mountpoint); return -1; } /* * Make up an external mount entry for this * mount point, since we couldn't find a user * supplied one. * * The 'val' was put into mi->source during * dump by resolve_external_mounts(). */ ext = mi->source; } mi->external = ext; out: pr_debug("\t\tWill mount %d from %s%s\n", mi->mnt_id, ext ? : mi->root, ext ? " (E)" : ""); return 0; } static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root, int root_len) { int len; len = strlen(mountpoint) + root_len + 1; mi->mountpoint = xmalloc(len); if (!mi->mountpoint) return -1; /* * For bind-mounts we would also fix the root here * too, but bind-mounts restore merges mountpoint * and root paths together, so there's no need in * that. */ strcpy(mi->mountpoint, root); strcpy(mi->mountpoint + root_len, mountpoint); mi->ns_mountpoint = mi->mountpoint + root_len; pr_debug("\t\tWill mount %d @ %s\n", mi->mnt_id, mi->mountpoint); return 0; } static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) { MntEntry *me = NULL; int ret, root_len = 1; struct cr_img *img; char root[PATH_MAX] = "."; img = open_image(CR_FD_MNTS, O_RSTR, nsid->id); if (!img) return -1; root_len = print_ns_root(nsid, 0, root, sizeof(root)); pr_debug("Reading mountpoint images (id %d pid %d)\n", nsid->id, (int)nsid->ns_pid); while (1) { struct mount_info *pm; ret = pb_read_one_eof(img, &me, PB_MNT); if (ret <= 0) break; pm = mnt_entry_alloc(); if (!pm) goto err; pm->nsid = nsid; pm->next = *pms; *pms = pm; pm->mnt_id = me->mnt_id; pm->parent_mnt_id = me->parent_mnt_id; pm->s_dev = me->root_dev; pm->flags = me->flags; pm->sb_flags = me->sb_flags; if (!me->has_sb_flags) { const unsigned int mflags = MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE | MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_NOATIME | MS_NODIRATIME | MS_RELATIME; /* * In old images mnt and sb flags are saved together. * Here we separate them and save the old logic about MS_RDONLY. */ pm->sb_flags = pm->flags & ~mflags; pm->flags = pm->flags & mflags; } pm->shared_id = me->shared_id; pm->master_id = me->master_id; pm->need_plugin = me->with_plugin; pm->deleted = me->deleted; pm->is_ns_root = is_root(me->mountpoint); if (me->has_internal_sharing) pm->internal_sharing = me->internal_sharing; pm->source = xstrdup(me->source); if (!pm->source) goto err; pm->options = xstrdup(me->options); if (!pm->options) goto err; if (me->fstype != FSTYPE__AUTO && me->fsname) { pr_err("fsname can be set only for FSTYPE__AUTO mounts\n"); goto err; } /* FIXME: abort unsupported early */ pm->fstype = decode_fstype(me->fstype); if (pm->fstype->collect && (pm->fstype->collect(pm) < 0)) goto err; if (me->fsname) { pm->fsname = xstrdup(me->fsname); if (!pm->fsname) goto err; } if (get_mp_root(me, pm)) goto err; if (get_mp_mountpoint(me->mountpoint, pm, root, root_len)) goto err; pr_debug("\tRead %d mp @ %s\n", pm->mnt_id, pm->mountpoint); } if (me) mnt_entry__free_unpacked(me, NULL); close_image(img); return 0; err: close_image(img); return -1; } int read_mnt_ns_img(void) { struct mount_info *pms = NULL; struct ns_id *nsid; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; if (collect_mnt_from_image(&pms, nsid)) return -1; } mntinfo = pms; return 0; } int rst_get_mnt_root(int mnt_id, char *path, int plen) { struct mount_info *m; if (!(root_ns_mask & CLONE_NEWNS) || mnt_id == -1) goto rroot; m = lookup_mnt_id(mnt_id); if (m == NULL) return -1; return print_ns_root(m->nsid, 0, path, plen); rroot: path[0] = '/'; path[1] = '\0'; return 1; } int mntns_maybe_create_roots(void) { if (!(root_ns_mask & CLONE_NEWNS)) return 0; return create_mnt_roots(); } static int do_restore_task_mnt_ns(struct ns_id *nsid, struct pstree_item *current) { int fd; fd = open_proc(vpid(root_item), "fd/%d", nsid->mnt.ns_fd); if (fd < 0) return -1; if (setns(fd, CLONE_NEWNS)) { pr_perror("Can't restore mntns"); close(fd); return -1; } close(fd); return 0; } int restore_task_mnt_ns(struct pstree_item *current) { if ((root_ns_mask & CLONE_NEWNS) == 0) return 0; if (current->ids && current->ids->has_mnt_ns_id) { unsigned int id = current->ids->mnt_ns_id; struct ns_id *nsid; /* * Regardless of the namespace a task wants to * live in, by that point they all will live in * root's one (see prepare_pstree_kobj_ids() + * get_clone_mask()). So if the current task's * target namespace is the root's one -- it's * already there, otherwise it will have to do * setns(). */ if (current->parent && id == current->parent->ids->mnt_ns_id) return 0; nsid = lookup_ns_by_id(id, &mnt_ns_desc); if (nsid == NULL) { pr_err("Can't find mount namespace %d\n", id); return -1; } BUG_ON(nsid->type == NS_CRIU); if (do_restore_task_mnt_ns(nsid, current)) return -1; } return 0; } void fini_restore_mntns(void) { struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) return; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; close_safe(&nsid->mnt.ns_fd); close_safe(&nsid->mnt.root_fd); nsid->ns_populated = true; } } /* * All nested mount namespaces are restore as sub-trees of the root namespace. */ static int populate_roots_yard(void) { struct mnt_remap_entry *r; char path[PATH_MAX]; struct ns_id *nsid; if (make_yard(mnt_roots)) return -1; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; print_ns_root(nsid, 0, path, sizeof(path)); if (mkdir(path, 0600)) { pr_perror("Unable to create %s", path); return -1; } } /* * mnt_remap_list is filled in find_remap_mounts() and * contains mounts which has to be restored separatly */ list_for_each_entry(r, &mnt_remap_list, node) { if (mkdirpat(AT_FDCWD, r->mi->mountpoint, 0755)) { pr_perror("Unable to create %s", r->mi->mountpoint); return -1; } } return 0; } static int populate_mnt_ns(void) { struct mount_info *pms; struct ns_id *nsid; int ret; if (mnt_roots) { /* mnt_roots is a tmpfs mount and it's private */ root_yard_mp = mnt_entry_alloc(); if (!root_yard_mp) return -1; root_yard_mp->mountpoint = mnt_roots; root_yard_mp->mounted = true; } pms = mnt_build_tree(mntinfo, root_yard_mp); if (!pms) return -1; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { /* Add to mount tree. Generic code will mount it later */ ret = add_cr_time_mount(pms, "binfmt_misc", BINFMT_MISC_HOME, 0); if (ret) return -1; } #endif if (resolve_shared_mounts(mntinfo, pms->master_id)) return -1; for (nsid = ns_ids; nsid; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; /* * Make trees of all namespaces look the * same, so that manual paths resolution * works on them. */ nsid->mnt.mntinfo_tree = pms; } if (validate_mounts(mntinfo, false)) return -1; if (find_remap_mounts(pms)) return -1; if (populate_roots_yard()) return -1; if (mount_clean_path()) return -1; ret = mnt_tree_for_each(pms, do_mount_one); mnt_tree_for_each(pms, do_close_one); if (ret == 0 && fixup_remap_mounts()) return -1; if (umount_clean_path()) return -1; return ret; } int __depopulate_roots_yard(void) { int ret = 0; if (mnt_roots == NULL) return 0; if (mount("none", mnt_roots, "none", MS_REC|MS_PRIVATE, NULL)) { pr_perror("Can't remount root with MS_PRIVATE"); ret = 1; } /* * Don't exit after a first error, because this function * can be used to rollback in a error case. * Don't worry about MNT_DETACH, because files are restored after this * and nobody will not be restored from a wrong mount namespace. */ if (umount2(mnt_roots, MNT_DETACH)) { pr_perror("Can't unmount %s", mnt_roots); ret = -1; } if (rmdir(mnt_roots)) { pr_perror("Can't remove the directory %s", mnt_roots); ret = -1; } return ret; } int depopulate_roots_yard(int mntns_fd, bool only_ghosts) { int ret = 0, old_cwd = -1, old_ns = -1; if (mntns_fd < 0) { ret |= try_clean_remaps(only_ghosts); cleanup_mnt_ns(); return ret; } pr_info("Switching to new ns to clean ghosts\n"); old_cwd = open(".", O_PATH); if (old_cwd < 0) { pr_perror("Unable to open cwd"); return -1; } old_ns = open_proc(PROC_SELF, "ns/mnt"); if (old_ns < 0) { pr_perror("`- Can't keep old ns"); close(old_cwd); return -1; } if (setns(mntns_fd, CLONE_NEWNS) < 0) { pr_perror("`- Can't switch"); close(old_ns); close(old_cwd); return -1; } if (try_clean_remaps(only_ghosts)) ret = -1; if (__depopulate_roots_yard()) ret = -1; if (setns(old_ns, CLONE_NEWNS) < 0) { pr_perror("Fail to switch back!"); ret = -1; } close(old_ns); if (fchdir(old_cwd)) { pr_perror("Unable to restore cwd"); ret = -1; } close(old_cwd); return ret; } void cleanup_mnt_ns(void) { char path[PATH_MAX], *root = opts.root ? : "/"; if (mnt_roots == NULL) return; snprintf(path, sizeof(path), "%s/%s", root, mnt_roots); if (rmdir(path)) pr_perror("Can't remove the directory %s", mnt_roots); } int prepare_mnt_ns(void) { int ret = -1, rst = -1; struct ns_id ns = { .type = NS_CRIU, .ns_pid = PROC_SELF, .nd = &mnt_ns_desc }; struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) return 0; pr_info("Restoring mount namespace\n"); if (!opts.root) { struct mount_info *old; if (chdir("/")) { pr_perror("chdir(\"/\") failed"); return -1; } old = collect_mntinfo(&ns, false); if (old == NULL) return -1; /* * The new mount namespace is filled with the mountpoint * clones from the original one. We have to umount them * prior to recreating new ones. */ pr_info("Cleaning mount namespace\n"); if (mnt_tree_for_each_reverse(ns.mnt.mntinfo_tree, do_umount_one)) { free_mntinfo(old); return -1; } free_mntinfo(old); } ret = populate_mnt_ns(); if (ret) return -1; rst = open_proc(PROC_SELF, "ns/mnt"); if (rst < 0) return -1; /* resotre non-root namespaces */ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { char path[PATH_MAX]; if (nsid->nd != &mnt_ns_desc) continue; /* Create the new mount namespace */ if (unshare(CLONE_NEWNS)) { pr_perror("Unable to create a new mntns"); goto err; } if (nsid->type == NS_ROOT) { int fd; /* * We need to create a mount namespace which will be * used to clean up remap files * (depopulate_roots_yard). The namespace where mounts * was restored has to be restored as a root mount * namespace, because there are file descriptors * linked with it (e.g. to bind-mount slave pty-s). */ fd = open_proc(PROC_SELF, "ns/mnt"); if (fd < 0) goto err; if (setns(rst, CLONE_NEWNS)) { pr_perror("Can't restore mntns back"); goto err; } nsid->mnt.ns_fd = rst; rst = fd; } else { /* Pin one with a file descriptor */ nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt"); if (nsid->mnt.ns_fd < 0) goto err; } /* Set its root */ path[0] = '/'; print_ns_root(nsid, 0, path + 1, sizeof(path) - 1); if (cr_pivot_root(path)) goto err; /* root_fd is used to restore file mappings */ nsid->mnt.root_fd = open_proc(PROC_SELF, "root"); if (nsid->mnt.root_fd < 0) goto err; /* And return back to regain the access to the roots yard */ if (setns(rst, CLONE_NEWNS)) { pr_perror("Can't restore mntns back"); goto err; } } close(rst); return ret; err: if (rst >= 0) restore_ns(rst, &mnt_ns_desc); return -1; } static int mntns_root_pid = -1; static int mntns_set_root_fd(pid_t pid, int fd) { int ret; ret = install_service_fd(ROOT_FD_OFF, fd); if (ret >= 0) mntns_root_pid = pid; close(fd); return ret; } int __mntns_get_root_fd(pid_t pid) { int fd, pfd; int ret; char path[PATH_MAX + 1]; if (mntns_root_pid == pid) /* The required root is already opened */ return get_service_fd(ROOT_FD_OFF); close_service_fd(ROOT_FD_OFF); if (!(root_ns_mask & CLONE_NEWNS)) { /* * If criu and tasks we dump live in the same mount * namespace, we can just open the root directory. * All paths resolution would occur relative to criu's * root. Even if it is not namespace's root, provided * file paths are resolved, we'd get consistent dump. */ fd = open("/", O_RDONLY | O_DIRECTORY); if (fd < 0) { pr_perror("Can't open root"); return -1; } goto set_root; } /* * If /proc/pid/root links on '/', it signs that a root of the task * and a root of mntns is the same. */ pfd = open_pid_proc(pid); ret = readlinkat(pfd, "root", path, sizeof(path) - 1); if (ret < 0) { close_pid_proc(); return ret; } path[ret] = '\0'; if (ret != 1 || path[0] != '/') { pr_err("The root task has another root than mntns: %s\n", path); close_pid_proc(); return -1; } fd = openat(pfd, "root", O_RDONLY | O_DIRECTORY, 0); close_pid_proc(); if (fd < 0) { pr_perror("Can't open the task root"); return -1; } set_root: return mntns_set_root_fd(pid, fd); } int mntns_get_root_fd(struct ns_id *mntns) { if (!(root_ns_mask & CLONE_NEWNS)) return __mntns_get_root_fd(0); /* * All namespaces are restored from the root task and during the * CR_STATE_FORKING stage the root task has two file descriptors for * each mntns. One is associated with a namespace and another one is a * root of this mntns. * * When a non-root task is forked, it enters into a proper mount * namespace, restores private mappings and forks children. Some of * these mappings can be associated with files from other namespaces. * * After the CR_STATE_FORKING stage the root task has to close all * mntns file descriptors to restore its descriptors and at this moment * we know that all tasks live in their mount namespaces. * * If we find that a mount namespace isn't populated, we can get its * root from the root task. */ if (!mntns->ns_populated) { int fd; fd = open_proc(vpid(root_item), "fd/%d", mntns->mnt.root_fd); if (fd < 0) return -1; return mntns_set_root_fd(mntns->ns_pid, fd); } return __mntns_get_root_fd(mntns->ns_pid); } struct ns_id *lookup_nsid_by_mnt_id(int mnt_id) { struct mount_info *mi; /* * Kernel before 3.15 doesn't show mnt_id for file descriptors. * mnt_id isn't saved for files, if mntns isn't dumped. * In both these cases we have only one root, so here * is not matter which mount will be restured. */ if (mnt_id == -1) mi = mntinfo; else mi = lookup_mnt_id(mnt_id); return mi ? mi->nsid : NULL; } int mntns_get_root_by_mnt_id(int mnt_id) { struct ns_id *mntns = NULL; if (root_ns_mask & CLONE_NEWNS) { mntns = lookup_nsid_by_mnt_id(mnt_id); BUG_ON(mntns == NULL); } return mntns_get_root_fd(mntns); } struct collect_mntns_arg { bool need_to_validate; bool for_dump; int root_master_id; }; static int collect_mntns(struct ns_id *ns, void *__arg) { struct collect_mntns_arg *arg = __arg; struct mount_info *pms; pms = collect_mntinfo(ns, arg->for_dump); if (!pms) return -1; if (arg->for_dump && ns->type != NS_CRIU) arg->need_to_validate = true; mntinfo_add_list(pms); if (arg->need_to_validate && ns->id == root_item->ids->mnt_ns_id) arg->root_master_id = ns->mnt.mntinfo_tree->master_id; return 0; } int collect_mnt_namespaces(bool for_dump) { struct collect_mntns_arg arg; int ret; arg.for_dump = for_dump; arg.need_to_validate = false; ret = walk_namespaces(&mnt_ns_desc, collect_mntns, &arg); if (ret) goto err; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (for_dump && !opts.has_binfmt_misc) { unsigned int s_dev = 0; struct ns_id *ns; for (ns = ns_ids; ns != NULL; ns = ns->next) { if (ns->type == NS_ROOT && ns->nd == &mnt_ns_desc) break; } if (ns) { ret = mount_cr_time_mount(ns, &s_dev, "binfmt_misc", "/" BINFMT_MISC_HOME, "binfmt_misc"); if (ret == -EPERM) pr_info("Can't mount binfmt_misc: EPERM. Running in user_ns?\n"); else if (ret < 0 && ret != -EBUSY && ret != -ENODEV && ret != -ENOENT) { pr_err("Can't mount binfmt_misc: %d %s\n", ret, strerror(-ret)); goto err; } else if (ret == 0) { ret = -1; goto err; } else if (ret > 0 && add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, s_dev) < 0) { ret = -1; goto err; } } } #endif ret = resolve_external_mounts(mntinfo); if (ret) goto err; if (arg.need_to_validate) { ret = -1; if (resolve_shared_mounts(mntinfo, arg.root_master_id)) goto err; if (validate_mounts(mntinfo, true)) goto err; } ret = 0; err: return ret; } int dump_mnt_namespaces(void) { struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) return 0; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc || nsid->type == NS_CRIU) continue; if ((nsid->type == NS_OTHER) && check_mnt_id()) { pr_err("Nested mount namespaces are not supported " "without mnt_id in fdinfo\n"); return -1; } if (dump_mnt_ns(nsid, nsid->mnt.mntinfo_list)) return -1; } return 0; } void clean_cr_time_mounts(void) { struct mount_info *mi; int mnt_fd, ret; for (mi = mntinfo; mi; mi = mi->next) { if (mi->mnt_id != CRTIME_MNT_ID) continue; ret = switch_ns(mi->nsid->ns_pid, &mnt_ns_desc, &mnt_fd); if (ret) { pr_err("Can't switch to pid's %u mnt_ns\n", mi->nsid->ns_pid); continue; } if (umount(mi->mountpoint) < 0) pr_perror("Can't umount forced mount %s", mi->mountpoint); if (restore_ns(mnt_fd, &mnt_ns_desc)) { pr_err("cleanup_forced_mounts exiting with wrong mnt_ns\n"); return; } } } struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt"); criu-3.6/criu/namespaces.c000066400000000000000000001044771317335042600155720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "page.h" #include "rst-malloc.h" #include "cr_options.h" #include "imgset.h" #include "uts_ns.h" #include "ipc_ns.h" #include "mount.h" #include "pstree.h" #include "namespaces.h" #include "net.h" #include "cgroup.h" #include "protobuf.h" #include "util.h" #include "images/ns.pb-c.h" #include "images/userns.pb-c.h" static struct ns_desc *ns_desc_array[] = { &net_ns_desc, &uts_ns_desc, &ipc_ns_desc, &pid_ns_desc, &user_ns_desc, &mnt_ns_desc, &cgroup_ns_desc, }; static unsigned int join_ns_flags; int check_namespace_opts(void) { errno = 22; if (join_ns_flags & opts.empty_ns) { pr_err("Conflicting flags: --join-ns and --empty-ns\n"); return -1; } if (join_ns_flags & CLONE_NEWUSER) pr_warn("join-ns with user-namespace is not fully tested and dangerous"); errno = 0; return 0; } static int check_int_str(char *str) { char *endptr; long val; if (str == NULL) return 0; if (*str == '\0') { str = NULL; return 0; } errno = 22; val = strtol(str, &endptr, 10); if ((errno == ERANGE) || (endptr == str) || (*endptr != '\0') || (val < 0) || (val > 65535)) { str = NULL; return -1; } errno = 0; return 0; } static int check_ns_file(char *ns_file) { int pid, ret, proc_dir; if (!check_int_str(ns_file)) { pid = atoi(ns_file); if (pid <= 0) { pr_err("Invalid join_ns pid %s\n", ns_file); return -1; } proc_dir = open_pid_proc(pid); if (proc_dir < 0) { pr_err("Invalid join_ns pid: /proc/%s not found\n", ns_file); return -1; } return 0; } ret = access(ns_file, 0); if (ret < 0) { pr_perror("Can't access join-ns file %s", ns_file); return -1; } return 0; } static int set_user_extra_opts(struct join_ns *jn, char *extra_opts) { char *uid, *gid, *aux; if (extra_opts == NULL) { jn->extra_opts.user_extra.uid = NULL; jn->extra_opts.user_extra.gid = NULL; return 0; } uid = extra_opts; aux = strchr(extra_opts, ','); if (aux == NULL) { gid = NULL; } else { *aux = '\0'; gid = aux + 1; } if (check_int_str(uid) || check_int_str(gid)) return -1; jn->extra_opts.user_extra.uid = uid; jn->extra_opts.user_extra.gid = gid; return 0; } int join_ns_add(const char *type, char *ns_file, char *extra_opts) { struct join_ns *jn; if (check_ns_file(ns_file)) return -1; jn = xmalloc(sizeof(*jn)); if (!jn) return -1; jn->ns_file = ns_file; if (!strncmp(type, "net", 4)) { jn->nd = &net_ns_desc; join_ns_flags |= CLONE_NEWNET; } else if (!strncmp(type, "uts", 4)) { jn->nd = &uts_ns_desc; join_ns_flags |= CLONE_NEWUTS; } else if (!strncmp(type, "ipc", 4)) { jn->nd = &ipc_ns_desc; join_ns_flags |= CLONE_NEWIPC; } else if (!strncmp(type, "pid", 4)) { pr_err("join-ns pid namespace not supported\n"); goto err; } else if (!strncmp(type, "user", 5)) { jn->nd = &user_ns_desc; if (set_user_extra_opts(jn, extra_opts)) { pr_err("invalid user namespace extra_opts %s\n", extra_opts); goto err; } join_ns_flags |= CLONE_NEWUSER; } else if (!strncmp(type, "mnt", 4)) { jn->nd = &mnt_ns_desc; join_ns_flags |= CLONE_NEWNS; } else { pr_err("invalid namespace type %s\n", type); goto err; } list_add_tail(&jn->list, &opts.join_ns); pr_info("Added %s:%s join namespace\n", type, ns_file); return 0; err: xfree(jn); return -1; } static unsigned int parse_ns_link(char *link, size_t len, struct ns_desc *d) { unsigned long kid = 0; char *end; if (len >= d->len + 2) { if (link[d->len] == ':' && !memcmp(link, d->str, d->len)) { kid = strtoul(&link[d->len + 2], &end, 10); if (end && *end == ']') BUG_ON(kid > UINT_MAX); else kid = 0; } } return (unsigned int)kid; } bool check_ns_proc(struct fd_link *link) { unsigned int i, kid; for (i = 0; i < ARRAY_SIZE(ns_desc_array); i++) { kid = parse_ns_link(link->name + 1, link->len - 1, ns_desc_array[i]); if (!kid) continue; link->ns_d = ns_desc_array[i]; link->ns_kid = kid; return true; } return false; } int switch_ns(int pid, struct ns_desc *nd, int *rst) { int nsfd; int ret; nsfd = open_proc(pid, "ns/%s", nd->str); if (nsfd < 0) return -1; ret = switch_ns_by_fd(nsfd, nd, rst); close(nsfd); return ret; } int switch_ns_by_fd(int nsfd, struct ns_desc *nd, int *rst) { int ret = -1; if (rst) { *rst = open_proc(PROC_SELF, "ns/%s", nd->str); if (*rst < 0) goto err_ns; } ret = setns(nsfd, nd->cflag); if (ret < 0) { pr_perror("Can't setns %d/%s", nsfd, nd->str); goto err_set; } return 0; err_set: if (rst) close(*rst); err_ns: return -1; } int restore_ns(int rst, struct ns_desc *nd) { int ret; ret = setns(rst, nd->cflag); if (ret < 0) pr_perror("Can't restore ns back"); close(rst); return ret; } struct ns_id *ns_ids = NULL; static unsigned int ns_next_id = 1; unsigned long root_ns_mask = 0; static void nsid_add(struct ns_id *ns, struct ns_desc *nd, unsigned int id, pid_t pid) { ns->nd = nd; ns->id = id; ns->ns_pid = pid; ns->next = ns_ids; ns_ids = ns; pr_info("Add %s ns %d pid %d\n", nd->str, ns->id, ns->ns_pid); } struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, struct ns_desc *nd, enum ns_type type) { struct ns_id *nsid; nsid = shmalloc(sizeof(*nsid)); if (nsid) { nsid->type = type; nsid_add(nsid, nd, id, pid); nsid->ns_populated = false; } return nsid; } int rst_add_ns_id(unsigned int id, struct pstree_item *i, struct ns_desc *nd) { pid_t pid = vpid(i); struct ns_id *nsid; nsid = lookup_ns_by_id(id, nd); if (nsid) { if (pid_rst_prio(pid, nsid->ns_pid)) nsid->ns_pid = pid; return 0; } nsid = rst_new_ns_id(id, pid, nd, i == root_item ? NS_ROOT : NS_OTHER); if (nsid == NULL) return -1; return 0; } static struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd) { struct ns_id *nsid; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) if (nsid->kid == kid && nsid->nd == nd) return nsid; return NULL; } struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd) { struct ns_id *nsid; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) if (nsid->id == id && nsid->nd == nd) return nsid; return NULL; } /* * For all namespaces we support, there are two supported * tasks-to-namespaces layout. * * If root task lives in the same namespace as criu does * all other tasks should live in it too and we do NOT dump * this namespace. On restore tasks inherit the respective * namespace from criu. * * If root task lives in its own namespace, then all other * tasks may live in it. Sometimes (CLONE_SUBNS) there can * be more than one namespace of that type. For this case * we dump all namespace's info and recreate them on restore. */ int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void *oarg) { int ret = 0; struct ns_id *ns; for (ns = ns_ids; ns != NULL; ns = ns->next) { if (ns->nd != nd) continue; if (ns->type == NS_CRIU) { if (root_ns_mask & nd->cflag) continue; ret = cb(ns, oarg); break; } ret = cb(ns, oarg); if (ret) break; } return ret; } static unsigned int generate_ns_id(int pid, unsigned int kid, struct ns_desc *nd, struct ns_id **ns_ret) { struct ns_id *nsid; enum ns_type type; nsid = lookup_ns_by_kid(kid, nd); if (nsid) goto found; if (pid != getpid()) { type = NS_OTHER; if (pid == root_item->pid->real) { BUG_ON(root_ns_mask & nd->cflag); pr_info("Will take %s namespace in the image\n", nd->str); root_ns_mask |= nd->cflag; type = NS_ROOT; } else if (nd->cflag & ~CLONE_SUBNS) { pr_err("Can't dump nested %s namespace for %d\n", nd->str, pid); return 0; } } else type = NS_CRIU; nsid = xzalloc(sizeof(*nsid)); if (!nsid) return 0; nsid->type = type; nsid->kid = kid; nsid->ns_populated = true; nsid_add(nsid, nd, ns_next_id++, pid); found: if (ns_ret) *ns_ret = nsid; return nsid->id; } static unsigned int __get_ns_id(int pid, struct ns_desc *nd, protobuf_c_boolean *supported, struct ns_id **ns) { int proc_dir; unsigned int kid; char ns_path[10]; struct stat st; proc_dir = open_pid_proc(pid); if (proc_dir < 0) return 0; sprintf(ns_path, "ns/%s", nd->str); if (fstatat(proc_dir, ns_path, &st, 0)) { if (errno == ENOENT) { /* The namespace is unsupported */ kid = 0; goto out; } pr_perror("Unable to stat %s", ns_path); return 0; } kid = st.st_ino; BUG_ON(!kid); out: if (supported) *supported = kid != 0; return generate_ns_id(pid, kid, nd, ns); } static unsigned int get_ns_id(int pid, struct ns_desc *nd, protobuf_c_boolean *supported) { return __get_ns_id(pid, nd, supported, NULL); } int dump_one_ns_file(int lfd, u32 id, const struct fd_parms *p) { struct cr_img *img; FileEntry fe = FILE_ENTRY__INIT; NsFileEntry nfe = NS_FILE_ENTRY__INIT; struct fd_link *link = p->link; struct ns_id *nsid; nsid = lookup_ns_by_kid(link->ns_kid, link->ns_d); if (!nsid) { pr_err("No NS ID with kid %u\n", link->ns_kid); return -1; } nfe.id = id; nfe.ns_id = nsid->id; nfe.ns_cflag = link->ns_d->cflag; nfe.flags = p->flags; fe.type = FD_TYPES__NS; fe.id = nfe.id; fe.nsf = &nfe; img = img_from_set(glob_imgset, CR_FD_FILES); return pb_write_one(img, &fe, PB_FILE); } const struct fdtype_ops nsfile_dump_ops = { .type = FD_TYPES__NS, .dump = dump_one_ns_file, }; struct ns_file_info { struct file_desc d; NsFileEntry *nfe; }; static int open_ns_fd(struct file_desc *d, int *new_fd) { struct ns_file_info *nfi = container_of(d, struct ns_file_info, d); struct pstree_item *item, *t; struct ns_desc *nd = NULL; char path[64]; int fd; /* * Find out who can open us. * * FIXME I need a hash or RBtree here. */ for_each_pstree_item(t) { TaskKobjIdsEntry *ids = t->ids; if (ids->pid_ns_id == nfi->nfe->ns_id) { item = t; nd = &pid_ns_desc; break; } else if (ids->net_ns_id == nfi->nfe->ns_id) { item = t; nd = &net_ns_desc; break; } else if (ids->ipc_ns_id == nfi->nfe->ns_id) { item = t; nd = &ipc_ns_desc; break; } else if (ids->uts_ns_id == nfi->nfe->ns_id) { item = t; nd = &uts_ns_desc; break; } else if (ids->mnt_ns_id == nfi->nfe->ns_id) { item = t; nd = &mnt_ns_desc; break; } else if (ids->cgroup_ns_id == nfi->nfe->ns_id) { item = t; nd = &cgroup_ns_desc; break; } } if (!nd || !item) { pr_err("Can't find suitable NS ID for %#x\n", nfi->nfe->ns_id); return -1; } if (nd->cflag != nfi->nfe->ns_cflag) { pr_err("Clone flag mismatch for %#x\n", nfi->nfe->ns_id); return -1; } snprintf(path, sizeof(path) - 1, "/proc/%d/ns/%s", vpid(item), nd->str); path[sizeof(path) - 1] = '\0'; fd = open(path, nfi->nfe->flags); if (fd < 0) { pr_perror("Can't open file %s on restore", path); return fd; } *new_fd = fd; return 0; } static struct file_desc_ops ns_desc_ops = { .type = FD_TYPES__NS, .open = open_ns_fd, }; static int collect_one_nsfile(void *o, ProtobufCMessage *base, struct cr_img *img) { struct ns_file_info *nfi = o; nfi->nfe = pb_msg(base, NsFileEntry); pr_info("Collected ns file ID %#x NS-ID %#x\n", nfi->nfe->id, nfi->nfe->ns_id); return file_desc_add(&nfi->d, nfi->nfe->id, &ns_desc_ops); } struct collect_image_info nsfile_cinfo = { .fd_type = CR_FD_NS_FILES, .pb_type = PB_NS_FILE, .priv_size = sizeof(struct ns_file_info), .collect = collect_one_nsfile, }; /* * Same as dump_task_ns_ids(), but * a) doesn't keep IDs (don't need them) * b) generates them for mount and netns only * mnt ones are needed for open_mount() in * inotify pred-dump * net ones are needed for parasite socket */ int predump_task_ns_ids(struct pstree_item *item) { int pid = item->pid->real; if (!__get_ns_id(pid, &net_ns_desc, NULL, &dmpi(item)->netns)) return -1; if (!get_ns_id(pid, &mnt_ns_desc, NULL)) return -1; return 0; } int dump_task_ns_ids(struct pstree_item *item) { int pid = item->pid->real; TaskKobjIdsEntry *ids = item->ids; ids->has_pid_ns_id = true; ids->pid_ns_id = get_ns_id(pid, &pid_ns_desc, NULL); if (!ids->pid_ns_id) { pr_err("Can't make pidns id\n"); return -1; } ids->has_net_ns_id = true; ids->net_ns_id = __get_ns_id(pid, &net_ns_desc, NULL, &dmpi(item)->netns); if (!ids->net_ns_id) { pr_err("Can't make netns id\n"); return -1; } ids->has_ipc_ns_id = true; ids->ipc_ns_id = get_ns_id(pid, &ipc_ns_desc, NULL); if (!ids->ipc_ns_id) { pr_err("Can't make ipcns id\n"); return -1; } ids->has_uts_ns_id = true; ids->uts_ns_id = get_ns_id(pid, &uts_ns_desc, NULL); if (!ids->uts_ns_id) { pr_err("Can't make utsns id\n"); return -1; } ids->has_mnt_ns_id = true; ids->mnt_ns_id = get_ns_id(pid, &mnt_ns_desc, NULL); if (!ids->mnt_ns_id) { pr_err("Can't make mntns id\n"); return -1; } ids->has_user_ns_id = true; ids->user_ns_id = get_ns_id(pid, &user_ns_desc, NULL); if (!ids->user_ns_id) { pr_err("Can't make userns id\n"); return -1; } ids->cgroup_ns_id = get_ns_id(pid, &cgroup_ns_desc, &ids->has_cgroup_ns_id); if (!ids->cgroup_ns_id) { pr_err("Can't make cgroup id\n"); return -1; } return 0; } static UsernsEntry userns_entry = USERNS_ENTRY__INIT; #define INVALID_ID (~0U) static unsigned int userns_id(unsigned int id, UidGidExtent **map, int n) { int i; if (!(root_ns_mask & CLONE_NEWUSER)) return id; for (i = 0; i < n; i++) { if (map[i]->lower_first <= id && map[i]->lower_first + map[i]->count > id) return map[i]->first + (id - map[i]->lower_first); } return INVALID_ID; } static unsigned int host_id(unsigned int id, UidGidExtent **map, int n) { int i; if (!(root_ns_mask & CLONE_NEWUSER)) return id; for (i = 0; i < n; i++) { if (map[i]->first <= id && map[i]->first + map[i]->count > id) return map[i]->lower_first + (id - map[i]->first); } return INVALID_ID; } static uid_t host_uid(uid_t uid) { UsernsEntry *e = &userns_entry; return host_id(uid, e->uid_map, e->n_uid_map); } static gid_t host_gid(gid_t gid) { UsernsEntry *e = &userns_entry; return host_id(gid, e->gid_map, e->n_gid_map); } uid_t userns_uid(uid_t uid) { UsernsEntry *e = &userns_entry; return userns_id(uid, e->uid_map, e->n_uid_map); } gid_t userns_gid(gid_t gid) { UsernsEntry *e = &userns_entry; return userns_id(gid, e->gid_map, e->n_gid_map); } static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts) { UidGidExtent *extents = NULL; int len = 0, size = 0, ret, i; FILE *f; f = fopen_proc(pid, "%s", name); if (f == NULL) return -1; ret = -1; while (1) { UidGidExtent *ext; if (len == size) { UidGidExtent *t; size = size * 2 + 1; t = xrealloc(extents, size * sizeof(UidGidExtent)); if (t == NULL) break; extents = t; } ext = &extents[len]; uid_gid_extent__init(ext); ret = fscanf(f, "%d %d %d", &ext->first, &ext->lower_first, &ext->count); if (ret != 3) { if (ferror(f)) { pr_perror("Unable to parse extents: %d", ret); ret = -1; } else ret = 0; break; } pr_info("id_map: %d %d %d\n", ext->first, ext->lower_first, ext->count); len++; } fclose(f); if (ret) goto err; if (len) { *pb_exts = xmalloc(sizeof(UidGidExtent *) * len); if (*pb_exts == NULL) goto err; for (i = 0; i < len; i++) (*pb_exts)[i] = &extents[i]; } else { xfree(extents); *pb_exts = NULL; } return len; err: xfree(extents); return -1; } int collect_user_ns(struct ns_id *ns, void *oarg) { /* * User namespace is dumped before files to get uid and gid * mappings, which are used for convirting local id-s to * userns id-s (userns_uid(), userns_gid()) */ if (dump_user_ns(root_item->pid->real, root_item->ids->user_ns_id)) return -1; return 0; } int collect_user_namespaces(bool for_dump) { if (!for_dump) return 0; if (!(root_ns_mask & CLONE_NEWUSER)) return 0; return walk_namespaces(&user_ns_desc, collect_user_ns, NULL); } static int check_user_ns(int pid) { int status; pid_t chld; chld = fork(); if (chld == -1) { pr_perror("Unable to fork a process"); return -1; } if (chld == 0) { struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; struct __user_cap_header_struct hdr; uid_t uid; gid_t gid; int i; uid = host_uid(0); gid = host_gid(0); if (uid == INVALID_ID || gid == INVALID_ID) { pr_err("Unable to convert uid or gid\n"); return -1; } if (prctl(PR_SET_KEEPCAPS, 1)) { pr_perror("Unable to set PR_SET_KEEPCAPS"); return -1; } if (setresgid(gid, gid, gid)) { pr_perror("Unable to set group ID"); return -1; } if (setgroups(0, NULL) < 0) { pr_perror("Unable to drop supplementary groups"); return -1; } if (setresuid(uid, uid, uid)) { pr_perror("Unable to set user ID"); return -1; } hdr.version = _LINUX_CAPABILITY_VERSION_3; hdr.pid = 0; if (capget(&hdr, data) < 0) { pr_perror("capget"); return -1; } data[0].effective = data[0].permitted; data[1].effective = data[1].permitted; if (capset(&hdr, data) < 0) { pr_perror("capset"); return -1; } close_old_fds(); for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++) close_service_fd(i); /* * Check that we are able to enter into other namespaces * from the target userns namespace. This signs that these * namespaces were created from the target userns. */ if (switch_ns(pid, &user_ns_desc, NULL)) exit(-1); if ((root_ns_mask & CLONE_NEWNET) && switch_ns(pid, &net_ns_desc, NULL)) exit(-1); if ((root_ns_mask & CLONE_NEWUTS) && switch_ns(pid, &uts_ns_desc, NULL)) exit(-1); if ((root_ns_mask & CLONE_NEWIPC) && switch_ns(pid, &ipc_ns_desc, NULL)) exit(-1); if ((root_ns_mask & CLONE_NEWNS) && switch_ns(pid, &mnt_ns_desc, NULL)) exit(-1); exit(0); } if (waitpid(chld, &status, 0) != chld) { pr_perror("Unable to wait for PID %d", chld); return -1; } if (status) { pr_err("One or more namespaces doesn't belong to the target user namespace\n"); return -1; } return 0; } int dump_user_ns(pid_t pid, int ns_id) { int ret, exit_code = -1; UsernsEntry *e = &userns_entry; struct cr_img *img; ret = parse_id_map(pid, "uid_map", &e->uid_map); if (ret < 0) goto err; e->n_uid_map = ret; ret = parse_id_map(pid, "gid_map", &e->gid_map); if (ret < 0) goto err; e->n_gid_map = ret; if (check_user_ns(pid)) return -1; img = open_image(CR_FD_USERNS, O_DUMP, ns_id); if (!img) goto err; ret = pb_write_one(img, e, PB_USERNS); close_image(img); if (ret < 0) goto err; return 0; err: if (e->uid_map) { xfree(e->uid_map[0]); xfree(e->uid_map); } if (e->gid_map) { xfree(e->gid_map[0]); xfree(e->gid_map); } return exit_code; } void free_userns_maps() { if (userns_entry.n_uid_map > 0) { xfree(userns_entry.uid_map[0]); xfree(userns_entry.uid_map); } if (userns_entry.n_gid_map > 0) { xfree(userns_entry.gid_map[0]); xfree(userns_entry.gid_map); } } static int do_dump_namespaces(struct ns_id *ns) { int ret; ret = switch_ns(ns->ns_pid, ns->nd, NULL); if (ret) return ret; switch (ns->nd->cflag) { case CLONE_NEWUTS: pr_info("Dump UTS namespace %d via %d\n", ns->id, ns->ns_pid); ret = dump_uts_ns(ns->id); break; case CLONE_NEWIPC: pr_info("Dump IPC namespace %d via %d\n", ns->id, ns->ns_pid); ret = dump_ipc_ns(ns->id); break; case CLONE_NEWNET: pr_info("Dump NET namespace info %d via %d\n", ns->id, ns->ns_pid); ret = dump_net_ns(ns->id); break; default: pr_err("Unknown namespace flag %x\n", ns->nd->cflag); break; } return ret; } int dump_namespaces(struct pstree_item *item, unsigned int ns_flags) { struct pid *ns_pid = item->pid; struct ns_id *ns; int pid, nr = 0; int ret = 0; /* * The setns syscall is cool, we can switch to the other * namespace and then return back to our initial one, but * for me it's much easier just to fork another task and * let it do the job, all the more so it can be done in * parallel with task dumping routine. * * However, the question how to dump sockets from the target * net namespace with this is still open */ pr_info("Dumping %d(%d)'s namespaces\n", ns_pid->ns[0].virt, ns_pid->real); if ((ns_flags & CLONE_NEWPID) && ns_pid->ns[0].virt != 1) { pr_err("Can't dump a pid namespace without the process init\n"); return -1; } for (ns = ns_ids; ns; ns = ns->next) { /* Skip current namespaces, which are in the list too */ if (ns->type == NS_CRIU) continue; switch (ns->nd->cflag) { /* No data for pid namespaces to dump */ case CLONE_NEWPID: /* Dumped explicitly with dump_mnt_namespaces() */ case CLONE_NEWNS: /* Userns is dumped before dumping tasks */ case CLONE_NEWUSER: /* handled separately in cgroup dumping code */ case CLONE_NEWCGROUP: continue; } pid = fork(); if (pid < 0) { pr_perror("Can't fork ns dumper"); return -1; } if (pid == 0) { ret = do_dump_namespaces(ns); exit(ret); } nr++; } while (nr > 0) { int status; ret = waitpid(-1, &status, 0); if (ret < 0) { pr_perror("Can't wait ns dumper"); return -1; } if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { pr_err("Namespaces dumping finished with error %d\n", status); return -1; } nr--; } pr_info("Namespaces dump complete\n"); return 0; } static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) { char buf[PAGE_SIZE]; int off = 0, i; int fd; /* * We can perform only a single write (that may contain multiple * newline-delimited records) to a uid_map and a gid_map files. */ for (i = 0; i < n; i++) off += snprintf(buf + off, sizeof(buf) - off, "%u %u %u\n", extents[i]->first, extents[i]->lower_first, extents[i]->count); fd = open_proc_rw(pid, "%s", id_map); if (fd < 0) return -1; if (write(fd, buf, off) != off) { pr_perror("Unable to write into %s", id_map); close(fd); return -1; } close(fd); return 0; } struct unsc_msg { struct msghdr h; /* * 0th is the call address * 1st is the flags * 2nd is the optional (NULL in response) arguments */ struct iovec iov[3]; char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; }; static int usernsd_pid; static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd) { struct cmsghdr *ch; struct ucred *ucred; m->h.msg_iov = m->iov; m->h.msg_iovlen = 2; m->iov[0].iov_base = c; m->iov[0].iov_len = sizeof(*c); m->iov[1].iov_base = x; m->iov[1].iov_len = sizeof(*x); if (arg) { m->iov[2].iov_base = arg; m->iov[2].iov_len = asize; m->h.msg_iovlen++; } m->h.msg_name = NULL; m->h.msg_namelen = 0; m->h.msg_flags = 0; m->h.msg_control = &m->c; /* Need to memzero because of: * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514917 */ memzero(&m->c, sizeof(m->c)); m->h.msg_controllen = CMSG_SPACE(sizeof(struct ucred)); ch = CMSG_FIRSTHDR(&m->h); ch->cmsg_len = CMSG_LEN(sizeof(struct ucred)); ch->cmsg_level = SOL_SOCKET; ch->cmsg_type = SCM_CREDENTIALS; ucred = (struct ucred *) CMSG_DATA(ch); ucred->pid = getpid(); ucred->uid = getuid(); ucred->gid = getgid(); if (fd >= 0) { m->h.msg_controllen += CMSG_SPACE(sizeof(int)); ch = CMSG_NXTHDR(&m->h, ch); BUG_ON(!ch); ch->cmsg_len = CMSG_LEN(sizeof(int)); ch->cmsg_level = SOL_SOCKET; ch->cmsg_type = SCM_RIGHTS; *((int *)CMSG_DATA(ch)) = fd; } } static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) { struct cmsghdr *ch; struct ucred *ucred; ch = CMSG_FIRSTHDR(&um->h); BUG_ON(!ch); BUG_ON(ch->cmsg_len != CMSG_LEN(sizeof(struct ucred))); BUG_ON(ch->cmsg_level != SOL_SOCKET); BUG_ON(ch->cmsg_type != SCM_CREDENTIALS); if (pid) { ucred = (struct ucred *) CMSG_DATA(ch); *pid = ucred->pid; } ch = CMSG_NXTHDR(&um->h, ch); if (ch && ch->cmsg_len == CMSG_LEN(sizeof(int))) { BUG_ON(ch->cmsg_level != SOL_SOCKET); BUG_ON(ch->cmsg_type != SCM_RIGHTS); *fd = *((int *)CMSG_DATA(ch)); } else { *fd = -1; } } static int usernsd(int sk) { pr_info("uns: Daemon started\n"); while (1) { struct unsc_msg um; static char msg[MAX_UNSFD_MSG_SIZE]; uns_call_t call; int flags, fd, ret; pid_t pid; unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0); if (recvmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: recv req error"); return -1; } unsc_msg_pid_fd(&um, &pid, &fd); pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags); BUG_ON(fd < 0 && flags & UNS_FDOUT); /* * Caller has sent us bare address of the routine it * wants to call. Since the caller is fork()-ed from the * same process as the daemon is, the latter has exactly * the same code at exactly the same address as the * former guy has. So go ahead and just call one! */ ret = call(msg, fd, pid); if (fd >= 0) close(fd); if (flags & UNS_ASYNC) { /* * Async call failed and the called doesn't know * about it. Exit now and let the stop_usernsd() * check the exit code and abort the restoration. * * We'd get there either by the end of restore or * from the next userns_call() due to failed * sendmsg() in there. */ if (ret < 0) { pr_err("uns: Async call failed. Exiting\n"); return -1; } continue; } if (flags & UNS_FDOUT) fd = ret; else fd = -1; unsc_msg_init(&um, &call, &ret, NULL, 0, fd); if (sendmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: send resp error"); return -1; } if (fd >= 0) close(fd); } } int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, size_t arg_size, int fd) { int ret, res, sk; bool async = flags & UNS_ASYNC; struct unsc_msg um; if (unlikely(arg_size > MAX_UNSFD_MSG_SIZE)) { pr_err("uns: message size exceeded\n"); return -1; } if (!usernsd_pid) return call(arg, fd, getpid()); sk = get_service_fd(USERNSD_SK); pr_debug("uns: calling %s (%d, %x)\n", func_name, fd, flags); if (!async) /* * Why don't we lock for async requests? Because * they just put the request in the daemon's * queue and do not wait for the response. Thus * when daemon response there's only one client * waiting for it in recvmsg below, so he * responses to proper caller. */ mutex_lock(&task_entries->userns_sync_lock); else /* * If we want the callback to give us and FD then * we should NOT do the asynchronous call. */ BUG_ON(flags & UNS_FDOUT); /* Send the request */ unsc_msg_init(&um, &call, &flags, arg, arg_size, fd); ret = sendmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: send req error"); ret = -1; goto out; } if (async) { ret = 0; goto out; } /* Get the response back */ unsc_msg_init(&um, &call, &res, NULL, 0, 0); ret = recvmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: recv resp error"); ret = -1; goto out; } /* Decode the result and return */ if (flags & UNS_FDOUT) unsc_msg_pid_fd(&um, NULL, &ret); else ret = res; out: if (!async) mutex_unlock(&task_entries->userns_sync_lock); return ret; } static int start_usernsd(void) { int sk[2]; int one = 1; if (!(root_ns_mask & CLONE_NEWUSER)) return 0; /* * Seqpacket to * * a) Help daemon distinguish individual requests from * each other easily. Stream socket require manual * messages boundaries. * * b) Make callers note the damon death by seeing the * disconnected socket. In case of dgram socket * callers would just get stuck in receiving the * response. */ if (socketpair(PF_UNIX, SOCK_SEQPACKET, 0, sk)) { pr_perror("Can't make usernsd socket"); return -1; } if (setsockopt(sk[0], SOL_SOCKET, SO_PASSCRED, &one, sizeof(one)) < 0) { pr_perror("failed to setsockopt"); return -1; } if (setsockopt(sk[1], SOL_SOCKET, SO_PASSCRED, &one, sizeof(1)) < 0) { pr_perror("failed to setsockopt"); return -1; } usernsd_pid = fork(); if (usernsd_pid < 0) { pr_perror("Can't fork usernsd"); close(sk[0]); close(sk[1]); return -1; } if (usernsd_pid == 0) { int ret; close(sk[0]); ret = usernsd(sk[1]); exit(ret); } close(sk[1]); if (install_service_fd(USERNSD_SK, sk[0]) < 0) { kill(usernsd_pid, SIGKILL); waitpid(usernsd_pid, NULL, 0); close(sk[0]); return -1; } close(sk[0]); return 0; } static int exit_usernsd(void *arg, int fd, pid_t pid) { int code = *(int *)arg; pr_info("uns: `- daemon exits w/ %d\n", code); exit(code); } int stop_usernsd(void) { int ret = 0; if (usernsd_pid) { int status = -1; sigset_t blockmask, oldmask; /* * Don't let the sigchld_handler() mess with us * calling waitpid() on the exited daemon. The * same is done in cr_system(). */ sigemptyset(&blockmask); sigaddset(&blockmask, SIGCHLD); sigprocmask(SIG_BLOCK, &blockmask, &oldmask); /* * Send a message to make sure the daemon _has_ * proceeded all its queue of asynchronous requests. * * All the restoring processes might have already * closed their USERNSD_SK descriptors, but daemon * still has its in connected state -- this is us * who hold the last reference on the peer. * * If daemon has exited "in advance" due to async * call or socket error, the userns_call() and the * waitpid() below would both fail and we'll see * bad exit status. */ userns_call(exit_usernsd, UNS_ASYNC, &ret, sizeof(ret), -1); waitpid(usernsd_pid, &status, 0); if (WIFEXITED(status)) ret = WEXITSTATUS(status); else ret = -1; usernsd_pid = 0; sigprocmask(SIG_SETMASK, &oldmask, NULL); if (ret != 0) pr_err("uns: daemon exited abnormally\n"); else pr_info("uns: daemon stopped\n"); } return ret; } int prepare_userns(struct pstree_item *item) { struct cr_img *img; UsernsEntry *e; int ret; img = open_image(CR_FD_USERNS, O_RSTR, item->ids->user_ns_id); if (!img) return -1; ret = pb_read_one(img, &e, PB_USERNS); close_image(img); if (ret < 0) return -1; if (write_id_map(item->pid->real, e->uid_map, e->n_uid_map, "uid_map")) return -1; if (write_id_map(item->pid->real, e->gid_map, e->n_gid_map, "gid_map")) return -1; return 0; } int collect_namespaces(bool for_dump) { int ret; ret = collect_user_namespaces(for_dump); if (ret < 0) return ret; ret = collect_mnt_namespaces(for_dump); if (ret < 0) return ret; ret = collect_net_namespaces(for_dump); if (ret < 0) return ret; return 0; } static int prepare_userns_creds(void) { /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ if (setuid(0) || setgid(0) || setgroups(0, NULL)) { pr_perror("Unable to initialize id-s"); return -1; } /* * This flag is dropped after entering userns, but is * required to access files in /proc, so put one here * temoprarily. It will be set to proper value at the * very end. */ if (prctl(PR_SET_DUMPABLE, 1, 0)) { pr_perror("Unable to set PR_SET_DUMPABLE"); return -1; } return 0; } static int get_join_ns_fd(struct join_ns *jn) { int pid, fd; char nsf[32]; char *pnsf; pid = atoi(jn->ns_file); if (pid > 0) { snprintf(nsf, sizeof(nsf), "/proc/%d/ns/%s", pid, jn->nd->str); pnsf = nsf; } else { pnsf = jn->ns_file; } fd = open(pnsf, O_RDONLY); if (fd < 0) { pr_perror("Can't open ns file %s", pnsf); return -1; } jn->ns_fd = fd; return 0; } static int switch_join_ns(struct join_ns *jn) { struct stat st, self_st; char buf[32]; if (jn->nd == &user_ns_desc) { /* It is not permitted to use setns() to reenter the caller's current * user namespace. This prevents a caller that has dropped capabilities * from regaining those capabilities via a call to setns() */ if (fstat(jn->ns_fd, &st) == -1) { pr_perror("Can't get ns file %s stat", jn->ns_file); return -1; } snprintf(buf, sizeof(buf), "/proc/self/ns/%s", jn->nd->str); if (stat(buf, &self_st) == -1) { pr_perror("Can't get ns file %s stat", buf); return -1; } if (st.st_ino == self_st.st_ino) return 0; } if (setns(jn->ns_fd, jn->nd->cflag)) { pr_perror("Failed to setns when join-ns %s:%s", jn->nd->str, jn->ns_file); return -1; } return 0; } static int switch_user_join_ns(struct join_ns *jn) { uid_t uid; gid_t gid; if (jn == NULL) return 0; if (switch_join_ns(jn)) return -1; if (jn->extra_opts.user_extra.uid == NULL) uid = getuid(); else uid = atoi(jn->extra_opts.user_extra.uid); if (jn->extra_opts.user_extra.gid == NULL) gid = getgid(); else gid = atoi(jn->extra_opts.user_extra.gid); /* FIXME: * if err occurs in setuid/setgid, should we just alert or * return an error */ if (setuid(uid)) { pr_perror("setuid failed while joining userns"); return -1; } if (setgid(gid)) { pr_perror("setgid failed while joining userns"); return -1; } return 0; } int join_namespaces(void) { struct join_ns *jn, *user_jn = NULL; int ret = -1; list_for_each_entry(jn, &opts.join_ns, list) if (get_join_ns_fd(jn)) goto err_out; list_for_each_entry(jn, &opts.join_ns, list) if (jn->nd == &user_ns_desc) { user_jn = jn; } else { if (switch_join_ns(jn)) goto err_out; } if (switch_user_join_ns(user_jn)) goto err_out; ret = 0; err_out: list_for_each_entry(jn, &opts.join_ns, list) close_safe(&jn->ns_fd); return ret; } int prepare_namespace(struct pstree_item *item, unsigned long clone_flags) { pid_t pid = vpid(item); int id; pr_info("Restoring namespaces %d flags 0x%lx\n", vpid(item), clone_flags); if ((clone_flags & CLONE_NEWUSER) && prepare_userns_creds()) return -1; /* * On netns restore we launch an IP tool, thus we * have to restore it _before_ altering the mount * tree (i.e. -- mnt_ns restoring) */ id = ns_per_id ? item->ids->net_ns_id : pid; if ((clone_flags & CLONE_NEWNET) && prepare_net_ns(id)) return -1; id = ns_per_id ? item->ids->uts_ns_id : pid; if ((clone_flags & CLONE_NEWUTS) && prepare_utsns(id)) return -1; id = ns_per_id ? item->ids->ipc_ns_id : pid; if ((clone_flags & CLONE_NEWIPC) && prepare_ipc_ns(id)) return -1; /* * This one is special -- there can be several mount * namespaces and prepare_mnt_ns handles them itself. */ if (prepare_mnt_ns()) return -1; return 0; } int prepare_namespace_before_tasks(void) { if (start_usernsd()) goto err_unds; if (netns_keep_nsfd()) goto err_netns; if (mntns_maybe_create_roots()) goto err_mnt; if (read_mnt_ns_img()) goto err_img; return 0; err_img: cleanup_mnt_ns(); err_mnt: /* * Nothing, netns' descriptor will be closed * on criu exit */ err_netns: stop_usernsd(); err_unds: return -1; } struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid"); struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user"); criu-3.6/criu/net.c000066400000000000000000001512311317335042600142270ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../soccr/soccr.h" #include "imgset.h" #include "namespaces.h" #include "net.h" #include "libnetlink.h" #include "cr_options.h" #include "sk-inet.h" #include "tun.h" #include "util-pie.h" #include "plugin.h" #include "action-scripts.h" #include "sockets.h" #include "pstree.h" #include "string.h" #include "sysctl.h" #include "kerndat.h" #include "util.h" #include "external.h" #include "protobuf.h" #include "images/netdev.pb-c.h" #ifndef IFLA_LINK_NETNSID #define IFLA_LINK_NETNSID 37 #endif #ifndef RTM_NEWNSID #define RTM_NEWNSID 88 #endif #ifndef IFLA_MACVLAN_FLAGS #define IFLA_MACVLAN_FLAGS 2 #endif enum { IFLA_IPTUN_UNSPEC, IFLA_IPTUN_LINK, IFLA_IPTUN_LOCAL, IFLA_IPTUN_REMOTE, IFLA_IPTUN_TTL, IFLA_IPTUN_TOS, IFLA_IPTUN_ENCAP_LIMIT, IFLA_IPTUN_FLOWINFO, IFLA_IPTUN_FLAGS, IFLA_IPTUN_PROTO, IFLA_IPTUN_PMTUDISC, IFLA_IPTUN_6RD_PREFIX, IFLA_IPTUN_6RD_RELAY_PREFIX, IFLA_IPTUN_6RD_PREFIXLEN, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, IFLA_IPTUN_ENCAP_TYPE, IFLA_IPTUN_ENCAP_FLAGS, IFLA_IPTUN_ENCAP_SPORT, IFLA_IPTUN_ENCAP_DPORT, __IFLA_IPTUN_MAX, }; #define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) static int ns_sysfs_fd = -1; int read_ns_sys_file(char *path, char *buf, int len) { int fd, rlen; BUG_ON(ns_sysfs_fd == -1); fd = openat(ns_sysfs_fd, path, O_RDONLY, 0); if (fd < 0) { pr_perror("Can't open ns' %s", path); return -1; } rlen = read(fd, buf, len); close(fd); if (rlen == len) { pr_err("Too small buffer to read ns sys file %s\n", path); return -1; } if (rlen > 0) buf[rlen - 1] = '\0'; return rlen; } static bool sysctl_entries_equal(SysctlEntry *a, SysctlEntry *b) { if (a->type != b->type) return false; switch (a->type) { case SYSCTL_TYPE__CTL_32: return a->has_iarg && b->has_iarg && a->iarg == b->iarg; case SYSCTL_TYPE__CTL_STR: return a->sarg && b->sarg && !strcmp(a->sarg, b->sarg); default:; } return false; } static char *devconfs4[] = { "accept_local", "accept_redirects", "accept_source_route", "arp_accept", "arp_announce", "arp_filter", "arp_ignore", "arp_notify", "bootp_relay", "disable_policy", "disable_xfrm", "force_igmp_version", "forwarding", "igmpv2_unsolicited_report_interval", "igmpv3_unsolicited_report_interval", "log_martians", "medium_id", "promote_secondaries", "proxy_arp", "proxy_arp_pvlan", "route_localnet", "rp_filter", "secure_redirects", "send_redirects", "shared_media", "src_valid_mark", "tag", "ignore_routes_with_linkdown", "drop_gratuitous_arp", "drop_unicast_in_l2_multicast", }; char *devconfs6[] = { "accept_dad", "accept_ra", "accept_ra_defrtr", "accept_ra_from_local", "accept_ra_min_hop_limit", "accept_ra_mtu", "accept_ra_pinfo", "accept_ra_rt_info_max_plen", "accept_ra_rtr_pref", "accept_redirects", "accept_source_route", "autoconf", "dad_transmits", "disable_ipv6", "drop_unicast_in_l2_multicast", "drop_unsolicited_na", "force_mld_version", "force_tllao", "forwarding", "hop_limit", "ignore_routes_with_linkdown", "keep_addr_on_down", "max_addresses", "max_desync_factor", "mldv1_unsolicited_report_interval", "mldv2_unsolicited_report_interval", "mtu", "ndisc_notify", "optimistic_dad", "proxy_ndp", "regen_max_retry", "router_probe_interval", "router_solicitation_delay", "router_solicitation_interval", "router_solicitations", "stable_secret", "suppress_frag_ndisc", "temp_prefered_lft", "temp_valid_lft", "use_oif_addrs_only", "use_optimistic", "use_tempaddr", }; #define CONF_OPT_PATH "net/%s/conf/%s/%s" #define MAX_CONF_OPT_PATH IFNAMSIZ+60 #define MAX_STR_CONF_LEN 200 static int net_conf_op(char *tgt, SysctlEntry **conf, int n, int op, char *proto, struct sysctl_req *req, char (*path)[MAX_CONF_OPT_PATH], int size, char **devconfs, SysctlEntry **def_conf) { int i, ri, ar = -1; int ret, flags = op == CTL_READ ? CTL_FLAGS_OPTIONAL : 0; SysctlEntry **rconf; if (n > size) pr_warn("The image contains unknown sysctl-s\n"); if (opts.weak_sysctls) flags = CTL_FLAGS_OPTIONAL; rconf = xmalloc(sizeof(SysctlEntry *) * size); if (!rconf) return -1; for (i = 0, ri = 0; i < size; i++) { if (i >= n) { pr_warn("Skip %s/%s\n", tgt, devconfs[i]); continue; } /* * If dev conf value is the same as default skip restoring it, * mtu may be changed by disable_ipv6 so we can not skip * it's restore */ if (def_conf && sysctl_entries_equal(conf[i], def_conf[i]) && strcmp(devconfs[i], "mtu")) { pr_debug("Skip %s/%s, coincides with default\n", tgt, devconfs[i]); continue; } /* * Make "accept_redirects" go last on write(it should * restore after forwarding to be correct) */ if (op == CTL_WRITE && !strcmp(devconfs[i], "accept_redirects")) { ar = i; continue; } snprintf(path[i], MAX_CONF_OPT_PATH, CONF_OPT_PATH, proto, tgt, devconfs[i]); req[ri].name = path[i]; req[ri].flags = flags; switch (conf[i]->type) { case SYSCTL_TYPE__CTL_32: req[ri].type = CTL_32; /* skip non-existing sysctl */ if (op == CTL_WRITE && !conf[i]->has_iarg) continue; req[ri].arg = &conf[i]->iarg; break; case SYSCTL_TYPE__CTL_STR: req[ri].type = CTL_STR(MAX_STR_CONF_LEN); req[ri].flags |= op == CTL_READ && !strcmp(devconfs[i], "stable_secret") ? CTL_FLAGS_READ_EIO_SKIP : 0; /* skip non-existing sysctl */ if (op == CTL_WRITE && !conf[i]->sarg) continue; req[ri].arg = conf[i]->sarg; break; default: continue; } rconf[ri] = conf[i]; ri++; } if (ar != -1 && conf[ar]->type == SYSCTL_TYPE__CTL_32 && conf[ar]->has_iarg) { snprintf(path[ar], MAX_CONF_OPT_PATH, CONF_OPT_PATH, proto, tgt, devconfs[ar]); req[ri].name = path[ar]; req[ri].type = CTL_32; req[ri].arg = &conf[ar]->iarg; req[ri].flags = flags; rconf[ri] = conf[ar]; ri++; } ret = sysctl_op(req, ri, op, CLONE_NEWNET); if (ret < 0) { pr_err("Failed to %s %s/\n", (op == CTL_READ)?"read":"write", tgt); goto err_free; } if (op == CTL_READ) { /* (un)mark (non-)existing sysctls in image */ for (i = 0; i < ri; i++) if (req[i].flags & CTL_FLAGS_HAS) { if (rconf[i]->type == SYSCTL_TYPE__CTL_32) rconf[i]->has_iarg = true; } else { if (rconf[i]->type == SYSCTL_TYPE__CTL_STR) rconf[i]->sarg = NULL; } } err_free: xfree(rconf); return ret; } static int ipv4_conf_op(char *tgt, SysctlEntry **conf, int n, int op, SysctlEntry **def_conf) { struct sysctl_req req[ARRAY_SIZE(devconfs4)]; char path[ARRAY_SIZE(devconfs4)][MAX_CONF_OPT_PATH]; return net_conf_op(tgt, conf, n, op, "ipv4", req, path, ARRAY_SIZE(devconfs4), devconfs4, def_conf); } static int ipv6_conf_op(char *tgt, SysctlEntry **conf, int n, int op, SysctlEntry **def_conf) { struct sysctl_req req[ARRAY_SIZE(devconfs6)]; char path[ARRAY_SIZE(devconfs6)][MAX_CONF_OPT_PATH]; return net_conf_op(tgt, conf, n, op, "ipv6", req, path, ARRAY_SIZE(devconfs6), devconfs6, def_conf); } /* * I case if some entry is missing in * the kernel, simply write DEVCONFS_UNUSED * into the image so we would skip it. */ #define DEVCONFS_UNUSED (-1u) static int ipv4_conf_op_old(char *tgt, int *conf, int n, int op, int *def_conf) { int i, ri; int ret, flags = op == CTL_READ ? CTL_FLAGS_OPTIONAL : 0; struct sysctl_req req[ARRAY_SIZE(devconfs4)]; char path[ARRAY_SIZE(devconfs4)][MAX_CONF_OPT_PATH]; if (n > ARRAY_SIZE(devconfs4)) pr_warn("The image contains unknown sysctl-s\n"); for (i = 0, ri = 0; i < ARRAY_SIZE(devconfs4); i++) { if (i >= n) { pr_warn("Skip %s/%s\n", tgt, devconfs4[i]); continue; } /* * If dev conf value is the same as default skip restoring it */ if (def_conf && conf[i] == def_conf[i]) { pr_debug("DEBUG Skip %s/%s, val =%d\n", tgt, devconfs4[i], conf[i]); continue; } if (op == CTL_WRITE && conf[i] == DEVCONFS_UNUSED) continue; else if (op == CTL_READ) conf[i] = DEVCONFS_UNUSED; snprintf(path[i], MAX_CONF_OPT_PATH, CONF_OPT_PATH, "ipv4", tgt, devconfs4[i]); req[ri].name = path[i]; req[ri].arg = &conf[i]; req[ri].type = CTL_32; req[ri].flags = flags; ri++; } ret = sysctl_op(req, ri, op, CLONE_NEWNET); if (ret < 0) { pr_err("Failed to %s %s/\n", (op == CTL_READ)?"read":"write", tgt); return -1; } return 0; } int write_netdev_img(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **info) { return pb_write_one(img_from_set(fds, CR_FD_NETDEV), nde, PB_NETDEV); } static int dump_one_netdev(int type, struct ifinfomsg *ifi, struct nlattr **tb, struct cr_imgset *fds, int (*dump)(NetDeviceEntry *, struct cr_imgset *, struct nlattr **info)) { int ret = -1; int i; NetDeviceEntry netdev = NET_DEVICE_ENTRY__INIT; SysctlEntry *confs4 = NULL; int size4 = ARRAY_SIZE(devconfs4); SysctlEntry *confs6 = NULL; int size6 = ARRAY_SIZE(devconfs6); char stable_secret[MAX_STR_CONF_LEN + 1] = {}; struct nlattr *info[IFLA_INFO_MAX + 1], **arg = NULL; if (!tb[IFLA_IFNAME]) { pr_err("No name for link %d\n", ifi->ifi_index); return -1; } netdev.type = type; netdev.ifindex = ifi->ifi_index; netdev.mtu = *(int *)RTA_DATA(tb[IFLA_MTU]); netdev.flags = ifi->ifi_flags; netdev.name = RTA_DATA(tb[IFLA_IFNAME]); if (tb[IFLA_ADDRESS] && (type != ND_TYPE__LOOPBACK)) { netdev.has_address = true; netdev.address.data = nla_data(tb[IFLA_ADDRESS]); netdev.address.len = nla_len(tb[IFLA_ADDRESS]); pr_info("Found ll addr (%02x:../%d) for %s\n", (int)netdev.address.data[0], (int)netdev.address.len, netdev.name); } netdev.n_conf4 = size4; netdev.conf4 = xmalloc(sizeof(SysctlEntry *) * size4); if (!netdev.conf4) goto err_free; confs4 = xmalloc(sizeof(SysctlEntry) * size4); if (!confs4) goto err_free; for (i = 0; i < size4; i++) { sysctl_entry__init(&confs4[i]); netdev.conf4[i] = &confs4[i]; netdev.conf4[i]->type = CTL_32; } netdev.n_conf6 = size6; netdev.conf6 = xmalloc(sizeof(SysctlEntry *) * size6); if (!netdev.conf6) goto err_free; confs6 = xmalloc(sizeof(SysctlEntry) * size6); if (!confs6) goto err_free; for (i = 0; i < size6; i++) { sysctl_entry__init(&confs6[i]); netdev.conf6[i] = &confs6[i]; if (strcmp(devconfs6[i], "stable_secret")) { netdev.conf6[i]->type = SYSCTL_TYPE__CTL_32; } else { netdev.conf6[i]->type = SYSCTL_TYPE__CTL_STR; netdev.conf6[i]->sarg = stable_secret; } } ret = ipv4_conf_op(netdev.name, netdev.conf4, size4, CTL_READ, NULL); if (ret < 0) goto err_free; ret = ipv6_conf_op(netdev.name, netdev.conf6, size6, CTL_READ, NULL); if (ret < 0) goto err_free; if (!dump) dump = write_netdev_img; if (tb[IFLA_LINKINFO]) { ret = nla_parse_nested(info, IFLA_INFO_MAX, tb[IFLA_LINKINFO], NULL); if (ret < 0) { pr_err("failed to parse nested linkinfo\n"); return -1; } arg = info; } ret = dump(&netdev, fds, arg); err_free: xfree(netdev.conf4); xfree(confs4); xfree(netdev.conf6); xfree(confs6); return ret; } static char *link_kind(struct ifinfomsg *ifi, struct nlattr **tb) { struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; if (!tb[IFLA_LINKINFO]) { pr_err("No linkinfo for eth link %d\n", ifi->ifi_index); return NULL; } nla_parse_nested(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], NULL); if (!linkinfo[IFLA_INFO_KIND]) { pr_err("No kind for eth link %d\n", ifi->ifi_index); return NULL; } return nla_data(linkinfo[IFLA_INFO_KIND]); } static int dump_unknown_device(struct ifinfomsg *ifi, char *kind, struct nlattr **tb, struct cr_imgset *fds) { int ret; ret = run_plugins(DUMP_EXT_LINK, ifi->ifi_index, ifi->ifi_type, kind); if (ret == 0) return dump_one_netdev(ND_TYPE__EXTLINK, ifi, tb, fds, NULL); if (ret == -ENOTSUP) pr_err("Unsupported link %d (type %d kind %s)\n", ifi->ifi_index, ifi->ifi_type, kind); return -1; } static int dump_bridge(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **info) { char spath[IFNAMSIZ + 16]; /* len("class/net//brif") + 1 for null */ int ret, fd; ret = snprintf(spath, sizeof(spath), "class/net/%s/brif", nde->name); if (ret < 0 || ret >= sizeof(spath)) return -1; /* Let's only allow dumping empty bridges for now. To do a full bridge * restore, we need to make sure the bridge and slaves are restored in * the right order and attached correctly. It looks like the veth code * supports this, but we need some way to do ordering. */ fd = openat(ns_sysfs_fd, spath, O_DIRECTORY, 0); if (fd < 0) { pr_perror("opening %s failed", spath); return -1; } ret = is_empty_dir(fd); close(fd); if (ret < 0) { pr_perror("problem testing %s for emptiness", spath); return -1; } if (!ret) { pr_err("dumping bridges with attached slaves not supported currently\n"); return -1; } return write_netdev_img(nde, imgset, info); } static int dump_macvlan(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **info) { MacvlanLinkEntry macvlan = MACVLAN_LINK_ENTRY__INIT; int ret; struct nlattr *data[IFLA_MACVLAN_FLAGS+1]; if (!info || !info[IFLA_INFO_DATA]) { pr_err("no data for macvlan\n"); return -1; } ret = nla_parse_nested(data, IFLA_MACVLAN_FLAGS, info[IFLA_INFO_DATA], NULL); if (ret < 0) { pr_err("failed ot parse macvlan data\n"); return -1; } if (!data[IFLA_MACVLAN_MODE]) { pr_err("macvlan mode required for %s\n", nde->name); return -1; } macvlan.mode = *((u32 *)RTA_DATA(data[IFLA_MACVLAN_MODE])); if (data[IFLA_MACVLAN_FLAGS]) macvlan.flags = *((u16 *) RTA_DATA(data[IFLA_MACVLAN_FLAGS])); nde->macvlan = &macvlan; return write_netdev_img(nde, imgset, info); } static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind, struct nlattr **tb, struct cr_imgset *fds) { if (!strcmp(kind, "veth")) /* * This is not correct. The peer of the veth device may * be either outside or inside the netns we're working * on, but there's currently no way of finding this out. * * Sigh... we have to assume, that the veth device is a * connection to the outer world and just dump this end :( */ return dump_one_netdev(ND_TYPE__VETH, ifi, tb, fds, NULL); if (!strcmp(kind, "tun")) return dump_one_netdev(ND_TYPE__TUN, ifi, tb, fds, dump_tun_link); if (!strcmp(kind, "bridge")) return dump_one_netdev(ND_TYPE__BRIDGE, ifi, tb, fds, dump_bridge); if (!strcmp(kind, "gretap")) { char *name = (char *)RTA_DATA(tb[IFLA_IFNAME]); if (!name) { pr_err("gretap %d has no name\n", ifi->ifi_index); return -1; } if (!strcmp(name, "gretap0")) { pr_info("found %s, ignoring\n", name); return 0; } pr_warn("GRE tap device %s not supported natively\n", name); } if (!strcmp(kind, "macvlan")) return dump_one_netdev(ND_TYPE__MACVLAN, ifi, tb, fds, dump_macvlan); return dump_unknown_device(ifi, kind, tb, fds); } static int dump_one_gendev(struct ifinfomsg *ifi, char *kind, struct nlattr **tb, struct cr_imgset *fds) { if (!strcmp(kind, "tun")) return dump_one_netdev(ND_TYPE__TUN, ifi, tb, fds, dump_tun_link); return dump_unknown_device(ifi, kind, tb, fds); } static int dump_one_voiddev(struct ifinfomsg *ifi, char *kind, struct nlattr **tb, struct cr_imgset *fds) { if (!strcmp(kind, "venet")) return dump_one_netdev(ND_TYPE__VENET, ifi, tb, fds, NULL); return dump_unknown_device(ifi, kind, tb, fds); } static int dump_one_gre(struct ifinfomsg *ifi, char *kind, struct nlattr **tb, struct cr_imgset *fds) { if (!strcmp(kind, "gre")) { char *name = (char *)RTA_DATA(tb[IFLA_IFNAME]); if (!name) { pr_err("gre device %d has no name\n", ifi->ifi_index); return -1; } if (!strcmp(name, "gre0")) { pr_info("found %s, ignoring\n", name); return 0; } pr_warn("GRE tunnel device %s not supported natively\n", name); } return dump_unknown_device(ifi, kind, tb, fds); } static int dump_sit(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **info) { int ret; struct nlattr *data[__IFLA_IPTUN_MAX]; SitEntry se = SIT_ENTRY__INIT; /* There are for IP(v6) addresses kernel feeds to us */ uint32_t a_local, a_remote, rd_prefix[4], rl_prefix; if (!info || !info[IFLA_INFO_DATA]) { pr_err("no data for sit\n"); return -1; } pr_info("Some data for SIT provided\n"); ret = nla_parse_nested(data, IFLA_IPTUN_MAX, info[IFLA_INFO_DATA], NULL); if (ret < 0) { pr_err("failed ot parse sit data\n"); return -1; } #define ENCODE_ENTRY(__type, __ifla, __proto) do { \ if (data[__ifla]) { \ se.__proto = *(__type *)nla_data(data[__ifla]); \ if (se.__proto) \ se.has_##__proto = true; \ } \ } while (0) if (data[IFLA_IPTUN_LOCAL]) { a_local = *(u32 *)nla_data(data[IFLA_IPTUN_LOCAL]); if (a_local != 0) { se.n_local = 1; se.local = &a_local; } } if (data[IFLA_IPTUN_REMOTE]) { a_remote = *(u32 *)nla_data(data[IFLA_IPTUN_REMOTE]); if (a_remote != 0) { se.n_remote = 1; se.remote = &a_remote; } } ENCODE_ENTRY(u32, IFLA_IPTUN_LINK, link); ENCODE_ENTRY(u8, IFLA_IPTUN_TTL, ttl); ENCODE_ENTRY(u8, IFLA_IPTUN_TOS, tos); ENCODE_ENTRY(u16, IFLA_IPTUN_FLAGS, flags); ENCODE_ENTRY(u8, IFLA_IPTUN_PROTO, proto); if (data[IFLA_IPTUN_PMTUDISC]) { u8 v; v = *(u8 *)nla_data(data[IFLA_IPTUN_PMTUDISC]); if (v) se.pmtudisc = se.has_pmtudisc = true; } ENCODE_ENTRY(u16, IFLA_IPTUN_ENCAP_TYPE, encap_type); ENCODE_ENTRY(u16, IFLA_IPTUN_ENCAP_FLAGS, encap_flags); ENCODE_ENTRY(u16, IFLA_IPTUN_ENCAP_SPORT, encap_sport); ENCODE_ENTRY(u16, IFLA_IPTUN_ENCAP_DPORT, encap_dport); if (data[IFLA_IPTUN_6RD_PREFIXLEN]) { se.rd_prefixlen = *(u16 *)nla_data(data[IFLA_IPTUN_6RD_PREFIXLEN]); if (!se.rd_prefixlen) goto skip; if (!data[IFLA_IPTUN_6RD_PREFIX]) { pr_err("No 6rd prefix for sit device\n"); return -1; } se.has_rd_prefixlen = true; memcpy(&rd_prefix, nla_data(data[IFLA_IPTUN_6RD_PREFIX]), sizeof(rd_prefix)); se.n_rd_prefix = 4; se.rd_prefix = rd_prefix; se.relay_prefixlen = *(u16 *)nla_data(data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]); if (!se.relay_prefixlen) goto skip; if (!data[IFLA_IPTUN_6RD_RELAY_PREFIX]) { pr_err("No 6rd relay prefix for sit device\n"); return -1; } se.has_relay_prefixlen = true; memcpy(&rl_prefix, nla_data(data[IFLA_IPTUN_6RD_RELAY_PREFIX]), sizeof(rl_prefix)); se.n_relay_prefix = 1; se.relay_prefix = &rl_prefix; skip:; } #undef ENCODE_ENTRY nde->sit = &se; return write_netdev_img(nde, imgset, info); } static int dump_one_sit(struct ifinfomsg *ifi, char *kind, struct nlattr **tb, struct cr_imgset *fds) { char *name; if (strcmp(kind, "sit")) { pr_err("SIT device with %s kind\n", kind); return -1; } name = (char *)RTA_DATA(tb[IFLA_IFNAME]); if (!name) { pr_err("sit device %d has no name\n", ifi->ifi_index); return -1; } if (!strcmp(name, "sit0")) { pr_info("found %s, ignoring\n", name); return 0; } return dump_one_netdev(ND_TYPE__SIT, ifi, tb, fds, dump_sit); } static int dump_one_link(struct nlmsghdr *hdr, void *arg) { struct cr_imgset *fds = arg; struct ifinfomsg *ifi; int ret = 0, len = hdr->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)); struct nlattr *tb[IFLA_MAX + 1]; char *kind; ifi = NLMSG_DATA(hdr); if (len < 0) { pr_err("No iflas for link %d\n", ifi->ifi_index); return -1; } nlmsg_parse(hdr, sizeof(struct ifinfomsg), tb, IFLA_MAX, NULL); pr_info("\tLD: Got link %d, type %d\n", ifi->ifi_index, ifi->ifi_type); if (ifi->ifi_type == ARPHRD_LOOPBACK) return dump_one_netdev(ND_TYPE__LOOPBACK, ifi, tb, fds, NULL); kind = link_kind(ifi, tb); if (!kind) goto unk; switch (ifi->ifi_type) { case ARPHRD_ETHER: ret = dump_one_ethernet(ifi, kind, tb, fds); break; case ARPHRD_NONE: ret = dump_one_gendev(ifi, kind, tb, fds); break; case ARPHRD_VOID: ret = dump_one_voiddev(ifi, kind, tb, fds); break; case ARPHRD_IPGRE: ret = dump_one_gre(ifi, kind, tb, fds); break; case ARPHRD_SIT: ret = dump_one_sit(ifi, kind, tb, fds); break; default: unk: ret = dump_unknown_device(ifi, kind, tb, fds); break; } return ret; } static int dump_one_nf(struct nlmsghdr *hdr, void *arg) { struct cr_img *img = arg; if (lazy_image(img) && open_image_lazy(img)) return -1; if (write_img_buf(img, hdr, hdr->nlmsg_len)) return -1; return 0; } static int ct_restore_callback(struct nlmsghdr *nlh) { struct nfgenmsg *msg; struct nlattr *tb[CTA_MAX+1], *tbp[CTA_PROTOINFO_MAX + 1], *tb_tcp[CTA_PROTOINFO_TCP_MAX+1]; int err; msg = NLMSG_DATA(nlh); if (msg->nfgen_family != AF_INET && msg->nfgen_family != AF_INET6) return 0; err = nlmsg_parse(nlh, sizeof(struct nfgenmsg), tb, CTA_MAX, NULL); if (err < 0) return -1; if (!tb[CTA_PROTOINFO]) return 0; err = nla_parse_nested(tbp, CTA_PROTOINFO_MAX, tb[CTA_PROTOINFO], NULL); if (err < 0) return -1; if (!tbp[CTA_PROTOINFO_TCP]) return 0; err = nla_parse_nested(tb_tcp, CTA_PROTOINFO_TCP_MAX, tbp[CTA_PROTOINFO_TCP], NULL); if (err < 0) return -1; if (tb_tcp[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { struct nf_ct_tcp_flags *flags; flags = nla_data(tb_tcp[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]); flags->flags |= IP_CT_TCP_FLAG_BE_LIBERAL; flags->mask |= IP_CT_TCP_FLAG_BE_LIBERAL; } if (tb_tcp[CTA_PROTOINFO_TCP_FLAGS_REPLY]) { struct nf_ct_tcp_flags *flags; flags = nla_data(tb_tcp[CTA_PROTOINFO_TCP_FLAGS_REPLY]); flags->flags |= IP_CT_TCP_FLAG_BE_LIBERAL; flags->mask |= IP_CT_TCP_FLAG_BE_LIBERAL; } return 0; } static int restore_nf_ct(int pid, int type) { struct nlmsghdr *nlh = NULL; int exit_code = -1, sk; struct cr_img *img; img = open_image(type, O_RSTR, pid); if (img == NULL) return -1; if (empty_image(img)) { close_image(img); return 0; } sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER); if (sk < 0) { pr_perror("Can't open rtnl sock for net dump"); goto out_img; } nlh = xmalloc(sizeof(struct nlmsghdr)); if (nlh == NULL) goto out; while (1) { struct nlmsghdr *p; int ret; ret = read_img_buf_eof(img, nlh, sizeof(struct nlmsghdr)); if (ret < 0) goto out; if (ret == 0) break; p = xrealloc(nlh, nlh->nlmsg_len); if (p == NULL) goto out; nlh = p; ret = read_img_buf_eof(img, nlh + 1, nlh->nlmsg_len - sizeof(struct nlmsghdr)); if (ret < 0) goto out; if (ret == 0) { pr_err("The image file was truncated\n"); goto out; } if (type == CR_FD_NETNF_CT) if (ct_restore_callback(nlh)) goto out; nlh->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_CREATE; ret = do_rtnl_req(sk, nlh, nlh->nlmsg_len, NULL, NULL, NULL); if (ret) goto out; } exit_code = 0; out: xfree(nlh); close(sk); out_img: close_image(img); return exit_code; } static int dump_nf_ct(struct cr_imgset *fds, int type) { struct cr_img *img; struct { struct nlmsghdr nlh; struct nfgenmsg g; } req; int sk, ret; pr_info("Dumping netns links\n"); ret = sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER); if (sk < 0) { pr_perror("Can't open rtnl sock for net dump"); goto out; } memset(&req, 0, sizeof(req)); req.nlh.nlmsg_len = sizeof(req); req.nlh.nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8); if (type == CR_FD_NETNF_CT) req.nlh.nlmsg_type |= IPCTNL_MSG_CT_GET; else if (type == CR_FD_NETNF_EXP) req.nlh.nlmsg_type |= IPCTNL_MSG_EXP_GET; else BUG(); req.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST; req.nlh.nlmsg_pid = 0; req.nlh.nlmsg_seq = CR_NLMSG_SEQ; req.g.nfgen_family = AF_UNSPEC; img = img_from_set(fds, type); ret = do_rtnl_req(sk, &req, sizeof(req), dump_one_nf, NULL, img); close(sk); out: return ret; } static int dump_links(struct cr_imgset *fds) { int sk, ret; struct { struct nlmsghdr nlh; struct rtgenmsg g; } req; pr_info("Dumping netns links\n"); ret = sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sk < 0) { pr_perror("Can't open rtnl sock for net dump"); goto out; } memset(&req, 0, sizeof(req)); req.nlh.nlmsg_len = sizeof(req); req.nlh.nlmsg_type = RTM_GETLINK; req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; req.nlh.nlmsg_pid = 0; req.nlh.nlmsg_seq = CR_NLMSG_SEQ; req.g.rtgen_family = AF_PACKET; ret = do_rtnl_req(sk, &req, sizeof(req), dump_one_link, NULL, fds); close(sk); out: return ret; } static int restore_link_cb(struct nlmsghdr *hdr, void *arg) { pr_info("Got response on SETLINK =)\n"); return 0; } struct newlink_req { struct nlmsghdr h; struct ifinfomsg i; char buf[1024]; }; /* Optional extra things to be provided at the top level of the NEWLINK * request. */ struct newlink_extras { int link; /* IFLA_LINK */ int target_netns; /* IFLA_NET_NS_FD */ }; static int populate_newlink_req(struct newlink_req *req, int msg_type, NetDeviceEntry *nde, int (*link_info)(NetDeviceEntry *, struct newlink_req *), struct newlink_extras *extras) { memset(req, 0, sizeof(*req)); req->h.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); req->h.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_CREATE; req->h.nlmsg_type = msg_type; req->h.nlmsg_seq = CR_NLMSG_SEQ; req->i.ifi_family = AF_PACKET; /* * SETLINK is called for external devices which may * have ifindex changed. Thus configure them by their * name only. */ if (msg_type == RTM_NEWLINK) req->i.ifi_index = nde->ifindex; req->i.ifi_flags = nde->flags; if (extras) { if (extras->link >= 0) addattr_l(&req->h, sizeof(*req), IFLA_LINK, &extras->link, sizeof(extras->link)); if (extras->target_netns >= 0) addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &extras->target_netns, sizeof(extras->target_netns)); } addattr_l(&req->h, sizeof(*req), IFLA_IFNAME, nde->name, strlen(nde->name)); addattr_l(&req->h, sizeof(*req), IFLA_MTU, &nde->mtu, sizeof(nde->mtu)); if (nde->has_address) { pr_debug("Restore ll addr (%02x:../%d) for device\n", (int)nde->address.data[0], (int)nde->address.len); addattr_l(&req->h, sizeof(*req), IFLA_ADDRESS, nde->address.data, nde->address.len); } if (link_info) { struct rtattr *linkinfo; int ret; linkinfo = NLMSG_TAIL(&req->h); addattr_l(&req->h, sizeof(*req), IFLA_LINKINFO, NULL, 0); ret = link_info(nde, req); if (ret < 0) return ret; linkinfo->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)linkinfo; } return 0; } static int do_rtm_link_req(int msg_type, NetDeviceEntry *nde, int nlsk, int (*link_info)(NetDeviceEntry *, struct newlink_req *), struct newlink_extras *extras) { struct newlink_req req; if (populate_newlink_req(&req, msg_type, nde, link_info, extras) < 0) return -1; return do_rtnl_req(nlsk, &req, req.h.nlmsg_len, restore_link_cb, NULL, NULL); } int restore_link_parms(NetDeviceEntry *nde, int nlsk) { return do_rtm_link_req(RTM_SETLINK, nde, nlsk, NULL, NULL); } static int restore_one_link(NetDeviceEntry *nde, int nlsk, int (*link_info)(NetDeviceEntry *, struct newlink_req *), struct newlink_extras *extras) { pr_info("Restoring netdev %s idx %d\n", nde->name, nde->ifindex); return do_rtm_link_req(RTM_NEWLINK, nde, nlsk, link_info, extras); } #ifndef VETH_INFO_MAX enum { VETH_INFO_UNSPEC, VETH_INFO_PEER, __VETH_INFO_MAX #define VETH_INFO_MAX (__VETH_INFO_MAX - 1) }; #endif #if IFLA_MAX <= 28 #define IFLA_NET_NS_FD 28 #endif static void veth_peer_info(NetDeviceEntry *nde, struct newlink_req *req) { char key[100], *val; snprintf(key, sizeof(key), "veth[%s]", nde->name); val = external_lookup_by_key(key); if (!IS_ERR_OR_NULL(val)) { char *aux; aux = strchrnul(val, '@'); addattr_l(&req->h, sizeof(*req), IFLA_IFNAME, val, aux - val); } } static int veth_link_info(NetDeviceEntry *nde, struct newlink_req *req) { int ns_fd = get_service_fd(NS_FD_OFF); struct rtattr *veth_data, *peer_data; struct ifinfomsg ifm; BUG_ON(ns_fd < 0); addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "veth", 4); veth_data = NLMSG_TAIL(&req->h); addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0); peer_data = NLMSG_TAIL(&req->h); memset(&ifm, 0, sizeof(ifm)); addattr_l(&req->h, sizeof(*req), VETH_INFO_PEER, &ifm, sizeof(ifm)); veth_peer_info(nde, req); addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &ns_fd, sizeof(ns_fd)); peer_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)peer_data; veth_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)veth_data; return 0; } static int venet_link_info(NetDeviceEntry *nde, struct newlink_req *req) { int ns_fd = get_service_fd(NS_FD_OFF); struct rtattr *venet_data; BUG_ON(ns_fd < 0); venet_data = NLMSG_TAIL(&req->h); addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "venet", 5); addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0); addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &ns_fd, sizeof(ns_fd)); venet_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)venet_data; return 0; } static int bridge_link_info(NetDeviceEntry *nde, struct newlink_req *req) { struct rtattr *bridge_data; bridge_data = NLMSG_TAIL(&req->h); addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "bridge", sizeof("bridge")); bridge_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)bridge_data; return 0; } static int changeflags(int s, char *name, short flags) { struct ifreq ifr; strlcpy(ifr.ifr_name, name, IFNAMSIZ); ifr.ifr_flags = flags; if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { pr_perror("couldn't set flags on %s", name); return -1; } return 0; } static int macvlan_link_info(NetDeviceEntry *nde, struct newlink_req *req) { struct rtattr *macvlan_data; MacvlanLinkEntry *macvlan = nde->macvlan; if (!macvlan) { pr_err("Missing macvlan link entry %d\n", nde->ifindex); return -1; } addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "macvlan", 7); macvlan_data = NLMSG_TAIL(&req->h); addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0); addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_MODE, &macvlan->mode, sizeof(macvlan->mode)); if (macvlan->has_flags) addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_FLAGS, &macvlan->flags, sizeof(macvlan->flags)); macvlan_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)macvlan_data; return 0; } static int userns_restore_one_link(void *arg, int fd, pid_t pid) { int nlsk, ret; struct newlink_req *req = arg; int ns_fd = get_service_fd(NS_FD_OFF), rst = -1; if (!(root_ns_mask & CLONE_NEWUSER)) { if (switch_ns_by_fd(ns_fd, &net_ns_desc, &rst)) return -1; } nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (nlsk < 0) { pr_perror("Can't create nlk socket"); ret = -1; goto out; } addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &fd, sizeof(fd)); ret = do_rtnl_req(nlsk, req, req->h.nlmsg_len, restore_link_cb, NULL, NULL); close(nlsk); out: if (rst >= 0 && restore_ns(rst, &net_ns_desc) < 0) ret = -1; return ret; } static int restore_one_macvlan(NetDeviceEntry *nde, int nlsk, int criu_nlsk) { struct newlink_extras extras = { .link = -1, .target_netns = -1, }; char key[100], *val; int my_netns = -1, ret = -1; snprintf(key, sizeof(key), "macvlan[%s]", nde->name); val = external_lookup_data(key); if (IS_ERR_OR_NULL(val)) { pr_err("a macvlan parent for %s is required\n", nde->name); return -1; } /* link and netns_id are used to identify the master device to plug our * macvlan slave into. We identify the destination via setting * IFLA_NET_NS_FD to my_netns, but we have to do that in two different * ways: in the userns case, we send the fd across to usernsd and set * it there, whereas in the non-userns case we can just set it here, * since we can just use a socket from criu's net ns given to us by * restore_links(). We need to do this two different ways because * CAP_NET_ADMIN is required in both namespaces, which we don't have in * the userns case, and usernsd doesn't exist in the non-userns case. */ extras.link = (int) (unsigned long) val; my_netns = open_proc(PROC_SELF, "ns/net"); if (my_netns < 0) return -1; { struct newlink_req req; if (populate_newlink_req(&req, RTM_NEWLINK, nde, macvlan_link_info, &extras) < 0) goto out; if (userns_call(userns_restore_one_link, 0, &req, sizeof(req), my_netns) < 0) { pr_err("couldn't restore macvlan interface %s via usernsd\n", nde->name); goto out; } } ret = 0; out: if (my_netns >= 0) close(my_netns); return ret; } static int sit_link_info(NetDeviceEntry *nde, struct newlink_req *req) { struct rtattr *sit_data; SitEntry *se = nde->sit; if (!se) { pr_err("Missing sit entry %d\n", nde->ifindex); return -1; } addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "sit", 3); sit_data = NLMSG_TAIL(&req->h); addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0); #define DECODE_ENTRY(__type, __ifla, __proto) do { \ __type aux; \ if (se->has_##__proto) { \ aux = se->__proto; \ addattr_l(&req->h, sizeof(*req), __ifla, \ &aux, sizeof(__type)); \ } \ } while (0) if (se->n_local) { if (se->n_local != 1) { pr_err("Too long local addr for sit\n"); return -1; } addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_LOCAL, se->local, sizeof(u32)); } if (se->n_remote) { if (se->n_remote != 1) { pr_err("Too long remote addr for sit\n"); return -1; } addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_REMOTE, se->remote, sizeof(u32)); } DECODE_ENTRY(u32, IFLA_IPTUN_LINK, link); DECODE_ENTRY(u8, IFLA_IPTUN_TTL, ttl); DECODE_ENTRY(u8, IFLA_IPTUN_TOS, tos); DECODE_ENTRY(u16, IFLA_IPTUN_FLAGS, flags); DECODE_ENTRY(u8, IFLA_IPTUN_PROTO, proto); if (se->has_pmtudisc && se->pmtudisc) { u8 aux = 1; addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_PMTUDISC, &aux, sizeof(u8)); } DECODE_ENTRY(u16, IFLA_IPTUN_ENCAP_TYPE, encap_type); DECODE_ENTRY(u16, IFLA_IPTUN_ENCAP_FLAGS, encap_flags); DECODE_ENTRY(u16, IFLA_IPTUN_ENCAP_SPORT, encap_sport); DECODE_ENTRY(u16, IFLA_IPTUN_ENCAP_DPORT, encap_dport); if (se->has_rd_prefixlen) { u16 aux; if (se->n_rd_prefix != 4) { pr_err("Bad 6rd prefixlen for sit\n"); return -1; } aux = se->rd_prefixlen; addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_6RD_PREFIXLEN, &aux, sizeof(u16)); addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_6RD_PREFIX, se->rd_prefix, 4 * sizeof(u32)); if (!se->has_relay_prefixlen) goto skip; if (se->n_relay_prefix != 1) { pr_err("Bad 6rd relay prefixlen for sit\n"); return -1; } aux = se->relay_prefixlen; addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_6RD_RELAY_PREFIXLEN, &aux, sizeof(u16)); addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_6RD_RELAY_PREFIX, se->relay_prefix, sizeof(u32)); skip:; } #undef DECODE_ENTRY sit_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)sit_data; return 0; } static int restore_link(NetDeviceEntry *nde, int nlsk, int criu_nlsk) { pr_info("Restoring link %s type %d\n", nde->name, nde->type); switch (nde->type) { case ND_TYPE__LOOPBACK: /* fallthrough */ case ND_TYPE__EXTLINK: /* see comment in images/netdev.proto */ return restore_link_parms(nde, nlsk); case ND_TYPE__VENET: return restore_one_link(nde, nlsk, venet_link_info, NULL); case ND_TYPE__VETH: return restore_one_link(nde, nlsk, veth_link_info, NULL); case ND_TYPE__TUN: return restore_one_tun(nde, nlsk); case ND_TYPE__BRIDGE: return restore_one_link(nde, nlsk, bridge_link_info, NULL); case ND_TYPE__MACVLAN: return restore_one_macvlan(nde, nlsk, criu_nlsk); case ND_TYPE__SIT: return restore_one_link(nde, nlsk, sit_link_info, NULL); default: pr_err("Unsupported link type %d\n", nde->type); break; } return -1; } static int restore_links(int pid, NetnsEntry **netns) { int nlsk, criu_nlsk = -1, ret = -1; struct cr_img *img; NetDeviceEntry *nde; img = open_image(CR_FD_NETDEV, O_RSTR, pid); if (!img) return -1; nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (nlsk < 0) { pr_perror("Can't create nlk socket"); close_image(img); return -1; } while (1) { NetnsEntry **def_netns = netns; ret = pb_read_one_eof(img, &nde, PB_NETDEV); if (ret <= 0) break; ret = restore_link(nde, nlsk, criu_nlsk); if (ret) { pr_err("Can't restore link\n"); goto exit; } /* * optimize restore of devices configuration except lo * lo is created with namespace and before default is set * so we can't optimize its restore */ if (nde->type == ND_TYPE__LOOPBACK) def_netns = NULL; if (nde->conf4) ret = ipv4_conf_op(nde->name, nde->conf4, nde->n_conf4, CTL_WRITE, def_netns ? (*def_netns)->def_conf4 : NULL); else if (nde->conf) ret = ipv4_conf_op_old(nde->name, nde->conf, nde->n_conf, CTL_WRITE, def_netns ? (*def_netns)->def_conf : NULL); if (ret) goto exit; if (nde->conf6) ret = ipv6_conf_op(nde->name, nde->conf6, nde->n_conf6, CTL_WRITE, def_netns ? (*def_netns)->def_conf6 : NULL); exit: net_device_entry__free_unpacked(nde, NULL); if (ret) break; } close(nlsk); close_image(img); return ret; } static int run_ip_tool(char *arg1, char *arg2, char *arg3, char *arg4, int fdin, int fdout, unsigned flags) { char *ip_tool_cmd; int ret; pr_debug("\tRunning ip %s %s %s %s\n", arg1, arg2, arg3 ? : "\0", arg4 ? : "\0"); ip_tool_cmd = getenv("CR_IP_TOOL"); if (!ip_tool_cmd) ip_tool_cmd = "ip"; ret = cr_system(fdin, fdout, -1, ip_tool_cmd, (char *[]) { "ip", arg1, arg2, arg3, arg4, NULL }, flags); if (ret) { if (!(flags & CRS_CAN_FAIL)) pr_err("IP tool failed on %s %s %s %s\n", arg1, arg2, arg3 ? : "\0", arg4 ? : "\0"); return -1; } return 0; } static int run_iptables_tool(char *def_cmd, int fdin, int fdout) { int ret; char *cmd; cmd = getenv("CR_IPTABLES"); if (!cmd) cmd = def_cmd; pr_debug("\tRunning %s for %s\n", cmd, def_cmd); ret = cr_system(fdin, fdout, -1, "sh", (char *[]) { "sh", "-c", cmd, NULL }, 0); if (ret) pr_err("%s failed\n", def_cmd); return ret; } static inline int dump_ifaddr(struct cr_imgset *fds) { struct cr_img *img = img_from_set(fds, CR_FD_IFADDR); return run_ip_tool("addr", "save", NULL, NULL, -1, img_raw_fd(img), 0); } static inline int dump_route(struct cr_imgset *fds) { struct cr_img *img; img = img_from_set(fds, CR_FD_ROUTE); if (run_ip_tool("route", "save", NULL, NULL, -1, img_raw_fd(img), 0)) return -1; /* If ipv6 is disabled, "ip -6 route dump" dumps all routes */ if (!kdat.ipv6) return 0; img = img_from_set(fds, CR_FD_ROUTE6); if (run_ip_tool("-6", "route", "save", NULL, -1, img_raw_fd(img), 0)) return -1; return 0; } static inline int dump_rule(struct cr_imgset *fds) { struct cr_img *img; char *path; img = img_from_set(fds, CR_FD_RULE); path = xstrdup(img->path); if (!path) return -1; if (run_ip_tool("rule", "save", NULL, NULL, -1, img_raw_fd(img), CRS_CAN_FAIL)) { pr_warn("Check if \"ip rule save\" is supported!\n"); unlinkat(get_service_fd(IMG_FD_OFF), path, 0); } free(path); return 0; } static inline int dump_iptables(struct cr_imgset *fds) { struct cr_img *img; img = img_from_set(fds, CR_FD_IPTABLES); if (run_iptables_tool("iptables-save", -1, img_raw_fd(img))) return -1; if (kdat.ipv6) { img = img_from_set(fds, CR_FD_IP6TABLES); if (run_iptables_tool("ip6tables-save", -1, img_raw_fd(img))) return -1; } return 0; } static int dump_netns_conf(struct cr_imgset *fds) { void *buf, *o_buf; int ret = -1; int i; NetnsEntry netns = NETNS_ENTRY__INIT; SysctlEntry *def_confs4 = NULL, *all_confs4 = NULL; int size4 = ARRAY_SIZE(devconfs4); SysctlEntry *def_confs6 = NULL, *all_confs6 = NULL; int size6 = ARRAY_SIZE(devconfs6); char def_stable_secret[MAX_STR_CONF_LEN + 1] = {}; char all_stable_secret[MAX_STR_CONF_LEN + 1] = {}; o_buf = buf = xmalloc( size4 * (sizeof(SysctlEntry*) + sizeof(SysctlEntry)) * 2 + size6 * (sizeof(SysctlEntry*) + sizeof(SysctlEntry)) * 2 ); if (!buf) goto out; netns.n_def_conf4 = size4; netns.n_all_conf4 = size4; netns.def_conf4 = xptr_pull_s(&buf, size4 * sizeof(SysctlEntry*)); netns.all_conf4 = xptr_pull_s(&buf, size4 * sizeof(SysctlEntry*)); def_confs4 = xptr_pull_s(&buf, size4 * sizeof(SysctlEntry)); all_confs4 = xptr_pull_s(&buf, size4 * sizeof(SysctlEntry)); for (i = 0; i < size4; i++) { sysctl_entry__init(&def_confs4[i]); sysctl_entry__init(&all_confs4[i]); netns.def_conf4[i] = &def_confs4[i]; netns.all_conf4[i] = &all_confs4[i]; netns.def_conf4[i]->type = CTL_32; netns.all_conf4[i]->type = CTL_32; } netns.n_def_conf6 = size6; netns.n_all_conf6 = size6; netns.def_conf6 = xptr_pull_s(&buf, size6 * sizeof(SysctlEntry*)); netns.all_conf6 = xptr_pull_s(&buf, size6 * sizeof(SysctlEntry*)); def_confs6 = xptr_pull_s(&buf, size6 * sizeof(SysctlEntry)); all_confs6 = xptr_pull_s(&buf, size6 * sizeof(SysctlEntry)); for (i = 0; i < size6; i++) { sysctl_entry__init(&def_confs6[i]); sysctl_entry__init(&all_confs6[i]); netns.def_conf6[i] = &def_confs6[i]; netns.all_conf6[i] = &all_confs6[i]; if (strcmp(devconfs6[i], "stable_secret")) { netns.def_conf6[i]->type = SYSCTL_TYPE__CTL_32; netns.all_conf6[i]->type = SYSCTL_TYPE__CTL_32; } else { netns.def_conf6[i]->type = SYSCTL_TYPE__CTL_STR; netns.all_conf6[i]->type = SYSCTL_TYPE__CTL_STR; netns.def_conf6[i]->sarg = def_stable_secret; netns.all_conf6[i]->sarg = all_stable_secret; } } ret = ipv4_conf_op("default", netns.def_conf4, size4, CTL_READ, NULL); if (ret < 0) goto err_free; ret = ipv4_conf_op("all", netns.all_conf4, size4, CTL_READ, NULL); if (ret < 0) goto err_free; ret = ipv6_conf_op("default", netns.def_conf6, size6, CTL_READ, NULL); if (ret < 0) goto err_free; ret = ipv6_conf_op("all", netns.all_conf6, size6, CTL_READ, NULL); if (ret < 0) goto err_free; ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); out: return ret; } static int restore_ip_dump(int type, int pid, char *cmd) { int ret = -1; struct cr_img *img; img = open_image(type, O_RSTR, pid); if (empty_image(img)) { close_image(img); return 0; } if (img) { ret = run_ip_tool(cmd, "restore", NULL, NULL, img_raw_fd(img), -1, 0); close_image(img); } return ret; } static inline int restore_ifaddr(int pid) { return restore_ip_dump(CR_FD_IFADDR, pid, "addr"); } static inline int restore_route(int pid) { if (restore_ip_dump(CR_FD_ROUTE, pid, "route")) return -1; if (restore_ip_dump(CR_FD_ROUTE6, pid, "route")) return -1; return 0; } static inline int restore_rule(int pid) { struct cr_img *img; int ret = 0; img = open_image(CR_FD_RULE, O_RSTR, pid); if (!img) { ret = -1; goto out; } if (empty_image(img)) goto close; /* * Delete 3 default rules to prevent duplicates. See kernel's * function fib_default_rules_init() for the details. */ run_ip_tool("rule", "flush", NULL, NULL, -1, -1, 0); run_ip_tool("rule", "delete", "table", "local", -1, -1, 0); if (restore_ip_dump(CR_FD_RULE, pid, "rule")) ret = -1; close: close_image(img); out: return ret; } static inline int restore_iptables(int pid) { int ret = -1; struct cr_img *img; img = open_image(CR_FD_IPTABLES, O_RSTR, pid); if (img == NULL) return -1; if (empty_image(img)) { ret = 0; goto ipt6; } ret = run_iptables_tool("iptables-restore -w", img_raw_fd(img), -1); close_image(img); if (ret) return ret; ipt6: img = open_image(CR_FD_IP6TABLES, O_RSTR, pid); if (img == NULL) return -1; if (empty_image(img)) goto out; ret = run_iptables_tool("ip6tables-restore -w", img_raw_fd(img), -1); out: close_image(img); return ret; } static int restore_netns_conf(int pid, NetnsEntry **netns) { int ret = 0; struct cr_img *img; img = open_image(CR_FD_NETNS, O_RSTR, pid); if (!img) return -1; if (empty_image(img)) /* Backward compatibility */ goto out; ret = pb_read_one(img, netns, PB_NETNS); if (ret < 0) { pr_err("Can not read netns object\n"); return -1; } if ((*netns)->def_conf4) { ret = ipv4_conf_op("all", (*netns)->all_conf4, (*netns)->n_all_conf4, CTL_WRITE, NULL); if (ret) goto out; ret = ipv4_conf_op("default", (*netns)->def_conf4, (*netns)->n_def_conf4, CTL_WRITE, NULL); if (ret) goto out; } else if ((*netns)->def_conf) { /* Backward compatibility */ ret = ipv4_conf_op_old("all", (*netns)->all_conf, (*netns)->n_all_conf, CTL_WRITE, NULL); if (ret) goto out; ret = ipv4_conf_op_old("default", (*netns)->def_conf, (*netns)->n_def_conf, CTL_WRITE, NULL); if (ret) goto out; } if ((*netns)->def_conf6) { ret = ipv6_conf_op("all", (*netns)->all_conf6, (*netns)->n_all_conf6, CTL_WRITE, NULL); if (ret) goto out; ret = ipv6_conf_op("default", (*netns)->def_conf6, (*netns)->n_def_conf6, CTL_WRITE, NULL); } out: close_image(img); return ret; } static int mount_ns_sysfs(void) { char sys_mount[] = "crtools-sys.XXXXXX"; BUG_ON(ns_sysfs_fd != -1); /* * A new mntns is required to avoid the race between * open_detach_mount and creating mntns. */ if (unshare(CLONE_NEWNS)) { pr_perror("Can't create new mount namespace"); return -1; } if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) { pr_perror("Can't mark the root mount as private"); return -1; } if (mkdtemp(sys_mount) == NULL) { pr_perror("mkdtemp failed %s", sys_mount); return -1; } /* * The setns() is called, so we're in proper context, * no need in pulling the mountpoint from parasite. */ pr_info("Mount ns' sysfs in %s\n", sys_mount); if (mount("sysfs", sys_mount, "sysfs", MS_MGC_VAL, NULL)) { pr_perror("mount failed"); rmdir(sys_mount); return -1; } ns_sysfs_fd = open_detach_mount(sys_mount); return ns_sysfs_fd >= 0 ? 0 : -1; } int dump_net_ns(int ns_id) { struct cr_imgset *fds; int ret; fds = cr_imgset_open(ns_id, NETNS, O_DUMP); if (fds == NULL) return -1; ret = mount_ns_sysfs(); if (!(opts.empty_ns & CLONE_NEWNET)) { if (!ret) ret = dump_netns_conf(fds); if (!ret) ret = dump_links(fds); if (!ret) ret = dump_ifaddr(fds); if (!ret) ret = dump_route(fds); if (!ret) ret = dump_rule(fds); if (!ret) ret = dump_iptables(fds); } if (!ret) ret = dump_nf_ct(fds, CR_FD_NETNF_CT); if (!ret) ret = dump_nf_ct(fds, CR_FD_NETNF_EXP); close(ns_sysfs_fd); ns_sysfs_fd = -1; close_cr_imgset(&fds); return ret; } int prepare_net_ns(int pid) { int ret = 0; NetnsEntry *netns = NULL; if (!(opts.empty_ns & CLONE_NEWNET)) { ret = restore_netns_conf(pid, &netns); if (!ret) ret = restore_links(pid, &netns); if (netns) netns_entry__free_unpacked(netns, NULL); if (!ret) ret = restore_ifaddr(pid); if (!ret) ret = restore_route(pid); if (!ret) ret = restore_rule(pid); if (!ret) ret = restore_iptables(pid); } if (!ret) ret = restore_nf_ct(pid, CR_FD_NETNF_CT); if (!ret) ret = restore_nf_ct(pid, CR_FD_NETNF_EXP); close_service_fd(NS_FD_OFF); return ret; } int netns_keep_nsfd(void) { int ns_fd, ret; if (!(root_ns_mask & CLONE_NEWNET)) return 0; /* * When restoring a net namespace we need to communicate * with the original (i.e. -- init) one. Thus, prepare for * that before we leave the existing namespaces. */ ns_fd = __open_proc(PROC_SELF, 0, O_RDONLY | O_CLOEXEC, "ns/net"); if (ns_fd < 0) return -1; ret = install_service_fd(NS_FD_OFF, ns_fd); if (ret < 0) pr_err("Can't install ns net reference\n"); else pr_info("Saved netns fd for links restore\n"); close(ns_fd); return ret >= 0 ? 0 : -1; } /* * If we want to modify iptables, we need to recevied the current * configuration, change it and load a new one into the kernel. * iptables can change or add only one rule. * iptables-restore allows to make a few changes for one iteration, * so it works faster. */ static int iptables_restore(bool ipv6, char *buf, int size) { int pfd[2], ret = -1; char *cmd4[] = {"iptables-restore", "-w", "--noflush", NULL}; char *cmd6[] = {"ip6tables-restore", "-w", "--noflush", NULL}; char **cmd = ipv6 ? cmd6 : cmd4;; if (pipe(pfd) < 0) { pr_perror("Unable to create pipe"); return -1; } if (write(pfd[1], buf, size) < size) { pr_perror("Unable to write iptables configugration"); goto err; } close_safe(&pfd[1]); ret = cr_system(pfd[0], -1, -1, cmd[0], cmd, 0); err: close_safe(&pfd[1]); close_safe(&pfd[0]); return ret; } int network_lock_internal() { char conf[] = "*filter\n" ":CRIU - [0:0]\n" "-I INPUT -j CRIU\n" "-I OUTPUT -j CRIU\n" "-A CRIU -m mark --mark " __stringify(SOCCR_MARK) " -j ACCEPT\n" "-A CRIU -j DROP\n" "COMMIT\n"; int ret = 0, nsret; if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; ret |= iptables_restore(false, conf, sizeof(conf) - 1); if (kdat.ipv6) ret |= iptables_restore(true, conf, sizeof(conf) - 1); if (ret) pr_err("Locking network failed: iptables-restore returned %d. " "This may be connected to disabled " "CONFIG_NETFILTER_XT_MARK kernel build config " "option.\n", ret); if (restore_ns(nsret, &net_ns_desc)) ret = -1; return ret; } static int network_unlock_internal() { char conf[] = "*filter\n" ":CRIU - [0:0]\n" "-D INPUT -j CRIU\n" "-D OUTPUT -j CRIU\n" "-X CRIU\n" "COMMIT\n"; int ret = 0, nsret; if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; ret |= iptables_restore(false, conf, sizeof(conf) - 1); if (kdat.ipv6) ret |= iptables_restore(true, conf, sizeof(conf) - 1); if (restore_ns(nsret, &net_ns_desc)) ret = -1; return ret; } int network_lock(void) { pr_info("Lock network\n"); /* Each connection will be locked on dump */ if (!(root_ns_mask & CLONE_NEWNET)) return 0; if (run_scripts(ACT_NET_LOCK)) return -1; return network_lock_internal(); } void network_unlock(void) { pr_info("Unlock network\n"); cpt_unlock_tcp_connections(); rst_unlock_tcp_connections(); if (root_ns_mask & CLONE_NEWNET) { run_scripts(ACT_NET_UNLOCK); network_unlock_internal(); } } int veth_pair_add(char *in, char *out) { char *e_str; e_str = xmalloc(200); /* For 3 IFNAMSIZ + 8 service characters */ if (!e_str) return -1; snprintf(e_str, 200, "veth[%s]:%s", in, out); return add_external(e_str); } int macvlan_ext_add(struct external *ext) { ext->data = (void *) (unsigned long) if_nametoindex(external_val(ext)); if (ext->data == 0) { pr_perror("can't get ifindex of %s", ext->id); return -1; } return 0; } /* * The setns() syscall (called by switch_ns()) can be extremely * slow. If we call it two or more times from the same task the * kernel will synchonously go on a very slow routine called * synchronize_rcu() trying to put a reference on old namespaces. * * To avoid doing this more than once we pre-create all the * needed other-ns sockets in advance. */ static int prep_ns_sockets(struct ns_id *ns, bool for_dump) { int nsret = -1, ret; if (ns->type != NS_CRIU) { pr_info("Switching to %d's net for collecting sockets\n", ns->ns_pid); if (switch_ns(ns->ns_pid, &net_ns_desc, &nsret)) return -1; } if (for_dump) { ret = ns->net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); if (ret < 0) { pr_perror("Can't create sock diag socket"); goto err_nl; } } else ns->net.nlsk = -1; ret = ns->net.seqsk = socket(PF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK, 0); if (ret < 0) { pr_perror("Can't create seqsk for parasite"); goto err_sq; } ret = 0; out: if (nsret >= 0 && restore_ns(nsret, &net_ns_desc) < 0) { nsret = -1; if (ret == 0) goto err_ret; } return ret; err_ret: close(ns->net.seqsk); err_sq: if (ns->net.nlsk >= 0) close(ns->net.nlsk); err_nl: goto out; } static int collect_net_ns(struct ns_id *ns, void *oarg) { bool for_dump = (oarg == (void *)1); int ret; pr_info("Collecting netns %d/%d\n", ns->id, ns->ns_pid); ret = prep_ns_sockets(ns, for_dump); if (ret) return ret; if (!for_dump) return 0; return collect_sockets(ns); } int collect_net_namespaces(bool for_dump) { return walk_namespaces(&net_ns_desc, collect_net_ns, (void *)(for_dump ? 1UL : 0)); } struct ns_desc net_ns_desc = NS_DESC_ENTRY(CLONE_NEWNET, "net"); static int move_to_bridge(struct external *ext, void *arg) { int s = *(int *)arg; int ret; char *out, *br; struct ifreq ifr; out = external_val(ext); if (!out) return -1; br = strchr(out, '@'); if (!br) return 0; *br = '\0'; br++; { pr_debug("\tMoving dev %s to bridge %s\n", out, br); if (s == -1) { s = socket(AF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0); if (s < 0) { pr_perror("Can't create control socket"); return -1; } } /* * Add the device to the bridge. This is equivalent to: * $ brctl addif */ ifr.ifr_ifindex = if_nametoindex(out); if (ifr.ifr_ifindex == 0) { pr_perror("Can't get index of %s", out); ret = -1; goto out; } strlcpy(ifr.ifr_name, br, IFNAMSIZ); ret = ioctl(s, SIOCBRADDIF, &ifr); if (ret < 0) { pr_perror("Can't add interface %s to bridge %s", out, br); goto out; } /* * Make sure the device is up. This is equivalent to: * $ ip link set dev up */ ifr.ifr_ifindex = 0; strlcpy(ifr.ifr_name, out, IFNAMSIZ); ret = ioctl(s, SIOCGIFFLAGS, &ifr); if (ret < 0) { pr_perror("Can't get flags of interface %s", out); goto out; } ret = 0; if (ifr.ifr_flags & IFF_UP) goto out; ifr.ifr_flags |= IFF_UP; if (changeflags(s, out, ifr.ifr_flags) < 0) goto out; ret = 0; } out: br--; *br = '@'; *(int *)arg = s; return ret; } int move_veth_to_bridge(void) { int sk = -1, ret; ret = external_for_each_type("veth", move_to_bridge, &sk); if (sk >= 0) close(sk); return ret; } criu-3.6/criu/netfilter.c000066400000000000000000000072261317335042600154410ustar00rootroot00000000000000#include #include #include #include #include #include #include "../soccr/soccr.h" #include "util.h" #include "common/list.h" #include "files.h" #include "netfilter.h" #include "sockets.h" #include "sk-inet.h" #include "kerndat.h" static char buf[512]; /* * Need to configure simple netfilter rules for blocking connections * ANy brave soul to write it using xtables-devel? */ #define NF_CONN_CMD "%s %s -t filter %s %s --protocol tcp " \ "-m mark ! --mark " __stringify(SOCCR_MARK) " --source %s --sport %d --destination %s --dport %d -j DROP" static char iptable_cmd_ipv4[] = "iptables"; static char iptable_cmd_ipv6[] = "ip6tables"; void preload_netfilter_modules(void) { int fd = -1; /* same as socket modules, ip_tables and ip6_tables will be loaded by * CRIU, so we should try and preload these as well. */ fd = open("/dev/null", O_RDWR); if (fd < 0) { fd = -1; pr_perror("failed to open /dev/null, using log fd for net module preload"); } cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]) { iptable_cmd_ipv4, "-L", "-n", NULL}, 0); cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]) { iptable_cmd_ipv6, "-L", "-n", NULL}, 0); close_safe(&fd); } static int nf_connection_switch_raw(int family, u32 *src_addr, u16 src_port, u32 *dst_addr, u16 dst_port, bool input, bool lock) { char sip[INET_ADDR_LEN], dip[INET_ADDR_LEN]; char *cmd; char *argv[4] = { "sh", "-c", buf, NULL }; int ret; switch (family) { case AF_INET: cmd = iptable_cmd_ipv4; break; case AF_INET6: cmd = iptable_cmd_ipv6; break; default: pr_err("Unknown socket family %d\n", family); return -1; }; if (!inet_ntop(family, (void *)src_addr, sip, INET_ADDR_LEN) || !inet_ntop(family, (void *)dst_addr, dip, INET_ADDR_LEN)) { pr_perror("nf: Can't translate ip addr"); return -1; } snprintf(buf, sizeof(buf), NF_CONN_CMD, cmd, kdat.has_xtlocks ? "-w" : "", lock ? "-I" : "-D", input ? "INPUT" : "OUTPUT", dip, (int)dst_port, sip, (int)src_port); pr_debug("\tRunning iptables [%s]\n", buf); /* * cr_system is used here, because it blocks SIGCHLD before waiting * a child and the child can't be waited from SIGCHLD handler. */ ret = cr_system(-1, -1, -1, "sh", argv, 0); if (ret < 0 || !WIFEXITED(ret) || WEXITSTATUS(ret)) { pr_err("Iptables configuration failed\n"); return -1; } pr_info("%s %s:%d - %s:%d connection\n", lock ? "Locked" : "Unlocked", sip, (int)src_port, dip, (int)dst_port); return 0; } static int nf_connection_switch(struct inet_sk_desc *sk, bool lock) { int ret = 0; ret = nf_connection_switch_raw(sk->sd.family, sk->src_addr, sk->src_port, sk->dst_addr, sk->dst_port, true, lock); if (ret) return -1; ret = nf_connection_switch_raw(sk->sd.family, sk->dst_addr, sk->dst_port, sk->src_addr, sk->src_port, false, lock); if (ret) /* rollback */ nf_connection_switch_raw(sk->sd.family, sk->src_addr, sk->src_port, sk->dst_addr, sk->dst_port, true, !lock); return ret; } int nf_lock_connection(struct inet_sk_desc *sk) { return nf_connection_switch(sk, true); } int nf_unlock_connection(struct inet_sk_desc *sk) { return nf_connection_switch(sk, false); } int nf_unlock_connection_info(struct inet_sk_info *si) { int ret = 0; ret |= nf_connection_switch_raw(si->ie->family, si->ie->src_addr, si->ie->src_port, si->ie->dst_addr, si->ie->dst_port, true, false); ret |= nf_connection_switch_raw(si->ie->family, si->ie->dst_addr, si->ie->dst_port, si->ie->src_addr, si->ie->src_port, false, false); /* * rollback nothing in case of any error, * because nobody checks errors of this function */ return ret; } criu-3.6/criu/page-pipe.c000066400000000000000000000213371317335042600153130ustar00rootroot00000000000000#include #undef LOG_PREFIX #define LOG_PREFIX "page-pipe: " #include "page.h" #include "config.h" #include "util.h" #include "criu-log.h" #include "page-pipe.h" #include "fcntl.h" /* can existing iov accumulate the page? */ static inline bool iov_grow_page(struct iovec *iov, unsigned long addr) { if ((unsigned long)iov->iov_base + iov->iov_len == addr) { iov->iov_len += PAGE_SIZE; return true; } return false; } static inline void iov_init(struct iovec *iov, unsigned long addr) { iov->iov_base = (void *)addr; iov->iov_len = PAGE_SIZE; } static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp) { struct page_pipe_buf *ppb; ppb = xmalloc(sizeof(*ppb)); if (!ppb) return NULL; if (pipe(ppb->p)) { xfree(ppb); pr_perror("Can't make pipe for page-pipe"); return NULL; } ppb->pipe_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0) / PAGE_SIZE; pp->nr_pipes++; list_add_tail(&ppb->l, &pp->bufs); return ppb; } static void ppb_destroy(struct page_pipe_buf *ppb) { close(ppb->p[0]); close(ppb->p[1]); xfree(ppb); } static void ppb_init(struct page_pipe_buf *ppb, unsigned int pages_in, unsigned int nr_segs, unsigned int flags, struct iovec *iov) { ppb->pages_in = pages_in; ppb->nr_segs = nr_segs; ppb->flags = flags; ppb->iov = iov; } static int ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size) { int ret; ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE); if (ret < 0) return -1; ret /= PAGE_SIZE; BUG_ON(ret < ppb->pipe_size); pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret); ppb->pipe_size = ret; return 0; } static int page_pipe_grow(struct page_pipe *pp, unsigned int flags) { struct page_pipe_buf *ppb; struct iovec *free_iov; pr_debug("Will grow page pipe (iov off is %u)\n", pp->free_iov); if (!list_empty(&pp->free_bufs)) { ppb = list_first_entry(&pp->free_bufs, struct page_pipe_buf, l); list_move_tail(&ppb->l, &pp->bufs); goto out; } if ((pp->flags & PP_CHUNK_MODE) && (pp->nr_pipes == NR_PIPES_PER_CHUNK)) return -EAGAIN; ppb = ppb_alloc(pp); if (!ppb) return -1; out: free_iov = &pp->iovs[pp->free_iov]; ppb_init(ppb, 0, 0, flags, free_iov); return 0; } struct page_pipe *create_page_pipe(unsigned int nr_segs, struct iovec *iovs, unsigned flags) { struct page_pipe *pp; pr_debug("Create page pipe for %u segs\n", nr_segs); pp = xzalloc(sizeof(*pp)); if (!pp) return NULL; pp->flags = flags; if (!iovs) { iovs = xmalloc(sizeof(*iovs) * nr_segs); if (!iovs) goto err_free_pp; pp->flags |= PP_OWN_IOVS; } pp->nr_pipes = 0; INIT_LIST_HEAD(&pp->bufs); INIT_LIST_HEAD(&pp->free_bufs); pp->nr_iovs = nr_segs; pp->iovs = iovs; pp->free_iov = 0; pp->nr_holes = 0; pp->free_hole = 0; pp->holes = NULL; if (page_pipe_grow(pp, 0)) goto err_free_iovs; return pp; err_free_iovs: if (pp->flags & PP_OWN_IOVS) xfree(iovs); err_free_pp: xfree(pp); return NULL; } void destroy_page_pipe(struct page_pipe *pp) { struct page_pipe_buf *ppb, *n; pr_debug("Killing page pipe\n"); list_splice(&pp->free_bufs, &pp->bufs); list_for_each_entry_safe(ppb, n, &pp->bufs, l) ppb_destroy(ppb); if (pp->flags & PP_OWN_IOVS) xfree(pp->iovs); xfree(pp); } void page_pipe_reinit(struct page_pipe *pp) { struct page_pipe_buf *ppb, *n; BUG_ON(!(pp->flags & PP_CHUNK_MODE)); pr_debug("Clean up page pipe\n"); list_for_each_entry_safe(ppb, n, &pp->bufs, l) list_move(&ppb->l, &pp->free_bufs); pp->free_hole = 0; if (page_pipe_grow(pp, 0)) BUG(); /* It can't fail, because ppb is in free_bufs */ } static inline int try_add_page_to(struct page_pipe *pp, struct page_pipe_buf *ppb, unsigned long addr, unsigned int flags) { if (ppb->flags != flags) return 1; if (ppb->pages_in == ppb->pipe_size) { unsigned long new_size = ppb->pipe_size << 1; int ret; if (new_size > PIPE_MAX_SIZE) return 1; ret = ppb_resize_pipe(ppb, new_size); if (ret < 0) return 1; /* need to add another buf */ } if (ppb->nr_segs) { if (iov_grow_page(&ppb->iov[ppb->nr_segs - 1], addr)) goto out; if (ppb->nr_segs == UIO_MAXIOV) /* XXX -- shrink pipe back? */ return 1; } pr_debug("Add iov to page pipe (%u iovs, %u/%u total)\n", ppb->nr_segs, pp->free_iov, pp->nr_iovs); iov_init(&ppb->iov[ppb->nr_segs++], addr); pp->free_iov++; BUG_ON(pp->free_iov > pp->nr_iovs); out: ppb->pages_in++; return 0; } static inline int try_add_page(struct page_pipe *pp, unsigned long addr, unsigned int flags) { BUG_ON(list_empty(&pp->bufs)); return try_add_page_to(pp, list_entry(pp->bufs.prev, struct page_pipe_buf, l), addr, flags); } int page_pipe_add_page(struct page_pipe *pp, unsigned long addr, unsigned int flags) { int ret; ret = try_add_page(pp, addr, flags); if (ret <= 0) return ret; ret = page_pipe_grow(pp, flags); if (ret < 0) return ret; ret = try_add_page(pp, addr, flags); BUG_ON(ret > 0); return ret; } #define PP_HOLES_BATCH 32 int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr, unsigned int flags) { if (pp->free_hole >= pp->nr_holes) { pp->holes = xrealloc(pp->holes, (pp->nr_holes + PP_HOLES_BATCH) * sizeof(struct iovec)); if (!pp->holes) return -1; pp->hole_flags = xrealloc(pp->hole_flags, (pp->nr_holes + PP_HOLES_BATCH) * sizeof(unsigned int)); if(!pp->hole_flags) return -1; pp->nr_holes += PP_HOLES_BATCH; } if (pp->free_hole && pp->hole_flags[pp->free_hole - 1] == flags && iov_grow_page(&pp->holes[pp->free_hole - 1], addr)) goto out; iov_init(&pp->holes[pp->free_hole++], addr); pp->hole_flags[pp->free_hole - 1] = flags; out: return 0; } /* * Get ppb and iov that contain addr and count amount of data between * beginning of the pipe belonging to the ppb and addr */ static struct page_pipe_buf *get_ppb(struct page_pipe *pp, unsigned long addr, struct iovec **iov_ret, unsigned long *len) { struct page_pipe_buf *ppb; int i; list_for_each_entry(ppb, &pp->bufs, l) { for (i = 0, *len = 0; i < ppb->nr_segs; i++) { struct iovec *iov = &ppb->iov[i]; unsigned long base = (unsigned long)iov->iov_base; if (addr < base || addr >= base + iov->iov_len) { *len += iov->iov_len; continue; } /* got iov that contains the addr */ *len += (addr - base); *iov_ret = iov; list_move(&ppb->l, &pp->bufs); return ppb; } } return NULL; } int pipe_read_dest_init(struct pipe_read_dest *prd) { int ret; if (pipe(prd->p)) { pr_perror("Cannot create pipe for reading from page-pipe"); return -1; } ret = fcntl(prd->p[0], F_SETPIPE_SZ, PIPE_MAX_SIZE * PAGE_SIZE); if (ret < 0) return -1; prd->sink_fd = open("/dev/null", O_WRONLY); if (prd->sink_fd < 0) { pr_perror("Cannot open sink for reading from page-pipe"); return -1; } ret = fcntl(prd->p[0], F_GETPIPE_SZ, 0); pr_debug("Created tee pipe size %d\n", ret); return 0; } int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, unsigned int ppb_flags) { struct page_pipe_buf *ppb; struct iovec *iov = NULL; unsigned long skip = 0, len; int ret; /* * Get ppb that contains addr and count length of data between * the beginning of the pipe and addr. If no ppb is found, the * requested page is mapped to zero pfn */ ppb = get_ppb(pp, addr, &iov, &skip); if (!ppb) { *nr_pages = 0; return 0; } if (!(ppb->flags & ppb_flags)) { pr_err("PPB flags mismatch: %x %x\n", ppb_flags, ppb->flags); return false; } /* clamp the request if it passes the end of iovec */ len = min((unsigned long)iov->iov_base + iov->iov_len - addr, (unsigned long)(*nr_pages) * PAGE_SIZE); *nr_pages = len / PAGE_SIZE; /* we should tee() the requested lenth + the beginning of the pipe */ len += skip; ret = tee(ppb->p[0], prd->p[1], len, 0); if (ret != len) { pr_perror("tee: %d", ret); return -1; } ret = splice(prd->p[0], NULL, prd->sink_fd, NULL, skip, 0); if (ret != skip) { pr_perror("splice: %d", ret); return -1; } return 0; } void page_pipe_destroy_ppb(struct page_pipe_buf *ppb) { list_del(&ppb->l); ppb_destroy(ppb); } void debug_show_page_pipe(struct page_pipe *pp) { struct page_pipe_buf *ppb; int i; struct iovec *iov; if (pr_quelled(LOG_DEBUG)) return; pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { pr_debug("\tbuf %u pages, %u iovs, flags: %x :\n", ppb->pages_in, ppb->nr_segs, ppb->flags); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); } } pr_debug("* %u holes:\n", pp->free_hole); for (i = 0; i < pp->free_hole; i++) { iov = &pp->holes[i]; pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); } } criu-3.6/criu/page-xfer.c000066400000000000000000000617701317335042600153270ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "types.h" #include "cr_options.h" #include "servicefd.h" #include "image.h" #include "page-xfer.h" #include "page-pipe.h" #include "util.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" #include "fcntl.h" #include "pstree.h" #include "parasite-syscall.h" #include "rst_info.h" static int page_server_sk = -1; struct page_server_iov { u32 cmd; u32 nr_pages; u64 vaddr; u64 dst_id; }; static void psi2iovec(struct page_server_iov *ps, struct iovec *iov) { iov->iov_base = decode_pointer(ps->vaddr); iov->iov_len = ps->nr_pages * PAGE_SIZE; } #define PS_IOV_ADD 1 #define PS_IOV_HOLE 2 #define PS_IOV_OPEN 3 #define PS_IOV_OPEN2 4 #define PS_IOV_PARENT 5 #define PS_IOV_ADD_F 6 #define PS_IOV_GET 7 #define PS_IOV_FLUSH 0x1023 #define PS_IOV_FLUSH_N_CLOSE 0x1024 #define PS_CMD_BITS 16 #define PS_CMD_MASK ((1 << PS_CMD_BITS) - 1) #define PS_TYPE_BITS 8 #define PS_TYPE_MASK ((1 << PS_TYPE_BITS) - 1) #define PS_TYPE_PID (1) #define PS_TYPE_SHMEM (2) /* * XXX: When adding new types here check decode_pm for legacy * numbers that can be met from older CRIUs */ static inline u64 encode_pm(int type, long id) { if (type == CR_FD_PAGEMAP) type = PS_TYPE_PID; else if (type == CR_FD_SHMEM_PAGEMAP) type = PS_TYPE_SHMEM; else { BUG(); return 0; } return ((u64)id) << PS_TYPE_BITS | type; } static int decode_pm(u64 dst_id, long *id) { int type; /* * Magic numbers below came from the older CRIU versions that * errorneously used the changing CR_FD_* constants. The * changes were made when we merged images together and moved * the CR_FD_-s at the tail of the enum */ type = dst_id & PS_TYPE_MASK; switch (type) { case 10: /* 3.1 3.2 */ case 11: /* 1.3 1.4 1.5 1.6 1.7 1.8 2.* 3.0 */ case 16: /* 1.2 */ case 17: /* 1.0 1.1 */ case PS_TYPE_PID: *id = dst_id >> PS_TYPE_BITS; type = CR_FD_PAGEMAP; break; case 27: /* 1.3 */ case 28: /* 1.4 1.5 */ case 29: /* 1.6 1.7 */ case 32: /* 1.2 1.8 */ case 33: /* 1.0 1.1 3.1 3.2 */ case 34: /* 2.* 3.0 */ case PS_TYPE_SHMEM: *id = dst_id >> PS_TYPE_BITS; type = CR_FD_SHMEM_PAGEMAP; break; default: type = -1; break; } return type; } static inline u32 encode_ps_cmd(u32 cmd, u32 flags) { return flags << PS_CMD_BITS | cmd; } static inline u32 decode_ps_cmd(u32 cmd) { return cmd & PS_CMD_MASK; } static inline u32 decode_ps_flags(u32 cmd) { return cmd >> PS_CMD_BITS; } static inline int send_psi_flags(int sk, struct page_server_iov *pi, int flags) { if (send(sk, pi, sizeof(*pi), flags) != sizeof(*pi)) { pr_perror("Can't send PSI %d to server", pi->cmd); return -1; } return 0; } static inline int send_psi(int sk, struct page_server_iov *pi) { return send_psi_flags(sk, pi, 0); } /* page-server xfer */ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long len) { pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE); if (splice(p, NULL, xfer->sk, NULL, len, SPLICE_F_MOVE) != len) { pr_perror("Can't write pages to socket"); return -1; } return 0; } static int write_pagemap_to_server(struct page_xfer *xfer, struct iovec *iov, u32 flags) { struct page_server_iov pi = { .cmd = encode_ps_cmd(PS_IOV_ADD_F, flags), .nr_pages = iov->iov_len / PAGE_SIZE, .vaddr = encode_pointer(iov->iov_base), .dst_id = xfer->dst_id, }; return send_psi(xfer->sk, &pi); } static void close_server_xfer(struct page_xfer *xfer) { xfer->sk = -1; } static int open_page_server_xfer(struct page_xfer *xfer, int fd_type, long id) { char has_parent; struct page_server_iov pi = { .cmd = PS_IOV_OPEN2, }; xfer->sk = page_server_sk; xfer->write_pagemap = write_pagemap_to_server; xfer->write_pages = write_pages_to_server; xfer->close = close_server_xfer; xfer->dst_id = encode_pm(fd_type, id); xfer->parent = NULL; pi.dst_id = xfer->dst_id; if (send_psi(xfer->sk, &pi)) { pr_perror("Can't write to page server"); return -1; } /* Push the command NOW */ tcp_nodelay(xfer->sk, true); if (read(xfer->sk, &has_parent, 1) != 1) { pr_perror("The page server doesn't answer"); return -1; } if (has_parent) xfer->parent = (void *) 1; /* This is required for generate_iovs() */ return 0; } /* local xfer */ static int write_pages_loc(struct page_xfer *xfer, int p, unsigned long len) { ssize_t ret; ssize_t curr = 0; while (1) { ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len, SPLICE_F_MOVE); if (ret == -1) { pr_perror("Unable to spice data"); return -1; } if (ret == 0) { pr_err("A pipe was closed unexpectedly"); return -1; } curr += ret; if (curr == len) break; } return 0; } static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) { int ret; unsigned long off, end; /* * Try to find pagemap entry in parent, from which * the data will be read on restore. * * This is the optimized version of the page-by-page * read_pagemap_page routine. */ pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len); off = (unsigned long)iov->iov_base; end = off + iov->iov_len; while (1) { unsigned long pend; ret = p->seek_pagemap(p, off); if (ret <= 0 || !p->pe) { pr_err("Missing %lx in parent pagemap\n", off); return -1; } pr_debug("\tFound %"PRIx64"/%lu\n", p->pe->vaddr, pagemap_len(p->pe)); /* * The pagemap entry in parent may happen to be * shorter, than the hole we write. In this case * we should go ahead and check the remainder. */ pend = p->pe->vaddr + pagemap_len(p->pe); if (end <= pend) return 0; pr_debug("\t\tcontinue on %lx\n", pend); off = pend; } } static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flags) { int ret; PagemapEntry pe = PAGEMAP_ENTRY__INIT; pe.vaddr = encode_pointer(iov->iov_base); pe.nr_pages = iov->iov_len / PAGE_SIZE; pe.has_flags = true; pe.flags = flags; if (flags & PE_PRESENT) { if (opts.auto_dedup && xfer->parent != NULL) { ret = dedup_one_iovec(xfer->parent, pe.vaddr, pagemap_len(&pe)); if (ret == -1) { pr_perror("Auto-deduplication failed"); return ret; } } } else if (flags & PE_PARENT) { if (xfer->parent != NULL) { ret = check_pagehole_in_parent(xfer->parent, iov); if (ret) { pr_err("Hole %p/%zu not found in parent\n", iov->iov_base, iov->iov_len); return -1; } } } if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0) return -1; return 0; } static void close_page_xfer(struct page_xfer *xfer) { if (xfer->parent != NULL) { xfer->parent->close(xfer->parent); xfree(xfer->parent); xfer->parent = NULL; } close_image(xfer->pi); close_image(xfer->pmi); } static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id) { u32 pages_id; xfer->pmi = open_image(fd_type, O_DUMP, id); if (!xfer->pmi) return -1; xfer->pi = open_pages_image(O_DUMP, xfer->pmi, &pages_id); if (!xfer->pi) { close_image(xfer->pmi); return -1; } /* * Open page-read for parent images (if it exists). It will * be used for two things: * 1) when writing a page, those from parent will be dedup-ed * 2) when writing a hole, the respective place would be checked * to exist in parent (either pagemap or hole) */ xfer->parent = NULL; if (fd_type == CR_FD_PAGEMAP || fd_type == CR_FD_SHMEM_PAGEMAP) { int ret; int pfd; int pr_flags = (fd_type == CR_FD_PAGEMAP) ? PR_TASK : PR_SHMEM; pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); if (pfd < 0 && errno == ENOENT) goto out; xfer->parent = xmalloc(sizeof(*xfer->parent)); if (!xfer->parent) { close(pfd); return -1; } ret = open_page_read_at(pfd, id, xfer->parent, pr_flags); if (ret <= 0) { pr_perror("No parent image found, though parent directory is set"); xfree(xfer->parent); xfer->parent = NULL; close(pfd); goto out; } close(pfd); } out: xfer->write_pagemap = write_pagemap_loc; xfer->write_pages = write_pages_loc; xfer->close = close_page_xfer; return 0; } int open_page_xfer(struct page_xfer *xfer, int fd_type, long id) { xfer->offset = 0; xfer->transfer_lazy = true; if (opts.use_page_server) return open_page_server_xfer(xfer, fd_type, id); else return open_page_local_xfer(xfer, fd_type, id); } static int page_xfer_dump_hole(struct page_xfer *xfer, struct iovec *hole, u32 flags) { BUG_ON(hole->iov_base < (void *)xfer->offset); hole->iov_base -= xfer->offset; pr_debug("\th %p [%u]\n", hole->iov_base, (unsigned int)(hole->iov_len / PAGE_SIZE)); if (xfer->write_pagemap(xfer, hole, flags)) return -1; return 0; } static int get_hole_flags(struct page_pipe *pp, int n) { unsigned int hole_flags = pp->hole_flags[n]; if (hole_flags == PP_HOLE_PARENT) return PE_PARENT; else BUG(); return -1; } static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp, unsigned int *cur_hole, void *limit) { int ret; for (; *cur_hole < pp->free_hole ; (*cur_hole)++) { struct iovec hole = pp->holes[*cur_hole]; u32 hole_flags; if (limit && hole.iov_base >= limit) break; hole_flags = get_hole_flags(pp, *cur_hole); ret = page_xfer_dump_hole(xfer, &hole, hole_flags); if (ret) return ret; } return 0; } static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *ppb) { if (ppb->flags & PPB_LAZY) /* * Pages that can be lazily restored are always marked as such. * In the case we actually transfer them into image mark them * as present as well. */ return (xfer->transfer_lazy ? PE_PRESENT : 0) | PE_LAZY; else return PE_PRESENT; } int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) { struct page_pipe_buf *ppb; unsigned int cur_hole = 0; int ret; pr_debug("Transferring pages:\n"); list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; u32 flags; ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base); if (ret) return ret; BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; pr_debug("\tp %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); flags = ppb_xfer_flags(xfer, ppb); if (xfer->write_pagemap(xfer, &iov, flags)) return -1; if ((flags & PE_PRESENT) && xfer->write_pages(xfer, ppb->p[0], iov.iov_len)) return -1; } } return dump_holes(xfer, pp, &cur_hole, NULL); } /* * Return: * 1 - if a parent image exists * 0 - if a parent image doesn't exist * -1 - in error cases */ int check_parent_local_xfer(int fd_type, int id) { char path[PATH_MAX]; struct stat st; int ret, pfd; pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); if (pfd < 0 && errno == ENOENT) return 0; snprintf(path, sizeof(path), imgset_template[fd_type].fmt, id); ret = fstatat(pfd, path, &st, 0); if (ret == -1 && errno != ENOENT) { pr_perror("Unable to stat %s", path); close(pfd); return -1; } close(pfd); return (ret == 0); } /* page server */ static int page_server_check_parent(int sk, struct page_server_iov *pi) { int type, ret; long id; type = decode_pm(pi->dst_id, &id); if (type == -1) { pr_err("Unknown pagemap type received\n"); return -1; } ret = check_parent_local_xfer(type, id); if (ret < 0) return -1; if (write(sk, &ret, sizeof(ret)) != sizeof(ret)) { pr_perror("Unable to send response"); return -1; } return 0; } static int check_parent_server_xfer(int fd_type, long id) { struct page_server_iov pi = {}; int has_parent; pi.cmd = PS_IOV_PARENT; pi.dst_id = encode_pm(fd_type, id); if (send_psi(page_server_sk, &pi)) return -1; tcp_nodelay(page_server_sk, true); if (read(page_server_sk, &has_parent, sizeof(int)) != sizeof(int)) { pr_perror("The page server doesn't answer"); return -1; } return has_parent; } int check_parent_page_xfer(int fd_type, long id) { if (opts.use_page_server) return check_parent_server_xfer(fd_type, id); else return check_parent_local_xfer(fd_type, id); } struct page_xfer_job { u64 dst_id; int p[2]; unsigned pipe_size; struct page_xfer loc_xfer; }; static struct page_xfer_job cxfer = { .dst_id = ~0, }; static struct pipe_read_dest pipe_read_dest = { .sink_fd = -1, }; static void page_server_close(void) { if (cxfer.dst_id != ~0) cxfer.loc_xfer.close(&cxfer.loc_xfer); if (pipe_read_dest.sink_fd != -1) { close(pipe_read_dest.sink_fd); close(pipe_read_dest.p[0]); close(pipe_read_dest.p[1]); } } static int page_server_open(int sk, struct page_server_iov *pi) { int type; long id; type = decode_pm(pi->dst_id, &id); if (type == -1) { pr_err("Unknown pagemap type received\n"); return -1; } pr_info("Opening %d/%ld\n", type, id); page_server_close(); if (open_page_local_xfer(&cxfer.loc_xfer, type, id)) return -1; cxfer.dst_id = pi->dst_id; if (sk >= 0) { char has_parent = !!cxfer.loc_xfer.parent; if (write(sk, &has_parent, 1) != 1) { pr_perror("Unable to send response"); close_page_xfer(&cxfer.loc_xfer); return -1; } } return 0; } static int prep_loc_xfer(struct page_server_iov *pi) { if (cxfer.dst_id != pi->dst_id) { pr_warn("Deprecated IO w/o open\n"); return page_server_open(-1, pi); } else return 0; } static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) { size_t len; struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; pr_debug("Adding %"PRIx64"/%u\n", pi->vaddr, pi->nr_pages); if (prep_loc_xfer(pi)) return -1; psi2iovec(pi, &iov); if (lxfer->write_pagemap(lxfer, &iov, flags)) return -1; if (!(flags & PE_PRESENT)) return 0; len = iov.iov_len; while (len > 0) { ssize_t chunk; chunk = len; if (chunk > cxfer.pipe_size) chunk = cxfer.pipe_size; /* * Splicing into a pipe may end up blocking if pipe is "full", * and we need the SPLICE_F_NONBLOCK flag here. At the same time * splcing from UNIX socket with this flag aborts splice with * the EAGAIN if there's no data in it (TCP looks at the socket * O_NONBLOCK flag _only_ and waits for data), so before doing * the non-blocking splice we need to explicitly wait. */ if (sk_wait_data(sk) < 0) { pr_perror("Can't poll socket"); return -1; } chunk = splice(sk, NULL, cxfer.p[1], NULL, chunk, SPLICE_F_MOVE | SPLICE_F_NONBLOCK); if (chunk < 0) { pr_perror("Can't read from socket"); return -1; } if (chunk == 0) { pr_err("A socket was closed unexpectedly"); return -1; } if (lxfer->write_pages(lxfer, cxfer.p[0], chunk)) return -1; len -= chunk; } return 0; } static int page_server_get_pages(int sk, struct page_server_iov *pi) { struct pstree_item *item; struct page_pipe *pp; unsigned long len; int ret; item = pstree_item_by_virt(pi->dst_id); pp = dmpi(item)->mem_pp; ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &pi->nr_pages, PPB_LAZY); if (ret) return ret; /* * The pi is reused for send_psi here, so .nr_pages, .vaddr and * .dst_id all remain intact. */ if (pi->nr_pages == 0) { pr_debug("no iovs found, zero pages\n"); return -1; } pi->cmd = encode_ps_cmd(PS_IOV_ADD_F, PE_PRESENT); if (send_psi(sk, pi)) return -1; len = pi->nr_pages * PAGE_SIZE; ret = splice(pipe_read_dest.p[0], NULL, sk, NULL, len, SPLICE_F_MOVE); if (ret != len) return -1; tcp_nodelay(sk, true); return 0; } static int page_server_serve(int sk) { int ret = -1; bool flushed = false; bool receiving_pages = !opts.lazy_pages; if (receiving_pages) { /* * This socket only accepts data except one thing -- it * writes back the has_parent bit from time to time, so * make it NODELAY all the time. */ tcp_nodelay(sk, true); if (pipe(cxfer.p)) { pr_perror("Can't make pipe for xfer"); close(sk); return -1; } cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0); pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size); } else { pipe_read_dest_init(&pipe_read_dest); tcp_cork(sk, true); } while (1) { struct page_server_iov pi; u32 cmd; ret = recv(sk, &pi, sizeof(pi), MSG_WAITALL); if (!ret) break; if (ret != sizeof(pi)) { pr_perror("Can't read pagemap from socket"); ret = -1; break; } flushed = false; cmd = decode_ps_cmd(pi.cmd); switch (cmd) { case PS_IOV_OPEN: ret = page_server_open(-1, &pi); break; case PS_IOV_OPEN2: ret = page_server_open(sk, &pi); break; case PS_IOV_PARENT: ret = page_server_check_parent(sk, &pi); break; case PS_IOV_ADD_F: case PS_IOV_ADD: case PS_IOV_HOLE: { u32 flags; if (likely(cmd == PS_IOV_ADD_F)) flags = decode_ps_flags(pi.cmd); else if (cmd == PS_IOV_ADD) flags = PE_PRESENT; else /* PS_IOV_HOLE */ flags = PE_PARENT; ret = page_server_add(sk, &pi, flags); break; } case PS_IOV_FLUSH: case PS_IOV_FLUSH_N_CLOSE: { int32_t status = 0; ret = 0; /* * An answer must be sent back to inform another side, * that all data were received */ if (write(sk, &status, sizeof(status)) != sizeof(status)) { pr_perror("Can't send the final package"); ret = -1; } flushed = true; break; } case PS_IOV_GET: ret = page_server_get_pages(sk, &pi); break; default: pr_err("Unknown command %u\n", pi.cmd); ret = -1; break; } if (ret || (pi.cmd == PS_IOV_FLUSH_N_CLOSE)) break; } if (receiving_pages && !ret && !flushed) { pr_err("The data were not flushed\n"); ret = -1; } if (ret == 0 && opts.ps_socket == -1) { char c; /* * Wait when a remote side closes the connection * to avoid TIME_WAIT bucket */ if (read(sk, &c, sizeof(c)) != 0) { pr_perror("Unexpected data"); ret = -1; } } page_server_close(); pr_info("Session over\n"); close(sk); return ret; } static int fill_page_pipe(struct page_read *pr, struct page_pipe *pp) { struct page_pipe_buf *ppb; int i, ret; pr->reset(pr); while (pr->advance(pr)) { unsigned long vaddr = pr->pe->vaddr; for (i = 0; i < pr->pe->nr_pages; i++, vaddr += PAGE_SIZE) { if (pagemap_in_parent(pr->pe)) ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); else ret = page_pipe_add_page(pp, vaddr, pagemap_lazy(pr->pe) ? PPB_LAZY : 0); if (ret) { pr_err("Failed adding page at %lx\n", vaddr); return -1; } } } list_for_each_entry(ppb, &pp->bufs, l) { for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; if (splice(img_raw_fd(pr->pi), NULL, ppb->p[1], NULL, iov.iov_len, SPLICE_F_MOVE) != iov.iov_len) { pr_perror("Splice failed"); return -1; } } } debug_show_page_pipe(pp); return 0; } static int page_pipe_from_pagemap(struct page_pipe **pp, int pid) { struct page_read pr; int nr_pages = 0; if (open_page_read(pid, &pr, PR_TASK) <= 0) { pr_err("Failed to open page read for %d\n", pid); return -1; } while (pr.advance(&pr)) if (pagemap_present(pr.pe)) nr_pages += pr.pe->nr_pages; *pp = create_page_pipe(nr_pages, NULL, 0); if (!*pp) { pr_err("Cannot create page pipe for %d\n", pid); return -1; } if (fill_page_pipe(&pr, *pp)) return -1; return 0; } static int page_server_init_send(void) { struct pstree_item *pi; struct page_pipe *pp; BUILD_BUG_ON(sizeof(struct dmp_info) > sizeof(struct rst_info)); if (prepare_dummy_pstree()) return -1; for_each_pstree_item(pi) { if (prepare_dummy_task_state(pi)) return -1; if (!task_alive(pi)) continue; if (page_pipe_from_pagemap(&pp, vpid(pi))) { pr_err("%d: failed to open page-read\n", vpid(pi)); return -1; } /* * prepare_dummy_pstree presumes 'restore' behaviour, * but page_server_get_pages uses dmpi() to get access * to the page-pipe, so we are faking it here. */ memset(rsti(pi), 0, sizeof(struct rst_info)); dmpi(pi)->mem_pp = pp; } return 0; } int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd) { int ask = -1; int sk = -1; int ret; if (!opts.lazy_pages) up_page_ids_base(); else if (!lazy_dump) if (page_server_init_send()) return -1; if (opts.ps_socket != -1) { ret = 0; ask = opts.ps_socket; pr_info("Re-using ps socket %d\n", ask); goto no_server; } sk = setup_tcp_server("page"); if (sk == -1) return -1; no_server: ret = run_tcp_server(daemon_mode, &ask, cfd, sk); if (ret != 0) return ret > 0 ? 0 : -1; if (ask >= 0) ret = page_server_serve(ask); if (daemon_mode) exit(ret); return ret; } static int connect_to_page_server(void) { if (!opts.use_page_server) return 0; if (opts.ps_socket != -1) { page_server_sk = opts.ps_socket; pr_info("Re-using ps socket %d\n", page_server_sk); goto out; } page_server_sk = setup_tcp_client(opts.addr); if (page_server_sk == -1) return -1; out: /* * CORK the socket at the very beginning. As per ANK * the corked by default socket with sporadic NODELAY-s * on urgent data is the smartest mode ever. */ tcp_cork(page_server_sk, true); return 0; } int connect_to_page_server_to_send(void) { return connect_to_page_server(); } int disconnect_from_page_server(void) { struct page_server_iov pi = { }; int32_t status = -1; int ret = -1; if (!opts.use_page_server) return 0; if (page_server_sk == -1) return 0; pr_info("Disconnect from the page server %s:%u\n", opts.addr, (int)ntohs(opts.port)); if (opts.ps_socket != -1) /* * The socket might not get closed (held by * the parent process) so we must order the * page-server to terminate itself. */ pi.cmd = PS_IOV_FLUSH_N_CLOSE; else pi.cmd = PS_IOV_FLUSH; if (send_psi(page_server_sk, &pi)) goto out; if (read(page_server_sk, &status, sizeof(status)) != sizeof(status)) { pr_perror("The page server doesn't answer"); goto out; } ret = 0; out: close_safe(&page_server_sk); return ret ? : status; } struct ps_async_read { unsigned long rb; /* read bytes */ unsigned long goal; struct page_server_iov pi; void *pages; ps_async_read_complete complete; void *priv; struct list_head l; }; static LIST_HEAD(async_reads); static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages, ps_async_read_complete complete, void *priv) { ar->pages = buf; ar->rb = 0; ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE; ar->complete = complete; ar->priv = priv; } static int page_server_start_async_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv) { struct ps_async_read *ar; ar = xmalloc(sizeof(*ar)); if (ar == NULL) return -1; init_ps_async_read(ar, buf, nr_pages, complete, priv); list_add_tail(&ar->l, &async_reads); return 0; } /* * There are two possible event types we need to handle: * - page info is available as a reply to request_remote_page * - page data is available, and it follows page info we've just received * Since the on dump side communications are completely synchronous, * we can return to epoll right after the reception of page info and * for sure the next time socket event will occur we'll get page data * related to info we've just received */ static int page_server_read(struct ps_async_read *ar, int flags) { int ret, need; void *buf; if (ar->rb < sizeof(ar->pi)) { /* Header */ buf = ((void *)&ar->pi) + ar->rb; need = sizeof(ar->pi) - ar->rb; } else { /* Page(s) data itself */ buf = ar->pages + (ar->rb - sizeof(ar->pi)); need = ar->goal - ar->rb; } ret = recv(page_server_sk, buf, need, flags); if (ret < 0) { pr_perror("Error reading async data from page server"); return -1; } ar->rb += ret; if (ar->rb < ar->goal) return 1; /* * IO complete -- notify the caller and drop the request */ BUG_ON(ar->rb > ar->goal); return ar->complete((int)ar->pi.dst_id, (unsigned long)ar->pi.vaddr, (int)ar->pi.nr_pages, ar->priv); } static int page_server_async_read(struct epoll_rfd *f) { struct ps_async_read *ar; int ret; BUG_ON(list_empty(&async_reads)); ar = list_first_entry(&async_reads, struct ps_async_read, l); ret = page_server_read(ar, MSG_DONTWAIT); if (ret > 0) return 0; if (!ret) { list_del(&ar->l); xfree(ar); } return ret; } static struct epoll_rfd ps_rfd; int connect_to_page_server_to_recv(int epfd) { if (connect_to_page_server()) return -1; ps_rfd.fd = page_server_sk; ps_rfd.revent = page_server_async_read; return epoll_add_rfd(epfd, &ps_rfd); } int request_remote_pages(int pid, unsigned long addr, int nr_pages) { struct page_server_iov pi = { .cmd = PS_IOV_GET, .nr_pages = nr_pages, .vaddr = addr, .dst_id = pid, }; /* XXX: why MSG_DONTWAIT here? */ if (send_psi_flags(page_server_sk, &pi, MSG_DONTWAIT)) return -1; tcp_nodelay(page_server_sk, true); return 0; } static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete complete, void *priv) { struct ps_async_read ar; int ret = 1; init_ps_async_read(&ar, buf, nr, complete, priv); while (ret == 1) ret = page_server_read(&ar, MSG_WAITALL); return ret; } int page_server_start_read(void *buf, int nr, ps_async_read_complete complete, void *priv, unsigned flags) { if (flags & PR_ASYNC) return page_server_start_async_read(buf, nr, complete, priv); else return page_server_start_sync_read(buf, nr, complete, priv); } criu-3.6/criu/pagemap-cache.c000066400000000000000000000113431317335042600161130ustar00rootroot00000000000000#include #include #include "page.h" #include "pagemap-cache.h" #include "common/compiler.h" #include "xmalloc.h" #include "util.h" #include "log.h" #include "vma.h" #include "mem.h" #include "kerndat.h" #undef LOG_PREFIX #define LOG_PREFIX "pagemap-cache: " /* To carry up to 2M of physical memory */ #define PMC_SHIFT (21) #define PMC_SIZE (1ul << PMC_SHIFT) #define PMC_MASK (~(PMC_SIZE - 1)) #define PMC_SIZE_GAP (PMC_SIZE / 4) #define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64)) /* * It's a workaround for a kernel bug. In the 3.19 kernel when pagemap are read * for a few vma-s for one read call, it returns incorrect data. * https://github.com/xemul/criu/issues/207 */ static bool pagemap_cache_disabled; static inline void pmc_reset(pmc_t *pmc) { memzero(pmc, sizeof(*pmc)); pmc->fd = -1; } static inline void pmc_zap(pmc_t *pmc) { pmc->start = pmc->end = 0; } int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size) { size_t map_size = max(size, (size_t)PMC_SIZE); pmc_reset(pmc); BUG_ON(!vma_head); pmc->pid = pid; pmc->map_len = PAGEMAP_LEN(map_size); pmc->vma_head = vma_head; pmc->map = xmalloc(pmc->map_len); if (!pmc->map) goto err; if (pagemap_cache_disabled) pr_debug("The pagemap cache is disabled\n"); if (kdat.pmap == PM_DISABLED) { /* * FIXME We might need to implement greedy * mode via reading all pages available inside * parasite. * * Actually since linux-4.4 the pagemap file * is available for usernamespace with hiding * PFNs but providing page attributes, so other * option simply require kernel 4.4 and above * for usernamespace support. */ pr_err("No pagemap for %d available\n", pid); goto err; } else { pmc->fd = open_proc(pid, "pagemap"); if (pmc->fd < 0) goto err; } pr_debug("created for pid %d (takes %zu bytes)\n", pid, pmc->map_len); return 0; err: pr_err("Failed to init pagemap for %d\n", pid); pmc_fini(pmc); return -1; } static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr) { return &pmc->map[PAGE_PFN(addr - pmc->start)]; } static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) { unsigned long low = vma->e->start & PMC_MASK; unsigned long high = low + PMC_SIZE; size_t len = vma_area_len(vma); size_t size_map; if (high > kdat.task_size) high = kdat.task_size; pmc->start = vma->e->start; pmc->end = vma->e->end; pr_debug("filling VMA %lx-%lx (%zuK) [l:%lx h:%lx]\n", (long)vma->e->start, (long)vma->e->end, len >> 10, low, high); /* * If we meet a small VMA, lets try to fit 2M cache * window at least 75% full, otherwise left as a plain * "one vma at a time" read. Note the VMAs in cache must * fit in solid manner, iow -- either the whole vma fits * the cache window, either plain read is used. * * The benefit (apart redusing the number of read() calls) * is to walk page tables less. */ if (!pagemap_cache_disabled && len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) { size_t size_cov = len; size_t nr_vmas = 1; pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n", (long)vma->e->start, (long)vma->e->end, nr_vmas, size_cov); list_for_each_entry_continue(vma, pmc->vma_head, list) { if (vma->e->start > high || vma->e->end > high) break; BUG_ON(vma->e->start < low); size_cov += vma_area_len(vma); nr_vmas++; pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n", (long)vma->e->start, (long)vma->e->end, nr_vmas, size_cov); } if (nr_vmas > 1) { /* * Note we don't touch low bound since it's set * to first VMA start already and not updating it * allows us to save a couple of code bytes. */ pmc->end = high; pr_debug("\tcache mode [l:%lx h:%lx]\n", pmc->start, pmc->end); } else pr_debug("\tsimple mode [l:%lx h:%lx]\n", pmc->start, pmc->end); } size_map = PAGEMAP_LEN(pmc->end - pmc->start); BUG_ON(pmc->map_len < size_map); BUG_ON(pmc->fd < 0); if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { pmc_zap(pmc); pr_perror("Can't read %d's pagemap file", pmc->pid); return -1; } return 0; } u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma) { /* Hit */ if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end)) return __pmc_get_map(pmc, vma->e->start); /* Miss, refill the cache */ if (pmc_fill_cache(pmc, vma)) { pr_err("Failed to fill cache for %d (%lx-%lx)\n", pmc->pid, (long)vma->e->start, (long)vma->e->end); return NULL; } /* Hit for sure */ return __pmc_get_map(pmc, vma->e->start); } void pmc_fini(pmc_t *pmc) { close_safe(&pmc->fd); xfree(pmc->map); pmc_reset(pmc); } static void __attribute__((constructor)) pagemap_cache_init(void) { pagemap_cache_disabled = (getenv("CRIU_PMC_OFF") != NULL); } criu-3.6/criu/pagemap.c000066400000000000000000000416411317335042600150560ustar00rootroot00000000000000#include #include #include #include #include #include #include "types.h" #include "image.h" #include "cr_options.h" #include "servicefd.h" #include "pagemap.h" #include "restorer.h" #include "rst-malloc.h" #include "page-xfer.h" #include "fault-injection.h" #include "xmalloc.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" #ifndef SEEK_DATA #define SEEK_DATA 3 #define SEEK_HOLE 4 #endif #define MAX_BUNCH_SIZE 256 /* * One "job" for the preadv() syscall in pagemap.c */ struct page_read_iov { off_t from; /* offset in pi file where to start reading from */ off_t end; /* the end of the read == sum to.iov_len -s */ struct iovec *to; /* destination iovs */ unsigned int nr; /* their number */ struct list_head l; }; static inline bool can_extend_bunch(struct iovec *bunch, unsigned long off, unsigned long len) { return /* The next region is the continuation of the existing */ ((unsigned long)bunch->iov_base + bunch->iov_len == off) && /* The resulting region is non empty and is small enough */ (bunch->iov_len == 0 || bunch->iov_len + len < MAX_BUNCH_SIZE * PAGE_SIZE); } static int punch_hole(struct page_read *pr, unsigned long off, unsigned long len, bool cleanup) { int ret; struct iovec * bunch = &pr->bunch; if (!cleanup && can_extend_bunch(bunch, off, len)) { pr_debug("pr%d-%d:Extend bunch len from %zu to %lu\n", pr->pid, pr->id, bunch->iov_len, bunch->iov_len + len); bunch->iov_len += len; } else { if (bunch->iov_len > 0) { pr_debug("Punch!/%p/%zu/\n", bunch->iov_base, bunch->iov_len); ret = fallocate(img_raw_fd(pr->pi), FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (unsigned long)bunch->iov_base, bunch->iov_len); if (ret != 0) { pr_perror("Error punching hole"); return -1; } } bunch->iov_base = (void *)off; bunch->iov_len = len; pr_debug("pr%d-%d:New bunch/%p/%zu/\n", pr->pid, pr->id, bunch->iov_base, bunch->iov_len); } return 0; } int dedup_one_iovec(struct page_read *pr, unsigned long off, unsigned long len) { unsigned long iov_end; iov_end = off + len; while (1) { int ret; unsigned long piov_end; struct page_read * prp; ret = pr->seek_pagemap(pr, off); if (ret == 0) { pr_warn("Missing %lx in parent pagemap\n", off); if (off < pr->cvaddr && pr->cvaddr < iov_end) off = pr->cvaddr; else return 0; } if (!pr->pe) return -1; piov_end = pr->pe->vaddr + pagemap_len(pr->pe); if (!pagemap_in_parent(pr->pe)) { ret = punch_hole(pr, pr->pi_off, min(piov_end, iov_end) - off, false); if (ret == -1) return ret; } prp = pr->parent; if (prp) { /* recursively */ pr_debug("Go to next parent level\n"); len = min(piov_end, iov_end) - off; ret = dedup_one_iovec(prp, off, len); if (ret != 0) return -1; } if (piov_end < iov_end) { off = piov_end; continue; } else return 0; } return 0; } static int advance(struct page_read *pr) { pr->curr_pme++; if (pr->curr_pme >= pr->nr_pmes) return 0; pr->pe = pr->pmes[pr->curr_pme]; pr->cvaddr = pr->pe->vaddr; return 1; } static void skip_pagemap_pages(struct page_read *pr, unsigned long len) { if (!len) return; if (pagemap_present(pr->pe)) pr->pi_off += len; pr->cvaddr += len; } static int seek_pagemap(struct page_read *pr, unsigned long vaddr) { if (!pr->pe) goto adv; do { unsigned long start = pr->pe->vaddr; unsigned long len = pr->pe->nr_pages * PAGE_SIZE; unsigned long end = start + len; if (vaddr < pr->cvaddr) break; if (vaddr >= start && vaddr < end) { skip_pagemap_pages(pr, vaddr - pr->cvaddr); return 1; } if (end <= vaddr) skip_pagemap_pages(pr, end - pr->cvaddr); adv: ; /* otherwise "label at end of compound stmt" gcc error */ } while (advance(pr)); return 0; } static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { pr_err("Page read err %"PRIx64":%u vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) { struct page_read *ppr = pr->parent; int ret; if (!ppr) { pr_err("No parent for snapshot pagemap\n"); return -1; } /* * Parent pagemap at this point entry may be shorter * than the current vaddr:nr needs, so we have to * carefully 'split' the vaddr:nr into pieces and go * to parent page-read with the longest requests it * can handle. */ do { int p_nr; pr_debug("\tpr%d-%u Read from parent\n", pr->pid, pr->id); ret = ppr->seek_pagemap(ppr, vaddr); if (ret <= 0) { pr_err("Missing %lx in parent pagemap\n", vaddr); return -1; } /* * This is how many pages we have in the parent * page_read starting from vaddr. Go ahead and * read as much as we can. */ p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE; pr_info("\tparent has %u pages in\n", p_nr); if (p_nr > nr) p_nr = nr; ret = ppr->read_pages(ppr, vaddr, p_nr, buf, flags); if (ret == -1) return ret; /* * OK, let's see how much data we have left and go * to parent page-read again for the next pagemap * entry. */ nr -= p_nr; vaddr += p_nr * PAGE_SIZE; buf += p_nr * PAGE_SIZE; } while (nr); return 0; } static int read_local_page(struct page_read *pr, unsigned long vaddr, unsigned long len, void *buf) { int fd = img_raw_fd(pr->pi); int ret; size_t curr = 0; /* * Flush any pending async requests if any not to break the * linear reading from the pages.img file. */ if (pr->sync(pr)) return -1; pr_debug("\tpr%d-%u Read page from self %lx/%"PRIx64"\n", pr->pid, pr->id, pr->cvaddr, pr->pi_off); while (1) { ret = pread(fd, buf + curr, len - curr, pr->pi_off + curr); if (ret < 1) { pr_perror("Can't read mapping page %d", ret); return -1; } curr += ret; if (curr == len) break; } if (opts.auto_dedup) { ret = punch_hole(pr, pr->pi_off, len, false); if (ret == -1) return -1; } return 0; } static int enqueue_async_iov(struct page_read *pr, void *buf, unsigned long len, struct list_head *to) { struct page_read_iov *pr_iov; struct iovec *iov; pr_iov = xzalloc(sizeof(*pr_iov)); if (!pr_iov) return -1; pr_iov->from = pr->pi_off; pr_iov->end = pr->pi_off + len; iov = xzalloc(sizeof(*iov)); if (!iov) { xfree(pr_iov); return -1; } iov->iov_base = buf; iov->iov_len = len; pr_iov->to = iov; pr_iov->nr = 1; list_add_tail(&pr_iov->l, to); return 0; } int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta) { struct page_read_iov *piov; ta->vma_ios = (struct restore_vma_io *)rst_mem_align_cpos(RM_PRIVATE); ta->vma_ios_n = 0; list_for_each_entry(piov, from, l) { struct restore_vma_io *rio; pr_info("`- render %d iovs (%p:%zd...)\n", piov->nr, piov->to[0].iov_base, piov->to[0].iov_len); rio = rst_mem_alloc(RIO_SIZE(piov->nr), RM_PRIVATE); if (!rio) return -1; rio->nr_iovs = piov->nr; rio->off = piov->from; memcpy(rio->iovs, piov->to, piov->nr * sizeof(struct iovec)); ta->vma_ios_n++; } return 0; } int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, struct list_head *to) { struct page_read_iov *cur_async = NULL; struct iovec *iov; if (!list_empty(to)) cur_async = list_entry(to->prev, struct page_read_iov, l); /* * We don't have any async requests or we have new read * request that should happen at pos _after_ some hole from * the previous one. * Start the new preadv request here. */ if (!cur_async || pr->pi_off != cur_async->end) return enqueue_async_iov(pr, buf, len, to); /* * This read is pure continuation of the previous one. Let's * just add another IOV (or extend one of the existing). */ iov = &cur_async->to[cur_async->nr - 1]; if (iov->iov_base + iov->iov_len == buf) { /* Extendable */ iov->iov_len += len; } else { /* Need one more target iovec */ unsigned int n_iovs = cur_async->nr + 1; if (n_iovs >= IOV_MAX) return enqueue_async_iov(pr, buf, len, to); iov = xrealloc(cur_async->to, n_iovs * sizeof(*iov)); if (!iov) return -1; cur_async->to = iov; iov += cur_async->nr; iov->iov_base = buf; iov->iov_len = len; cur_async->nr = n_iovs; } cur_async->end += len; return 0; } static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) { int ret; unsigned long len = nr * PAGE_SIZE; /* * There's no API in the kernel to start asynchronous * cached read (or write), so in case someone is asking * for us for urgent async read, just do the regular * cached read. */ if ((flags & (PR_ASYNC|PR_ASAP)) == PR_ASYNC) ret = pagemap_enqueue_iovec(pr, buf, len, &pr->async); else { ret = read_local_page(pr, vaddr, len, buf); if (ret == 0 && pr->io_complete) ret = pr->io_complete(pr, vaddr, nr); } pr->pi_off += len; return ret; } static int read_page_complete(int pid, unsigned long vaddr, int nr_pages, void *priv) { int ret = 0; struct page_read *pr = priv; if (pr->pid != pid) { pr_err("Out of order read completed (want %d have %d)\n", pr->pid, pid); return -1; } if (pr->io_complete) ret = pr->io_complete(pr, vaddr, nr_pages); return ret; } static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) { int ret; /* We always do PR_ASAP mode here (FIXME?) */ ret = request_remote_pages(pr->pid, vaddr, nr); if (!ret) ret = page_server_start_read(buf, nr, read_page_complete, pr, flags); return ret; } static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) { pr_info("pr%d-%u Read %lx %u pages\n", pr->pid, pr->id, vaddr, nr); pagemap_bound_check(pr->pe, vaddr, nr); if (pagemap_in_parent(pr->pe)) { if (read_parent_page(pr, vaddr, nr, buf, flags) < 0) return -1; } else { if (pr->maybe_read_page(pr, vaddr, nr, buf, flags) < 0) return -1; } pr->cvaddr += nr * PAGE_SIZE; return 1; } static void free_pagemaps(struct page_read *pr) { int i; for (i = 0; i < pr->nr_pmes; i++) pagemap_entry__free_unpacked(pr->pmes[i], NULL); xfree(pr->pmes); } static void advance_piov(struct page_read_iov *piov, ssize_t len) { ssize_t olen = len; int onr = piov->nr; piov->from += len; while (len) { struct iovec *cur = piov->to; if (cur->iov_len <= len) { piov->to++; piov->nr--; len -= cur->iov_len; continue; } cur->iov_base += len; cur->iov_len -= len; break; } pr_debug("Advanced iov %zu bytes, %d->%d iovs, %zu tail\n", olen, onr, piov->nr, len); } static int process_async_reads(struct page_read *pr) { int fd, ret = 0; struct page_read_iov *piov, *n; fd = img_raw_fd(pr->pi); list_for_each_entry_safe(piov, n, &pr->async, l) { ssize_t ret; off_t start = piov->from; struct iovec *iovs = piov->to; pr_debug("Read piov iovs %d, from %ju, len %ju, first %p:%zu\n", piov->nr, piov->from, piov->end - piov->from, piov->to->iov_base, piov->to->iov_len); more: ret = preadv(fd, piov->to, piov->nr, piov->from); if (fault_injected(FI_PARTIAL_PAGES)) { /* * We might have read everything, but for debug * purposes let's try to force the advance_piov() * and re-read tail. */ if (ret > 0 && piov->nr >= 2) { pr_debug("`- trim preadv %zu\n", ret); ret /= 2; } } if (ret != piov->end - piov->from) { if (ret < 0) { pr_err("Can't read async pr bytes (%zd / %ju read, %ju off, %d iovs)\n", ret, piov->end - piov->from, piov->from, piov->nr); return -1; } /* * The preadv() can return less than requested. It's * valid and doesn't mean error or EOF. We should advance * the iovecs and continue * * Modify the piov in-place, we're going to drop this one * anyway. */ advance_piov(piov, ret); goto more; } if (opts.auto_dedup && punch_hole(pr, start, ret, false)) return -1; BUG_ON(pr->io_complete); /* FIXME -- implement once needed */ list_del(&piov->l); xfree(iovs); xfree(piov); } if (pr->parent) ret = process_async_reads(pr->parent); return ret; } static void close_page_read(struct page_read *pr) { int ret; BUG_ON(!list_empty(&pr->async)); if (pr->bunch.iov_len > 0) { ret = punch_hole(pr, 0, 0, true); if (ret == -1) return; pr->bunch.iov_len = 0; } if (pr->parent) { close_page_read(pr->parent); xfree(pr->parent); } if (pr->pmi) close_image(pr->pmi); if (pr->pi) close_image(pr->pi); if (pr->pmes) free_pagemaps(pr); } static void reset_pagemap(struct page_read *pr) { pr->cvaddr = 0; pr->pi_off = 0; pr->curr_pme = -1; pr->pe = NULL; /* FIXME: take care of bunch */ if (pr->parent) reset_pagemap(pr->parent); } static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags) { int pfd, ret; struct page_read *parent = NULL; pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY); if (pfd < 0 && errno == ENOENT) goto out; parent = xmalloc(sizeof(*parent)); if (!parent) goto err_cl; ret = open_page_read_at(pfd, pid, parent, pr_flags); if (ret < 0) goto err_free; if (!ret) { xfree(parent); parent = NULL; } close(pfd); out: pr->parent = parent; return 0; err_free: xfree(parent); err_cl: close(pfd); return -1; } static void init_compat_pagemap_entry(PagemapEntry *pe) { /* * pagemap image generated with older version will either * contain a hole because the pages are in the parent * shanpshot or a pagemap that should be marked with * PE_PRESENT */ if (pe->has_in_parent && pe->in_parent) pe->flags |= PE_PARENT; else if (!pe->has_flags) pe->flags = PE_PRESENT; } /* * The pagemap entry size is at least 8 bytes for small mappings with * low address and may get to 18 bytes or even more for large mappings * with high address and in_parent flag set. 16 seems to be nice round * number to minimize {over,under}-allocations */ #define PAGEMAP_ENTRY_SIZE_ESTIMATE 16 static int init_pagemaps(struct page_read *pr) { off_t fsize; int nr_pmes, nr_realloc; fsize = img_raw_size(pr->pmi); if (fsize < 0) return -1; nr_pmes = fsize / PAGEMAP_ENTRY_SIZE_ESTIMATE + 1; nr_realloc = nr_pmes / 2; pr->pmes = xzalloc(nr_pmes * sizeof(*pr->pmes)); if (!pr->pmes) return -1; pr->nr_pmes = 0; pr->curr_pme = -1; while (1) { int ret = pb_read_one_eof(pr->pmi, &pr->pmes[pr->nr_pmes], PB_PAGEMAP); if (ret < 0) goto free_pagemaps; if (ret == 0) break; init_compat_pagemap_entry(pr->pmes[pr->nr_pmes]); pr->nr_pmes++; if (pr->nr_pmes >= nr_pmes) { nr_pmes += nr_realloc; pr->pmes = xrealloc(pr->pmes, nr_pmes * sizeof(*pr->pmes)); if (!pr->pmes) goto free_pagemaps; } } close_image(pr->pmi); pr->pmi = NULL; return 0; free_pagemaps: free_pagemaps(pr); return -1; } int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags) { int flags, i_typ; static unsigned ids = 1; bool remote = pr_flags & PR_REMOTE; /* * Only the top-most page-read can be remote, all the * others are always local. */ pr_flags &= ~PR_REMOTE; if (opts.auto_dedup) pr_flags |= PR_MOD; if (pr_flags & PR_MOD) flags = O_RDWR; else flags = O_RSTR; switch (pr_flags & PR_TYPE_MASK) { case PR_TASK: i_typ = CR_FD_PAGEMAP; break; case PR_SHMEM: i_typ = CR_FD_SHMEM_PAGEMAP; break; default: BUG(); return -1; } INIT_LIST_HEAD(&pr->async); pr->pe = NULL; pr->parent = NULL; pr->cvaddr = 0; pr->pi_off = 0; pr->bunch.iov_len = 0; pr->bunch.iov_base = NULL; pr->pmes = NULL; pr->pieok = false; pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid); if (!pr->pmi) return -1; if (empty_image(pr->pmi)) { close_image(pr->pmi); return 0; } if (try_open_parent(dfd, pid, pr, pr_flags)) { close_image(pr->pmi); return -1; } pr->pi = open_pages_image_at(dfd, flags, pr->pmi, &pr->pages_img_id); if (!pr->pi) { close_page_read(pr); return -1; } if (init_pagemaps(pr)) { close_page_read(pr); return -1; } pr->read_pages = read_pagemap_page; pr->advance = advance; pr->close = close_page_read; pr->skip_pages = skip_pagemap_pages; pr->sync = process_async_reads; pr->seek_pagemap = seek_pagemap; pr->reset = reset_pagemap; pr->io_complete = NULL; /* set up by the client if needed */ pr->id = ids++; pr->pid = pid; if (remote) pr->maybe_read_page = maybe_read_page_remote; else { pr->maybe_read_page = maybe_read_page_local; if (!pr->parent && !opts.lazy_pages) pr->pieok = true; } pr_debug("Opened %s page read %u (parent %u)\n", remote ? "remote" : "local", pr->id, pr->parent ? pr->parent->id : 0); return 1; } int open_page_read(int pid, struct page_read *pr, int pr_flags) { return open_page_read_at(get_service_fd(IMG_FD_OFF), pid, pr, pr_flags); } #define DUP_IDS_BASE 1000 void dup_page_read(struct page_read *src, struct page_read *dst) { static int dup_ids = 1; memcpy(dst, src, sizeof(*dst)); INIT_LIST_HEAD(&dst->async); dst->id = src->id + DUP_IDS_BASE * dup_ids++; dst->reset(dst); } criu-3.6/criu/parasite-syscall.c000066400000000000000000000323711317335042600167240ustar00rootroot00000000000000#include #include #include #include #include #include "common/compiler.h" #include "types.h" #include "protobuf.h" #include "images/sa.pb-c.h" #include "images/timer.pb-c.h" #include "images/creds.pb-c.h" #include "images/core.pb-c.h" #include "images/pagemap.pb-c.h" #include "imgset.h" #include "parasite-syscall.h" #include "parasite.h" #include "crtools.h" #include "namespaces.h" #include "kerndat.h" #include "config.h" #include "pstree.h" #include "posix-timer.h" #include "mem.h" #include "criu-log.h" #include "vma.h" #include "proc_parse.h" #include "aio.h" #include "fault-injection.h" #include #include "signal.h" #include "sigframe.h" #include #include #include #include "dump.h" #include "restorer.h" #include "pie/pie-relocs.h" #include "infect.h" #include "infect-rpc.h" #include "pie/parasite-blob.h" #include unsigned long get_exec_start(struct vm_area_list *vmas) { struct vma_area *vma_area; list_for_each_entry(vma_area, &vmas->h, list) { unsigned long len; if (vma_area->e->start >= kdat.task_size) continue; if (!(vma_area->e->prot & PROT_EXEC)) continue; len = vma_area_len(vma_area); if (len < PARASITE_START_AREA_MIN) { pr_warn("Suspiciously short VMA @%#lx\n", (unsigned long)vma_area->e->start); continue; } return vma_area->e->start; } return 0; } /* * We need to detect parasite crashes not to hang on socket operations. * Since CRIU holds parasite with ptrace, it will receive SIGCHLD if the * latter would crash. * * This puts a restriction on how to execute a sub-process on dump stage. * One should use the cr_system helper, that blocks sigcild and waits * for the spawned program to finish. */ static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) { int pid, status; pid = waitpid(-1, &status, WNOHANG); if (pid <= 0) return; pr_err("si_code=%d si_pid=%d si_status=%d\n", siginfo->si_code, siginfo->si_pid, siginfo->si_status); if (WIFEXITED(status)) pr_err("%d exited with %d unexpectedly\n", pid, WEXITSTATUS(status)); else if (WIFSIGNALED(status)) pr_err("%d was killed by %d unexpectedly: %s\n", pid, WTERMSIG(status), strsignal(WTERMSIG(status))); else if (WIFSTOPPED(status)) pr_err("%d was stopped by %d unexpectedly\n", pid, WSTOPSIG(status)); exit(1); } static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c) { BUILD_BUG_ON(sizeof(ce->groups[0]) != sizeof(c->groups[0])); BUILD_BUG_ON(sizeof(ce->cap_inh[0]) != sizeof(c->cap_inh[0])); BUILD_BUG_ON(sizeof(ce->cap_prm[0]) != sizeof(c->cap_prm[0])); BUILD_BUG_ON(sizeof(ce->cap_eff[0]) != sizeof(c->cap_eff[0])); BUILD_BUG_ON(sizeof(ce->cap_bnd[0]) != sizeof(c->cap_bnd[0])); BUG_ON(ce->n_cap_inh != CR_CAP_SIZE); BUG_ON(ce->n_cap_prm != CR_CAP_SIZE); BUG_ON(ce->n_cap_eff != CR_CAP_SIZE); BUG_ON(ce->n_cap_bnd != CR_CAP_SIZE); memcpy(ce->cap_inh, c->cap_inh, sizeof(c->cap_inh[0]) * CR_CAP_SIZE); memcpy(ce->cap_prm, c->cap_prm, sizeof(c->cap_prm[0]) * CR_CAP_SIZE); memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); ce->secbits = c->secbits; ce->n_groups = c->ngroups; ce->groups = xmemdup(c->groups, sizeof(c->groups[0]) * c->ngroups); ce->uid = c->uids[0]; ce->gid = c->gids[0]; ce->euid = c->uids[1]; ce->egid = c->gids[1]; ce->suid = c->uids[2]; ce->sgid = c->gids[2]; ce->fsuid = c->uids[3]; ce->fsgid = c->gids[3]; return ce->groups ? 0 : -ENOMEM; } int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core) { ThreadCoreEntry *tc = core->thread_core; struct parasite_dump_thread *args; struct parasite_dump_creds *pc; int ret; args = compel_parasite_args(ctl, struct parasite_dump_thread); pc = args->creds; pc->cap_last_cap = kdat.last_cap; ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_THREAD, ctl); if (ret < 0) return ret; ret = alloc_groups_copy_creds(tc->creds, pc); if (ret) { pr_err("Can't copy creds for thread leader %d\n", pid); return -1; } return dump_thread_core(pid, core, args); } int parasite_dump_thread_seized(struct parasite_ctl *ctl, int id, struct pid *tid, CoreEntry *core) { struct parasite_dump_thread *args; pid_t pid = tid->real; ThreadCoreEntry *tc = core->thread_core; CredsEntry *creds = tc->creds; struct parasite_dump_creds *pc; int ret; struct parasite_thread_ctl *tctl; BUG_ON(id == 0); /* Leader is dumped in dump_task_core_all */ args = compel_parasite_args(ctl, struct parasite_dump_thread); pc = args->creds; pc->cap_last_cap = kdat.last_cap; tctl = compel_prepare_thread(ctl, pid); if (!tctl) return -1; tc->has_blk_sigset = true; memcpy(&tc->blk_sigset, compel_thread_sigmask(tctl), sizeof(k_rtsigset_t)); ret = compel_get_thread_regs(tctl, save_task_regs, core); if (ret) { pr_err("Can't obtain regs for thread %d\n", pid); goto err_rth; } ret = compel_run_in_thread(tctl, PARASITE_CMD_DUMP_THREAD); if (ret) { pr_err("Can't init thread in parasite %d\n", pid); goto err_rth; } ret = alloc_groups_copy_creds(creds, pc); if (ret) { pr_err("Can't copy creds for thread %d\n", pid); goto err_rth; } compel_release_thread(tctl); tid->ns[0].virt = args->tid; return dump_thread_core(pid, core, args); err_rth: compel_release_thread(tctl); return -1; } int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) { TaskCoreEntry *tc = item->core[0]->tc; struct parasite_dump_sa_args *args; int ret, sig; SaEntry *sa, **psa; args = compel_parasite_args(ctl, struct parasite_dump_sa_args); ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); if (ret < 0) return ret; psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); if (!psa) return -1; sa = (SaEntry *)(psa + SIGMAX - 2); tc->n_sigactions = SIGMAX - 2; tc->sigactions = psa; for (sig = 1; sig <= SIGMAX; sig++) { int i = sig - 1; if (sig == SIGSTOP || sig == SIGKILL) continue; sa_entry__init(sa); ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); sa->has_compat_sigaction = true; sa->compat_sigaction = !compel_mode_native(ctl); *(psa++) = sa++; } return 0; } static void encode_itimer(struct itimerval *v, ItimerEntry *ie) { ie->isec = v->it_interval.tv_sec; ie->iusec = v->it_interval.tv_usec; ie->vsec = v->it_value.tv_sec; ie->vusec = v->it_value.tv_usec; } int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) { CoreEntry *core = item->core[0]; struct parasite_dump_itimers_args *args; int ret; args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); if (ret < 0) return ret; encode_itimer((&args->real), (core->tc->timers->real)); \ encode_itimer((&args->virt), (core->tc->timers->virt)); \ encode_itimer((&args->prof), (core->tc->timers->prof)); \ return 0; } static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) { int sz; /* * Will be free()-ed in core_entry_free() */ sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); tte->posix = xmalloc(sz); if (!tte->posix) return -1; tte->n_posix = n; *pte = (PosixTimerEntry *)(tte->posix + n); return 0; } static void encode_posix_timer(struct posix_timer *v, struct proc_posix_timer *vp, PosixTimerEntry *pte) { pte->it_id = vp->spt.it_id; pte->clock_id = vp->spt.clock_id; pte->si_signo = vp->spt.si_signo; pte->it_sigev_notify = vp->spt.it_sigev_notify; pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); pte->overrun = v->overrun; pte->isec = v->val.it_interval.tv_sec; pte->insec = v->val.it_interval.tv_nsec; pte->vsec = v->val.it_value.tv_sec; pte->vnsec = v->val.it_value.tv_nsec; } int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, struct pstree_item *item) { CoreEntry *core = item->core[0]; TaskTimersEntry *tte = core->tc->timers; PosixTimerEntry *pte; struct proc_posix_timer *temp; struct parasite_dump_posix_timers_args *args; int args_size; int ret = 0; int i; if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) return -1; args_size = posix_timers_dump_size(proc_args->timer_n); args = compel_parasite_args_s(ctl, args_size); args->timer_n = proc_args->timer_n; i = 0; list_for_each_entry(temp, &proc_args->timers, list) { args->timer[i].it_id = temp->spt.it_id; i++; } ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); if (ret < 0) goto end_posix; i = 0; list_for_each_entry(temp, &proc_args->timers, list) { posix_timer_entry__init(&pte[i]); encode_posix_timer(&args->timer[i], temp, &pte[i]); tte->posix[i] = &pte[i]; i++; } end_posix: free_posix_timers(proc_args); return ret; } int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc) { struct parasite_dump_misc *ma; ma = compel_parasite_args(ctl, struct parasite_dump_misc); if (compel_rpc_call_sync(PARASITE_CMD_DUMP_MISC, ctl) < 0) return -1; *misc = *ma; return 0; } struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type) { struct parasite_tty_args *p; p = compel_parasite_args(ctl, struct parasite_tty_args); p->fd = fd; p->type = type; if (compel_rpc_call_sync(PARASITE_CMD_DUMP_TTY, ctl) < 0) return NULL; return p; } int parasite_drain_fds_seized(struct parasite_ctl *ctl, struct parasite_drain_fd *dfds, int nr_fds, int off, int *lfds, struct fd_opts *opts) { int ret = -1, size, sk; struct parasite_drain_fd *args; size = drain_fds_size(dfds); args = compel_parasite_args_s(ctl, size); args->nr_fds = nr_fds; memcpy(&args->fds, dfds->fds + off, sizeof(int) * nr_fds); ret = compel_rpc_call(PARASITE_CMD_DRAIN_FDS, ctl); if (ret) { pr_err("Parasite failed to drain descriptors\n"); goto err; } sk = compel_rpc_sock(ctl); ret = recv_fds(sk, lfds, nr_fds, opts, sizeof(struct fd_opts)); if (ret) pr_err("Can't retrieve FDs from socket\n"); ret |= compel_rpc_sync(PARASITE_CMD_DRAIN_FDS, ctl); err: return ret; } int parasite_get_proc_fd_seized(struct parasite_ctl *ctl) { int ret = -1, fd, sk; ret = compel_rpc_call(PARASITE_CMD_GET_PROC_FD, ctl); if (ret) { pr_err("Parasite failed to get proc fd\n"); return ret; } sk = compel_rpc_sock(ctl); fd = recv_fd(sk); if (fd < 0) pr_err("Can't retrieve FD from socket\n"); if (compel_rpc_sync(PARASITE_CMD_GET_PROC_FD, ctl)) { close_safe(&fd); return -1; } return fd; } /* This is officially the 50000'th line in the CRIU source code */ int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_args *cgroup) { int ret; struct parasite_dump_cgroup_args *ca; ca = compel_parasite_args(ctl, struct parasite_dump_cgroup_args); ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_CGROUP, ctl); if (ret) { pr_err("Parasite failed to dump /proc/self/cgroup\n"); return ret; } *cgroup = *ca; return 0; } static unsigned long parasite_args_size = PARASITE_ARG_SIZE_MIN; void parasite_ensure_args_size(unsigned long sz) { if (parasite_args_size < sz) parasite_args_size = sz; } static int make_sigframe(void *arg, struct rt_sigframe *sf, struct rt_sigframe *rtsf, k_rtsigset_t *bs) { return construct_sigframe(sf, rtsf, bs, (CoreEntry *)arg); } struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, struct vm_area_list *vma_area_list) { struct parasite_ctl *ctl; struct infect_ctx *ictx; unsigned long p; BUG_ON(item->threads[0].real != pid); p = get_exec_start(vma_area_list); if (!p) { pr_err("No suitable VM found\n"); return NULL; } ctl = compel_prepare_noctx(pid); if (!ctl) return NULL; ictx = compel_infect_ctx(ctl); ictx->open_proc = do_open_proc; ictx->child_handler = sigchld_handler; ictx->orig_handler.sa_handler = SIG_DFL; ictx->orig_handler.sa_flags = SA_SIGINFO | SA_RESTART; sigemptyset(&ictx->orig_handler.sa_mask); sigaddset(&ictx->orig_handler.sa_mask, SIGCHLD); ictx->sock = dmpi(item)->netns->net.seqsk; ictx->save_regs = save_task_regs; ictx->make_sigframe = make_sigframe; ictx->regs_arg = item->core[0]; ictx->task_size = kdat.task_size; ictx->syscall_ip = p; pr_debug("Parasite syscall_ip at %#lx\n", p); if (fault_injected(FI_NO_MEMFD)) ictx->flags |= INFECT_NO_MEMFD; if (fault_injected(FI_PARASITE_CONNECT)) ictx->flags |= INFECT_FAIL_CONNECT; if (fault_injected(FI_NO_BREAKPOINTS)) ictx->flags |= INFECT_NO_BREAKPOINTS; if (kdat.compat_cr) ictx->flags |= INFECT_COMPATIBLE; ictx->log_fd = log_get_fd(); parasite_setup_c_header(ctl); parasite_ensure_args_size(dump_pages_args_size(vma_area_list)); parasite_ensure_args_size(aio_rings_args_size(vma_area_list)); if (compel_infect(ctl, item->nr_threads, parasite_args_size) < 0) { compel_cure(ctl); return NULL; } parasite_args_size = PARASITE_ARG_SIZE_MIN; /* reset for next task */ memcpy(&item->core[0]->tc->blk_sigset, compel_task_sigmask(ctl), sizeof(k_rtsigset_t)); dmpi(item)->parasite_ctl = ctl; return ctl; } criu-3.6/criu/path.c000066400000000000000000000044571317335042600144040ustar00rootroot00000000000000#include #include #include #include "int.h" #include "mount.h" #include "path.h" #include "log.h" #include "common/bug.h" char *cut_root_for_bind(char *target_root, char *source_root) { int tok = 0; char *path = NULL; /* * Cut common part of root. * For non-root binds the source is always "/" (checked) * so this will result in this slash removal only. */ while (target_root[tok] == source_root[tok]) { tok++; if (source_root[tok] == '\0') { path = target_root + tok; goto out; } if (target_root[tok] == '\0') { path = source_root + tok; goto out; } } return NULL; out: BUG_ON(path == NULL); if (path[0] == '/') path++; return path; } char *mnt_get_sibling_path(struct mount_info *m, struct mount_info *p, char *buf, int len) { struct mount_info *pa = m->parent; char *rpath, *cut_root, *path = buf; int off = 0; if (pa == NULL) return NULL; rpath = m->mountpoint + strlen(pa->mountpoint); if (rpath[0] == '/') rpath++; /* * Get a path to a sibling of "m" with parent "p", * return NULL is p can't have a sibling of m. * * Here are two cases: * When a parent of "m" has longer root than "p": * / pm->root / rpath * | cut_root | * / p->root / * In this case, a sibling path is a sum of p->mountpoint, * cut_root and rpath. * * When a parent of m has shorter root than "p": * / pm->root / rpath * | cut_root | * / p->root / rpath +strlen(cut_root) * In this case, a sibling path is a sum of p->mountpoint and * rpath - strlen(cut_root). */ cut_root = cut_root_for_bind(pa->root, p->root); if (cut_root == NULL) return NULL; if (p->mountpoint[1] != 0) /* not "/" */ { off = snprintf(path, len, "%s", p->mountpoint); if (path[off - 1] == '/') /* p->mountpoint = "./" */ off--; } len -= off; path += off; if (strlen(pa->root) > strlen(p->root)) { off = snprintf(path, len, "/%s", cut_root); len -= off; path += off; } else { int len = strlen(cut_root); if (strncmp(rpath, cut_root, len)) return NULL; rpath += strlen(cut_root); if (len > 0 && (rpath[0] && rpath[0] != '/')) return NULL; } if (rpath[0] == '/') rpath++; if (rpath[0] != '\0') off = snprintf(path, len, "/%s", rpath); return buf; } criu-3.6/criu/pie-util-vdso-elf32.c000077700000000000000000000000001317335042600226232pie/util-vdso-elf32.custar00rootroot00000000000000criu-3.6/criu/pie-util-vdso.c000077700000000000000000000000001317335042600210012pie/util-vdso.custar00rootroot00000000000000criu-3.6/criu/pie-util.c000077700000000000000000000000001317335042600170572pie/util.custar00rootroot00000000000000criu-3.6/criu/pie/000077500000000000000000000000001317335042600140475ustar00rootroot00000000000000criu-3.6/criu/pie/Makefile000066400000000000000000000025461317335042600155160ustar00rootroot00000000000000target := parasite restorer CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) ccflags-y += $(COMPEL_UAPI_INCLUDES) ccflags-y += $(CFLAGS_PIE) ccflags-y += -DCR_NOGLIBC ccflags-y += -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 ccflags-y += -Wp,-U_FORTIFY_SOURCE -Wp,-D_FORTIFY_SOURCE=0 ifneq ($(filter-out clean mrproper,$(MAKECMDGOALS)),) CFLAGS += $(shell $(COMPEL_BIN) cflags) LDFLAGS += $(shell $(COMPEL_BIN) ldflags) compel_plugins := $(shell $(COMPEL_BIN) plugins) endif ifeq ($(SRCARCH),arm) ccflags-y += -marm endif asflags-y += -D__ASSEMBLY__ LDS := compel/arch/$(SRCARCH)/scripts/compel-pack.lds.S restorer-obj-y += ./$(ARCH_DIR)/restorer.o ifeq ($(ARCH),x86) ifeq ($(CONFIG_COMPAT),y) restorer-obj-y += ./$(ARCH_DIR)/call32.o restorer-obj-y += ./$(ARCH_DIR)/restorer_unmap.o restorer-obj-y += ./$(ARCH_DIR)/sigaction_compat_pie.o endif endif define gen-pie-rules $(1)-obj-y += $(1).o $(1)-obj-e += pie.lib.a $(1)-obj-e += $$(compel_plugins) # Dependency on compel linker script, to relink if it has changed $$(obj)/$(1).built-in.o: $$(LDS) $$(obj)/$(1)-blob.h: $$(obj)/$(1).built-in.o $$(call msg-gen, $$@) $$(Q) $$(COMPEL_BIN) hgen -f $$< -o $$@ all-y += $$(obj)/$(1)-blob.h cleanup-y += $$(obj)/$(1)-blob.h endef $(foreach t,$(target),$(eval $(call gen-pie-rules,$(t)))) criu-3.6/criu/pie/Makefile.library000066400000000000000000000017761317335042600171650ustar00rootroot00000000000000lib-name := pie.lib.a CFLAGS += -fno-stack-protector -DCR_NOGLIBC -fpie lib-y += util.o ifeq ($(VDSO),y) lib-y += util-vdso.o parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o ifeq ($(SRCARCH),aarch64) lib-y += ./$(ARCH_DIR)/intraprocedure.o endif ifeq ($(SRCARCH),ppc64) lib-y += ./$(ARCH_DIR)/vdso-trampoline.o endif endif ifeq ($(SRCARCH),ppc64) lib-y += ./$(ARCH_DIR)/misc.o endif ifeq ($(SRCARCH),x86) ifeq ($(CONFIG_COMPAT),y) lib-y += util-vdso-elf32.o endif CFLAGS_util-vdso-elf32.o += -DCONFIG_VDSO_32 endif # # We can't provide proper mount implementation # in parasite code -- it requires run-time rellocation # applications, which is not the target of the # project. # CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) asflags-y := -D__ASSEMBLY__ ccflags-y += $(COMPEL_UAPI_INCLUDES) ccflags-y += $(CFLAGS_PIE) ifeq ($(SRCARCH),arm) ccflags-y += -marm endif criu-3.6/criu/pie/parasite-vdso.c000066400000000000000000000166301317335042600170020ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "int.h" #include "types.h" #include "page.h" #include #include "image.h" #include "parasite-vdso.h" #include "vma.h" #include "log.h" #include "common/bug.h" #ifdef LOG_PREFIX # undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size) { unsigned long addr; pr_debug("Remap %s %lx -> %lx\n", who, from, to); addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to); if (addr != to) { pr_err("Unable to remap %lx -> %lx %lx\n", from, to, addr); return -1; } return 0; } /* * Park runtime vDSO in some safe place where it can be accessible * from the restorer */ int vdso_do_park(struct vdso_maps *rt, unsigned long park_at, unsigned long park_size) { unsigned long vvar_size = rt->sym.vvar_size; unsigned long vdso_size = rt->sym.vdso_size; unsigned long rt_vvar_park = park_at; unsigned long rt_vdso_park = park_at; int ret; if (rt->vvar_start == VVAR_BAD_ADDR) { BUG_ON(vdso_size < park_size); return vdso_remap("rt-vdso", rt->vdso_start, rt_vdso_park, vdso_size); } BUG_ON((vdso_size + vvar_size) < park_size); if (rt->sym.vdso_before_vvar) rt_vvar_park = park_at + vdso_size; else rt_vdso_park = park_at + vvar_size; ret = vdso_remap("rt-vdso", rt->vdso_start, rt_vdso_park, vdso_size); ret |= vdso_remap("rt-vvar", rt->vvar_start, rt_vvar_park, vvar_size); return ret; } /* XXX: move in arch/ */ #if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) int __vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t, bool compat_vdso) { if (compat_vdso) return vdso_fill_symtable_compat(mem, size, t); else return vdso_fill_symtable(mem, size, t); } #else int __vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t, bool __always_unused compat_vdso) { return vdso_fill_symtable(mem, size, t); } #endif /* * Proxification strategy * * - There might be two vDSO zones: vdso code and optionally vvar data * - To be able to use in-place remapping we need * * a) Size and order of vDSO zones are to match * b) Symbols offsets must match * c) Have same number of vDSO zones */ static bool blobs_matches(VmaEntry *vdso_img, VmaEntry *vvar_img, struct vdso_symtable *sym_img, struct vdso_symtable *sym_rt) { size_t i; if (vma_entry_len(vdso_img) != sym_rt->vdso_size) return false; for (i = 0; i < ARRAY_SIZE(sym_img->symbols); i++) { if (sym_img->symbols[i].offset != sym_rt->symbols[i].offset) return false; } if (vvar_img && sym_rt->vvar_size != VVAR_BAD_SIZE) { bool vdso_firstly = (vvar_img->start > vdso_img->start); if (sym_rt->vvar_size != vma_entry_len(vvar_img)) return false; return (vdso_firstly == sym_rt->vdso_before_vvar); } return true; } /* * The easy case -- the vdso from an image has the same offsets, * order and size as runtime vdso, so we simply remap runtime vdso * to dumpee position without generating any proxy. */ static int remap_rt_vdso(VmaEntry *vma_vdso, VmaEntry *vma_vvar, struct vdso_symtable *sym_rt, unsigned long vdso_rt_parked_at) { unsigned long rt_vvar_addr = vdso_rt_parked_at; unsigned long rt_vdso_addr = vdso_rt_parked_at; int ret; pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n"); if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) { pr_err("Failed to unmap dumpee vdso\n"); return -1; } if (!vma_vvar) { return vdso_remap("rt-vdso", rt_vdso_addr, vma_vdso->start, sym_rt->vdso_size); } if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) { pr_err("Failed to unmap dumpee vvar\n"); return -1; } if (vma_vdso->start < vma_vvar->start) rt_vvar_addr = vdso_rt_parked_at + sym_rt->vdso_size; else rt_vdso_addr = vdso_rt_parked_at + sym_rt->vvar_size; ret = vdso_remap("rt-vdso", rt_vdso_addr, vma_vdso->start, sym_rt->vdso_size); ret |= vdso_remap("rt-vvar", rt_vvar_addr, vma_vvar->start, sym_rt->vvar_size); return ret; } /* * The complex case -- we need to proxify calls. We redirect * calls from dumpee vdso to runtime vdso, making dumpee * to operate as proxy vdso. */ static int add_vdso_proxy(VmaEntry *vma_vdso, VmaEntry *vma_vvar, struct vdso_symtable *sym_img, struct vdso_symtable *sym_rt, unsigned long vdso_rt_parked_at, bool compat_vdso) { unsigned long rt_vvar_addr = vdso_rt_parked_at; unsigned long rt_vdso_addr = vdso_rt_parked_at; unsigned long orig_vvar_addr = vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR; pr_info("Runtime vdso mismatches dumpee, generate proxy\n"); /* * Don't forget to shift if vvar is before vdso. */ if (sym_rt->vvar_size == VVAR_BAD_SIZE) { rt_vvar_addr = VVAR_BAD_ADDR; } else { if (sym_rt->vdso_before_vvar) rt_vvar_addr += sym_rt->vdso_size; else rt_vdso_addr += sym_rt->vvar_size; } /* * Note: we assume that after first migration with inserted * rt-vdso and trampoilines on the following migrations * number of vdso symbols will not decrease. * We don't save the content of original vdso under inserted * jumps, so we can't remove them if on the following migration * found that number of symbols in vdso has decreased. */ if (vdso_redirect_calls(rt_vdso_addr, vma_vdso->start, sym_rt, sym_img, compat_vdso)) { pr_err("Failed to proxify dumpee contents\n"); return -1; } /* * Put a special mark into runtime vdso, thus at next checkpoint * routine we could detect this vdso and do not dump it, since * it's auto-generated every new session if proxy required. */ sys_mprotect((void *)rt_vdso_addr, sym_rt->vdso_size, PROT_WRITE); vdso_put_mark((void *)rt_vdso_addr, rt_vvar_addr, vma_vdso->start, orig_vvar_addr); sys_mprotect((void *)rt_vdso_addr, sym_rt->vdso_size, VDSO_PROT); return 0; } int vdso_proxify(struct vdso_symtable *sym_rt, unsigned long vdso_rt_parked_at, VmaEntry *vmas, size_t nr_vmas, bool compat_vdso, bool force_trampolines) { VmaEntry *vma_vdso = NULL, *vma_vvar = NULL; struct vdso_symtable s = VDSO_SYMTABLE_INIT; unsigned int i; for (i = 0; i < nr_vmas; i++) { if (vma_entry_is(&vmas[i], VMA_AREA_VDSO)) vma_vdso = &vmas[i]; else if (vma_entry_is(&vmas[i], VMA_AREA_VVAR)) vma_vvar = &vmas[i]; } if (!vma_vdso && !vma_vvar) { pr_info("No VVAR, no vDSO in image\n"); /* * We don't have to unmap rt-vdso, rt-vvar as we didn't * park them previously. */ return 0; } if (!vma_vdso) { pr_err("Can't find vDSO area in image\n"); return -1; } /* * vDSO mark overwrites Elf program header of proxy vDSO thus * it must never ever be greater in size. */ BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr)); /* * Find symbols in vDSO zone read from image. */ if (__vdso_fill_symtable((uintptr_t)vma_vdso->start, vma_entry_len(vma_vdso), &s, compat_vdso)) return -1; pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n", (unsigned long)vma_vdso->start, (unsigned long)vma_vdso->end, vma_vvar ? (unsigned long)vma_vvar->start : VVAR_BAD_ADDR, vma_vvar ? (unsigned long)vma_vvar->end : VVAR_BAD_ADDR); if (blobs_matches(vma_vdso, vma_vvar, &s, sym_rt) && !force_trampolines) { return remap_rt_vdso(vma_vdso, vma_vvar, sym_rt, vdso_rt_parked_at); } return add_vdso_proxy(vma_vdso, vma_vvar, &s, sym_rt, vdso_rt_parked_at, compat_vdso); } criu-3.6/criu/pie/parasite.c000066400000000000000000000346671317335042600160430ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "int.h" #include "types.h" #include #include "parasite.h" #include "config.h" #include "fcntl.h" #include "prctl.h" #include "common/lock.h" #include "parasite-vdso.h" #include "criu-log.h" #include "tty.h" #include "aio.h" #include "asm/parasite.h" #include "restorer.h" #include "infect-pie.h" /* * PARASITE_CMD_DUMPPAGES is called many times and the parasite args contains * an array of VMAs at this time, so VMAs can be unprotected in any moment */ static struct parasite_dump_pages_args *mprotect_args = NULL; #ifndef SPLICE_F_GIFT #define SPLICE_F_GIFT 0x08 #endif #ifndef PR_GET_PDEATHSIG #define PR_GET_PDEATHSIG 2 #endif static int mprotect_vmas(struct parasite_dump_pages_args *args) { struct parasite_vma_entry *vmas, *vma; int ret = 0, i; vmas = pargs_vmas(args); for (i = 0; i < args->nr_vmas; i++) { vma = vmas + i; ret = sys_mprotect((void *)vma->start, vma->len, vma->prot | args->add_prot); if (ret) { pr_err("mprotect(%08lx, %lu) failed with code %d\n", vma->start, vma->len, ret); break; } } if (args->add_prot) mprotect_args = args; else mprotect_args = NULL; return ret; } static int dump_pages(struct parasite_dump_pages_args *args) { int p, ret, tsock; struct iovec *iovs; tsock = parasite_get_rpc_sock(); p = recv_fd(tsock); if (p < 0) return -1; iovs = pargs_iovs(args); ret = sys_vmsplice(p, &iovs[args->off], args->nr_segs, SPLICE_F_GIFT | SPLICE_F_NONBLOCK); if (ret != PAGE_SIZE * args->nr_pages) { sys_close(p); pr_err("Can't splice pages to pipe (%d/%d)\n", ret, args->nr_pages); return -1; } sys_close(p); return 0; } static int dump_sigact(struct parasite_dump_sa_args *da) { int sig, ret = 0; for (sig = 1; sig <= SIGMAX; sig++) { int i = sig - 1; if (sig == SIGKILL || sig == SIGSTOP) continue; ret = sys_sigaction(sig, NULL, &da->sas[i], sizeof(k_rtsigset_t)); if (ret < 0) { pr_err("sys_sigaction failed (%d)\n", ret); break; } } return ret; } static int dump_itimers(struct parasite_dump_itimers_args *args) { int ret; ret = sys_getitimer(ITIMER_REAL, &args->real); if (!ret) ret = sys_getitimer(ITIMER_VIRTUAL, &args->virt); if (!ret) ret = sys_getitimer(ITIMER_PROF, &args->prof); if (ret) pr_err("getitimer failed (%d)\n", ret); return ret; } static int dump_posix_timers(struct parasite_dump_posix_timers_args *args) { int i; int ret = 0; for(i = 0; i < args->timer_n; i++) { ret = sys_timer_gettime(args->timer[i].it_id, &args->timer[i].val); if (ret < 0) { pr_err("sys_timer_gettime failed (%d)\n", ret); return ret; } args->timer[i].overrun = sys_timer_getoverrun(args->timer[i].it_id); ret = args->timer[i].overrun; if (ret < 0) { pr_err("sys_timer_getoverrun failed (%d)\n", ret); return ret; } } return ret; } static int dump_creds(struct parasite_dump_creds *args); static int dump_thread_common(struct parasite_dump_thread *ti) { int ret; arch_get_tls(&ti->tls); ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long) &ti->tid_addr, 0, 0, 0); if (ret) goto out; ret = sys_sigaltstack(NULL, &ti->sas); if (ret) goto out; ret = sys_prctl(PR_GET_PDEATHSIG, (unsigned long)&ti->pdeath_sig, 0, 0, 0); if (ret) goto out; ret = dump_creds(ti->creds); out: return ret; } static int dump_misc(struct parasite_dump_misc *args) { args->brk = sys_brk(0); args->pid = sys_getpid(); args->sid = sys_getsid(); args->pgid = sys_getpgid(0); args->umask = sys_umask(0); sys_umask(args->umask); /* never fails */ args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); args->thp_disabled = sys_prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); return 0; } static int dump_creds(struct parasite_dump_creds *args) { int ret, i, j; struct cap_data data[_LINUX_CAPABILITY_U32S_3]; struct cap_header hdr = {_LINUX_CAPABILITY_VERSION_3, 0}; ret = sys_capget(&hdr, data); if (ret < 0) { pr_err("Unable to get capabilities: %d\n", ret); return -1; } /* * Loop through the capability constants until we reach cap_last_cap. * The cap_bnd set is stored as a bitmask comprised of CR_CAP_SIZE number of * 32-bit uints, hence the inner loop from 0 to 32. */ for (i = 0; i < CR_CAP_SIZE; i++) { args->cap_eff[i] = data[i].eff; args->cap_prm[i] = data[i].prm; args->cap_inh[i] = data[i].inh; args->cap_bnd[i] = 0; for (j = 0; j < 32; j++) { if (j + i * 32 > args->cap_last_cap) break; ret = sys_prctl(PR_CAPBSET_READ, j + i * 32, 0, 0, 0); if (ret < 0) { pr_err("Unable to read capability %d: %d\n", j + i * 32, ret); return -1; } if (ret) args->cap_bnd[i] |= (1 << j); } } args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); ret = sys_getgroups(0, NULL); if (ret < 0) goto grps_err; args->ngroups = ret; if (args->ngroups >= PARASITE_MAX_GROUPS) { pr_err("Too many groups in task %d\n", (int)args->ngroups); return -1; } ret = sys_getgroups(args->ngroups, args->groups); if (ret < 0) goto grps_err; if (ret != args->ngroups) { pr_err("Groups changed on the fly %d -> %d\n", args->ngroups, ret); return -1; } ret = sys_getresuid(&args->uids[0], &args->uids[1], &args->uids[2]); if (ret) { pr_err("Unable to get uids: %d\n", ret); return -1; } args->uids[3] = sys_setfsuid(-1L); /* * FIXME In https://github.com/xemul/criu/issues/95 it is * been reported that only low 16 bits are set upon syscall * on ARMv7. * * We may rather need implement builtin-memset and clear the * whole memory needed here. */ args->gids[0] = args->gids[1] = args->gids[2] = args->gids[3] = 0; ret = sys_getresgid(&args->gids[0], &args->gids[1], &args->gids[2]); if (ret) { pr_err("Unable to get uids: %d\n", ret); return -1; } args->gids[3] = sys_setfsgid(-1L); return 0; grps_err: pr_err("Error calling getgroups (%d)\n", ret); return -1; } static int fill_fds_opts(struct parasite_drain_fd *fds, struct fd_opts *opts) { int i; for (i = 0; i < fds->nr_fds; i++) { int flags, fd = fds->fds[i], ret; struct fd_opts *p = opts + i; struct f_owner_ex owner_ex; uint32_t v[2]; flags = sys_fcntl(fd, F_GETFD, 0); if (flags < 0) { pr_err("fcntl(%d, F_GETFD) -> %d\n", fd, flags); return -1; } p->flags = (char)flags; ret = sys_fcntl(fd, F_GETOWN_EX, (long)&owner_ex); if (ret) { pr_err("fcntl(%d, F_GETOWN_EX) -> %d\n", fd, ret); return -1; } /* * Simple case -- nothing is changed. */ if (owner_ex.pid == 0) { p->fown.pid = 0; continue; } ret = sys_fcntl(fd, F_GETOWNER_UIDS, (long)&v); if (ret) { pr_err("fcntl(%d, F_GETOWNER_UIDS) -> %d\n", fd, ret); return -1; } p->fown.uid = v[0]; p->fown.euid = v[1]; p->fown.pid_type = owner_ex.type; p->fown.pid = owner_ex.pid; } return 0; } static int drain_fds(struct parasite_drain_fd *args) { int ret, tsock; struct fd_opts *opts; /* * See the drain_fds_size() in criu code, the memory * for this args is ensured to be large enough to keep * an array of fd_opts at the tail. */ opts = ((void *)args) + sizeof(*args) + args->nr_fds * sizeof(args->fds[0]); ret = fill_fds_opts(args, opts); if (ret) return ret; tsock = parasite_get_rpc_sock(); ret = send_fds(tsock, NULL, 0, args->fds, args->nr_fds, opts, sizeof(struct fd_opts)); if (ret) pr_err("send_fds failed (%d)\n", ret); return ret; } static int dump_thread(struct parasite_dump_thread *args) { args->tid = sys_gettid(); return dump_thread_common(args); } static char proc_mountpoint[] = "proc.crtools"; static int pie_atoi(char *str) { int ret = 0; while (*str) { ret *= 10; ret += *str - '0'; str++; } return ret; } static int get_proc_fd(void) { int ret; char buf[11]; ret = sys_readlinkat(AT_FDCWD, "/proc/self", buf, sizeof(buf) - 1); if (ret < 0 && ret != -ENOENT) { pr_err("Can't readlink /proc/self (%d)\n", ret); return ret; } if (ret > 0) { buf[ret] = 0; /* Fast path -- if /proc belongs to this pidns */ if (pie_atoi(buf) == sys_getpid()) return sys_open("/proc", O_RDONLY, 0); } ret = sys_mkdir(proc_mountpoint, 0700); if (ret) { pr_err("Can't create a directory (%d)\n", ret); return -1; } ret = sys_mount("proc", proc_mountpoint, "proc", MS_MGC_VAL, NULL); if (ret) { if (ret == -EPERM) pr_err("can't dump unpriviliged task whose /proc doesn't belong to it\n"); else pr_err("mount failed (%d)\n", ret); sys_rmdir(proc_mountpoint); return -1; } return open_detach_mount(proc_mountpoint); } static int parasite_get_proc_fd(void) { int fd, ret, tsock; fd = get_proc_fd(); if (fd < 0) { pr_err("Can't get /proc fd\n"); return -1; } tsock = parasite_get_rpc_sock(); ret = send_fd(tsock, NULL, 0, fd); sys_close(fd); return ret; } static inline int tty_ioctl(int fd, int cmd, int *arg) { int ret; ret = sys_ioctl(fd, cmd, (unsigned long)arg); if (ret < 0) { if (ret != -ENOTTY) return ret; *arg = 0; } return 0; } /* * Stolen from kernel/fs/aio.c * * Is it valid to go to memory and check it? Should be, * as libaio does the same. */ #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 static int sane_ring(struct parasite_aio *aio) { struct aio_ring *ring = (struct aio_ring *)aio->ctx; unsigned nr; nr = (aio->size - sizeof(struct aio_ring)) / sizeof(struct io_event); return ring->magic == AIO_RING_MAGIC && ring->compat_features == AIO_RING_COMPAT_FEATURES && ring->incompat_features == AIO_RING_INCOMPAT_FEATURES && ring->header_length == sizeof(struct aio_ring) && ring->nr == nr; } static int parasite_check_aios(struct parasite_check_aios_args *args) { int i; for (i = 0; i < args->nr_rings; i++) { struct aio_ring *ring; ring = (struct aio_ring *)args->ring[i].ctx; if (!sane_ring(&args->ring[i])) { pr_err("Not valid ring #%d\n", i); pr_info(" `- magic %x\n", ring->magic); pr_info(" `- cf %d\n", ring->compat_features); pr_info(" `- if %d\n", ring->incompat_features); pr_info(" `- header size %d (%zd)\n", ring->header_length, sizeof(struct aio_ring)); pr_info(" `- nr %d\n", ring->nr); return -1; } /* XXX: wait aio completion */ } return 0; } static int parasite_dump_tty(struct parasite_tty_args *args) { int ret; #ifndef TIOCGPKT # define TIOCGPKT _IOR('T', 0x38, int) #endif #ifndef TIOCGPTLCK # define TIOCGPTLCK _IOR('T', 0x39, int) #endif #ifndef TIOCGEXCL # define TIOCGEXCL _IOR('T', 0x40, int) #endif args->sid = 0; args->pgrp = 0; args->st_pckt = 0; args->st_lock = 0; args->st_excl = 0; #define __tty_ioctl(cmd, arg) \ do { \ ret = tty_ioctl(args->fd, cmd, &arg); \ if (ret < 0) { \ if (ret == -ENOTTY) \ arg = 0; \ else if (ret == -EIO) \ goto err_io; \ else \ goto err; \ } \ } while (0) __tty_ioctl(TIOCGSID, args->sid); __tty_ioctl(TIOCGPGRP, args->pgrp); __tty_ioctl(TIOCGEXCL, args->st_excl); if (args->type == TTY_TYPE__PTY) { __tty_ioctl(TIOCGPKT, args->st_pckt); __tty_ioctl(TIOCGPTLCK, args->st_lock); } args->hangup = false; return 0; err: pr_err("tty: Can't fetch params: err = %d\n", ret); return -1; err_io: /* kernel reports EIO for get ioctls on pair-less ptys */ pr_debug("tty: EIO on tty\n"); args->hangup = true; return 0; #undef __tty_ioctl } #ifdef CONFIG_VDSO static int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args) { struct vdso_mark *m = (void *)args->start; if (is_vdso_mark(m)) { /* * Make sure we don't meet some corrupted entry * where signature matches but verions is not! */ if (m->version != VDSO_MARK_CUR_VERSION) { pr_err("vdso: Mark version mismatch!\n"); return -EINVAL; } args->is_marked = 1; args->orig_vdso_addr = m->orig_vdso_addr; args->orig_vvar_addr = m->orig_vvar_addr; args->rt_vvar_addr = m->rt_vvar_addr; } else { args->is_marked = 0; args->orig_vdso_addr = VDSO_BAD_ADDR; args->orig_vvar_addr = VVAR_BAD_ADDR; args->rt_vvar_addr = VVAR_BAD_ADDR; if (args->try_fill_symtable) { struct vdso_symtable t; if (vdso_fill_symtable(args->start, args->len, &t)) args->is_vdso = false; else args->is_vdso = true; } } return 0; } #else static inline int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args) { pr_err("Unexpected VDSO check command\n"); return -1; } #endif static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) { int proc, cgroup, len; proc = get_proc_fd(); if (proc < 0) { pr_err("can't get /proc fd\n"); return -1; } cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0); sys_close(proc); if (cgroup < 0) { pr_err("can't get /proc/self/cgroup fd\n"); sys_close(cgroup); return -1; } len = sys_read(cgroup, args->contents, sizeof(args->contents)); sys_close(cgroup); if (len < 0) { pr_err("can't read /proc/self/cgroup %d\n", len); return -1; } if (len == sizeof(*args)) { pr_warn("/proc/self/cgroup was bigger than the page size\n"); return -1; } /* null terminate */ args->contents[len] = 0; return 0; } void parasite_cleanup(void) { if (mprotect_args) { mprotect_args->add_prot = 0; mprotect_vmas(mprotect_args); } } int parasite_daemon_cmd(int cmd, void *args) { int ret; switch (cmd) { case PARASITE_CMD_DUMPPAGES: ret = dump_pages(args); break; case PARASITE_CMD_MPROTECT_VMAS: ret = mprotect_vmas(args); break; case PARASITE_CMD_DUMP_SIGACTS: ret = dump_sigact(args); break; case PARASITE_CMD_DUMP_ITIMERS: ret = dump_itimers(args); break; case PARASITE_CMD_DUMP_POSIX_TIMERS: ret = dump_posix_timers(args); break; case PARASITE_CMD_DUMP_THREAD: ret = dump_thread(args); break; case PARASITE_CMD_DUMP_MISC: ret = dump_misc(args); break; case PARASITE_CMD_DRAIN_FDS: ret = drain_fds(args); break; case PARASITE_CMD_GET_PROC_FD: ret = parasite_get_proc_fd(); break; case PARASITE_CMD_DUMP_TTY: ret = parasite_dump_tty(args); break; case PARASITE_CMD_CHECK_AIOS: ret = parasite_check_aios(args); break; case PARASITE_CMD_CHECK_VDSO_MARK: ret = parasite_check_vdso_mark(args); break; case PARASITE_CMD_DUMP_CGROUP: ret = parasite_dump_cgroup(args); break; default: pr_err("Unknown command in parasite daemon thread leader: %d\n", cmd); ret = -1; break; } return ret; } int parasite_trap_cmd(int cmd, void *args) { switch (cmd) { case PARASITE_CMD_DUMP_THREAD: return dump_thread(args); } pr_err("Unknown command to parasite: %d\n", cmd); return -EINVAL; } criu-3.6/criu/pie/pie-relocs.h000066400000000000000000000004411317335042600162610ustar00rootroot00000000000000#ifndef __PIE_RELOCS_H__ #define __PIE_RELOCS_H__ #include #include "common/compiler.h" #include "config.h" #define pie_size(__pie_name) (round_up(sizeof(__pie_name##_blob) + \ __pie_name ## _nr_gotpcrel * sizeof(long), page_size())) #endif /* __PIE_RELOCS_H__ */ criu-3.6/criu/pie/restorer.c000066400000000000000000001246341317335042600160720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "linux/userfaultfd.h" #include "int.h" #include "types.h" #include "common/compiler.h" #include #include #include #include "signal.h" #include "config.h" #include "prctl.h" #include "criu-log.h" #include "util.h" #include "image.h" #include "sk-inet.h" #include "vma.h" #include "uffd.h" #include "common/lock.h" #include "restorer.h" #include "aio.h" #include "seccomp.h" #include "images/creds.pb-c.h" #include "images/mm.pb-c.h" #include "shmem.h" #include "restorer.h" #ifndef PR_SET_PDEATHSIG #define PR_SET_PDEATHSIG 1 #endif #define sys_prctl_safe(opcode, val1, val2, val3) \ ({ \ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ if (__ret) \ pr_err("prctl failed @%d with %ld\n", __LINE__, __ret);\ __ret; \ }) static struct task_entries *task_entries_local; static futex_t thread_inprogress; static pid_t *helpers; static int n_helpers; static pid_t *zombies; static int n_zombies; static enum faults fi_strategy; bool fault_injected(enum faults f) { return __fault_injected(f, fi_strategy); } /* * These are stubs for std compel plugin. */ int parasite_daemon_cmd(int cmd, void *args) { return 0; } int parasite_trap_cmd(int cmd, void *args) { return 0; } void parasite_cleanup(void) { } extern void cr_restore_rt (void) asm ("__cr_restore_rt") __attribute__ ((visibility ("hidden"))); static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) { char *r; int i; /* We can ignore helpers that die, we expect them to after * CR_STATE_RESTORE is finished. */ for (i = 0; i < n_helpers; i++) if (siginfo->si_pid == helpers[i]) return; for (i = 0; i < n_zombies; i++) if (siginfo->si_pid == zombies[i]) return; if (siginfo->si_code == CLD_EXITED) r = "exited, status="; else if (siginfo->si_code == CLD_KILLED) r = "killed by signal"; else if (siginfo->si_code == CLD_DUMPED) r = "terminated abnormally with"; else if (siginfo->si_code == CLD_TRAPPED) r = "trapped with"; else if (siginfo->si_code == CLD_STOPPED) r = "stopped with"; else r = "disappeared with"; pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status); futex_abort_and_wake(&task_entries_local->nr_in_progress); /* sa_restorer may be unmaped, so we can't go back to userspace*/ sys_kill(sys_getpid(), SIGSTOP); sys_exit_group(1); } static int lsm_set_label(char *label, int procfd) { int ret = -1, len, lsmfd; char path[STD_LOG_SIMPLE_CHUNK]; if (!label) return 0; pr_info("restoring lsm profile %s\n", label); std_sprintf(path, "self/task/%ld/attr/current", sys_gettid()); lsmfd = sys_openat(procfd, path, O_WRONLY, 0); if (lsmfd < 0) { pr_err("failed openat %d\n", lsmfd); return -1; } for (len = 0; label[len]; len++) ; ret = sys_write(lsmfd, label, len); sys_close(lsmfd); if (ret < 0) { pr_err("can't write lsm profile %d\n", ret); return -1; } return 0; } static int restore_creds(struct thread_creds_args *args, int procfd) { CredsEntry *ce = &args->creds; int b, i, ret; struct cap_header hdr; struct cap_data data[_LINUX_CAPABILITY_U32S_3]; /* * We're still root here and thus can do it without failures. */ /* * Setup supplementary group IDs early. */ if (args->groups) { ret = sys_setgroups(ce->n_groups, args->groups); if (ret) { pr_err("Can't setup supplementary group IDs: %d\n", ret); return -1; } } /* * First -- set the SECURE_NO_SETUID_FIXUP bit not to * lose caps bits when changing xids. */ ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); if (ret) { pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); return -1; } /* * Second -- restore xids. Since we still have the CAP_SETUID * capability nothing should fail. But call the setfsXid last * to override the setresXid settings. */ ret = sys_setresuid(ce->uid, ce->euid, ce->suid); if (ret) { pr_err("Unable to set real, effective and saved user ID: %d\n", ret); return -1; } sys_setfsuid(ce->fsuid); if (sys_setfsuid(-1) != ce->fsuid) { pr_err("Unable to set fsuid\n"); return -1; } ret = sys_setresgid(ce->gid, ce->egid, ce->sgid); if (ret) { pr_err("Unable to set real, effective and saved group ID: %d\n", ret); return -1; } sys_setfsgid(ce->fsgid); if (sys_setfsgid(-1) != ce->fsgid) { pr_err("Unable to set fsgid\n"); return -1; } /* * Third -- restore securebits. We don't need them in any * special state any longer. */ ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); if (ret) { pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); return -1; } /* * Fourth -- trim bset. This can only be done while * having the CAP_SETPCAP capablity. */ for (b = 0; b < CR_CAP_SIZE; b++) { for (i = 0; i < 32; i++) { if (b * 32 + i > args->cap_last_cap) break; if (args->cap_bnd[b] & (1 << i)) /* already set */ continue; ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0); if (ret) { pr_err("Unable to drop capability %d: %d\n", i + b * 32, ret); return -1; } } } /* * Fifth -- restore caps. Nothing but cap bits are changed * at this stage, so just do it. */ hdr.version = _LINUX_CAPABILITY_VERSION_3; hdr.pid = 0; BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE); for (i = 0; i < CR_CAP_SIZE; i++) { data[i].eff = args->cap_eff[i]; data[i].prm = args->cap_prm[i]; data[i].inh = args->cap_inh[i]; } ret = sys_capset(&hdr, data); if (ret) { pr_err("Unable to restore capabilities: %d\n", ret); return -1; } if (lsm_set_label(args->lsm_profile, procfd) < 0) return -1; return 0; } /* * This should be done after creds restore, as * some creds changes might drop the value back * to zero. */ static inline int restore_pdeath_sig(struct thread_restore_args *ta) { if (ta->pdeath_sig) return sys_prctl(PR_SET_PDEATHSIG, ta->pdeath_sig, 0, 0, 0); else return 0; } static int restore_dumpable_flag(MmEntry *mme) { int current_dumpable; int ret; if (!mme->has_dumpable) { pr_warn("Dumpable flag not present in criu dump.\n"); return 0; } if (mme->dumpable == 0 || mme->dumpable == 1) { ret = sys_prctl(PR_SET_DUMPABLE, mme->dumpable, 0, 0, 0); if (ret) { pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret); return -1; } return 0; } /* * If dumpable flag is present but it is not 0 or 1, then we can not * use prctl to set it back. Try to see if it is already correct * (which is likely if sysctl fs.suid_dumpable is the same when dump * and restore are run), in which case there is nothing to do. * Otherwise, set dumpable to 0 which should be a secure fallback. */ current_dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); if (mme->dumpable != current_dumpable) { pr_warn("Dumpable flag [%d] does not match current [%d]. " "Will fallback to setting it to 0 to disable it.\n", mme->dumpable, current_dumpable); ret = sys_prctl(PR_SET_DUMPABLE, 0, 0, 0, 0); if (ret) { pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret); return -1; } } return 0; } static void restore_sched_info(struct rst_sched_param *p) { struct sched_param parm; pr_info("Restoring scheduler params %d.%d.%d\n", p->policy, p->nice, p->prio); sys_setpriority(PRIO_PROCESS, 0, p->nice); parm.sched_priority = p->prio; sys_sched_setscheduler(0, p->policy, &parm); } static void restore_rlims(struct task_restore_args *ta) { int r; for (r = 0; r < ta->rlims_n; r++) { struct krlimit krlim; krlim.rlim_cur = ta->rlims[r].rlim_cur; krlim.rlim_max = ta->rlims[r].rlim_max; sys_setrlimit(r, &krlim); } } static int restore_signals(siginfo_t *ptr, int nr, bool group) { int ret, i; for (i = 0; i < nr; i++) { siginfo_t *info = ptr + i; pr_info("Restore signal %d group %d\n", info->si_signo, group); if (group) ret = sys_rt_sigqueueinfo(sys_getpid(), info->si_signo, info); else ret = sys_rt_tgsigqueueinfo(sys_getpid(), sys_gettid(), info->si_signo, info); if (ret) { pr_err("Unable to send siginfo %d %x with code %d\n", info->si_signo, info->si_code, ret); return -1;; } } return 0; } static int restore_seccomp(struct task_restore_args *args) { int ret; switch (args->seccomp_mode) { case SECCOMP_MODE_DISABLED: return 0; case SECCOMP_MODE_STRICT: ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0); if (ret < 0) { pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret); goto die; } return 0; case SECCOMP_MODE_FILTER: { int i; void *filter_data; filter_data = &args->seccomp_filters[args->seccomp_filters_n]; for (i = 0; i < args->seccomp_filters_n; i++) { struct sock_fprog *fprog = &args->seccomp_filters[i]; fprog->filter = filter_data; /* We always TSYNC here, since we require that the * creds for all threads be the same; this means we * don't have to restore_seccomp() in threads, and that * future TSYNC behavior will be correct. */ ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog); if (ret < 0) { pr_err("sys_seccomp() returned %d\n", ret); goto die; } filter_data += fprog->len * sizeof(struct sock_filter); } return 0; } default: goto die; } return 0; die: return -1; } static int restore_robust_futex(struct thread_restore_args *args) { uint32_t futex_len = args->futex_rla_len; int ret; if (!args->futex_rla_len) return 0; /* * XXX: We check here *task's* mode, not *thread's*. * But it's possible to write an application with mixed * threads (on x86): some in 32-bit mode, some in 64-bit. * Quite unlikely that such application exists at all. */ if (args->ta->compatible_mode) { uint32_t futex = (uint32_t)args->futex_rla; ret = set_compat_robust_list(futex, futex_len); } else { void *futex = decode_pointer(args->futex_rla); ret = sys_set_robust_list(futex, futex_len); } if (ret) pr_err("Failed to recover futex robust list: %d\n", ret); return ret; } static int restore_thread_common(struct thread_restore_args *args) { sys_set_tid_address((int *)decode_pointer(args->clear_tid_addr)); if (restore_robust_futex(args)) return -1; restore_sched_info(&args->sp); if (restore_nonsigframe_gpregs(&args->gpregs)) return -1; restore_tls(&args->tls); return 0; } static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sigframe) { ARCH_RT_SIGRETURN(new_sp, sigframe); } /* * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. */ long __export_restore_thread(struct thread_restore_args *args) { struct rt_sigframe *rt_sigframe; k_rtsigset_t to_block; unsigned long new_sp; int my_pid = sys_gettid(); int ret; if (my_pid != args->pid) { pr_err("Thread pid mismatch %d/%d\n", my_pid, args->pid); goto core_restore_end; } /* All signals must be handled by thread leader */ ksigfillset(&to_block); ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); if (ret) { pr_err("Unable to block signals %d\n", ret); goto core_restore_end; } rt_sigframe = (void *)&args->mz->rt_sigframe; if (restore_thread_common(args)) goto core_restore_end; ret = restore_creds(args->creds_args, args->ta->proc_fd); if (ret) goto core_restore_end; ret = restore_dumpable_flag(&args->ta->mm); if (ret) goto core_restore_end; pr_info("%ld: Restored\n", sys_gettid()); restore_finish_stage(task_entries_local, CR_STATE_RESTORE); if (restore_signals(args->siginfo, args->siginfo_n, false)) goto core_restore_end; restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); restore_pdeath_sig(args); if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED) pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid()); restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); futex_dec_and_wake(&thread_inprogress); new_sp = (long)rt_sigframe + RT_SIGFRAME_OFFSET(rt_sigframe); rst_sigreturn(new_sp, rt_sigframe); core_restore_end: pr_err("Restorer abnormal termination for %ld\n", sys_getpid()); futex_abort_and_wake(&task_entries_local->nr_in_progress); sys_exit_group(1); return -1; } static long restore_self_exe_late(struct task_restore_args *args) { int fd = args->fd_exe_link, ret; pr_info("Restoring EXE link\n"); ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_EXE_FILE, fd, 0); if (ret) pr_err("Can't restore EXE link (%d)\n", ret); sys_close(fd); return ret; } #ifndef ARCH_HAS_SHMAT_HOOK unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size) { return sys_shmat(shmid, shmaddr, shmflg); } #endif static unsigned long restore_mapping(VmaEntry *vma_entry) { int prot = vma_entry->prot; int flags = vma_entry->flags | MAP_FIXED; unsigned long addr; if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC)) { int att_flags; void *shmaddr = decode_pointer(vma_entry->start); unsigned long shmsize = (vma_entry->end - vma_entry->start); /* * See comment in open_shmem_sysv() for what SYSV_SHMEM_SKIP_FD * means and why we check for PROT_EXEC few lines below. */ if (vma_entry->fd == SYSV_SHMEM_SKIP_FD) return vma_entry->start; if (vma_entry->prot & PROT_EXEC) { att_flags = 0; vma_entry->prot &= ~PROT_EXEC; } else att_flags = SHM_RDONLY; pr_info("Attach SYSV shmem %d at %"PRIx64"\n", (int)vma_entry->fd, vma_entry->start); return arch_shmat(vma_entry->fd, shmaddr, att_flags, shmsize); } /* * Restore or shared mappings are tricky, since * we open anonymous mapping via map_files/ * MAP_ANONYMOUS should be eliminated so fd would * be taken into account by a kernel. */ if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL)) flags &= ~MAP_ANONYMOUS; /* See comment in premap_private_vma() for this flag change */ if (vma_entry_is(vma_entry, VMA_AREA_AIORING)) flags |= MAP_ANONYMOUS; /* A mapping of file with MAP_SHARED is up to date */ if ((vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED)) && !(vma_entry->status & VMA_NO_PROT_WRITE)) prot |= PROT_WRITE; pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n", vma_entry->start, vma_entry->end, prot, flags, (int)vma_entry->fd); /* * Should map memory here. Note we map them as * writable since we're going to restore page * contents. */ addr = sys_mmap(decode_pointer(vma_entry->start), vma_entry_len(vma_entry), prot, flags, vma_entry->fd, vma_entry->pgoff); if ((vma_entry->fd != -1) && (vma_entry->status & VMA_CLOSE)) sys_close(vma_entry->fd); return addr; } /* * This restores aio ring header, content, head and in-kernel position * of tail. To set tail, we write to /dev/null and use the fact this * operation is synchronious for the device. Also, we unmap temporary * anonymous area, used to store content of ring buffer during restore * and mapped in premap_private_vma(). */ static int restore_aio_ring(struct rst_aio_ring *raio) { struct aio_ring *ring = (void *)raio->addr, *new; int i, maxr, count, fd, ret; unsigned head = ring->head; unsigned tail = ring->tail; struct iocb *iocb, **iocbp; unsigned long ctx = 0; unsigned size; char buf[1]; ret = sys_io_setup(raio->nr_req, &ctx); if (ret < 0) { pr_err("Ring setup failed with %d\n", ret); return -1; } new = (struct aio_ring *)ctx; i = (raio->len - sizeof(struct aio_ring)) / sizeof(struct io_event); if (tail >= ring->nr || head >= ring->nr || ring->nr != i || new->nr != ring->nr) { pr_err("wrong aio: tail=%x head=%x req=%x old_nr=%x new_nr=%x expect=%x\n", tail, head, raio->nr_req, ring->nr, new->nr, i); return -1; } if (tail == 0 && head == 0) goto populate; fd = sys_open("/dev/null", O_WRONLY, 0); if (fd < 0) { pr_err("Can't open /dev/null for aio\n"); return -1; } /* * If tail < head, we have to do full turn and then submit * tail more request, i.e. ring->nr + tail. * If we do not do full turn, in-kernel completed_events * will initialize wrong. * * Maximum number reqs to submit at once are ring->nr-1, * so we won't allocate more. */ if (tail < head) count = ring->nr + tail; else count = tail; maxr = min_t(unsigned, count, ring->nr-1); /* * Since we only interested in moving the tail, the requests * may be any. We submit count identical requests. */ size = sizeof(struct iocb) + maxr * sizeof(struct iocb *); iocb = (void *)sys_mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); iocbp = (void *)iocb + sizeof(struct iocb); if (IS_ERR(iocb)) { pr_err("Can't mmap aio tmp buffer: %ld\n", PTR_ERR(iocb)); return -1; } iocb->aio_fildes = fd; iocb->aio_buf = (unsigned long)buf; iocb->aio_nbytes = 1; iocb->aio_lio_opcode = IOCB_CMD_PWRITE; /* Write is nop, read populates buf */ for (i = 0; i < maxr; i++) iocbp[i] = iocb; i = 0; do { ret = sys_io_submit(ctx, count - i, iocbp); if (ret < 0) { pr_err("Can't submit aio iocbs: ret=%d\n", ret); return -1; } i += ret; /* * We may submit less than requested, because of too big * count OR behaviour of get_reqs_available(), which * takes available requests only if their number is * aliquot to kioctx::req_batch. Free part of buffer * for next iteration. * * Direct set of head is equal to sys_io_getevents() call, * and faster. See kernel for the details. */ ((struct aio_ring *)ctx)->head = i < head ? i : head; } while (i < count); sys_munmap(iocb, size); sys_close(fd); populate: i = offsetof(struct aio_ring, io_events); memcpy((void *)ctx + i, (void *)ring + i, raio->len - i); /* * If we failed to get the proper nr_req right and * created smaller or larger ring, then this remap * will (should) fail, since AIO rings has immutable * size. * * This is not great, but anyway better than putting * a ring of wrong size into correct place. * * Also, this unmaps temporary anonymous area on raio->addr. */ ctx = sys_mremap(ctx, raio->len, raio->len, MREMAP_FIXED | MREMAP_MAYMOVE, raio->addr); if (ctx != raio->addr) { pr_err("Ring remap failed with %ld\n", ctx); return -1; } return 0; } static void rst_tcp_repair_off(struct rst_tcp_sock *rts) { int aux, ret; aux = rts->reuseaddr; pr_debug("pie: Turning repair off for %d (reuse %d)\n", rts->sk, aux); tcp_repair_off(rts->sk); ret = sys_setsockopt(rts->sk, SOL_SOCKET, SO_REUSEADDR, &aux, sizeof(aux)); if (ret < 0) pr_err("Failed to restore of SO_REUSEADDR on socket (%d)\n", ret); } static void rst_tcp_socks_all(struct task_restore_args *ta) { int i; for (i = 0; i < ta->tcp_socks_n; i++) rst_tcp_repair_off(&ta->tcp_socks[i]); } static int enable_uffd(int uffd, unsigned long addr, unsigned long len) { int rc; struct uffdio_register uffdio_register; unsigned long expected_ioctls; /* * If uffd == -1, this means that userfaultfd is not enabled * or it is not available. */ if (uffd == -1) return 0; uffdio_register.range.start = addr; uffdio_register.range.len = len; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; pr_info("lazy-pages: register: %lx, len %lx\n", addr, len); rc = sys_ioctl(uffd, UFFDIO_REGISTER, (unsigned long) &uffdio_register); if (rc != 0) { pr_err("lazy-pages: register %lx failed: rc:%d, \n", addr, rc); return -1; } expected_ioctls = (1 << _UFFDIO_WAKE) | (1 << _UFFDIO_COPY) | (1 << _UFFDIO_ZEROPAGE); if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { pr_err("lazy-pages: unexpected missing uffd ioctl for anon memory\n"); } return 0; } static int vma_remap(VmaEntry *vma_entry, int uffd) { unsigned long src = vma_premmaped_start(vma_entry); unsigned long dst = vma_entry->start; unsigned long len = vma_entry_len(vma_entry); unsigned long guard = 0, tmp; pr_info("Remap %lx->%lx len %lx\n", src, dst, len); if (src - dst < len) guard = dst; else if (dst - src < len) guard = dst + len - PAGE_SIZE; if (src == dst) return 0; if (guard != 0) { /* * mremap() returns an error if a target and source vma-s are * overlapped. In this case the source vma are remapped in * a temporary place and then remapped to the target address. * Here is one hack to find non-ovelapped temporary place. * * 1. initial placement. We need to move src -> tgt. * | |+++++src+++++| * |-----tgt-----| | * * 2. map a guard page at the non-ovelapped border of a target vma. * | |+++++src+++++| * |G|----tgt----| | * * 3. remap src to any other place. * G prevents src from being remaped on tgt again * | |-------------| -> |+++++src+++++| * |G|---tgt-----| | * * 4. remap src to tgt, no overlapping any longer * |+++++src+++++| <---- |-------------| * |G|---tgt-----| | */ unsigned long addr; /* Map guard page (step 2) */ tmp = sys_mmap((void *) guard, PAGE_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (tmp != guard) { pr_err("Unable to map a guard page %lx (%lx)\n", guard, tmp); return -1; } /* Move src to non-overlapping place (step 3) */ addr = sys_mmap(NULL, len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (addr == (unsigned long) MAP_FAILED) { pr_err("Unable to reserve memory (%lx)\n", addr); return -1; } tmp = sys_mremap(src, len, len, MREMAP_MAYMOVE | MREMAP_FIXED, addr); if (tmp != addr) { pr_err("Unable to remap %lx -> %lx (%lx)\n", src, addr, tmp); return -1; } src = addr; } tmp = sys_mremap(src, len, len, MREMAP_MAYMOVE | MREMAP_FIXED, dst); if (tmp != dst) { pr_err("Unable to remap %lx -> %lx\n", src, dst); return -1; } /* * If running in userfaultfd/lazy-pages mode pages with * MAP_ANONYMOUS and MAP_PRIVATE are remapped but without the * real content. * The function enable_uffd() marks the page(s) as userfaultfd * pages, so that the processes will hang until the memory is * injected via userfaultfd. */ if (vma_entry_can_be_lazy(vma_entry)) if (enable_uffd(uffd, dst, len) != 0) return -1; return 0; } static int timerfd_arm(struct task_restore_args *args) { int i; for (i = 0; i < args->timerfd_n; i++) { struct restore_timerfd *t = &args->timerfd[i]; int ret; pr_debug("timerfd: arm for fd %d (%d)\n", t->fd, i); if (t->settime_flags & TFD_TIMER_ABSTIME) { struct timespec ts; /* * We might need to adjust value because the checkpoint * and restore procedure takes some time itself. Note * we don't adjust nanoseconds, since the result may * overflow the limit NSEC_PER_SEC FIXME */ if (sys_clock_gettime(t->clockid, &ts)) { pr_err("Can't get current time\n"); return -1; } t->val.it_value.tv_sec += (time_t)ts.tv_sec; pr_debug("Ajust id %#x it_value(%llu, %llu) -> it_value(%llu, %llu)\n", t->id, (unsigned long long)ts.tv_sec, (unsigned long long)ts.tv_nsec, (unsigned long long)t->val.it_value.tv_sec, (unsigned long long)t->val.it_value.tv_nsec); } ret = sys_timerfd_settime(t->fd, t->settime_flags, &t->val, NULL); if (t->ticks) ret |= sys_ioctl(t->fd, TFD_IOC_SET_TICKS, (unsigned long)&t->ticks); if (ret) { pr_err("Can't restore ticks/time for timerfd - %d\n", i); return ret; } } return 0; } static int create_posix_timers(struct task_restore_args *args) { int ret, i; kernel_timer_t next_id; struct sigevent sev; for (i = 0; i < args->posix_timers_n; i++) { sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; sev.sigev_signo = args->posix_timers[i].spt.si_signo; sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; while (1) { ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); return ret; } if (next_id == args->posix_timers[i].spt.it_id) break; ret = sys_timer_delete(next_id); if (ret < 0) { pr_err("Can't remove temporaty posix timer 0x%x\n", next_id); return ret; } if ((long)next_id > args->posix_timers[i].spt.it_id) { pr_err("Can't create timers, kernel don't give them consequently\n"); return -1; } } } return 0; } static void restore_posix_timers(struct task_restore_args *args) { int i; struct restore_posix_timer *rt; for (i = 0; i < args->posix_timers_n; i++) { rt = &args->posix_timers[i]; sys_timer_settime((kernel_timer_t)rt->spt.it_id, 0, &rt->val, NULL); } } /* * sys_munmap must not return here. The control process must * trap us on the exit from sys_munmap. */ #ifdef CONFIG_VDSO unsigned long vdso_rt_size = 0; #else #define vdso_rt_size (0) #endif void *bootstrap_start = NULL; unsigned int bootstrap_len = 0; void __export_unmap(void) { sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size); } /* * This function unmaps all VMAs, which don't belong to * the restored process or the restorer. * * The restorer memory is two regions -- area with restorer, its stack * and arguments and the one with private vmas of the tasks we restore * (a.k.a. premmaped area): * * 0 task_size * +----+====+----+====+---+ * * Thus to unmap old memory we have to do 3 unmaps: * [ 0 -- 1st area start ] * [ 1st end -- 2nd start ] * [ 2nd start -- task_size ] */ static int unmap_old_vmas(void *premmapped_addr, unsigned long premmapped_len, void *bootstrap_start, unsigned long bootstrap_len, unsigned long task_size) { unsigned long s1, s2; void *p1, *p2; int ret; if (premmapped_addr < bootstrap_start) { p1 = premmapped_addr; s1 = premmapped_len; p2 = bootstrap_start; s2 = bootstrap_len; } else { p2 = premmapped_addr; s2 = premmapped_len; p1 = bootstrap_start; s1 = bootstrap_len; } ret = sys_munmap(NULL, p1 - NULL); if (ret) { pr_err("Unable to unmap (%p-%p): %d\n", NULL, p1, ret); return -1; } ret = sys_munmap(p1 + s1, p2 - (p1 + s1)); if (ret) { pr_err("Unable to unmap (%p-%p): %d\n", p1 + s1, p2, ret); return -1; } ret = sys_munmap(p2 + s2, task_size - (unsigned long)(p2 + s2)); if (ret) { pr_err("Unable to unmap (%p-%p): %d\n", p2 + s2, (void *)task_size, ret); return -1; } return 0; } static int wait_helpers(struct task_restore_args *task_args) { int i; for (i = 0; i < task_args->helpers_n; i++) { int status; pid_t pid = task_args->helpers[i]; /* Check that a helper completed. */ if (sys_wait4(pid, &status, 0, NULL) == -ECHILD) { /* It has been waited in sigchld_handler */ continue; } if (!WIFEXITED(status) || WEXITSTATUS(status)) { pr_err("%d exited with non-zero code (%d,%d)\n", pid, WEXITSTATUS(status), WTERMSIG(status)); return -1; } } return 0; } static int wait_zombies(struct task_restore_args *task_args) { int i; for (i = 0; i < task_args->zombies_n; i++) { int ret, nr_in_progress; nr_in_progress = futex_get(&task_entries_local->nr_in_progress); ret = sys_waitid(P_PID, task_args->zombies[i], NULL, WNOWAIT | WEXITED, NULL); if (ret == -ECHILD) { /* A process isn't reparented to this task yet. * Let's wait when someone complete this stage * and try again. */ futex_wait_while_eq(&task_entries_local->nr_in_progress, nr_in_progress); i--; continue; } if (ret < 0) { pr_err("Wait on %d zombie failed: %d\n", task_args->zombies[i], ret); return -1; } pr_debug("%ld: Collect a zombie with pid %d\n", sys_getpid(), task_args->zombies[i]); } return 0; } static bool vdso_unmapped(struct task_restore_args *args) { unsigned int i; /* Don't park rt-vdso or rt-vvar if dumpee doesn't have them */ for (i = 0; i < args->vmas_n; i++) { VmaEntry *vma = &args->vmas[i]; if (vma_entry_is(vma, VMA_AREA_VDSO) || vma_entry_is(vma, VMA_AREA_VVAR)) return false; } return true; } static bool vdso_needs_parking(struct task_restore_args *args) { /* Compatible vDSO will be mapped, not moved */ if (args->compatible_mode) return false; if (args->can_map_vdso) return false; return !vdso_unmapped(args); } /* * The main routine to restore task via sigreturn. * This one is very special, we never return there * but use sigreturn facility to restore core registers * and jump execution to some predefined ip read from * core file. */ long __export_restore_task(struct task_restore_args *args) { long ret = -1; int i; VmaEntry *vma_entry; unsigned long va; struct restore_vma_io *rio; struct rt_sigframe *rt_sigframe; struct prctl_mm_map prctl_map; unsigned long new_sp; k_rtsigset_t to_block; pid_t my_pid = sys_getpid(); rt_sigaction_t act; bootstrap_start = args->bootstrap_start; bootstrap_len = args->bootstrap_len; #ifdef CONFIG_VDSO vdso_rt_size = args->vdso_rt_size; #endif fi_strategy = args->fault_strategy; task_entries_local = args->task_entries; helpers = args->helpers; n_helpers = args->helpers_n; zombies = args->zombies; n_zombies = args->zombies_n; *args->breakpoint = rst_sigreturn; ksigfillset(&act.rt_sa_mask); act.rt_sa_handler = sigchld_handler; act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART; act.rt_sa_restorer = cr_restore_rt; sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t)); ksigemptyset(&to_block); ksigaddset(&to_block, SIGCHLD); ret = sys_sigprocmask(SIG_UNBLOCK, &to_block, NULL, sizeof(k_rtsigset_t)); std_log_set_fd(args->logfd); std_log_set_loglevel(args->loglevel); std_log_set_start(&args->logstart); pr_info("Switched to the restorer %d\n", my_pid); if (args->uffd > -1) { pr_debug("lazy-pages: uffd %d\n", args->uffd); } if (vdso_needs_parking(args)) { if (vdso_do_park(&args->vdso_maps_rt, args->vdso_rt_parked_at, vdso_rt_size)) goto core_restore_end; } if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len, bootstrap_start, bootstrap_len, args->task_size)) goto core_restore_end; /* Map vdso that wasn't parked */ if (!vdso_unmapped(args) && args->can_map_vdso) { if (arch_map_vdso(args->vdso_rt_parked_at, args->compatible_mode) < 0) { goto core_restore_end; } } /* Shift private vma-s to the left */ for (i = 0; i < args->vmas_n; i++) { vma_entry = args->vmas + i; if (!vma_entry_is(vma_entry, VMA_PREMMAPED)) continue; if (vma_entry->end >= args->task_size) continue; if (vma_entry->start > vma_entry->shmid) break; if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } /* Shift private vma-s to the right */ for (i = args->vmas_n - 1; i >= 0; i--) { vma_entry = args->vmas + i; if (!vma_entry_is(vma_entry, VMA_PREMMAPED)) continue; if (vma_entry->start > args->task_size) continue; if (vma_entry->start < vma_entry->shmid) break; if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } if (args->uffd > -1) { /* re-enable THP if we disabled it previously */ if (args->has_thp_enabled) { if (sys_prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { pr_err("Cannot re-enable THP\n"); goto core_restore_end; } } pr_debug("lazy-pages: closing uffd %d\n", args->uffd); /* * All userfaultfd configuration has finished at this point. * Let's close the UFFD file descriptor, so that the restored * process does not have an opened UFFD FD for ever. */ sys_close(args->uffd); } /* * OK, lets try to map new one. */ for (i = 0; i < args->vmas_n; i++) { vma_entry = args->vmas + i; if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) && !vma_entry_is(vma_entry, VMA_AREA_AIORING)) continue; if (vma_entry_is(vma_entry, VMA_PREMMAPED)) continue; va = restore_mapping(vma_entry); if (va != vma_entry->start) { pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va); goto core_restore_end; } } /* * Now read the contents (if any) */ rio = args->vma_ios; for (i = 0; i < args->vma_ios_n; i++) { struct iovec *iovs = rio->iovs; int nr = rio->nr_iovs; ssize_t r; while (nr) { pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off); if (r < 0) { pr_err("Can't read pages data (%d)\n", (int)r); goto core_restore_end; } pr_debug("`- returned %ld\n", (long)r); rio->off += r; /* Advance the iovecs */ do { if (iovs->iov_len <= r) { pr_debug(" `- skip pagemap\n"); r -= iovs->iov_len; iovs++; nr--; continue; } iovs->iov_base += r; iovs->iov_len -= r; break; } while (nr > 0); } rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs); } sys_close(args->vma_ios_fd); #ifdef CONFIG_VDSO /* * Proxify vDSO. */ if (vdso_proxify(&args->vdso_maps_rt.sym, args->vdso_rt_parked_at, args->vmas, args->vmas_n, args->compatible_mode, fault_injected(FI_VDSO_TRAMPOLINES))) goto core_restore_end; #endif /* * Walk though all VMAs again to drop PROT_WRITE * if it was not there. */ for (i = 0; i < args->vmas_n; i++) { vma_entry = args->vmas + i; if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) continue; if ((vma_entry->prot & PROT_WRITE) || (vma_entry->status & VMA_NO_PROT_WRITE)) continue; sys_mprotect(decode_pointer(vma_entry->start), vma_entry_len(vma_entry), vma_entry->prot); } /* * Now when all VMAs are in their places time to set * up AIO rings. */ for (i = 0; i < args->rings_n; i++) if (restore_aio_ring(&args->rings[i]) < 0) goto core_restore_end; /* * Finally restore madivse() bits */ for (i = 0; i < args->vmas_n; i++) { unsigned long m; vma_entry = args->vmas + i; if (!vma_entry->has_madv || !vma_entry->madv) continue; for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { if (vma_entry->madv & (1ul << m)) { ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m); if (ret) { pr_err("madvise(%"PRIx64", %"PRIu64", %ld) " "failed with %ld\n", vma_entry->start, vma_entry_len(vma_entry), m, ret); goto core_restore_end; } } } } ret = 0; /* * Tune up the task fields. */ ret = sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0); if (ret) goto core_restore_end; /* * New kernel interface with @PR_SET_MM_MAP will become * more widespread once kernel get deployed over the world. * Thus lets be opportunistic and use new inteface as a try. */ prctl_map = (struct prctl_mm_map) { .start_code = args->mm.mm_start_code, .end_code = args->mm.mm_end_code, .start_data = args->mm.mm_start_data, .end_data = args->mm.mm_end_data, .start_stack = args->mm.mm_start_stack, .start_brk = args->mm.mm_start_brk, .brk = args->mm.mm_brk, .arg_start = args->mm.mm_arg_start, .arg_end = args->mm.mm_arg_end, .env_start = args->mm.mm_env_start, .env_end = args->mm.mm_env_end, .auxv = (void *)args->mm_saved_auxv, .auxv_size = args->mm_saved_auxv_size, .exe_fd = args->fd_exe_link, }; ret = sys_prctl(PR_SET_MM, PR_SET_MM_MAP, (long)&prctl_map, sizeof(prctl_map), 0); if (ret == -EINVAL) { ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size); /* * Because of requirements applied from kernel side * we need to restore /proc/pid/exe symlink late, * after old existing VMAs are superseded with * new ones from image file. */ ret |= restore_self_exe_late(args); } else { if (ret) pr_err("sys_prctl(PR_SET_MM, PR_SET_MM_MAP) failed with %d\n", (int)ret); sys_close(args->fd_exe_link); } if (ret) goto core_restore_end; /* * We need to prepare a valid sigframe here, so * after sigreturn the kernel will pick up the * registers from the frame, set them up and * finally pass execution to the new IP. */ rt_sigframe = (void *)&args->t->mz->rt_sigframe; if (restore_thread_common(args->t)) goto core_restore_end; /* * Threads restoration. This requires some more comments. This * restorer routine and thread restorer routine has the following * memory map, prepared by a caller code. * * | <-- low addresses high addresses --> | * +-------------------------------------------------------+-----------------------+ * | this proc body | own stack | rt_sigframe space | thread restore zone | * +-------------------------------------------------------+-----------------------+ * * where each thread restore zone is the following * * | <-- low addresses high addresses --> | * +--------------------------------------------------------------------------+ * | thread restore proc | thread1 stack | thread1 rt_sigframe | * +--------------------------------------------------------------------------+ */ if (args->nr_threads > 1) { struct thread_restore_args *thread_args = args->thread_args; long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | CLONE_FS; long last_pid_len; long parent_tid; int i, fd; fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0); if (fd < 0) { pr_err("can't open last pid fd %d\n", fd); goto core_restore_end; } ret = sys_flock(fd, LOCK_EX); if (ret) { pr_err("Can't lock last_pid %d\n", fd); sys_close(fd); goto core_restore_end; } for (i = 0; i < args->nr_threads; i++) { char last_pid_buf[16], *s; /* skip self */ if (thread_args[i].pid == args->t->pid) continue; new_sp = restorer_stack(thread_args[i].mz); last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); sys_lseek(fd, 0, SEEK_SET); ret = sys_write(fd, s, last_pid_len); if (ret < 0) { pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf); sys_close(fd); goto core_restore_end; } /* * To achieve functionality like libc's clone() * we need a pure assembly here, because clone()'ed * thread will run with own stack and we must not * have any additional instructions... oh, dear... */ RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); } ret = sys_flock(fd, LOCK_UN); if (ret) { pr_err("Can't unlock last_pid %ld\n", ret); sys_close(fd); goto core_restore_end; } sys_close(fd); } restore_rlims(args); ret = create_posix_timers(args); if (ret < 0) { pr_err("Can't restore posix timers %ld\n", ret); goto core_restore_end; } ret = timerfd_arm(args); if (ret < 0) { pr_err("Can't restore timerfd %ld\n", ret); goto core_restore_end; } pr_info("%ld: Restored\n", sys_getpid()); restore_finish_stage(task_entries_local, CR_STATE_RESTORE); if (wait_helpers(args) < 0) goto core_restore_end; if (wait_zombies(args) < 0) goto core_restore_end; ksigfillset(&to_block); ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); if (ret) { pr_err("Unable to block signals %ld\n", ret); goto core_restore_end; } if (!args->compatible_mode) { sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t)); } else { void *stack = alloc_compat_syscall_stack(); if (!stack) { pr_err("Failed to allocate 32-bit stack for sigaction\n"); goto core_restore_end; } arch_compat_rt_sigaction(stack, SIGCHLD, (void*)&args->sigchld_act); free_compat_syscall_stack(stack); } ret = restore_signals(args->siginfo, args->siginfo_n, true); if (ret) goto core_restore_end; ret = restore_signals(args->t->siginfo, args->t->siginfo_n, false); if (ret) goto core_restore_end; restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); rst_tcp_socks_all(args); /* The kernel restricts setting seccomp to uid 0 in the current user * ns, so we must do this before restore_creds. */ pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid()); if (restore_seccomp(args)) goto core_restore_end; /* * Writing to last-pid is CAP_SYS_ADMIN protected, * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ ret = restore_creds(args->t->creds_args, args->proc_fd); ret = ret || restore_dumpable_flag(&args->mm); ret = ret || restore_pdeath_sig(args->t); futex_set_and_wake(&thread_inprogress, args->nr_threads); restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); if (ret) BUG(); /* Wait until children stop to use args->task_entries */ futex_wait_while_gt(&thread_inprogress, 1); sys_close(args->proc_fd); std_log_set_fd(-1); /* * The code that prepared the itimers makes shure the * code below doesn't fail due to bad timing values. */ #define itimer_armed(args, i) \ (args->itimers[i].it_interval.tv_sec || \ args->itimers[i].it_interval.tv_usec) if (itimer_armed(args, 0)) sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); if (itimer_armed(args, 1)) sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL); if (itimer_armed(args, 2)) sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL); restore_posix_timers(args); sys_munmap(args->rst_mem, args->rst_mem_size); /* * Sigframe stack. */ new_sp = (long)rt_sigframe + RT_SIGFRAME_OFFSET(rt_sigframe); /* * Prepare the stack and call for sigreturn, * pure assembly since we don't need any additional * code insns from gcc. */ rst_sigreturn(new_sp, rt_sigframe); core_restore_end: futex_abort_and_wake(&task_entries_local->nr_in_progress); pr_err("Restorer fail %ld\n", sys_getpid()); sys_exit_group(1); return -1; } /* * For most of the restorer's objects -fstack-protector is disabled. * But we share some of them with CRIU, which may have it enabled. */ void __stack_chk_fail(void) { pr_err("Restorer stack smash detected %ld\n", sys_getpid()); sys_exit_group(1); BUG(); } criu-3.6/criu/pie/util-vdso-elf32.c000077700000000000000000000000001317335042600211372util-vdso.custar00rootroot00000000000000criu-3.6/criu/pie/util-vdso.c000066400000000000000000000167411317335042600161520ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "image.h" #include "util-vdso.h" #include "vma.h" #include "log.h" #include "common/bug.h" #ifdef CR_NOGLIBC # include #else # include # define std_strncmp strncmp #endif #ifdef LOG_PREFIX # undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " /* Check if pointer is out-of-bound */ static bool __ptr_oob(uintptr_t ptr, uintptr_t start, size_t size) { uintptr_t end = start + size; return ptr >= end || ptr < start; } /* Check if pointed structure's end is out-of-bound */ static bool __ptr_struct_end_oob(uintptr_t ptr, size_t struct_size, uintptr_t start, size_t size) { return __ptr_oob(ptr + struct_size - 1, start, size); } /* Check if pointed structure is out-of-bound */ static bool __ptr_struct_oob(uintptr_t ptr, size_t struct_size, uintptr_t start, size_t size) { return __ptr_oob(ptr, start, size) || __ptr_struct_end_oob(ptr, struct_size, start, size); } /* * Elf hash, see format specification. */ static unsigned long elf_hash(const unsigned char *name) { unsigned long h = 0, g; while (*name) { h = (h << 4) + *name++; g = h & 0xf0000000ul; if (g) h ^= g >> 24; h &= ~g; } return h; } #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define BORD ELFDATA2MSB /* 0x02 */ #else #define BORD ELFDATA2LSB /* 0x01 */ #endif static int has_elf_identity(Ehdr_t *ehdr) { /* * See Elf specification for this magic values. */ #if defined(CONFIG_VDSO_32) static const char elf_ident[] = { 0x7f, 0x45, 0x4c, 0x46, 0x01, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; #else static const char elf_ident[] = { 0x7f, 0x45, 0x4c, 0x46, 0x02, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; #endif BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { pr_err("ELF header magic mismatch\n"); return false; } return true; } static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t **load) { Ehdr_t *ehdr = (void *)mem; uintptr_t addr; Phdr_t *phdr; int i; if (__ptr_struct_end_oob(mem, sizeof(Ehdr_t), mem, size)) goto err_oob; /* * Make sure it's a file we support. */ if (!has_elf_identity(ehdr)) return -EINVAL; addr = mem + ehdr->e_phoff; if (__ptr_oob(addr, mem, size)) goto err_oob; for (i = 0; i < ehdr->e_phnum; i++, addr += sizeof(Phdr_t)) { if (__ptr_struct_end_oob(addr, sizeof(Phdr_t), mem, size)) goto err_oob; phdr = (void *)addr; switch (phdr->p_type) { case PT_DYNAMIC: if (*dynamic) { pr_err("Second PT_DYNAMIC header\n"); return -EINVAL; } *dynamic = phdr; break; case PT_LOAD: if (*load) { pr_err("Second PT_LOAD header\n"); return -EINVAL; } *load = phdr; break; } } return 0; err_oob: pr_err("Corrupted Elf phdr\n"); return -EFAULT; } /* * Parse dynamic program header. * Output parameters are: * @dyn_strtab - address of the symbol table * @dyn_symtab - address of the string table section * @dyn_hash - address of the symbol hash table */ static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, Dyn_t **dyn_hash) { Dyn_t *dyn_syment = NULL; Dyn_t *dyn_strsz = NULL; uintptr_t addr; Dyn_t *d; int i; addr = mem + dynamic->p_offset; if (__ptr_oob(addr, mem, size)) goto err_oob; for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, addr += sizeof(Dyn_t)) { if (__ptr_struct_end_oob(addr, sizeof(Dyn_t), mem, size)) goto err_oob; d = (void *)addr; if (d->d_tag == DT_NULL) { break; } else if (d->d_tag == DT_STRTAB) { *dyn_strtab = d; pr_debug("DT_STRTAB: %lx\n", (unsigned long)d->d_un.d_ptr); } else if (d->d_tag == DT_SYMTAB) { *dyn_symtab = d; pr_debug("DT_SYMTAB: %lx\n", (unsigned long)d->d_un.d_ptr); } else if (d->d_tag == DT_STRSZ) { dyn_strsz = d; pr_debug("DT_STRSZ: %lx\n", (unsigned long)d->d_un.d_val); } else if (d->d_tag == DT_SYMENT) { dyn_syment = d; pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val); } else if (d->d_tag == DT_HASH) { *dyn_hash = d; pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); } } if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || !*dyn_hash) { pr_err("Not all dynamic entries are present\n"); return -EINVAL; } return 0; err_oob: pr_err("Corrupted Elf dynamic section\n"); return -EFAULT; } /* On s390x Hash_t is 64 bit */ #ifdef __s390x__ typedef unsigned long Hash_t; #else typedef Word_t Hash_t; #endif static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t, uintptr_t dynsymbol_names, Hash_t *hash, Dyn_t *dyn_symtab) { const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS }; const size_t vdso_symbol_length = sizeof(t->symbols[0].name); Hash_t nbucket, nchain; Hash_t *bucket, *chain; unsigned int i, j, k; uintptr_t addr; nbucket = hash[0]; nchain = hash[1]; bucket = &hash[2]; chain = &hash[nbucket + 2]; pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", (long)nbucket, (long)nchain, (unsigned long)bucket, (unsigned long)chain); for (i = 0; i < VDSO_SYMBOL_MAX; i++) { const char * symbol = vdso_symbols[i]; k = elf_hash((const unsigned char *)symbol); for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) { addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; Sym_t *sym; char *name; addr += sizeof(Sym_t)*j; if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) continue; sym = (void *)addr; if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) continue; addr = dynsymbol_names + sym->st_name; if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) continue; name = (void *)addr; if (std_strncmp(name, symbol, vdso_symbol_length)) continue; memcpy(t->symbols[i].name, name, vdso_symbol_length); t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr; break; } } } int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) { Phdr_t *dynamic = NULL, *load = NULL; Dyn_t *dyn_strtab = NULL; Dyn_t *dyn_symtab = NULL; Dyn_t *dyn_hash = NULL; Hash_t *hash = NULL; uintptr_t dynsymbol_names; uintptr_t addr; int ret; pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size); /* * We need PT_LOAD and PT_DYNAMIC here. Each once. */ ret = parse_elf_phdr(mem, size, &dynamic, &load); if (ret < 0) return ret; if (!load || !dynamic) { pr_err("One of obligated program headers is missed\n"); return -EINVAL; } pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr); /* * Dynamic section tags should provide us the rest of information * needed. Note that we're interested in a small set of tags. */ ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, &dyn_hash); if (ret < 0) return ret; addr = mem + dyn_strtab->d_un.d_val - load->p_vaddr; if (__ptr_oob(addr, mem, size)) goto err_oob; dynsymbol_names = addr; addr = mem + dyn_hash->d_un.d_ptr - load->p_vaddr; if (__ptr_struct_oob(addr, sizeof(Word_t), mem, size)) goto err_oob; hash = (void *)addr; parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab); return 0; err_oob: pr_err("Corrupted Elf symbols/hash\n"); return -EFAULT; } criu-3.6/criu/pie/util.c000066400000000000000000000020031317335042600151630ustar00rootroot00000000000000#include #include #include #include #include #include #include "int.h" #include "types.h" #include "common/compiler.h" #include "fcntl.h" #include "log.h" #include "util-pie.h" #ifdef CR_NOGLIBC # include # define __sys(foo) sys_##foo #else # define __sys(foo) foo #endif #ifdef CR_NOGLIBC #define __pr_perror(fmt, ...) pr_err(fmt "\n", ##__VA_ARGS__) #else #define __pr_perror(fmt, ...) pr_perror(fmt, ##__VA_ARGS__) #endif int open_detach_mount(char *dir) { int fd, ret; fd = __sys(open)(dir, O_RDONLY | O_DIRECTORY, 0); if (fd < 0) __pr_perror("Can't open directory %s: %d", dir, fd); ret = __sys(umount2)(dir, MNT_DETACH); if (ret) { __pr_perror("Can't detach mount %s: %d", dir, ret); goto err_close; } ret = __sys(rmdir)(dir); if (ret) { __pr_perror("Can't remove tmp dir %s: %d", dir, ret); goto err_close; } return fd; err_close: if (fd >= 0) __sys(close)(fd); return -1; } criu-3.6/criu/pipes.c000066400000000000000000000256271317335042600145720ustar00rootroot00000000000000#include #include #include #include #include #include "crtools.h" #include "imgset.h" #include "image.h" #include "files.h" #include "pipes.h" #include "util-pie.h" #include "autofs.h" #include "protobuf.h" #include "util.h" #include "images/pipe.pb-c.h" #include "images/pipe-data.pb-c.h" #include "fcntl.h" #include "namespaces.h" static LIST_HEAD(pipes); static void show_saved_pipe_fds(struct pipe_info *pi) { struct fdinfo_list_entry *fle; pr_info(" `- ID %p %#x\n", pi, pi->pe->id); list_for_each_entry(fle, &pi->d.fd_info_head, desc_list) pr_info(" `- FD %d pid %d\n", fle->fe->fd, fle->pid); } static int pipe_data_read(struct cr_img *img, struct pipe_data_rst *r) { unsigned long bytes = r->pde->bytes; if (!bytes) return 0; /* * We potentially allocate more memory than required for data, * but this is OK. Look at restore_pipe_data -- it vmsplice-s * this into the kernel with F_GIFT flag (since some time it * works on non-aligned data), thus just giving this page to * pipe buffer. And since kernel allocates pipe buffers in pages * anyway we don't increase memory consumption :) */ r->data = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0); if (r->data == MAP_FAILED) { pr_perror("Can't map mem for pipe buffers"); return -1; } return read_img_buf(img, r->data, bytes); } int do_collect_pipe_data(struct pipe_data_rst *r, ProtobufCMessage *msg, struct cr_img *img, struct pipe_data_rst **hash) { int aux; r->pde = pb_msg(msg, PipeDataEntry); aux = pipe_data_read(img, r); if (aux < 0) return aux; aux = r->pde->pipe_id & PIPE_DATA_HASH_MASK; r->next = hash[aux]; hash[aux] = r; pr_info("Collected pipe data for %#x (chain %u)\n", r->pde->pipe_id, aux); return 0; } /* Choose who will restore a pipe. */ static int mark_pipe_master_cb(struct pprep_head *ph) { LIST_HEAD(head); pr_info("Pipes:\n"); while (1) { struct fdinfo_list_entry *fle; struct pipe_info *pi, *pic, *p; struct pipe_info *pr = NULL, *pw = NULL; if (list_empty(&pipes)) break; pi = list_first_entry(&pipes, struct pipe_info, list); list_move(&pi->list, &head); pr_info(" `- PIPE ID %#x\n", pi->pe->pipe_id); show_saved_pipe_fds(pi); fle = file_master(&pi->d); p = pi; if (!(pi->pe->flags & O_LARGEFILE)) { if (pi->pe->flags & O_WRONLY) { if (pw == NULL) pw = pi; } else { if (pr == NULL) pr = pi; } } list_for_each_entry(pic, &pi->pipe_list, pipe_list) { struct fdinfo_list_entry *f; list_move(&pic->list, &head); f = file_master(&pic->d); if (fdinfo_rst_prio(f, fle)) { p = pic; fle = f; } if (!(pic->pe->flags & O_LARGEFILE)) { if (pic->pe->flags & O_WRONLY) { if (pw == NULL) pw = pic; } else { if (pr == NULL) pr = pic; } } show_saved_pipe_fds(pic); } p->create = 1; if (pr) pr->reopen = 0; if (pw) pw->reopen = 0; pr_info(" by %#x\n", p->pe->id); } list_splice(&head, &pipes); return 0; } static MAKE_PPREP_HEAD(mark_pipe_master); static struct pipe_data_rst *pd_hash_pipes[PIPE_DATA_HASH_SIZE]; int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash) { int ret; struct pipe_data_rst *pd; struct iovec iov; for (pd = hash[id & PIPE_DATA_HASH_MASK]; pd != NULL; pd = pd->next) if (pd->pde->pipe_id == id) break; if (!pd) { /* no data for this pipe */ pr_info("No data for pipe %#x\n", id); return 0; } if (!pd->pde->bytes) goto out; if (!pd->data) { pr_err("Double data restore occurred on %#x\n", id); return -1; } iov.iov_base = pd->data; iov.iov_len = pd->pde->bytes; while (iov.iov_len > 0) { ret = vmsplice(pfd, &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK); if (ret < 0) { pr_perror("%#x: Error splicing data", id); goto err; } if (ret == 0 || ret > iov.iov_len /* sanity */) { pr_err("%#x: Wanted to restore %zu bytes, but got %d\n", id, iov.iov_len, ret); ret = -1; goto err; } iov.iov_base += ret; iov.iov_len -= ret; } /* * 3 reasons for killing the buffer from our address space: * * 1. We gifted the pages to the kernel to optimize memory usage, thus * accidental memory corruption can change the pipe buffer. * 2. This will make the vmas restoration a bit faster due to less self * mappings to be unmapped. * 3. We can catch bugs with double pipe data restore. */ munmap(pd->data, pd->pde->bytes); pd->data = NULL; out: ret = 0; if (pd->pde->has_size) { pr_info("Restoring size %#x for %#x\n", pd->pde->size, pd->pde->pipe_id); ret = fcntl(pfd, F_SETPIPE_SZ, pd->pde->size); if (ret < 0) pr_perror("Can't restore pipe size"); else ret = 0; } err: return ret; } static int userns_reopen(void *_arg, int fd, pid_t pid) { char path[PSFDS]; int ret, flags = *(int*)_arg; sprintf(path, "/proc/self/fd/%d", fd); ret = open(path, flags); if (ret < 0) pr_perror("Unable to reopen the pipe %s", path); close(fd); return ret; } static int reopen_pipe(int fd, int flags) { int ret; char path[PSFDS]; sprintf(path, "/proc/self/fd/%d", fd); ret = open(path, flags); if (ret < 0) { if (errno == EACCES) { /* It may be an external pipe from an another userns */ ret = userns_call(userns_reopen, UNS_FDOUT, &flags, sizeof(flags), fd); } else pr_perror("Unable to reopen the pipe %s", path); } close(fd); return ret; } static int recv_pipe_fd(struct pipe_info *pi, int *new_fd) { int tmp, fd, ret; ret = recv_desc_from_peer(&pi->d, &tmp); if (ret != 0) { if (ret != 1) pr_err("Can't get fd %d\n", tmp); return ret; } if (pi->reopen) fd = reopen_pipe(tmp, pi->pe->flags); else fd = tmp; if (fd >= 0) { if (rst_file_params(fd, pi->pe->fown, pi->pe->flags)) { close(fd); return -1; } *new_fd = fd; } return fd < 0 ? -1 : 0; } static char *pipe_d_name(struct file_desc *d, char *buf, size_t s) { struct pipe_info *pi; pi = container_of(d, struct pipe_info, d); if (snprintf(buf, s, "pipe:[%d]", pi->pe->pipe_id) >= s) { pr_err("Not enough room for pipe %d identifier string\n", pi->pe->pipe_id); return NULL; } return buf; } int open_pipe(struct file_desc *d, int *new_fd) { struct pipe_info *pi, *p; int ret, tmp; int pfd[2]; pi = container_of(d, struct pipe_info, d); pr_info("\t\tCreating pipe pipe_id=%#x id=%#x\n", pi->pe->pipe_id, pi->pe->id); if (inherited_fd(d, &tmp)) { if (tmp < 0) return tmp; pi->reopen = 1; goto reopen; } if (!pi->create) return recv_pipe_fd(pi, new_fd); if (pipe(pfd) < 0) { pr_perror("Can't create pipe"); return -1; } ret = restore_pipe_data(CR_FD_PIPES_DATA, pfd[1], pi->pe->pipe_id, pd_hash_pipes); if (ret) return -1; list_for_each_entry(p, &pi->pipe_list, pipe_list) { int fd = pfd[p->pe->flags & O_WRONLY]; if (send_desc_to_peer(fd, &p->d)) { pr_perror("Can't send file descriptor"); return -1; } } close(pfd[!(pi->pe->flags & O_WRONLY)]); tmp = pfd[pi->pe->flags & O_WRONLY]; reopen: if (pi->reopen) tmp = reopen_pipe(tmp, pi->pe->flags); if (tmp >= 0) if (rst_file_params(tmp, pi->pe->fown, pi->pe->flags)) return -1; if (tmp < 0) return -1; *new_fd = tmp; return 0; } static struct file_desc_ops pipe_desc_ops = { .type = FD_TYPES__PIPE, .open = open_pipe, .name = pipe_d_name, }; int collect_one_pipe_ops(void *o, ProtobufCMessage *base, struct file_desc_ops *ops) { struct pipe_info *pi = o, *tmp; pi->pe = pb_msg(base, PipeEntry); pi->create = 0; pi->reopen = 1; pr_info("Collected pipe entry ID %#x PIPE ID %#x\n", pi->pe->id, pi->pe->pipe_id); if (file_desc_add(&pi->d, pi->pe->id, ops)) return -1; INIT_LIST_HEAD(&pi->pipe_list); if (!inherited_fd(&pi->d, NULL)) { list_for_each_entry(tmp, &pipes, list) if (pi->pe->pipe_id == tmp->pe->pipe_id) break; if (&tmp->list != &pipes) list_add(&pi->pipe_list, &tmp->pipe_list); } add_post_prepare_cb_once(&mark_pipe_master); list_add_tail(&pi->list, &pipes); return 0; } static int collect_one_pipe(void *o, ProtobufCMessage *base, struct cr_img *i) { return collect_one_pipe_ops(o, base, &pipe_desc_ops); } struct collect_image_info pipe_cinfo = { .fd_type = CR_FD_PIPES, .pb_type = PB_PIPE, .priv_size = sizeof(struct pipe_info), .collect = collect_one_pipe, }; static int collect_pipe_data(void *obj, ProtobufCMessage *msg, struct cr_img *img) { return do_collect_pipe_data(obj, msg, img, pd_hash_pipes); } struct collect_image_info pipe_data_cinfo = { .fd_type = CR_FD_PIPES_DATA, .pb_type = PB_PIPE_DATA, .priv_size = sizeof(struct pipe_data_rst), .collect = collect_pipe_data, }; int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms *p) { struct cr_img *img; int pipe_size, i, bytes; int steal_pipe[2]; int ret = -1; PipeDataEntry pde = PIPE_DATA_ENTRY__INIT; if (p->flags & O_WRONLY) return 0; /* Maybe we've dumped it already */ for (i = 0; i < pd->nr; i++) { if (pd->ids[i] == pipe_id(p)) return 0; } pr_info("Dumping data from pipe %#x fd %d\n", pipe_id(p), lfd); if (pd->nr >= NR_PIPES_WITH_DATA) { pr_err("OOM storing pipe\n"); return -1; } img = img_from_set(glob_imgset, pd->img_type); pd->ids[pd->nr++] = pipe_id(p); pipe_size = fcntl(lfd, F_GETPIPE_SZ); if (pipe_size < 0) { pr_err("Can't obtain piped data size\n"); goto err; } if (pipe(steal_pipe) < 0) { pr_perror("Can't create pipe for stealing data"); goto err; } bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK); if (bytes < 0) { if (errno != EAGAIN) { pr_perror("Can't pick pipe data"); goto err_close; } bytes = 0; } pde.pipe_id = pipe_id(p); pde.bytes = bytes; pde.has_size = true; pde.size = pipe_size; if (pb_write_one(img, &pde, PB_PIPE_DATA)) goto err_close; if (bytes) { int wrote; wrote = splice(steal_pipe[0], NULL, img_raw_fd(img), NULL, bytes, 0); if (wrote < 0) { pr_perror("Can't push pipe data"); goto err_close; } else if (wrote != bytes) { pr_err("%#x: Wanted to write %d bytes, but wrote %d\n", pipe_id(p), bytes, wrote); goto err_close; } } ret = 0; err_close: close(steal_pipe[0]); close(steal_pipe[1]); err: return ret; } static struct pipe_data_dump pd_pipes = { .img_type = CR_FD_PIPES_DATA, }; static int dump_one_pipe(int lfd, u32 id, const struct fd_parms *p) { FileEntry fe = FILE_ENTRY__INIT; PipeEntry pe = PIPE_ENTRY__INIT; pr_info("Dumping pipe %d with id %#x pipe_id %#x\n", lfd, id, pipe_id(p)); if ((p->flags & O_DIRECT) && !is_autofs_pipe(pipe_id(p))) { pr_err("The packetized mode for pipes is not supported yet\n"); return -1; } pe.id = id; pe.pipe_id = pipe_id(p); pe.flags = p->flags & ~O_DIRECT; pe.fown = (FownEntry *)&p->fown; fe.type = FD_TYPES__PIPE; fe.id = pe.id; fe.pipe = &pe; if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) return -1; return dump_one_pipe_data(&pd_pipes, lfd, p); } const struct fdtype_ops pipe_dump_ops = { .type = FD_TYPES__PIPE, .dump = dump_one_pipe, }; criu-3.6/criu/plugin.c000066400000000000000000000123141317335042600147350ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "cr_options.h" #include "common/compiler.h" #include "xmalloc.h" #include "plugin.h" #include "common/list.h" #include "log.h" cr_plugin_ctl_t cr_plugin_ctl = { .head.next = &cr_plugin_ctl.head, .head.prev = &cr_plugin_ctl.head, }; /* * If we met old version of a plugin, selfgenerate a plugin descriptor for it. */ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) { cr_plugin_desc_t *d; d = xzalloc(sizeof(*d)); if (!d) return NULL; d->name = xstrdup(path); d->max_hooks = CR_PLUGIN_HOOK__MAX; d->version = CRIU_PLUGIN_VERSION_OLD; pr_warn("Generating dynamic descriptor for plugin `%s'." "Won't work in next version of the program." "Please update your plugin.\n", path); #define __assign_hook(__hook, __name) \ do { \ void *name; \ name = dlsym(h, __name); \ if (name) \ d->hooks[CR_PLUGIN_HOOK__ ##__hook] = name; \ } while (0) __assign_hook(DUMP_UNIX_SK, "cr_plugin_dump_unix_sk"); __assign_hook(RESTORE_UNIX_SK, "cr_plugin_restore_unix_sk"); __assign_hook(DUMP_EXT_FILE, "cr_plugin_dump_file"); __assign_hook(RESTORE_EXT_FILE, "cr_plugin_restore_file"); __assign_hook(DUMP_EXT_MOUNT, "cr_plugin_dump_ext_mount"); __assign_hook(RESTORE_EXT_MOUNT, "cr_plugin_restore_ext_mount"); __assign_hook(DUMP_EXT_LINK, "cr_plugin_dump_ext_link"); #undef __assign_hook d->init = dlsym(h, "cr_plugin_init"); d->exit = dlsym(h, "cr_plugin_fini"); return d; } static void show_plugin_desc(cr_plugin_desc_t *d) { size_t i; pr_debug("Plugin \"%s\" (version %u hooks %u)\n", d->name, d->version, d->max_hooks); for (i = 0; i < d->max_hooks; i++) { if (d->hooks[i]) pr_debug("\t%4zu -> %p\n", i, d->hooks[i]); } } static int verify_plugin(cr_plugin_desc_t *d) { if (d->version > CRIU_PLUGIN_VERSION) { pr_debug("Plugin %s has version %x while max %x supported\n", d->name, d->version, CRIU_PLUGIN_VERSION); return -1; } if (d->max_hooks > CR_PLUGIN_HOOK__MAX) { pr_debug("Plugin %s has %u assigned while max %u supported\n", d->name, d->max_hooks, CR_PLUGIN_HOOK__MAX); return -1; } return 0; } static int cr_lib_load(int stage, char *path) { cr_plugin_desc_t *d; plugin_desc_t *this; size_t i; void *h; bool allocated = false; h = dlopen(path, RTLD_LAZY); if (h == NULL) { pr_err("Unable to load %s: %s\n", path, dlerror()); return -1; } /* * Load plugin descriptor. If plugin is too old -- create * dynamic plugin descriptor. In most cases this won't * be a common operation and plugins are not supposed to * be changing own format frequently. */ d = dlsym(h, "CR_PLUGIN_DESC"); if (!d) { d = cr_gen_plugin_desc(h, path); if (!d) { pr_err("Can't load plugin %s\n", path); goto error_close; } allocated = true; } this = xzalloc(sizeof(*this)); if (!this) goto error_close; if (verify_plugin(d)) { pr_err("Corrupted plugin %s\n", path); goto error_free; } this->d = d; this->dlhandle = h; INIT_LIST_HEAD(&this->list); for (i = 0; i < d->max_hooks; i++) INIT_LIST_HEAD(&this->link[i]); list_add_tail(&this->list, &cr_plugin_ctl.head); show_plugin_desc(d); if (d->init && d->init(stage)) { pr_err("Failed in init(%d) of \"%s\"\n", stage, d->name); list_del(&this->list); goto error_free; } /* * Chain hooks into appropriate places for * fast handler access. */ for (i = 0; i < d->max_hooks; i++) { if (!d->hooks[i]) continue; list_add_tail(&this->link[i], &cr_plugin_ctl.hook_chain[i]); } return 0; error_free: xfree(this); error_close: dlclose(h); if (allocated) xfree(d); return -1; } void cr_plugin_fini(int stage, int ret) { plugin_desc_t *this, *tmp; list_for_each_entry_safe(this, tmp, &cr_plugin_ctl.head, list) { void *h = this->dlhandle; size_t i; list_del(&this->list); if (this->d->exit) this->d->exit(stage, ret); for (i = 0; i < this->d->max_hooks; i++) { if (!list_empty(&this->link[i])) list_del(&this->link[i]); } if (this->d->version == CRIU_PLUGIN_VERSION_OLD) xfree(this->d); dlclose(h); } } int cr_plugin_init(int stage) { int exit_code = -1; char *path; size_t i; DIR *d; INIT_LIST_HEAD(&cr_plugin_ctl.head); for (i = 0; i < ARRAY_SIZE(cr_plugin_ctl.hook_chain); i++) INIT_LIST_HEAD(&cr_plugin_ctl.hook_chain[i]); if (opts.libdir == NULL) { path = getenv("CRIU_LIBS_DIR"); if (path) opts.libdir = path; else { if (access(CR_PLUGIN_DEFAULT, F_OK)) return 0; opts.libdir = CR_PLUGIN_DEFAULT; } } d = opendir(opts.libdir); if (d == NULL) { pr_perror("Unable to open directory %s", opts.libdir); return -1; } while (1) { char path[PATH_MAX]; struct dirent *de; int len; errno = 0; de = readdir(d); if (de == NULL) { if (errno == 0) break; pr_perror("Unable to read the libraries directory"); goto err; } len = strlen(de->d_name); if (len < 3 || strncmp(de->d_name + len - 3, ".so", 3)) continue; snprintf(path, sizeof(path), "%s/%s", opts.libdir, de->d_name); if (cr_lib_load(stage, path)) goto err; } exit_code = 0; err: closedir(d); if (exit_code) cr_plugin_fini(stage, exit_code); return exit_code; } criu-3.6/criu/proc_parse.c000066400000000000000000001537311317335042600156050ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include "common/list.h" #include "util.h" #include "mount.h" #include "filesystems.h" #include "mman.h" #include "cpu.h" #include "file-lock.h" #include "pstree.h" #include "fsnotify.h" #include "posix-timer.h" #include "kerndat.h" #include "vdso.h" #include "vma.h" #include "mem.h" #include "bfd.h" #include "proc_parse.h" #include "fdinfo.h" #include "parasite.h" #include "cr_options.h" #include "sysfs_parse.h" #include "seccomp.h" #include "string.h" #include "namespaces.h" #include "files-reg.h" #include "cgroup.h" #include "cgroup-props.h" #include "timerfd.h" #include "path.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" #include #ifndef SIGEV_SIGNAL #define SIGEV_SIGNAL 0 /* notify via signal */ #endif #ifndef SIGEV_NONE #define SIGEV_NONE 1 /* other notification: meaningless */ #endif #ifndef SIGEV_THREAD #define SIGEV_THREAD 2 /* deliver via thread creation */ #endif #ifndef SIGEV_THREAD_ID #define SIGEV_THREAD_ID 4 /* deliver to thread */ #endif struct buffer { char buf[PAGE_SIZE]; char end; /* '\0' */ }; static struct buffer __buf; static char *buf = __buf.buf; #define BUF_SIZE sizeof(__buf.buf) /* * This is how AIO ring buffers look like in proc */ #define AIO_FNAME "/[aio]" /* check the @line starts with "%lx-%lx" format */ static bool __is_vma_range_fmt(char *line) { #define ____is_vma_addr_char(__c) \ (((__c) <= '9' && (__c) >= '0') || \ ((__c) <= 'f' && (__c) >= 'a')) while (*line && ____is_vma_addr_char(*line)) line++; if (*line++ != '-') return false; while (*line && ____is_vma_addr_char(*line)) line++; if (*line++ != ' ') return false; return true; #undef ____is_vma_addr_char } bool is_vma_range_fmt(char *line) { return __is_vma_range_fmt(line); } static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { char *tok; if (!buf[0]) return; tok = strtok(buf, " \n"); if (!tok) return; #define _vmflag_match(_t, _s) (_t[0] == _s[0] && _t[1] == _s[1]) do { /* mmap() block */ if (_vmflag_match(tok, "gd")) *flags |= MAP_GROWSDOWN; else if (_vmflag_match(tok, "lo")) *flags |= MAP_LOCKED; else if (_vmflag_match(tok, "nr")) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; /* madvise() block */ if (_vmflag_match(tok, "sr")) *madv |= (1ul << MADV_SEQUENTIAL); else if (_vmflag_match(tok, "rr")) *madv |= (1ul << MADV_RANDOM); else if (_vmflag_match(tok, "dc")) *madv |= (1ul << MADV_DONTFORK); else if (_vmflag_match(tok, "dd")) *madv |= (1ul << MADV_DONTDUMP); else if (_vmflag_match(tok, "mg")) *madv |= (1ul << MADV_MERGEABLE); else if (_vmflag_match(tok, "hg")) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) *io_pf = 1; /* * Anything else is just ignored. */ } while ((tok = strtok(NULL, " \n"))); #undef _vmflag_match } void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { __parse_vmflags(buf, flags, madv, io_pf); } static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) { int io_pf = 0; __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf); /* * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP */ if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) vma_area->e->has_madv = true; } static inline int is_anon_shmem_map(dev_t dev) { return kdat.shmem_dev == dev; } struct vma_file_info { int dev_maj; int dev_min; unsigned long ino; struct vma_area *vma; }; static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b) { return ((a->ino ^ b->ino) | (a->dev_maj ^ b->dev_maj) | (a->dev_min ^ b->dev_min)) == 0; } static int vma_get_mapfile_flags(struct vma_area *vma, DIR *mfd, char *path) { struct stat stat; if (fstatat(dirfd(mfd), path, &stat, AT_SYMLINK_NOFOLLOW) < 0) { if (errno == ENOENT) { /* Just mapping w/o map_files link */ return 0; } pr_perror("Failed fstatat on map %"PRIx64"", vma->e->start); return -1; } switch(stat.st_mode & 0600) { case 0200: vma->e->fdflags = O_WRONLY; break; case 0400: vma->e->fdflags = O_RDONLY; break; case 0600: vma->e->fdflags = O_RDWR; break; } vma->e->has_fdflags = true; return 0; } static int vma_stat(struct vma_area *vma, int fd) { vma->vmst = xmalloc(sizeof(struct stat)); if (!vma->vmst) return -1; /* * For AUFS support, we need to check if the symbolic link * points to a branch. If it does, we cannot fstat() its file * descriptor because it would return a different dev/ino than * the real file. If fixup_aufs_vma_fd() returns positive, * it means that it has stat()'ed using the full pathname. * Zero return means that the symbolic link does not point to * a branch and we can do fstat() below. */ if (opts.aufs) { int ret; ret = fixup_aufs_vma_fd(vma, fd); if (ret < 0) return -1; if (ret > 0) return 0; } if (fstat(fd, vma->vmst) < 0) { pr_perror("Failed fstat on map %"PRIx64"", vma->e->start); return -1; } return 0; } static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct vma_file_info *vfi, int *vm_file_fd, const char *path) { int fd; dev_t vfi_dev; /* * Kernel prohibits reading map_files for users. The * best we can do here is fill stat using the information * from smaps file and ... hope for the better :\ * * Here we'll miss AIO-s and sockets :( */ if (fname[0] == '\0') { /* * Another bad thing is that kernel first checks * for permission access to ANY map_files link, * then checks for its existence. So we have to * check for file path being empty to "emulate" * the ENOENT case. */ if (vfi->dev_maj != 0 || vfi->dev_min != 0 || vfi->ino != 0) { pr_err("Strange file mapped at %lx [%s]:%d.%d.%ld\n", (unsigned long)vma->e->start, fname, vfi->dev_maj, vfi->dev_min, vfi->ino); return -1; } return 0; } else if (fname[0] != '/') { /* * This should be some kind of * special mapping like [heap], [vdso] * and such, the caller should take care * of the @fname and vma status. */ return 0; } vfi_dev = makedev(vfi->dev_maj, vfi->dev_min); if (is_anon_shmem_map(vfi_dev)) { if (!(vma->e->flags & MAP_SHARED)) return -1; vma->e->flags |= MAP_ANONYMOUS; vma->e->status |= VMA_ANON_SHARED; vma->e->shmid = vfi->ino; if (!strncmp(fname, "/SYSV", 5)) vma->e->status |= VMA_AREA_SYSVIPC; return 0; } pr_info("Failed to open map_files/%s, try to go via [%s] path\n", path, fname); fd = open(fname, O_RDONLY); if (fd < 0) { pr_perror("Can't open mapped [%s]", fname); return -1; } if (vma_stat(vma, fd)) { close(fd); return -1; } if (vma->vmst->st_dev != vfi_dev || vma->vmst->st_ino != vfi->ino) { pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); close(fd); return -1; } *vm_file_fd = fd; return 0; } static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, struct vma_file_info *vfi, struct vma_file_info *prev_vfi, int *vm_file_fd) { char path[32]; int flags; /* Figure out if it's file mapping */ snprintf(path, sizeof(path), "%"PRIx64"-%"PRIx64, vma->e->start, vma->e->end); if (vma_get_mapfile_flags(vma, mfd, path)) return -1; if (prev_vfi->vma && vfi_equal(vfi, prev_vfi)) { struct vma_area *prev = prev_vfi->vma; /* * If vfi is equal (!) and negative @vm_file_fd -- * we have nothing to borrow for sure. */ if (*vm_file_fd < 0) return 0; pr_debug("vma %"PRIx64" borrows vfi from previous %"PRIx64"\n", vma->e->start, prev->e->start); if (prev->e->status & VMA_AREA_SOCKET) vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; /* * FIXME -- in theory there can be vmas that have * dev:ino match, but live in different mount * namespaces. However, we only borrow files for * subsequent vmas. These are _very_ likely to * have files from the same namespaces. */ vma->file_borrowed = true; return 0; } close_safe(vm_file_fd); /* * Note that we "open" it in dumper process space * so later we might refer to it via /proc/self/fd/vm_file_fd * if needed. */ flags = O_PATH; if (vfi->dev_maj == 0) /* * Opening with O_PATH omits calling kernel ->open * method, thus for some special files their type * detection might be broken. Thus we open those with * the O_RDONLY to potentially get ENXIO and check * it below. */ flags = O_RDONLY; *vm_file_fd = openat(dirfd(mfd), path, flags); if (*vm_file_fd < 0) { if (errno == ENOENT) /* Just mapping w/o map_files link */ return 0; if (errno == ENXIO) { struct stat buf; if (fstatat(dirfd(mfd), path, &buf, 0)) return -1; if (S_ISSOCK(buf.st_mode)) { pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start); vma->vm_socket_id = buf.st_ino; vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; return 0; } if ((buf.st_mode & S_IFMT) == 0 && !strncmp(fname, AIO_FNAME, sizeof(AIO_FNAME) - 1)) { /* AIO ring, let's try */ close_safe(vm_file_fd); vma->e->status = VMA_AREA_AIORING; return 0; } pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); return -1; } if (errno == EPERM && !opts.aufs) return vma_get_mapfile_user(fname, vma, vfi, vm_file_fd, path); pr_perror("Can't open map_files"); return -1; } return vma_stat(vma, *vm_file_fd); } int parse_self_maps_lite(struct vm_area_list *vms) { struct vma_area *prev = NULL; struct bfd maps; char *buf; vm_area_list_init(vms); maps.fd = open_proc(PROC_SELF, "maps"); if (maps.fd < 0) return -1; if (bfdopenr(&maps)) return -1; while (1) { struct vma_area *vma; char *end; unsigned long s, e; buf = breadline(&maps); if (!buf) break; if (IS_ERR(buf)) goto err; s = strtoul(buf, &end, 16); e = strtoul(end + 1, NULL, 16); if (prev && prev->e->end == s) /* * This list is needed for one thing only -- to * get the idea of what parts of current address * space are busy. So merge them altogether. */ prev->e->end = e; else { vma = alloc_vma_area(); if (!vma) goto err; vma->e->start = s; vma->e->end = e; list_add_tail(&vma->list, &vms->h); vms->nr++; prev = vma; } pr_debug("Parsed %"PRIx64"-%"PRIx64" vma\n", prev->e->start, prev->e->end); } bclose(&maps); return 0; err: bclose(&maps); return -1; } #ifdef CONFIG_VDSO static inline int handle_vdso_vma(struct vma_area *vma) { vma->e->status |= VMA_AREA_REGULAR; if ((vma->e->prot & VDSO_PROT) == VDSO_PROT) vma->e->status |= VMA_AREA_VDSO; return 0; } static inline int handle_vvar_vma(struct vma_area *vma) { vma->e->status |= VMA_AREA_REGULAR; if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) vma->e->status |= VMA_AREA_VVAR; return 0; } #else static inline int handle_vdso_vma(struct vma_area *vma) { pr_warn_once("Found vDSO area without support\n"); return -1; } static inline int handle_vvar_vma(struct vma_area *vma) { pr_warn_once("Found VVAR area without support\n"); return -1; } #endif static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_path, DIR *map_files_dir, struct vma_file_info *vfi, struct vma_file_info *prev_vfi, int *vm_file_fd) { if (vma_get_mapfile(file_path, vma_area, map_files_dir, vfi, prev_vfi, vm_file_fd)) goto err_bogus_mapfile; if (vma_area->e->status != 0) return 0; if (!strcmp(file_path, "[vsyscall]") || !strcmp(file_path, "[vectors]")) { vma_area->e->status |= VMA_AREA_VSYSCALL; } else if (!strcmp(file_path, "[vdso]")) { if (handle_vdso_vma(vma_area)) goto err; } else if (!strcmp(file_path, "[vvar]")) { if (handle_vvar_vma(vma_area)) goto err; } else if (!strcmp(file_path, "[heap]")) { vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; } else { vma_area->e->status = VMA_AREA_REGULAR; } /* * Some mapping hints for restore, we save this on * disk and restore might need to analyze it. */ if (vma_area->file_borrowed) { struct vma_area *prev = prev_vfi->vma; /* * Pick-up flags that might be set in the branch below. * Status is copied as-is as it should be zero here, * and have full match with the previous. */ vma_area->e->flags |= (prev->e->flags & MAP_ANONYMOUS); vma_area->e->status = prev->e->status; vma_area->e->shmid = prev->e->shmid; vma_area->vmst = prev->vmst; vma_area->mnt_id = prev->mnt_id; } else if (*vm_file_fd >= 0) { struct stat *st_buf = vma_area->vmst; if (S_ISREG(st_buf->st_mode)) /* regular file mapping -- supported */; else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) /* devzero mapping -- also makes sense */; else { pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start); goto err; } /* * /dev/zero stands for anon-shared mapping * otherwise it's some file mapping. */ if (is_anon_shmem_map(st_buf->st_dev)) { if (!(vma_area->e->flags & MAP_SHARED)) goto err_bogus_mapping; vma_area->e->flags |= MAP_ANONYMOUS; vma_area->e->status |= VMA_ANON_SHARED; vma_area->e->shmid = st_buf->st_ino; if (!strncmp(file_path, "/SYSV", 5)) { pr_info("path: %s\n", file_path); vma_area->e->status |= VMA_AREA_SYSVIPC; } } else { if (vma_area->e->flags & MAP_PRIVATE) vma_area->e->status |= VMA_FILE_PRIVATE; else vma_area->e->status |= VMA_FILE_SHARED; } /* * We cannot use the mnt_id value provided by the kernel * for vm_file_fd if it is an AUFS file (the value is * wrong). In such a case, fixup_aufs_vma_fd() has set * mnt_id to -1 to mimic pre-3.15 kernels that didn't * have mnt_id. */ if (vma_area->mnt_id != -1 && get_fd_mntid(*vm_file_fd, &vma_area->mnt_id)) return -1; } else { /* * No file but mapping -- anonymous one. */ if (vma_area->e->flags & MAP_SHARED) { vma_area->e->status |= VMA_ANON_SHARED; vma_area->e->shmid = vfi->ino; } else { vma_area->e->status |= VMA_ANON_PRIVATE; } vma_area->e->flags |= MAP_ANONYMOUS; } return 0; err: return -1; err_bogus_mapping: pr_err("Bogus mapping 0x%"PRIx64"-0x%"PRIx64" (flags: %#x vm_file_fd: %d)\n", vma_area->e->start, vma_area->e->end, vma_area->e->flags, *vm_file_fd); goto err; err_bogus_mapfile: pr_perror("Can't open %d's mapfile link %"PRIx64, pid, vma_area->e->start); goto err; } static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area_list, unsigned long *prev_end, struct vma_file_info *vfi, struct vma_file_info *prev_vfi) { if (vma_area->e->status & VMA_UNSUPP) { pr_err("Unsupported mapping found %016"PRIx64"-%016"PRIx64"\n", vma_area->e->start, vma_area->e->end); return -1; } /* Add a guard page only if here is enough space for it */ if (vma_has_guard_gap_hidden(vma_area) && *prev_end < vma_area->e->start) vma_area->e->start -= PAGE_SIZE; /* Guard page */ *prev_end = vma_area->e->end; list_add_tail(&vma_area->list, &vma_area_list->h); vma_area_list->nr++; if (vma_area_is_private(vma_area, kdat.task_size)) { unsigned long pages; pages = vma_area_len(vma_area) / PAGE_SIZE; vma_area_list->priv_size += pages; vma_area_list->priv_longest = max(vma_area_list->priv_longest, pages); } else if (vma_area_is(vma_area, VMA_ANON_SHARED)) { unsigned long pages; pages = vma_area_len(vma_area) / PAGE_SIZE; vma_area_list->shared_longest = max(vma_area_list->shared_longest, pages); } *prev_vfi = *vfi; prev_vfi->vma = vma_area; return 0; } /* * On s390 we have old kernels where the global task size assumption of * criu does not work. See also compel_task_size() for s390. */ static int task_size_check(pid_t pid, VmaEntry *entry) { #ifdef __s390x__ if (entry->end <= kdat.task_size) return 0; pr_err("Can't dump high memory region %lx-%lx of task %d because kernel commit ee71d16d22bb is missing\n", entry->start, entry->end, pid); return -1; #else return 0; #endif } int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_filemap) { struct vma_area *vma_area = NULL; unsigned long start, end, pgoff, prev_end = 0; char r, w, x, s; int ret = -1, vm_file_fd = -1; struct vma_file_info vfi; struct vma_file_info prev_vfi = {}; DIR *map_files_dir = NULL; struct bfd f; vma_area_list->nr = 0; vma_area_list->nr_aios = 0; vma_area_list->priv_longest = 0; vma_area_list->priv_size = 0; vma_area_list->shared_longest = 0; INIT_LIST_HEAD(&vma_area_list->h); f.fd = open_proc(pid, "smaps"); if (f.fd < 0) goto err_n; if (bfdopenr(&f)) goto err_n; map_files_dir = opendir_proc(pid, "map_files"); if (!map_files_dir) /* old kernel? */ goto err; while (1) { int num, path_off; bool eof; char *str; str = breadline(&f); if (IS_ERR(str)) goto err; eof = (str == NULL); if (!eof && !__is_vma_range_fmt(str)) { if (!strncmp(str, "Nonlinear", 9)) { BUG_ON(!vma_area); pr_err("Nonlinear mapping found %016"PRIx64"-%016"PRIx64"\n", vma_area->e->start, vma_area->e->end); /* * VMA is already on list and will be * freed later as list get destroyed. */ vma_area = NULL; goto err; } else if (!strncmp(str, "VmFlags: ", 9)) { BUG_ON(!vma_area); parse_vma_vmflags(&str[9], vma_area); continue; } else continue; } if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) goto err; if (eof) break; vma_area = alloc_vma_area(); if (!vma_area) goto err; num = sscanf(str, "%lx-%lx %c%c%c%c %lx %x:%x %lu %n", &start, &end, &r, &w, &x, &s, &pgoff, &vfi.dev_maj, &vfi.dev_min, &vfi.ino, &path_off); if (num < 10) { pr_err("Can't parse: %s\n", str); goto err; } vma_area->e->start = start; vma_area->e->end = end; vma_area->e->pgoff = pgoff; vma_area->e->prot = PROT_NONE; if (task_size_check(pid, vma_area->e)) goto err; if (r == 'r') vma_area->e->prot |= PROT_READ; if (w == 'w') vma_area->e->prot |= PROT_WRITE; if (x == 'x') vma_area->e->prot |= PROT_EXEC; if (s == 's') vma_area->e->flags = MAP_SHARED; else if (s == 'p') vma_area->e->flags = MAP_PRIVATE; else { pr_err("Unexpected VMA met (%c)\n", s); goto err; } if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd)) goto err; if (vma_entry_is(vma_area->e, VMA_FILE_PRIVATE) || vma_entry_is(vma_area->e, VMA_FILE_SHARED)) { if (dump_filemap && dump_filemap(vma_area, vm_file_fd)) goto err; } else if (vma_entry_is(vma_area->e, VMA_AREA_AIORING)) vma_area_list->nr_aios++; } vma_area = NULL; ret = 0; err: bclose(&f); err_n: close_safe(&vm_file_fd); if (map_files_dir) closedir(map_files_dir); xfree(vma_area); return ret; } int parse_pid_stat(pid_t pid, struct proc_pid_stat *s) { char *tok, *p; int fd; int n; fd = open_proc(pid, "stat"); if (fd < 0) return -1; n = read(fd, buf, BUF_SIZE); close(fd); if (n < 1) { pr_err("stat for %d is corrupted\n", pid); return -1; } memset(s, 0, sizeof(*s)); tok = strchr(buf, ' '); if (!tok) goto err; *tok++ = '\0'; if (*tok != '(') goto err; s->pid = atoi(buf); p = strrchr(tok + 1, ')'); if (!p) goto err; *tok = '\0'; *p = '\0'; strlcpy(s->comm, tok + 1, sizeof(s->comm)); n = sscanf(p + 1, " %c %d %d %d %d %d %u %lu %lu %lu %lu " "%lu %lu %ld %ld %ld %ld %d %d %llu %lu %ld %lu %lu %lu %lu " "%lu %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld " "%lu %lu %lu %lu %lu %lu %lu %d", &s->state, &s->ppid, &s->pgid, &s->sid, &s->tty_nr, &s->tty_pgrp, &s->flags, &s->min_flt, &s->cmin_flt, &s->maj_flt, &s->cmaj_flt, &s->utime, &s->stime, &s->cutime, &s->cstime, &s->priority, &s->nice, &s->num_threads, &s->zero0, &s->start_time, &s->vsize, &s->mm_rss, &s->rsslim, &s->start_code, &s->end_code, &s->start_stack, &s->esp, &s->eip, &s->sig_pending, &s->sig_blocked, &s->sig_ignored, &s->sig_handled, &s->wchan, &s->zero1, &s->zero2, &s->exit_signal, &s->task_cpu, &s->rt_priority, &s->policy, &s->delayacct_blkio_ticks, &s->gtime, &s->cgtime, &s->start_data, &s->end_data, &s->start_brk, &s->arg_start, &s->arg_end, &s->env_start, &s->env_end, &s->exit_code); if (n < 50) goto err; return 0; err: pr_err("Parsing %d's stat failed (#fields do not match)\n", pid); return -1; } int prepare_loginuid(unsigned int value, unsigned int loglevel) { int fd, ret = 0; char buf[11]; /* 4294967295 is maximum for u32 */ fd = open_proc_rw(PROC_SELF, "loginuid"); if (fd < 0) return -1; snprintf(buf, 11, "%u", value); if (write(fd, buf, 11) < 0) { print_on_level(loglevel, "Write %s to /proc/self/loginuid failed: %s", buf, strerror(errno)); ret = -1; } close(fd); return ret; } unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent) { int fd; ssize_t num; *err = 0; fd = __open_proc(pid, (ignore_noent) ? ENOENT : 0, O_RDONLY, "loginuid"); if (fd < 0) goto out; num = read(fd, buf, 10); close(fd); if (num < 0) { pr_perror("Unable to read /proc/%d/loginuid", pid); goto out; } buf[num] = '\0'; return strtol(buf, NULL, 10); out: *err = -1; return INVALID_UID; /* unset value */ } int parse_pid_oom_score_adj(pid_t pid, int *err) { int fd; ssize_t num; *err = 0; fd = open_proc(pid, "oom_score_adj"); if (fd < 0) goto out; num = read(fd, buf, 10); close(fd); if (num < 0) { pr_perror("Unable to read /proc/%d/oom_score_adj", pid); goto out; } buf[num] = '\0'; return strtol(buf, NULL, 10); out: *err = -1; return 0; } static int ids_parse(char *str, unsigned int *arr) { char *end; arr[0] = strtol(str, &end, 10); arr[1] = strtol(end + 1, &end, 10); arr[2] = strtol(end + 1, &end, 10); arr[3] = strtol(end + 1, &end, 10); if (*end) return -1; else return 0; } static int cap_parse(char *str, unsigned int *res) { int i, ret; for (i = 0; i < PROC_CAP_SIZE; i++) { ret = sscanf(str, "%08x", &res[PROC_CAP_SIZE - 1 - i]); if (ret != 1) return -1; str += 8; } return 0; } int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) { struct proc_status_creds *cr = container_of(ss, struct proc_status_creds, s); struct bfd f; int done = 0; int ret = -1; char *str; bool parsed_seccomp = false; f.fd = open_proc(pid, "status"); if (f.fd < 0) return -1; cr->s.sigpnd = 0; cr->s.shdpnd = 0; if (bfdopenr(&f)) return -1; while (done < 12) { str = breadline(&f); if (str == NULL) break; if (IS_ERR(str)) goto err_parse; if (!strncmp(str, "State:", 6)) { cr->s.state = str[7]; done++; continue; } if (!strncmp(str, "PPid:", 5)) { if (sscanf(str, "PPid:\t%d", &cr->s.ppid) != 1) { pr_err("Unable to parse: %s\n", str); goto err_parse; } done++; continue; } if (!strncmp(str, "Uid:", 4)) { if (ids_parse(str + 5, cr->uids)) goto err_parse; done++; continue; } if (!strncmp(str, "Gid:", 4)) { if (ids_parse(str + 5, cr->gids)) goto err_parse; done++; continue; } if (!strncmp(str, "CapInh:", 7)) { if (cap_parse(str + 8, cr->cap_inh)) goto err_parse; done++; continue; } if (!strncmp(str, "CapEff:", 7)) { if (cap_parse(str + 8, cr->cap_eff)) goto err_parse; done++; continue; } if (!strncmp(str, "CapPrm:", 7)) { if (cap_parse(str + 8, cr->cap_prm)) goto err_parse; done++; continue; } if (!strncmp(str, "CapBnd:", 7)) { if (cap_parse(str + 8, cr->cap_bnd)) goto err_parse; done++; continue; } if (!strncmp(str, "Seccomp:", 8)) { if (sscanf(str + 9, "%d", &cr->s.seccomp_mode) != 1) { goto err_parse; } parsed_seccomp = true; done++; continue; } if (!strncmp(str, "ShdPnd:", 7)) { unsigned long long sigpnd; if (sscanf(str + 7, "%llx", &sigpnd) != 1) goto err_parse; cr->s.shdpnd |= sigpnd; done++; continue; } if (!strncmp(str, "SigPnd:", 7)) { unsigned long long sigpnd; if (sscanf(str + 7, "%llx", &sigpnd) != 1) goto err_parse; cr->s.sigpnd |= sigpnd; done++; continue; } } /* seccomp is optional */ if (done >= 11 || (done == 10 && !parsed_seccomp)) ret = 0; err_parse: if (ret) pr_err("Error parsing proc status file\n"); bclose(&f); return ret; } struct opt2flag { char *opt; unsigned flag; }; static bool sb_opt_cb(char *opt, char *unknown, size_t *uoff) { unsigned int id; if (sscanf(opt, "gid=%d", &id) == 1) { *uoff += sprintf(unknown + *uoff, "gid=%d", userns_gid(id)); unknown[*uoff] = ','; (*uoff)++; return true; } else if (sscanf(opt, "uid=%d", &id) == 1) { *uoff += sprintf(unknown + *uoff, "uid=%d", userns_uid(id)); unknown[*uoff] = ','; (*uoff)++; return true; } return false; } static int do_opt2flag(char *opt, unsigned *flags, const struct opt2flag *opts, char *unknown, bool (*cb)(char *opt, char *unknown, size_t *uoff)) { int i; char *end; size_t uoff = 0; while (1) { end = strchr(opt, ','); if (end) *end = '\0'; for (i = 0; opts[i].opt != NULL; i++) if (!strcmp(opts[i].opt, opt)) { (*flags) |= opts[i].flag; break; } if (opts[i].opt == NULL && cb && !cb(opt, unknown, &uoff)) { if (!unknown) { pr_err("Unknown option [%s]\n", opt); return -1; } strcpy(unknown + uoff, opt); uoff += strlen(opt); unknown[uoff] = ','; uoff++; } if (!end) { if (uoff) uoff--; if (unknown) unknown[uoff] = '\0'; break; } else opt = end + 1; } return 0; } static int parse_mnt_flags(char *opt, unsigned *flags) { static const struct opt2flag mnt_opt2flag[] = { { "rw", 0, }, { "ro", MS_RDONLY, }, { "nosuid", MS_NOSUID, }, { "nodev", MS_NODEV, }, { "noexec", MS_NOEXEC, }, { "noatime", MS_NOATIME, }, { "nodiratime", MS_NODIRATIME, }, { "relatime", MS_RELATIME, }, { }, }; if (do_opt2flag(opt, flags, mnt_opt2flag, NULL, NULL)) return -1; /* Otherwise the kernel assumes RELATIME by default */ if ((*flags & (MS_RELATIME | MS_NOATIME)) == 0) *flags |= MS_STRICTATIME; return 0; } static int parse_sb_opt(char *opt, unsigned *flags, char *uopt) { static const struct opt2flag sb_opt2flag[] = { { "rw", 0, }, { "ro", MS_RDONLY, }, { "sync", MS_SYNC, }, { "dirsync", MS_DIRSYNC, }, { "mad", MS_MANDLOCK, }, { }, }; return do_opt2flag(opt, flags, sb_opt2flag, uopt, sb_opt_cb); } static int parse_mnt_opt(char *str, struct mount_info *mi, int *off) { char *istr = str, *end; while (1) { end = strchr(str, ' '); if (!end) { pr_err("Error parsing mount options\n"); return -1; } *end = '\0'; if (!strncmp(str, "-", 1)) break; else if (!strncmp(str, "shared:", 7)) { mi->flags |= MS_SHARED; mi->shared_id = atoi(str + 7); } else if (!strncmp(str, "master:", 7)) { mi->flags |= MS_SLAVE; mi->master_id = atoi(str + 7); } else if (!strncmp(str, "propagate_from:", 15)) { /* skip */; } else if (!strncmp(str, "unbindable", 11)) mi->flags |= MS_UNBINDABLE; else { pr_err("Unknown option [%s]\n", str); return -1; } str = end + 1; } *off = end - istr + 1; return 0; } /* * mountinfo contains mangled paths. space, tab and back slash were replaced * with usual octal escape. This function replaces these symbols back. */ static void cure_path(char *path) { int i, len, off = 0; if (strchr(path, '\\') == NULL) /* fast path */ return; len = strlen(path); for (i = 0; i < len; i++) { if (!strncmp(path + i, "\\040", 4)) { path[i - off] = ' '; goto replace; } else if (!strncmp(path + i, "\\011", 4)) { path[i - off] = '\t'; goto replace; } else if (!strncmp(path + i, "\\134", 4)) { path[i - off] = '\\'; goto replace; } if (off) path[i - off] = path[i]; continue; replace: off += 3; i += 3; } path[len - off] = 0; } static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) { struct fd_link root_link; unsigned int kmaj, kmin; int ret, n; char *sub, *opt = NULL; new->mountpoint = xmalloc(PATH_MAX); if (new->mountpoint == NULL) goto err; new->mountpoint[0] = '.'; ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; cure_path(new->mountpoint); cure_path(new->root); root_link.len = strlen(new->root); strcpy(root_link.name, new->root); if (strip_deleted(&root_link)) { strcpy(new->root, root_link.name); new->deleted = true; } new->mountpoint = xrealloc(new->mountpoint, strlen(new->mountpoint) + 1); if (!new->mountpoint) goto err; new->ns_mountpoint = new->mountpoint; new->is_ns_root = is_root(new->ns_mountpoint + 1); new->s_dev = new->s_dev_rt = MKKDEV(kmaj, kmin); new->flags = 0; if (parse_mnt_flags(opt, &new->flags)) goto err; free(opt); /* we are going to reallocate/reuse this buffer */ opt = NULL; str += n; if (parse_mnt_opt(str, new, &n)) goto err; str += n; ret = sscanf(str, "%ms %ms %ms", fsname, &new->source, &opt); if (ret == 2) { /* src may be empty */ opt = new->source; new->source = xstrdup(""); if (new->source == NULL) goto err; } else if (ret != 3) goto err; cure_path(new->source); new->fsname = xstrdup(*fsname); if (!new->fsname) goto err; /* * The kernel reports "subtypes" sometimes and the valid * type-vs-subtype delimiter is the dot symbol. We disregard * any subtypes for the purpose of finding the fstype. */ sub = strchr(*fsname, '.'); if (sub) *sub = 0; new->fstype = find_fstype_by_name(*fsname); new->options = xmalloc(strlen(opt) + 1); if (!new->options) goto err; if (parse_sb_opt(opt, &new->sb_flags, new->options)) goto err; ret = 0; ret: xfree(opt); return ret; err: ret = -1; goto ret; } static LIST_HEAD(skip_mount_list); struct str_node { struct list_head node; char string[]; }; bool add_skip_mount(const char *mountpoint) { struct str_node *skip = xmalloc(sizeof(struct str_node) + strlen(mountpoint) + 1); if (!skip) return false; strcpy(skip->string, mountpoint); list_add(&skip->node, &skip_mount_list); return true; } static bool should_skip_mount(const char *mountpoint) { struct str_node *pos; list_for_each_entry(pos, &skip_mount_list, node) { if (strcmp(mountpoint, pos->string) == 0) return true; } return false; } struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump) { struct mount_info *list = NULL; FILE *f; f = fopen_proc(pid, "mountinfo"); if (!f) return NULL; while (fgets(buf, BUF_SIZE, f)) { struct mount_info *new; int ret = -1; char *fsname = NULL; new = mnt_entry_alloc(); if (!new) goto end; new->nsid = nsid; ret = parse_mountinfo_ent(buf, new, &fsname); if (ret < 0) { pr_err("Bad format in %d mountinfo: '%s'\n", pid, buf); goto end; } /* * Drop this mountpoint early, so that lookup_mnt_id/etc will * fail loudly at "dump" stage if an opened file or another mnt * depends on this one. */ if (for_dump && should_skip_mount(new->mountpoint + 1)) { pr_info("\tskip %s @ %s\n", fsname, new->mountpoint); mnt_entry_free(new); new = NULL; goto end; } pr_info("\ttype %s source %s mnt_id %d s_dev %#x %s @ %s flags %#x options %s\n", fsname, new->source, new->mnt_id, new->s_dev, new->root, new->mountpoint, new->flags, new->options); if (new->fstype->parse) { ret = new->fstype->parse(new); if (ret < 0) { pr_err("Failed to parse FS specific data on %s\n", new->mountpoint); mnt_entry_free(new); new = NULL; goto end; } if (ret > 0) { pr_info("\tskipping fs mounted at %s\n", new->mountpoint + 1); mnt_entry_free(new); new = NULL; ret = 0; goto end; } } end: if (fsname) free(fsname); if (new) { new->next = list; list = new; } if (ret) goto err; } out: fclose(f); return list; err: while (list) { struct mount_info *next = list->next; mnt_entry_free(list); list = next; } goto out; } static char nybble(const char n) { if (n >= '0' && n <= '9') return n - '0'; else if (n >= 'A' && n <= 'F') return n - ('A' - 10); else if (n >= 'a' && n <= 'f') return n - ('a' - 10); return 0; } static void parse_fhandle_encoded(char *tok, FhEntry *fh) { char *d = (char *)fh->handle; int i = 0; memzero(d, pb_repeated_size(fh, handle)); while (*tok == ' ') tok++; while (*tok) { if (i >= pb_repeated_size(fh, handle)) break; d[i++] = (nybble(tok[0]) << 4) | nybble(tok[1]); if (tok[1]) tok += 2; else break; } } static int parse_timerfd(struct bfd *f, char *str, TimerfdEntry *tfy) { /* * Format is * clockid: 0 * ticks: 0 * settime flags: 01 * it_value: (0, 49406829) * it_interval: (1, 0) */ if (sscanf(str, "clockid: %d", &tfy->clockid) != 1) goto parse_err; if (verify_timerfd(tfy) < 0) goto parse_err; str = breadline(f); if (IS_ERR_OR_NULL(str)) goto nodata; if (sscanf(str, "ticks: %llu", (unsigned long long *)&tfy->ticks) != 1) goto parse_err; str = breadline(f); if (IS_ERR_OR_NULL(str)) goto nodata; if (sscanf(str, "settime flags: 0%o", &tfy->settime_flags) != 1) goto parse_err; str = breadline(f); if (IS_ERR_OR_NULL(str)) goto nodata; if (sscanf(str, "it_value: (%llu, %llu)", (unsigned long long *)&tfy->vsec, (unsigned long long *)&tfy->vnsec) != 2) goto parse_err; str = breadline(f); if (IS_ERR_OR_NULL(str)) goto nodata; if (sscanf(str, "it_interval: (%llu, %llu)", (unsigned long long *)&tfy->isec, (unsigned long long *)&tfy->insec) != 2) goto parse_err; return 0; parse_err: return -1; nodata: pr_err("No data left in proc file while parsing timerfd\n"); goto parse_err; } #define fdinfo_field(str, field) !strncmp(str, field":", sizeof(field)) static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked); static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) { struct bfd f; char *str; bool entry_met = false; int ret, exit_code = -1;; f.fd = open_proc(pid, "fdinfo/%d", fd); if (f.fd < 0) return -1; if (bfdopenr(&f)) return -1; while (1) { str = breadline(&f); if (!str) break; if (IS_ERR(str)) goto out; if (fdinfo_field(str, "pos") || fdinfo_field(str, "flags") || fdinfo_field(str, "mnt_id")) { unsigned long long val; struct fdinfo_common *fdinfo = arg; if (type != FD_TYPES__UND) continue; ret = sscanf(str, "%*s %lli", &val); if (ret != 1) goto parse_err; if (fdinfo_field(str, "pos")) fdinfo->pos = val; else if (fdinfo_field(str, "flags")) fdinfo->flags = val; else if (fdinfo_field(str, "mnt_id")) fdinfo->mnt_id = val; entry_met = true; continue; } if (fdinfo_field(str, "lock")) { struct file_lock *fl; struct fdinfo_common *fdinfo = arg; if (type != FD_TYPES__UND) continue; fl = alloc_file_lock(); if (!fl) { pr_perror("Alloc file lock failed!"); goto out; } if (parse_file_lock_buf(str + 6, fl, 0)) { xfree(fl); goto parse_err; } pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n", fl->fl_id, fl->fl_kind, fl->fl_ltype, fl->fl_owner, fl->maj, fl->min, fl->i_no, fl->start, fl->end); if (fl->fl_kind == FL_UNKNOWN) { pr_err("Unknown file lock!\n"); xfree(fl); goto out; } fl->real_owner = fdinfo->owner; fl->owners_fd = fd; list_add_tail(&fl->list, &file_lock_list); } if (type == FD_TYPES__UND) continue; if (fdinfo_field(str, "eventfd-count")) { EventfdFileEntry *efd = arg; if (type != FD_TYPES__EVENTFD) goto parse_err; ret = sscanf(str, "eventfd-count: %"PRIx64, &efd->counter); if (ret != 1) goto parse_err; entry_met = true; continue; } if (fdinfo_field(str, "clockid")) { TimerfdEntry *tfe = arg; if (type != FD_TYPES__TIMERFD) goto parse_err; ret = parse_timerfd(&f, str, tfe); if (ret) goto parse_err; entry_met = true; continue; } if (fdinfo_field(str, "tfd")) { EventpollFileEntry *epfe = arg; EventpollTfdEntry *e; int i; if (type != FD_TYPES__EVENTPOLL) goto parse_err; e = xmalloc(sizeof(EventpollTfdEntry)); if (!e) goto out; eventpoll_tfd_entry__init(e); ret = sscanf(str, "tfd: %d events: %x data: %"PRIx64, &e->tfd, &e->events, &e->data); if (ret != 3) { eventpoll_tfd_entry__free_unpacked(e, NULL); goto parse_err; } i = epfe->n_tfd++; if (xrealloc_safe(&epfe->tfd, epfe->n_tfd * sizeof(EventpollTfdEntry *))) goto out; epfe->tfd[i] = e; entry_met = true; continue; } if (fdinfo_field(str, "sigmask")) { SignalfdEntry *sfd = arg; if (type != FD_TYPES__SIGNALFD) goto parse_err; ret = sscanf(str, "sigmask: %Lx", (unsigned long long *)&sfd->sigmask); if (ret != 1) goto parse_err; entry_met = true; continue; } if (fdinfo_field(str, "fanotify flags")) { FanotifyFileEntry *fe = arg; if (type != FD_TYPES__FANOTIFY) goto parse_err; ret = sscanf(str, "fanotify flags:%x event-flags:%x", &fe->faflags, &fe->evflags); if (ret != 2) goto parse_err; entry_met = true; continue; } if (fdinfo_field(str, "fanotify ino")) { void *buf, *ob; FanotifyFileEntry *fe = arg; FanotifyMarkEntry *me; int hoff = 0, i; if (type != FD_TYPES__FANOTIFY) goto parse_err; ob = buf = xmalloc(sizeof(FanotifyMarkEntry) + sizeof(FanotifyInodeMarkEntry) + sizeof(FhEntry) + FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); if (!buf) goto out; me = xptr_pull(&buf, FanotifyMarkEntry); fanotify_mark_entry__init(me); me->ie = xptr_pull(&buf, FanotifyInodeMarkEntry); fanotify_inode_mark_entry__init(me->ie); me->ie->f_handle = xptr_pull(&buf, FhEntry); fh_entry__init(me->ie->f_handle); me->ie->f_handle->n_handle = FH_ENTRY_SIZES__min_entries; me->ie->f_handle->handle = xptr_pull_s(&buf, FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); ret = sscanf(str, "fanotify ino:%"PRIx64" sdev:%x mflags:%x mask:%x ignored_mask:%x " "fhandle-bytes:%x fhandle-type:%x f_handle: %n", &me->ie->i_ino, &me->s_dev, &me->mflags, &me->mask, &me->ignored_mask, &me->ie->f_handle->bytes, &me->ie->f_handle->type, &hoff); if (ret != 7 || hoff == 0) { xfree(ob); goto parse_err; } parse_fhandle_encoded(str + hoff, me->ie->f_handle); me->type = MARK_TYPE__INODE; i = fe->n_mark++; if (xrealloc_safe(&fe->mark, fe->n_mark * sizeof(FanotifyMarkEntry *))) { xfree(ob); goto out; } fe->mark[i] = me; entry_met = true; continue; } if (fdinfo_field(str, "fanotify mnt_id")) { void *buf, *ob; FanotifyFileEntry *fe = arg; FanotifyMarkEntry *me; int i; if (type != FD_TYPES__FANOTIFY) goto parse_err; ob = buf = xmalloc(sizeof(FanotifyMarkEntry) + sizeof(FanotifyMountMarkEntry)); if (!buf) goto out; me = xptr_pull(&buf, FanotifyMarkEntry); fanotify_mark_entry__init(me); me->me = xptr_pull(&buf, FanotifyMountMarkEntry); fanotify_mount_mark_entry__init(me->me); ret = sscanf(str, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x", &me->me->mnt_id, &me->mflags, &me->mask, &me->ignored_mask); if (ret != 4) { xfree(ob); goto parse_err; } me->type = MARK_TYPE__MOUNT; i = fe->n_mark++; if (xrealloc_safe(&fe->mark, fe->n_mark * sizeof(FanotifyMarkEntry *))) { xfree(ob); goto out; } fe->mark[i] = me; entry_met = true; continue; } if (fdinfo_field(str, "inotify wd")) { void *buf, *ob; InotifyFileEntry *ie = arg; InotifyWdEntry *ify; int hoff, i; if (type != FD_TYPES__INOTIFY) goto parse_err; ob = buf = xmalloc(sizeof(InotifyWdEntry) + sizeof(FhEntry) + FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); if (!buf) goto out; ify = xptr_pull(&buf, InotifyWdEntry); inotify_wd_entry__init(ify); ify->f_handle = xptr_pull(&buf, FhEntry); fh_entry__init(ify->f_handle); ify->f_handle->n_handle = FH_ENTRY_SIZES__min_entries; ify->f_handle->handle = xptr_pull_s(&buf, FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); ret = sscanf(str, "inotify wd:%x ino:%"PRIx64" sdev:%x " "mask:%x ignored_mask:%x " "fhandle-bytes:%x fhandle-type:%x " "f_handle: %n", &ify->wd, &ify->i_ino, &ify->s_dev, &ify->mask, &ify->ignored_mask, &ify->f_handle->bytes, &ify->f_handle->type, &hoff); if (ret != 7) { xfree(ob); goto parse_err; } parse_fhandle_encoded(str + hoff, ify->f_handle); i = ie->n_wd++; if (xrealloc_safe(&ie->wd, ie->n_wd * sizeof(InotifyWdEntry *))) { xfree(ob); goto out; } ie->wd[i] = ify; entry_met = true; continue; } } exit_code = 0; if (entry_met) goto out; /* * An eventpoll/inotify file may have no target fds set thus * resulting in no tfd: lines in proc. This is normal. */ if (type == FD_TYPES__EVENTPOLL || type == FD_TYPES__INOTIFY) goto out; pr_err("No records of type %d found in fdinfo file\n", type); parse_err: exit_code = -1; pr_perror("%s: error parsing [%s] for %d", __func__, str, type); out: bclose(&f); return exit_code; } int parse_fdinfo_pid(int pid, int fd, int type, void *arg) { return parse_fdinfo_pid_s(pid, fd, type, arg); } int parse_fdinfo(int fd, int type, void *arg) { return parse_fdinfo_pid_s(PROC_SELF, fd, type, arg); } int get_fd_mntid(int fd, int *mnt_id) { struct fdinfo_common fdinfo = { .mnt_id = -1}; if (parse_fdinfo(fd, FD_TYPES__UND, &fdinfo)) return -1; *mnt_id = fdinfo.mnt_id; return 0; } static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) { int num; char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } if (num < 10) { pr_err("Invalid file lock info (%d): %s\n", num, buf); return -1; } if (!strcmp(fl_flag, "POSIX")) fl->fl_kind = FL_POSIX; else if (!strcmp(fl_flag, "FLOCK")) fl->fl_kind = FL_FLOCK; else if (!strcmp(fl_flag, "OFDLCK")) fl->fl_kind = FL_OFD; else fl->fl_kind = FL_UNKNOWN; if (!strcmp(fl_type, "MSNFS")) { fl->fl_ltype |= LOCK_MAND; if (!strcmp(fl_option, "READ")) { fl->fl_ltype |= LOCK_READ; } else if (!strcmp(fl_option, "RW")) { fl->fl_ltype |= LOCK_RW; } else if (!strcmp(fl_option, "WRITE")) { fl->fl_ltype |= LOCK_WRITE; } else { pr_err("Unknown lock option!\n"); return -1; } } else { if (!strcmp(fl_option, "UNLCK")) { fl->fl_ltype |= F_UNLCK; } else if (!strcmp(fl_option, "WRITE")) { fl->fl_ltype |= F_WRLCK; } else if (!strcmp(fl_option, "READ")) { fl->fl_ltype |= F_RDLCK; } else { pr_err("Unknown lock option!\n"); return -1; } } return 0; } static bool pid_in_pstree(pid_t pid) { return pstree_item_by_real(pid) != NULL; } int parse_file_locks(void) { struct file_lock *fl; FILE *fl_locks; int exit_code = -1; bool is_blocked; if (kdat.has_fdinfo_lock) return 0; fl_locks = fopen_proc(PROC_GEN, "locks"); if (!fl_locks) return -1; while (fgets(buf, BUF_SIZE, fl_locks)) { is_blocked = strstr(buf, "->") != NULL; fl = alloc_file_lock(); if (!fl) { pr_perror("Alloc file lock failed!"); goto err; } if (parse_file_lock_buf(buf, fl, is_blocked)) { xfree(fl); goto err; } pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n", fl->fl_id, fl->fl_kind, fl->fl_ltype, fl->fl_owner, fl->maj, fl->min, fl->i_no, fl->start, fl->end); if (fl->fl_kind == FL_UNKNOWN) { pr_err("Unknown file lock: %s!\n", buf); xfree(fl); goto err; } if (is_blocked) { /* * All target processes are stopped in this moment and * can't wait any locks. */ pr_debug("Skip blocked processes\n"); xfree(fl); continue; } if ((fl->fl_kind == FL_POSIX) && !pid_in_pstree(fl->fl_owner)) { /* * We only care about tasks which are taken * into dump, so we only collect file locks * belong to these tasks. */ xfree(fl); continue; } list_add_tail(&fl->list, &file_lock_list); } exit_code = 0; err: fclose(fl_locks); return exit_code; } void free_posix_timers(struct proc_posix_timers_stat *st) { while (!list_empty(&st->timers)) { struct proc_posix_timer *timer; timer = list_first_entry(&st->timers, struct proc_posix_timer, list); list_del(&timer->list); xfree(timer); } } int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args) { int exit_code = -1; int pid_t; int i = 0; struct bfd f; char *s; char sigpid[7]; char tidpid[4]; struct proc_posix_timer *timer = NULL; INIT_LIST_HEAD(&args->timers); args->timer_n = 0; f.fd = open_proc(pid, "timers"); if (f.fd < 0) return -1; if (bfdopenr(&f)) return -1; while (1) { char pbuf[17]; /* 16 + eol */ s = breadline(&f); if (!s) break; if (IS_ERR(s)) goto err; switch (i % 4) { case 0: timer = xzalloc(sizeof(struct proc_posix_timer)); if (timer == NULL) goto err; if (sscanf(s, "ID: %ld", &timer->spt.it_id) != 1) goto err; break; case 1: if (sscanf(s, "signal: %d/%16s", &timer->spt.si_signo, pbuf) != 2) goto err; break; case 2: if (sscanf(s, "notify: %6[a-z]/%3[a-z].%d\n", sigpid, tidpid, &pid_t) != 3) goto err; break; case 3: if (sscanf(s, "ClockID: %d\n", &timer->spt.clock_id) != 1) goto err; timer->spt.sival_ptr = NULL; if (sscanf(pbuf, "%p", &timer->spt.sival_ptr) != 1 && strcmp(pbuf, "(null)")) { pr_err("Unable to parse '%s'\n", pbuf); goto err; } if ( tidpid[0] == 't') { timer->spt.it_sigev_notify = SIGEV_THREAD_ID; } else { switch (sigpid[0]) { case 's' : timer->spt.it_sigev_notify = SIGEV_SIGNAL; break; case 't' : timer->spt.it_sigev_notify = SIGEV_THREAD; break; default : timer->spt.it_sigev_notify = SIGEV_NONE; break; } } list_add(&timer->list, &args->timers); timer = NULL; args->timer_n++; break; } i++; } exit_code = 0; out: bclose(&f); return exit_code; err: xfree(timer); free_posix_timers(args); pr_perror("Parse error in posix timers proc file!"); goto out; } int parse_threads(int pid, struct pid **_t, int *_n) { struct dirent *de; DIR *dir; struct pid *t = NULL; int nr = 1; if (*_t) t = *_t; dir = opendir_proc(pid, "task"); if (!dir) return -1; while ((de = readdir(dir))) { struct pid *tmp; /* We expect numbers only here */ if (de->d_name[0] == '.') continue; if (*_t == NULL) { tmp = xrealloc(t, nr * sizeof(struct pid)); if (!tmp) { xfree(t); return -1; } t = tmp; t[nr - 1].ns[0].virt = -1; } t[nr - 1].real = atoi(de->d_name); t[nr - 1].state = TASK_THREAD; nr++; } closedir(dir); if (*_t == NULL) { *_t = t; *_n = nr - 1; } else BUG_ON(nr - 1 != *_n); return 0; } int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n) { while (fgets(buf, BUF_SIZE, f)) { struct cg_ctl *ncc, *cc; char *name, *path = NULL, *e; ncc = xmalloc(sizeof(*cc)); if (!ncc) goto err; /* * Typical output (':' is a separator here) * * 4:cpu,cpuacct:/ * 3:cpuset:/ * 2:name=systemd:/user.slice/user-1000.slice/session-1.scope */ name = strchr(buf, ':'); if (name) { path = strchr(++name, ':'); if (*name == ':') { /* * It's unified hierarchy. On kernels with legacy * tree this item is added automatically, so we * can just skip one. For those with full unified * support is on ... we need to write new code. */ xfree(ncc); continue; } } if (!name || !path) { pr_err("Failed parsing cgroup %s\n", buf); xfree(ncc); goto err; } e = strchr(name, '\n'); *path++ = '\0'; if (e) *e = '\0'; /* * Controllers and their props might be * configured the way some of them are * not taken into the image for migration * sake or container specifics. */ if (cgp_should_skip_controller(name)) { pr_debug("cg-prop: Skipping controller %s\n", name); xfree(ncc); continue; } ncc->name = xstrdup(name); ncc->path = xstrdup(path); ncc->cgns_prefix = 0; if (!ncc->name || !ncc->path) { xfree(ncc->name); xfree(ncc->path); xfree(ncc); goto err; } list_for_each_entry(cc, retl, l) if (strcmp(cc->name, name) >= 0) break; list_add_tail(&ncc->l, &cc->l); (*n)++; } return 0; err: put_ctls(retl); return -1; } int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n) { FILE *f; int ret; LIST_HEAD(internal); unsigned int n_internal = 0; struct cg_ctl *intern, *ext; f = fopen_proc(pid, "cgroup"); if (!f) return -1; ret = parse_cgroup_file(f, retl, n); fclose(f); if (ret < 0) return -1; /* No parasite args, we're dumping criu's cg set, so we don't need to * try and parse the "internal" cgroup set to find namespace * boundaries. */ if (!args) return 0; f = fmemopen(args->contents, strlen(args->contents), "r"); if (!f) { pr_perror("couldn't fmemopen cgroup buffer %s", args->contents); return -1; } ret = parse_cgroup_file(f, &internal, &n_internal); fclose(f); if (ret < 0) { pr_err("couldn't parse internal cgroup file\n"); return -1; } /* Here's where we actually compute the cgns prefix. Consider a task * in /foo/bar which has unshared its namespace at /foo. The internal * path is /bar, but the external path is /foo/bar, and the cgns * prefix is /foo. The algorithm is: * * // no cg ns unshare in this case * if (internal == external) * continue; * idx = find_suffix_pos(external, internal) * cgns_prefix = external[:idx] */ list_for_each_entry(intern, &internal, l) { list_for_each_entry(ext, retl, l) { char *pos; if (strcmp(ext->name, intern->name)) continue; /* If the cgroup namespace was unshared at / (or there * is no cgroup namespace relative to criu), the paths * are equal and we don't need to set a prefix. */ if (!strcmp(ext->path, intern->path)) continue; /* +1 here to chop off the leading / */ pos = ext->path + strlen(ext->path) - strlen(intern->path+1); if (strcmp(pos, intern->path+1)) { pr_err("invalid cgroup configuration, %s is not a suffix of %s\n", intern->path, ext->path); ret = -1; goto out; } ext->cgns_prefix = pos - ext->path; if (ext->path[ext->cgns_prefix-1] == '/') ext->cgns_prefix--; } } out: put_ctls(&internal); return ret; } void put_ctls(struct list_head *l) { struct cg_ctl *c, *n; list_for_each_entry_safe(c, n, l, l) { xfree(c->name); xfree(c->path); xfree(c); } INIT_LIST_HEAD(l); } /* Parse and create all the real controllers. This does not include things with * the "name=" prefix, e.g. systemd. */ int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups) { int exit_code = -1; FILE *f; f = fopen_proc(PROC_SELF, "cgroup"); if (f == NULL) return -1; while (fgets(buf, BUF_SIZE, f)) { struct cg_controller *nc = NULL; char *controllers, *off; controllers = strchr(buf, ':'); if (!controllers) { pr_err("Unable to parse \"%s\"\n", buf); goto err; } controllers++; if (*controllers == ':') /* * Unified hier. See comment in parse_cgroup_file * for more details. */ continue; off = strchr(controllers, ':'); if (!off) { pr_err("Unable to parse \"%s\"\n", buf); goto err; } *off = '\0'; while (1) { off = strchr(controllers, ','); if (off) *off = '\0'; if (!strncmp("name=", controllers, 5)) goto skip; if (!nc) { nc = new_controller(controllers); if (!nc) goto err; list_add_tail(&nc->l, cgroups); (*n_cgroups)++; } else { void *m; char *n; nc->n_controllers++; m = xrealloc(nc->controllers, sizeof(char *) * nc->n_controllers); if (!m) goto err; nc->controllers = m; n = xstrdup(controllers); if (!n) goto err; nc->controllers[nc->n_controllers-1] = n; } skip: if (!off) break; controllers = off + 1; } } exit_code = 0; err: fclose(f); return exit_code; } /* * If an OverlayFS mountpoint is found in the mountinfo table, * we enable opts.overlayfs, which is a workaround for the * OverlayFS Kernel bug. * * See fixup_overlayfs for details. */ int overlayfs_parse(struct mount_info *new) { opts.overlayfs = true; return 0; } /* * AUFS callback function to "fix up" the root pathname. * See sysfs_parse.c for details. */ int aufs_parse(struct mount_info *new) { int ret = 0; if (!strcmp(new->mountpoint, "./")) { opts.aufs = true; ret = parse_aufs_branches(new); } return ret; } int parse_children(pid_t pid, pid_t **_c, int *_n) { pid_t *ch = NULL; int nr = 0; DIR *dir; struct dirent *de; struct bfd f; dir = opendir_proc(pid, "task"); if (dir == NULL) return -1; while ((de = readdir(dir))) { char *pos, *end; if (dir_dots(de)) continue; f.fd = open_proc(pid, "task/%s/children", de->d_name); if (f.fd < 0) goto err; if (bfdopenr(&f)) goto err; while (1) { pid_t val, *tmp; pos = breadchr(&f, ' '); if (IS_ERR(pos)) goto err_close; if (pos == NULL) break; val = strtol(pos, &end, 0); if (*end != 0 && *end != ' ') { pr_err("Unable to parse %s\n", end); goto err_close; } tmp = xrealloc(ch, (nr + 1) * sizeof(pid_t)); if (!tmp) goto err_close; ch = tmp; ch[nr] = val; nr++; } bclose(&f); } *_c = ch; *_n = nr; closedir(dir); return 0; err_close: bclose(&f); err: closedir(dir); xfree(ch); return -1; } criu-3.6/criu/protobuf-desc.c000066400000000000000000000064731317335042600162240ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "common/compiler.h" #include "log.h" #include "protobuf-desc.h" #include "images/inventory.pb-c.h" #include "images/stats.pb-c.h" #include "images/regfile.pb-c.h" #include "images/ext-file.pb-c.h" #include "images/ns.pb-c.h" #include "images/eventfd.pb-c.h" #include "images/eventpoll.pb-c.h" #include "images/signalfd.pb-c.h" #include "images/fsnotify.pb-c.h" #include "images/core.pb-c.h" #include "images/mm.pb-c.h" #include "images/pipe.pb-c.h" #include "images/fifo.pb-c.h" #include "images/fdinfo.pb-c.h" #include "images/pipe-data.pb-c.h" #include "images/pstree.pb-c.h" #include "images/sa.pb-c.h" #include "images/sk-unix.pb-c.h" #include "images/sk-inet.pb-c.h" #include "images/packet-sock.pb-c.h" #include "images/sk-packet.pb-c.h" #include "images/creds.pb-c.h" #include "images/timer.pb-c.h" #include "images/utsns.pb-c.h" #include "images/ipc-var.pb-c.h" #include "images/ipc-shm.pb-c.h" #include "images/ipc-msg.pb-c.h" #include "images/ipc-sem.pb-c.h" #include "images/fs.pb-c.h" #include "images/remap-file-path.pb-c.h" #include "images/ghost-file.pb-c.h" #include "images/mnt.pb-c.h" #include "images/netdev.pb-c.h" #include "images/tcp-stream.pb-c.h" #include "images/tty.pb-c.h" #include "images/file-lock.pb-c.h" #include "images/rlimit.pb-c.h" #include "images/pagemap.pb-c.h" #include "images/siginfo.pb-c.h" #include "images/sk-netlink.pb-c.h" #include "images/vma.pb-c.h" #include "images/tun.pb-c.h" #include "images/cgroup.pb-c.h" #include "images/timerfd.pb-c.h" #include "images/cpuinfo.pb-c.h" #include "images/userns.pb-c.h" #include "images/seccomp.pb-c.h" #include "images/binfmt-misc.pb-c.h" #include "images/autofs.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; #define CR_PB_DESC(__type, __vtype, __ftype) \ CR_PB_MDESC_INIT(cr_pb_descs[PB_##__type], \ __vtype##Entry, \ __ftype##_entry) #define PB_PACK_TYPECHECK(__o, __fn) ({ if (0) __fn##__pack(__o, NULL); (pb_pack_t)&__fn##__pack; }) #define PB_GPS_TYPECHECK(__o, __fn) ({ if (0) __fn##__get_packed_size(__o); (pb_getpksize_t)&__fn##__get_packed_size; }) #define PB_UNPACK_TYPECHECK(__op, __fn) ({ if (0) *__op = __fn##__unpack(NULL, 0, NULL); (pb_unpack_t)&__fn##__unpack; }) #define PB_FREE_TYPECHECK(__o, __fn) ({ if (0) __fn##__free_unpacked(__o, NULL); (pb_free_t)&__fn##__free_unpacked; }) /* * This should be explicitly "called" to do type-checking */ #define CR_PB_MDESC_INIT(__var, __type, __name) \ do { \ __var.getpksize = PB_GPS_TYPECHECK((__type *)NULL, __name); \ __var.pack = PB_PACK_TYPECHECK((__type *)NULL, __name); \ __var.unpack = PB_UNPACK_TYPECHECK((__type **)NULL, __name); \ __var.free = PB_FREE_TYPECHECK((__type *)NULL, __name); \ __var.pb_desc = &__name##__descriptor; \ } while (0) void cr_pb_init(void) { CR_PB_DESC(IDS, TaskKobjIds, task_kobj_ids); CR_PB_DESC(SIGACT, Sa, sa); CR_PB_DESC(SK_QUEUES, SkPacket, sk_packet); CR_PB_MDESC_INIT(cr_pb_descs[PB_IPCNS_MSG], IpcMsg, ipc_msg); CR_PB_DESC(IPCNS_MSG_ENT, IpcMsg, ipc_msg); CR_PB_DESC(REMAP_FPATH, RemapFilePath, remap_file_path); CR_PB_DESC(NETDEV, NetDevice, net_device); CR_PB_MDESC_INIT(cr_pb_descs[PB_PAGEMAP_HEAD], PagemapHead, pagemap_head); #include "protobuf-desc-gen.h" } criu-3.6/criu/protobuf.c000066400000000000000000000121411317335042600152750ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "image.h" #include "servicefd.h" #include "common/compiler.h" #include "log.h" #include "rst-malloc.h" #include "string.h" #include "sockets.h" #include "cr_options.h" #include "bfd.h" #include "protobuf.h" #include "util.h" /* * To speed up reading of packed objects * by providing space on stack, this should * be more than enough for most objects. */ #define PB_PKOBJ_LOCAL_SIZE 1024 static char *image_name(struct cr_img *img) { int fd = img->_x.fd; static char image_path[PATH_MAX]; if (read_fd_link(fd, image_path, sizeof(image_path)) > 0) return image_path; return NULL; } /* * Reads PB record (header + packed object) from file @fd and unpack * it with @unpack procedure to the pointer @pobj * * 1 on success * -1 on error (or EOF met and @eof set to false) * 0 on EOF and @eof set to true * * Don't forget to free memory granted to unpacked object in calling code if needed */ int do_pb_read_one(struct cr_img *img, void **pobj, int type, bool eof) { u8 local[PB_PKOBJ_LOCAL_SIZE]; void *buf = (void *)&local; u32 size; int ret; if (!cr_pb_descs[type].pb_desc) { pr_err("Wrong object requested %d on %s\n", type, image_name(img)); return -1; } *pobj = NULL; if (unlikely(empty_image(img))) ret = 0; else ret = bread(&img->_x, &size, sizeof(size)); if (ret == 0) { if (eof) { return 0; } else { pr_err("Unexpected EOF on %s\n", image_name(img)); return -1; } } else if (ret < sizeof(size)) { pr_perror("Read %d bytes while %d expected on %s", ret, (int)sizeof(size), image_name(img)); return -1; } if (size > sizeof(local)) { ret = -1; buf = xmalloc(size); if (!buf) goto err; } ret = bread(&img->_x, buf, size); if (ret < 0) { pr_perror("Can't read %d bytes from file %s", size, image_name(img)); goto err; } else if (ret != size) { pr_perror("Read %d bytes while %d expected from %s", ret, size, image_name(img)); ret = -1; goto err; } *pobj = cr_pb_descs[type].unpack(NULL, size, buf); if (!*pobj) { ret = -1; pr_err("Failed unpacking object %p from %s\n", pobj, image_name(img)); goto err; } ret = 1; err: if (buf != (void *)&local) xfree(buf); return ret; } /* * Writes PB record (header + packed object pointed by @obj) * to file @fd, using @getpksize to get packed size and @pack * to implement packing * * 0 on success * -1 on error */ int pb_write_one(struct cr_img *img, void *obj, int type) { u8 local[PB_PKOBJ_LOCAL_SIZE]; void *buf = (void *)&local; u32 size, packed; int ret = -1; struct iovec iov[2]; if (!cr_pb_descs[type].pb_desc) { pr_err("Wrong object requested %d\n", type); return -1; } if (lazy_image(img) && open_image_lazy(img)) return -1; size = cr_pb_descs[type].getpksize(obj); if (size > (u32)sizeof(local)) { buf = xmalloc(size); if (!buf) goto err; } packed = cr_pb_descs[type].pack(obj, buf); if (packed != size) { pr_err("Failed packing PB object %p\n", obj); goto err; } iov[0].iov_base = &size; iov[0].iov_len = sizeof(size); iov[1].iov_base = buf; iov[1].iov_len = size; ret = bwritev(&img->_x, iov, 2); if (ret != size + sizeof(size)) { pr_perror("Can't write %d bytes", (int)(size + sizeof(size))); goto err; } ret = 0; err: if (buf != (void *)&local) xfree(buf); return ret; } int collect_entry(ProtobufCMessage *msg, struct collect_image_info *cinfo) { void *obj; void *(*o_alloc)(size_t size) = malloc; void (*o_free)(void *ptr) = free; if (cinfo->flags & COLLECT_SHARED) { o_alloc = shmalloc; o_free = shfree_last; } if (cinfo->priv_size) { obj = o_alloc(cinfo->priv_size); if (!obj) return -1; } else obj = NULL; cinfo->flags |= COLLECT_HAPPENED; if (cinfo->collect(obj, msg, NULL) < 0) { o_free(obj); cr_pb_descs[cinfo->pb_type].free(msg, NULL); return -1; } if (!cinfo->priv_size && !(cinfo->flags & COLLECT_NOFREE)) cr_pb_descs[cinfo->pb_type].free(msg, NULL); return 0; } int collect_image(struct collect_image_info *cinfo) { int ret; struct cr_img *img; void *(*o_alloc)(size_t size) = malloc; void (*o_free)(void *ptr) = free; pr_info("Collecting %d/%d (flags %x)\n", cinfo->fd_type, cinfo->pb_type, cinfo->flags); img = open_image(cinfo->fd_type, O_RSTR); if (!img) return -1; if (cinfo->flags & COLLECT_SHARED) { o_alloc = shmalloc; o_free = shfree_last; } while (1) { void *obj; ProtobufCMessage *msg; if (cinfo->priv_size) { ret = -1; obj = o_alloc(cinfo->priv_size); if (!obj) break; } else obj = NULL; ret = pb_read_one_eof(img, &msg, cinfo->pb_type); if (ret <= 0) { o_free(obj); break; } cinfo->flags |= COLLECT_HAPPENED; ret = cinfo->collect(obj, msg, img); if (ret < 0) { o_free(obj); cr_pb_descs[cinfo->pb_type].free(msg, NULL); break; } if (!cinfo->priv_size && !(cinfo->flags & COLLECT_NOFREE)) cr_pb_descs[cinfo->pb_type].free(msg, NULL); } close_image(img); pr_debug(" `- ... done\n"); return ret; } criu-3.6/criu/pstree.c000066400000000000000000000550061317335042600147460ustar00rootroot00000000000000#include #include #include #include #include "types.h" #include "cr_options.h" #include "pstree.h" #include "rst-malloc.h" #include "common/lock.h" #include "namespaces.h" #include "files.h" #include "tty.h" #include "mount.h" #include "dump.h" #include "util.h" #include "protobuf.h" #include "images/pstree.pb-c.h" #include "crtools.h" struct pstree_item *root_item; static struct rb_root pid_root_rb; void core_entry_free(CoreEntry *core) { if (core->tc && core->tc->timers) xfree(core->tc->timers->posix); if (core->thread_core) xfree(core->thread_core->creds->groups); arch_free_thread_info(core); xfree(core); } #ifndef RLIM_NLIMITS # define RLIM_NLIMITS 16 #endif CoreEntry *core_entry_alloc(int th, int tsk) { size_t sz; CoreEntry *core = NULL; void *m; sz = sizeof(CoreEntry); if (tsk) { sz += sizeof(TaskCoreEntry) + TASK_COMM_LEN; if (th) { sz += sizeof(TaskRlimitsEntry); sz += RLIM_NLIMITS * sizeof(RlimitEntry *); sz += RLIM_NLIMITS * sizeof(RlimitEntry); sz += sizeof(TaskTimersEntry); sz += 3 * sizeof(ItimerEntry); /* 3 for real, virt and prof */ } } if (th) { CredsEntry *ce = NULL; sz += sizeof(ThreadCoreEntry) + sizeof(ThreadSasEntry) + sizeof(CredsEntry); sz += CR_CAP_SIZE * sizeof(ce->cap_inh[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]); /* * @groups are dynamic and allocated * on demand. */ } m = xmalloc(sz); if (m) { core = xptr_pull(&m, CoreEntry); core_entry__init(core); core->mtype = CORE_ENTRY__MARCH; if (tsk) { core->tc = xptr_pull(&m, TaskCoreEntry); task_core_entry__init(core->tc); core->tc->comm = xptr_pull_s(&m, TASK_COMM_LEN); memzero(core->tc->comm, TASK_COMM_LEN); if (th) { TaskRlimitsEntry *rls; TaskTimersEntry *tte; int i; rls = core->tc->rlimits = xptr_pull(&m, TaskRlimitsEntry); task_rlimits_entry__init(rls); rls->n_rlimits = RLIM_NLIMITS; rls->rlimits = xptr_pull_s(&m, sizeof(RlimitEntry *) * RLIM_NLIMITS); for (i = 0; i < RLIM_NLIMITS; i++) { rls->rlimits[i] = xptr_pull(&m, RlimitEntry); rlimit_entry__init(rls->rlimits[i]); } tte = core->tc->timers = xptr_pull(&m, TaskTimersEntry); task_timers_entry__init(tte); tte->real = xptr_pull(&m, ItimerEntry); itimer_entry__init(tte->real); tte->virt = xptr_pull(&m, ItimerEntry); itimer_entry__init(tte->virt); tte->prof = xptr_pull(&m, ItimerEntry); itimer_entry__init(tte->prof); } } if (th) { CredsEntry *ce; core->thread_core = xptr_pull(&m, ThreadCoreEntry); thread_core_entry__init(core->thread_core); core->thread_core->sas = xptr_pull(&m, ThreadSasEntry); thread_sas_entry__init(core->thread_core->sas); ce = core->thread_core->creds = xptr_pull(&m, CredsEntry); creds_entry__init(ce); ce->n_cap_inh = CR_CAP_SIZE; ce->n_cap_prm = CR_CAP_SIZE; ce->n_cap_eff = CR_CAP_SIZE; ce->n_cap_bnd = CR_CAP_SIZE; ce->cap_inh = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_inh[0])); ce->cap_prm = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_prm[0])); ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); if (arch_alloc_thread_info(core)) { xfree(core); core = NULL; } } } return core; } int pstree_alloc_cores(struct pstree_item *item) { unsigned int i; item->core = xzalloc(sizeof(*item->core) * item->nr_threads); if (!item->core) return -1; for (i = 0; i < item->nr_threads; i++) { if (item->threads[i].real == item->pid->real) item->core[i] = core_entry_alloc(1, 1); else item->core[i] = core_entry_alloc(1, 0); if (!item->core[i]) goto err; } return 0; err: pstree_free_cores(item); return -1; } void pstree_free_cores(struct pstree_item *item) { unsigned int i; if (item->core) { for (i = 1; i < item->nr_threads; i++) if (item->core[i]) core_entry_free(item->core[i]); xfree(item->core); item->core = NULL; } } void free_pstree(struct pstree_item *root_item) { struct pstree_item *item = root_item, *parent; while (item) { if (!list_empty(&item->children)) { item = list_first_entry(&item->children, struct pstree_item, sibling); continue; } parent = item->parent; list_del(&item->sibling); pstree_free_cores(item); xfree(item->threads); xfree(item); item = parent; } } struct pstree_item *__alloc_pstree_item(bool rst) { struct pstree_item *item; int sz; if (!rst) { sz = sizeof(*item) + sizeof(struct dmp_info) + sizeof(struct pid); item = xzalloc(sz); if (!item) return NULL; item->pid = (void *)item + sizeof(*item) + sizeof(struct dmp_info); } else { sz = sizeof(*item) + sizeof(struct rst_info) + sizeof(struct pid); item = shmalloc(sz); if (!item) return NULL; memset(item, 0, sz); vm_area_list_init(&rsti(item)->vmas); INIT_LIST_HEAD(&rsti(item)->vma_io); item->pid = (void *)item + sizeof(*item) + sizeof(struct rst_info); } INIT_LIST_HEAD(&item->children); INIT_LIST_HEAD(&item->sibling); item->pid->ns[0].virt = -1; item->pid->real = -1; item->pid->state = TASK_UNDEF; item->born_sid = -1; item->pid->item = item; futex_init(&item->task_st); return item; } void init_pstree_helper(struct pstree_item *ret) { ret->pid->state = TASK_HELPER; rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS; task_entries->nr_helpers++; } /* Deep first search on children */ struct pstree_item *pstree_item_next(struct pstree_item *item) { if (!list_empty(&item->children)) return list_first_entry(&item->children, struct pstree_item, sibling); while (item->parent) { if (item->sibling.next != &item->parent->children) return list_entry(item->sibling.next, struct pstree_item, sibling); item = item->parent; } return NULL; } /* Preorder traversal of pstree item */ int preorder_pstree_traversal(struct pstree_item *item, int (*f)(struct pstree_item *)) { struct pstree_item *cursor; if (f(item) < 0) return -1; list_for_each_entry(cursor, &item->children, sibling) { if (preorder_pstree_traversal(cursor, f) < 0) return -1; } return 0; } int dump_pstree(struct pstree_item *root_item) { struct pstree_item *item = root_item; PstreeEntry e = PSTREE_ENTRY__INIT; int ret = -1, i; struct cr_img *img; pr_info("\n"); pr_info("Dumping pstree (pid: %d)\n", root_item->pid->real); pr_info("----------------------------------------\n"); /* * Make sure we're dumping session leader, if not an * appropriate option must be passed. * * Also note that if we're not a session leader we * can't get the situation where the leader sits somewhere * deeper in process tree, thus top-level checking for * leader is enough. */ if (vpid(root_item) != root_item->sid) { if (!opts.shell_job) { pr_err("The root process %d is not a session leader. " "Consider using --" OPT_SHELL_JOB " option\n", vpid(item)); return -1; } } img = open_image(CR_FD_PSTREE, O_DUMP); if (!img) return -1; for_each_pstree_item(item) { pr_info("Process: %d(%d)\n", vpid(item), item->pid->real); e.pid = vpid(item); e.ppid = item->parent ? vpid(item->parent) : 0; e.pgid = item->pgid; e.sid = item->sid; e.n_threads = item->nr_threads; e.threads = xmalloc(sizeof(e.threads[0]) * e.n_threads); if (!e.threads) goto err; for (i = 0; i < item->nr_threads; i++) e.threads[i] = item->threads[i].ns[0].virt; ret = pb_write_one(img, &e, PB_PSTREE); xfree(e.threads); if (ret) goto err; } ret = 0; err: pr_info("----------------------------------------\n"); close_image(img); return ret; } static int prepare_pstree_for_shell_job(void) { pid_t current_sid = getsid(getpid()); pid_t current_gid = getpgid(getpid()); struct pstree_item *pi; pid_t old_sid; pid_t old_gid; if (!opts.shell_job) return 0; if (root_item->sid == vpid(root_item)) return 0; /* * Migration of a root task group leader is a bit tricky. * When a task yields SIGSTOP, the kernel notifies the parent * with SIGCHLD. This means when task is running in a * shell, the shell obtains SIGCHLD and sends a task to * the background. * * The situation gets changed once we restore the * program -- our tool become an additional stub between * the restored program and the shell. So to be able to * notify the shell with SIGCHLD from our restored * program -- we make the root task to inherit the * process group from us. * * Not that clever solution but at least it works. */ old_sid = root_item->sid; old_gid = root_item->pgid; pr_info("Migrating process tree (GID %d->%d SID %d->%d)\n", old_gid, current_gid, old_sid, current_sid); for_each_pstree_item(pi) { if (pi->pgid == old_gid) pi->pgid = current_gid; if (pi->sid == old_sid) pi->sid = current_sid; } if (lookup_create_item(current_sid) == NULL) return -1; if (lookup_create_item(current_gid) == NULL) return -1; return 0; } /* * Try to find a pid node in the tree and insert a new one, * it is not there yet. If pid_node isn't set, pstree_item * is inserted. */ static struct pid *lookup_create_pid(pid_t pid, struct pid *pid_node) { struct rb_node *node = pid_root_rb.rb_node; struct rb_node **new = &pid_root_rb.rb_node; struct rb_node *parent = NULL; while (node) { struct pid *this = rb_entry(node, struct pid, ns[0].node); parent = *new; if (pid < this->ns[0].virt) node = node->rb_left, new = &((*new)->rb_left); else if (pid > this->ns[0].virt) node = node->rb_right, new = &((*new)->rb_right); else return this; } if (!pid_node) { struct pstree_item *item; item = __alloc_pstree_item(true); if (item == NULL) return NULL; item->pid->ns[0].virt = pid; pid_node = item->pid; } rb_link_and_balance(&pid_root_rb, &pid_node->ns[0].node, parent, new); return pid_node; } void pstree_insert_pid(struct pid *pid_node) { struct pid* n; n = lookup_create_pid(pid_node->ns[0].virt, pid_node); BUG_ON(n != pid_node); } struct pstree_item *lookup_create_item(pid_t pid) { struct pid *node;; node = lookup_create_pid(pid, NULL); if (!node) return NULL; BUG_ON(node->state == TASK_THREAD); return node->item; } struct pid *pstree_pid_by_virt(pid_t pid) { struct rb_node *node = pid_root_rb.rb_node; while (node) { struct pid *this = rb_entry(node, struct pid, ns[0].node); if (pid < this->ns[0].virt) node = node->rb_left; else if (pid > this->ns[0].virt) node = node->rb_right; else return this; } return NULL; } static int read_pstree_ids(struct pstree_item *pi) { int ret; struct cr_img *img; img = open_image(CR_FD_IDS, O_RSTR, vpid(pi)); if (!img) return -1; ret = pb_read_one_eof(img, &pi->ids, PB_IDS); close_image(img); if (ret <= 0) return ret; if (pi->ids->has_mnt_ns_id) { if (rst_add_ns_id(pi->ids->mnt_ns_id, pi, &mnt_ns_desc)) return -1; } return 0; } static int read_pstree_image(pid_t *pid_max) { int ret = 0, i; struct cr_img *img; struct pstree_item *pi; pr_info("Reading image tree\n"); img = open_image(CR_FD_PSTREE, O_RSTR); if (!img) return -1; while (1) { PstreeEntry *e; ret = pb_read_one_eof(img, &e, PB_PSTREE); if (ret <= 0) break; ret = -1; pi = lookup_create_item(e->pid); if (pi == NULL) break; BUG_ON(pi->pid->state != TASK_UNDEF); /* * All pids should be added in the tree to be able to find * free pid-s for helpers. pstree_item for these pid-s will * be initialized when we meet PstreeEntry with this pid or * we will create helpers for them. */ if (lookup_create_item(e->pgid) == NULL) break; if (lookup_create_item(e->sid) == NULL) break; pi->pid->ns[0].virt = e->pid; if (e->pid > *pid_max) *pid_max = e->pid; pi->pgid = e->pgid; if (e->pgid > *pid_max) *pid_max = e->pgid; pi->sid = e->sid; if (e->sid > *pid_max) *pid_max = e->sid; pi->pid->state = TASK_ALIVE; if (e->ppid == 0) { if (root_item) { pr_err("Parent missed on non-root task " "with pid %d, image corruption!\n", e->pid); goto err; } root_item = pi; pi->parent = NULL; } else { struct pid *pid; struct pstree_item *parent; pid = pstree_pid_by_virt(e->ppid); if (!pid || pid->state == TASK_UNDEF || pid->state == TASK_THREAD) { pr_err("Can't find a parent for %d\n", vpid(pi)); pstree_entry__free_unpacked(e, NULL); xfree(pi); goto err; } parent = pid->item; pi->parent = parent; list_add(&pi->sibling, &parent->children); } pi->nr_threads = e->n_threads; pi->threads = xmalloc(e->n_threads * sizeof(struct pid)); if (!pi->threads) break; for (i = 0; i < e->n_threads; i++) { struct pid *node; pi->threads[i].real = -1; pi->threads[i].ns[0].virt = e->threads[i]; pi->threads[i].state = TASK_THREAD; pi->threads[i].item = NULL; if (i == 0) continue; /* A thread leader is in a tree already */ node = lookup_create_pid(pi->threads[i].ns[0].virt, &pi->threads[i]); BUG_ON(node == NULL); if (node != &pi->threads[i]) { pr_err("Unexpected task %d in a tree %d\n", e->threads[i], i); return -1; } } task_entries->nr_threads += e->n_threads; task_entries->nr_tasks++; pstree_entry__free_unpacked(e, NULL); ret = read_pstree_ids(pi); if (ret < 0) goto err; } err: close_image(img); return ret; } #define RESERVED_PIDS 300 static int get_free_pid() { static struct pid *prev, *next; if (prev == NULL) prev = rb_entry(rb_first(&pid_root_rb), struct pid, ns[0].node); while (1) { struct rb_node *node; pid_t pid; pid = prev->ns[0].virt + 1; pid = pid < RESERVED_PIDS ? RESERVED_PIDS + 1 : pid; node = rb_next(&prev->ns[0].node); if (node == NULL) return pid; next = rb_entry(node, struct pid, ns[0].node); if (next->ns[0].virt > pid) return pid; prev = next; } return -1; } static int prepare_pstree_ids(void) { struct pstree_item *item, *child, *helper, *tmp; LIST_HEAD(helpers); pid_t current_pgid = getpgid(getpid()); /* * Some task can be reparented to init. A helper task should be added * for restoring sid of such tasks. The helper tasks will be exited * immediately after forking children and all children will be * reparented to init. */ list_for_each_entry(item, &root_item->children, sibling) { struct pstree_item *leader; /* * If a child belongs to the root task's session or it's * a session leader himself -- this is a simple case, we * just proceed in a normal way. */ if (item->sid == root_item->sid || item->sid == vpid(item)) continue; leader = pstree_item_by_virt(item->sid); BUG_ON(leader == NULL); if (leader->pid->state != TASK_UNDEF) { pid_t pid; pid = get_free_pid(); if (pid < 0) break; helper = lookup_create_item(pid); if (helper == NULL) return -1; pr_info("Session leader %d\n", item->sid); helper->sid = item->sid; helper->pgid = leader->pgid; helper->ids = leader->ids; helper->parent = leader; list_add(&helper->sibling, &leader->children); pr_info("Attach %d to the task %d\n", vpid(helper), vpid(leader)); } else { helper = leader; helper->sid = item->sid; helper->pgid = item->sid; helper->parent = root_item; helper->ids = root_item->ids; list_add_tail(&helper->sibling, &helpers); } init_pstree_helper(helper); pr_info("Add a helper %d for restoring SID %d\n", vpid(helper), helper->sid); child = list_entry(item->sibling.prev, struct pstree_item, sibling); item = child; /* * Stack on helper task all children with target sid. */ list_for_each_entry_safe_continue(child, tmp, &root_item->children, sibling) { if (child->sid != helper->sid) continue; if (child->sid == vpid(child)) continue; pr_info("Attach %d to the temporary task %d\n", vpid(child), vpid(helper)); child->parent = helper; list_move(&child->sibling, &helper->children); } } /* Try to connect helpers to session leaders */ for_each_pstree_item(item) { if (!item->parent) /* skip the root task */ continue; if (item->pid->state == TASK_HELPER) continue; if (item->sid != vpid(item)) { struct pstree_item *parent; if (item->parent->sid == item->sid) continue; /* the task could fork a child before and after setsid() */ parent = item->parent; while (parent && vpid(parent) != item->sid) { if (parent->born_sid != -1 && parent->born_sid != item->sid) { pr_err("Can't figure out which sid (%d or %d)" "the process %d was born with\n", parent->born_sid, item->sid, vpid(parent)); return -1; } parent->born_sid = item->sid; pr_info("%d was born with sid %d\n", vpid(parent), item->sid); parent = parent->parent; } if (parent == NULL) { pr_err("Can't find a session leader for %d\n", item->sid); return -1; } continue; } } /* All other helpers are session leaders for own sessions */ list_splice(&helpers, &root_item->children); /* Add a process group leader if it is absent */ for_each_pstree_item(item) { struct pid *pid; if (!item->pgid || vpid(item) == item->pgid) continue; pid = pstree_pid_by_virt(item->pgid); if (pid->state != TASK_UNDEF) { BUG_ON(pid->state == TASK_THREAD); rsti(item)->pgrp_leader = pid->item; continue; } /* * If the PGID is eq to current one -- this * means we're inheriting group from the current * task so we need to escape creating a helper here. */ if (current_pgid == item->pgid) continue; helper = pid->item; init_pstree_helper(helper); helper->sid = item->sid; helper->pgid = item->pgid; helper->pid->ns[0].virt = item->pgid; helper->parent = item; helper->ids = item->ids; list_add(&helper->sibling, &item->children); rsti(item)->pgrp_leader = helper; pr_info("Add a helper %d for restoring PGID %d\n", vpid(helper), helper->pgid); } return 0; } static unsigned long get_clone_mask(TaskKobjIdsEntry *i, TaskKobjIdsEntry *p) { unsigned long mask = 0; if (i->files_id == p->files_id) mask |= CLONE_FILES; if (i->pid_ns_id != p->pid_ns_id) mask |= CLONE_NEWPID; if (i->net_ns_id != p->net_ns_id) mask |= CLONE_NEWNET; if (i->ipc_ns_id != p->ipc_ns_id) mask |= CLONE_NEWIPC; if (i->uts_ns_id != p->uts_ns_id) mask |= CLONE_NEWUTS; if (i->mnt_ns_id != p->mnt_ns_id) mask |= CLONE_NEWNS; if (i->user_ns_id != p->user_ns_id) mask |= CLONE_NEWUSER; return mask; } static int prepare_pstree_kobj_ids(void) { struct pstree_item *item; /* Find a process with minimal pid for shared fd tables */ for_each_pstree_item(item) { struct pstree_item *parent = item->parent; TaskKobjIdsEntry *ids; unsigned long cflags; if (!item->ids) { if (item == root_item) { pr_err("No IDS for root task.\n"); pr_err("Images currupted or too old criu was used for dump.\n"); return -1; } continue; } if (parent) ids = parent->ids; else ids = root_ids; /* * Add some sanity check on image data. */ if (unlikely(!ids)) { pr_err("No kIDs provided, image corruption\n"); return -1; } cflags = get_clone_mask(item->ids, ids); if (cflags & CLONE_FILES) { int ret; /* * There might be a case when kIDs for * root task are the same as in root_ids, * thus it's image corruption and we should * exit out. */ if (unlikely(!item->parent)) { pr_err("Image corruption on kIDs data\n"); return -1; } ret = shared_fdt_prepare(item); if (ret) return ret; } rsti(item)->clone_flags = cflags; if (parent) /* * Mount namespaces are setns()-ed at * restore_task_mnt_ns() explicitly, * no need in creating it with its own * temporary namespace. * * Root task is exceptional -- it will * be born in a fresh new mount namespace * which will be populated with all other * namespaces' entries. */ rsti(item)->clone_flags &= ~CLONE_NEWNS; cflags &= CLONE_ALLNS; if (item == root_item) { pr_info("Will restore in %lx namespaces\n", cflags); root_ns_mask = cflags; } else if (cflags & ~(root_ns_mask & CLONE_SUBNS)) { /* * Namespaces from CLONE_SUBNS can be nested, but in * this case nobody can't share external namespaces of * these types. * * Workaround for all other namespaces -- * all tasks should be in one namespace. And * this namespace is either inherited from the * criu or is created for the init task (only) */ pr_err("Can't restore sub-task in NS\n"); return -1; } } pr_debug("NS mask to use %lx\n", root_ns_mask); return 0; } int prepare_pstree(void) { int ret; pid_t pid_max = 0, kpid_max = 0; int fd; char buf[21]; fd = open_proc(PROC_GEN, PID_MAX_PATH); if (fd >= 0) { ret = read(fd, buf, sizeof(buf) - 1); close(fd); if (ret > 0) { buf[ret] = 0; kpid_max = strtoul(buf, NULL, 10); pr_debug("kernel pid_max=%d\n", kpid_max); } } ret = read_pstree_image(&pid_max); pr_debug("pstree pid_max=%d\n", pid_max); if (!ret && kpid_max && pid_max > kpid_max) { /* Try to set kernel pid_max */ fd = open_proc_rw(PROC_GEN, PID_MAX_PATH); if (fd == -1) ret = -1; else { snprintf(buf, sizeof(buf), "%u", pid_max+1); if (write(fd, buf, strlen(buf)) < 0) { pr_perror("Can't set kernel pid_max=%s", buf); ret = -1; } else pr_info("kernel pid_max pushed to %s\n", buf); close(fd); } } if (!ret) /* * Shell job may inherit sid/pgid from the current * shell, not from image. Set things up for this. */ ret = prepare_pstree_for_shell_job(); if (!ret) /* * Walk the collected tree and prepare for restoring * of shared objects at clone time */ ret = prepare_pstree_kobj_ids(); if (!ret) /* * Session/Group leaders might be dead. Need to fix * pstree with properly injected helper tasks. */ ret = prepare_pstree_ids(); return ret; } int prepare_dummy_pstree(void) { pid_t dummy = 0; if (check_img_inventory() == -1) return -1; if (prepare_task_entries() == -1) return -1; if (read_pstree_image(&dummy) == -1) return -1; return 0; } bool restore_before_setsid(struct pstree_item *child) { int csid = child->born_sid == -1 ? child->sid : child->born_sid; if (child->parent->born_sid == csid) return true; return false; } struct pstree_item *pstree_item_by_virt(pid_t virt) { struct pid *pid; pid = pstree_pid_by_virt(virt); if (pid == NULL) return NULL; BUG_ON(pid->state == TASK_THREAD); return pid->item; } struct pstree_item *pstree_item_by_real(pid_t real) { struct pstree_item *item; for_each_pstree_item(item) { if (item->pid->real == real) return item; } return NULL; } int pid_to_virt(pid_t real) { struct pstree_item *item; item = pstree_item_by_real(real); if (item) return vpid(item); return 0; } criu-3.6/criu/rbtree.c000066400000000000000000000170111317335042600147210ustar00rootroot00000000000000/* * RBtree implementation adopted from the Linux kernel sources. */ #include #include "rbtree.h" static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) { struct rb_node *right = node->rb_right; struct rb_node *parent = rb_parent(node); node->rb_right = right->rb_left; if (node->rb_right) rb_set_parent(right->rb_left, node); right->rb_left = node; rb_set_parent(right, parent); if (parent) { if (node == parent->rb_left) parent->rb_left = right; else parent->rb_right = right; } else root->rb_node = right; rb_set_parent(node, right); } static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) { struct rb_node *left = node->rb_left; struct rb_node *parent = rb_parent(node); node->rb_left = left->rb_right; if (node->rb_left) rb_set_parent(left->rb_right, node); left->rb_right = node; rb_set_parent(left, parent); if (parent) { if (node == parent->rb_right) parent->rb_right = left; else parent->rb_left = left; } else root->rb_node = left; rb_set_parent(node, left); } void rb_insert_color(struct rb_node *node, struct rb_root *root) { struct rb_node *parent, *gparent; while ((parent = rb_parent(node)) && rb_is_red(parent)) { gparent = rb_parent(parent); if (parent == gparent->rb_left) { { register struct rb_node *uncle = gparent->rb_right; if (uncle && rb_is_red(uncle)) { rb_set_black(uncle); rb_set_black(parent); rb_set_red(gparent); node = gparent; continue; } } if (parent->rb_right == node) { register struct rb_node *tmp; __rb_rotate_left(parent, root); tmp = parent; parent = node; node = tmp; } rb_set_black(parent); rb_set_red(gparent); __rb_rotate_right(gparent, root); } else { { register struct rb_node *uncle = gparent->rb_left; if (uncle && rb_is_red(uncle)) { rb_set_black(uncle); rb_set_black(parent); rb_set_red(gparent); node = gparent; continue; } } if (parent->rb_left == node) { register struct rb_node *tmp; __rb_rotate_right(parent, root); tmp = parent; parent = node; node = tmp; } rb_set_black(parent); rb_set_red(gparent); __rb_rotate_left(gparent, root); } } rb_set_black(root->rb_node); } static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, struct rb_root *root) { struct rb_node *other; while ((!node || rb_is_black(node)) && node != root->rb_node) { if (parent->rb_left == node) { other = parent->rb_right; if (rb_is_red(other)) { rb_set_black(other); rb_set_red(parent); __rb_rotate_left(parent, root); other = parent->rb_right; } if ((!other->rb_left || rb_is_black(other->rb_left)) && (!other->rb_right || rb_is_black(other->rb_right))) { rb_set_red(other); node = parent; parent = rb_parent(node); } else { if (!other->rb_right || rb_is_black(other->rb_right)) { rb_set_black(other->rb_left); rb_set_red(other); __rb_rotate_right(other, root); other = parent->rb_right; } rb_set_color(other, rb_color(parent)); rb_set_black(parent); rb_set_black(other->rb_right); __rb_rotate_left(parent, root); node = root->rb_node; break; } } else { other = parent->rb_left; if (rb_is_red(other)) { rb_set_black(other); rb_set_red(parent); __rb_rotate_right(parent, root); other = parent->rb_left; } if ((!other->rb_left || rb_is_black(other->rb_left)) && (!other->rb_right || rb_is_black(other->rb_right))) { rb_set_red(other); node = parent; parent = rb_parent(node); } else { if (!other->rb_left || rb_is_black(other->rb_left)) { rb_set_black(other->rb_right); rb_set_red(other); __rb_rotate_left(other, root); other = parent->rb_left; } rb_set_color(other, rb_color(parent)); rb_set_black(parent); rb_set_black(other->rb_left); __rb_rotate_right(parent, root); node = root->rb_node; break; } } } if (node) rb_set_black(node); } void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *child, *parent; int color; if (!node->rb_left) child = node->rb_right; else if (!node->rb_right) child = node->rb_left; else { struct rb_node *old = node, *left; node = node->rb_right; while ((left = node->rb_left)) node = left; if (rb_parent(old)) { if (rb_parent(old)->rb_left == old) rb_parent(old)->rb_left = node; else rb_parent(old)->rb_right = node; } else root->rb_node = node; child = node->rb_right; parent = rb_parent(node); color = rb_color(node); if (parent == old) { parent = node; } else { if (child) rb_set_parent(child, parent); parent->rb_left = child; node->rb_right = old->rb_right; rb_set_parent(old->rb_right, node); } node->rb_parent_color = old->rb_parent_color; node->rb_left = old->rb_left; rb_set_parent(old->rb_left, node); goto color; } parent = rb_parent(node); color = rb_color(node); if (child) rb_set_parent(child, parent); if (parent) { if (parent->rb_left == node) parent->rb_left = child; else parent->rb_right = child; } else root->rb_node = child; color: if (color == RB_BLACK) __rb_erase_color(child, parent, root); } /* * This function returns the first node (in sort order) of the tree. */ struct rb_node *rb_first(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_left) n = n->rb_left; return n; } struct rb_node *rb_last(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_right) n = n->rb_right; return n; } struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; if (rb_parent(node) == node) return NULL; /* * If we have a right-hand child, go down and * then left as far as we can. */ if (node->rb_right) { node = node->rb_right; while (node->rb_left) node=node->rb_left; return (struct rb_node *)node; } /* * No right-hand children. Everything down and left is * smaller than us, so any 'next' node must be in the general * direction of our parent. Go up the tree; any time the * ancestor is a right-hand child of its parent, keep going * up. First time it's a left-hand child of its parent, said * parent is our 'next' node. */ while ((parent = rb_parent(node)) && node == parent->rb_right) node = parent; return parent; } struct rb_node *rb_prev(const struct rb_node *node) { struct rb_node *parent; if (rb_parent(node) == node) return NULL; /* * If we have a left-hand child, go down and * then right as far as we can. */ if (node->rb_left) { node = node->rb_left; while (node->rb_right) node = node->rb_right; return (struct rb_node *)node; } /* * No left-hand children. Go up till we find * an ancestor which is a right-hand child of its parent. */ while ((parent = rb_parent(node)) && node == parent->rb_left) node = parent; return parent; } void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root) { struct rb_node *parent = rb_parent(victim); /* Set the surrounding nodes to point to the replacement */ if (parent) { if (victim == parent->rb_left) parent->rb_left = new; else parent->rb_right = new; } else root->rb_node = new; if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) rb_set_parent(victim->rb_right, new); /* Copy the pointers/colour from the victim to the replacement */ *new = *victim; } criu-3.6/criu/rst-malloc.c000066400000000000000000000124561317335042600155230ustar00rootroot00000000000000#include #include #include #include "page.h" #include "rst-malloc.h" #include "log.h" #include "common/bug.h" struct rst_mem_type_s { bool remapable; bool enabled; unsigned long free_bytes; void *free_mem; int (*grow)(struct rst_mem_type_s *, unsigned long size); unsigned long last; void *buf; unsigned long size; }; static inline unsigned long rst_mem_grow(unsigned long need_size) { int rst_mem_batch = 2 * page_size(); need_size = round_up(need_size, page_size()); if (likely(need_size < rst_mem_batch)) need_size = rst_mem_batch; else pr_debug("Growing rst memory %lu pages\n", need_size / page_size()); return need_size; } static int grow_shared(struct rst_mem_type_s *t, unsigned long size) { void *aux; size = rst_mem_grow(size); /* * This buffer will not get remapped into * restorer, thus we can just forget the * previous chunk location and allocate a * new one */ aux = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (aux == MAP_FAILED) return -1; t->free_mem = aux; t->free_bytes = size; t->last = 0; return 0; } static int grow_remap(struct rst_mem_type_s *t, int flag, unsigned long size) { void *aux; size = rst_mem_grow(size); if (!t->buf) /* * Can't call mremap with NULL address :( */ aux = mmap(NULL, size, PROT_READ | PROT_WRITE, flag | MAP_ANONYMOUS, 0, 0); else { if (flag & MAP_SHARED) { /* * Anon shared memory cannot grow with * mremap, anon-shmem file size doesn't * chage and memory access generates * SIGBUS. We should truncate the guy, * but for now we don't need it. */ pr_err("Can't grow RM_SHREMAP memory\n"); return -1; } /* * We'll have to remap all objects into restorer * address space and get their new addresses. Since * we allocate many objects as one linear array, it's * simpler just to grow the buffer and let callers * find out new array addresses, rather than allocate * a completely new one and force callers use objects' * cpos-s. */ aux = mremap(t->buf, t->size, t->size + size, MREMAP_MAYMOVE); } if (aux == MAP_FAILED) return -1; t->free_mem += (aux - t->buf); t->free_bytes += size; t->size += size; t->buf = aux; return 0; } static int grow_shremap(struct rst_mem_type_s *t, unsigned long size) { return grow_remap(t, MAP_SHARED, size); } static int grow_private(struct rst_mem_type_s *t, unsigned long size) { return grow_remap(t, MAP_PRIVATE, size); } static struct rst_mem_type_s rst_mems[RST_MEM_TYPES] = { [RM_SHARED] = { .grow = grow_shared, .remapable = false, .enabled = true, }, [RM_SHREMAP] = { .grow = grow_shremap, .remapable = true, .enabled = true, }, [RM_PRIVATE] = { .grow = grow_private, .remapable = true, .enabled = false, }, }; void rst_mem_switch_to_private(void) { rst_mems[RM_SHARED].enabled = false; rst_mems[RM_SHREMAP].enabled = false; rst_mems[RM_PRIVATE].enabled = true; } void rst_mem_align(int type) { struct rst_mem_type_s *t = &rst_mems[type]; void *ptr; ptr = (void *) round_up((unsigned long)t->free_mem, sizeof(void *)); t->free_bytes -= (ptr - t->free_mem); t->free_mem = ptr; } unsigned long rst_mem_align_cpos(int type) { struct rst_mem_type_s *t = &rst_mems[type]; BUG_ON(!t->remapable || !t->enabled); rst_mem_align(type); return t->free_mem - t->buf; } void *rst_mem_remap_ptr(unsigned long pos, int type) { struct rst_mem_type_s *t = &rst_mems[type]; BUG_ON(!t->remapable); return t->buf + pos; } void *rst_mem_alloc(unsigned long size, int type) { struct rst_mem_type_s *t = &rst_mems[type]; void *ret; BUG_ON(!t->enabled); if ((t->free_bytes < size) && t->grow(t, size)) { pr_perror("Can't grow rst mem"); return NULL; } ret = t->free_mem; t->free_mem += size; t->free_bytes -= size; t->last = size; return ret; } void rst_mem_free_last(int type) { struct rst_mem_type_s *t = &rst_mems[type]; BUG_ON(!t->enabled); t->free_mem -= t->last; t->free_bytes += t->last; t->last = 0; /* next free_last would be no-op */ } unsigned long rst_mem_lock(void) { /* * Don't allow further allocations from rst_mem since we're * going to get the bootstrap area and remap all the stuff * into it. The SHREMAP and SHARED should be already locked * in the rst_mem_switch_to_private(). */ rst_mems[RM_PRIVATE].enabled = false; return rst_mems[RM_PRIVATE].size + rst_mems[RM_SHREMAP].size; } static int rst_mem_remap_one(struct rst_mem_type_s *t, void *to) { void *aux; BUG_ON(!t->remapable || t->enabled); if (!t->buf) /* * No allocations happened from this buffer. * It's safe just to do nothing. */ return 0; pr_debug("\tcall mremap(%p, %lu, %lu, MAYMOVE | FIXED, %p)\n", t->buf, t->size, t->size, to); aux = mremap(t->buf, t->size, t->size, MREMAP_MAYMOVE | MREMAP_FIXED, to); if (aux == MAP_FAILED) { pr_perror("Can't mremap rst mem"); return -1; } t->buf = aux; return 0; } int rst_mem_remap(void *to) { int ret; ret = rst_mem_remap_one(&rst_mems[RM_PRIVATE], to); if (!ret) { to += rst_mems[RM_PRIVATE].size; ret = rst_mem_remap_one(&rst_mems[RM_SHREMAP], to); } return ret; } void *shmalloc(size_t bytes) { rst_mem_align(RM_SHARED); return rst_mem_alloc(bytes, RM_SHARED); } /* Only last chunk can be released */ void shfree_last(void *ptr) { rst_mem_free_last(RM_SHARED); } criu-3.6/criu/seccomp.c000066400000000000000000000134161317335042600150740ustar00rootroot00000000000000#include #include #include #include #include "config.h" #include "imgset.h" #include "kcmp.h" #include "pstree.h" #include #include "proc_parse.h" #include "restorer.h" #include "seccomp.h" #include "servicefd.h" #include "util.h" #include "rst-malloc.h" #include "protobuf.h" #include "images/seccomp.pb-c.h" /* populated on dump during collect_seccomp_filters() */ static int next_filter_id = 0; static struct seccomp_info **filters = NULL; static struct seccomp_info *find_inherited(struct pstree_item *parent, struct sock_filter *filter, int len) { struct seccomp_info *info; /* if we have no filters yet, this one has no parent */ if (!filters) return NULL; for (info = filters[dmpi(parent)->pi_creds->last_filter]; info; info = info->prev) { if (len != info->filter.filter.len) continue; if (!memcmp(filter, info->filter.filter.data, len)) return info; } return NULL; } static int collect_filter_for_pstree(struct pstree_item *item) { struct seccomp_info *infos = NULL, *cursor; int info_count, i, ret = -1; struct sock_filter buf[BPF_MAXINSNS]; void *m; if (item->pid->state == TASK_DEAD || dmpi(item)->pi_creds->s.seccomp_mode != SECCOMP_MODE_FILTER) return 0; for (i = 0; true; i++) { int len; struct seccomp_info *info, *inherited = NULL; len = ptrace(PTRACE_SECCOMP_GET_FILTER, item->pid->real, i, buf); if (len < 0) { if (errno == ENOENT) { /* end of the search */ BUG_ON(i == 0); goto save_infos; } else if (errno == EINVAL) { pr_err("dumping seccomp infos not supported\n"); goto out; } else { pr_perror("couldn't dump seccomp filter"); goto out; } } inherited = find_inherited(item->parent, buf, len); if (inherited) { bool found = false; /* Small sanity check: if infos is already populated, * we should have inherited that filter too. */ for (cursor = infos; cursor; cursor = cursor->prev) { if (inherited->prev== cursor) { found = true; break; } } BUG_ON(!found); infos = inherited; continue; } info = xmalloc(sizeof(*info)); if (!info) goto out; seccomp_filter__init(&info->filter); info->filter.filter.len = len * sizeof(struct sock_filter); info->filter.filter.data = xmalloc(info->filter.filter.len); if (!info->filter.filter.data) { xfree(info); goto out; } memcpy(info->filter.filter.data, buf, info->filter.filter.len); info->prev = infos; infos = info; } save_infos: info_count = i; m = xrealloc(filters, sizeof(*filters) * (next_filter_id + info_count)); if (!m) goto out; filters = m; for (cursor = infos, i = info_count + next_filter_id - 1; i >= next_filter_id; i--) { BUG_ON(!cursor); cursor->id = i; filters[i] = cursor; cursor = cursor->prev; } next_filter_id += info_count; dmpi(item)->pi_creds->last_filter = infos->id; /* Don't free the part of the tree we just successfully acquired */ infos = NULL; ret = 0; out: while (infos) { struct seccomp_info *freeme = infos; infos = infos->prev; xfree(freeme->filter.filter.data); xfree(freeme); } return ret; } static int dump_seccomp_filters(void) { SeccompEntry se = SECCOMP_ENTRY__INIT; int ret = -1, i; /* If we didn't collect any filters, don't create a seccomp image at all. */ if (next_filter_id == 0) return 0; se.seccomp_filters = xzalloc(sizeof(*se.seccomp_filters) * next_filter_id); if (!se.seccomp_filters) return -1; se.n_seccomp_filters = next_filter_id; for (i = 0; i < next_filter_id; i++) { SeccompFilter *sf; struct seccomp_info *cur = filters[i]; sf = se.seccomp_filters[cur->id] = &cur->filter; if (cur->prev) { sf->has_prev = true; sf->prev = cur->prev->id; } } ret = pb_write_one(img_from_set(glob_imgset, CR_FD_SECCOMP), &se, PB_SECCOMP); xfree(se.seccomp_filters); for (i = 0; i < next_filter_id; i++) { struct seccomp_info *freeme = filters[i]; xfree(freeme->filter.filter.data); xfree(freeme); } xfree(filters); return ret; } int collect_seccomp_filters(void) { if (preorder_pstree_traversal(root_item, collect_filter_for_pstree) < 0) return -1; if (dump_seccomp_filters()) return -1; return 0; } /* Populated on restore by prepare_seccomp_filters */ static SeccompEntry *se; int prepare_seccomp_filters(void) { struct cr_img *img; int ret; img = open_image(CR_FD_SECCOMP, O_RSTR); if (!img) return -1; ret = pb_read_one_eof(img, &se, PB_SECCOMP); close_image(img); if (ret <= 0) return 0; /* there were no filters */ BUG_ON(!se); return 0; } int seccomp_filters_get_rst_pos(CoreEntry *core, struct task_restore_args *ta) { SeccompFilter *sf = NULL; struct sock_fprog *arr = NULL; void *filter_data = NULL; int ret = -1, i, n_filters; size_t filter_size = 0; ta->seccomp_filters_n = 0; if (!core->tc->has_seccomp_filter) return 0; ta->seccomp_filters = (struct sock_fprog *)rst_mem_align_cpos(RM_PRIVATE); BUG_ON(core->tc->seccomp_filter > se->n_seccomp_filters); sf = se->seccomp_filters[core->tc->seccomp_filter]; while (1) { ta->seccomp_filters_n++; filter_size += sf->filter.len; if (!sf->has_prev) break; sf = se->seccomp_filters[sf->prev]; } n_filters = ta->seccomp_filters_n; arr = rst_mem_alloc(sizeof(struct sock_fprog) * n_filters + filter_size, RM_PRIVATE); if (!arr) goto out; filter_data = &arr[n_filters]; sf = se->seccomp_filters[core->tc->seccomp_filter]; for (i = 0; i < n_filters; i++) { struct sock_fprog *fprog = &arr[i]; BUG_ON(sf->filter.len % sizeof(struct sock_filter)); fprog->len = sf->filter.len / sizeof(struct sock_filter); memcpy(filter_data, sf->filter.data, sf->filter.len); filter_data += sf->filter.len; sf = se->seccomp_filters[sf->prev]; } ret = 0; out: seccomp_entry__free_unpacked(se, NULL); return ret; } criu-3.6/criu/seize.c000066400000000000000000000455261317335042600145710ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "int.h" #include "common/compiler.h" #include "cr_options.h" #include "cr-errno.h" #include "pstree.h" #include "criu-log.h" #include #include "proc_parse.h" #include "seize.h" #include "stats.h" #include "xmalloc.h" #include "util.h" #include #define NR_ATTEMPTS 5 static const char frozen[] = "FROZEN"; static const char freezing[] = "FREEZING"; static const char thawed[] = "THAWED"; static const char *get_freezer_state(int fd) { char state[32]; int ret; BUILD_BUG_ON((sizeof(state) < sizeof(frozen)) || (sizeof(state) < sizeof(freezing)) || (sizeof(state) < sizeof(thawed))); lseek(fd, 0, SEEK_SET); ret = read(fd, state, sizeof(state) - 1); if (ret <= 0) { pr_perror("Unable to get a current state"); goto err; } if (state[ret - 1] == '\n') state[ret - 1] = 0; else state[ret] = 0; pr_debug("freezer.state=%s\n", state); if (strcmp(state, frozen) == 0) return frozen; else if (strcmp(state, freezing) == 0) return freezing; else if (strcmp(state, thawed) == 0) return thawed; pr_err("Unknown freezer state: %s\n", state); err: return NULL; } static bool freezer_thawed; const char *get_real_freezer_state(void) { return freezer_thawed ? thawed : frozen; } static int freezer_restore_state(void) { int fd; char path[PATH_MAX]; if (!opts.freeze_cgroup || freezer_thawed) return 0; snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup); fd = open(path, O_RDWR); if (fd < 0) { pr_perror("Unable to open %s", path); return -1; } if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) { pr_perror("Unable to freeze tasks"); close(fd); return -1; } close(fd); return 0; } /* A number of tasks in a freezer cgroup which are not going to be dumped */ static int processes_to_wait; static pid_t *processes_to_wait_pids; static int seize_cgroup_tree(char *root_path, const char *state) { DIR *dir; struct dirent *de; char path[PATH_MAX]; FILE *f; /* * New tasks can appear while a freezer state isn't * frozen, so we need to catch all new tasks. */ snprintf(path, sizeof(path), "%s/tasks", root_path); f = fopen(path, "r"); if (f == NULL) { pr_perror("Unable to open %s", path); return -1; } while (fgets(path, sizeof(path), f)) { pid_t pid; int ret; pid = atoi(path); /* Here we are going to skip tasks which are already traced. */ ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL); if (ret == 0) continue; if (errno != ESRCH) { pr_perror("Unexpected error"); fclose(f); return -1; } if (!compel_interrupt_task(pid)) { pr_debug("SEIZE %d: success\n", pid); processes_to_wait++; } else if (state == frozen) { char buf[] = "/proc/XXXXXXXXXX/exe"; struct stat st; /* skip kernel threads */ snprintf(buf, sizeof(buf), "/proc/%d/exe", pid); if (stat(buf, &st) == -1 && errno == ENOENT) continue; /* * fails when meets a zombie, or eixting process: * there is a small race in a kernel -- the process * may start exiting and we are trying to freeze it * before it compete exit procedure. The caller simply * should wait a bit and try freezing again. */ pr_err("zombie found while seizing\n"); fclose(f); return -EAGAIN; } } fclose(f); dir = opendir(root_path); if (!dir) { pr_perror("Unable to open %s", root_path); return -1; } while ((de = readdir(dir))) { struct stat st; int ret; if (dir_dots(de)) continue; sprintf(path, "%s/%s", root_path, de->d_name); if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) { pr_perror("stat of %s failed", path); closedir(dir); return -1; } if (!S_ISDIR(st.st_mode)) continue; ret = seize_cgroup_tree(path, state); if (ret < 0) { closedir(dir); return ret; } } closedir(dir); return 0; } /* * A freezer cgroup can contain tasks which will not be dumped * and we need to wait them, because the are interupted them by ptrace. */ static int freezer_wait_processes() { int i; processes_to_wait_pids = xmalloc(sizeof(pid_t) * processes_to_wait); if (processes_to_wait_pids == NULL) return -1; for (i = 0; i < processes_to_wait; i++) { int status; pid_t pid; /* * Here we are going to skip tasks which are already traced. * Ptraced tasks looks like children for us, so if * a task isn't ptraced yet, waitpid() will return a error. */ pid = waitpid(-1, &status, 0); if (pid < 0) { pr_perror("Unable to wait processes"); xfree(processes_to_wait_pids); processes_to_wait_pids = NULL; return -1; } pr_warn("Unexpected process %d in the freezer cgroup (status 0x%x)\n", pid, status); processes_to_wait_pids[i] = pid; } return 0; } static int freezer_detach(void) { int i; if (!opts.freeze_cgroup) return 0; for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { pid_t pid = processes_to_wait_pids[i]; int status, save_errno; if (ptrace(PTRACE_DETACH, pid, NULL, NULL) == 0) continue; save_errno = errno; /* A process may be killed by SIGKILL */ if (wait4(pid, &status, __WALL, NULL) == pid) { pr_warn("The %d process returned 0x %x\n", pid, status); continue; } errno = save_errno; pr_perror("Unable to detach from %d", pid); } return 0; } static int log_unfrozen_stacks(char *root) { DIR *dir; struct dirent *de; char path[PATH_MAX]; FILE *f; snprintf(path, sizeof(path), "%s/tasks", root); f = fopen(path, "r"); if (f == NULL) { pr_perror("Unable to open %s", path); return -1; } while (fgets(path, sizeof(path), f)) { pid_t pid; int ret, stack; char stackbuf[2048]; pid = atoi(path); stack = open_proc(pid, "stack"); if (stack < 0) { pr_err("`- couldn't log %d's stack\n", pid); fclose(f); return -1; } ret = read(stack, stackbuf, sizeof(stackbuf) - 1); close(stack); if (ret < 0) { pr_perror("couldn't read %d's stack", pid); fclose(f); return -1; } stackbuf[ret] = '\0'; pr_debug("Task %d has stack:\n%s", pid, stackbuf); } fclose(f); dir = opendir(root); if (!dir) { pr_perror("Unable to open %s", root); return -1; } while ((de = readdir(dir))) { struct stat st; if (dir_dots(de)) continue; sprintf(path, "%s/%s", root, de->d_name); if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) { pr_perror("stat of %s failed", path); closedir(dir); return -1; } if (!S_ISDIR(st.st_mode)) continue; if (log_unfrozen_stacks(path) < 0) { closedir(dir); return -1; } } closedir(dir); return 0; } static int freeze_processes(void) { int fd, exit_code = -1; char path[PATH_MAX]; const char *state = thawed; static const unsigned long step_ms = 100; unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms; unsigned long i = 0; const struct timespec req = { .tv_nsec = step_ms * 1000000, .tv_sec = 0, }; if (unlikely(!nr_attempts)) { /* * If timeout is turned off, lets * wait for at least 10 seconds. */ nr_attempts = (10 * 1000000) / step_ms; } pr_debug("freezing processes: %lu attempst with %lu ms steps\n", nr_attempts, step_ms); snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup); fd = open(path, O_RDWR); if (fd < 0) { pr_perror("Unable to open %s", path); return -1; } state = get_freezer_state(fd); if (!state) { close(fd); return -1; } if (state == thawed) { freezer_thawed = true; lseek(fd, 0, SEEK_SET); if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) { pr_perror("Unable to freeze tasks"); close(fd); return -1; } /* * Wait the freezer to complete before * processing tasks. They might be exiting * before freezing complete so we should * not read @tasks pids while freezer in * transition stage. */ for (; i <= nr_attempts; i++) { state = get_freezer_state(fd); if (!state) { close(fd); return -1; } if (state == frozen) break; if (alarm_timeouted()) goto err; nanosleep(&req, NULL); } if (i > nr_attempts) { pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup); if (!pr_quelled(LOG_DEBUG)) log_unfrozen_stacks(opts.freeze_cgroup); goto err; } pr_debug("freezing processes: %lu attempts done\n", i); } /* * Pay attention on @i variable -- it's continuation. */ for (; i <= nr_attempts; i++) { exit_code = seize_cgroup_tree(opts.freeze_cgroup, state); if (exit_code == -EAGAIN) { if (alarm_timeouted()) goto err; nanosleep(&req, NULL); } else break; } err: if (exit_code == 0 || freezer_thawed) { lseek(fd, 0, SEEK_SET); if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) { pr_perror("Unable to thaw tasks"); exit_code = -1; } } if (close(fd)) { pr_perror("Unable to thaw tasks"); return -1; } return exit_code; } static inline bool child_collected(struct pstree_item *i, pid_t pid) { struct pstree_item *c; list_for_each_entry(c, &i->children, sibling) if (c->pid->real == pid) return true; return false; } static int collect_task(struct pstree_item *item); static int collect_children(struct pstree_item *item) { pid_t *ch; int ret, i, nr_children, nr_inprogress; ret = parse_children(item->pid->real, &ch, &nr_children); if (ret < 0) return ret; nr_inprogress = 0; for (i = 0; i < nr_children; i++) { struct pstree_item *c; struct proc_status_creds *creds; pid_t pid = ch[i]; /* Is it already frozen? */ if (child_collected(item, pid)) continue; nr_inprogress++; if (alarm_timeouted()) { ret = -1; goto free; } pr_info("Seized task %d, state %d\n", pid, ret); c = alloc_pstree_item(); if (c == NULL) { ret = -1; goto free; } if (!opts.freeze_cgroup) /* fails when meets a zombie */ compel_interrupt_task(pid); creds = xzalloc(sizeof(*creds)); if (!creds) { ret = -1; goto free; } ret = compel_wait_task(pid, item->pid->real, parse_pid_status, NULL, &creds->s, NULL); if (ret < 0) { /* * Here is a race window between parse_children() and seize(), * so the task could die for these time. * Don't worry, will try again on the next attempt. The number * of attempts is restricted, so it will exit if something * really wrong. */ ret = 0; xfree(c); xfree(creds); continue; } if (ret == TASK_ZOMBIE) ret = TASK_DEAD; else processes_to_wait--; dmpi(c)->pi_creds = creds; c->pid->real = pid; c->parent = item; c->pid->state = ret; list_add_tail(&c->sibling, &item->children); /* Here is a recursive call (Depth-first search) */ ret = collect_task(c); if (ret < 0) goto free; } free: xfree(ch); return ret < 0 ? ret : nr_inprogress; } static void unseize_task_and_threads(const struct pstree_item *item, int st) { int i; if (item->pid->state == TASK_DEAD) return; /* * The st is the state we want to switch tasks into, * the item->state is the state task was in when we seized one. */ compel_resume_task(item->pid->real, item->pid->state, st); if (st == TASK_DEAD) return; for (i = 1; i < item->nr_threads; i++) if (ptrace(PTRACE_DETACH, item->threads[i].real, NULL, NULL)) pr_perror("Unable to detach from %d", item->threads[i].real); } static void pstree_wait(struct pstree_item *root_item) { struct pstree_item *item = root_item; int pid, status, i; for_each_pstree_item(item) { if (item->pid->state == TASK_DEAD) continue; for (i = 0; i < item->nr_threads; i++) { pid = wait4(-1, &status, __WALL, NULL); if (pid < 0) { pr_perror("wait4 failed"); break; } else { if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) { pr_err("Unexpected exit code %d of %d: %s\n", status, pid, strsignal(status)); BUG(); } } } } pid = wait4(-1, &status, __WALL, NULL); if (pid > 0) { pr_err("Unexpected child %d\n", pid); BUG(); } } void pstree_switch_state(struct pstree_item *root_item, int st) { struct pstree_item *item = root_item; if (!root_item) return; if (st != TASK_DEAD) freezer_restore_state(); /* * We need to detach from all processes before waiting the init * process, because one of these processes may collect processes from a * target pid namespace. The pid namespace is destroyed only when all * processes have been killed and collected. */ freezer_detach(); pr_info("Unfreezing tasks into %d\n", st); for_each_pstree_item(item) unseize_task_and_threads(item, st); if (st == TASK_DEAD) pstree_wait(root_item); } static pid_t item_ppid(const struct pstree_item *item) { item = item->parent; return item ? item->pid->real : -1; } static inline bool thread_collected(struct pstree_item *i, pid_t tid) { int t; if (i->pid->real == tid) /* thread leader is collected as task */ return true; for (t = 0; t < i->nr_threads; t++) if (tid == i->threads[t].real) return true; return false; } static bool creds_dumpable(struct proc_status_creds *parent, struct proc_status_creds *child) { /* * - seccomp filters should be passed via * semantic comparison (FIXME) but for * now we require them to be exactly * identical */ if (parent->s.seccomp_mode != child->s.seccomp_mode || parent->last_filter != child->last_filter) { if (!pr_quelled(LOG_DEBUG)) { pr_debug("Creds undumpable (parent:child)\n" " uids: %d:%d %d:%d %d:%d %d:%d\n" " gids: %d:%d %d:%d %d:%d %d:%d\n" " state: %d:%d" " ppid: %d:%d\n" " shdpnd: %llu:%llu\n" " seccomp_mode: %d:%d\n" " last_filter: %u:%u\n", parent->uids[0], child->uids[0], parent->uids[1], child->uids[1], parent->uids[2], child->uids[2], parent->uids[3], child->uids[3], parent->gids[0], child->gids[0], parent->gids[1], child->gids[1], parent->gids[2], child->gids[2], parent->gids[3], child->gids[3], parent->s.state, child->s.state, parent->s.ppid, child->s.ppid, parent->s.shdpnd, child->s.shdpnd, parent->s.seccomp_mode, child->s.seccomp_mode, parent->last_filter, child->last_filter); } return false; } return true; } static int collect_threads(struct pstree_item *item) { struct pid *threads = NULL; int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0; ret = parse_threads(item->pid->real, &threads, &nr_threads); if (ret < 0) goto err; if ((item->pid->state == TASK_DEAD) && (nr_threads > 1)) { pr_err("Zombies with threads are not supported\n"); goto err; } /* The number of threads can't be less than already frozen */ item->threads = xrealloc(item->threads, nr_threads * sizeof(struct pid)); if (item->threads == NULL) return -1; if (item->nr_threads == 0) { item->threads[0].real = item->pid->real; item->nr_threads = 1; item->threads[0].item = NULL; } nr_inprogress = 0; for (i = 0; i < nr_threads; i++) { pid_t pid = threads[i].real; struct proc_status_creds t_creds = {}; if (thread_collected(item, pid)) continue; nr_inprogress++; pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid); if (!opts.freeze_cgroup && compel_interrupt_task(pid)) continue; ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL); if (ret < 0) { /* * Here is a race window between parse_threads() and seize(), * so the task could die for these time. * Don't worry, will try again on the next attempt. The number * of attempts is restricted, so it will exit if something * really wrong. */ continue; } if (ret == TASK_ZOMBIE) ret = TASK_DEAD; else processes_to_wait--; BUG_ON(item->nr_threads + 1 > nr_threads); item->threads[item->nr_threads].real = pid; item->threads[item->nr_threads].item = NULL; item->nr_threads++; if (ret == TASK_DEAD) { pr_err("Zombie thread not supported\n"); goto err; } if (!creds_dumpable(dmpi(item)->pi_creds, &t_creds)) goto err; if (ret == TASK_STOPPED) { nr_stopped++; } } if (nr_stopped && nr_stopped != nr_inprogress) { pr_err("Individually stopped threads not supported\n"); goto err; } xfree(threads); return nr_inprogress; err: xfree(threads); return -1; } static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_item *)) { int attempts = NR_ATTEMPTS, nr_inprogress = 1; if (opts.freeze_cgroup) attempts = 1; /* * While we scan the proc and seize the children/threads * new ones can appear (with clone(CLONE_PARENT) or with * pthread_create). Thus, after one go, we need to repeat * the scan-and-freeze again collecting new arrivals. As * new guys may appear again we do NR_ATTEMPTS passes and * fail to seize the item if new tasks/threads still * appear. */ while (nr_inprogress > 0 && attempts >= 0) { attempts--; nr_inprogress = collect(item); } pr_info("Collected (%d attempts, %d in_progress)\n", attempts, nr_inprogress); /* * We may fail to collect items or run out of attempts. * In the former case nr_inprogress will be negative, in * the latter -- positive. Thus it's enough just to check * for "no more new stuff" and say "we're OK" if so. */ return (nr_inprogress == 0) ? 0 : -1; } static int collect_task(struct pstree_item *item) { int ret; ret = collect_loop(item, collect_threads); if (ret < 0) goto err_close; /* Depth-first search (DFS) is used for traversing a process tree. */ ret = collect_loop(item, collect_children); if (ret < 0) goto err_close; if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) { pr_err("Zombie with children?! O_o Run, run, run!\n"); goto err_close; } if (pstree_alloc_cores(item)) goto err_close; pr_info("Collected %d in %d state\n", item->pid->real, item->pid->state); return 0; err_close: close_pid_proc(); return -1; } int collect_pstree(void) { pid_t pid = root_item->pid->real; int ret = -1; struct proc_status_creds *creds; timing_start(TIME_FREEZING); /* * wait4() may hang for some reason. Enable timer and fire SIGALRM * if timeout reached. SIGALRM handler will do the necessary * cleanups and terminate current process. */ alarm(opts.timeout); if (opts.freeze_cgroup && freeze_processes()) goto err; if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; } creds = xzalloc(sizeof(*creds)); if (!creds) goto err; ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds->s, NULL); if (ret < 0) goto err; if (ret == TASK_ZOMBIE) ret = TASK_DEAD; else processes_to_wait--; pr_info("Seized task %d, state %d\n", pid, ret); root_item->pid->state = ret; dmpi(root_item)->pi_creds = creds; ret = collect_task(root_item); if (ret < 0) goto err; if (opts.freeze_cgroup && freezer_wait_processes()) { ret = -1; goto err; } ret = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); err: /* Freezing stage finished in time - disable timer. */ alarm(0); return ret; } criu-3.6/criu/shmem.c000066400000000000000000000436051317335042600145570ustar00rootroot00000000000000#include #include #include #include #include #include "common/list.h" #include "pid.h" #include "shmem.h" #include "image.h" #include "cr_options.h" #include "kerndat.h" #include "page-pipe.h" #include "page-xfer.h" #include "rst-malloc.h" #include "vma.h" #include "mem.h" #include "config.h" #include #include "bitops.h" #include "log.h" #include "types.h" #include "page.h" #include "util.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" #ifndef SEEK_DATA #define SEEK_DATA 3 #define SEEK_HOLE 4 #endif /* * Hash table and routines for keeping shmid -> shmem_xinfo mappings */ /* * The hash is filled with shared objects before we fork * any tasks. Thus the heads are private (COW-ed) and the * entries are all in shmem. */ #define SHMEM_HASH_SIZE 32 static struct hlist_head shmems_hash[SHMEM_HASH_SIZE]; #define for_each_shmem(_i, _si) \ for (i = 0; i < SHMEM_HASH_SIZE; i++) \ hlist_for_each_entry(_si, &shmems_hash[_i], h) struct shmem_info { struct hlist_node h; unsigned long shmid; /* * Owner PID. This guy creates anon shmem on restore and * from this the shmem is read on dump */ int pid; unsigned long size; union { struct { /* For restore */ /* * Descriptor by which this shmem is opened * by the creator */ int fd; /* * 0. lock is initilized to zero * 1. the master opens a descriptor and set lock to 1 * 2. slaves open their descriptors and increment lock * 3. the master waits all slaves on lock. After that * it can close the descriptor. */ futex_t lock; /* * Here is a problem, that we don't know, which process will restore * an region. Each time when we found a process with a smaller pid, * we reset self_count, so we can't have only one counter. */ int count; /* the number of regions */ int self_count; /* the number of regions, which belongs to "pid" */ }; struct { /* For sysvipc restore */ struct list_head att; /* list of shmem_sysv_att-s */ int want_write; }; struct { /* For dump */ unsigned long start; unsigned long end; unsigned long *pstate_map; }; }; }; struct shmem_sysv_att { struct list_head l; VmaEntry *first; unsigned long prev_end; }; /* This is the "pid that will restore shmem" value for sysv */ #define SYSVIPC_SHMEM_PID (-1) static inline struct hlist_head *shmem_chain(unsigned long shmid) { return &shmems_hash[shmid % SHMEM_HASH_SIZE]; } static void shmem_hash_add(struct shmem_info *si) { struct hlist_head *chain; chain = shmem_chain(si->shmid); hlist_add_head(&si->h, chain); } static struct shmem_info *shmem_find(unsigned long shmid) { struct hlist_head *chain; struct shmem_info *si; chain = shmem_chain(shmid); hlist_for_each_entry(si, chain, h) if (si->shmid == shmid) return si; return NULL; } #define PST_DONT_DUMP 0 #define PST_DUMP 1 #define PST_ZERO 2 #define PST_DIRTY 3 #define PST_BITS 2 #define PST_BIT0_IX(pfn) ((pfn) * PST_BITS) #define PST_BIT1_IX(pfn) (PST_BIT0_IX(pfn) + 1) /* * Disable pagemap based shmem changes tracking by default * because it has bugs in implementation - * process can map shmem page, change it and unmap it. * We won't observe any changes in such pagemaps during dump. */ static bool is_shmem_tracking_en(void) { static bool is_inited = false; static bool is_enabled = false; if (!is_inited) { is_enabled = (bool)getenv("CRIU_TRACK_SHMEM"); is_inited = true; if (is_enabled) pr_msg("Turn anon shmem tracking on via env\n"); } return is_enabled; } static unsigned int get_pstate(unsigned long *pstate_map, unsigned long pfn) { unsigned int bit0 = test_bit(PST_BIT0_IX(pfn), pstate_map) ? 1 : 0; unsigned int bit1 = test_bit(PST_BIT1_IX(pfn), pstate_map) ? 1 : 0; return (bit1 << 1) | bit0; } static void set_pstate(unsigned long *pstate_map, unsigned long pfn, unsigned int pstate) { if (pstate & 1) set_bit(PST_BIT0_IX(pfn), pstate_map); if (pstate & 2) set_bit(PST_BIT1_IX(pfn), pstate_map); } static int expand_shmem(struct shmem_info *si, unsigned long new_size) { unsigned long nr_pages, nr_map_items, map_size, nr_new_map_items, new_map_size, old_size; old_size = si->size; si->size = new_size; if (!is_shmem_tracking_en()) return 0; nr_pages = DIV_ROUND_UP(old_size, PAGE_SIZE); nr_map_items = BITS_TO_LONGS(nr_pages * PST_BITS); map_size = nr_map_items * sizeof(*si->pstate_map); nr_pages = DIV_ROUND_UP(new_size, PAGE_SIZE); nr_new_map_items = BITS_TO_LONGS(nr_pages * PST_BITS); new_map_size = nr_new_map_items * sizeof(*si->pstate_map); BUG_ON(new_map_size < map_size); si->pstate_map = xrealloc(si->pstate_map, new_map_size); if (!si->pstate_map) return -1; memzero(si->pstate_map + nr_map_items, new_map_size - map_size); return 0; } static void update_shmem_pmaps(struct shmem_info *si, u64 *map, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; if (!is_shmem_tracking_en()) return; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); for (vma_pfn = 0; vma_pfn < vma_pgcnt; ++vma_pfn) { if (!should_dump_page(vma, map[vma_pfn])) continue; shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); if (map[vma_pfn] & PME_SOFT_DIRTY) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); else if (page_is_zero(map[vma_pfn])) set_pstate(si->pstate_map, shmem_pfn, PST_ZERO); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } } int collect_sysv_shmem(unsigned long shmid, unsigned long size) { struct shmem_info *si; /* * Tasks will not modify this object, so don't * shmalloc() as we do it for anon shared mem */ si = malloc(sizeof(*si)); if (!si) return -1; si->shmid = shmid; si->pid = SYSVIPC_SHMEM_PID; si->size = size; si->want_write = 0; INIT_LIST_HEAD(&si->att); shmem_hash_add(si); pr_info("Collected SysV shmem %lx, size %ld\n", si->shmid, si->size); return 0; } int fixup_sysv_shmems(void) { int i; struct shmem_info *si; struct shmem_sysv_att *att; for_each_shmem(i, si) { /* It can be anon shmem */ if (si->pid != SYSVIPC_SHMEM_PID) continue; list_for_each_entry(att, &si->att, l) { /* * Same thing is checked in open_shmem_sysv() for * intermediate holes. */ if (att->first->start + round_up(si->size, page_size()) != att->prev_end) { pr_err("Sysv shmem %lx with tail hole not supported\n", si->shmid); return -1; } /* * See comment in open_shmem_sysv() about this PROT_EXEC */ if (si->want_write) att->first->prot |= PROT_EXEC; } } return 0; } static int open_shmem_sysv(int pid, struct vma_area *vma) { VmaEntry *vme = vma->e; struct shmem_info *si; struct shmem_sysv_att *att; uint64_t ret_fd; si = shmem_find(vme->shmid); if (!si) { pr_err("Can't find sysv shmem for %"PRIx64"\n", vme->shmid); return -1; } if (si->pid != SYSVIPC_SHMEM_PID) { pr_err("SysV shmem vma 0x%"PRIx64" points to anon vma %lx\n", vme->start, si->shmid); return -1; } /* * We can have a chain of VMAs belonging to the same * sysv shmem segment all with different access rights * (ro and rw). But single shmat() system call attaches * the whole segment regardless of the actual mapping * size. This can be achieved by attaching a segment * and then write-protecting its parts. * * So, to restore this thing we note the very first * area of the segment and make it restore the whole * thing. All the subsequent ones will carry the sign * telling the restorer to omit shmat and only do the * ro protection. Yes, it may happen that some sysv * shmem vma-s sit in the list (and restorer's array) * for no use. * * Holes in between are not handled now, as well as * the hole at the end (see fixup_sysv_shmems). * * One corner case. At shmat() time we need to know * whether to create the segment rw or ro, but the * first vma can have different protection. So the * segment ro-ness is marked with PROT_EXEC bit in * the first vma. Unfortunatelly, we only know this * after we scan all the vmas, so this bit is set * at the end in fixup_sysv_shmems(). */ if (vme->pgoff == 0) { att = xmalloc(sizeof(*att)); if (!att) return -1; att->first = vme; list_add(&att->l, &si->att); ret_fd = si->shmid; } else { att = list_first_entry(&si->att, struct shmem_sysv_att, l); if (att->prev_end != vme->start) { pr_err("Sysv shmem %lx with a hole not supported\n", si->shmid); return -1; } if (vme->pgoff != att->prev_end - att->first->start) { pr_err("Sysv shmem %lx with misordered attach chunks\n", si->shmid); return -1; } /* * Value that doesn't (shouldn't) match with any real * sysv shmem ID (thus it cannot be 0, as shmem id can) * and still is not negative to prevent prepare_vmas() from * treating it as error. */ ret_fd = SYSV_SHMEM_SKIP_FD; } pr_info("Note 0x%"PRIx64"-0x%"PRIx64" as %lx sysvshmem\n", vme->start, vme->end, si->shmid); att->prev_end = vme->end; if (!vme->has_fdflags || vme->fdflags == O_RDWR) /* * We can't look at vma->prot & PROT_WRITE as all this stuff * can be read-protected. If !has_fdflags these are old images * and ... we have no other choice other than make it with * maximum access :( */ si->want_write = 1; vme->fd = ret_fd; return 0; } static int open_shmem(int pid, struct vma_area *vma); int collect_shmem(int pid, struct vma_area *vma) { VmaEntry *vi = vma->e; unsigned long size = vi->pgoff + vi->end - vi->start; struct shmem_info *si; if (vma_entry_is(vi, VMA_AREA_SYSVIPC)) { vma->vm_open = open_shmem_sysv; return 0; } vma->vm_open = open_shmem; si = shmem_find(vi->shmid); if (si) { if (si->pid == SYSVIPC_SHMEM_PID) { pr_err("Shmem %"PRIx64" already collected as SYSVIPC\n", vi->shmid); return -1; } if (si->size < size) si->size = size; si->count++; /* * Only the shared mapping with a lowest * pid will be created in real, other processes * will wait until the kernel propagate this mapping * into /proc */ if (!pid_rst_prio(pid, si->pid)) { if (si->pid == pid) si->self_count++; return 0; } si->pid = pid; si->self_count = 1; return 0; } si = shmalloc(sizeof(struct shmem_info)); if (!si) return -1; pr_info("Add new shmem 0x%"PRIx64" (%#016"PRIx64"-%#016"PRIx64")\n", vi->shmid, vi->start, vi->end); si->shmid = vi->shmid; si->pid = pid; si->size = size; si->fd = -1; si->count = 1; si->self_count = 1; futex_init(&si->lock); shmem_hash_add(si); return 0; } static int shmem_wait_and_open(struct shmem_info *si, VmaEntry *vi) { char path[128]; int ret; pr_info("Waiting for the %lx shmem to appear\n", si->shmid); futex_wait_while(&si->lock, 0); snprintf(path, sizeof(path), "/proc/%d/fd/%d", si->pid, si->fd); pr_info("Opening shmem [%s] \n", path); ret = open_proc_rw(si->pid, "fd/%d", si->fd); futex_inc_and_wake(&si->lock); if (ret < 0) return -1; vi->fd = ret; return 0; } static int do_restore_shmem_content(void *addr, unsigned long size, unsigned long shmid) { int ret = 0; struct page_read pr; ret = open_page_read(shmid, &pr, PR_SHMEM); if (ret <= 0) return -1; while (1) { unsigned long vaddr; unsigned nr_pages; ret = pr.advance(&pr); if (ret <= 0) break; vaddr = (unsigned long)decode_pointer(pr.pe->vaddr); nr_pages = pr.pe->nr_pages; if (vaddr + nr_pages * PAGE_SIZE > size) break; pr.read_pages(&pr, vaddr, nr_pages, addr + vaddr, 0); } pr.close(&pr); return ret; } static int restore_shmem_content(void *addr, struct shmem_info *si) { return do_restore_shmem_content(addr, si->size, si->shmid); } int restore_sysv_shmem_content(void *addr, unsigned long size, unsigned long shmid) { return do_restore_shmem_content(addr, round_up(size, PAGE_SIZE), shmid); } static int open_shmem(int pid, struct vma_area *vma) { VmaEntry *vi = vma->e; struct shmem_info *si; void *addr = MAP_FAILED; int f = -1; int flags; si = shmem_find(vi->shmid); pr_info("Search for %#016"PRIx64" shmem 0x%"PRIx64" %p/%d\n", vi->start, vi->shmid, si, si ? si->pid : -1); if (!si) { pr_err("Can't find my shmem %#016"PRIx64"\n", vi->start); return -1; } BUG_ON(si->pid == SYSVIPC_SHMEM_PID); if (si->pid != pid) return shmem_wait_and_open(si, vi); if (si->fd != -1) { f = dup(si->fd); if (f < 0) { pr_perror("Can't dup shmem fd"); return -1; } goto out; } flags = MAP_SHARED; if (kdat.has_memfd) { f = syscall(SYS_memfd_create, "", 0); if (f < 0) { pr_perror("Unable to create memfd"); goto err; } if (ftruncate(f, si->size)) { pr_perror("Unable to truncate memfd"); goto err; } flags |= MAP_FILE; } else flags |= MAP_ANONYMOUS; /* * The following hack solves problems: * vi->pgoff may be not zero in a target process. * This mapping may be mapped more then once. * The restorer doesn't have snprintf. * Here is a good place to restore content */ addr = mmap(NULL, si->size, PROT_WRITE | PROT_READ, flags, f, 0); if (addr == MAP_FAILED) { pr_err("Can't mmap shmid=0x%"PRIx64" size=%ld\n", vi->shmid, si->size); goto err; } if (restore_shmem_content(addr, si) < 0) { pr_err("Can't restore shmem content\n"); goto err; } if (f == -1) { f = open_proc_rw(getpid(), "map_files/%lx-%lx", (unsigned long) addr, (unsigned long) addr + si->size); if (f < 0) goto err; } munmap(addr, si->size); si->fd = f; /* Send signal to slaves, that they can open fd for this shmem */ futex_inc_and_wake(&si->lock); /* * All other regions in this process will duplicate * the file descriptor, so we don't wait them. */ futex_wait_until(&si->lock, si->count - si->self_count + 1); out: vi->fd = f; return 0; err: if (addr != MAP_FAILED) munmap(addr, si->size); close_safe(&f); return -1; } int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) { struct shmem_info *si; unsigned long size = vma->pgoff + (vma->end - vma->start); if (vma_entry_is(vma, VMA_AREA_SYSVIPC)) pid = SYSVIPC_SHMEM_PID; si = shmem_find(vma->shmid); if (si) { if (si->size < size) { if (expand_shmem(si, size)) return -1; } update_shmem_pmaps(si, map, vma); return 0; } si = xzalloc(sizeof(*si)); if (!si) return -1; si->pid = pid; si->start = vma->start; si->end = vma->end; si->shmid = vma->shmid; shmem_hash_add(si); if (expand_shmem(si, size)) return -1; update_shmem_pmaps(si, map, vma); return 0; } static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer) { struct page_pipe_buf *ppb; list_for_each_entry(ppb, &pp->bufs, l) if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs, SPLICE_F_GIFT | SPLICE_F_NONBLOCK) != ppb->pages_in * PAGE_SIZE) { pr_perror("Can't get shmem into page-pipe"); return -1; } return page_xfer_dump_pages(xfer, pp); } static int next_data_segment(int fd, unsigned long pfn, unsigned long *next_data_pfn, unsigned long *next_hole_pfn) { off_t off; off = lseek(fd, pfn * PAGE_SIZE, SEEK_DATA); if (off == (off_t) -1) { if (errno == ENXIO) { *next_data_pfn = ~0UL; *next_hole_pfn = ~0UL; return 0; } pr_perror("Unable to lseek(SEEK_DATA)"); return -1; } *next_data_pfn = off / PAGE_SIZE; off = lseek(fd, off, SEEK_HOLE); if (off == (off_t) -1) { pr_perror("Unable to lseek(SEEK_HOLE)"); return -1; } *next_hole_pfn = off / PAGE_SIZE; return 0; } static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si) { struct page_pipe *pp; struct page_xfer xfer; int err, ret = -1; unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0; nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE; pp = create_page_pipe((nrpages + 1) / 2, NULL, PP_CHUNK_MODE); if (!pp) goto err; err = open_page_xfer(&xfer, CR_FD_SHMEM_PAGEMAP, si->shmid); if (err) goto err_pp; xfer.offset = (unsigned long)addr; for (pfn = 0; pfn < nrpages; pfn++) { unsigned int pgstate = PST_DIRTY; bool use_mc = true; unsigned long pgaddr; if (pfn >= next_hole_pfn && next_data_segment(fd, pfn, &next_data_pnf, &next_hole_pfn)) goto err_xfer; if (si->pstate_map && is_shmem_tracking_en()) { pgstate = get_pstate(si->pstate_map, pfn); use_mc = pgstate == PST_DONT_DUMP; } if (use_mc) { if (pfn < next_data_pnf) pgstate = PST_ZERO; else pgstate = PST_DIRTY; } pgaddr = (unsigned long)addr + pfn * PAGE_SIZE; again: if (pgstate == PST_ZERO) ret = 0; else if (xfer.parent && page_in_parent(pgstate == PST_DIRTY)) ret = page_pipe_add_hole(pp, pgaddr, PP_HOLE_PARENT); else ret = page_pipe_add_page(pp, pgaddr, 0); if (ret == -EAGAIN) { ret = dump_pages(pp, &xfer); if (ret) goto err_xfer; page_pipe_reinit(pp); goto again; } else if (ret) goto err_xfer; } ret = dump_pages(pp, &xfer); err_xfer: xfer.close(&xfer); err_pp: destroy_page_pipe(pp); err: return ret; } static int dump_one_shmem(struct shmem_info *si) { int fd, ret = -1; void *addr; pr_info("Dumping shared memory %ld\n", si->shmid); fd = open_proc(si->pid, "map_files/%lx-%lx", si->start, si->end); if (fd < 0) goto err; addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) { pr_err("Can't map shmem 0x%lx (0x%lx-0x%lx)\n", si->shmid, si->start, si->end); goto errc; } ret = do_dump_one_shmem(fd, addr, si); munmap(addr, si->size); errc: close(fd); err: return ret; } int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid) { int fd, ret; struct shmem_info *si, det; si = shmem_find(shmid); if (!si) { pr_info("Detached shmem...\n"); det.pid = SYSVIPC_SHMEM_PID; det.shmid = shmid; det.size = round_up(size, PAGE_SIZE); det.pstate_map = NULL; si = &det; } fd = open_proc(PROC_SELF, "map_files/%lx-%lx", (unsigned long)addr, (unsigned long)addr + si->size); if (fd < 0) return -1; ret = do_dump_one_shmem(fd, addr, si); close(fd); return ret; } int cr_dump_shmem(void) { int ret = 0, i; struct shmem_info *si; for_each_shmem(i, si) { if (si->pid == SYSVIPC_SHMEM_PID) continue; ret = dump_one_shmem(si); if (ret) goto out; } out: return ret; } criu-3.6/criu/sigframe.c000066400000000000000000000024571317335042600152430ustar00rootroot00000000000000#include #include #include "log.h" #include "restore.h" #include "images/core.pb-c.h" #ifndef setup_sas static inline void setup_sas(struct rt_sigframe* sigframe, ThreadSasEntry *sas) { if (sas) { #define UC RT_SIGFRAME_UC(sigframe) UC->uc_stack.ss_sp = (void *)decode_pointer((sas)->ss_sp); UC->uc_stack.ss_flags = (int)(sas)->ss_flags; UC->uc_stack.ss_size = (size_t)(sas)->ss_size; #undef UC } } #endif #ifndef RT_SIGFRAME_UC_SIGMASK #define RT_SIGFRAME_UC_SIGMASK(sigframe) \ (k_rtsigset_t*)&RT_SIGFRAME_UC(sigframe)->uc_sigmask #endif int construct_sigframe(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe, k_rtsigset_t *blkset, CoreEntry *core) { k_rtsigset_t *blk_sigset; /* * Copy basic register set in the first place: this will set * rt_sigframe type: native/compat. */ if (restore_gpregs(sigframe, CORE_THREAD_ARCH_INFO(core)->gpregs)) return -1; blk_sigset = RT_SIGFRAME_UC_SIGMASK(sigframe); if (blkset) memcpy(blk_sigset, blkset, sizeof(k_rtsigset_t)); else memset(blk_sigset, 0, sizeof(k_rtsigset_t)); if (restore_fpu(sigframe, core)) return -1; if (RT_SIGFRAME_HAS_FPU(sigframe)) if (sigreturn_prep_fpu_frame(sigframe, rsigframe)) return -1; setup_sas(sigframe, core->thread_core->sas); return 0; } criu-3.6/criu/signalfd.c000066400000000000000000000045441317335042600152340ustar00rootroot00000000000000#include #include #include #include "common/compiler.h" #include "signalfd.h" #include "fdinfo.h" #include "imgset.h" #include "image.h" #include "util.h" #include "log.h" #include "files.h" #include "protobuf.h" #include "images/signalfd.pb-c.h" struct signalfd_info { SignalfdEntry *sfe; struct file_desc d; }; int is_signalfd_link(char *link) { return is_anon_link_type(link, "[signalfd]"); } static int dump_one_signalfd(int lfd, u32 id, const struct fd_parms *p) { SignalfdEntry sfd = SIGNALFD_ENTRY__INIT; FileEntry fe = FILE_ENTRY__INIT; if (parse_fdinfo(lfd, FD_TYPES__SIGNALFD, &sfd)) return -1; sfd.id = id; sfd.flags = p->flags; sfd.fown = (FownEntry *)&p->fown; fe.type = FD_TYPES__SIGNALFD; fe.id = sfd.id; fe.sgfd = &sfd; return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } const struct fdtype_ops signalfd_dump_ops = { .type = FD_TYPES__SIGNALFD, .dump = dump_one_signalfd, }; static void sigset_fill(sigset_t *to, unsigned long long from) { int sig; pr_info("\tCalculating sigmask for %Lx\n", from); sigemptyset(to); for (sig = 1; sig < NSIG; sig++) if (from & (1ULL << (sig - 1))) { pr_debug("\t\tAdd %d signal to mask\n", sig); sigaddset(to, sig); } } static int signalfd_open(struct file_desc *d, int *new_fd) { struct signalfd_info *info; int tmp; sigset_t mask; info = container_of(d, struct signalfd_info, d); pr_info("Restoring signalfd %#x\n", info->sfe->id); sigset_fill(&mask, info->sfe->sigmask); tmp = signalfd(-1, &mask, 0); if (tmp < 0) { pr_perror("Can't create signalfd %#08x", info->sfe->id); return -1; } if (rst_file_params(tmp, info->sfe->fown, info->sfe->flags)) { pr_perror("Can't restore params on signalfd %#08x", info->sfe->id); goto err_close; } *new_fd = tmp; return 0; err_close: close(tmp); return -1; } static struct file_desc_ops signalfd_desc_ops = { .type = FD_TYPES__SIGNALFD, .open = signalfd_open, }; static int collect_one_sigfd(void *o, ProtobufCMessage *msg, struct cr_img *i) { struct signalfd_info *info = o; info->sfe = pb_msg(msg, SignalfdEntry); return file_desc_add(&info->d, info->sfe->id, &signalfd_desc_ops); } struct collect_image_info signalfd_cinfo = { .fd_type = CR_FD_SIGNALFD, .pb_type = PB_SIGNALFD, .priv_size = sizeof(struct signalfd_info), .collect = collect_one_sigfd, }; criu-3.6/criu/sk-inet.c000066400000000000000000000447101317335042600150160ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "../soccr/soccr.h" #include "libnetlink.h" #include "cr_options.h" #include "imgset.h" #include "inet_diag.h" #include "files.h" #include "image.h" #include "log.h" #include "rst-malloc.h" #include "sockets.h" #include "sk-inet.h" #include "protobuf.h" #include "util.h" #define PB_ALEN_INET 1 #define PB_ALEN_INET6 4 static LIST_HEAD(inet_ports); struct inet_port { int port; int type; struct list_head type_list; atomic_t users; mutex_t reuseaddr_lock; struct list_head list; }; static struct inet_port *port_add(struct inet_sk_info *ii, int port) { int type = ii->ie->type; struct inet_port *e; list_for_each_entry(e, &inet_ports, list) if (e->type == type && e->port == port) { atomic_inc(&e->users); goto out_link; } e = shmalloc(sizeof(*e)); if (e == NULL) { pr_err("Not enough memory\n"); return NULL; } e->port = port; e->type = type; atomic_set(&e->users, 1); mutex_init(&e->reuseaddr_lock); INIT_LIST_HEAD(&e->type_list); list_add(&e->list, &inet_ports); out_link: list_add(&ii->port_list, &e->type_list); return e; } static void show_one_inet(const char *act, const struct inet_sk_desc *sk) { char src_addr[INET_ADDR_LEN] = ""; if (inet_ntop(sk->sd.family, (void *)sk->src_addr, src_addr, INET_ADDR_LEN) == NULL) { pr_perror("Failed to translate address"); } pr_debug("\t%s: ino %#8x family %4d type %4d port %8d " "state %2d src_addr %s\n", act, sk->sd.ino, sk->sd.family, sk->type, sk->src_port, sk->state, src_addr); } static void show_one_inet_img(const char *act, const InetSkEntry *e) { char src_addr[INET_ADDR_LEN] = ""; if (inet_ntop(e->family, (void *)e->src_addr, src_addr, INET_ADDR_LEN) == NULL) { pr_perror("Failed to translate address"); } pr_debug("\t%s: family %d type %d proto %d port %d " "state %d src_addr %s\n", act, e->family, e->type, e->proto, e->src_port, e->state, src_addr); } static int can_dump_ipproto(int ino, int proto) { /* Make sure it's a proto we support */ switch (proto) { case IPPROTO_IP: case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); return 0; } return 1; } static int can_dump_inet_sk(const struct inet_sk_desc *sk) { BUG_ON((sk->sd.family != AF_INET) && (sk->sd.family != AF_INET6)); if (sk->type == SOCK_DGRAM) { if (sk->wqlen != 0) { pr_err("Can't dump corked dgram socket %x\n", sk->sd.ino); return 0; } if (sk->rqlen) pr_warn("Read queue is dropped for socket %x\n", sk->sd.ino); return 1; } if (sk->type != SOCK_STREAM) { pr_err("Can't dump %d inet socket %x. " "Only stream and dgram are supported.\n", sk->type, sk->sd.ino); return 0; } switch (sk->state) { case TCP_LISTEN: if (sk->rqlen != 0) { if (opts.tcp_skip_in_flight) { pr_info("Skipping in-flight connection (l) for %x\n", sk->sd.ino); break; } /* * Currently the ICONS nla reports the conn * requests for listen sockets. Need to pick * those up and fix the connect job respectively */ pr_err("In-flight connection (l) for %x\n", sk->sd.ino); pr_err("In-flight connections can be ignored with the " "--%s option.\n", SK_INFLIGHT_PARAM); return 0; } break; case TCP_ESTABLISHED: case TCP_FIN_WAIT2: case TCP_FIN_WAIT1: case TCP_CLOSE_WAIT: case TCP_LAST_ACK: case TCP_CLOSING: case TCP_SYN_SENT: if (!opts.tcp_established_ok) { pr_err("Connected TCP socket, consider using --%s option.\n", SK_EST_PARAM); return 0; } break; case TCP_CLOSE: /* Trivial case, we just need to create a socket on restore */ break; default: pr_err("Unknown inet socket %x state %d\n", sk->sd.ino, sk->state); return 0; } return 1; } static int dump_sockaddr(union libsoccr_addr *sa, u32 *pb_port, u32 *pb_addr) { if (sa->sa.sa_family == AF_INET) { memcpy(pb_addr, &sa->v4.sin_addr, sizeof(sa->v4.sin_addr)); *pb_port = ntohs(sa->v4.sin_port); return 0; } if (sa->sa.sa_family == AF_INET6) { *pb_port = ntohs(sa->v6.sin6_port); memcpy(pb_addr, &sa->v6.sin6_addr, sizeof(sa->v6.sin6_addr)); return 0; } return -1; } static struct inet_sk_desc *gen_uncon_sk(int lfd, const struct fd_parms *p, int proto) { struct inet_sk_desc *sk; union libsoccr_addr address; socklen_t aux; int ret; sk = xzalloc(sizeof(*sk)); if (!sk) goto err; ret = do_dump_opt(lfd, SOL_SOCKET, SO_DOMAIN, &sk->sd.family, sizeof(sk->sd.family)); ret |= do_dump_opt(lfd, SOL_SOCKET, SO_TYPE, &sk->type, sizeof(sk->type)); if (ret) goto err; if (sk->sd.family == AF_INET) aux = sizeof(struct sockaddr_in); else if (sk->sd.family == AF_INET6) aux = sizeof(struct sockaddr_in6); else { pr_err("Unsupported socket family: %d\n", sk->sd.family); goto err; } ret = getsockopt(lfd, SOL_SOCKET, SO_PEERNAME, &address, &aux); if (ret < 0) { if (errno != ENOTCONN) { pr_perror("Unexpected error returned from unconnected socket"); goto err; } } else if (dump_sockaddr(&address, &sk->dst_port, sk->dst_addr)) goto err; ret = getsockname(lfd, &address.sa, &aux); if (ret < 0) { if (errno != ENOTCONN) { pr_perror("Unexpected error returned from unconnected socket"); goto err; } } else if (dump_sockaddr(&address, &sk->src_port, sk->src_addr)) goto err; sk->sd.ino = p->stat.st_ino; if (proto == IPPROTO_TCP) { struct { __u8 tcpi_state; __u8 tcpi_ca_state; __u8 tcpi_retransmits; __u8 tcpi_probes; __u8 tcpi_backoff; __u8 tcpi_options; } info; aux = sizeof(info); ret = getsockopt(lfd, SOL_TCP, TCP_INFO, &info, &aux); if (ret) { pr_perror("Failed to obtain TCP_INFO"); goto err; } if (info.tcpi_state != TCP_CLOSE) { pr_err("Socket state %d obtained but expected %d\n", info.tcpi_state, TCP_CLOSE); goto err; } sk->wqlen = info.tcpi_backoff; } sk->state = TCP_CLOSE; sk_collect_one(sk->sd.ino, sk->sd.family, &sk->sd); return sk; err: xfree(sk); return NULL; } static int dump_ip_opts(int sk, IpOptsEntry *ioe) { int ret = 0; ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); ioe->has_freebind = ioe->freebind; return ret; } /* Stolen from the kernel's __ipv6_addr_type/__ipv6_addr_needs_scopeid; * link local and (multicast + loopback + linklocal) addrs require a * scope id. */ #define IPV6_ADDR_SCOPE_NODELOCAL 0x01 #define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 static bool needs_scope_id(uint32_t *src_addr) { if ((src_addr[0] & htonl(0xFF00000)) == htonl(0xFF000000)) { if (src_addr[1] & (IPV6_ADDR_SCOPE_LINKLOCAL|IPV6_ADDR_SCOPE_NODELOCAL)) return true; } if ((src_addr[0] & htonl(0xFFC00000)) == htonl(0xFE800000)) return true; return false; } static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int family) { struct inet_sk_desc *sk; FileEntry fe = FILE_ENTRY__INIT; InetSkEntry ie = INET_SK_ENTRY__INIT; IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT; SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; int ret = -1, err = -1, proto; ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL, &proto, sizeof(proto)); if (ret) goto err; if (!can_dump_ipproto(p->stat.st_ino, proto)) goto err; sk = (struct inet_sk_desc *)lookup_socket(p->stat.st_ino, family, proto); if (IS_ERR(sk)) goto err; if (!sk) { sk = gen_uncon_sk(lfd, p, proto); if (!sk) goto err; } if (!can_dump_inet_sk(sk)) goto err; BUG_ON(sk->sd.already_dumped); ie.id = id; ie.ino = sk->sd.ino; ie.family = family; ie.proto = proto; ie.type = sk->type; ie.src_port = sk->src_port; ie.dst_port = sk->dst_port; ie.backlog = sk->wqlen; ie.flags = p->flags; ie.fown = (FownEntry *)&p->fown; ie.opts = &skopts; ie.ip_opts = &ipopts; ie.n_src_addr = PB_ALEN_INET; ie.n_dst_addr = PB_ALEN_INET; if (ie.family == AF_INET6) { int val; char device[IFNAMSIZ]; socklen_t len = sizeof(device); ie.n_src_addr = PB_ALEN_INET6; ie.n_dst_addr = PB_ALEN_INET6; ret = dump_opt(lfd, SOL_IPV6, IPV6_V6ONLY, &val); if (ret < 0) goto err; ie.v6only = val ? true : false; ie.has_v6only = true; /* ifindex only matters on source ports for bind, so let's * find only that ifindex. */ if (sk->src_port && needs_scope_id(sk->src_addr)) { if (getsockopt(lfd, SOL_SOCKET, SO_BINDTODEVICE, device, &len) < 0) { pr_perror("can't get ifname"); goto err; } if (len > 0) { ie.ifname = xstrdup(device); if (!ie.ifname) goto err; } else { pr_err("couldn't find ifname for %d, can't bind\n", id); goto err; } } } ie.src_addr = xmalloc(pb_repeated_size(&ie, src_addr)); ie.dst_addr = xmalloc(pb_repeated_size(&ie, dst_addr)); if (!ie.src_addr || !ie.dst_addr) goto err; memcpy(ie.src_addr, sk->src_addr, pb_repeated_size(&ie, src_addr)); memcpy(ie.dst_addr, sk->dst_addr, pb_repeated_size(&ie, dst_addr)); if (dump_ip_opts(lfd, &ipopts)) goto err; if (dump_socket_opts(lfd, &skopts)) goto err; pr_info("Dumping inet socket at %d\n", p->fd); show_one_inet("Dumping", sk); show_one_inet_img("Dumped", &ie); sk->sd.already_dumped = 1; sk->cpt_reuseaddr = skopts.reuseaddr; switch (proto) { case IPPROTO_TCP: err = dump_one_tcp(lfd, sk); break; case IPPROTO_UDP: case IPPROTO_UDPLITE: sk_encode_shutdown(&ie, sk->shutdown); /* Fallthrough! */ default: err = 0; break; } ie.state = sk->state; fe.type = FD_TYPES__INETSK; fe.id = ie.id; fe.isk = &ie; if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) goto err; err: release_skopts(&skopts); xfree(ie.src_addr); xfree(ie.dst_addr); return err; } static int dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p) { return do_dump_one_inet_fd(lfd, id, p, PF_INET); } const struct fdtype_ops inet_dump_ops = { .type = FD_TYPES__INETSK, .dump = dump_one_inet_fd, }; static int dump_one_inet6_fd(int lfd, u32 id, const struct fd_parms *p) { return do_dump_one_inet_fd(lfd, id, p, PF_INET6); } const struct fdtype_ops inet6_dump_ops = { .type = FD_TYPES__INETSK, .dump = dump_one_inet6_fd, }; int inet_collect_one(struct nlmsghdr *h, int family, int type) { struct inet_sk_desc *d; struct inet_diag_msg *m = NLMSG_DATA(h); struct nlattr *tb[INET_DIAG_MAX+1]; int ret; nlmsg_parse(h, sizeof(struct inet_diag_msg), tb, INET_DIAG_MAX, NULL); d = xzalloc(sizeof(*d)); if (!d) return -1; d->type = type; d->src_port = ntohs(m->id.idiag_sport); d->dst_port = ntohs(m->id.idiag_dport); d->state = m->idiag_state; d->rqlen = m->idiag_rqueue; d->wqlen = m->idiag_wqueue; memcpy(d->src_addr, m->id.idiag_src, sizeof(u32) * 4); memcpy(d->dst_addr, m->id.idiag_dst, sizeof(u32) * 4); if (tb[INET_DIAG_SHUTDOWN]) d->shutdown = nla_get_u8(tb[INET_DIAG_SHUTDOWN]); else pr_err_once("Can't check shutdown state of inet socket\n"); ret = sk_collect_one(m->idiag_inode, family, &d->sd); show_one_inet("Collected", d); return ret; } static int open_inet_sk(struct file_desc *d, int *new_fd); static int post_open_inet_sk(struct file_desc *d, int sk); static struct file_desc_ops inet_desc_ops = { .type = FD_TYPES__INETSK, .open = open_inet_sk, }; static inline int tcp_connection(InetSkEntry *ie) { return (ie->proto == IPPROTO_TCP && ie->dst_port); } static int collect_one_inetsk(void *o, ProtobufCMessage *base, struct cr_img *i) { struct inet_sk_info *ii = o; ii->ie = pb_msg(base, InetSkEntry); if (tcp_connection(ii->ie)) tcp_locked_conn_add(ii); /* * A socket can reuse addr only if all previous sockets allow that, * so a value of SO_REUSEADDR can be restored after restoring all * sockets. */ ii->port = port_add(ii, ii->ie->src_port); if (ii->port == NULL) return -1; return file_desc_add(&ii->d, ii->ie->id, &inet_desc_ops); } struct collect_image_info inet_sk_cinfo = { .fd_type = CR_FD_INETSK, .pb_type = PB_INET_SK, .priv_size = sizeof(struct inet_sk_info), .collect = collect_one_inetsk, }; static int inet_validate_address(InetSkEntry *ie) { if ((ie->family == AF_INET) && /* v0.1 had 4 in ipv4 addr len */ (ie->n_src_addr >= PB_ALEN_INET) && (ie->n_dst_addr >= PB_ALEN_INET)) return 0; if ((ie->family == AF_INET6) && (ie->n_src_addr == PB_ALEN_INET6) && (ie->n_dst_addr == PB_ALEN_INET6)) return 0; pr_err("Addr len mismatch f %d ss %zu ds %zu\n", ie->family, pb_repeated_size(ie, src_addr), pb_repeated_size(ie, dst_addr)); return -1; } static void dec_users_and_wake(struct inet_port *port) { struct fdinfo_list_entry *fle; struct inet_sk_info *ii; if (atomic_dec_return(&port->users)) return; list_for_each_entry(ii, &port->type_list, port_list) { fle = file_master(&ii->d); set_fds_event(fle->pid); } } static int post_open_inet_sk(struct file_desc *d, int sk) { struct inet_sk_info *ii; int val; ii = container_of(d, struct inet_sk_info, d); /* * TCP sockets are handled at the last moment * after unlocking connections. */ if (tcp_connection(ii->ie)) { pr_debug("Schedule %d socket for repair off\n", sk); BUG_ON(ii->sk_fd != -1); ii->sk_fd = sk; return 0; } /* SO_REUSEADDR is set for all sockets */ if (ii->ie->opts->reuseaddr) return 0; if (atomic_read(&ii->port->users)) return 1; val = ii->ie->opts->reuseaddr; if (restore_opt(sk, SOL_SOCKET, SO_REUSEADDR, &val)) return -1; return 0; } int restore_ip_opts(int sk, IpOptsEntry *ioe) { int ret = 0; if (ioe->has_freebind) ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); return ret; } static int open_inet_sk(struct file_desc *d, int *new_fd) { struct fdinfo_list_entry *fle = file_master(d); struct inet_sk_info *ii; InetSkEntry *ie; int sk, yes = 1; if (fle->stage >= FLE_OPEN) return post_open_inet_sk(d, fle->fe->fd); ii = container_of(d, struct inet_sk_info, d); ie = ii->ie; show_one_inet_img("Restore", ie); if (ie->family != AF_INET && ie->family != AF_INET6) { pr_err("Unsupported socket family: %d\n", ie->family); return -1; } if ((ie->type != SOCK_STREAM) && (ie->type != SOCK_DGRAM)) { pr_err("Unsupported socket type: %d\n", ie->type); return -1; } if (inet_validate_address(ie)) return -1; sk = socket(ie->family, ie->type, ie->proto); if (sk < 0) { pr_perror("Can't create inet socket"); return -1; } if (ie->v6only) { if (restore_opt(sk, SOL_IPV6, IPV6_V6ONLY, &yes) == -1) goto err; } /* * Set SO_REUSEADDR, because some sockets can be bound to one addr. * The origin value of SO_REUSEADDR will be restored in post_open. */ if (restore_opt(sk, SOL_SOCKET, SO_REUSEADDR, &yes)) goto err; if (tcp_connection(ie)) { if (!opts.tcp_established_ok && !opts.tcp_close) { pr_err("Connected TCP socket in image\n"); goto err; } mutex_lock(&ii->port->reuseaddr_lock); if (restore_one_tcp(sk, ii)) { mutex_unlock(&ii->port->reuseaddr_lock); goto err; } mutex_unlock(&ii->port->reuseaddr_lock); goto done; } if (ie->src_port) { if (inet_bind(sk, ii)) goto err; } /* * Listen sockets are easiest ones -- simply * bind() and listen(), and that's all. */ if (ie->state == TCP_LISTEN) { if (ie->proto != IPPROTO_TCP) { pr_err("Wrong socket in listen state %d\n", ie->proto); goto err; } mutex_lock(&ii->port->reuseaddr_lock); if (listen(sk, ie->backlog) == -1) { pr_perror("Can't listen on a socket"); mutex_unlock(&ii->port->reuseaddr_lock); goto err; } mutex_unlock(&ii->port->reuseaddr_lock); } if (ie->dst_port && inet_connect(sk, ii)) goto err; done: dec_users_and_wake(ii->port); if (rst_file_params(sk, ie->fown, ie->flags)) goto err; if (ie->ip_opts && restore_ip_opts(sk, ie->ip_opts)) goto err; if (restore_socket_opts(sk, ie->opts)) goto err; if (ie->has_shutdown && (ie->proto == IPPROTO_UDP || ie->proto == IPPROTO_UDPLITE)) { if (shutdown(sk, sk_decode_shutdown(ie->shutdown))) { pr_perror("Can't shutdown socket into %d", sk_decode_shutdown(ie->shutdown)); goto err; } } *new_fd = sk; return 1; err: close(sk); return -1; } int restore_sockaddr(union libsoccr_addr *sa, int family, u32 pb_port, u32 *pb_addr, u32 ifindex) { BUILD_BUG_ON(sizeof(sa->v4.sin_addr.s_addr) > PB_ALEN_INET * sizeof(u32)); BUILD_BUG_ON(sizeof(sa->v6.sin6_addr.s6_addr) > PB_ALEN_INET6 * sizeof(u32)); memzero(sa, sizeof(*sa)); if (family == AF_INET) { sa->v4.sin_family = AF_INET; sa->v4.sin_port = htons(pb_port); memcpy(&sa->v4.sin_addr.s_addr, pb_addr, sizeof(sa->v4.sin_addr.s_addr)); return sizeof(sa->v4); } if (family == AF_INET6) { sa->v6.sin6_family = AF_INET6; sa->v6.sin6_port = htons(pb_port); memcpy(sa->v6.sin6_addr.s6_addr, pb_addr, sizeof(sa->v6.sin6_addr.s6_addr)); /* Here although the struct member is called scope_id, the * kernel really wants ifindex. See * /net/ipv6/af_inet6.c:inet6_bind for details. */ sa->v6.sin6_scope_id = ifindex; return sizeof(sa->v6); } BUG(); return -1; } int inet_bind(int sk, struct inet_sk_info *ii) { bool rst_freebind = false; union libsoccr_addr addr; int addr_size, ifindex = 0; if (ii->ie->ifname) { ifindex = if_nametoindex(ii->ie->ifname); if (!ifindex) { pr_err("couldn't find ifindex for %s\n", ii->ie->ifname); return -1; } } addr_size = restore_sockaddr(&addr, ii->ie->family, ii->ie->src_port, ii->ie->src_addr, ifindex); /* * ipv6 addresses go through a “tentative” phase and * sockets could not be bound to them in this moment * without setting IP_FREEBIND. */ if (ii->ie->family == AF_INET6) { int yes = 1; if (restore_opt(sk, SOL_IP, IP_FREEBIND, &yes)) return -1; if (ii->ie->ip_opts && ii->ie->ip_opts->freebind) /* * The right value is already set, so * don't need to restore it in restore_ip_opts() */ ii->ie->ip_opts->has_freebind = false; else rst_freebind = true; } if (bind(sk, (struct sockaddr *)&addr, addr_size) == -1) { pr_perror("Can't bind inet socket (id %d)", ii->ie->id); return -1; } if (rst_freebind) { int no = 0; /* * The "no" value is default, so it will not be * restore in restore_ip_opts() */ if (restore_opt(sk, SOL_IP, IP_FREEBIND, &no)) return -1; } return 0; } int inet_connect(int sk, struct inet_sk_info *ii) { union libsoccr_addr addr; int addr_size; addr_size = restore_sockaddr(&addr, ii->ie->family, ii->ie->dst_port, ii->ie->dst_addr, 0); if (connect(sk, (struct sockaddr *)&addr, addr_size) == -1) { pr_perror("Can't connect inet socket back"); return -1; } return 0; } criu-3.6/criu/sk-netlink.c000066400000000000000000000130001317335042600155070ustar00rootroot00000000000000#include #include #include #include #include "imgset.h" #include "files.h" #include "sockets.h" #include "util.h" #include "protobuf.h" #include "images/sk-netlink.pb-c.h" #include "netlink_diag.h" #include "libnetlink.h" struct netlink_sk_desc { struct socket_desc sd; u32 portid; u32 *groups; u32 gsize; u32 dst_portid; u32 dst_group; u8 state; u8 protocol; }; int netlink_receive_one(struct nlmsghdr *hdr, void *arg) { struct nlattr *tb[NETLINK_DIAG_MAX+1]; struct netlink_diag_msg *m; struct netlink_sk_desc *sd; unsigned long *groups; m = NLMSG_DATA(hdr); pr_debug("Collect netlink sock 0x%x\n", m->ndiag_ino); sd = xmalloc(sizeof(*sd)); if (!sd) return -1; sd->protocol = m->ndiag_protocol; sd->portid = m->ndiag_portid; sd->dst_portid = m->ndiag_dst_portid; sd->dst_group = m->ndiag_dst_group; sd->state = m->ndiag_state; nlmsg_parse(hdr, sizeof(struct netlink_diag_msg), tb, NETLINK_DIAG_MAX, NULL); if (tb[NETLINK_DIAG_GROUPS]) { sd->gsize = nla_len(tb[NETLINK_DIAG_GROUPS]); groups = nla_data(tb[NETLINK_DIAG_GROUPS]); sd->groups = xmalloc(sd->gsize); if (!sd->groups) { xfree(sd); return -1; } memcpy(sd->groups, groups, sd->gsize); } else { sd->groups = NULL; sd->gsize = 0; } return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd); } static bool can_dump_netlink_sk(int lfd) { int ret; ret = fd_has_data(lfd); if (ret == 1) pr_err("The socket has data to read\n"); return ret == 0; } static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) { struct netlink_sk_desc *sk; FileEntry fe = FILE_ENTRY__INIT; NetlinkSkEntry ne = NETLINK_SK_ENTRY__INIT; SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; sk = (struct netlink_sk_desc *)lookup_socket(p->stat.st_ino, PF_NETLINK, 0); if (IS_ERR(sk)) goto err; ne.id = id; ne.ino = p->stat.st_ino; if (!can_dump_netlink_sk(lfd)) goto err; if (sk) { BUG_ON(sk->sd.already_dumped); ne.protocol = sk->protocol; ne.portid = sk->portid; ne.groups = sk->groups; ne.n_groups = sk->gsize / sizeof(ne.groups[0]); /* * On 64-bit sk->gsize is multiple to 8 bytes (sizeof(long)), * so remove the last 4 bytes if they are empty. */ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ /* * Big endian swap: Ugly hack for zdtm/static/sk-netlink * * For big endian systems: * * - sk->groups[0] are bits 32-64 * - sk->groups[1] are bits 0-32 */ if (ne.n_groups == 2) { uint32_t tmp = sk->groups[1]; sk->groups[1] = sk->groups[0]; sk->groups[0] = tmp; } #endif if (ne.n_groups && sk->groups[ne.n_groups - 1] == 0) ne.n_groups -= 1; if (ne.n_groups > 1) { pr_err("%d %x\n", sk->gsize, sk->groups[1]); pr_err("The netlink socket 0x%x has more than 32 groups\n", ne.ino); return -1; } if (sk->groups && !sk->portid) { pr_err("The netlink socket 0x%x is bound to groups but not to portid\n", ne.ino); return -1; } ne.state = sk->state; ne.dst_portid = sk->dst_portid; ne.dst_group = sk->dst_group; } else { /* unconnected and unbound socket */ int val; socklen_t aux = sizeof(val); if (getsockopt(lfd, SOL_SOCKET, SO_PROTOCOL, &val, &aux) < 0) { pr_perror("Unable to get protocol for netlink socket"); goto err; } ne.protocol = val; } ne.fown = (FownEntry *)&p->fown; ne.opts = &skopts; if (dump_socket_opts(lfd, &skopts)) goto err; fe.type = FD_TYPES__NETLINKSK; fe.id = ne.id; fe.nlsk = ≠ if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) goto err; return 0; err: return -1; } const struct fdtype_ops netlink_dump_ops = { .type = FD_TYPES__NETLINKSK, .dump = dump_one_netlink_fd, }; struct netlink_sock_info { NetlinkSkEntry *nse; struct file_desc d; }; static int open_netlink_sk(struct file_desc *d, int *new_fd) { struct netlink_sock_info *nsi; NetlinkSkEntry *nse; struct sockaddr_nl addr; int sk = -1; nsi = container_of(d, struct netlink_sock_info, d); nse = nsi->nse; pr_info("Opening netlink socket id %#x\n", nse->id); sk = socket(PF_NETLINK, SOCK_RAW, nse->protocol); if (sk < 0) { pr_perror("Can't create netlink sock"); goto err; } if (nse->portid) { memset(&addr, 0, sizeof(addr)); addr.nl_family = AF_NETLINK; if (nse->n_groups > 1) { pr_err("Groups above 32 are not supported yet\n"); goto err; } if (nse->n_groups) addr.nl_groups = nse->groups[0]; addr.nl_pid = nse->portid; if (bind(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) { pr_perror("Can't bind netlink socket"); goto err; } } if (nse->state == NETLINK_CONNECTED) { addr.nl_family = AF_NETLINK; addr.nl_groups = 1 << (nse->dst_group - 1); addr.nl_pid = nse->dst_portid; if (connect(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) { pr_perror("Can't connect netlink socket"); goto err; } } if (rst_file_params(sk, nse->fown, nse->flags)) goto err; if (restore_socket_opts(sk, nse->opts)) goto err; *new_fd = sk; return 0; err: close(sk); return -1; } static struct file_desc_ops netlink_sock_desc_ops = { .type = FD_TYPES__NETLINKSK, .open = open_netlink_sk, }; static int collect_one_netlink_sk(void *o, ProtobufCMessage *base, struct cr_img *i) { struct netlink_sock_info *si = o; si->nse = pb_msg(base, NetlinkSkEntry); return file_desc_add(&si->d, si->nse->id, &netlink_sock_desc_ops); } struct collect_image_info netlink_sk_cinfo = { .fd_type = CR_FD_NETLINK_SK, .pb_type = PB_NETLINK_SK, .priv_size = sizeof(struct netlink_sock_info), .collect = collect_one_netlink_sk, }; criu-3.6/criu/sk-packet.c000066400000000000000000000304241317335042600153230ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "imgset.h" #include "files.h" #include "sockets.h" #include "libnetlink.h" #include "sk-packet.h" #include "packet_diag.h" #include "vma.h" #include #include "protobuf.h" #include "xmalloc.h" #include "images/packet-sock.pb-c.h" #include "images/fdinfo.pb-c.h" struct packet_sock_info { PacketSockEntry *pse; struct file_desc d; }; struct packet_mreq_max { int mr_ifindex; unsigned short mr_type; unsigned short mr_alen; unsigned char mr_address[MAX_ADDR_LEN]; }; struct packet_sock_desc { struct socket_desc sd; unsigned int file_id; unsigned int type; unsigned short proto; struct packet_diag_info nli; int mreq_n; struct packet_diag_mclist *mreqs; unsigned int fanout; struct packet_diag_ring *rx, *tx; }; #define NO_FANOUT ((unsigned int)-1) static int dump_mreqs(PacketSockEntry *psk, struct packet_sock_desc *sd) { int i; if (!sd->mreq_n) return 0; pr_debug("\tdumping %d mreqs\n", sd->mreq_n); psk->mclist = xmalloc(sd->mreq_n * sizeof(psk->mclist[0])); if (!psk->mclist) return -1; for (i = 0; i < sd->mreq_n; i++) { struct packet_diag_mclist *m = &sd->mreqs[i]; PacketMclist *im; if (m->pdmc_count != 1) { pr_err("Multiple MC membership not supported (but can be)\n"); goto err; } pr_debug("\tmr%d: idx %d type %d\n", i, m->pdmc_index, m->pdmc_type); im = xmalloc(sizeof(*im)); if (!im) goto err; packet_mclist__init(im); psk->mclist[i] = im; psk->n_mclist++; im->index = m->pdmc_index; im->type = m->pdmc_type; switch (m->pdmc_type) { case PACKET_MR_MULTICAST: case PACKET_MR_UNICAST: im->addr.len = m->pdmc_alen; im->addr.data = xmalloc(m->pdmc_alen); if (!im->addr.data) goto err; memcpy(im->addr.data, m->pdmc_addr, m->pdmc_alen); break; case PACKET_MR_PROMISC: case PACKET_MR_ALLMULTI: break; default: pr_err("Unknown mc membership type %d\n", m->pdmc_type); goto err; } } return 0; err: return -1; } static PacketRing *dump_ring(struct packet_diag_ring *dr) { PacketRing *ring; ring = xmalloc(sizeof(*ring)); if (!ring) return NULL; packet_ring__init(ring); ring->block_size = dr->pdr_block_size; ring->block_nr = dr->pdr_block_nr; ring->frame_size = dr->pdr_frame_size; ring->frame_nr = dr->pdr_frame_nr; ring->retire_tmo = dr->pdr_retire_tmo; ring->sizeof_priv = dr->pdr_sizeof_priv; ring->features = dr->pdr_features; return ring; } static int dump_rings(PacketSockEntry *psk, struct packet_sock_desc *sd) { if (sd->rx) { psk->rx_ring = dump_ring(sd->rx); if (!psk->rx_ring) return -1; } if (sd->tx) { psk->tx_ring = dump_ring(sd->tx); if (!psk->tx_ring) return -1; } return 0; } static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p) { FileEntry fe = FILE_ENTRY__INIT; PacketSockEntry psk = PACKET_SOCK_ENTRY__INIT; SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; struct packet_sock_desc *sd; int i, ret; sd = (struct packet_sock_desc *)lookup_socket(p->stat.st_ino, PF_PACKET, 0); if (IS_ERR_OR_NULL(sd)) { pr_err("Can't find packet socket %"PRIu64"\n", p->stat.st_ino); return -1; } pr_info("Dumping packet socket fd %d id %#x\n", lfd, id); BUG_ON(sd->sd.already_dumped); sd->sd.already_dumped = 1; psk.id = sd->file_id = id; psk.type = sd->type; psk.flags = p->flags; psk.fown = (FownEntry *)&p->fown; psk.opts = &skopts; if (dump_socket_opts(lfd, &skopts)) return -1; psk.protocol = sd->proto; psk.ifindex = sd->nli.pdi_index; psk.version = sd->nli.pdi_version; psk.reserve = sd->nli.pdi_reserve; psk.timestamp = sd->nli.pdi_tstamp; psk.copy_thresh = sd->nli.pdi_copy_thresh; psk.aux_data = (sd->nli.pdi_flags & PDI_AUXDATA ? true : false); psk.orig_dev = (sd->nli.pdi_flags & PDI_ORIGDEV ? true : false); psk.vnet_hdr = (sd->nli.pdi_flags & PDI_VNETHDR ? true : false); psk.loss = (sd->nli.pdi_flags & PDI_LOSS ? true : false); ret = dump_mreqs(&psk, sd); if (ret) goto out; if (sd->fanout != NO_FANOUT) { psk.has_fanout = true; psk.fanout = sd->fanout; } ret = dump_rings(&psk, sd); if (ret) goto out; fe.type = FD_TYPES__PACKETSK; fe.id = psk.id; fe.psk = &psk; ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); out: release_skopts(&skopts); xfree(psk.rx_ring); xfree(psk.tx_ring); for (i = 0; i < psk.n_mclist; i++) xfree(psk.mclist[i]->addr.data); xfree(psk.mclist); return ret; } const struct fdtype_ops packet_dump_ops = { .type = FD_TYPES__PACKETSK, .dump = dump_one_packet_fd, }; int dump_socket_map(struct vma_area *vma) { struct packet_sock_desc *sd; sd = (struct packet_sock_desc *)lookup_socket(vma->vm_socket_id, PF_PACKET, 0); if (IS_ERR_OR_NULL(sd)) { pr_err("Can't find packet socket %u to mmap\n", vma->vm_socket_id); return -1; } if (!sd->file_id) { pr_err("Mmap-ed socket %u not open\n", vma->vm_socket_id); return -1; } pr_info("Dumping socket map %x -> %"PRIx64"\n", sd->file_id, vma->e->start); vma->e->shmid = sd->file_id; return 0; } static int packet_save_mreqs(struct packet_sock_desc *sd, struct nlattr *mc) { sd->mreq_n = nla_len(mc) / sizeof(struct packet_diag_mclist); pr_debug("\tGot %d mreqs\n", sd->mreq_n); sd->mreqs = xmalloc(nla_len(mc)); if (!sd->mreqs) return -1; memcpy(sd->mreqs, nla_data(mc), nla_len(mc)); return 0; } int packet_receive_one(struct nlmsghdr *hdr, void *arg) { struct packet_diag_msg *m; struct nlattr *tb[PACKET_DIAG_MAX + 1]; struct packet_sock_desc *sd; m = NLMSG_DATA(hdr); nlmsg_parse(hdr, sizeof(struct packet_diag_msg), tb, PACKET_DIAG_MAX, NULL); pr_info("Collect packet sock %u %u\n", m->pdiag_ino, (unsigned int)m->pdiag_num); if (!tb[PACKET_DIAG_INFO]) { pr_err("No packet sock info in nlm\n"); return -1; } if (!tb[PACKET_DIAG_MCLIST]) { pr_err("No packet sock mclist in nlm\n"); return -1; } sd = xmalloc(sizeof(*sd)); if (!sd) return -1; sd->file_id = 0; sd->type = m->pdiag_type; sd->proto = htons(m->pdiag_num); sd->rx = NULL; sd->tx = NULL; memcpy(&sd->nli, nla_data(tb[PACKET_DIAG_INFO]), sizeof(sd->nli)); if (packet_save_mreqs(sd, tb[PACKET_DIAG_MCLIST])) goto err; if (tb[PACKET_DIAG_FANOUT]) sd->fanout = *(__u32 *)RTA_DATA(tb[PACKET_DIAG_FANOUT]); else sd->fanout = NO_FANOUT; if (tb[PACKET_DIAG_RX_RING]) { sd->rx = xmalloc(sizeof(*sd->rx)); if (sd->rx == NULL) goto err; memcpy(sd->rx, RTA_DATA(tb[PACKET_DIAG_RX_RING]), sizeof(*sd->rx)); } if (tb[PACKET_DIAG_TX_RING]) { sd->tx = xmalloc(sizeof(*sd->tx)); if (sd->tx == NULL) goto err; memcpy(sd->tx, RTA_DATA(tb[PACKET_DIAG_TX_RING]), sizeof(*sd->tx)); } return sk_collect_one(m->pdiag_ino, PF_PACKET, &sd->sd); err: xfree(sd->tx); xfree(sd->rx); xfree(sd); return -1; } static int open_socket_map(int pid, struct vma_area *vm) { VmaEntry *vma = vm->e; struct file_desc *fd; struct fdinfo_list_entry *le; pr_info("Getting packet socket fd for %d:%x\n", pid, (int)vma->shmid); fd = find_file_desc_raw(FD_TYPES__PACKETSK, vma->shmid); if (!fd) { pr_err("No packet socket %x\n", (int)vma->shmid); return -1; } list_for_each_entry(le, &fd->fd_info_head, desc_list) if (le->pid == pid) { int fd; /* * Restorer will close the mmap-ed fd */ fd = dup(le->fe->fd); if (fd < 0) { pr_perror("Can't dup packet sk"); return -1; } vma->fd = fd; return 0; } pr_err("No open packet socket %x by %d\n", (int)vma->shmid, pid); return -1; } int collect_socket_map(struct vma_area *vma) { vma->vm_open = open_socket_map; return 0; } static int restore_mreqs(int sk, PacketSockEntry *pse) { int i; for (i = 0; i < pse->n_mclist; i++) { PacketMclist *ml; struct packet_mreq_max mreq; ml = pse->mclist[i]; pr_info("Restoring mreq type %d\n", ml->type); if (ml->addr.len > sizeof(mreq.mr_address)) { pr_err("To big mcaddr %zu\n", ml->addr.len); return -1; } mreq.mr_ifindex = ml->index; mreq.mr_type = ml->type; mreq.mr_alen = ml->addr.len; memcpy(mreq.mr_address, ml->addr.data, ml->addr.len); if (restore_opt(sk, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq)) return -1; } return 0; } static int restore_ring(int sk, int type, PacketRing *ring) { struct tpacket_req3 req; if (!ring) return 0; pr_debug("\tRestoring %d ring\n", type); req.tp_block_size = ring->block_size; req.tp_block_nr = ring->block_nr; req.tp_frame_size = ring->frame_size; req.tp_frame_nr = ring->frame_nr; req.tp_retire_blk_tov = ring->retire_tmo; req.tp_sizeof_priv = ring->sizeof_priv; req.tp_feature_req_word = ring->features; return restore_opt(sk, SOL_PACKET, type, &req); } static int restore_rings(int sk, PacketSockEntry *psk) { if (restore_ring(sk, PACKET_RX_RING, psk->rx_ring)) return -1; if (restore_ring(sk, PACKET_TX_RING, psk->tx_ring)) return -1; return 0; } static int open_packet_sk_spkt(PacketSockEntry *pse, int *new_fd) { struct sockaddr addr_spkt; int sk; sk = socket(PF_PACKET, pse->type, pse->protocol); if (sk < 0) { pr_perror("Can't create packet socket"); return -1; } memset(&addr_spkt, 0, sizeof(addr_spkt)); addr_spkt.sa_family = AF_PACKET; // if the socket was bound to any device if (pse->ifindex > 0) { const size_t sa_data_size = sizeof(addr_spkt.sa_data); struct ifreq req; memset(&req, 0, sizeof(req)); req.ifr_ifindex = pse->ifindex; if (ioctl(sk, SIOCGIFNAME, &req) < 0) { pr_perror("Can't get interface name (ifindex %d)", pse->ifindex); goto err; } strncpy(addr_spkt.sa_data, req.ifr_name, sa_data_size); addr_spkt.sa_data[sa_data_size - 1] = 0; if (bind(sk, &addr_spkt, sizeof(addr_spkt)) < 0) { pr_perror("Can't bind packet socket to %s", req.ifr_name); goto err; } } if (rst_file_params(sk, pse->fown, pse->flags)) goto err; if (restore_socket_opts(sk, pse->opts)) goto err; *new_fd = sk; return 0; err: close(sk); return -1; } static int open_packet_sk(struct file_desc *d, int *new_fd) { struct packet_sock_info *psi; PacketSockEntry *pse; struct sockaddr_ll addr; int sk, yes; psi = container_of(d, struct packet_sock_info, d); pse = psi->pse; pr_info("Opening packet socket id %#x\n", pse->id); if (pse->type == SOCK_PACKET) return open_packet_sk_spkt(pse, new_fd); sk = socket(PF_PACKET, pse->type, pse->protocol); if (sk < 0) { pr_perror("Can't create packet sock"); goto err; } memset(&addr, 0, sizeof(addr)); addr.sll_family = AF_PACKET; addr.sll_ifindex = pse->ifindex; if (bind(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) { pr_perror("Can't bind packet socket"); goto err_cl; } if (restore_opt(sk, SOL_PACKET, PACKET_VERSION, &pse->version)) goto err_cl; if (restore_opt(sk, SOL_PACKET, PACKET_RESERVE, &pse->reserve)) goto err_cl; if (restore_opt(sk, SOL_PACKET, PACKET_TIMESTAMP, &pse->timestamp)) goto err_cl; if (restore_opt(sk, SOL_PACKET, PACKET_COPY_THRESH, &pse->copy_thresh)) goto err_cl; if (pse->aux_data) { yes = 1; if (restore_opt(sk, SOL_PACKET, PACKET_AUXDATA, &yes)) goto err_cl; } if (pse->orig_dev) { yes = 1; if (restore_opt(sk, SOL_PACKET, PACKET_ORIGDEV, &yes)) goto err_cl; } if (pse->vnet_hdr) { yes = 1; if (restore_opt(sk, SOL_PACKET, PACKET_VNET_HDR, &yes)) goto err_cl; } if (pse->loss) { yes = 1; if (restore_opt(sk, SOL_PACKET, PACKET_LOSS, &yes)) goto err_cl; } if (restore_mreqs(sk, pse)) goto err_cl; if (restore_rings(sk, pse)) goto err_cl; if (pse->has_fanout) { pr_info("Restoring fanout %x\n", pse->fanout); if (restore_opt(sk, SOL_PACKET, PACKET_FANOUT, &pse->fanout)) goto err_cl; } if (rst_file_params(sk, pse->fown, pse->flags)) goto err_cl; if (restore_socket_opts(sk, pse->opts)) goto err_cl; *new_fd = sk; return 0; err_cl: close(sk); err: return -1; } static struct file_desc_ops packet_sock_desc_ops = { .type = FD_TYPES__PACKETSK, .open = open_packet_sk, }; static int collect_one_packet_sk(void *o, ProtobufCMessage *base, struct cr_img *i) { struct packet_sock_info *si = o; si->pse = pb_msg(base, PacketSockEntry); return file_desc_add(&si->d, si->pse->id, &packet_sock_desc_ops); } struct collect_image_info packet_sk_cinfo = { .fd_type = CR_FD_PACKETSK, .pb_type = PB_PACKET_SOCK, .priv_size = sizeof(struct packet_sock_info), .collect = collect_one_packet_sk, }; criu-3.6/criu/sk-queue.c000066400000000000000000000201451317335042600151770ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "common/list.h" #include "imgset.h" #include "image.h" #include "servicefd.h" #include "cr_options.h" #include "util.h" #include "util-pie.h" #include "sockets.h" #include "xmalloc.h" #include "sk-queue.h" #include "files.h" #include "protobuf.h" #include "images/sk-packet.pb-c.h" struct sk_packet { struct list_head list; SkPacketEntry *entry; char *data; unsigned scm_len; int *scm; }; static LIST_HEAD(packets_list); static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *img) { struct sk_packet *pkt = obj; pkt->entry = pb_msg(msg, SkPacketEntry); pkt->scm = NULL; pkt->data = xmalloc(pkt->entry->length); if (pkt->data ==NULL) return -1; /* * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and * only 1 of that kind is possible, thus not more than 1 SCMs * on a packet. */ if (pkt->entry->n_scm > 1) { pr_err("More than 1 SCM is not possible\n"); return -1; } /* * NOTE: packet must be added to the tail. Otherwise sequence * will be broken. */ list_add_tail(&pkt->list, &packets_list); if (read_img_buf(img, pkt->data, pkt->entry->length) != 1) { xfree(pkt->data); pr_perror("Unable to read packet data"); return -1; } return 0; } struct collect_image_info sk_queues_cinfo = { .fd_type = CR_FD_SK_QUEUES, .pb_type = PB_SK_QUEUES, .priv_size = sizeof(struct sk_packet), .collect = collect_one_packet, }; static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe) { int nr_fds, *fds, i; void *buf; ScmEntry *scme; nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int); fds = (int *)CMSG_DATA(ch); buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t)); if (!buf) return -1; scme = xptr_pull(&buf, ScmEntry); scm_entry__init(scme); scme->type = SCM_RIGHTS; scme->n_rights = nr_fds; scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t)); for (i = 0; i < nr_fds; i++) { int ftyp; if (dump_my_file(fds[i], &scme->rights[i], &ftyp)) return -1; /* * Unix sent over Unix or Epoll with some other sh*t * sent over unix (maybe with this very unix polled) * are tricky and not supported for now. (XXX -- todo) */ if (ftyp == FD_TYPES__UNIXSK || ftyp == FD_TYPES__EVENTPOLL) { pr_err("Can't dump send %d (unix/epoll) fd\n", ftyp); return -1; } } i = pe->n_scm++; if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*))) return -1; pe->scm[i] = scme; return 0; } /* * Maximum size of the control messages. XXX -- is there any * way to get this value out of the kernel? * */ #define CMSG_MAX_SIZE 1024 static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe) { struct cmsghdr *ch; int n_rights = 0; for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) { if (ch->cmsg_type == SCM_RIGHTS) { if (n_rights) { /* * Even if user is sending more than one cmsg with * rights, kernel merges them alltogether on recv. */ pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n"); return -1; } if (dump_scm_rights(ch, pe)) return -1; n_rights++; continue; } pr_err("Control messages in queue, not supported\n"); return -1; } return 0; } static void release_cmsg(SkPacketEntry *pe) { int i; for (i = 0; i < pe->n_scm; i++) xfree(pe->scm[i]); xfree(pe->scm); pe->n_scm = 0; pe->scm = NULL; } int dump_sk_queue(int sock_fd, int sock_id) { SkPacketEntry pe = SK_PACKET_ENTRY__INIT; int ret, size, orig_peek_off; void *data; socklen_t tmp; /* * Save original peek offset. */ tmp = sizeof(orig_peek_off); orig_peek_off = 0; ret = getsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &orig_peek_off, &tmp); if (ret < 0) { pr_perror("getsockopt failed"); return ret; } /* * Discover max DGRAM size */ tmp = sizeof(size); size = 0; ret = getsockopt(sock_fd, SOL_SOCKET, SO_SNDBUF, &size, &tmp); if (ret < 0) { pr_perror("getsockopt failed"); return ret; } /* Note: 32 bytes will be used by kernel for protocol header. */ size -= 32; /* * Allocate data for a stream. */ data = xmalloc(size); if (!data) return -1; /* * Enable peek offset incrementation. */ ret = setsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &ret, sizeof(int)); if (ret < 0) { pr_perror("setsockopt fail"); goto err_brk; } pe.id_for = sock_id; while (1) { char cmsg[CMSG_MAX_SIZE]; struct iovec iov = { .iov_base = data, .iov_len = size, }; struct msghdr msg = { .msg_iov = &iov, .msg_iovlen = 1, .msg_control = &cmsg, .msg_controllen = sizeof(cmsg), }; ret = pe.length = recvmsg(sock_fd, &msg, MSG_DONTWAIT | MSG_PEEK); if (!ret) /* * It means, that peer has performed an * orderly shutdown, so we're done. */ break; else if (ret < 0) { if (errno == EAGAIN) break; /* we're done */ pr_perror("recvmsg fail: error"); goto err_set_sock; } if (msg.msg_flags & MSG_TRUNC) { /* * DGRAM truncated. This should not happen. But we have * to check... */ pr_err("sys_recvmsg failed: truncated\n"); ret = -E2BIG; goto err_set_sock; } if (dump_packet_cmsg(&msg, &pe)) goto err_set_sock; ret = pb_write_one(img_from_set(glob_imgset, CR_FD_SK_QUEUES), &pe, PB_SK_QUEUES); if (ret < 0) { ret = -EIO; goto err_set_sock; } ret = write_img_buf(img_from_set(glob_imgset, CR_FD_SK_QUEUES), data, pe.length); if (ret < 0) { ret = -EIO; goto err_set_sock; } if (pe.scm) release_cmsg(&pe); } ret = 0; err_set_sock: /* * Restore original peek offset. */ if (setsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &orig_peek_off, sizeof(int))) { pr_perror("setsockopt failed on restore"); ret = -1; } err_brk: xfree(data); return ret; } static int send_one_pkt(int fd, struct sk_packet *pkt) { int ret; SkPacketEntry *entry = pkt->entry; struct msghdr mh = {}; struct iovec iov; mh.msg_iov = &iov; mh.msg_iovlen = 1; iov.iov_base = pkt->data; iov.iov_len = entry->length; if (pkt->scm != NULL) { mh.msg_controllen = pkt->scm_len; mh.msg_control = pkt->scm; } /* * Don't try to use sendfile here, because it use sendpage() and * all data are split on pages and a new skb is allocated for * each page. It creates a big overhead on SNDBUF. * sendfile() isn't suitable for DGRAM sockets, because message * boundaries messages should be saved. */ ret = sendmsg(fd, &mh, 0); xfree(pkt->data); if (ret < 0) { pr_perror("Failed to send packet"); return -1; } if (ret != entry->length) { pr_err("Restored skb trimmed to %d/%d\n", ret, (unsigned int)entry->length); return -1; } return 0; } int restore_sk_queue(int fd, unsigned int peer_id) { struct sk_packet *pkt, *tmp; int ret = -1; pr_info("Trying to restore recv queue for %u\n", peer_id); if (restore_prepare_socket(fd)) goto out; list_for_each_entry_safe(pkt, tmp, &packets_list, list) { SkPacketEntry *entry = pkt->entry; if (entry->id_for != peer_id) continue; pr_info("\tRestoring %d-bytes skb for %u\n", (unsigned int)entry->length, peer_id); ret = send_one_pkt(fd, pkt); if (ret) goto out; list_del(&pkt->list); sk_packet_entry__free_unpacked(entry, NULL); xfree(pkt); } ret = 0; out: return ret; } int prepare_scms(void) { struct sk_packet *pkt; pr_info("Preparing SCMs\n"); list_for_each_entry(pkt, &packets_list, list) { SkPacketEntry *pe = pkt->entry; ScmEntry *se; struct cmsghdr *ch; if (!pe->n_scm) continue; se = pe->scm[0]; /* Only 1 SCM is possible */ if (se->type == SCM_RIGHTS) { pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int)); pkt->scm = xmalloc(pkt->scm_len); if (!pkt->scm) return -1; ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */ ch->cmsg_level = SOL_SOCKET; ch->cmsg_type = SCM_RIGHTS; ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int)); if (unix_note_scm_rights(pe->id_for, se->rights, (int *)CMSG_DATA(ch), se->n_rights)) return -1; continue; } pr_err("Unsupported scm %d in image\n", se->type); return -1; } return 0; } criu-3.6/criu/sk-tcp.c000066400000000000000000000216201317335042600146400ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "../soccr/soccr.h" #include "cr_options.h" #include "util.h" #include "common/list.h" #include "log.h" #include "files.h" #include "sockets.h" #include "sk-inet.h" #include "netfilter.h" #include "image.h" #include "namespaces.h" #include "xmalloc.h" #include "config.h" #include "kerndat.h" #include "restorer.h" #include "rst-malloc.h" #include "protobuf.h" #include "images/tcp-stream.pb-c.h" static LIST_HEAD(cpt_tcp_repair_sockets); static LIST_HEAD(rst_tcp_repair_sockets); static int tcp_repair_establised(int fd, struct inet_sk_desc *sk) { int ret; struct libsoccr_sk *socr; pr_info("\tTurning repair on for socket %x\n", sk->sd.ino); /* * Keep the socket open in criu till the very end. In * case we close this fd after one task fd dumping and * fail we'll have to turn repair mode off */ sk->rfd = dup(fd); if (sk->rfd < 0) { pr_perror("Can't save socket fd for repair"); goto err1; } if (!(root_ns_mask & CLONE_NEWNET)) { ret = nf_lock_connection(sk); if (ret < 0) goto err2; } socr = libsoccr_pause(sk->rfd); if (!socr) goto err3; sk->priv = socr; list_add_tail(&sk->rlist, &cpt_tcp_repair_sockets); return 0; err3: if (!(root_ns_mask & CLONE_NEWNET)) nf_unlock_connection(sk); err2: close(sk->rfd); err1: return -1; } static void tcp_unlock_one(struct inet_sk_desc *sk) { int ret; list_del(&sk->rlist); if (!(root_ns_mask & CLONE_NEWNET)) { ret = nf_unlock_connection(sk); if (ret < 0) pr_perror("Failed to unlock TCP connection"); } libsoccr_resume(sk->priv); sk->priv = NULL; /* * tcp_repair_off modifies SO_REUSEADDR so * don't forget to restore original value. */ restore_opt(sk->rfd, SOL_SOCKET, SO_REUSEADDR, &sk->cpt_reuseaddr); close(sk->rfd); } void cpt_unlock_tcp_connections(void) { struct inet_sk_desc *sk, *n; list_for_each_entry_safe(sk, n, &cpt_tcp_repair_sockets, rlist) tcp_unlock_one(sk); } static int dump_tcp_conn_state(struct inet_sk_desc *sk) { struct libsoccr_sk *socr = sk->priv; int ret, aux; struct cr_img *img; TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT; char *buf; struct libsoccr_sk_data data; ret = libsoccr_save(socr, &data, sizeof(data)); if (ret < 0) { pr_err("libsoccr_save() failed with %d\n", ret); goto err_r; } if (ret != sizeof(data)) { pr_err("This libsocr is not supported (%d vs %d)\n", ret, (int)sizeof(data)); goto err_r; } sk->state = data.state; tse.inq_len = data.inq_len; tse.inq_seq = data.inq_seq; tse.outq_len = data.outq_len; tse.outq_seq = data.outq_seq; tse.unsq_len = data.unsq_len; tse.has_unsq_len = true; tse.mss_clamp = data.mss_clamp; tse.opt_mask = data.opt_mask; if (tse.opt_mask & TCPI_OPT_WSCALE) { tse.snd_wscale = data.snd_wscale; tse.rcv_wscale = data.rcv_wscale; tse.has_rcv_wscale = true; } if (tse.opt_mask & TCPI_OPT_TIMESTAMPS) { tse.timestamp = data.timestamp; tse.has_timestamp = true; } if (data.flags & SOCCR_FLAGS_WINDOW) { tse.has_snd_wl1 = true; tse.has_snd_wnd = true; tse.has_max_window = true; tse.has_rcv_wnd = true; tse.has_rcv_wup = true; tse.snd_wl1 = data.snd_wl1; tse.snd_wnd = data.snd_wnd; tse.max_window = data.max_window; tse.rcv_wnd = data.rcv_wnd; tse.rcv_wup = data.rcv_wup; } /* * TCP socket options */ if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux)) goto err_opt; if (aux) { tse.has_nodelay = true; tse.nodelay = true; } if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux)) goto err_opt; if (aux) { tse.has_cork = true; tse.cork = true; } /* * Push the stuff to image */ img = open_image(CR_FD_TCP_STREAM, O_DUMP, sk->sd.ino); if (!img) goto err_img; ret = pb_write_one(img, &tse, PB_TCP_STREAM); if (ret < 0) goto err_iw; buf = libsoccr_get_queue_bytes(socr, TCP_RECV_QUEUE, SOCCR_MEM_EXCL); if (buf) { ret = write_img_buf(img, buf, tse.inq_len); if (ret < 0) goto err_iw; xfree(buf); } buf = libsoccr_get_queue_bytes(socr, TCP_SEND_QUEUE, SOCCR_MEM_EXCL); if (buf) { ret = write_img_buf(img, buf, tse.outq_len); if (ret < 0) goto err_iw; xfree(buf); } pr_info("Done\n"); err_iw: close_image(img); err_img: err_opt: err_r: return ret; } int dump_one_tcp(int fd, struct inet_sk_desc *sk) { if (sk->dst_port == 0) return 0; pr_info("Dumping TCP connection\n"); if (tcp_repair_establised(fd, sk)) return -1; if (dump_tcp_conn_state(sk)) return -1; /* * Socket is left in repair mode, so that at the end it's just * closed and the connection is silently terminated */ return 0; } static int read_tcp_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, int queue, u32 len, struct cr_img *img) { char *buf; buf = xmalloc(len); if (!buf) return -1; if (read_img_buf(img, buf, len) < 0) goto err; return libsoccr_set_queue_bytes(sk, queue, buf, SOCCR_MEM_EXCL); err: xfree(buf); return -1; } static int read_tcp_queues(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, struct cr_img *img) { u32 len; len = data->inq_len; if (len && read_tcp_queue(sk, data, TCP_RECV_QUEUE, len, img)) return -1; len = data->outq_len; if (len && read_tcp_queue(sk, data, TCP_SEND_QUEUE, len, img)) return -1; return 0; } static int restore_tcp_conn_state(int sk, struct libsoccr_sk *socr, struct inet_sk_info *ii) { int aux; struct cr_img *img; TcpStreamEntry *tse; struct libsoccr_sk_data data = {}; union libsoccr_addr sa_src, sa_dst; pr_info("Restoring TCP connection id %x ino %x\n", ii->ie->id, ii->ie->ino); img = open_image(CR_FD_TCP_STREAM, O_RSTR, ii->ie->ino); if (!img) goto err; if (pb_read_one(img, &tse, PB_TCP_STREAM) < 0) goto err_c; if (!tse->has_unsq_len) { pr_err("No unsq len in the image\n"); goto err_c; } data.state = ii->ie->state;; data.inq_len = tse->inq_len; data.inq_seq = tse->inq_seq; data.outq_len = tse->outq_len; data.outq_seq = tse->outq_seq; data.unsq_len = tse->unsq_len; data.mss_clamp = tse->mss_clamp; data.opt_mask = tse->opt_mask; if (tse->opt_mask & TCPI_OPT_WSCALE) { if (!tse->has_rcv_wscale) { pr_err("No rcv wscale in the image\n"); goto err_c; } data.snd_wscale = tse->snd_wscale; data.rcv_wscale = tse->rcv_wscale; } if (tse->opt_mask & TCPI_OPT_TIMESTAMPS) { if (!tse->has_timestamp) { pr_err("No timestamp in the image\n"); goto err_c; } data.timestamp = tse->timestamp; } if (tse->has_snd_wnd) { data.flags |= SOCCR_FLAGS_WINDOW; data.snd_wl1 = tse->snd_wl1; data.snd_wnd = tse->snd_wnd; data.max_window = tse->max_window; data.rcv_wnd = tse->rcv_wnd; data.rcv_wup = tse->rcv_wup; } if (restore_sockaddr(&sa_src, ii->ie->family, ii->ie->src_port, ii->ie->src_addr, 0) < 0) goto err_c; if (restore_sockaddr(&sa_dst, ii->ie->family, ii->ie->dst_port, ii->ie->dst_addr, 0) < 0) goto err_c; libsoccr_set_addr(socr, 1, &sa_src, 0); libsoccr_set_addr(socr, 0, &sa_dst, 0); /* * O_NONBLOCK has to be set before libsoccr_restore(), * it is required to restore syn-sent sockets. */ if (restore_prepare_socket(sk)) goto err_c; if (read_tcp_queues(socr, &data, img)) goto err_c; if (libsoccr_restore(socr, &data, sizeof(data))) goto err_c; if (tse->has_nodelay && tse->nodelay) { aux = 1; if (restore_opt(sk, SOL_TCP, TCP_NODELAY, &aux)) goto err_c; } if (tse->has_cork && tse->cork) { aux = 1; if (restore_opt(sk, SOL_TCP, TCP_CORK, &aux)) goto err_c; } tcp_stream_entry__free_unpacked(tse, NULL); close_image(img); return 0; err_c: tcp_stream_entry__free_unpacked(tse, NULL); close_image(img); err: return -1; } int prepare_tcp_socks(struct task_restore_args *ta) { struct inet_sk_info *ii; ta->tcp_socks = (struct rst_tcp_sock *) rst_mem_align_cpos(RM_PRIVATE); ta->tcp_socks_n = 0; list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist) { struct rst_tcp_sock *rs; /* * rst_tcp_repair_sockets contains all sockets, so we need to * select sockets which restored in a current porcess. */ if (ii->sk_fd == -1) continue; rs = rst_mem_alloc(sizeof(*rs), RM_PRIVATE); if (!rs) return -1; rs->sk = ii->sk_fd; rs->reuseaddr = ii->ie->opts->reuseaddr; ta->tcp_socks_n++; } return 0; } int restore_one_tcp(int fd, struct inet_sk_info *ii) { struct libsoccr_sk *sk; pr_info("Restoring TCP connection\n"); if (opts.tcp_close && ii->ie->state != TCP_LISTEN && ii->ie->state != TCP_CLOSE) { return 0; } sk = libsoccr_pause(fd); if (!sk) return -1; if (restore_tcp_conn_state(fd, sk, ii)) return -1; return 0; } void tcp_locked_conn_add(struct inet_sk_info *ii) { list_add_tail(&ii->rlist, &rst_tcp_repair_sockets); ii->sk_fd = -1; } void rst_unlock_tcp_connections(void) { struct inet_sk_info *ii; /* Network will be unlocked by network-unlock scripts */ if (root_ns_mask & CLONE_NEWNET) return; list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist) nf_unlock_connection_info(ii); } criu-3.6/criu/sk-unix.c000066400000000000000000001134671317335042600150500ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "libnetlink.h" #include "cr_options.h" #include "imgset.h" #include "unix_diag.h" #include "files.h" #include "file-ids.h" #include "log.h" #include "util.h" #include "util-pie.h" #include "sockets.h" #include "sk-queue.h" #include "mount.h" #include "cr-service.h" #include "plugin.h" #include "namespaces.h" #include "pstree.h" #include "external.h" #include "crtools.h" #include "protobuf.h" #include "images/sk-unix.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "sk unix: " /* * By-default, when dumping a unix socket, we should dump its peer * as well. Which in turn means, we should dump the task(s) that have * this peer opened. * * Sometimes, we can break this rule and dump only one end of the * unix sockets pair, and on restore time connect() this end back to * its peer. * * So, to resolve this situation we mark the peers we don't dump * as "external" and require the --ext-unix-sk option. */ #define USK_EXTERN (1 << 0) #define USK_SERVICE (1 << 1) #define USK_CALLBACK (1 << 2) #define USK_INHERIT (1 << 3) typedef struct { char *dir; unsigned int udiag_vfs_dev; unsigned int udiag_vfs_ino; } rel_name_desc_t; struct unix_sk_desc { struct socket_desc sd; unsigned int type; unsigned int state; unsigned int peer_ino; unsigned int rqlen; unsigned int wqlen; unsigned int namelen; char *name; rel_name_desc_t *rel_name; unsigned int nr_icons; unsigned int *icons; unsigned char shutdown; bool deleted; mode_t mode; uid_t uid; gid_t gid; struct list_head list; int fd; struct list_head peer_list; struct list_head peer_node; UnixSkEntry *ue; }; static LIST_HEAD(unix_sockets); struct unix_sk_listen_icon { unsigned int peer_ino; struct unix_sk_desc *sk_desc; struct unix_sk_listen_icon *next; }; #define SK_HASH_SIZE 32 static struct unix_sk_listen_icon *unix_listen_icons[SK_HASH_SIZE]; static struct unix_sk_listen_icon *lookup_unix_listen_icons(int peer_ino) { struct unix_sk_listen_icon *ic; for (ic = unix_listen_icons[peer_ino % SK_HASH_SIZE]; ic; ic = ic->next) if (ic->peer_ino == peer_ino) return ic; return NULL; } static void show_one_unix(char *act, const struct unix_sk_desc *sk) { pr_debug("\t%s: ino %#x peer_ino %#x family %4d type %4d state %2d name %s\n", act, sk->sd.ino, sk->peer_ino, sk->sd.family, sk->type, sk->state, sk->name); if (sk->nr_icons) { int i; for (i = 0; i < sk->nr_icons; i++) pr_debug("\t\ticon: %4d\n", sk->icons[i]); } } static void show_one_unix_img(const char *act, const UnixSkEntry *e) { pr_info("\t%s: id %#x ino %#x peer %#x type %d state %d name %d bytes\n", act, e->id, e->ino, e->peer, e->type, e->state, (int)e->name.len); } static int can_dump_unix_sk(const struct unix_sk_desc *sk) { /* * The last case in this "if" is seqpacket socket, * that is connected to cr_service. We will dump * it properly below. */ if (sk->type != SOCK_STREAM && sk->type != SOCK_DGRAM && sk->type != SOCK_SEQPACKET) { pr_err("Unsupported type (%d) on socket %x.\n" "Only stream/dgram/seqpacket are supported.\n", sk->type, sk->sd.ino); return 0; } switch (sk->state) { case TCP_LISTEN: case TCP_ESTABLISHED: case TCP_CLOSE: break; default: pr_err("Unknown state %d for unix socket %x\n", sk->state, sk->sd.ino); return 0; } return 1; } static bool unix_sk_exception_lookup_id(unsigned int ino) { char id[20]; snprintf(id, sizeof(id), "unix[%u]", ino); if (external_lookup_id(id)) { pr_debug("Found ino %u in exception unix sk list\n", (unsigned int)ino); return true; } return false; } static int write_unix_entry(struct unix_sk_desc *sk) { int ret; FileEntry fe = FILE_ENTRY__INIT; fe.type = FD_TYPES__UNIXSK; fe.id = sk->ue->id; fe.usk = sk->ue; ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); show_one_unix_img("Dumped", sk->ue); release_skopts(sk->ue->opts); xfree(sk->ue); sk->ue = NULL; return ret; } static int resolve_rel_name(struct unix_sk_desc *sk, const struct fd_parms *p) { rel_name_desc_t *rel_name = sk->rel_name; const char *dirs[] = { "cwd", "root" }; struct pstree_item *task; int mntns_root, i; struct ns_id *ns; task = pstree_item_by_real(p->pid); if (!task) { pr_err("Can't find task with pid %d\n", p->pid); return -ENOENT; } ns = lookup_ns_by_id(task->ids->mnt_ns_id, &mnt_ns_desc); if (!ns) { pr_err("Can't resolve mount namespace for pid %d\n", p->pid); return -ENOENT; } mntns_root = mntns_get_root_fd(ns); if (mntns_root < 0) { pr_err("Can't resolve fs root for pid %d\n", p->pid); return -ENOENT; } pr_debug("Resolving relative name %s for socket %x\n", sk->name, sk->sd.ino); for (i = 0; i < ARRAY_SIZE(dirs); i++) { char dir[PATH_MAX], path[PATH_MAX]; struct stat st; int ret; snprintf(path, sizeof(path), "/proc/%d/%s", p->pid, dirs[i]); ret = readlink(path, dir, sizeof(dir)); if (ret < 0 || (size_t)ret == sizeof(dir)) { pr_err("Can't readlink for %s\n", dirs[i]); return -1; } dir[ret] = 0; snprintf(path, sizeof(path), ".%s/%s", dir, sk->name); if (fstatat(mntns_root, path, &st, 0)) { if (errno == ENOENT) continue; goto err; } if ((st.st_ino == rel_name->udiag_vfs_ino) && phys_stat_dev_match(st.st_dev, rel_name->udiag_vfs_dev, ns, &path[1])) { rel_name->dir = xstrdup(dir); if (!rel_name->dir) return -ENOMEM; pr_debug("Resolved relative socket name to dir %s\n", rel_name->dir); sk->mode = st.st_mode; sk->uid = st.st_uid; sk->gid = st.st_gid; return 0; } } err: pr_err("Can't resolve name for socket %#x\n", rel_name->udiag_vfs_ino); return -ENOENT; } static int dump_one_unix_fd(int lfd, u32 id, const struct fd_parms *p) { struct unix_sk_desc *sk, *peer; UnixSkEntry *ue; SkOptsEntry *skopts; FilePermsEntry *perms; FownEntry *fown; ue = xmalloc(sizeof(UnixSkEntry) + sizeof(SkOptsEntry) + sizeof(FilePermsEntry) + sizeof(FownEntry)); if (ue == NULL) return -1; skopts = (void *) ue + sizeof(UnixSkEntry); perms = (void *) skopts + sizeof(SkOptsEntry); fown = (void *) perms + sizeof(FilePermsEntry); unix_sk_entry__init(ue); sk_opts_entry__init(skopts); file_perms_entry__init(perms); *fown = p->fown; sk = (struct unix_sk_desc *)lookup_socket(p->stat.st_ino, PF_UNIX, 0); if (IS_ERR_OR_NULL(sk)) { pr_err("Unix socket %#x not found\n", (int)p->stat.st_ino); goto err; } if (!can_dump_unix_sk(sk)) goto err; BUG_ON(sk->sd.already_dumped); ue->name.len = (size_t)sk->namelen; ue->name.data = (void *)sk->name; ue->id = id; ue->ino = sk->sd.ino; ue->type = sk->type; ue->state = sk->state; ue->flags = p->flags; ue->backlog = sk->wqlen; ue->peer = sk->peer_ino; ue->fown = fown; ue->opts = skopts; ue->uflags = 0; if (sk->rel_name) { if (resolve_rel_name(sk, p)) goto err; ue->name_dir = sk->rel_name->dir; } /* * Check if this socket is connected to criu service. * Dump it like closed one and mark it for restore. */ if (unlikely(ue->peer == service_sk_ino)) { ue->state = TCP_CLOSE; ue->peer = 0; ue->uflags |= USK_SERVICE; } if (sk->namelen && *sk->name) { ue->file_perms = perms; perms->mode = sk->mode; perms->uid = userns_uid(sk->uid); perms->gid = userns_gid(sk->gid); } if (sk->deleted) { ue->has_deleted = true; ue->deleted = sk->deleted; } sk_encode_shutdown(ue, sk->shutdown); if (ue->peer) { peer = (struct unix_sk_desc *)lookup_socket(ue->peer, PF_UNIX, 0); if (IS_ERR_OR_NULL(peer)) { pr_err("Unix socket %#x without peer %#x\n", ue->ino, ue->peer); goto err; } /* * Peer should have us as peer or have a name by which * we can access one. */ if (peer->peer_ino != ue->ino) { if (!peer->name) { pr_err("Unix socket %#x with unreachable peer %#x (%#x/%s)\n", ue->ino, ue->peer, peer->peer_ino, peer->name); goto err; } } /* * It can be external socket, so we defer dumping * until all sockets the program owns are processed. */ if (!peer->sd.already_dumped) { show_one_unix("Add a peer", peer); list_add(&sk->peer_node, &peer->peer_list); sk->fd = dup(lfd); if (sk->fd < 0) { pr_perror("Unable to dup(%d)", lfd); goto err; } } if ((ue->type != SOCK_DGRAM) && ( ((ue->shutdown == SK_SHUTDOWN__READ) && (peer->shutdown != SK_SHUTDOWN__WRITE)) || ((ue->shutdown == SK_SHUTDOWN__WRITE) && (peer->shutdown != SK_SHUTDOWN__READ)) || ((ue->shutdown == SK_SHUTDOWN__BOTH) && (peer->shutdown != SK_SHUTDOWN__BOTH)) )) { /* * Usually this doesn't happen, however it's possible if * socket was shut down before connect() (see sockets03.c test). * On restore we will shutdown both end (iow socktes will be in * matched state). This shoudn't be a problem, since kernel seems * to check both ends on read()/write(). Thus mismatched sockets behave * the same way as matched. */ pr_warn("Shutdown mismatch %u:%d -> %u:%d\n", ue->ino, ue->shutdown, peer->sd.ino, peer->shutdown); } } else if (ue->state == TCP_ESTABLISHED) { const struct unix_sk_listen_icon *e; e = lookup_unix_listen_icons(ue->ino); if (!e) { /* * ESTABLISHED socket without peer and without * anyone waiting for it should be semi-closed * connection. */ if (ue->shutdown == SK_SHUTDOWN__BOTH) { pr_info("Dumping semi-closed connection\n"); goto dump; } pr_err("Dangling connection %#x\n", ue->ino); goto err; } /* * If this is in-flight connection we need to figure * out where to connect it on restore. Thus, tune up peer * id by searching an existing listening socket. * * Note the socket name will be found at restore stage, * not now, just to reduce size of dump files. */ /* e->sk_desc is _never_ NULL */ if (e->sk_desc->state != TCP_LISTEN) { pr_err("In-flight connection on " "non-listening socket %d\n", ue->ino); goto err; } ue->peer = e->sk_desc->sd.ino; pr_debug("\t\tFixed inflight socket %#x peer %#x)\n", ue->ino, ue->peer); } dump: if (dump_socket_opts(lfd, skopts)) goto err; /* * If a stream listening socket has non-zero rqueue, this * means there are in-flight connections waiting to get * accept()-ed. We handle them separately with the "icons" * (i stands for in-flight, cons -- for connections) things. */ if (sk->rqlen != 0 && !(sk->type == SOCK_STREAM && sk->state == TCP_LISTEN)) if (dump_sk_queue(lfd, id)) goto err; pr_info("Dumping unix socket at %d\n", p->fd); show_one_unix("Dumping", sk); sk->ue = ue; /* * Postpone writing the entry if a peer isn't found yet. * It's required, because we may need to modify the entry. * For example, if a socket is external and is dumped by * a callback, the USK_CALLBACK flag must be set. */ if (list_empty(&sk->peer_node) && write_unix_entry(sk)) return -1; sk->sd.already_dumped = 1; while (!list_empty(&sk->peer_list)) { struct unix_sk_desc *psk; psk = list_first_entry(&sk->peer_list, struct unix_sk_desc, peer_node); close_safe(&psk->fd); list_del_init(&psk->peer_node); if (write_unix_entry(psk)) return -1; } return 0; err: release_skopts(skopts); xfree(ue); return -1; } const struct fdtype_ops unix_dump_ops = { .type = FD_TYPES__UNIXSK, .dump = dump_one_unix_fd, }; /* * Returns: < 0 on error, 0 if OK, 1 to skip the socket */ static int unix_process_name(struct unix_sk_desc *d, const struct unix_diag_msg *m, struct nlattr **tb) { int len, ret; char *name; len = nla_len(tb[UNIX_DIAG_NAME]); name = xmalloc(len + 1); if (!name) return -ENOMEM; memcpy(name, nla_data(tb[UNIX_DIAG_NAME]), len); name[len] = '\0'; if (name[0] != '\0') { struct unix_diag_vfs *uv; bool deleted = false; char rpath[PATH_MAX]; struct ns_id *ns; struct stat st; int mntns_root; if (!tb[UNIX_DIAG_VFS]) { pr_err("Bound socket w/o inode %#x\n", m->udiag_ino); goto skip; } ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); if (!ns) { ret = -ENOENT; goto out; } mntns_root = mntns_get_root_fd(ns); if (mntns_root < 0) { ret = -ENOENT; goto out; } uv = RTA_DATA(tb[UNIX_DIAG_VFS]); if (name[0] != '/') { /* * Relative names are be resolved later at first * dump attempt. */ rel_name_desc_t *rel_name = xzalloc(sizeof(*rel_name)); if (!rel_name) { ret = -ENOMEM; goto out; } rel_name->udiag_vfs_dev = uv->udiag_vfs_dev; rel_name->udiag_vfs_ino = uv->udiag_vfs_ino; d->rel_name = rel_name; goto postprone; } snprintf(rpath, sizeof(rpath), ".%s", name); if (fstatat(mntns_root, rpath, &st, 0)) { if (errno != ENOENT) { pr_warn("Can't stat socket %#x(%s), skipping: %m (err %d)\n", m->udiag_ino, rpath, errno); goto skip; } pr_info("unix: Dropping path %s for unlinked sk %#x\n", name, m->udiag_ino); deleted = true; } else if ((st.st_ino != uv->udiag_vfs_ino) || !phys_stat_dev_match(st.st_dev, uv->udiag_vfs_dev, ns, name)) { pr_info("unix: Dropping path %s for unlinked bound " "sk %#x.%#x real %#x.%#x\n", name, (int)st.st_dev, (int)st.st_ino, (int)uv->udiag_vfs_dev, (int)uv->udiag_vfs_ino); deleted = true; } d->mode = st.st_mode; d->uid = st.st_uid; d->gid = st.st_gid; d->deleted = deleted; } postprone: d->namelen = len; d->name = name; return 0; out: xfree(name); return ret; skip: ret = 1; goto out; } static int unix_collect_one(const struct unix_diag_msg *m, struct nlattr **tb) { struct unix_sk_desc *d; int ret = 0; d = xzalloc(sizeof(*d)); if (!d) return -1; d->type = m->udiag_type; d->state = m->udiag_state; INIT_LIST_HEAD(&d->list); INIT_LIST_HEAD(&d->peer_list); INIT_LIST_HEAD(&d->peer_node); d->fd = -1; if (tb[UNIX_DIAG_SHUTDOWN]) d->shutdown = nla_get_u8(tb[UNIX_DIAG_SHUTDOWN]); else pr_err_once("No socket shutdown info\n"); if (tb[UNIX_DIAG_PEER]) d->peer_ino = nla_get_u32(tb[UNIX_DIAG_PEER]); if (tb[UNIX_DIAG_NAME]) { ret = unix_process_name(d, m, tb); if (ret < 0) goto err; else if (ret == 1) goto skip; BUG_ON(ret != 0); } if (tb[UNIX_DIAG_ICONS]) { int len = nla_len(tb[UNIX_DIAG_ICONS]); int i; d->icons = xmalloc(len); if (!d->icons) goto err; memcpy(d->icons, nla_data(tb[UNIX_DIAG_ICONS]), len); d->nr_icons = len / sizeof(u32); /* * Remember these sockets, we will need them * to fix up in-flight sockets peers. */ for (i = 0; i < d->nr_icons; i++) { struct unix_sk_listen_icon *e, **chain; int n; e = xzalloc(sizeof(*e)); if (!e) goto err; n = d->icons[i]; chain = &unix_listen_icons[n % SK_HASH_SIZE]; e->next = *chain; *chain = e; pr_debug("\t\tCollected icon %d\n", d->icons[i]); e->peer_ino = n; e->sk_desc = d; } } if (tb[UNIX_DIAG_RQLEN]) { struct unix_diag_rqlen *rq; rq = (struct unix_diag_rqlen *)RTA_DATA(tb[UNIX_DIAG_RQLEN]); d->rqlen = rq->udiag_rqueue; d->wqlen = rq->udiag_wqueue; } sk_collect_one(m->udiag_ino, AF_UNIX, &d->sd); list_add_tail(&d->list, &unix_sockets); show_one_unix("Collected", d); return 0; err: ret = -1; skip: xfree(d->icons); xfree(d->name); xfree(d); return ret; } int unix_receive_one(struct nlmsghdr *h, void *arg) { struct unix_diag_msg *m = NLMSG_DATA(h); struct nlattr *tb[UNIX_DIAG_MAX+1]; nlmsg_parse(h, sizeof(struct unix_diag_msg), tb, UNIX_DIAG_MAX, NULL); return unix_collect_one(m, tb); } static int dump_external_sockets(struct unix_sk_desc *peer) { struct unix_sk_desc *sk; int ret; while (!list_empty(&peer->peer_list)) { sk = list_first_entry(&peer->peer_list, struct unix_sk_desc, peer_node); ret = run_plugins(DUMP_UNIX_SK, sk->fd, sk->sd.ino); if (ret == -ENOTSUP) { if (unix_sk_exception_lookup_id(sk->sd.ino)) { pr_debug("found exception for unix name-less external socket.\n"); } else { /* Legacy -x|--ext-unix-sk option handling */ if (!opts.ext_unix_sk) { show_one_unix("Runaway socket", peer); pr_err("External socket is used. " "Consider using --" USK_EXT_PARAM " option.\n"); return -1; } if (peer->type != SOCK_DGRAM) { show_one_unix("Ext stream not supported", peer); pr_err("Can't dump half of stream unix connection.\n"); return -1; } if (!peer->name) { show_one_unix("Ext dgram w/o name", peer); pr_err("Can't dump name-less external socket.\n"); pr_err("%d\n", sk->fd); return -1; } } } else if (ret < 0) return -1; else sk->ue->uflags |= USK_CALLBACK; if (write_unix_entry(sk)) return -1; close_safe(&sk->fd); list_del_init(&sk->peer_node); } return 0; } int fix_external_unix_sockets(void) { struct unix_sk_desc *sk; pr_debug("Dumping external sockets\n"); list_for_each_entry(sk, &unix_sockets, list) { FileEntry fe = FILE_ENTRY__INIT; UnixSkEntry e = UNIX_SK_ENTRY__INIT; FownEntry fown = FOWN_ENTRY__INIT; SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; if (sk->sd.already_dumped || list_empty(&sk->peer_list)) continue; show_one_unix("Dumping extern", sk); fd_id_generate_special(NULL, &e.id); e.ino = sk->sd.ino; e.type = SOCK_DGRAM; e.state = TCP_LISTEN; e.name.data = (void *)sk->name; e.name.len = (size_t)sk->namelen; e.uflags = USK_EXTERN; e.peer = 0; e.fown = &fown; e.opts = &skopts; fe.type = FD_TYPES__UNIXSK; fe.id = e.id; fe.usk = &e; if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) goto err; show_one_unix_img("Dumped extern", &e); if (dump_external_sockets(sk)) goto err; } return 0; err: return -1; } struct unix_sk_info { UnixSkEntry *ue; struct list_head list; char *name; char *name_dir; unsigned flags; struct unix_sk_info *peer; struct pprep_head peer_resolve; /* XXX : union with the above? */ struct file_desc d; struct list_head connected; /* List of sockets, connected to me */ struct list_head node; /* To link in peer's connected list */ struct list_head scm_fles; /* * For DGRAM sockets with queues, we should only restore the queue * once although it may be open by more than one tid. This is the peer * that should do the queueing. */ u32 queuer; u8 bound:1; u8 listen:1; }; struct scm_fle { struct list_head l; struct fdinfo_list_entry *fle; }; #define USK_PAIR_MASTER 0x1 #define USK_PAIR_SLAVE 0x2 static struct unix_sk_info *find_unix_sk_by_ino(int ino) { struct unix_sk_info *ui; list_for_each_entry(ui, &unix_sockets, list) { if (ui->ue->ino == ino) return ui; } return NULL; } static struct unix_sk_info *find_queuer_for(int id) { struct unix_sk_info *ui; list_for_each_entry(ui, &unix_sockets, list) { if (ui->queuer == id) return ui; } return NULL; } static struct fdinfo_list_entry *get_fle_for_scm(struct file_desc *tgt, struct pstree_item *owner) { struct fdinfo_list_entry *fle; FdinfoEntry *e = NULL; int fd; list_for_each_entry(fle, &tgt->fd_info_head, desc_list) { if (fle->task == owner) /* * Owner already has this file in its fdtable. * Just use one. */ return fle; e = fle->fe; /* keep any for further reference */ } /* * Some other task restores this file. Pretend that * we're another user of it. */ fd = find_unused_fd(owner, -1); pr_info("`- will add SCM-only %d fd\n", fd); if (e != NULL) { e = dup_fdinfo(e, fd, 0); if (!e) { pr_err("Can't duplicate fdinfo for scm\n"); return NULL; } } else { /* * This can happen if the file in question is * sent over the socket and closed. In this case * we need to ... invent a new one! */ e = xmalloc(sizeof(*e)); if (!e) return NULL; fdinfo_entry__init(e); e->id = tgt->id; e->type = tgt->ops->type; e->fd = fd; e->flags = 0; } /* * Make this fle fake, so that files collecting engine * closes them at the end. */ return collect_fd_to(vpid(owner), e, rsti(owner), tgt, true); } int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids) { struct unix_sk_info *ui; struct pstree_item *owner; int i; ui = find_queuer_for(id_for); if (!ui) { pr_err("Can't find sender for %d\n", id_for); return -1; } pr_info("Found queuer for %d -> %d\n", id_for, ui->ue->id); /* * This is the task that will restore this socket */ owner = file_master(&ui->d)->task; pr_info("-> will set up deps\n"); /* * The ui will send data to the rights receiver. Add a fake fle * for the file and a dependency. */ for (i = 0; i < n_ids; i++) { struct file_desc *tgt; struct scm_fle *sfle; tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]); if (!tgt) { pr_err("Can't find fdesc to send\n"); return -1; } pr_info("scm: add file %d -> %d\n", tgt->id, vpid(owner)); sfle = xmalloc(sizeof(*sfle)); if (!sfle) return -1; sfle->fle = get_fle_for_scm(tgt, owner); if (!sfle->fle) { pr_err("Can't request new fle for scm\n"); return -1; } list_add_tail(&sfle->l, &ui->scm_fles); fds[i] = sfle->fle->fe->fd; } return 0; } static int chk_restored_scms(struct unix_sk_info *ui) { struct scm_fle *sf, *n; list_for_each_entry_safe(sf, n, &ui->scm_fles, l) { if (sf->fle->stage < FLE_OPEN) return 1; /* Optimization for the next pass */ list_del(&sf->l); xfree(sf); } return 0; } static int wake_connected_sockets(struct unix_sk_info *ui) { struct fdinfo_list_entry *fle; struct unix_sk_info *tmp; list_for_each_entry(tmp, &ui->connected, node) { fle = file_master(&tmp->d); set_fds_event(fle->pid); } return 0; } static bool peer_is_not_prepared(struct unix_sk_info *peer) { if (peer->ue->state != TCP_LISTEN) return (!peer->bound); else return (!peer->listen); } static int shutdown_unix_sk(int sk, struct unix_sk_info *ui) { int how; UnixSkEntry *ue = ui->ue; if (!ue->has_shutdown || ue->shutdown == SK_SHUTDOWN__NONE) return 0; how = sk_decode_shutdown(ue->shutdown); if (shutdown(sk, how)) { pr_perror("Can't shutdown unix socket"); return -1; } pr_debug("Socket %#x is shut down %d\n", ue->ino, how); return 0; } static int restore_sk_common(int fd, struct unix_sk_info *ui) { if (rst_file_params(fd, ui->ue->fown, ui->ue->flags)) return -1; if (restore_socket_opts(fd, ui->ue->opts)) return -1; if (shutdown_unix_sk(fd, ui)) return -1; return 0; } static void revert_unix_sk_cwd(int *prev_cwd_fd, int *root_fd) { if (*root_fd >= 0) { if (fchdir(*root_fd) || chroot(".")) pr_perror("Can't revert root directory"); close_safe(root_fd); } if (prev_cwd_fd && *prev_cwd_fd >= 0) { if (fchdir(*prev_cwd_fd)) pr_perror("Can't revert working dir"); else pr_debug("Reverted working dir\n"); close(*prev_cwd_fd); *prev_cwd_fd = -1; } } static int prep_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd, int *prev_root_fd) { static struct ns_id *root = NULL; *prev_cwd_fd = open(".", O_RDONLY); if (*prev_cwd_fd < 0) { pr_perror("Can't open current dir"); return -1; } if (prev_root_fd && (root_ns_mask & CLONE_NEWNS)) { if (root == NULL) root = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); *prev_root_fd = open("/", O_RDONLY); if (*prev_root_fd < 0) { pr_perror("Can't open current root"); goto err; } if (fchdir(root->mnt.root_fd)) { pr_perror("Unable to change current working dir"); goto err; } if (chroot(".")) { pr_perror("Unable to change root directory"); goto err; } } if (ui->name_dir) { if (chdir(ui->name_dir)) { pr_perror("Can't change working dir %s", ui->name_dir); goto err; } pr_debug("Change working dir to %s\n", ui->name_dir); } return 0; err: close_safe(prev_cwd_fd); if (prev_root_fd) close_safe(prev_root_fd); return -1; } static int post_open_unix_sk(struct file_desc *d, int fd) { struct unix_sk_info *ui; struct unix_sk_info *peer; struct sockaddr_un addr; int cwd_fd = -1, root_fd = -1; ui = container_of(d, struct unix_sk_info, d); BUG_ON((ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE)) || (ui->ue->uflags & (USK_CALLBACK | USK_INHERIT))); peer = ui->peer; BUG_ON(peer == NULL); /* Skip external sockets */ if (!list_empty(&peer->d.fd_info_head)) if (peer_is_not_prepared(peer)) return 1; memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_UNIX; memcpy(&addr.sun_path, peer->name, peer->ue->name.len); pr_info("\tConnect %#x to %#x\n", ui->ue->ino, peer->ue->ino); if (prep_unix_sk_cwd(peer, &cwd_fd, NULL)) return -1; if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + peer->ue->name.len) < 0) { pr_perror("Can't connect %#x socket", ui->ue->ino); revert_unix_sk_cwd(&cwd_fd, &root_fd); return -1; } revert_unix_sk_cwd(&cwd_fd, &root_fd); if (peer->queuer == ui->ue->id && restore_sk_queue(fd, peer->ue->id)) return -1; return restore_sk_common(fd, ui); } static int bind_unix_sk(int sk, struct unix_sk_info *ui) { struct sockaddr_un addr; int cwd_fd = -1, root_fd = -1; int ret = -1; if (ui->ue->name.len == 0) return 0; if ((ui->ue->type == SOCK_STREAM) && (ui->ue->state == TCP_ESTABLISHED)) { /* * FIXME this can be done, but for doing this properly we * need to bind socket to its name, then rename one to * some temporary unique one and after all the sockets are * restored we should walk those temp names and rename * some of them back to real ones. */ ret = 0; goto done; } memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_UNIX; memcpy(&addr.sun_path, ui->name, ui->ue->name.len); if (prep_unix_sk_cwd(ui, &cwd_fd, NULL)) return -1; if (ui->ue->name.len) { ret = bind(sk, (struct sockaddr *)&addr, sizeof(addr.sun_family) + ui->ue->name.len); if (ret < 0) { if (ui->ue->has_deleted && ui->ue->deleted && errno == EADDRINUSE) { char temp[PATH_MAX]; pr_info("found duplicate unix socket bound at %s\n", addr.sun_path); ret = snprintf(temp, sizeof(temp), "%s-%s-%d", addr.sun_path, "criu-temp", getpid()); /* this shouldn't happen, since sun_addr is only 108 chars long */ if (ret < 0 || ret >= sizeof(temp)) { pr_err("snprintf of %s failed?\n", addr.sun_path); goto done; } ret = rename(addr.sun_path, temp); if (ret < 0) { pr_perror("couldn't move socket for binding"); goto done; } ret = bind(sk, (struct sockaddr *)&addr, sizeof(addr.sun_family) + ui->ue->name.len); if (ret < 0) { pr_perror("Can't bind socket after move"); goto done; } ret = rename(temp, addr.sun_path); if (ret < 0) { pr_perror("couldn't move socket back"); goto done; } /* we've handled the deleted-ness of this * socket and we don't want to delete it later * since it's not /this/ socket. */ ui->ue->deleted = false; } else { pr_perror("Can't bind socket"); goto done; } } if (*ui->name && ui->ue->file_perms) { FilePermsEntry *perms = ui->ue->file_perms; char fname[PATH_MAX]; if (ui->ue->name.len >= sizeof(fname)) { pr_err("The file name is too long\n"); goto done; } memcpy(fname, ui->name, ui->ue->name.len); fname[ui->ue->name.len] = '\0'; if (fchownat(AT_FDCWD, fname, perms->uid, perms->gid, 0) == -1) { pr_perror("Unable to change file owner and group"); goto done; } if (fchmodat(AT_FDCWD, fname, perms->mode, 0) == -1) { pr_perror("Unable to change file mode bits"); goto done; } } if (ui->ue->deleted && unlink((char *)ui->ue->name.data) < 0) { pr_perror("failed to unlink %s", ui->ue->name.data); goto done; } } if (ui->ue->state != TCP_LISTEN) { ui->bound = 1; wake_connected_sockets(ui); } ret = 0; done: revert_unix_sk_cwd(&cwd_fd, &root_fd); return ret; } static int open_unixsk_pair_master(struct unix_sk_info *ui, int *new_fd) { int sk[2]; struct unix_sk_info *peer = ui->peer; pr_info("Opening pair master (id %#x ino %#x peer %#x)\n", ui->ue->id, ui->ue->ino, ui->ue->peer); if (socketpair(PF_UNIX, ui->ue->type, 0, sk) < 0) { pr_perror("Can't make socketpair"); return -1; } if (restore_sk_queue(sk[0], peer->ue->id)) return -1; if (restore_sk_queue(sk[1], ui->ue->id)) return -1; if (bind_unix_sk(sk[0], ui)) return -1; if (restore_sk_common(sk[0], ui)) return -1; if (send_desc_to_peer(sk[1], &peer->d)) { pr_err("Can't send pair slave\n"); return -1; } close(sk[1]); *new_fd = sk[0]; return 0; } static int open_unixsk_pair_slave(struct unix_sk_info *ui, int *new_fd) { int sk, ret; ret = recv_desc_from_peer(&ui->d, &sk); if (ret != 0) { if (ret != 1) pr_err("Can't recv pair slave\n"); return ret; } if (bind_unix_sk(sk, ui)) return -1; if (restore_sk_common(sk, ui)) return -1; *new_fd = sk; return 0; } static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd) { int sk; pr_info("Opening standalone socket (id %#x ino %#x peer %#x)\n", ui->ue->id, ui->ue->ino, ui->ue->peer); /* * Check if this socket was connected to criu service. * If so, put response, that dumping and restoring * was successful. */ if (ui->ue->uflags & USK_SERVICE) { int sks[2]; if (socketpair(PF_UNIX, ui->ue->type, 0, sks)) { pr_perror("Can't create socketpair"); return -1; } if (send_criu_dump_resp(sks[1], true, true) == -1) return -1; close(sks[1]); sk = sks[0]; } else if ((ui->ue->state == TCP_ESTABLISHED) && !ui->ue->peer) { int ret, sks[2]; if (ui->ue->type != SOCK_STREAM) { pr_err("Non-stream socket %x in established state\n", ui->ue->ino); return -1; } if (ui->ue->shutdown != SK_SHUTDOWN__BOTH) { pr_err("Wrong shutdown/peer state for %x\n", ui->ue->ino); return -1; } ret = socketpair(PF_UNIX, ui->ue->type, 0, sks); if (ret < 0) { pr_perror("Can't create socketpair"); return -1; } /* * Restore queue at the one end, * before closing the second one. */ if (restore_sk_queue(sks[1], ui->ue->id)) { pr_perror("Can't restore socket queue"); return -1; } close(sks[1]); sk = sks[0]; } else if (ui->ue->type == SOCK_DGRAM && !ui->queuer) { struct sockaddr_un addr; int sks[2]; if (socketpair(PF_UNIX, ui->ue->type, 0, sks) < 0) { pr_perror("Can't create socketpair"); return -1; } sk = sks[0]; addr.sun_family = AF_UNSPEC; /* * socketpair() assigns sks[1] as a peer of sks[0] * (and vice versa). But in this case (not zero peer) * it's impossible for other sockets to connect * to sks[0] (see unix_dgram_connect()->unix_may_send()). * The below is hack: we use that connect with AF_UNSPEC * clears socket's peer. */ if (connect(sk, (struct sockaddr *)&addr, sizeof(addr.sun_family))) { pr_perror("Can't clear socket's peer"); return -1; } /* * This must be after the connect() hack, because * connect() flushes receive queue. */ if (restore_sk_queue(sks[1], ui->ue->id)) { pr_perror("Can't restore socket queue"); return -1; } close(sks[1]); } else { if (ui->ue->uflags & USK_CALLBACK) { sk = run_plugins(RESTORE_UNIX_SK, ui->ue->ino); if (sk >= 0) goto out; } /* * Connect to external sockets requires * special option to be passed. */ if (ui->peer && (ui->peer->ue->uflags & USK_EXTERN) && !(opts.ext_unix_sk)) { pr_err("External socket found in image. " "Consider using the --" USK_EXT_PARAM "option to allow restoring it.\n"); return -1; } sk = socket(PF_UNIX, ui->ue->type, 0); if (sk < 0) { pr_perror("Can't make unix socket"); return -1; } } if (bind_unix_sk(sk, ui)) return -1; if (ui->ue->state == TCP_LISTEN) { pr_info("\tPutting %#x into listen state\n", ui->ue->ino); if (listen(sk, ui->ue->backlog) < 0) { pr_perror("Can't make usk listen"); return -1; } ui->listen = 1; wake_connected_sockets(ui); } if (ui->peer) { /* * We need to connect() to the peer, but the * guy might have not bind()-ed himself, so * let's postpone this. */ *new_fd = sk; return 1; } out: if (restore_sk_common(sk, ui)) return -1; *new_fd = sk; return 0; } static int open_unix_sk(struct file_desc *d, int *new_fd) { struct fdinfo_list_entry *fle; struct unix_sk_info *ui; int ret; ui = container_of(d, struct unix_sk_info, d); /* FIXME -- only queue restore may be postponed */ if (chk_restored_scms(ui)) { pr_info("scm: Wait for tgt to restore\n"); return 1; } fle = file_master(d); if (fle->stage >= FLE_OPEN) return post_open_unix_sk(d, fle->fe->fd); if (inherited_fd(d, new_fd)) { ui->ue->uflags |= USK_INHERIT; ret = *new_fd >= 0 ? 0 : -1; } else if (ui->flags & USK_PAIR_MASTER) ret = open_unixsk_pair_master(ui, new_fd); else if (ui->flags & USK_PAIR_SLAVE) ret = open_unixsk_pair_slave(ui, new_fd); else ret = open_unixsk_standalone(ui, new_fd); return ret; } static char *socket_d_name(struct file_desc *d, char *buf, size_t s) { struct unix_sk_info *ui; ui = container_of(d, struct unix_sk_info, d); if (snprintf(buf, s, "socket:[%d]", ui->ue->ino) >= s) { pr_err("Not enough room for unixsk %d identifier string\n", ui->ue->ino); return NULL; } return buf; } static struct file_desc_ops unix_desc_ops = { .type = FD_TYPES__UNIXSK, .open = open_unix_sk, .name = socket_d_name, }; /* * Make FS clean from sockets we're about to * restore. See for how we bind them for details */ static void unlink_stale(struct unix_sk_info *ui) { int ret, cwd_fd = -1, root_fd = -1; if (ui->name[0] == '\0' || (ui->ue->uflags & USK_EXTERN)) return; if (prep_unix_sk_cwd(ui, &cwd_fd, &root_fd)) return; ret = unlinkat(AT_FDCWD, ui->name, 0) ? -1 : 0; if (ret < 0) { pr_warn("Can't unlink stale socket %#x peer %#x (name %s dir %s)\n", ui->ue->ino, ui->ue->peer, ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-", ui->name_dir ? ui->name_dir : "-"); } revert_unix_sk_cwd(&cwd_fd, &root_fd); } static void try_resolve_unix_peer(struct unix_sk_info *ui); static int fixup_unix_peer(struct unix_sk_info *ui); static int post_prepare_unix_sk(struct pprep_head *ph) { struct unix_sk_info *ui; ui = container_of(ph, struct unix_sk_info, peer_resolve); if (ui->ue->peer && fixup_unix_peer(ui)) return -1; if (ui->name) unlink_stale(ui); return 0; } static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i) { struct unix_sk_info *ui = o; char *uname, *prefix = ""; int ulen; ui->ue = pb_msg(base, UnixSkEntry); ui->name_dir = (void *)ui->ue->name_dir; if (ui->ue->name.len) { if (ui->ue->name.len > UNIX_PATH_MAX) { pr_err("Bad unix name len %d\n", (int)ui->ue->name.len); return -1; } ui->name = (void *)ui->ue->name.data; } else ui->name = NULL; ui->queuer = 0; ui->peer = NULL; ui->bound = 0; ui->listen = 0; INIT_LIST_HEAD(&ui->connected); INIT_LIST_HEAD(&ui->node); INIT_LIST_HEAD(&ui->scm_fles); ui->flags = 0; uname = ui->name; ulen = ui->ue->name.len; if (ulen > 0 && uname[0] == 0) { prefix = "@"; uname++; ulen--; if (memrchr(uname, 0, ulen)) { /* replace zero characters */ char *s = alloca(ulen + 1); int i; for (i = 0; i < ulen; i++) s[i] = uname[i] ? : '@'; uname = s; } } else if (ulen == 0) { ulen = 1; uname = "-"; } pr_info(" `- Got %#x peer %#x (name %s%.*s dir %s)\n", ui->ue->ino, ui->ue->peer, prefix, ulen, uname, ui->name_dir ? ui->name_dir : "-"); if (ui->ue->peer || ui->name) { if (ui->ue->peer) try_resolve_unix_peer(ui); ui->peer_resolve.actor = post_prepare_unix_sk; add_post_prepare_cb(&ui->peer_resolve); } list_add_tail(&ui->list, &unix_sockets); return file_desc_add(&ui->d, ui->ue->id, &unix_desc_ops); } struct collect_image_info unix_sk_cinfo = { .fd_type = CR_FD_UNIXSK, .pb_type = PB_UNIX_SK, .priv_size = sizeof(struct unix_sk_info), .collect = collect_one_unixsk, .flags = COLLECT_SHARED, }; static void set_peer(struct unix_sk_info *ui, struct unix_sk_info *peer) { ui->peer = peer; list_add(&ui->node, &peer->connected); if (!peer->queuer) peer->queuer = ui->ue->id; } static void interconnected_pair(struct unix_sk_info *ui, struct unix_sk_info *peer) { struct fdinfo_list_entry *fle, *fle_peer; /* * Select who will restore the pair. Check is identical to * the one in pipes.c and makes sure tasks wait for each other * in pids sorting order (ascending). */ fle = file_master(&ui->d); fle_peer = file_master(&peer->d); if (fdinfo_rst_prio(fle, fle_peer)) { ui->flags |= USK_PAIR_MASTER; peer->flags |= USK_PAIR_SLAVE; } else { peer->flags |= USK_PAIR_MASTER; ui->flags |= USK_PAIR_SLAVE; } } static int fixup_unix_peer(struct unix_sk_info *ui) { struct unix_sk_info *peer = ui->peer; if (!peer) { pr_err("FATAL: Peer %#x unresolved for %#x\n", ui->ue->peer, ui->ue->ino); return -1; } if (peer != ui && peer->peer == ui && !(ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE))) { pr_info("Connected %#x -> %#x (%#x) flags %#x\n", ui->ue->ino, ui->ue->peer, peer->ue->ino, ui->flags); /* socketpair or interconnected sockets */ interconnected_pair(ui, peer); } return 0; } static void try_resolve_unix_peer(struct unix_sk_info *ui) { struct unix_sk_info *peer; if (ui->peer) return; BUG_ON(!ui->ue->peer); if (ui->ue->peer == ui->ue->ino) { /* socket connected to self %) */ set_peer(ui, ui); return; } peer = find_unix_sk_by_ino(ui->ue->peer); if (peer) { set_peer(ui, peer); if (peer->ue->peer == ui->ue->ino) set_peer(peer, ui); } /* else -- maybe later */ } int unix_sk_id_add(unsigned int ino) { char *e_str; e_str = xmalloc(20); if (!e_str) return -1; snprintf(e_str, 20, "unix[%u]", ino); return add_external(e_str); } int unix_sk_ids_parse(char *optarg) { /* * parsing option of the following form: --ext-unix-sk=,... or short form -x,... */ char *iter = optarg; while (*iter != '\0') { if (*iter == ',') iter++; else { unsigned int ino = strtoul(iter, &iter, 10); if (0 == ino) { pr_err("Can't parse unix socket inode from optarg: %s\n", optarg); return -1; } if (unix_sk_id_add(ino) < 0) { pr_err("Can't add unix socket inode in list: %s\n", optarg); return -1; } } } return 0; } criu-3.6/criu/sockets.c000066400000000000000000000431751317335042600151230ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "int.h" #include "bitops.h" #include "libnetlink.h" #include "sockets.h" #include "unix_diag.h" #include "inet_diag.h" #include "packet_diag.h" #include "netlink_diag.h" #include "files.h" #include "util-pie.h" #include "sk-packet.h" #include "namespaces.h" #include "net.h" #include "xmalloc.h" #include "fs-magic.h" #ifndef SOCK_DIAG_BY_FAMILY #define SOCK_DIAG_BY_FAMILY 20 #endif #define SK_HASH_SIZE 32 #ifndef SO_GET_FILTER #define SO_GET_FILTER SO_ATTACH_FILTER #endif struct sock_diag_greq { u8 family; u8 protocol; }; struct sock_diag_req { struct nlmsghdr hdr; union { struct unix_diag_req u; struct inet_diag_req_v2 i; struct packet_diag_req p; struct netlink_diag_req n; struct sock_diag_greq g; } r; }; enum socket_cl_bits { NETLINK_CL_BIT, INET_TCP_CL_BIT, INET_UDP_CL_BIT, INET_UDPLITE_CL_BIT, INET6_TCP_CL_BIT, INET6_UDP_CL_BIT, INET6_UDPLITE_CL_BIT, UNIX_CL_BIT, PACKET_CL_BIT, _MAX_CL_BIT, }; #define MAX_CL_BIT (_MAX_CL_BIT - 1) static DECLARE_BITMAP(socket_cl_bits, MAX_CL_BIT); static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsigned int proto) { if (family == AF_NETLINK) return NETLINK_CL_BIT; if (family == AF_UNIX) return UNIX_CL_BIT; if (family == AF_PACKET) return PACKET_CL_BIT; if (family == AF_INET) { if (proto == IPPROTO_TCP) return INET_TCP_CL_BIT; if (proto == IPPROTO_UDP) return INET_UDP_CL_BIT; if (proto == IPPROTO_UDPLITE) return INET_UDPLITE_CL_BIT; } if (family == AF_INET6) { if (proto == IPPROTO_TCP) return INET6_TCP_CL_BIT; if (proto == IPPROTO_UDP) return INET6_UDP_CL_BIT; if (proto == IPPROTO_UDPLITE) return INET6_UDPLITE_CL_BIT; } pr_err("Unknown pair family %d proto %d\n", family, proto); BUG(); return -1; } static void set_collect_bit(unsigned int family, unsigned int proto) { enum socket_cl_bits nr; nr = get_collect_bit_nr(family, proto); set_bit(nr, socket_cl_bits); } bool socket_test_collect_bit(unsigned int family, unsigned int proto) { enum socket_cl_bits nr; nr = get_collect_bit_nr(family, proto); return test_bit(nr, socket_cl_bits) != 0; } static int probe_recv_one(struct nlmsghdr *h, void *arg) { pr_err("PROBE RECEIVED\n"); return -1; } static int probe_err(int err, void *arg) { int expected_err = *(int *)arg; if (err == expected_err) return 0; pr_err("Diag module missing (%d)\n", err); return err; } static inline void probe_diag(int nl, struct sock_diag_req *req, int expected_err) { do_rtnl_req(nl, req, req->hdr.nlmsg_len, probe_recv_one, probe_err, &expected_err); } void preload_socket_modules(void) { int nl; struct sock_diag_req req; /* * If the task to dump (e.g. an LXC container) has any netlink * KOBJECT_UEVENT socket open and the _diag modules aren't * loaded is dumped, criu will freeze the task and then the * kernel will send it messages on the socket, and then we will * fail to dump because the socket has pending data. The Real * Solution is to dump this pending data, but we just make sure * modules are there beforehand for now so that the first dump * doesn't fail. */ nl = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); if (nl < 0) return; pr_info("Probing sock diag modules\n"); memset(&req, 0, sizeof(req)); req.hdr.nlmsg_type = SOCK_DIAG_BY_FAMILY; req.hdr.nlmsg_seq = CR_NLMSG_SEQ; /* * Probe UNIX, netlink and packet diag-s by feeding * to the kernel request that is shorter than they * expect, byt still containing the family to make * sure the family handler is there. The family-level * diag module would report EINVAL in this case. */ req.hdr.nlmsg_len = sizeof(req.hdr) + sizeof(req.r.g); req.hdr.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST; req.r.g.family = AF_UNIX; probe_diag(nl, &req, -EINVAL); req.r.g.family = AF_PACKET; probe_diag(nl, &req, -EINVAL); req.r.g.family = AF_NETLINK; probe_diag(nl, &req, -EINVAL); /* * TCP and UDP(LITE) diags do not support such trick, only * inet_diag module can be probed like that. For the protocol * level ones it's OK to request for exact non-existing socket * and check for ENOENT being reported back as error. */ req.hdr.nlmsg_len = sizeof(req.hdr) + sizeof(req.r.i); req.hdr.nlmsg_flags = NLM_F_REQUEST; req.r.i.sdiag_family = AF_INET; req.r.i.sdiag_protocol = IPPROTO_TCP; probe_diag(nl, &req, -ENOENT); req.r.i.sdiag_protocol = IPPROTO_UDP; /* UDLITE is merged with UDP */ probe_diag(nl, &req, -ENOENT); close(nl); pr_info("Done probing\n"); } static int dump_bound_dev(int sk, SkOptsEntry *soe) { int ret; char dev[IFNAMSIZ]; socklen_t len = sizeof(dev); ret = getsockopt(sk, SOL_SOCKET, SO_BINDTODEVICE, &dev, &len); if (ret) { pr_perror("Can't get bound dev"); return ret; } if (len == 0) return 0; pr_debug("\tDumping %s bound dev for sk\n", dev); soe->so_bound_dev = xmalloc(len); if (soe->so_bound_dev == NULL) return -1; strcpy(soe->so_bound_dev, dev); return 0; } static int restore_bound_dev(int sk, SkOptsEntry *soe) { char *n = soe->so_bound_dev; if (!n) return 0; pr_debug("\tBinding socket to %s dev\n", n); return do_restore_opt(sk, SOL_SOCKET, SO_BINDTODEVICE, n, strlen(n)); } /* * Protobuf handles le/be himself, but the sock_filter is not just u64, * it's a structure and we have to preserve the fields order to be able * to move socket image across architectures. */ static void encode_filter(struct sock_filter *f, u64 *img, int n) { int i; BUILD_BUG_ON(sizeof(*f) != sizeof(*img)); for (i = 0; i < n; i++) img[i] = ((u64)f[i].code << 48) | ((u64)f[i].jt << 40) | ((u64)f[i].jf << 32) | ((u64)f[i].k << 0); } static void decode_filter(u64 *img, struct sock_filter *f, int n) { int i; for (i = 0; i < n; i++) { f[i].code = img[i] >> 48; f[i].jt = img[i] >> 40; f[i].jf = img[i] >> 32; f[i].k = img[i] >> 0; } } static int dump_socket_filter(int sk, SkOptsEntry *soe) { socklen_t len = 0; int ret; struct sock_filter *flt; ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len); if (ret) { pr_perror("Can't get socket filter len"); return ret; } if (!len) { pr_info("No filter for socket\n"); return 0; } flt = xmalloc(len * sizeof(*flt)); if (!flt) return -1; ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, flt, &len); if (ret) { pr_perror("Can't get socket filter"); xfree(flt); return ret; } soe->so_filter = xmalloc(len * sizeof(*soe->so_filter)); if (!soe->so_filter) { xfree(flt); return -1; } encode_filter(flt, soe->so_filter, len); soe->n_so_filter = len; xfree(flt); return 0; } static int restore_socket_filter(int sk, SkOptsEntry *soe) { int ret; struct sock_fprog sfp; if (!soe->n_so_filter) return 0; pr_info("Restoring socket filter\n"); sfp.len = soe->n_so_filter; sfp.filter = xmalloc(soe->n_so_filter * sfp.len); if (!sfp.filter) return -1; decode_filter(soe->so_filter, sfp.filter, sfp.len); ret = restore_opt(sk, SOL_SOCKET, SO_ATTACH_FILTER, &sfp); xfree(sfp.filter); return ret; } static struct socket_desc *sockets[SK_HASH_SIZE]; struct socket_desc *lookup_socket(unsigned ino, int family, int proto) { struct socket_desc *sd; if (!socket_test_collect_bit(family, proto)) { pr_err("Sockets (family %d, proto %d) are not collected\n", family, proto); return ERR_PTR(-EINVAL); } pr_debug("\tSearching for socket %x (family %d.%d)\n", ino, family, proto); for (sd = sockets[ino % SK_HASH_SIZE]; sd; sd = sd->next) if (sd->ino == ino) { BUG_ON(sd->family != family); return sd; } return NULL; } int sk_collect_one(unsigned ino, int family, struct socket_desc *d) { struct socket_desc **chain; d->ino = ino; d->family = family; d->already_dumped = 0; chain = &sockets[ino % SK_HASH_SIZE]; d->next = *chain; *chain = d; return 0; } int do_restore_opt(int sk, int level, int name, void *val, int len) { if (setsockopt(sk, level, name, val, len) < 0) { pr_perror("Can't set %d:%d (len %d)", level, name, len); return -1; } return 0; } static int sk_setbufs(void *arg, int fd, pid_t pid) { u32 *buf = (u32 *)arg; if (restore_opt(fd, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0])) return -1; if (restore_opt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1])) return -1; return 0; } /* * Set sizes of buffers to maximum and prevent blocking * Caller of this fn should call other socket restoring * routines to drop the non-blocking and set proper send * and receive buffers. */ int restore_prepare_socket(int sk) { int flags; /* In kernel a bufsize has type int and a value is doubled. */ u32 maxbuf[2] = { INT_MAX / 2, INT_MAX / 2 }; if (userns_call(sk_setbufs, 0, maxbuf, sizeof(maxbuf), sk)) return -1; /* Prevent blocking on restore */ flags = fcntl(sk, F_GETFL, 0); if (flags == -1) { pr_perror("Unable to get flags for %d", sk); return -1; } if (fcntl(sk, F_SETFL, flags | O_NONBLOCK) ) { pr_perror("Unable to set O_NONBLOCK for %d", sk); return -1; } return 0; } int restore_socket_opts(int sk, SkOptsEntry *soe) { int ret = 0, val; struct timeval tv; /* In kernel a bufsize value is doubled. */ u32 bufs[2] = { soe->so_sndbuf / 2, soe->so_rcvbuf / 2}; pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf); /* setsockopt() multiplies the input values by 2 */ ret |= userns_call(sk_setbufs, UNS_ASYNC, bufs, sizeof(bufs), sk); if (soe->has_so_priority) { pr_debug("\trestore priority %d for socket\n", soe->so_priority); ret |= restore_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); } if (soe->has_so_rcvlowat) { pr_debug("\trestore rcvlowat %d for socket\n", soe->so_rcvlowat); ret |= restore_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat); } if (soe->has_so_mark) { pr_debug("\trestore mark %d for socket\n", soe->so_mark); ret |= restore_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); } if (soe->has_so_passcred && soe->so_passcred) { val = 1; pr_debug("\tset passcred for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); } if (soe->has_so_passsec && soe->so_passsec) { val = 1; pr_debug("\tset passsec for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); } if (soe->has_so_dontroute && soe->so_dontroute) { val = 1; pr_debug("\tset dontroute for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); } if (soe->has_so_no_check && soe->so_no_check) { val = 1; pr_debug("\tset no_check for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_NO_CHECK, &val); } tv.tv_sec = soe->so_snd_tmo_sec; tv.tv_usec = soe->so_snd_tmo_usec; ret |= restore_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); tv.tv_sec = soe->so_rcv_tmo_sec; tv.tv_usec = soe->so_rcv_tmo_usec; ret |= restore_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv); ret |= restore_bound_dev(sk, soe); ret |= restore_socket_filter(sk, soe); /* The restore of SO_REUSEADDR depends on type of socket */ return ret; } int do_dump_opt(int sk, int level, int name, void *val, int len) { socklen_t aux = len; if (getsockopt(sk, level, name, val, &aux) < 0) { pr_perror("Can't get %d:%d opt", level, name); return -1; } if (aux != len) { pr_err("Len mismatch on %d:%d : %d, want %d\n", level, name, aux, len); return -1; } return 0; } int dump_socket_opts(int sk, SkOptsEntry *soe) { int ret = 0, val; struct timeval tv; ret |= dump_opt(sk, SOL_SOCKET, SO_SNDBUF, &soe->so_sndbuf); ret |= dump_opt(sk, SOL_SOCKET, SO_RCVBUF, &soe->so_rcvbuf); soe->has_so_priority = true; ret |= dump_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); soe->has_so_rcvlowat = true; ret |= dump_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat); soe->has_so_mark = true; ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); soe->so_snd_tmo_sec = tv.tv_sec; soe->so_snd_tmo_usec = tv.tv_usec; ret |= dump_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv); soe->so_rcv_tmo_sec = tv.tv_sec; soe->so_rcv_tmo_usec = tv.tv_usec; ret |= dump_opt(sk, SOL_SOCKET, SO_REUSEADDR, &val); soe->reuseaddr = val ? true : false; soe->has_reuseaddr = true; ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); soe->has_so_passcred = true; soe->so_passcred = val ? true : false; ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); soe->has_so_passsec = true; soe->so_passsec = val ? true : false; ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); soe->has_so_dontroute = true; soe->so_dontroute = val ? true : false; ret |= dump_opt(sk, SOL_SOCKET, SO_NO_CHECK, &val); soe->has_so_no_check = true; soe->so_no_check = val ? true : false; ret |= dump_bound_dev(sk, soe); ret |= dump_socket_filter(sk, soe); return ret; } void release_skopts(SkOptsEntry *soe) { xfree(soe->so_filter); xfree(soe->so_bound_dev); } int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *e) { int family; const struct fdtype_ops *ops; if (dump_opt(lfd, SOL_SOCKET, SO_DOMAIN, &family)) return -1; switch (family) { case AF_UNIX: ops = &unix_dump_ops; break; case AF_INET: ops = &inet_dump_ops; break; case AF_INET6: ops = &inet6_dump_ops; break; case AF_PACKET: ops = &packet_dump_ops; break; case AF_NETLINK: ops = &netlink_dump_ops; break; default: pr_err("BUG! Unknown socket collected (family %d)\n", family); return -1; } return do_dump_gen_file(p, lfd, ops, e); } static int inet_receive_one(struct nlmsghdr *h, void *arg) { struct inet_diag_req_v2 *i = arg; int type; switch (i->sdiag_protocol) { case IPPROTO_TCP: type = SOCK_STREAM; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: type = SOCK_DGRAM; break; default: BUG_ON(1); return -1; } return inet_collect_one(h, i->sdiag_family, type); } static int do_collect_req(int nl, struct sock_diag_req *req, int size, int (*receive_callback)(struct nlmsghdr *h, void *), void *arg) { int tmp; tmp = do_rtnl_req(nl, req, size, receive_callback, NULL, arg); if (tmp == 0) set_collect_bit(req->r.n.sdiag_family, req->r.n.sdiag_protocol); return tmp; } int collect_sockets(struct ns_id *ns) { int err = 0, tmp; int nl = ns->net.nlsk; struct sock_diag_req req; memset(&req, 0, sizeof(req)); req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = SOCK_DIAG_BY_FAMILY; req.hdr.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST; req.hdr.nlmsg_seq = CR_NLMSG_SEQ; /* Collect UNIX sockets */ req.r.u.sdiag_family = AF_UNIX; req.r.u.udiag_states = -1; /* All */ req.r.u.udiag_show = UDIAG_SHOW_NAME | UDIAG_SHOW_VFS | UDIAG_SHOW_PEER | UDIAG_SHOW_ICONS | UDIAG_SHOW_RQLEN; tmp = do_collect_req(nl, &req, sizeof(req), unix_receive_one, NULL); if (tmp) err = tmp; /* Collect IPv4 TCP sockets */ req.r.i.sdiag_family = AF_INET; req.r.i.sdiag_protocol = IPPROTO_TCP; req.r.i.idiag_ext = 0; /* Only listening and established sockets supported yet */ req.r.i.idiag_states = (1 << TCP_LISTEN) | (1 << TCP_ESTABLISHED) | (1 << TCP_FIN_WAIT1) | (1 << TCP_FIN_WAIT2) | (1 << TCP_CLOSE_WAIT) | (1 << TCP_LAST_ACK) | (1 << TCP_CLOSING) | (1 << TCP_SYN_SENT); tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i); if (tmp) err = tmp; /* Collect IPv4 UDP sockets */ req.r.i.sdiag_family = AF_INET; req.r.i.sdiag_protocol = IPPROTO_UDP; req.r.i.idiag_ext = 0; req.r.i.idiag_states = -1; /* All */ tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i); if (tmp) err = tmp; /* Collect IPv4 UDP-lite sockets */ req.r.i.sdiag_family = AF_INET; req.r.i.sdiag_protocol = IPPROTO_UDPLITE; req.r.i.idiag_ext = 0; req.r.i.idiag_states = -1; /* All */ tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i); if (tmp) err = tmp; /* Collect IPv6 TCP sockets */ req.r.i.sdiag_family = AF_INET6; req.r.i.sdiag_protocol = IPPROTO_TCP; req.r.i.idiag_ext = 0; /* Only listening sockets supported yet */ req.r.i.idiag_states = (1 << TCP_LISTEN) | (1 << TCP_ESTABLISHED) | (1 << TCP_FIN_WAIT1) | (1 << TCP_FIN_WAIT2) | (1 << TCP_CLOSE_WAIT) | (1 << TCP_LAST_ACK) | (1 << TCP_CLOSING) | (1 << TCP_SYN_SENT); tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i); if (tmp) err = tmp; /* Collect IPv6 UDP sockets */ req.r.i.sdiag_family = AF_INET6; req.r.i.sdiag_protocol = IPPROTO_UDP; req.r.i.idiag_ext = 0; req.r.i.idiag_states = -1; /* All */ tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i); if (tmp) err = tmp; /* Collect IPv6 UDP-lite sockets */ req.r.i.sdiag_family = AF_INET6; req.r.i.sdiag_protocol = IPPROTO_UDPLITE; req.r.i.idiag_ext = 0; req.r.i.idiag_states = -1; /* All */ tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i); if (tmp) err = tmp; req.r.p.sdiag_family = AF_PACKET; req.r.p.sdiag_protocol = 0; req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST | PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG; tmp = do_collect_req(nl, &req, sizeof(req), packet_receive_one, NULL); if (tmp) { pr_warn("The current kernel doesn't support packet_diag\n"); if (ns->ns_pid == 0 || tmp != -ENOENT) /* Fedora 19 */ err = tmp; } req.r.n.sdiag_family = AF_NETLINK; req.r.n.sdiag_protocol = NDIAG_PROTO_ALL; req.r.n.ndiag_show = NDIAG_SHOW_GROUPS; tmp = do_collect_req(nl, &req, sizeof(req), netlink_receive_one, NULL); if (tmp) { pr_warn("The current kernel doesn't support netlink_diag\n"); if (ns->ns_pid == 0 || tmp != -ENOENT) /* Fedora 19 */ err = tmp; } /* don't need anymore */ close(nl); ns->net.nlsk = -1; if (err && (ns->type == NS_CRIU)) { /* * If netns isn't dumped, criu will fail only * if an unsupported socket will be really dumped. */ pr_info("Uncollected sockets! Will probably fail later.\n"); err = 0; } return err; } criu-3.6/criu/stats.c000066400000000000000000000125251317335042600146010ustar00rootroot00000000000000#include #include #include #include "int.h" #include "atomic.h" #include "cr_options.h" #include "rst-malloc.h" #include "protobuf.h" #include "stats.h" #include "util.h" #include "image.h" #include "images/stats.pb-c.h" struct timing { struct timeval start; struct timeval total; }; struct dump_stats { struct timing timings[DUMP_TIME_NR_STATS]; unsigned long counts[DUMP_CNT_NR_STATS]; }; struct restore_stats { struct timing timings[RESTORE_TIME_NS_STATS]; atomic_t counts[RESTORE_CNT_NR_STATS]; }; struct dump_stats *dstats; struct restore_stats *rstats; void cnt_add(int c, unsigned long val) { if (dstats != NULL) { BUG_ON(c >= DUMP_CNT_NR_STATS); dstats->counts[c] += val; } else if (rstats != NULL) { BUG_ON(c >= RESTORE_CNT_NR_STATS); atomic_add(val, &rstats->counts[c]); } else BUG(); } static void timeval_accumulate(const struct timeval *from, const struct timeval *to, struct timeval *res) { suseconds_t usec; res->tv_sec += to->tv_sec - from->tv_sec; usec = to->tv_usec; if (usec < from->tv_usec) { usec += USEC_PER_SEC; res->tv_sec -= 1; } res->tv_usec += usec - from->tv_usec; if (res->tv_usec > USEC_PER_SEC) { res->tv_usec -= USEC_PER_SEC; res->tv_sec += 1; } } static struct timing *get_timing(int t) { if (dstats != NULL) { BUG_ON(t >= DUMP_TIME_NR_STATS); return &dstats->timings[t]; } else if (rstats != NULL) { /* * FIXME -- this does _NOT_ work when called * from different tasks. */ BUG_ON(t >= RESTORE_TIME_NS_STATS); return &rstats->timings[t]; } BUG(); return NULL; } void timing_start(int t) { struct timing *tm; tm = get_timing(t); gettimeofday(&tm->start, NULL); } void timing_stop(int t) { struct timing *tm; struct timeval now; tm = get_timing(t); gettimeofday(&now, NULL); timeval_accumulate(&tm->start, &now, &tm->total); } static void encode_time(int t, u_int32_t *to) { struct timing *tm; tm = get_timing(t); *to = tm->total.tv_sec * USEC_PER_SEC + tm->total.tv_usec; } static void display_stats(int what, StatsEntry *stats) { if (what == DUMP_STATS) { pr_msg("Displaying dump stats:\n"); pr_msg("Freezing time: %d us\n", stats->dump->freezing_time); pr_msg("Frozen time: %d us\n", stats->dump->frozen_time); pr_msg("Memory dump time: %d us\n", stats->dump->memdump_time); pr_msg("Memory write time: %d us\n", stats->dump->memwrite_time); if (stats->dump->has_irmap_resolve) pr_msg("IRMAP resolve time: %d us\n", stats->dump->irmap_resolve); pr_msg("Memory pages scanned: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_scanned, stats->dump->pages_scanned); pr_msg("Memory pages skipped from parent: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_skipped_parent, stats->dump->pages_skipped_parent); pr_msg("Memory pages written: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_written, stats->dump->pages_written); pr_msg("Lazy memory pages: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_lazy, stats->dump->pages_lazy); } else if (what == RESTORE_STATS) { pr_msg("Displaying restore stats:\n"); pr_msg("Pages compared: %" PRIu64 " (0x%" PRIx64 ")\n", stats->restore->pages_compared, stats->restore->pages_compared); pr_msg("Pages skipped COW: %" PRIu64 " (0x%" PRIx64 ")\n", stats->restore->pages_skipped_cow, stats->restore->pages_skipped_cow); if (stats->restore->has_pages_restored) pr_msg("Pages restored: %" PRIu64 " (0x%" PRIx64 ")\n", stats->restore->pages_restored, stats->restore->pages_restored); pr_msg("Restore time: %d us\n", stats->restore->restore_time); pr_msg("Forking time: %d us\n", stats->restore->forking_time); } else return; } void write_stats(int what) { StatsEntry stats = STATS_ENTRY__INIT; DumpStatsEntry ds_entry = DUMP_STATS_ENTRY__INIT; RestoreStatsEntry rs_entry = RESTORE_STATS_ENTRY__INIT; char *name; struct cr_img *img; pr_info("Writing stats\n"); if (what == DUMP_STATS) { stats.dump = &ds_entry; encode_time(TIME_FREEZING, &ds_entry.freezing_time); encode_time(TIME_FROZEN, &ds_entry.frozen_time); encode_time(TIME_MEMDUMP, &ds_entry.memdump_time); encode_time(TIME_MEMWRITE, &ds_entry.memwrite_time); ds_entry.has_irmap_resolve = true; encode_time(TIME_IRMAP_RESOLVE, &ds_entry.irmap_resolve); ds_entry.pages_scanned = dstats->counts[CNT_PAGES_SCANNED]; ds_entry.pages_skipped_parent = dstats->counts[CNT_PAGES_SKIPPED_PARENT]; ds_entry.pages_written = dstats->counts[CNT_PAGES_WRITTEN]; ds_entry.pages_lazy = dstats->counts[CNT_PAGES_LAZY]; name = "dump"; } else if (what == RESTORE_STATS) { stats.restore = &rs_entry; rs_entry.pages_compared = atomic_read(&rstats->counts[CNT_PAGES_COMPARED]); rs_entry.pages_skipped_cow = atomic_read(&rstats->counts[CNT_PAGES_SKIPPED_COW]); rs_entry.has_pages_restored = true; rs_entry.pages_restored = atomic_read(&rstats->counts[CNT_PAGES_RESTORED]); encode_time(TIME_FORK, &rs_entry.forking_time); encode_time(TIME_RESTORE, &rs_entry.restore_time); name = "restore"; } else return; img = open_image_at(AT_FDCWD, CR_FD_STATS, O_DUMP, name); if (img) { pb_write_one(img, &stats, PB_STATS); close_image(img); } if (opts.display_stats) display_stats(what, &stats); } int init_stats(int what) { if (what == DUMP_STATS) { dstats = xzalloc(sizeof(*dstats)); return dstats ? 0 : -1; } rstats = shmalloc(sizeof(struct restore_stats)); return rstats ? 0 : -1; } criu-3.6/criu/string.c000066400000000000000000000025321317335042600147460ustar00rootroot00000000000000/* * Adopted from linux kernel */ #include #include #include "string.h" #ifndef CONFIG_HAS_STRLCPY /** * strlcpy - Copy a %NUL terminated string into a sized buffer * @dest: Where to copy the string to * @src: Where to copy the string from * @size: size of destination buffer * * Compatible with *BSD: the result is always a valid * NUL-terminated string that fits in the buffer (unless, * of course, the buffer size is zero). It does not pad * out the result like strncpy() does. */ size_t strlcpy(char *dest, const char *src, size_t size) { size_t ret = strlen(src); if (size) { size_t len = (ret >= size) ? size - 1 : ret; memcpy(dest, src, len); dest[len] = '\0'; } return ret; } #endif #ifndef CONFIG_HAS_STRLCAT /** * strlcat - Append a length-limited, %NUL-terminated string to another * @dest: The string to be appended to * @src: The string to append to it * @count: The size of the destination buffer. */ size_t strlcat(char *dest, const char *src, size_t count) { size_t dsize = strlen(dest); size_t len = strlen(src); size_t res = dsize + len; /* * It's assumed that @dsize strictly * less than count. Otherwise it's * a bug. But we left it to a caller. */ dest += dsize; count -= dsize; if (len >= count) len = count-1; memcpy(dest, src, len); dest[len] = 0; return res; } #endif criu-3.6/criu/sysctl.c000066400000000000000000000266021317335042600147650ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "namespaces.h" #include "sysctl.h" #include "util.h" /* These are the namespaces we know how to restore in various ways. */ #define KNOWN_NS_MASK (CLONE_NEWUTS | CLONE_NEWNET | CLONE_NEWIPC) struct sysctl_userns_req { int op; unsigned int ns; size_t nr_req; struct sysctl_req *reqs; }; #define __SYSCTL_OP(__ret, __fd, __req, __type, __nr, __op) \ do { \ if (__op == CTL_READ) \ __ret = sysctl_read_##__type(__fd, __req, \ (__type *)(__req)->arg, \ __nr); \ else if (__op == CTL_WRITE) \ __ret = sysctl_write_##__type(__fd, __req, \ (__type *)(__req)->arg, \ __nr); \ else \ __ret = -1; \ } while (0) #define GEN_SYSCTL_READ_FUNC(__type, __conv) \ static int sysctl_read_##__type(int fd, \ struct sysctl_req *req, \ __type *arg, \ int nr) \ { \ char buf[1024] = {0}; \ int i, ret = -1; \ char *p = buf; \ \ ret = read(fd, buf, sizeof(buf)); \ if (ret < 0) { \ pr_perror("Can't read %s", req->name); \ ret = -1; \ goto err; \ } \ \ for (i = 0; i < nr && p < buf + sizeof(buf); p++, i++) \ ((__type *)arg)[i] = __conv(p, &p, 10); \ \ if (i != nr) { \ pr_err("Not enough params for %s (%d != %d)\n", \ req->name, i, nr); \ goto err; \ } \ \ ret = 0; \ \ err: \ return ret; \ } #define GEN_SYSCTL_WRITE_FUNC(__type, __fmt) \ static int sysctl_write_##__type(int fd, \ struct sysctl_req *req, \ __type *arg, \ int nr) \ { \ char buf[1024]; \ int i, ret = -1; \ int off = 0; \ \ for (i = 0; i < nr && off < sizeof(buf) - 1; i++) { \ snprintf(&buf[off], sizeof(buf) - off, __fmt, arg[i]); \ off += strlen(&buf[off]); \ } \ \ if (i != nr) { \ pr_err("Not enough space for %s (%d != %d)\n", \ req->name, i, nr); \ goto err; \ } \ \ /* trailing spaces in format */ \ while (off > 0 && isspace(buf[off - 1])) \ off--; \ buf[off + 0] = '\n'; \ ret = write(fd, buf, off + 1); \ if (ret < 0) { \ pr_perror("Can't write %s", req->name); \ ret = -1; \ goto err; \ } \ \ ret = 0; \ err: \ return ret; \ } GEN_SYSCTL_READ_FUNC(u32, strtoul); GEN_SYSCTL_READ_FUNC(u64, strtoull); GEN_SYSCTL_READ_FUNC(s32, strtol); GEN_SYSCTL_WRITE_FUNC(u32, "%u "); GEN_SYSCTL_WRITE_FUNC(u64, "%"PRIu64" "); GEN_SYSCTL_WRITE_FUNC(s32, "%d "); static int sysctl_write_char(int fd, struct sysctl_req *req, char *arg, int nr) { pr_debug("%s nr %d\n", req->name, nr); if (dprintf(fd, "%s\n", arg) < 0) return -1; return 0; } static int sysctl_read_char(int fd, struct sysctl_req *req, char *arg, int nr) { int ret = -1; pr_debug("%s nr %d\n", req->name, nr); ret = read(fd, arg, nr - 1); if (ret < 0) { if (errno != EIO || !(req->flags & CTL_FLAGS_READ_EIO_SKIP)) pr_perror("Can't read %s", req->name); goto err; } arg[ret]='\0'; ret = 0; err: return ret; } static int sysctl_userns_arg_size(int type) { switch(CTL_TYPE(type)) { case __CTL_U32A: return sizeof(u32) * CTL_LEN(type); case CTL_U32: return sizeof(u32); case CTL_32: return sizeof(s32); case __CTL_U64A: return sizeof(u64) * CTL_LEN(type); case CTL_U64: return sizeof(u64); case __CTL_STR: return sizeof(char) * CTL_LEN(type) + 1; default: pr_err("unknown arg type %d\n", type); /* Ensure overflow to cause an error */ return MAX_UNSFD_MSG_SIZE; } } static int do_sysctl_op(int fd, struct sysctl_req *req, int op) { int ret = -1, nr = 1; switch (CTL_TYPE(req->type)) { case __CTL_U32A: nr = CTL_LEN(req->type); /* fallthrough */ case CTL_U32: __SYSCTL_OP(ret, fd, req, u32, nr, op); break; case CTL_32: __SYSCTL_OP(ret, fd, req, s32, nr, op); break; case __CTL_U64A: nr = CTL_LEN(req->type); /* fallthrough */ case CTL_U64: __SYSCTL_OP(ret, fd, req, u64, nr, op); break; case __CTL_STR: nr = CTL_LEN(req->type); __SYSCTL_OP(ret, fd, req, char, nr, op); break; } return ret; } static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid) { int fd, ret = -1, dir, i, status, *fds = NULL; struct sysctl_userns_req *userns_req = arg; int op = userns_req->op; struct sysctl_req *req, **reqs = NULL; sigset_t blockmask, oldmask; pid_t worker; // fix up the pointer req = userns_req->reqs = (struct sysctl_req *) &userns_req[1]; /* For files in the IPC/UTS namespaces, restoring is more complicated * than for net. Unprivileged users cannot even open these files, so * they must be opened by usernsd. However, the value in the kernel is * changed for the IPC/UTS namespace that write()s to the open sysctl * file (not who opened it). So, we must set the value from inside the * usernsd caller's namespace. We: * * 1. unsd opens the sysctl files * 2. forks a task * 3. setns()es to the UTS/IPC namespace of the caller * 4. write()s to the files and exits */ dir = open("/proc/sys", O_RDONLY, O_DIRECTORY); if (dir < 0) { pr_perror("Can't open sysctl dir"); return -1; } fds = xmalloc(sizeof(int) * userns_req->nr_req); if (!fds) goto out; reqs = xmalloc(sizeof(struct sysctl_req *) * userns_req->nr_req); if (!reqs) goto out; memset(fds, -1, sizeof(int) * userns_req->nr_req); for (i = 0; i < userns_req->nr_req; i++) { int arg_len = sysctl_userns_arg_size(req->type); int name_len = strlen((char *) &req[1]) + 1; int total_len = sizeof(*req) + arg_len + name_len; int flags; /* fix up the pointers */ req->name = (char *) &req[1]; req->arg = req->name + name_len; if (((char *) req) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) { pr_err("bad sysctl req %s, too big: %d\n", req->name, total_len); goto out; } if (op == CTL_READ) flags = O_RDONLY; else flags = O_WRONLY; fd = openat(dir, req->name, flags); if (fd < 0) { if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL)) continue; pr_perror("Can't open sysctl %s", req->name); goto out; } /* save a pointer to the req, so we don't need to recompute its * location */ reqs[i] = req; fds[i] = fd; req = (struct sysctl_req *) (((char *) req) + total_len); } /* * Don't let the sigchld_handler() mess with us * calling waitpid() on the exited worker. The * same is done in cr_system(). */ sigemptyset(&blockmask); sigaddset(&blockmask, SIGCHLD); sigprocmask(SIG_BLOCK, &blockmask, &oldmask); worker = fork(); if (worker < 0) goto out; if (!worker) { int nsfd; const char *nsname = ns_to_string(userns_req->ns); BUG_ON(!nsname); nsfd = openat(proc_fd, nsname, O_RDONLY); if (nsfd < 0) { pr_perror("failed to open pid %d's ns %s", pid, nsname); exit(1); } if (setns(nsfd, 0) < 0) { pr_perror("failed to setns to %d's ns %s", pid, nsname); exit(1); } close(nsfd); for (i = 0; i < userns_req->nr_req; i++) { if (do_sysctl_op(fds[i], reqs[i], op) < 0) { if (op != CTL_READ || errno != EIO || !(req->flags & CTL_FLAGS_READ_EIO_SKIP)) exit(1); } else { /* mark sysctl in question exists */ req->flags |= CTL_FLAGS_HAS; } } exit(0); } if (waitpid(worker, &status, 0) != worker) { pr_perror("worker didn't die?"); kill(worker, SIGKILL); goto out; } sigprocmask(SIG_SETMASK, &oldmask, NULL); if (!WIFEXITED(status) || WEXITSTATUS(status)) { pr_err("worker failed: %d\n", status); goto out; } ret = 0; out: if (fds) { for (i = 0; i < userns_req->nr_req; i++) { if (fds[i] < 0) break; close_safe(&fds[i]); } xfree(fds); } if (reqs) xfree(reqs); close_safe(&dir); return ret; } static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) { int ret, exit_code = -1;; while (nr_req--) { int fd; if (op == CTL_READ) fd = do_open_proc(PROC_GEN, O_RDONLY, "sys/%s", req->name); else fd = do_open_proc(PROC_GEN, O_RDWR, "sys/%s", req->name); if (fd < 0) { if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL)) { req++; continue; } pr_perror("Can't open sysctl %s", req->name); goto out; } ret = do_sysctl_op(fd, req, op); if (ret) { if (op != CTL_READ || errno != EIO || !(req->flags & CTL_FLAGS_READ_EIO_SKIP)) { close(fd); goto out; } } else { /* mark sysctl in question exists */ req->flags |= CTL_FLAGS_HAS; } close(fd); req++; } exit_code = 0; out: return exit_code; } int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns) { int i, fd, ret; struct sysctl_userns_req *userns_req; struct sysctl_req *cur; if (nr_req == 0) return 0; if (ns & ~KNOWN_NS_MASK) { pr_err("don't know how to restore some namespaces in %u\n", ns); return -1; } /* The way sysctl files behave on open/write depends on the namespace * they correspond to. If we don't want to interact with something in a * namespace (e.g. kernel/cap_last_cap is global), we can do this from * the current process. Similarly, if we're accessing net namespaces, * we can just do the operation from our current process, since * anything with CAP_NET_ADMIN can write to the net/ sysctls, and we * still have that even when restoring in a user ns. * * For IPC/UTS, we restore them as described above. * * For read operations, we need to copy the values back to return. * Fortunately, we only do read on dump (or global reads on restore), * so we can do those in process as well. */ if (!ns || ns & CLONE_NEWNET || op == CTL_READ) return __nonuserns_sysctl_op(req, nr_req, op); /* * In order to avoid lots of opening of /proc/sys for each struct sysctl_req, * we encode each array of sysctl_reqs into one contiguous region of memory so * it can be passed via userns_call if necessary. It looks like this: * * struct sysctl_userns_req struct sysctl_req name arg * --------------------------------------------------------------------------- * | op | nr_req | reqs | | name | arg | "the name" | "the arg" ... * --------------------------------------------------------------------------- * |____^ |______|__^ ^ * |_______________| */ userns_req = alloca(MAX_UNSFD_MSG_SIZE); userns_req->op = op; userns_req->nr_req = nr_req; userns_req->ns = ns; userns_req->reqs = (struct sysctl_req *) (&userns_req[1]); cur = userns_req->reqs; for (i = 0; i < nr_req; i++) { int arg_len = sysctl_userns_arg_size(req[i].type); int name_len = strlen(req[i].name) + 1; int total_len = sizeof(*cur) + arg_len + name_len; if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) { pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len); return -1; } /* copy over the non-pointer fields */ cur->type = req[i].type; cur->flags = req[i].flags; cur->name = (char *) &cur[1]; strcpy(cur->name, req[i].name); cur->arg = cur->name + name_len; memcpy(cur->arg, req[i].arg, arg_len); cur = (struct sysctl_req *) (((char *) cur) + total_len); } fd = open_proc(PROC_SELF, "ns"); if (fd < 0) return -1; ret = userns_call(__userns_sysctl_op, 0, userns_req, MAX_UNSFD_MSG_SIZE, fd); close(fd); return ret; } criu-3.6/criu/sysfs_parse.c000066400000000000000000000171041317335042600160020ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "cr_options.h" #include "log.h" #include "xmalloc.h" #include "files.h" #include "proc_parse.h" #include "util.h" #include "sysfs_parse.h" #include "namespaces.h" #include "mount.h" /* * Currently, there are two kernel problems dealing with AUFS * filesystems. Until these problems are fixed in the kernel, * we have AUFS support in CRIU to handle the following issues: * * 1) /proc//mountinfo: The problem is that for AUFS the root field * of the root entry is missing the pathname (it's only /). For example: * * 90 61 0:33 / / rw,relatime - aufs none rw,si=4476a910a24617e6 * * To handle this issue, the user has to specify the root of the AUFS * filesystem with the --root command line option. * * 2) /proc//map_files: The symlinks are absolute pathnames of the * corresponding *physical* files in the branch they exist. For example, * for a Docker container using AUFS, a symlink would look like: * 400000-489000 -> /var/lib/docker/aufs/diff//bin/ * * Therefore, when we use the link file descriptor vm_file_fd in * dump_one_reg_file() to read the link, we get the file's physical * absolute pathname which does not exist relative to the root of the * mount namespace and even if we used its relative pathname, the dev:ino * values would be different from the physical file's dev:ino causing the * dump to fail. * * To handle this issue, we figure out the "correct" paths when parsing * map_files and save it for later use. See fixup_aufs_vma_fd() for * details. */ struct ns_id *aufs_nsid; static char **aufs_branches; /* * Parse out and save the AUFS superblock info in the * given buffer. */ static int parse_aufs_sbinfo(struct mount_info *mi, char *sbinfo, int len) { char *cp; int n; cp = strstr(mi->options, "si="); if (!cp) { pr_err("Cannot find sbinfo in option string %s\n", mi->options); return -1; } /* all ok, copy */ if (len < 4) { /* 4 for "si_" */ pr_err("Buffer of %d bytes too small for sbinfo\n", len); return -1; } strcpy(sbinfo, "si_"); n = 3; sbinfo += n; cp += n; while (isxdigit(*cp) && n < len) { *sbinfo++ = *cp++; n++; } if (n >= len) { pr_err("Sbinfo in options string %s too long\n", mi->options); return -1; } *sbinfo = '\0'; return 0; } /* * If the specified path is in a branch, replace it * with pathname from root. */ static int fixup_aufs_path(char *path, int size) { char rpath[PATH_MAX]; int n; int blen; if (aufs_branches == NULL) { pr_err("No aufs branches to search for %s\n", path); return -1; } for (n = 0; aufs_branches[n] != NULL; n++) { blen = strlen(aufs_branches[n]); if (!strncmp(path, aufs_branches[n], blen)) break; } if (aufs_branches[n] == NULL) return 0; /* not in a branch */ n = snprintf(rpath, PATH_MAX, "%s", &path[blen]); if (n >= min(PATH_MAX, size)) { pr_err("Not enough space to replace %s\n", path); return -1; } pr_debug("Replacing %s with %s\n", path, rpath); strcpy(path, rpath); return n; } /* * Kernel stores patchnames to AUFS branches in the br files in * the /sys/fs/aufs/si_ directory where denotes a branch * number and is a hexadecimal number in %lx format. For * example: * * $ cat /sys/fs/aufs/si_f598876b087ed883/br0 * /path/to/branch0/directory=rw * * This function sets up an array of pointers to branch pathnames. */ int parse_aufs_branches(struct mount_info *mi) { char path[AUFSBR_PATH_LEN]; char *cp; int n; int ret; unsigned int br_num; unsigned int br_max; DIR *dp; FILE *fp; struct dirent *de; pr_info("Collecting AUFS branch pathnames ...\n"); if (mi->nsid == 0) { pr_err("No nsid to parse its aufs branches\n"); return -1; } if (mi->nsid == aufs_nsid) { pr_debug("Using cached aufs branch paths for nsid %p\n", aufs_nsid); return 0; } if (aufs_nsid) free_aufs_branches(); strcpy(path, SYSFS_AUFS); /* /sys/fs/aufs/ */ if (parse_aufs_sbinfo(mi, &path[sizeof SYSFS_AUFS - 1], SBINFO_LEN) < 0) return -1; if ((dp = opendir(path)) == NULL) { pr_perror("Cannot opendir %s", path); return -1; } /* * Find out how many branches we have. */ br_max = 0; ret = 0; while (1) { errno = 0; if ((de = readdir(dp)) == NULL) { if (errno) { pr_perror("Cannot readdir %s", path); ret = -1; } break; } ret = sscanf(de->d_name, "br%d", &br_num); if (ret == 1 && br_num > br_max) br_max = br_num; } closedir(dp); if (ret == -1) return -1; /* * Default AUFS maximum is 127, so 1000 should be plenty. * If you increase the maximum to more than 3 digits, * make sure to change AUFSBR_PATH_LEN accordingly. */ if (br_max > 999) { pr_err("Too many branches %d\n", br_max); return -1; } /* * Allocate an array of pointers to branch pathnames to be read. * Branches are indexed from 0 and we need a NULL pointer at the end. */ aufs_branches = xzalloc((br_max + 2) * sizeof (char *)); if (!aufs_branches) return -1; /* * Now read branch pathnames from the branch files. */ n = strlen(path); for (br_num = 0; br_num <= br_max; br_num++) { fp = NULL; ret = snprintf(&path[n], sizeof path - n, "/br%d", br_num); if (ret >= sizeof path - n) { pr_err("Buffer overrun creating path for branch %d\n", br_num); goto err; } if ((fp = fopen(path, "r")) == NULL) { pr_perror("Cannot fopen %s", path); goto err; } if (fscanf(fp, "%ms=", &aufs_branches[br_num]) != 1 || aufs_branches[br_num] == NULL) { pr_perror("Parse error reading %s", path); goto err; } /* chop off the trailing "=..." stuff */ if ((cp = strchr(aufs_branches[br_num], '=')) == NULL) { pr_err("Bad format in branch pathname %s\n", aufs_branches[br_num]); goto err; } *cp = '\0'; fclose(fp); /* * Log branch information for extenal utitilies that * want to recreate the process's AUFS filesystem * before calling criu restore. * * DO NOT CHANGE this format! */ pr_info("%s : %s\n", path, aufs_branches[br_num]); } aufs_nsid = mi->nsid; return 0; err: if (fp) fclose(fp); free_aufs_branches(); return -1; } /* * AUFS support to compensate for the kernel bug * exposing branch pathnames in map_files and providing * a wrong mnt_id value in /proc//fdinfo/. * * If the link points inside a branch, save the * relative pathname from the root of the mount * namespace as well as the full pathname from * globl root (/) for later use in dump_filemap() * and parse_smaps(). */ int fixup_aufs_vma_fd(struct vma_area *vma, int vm_file_fd) { char path[PATH_MAX]; int len; path[0] = '.'; len = read_fd_link(vm_file_fd, &path[0], sizeof path - 1); if (len < 0) return -1; len = fixup_aufs_path(&path[1], sizeof path - 1); if (len <= 0) return len; vma->aufs_rpath = xmalloc(len + 2); if (!vma->aufs_rpath) return -1; strcpy(vma->aufs_rpath, path); if (opts.root) { /* skip ./ in path */ vma->aufs_fpath = xsprintf("%s/%s", opts.root, &path[2]); if (!vma->aufs_fpath) return -1; } pr_debug("Saved AUFS paths %s and %s\n", vma->aufs_rpath, vma->aufs_fpath); if (stat(vma->aufs_fpath, vma->vmst) < 0) { pr_perror("Failed stat on map %"PRIx64" (%s)", vma->e->start, vma->aufs_fpath); return -1; } /* tell parse_smap() not to call get_fd_mntid() */ vma->mnt_id = -1; return len; } void free_aufs_branches(void) { int n; if (aufs_branches) { for (n = 0; aufs_branches[n] != NULL; n++) xfree(aufs_branches[n]); xfree(aufs_branches); aufs_branches = NULL; } aufs_nsid = NULL; } criu-3.6/criu/timerfd.c000066400000000000000000000101011317335042600150610ustar00rootroot00000000000000#include #include #include #include #include #include "protobuf.h" #include "images/timerfd.pb-c.h" #include "fdinfo.h" #include "rst-malloc.h" #include "cr_options.h" #include "restorer.h" #include "timerfd.h" #include "pstree.h" #include "files.h" #include "imgset.h" #include "util.h" #include "log.h" #include "common/bug.h" #undef LOG_PREFIX #define LOG_PREFIX "timerfd: " struct timerfd_dump_arg { u32 id; const struct fd_parms *p; }; struct timerfd_info { TimerfdEntry *tfe; struct file_desc d; int t_fd; struct list_head rlist; }; static LIST_HEAD(rst_timerfds); int check_timerfd(void) { int fd, ret = -1; fd = timerfd_create(CLOCK_MONOTONIC, 0); if (fd < 0) { pr_perror("timerfd_create failed"); return -1; } else { ret = ioctl(fd, TFD_IOC_SET_TICKS, NULL); if (ret < 0) { if (errno != EFAULT) pr_perror("No timerfd support for c/r"); else ret = 0; } } close(fd); return ret; } int is_timerfd_link(char *link) { return is_anon_link_type(link, "[timerfd]"); } static int dump_one_timerfd(int lfd, u32 id, const struct fd_parms *p) { TimerfdEntry tfe = TIMERFD_ENTRY__INIT; FileEntry fe = FILE_ENTRY__INIT; if (parse_fdinfo(lfd, FD_TYPES__TIMERFD, &tfe)) return -1; tfe.id = id; tfe.flags = p->flags; tfe.fown = (FownEntry *)&p->fown; pr_info("Dumping id %#x clockid %d it_value(%llu, %llu) it_interval(%llu, %llu)\n", tfe.id, tfe.clockid, (unsigned long long)tfe.vsec, (unsigned long long)tfe.vnsec, (unsigned long long)tfe.isec, (unsigned long long)tfe.insec); fe.type = FD_TYPES__TIMERFD; fe.id = tfe.id; fe.tfd = &tfe; return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } const struct fdtype_ops timerfd_dump_ops = { .type = FD_TYPES__TIMERFD, .dump = dump_one_timerfd, }; int prepare_timerfds(struct task_restore_args *ta) { struct timerfd_info *ti; struct restore_timerfd *t; ta->timerfd = (struct restore_timerfd *)rst_mem_align_cpos(RM_PRIVATE); ta->timerfd_n = 0; list_for_each_entry(ti, &rst_timerfds, rlist) { TimerfdEntry *tfe = ti->tfe; t = rst_mem_alloc(sizeof(*t), RM_PRIVATE); if (!t) return -1; t->id = tfe->id; t->fd = ti->t_fd; t->clockid = tfe->clockid; t->ticks = (unsigned long)tfe->ticks; t->settime_flags = tfe->settime_flags; t->val.it_interval.tv_sec = (time_t)tfe->isec; t->val.it_interval.tv_nsec = (long)tfe->insec; t->val.it_value.tv_sec = (time_t)tfe->vsec; t->val.it_value.tv_nsec = (long)tfe->vnsec; ta->timerfd_n++; } return 0; } static int timerfd_open(struct file_desc *d, int *new_fd) { struct timerfd_info *info; TimerfdEntry *tfe; int tmp = -1; info = container_of(d, struct timerfd_info, d); tfe = info->tfe; pr_info("Creating timerfd id %#x clockid %d settime_flags %x ticks %llu " "it_value(%llu, %llu) it_interval(%llu, %llu)\n", tfe->id, tfe->clockid, tfe->settime_flags, (unsigned long long)tfe->ticks, (unsigned long long)tfe->vsec, (unsigned long long)tfe->vnsec, (unsigned long long)tfe->isec, (unsigned long long)tfe->insec); tmp = timerfd_create(tfe->clockid, 0); if (tmp < 0) { pr_perror("Can't create for %#x", tfe->id); return -1; } if (rst_file_params(tmp, tfe->fown, tfe->flags)) { pr_perror("Can't restore params for %#x", tfe->id); goto err_close; } info->t_fd = file_master(d)->fe->fd; list_add_tail(&info->rlist, &rst_timerfds); *new_fd = tmp; return 0; err_close: close_safe(&tmp); return -1; } static struct file_desc_ops timerfd_desc_ops = { .type = FD_TYPES__TIMERFD, .open = timerfd_open, }; static int collect_one_timerfd(void *o, ProtobufCMessage *msg, struct cr_img *i) { struct timerfd_info *info = o; info->tfe = pb_msg(msg, TimerfdEntry); if (verify_timerfd(info->tfe)) { pr_err("Verification failed for %#x\n", info->tfe->id); return -1; } info->t_fd = -1; return file_desc_add(&info->d, info->tfe->id, &timerfd_desc_ops); } struct collect_image_info timerfd_cinfo = { .fd_type = CR_FD_TIMERFD, .pb_type = PB_TIMERFD, .priv_size = sizeof(struct timerfd_info), .collect = collect_one_timerfd, }; criu-3.6/criu/tty.c000066400000000000000000001516461317335042600142730ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include "common/compiler.h" #include "crtools.h" #include "files.h" #include "cr_options.h" #include "imgset.h" #include "servicefd.h" #include "rst-malloc.h" #include "log.h" #include "common/list.h" #include "util-pie.h" #include "proc_parse.h" #include "file-ids.h" #include "files-reg.h" #include "namespaces.h" #include "external.h" #include "action-scripts.h" #include "mount.h" #include "protobuf.h" #include "util.h" #include "images/tty.pb-c.h" #include "parasite-syscall.h" #include "parasite.h" #include "pstree.h" #include "fdstore.h" #include "tty.h" /* * Here are some notes about overall TTY c/r design. At moment * we support unix98 ptys only. Supporting legacy BSD terminals * is impossible without help from the kernel side -- the indices * of such terminals are not reported anywhere in the kernel so that * we can't figure out active pairs. * * Usually the PTYs represent a pair of links -- master peer and slave * peer. Master peer must be opened before slave. Internally, when kernel * creates master peer it also generates a slave interface in a form of * /dev/pts/N, where N is that named pty "index". Master/slave connection * unambiguously identified by this index. * * Still, one master can carry multiple slaves -- for example a user opens * one master via /dev/ptmx and appropriate /dev/pts/N in sequence. * The result will be the following * * master * `- slave 1 * `- slave 2 * * both slave will have same master index but different file descriptors. * Still inside the kernel pty parameters are same for both slaves. Thus * only one slave parameters should be restored, there is no need to carry * all parameters for every slave peer we've found. * * Note the /dev/pts/ is rather convenient agreement and internally the * kernel doesn't care where exactly the inodes of ptys are laying -- * it depends on "devpts" mount point path. */ #undef LOG_PREFIX #define LOG_PREFIX "tty: " struct tty_data_entry { struct list_head list; TtyDataEntry *tde; }; struct tty_info { struct list_head list; struct file_desc d; struct file_desc *reg_d; TtyFileEntry *tfe; TtyInfoEntry *tie; struct list_head sibling; struct tty_driver *driver; bool create; bool inherit; struct tty_info *ctl_tty; struct tty_info *link; struct tty_data_entry *tty_data; int fdstore_id; }; struct tty_dump_info { struct list_head list; u32 id; pid_t sid; pid_t pgrp; pid_t pid_real; int fd; int mnt_id; struct tty_driver *driver; int index; int lfd; int flags; struct tty_dump_info *link; void *tty_data; size_t tty_data_size; }; static bool stdin_isatty = false; static LIST_HEAD(collected_ttys); static LIST_HEAD(all_ttys); /* * Usually an application has not that many ttys opened. * If this won't be enough in future we simply need to * change tracking mechanism to some more extendable. * * This particular bitmap requires 256 bytes of memory. * Pretty acceptable trade off in a sake of simplicity. */ #define MAX_TTYS 1024 /* * Custom indices should be even numbers just in case if we * need odds for pair numbering someday. */ #define MAX_PTY_INDEX 1000 #define CONSOLE_INDEX 1002 #define VT_INDEX 1004 #define CTTY_INDEX 1006 #define ETTY_INDEX 1008 #define STTY_INDEX 1010 #define INDEX_ERR (MAX_TTYS + 1) static DECLARE_BITMAP(tty_bitmap, (MAX_TTYS << 1)); static DECLARE_BITMAP(tty_active_pairs, (MAX_TTYS << 1)); struct tty_driver { short type; short subtype; char *name; int index; int (*fd_get_index)(int fd, const struct fd_parms *p); int (*img_get_index)(struct tty_info *ti); int (*open)(struct tty_info *ti); }; #define TTY_SUBTYPE_MASTER 0x0001 #define TTY_SUBTYPE_SLAVE 0x0002 static int ptm_fd_get_index(int fd, const struct fd_parms *p) { int index; if (ioctl(fd, TIOCGPTN, &index)) { pr_perror("Can't obtain ptmx index"); return INDEX_ERR; } if (index > MAX_PTY_INDEX) { pr_err("Index %d on ptmx is too big\n", index); return INDEX_ERR; } return index; } static int pty_get_index(struct tty_info *ti) { return ti->tie->pty->index; } static int pty_open_ptmx(struct tty_info *info); static struct tty_driver ptm_driver = { .type = TTY_TYPE__PTY, .subtype = TTY_SUBTYPE_MASTER, .name = "ptmx", .fd_get_index = ptm_fd_get_index, .img_get_index = pty_get_index, .open = pty_open_ptmx, }; static int open_simple_tty(struct tty_info *info); static struct tty_driver console_driver = { .type = TTY_TYPE__CONSOLE, .name = "console", .index = CONSOLE_INDEX, .open = open_simple_tty, }; static struct tty_driver ctty_driver = { .type = TTY_TYPE__CTTY, .name = "ctty", .index = CTTY_INDEX, .open = open_simple_tty, }; static struct tty_driver vt_driver = { .type = TTY_TYPE__VT, .name = "vt", .index = VT_INDEX, .open = open_simple_tty, }; static int open_ext_tty(struct tty_info *info); static struct tty_driver ext_driver = { .type = TTY_TYPE__EXT_TTY, .name = "ext", .index = ETTY_INDEX, .open = open_ext_tty, }; static struct tty_driver serial_driver = { .type = TTY_TYPE__SERIAL, .name = "serial", .index = STTY_INDEX, .open = open_simple_tty, }; static int pts_fd_get_index(int fd, const struct fd_parms *p) { int index; const struct fd_link *link = p->link; char *pos = strrchr(link->name, '/'); if (!pos || pos == (link->name + link->len - 1)) { pr_err("Unexpected format on path %s\n", link->name + 1); return INDEX_ERR; } index = atoi(pos + 1); if (index > MAX_PTY_INDEX) { pr_err("Index %d on pts is too big\n", index); return INDEX_ERR; } return index; } static struct tty_driver pts_driver = { .type = TTY_TYPE__PTY, .subtype = TTY_SUBTYPE_SLAVE, .name = "pts", .fd_get_index = pts_fd_get_index, .img_get_index = pty_get_index, .open = pty_open_ptmx, }; struct tty_driver *get_tty_driver(dev_t rdev, dev_t dev) { int major, minor; char id[42]; snprintf(id, sizeof(id), "tty[%"PRIx64":%"PRIx64"]", rdev, dev); if (external_lookup_id(id) || inherit_fd_lookup_id(id) >= 0) return &ext_driver; major = major(rdev); minor = minor(rdev); switch (major) { case TTYAUX_MAJOR: if (minor == 2) return &ptm_driver; else if (minor == 1) return &console_driver; else if (minor == 0) return &ctty_driver; break; case TTY_MAJOR: if (minor >= MIN_NR_CONSOLES && minor <= MAX_NR_CONSOLES) /* * Minors [MIN_NR_CONSOLES; MAX_NR_CONSOLES] stand * for consoles (virtual terminals, VT in terms * of kernel). */ return &vt_driver; #ifdef __s390x__ /* * On s390 we have the following consoles: * - tty3215 : ttyS0 , minor = 64, linemode console * - sclp_line : ttyS0 , minor = 64, linemode console * - sclp_vt220 : ttysclp0, minor = 65, vt220 console * See also "drivers/s390/char" */ else if (minor == 64 || minor == 65) return &vt_driver; #endif /* Other minors points to UART serial ports */ break; case USB_SERIAL_MAJOR: case LOW_DENSE_SERIAL_MAJOR: return &serial_driver; case UNIX98_PTY_MASTER_MAJOR ... (UNIX98_PTY_MASTER_MAJOR + UNIX98_PTY_MAJOR_COUNT - 1): return &ptm_driver; case UNIX98_PTY_SLAVE_MAJOR: return &pts_driver; } return NULL; } static inline int is_pty(struct tty_driver *driver) { return driver->type == TTY_TYPE__PTY; } /* * /dev/ptmx is a shared resource between all tasks * so we need to serialize access to it. */ static mutex_t *tty_mutex; static bool tty_is_master(struct tty_info *info); static int init_tty_mutex(void) { if (tty_mutex) return 0; tty_mutex = shmalloc(sizeof(*tty_mutex)); if (!tty_mutex) { pr_err("Can't create ptmx index mutex\n"); return -1; } mutex_init(tty_mutex); return 0; } #define winsize_copy(d, s) \ do { \ ASSIGN_MEMBER((d), (s), ws_row); \ ASSIGN_MEMBER((d), (s), ws_col); \ ASSIGN_MEMBER((d), (s), ws_xpixel); \ ASSIGN_MEMBER((d), (s), ws_ypixel); \ } while (0) #define termios_copy(d, s) \ do { \ struct termios __t; \ \ memcpy((d)->c_cc, (s)->c_cc, \ sizeof(__t.c_cc)); \ \ ASSIGN_MEMBER((d),(s), c_iflag); \ ASSIGN_MEMBER((d),(s), c_oflag); \ ASSIGN_MEMBER((d),(s), c_cflag); \ ASSIGN_MEMBER((d),(s), c_lflag); \ ASSIGN_MEMBER((d),(s), c_line); \ } while (0) static int tty_gen_id(struct tty_driver *driver, int index) { return (index << 1) + (driver->subtype == TTY_SUBTYPE_MASTER); } static int tty_get_index(u32 id) { return id >> 1; } /* Make sure the active pairs do exist */ static int tty_verify_active_pairs(void) { unsigned long i, unpaired_slaves = 0; for_each_bit(i, tty_active_pairs) { if ((i % 2) == 0) { if (test_bit(i + 1, tty_active_pairs)) { i++; continue; } if (!opts.shell_job && !opts.orphan_pts_master) { pr_err("Found slave peer index %d without " "correspond master peer\n", tty_get_index(i)); return -1; } pr_debug("Unpaired slave %d\n", tty_get_index(i)); if (++unpaired_slaves > 1) { pr_err("Only one slave external peer " "is allowed (index %d)\n", tty_get_index(i)); return -1; } } } return 0; } static int tty_test_and_set(int bit, unsigned long *bitmap) { int ret; ret = test_bit(bit, bitmap); if (!ret) set_bit(bit, bitmap); return ret; } /* * Generate a regular file object in case if such is missed * in the image file, ie obsolete interface has been used on * checkpoint. */ static struct file_desc *pty_alloc_reg(struct tty_info *info, bool add) { TtyFileEntry *tfe = info->tfe; const size_t namelen = 64; struct reg_file_info *r; static struct file_desc_ops noops = {}; r = xzalloc(sizeof(*r) + sizeof(*r->rfe) + namelen); if (!r) return NULL; r->rfe = (void *)r + sizeof(*r); reg_file_entry__init(r->rfe); r->rfe->name = (void *)r + sizeof(*r) + sizeof(*r->rfe); if (tty_is_master(info)) strcpy(r->rfe->name, "/dev/ptmx"); else snprintf(r->rfe->name, namelen, "/dev/pts/%u", info->tie->pty->index); if (add) file_desc_add(&r->d, tfe->id, &noops); else file_desc_init(&r->d, tfe->id, &noops); r->rfe->id = tfe->id; r->rfe->flags = tfe->flags; r->rfe->fown = tfe->fown; r->path = &r->rfe->name[1]; return &r->d; } /* * In case if we need to open a fake pty (for example * a master peer which were deleted at checkpoint moment, * or open a slave peer when restoring control terminal) * we need to create a new reg-file object taking @info * as a template. Here is a trick though: the @info might * represent master peer while we need to allocate a slave * one and the reverse. For such case taking path from the * @info as a template we generate that named 'inverted-path'. * * For example if the master peer was /dev/pts/ptmx with index 1, * the inverted path is /dev/pts/1, for inverted slaves it's simpler * we just add 'ptmx' postfix. */ static struct reg_file_info *pty_alloc_fake_reg(struct tty_info *info, int subtype) { struct reg_file_info *new, *orig; struct file_desc *fake_desc; pr_debug("Allocating fake descriptor for %#x (reg_d %p)\n", info->tfe->id, info->reg_d); BUG_ON(!info->reg_d); BUG_ON(!is_pty(info->driver)); fake_desc = pty_alloc_reg(info, false); if (!fake_desc) return NULL; orig = container_of(info->reg_d, struct reg_file_info, d); new = container_of(fake_desc, struct reg_file_info, d); if ((subtype == TTY_SUBTYPE_MASTER && tty_is_master(info)) || (subtype == TTY_SUBTYPE_SLAVE && !tty_is_master(info))) { new->path = xstrdup(orig->path); new->rfe->name = &new->path[1]; } else { char *pos = strrchr(orig->rfe->name, '/'); size_t len = strlen(orig->rfe->name) + 1; size_t slash_at = pos - orig->rfe->name; char *inverted_path = xmalloc(len + 32); BUG_ON(!pos || !inverted_path); memcpy(inverted_path, orig->rfe->name, slash_at + 1); if (subtype == TTY_SUBTYPE_MASTER) { inverted_path[slash_at + 1] = '\0'; strcat(inverted_path, "ptmx"); } else { if (slash_at >= 3 && strncmp(&inverted_path[slash_at - 3], "pts", 3)) snprintf(&inverted_path[slash_at + 1], 10, "pts/%u", info->tie->pty->index); else snprintf(&inverted_path[slash_at + 1], 10, "%u", info->tie->pty->index); } new->rfe->name = inverted_path; new->path = &inverted_path[1]; } return new; } #define pty_alloc_fake_master(info) pty_alloc_fake_reg(info, TTY_SUBTYPE_MASTER) #define pty_alloc_fake_slave(info) pty_alloc_fake_reg(info, TTY_SUBTYPE_SLAVE) static void pty_free_fake_reg(struct reg_file_info **r) { if (*r) { xfree((*r)->rfe->name); xfree((*r)); *r = NULL; } } static int do_open_tty_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) { int fd; fd = do_open_reg_noseek_flags(ns_root_fd, rfi, arg); if (fd >= 0) { /* * Peers might have differend modes set * after creation before we've dumped * them. So simply setup mode from image * the regular file engine will check * for this, so if we fail here it * gonna be catched anyway. */ if (rfi->rfe->has_mode) fchmod(fd, rfi->rfe->mode); } return fd; } static int open_tty_reg(void *arg, int flags) { struct file_desc *reg_d = arg; /* * Never set as a control terminal automatically, all * ctty magic happens only in tty_set_sid(). */ flags |= O_NOCTTY; return open_path(reg_d, do_open_tty_reg, &flags); } static char *path_from_reg(struct file_desc *d) { struct reg_file_info *rfi = container_of(d, struct reg_file_info, d); return rfi->path; } static int __pty_open_ptmx_index(int index, int flags, int (*cb)(void *arg, int flags), void *arg, char *path) { int fds[32], i, ret = -1, cur_idx; memset(fds, 0xff, sizeof(fds)); mutex_lock(tty_mutex); for (i = 0; i < ARRAY_SIZE(fds); i++) { fds[i] = cb(arg, flags); if (fds[i] < 0) { pr_err("Can't open %s\n", path); break; } if (ioctl(fds[i], TIOCGPTN, &cur_idx)) { pr_perror("Can't obtain current index on %s", path); break; } pr_debug("\t\tptmx opened with index %d\n", cur_idx); if (cur_idx == index) { pr_info("ptmx opened with index %d\n", cur_idx); ret = fds[i]; fds[i] = -1; break; } /* * Maybe indices are already borrowed by * someone else, so no need to continue. */ if (cur_idx < index && (index - cur_idx) < ARRAY_SIZE(fds)) continue; pr_err("Unable to open %s with specified index %d\n", path, index); break; } for (i = 0; i < ARRAY_SIZE(fds); i++) { if (fds[i] >= 0) close(fds[i]); } mutex_unlock(tty_mutex); return ret; } static int pty_open_ptmx_index(struct file_desc *d, struct tty_info *info, int flags) { if (info->fdstore_id >= 0) return fdstore_get(info->fdstore_id); return __pty_open_ptmx_index(info->tie->pty->index, flags, open_tty_reg, d, path_from_reg(d)); } static int unlock_pty(int fd) { const int lock = 0; /* * Usually when ptmx opened it gets locked * by kernel and we need to unlock it to be * able to connect slave peer. */ if (ioctl(fd, TIOCSPTLCK, &lock)) { pr_err("Unable to unlock pty device via y%d\n", fd); return -1; } return 0; } static int lock_pty(int fd) { const int lock = 1; if (ioctl(fd, TIOCSPTLCK, &lock)) { pr_err("Unable to lock pty device via %d\n", fd); return -1; } return 0; } static int tty_set_sid(int fd) { if (ioctl(fd, TIOCSCTTY, 1)) { pr_perror("Can't set sid on terminal fd %d", fd); return -1; } return 0; } static int tty_set_prgp(int fd, int group) { if (ioctl(fd, TIOCSPGRP, &group)) { pr_perror("Failed to set group %d on %d", group, fd); return -1; } return 0; } int tty_restore_ctl_terminal(struct file_desc *d, int fd) { struct tty_info *info = container_of(d, struct tty_info, d); struct tty_driver *driver = info->driver; struct reg_file_info *fake = NULL; struct file_desc *slave_d; int slave = -1, ret = -1, index = -1; BUG_ON(!is_service_fd(fd, CTL_TTY_OFF)); if (driver->type == TTY_TYPE__EXT_TTY) { slave = -1; if (!inherited_fd(&info->d, &slave) && slave < 0) return -1; goto out; } if (driver->img_get_index) index = driver->img_get_index(info); else index = driver->index; if (is_pty(info->driver) && tty_is_master(info)) { fake = pty_alloc_fake_slave(info); if (!fake) goto err; slave_d = &fake->d; } else slave_d = info->reg_d; slave = open_tty_reg(slave_d, O_RDONLY); if (slave < 0) { pr_err("Can't open slave tty %s\n", path_from_reg(slave_d)); goto err; } out: pr_info("Restore session %d by %d tty (index %d)\n", info->tie->sid, (int)getpid(), index); ret = tty_set_sid(slave); if (!ret) ret = tty_set_prgp(slave, info->tie->pgrp); close(slave); err: pty_free_fake_reg(&fake); close(fd); return ret ? -1 : 0; } static bool __tty_is_master(struct tty_driver *driver) { if (driver->subtype == TTY_SUBTYPE_MASTER) return true; switch (driver->type) { case TTY_TYPE__CONSOLE: case TTY_TYPE__CTTY: return true; case TTY_TYPE__SERIAL: case TTY_TYPE__VT: if (!opts.shell_job) return true; break; case TTY_TYPE__EXT_TTY: return true; } return false; } static bool tty_is_master(struct tty_info *info) { return __tty_is_master(info->driver); } static bool tty_is_hung(struct tty_info *info) { return info->tie->termios == NULL; } static bool tty_has_active_pair(struct tty_info *info) { int d = tty_is_master(info) ? -1 : + 1; return test_bit(info->tfe->tty_info_id + d, tty_active_pairs); } static void tty_show_pty_info(char *prefix, struct tty_info *info) { int index = -1; struct tty_driver *driver = info->driver; if (driver->img_get_index) index = driver->img_get_index(info); else index = driver->index; pr_info("%s driver %s id %#x index %d (master %d sid %d pgrp %d inherit %d)\n", prefix, info->driver->name, info->tfe->id, index, tty_is_master(info), info->tie->sid, info->tie->pgrp, info->inherit); } struct tty_parms { int tty_id; unsigned has; #define HAS_TERMIOS_L 0x1 #define HAS_TERMIOS 0x2 #define HAS_WINS 0x4 struct termios tl; struct termios t; struct winsize w; }; static int do_restore_tty_parms(void *arg, int fd, pid_t pid) { struct tty_parms *p = arg; /* * Only locked termios need CAP_SYS_ADMIN, but we * restore them all here, since the regular tremios * restore is affected by locked and thus we would * have to do synchronous usernsd call which is not * nice. * * Window size is restored here as it might depend * on termios too. Just to be on the safe side. */ if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) goto err; if ((p->has & HAS_TERMIOS) && ioctl(fd, TCSETS, &p->t) < 0) goto err; if ((p->has & HAS_WINS) && ioctl(fd, TIOCSWINSZ, &p->w) < 0) goto err; return 0; err: pr_perror("Can't set tty params on %#x", p->tty_id); return -1; } static int restore_tty_params(int fd, struct tty_info *info) { struct tty_parms p; /* * It's important to zeroify termios * because it contain @c_cc array which * is bigger than TERMIOS_NCC. Same applies * to winsize usage, we can't guarantee the * structure taken from the system headers will * never be extended. */ p.has = 0; p.tty_id = info->tfe->id; if (info->tie->termios_locked) { memzero(&p.tl, sizeof(p.tl)); p.has |= HAS_TERMIOS_L; termios_copy(&p.tl, info->tie->termios_locked); } if (info->tie->termios) { memzero(&p.t, sizeof(p.t)); p.has |= HAS_TERMIOS; termios_copy(&p.t, info->tie->termios); } if (info->tie->winsize) { memzero(&p.w, sizeof(p.w)); p.has |= HAS_WINS; winsize_copy(&p.w, info->tie->winsize); } if (info->tie->has_uid && info->tie->has_gid) { if (fchown(fd, info->tie->uid, info->tie->gid)) { pr_perror("Can't setup uid %d gid %d on %#x", (int)info->tie->uid, (int)info->tie->gid, info->tfe->id); return -1; } } return userns_call(do_restore_tty_parms, UNS_ASYNC, &p, sizeof(p), fd); } /* * When we restore queued data we don't exit if error happened: * the terminals never was a transport with guaranted delivery, * it's up to application which uses it to guaratee the data * integrity. */ static void pty_restore_queued_data(struct tty_info *info, int fd) { if (info && info->tty_data) { ProtobufCBinaryData bd = info->tty_data->tde->data; int retval; pr_debug("restore queued data on %#x (%zu bytes)\n", info->tfe->id, (size_t)bd.len); retval = write(fd, bd.data, bd.len); if (retval != bd.len) pr_err("Restored %d bytes while %zu expected\n", retval, (size_t)bd.len); } } static int pty_open_slaves(struct tty_info *info) { int fd = -1, ret = -1; struct tty_info *slave; list_for_each_entry(slave, &info->sibling, sibling) { BUG_ON(tty_is_master(slave)); fd = open_tty_reg(slave->reg_d, slave->tfe->flags); if (fd < 0) { pr_err("Can't open slave tty %s\n", path_from_reg(slave->reg_d)); goto err; } if (restore_tty_params(fd, slave)) goto err; pr_debug("send slave %#x fd %d connected on %s\n", slave->tfe->id, fd, path_from_reg(slave->reg_d)); if (send_desc_to_peer(fd, &slave->d)) { pr_err("Can't send file descriptor\n"); goto err; } pty_restore_queued_data(slave->link, fd); close(fd); fd = -1; } ret = 0; err: close_safe(&fd); return ret; } static int receive_tty(struct tty_info *info, int *new_fd) { int fd, ret; ret = recv_desc_from_peer(&info->d, &fd); if (ret != 0) { if (ret != 1) pr_err("Can't get fd %d\n", fd); return ret; } if (rst_file_params(fd, info->tfe->fown, info->tfe->flags) < 0) { close_safe(&fd); return -1; } *new_fd = fd; return 0; } static int pty_open_unpaired_slave(struct file_desc *d, struct tty_info *slave) { struct reg_file_info *fake = NULL; int master = -1, ret = -1, fd = -1; /* * We may have 2 cases here: the slave either need to * be inherited, either it requires a fake master. */ if (likely(slave->inherit)) { if (opts.orphan_pts_master) { fake = pty_alloc_fake_master(slave); if (!fake) goto err; master = pty_open_ptmx_index(&fake->d, slave, O_RDWR); if (master < 0) { pr_err("Can't open master pty %x (index %d)\n", slave->tfe->id, slave->tie->pty->index); goto err; } unlock_pty(master); if (opts.orphan_pts_master && rpc_send_fd(ACT_ORPHAN_PTS_MASTER, master) == 0) { fd = open_tty_reg(slave->reg_d, slave->tfe->flags); if (fd < 0) { pr_err("Can't open slave pty %s\n", path_from_reg(slave->reg_d)); goto err; } goto out; } } if (!stdin_isatty) { pr_err("Don't have tty to inherit session from, aborting\n"); return -1; } fd = dup(get_service_fd(SELF_STDIN_OFF)); if (fd < 0) { pr_perror("Can't dup SELF_STDIN_OFF"); return -1; } pr_info("Migrated slave peer %#x -> to fd %d\n", slave->tfe->id, fd); } else { fake = pty_alloc_fake_master(slave); if (!fake) goto err; master = pty_open_ptmx_index(&fake->d, slave, O_RDONLY); if (master < 0) { pr_err("Can't open master pty %#x (index %d)\n", slave->tfe->id, slave->tie->pty->index); goto err; } unlock_pty(master); fd = open_tty_reg(slave->reg_d, slave->tfe->flags); if (fd < 0) { pr_err("Can't open slave pty %s\n", path_from_reg(slave->reg_d)); goto err; } } out: if (restore_tty_params(fd, slave)) goto err; /* * If tty is migrated we need to set its group * to the parent group, because signals on key * presses are delivered to a group of terminal. * * Note, at this point the group/session should * be already restored properly thus we can simply * use syscalls instead of lookup via process tree. */ if (slave->inherit && opts.shell_job) { /* * The restoration procedure only works if we're * migrating not a session leader, otherwise it's * not allowed to restore a group and one better to * checkpoint complete process tree together with * the process which keeps the master peer. */ if (root_item->sid != vpid(root_item)) { pr_debug("Restore inherited group %d\n", getpgid(getppid())); if (tty_set_prgp(fd, getpgid(getppid()))) goto err; } } if (pty_open_slaves(slave)) goto err; ret = fd; fd = -1; err: close_safe(&master); close_safe(&fd); pty_free_fake_reg(&fake); return ret; } static int pty_open_ptmx(struct tty_info *info) { int master = -1; master = pty_open_ptmx_index(info->reg_d, info, info->tfe->flags); if (master < 0) { pr_err("Can't open master pty %#x (index %d)\n", info->tfe->id, info->tie->pty->index); return -1; } unlock_pty(master); if (restore_tty_params(master, info)) goto err; if (info->tie->packet_mode) { int packet_mode = 1; if (ioctl(master, TIOCPKT, &packet_mode) < 0) { pr_perror("Can't set packed mode on %#x", info->tfe->id); goto err; } } if (pty_open_slaves(info)) goto err; pty_restore_queued_data(info->link, master); if (info->tie->locked) lock_pty(master); return master; err: close_safe(&master); return -1; } static int open_simple_tty(struct tty_info *info) { int fd = -1; fd = open_tty_reg(info->reg_d, info->tfe->flags); if (fd < 0) { pr_err("Can't open tty %s %#x\n", info->driver->name, info->tfe->id); return -1; } if (restore_tty_params(fd, info)) goto err; return fd; err: close_safe(&fd); return -1; } static int open_ext_tty(struct tty_info *info) { int fd = -1; if (!inherited_fd(&info->d, &fd) && fd < 0) return -1; if (restore_tty_params(fd, info)) { close(fd); return -1; } return fd; } static bool tty_deps_restored(struct tty_info *info) { struct list_head *list = &rsti(current)->fds; struct fdinfo_list_entry *fle; struct tty_info *tmp; if (info->driver->type == TTY_TYPE__CTTY) { list_for_each_entry(fle, list, ps_list) { if (fle->desc->ops->type != FD_TYPES__TTY || fle->desc == &info->d) continue; /* ctty needs all others are restored */ if (fle->stage != FLE_RESTORED) return false; } } else if (!tty_is_master(info)) { list_for_each_entry(fle, list, ps_list) { if (fle->desc->ops->type != FD_TYPES__TTY || fle->desc == &info->d) continue; tmp = container_of(fle->desc, struct tty_info, d); /* slaves wait for masters except ctty */ if (tmp->driver->type == TTY_TYPE__CTTY || !tty_is_master(tmp)) continue; if (fle->stage != FLE_RESTORED) return false; } } return true; } static int tty_open(struct file_desc *d, int *new_fd) { struct tty_info *info = container_of(d, struct tty_info, d); int ret; tty_show_pty_info("open", info); if (!info->create) return receive_tty(info, new_fd); if (!tty_deps_restored(info)) return 1; if (is_pty(info->driver) && !tty_is_master(info)) ret = pty_open_unpaired_slave(d, info); else ret = info->driver->open(info); if (ret < 0) return -1; *new_fd = ret; return 0; } static char *tty_d_name(struct file_desc *d, char *buf, size_t s) { struct tty_info *info = container_of(d, struct tty_info, d); snprintf(buf, s, "tty[%x:%x]", info->tie->rdev, info->tie->dev); return buf; } static struct file_desc_ops tty_desc_ops = { .type = FD_TYPES__TTY, .open = tty_open, .name = tty_d_name, }; static struct pstree_item *find_first_sid(int sid) { struct pstree_item *item; for_each_pstree_item(item) { if (item->sid == sid) return item; } return NULL; } static int tty_find_restoring_task(struct tty_info *info) { struct pstree_item *item; /* * The overall scenario is the following (note * we might have corrupted image so don't believe * anything). * * SID is present on a peer * ------------------------ * * - if it's master peer and we have as well a slave * peer then prefer restore controlling terminal * via slave peer * * - if it's master peer without slave, there must be * a SID leader who will be restoring the peer * * - if it's a slave peer and no session leader found * than we need an option to inherit terminal * * No SID present on a peer * ------------------------ * * - if it's a master peer than we are in good shape * and continue in a normal way, we're the peer keepers * * - if it's a slave peer and no appropriate master peer * found we need an option to inherit terminal * * In any case if it's hungup peer, then we jump out * early since it will require fake master peer and * rather non-usable anyway. */ if (tty_is_hung(info)) { pr_debug("Hungup terminal found id %#x\n", info->tfe->id); return 0; } /* * Current tty should be skipped here: the * underlied _real_ pty (or anything else * driver in future) should restore the * session. */ if (info->driver->type == TTY_TYPE__CTTY) return 0; if (info->tie->sid) { if (!tty_is_master(info)) { if (tty_has_active_pair(info)) return 0; else if (!opts.orphan_pts_master) goto shell_job; else info->inherit = true; } /* * Restoring via leader only. All files * opened over same real tty get propagated * automatically by kernel itself. */ if (info->ctl_tty != info) return 0; /* * Find out the task which is session leader * and it can restore the controlling terminal * for us. */ item = find_first_sid(info->tie->sid); if (item && vpid(item) == item->sid) { pr_info("Set a control terminal %#x to %d\n", info->tfe->id, info->tie->sid); return prepare_ctl_tty(vpid(item), rsti(item), info->tfe->id); } goto notask; } else { if (tty_is_master(info)) return 0; if (tty_has_active_pair(info)) return 0; } shell_job: if (opts.shell_job) { pr_info("Inherit terminal for id %#x\n", info->tfe->id); info->inherit = true; return 0; } notask: pr_err("No task found with sid %d\n", info->tie->sid); return -1; } static int tty_setup_orphan_slavery(void) { struct tty_info *info, *peer, *m; list_for_each_entry(info, &all_ttys, list) { struct fdinfo_list_entry *a, *b; bool has_leader = false; if (tty_is_master(info)) continue; a = file_master(&info->d); m = info; list_for_each_entry(peer, &info->sibling, sibling) { if (tty_is_master(peer)) { has_leader = true; break; } /* * Same check as in pipes and files -- need to * order slave ends so that they do not dead lock * waiting for each other. */ b = file_master(&peer->d); if (fdinfo_rst_prio(b, a)) { a = b; m = peer; } } if (!has_leader) { m->create = true; pr_debug("Found orphan slave fake leader (%#x)\n", m->tfe->id); } } return 0; } static int tty_setup_slavery(void) { struct tty_info *info, *peer, *m; /* * Setup links for PTY terminal pairs by * their indices, queued data already bound * to them by data ids. */ list_for_each_entry(info, &all_ttys, list) { if (!is_pty(info->driver) || info->link) continue; peer = info; list_for_each_entry_continue(peer, &all_ttys, list) { if (!is_pty(peer->driver) || peer->link) continue; if (peer->tie->pty->index == info->tie->pty->index) { info->link = peer; peer->link = info; pr_debug("Link PTYs (%#x)\n", info->tfe->id); break; } } } /* * The image may carry several terminals opened * belonging to the same session, so choose the * leader which gonna be setting up the controlling * terminal. */ list_for_each_entry(info, &all_ttys, list) { if (!info->tie->sid || info->ctl_tty || info->driver->type == TTY_TYPE__CTTY) continue; if (!tty_is_master(info) && info->link) continue; info->ctl_tty = info; pr_debug("ctl tty leader %#x\n", info->tfe->id); peer = info; list_for_each_entry_safe_continue(peer, m, &all_ttys, list) { if (!peer->tie->sid || peer->ctl_tty || peer->driver->type == TTY_TYPE__CTTY) continue; if (peer->tie->sid == info->tie->sid) { pr_debug(" `- slave %#x\n", peer->tfe->id); peer->ctl_tty = info; } } } list_for_each_entry(info, &all_ttys, list) { if (tty_find_restoring_task(info)) return -1; if (!is_pty(info->driver)) continue; peer = info; list_for_each_entry_safe_continue(peer, m, &all_ttys, list) { if (!is_pty(peer->driver)) continue; if (peer->tie->pty->index != info->tie->pty->index) continue; if (tty_find_restoring_task(peer)) return -1; list_add(&peer->sibling, &info->sibling); list_del(&peer->list); } } /* * Print out information about peers. */ list_for_each_entry(info, &all_ttys, list) { tty_show_pty_info("head", info); list_for_each_entry(peer, &info->sibling, sibling) tty_show_pty_info(" `- sibling", peer); } return tty_setup_orphan_slavery(); } static int verify_termios(u32 id, TermiosEntry *e) { if (e && e->n_c_cc < TERMIOS_NCC) { pr_err("pty ID %#x n_c_cc (%d) has wrong value\n", id, (int)e->n_c_cc); return -1; } return 0; } #define term_opts_missing_cmp(tie, op) \ (!(tie)->termios op \ !(tie)->termios_locked op \ !(tie)->winsize) #define term_opts_missing_any(p) \ term_opts_missing_cmp(p, ||) #define term_opts_missing_all(p) \ term_opts_missing_cmp(p, &&) static int verify_info(TtyInfoEntry *tie, struct tty_driver *driver) { /* * Master peer must have all parameters present, * while slave peer must have either all parameters present * or don't have them at all. */ if (term_opts_missing_any(tie)) { if (__tty_is_master(driver)) { pr_err("Corrupted master peer %#x\n", tie->id); return -1; } else if (!term_opts_missing_all(tie)) { pr_err("Corrupted slave peer %#x\n", tie->id); return -1; } } if (verify_termios(tie->id, tie->termios_locked) || verify_termios(tie->id, tie->termios)) return -1; if (tie->termios && tie->id > (MAX_TTYS << 1)) return -1; return 0; } static int tty_info_setup(struct tty_info *info); static int collect_one_tty_info_entry(void *obj, ProtobufCMessage *msg, struct cr_img *i) { struct tty_info *info, *n; TtyInfoEntry *tie; struct tty_driver *driver; tie = pb_msg(msg, TtyInfoEntry); switch (tie->type) { case TTY_TYPE__PTY: if (!tie->pty) { pr_err("No PTY data found (id %#x), corrupted image?\n", tie->id); return -1; } break; case TTY_TYPE__CTTY: case TTY_TYPE__CONSOLE: case TTY_TYPE__SERIAL: case TTY_TYPE__VT: case TTY_TYPE__EXT_TTY: if (tie->pty) { pr_err("PTY data found (id %#x), corrupted image?\n", tie->id); return -1; } break; default: pr_err("Unexpected TTY type %d (id %#x)\n", tie->type, tie->id); return -1; } driver = get_tty_driver(tie->rdev, tie->dev); if (driver == NULL) { pr_err("Unable to find a tty driver (rdev %#x dev %#x)\n", tie->rdev, tie->dev); return -1; } if (verify_info(tie, driver)) return -1; list_for_each_entry_safe(info, n, &collected_ttys, list) { if (info->tfe->tty_info_id != tie->id) continue; info->tie = tie; info->driver = driver; list_move_tail(&info->list, &all_ttys); if (tty_info_setup(info)) return -1; } /* * The tty peers which have no @termios are hung up, * so don't mark them as active, we create them with * faked master and they are rather a rudiment which * can't be used. Most likely they appear if a user has * dumped program when it was closing a peer. */ if (is_pty(driver) && tie->termios) tty_test_and_set(tie->id, tty_active_pairs); return 0; } struct collect_image_info tty_info_cinfo = { .fd_type = CR_FD_TTY_INFO, .pb_type = PB_TTY_INFO, .collect = collect_one_tty_info_entry, .flags = COLLECT_NOFREE, }; static int prep_tty_restore_cb(struct pprep_head *ph) { if (!list_empty(&collected_ttys)) { pr_err("Not all TTYs got its infos\n"); return -1; } if (tty_verify_active_pairs()) return -1; if (tty_setup_slavery()) return -1; return 0; } static MAKE_PPREP_HEAD(prep_tty_restore); static int collect_one_tty(void *obj, ProtobufCMessage *msg, struct cr_img *i) { struct tty_info *info = obj; info->tfe = pb_msg(msg, TtyFileEntry); list_add_tail(&info->list, &collected_ttys); return 0; } static int tty_info_setup(struct tty_info *info) { INIT_LIST_HEAD(&info->sibling); info->create = tty_is_master(info); info->inherit = false; info->ctl_tty = NULL; info->tty_data = NULL; info->link = NULL; /* * The image might have no reg file record in old CRIU, so * lets don't fail for a while. After a couple of releases * simply require the record to present. * * Note for external ttys it's fine to not have any * reg file rectord because they are inherited from * command line on restore. */ info->reg_d = try_collect_special_file( info->tfe->has_regf_id ? info->tfe->regf_id : info->tfe->id, 1); if (!info->reg_d) { if (info->driver->type != TTY_TYPE__EXT_TTY) { if (!deprecated_ok("TTY w/o regfile")) return -1; if (is_pty(info->driver)) { info->reg_d = pty_alloc_reg(info, true); if (!info->reg_d) { pr_err("Can't generate new reg descriptor for id %#x\n", info->tfe->id); return -1; } } else { pr_err("No reg_d descriptor for id %#x\n", info->tfe->id); return -1; } } } pr_info("Collected tty ID %#x (%s)\n", info->tfe->id, info->driver->name); add_post_prepare_cb_once(&prep_tty_restore); /* * Call it explicitly. Post-callbacks will be called after * namespaces preparation, while the latter needs this mutex. */ if (init_tty_mutex()) return -1; info->fdstore_id = -1; return file_desc_add(&info->d, info->tfe->id, &tty_desc_ops); } struct collect_image_info tty_cinfo = { .fd_type = CR_FD_TTY_FILES, .pb_type = PB_TTY_FILE, .priv_size = sizeof(struct tty_info), .collect = collect_one_tty, }; static int collect_one_tty_data(void *obj, ProtobufCMessage *msg, struct cr_img *i) { struct tty_data_entry *tdo = obj; struct tty_info *info; tdo->tde = pb_msg(msg, TtyDataEntry); pr_debug("Collected data for id %#x (size %zu bytes)\n", tdo->tde->tty_id, (size_t)tdo->tde->data.len); list_for_each_entry(info, &all_ttys, list) { if (tdo->tde->tty_id == info->tie->id) { info->tty_data = tdo; return 0; } } pr_err("No tty found to queued data on id %#x\n", tdo->tde->tty_id); return -ENOENT; } struct collect_image_info tty_cdata = { .fd_type = CR_FD_TTY_DATA, .pb_type = PB_TTY_DATA, .priv_size = sizeof(struct tty_data_entry), .collect = collect_one_tty_data, }; /* Make sure the ttys we're dumping do belong our process tree */ int dump_verify_tty_sids(void) { struct tty_dump_info *dinfo, *n; int ret = 0; /* * There might be a cases where we get sid/pgid on * slave peer. For example the application is running * with redirection and we're migrating shell job. * * # ./app < /dev/zero > /dev/zero &2>1 * * Which produce a tree like * PID PPID PGID SID * root 23786 23784 23786 23786 pts/0 \_ -bash * root 24246 23786 24246 23786 pts/0 \_ ./app * * And the application goes background, then we dump * it from the same shell. * * In this case we simply zap sid/pgid and inherit * the peer from the current terminal on restore. */ list_for_each_entry_safe(dinfo, n, &all_ttys, list) { if (!ret && dinfo->sid) { struct pstree_item *item = find_first_sid(dinfo->sid); if (!item || vpid(item) != dinfo->sid) { if (!opts.shell_job) { pr_err("Found dangling tty with sid %d pgid %d (%s) on peer fd %d.\n", dinfo->sid, dinfo->pgrp, dinfo->driver->name, dinfo->fd); /* * First thing people do with criu is dump smth * run from shell. This is typical pitfall, warn * user about it explicitly. */ pr_msg("Task attached to shell terminal. " "Consider using --" OPT_SHELL_JOB " option. " "More details on http://criu.org/Simple_loop\n"); ret = -1; } } } } return ret; } static int dump_tty_info(int lfd, u32 id, const struct fd_parms *p, struct tty_driver *driver, int index) { TtyInfoEntry info = TTY_INFO_ENTRY__INIT; TermiosEntry termios = TERMIOS_ENTRY__INIT; TermiosEntry termios_locked = TERMIOS_ENTRY__INIT; WinsizeEntry winsize = WINSIZE_ENTRY__INIT; TtyPtyEntry pty = TTY_PTY_ENTRY__INIT; struct parasite_tty_args *pti; struct tty_dump_info *dinfo; struct termios t; struct winsize w; int ret = -1; if (!p->fd_ctl) { pr_err("No CTL for TTY dump, likely SCM case\n"); return -1; } /* * Make sure the structures the system provides us * correlates well with protobuf templates. */ BUILD_BUG_ON(ARRAY_SIZE(t.c_cc) < TERMIOS_NCC); BUILD_BUG_ON(sizeof(termios.c_cc) != sizeof(void *)); BUILD_BUG_ON((sizeof(termios.c_cc) * TERMIOS_NCC) < sizeof(t.c_cc)); pti = parasite_dump_tty(p->fd_ctl, p->fd, driver->type); if (!pti) return -1; dinfo = xzalloc(sizeof(*dinfo)); if (!dinfo) return -1; dinfo->id = id; dinfo->sid = pti->sid; dinfo->pgrp = pti->pgrp; dinfo->pid_real = p->pid; dinfo->fd = p->fd; dinfo->mnt_id = p->mnt_id; dinfo->driver = driver; dinfo->flags = p->flags; if (is_pty(driver)) { dinfo->lfd = dup(lfd); if (dinfo->lfd < 0) { pr_perror("Can't dup local fd on %#x", id); xfree(dinfo); return -1; } dinfo->index = index; } else { dinfo->index = -1; dinfo->lfd = -1; } list_add_tail(&dinfo->list, &all_ttys); info.id = id; info.sid = pti->sid; info.pgrp = pti->pgrp; info.rdev = p->stat.st_rdev; info.dev = p->stat.st_dev; info.has_dev = true; info.locked = pti->st_lock; info.exclusive = pti->st_excl; info.packet_mode = pti->st_pckt; info.has_uid = true; info.uid = userns_uid(p->stat.st_uid); info.has_gid = true; info.gid = userns_gid(p->stat.st_gid); info.type = driver->type; if (info.type == TTY_TYPE__PTY) { info.pty = &pty; pty.index = index; } /* * Nothing we can do on hanging up terminal, * just write out minimum information we can * gather. */ if (pti->hangup) return pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_INFO), &info, PB_TTY_INFO); /* * Now trace the paired/unpaired ttys. For example * the task might have slave peer assigned but no * master peer. Such "detached" master peers are * not yet supported by our tool and better to * inform a user about such situation. */ if (is_pty(driver)) tty_test_and_set(id, tty_active_pairs); info.termios = &termios; info.termios_locked = &termios_locked; info.winsize = &winsize; termios.n_c_cc = TERMIOS_NCC; termios.c_cc = xmalloc(pb_repeated_size(&termios, c_cc)); termios_locked.n_c_cc = TERMIOS_NCC; termios_locked.c_cc = xmalloc(pb_repeated_size(&termios_locked, c_cc)); if (!termios.c_cc || !termios_locked.c_cc) goto out; memzero(&t, sizeof(t)); if (ioctl(lfd, TCGETS, &t) < 0) { pr_perror("Can't get tty params on %#x", id); goto out; } termios_copy(&termios, &t); memzero(&t, sizeof(t)); if (ioctl(lfd, TIOCGLCKTRMIOS, &t) < 0) { pr_perror("Can't get tty locked params on %#x", id); goto out; } termios_copy(&termios_locked, &t); memzero(&w, sizeof(w)); if (ioctl(lfd, TIOCGWINSZ, &w) < 0) { pr_perror("Can't get tty window params on %#x", id); goto out; } winsize_copy(&winsize, &w); ret = pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_INFO), &info, PB_TTY_INFO); out: xfree(termios.c_cc); xfree(termios_locked.c_cc); return ret; } static int dump_one_tty(int lfd, u32 id, const struct fd_parms *p) { TtyFileEntry e = TTY_FILE_ENTRY__INIT; int ret = 0, index = -1; struct tty_driver *driver; pr_info("Dumping tty %d with id %#x\n", lfd, id); driver = get_tty_driver(p->stat.st_rdev, p->stat.st_dev); if (driver->fd_get_index) index = driver->fd_get_index(lfd, p); else index = driver->index; if (index == INDEX_ERR) { pr_info("Can't obtain index on tty %d id %#x\n", lfd, id); return -1; } e.id = id; e.tty_info_id = tty_gen_id(driver, index); e.flags = p->flags; e.fown = (FownEntry *)&p->fown; if (driver->type != TTY_TYPE__EXT_TTY) { u32 rf_id; fd_id_generate_special(NULL, &rf_id); if (dump_one_reg_file(lfd, rf_id, p)) return -1; e.has_regf_id = true; e.regf_id = rf_id; } /* * FIXME * * Figure out how to fetch data buffered in terminal. * For a while simply flush before dumping. Note * we don't check for errors here since it makes * no sense anyway, the buffered data is not handled * properly yet. * * Note as well that if we have only one peer here * the external end might be sending the data to us * again and again while kernel buffer is not full, * this might lead to endless SIGTTOU signal delivery * to the dumpee, ruining checkpoint procedure. * * So simply do not flush the line while we dump * parameters tty never was being a guaranteed delivery * transport anyway. */ if (!tty_test_and_set(e.tty_info_id, tty_bitmap)) ret = dump_tty_info(lfd, e.tty_info_id, p, driver, index); if (!ret) { FileEntry fe = FILE_ENTRY__INIT; fe.type = FD_TYPES__TTY; fe.id = e.id; fe.tty = &e; ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } return ret; } const struct fdtype_ops tty_dump_ops = { .type = FD_TYPES__TTY, .dump = dump_one_tty, }; static int tty_reblock(int id, int lfd, int flags) { static const int fmask = O_RDWR | O_NONBLOCK; int ret; if ((flags & fmask) != fmask) { if (fcntl(lfd, F_SETFL, flags)) { ret = -errno; pr_perror("Can't revert mode back to %o on (%#x)", fmask, id); return ret; } } return 0; } static int tty_unblock(int id, int lfd, int flags) { static const int fmask = O_RDWR | O_NONBLOCK; int ret; if ((flags & fmask) != fmask) { if (fcntl(lfd, F_SETFL, fmask)) { ret = -errno; pr_perror("Can't change mode to %o on (%#x)", fmask, id); return ret; } } return 0; } static int tty_do_dump_queued_data(struct tty_dump_info *dinfo) { TtyDataEntry e = TTY_DATA_ENTRY__INIT; size_t off = 0, size = 16384; char *buf; int ret; buf = xmalloc(size); if (!buf) return -ENOMEM; ret = tty_unblock(dinfo->id, dinfo->lfd, dinfo->flags); if (ret) { xfree(buf); return ret; } while (1) { ret = read(dinfo->lfd, &buf[off], size - off); if (ret == 0) { pr_debug("No more data on tty (%s %#x)\n", dinfo->driver->name, dinfo->id); break; } else if (ret < 0) { if (errno == EAGAIN) { pr_debug("Not waiting data tty (%s %#x)\n", dinfo->driver->name, dinfo->id); break; } else { ret = -errno; pr_perror("Can't read data from tty (%s %#x)", dinfo->driver->name, dinfo->id); xfree(buf); return ret; } } off += ret; pr_debug("Read %d bytes (%d) from tty (%s %#x)\n", ret, (int)off, dinfo->driver->name, dinfo->id); if (off >= size) { pr_err("The tty (%s %#x) queued data overrflow %zu bytes limit\n", dinfo->driver->name, dinfo->id, size); off = size; break; } } if (off) { dinfo->tty_data = buf; dinfo->tty_data_size = off; e.tty_id = dinfo->id; e.data.data = (void *)buf; e.data.len = off; ret = pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_DATA), &e, PB_TTY_DATA); } else { xfree(buf); ret = 0; } return ret; } /* * If error happens here, so be it, ttys are not delivering * data with guaranteed results. */ static void __tty_do_writeback_queued_data(struct tty_dump_info *dinfo) { if (dinfo->tty_data) { if (write(dinfo->link->lfd, dinfo->tty_data, dinfo->tty_data_size) != dinfo->tty_data_size) pr_perror("Can't writeback to tty (%#x)", dinfo->id); } tty_reblock(dinfo->link->id, dinfo->link->lfd, dinfo->link->flags); } static void tty_do_writeback_queued_data(struct tty_dump_info *dinfo) { __tty_do_writeback_queued_data(dinfo); __tty_do_writeback_queued_data(dinfo->link); } static void tty_dinfo_free(struct tty_dump_info *dinfo) { list_del(&dinfo->list); close_safe(&dinfo->lfd); xfree(dinfo->tty_data); xfree(dinfo); } /* * Dumping queued data must be done at the very end of the * checkpoint procedure -- it's tail optimization, we trying * to defer this procedure until everything else passed * successfully because in real it is time consuming on * its own which might require writting data back to the * former peers if case something go wrong. * * Moreover when we gather PTYs peers into own list we * do it in destructive way -- the former @all_ttys * list get modified (one of the peer get moved from * @all_ttys to @all_ptys list) because otherwise we * will have to add one more entry into tty_dump_info, * thus we simply reuse the @list entry for own needs. */ static int tty_dump_queued_data(void) { struct tty_dump_info *dinfo, *peer, *n; LIST_HEAD(all_ptys); int ret = 0; /* * Link PTY peers, and move one of linked * into separate list. */ list_for_each_entry_safe(dinfo, n, &all_ttys, list) { if (!is_pty(dinfo->driver) || dinfo->link) continue; peer = dinfo; list_for_each_entry_continue(peer, &all_ttys, list) { if (!is_pty(peer->driver) || peer->link) continue; if (peer->index == dinfo->index) { dinfo->link = peer; peer->link = dinfo; pr_debug("Link PTYs (%#x)\n", dinfo->id); list_move(&dinfo->list, &all_ptys); } } } /* * Once linked fetch the queued data if present. */ list_for_each_entry(dinfo, &all_ptys, list) { ret = tty_do_dump_queued_data(dinfo); if (ret) break; ret = tty_do_dump_queued_data(dinfo->link); if (ret) break; } if (ret || opts.final_state != TASK_DEAD) { list_for_each_entry(dinfo, &all_ptys, list) tty_do_writeback_queued_data(dinfo); } list_for_each_entry_safe(dinfo, n, &all_ptys, list) { tty_dinfo_free(dinfo->link); tty_dinfo_free(dinfo); } list_for_each_entry_safe(dinfo, n, &all_ttys, list) tty_dinfo_free(dinfo); return ret; } static int tty_verify_ctty(void) { struct tty_dump_info *d, *p; list_for_each_entry(d, &all_ttys, list) { struct tty_dump_info *n = NULL; if (d->driver->type != TTY_TYPE__CTTY) continue; list_for_each_entry(p, &all_ttys, list) { if (!is_pty(p->driver) || p->sid != d->sid || p->pgrp != d->sid) continue; n = p; break; } if (!n) { pr_err("ctty inheritance detected sid/pgrp %d, " "no PTY peer with sid/pgrp needed\n", d->sid); return -ENOENT; } else if (n->pid_real != d->pid_real) { pr_err("ctty inheritance detected sid/pgrp %d " "(ctty pid_real %d pty pid_real %d)\n", d->sid, d->pid_real, n->pid_real); return -ENOENT; } } return 0; } int tty_post_actions(void) { if (tty_verify_ctty()) return -1; if (tty_verify_active_pairs()) return -1; else if (tty_dump_queued_data()) return -1; return 0; } int tty_prep_fds(void) { if (!opts.shell_job) return 0; if (!isatty(STDIN_FILENO)) pr_info("Standard stream is not a terminal, may fail later\n"); else stdin_isatty = true; if (install_service_fd(SELF_STDIN_OFF, STDIN_FILENO) < 0) { pr_err("Can't dup stdin to SELF_STDIN_OFF\n"); return -1; } return 0; } void tty_fini_fds(void) { close_service_fd(SELF_STDIN_OFF); } static int open_pty(void *arg, int flags) { int dfd = (unsigned long) arg; /* * Never set as a control terminal automatically, all * ctty magic happens only in tty_set_sid(). */ flags |= O_NOCTTY; return openat(dfd, "ptmx", flags); } /* Create a pty pair and save a master descriptor in fdstore */ static int pty_create_ptmx_index(int dfd, int index, int flags) { struct tty_info *info; int fd, id; fd = __pty_open_ptmx_index(index, flags, open_pty, (void *)(unsigned long) dfd, "ptmx"); if (fd < 0) return -1; id = fdstore_add(fd); if (id < 0) return -1; close(fd); list_for_each_entry(info, &all_ttys, list) { if (!is_pty(info->driver)) continue; if (info->tie->pty->index == index) { info->fdstore_id = id; } } return 0; } /* * Here we check that a master of a bind-mounted slave was opened in the root * mount namespace. The problem is that we restore all mounts in the root mount * namespace. Only when all mounts are restored, we create other mount * namespaces. So when we are restoring mounts, we can open files only in the * root mount namespace. */ int devpts_check_bindmount(struct mount_info *m) { struct tty_dump_info *dinfo = NULL; struct mount_info *master_mp; int index; if (strcmp(m->root, "/") == 0 || strcmp(m->root, "/ptmx") == 0) return 0; if (sscanf(m->root, "/%d", &index) != 1) { pr_err("Unable to parse %s\n", m->root); return -1; } list_for_each_entry(dinfo, &all_ttys, list) { if (!is_pty(dinfo->driver)) continue; if (dinfo->driver->subtype != TTY_SUBTYPE_MASTER) continue; if (dinfo->index == index) goto found; } if (opts.orphan_pts_master) /* external master */ return 0; pr_err("Unable to find a master for %s\n", m->root); return -1; found: /* mnt_id isn't reported in fdinfo, so here is only one mntns */ if (dinfo->mnt_id == -1) return 0; master_mp = lookup_mnt_id(dinfo->mnt_id); if (!master_mp) { pr_err("Unable to find a mount %d\n", dinfo->mnt_id); return -1; } if (master_mp->nsid->type != NS_ROOT) { pr_err("The master for %s isn't from the root mntns\n", m->root); return -1; } return 0; } /* Restore slave pty-s which have to be bind-mounted to somewhere */ int devpts_restore(struct mount_info *pm) { struct mount_info *bm; int dfd, exit_code = -1; dfd = open(pm->mountpoint, O_RDONLY); if (dfd < 0) { pr_perror("Unable to open %s", pm->mountpoint); return -1; } list_for_each_entry(bm, &pm->mnt_bind, mnt_bind) { int idx; struct stat st; if (sscanf(bm->root, "/%d", &idx) < 1) continue; if (fstatat(dfd, bm->root + 1, &st, 0) == 0) continue; pr_debug("Create a slave tty %d\n", idx); if (pty_create_ptmx_index(dfd, idx, O_RDWR)) goto err; } exit_code = 0; err: close(dfd); return exit_code; } criu-3.6/criu/tun.c000066400000000000000000000243261317335042600142530ustar00rootroot00000000000000#include #include #include #include #include #include #include // MAO required on Centos 6 (linux-3.18.1 kernel) #include #include "cr_options.h" #include "imgset.h" #include "protobuf.h" #include "string.h" #include "files.h" #include "files-reg.h" #include "tun.h" #include "net.h" #include "namespaces.h" #include "xmalloc.h" #include "images/tun.pb-c.h" #ifndef IFF_PERSIST #define IFF_PERSIST 0x0800 #endif #ifndef IFF_NOFILTER #define IFF_NOFILTER 0x1000 #endif #ifndef TUNSETQUEUE #define TUNSETQUEUE _IOW('T', 217, int) #define IFF_ATTACH_QUEUE 0x0200 #define IFF_DETACH_QUEUE 0x0400 #endif /* * Absence of the 1st ioctl means we cannot restore tun link. But * since the 2nd one appeared at the same time, we'll "check" this * by trying to dump filter and abort dump if it's not there. */ #ifndef TUNSETIFINDEX #define TUNSETIFINDEX _IOW('T', 218, unsigned int) #endif #ifndef TUNGETFILTER #define TUNGETFILTER _IOR('T', 219, struct sock_fprog) #endif #define TUN_DEV_GEN_PATH "/dev/net/tun" int check_tun_cr(int no_tun_err) { int fd, idx = 13, ret; fd = open(TUN_DEV_GEN_PATH, O_RDWR); if (fd < 0) { pr_perror("Can't check tun support"); return no_tun_err; } ret = ioctl(fd, TUNSETIFINDEX, &idx); if (ret < 0) pr_perror("No proper support for tun dump/restore"); close(fd); return ret; } static LIST_HEAD(tun_links); struct tun_link { char name[IFNAMSIZ]; struct list_head l; union { struct { unsigned flags; } rst; struct { unsigned sndbuf; unsigned vnethdr; } dmp; }; }; static int list_tun_link(NetDeviceEntry *nde) { struct tun_link *tl; tl = xmalloc(sizeof(*tl)); if (!tl) return -1; strlcpy(tl->name, nde->name, sizeof(tl->name)); /* * Keep tun-flags not only for persistency fixup (see * commend below), but also for TUNSETIFF -- we must * open the device with the same flags it should live * with (i.e. -- with which it was created. */ tl->rst.flags = nde->tun->flags; list_add_tail(&tl->l, &tun_links); return 0; } static struct tun_link *find_tun_link(char *name) { struct tun_link *tl; list_for_each_entry(tl, &tun_links, l) if (!strcmp(tl->name, name)) return tl; return NULL; } static struct tun_link *__dump_tun_link_fd(int fd, char *name, unsigned flags) { struct tun_link *tl; struct sock_fprog flt; tl = xmalloc(sizeof(*tl)); if (!tl) goto err; strlcpy(tl->name, name, sizeof(tl->name)); if (ioctl(fd, TUNGETVNETHDRSZ, &tl->dmp.vnethdr) < 0) { pr_perror("Can't dump vnethdr size for %s", name); goto err; } if (ioctl(fd, TUNGETSNDBUF, &tl->dmp.sndbuf) < 0) { pr_perror("Can't dump sndbuf for %s", name); goto err; } if (flags & IFF_TAP) { pr_debug("Checking filter for tap %s\n", name); if (ioctl(fd, TUNGETFILTER, &flt) < 0) { pr_perror("Can't get tun filter for %s", name); goto err; } /* * TUN filters are tricky -- the program itself is 'somewhere' * in the task's memory, so we can't get one for unattached * persistent device. The only way for doing it is opening the * device with IFF_NOFILTER and attaching some fake one :( */ if (flt.len != 0) { pr_err("Can't dump %s with filter on-board\n", name); goto err; } } else if (!(flags & IFF_NOFILTER)) { pr_err("No info about %s filter, kernel is too old\n", name); goto err; } return tl; err: xfree(tl); return NULL; } static struct tun_link *dump_tun_link_fd(int fd, char *name, unsigned flags) { struct tun_link *tl; tl = find_tun_link(name); if (tl) return tl; tl = __dump_tun_link_fd(fd, name, flags); if (tl) /* * Keep this in list till links dumping code starts. * We can't let it dump all this stuff itself, since * multiple attaches to one tun device is limited and * we may not be able to it that late. * * For persistent detached devices the get_tun_link_fd * will attach to the device and get the needed stuff. */ list_add(&tl->l, &tun_links); return tl; } static int open_tun_dev(char *name, unsigned int idx, unsigned flags) { int fd; struct ifreq ifr; fd = open(TUN_DEV_GEN_PATH, O_RDWR); if (fd < 0) { pr_perror("Can't open tun device"); return -1; } if (idx) { pr_debug(" restoring %u for %s tun\n", idx, name); if (ioctl(fd, TUNSETIFINDEX, &idx) < 0) { pr_perror("Can't restore tun's index"); goto err; } } memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_flags = flags; if (ioctl(fd, TUNSETIFF, &ifr)) { pr_perror("Can't create tun device"); goto err; } return fd; err: close(fd); return -1; } static struct tun_link *get_tun_link_fd(char *name, unsigned flags) { struct tun_link *tl; int fd; tl = find_tun_link(name); if (tl) return tl; /* * If we haven't found this thing, then the * device we see via netlink exists w/o any fds * attached, i.e. -- it's persistent */ if (!(flags & IFF_PERSIST)) { pr_err("No fd infor for non persistent tun device %s\n", name); return NULL; } /* * Kernel will try to attach filter (if it exists) to our memory, * avoid this. */ flags |= IFF_NOFILTER; fd = open_tun_dev(name, 0, flags); if (fd < 0) return NULL; tl = __dump_tun_link_fd(fd, name, flags); close(fd); return tl; } static int dump_tunfile(int lfd, u32 id, const struct fd_parms *p) { int ret; struct cr_img *img; FileEntry fe = FILE_ENTRY__INIT; TunfileEntry tfe = TUNFILE_ENTRY__INIT; struct ifreq ifr; if (!(root_ns_mask & CLONE_NEWNET)) { pr_err("Net namespace is required to dump tun link\n"); return -1; } if (dump_one_reg_file(lfd, id, p)) return -1; pr_info("Dumping tun-file %d with id %#x\n", lfd, id); tfe.id = id; ret = ioctl(lfd, TUNGETIFF, &ifr); if (ret < 0) { if (errno != EBADFD) { pr_perror("Can't dump tun-file device"); return -1; } /* * Otherwise this is just opened file with not yet attached * tun device. Go agead an write the respective entry. */ } else { tfe.netdev = ifr.ifr_name; pr_info("`- attached to device %s (flags %x)\n", tfe.netdev, ifr.ifr_flags); if (ifr.ifr_flags & IFF_DETACH_QUEUE) { tfe.has_detached = true; tfe.detached = true; } if (dump_tun_link_fd(lfd, tfe.netdev, ifr.ifr_flags) == NULL) return -1; } fe.type = FD_TYPES__TUNF; fe.id = tfe.id; fe.tunf = &tfe; img = img_from_set(glob_imgset, CR_FD_FILES); return pb_write_one(img, &fe, PB_FILE); } const struct fdtype_ops tunfile_dump_ops = { .type = FD_TYPES__TUNF, .dump = dump_tunfile, }; struct tunfile_info { struct file_desc d; TunfileEntry *tfe; }; static int tunfile_open(struct file_desc *d, int *new_fd) { int fd; struct tunfile_info *ti; struct ifreq ifr; struct tun_link *tl; ti = container_of(d, struct tunfile_info, d); fd = open_reg_by_id(ti->tfe->id); if (fd < 0) return -1; if (!ti->tfe->netdev) /* just-opened tun file */ goto ok;; tl = find_tun_link(ti->tfe->netdev); if (!tl) { pr_err("No tun device for file %s\n", ti->tfe->netdev); goto err; } memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, tl->name, sizeof(ifr.ifr_name)); ifr.ifr_flags = tl->rst.flags; if (ioctl(fd, TUNSETIFF, &ifr) < 0) { pr_perror("Can't attach tunfile to device"); goto err; } if (ti->tfe->has_detached && ti->tfe->detached) { pr_info("Detaching from %s queue\n", ti->tfe->netdev); ifr.ifr_flags = IFF_DETACH_QUEUE; if (ioctl(fd, TUNSETQUEUE, &ifr) < 0) { pr_perror("Can't detach queue"); goto err; } } if (!(tl->rst.flags & IFF_PERSIST)) { pr_info("Dropping persistency for %s\n", tl->name); if (ioctl(fd, TUNSETPERSIST, 0) < 0) { pr_perror("Error dropping persistency"); goto err; } } ok: *new_fd = fd; return 0; err: close(fd); return -1; } static struct file_desc_ops tunfile_desc_ops = { .type = FD_TYPES__TUNF, .open = tunfile_open, }; static int collect_one_tunfile(void *o, ProtobufCMessage *base, struct cr_img *i) { struct tunfile_info *ti = o; ti->tfe = pb_msg(base, TunfileEntry); file_desc_add(&ti->d, ti->tfe->id, &tunfile_desc_ops); pr_info("Collected %s tunfile\n", ti->tfe->netdev); return 0; } struct collect_image_info tunfile_cinfo = { .fd_type = CR_FD_TUNFILE, .pb_type = PB_TUNFILE, .priv_size = sizeof(struct tunfile_info), .collect = collect_one_tunfile, }; int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **info) { TunLinkEntry tle = TUN_LINK_ENTRY__INIT; char spath[64]; char buf[64]; int ret = 0; struct tun_link *tl; sprintf(spath, "class/net/%s/tun_flags", nde->name); ret |= read_ns_sys_file(spath, buf, sizeof(buf)); tle.flags = strtol(buf, NULL, 0); sprintf(spath, "class/net/%s/owner", nde->name); ret |= read_ns_sys_file(spath, buf, sizeof(buf)); tle.owner = strtol(buf, NULL, 10); sprintf(spath, "class/net/%s/group", nde->name); ret |= read_ns_sys_file(spath, buf, sizeof(buf)); tle.group = strtol(buf, NULL, 10); if (ret < 0) return ret; tl = get_tun_link_fd(nde->name, tle.flags); if (!tl) return ret; tle.vnethdr = tl->dmp.vnethdr; tle.sndbuf = tl->dmp.sndbuf; nde->tun = &tle; return write_netdev_img(nde, fds, info); } int restore_one_tun(NetDeviceEntry *nde, int nlsk) { int fd, ret = -1, aux; if (!nde->tun) { pr_err("Corrupted TUN link entry %x\n", nde->ifindex); return -1; } pr_info("Restoring tun device %s\n", nde->name); fd = open_tun_dev(nde->name, nde->ifindex, nde->tun->flags); if (fd < 0) return -1; aux = nde->tun->owner; if ((aux != -1) && ioctl(fd, TUNSETOWNER, aux) < 0) { pr_perror("Can't set owner"); goto out; } aux = nde->tun->group; if ((aux != -1) && ioctl(fd, TUNSETGROUP, aux) < 0) { pr_perror("Can't set group"); goto out; } aux = nde->tun->sndbuf; if (ioctl(fd, TUNSETSNDBUF, &aux) < 0) { pr_perror("Can't set sndbuf"); goto out; } aux = nde->tun->vnethdr; if (ioctl(fd, TUNSETVNETHDRSZ, &aux) < 0) { pr_perror("Can't set vnethdr"); goto out; } /* * Set this device persistent anyway and schedule * the persistence drop if it should not be such. * The first _real_ opener will do it. */ if (ioctl(fd, TUNSETPERSIST, 1)) { pr_perror("Can't make tun device persistent"); goto out; } if (restore_link_parms(nde, nlsk)) { pr_err("Error restoring %s link params\n", nde->name); goto out; } ret = list_tun_link(nde); out: close(fd); return ret; } criu-3.6/criu/uffd.c000066400000000000000000000631671317335042600143770ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "linux/userfaultfd.h" #include "int.h" #include "page.h" #include "criu-log.h" #include "criu-plugin.h" #include "pagemap.h" #include "files-reg.h" #include "kerndat.h" #include "mem.h" #include "uffd.h" #include "util-pie.h" #include "protobuf.h" #include "pstree.h" #include "crtools.h" #include "cr_options.h" #include "xmalloc.h" #include #include "restorer.h" #include "page-xfer.h" #include "common/lock.h" #include "rst-malloc.h" #include "util.h" #undef LOG_PREFIX #define LOG_PREFIX "uffd: " #define lp_debug(lpi, fmt, arg...) pr_debug("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg) #define lp_info(lpi, fmt, arg...) pr_info("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg) #define lp_warn(lpi, fmt, arg...) pr_warn("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg) #define lp_err(lpi, fmt, arg...) pr_err("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg) #define lp_perror(lpi, fmt, arg...) pr_perror("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg) #define NEED_UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_EVENT_REMOVE) #define LAZY_PAGES_SOCK_NAME "lazy-pages.socket" static mutex_t *lazy_sock_mutex; struct lazy_iov { struct list_head l; unsigned long base; /* run-time start address, tracks remaps */ unsigned long img_base; /* start address at the dump time */ unsigned long len; bool queued; }; struct lp_req { unsigned long addr; /* actual #PF (or background) destination */ unsigned long img_addr; /* the corresponding address at the dump time */ unsigned long len; struct list_head l; }; struct lazy_pages_info { int pid; struct list_head iovs; struct list_head reqs; struct lazy_pages_info *parent; unsigned num_children; struct page_read pr; unsigned long total_pages; unsigned long copied_pages; struct epoll_rfd lpfd; struct list_head l; void *buf; }; /* global lazy-pages daemon state */ static LIST_HEAD(lpis); static LIST_HEAD(exiting_lpis); static LIST_HEAD(pending_lpis); static int epollfd; static int handle_uffd_event(struct epoll_rfd *lpfd); static struct lazy_pages_info *lpi_init(void) { struct lazy_pages_info *lpi = NULL; lpi = xmalloc(sizeof(*lpi)); if (!lpi) return NULL; memset(lpi, 0, sizeof(*lpi)); INIT_LIST_HEAD(&lpi->iovs); INIT_LIST_HEAD(&lpi->reqs); INIT_LIST_HEAD(&lpi->l); lpi->lpfd.revent = handle_uffd_event; return lpi; } static void free_iovs(struct lazy_pages_info *lpi) { struct lazy_iov *p, *n; list_for_each_entry_safe(p, n, &lpi->iovs, l) { list_del(&p->l); xfree(p); } } static void lpi_fini(struct lazy_pages_info *lpi) { if (!lpi) return; free(lpi->buf); free_iovs(lpi); if (lpi->lpfd.fd > 0) close(lpi->lpfd.fd); if (lpi->parent) lpi->parent->num_children--; if (!lpi->parent && !lpi->num_children && lpi->pr.close) lpi->pr.close(&lpi->pr); free(lpi); } static int prepare_sock_addr(struct sockaddr_un *saddr) { int len; memset(saddr, 0, sizeof(struct sockaddr_un)); saddr->sun_family = AF_UNIX; len = snprintf(saddr->sun_path, sizeof(saddr->sun_path), "%s", LAZY_PAGES_SOCK_NAME); if (len >= sizeof(saddr->sun_path)) { pr_err("Wrong UNIX socket name: %s\n", LAZY_PAGES_SOCK_NAME); return -1; } return 0; } static int send_uffd(int sendfd, int pid) { int fd; int ret = -1; if (sendfd < 0) return -1; fd = get_service_fd(LAZY_PAGES_SK_OFF); if (fd < 0) { pr_err("%s: get_service_fd\n", __func__); return -1; } mutex_lock(lazy_sock_mutex); /* The "transfer protocol" is first the pid as int and then * the FD for UFFD */ pr_debug("Sending PID %d\n", pid); if (send(fd, &pid, sizeof(pid), 0) < 0) { pr_perror("PID sending error"); goto out; } /* for a zombie process pid will be negative */ if (pid < 0) { ret = 0; goto out; } if (send_fd(fd, NULL, 0, sendfd) < 0) { pr_err("send_fd error\n"); goto out; } ret = 0; out: mutex_unlock(lazy_sock_mutex); close(fd); return ret; } int lazy_pages_setup_zombie(int pid) { if (!opts.lazy_pages) return 0; if (send_uffd(0, -pid)) return -1; return 0; } bool uffd_noncooperative(void) { unsigned long features = NEED_UFFD_API_FEATURES; return (kdat.uffd_features & features) == features; } int uffd_open(int flags, unsigned long *features) { struct uffdio_api uffdio_api = { 0 }; int uffd; uffd = syscall(SYS_userfaultfd, flags); if (uffd == -1) { pr_perror("Lazy pages are not available"); return -errno; } uffdio_api.api = UFFD_API; if (features) uffdio_api.features = *features; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { pr_perror("Failed to get uffd API"); goto err; } if (uffdio_api.api != UFFD_API) { pr_err("Incompatible uffd API: expected %Lu, got %Lu\n", UFFD_API, uffdio_api.api); goto err; } if (features) *features = uffdio_api.features; return uffd; err: close(uffd); return -1; } /* This function is used by 'criu restore --lazy-pages' */ int setup_uffd(int pid, struct task_restore_args *task_args) { unsigned long features = kdat.uffd_features & NEED_UFFD_API_FEATURES; if (!opts.lazy_pages) { task_args->uffd = -1; return 0; } /* * Open userfaulfd FD which is passed to the restorer blob and * to a second process handling the userfaultfd page faults. */ task_args->uffd = uffd_open(O_CLOEXEC | O_NONBLOCK, &features); if (task_args->uffd < 0) { pr_perror("Unable to open an userfaultfd descriptor"); return -1; } if (send_uffd(task_args->uffd, pid) < 0) goto err; return 0; err: close(task_args->uffd); return -1; } int prepare_lazy_pages_socket(void) { int fd, new_fd; int len; struct sockaddr_un sun; if (!opts.lazy_pages) return 0; if (prepare_sock_addr(&sun)) return -1; lazy_sock_mutex = shmalloc(sizeof(*lazy_sock_mutex)); if (!lazy_sock_mutex) return -1; mutex_init(lazy_sock_mutex); if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) return -1; new_fd = install_service_fd(LAZY_PAGES_SK_OFF, fd); close(fd); if (new_fd < 0) return -1; len = offsetof(struct sockaddr_un, sun_path) + strlen(sun.sun_path); if (connect(new_fd, (struct sockaddr *) &sun, len) < 0) { pr_perror("connect to %s failed", sun.sun_path); close(new_fd); return -1; } return 0; } static int server_listen(struct sockaddr_un *saddr) { int fd; int len; if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) return -1; unlink(saddr->sun_path); len = offsetof(struct sockaddr_un, sun_path) + strlen(saddr->sun_path); if (bind(fd, (struct sockaddr *) saddr, len) < 0) { goto out; } if (listen(fd, 10) < 0) { goto out; } return fd; out: close(fd); return -1; } static MmEntry *init_mm_entry(struct lazy_pages_info *lpi) { struct cr_img *img; MmEntry *mm; int ret; img = open_image(CR_FD_MM, O_RSTR, lpi->pid); if (!img) return NULL; ret = pb_read_one_eof(img, &mm, PB_MM); close_image(img); if (ret == -1) return NULL; lp_debug(lpi, "Found %zd VMAs in image\n", mm->n_vmas); return mm; } static struct lazy_iov *find_iov(struct lazy_pages_info *lpi, unsigned long addr) { struct lazy_iov *iov; list_for_each_entry(iov, &lpi->iovs, l) if (addr >= iov->base && addr < iov->base + iov->len) return iov; return NULL; } static int split_iov(struct lazy_iov *iov, unsigned long addr, bool new_below) { struct lazy_iov *new; new = xzalloc(sizeof(*new)); if (!new) return -1; if (new_below) { new->base = iov->base; new->img_base = iov->img_base; new->len = addr - iov->base; iov->base = addr; iov->img_base += new->len; iov->len -= new->len; list_add_tail(&new->l, &iov->l); } else { new->base = addr; new->img_base = iov->img_base + addr - iov->base; new->len = iov->len - (addr - iov->base); iov->len -= new->len; list_add(&new->l, &iov->l); } return 0; } static int copy_iovs(struct lazy_pages_info *src, struct lazy_pages_info *dst) { struct lazy_iov *iov, *new; int max_iov_len = 0; list_for_each_entry(iov, &src->iovs, l) { new = xzalloc(sizeof(*new)); if (!new) return -1; new->base = iov->base; new->img_base = iov->img_base; new->len = iov->len; list_add_tail(&new->l, &dst->iovs); if (new->len > max_iov_len) max_iov_len = new->len; } if (posix_memalign(&dst->buf, PAGE_SIZE, max_iov_len)) goto free_iovs; return 0; free_iovs: free_iovs(dst); return -1; } /* * Purge range (addr, addr + len) from lazy_iovs. The range may * cover several continuous IOVs. */ static int drop_iovs(struct lazy_pages_info *lpi, unsigned long addr, int len) { struct lazy_iov *iov, *n; list_for_each_entry_safe(iov, n, &lpi->iovs, l) { unsigned long start = iov->base; unsigned long end = start + iov->len; if (len <= 0 || addr + len < start) break; if (addr >= end) continue; if (addr < start) { len -= (start - addr); addr = start; } /* * The range completely fits into the current IOV. * If addr equals iov_base we just "drop" the * beginning of the IOV. Otherwise, we make the IOV to * end at addr, and add a new IOV start starts at * addr + len. */ if (addr + len < end) { if (addr == start) { iov->base += len; iov->img_base += len; iov->len -= len; } else { if (split_iov(iov, addr + len, false)) return -1; iov->len -= len; } break; } /* * The range spawns beyond the end of the current IOV. * If addr equals iov_base we just "drop" the entire * IOV. Otherwise, we cut the beginning of the IOV * and continue to the next one with the updated range */ if (addr == start) { list_del(&iov->l); xfree(iov); } else { iov->len -= (end - addr); } len -= (end - addr); addr = end; } return 0; } static int remap_iovs(struct lazy_pages_info *lpi, unsigned long from, unsigned long to, unsigned long len) { unsigned long off = to - from; struct lazy_iov *iov, *n, *p; LIST_HEAD(remaps); list_for_each_entry_safe(iov, n, &lpi->iovs, l) { unsigned long iov_end = iov->base + iov->len; if (from >= iov_end) continue; if (len <= 0 || from + len < iov->base) break; if (from < iov->base) { len -= (iov->base - from); from = iov->base; } if (from > iov->base) if (split_iov(iov, from, true)) return -1; if (from + len < iov_end) if (split_iov(iov, from + len, false)) return -1; list_safe_reset_next(iov, n, l); /* here we have iov->base = from, iov_end <= from + len */ from = iov_end; len -= iov->len; iov->base += off; list_move_tail(&iov->l, &remaps); } list_for_each_entry_safe(iov, n, &remaps, l) { list_for_each_entry(p, &lpi->iovs, l) { if (iov->base < p->base) { list_move_tail(&iov->l, &p->l); break; } if (list_is_last(&p->l, &lpi->iovs) && iov->base > p->base) { list_move(&iov->l, &p->l); break; } } } return 0; } /* * Create a list of IOVs that can be handled using userfaultfd. The * IOVs generally correspond to lazy pagemap entries, except the cases * when a single pagemap entry covers several VMAs. In those cases * IOVs are split at VMA boundaries because UFFDIO_COPY may be done * only inside a single VMA. * We assume here that pagemaps and VMAs are sorted. */ static int collect_iovs(struct lazy_pages_info *lpi) { struct page_read *pr = &lpi->pr; struct lazy_iov *iov; MmEntry *mm; int nr_pages = 0, n_vma = 0, max_iov_len = 0; int ret = -1; unsigned long start, end, len; mm = init_mm_entry(lpi); if (!mm) return -1; while (pr->advance(pr)) { if (!pagemap_lazy(pr->pe)) continue; start = pr->pe->vaddr; end = start + pr->pe->nr_pages * page_size(); nr_pages += pr->pe->nr_pages; for (; n_vma < mm->n_vmas; n_vma++) { VmaEntry *vma = mm->vmas[n_vma]; if (start >= vma->end) continue; iov = xzalloc(sizeof(*iov)); if (!iov) goto free_iovs; len = min_t(uint64_t, end, vma->end) - start; iov->base = start; iov->img_base = start; iov->len = len; list_add_tail(&iov->l, &lpi->iovs); if (len > max_iov_len) max_iov_len = len; if (end <= vma->end) break; start = vma->end; } } if (posix_memalign(&lpi->buf, PAGE_SIZE, max_iov_len)) goto free_iovs; ret = nr_pages; goto free_mm; free_iovs: free_iovs(lpi); free_mm: mm_entry__free_unpacked(mm, NULL); return ret; } static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, int nr); static int ud_open(int client, struct lazy_pages_info **_lpi) { struct lazy_pages_info *lpi; int ret = -1; int pr_flags = PR_TASK; lpi = lpi_init(); if (!lpi) goto out; /* The "transfer protocol" is first the pid as int and then * the FD for UFFD */ ret = recv(client, &lpi->pid, sizeof(lpi->pid), 0); if (ret != sizeof(lpi->pid)) { if (ret < 0) pr_perror("PID recv error"); else pr_err("PID recv: short read\n"); goto out; } if (lpi->pid < 0) { pr_debug("Zombie PID: %d\n", lpi->pid); lpi_fini(lpi); return 0; } lpi->lpfd.fd = recv_fd(client); if (lpi->lpfd.fd < 0) { pr_err("recv_fd error\n"); goto out; } pr_debug("Received PID: %d, uffd: %d\n", lpi->pid, lpi->lpfd.fd); if (opts.use_page_server) pr_flags |= PR_REMOTE; ret = open_page_read(lpi->pid, &lpi->pr, pr_flags); if (ret <= 0) { ret = -1; goto out; } lpi->pr.io_complete = uffd_io_complete; /* * Find the memory pages belonging to the restored process * so that it is trackable when all pages have been transferred. */ ret = collect_iovs(lpi); if (ret < 0) goto out; lpi->total_pages = ret; lp_debug(lpi, "Found %ld pages to be handled by UFFD\n", lpi->total_pages); list_add_tail(&lpi->l, &lpis); *_lpi = lpi; return 0; out: lpi_fini(lpi); return -1; } static int handle_exit(struct lazy_pages_info *lpi) { lp_debug(lpi, "EXIT\n"); if (epoll_del_rfd(epollfd, &lpi->lpfd)) return -1; free_iovs(lpi); close(lpi->lpfd.fd); lpi->lpfd.fd = 0; /* keep it for tracking in-flight requests and for the summary */ list_move_tail(&lpi->l, &lpis); return 0; } static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsigned long len, unsigned long rc_len, int rc) { if (rc) { if (errno == ENOSPC || errno == ESRCH) { handle_exit(lpi); return 0; } if (rc_len != -EEXIST) { lp_perror(lpi, "%s: rc:%d copy:%ld, errno:%d", op, rc, rc_len, errno); return -1; } } else if (rc_len != len) { lp_err(lpi, "%s unexpected size %ld\n", op, rc_len); return -1; } return 0; } static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int nr_pages) { struct uffdio_copy uffdio_copy; unsigned long len = nr_pages * page_size(); int rc; uffdio_copy.dst = address; uffdio_copy.src = (unsigned long)lpi->buf; uffdio_copy.len = len; uffdio_copy.mode = 0; uffdio_copy.copy = 0; lp_debug(lpi, "uffd_copy: 0x%llx/%ld\n", uffdio_copy.dst, len); rc = ioctl(lpi->lpfd.fd, UFFDIO_COPY, &uffdio_copy); if (uffd_check_op_error(lpi, "copy", len, uffdio_copy.copy, rc)) return -1; lpi->copied_pages += nr_pages; return 0; } static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr) { struct lazy_pages_info *lpi; unsigned long addr = 0; struct lp_req *req; lpi = container_of(pr, struct lazy_pages_info, pr); list_for_each_entry(req, &lpi->reqs, l) { if (req->img_addr == img_addr) { addr = req->addr; list_del(&req->l); xfree(req); break; } } /* * The process may exit while we still have requests in * flight. We just drop the request and the received data in * this case to avoid making uffd unhappy */ if (list_empty(&lpi->iovs)) return 0; BUG_ON(!addr); if (uffd_copy(lpi, addr, nr)) return -1; return drop_iovs(lpi, addr, nr * PAGE_SIZE); } static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) { struct uffdio_zeropage uffdio_zeropage; unsigned long len = page_size() * nr_pages; int rc; uffdio_zeropage.range.start = address; uffdio_zeropage.range.len = len; uffdio_zeropage.mode = 0; lp_debug(lpi, "zero page at 0x%llx\n", address); rc = ioctl(lpi->lpfd.fd, UFFDIO_ZEROPAGE, &uffdio_zeropage); if (uffd_check_op_error(lpi, "zero", len, uffdio_zeropage.zeropage, rc)) return -1; return 0; } /* * Seek for the requested address in the pagemap. If it is found, the * subsequent call to pr->page_read will bring us the data. If the * address is not found in the pagemap, but no error occured, the * address should be mapped to zero pfn. * * Returns 0 for zero pages, 1 for "real" pages and negative value on * error */ static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) { int ret; lpi->pr.reset(&lpi->pr); ret = lpi->pr.seek_pagemap(&lpi->pr, address); if (!ret) { lp_err(lpi, "no pagemap covers %llx\n", address); return -1; } return 0; } static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, int nr, unsigned flags) { int ret; ret = uffd_seek_pages(lpi, address, nr); if (ret) return ret; ret = lpi->pr.read_pages(&lpi->pr, address, nr, lpi->buf, flags); if (ret <= 0) { lp_err(lpi, "failed reading pages at %llx\n", address); return ret; } return 0; } static struct lazy_iov *first_pending_iov(struct lazy_pages_info *lpi) { struct lazy_iov *iov; list_for_each_entry(iov, &lpi->iovs, l) if (!iov->queued) return iov; return NULL; } static bool is_iov_queued(struct lazy_pages_info *lpi, struct lazy_iov *iov) { struct lp_req *req; list_for_each_entry(req, &lpi->reqs, l) if (req->addr >= iov->base && req->addr < iov->base + iov->len) return true; return false; } static int handle_remaining_pages(struct lazy_pages_info *lpi) { struct lazy_iov *iov; struct lp_req *req; int nr_pages, err; iov = first_pending_iov(lpi); if (!iov) return 0; if (is_iov_queued(lpi, iov)) return 0; nr_pages = iov->len / PAGE_SIZE; req = xzalloc(sizeof(*req)); if (!req) return -1; req->addr = iov->base; req->img_addr = iov->img_base; req->len = iov->len; list_add(&req->l, &lpi->reqs); iov->queued = true; err = uffd_handle_pages(lpi, req->img_addr, nr_pages, PR_ASYNC | PR_ASAP); if (err < 0) { lp_err(lpi, "Error during UFFD copy\n"); return -1; } return 0; } static int handle_remove(struct lazy_pages_info *lpi, struct uffd_msg *msg) { struct uffdio_range unreg; unreg.start = msg->arg.remove.start; unreg.len = msg->arg.remove.end - msg->arg.remove.start; lp_debug(lpi, "%s: %Lx(%Lx)\n", msg->event == UFFD_EVENT_REMOVE ? "REMOVE" : "UNMAP", unreg.start, unreg.len); /* * The REMOVE event does not change the VMA, so we need to * make sure that we won't handle #PFs in the removed * range. With UNMAP, there's no VMA to worry about */ if (msg->event == UFFD_EVENT_REMOVE && ioctl(lpi->lpfd.fd, UFFDIO_UNREGISTER, &unreg)) { /* * The kernel returns -ENOMEM when unregister is * called after the process has gone */ if (errno == ENOMEM) { handle_exit(lpi); return 0; } pr_perror("Failed to unregister (%llx - %llx)", unreg.start, unreg.start + unreg.len); return -1; } return drop_iovs(lpi, unreg.start, unreg.len); } static int handle_remap(struct lazy_pages_info *lpi, struct uffd_msg *msg) { unsigned long from = msg->arg.remap.from; unsigned long to = msg->arg.remap.to; unsigned long len = msg->arg.remap.len; lp_debug(lpi, "REMAP: %lx -> %lx (%ld)\n", from , to, len); return remap_iovs(lpi, from, to, len); } static int handle_fork(struct lazy_pages_info *parent_lpi, struct uffd_msg *msg) { struct lazy_pages_info *lpi; int uffd = msg->arg.fork.ufd; lp_debug(parent_lpi, "FORK: child with ufd=%d\n", uffd); lpi = lpi_init(); if (!lpi) return -1; if (copy_iovs(parent_lpi, lpi)) goto out; lpi->pid = parent_lpi->pid; lpi->lpfd.fd = uffd; lpi->parent = parent_lpi->parent ? parent_lpi->parent : parent_lpi; lpi->copied_pages = lpi->parent->copied_pages; lpi->total_pages = lpi->parent->total_pages; list_add_tail(&lpi->l, &pending_lpis); dup_page_read(&lpi->parent->pr, &lpi->pr); lpi->parent->num_children++; return 1; out: lpi_fini(lpi); return -1; } static int complete_forks(int epollfd, struct epoll_event **events, int *nr_fds) { struct lazy_pages_info *lpi, *n; list_for_each_entry(lpi, &pending_lpis, l) (*nr_fds)++; *events = xrealloc(*events, sizeof(struct epoll_event) * (*nr_fds)); if (!*events) return -1; list_for_each_entry_safe(lpi, n, &pending_lpis, l) { if (epoll_add_rfd(epollfd, &lpi->lpfd)) return -1; list_del_init(&lpi->l); list_add_tail(&lpi->l, &lpis); } return 0; } static bool is_page_queued(struct lazy_pages_info *lpi, unsigned long addr) { struct lp_req *req; list_for_each_entry(req, &lpi->reqs, l) if (addr >= req->addr && addr < req->addr + req->len) return true; return false; } static int handle_page_fault(struct lazy_pages_info *lpi, struct uffd_msg *msg) { struct lp_req *req; struct lazy_iov *iov; __u64 address; int ret; /* Align requested address to the next page boundary */ address = msg->arg.pagefault.address & ~(page_size() - 1); lp_debug(lpi, "#PF at 0x%llx\n", address); if (is_page_queued(lpi, address)) return 0; iov = find_iov(lpi, address); if (!iov) return uffd_zero(lpi, address, 1); req = xzalloc(sizeof(*req)); if (!req) return -1; req->addr = address; req->img_addr = iov->img_base + (address - iov->base); req->len = PAGE_SIZE; list_add(&req->l, &lpi->reqs); ret = uffd_handle_pages(lpi, req->img_addr, 1, PR_ASYNC | PR_ASAP); if (ret < 0) { lp_err(lpi, "Error during regular page copy\n"); return -1; } return 0; } static int handle_uffd_event(struct epoll_rfd *lpfd) { struct lazy_pages_info *lpi; struct uffd_msg msg; int ret; lpi = container_of(lpfd, struct lazy_pages_info, lpfd); ret = read(lpfd->fd, &msg, sizeof(msg)); if (!ret) return 1; if (ret != sizeof(msg)) { /* we've already handled the page fault for another thread */ if (errno == EAGAIN) return 0; if (ret < 0) lp_perror(lpi, "Can't read uffd message"); else lp_err(lpi, "Can't read uffd message: short read"); return -1; } switch (msg.event) { case UFFD_EVENT_PAGEFAULT: return handle_page_fault(lpi, &msg); case UFFD_EVENT_REMOVE: case UFFD_EVENT_UNMAP: return handle_remove(lpi, &msg); case UFFD_EVENT_REMAP: return handle_remap(lpi, &msg); case UFFD_EVENT_FORK: return handle_fork(lpi, &msg); default: lp_err(lpi, "unexpected uffd event %u\n", msg.event); return -1; } return 0; } static void lazy_pages_summary(struct lazy_pages_info *lpi) { lp_debug(lpi, "UFFD transferred pages: (%ld/%ld)\n", lpi->copied_pages, lpi->total_pages); #if 0 if ((lpi->copied_pages != lpi->total_pages) && (lpi->total_pages > 0)) { lp_warn(lpi, "Only %ld of %ld pages transferred via UFFD\n" "Something probably went wrong.\n", lpi->copied_pages, lpi->total_pages); return 1; } #endif } #define POLL_TIMEOUT 1000 static int handle_requests(int epollfd, struct epoll_event *events, int nr_fds) { struct lazy_pages_info *lpi, *n; int poll_timeout = POLL_TIMEOUT; int ret; for (;;) { bool remaining = false; ret = epoll_run_rfds(epollfd, events, nr_fds, poll_timeout); if (ret < 0) goto out; if (ret > 0) { if (complete_forks(epollfd, &events, &nr_fds)) return -1; continue; } if (poll_timeout) pr_debug("Start handling remaining pages\n"); poll_timeout = 0; list_for_each_entry_safe(lpi, n, &lpis, l) { if (list_empty(&lpi->iovs) && list_empty(&lpi->reqs)) { lazy_pages_summary(lpi); list_del(&lpi->l); lpi_fini(lpi); continue; } remaining = true; if (!list_empty(&lpi->iovs)) { ret = handle_remaining_pages(lpi); if (ret < 0) goto out; break; } } if (!remaining) break; } out: return ret; } static int prepare_lazy_socket(void) { int listen; struct sockaddr_un saddr; if (prepare_sock_addr(&saddr)) return -1; pr_debug("Waiting for incoming connections on %s\n", saddr.sun_path); if ((listen = server_listen(&saddr)) < 0) { pr_perror("server_listen error"); return -1; } return listen; } static int prepare_uffds(int listen, int epollfd) { int i; int client; socklen_t len; struct sockaddr_un saddr; /* accept new client request */ len = sizeof(struct sockaddr_un); if ((client = accept(listen, (struct sockaddr *) &saddr, &len)) < 0) { pr_perror("server_accept error"); close(listen); return -1; } for (i = 0; i < task_entries->nr_tasks; i++) { struct lazy_pages_info *lpi = NULL; if (ud_open(client, &lpi)) goto close_uffd; if (lpi == NULL) continue; if (epoll_add_rfd(epollfd, &lpi->lpfd)) goto close_uffd; } close_safe(&client); close(listen); return 0; close_uffd: close_safe(&client); close(listen); return -1; } int cr_lazy_pages(bool daemon) { struct epoll_event *events; int nr_fds; int lazy_sk; int ret; if (kerndat_uffd() || !kdat.has_uffd) return -1; if (prepare_dummy_pstree()) return -1; lazy_sk = prepare_lazy_socket(); if (lazy_sk < 0) return -1; if (daemon) { ret = cr_daemon(1, 0, &lazy_sk, -1); if (ret == -1) { pr_err("Can't run in the background\n"); return -1; } if (ret > 0) { /* parent task, daemon started */ if (opts.pidfile) { if (write_pidfile(ret) == -1) { pr_perror("Can't write pidfile"); kill(ret, SIGKILL); waitpid(ret, NULL, 0); return -1; } } return 0; } } if (close_status_fd()) return -1; nr_fds = task_entries->nr_tasks + (opts.use_page_server ? 1 : 0); epollfd = epoll_prepare(nr_fds, &events); if (epollfd < 0) return -1; if (prepare_uffds(lazy_sk, epollfd)) return -1; if (opts.use_page_server) { if (connect_to_page_server_to_recv(epollfd)) return -1; } ret = handle_requests(epollfd, events, nr_fds); return ret; } criu-3.6/criu/util.c000066400000000000000000000575611317335042600144310ustar00rootroot00000000000000#define _XOPEN_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bitops.h" #include "page.h" #include "common/compiler.h" #include "common/list.h" #include "util.h" #include "rst-malloc.h" #include "image.h" #include "vma.h" #include "mem.h" #include "namespaces.h" #include "criu-log.h" #include "cr_options.h" #include "servicefd.h" #include "cr-service.h" #include "files.h" #include "cr-errno.h" #define VMA_OPT_LEN 128 static int xatol_base(const char *string, long *number, int base) { char *endptr; long nr; errno = 0; nr = strtol(string, &endptr, base); if ((errno == ERANGE && (nr == LONG_MAX || nr == LONG_MIN)) || (errno != 0 && nr == 0)) { pr_perror("failed to convert string '%s'", string); return -EINVAL; } if ((endptr == string) || (*endptr != '\0')) { pr_err("String is not a number: '%s'\n", string); return -EINVAL; } *number = nr; return 0; } int xatol(const char *string, long *number) { return xatol_base(string, number, 10); } int xatoi(const char *string, int *number) { long tmp; int err; err = xatol(string, &tmp); if (err) return err; if (tmp > INT_MAX || tmp < INT_MIN) { pr_err("value %#lx (%ld) is out of int range\n", tmp, tmp); return -ERANGE; } *number = (int)tmp; return 0; } /* * This function reallocates passed str pointer. * It means: * 1) passed pointer can be either NULL, or previously allocated by malloc. * 2) Passed pointer can' be reused. It's either freed in case of error or can * be changed. */ static char *xvstrcat(char *str, const char *fmt, va_list args) { size_t offset = 0, delta; int ret; char *new; va_list tmp; if (str) offset = strlen(str); delta = strlen(fmt) * 2; do { new = xrealloc(str, offset + delta); if (!new) { /* realloc failed. We must release former string */ xfree(str); pr_err("Failed to allocate string\n"); return new; } va_copy(tmp, args); ret = vsnprintf(new + offset, delta, fmt, tmp); va_end(tmp); if (ret < delta) /* an error, or all was written */ break; /* NOTE: vsnprintf returns the amount of bytes * to allocate. */ delta = ret + 1; str = new; } while (1); if (ret < 0) { /* vsnprintf failed */ pr_err("Failed to print string\n"); xfree(new); new = NULL; } return new; } char *xstrcat(char *str, const char *fmt, ...) { va_list args; va_start(args, fmt); str = xvstrcat(str, fmt, args); va_end(args); return str; } char *xsprintf(const char *fmt, ...) { va_list args; char *str; va_start(args, fmt); str = xvstrcat(NULL, fmt, args); va_end(args); return str; } static void vma_opt_str(const struct vma_area *v, char *opt) { int p = 0; #define opt2s(_o, _s) do { \ if (v->e->status & _o) \ p += sprintf(opt + p, _s " "); \ } while (0) opt[p] = '\0'; opt2s(VMA_AREA_REGULAR, "reg"); opt2s(VMA_AREA_STACK, "stk"); opt2s(VMA_AREA_VSYSCALL, "vsys"); opt2s(VMA_AREA_VDSO, "vdso"); opt2s(VMA_AREA_VVAR, "vvar"); opt2s(VMA_AREA_HEAP, "heap"); opt2s(VMA_FILE_PRIVATE, "fp"); opt2s(VMA_FILE_SHARED, "fs"); opt2s(VMA_ANON_SHARED, "as"); opt2s(VMA_ANON_PRIVATE, "ap"); opt2s(VMA_AREA_SYSVIPC, "sysv"); opt2s(VMA_AREA_SOCKET, "sk"); #undef opt2s } void pr_vma(unsigned int loglevel, const struct vma_area *vma_area) { char opt[VMA_OPT_LEN]; memset(opt, 0, VMA_OPT_LEN); if (!vma_area) return; vma_opt_str(vma_area, opt); print_on_level(loglevel, "%#"PRIx64"-%#"PRIx64" (%"PRIi64"K) prot %#x flags %#x fdflags %#o st %#x off %#"PRIx64" " "%s shmid: %#"PRIx64"\n", vma_area->e->start, vma_area->e->end, KBYTES(vma_area_len(vma_area)), vma_area->e->prot, vma_area->e->flags, vma_area->e->fdflags, vma_area->e->status, vma_area->e->pgoff, opt, vma_area->e->shmid); } int close_safe(int *fd) { int ret = 0; if (*fd > -1) { ret = close(*fd); if (!ret) *fd = -1; else pr_perror("Unable to close fd %d", *fd); } return ret; } int reopen_fd_as_safe(char *file, int line, int new_fd, int old_fd, bool allow_reuse_fd) { int tmp; if (old_fd != new_fd) { /* make sure we won't clash with an inherit fd */ if (inherit_fd_resolve_clash(new_fd) < 0) return -1; if (!allow_reuse_fd) { if (fcntl(new_fd, F_GETFD) != -1 || errno != EBADF) { pr_err("fd %d already in use (called at %s:%d)\n", new_fd, file, line); return -1; } } tmp = dup2(old_fd, new_fd); if (tmp < 0) { pr_perror("Dup %d -> %d failed (called at %s:%d)", old_fd, new_fd, file, line); return tmp; } /* Just to have error message if failed */ close_safe(&old_fd); } return 0; } int move_fd_from(int *img_fd, int want_fd) { if (*img_fd == want_fd) { int tmp; tmp = dup(*img_fd); if (tmp < 0) { pr_perror("Can't dup file"); return -1; } close(*img_fd); *img_fd = tmp; } return 0; } /* * Cached opened /proc/$pid and /proc/self files. * Used for faster access to /proc/.../foo files * by using openat()-s */ static pid_t open_proc_pid = PROC_NONE; static int open_proc_fd = -1; static pid_t open_proc_self_pid; static int open_proc_self_fd = -1; static inline void set_proc_self_fd(int fd) { if (open_proc_self_fd >= 0) close(open_proc_self_fd); open_proc_self_fd = fd; open_proc_self_pid = getpid(); } static inline void set_proc_pid_fd(int pid, int fd) { if (open_proc_fd >= 0) close(open_proc_fd); open_proc_pid = pid; open_proc_fd = fd; } static inline int get_proc_fd(int pid) { if (pid == PROC_SELF) { if (open_proc_self_fd != -1 && open_proc_self_pid != getpid()) { close(open_proc_self_fd); open_proc_self_fd = -1; } return open_proc_self_fd; } else if (pid == open_proc_pid) return open_proc_fd; else return -1; } int close_pid_proc(void) { set_proc_self_fd(-1); set_proc_pid_fd(PROC_NONE, -1); return 0; } void close_proc() { close_pid_proc(); close_service_fd(PROC_FD_OFF); } int set_proc_fd(int fd) { if (install_service_fd(PROC_FD_OFF, fd) < 0) return -1; return 0; } static int open_proc_sfd(char *path) { int fd, ret; close_proc(); fd = open(path, O_DIRECTORY | O_PATH); if (fd == -1) { pr_perror("Can't open %s", path); return -1; } ret = install_service_fd(PROC_FD_OFF, fd); close(fd); if (ret < 0) return -1; return 0; } inline int open_pid_proc(pid_t pid) { char path[18]; int fd; int dfd; fd = get_proc_fd(pid); if (fd >= 0) return fd; dfd = get_service_fd(PROC_FD_OFF); if (dfd < 0) { if (open_proc_sfd("/proc") < 0) return -1; dfd = get_service_fd(PROC_FD_OFF); } if (pid == PROC_GEN) /* * Don't cache it, close_pid_proc() would * close service descriptor otherwise. */ return dfd; if (pid == PROC_SELF) snprintf(path, sizeof(path), "self"); else snprintf(path, sizeof(path), "%d", pid); fd = openat(dfd, path, O_PATH); if (fd < 0) { pr_perror("Can't open %s", path); set_cr_errno(ESRCH); return -1; } if (pid == PROC_SELF) set_proc_self_fd(fd); else set_proc_pid_fd(pid, fd); return fd; } int do_open_proc(pid_t pid, int flags, const char *fmt, ...) { char path[128]; va_list args; int dirfd; dirfd = open_pid_proc(pid); if (dirfd < 0) return -1; va_start(args, fmt); vsnprintf(path, sizeof(path), fmt, args); va_end(args); return openat(dirfd, path, flags); } static int service_fd_rlim_cur; static int service_fd_id = 0; int init_service_fd(void) { struct rlimit64 rlimit; /* * Service FDs are those that most likely won't * conflict with any 'real-life' ones */ if (syscall(__NR_prlimit64, getpid(), RLIMIT_NOFILE, NULL, &rlimit)) { pr_perror("Can't get rlimit"); return -1; } service_fd_rlim_cur = (int)rlimit.rlim_cur; BUG_ON(service_fd_rlim_cur < SERVICE_FD_MAX); return 0; } static int __get_service_fd(enum sfd_type type, int service_fd_id) { return service_fd_rlim_cur - type - SERVICE_FD_MAX * service_fd_id; } int service_fd_min_fd(void) { return service_fd_rlim_cur - (SERVICE_FD_MAX - 1) - SERVICE_FD_MAX * service_fd_id; } static DECLARE_BITMAP(sfd_map, SERVICE_FD_MAX); int reserve_service_fd(enum sfd_type type) { int sfd = __get_service_fd(type, service_fd_id); BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX); set_bit(type, sfd_map); return sfd; } int install_service_fd(enum sfd_type type, int fd) { int sfd = __get_service_fd(type, service_fd_id); BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX); if (dup3(fd, sfd, O_CLOEXEC) != sfd) { pr_perror("Dup %d -> %d failed", fd, sfd); return -1; } set_bit(type, sfd_map); return sfd; } int get_service_fd(enum sfd_type type) { BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX); if (!test_bit(type, sfd_map)) return -1; return __get_service_fd(type, service_fd_id); } int criu_get_image_dir(void) { return get_service_fd(IMG_FD_OFF); } int close_service_fd(enum sfd_type type) { int fd; fd = get_service_fd(type); if (fd < 0) return 0; if (close_safe(&fd)) return -1; clear_bit(type, sfd_map); return 0; } int clone_service_fd(int id) { int ret = -1, i; if (service_fd_id == id) return 0; for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++) { int old = get_service_fd(i); int new = __get_service_fd(i, id); if (old < 0) continue; ret = dup2(old, new); if (ret == -1) { if (errno == EBADF) continue; pr_perror("Unable to clone %d->%d", old, new); } } service_fd_id = id; ret = 0; return ret; } bool is_any_service_fd(int fd) { return fd > __get_service_fd(SERVICE_FD_MAX, service_fd_id) && fd < __get_service_fd(SERVICE_FD_MIN, service_fd_id); } bool is_service_fd(int fd, enum sfd_type type) { return fd == get_service_fd(type); } int copy_file(int fd_in, int fd_out, size_t bytes) { ssize_t written = 0; size_t chunk = bytes ? bytes : 4096; while (1) { ssize_t ret; ret = sendfile(fd_out, fd_in, NULL, chunk); if (ret < 0) { pr_perror("Can't send data to ghost file"); return -1; } if (ret == 0) { if (bytes && (written != bytes)) { pr_err("Ghost file size mismatch %zu/%zu\n", written, bytes); return -1; } break; } written += ret; } return 0; } int read_fd_link(int lfd, char *buf, size_t size) { char t[32]; ssize_t ret; snprintf(t, sizeof(t), "/proc/self/fd/%d", lfd); ret = readlink(t, buf, size); if (ret < 0) { pr_perror("Can't read link of fd %d", lfd); return -1; } else if ((size_t)ret >= size) { pr_err("Buffer for read link of fd %d is too small\n", lfd); return -1; } buf[ret] = 0; return ret; } int is_anon_link_type(char *link, char *type) { char aux[32]; snprintf(aux, sizeof(aux), "anon_inode:%s", type); return !strcmp(link, aux); } #define DUP_SAFE(fd, out) \ ({ \ int ret__; \ ret__ = dup(fd); \ if (ret__ == -1) { \ pr_perror("dup(%d) failed", fd); \ goto out; \ } \ ret__; \ }) /* * If "in" is negative, stdin will be closed. * If "out" or "err" are negative, a log file descriptor will be used. */ int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags) { return cr_system_userns(in, out, err, cmd, argv, flags, -1); } int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], unsigned flags, int userns_pid) { sigset_t blockmask, oldmask; int ret = -1, status; pid_t pid; sigemptyset(&blockmask); sigaddset(&blockmask, SIGCHLD); if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) { pr_perror("Can not set mask of blocked signals"); return -1; } pid = fork(); if (pid == -1) { pr_perror("fork() failed"); goto out; } else if (pid == 0) { if (userns_pid > 0) { if (switch_ns(userns_pid, &user_ns_desc, NULL)) goto out_chld; if (setuid(0) || setgid(0)) { pr_perror("Unable to set uid or gid"); goto out_chld; } } if (out < 0) out = DUP_SAFE(log_get_fd(), out_chld); if (err < 0) err = DUP_SAFE(log_get_fd(), out_chld); /* * out, err, in should be a separate fds, * because reopen_fd_as() closes an old fd */ if (err == out || err == in) err = DUP_SAFE(err, out_chld); if (out == in) out = DUP_SAFE(out, out_chld); if (move_fd_from(&out, STDIN_FILENO) || move_fd_from(&err, STDIN_FILENO)) goto out_chld; if (in < 0) { close(STDIN_FILENO); } else { if (reopen_fd_as_nocheck(STDIN_FILENO, in)) goto out_chld; } if (move_fd_from(&err, STDOUT_FILENO)) goto out_chld; if (reopen_fd_as_nocheck(STDOUT_FILENO, out)) goto out_chld; if (reopen_fd_as_nocheck(STDERR_FILENO, err)) goto out_chld; execvp(cmd, argv); pr_perror("exec(%s, ...) failed", cmd); out_chld: _exit(1); } while (1) { ret = waitpid(pid, &status, 0); if (ret == -1) { pr_perror("waitpid() failed"); goto out; } if (WIFEXITED(status)) { if (!(flags & CRS_CAN_FAIL) && WEXITSTATUS(status)) pr_err("exited, status=%d\n", WEXITSTATUS(status)); break; } else if (WIFSIGNALED(status)) { pr_err("killed by signal %d: %s\n", WTERMSIG(status), strsignal(WTERMSIG(status))); break; } else if (WIFSTOPPED(status)) { pr_err("stopped by signal %d\n", WSTOPSIG(status)); } else if (WIFCONTINUED(status)) { pr_err("continued\n"); } } ret = status ? -1 : 0; out: if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { pr_perror("Can not unset mask of blocked signals"); BUG(); } return ret; } int close_status_fd(void) { char c = 0; if (opts.status_fd < 0) return 0; if (write(opts.status_fd, &c, 1) != 1) { pr_perror("Unable to write into the status fd"); return -1; } return close_safe(&opts.status_fd); } int cr_daemon(int nochdir, int noclose, int *keep_fd, int close_fd) { int pid; pid = fork(); if (pid < 0) { pr_perror("Can't fork"); return -1; } if (pid > 0) return pid; setsid(); if (!nochdir) if (chdir("/") == -1) pr_perror("Can't change directory"); if (!noclose) { int fd; if (close_fd != -1) close(close_fd); if ((*keep_fd != -1) && (*keep_fd != 3)) { fd = dup2(*keep_fd, 3); if (fd < 0) { pr_perror("Dup2 failed"); return -1; } close(*keep_fd); *keep_fd = fd; } fd = open("/dev/null", O_RDWR); if (fd < 0) { pr_perror("Can't open /dev/null"); return -1; } dup2(fd, 0); dup2(fd, 1); dup2(fd, 2); close(fd); } return 0; } int is_root_user() { if (geteuid() != 0) { pr_err("You need to be root to run this command\n"); return 0; } return 1; } int is_empty_dir(int dirfd) { int ret = 0; DIR *fdir = NULL; struct dirent *de; fdir = fdopendir(dirfd); if (!fdir) return -1; while ((de = readdir(fdir))) { if (dir_dots(de)) continue; goto out; } ret = 1; out: closedir(fdir); return ret; } /* * Get PFN from pagemap file for virtual address vaddr. * Optionally if fd >= 0, it's used as pagemap file descriptor * (may be other task's pagemap) */ int vaddr_to_pfn(int fd, unsigned long vaddr, u64 *pfn) { int ret = -1; off_t off; bool close_fd = false; if (fd < 0) { fd = open_proc(PROC_SELF, "pagemap"); if (fd < 0) return -1; close_fd = true; } off = (vaddr / page_size()) * sizeof(u64); ret = pread(fd, pfn, sizeof(*pfn), off); if (ret != sizeof(*pfn)) { pr_perror("Can't read pme for pid %d", getpid()); ret = -1; } else { *pfn &= PME_PFRAME_MASK; ret = 0; } if (close_fd) close(fd); return ret; } /* * Note since VMA_AREA_NONE = 0 we can skip assignment * here and simply rely on xzalloc */ struct vma_area *alloc_vma_area(void) { struct vma_area *p; p = xzalloc(sizeof(*p) + sizeof(VmaEntry)); if (p) { p->e = (VmaEntry *)(p + 1); vma_entry__init(p->e); p->e->fd = -1; } return p; } int mkdirpat(int fd, const char *path, int mode) { size_t i; char made_path[PATH_MAX], *pos; if (strlen(path) >= PATH_MAX) { pr_err("path %s is longer than PATH_MAX\n", path); return -ENOSPC; } strcpy(made_path, path); i = 0; if (made_path[0] == '/') i++; for (; i < strlen(made_path); i++) { pos = strchr(made_path + i, '/'); if (pos) *pos = '\0'; if (mkdirat(fd, made_path, mode) < 0 && errno != EEXIST) { int ret = -errno; pr_perror("couldn't mkdirpat directory %s", made_path); return ret; } if (pos) { *pos = '/'; i = pos - made_path; } else break; } return 0; } bool is_path_prefix(const char *path, const char *prefix) { if (strstartswith(path, prefix)) { size_t len = strlen(prefix); switch (path[len]) { case '\0': case '/': return true; } } return false; } FILE *fopenat(int dirfd, char *path, char *cflags) { int tmp, flags = 0; char *iter; for (iter = cflags; *iter; iter++) { switch (*iter) { case 'r': flags |= O_RDONLY; break; case 'a': flags |= O_APPEND; break; case 'w': flags |= O_WRONLY | O_CREAT; break; case '+': flags = O_RDWR | O_CREAT; break; } } tmp = openat(dirfd, path, flags, S_IRUSR | S_IWUSR); if (tmp < 0) return NULL; return fdopen(tmp, cflags); } void split(char *str, char token, char ***out, int *n) { int i; char *cur; *n = 0; for (cur = str; cur != NULL; cur = strchr(cur, token)) { (*n)++; cur++; } *out = xmalloc((*n) * sizeof(char *)); if (!*out) { *n = -1; return; } cur = str; i = 0; do { char *prev = cur; cur = strchr(cur, token); if (cur) *cur = '\0'; (*out)[i] = xstrdup(prev); if (cur) { *cur = token; cur++; } if (!(*out)[i]) { int j; for (j = 0; j < i; j++) xfree((*out)[j]); xfree(*out); *out = NULL; *n = -1; return; } i++; } while(cur); } int fd_has_data(int lfd) { struct pollfd pfd = {lfd, POLLIN, 0}; int ret; ret = poll(&pfd, 1, 0); if (ret < 0) { pr_perror("poll() failed"); } return ret; } int make_yard(char *path) { if (mount("none", path, "tmpfs", 0, NULL)) { pr_perror("Unable to mount tmpfs in %s", path); return -1; } if (mount("none", path, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to mark yard as private"); return -1; } return 0; } const char *ns_to_string(unsigned int ns) { switch (ns) { case CLONE_NEWIPC: return "ipc"; case CLONE_NEWNS: return "mnt"; case CLONE_NEWNET: return "net"; case CLONE_NEWPID: return "pid"; case CLONE_NEWUSER: return "user"; case CLONE_NEWUTS: return "uts"; default: return NULL; } } void tcp_cork(int sk, bool on) { int val = on ? 1 : 0; setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val)); } void tcp_nodelay(int sk, bool on) { int val = on ? 1 : 0; setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val)); } static inline void pr_xsym(unsigned char *data, size_t len, int pos) { char sym; if (pos < len) sym = data[pos]; else sym = ' '; pr_msg("%c", isprint(sym) ? sym : '.'); } static inline void pr_xdigi(unsigned char *data, size_t len, int pos) { if (pos < len) pr_msg("%02x ", data[pos]); else pr_msg(" "); } static int nice_width_for(unsigned long addr) { int ret = 3; while (addr) { addr >>= 4; ret++; } return ret; } void print_data(unsigned long addr, unsigned char *data, size_t size) { int i, j, addr_len; unsigned zero_line = 0; addr_len = nice_width_for(addr + size); for (i = 0; i < size; i += 16) { if (*(u64 *)(data + i) == 0 && *(u64 *)(data + i + 8) == 0) { if (zero_line == 0) zero_line = 1; else { if (zero_line == 1) { pr_msg("*\n"); zero_line = 2; } continue; } } else zero_line = 0; pr_msg("%#0*lx: ", addr_len, addr + i); for (j = 0; j < 8; j++) pr_xdigi(data, size, i + j); pr_msg(" "); for (j = 8; j < 16; j++) pr_xdigi(data, size, i + j); pr_msg(" |"); for (j = 0; j < 8; j++) pr_xsym(data, size, i + j); pr_msg(" "); for (j = 8; j < 16; j++) pr_xsym(data, size, i + j); pr_msg("|\n"); } } static int get_sockaddr_in(struct sockaddr_in *addr, char *host) { memset(addr, 0, sizeof(*addr)); addr->sin_family = AF_INET; if (!host) addr->sin_addr.s_addr = INADDR_ANY; else if (!inet_aton(host, &addr->sin_addr)) { pr_perror("Bad server address"); return -1; } addr->sin_port = opts.port; return 0; } int setup_tcp_server(char *type) { int sk = -1; struct sockaddr_in saddr; socklen_t slen = sizeof(saddr); pr_info("Starting %s server on port %u\n", type, (int)ntohs(opts.port)); sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); if (sk < 0) { pr_perror("Can't init %s server", type); return -1; } if (get_sockaddr_in(&saddr, opts.addr)) goto out; if (bind(sk, (struct sockaddr *)&saddr, slen)) { pr_perror("Can't bind %s server", type); goto out; } if (listen(sk, 1)) { pr_perror("Can't listen on %s server socket", type); goto out; } /* Get socket port in case of autobind */ if (opts.port == 0) { if (getsockname(sk, (struct sockaddr *)&saddr, &slen)) { pr_perror("Can't get %s server name", type); goto out; } opts.port = ntohs(saddr.sin_port); pr_info("Using %u port\n", opts.port); } return sk; out: close(sk); return -1; } int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk) { int ret; struct sockaddr_in caddr; socklen_t clen = sizeof(caddr); if (daemon_mode) { ret = cr_daemon(1, 0, ask, cfd); if (ret == -1) { pr_err("Can't run in the background\n"); goto out; } if (ret > 0) { /* parent task, daemon started */ close_safe(&sk); if (opts.pidfile) { if (write_pidfile(ret) == -1) { pr_perror("Can't write pidfile"); kill(ret, SIGKILL); waitpid(ret, NULL, 0); return -1; } } return ret; } } if (close_status_fd()) return -1; if (sk >= 0) { ret = *ask = accept(sk, (struct sockaddr *)&caddr, &clen); if (*ask < 0) pr_perror("Can't accept connection to server"); else pr_info("Accepted connection from %s:%u\n", inet_ntoa(caddr.sin_addr), (int)ntohs(caddr.sin_port)); close(sk); } return 0; out: close(sk); return -1; } int setup_tcp_client(char *addr) { struct sockaddr_in saddr; int sk; pr_info("Connecting to server %s:%u\n", addr, (int)ntohs(opts.port)); if (get_sockaddr_in(&saddr, addr)) return -1; sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); if (sk < 0) { pr_perror("Can't create socket"); return -1; } if (connect(sk, (struct sockaddr *)&saddr, sizeof(saddr)) < 0) { pr_perror("Can't connect to server"); close(sk); return -1; } return sk; } int epoll_add_rfd(int epfd, struct epoll_rfd *rfd) { struct epoll_event ev; ev.events = EPOLLIN; ev.data.ptr = rfd; if (epoll_ctl(epfd, EPOLL_CTL_ADD, rfd->fd, &ev) == -1) { pr_perror("epoll_ctl failed"); return -1; } return 0; } int epoll_del_rfd(int epfd, struct epoll_rfd *rfd) { if (epoll_ctl(epfd, EPOLL_CTL_DEL, rfd->fd, NULL) == -1) { pr_perror("epoll_ctl failed"); return -1; } return 0; } int epoll_run_rfds(int epollfd, struct epoll_event *evs, int nr_fds, int timeout) { int ret, i, nr_events; bool have_a_break = false; while (1) { /* FIXME -- timeout should decrease over time... */ ret = epoll_wait(epollfd, evs, nr_fds, timeout); if (ret <= 0) { if (ret < 0) pr_perror("polling failed"); break; } nr_events = ret; for (i = 0; i < nr_events; i++) { struct epoll_rfd *rfd; rfd = (struct epoll_rfd *)evs[i].data.ptr; ret = rfd->revent(rfd); if (ret < 0) goto out; if (ret > 0) have_a_break = true; } if (have_a_break) return 1; } out: return ret; } int epoll_prepare(int nr_fds, struct epoll_event **events) { int epollfd; *events = xmalloc(sizeof(struct epoll_event) * nr_fds); if (!*events) return -1; epollfd = epoll_create(nr_fds); if (epollfd == -1) { pr_perror("epoll_create failed"); goto free_events; } return epollfd; free_events: xfree(*events); return -1; } criu-3.6/criu/uts_ns.c000066400000000000000000000024761317335042600147620ustar00rootroot00000000000000#include #include #include #include #include #include "util.h" #include "namespaces.h" #include "sysctl.h" #include "uts_ns.h" #include "protobuf.h" #include "images/utsns.pb-c.h" int dump_uts_ns(int ns_id) { int ret; struct cr_img *img; struct utsname ubuf; UtsnsEntry ue = UTSNS_ENTRY__INIT; img = open_image(CR_FD_UTSNS, O_DUMP, ns_id); if (!img) return -1; ret = uname(&ubuf); if (ret < 0) { pr_perror("Error calling uname"); goto err; } ue.nodename = ubuf.nodename; ue.domainname = ubuf.domainname; ret = pb_write_one(img, &ue, PB_UTSNS); err: close_image(img); return ret < 0 ? -1 : 0; } int prepare_utsns(int pid) { int ret; struct cr_img *img; UtsnsEntry *ue; struct sysctl_req req[] = { { "kernel/hostname" }, { "kernel/domainname" }, }; img = open_image(CR_FD_UTSNS, O_RSTR, pid); if (!img) return -1; ret = pb_read_one(img, &ue, PB_UTSNS); if (ret < 0) goto out; req[0].arg = ue->nodename; req[0].type = CTL_STR(strlen(ue->nodename)); req[1].arg = ue->domainname; req[1].type = CTL_STR(strlen(ue->domainname)); ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWUTS); utsns_entry__free_unpacked(ue, NULL); out: close_image(img); return ret; } struct ns_desc uts_ns_desc = NS_DESC_ENTRY(CLONE_NEWUTS, "uts"); criu-3.6/criu/vdso-compat.c000066400000000000000000000044611317335042600156770ustar00rootroot00000000000000#include #include #include #include #include "types.h" #include "parasite-syscall.h" #include "parasite.h" #include "vdso.h" static void exit_on(int ret, int err_fd, char *reason) { if (ret) { syscall(__NR_write, err_fd, reason, strlen(reason)); syscall(__NR_exit, ret); } } /* * Because of restrictions of ARCH_MAP_VDSO_* API, new vDSO blob * can be mapped only if there is no vDSO blob present for a process. * This is a helper process, it unmaps 64-bit vDSO and maps 32-bit vDSO. * Then it copies vDSO blob to shared with CRIU mapping. * * The purpose is to fill compat vdso's symtable (vdso_compat_rt). * It's an optimization to fill symtable only once at CRIU restore * for all restored tasks. * * @native - 64-bit vDSO blob (for easy unmap) * @pipe_fd - to get size of compat blob from /proc/.../maps * @err_fd - to print error messages * @vdso_buf, buf_size - shared with CRIU buffer * * WARN: This helper shouldn't call pr_err() or any syscall with * Glibc's wrapper function - it may very likely blow up. */ void compat_vdso_helper(struct vdso_maps *native, int pipe_fd, int err_fd, void *vdso_buf, size_t buf_size) { void *vdso_addr; long vdso_size; long ret; if (native->vdso_start != VDSO_BAD_ADDR) { ret = syscall(__NR_munmap, native->vdso_start, native->sym.vdso_size); exit_on(ret, err_fd, "Error: Failed to unmap native vdso\n"); } if (native->vvar_start != VVAR_BAD_ADDR) { ret = syscall(__NR_munmap, native->vvar_start, native->sym.vvar_size); exit_on(ret, err_fd, "Error: Failed to unmap native vvar\n"); } ret = syscall(__NR_arch_prctl, ARCH_MAP_VDSO_32, native->vdso_start); if (ret < 0) exit_on(ret, err_fd, "Error: ARCH_MAP_VDSO failed\n"); vdso_size = ret; if (vdso_size > buf_size) exit_on(-1, err_fd, "Error: Compatible vdso's size is bigger than reserved buf\n"); /* Stop so CRIU could parse smaps to find 32-bit vdso's size */ ret = syscall(__NR_kill, syscall(__NR_getpid), SIGSTOP); exit_on(ret, err_fd, "Error: Can't stop myself with SIGSTOP (having a good time)\n"); ret = syscall(__NR_read, pipe_fd, &vdso_addr, sizeof(void *)); if (ret != sizeof(void *)) exit_on(-1, err_fd, "Error: Can't read size of mmaped vdso from pipe\n"); memcpy(vdso_buf, vdso_addr, vdso_size); syscall(__NR_exit, 0); } criu-3.6/criu/vdso.c000066400000000000000000000404131317335042600144130ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "types.h" #include "parasite-syscall.h" #include "parasite.h" #include "common/compiler.h" #include "kerndat.h" #include "vdso.h" #include "util.h" #include "criu-log.h" #include "mem.h" #include "vma.h" #include #include #ifdef LOG_PREFIX # undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " u64 vdso_pfn = VDSO_BAD_PFN; struct vdso_maps vdso_maps = VDSO_MAPS_INIT; struct vdso_maps vdso_maps_compat = VDSO_MAPS_INIT; /* * Starting with 3.16 the [vdso]/[vvar] marks are reported correctly * even when they are remapped into a new place, but only since that * particular version of the kernel! * On previous kernels we need to check if vma is vdso by some means: * - if pagemap is present, by pfn * - by parsing ELF and filling vdso symtable otherwise */ enum vdso_check_t { /* from slowest to fastest */ VDSO_CHECK_SYMS = 0, VDSO_CHECK_PFN, VDSO_NO_CHECK, }; static enum vdso_check_t get_vdso_check_type(struct parasite_ctl *ctl) { /* * ia32 C/R depends on mremap() for vdso patches (v4.8), * so we can omit any check and be sure that "[vdso]" * hint stays in /proc/../maps file and is correct. */ if (!compel_mode_native(ctl)) { pr_info("Don't check vdso for compat task\n"); return VDSO_NO_CHECK; } if (kdat.vdso_hint_reliable) { pr_info("vDSO hint is reliable - omit checking\n"); return VDSO_NO_CHECK; } if (kdat.pmap == PM_FULL) { pr_info("Check vdso by pfn from pagemap\n"); return VDSO_CHECK_PFN; } pr_info("Pagemap is unavailable, check vdso by filling symtable\n"); return VDSO_CHECK_SYMS; } static int check_vdso_by_pfn(int pagemap_fd, struct vma_area *vma, bool *has_vdso_pfn) { u64 pfn = VDSO_BAD_PFN; if (vaddr_to_pfn(pagemap_fd, vma->e->start, &pfn)) return -1; if (!pfn) { pr_err("Unexpected page frame number 0\n"); return -1; } if ((pfn == vdso_pfn && pfn != VDSO_BAD_PFN)) *has_vdso_pfn = true; else *has_vdso_pfn = false; return 0; } static bool not_vvar_or_vdso(struct vma_area *vma) { if (!vma_area_is(vma, VMA_AREA_REGULAR)) return true; if (vma_area_is(vma, VMA_FILE_SHARED)) return true; if (vma_area_is(vma, VMA_FILE_PRIVATE)) return true; if (vma->e->start > kdat.task_size) return true; if (vma->e->flags & MAP_GROWSDOWN) return true; BUILD_BUG_ON(!(VDSO_PROT & VVAR_PROT)); if ((vma->e->prot & VVAR_PROT) != VVAR_PROT) return true; return false; } /* Contains addresses from vdso mark */ struct vdso_quarter { unsigned long orig_vdso; unsigned long orig_vvar; unsigned long rt_vdso; unsigned long rt_vvar; }; static void drop_rt_vdso(struct vm_area_list *vma_area_list, struct vdso_quarter *addr, struct vma_area *rt_vdso_marked) { struct vma_area *rt_vvar_marked = NULL; struct vma_area *vma; if (!rt_vdso_marked) return; /* * There is marked vdso, it means such vdso is autogenerated * and must be dropped from vma list. */ pr_debug("vdso: Found marked at %lx (orig vDSO at %lx VVAR at %lx)\n", (long)rt_vdso_marked->e->start, addr->orig_vdso, addr->orig_vvar); /* * Don't forget to restore the proxy vdso/vvar status, since * they're unknown to the kernel. * Also BTW search for rt-vvar to remove it later. */ list_for_each_entry(vma, &vma_area_list->h, list) { if (vma->e->start == addr->orig_vdso) { vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO; pr_debug("vdso: Restore orig vDSO status at %lx\n", (long)vma->e->start); } else if (vma->e->start == addr->orig_vvar) { vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VVAR; pr_debug("vdso: Restore orig VVAR status at %lx\n", (long)vma->e->start); } else if (addr->rt_vvar != VVAR_BAD_ADDR && addr->rt_vvar == vma->e->start) { BUG_ON(rt_vvar_marked); if (not_vvar_or_vdso(vma)) { pr_warn("Mark in rt-vdso points to vma, that doesn't look like vvar - skipping unmap\n"); continue; } rt_vvar_marked = vma; } } pr_debug("vdso: Droppping marked vdso at %lx\n", (long)rt_vdso_marked->e->start); list_del(&rt_vdso_marked->list); xfree(rt_vdso_marked); vma_area_list->nr--; if (rt_vvar_marked) { pr_debug("vdso: Droppping marked vvar at %lx\n", (long)rt_vvar_marked->e->start); list_del(&rt_vvar_marked->list); xfree(rt_vvar_marked); vma_area_list->nr--; } } /* * I need to poke every potentially marked vma, * otherwise if task never called for vdso functions * page frame number won't be reported. * * Moreover, if page frame numbers are not accessible * we have to scan the vma zone for vDSO elf structure * which gonna be a slow way. */ static int check_if_vma_is_vdso(enum vdso_check_t vcheck, int pagemap_fd, struct parasite_ctl *ctl, struct vma_area *vma, struct vma_area **rt_vdso_marked, struct vdso_quarter *addr) { struct parasite_vdso_vma_entry *args; bool has_vdso_pfn = false; args = compel_parasite_args(ctl, struct parasite_vdso_vma_entry); if (not_vvar_or_vdso(vma)) return 0; if ((vma->e->prot & VDSO_PROT) != VDSO_PROT) return 0; args->start = vma->e->start; args->len = vma_area_len(vma); args->try_fill_symtable = (vcheck == VDSO_CHECK_SYMS); args->is_vdso = false; if (compel_rpc_call_sync(PARASITE_CMD_CHECK_VDSO_MARK, ctl)) { pr_err("Parasite failed to poke for mark\n"); return -1; } if (unlikely(args->is_marked)) { if (*rt_vdso_marked) { pr_err("Ow! Second vdso mark detected!\n"); return -1; } *rt_vdso_marked = vma; addr->orig_vdso = args->orig_vdso_addr; addr->orig_vvar = args->orig_vvar_addr; addr->rt_vvar = args->rt_vvar_addr; return 0; } if (vcheck == VDSO_NO_CHECK) return 0; if (vcheck == VDSO_CHECK_PFN) { if (check_vdso_by_pfn(pagemap_fd, vma, &has_vdso_pfn) < 0) { pr_err("Failed checking vdso by pfn\n"); return -1; } } if (has_vdso_pfn || args->is_vdso) { if (!vma_area_is(vma, VMA_AREA_VDSO)) { pr_debug("Restore vDSO status by pfn/symtable at %lx\n", (long)vma->e->start); vma->e->status |= VMA_AREA_VDSO; } } else { if (unlikely(vma_area_is(vma, VMA_AREA_VDSO))) { pr_debug("Drop mishinted vDSO status at %lx\n", (long)vma->e->start); vma->e->status &= ~VMA_AREA_VDSO; } } return 0; } /* * The VMAs list might have proxy vdso/vvar areas left * from previous dump/restore cycle so we need to detect * them and eliminated from the VMAs list, they will be * generated again on restore if needed. */ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list *vma_area_list) { struct vma_area *rt_vdso_marked = NULL; struct vdso_quarter addr = { .orig_vdso = VDSO_BAD_ADDR, .orig_vvar = VVAR_BAD_ADDR, .rt_vdso = VDSO_BAD_ADDR, .rt_vvar = VVAR_BAD_ADDR, }; enum vdso_check_t vcheck; struct vma_area *vma; int fd = -1; vcheck = get_vdso_check_type(ctl); if (vcheck == VDSO_CHECK_PFN) { BUG_ON(vdso_pfn == VDSO_BAD_PFN); fd = open_proc(pid, "pagemap"); if (fd < 0) return -1; } list_for_each_entry(vma, &vma_area_list->h, list) { /* * Defer handling marked vdso until we walked over * all vmas and restore potentially remapped vDSO * area status. */ if (check_if_vma_is_vdso(vcheck, fd, ctl, vma, &rt_vdso_marked, &addr)) { close_safe(&fd); return -1; } } drop_rt_vdso(vma_area_list, &addr, rt_vdso_marked); close_safe(&fd); return 0; } static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) { int exit_code = -1; char *buf; struct bfd f; *s = (struct vdso_maps)VDSO_MAPS_INIT; f.fd = open_proc(pid, "maps"); if (f.fd < 0) return -1; if (bfdopenr(&f)) goto err; while (1) { unsigned long start, end; char *has_vdso, *has_vvar; buf = breadline(&f); if (buf == NULL) break; if (IS_ERR(buf)) goto err; has_vdso = strstr(buf, "[vdso]"); if (!has_vdso) has_vvar = strstr(buf, "[vvar]"); else has_vvar = NULL; if (!has_vdso && !has_vvar) continue; if (sscanf(buf, "%lx-%lx", &start, &end) != 2) { pr_err("Can't find vDSO/VVAR bounds\n"); goto err; } if (has_vdso) { if (s->vdso_start != VDSO_BAD_ADDR) { pr_err("Got second vDSO entry\n"); goto err; } s->vdso_start = start; s->sym.vdso_size = end - start; } else { if (s->vvar_start != VVAR_BAD_ADDR) { pr_err("Got second VVAR entry\n"); goto err; } s->vvar_start = start; s->sym.vvar_size = end - start; } } if (s->vdso_start != VDSO_BAD_ADDR && s->vvar_start != VVAR_BAD_ADDR) s->sym.vdso_before_vvar = (s->vdso_start < s->vvar_start); exit_code = 0; err: bclose(&f); return exit_code; } static int validate_vdso_addr(struct vdso_maps *s) { unsigned long vdso_end = s->vdso_start + s->sym.vdso_size; unsigned long vvar_end = s->vvar_start + s->sym.vvar_size; /* * Validate its structure -- for new vDSO format the * structure must be like * * 7fff1f5fd000-7fff1f5fe000 r-xp 00000000 00:00 0 [vdso] * 7fff1f5fe000-7fff1f600000 r--p 00000000 00:00 0 [vvar] * * The areas may be in reverse order. * * 7fffc3502000-7fffc3504000 r--p 00000000 00:00 0 [vvar] * 7fffc3504000-7fffc3506000 r-xp 00000000 00:00 0 [vdso] * */ if (s->vdso_start != VDSO_BAD_ADDR) { if (s->vvar_start != VVAR_BAD_ADDR) { if (vdso_end != s->vvar_start && vvar_end != s->vdso_start) { pr_err("Unexpected rt vDSO area bounds\n"); return -1; } } } else { pr_err("Can't find rt vDSO\n"); return -1; } return 0; } static int vdso_fill_self_symtable(struct vdso_maps *s) { if (s->vdso_start == VDSO_BAD_ADDR || s->sym.vdso_size == VDSO_BAD_SIZE) return -1; if (vdso_fill_symtable(s->vdso_start, s->sym.vdso_size, &s->sym)) return -1; if (validate_vdso_addr(s)) return -1; pr_debug("rt [vdso] %lx-%lx [vvar] %lx-%lx\n", s->vdso_start, s->vdso_start + s->sym.vdso_size, s->vvar_start, s->vvar_start + s->sym.vvar_size); return 0; } #ifdef CONFIG_COMPAT static int vdso_mmap_compat(struct vdso_maps *native, struct vdso_maps *compat, void *vdso_buf, size_t buf_size) { pid_t pid; int status, ret = -1; int fds[2]; if (pipe(fds)) { pr_perror("Failed to open pipe"); return -1; } pid = fork(); if (pid == 0) { if (close(fds[1])) { pr_perror("Failed to close pipe"); syscall(__NR_exit, 1); } compat_vdso_helper(native, fds[0], log_get_fd(), vdso_buf, buf_size); BUG(); } if (close(fds[0])) { pr_perror("Failed to close pipe"); goto out_kill; } waitpid(pid, &status, WUNTRACED); if (WIFEXITED(status)) { pr_err("Compat vdso helper exited with %d\n", WEXITSTATUS(status)); goto out_kill; } if (!WIFSTOPPED(status)) { pr_err("Compat vdso helper isn't stopped\n"); goto out_kill; } if (vdso_parse_maps(pid, compat)) goto out_kill; if (validate_vdso_addr(compat)) goto out_kill; if (kill(pid, SIGCONT)) { pr_perror("Failed to kill(SIGCONT) for compat vdso helper\n"); goto out_kill; } if (write(fds[1], &compat->vdso_start, sizeof(void *)) != sizeof(compat->vdso_start)) { pr_perror("Failed write to pipe\n"); goto out_kill; } waitpid(pid, &status, WUNTRACED); if (WIFEXITED(status)) { ret = WEXITSTATUS(status); if (ret) pr_err("Helper for mmaping compat vdso failed with %d\n", ret); goto out_close; } pr_err("Compat vDSO helper didn't exit, status: %d\n", status); out_kill: kill(pid, SIGKILL); out_close: if (close(fds[1])) pr_perror("Failed to close pipe"); return ret; } #define COMPAT_VDSO_BUF_SZ (PAGE_SIZE*2) static int vdso_fill_compat_symtable(struct vdso_maps *native, struct vdso_maps *compat) { void *vdso_mmap; int ret = -1; if (!kdat.compat_cr) return 0; vdso_mmap = mmap(NULL, COMPAT_VDSO_BUF_SZ, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); if (vdso_mmap == MAP_FAILED) { pr_perror("Failed to mmap buf for compat vdso"); return -1; } if (vdso_mmap_compat(native, compat, vdso_mmap, COMPAT_VDSO_BUF_SZ)) { pr_err("Failed to mmap compatible vdso with helper process\n"); goto out_unmap; } if (vdso_fill_symtable_compat((uintptr_t)vdso_mmap, compat->sym.vdso_size, &compat->sym)) { pr_err("Failed to parse mmaped compatible vdso blob\n"); goto out_unmap; } pr_debug("compat [vdso] %lx-%lx [vvar] %lx-%lx\n", compat->vdso_start, compat->vdso_start + compat->sym.vdso_size, compat->vvar_start, compat->vvar_start + compat->sym.vvar_size); ret = 0; out_unmap: if (munmap(vdso_mmap, COMPAT_VDSO_BUF_SZ)) pr_perror("Failed to unmap buf for compat vdso"); return ret; } #endif /* CONFIG_COMPAT */ int vdso_init_dump(void) { if (vdso_parse_maps(PROC_SELF, &vdso_maps)) { pr_err("Failed reading self/maps for filling vdso/vvar bounds\n"); return -1; } if (kdat.pmap != PM_FULL) pr_info("VDSO detection turned off\n"); else if (vaddr_to_pfn(-1, vdso_maps.vdso_start, &vdso_pfn)) return -1; return 0; } /* * Check vdso/vvar sized read from maps to kdat values. * We do not read /proc/self/maps for compatible vdso as it's * not parked as run-time vdso in restorer, but mapped with * arch_prlctl(MAP_VDSO_32) API. * By that reason we verify only native sizes. */ static int is_kdat_vdso_sym_valid(void) { if (vdso_maps.sym.vdso_size != kdat.vdso_sym.vdso_size) return false; if (vdso_maps.sym.vvar_size != kdat.vdso_sym.vvar_size) return false; return true; } int vdso_init_restore(void) { if (kdat.vdso_sym.vdso_size == VDSO_BAD_SIZE) { pr_err("Kdat has empty vdso symtable\n"); return -1; } /* Already filled vdso_maps during kdat test */ if (vdso_maps.vdso_start != VDSO_BAD_ADDR) return 0; /* * Parsing self-maps here only to find vvar/vdso vmas in * criu's address space, for further remapping to restorer's * parking zone. Don't need to do this if map-vdso API * is present. */ if (!kdat.can_map_vdso) { if (vdso_parse_maps(PROC_SELF, &vdso_maps)) { pr_err("Failed reading self/maps for filling vdso/vvar bounds\n"); return -1; } if (!is_kdat_vdso_sym_valid()) { pr_err("Kdat sizes of vdso/vvar differ to maps file \n"); return -1; } } vdso_maps.sym = kdat.vdso_sym; #ifdef CONFIG_COMPAT vdso_maps_compat.sym = kdat.vdso_sym_compat; #endif return 0; } int kerndat_vdso_fill_symtable(void) { if (vdso_parse_maps(PROC_SELF, &vdso_maps)) { pr_err("Failed reading self/maps for filling vdso/vvar bounds\n"); return -1; } if (vdso_fill_self_symtable(&vdso_maps)) { pr_err("Failed to fill self vdso symtable\n"); return -1; } kdat.vdso_sym = vdso_maps.sym; #ifdef CONFIG_COMPAT if (vdso_fill_compat_symtable(&vdso_maps, &vdso_maps_compat)) { pr_err("Failed to fill compat vdso symtable\n"); return -1; } kdat.vdso_sym_compat = vdso_maps_compat.sym; #endif return 0; } /* * On x86 pre-v3.16 kernels can lose "[vdso]" hint * in /proc/.../maps file after mremap()'ing vdso vma. * Depends on kerndat_vdso_fill_symtable() - assuming that * vdso_maps and vdso_maps_compat are filled. */ int kerndat_vdso_preserves_hint(void) { struct vdso_maps vdso_maps_after; int status, ret = -1; pid_t child; kdat.vdso_hint_reliable = 0; if (vdso_maps.vdso_start == VDSO_BAD_ADDR) return 0; child = fork(); if (child < 0) { pr_perror("fork() failed"); return -1; } if (child == 0) { unsigned long vdso_addr = vdso_maps.vdso_start; unsigned long vdso_size = vdso_maps.sym.vdso_size; void *new_addr; new_addr = mmap(0, vdso_size, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); if (new_addr == MAP_FAILED) exit(1); child = getpid(); new_addr = (void *)syscall(SYS_mremap, vdso_addr, vdso_size, vdso_size, MREMAP_MAYMOVE | MREMAP_FIXED, new_addr); if (new_addr == MAP_FAILED) syscall(SYS_exit, 2); syscall(SYS_kill, child, SIGSTOP); syscall(SYS_exit, 3); } waitpid(child, &status, WUNTRACED); if (WIFEXITED(status)) { int ret = WEXITSTATUS(status); pr_err("Child unexpectedly exited with %d\n", ret); goto out; } else if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); pr_err("Child unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); goto out; } else if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { pr_err("Child is unstoppable or was stopped by other means\n"); goto out_kill; } if (vdso_parse_maps(child, &vdso_maps_after)) { pr_err("Failed parsing maps for child helper\n"); goto out_kill; } if (vdso_maps_after.vdso_start != VDSO_BAD_ADDR) kdat.vdso_hint_reliable = 1; ret = 0; out_kill: kill(child, SIGKILL); waitpid(child, &status, 0); out: return ret; } criu-3.6/images/000077500000000000000000000000001317335042600135755ustar00rootroot00000000000000criu-3.6/images/Makefile000066400000000000000000000073571317335042600152510ustar00rootroot00000000000000proto-obj-y += stats.o proto-obj-y += core.o proto-obj-y += core-x86.o proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o proto-obj-y += core-ppc64.o proto-obj-y += core-s390.o proto-obj-y += cpuinfo.o proto-obj-y += inventory.o proto-obj-y += fdinfo.o proto-obj-y += fown.o proto-obj-y += ns.o proto-obj-y += regfile.o proto-obj-y += ghost-file.o proto-obj-y += fifo.o proto-obj-y += remap-file-path.o proto-obj-y += eventfd.o proto-obj-y += eventpoll.o proto-obj-y += fh.o proto-obj-y += fsnotify.o proto-obj-y += signalfd.o proto-obj-y += fs.o proto-obj-y += pstree.o proto-obj-y += pipe.o proto-obj-y += tcp-stream.o proto-obj-y += sk-packet.o proto-obj-y += mnt.o proto-obj-y += pipe-data.o proto-obj-y += sa.o proto-obj-y += timer.o proto-obj-y += timerfd.o proto-obj-y += mm.o proto-obj-y += sk-opts.o proto-obj-y += sk-unix.o proto-obj-y += sk-inet.o proto-obj-y += tun.o proto-obj-y += sk-netlink.o proto-obj-y += packet-sock.o proto-obj-y += ipc-var.o proto-obj-y += ipc-desc.o proto-obj-y += ipc-shm.o proto-obj-y += ipc-msg.o proto-obj-y += ipc-sem.o proto-obj-y += utsns.o proto-obj-y += creds.o proto-obj-y += vma.o proto-obj-y += netdev.o proto-obj-y += tty.o proto-obj-y += file-lock.o proto-obj-y += rlimit.o proto-obj-y += pagemap.o proto-obj-y += siginfo.o proto-obj-y += rpc.o proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += google/protobuf/descriptor.o # To make protoc-c happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o proto-obj-y += time.o proto-obj-y += sysctl.o proto-obj-y += autofs.o proto-obj-y += macvlan.o proto-obj-y += sit.o CFLAGS += -iquote $(obj)/ # # Generates a set of names from protobuf "import" directive. # The names are bare, ie no suffixes. define gen-proto-dep-names $(shell grep "^[[:blank:]]*import[[:blank:]]" $(1) | \ sed -e 's/[[:blank:]]*import[[:blank:]]*//' \ -e 's/[\";]//g' \ -e 's/\.proto//g' | \ sort | uniq) endef makefile-deps := Makefile $(obj)/Makefile # # Generates rules needed to compile protobuf files. define gen-proto-rules $(obj)/$(1).pb-c.c $(obj)/$(1).pb-c.h: $(obj)/$(1).proto $(addsuffix .pb-c.c,$(addprefix $(obj)/,$(2))) $(makefile-deps) $$(E) " PBCC " $$@ $$(Q) protoc-c --proto_path=$(obj)/ --c_out=$(obj)/ $$< ifeq ($(PROTOUFIX),y) $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$@ $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.c,%.h,$$@) $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.h,%.c,$$@) endif $(obj)/$(1).pb-c.d: $(obj)/$(1).pb-c.c $(addsuffix .pb-c.d,$(addprefix $(obj)/,$(2))) $(makefile-deps) $$(E) " DEP " $$@ $$(Q) $$(CC) -M -MT $$@ -MT $(patsubst %.d,%.o,$$@) $$(CFLAGS) $$< -o $$@ endef $(foreach file, $(proto-obj-y), \ $(eval $(call gen-proto-rules,$(file:.o=), \ $(call gen-proto-dep-names, \ $(addprefix $(obj)/,$(file:.o=.proto)))))) $(obj)/%.o: $(obj)/%.pb-c.c $(obj)/%.pb-c.h $(E) " CC " $@ $(Q) $(CC) -c $(CFLAGS) $< -o $@ $(obj)/built-in.o: $(addprefix $(obj)/,$(proto-obj-y)) $(E) " LINK " $@ $(Q) $(LD) $(ldflags-y) -r -o $@ $^ cleanup-y += $(obj)/built-in.o ifneq ($(MAKECMDGOALS),clean) ifneq ($(MAKECMDGOALS),mrproper) -include $(addprefix $(obj)/,$(proto-obj-y:.o=.pb-c.d)) endif endif cleanup-y += $(call cleanify,$(addprefix $(obj)/,$(proto-obj-y))) cleanup-y += $(call cleanify,$(addprefix $(obj)/,$(proto-obj-y:.o=.pb-c.o))) mrproper-y += $(addprefix $(obj)/,$(proto-obj-y:.o=.pb-c.c)) mrproper-y += $(addprefix $(obj)/,$(proto-obj-y:.o=.pb-c.h)) criu-3.6/images/autofs.proto000066400000000000000000000004741317335042600161700ustar00rootroot00000000000000syntax = "proto2"; message autofs_entry { required int32 fd = 1; required int32 pgrp = 2; required int32 timeout = 3; required int32 minproto = 4; required int32 maxproto = 5; required int32 mode = 6; optional int32 uid = 7; optional int32 gid = 8; optional int32 read_fd = 9; } criu-3.6/images/binfmt-misc.proto000066400000000000000000000004601317335042600170720ustar00rootroot00000000000000syntax = "proto2"; message binfmt_misc_entry { required string name = 1; required bool enabled = 2; required string interpreter = 3; optional string flags = 4; optional string extension = 5; optional string magic = 6; optional string mask = 7; optional int32 offset = 8; } criu-3.6/images/cgroup.proto000066400000000000000000000015441317335042600161650ustar00rootroot00000000000000syntax = "proto2"; message cgroup_perms { required uint32 mode = 1; required uint32 uid = 2; required uint32 gid = 3; } message cgroup_prop_entry { required string name = 1; required string value = 2; optional cgroup_perms perms = 3; } message cgroup_dir_entry { required string dir_name = 1; repeated cgroup_dir_entry children = 2; repeated cgroup_prop_entry properties = 3; optional cgroup_perms dir_perms = 4; } message cg_controller_entry { repeated string cnames = 1; repeated cgroup_dir_entry dirs = 2; } message cg_member_entry { required string name = 1; required string path = 2; optional uint32 cgns_prefix = 3; } message cg_set_entry { required uint32 id = 1; repeated cg_member_entry ctls = 2; } message cgroup_entry { repeated cg_set_entry sets = 1; repeated cg_controller_entry controllers = 2; } criu-3.6/images/core-aarch64.proto000066400000000000000000000010621317335042600170370ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; message user_aarch64_regs_entry { repeated uint64 regs = 1; required uint64 sp = 2; required uint64 pc = 3; required uint64 pstate = 4; } message user_aarch64_fpsimd_context_entry { repeated uint64 vregs = 1; required uint32 fpsr = 2; required uint32 fpcr = 3; } message thread_info_aarch64 { required uint64 clear_tid_addr = 1[(criu).hex = true]; required uint64 tls = 2; required user_aarch64_regs_entry gpregs = 3[(criu).hex = true]; required user_aarch64_fpsimd_context_entry fpsimd = 4; } criu-3.6/images/core-arm.proto000066400000000000000000000020531317335042600163670ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; message user_arm_regs_entry { required uint32 r0 = 1; required uint32 r1 = 2; required uint32 r2 = 3; required uint32 r3 = 4; required uint32 r4 = 5; required uint32 r5 = 6; required uint32 r6 = 7; required uint32 r7 = 8; required uint32 r8 = 9; required uint32 r9 = 10; required uint32 r10 = 11; required uint32 fp = 12; required uint32 ip = 13; required uint32 sp = 14; required uint32 lr = 15; required uint32 pc = 16; required uint32 cpsr = 17; required uint32 orig_r0 = 18; } message user_arm_vfpstate_entry { repeated uint64 vfp_regs = 1; required uint32 fpscr = 2; required uint32 fpexc = 3; required uint32 fpinst = 4; required uint32 fpinst2 = 5; } message thread_info_arm { required uint64 clear_tid_addr = 1[(criu).hex = true]; required uint32 tls = 2; required user_arm_regs_entry gpregs = 3[(criu).hex = true]; required user_arm_vfpstate_entry fpstate = 4; } criu-3.6/images/core-ppc64.proto000066400000000000000000000040651317335042600165510ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; message user_ppc64_regs_entry { /* Following is the list of regiters starting at r0. */ repeated uint64 gpr = 1; required uint64 nip = 2; required uint64 msr = 3; required uint64 orig_gpr3 = 4; required uint64 ctr = 5; required uint64 link = 6; required uint64 xer = 7; required uint64 ccr = 8; required uint64 trap = 9; /* For Transactional memory support since P8 */ optional uint64 texasr = 10; optional uint64 tfhar = 11; optional uint64 tfiar = 12; } message user_ppc64_fpstate_entry { /* Following is the list of regiters starting at fpr0 */ repeated uint64 fpregs = 1; } message user_ppc64_vrstate_entry { /* * Altivec registers * The vector registers are 128bit registers (VSR[32..63]). * The following vregs entry will store first the high part then the * low one: * VR0 = vrregs[0] << 64 | vrregs[1]; * VR1 = vrregs[2] << 64 | vrregs[3]; * .. * The last entry stores in a 128bit field the VSCR which is a 32bit * value returned by the kernel in a 128 field. */ repeated uint64 vrregs = 1; required uint32 vrsave = 2; } message user_ppc64_vsxstate_entry { /* * VSX registers * The vector-scale registers are 128bit registers (VSR[0..64]). * Since there is an overlapping over the VSX registers by the FPR and * the Altivec registers, only the lower part of the first 32 VSX * registers have to be saved. */ repeated uint64 vsxregs = 1; } /* * Transactional memory operation's state */ message user_ppc64_tm_regs_entry { required user_ppc64_regs_entry gpregs = 1; optional user_ppc64_fpstate_entry fpstate = 2; optional user_ppc64_vrstate_entry vrstate = 3; optional user_ppc64_vsxstate_entry vsxstate = 4; } message thread_info_ppc64 { required uint64 clear_tid_addr = 1[(criu).hex = true]; required user_ppc64_regs_entry gpregs = 2[(criu).hex = true]; optional user_ppc64_fpstate_entry fpstate = 3; optional user_ppc64_vrstate_entry vrstate = 4; optional user_ppc64_vsxstate_entry vsxstate = 5; optional user_ppc64_tm_regs_entry tmstate = 6; } criu-3.6/images/core-s390.proto000066400000000000000000000025021317335042600163050ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; message user_s390_regs_entry { required uint64 psw_mask = 1; required uint64 psw_addr = 2; repeated uint64 gprs = 3; repeated uint32 acrs = 4; required uint64 orig_gpr2 = 5; required uint32 system_call = 6; } message user_s390_vxrs_low_entry { repeated uint64 regs = 1; } /* * The vxrs_high registers have 128 bit: * * vxrs_high_0 = regs[0] << 64 | regs[1]; * vxrs_high_1 = regs[2] << 64 | regs[3]; */ message user_s390_vxrs_high_entry { repeated uint64 regs = 1; } message user_s390_fpregs_entry { required uint32 fpc = 1; repeated uint64 fprs = 2; } message user_s390_gs_cb_entry { repeated uint64 regs = 1; } message user_s390_ri_entry { required uint32 ri_on = 1; repeated uint64 regs = 2; } message thread_info_s390 { required uint64 clear_tid_addr = 1[(criu).hex = true]; required user_s390_regs_entry gpregs = 2[(criu).hex = true]; required user_s390_fpregs_entry fpregs = 3[(criu).hex = true]; optional user_s390_vxrs_low_entry vxrs_low = 4[(criu).hex = true]; optional user_s390_vxrs_high_entry vxrs_high = 5[(criu).hex = true]; optional user_s390_gs_cb_entry gs_cb = 6[(criu).hex = true]; optional user_s390_gs_cb_entry gs_bc = 7[(criu).hex = true]; optional user_s390_ri_entry ri_cb = 8[(criu).hex = true]; } criu-3.6/images/core-x86.proto000066400000000000000000000045121317335042600162370ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; enum user_x86_regs_mode { NATIVE = 1; COMPAT = 2; } /* Reusing entry for both 64 and 32 bits register sets */ message user_x86_regs_entry { required uint64 r15 = 1; required uint64 r14 = 2; required uint64 r13 = 3; required uint64 r12 = 4; required uint64 bp = 5; required uint64 bx = 6; required uint64 r11 = 7; required uint64 r10 = 8; required uint64 r9 = 9; required uint64 r8 = 10; required uint64 ax = 11; required uint64 cx = 12; required uint64 dx = 13; required uint64 si = 14; required uint64 di = 15; required uint64 orig_ax = 16; required uint64 ip = 17; required uint64 cs = 18; required uint64 flags = 19; required uint64 sp = 20; required uint64 ss = 21; required uint64 fs_base = 22; required uint64 gs_base = 23; required uint64 ds = 24; required uint64 es = 25; required uint64 fs = 26; required uint64 gs = 27; optional user_x86_regs_mode mode = 28 [default = NATIVE]; } message user_x86_xsave_entry { required uint64 xstate_bv = 1; repeated uint32 ymmh_space = 2; } message user_x86_fpregs_entry { /* fxsave data */ required uint32 cwd = 1; required uint32 swd = 2; required uint32 twd = 3; required uint32 fop = 4; required uint64 rip = 5; required uint64 rdp = 6; required uint32 mxcsr = 7; required uint32 mxcsr_mask = 8; repeated uint32 st_space = 9; repeated uint32 xmm_space = 10; /* Unused, but present for backward compatibility */ repeated uint32 padding = 11; /* xsave extension */ optional user_x86_xsave_entry xsave = 13; } message user_desc_t { required uint32 entry_number = 1; /* this is for GDT, not for MSRs - 32-bit base */ required uint32 base_addr = 2; required uint32 limit = 3; required bool seg_32bit = 4; required bool contents_h = 5; required bool contents_l = 6; required bool read_exec_only = 7 [default = true]; required bool limit_in_pages = 8; required bool seg_not_present = 9 [default = true]; required bool useable = 10; } message thread_info_x86 { required uint64 clear_tid_addr = 1[(criu).hex = true]; required user_x86_regs_entry gpregs = 2[(criu).hex = true]; required user_x86_fpregs_entry fpregs = 3; repeated user_desc_t tls = 4; } criu-3.6/images/core.proto000066400000000000000000000050011317335042600156060ustar00rootroot00000000000000syntax = "proto2"; import "core-x86.proto"; import "core-arm.proto"; import "core-aarch64.proto"; import "core-ppc64.proto"; import "core-s390.proto"; import "rlimit.proto"; import "timer.proto"; import "creds.proto"; import "sa.proto"; import "siginfo.proto"; import "opts.proto"; /* * These match the SECCOMP_MODE_* flags from . */ enum seccomp_mode { disabled = 0; strict = 1; filter = 2; }; message task_core_entry { required uint32 task_state = 1 [(criu).dict = "gen"]; required uint32 exit_code = 2; required uint32 personality = 3; required uint32 flags = 4; required uint64 blk_sigset = 5[(criu).hex = true]; required string comm = 6; optional task_timers_entry timers = 7; optional task_rlimits_entry rlimits = 8; optional uint32 cg_set = 9; optional signal_queue_entry signals_s = 10; optional seccomp_mode seccomp_mode = 11; optional uint32 seccomp_filter = 12; optional uint32 loginuid = 13; optional int32 oom_score_adj = 14; repeated sa_entry sigactions = 15; } message task_kobj_ids_entry { required uint32 vm_id = 1; required uint32 files_id = 2; required uint32 fs_id = 3; required uint32 sighand_id = 4; optional uint32 pid_ns_id = 5; optional uint32 net_ns_id = 6; optional uint32 ipc_ns_id = 7; optional uint32 uts_ns_id = 8; optional uint32 mnt_ns_id = 9; optional uint32 user_ns_id = 10; optional uint32 cgroup_ns_id = 11; } message thread_sas_entry { required uint64 ss_sp = 1; required uint64 ss_size = 2; required uint32 ss_flags = 3; } message thread_core_entry { required uint64 futex_rla = 1; required uint32 futex_rla_len = 2; optional sint32 sched_nice = 3; optional uint32 sched_policy = 4; optional uint32 sched_prio = 5; optional uint64 blk_sigset = 6; optional thread_sas_entry sas = 7; optional uint32 pdeath_sig = 8; optional signal_queue_entry signals_p = 9; optional creds_entry creds = 10; } message task_rlimits_entry { repeated rlimit_entry rlimits = 1; }; message core_entry { enum march { UNKNOWN = 0; X86_64 = 1; ARM = 2; AARCH64 = 3; PPC64 = 4; S390 = 5; } required march mtype = 1; optional thread_info_x86 thread_info = 2; optional thread_info_arm ti_arm = 6; optional thread_info_aarch64 ti_aarch64 = 8; optional thread_info_ppc64 ti_ppc64 = 9; optional thread_info_s390 ti_s390 = 10; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4; optional thread_core_entry thread_core = 5; } criu-3.6/images/cpuinfo.proto000066400000000000000000000015651317335042600163340ustar00rootroot00000000000000syntax = "proto2"; message cpuinfo_x86_entry { enum vendor { UNKNOWN = 0; INTEL = 1; AMD = 2; } required vendor vendor_id = 1; required uint32 cpu_family = 2; required uint32 model = 3; required uint32 stepping = 4; required uint32 capability_ver = 5; repeated uint32 capability = 6; optional string model_id = 7; } message cpuinfo_ppc64_entry { enum endianness { BIGENDIAN = 0; LITTLEENDIAN = 1; } required endianness endian = 1; repeated uint64 hwcap = 2; } message cpuinfo_s390_entry { repeated uint64 hwcap = 2; } message cpuinfo_entry { /* * Usually on SMP system there should be same CPUs * installed, but it might happen that system carries * various CPUs so @repeated used. */ repeated cpuinfo_x86_entry x86_entry = 1; repeated cpuinfo_ppc64_entry ppc64_entry = 2; repeated cpuinfo_s390_entry s390_entry = 3; } criu-3.6/images/creds.proto000066400000000000000000000007431317335042600157660ustar00rootroot00000000000000syntax = "proto2"; message creds_entry { required uint32 uid = 1; required uint32 gid = 2; required uint32 euid = 3; required uint32 egid = 4; required uint32 suid = 5; required uint32 sgid = 6; required uint32 fsuid = 7; required uint32 fsgid = 8; repeated uint32 cap_inh = 9; repeated uint32 cap_prm = 10; repeated uint32 cap_eff = 11; repeated uint32 cap_bnd = 12; required uint32 secbits = 13; repeated uint32 groups = 14; optional string lsm_profile = 15; } criu-3.6/images/eventfd.proto000066400000000000000000000003021317335042600163100ustar00rootroot00000000000000syntax = "proto2"; import "fown.proto"; message eventfd_file_entry { required uint32 id = 1; required uint32 flags = 2; required fown_entry fown = 3; required uint64 counter = 4; } criu-3.6/images/eventpoll.proto000066400000000000000000000005301317335042600166700ustar00rootroot00000000000000syntax = "proto2"; import "fown.proto"; message eventpoll_tfd_entry { required uint32 id = 1; required uint32 tfd = 2; required uint32 events = 3; required uint64 data = 4; } message eventpoll_file_entry { required uint32 id = 1; required uint32 flags = 2; required fown_entry fown = 3; repeated eventpoll_tfd_entry tfd = 4; } criu-3.6/images/ext-file.proto000066400000000000000000000001761317335042600164030ustar00rootroot00000000000000syntax = "proto2"; import "fown.proto"; message ext_file_entry { required uint32 id = 1; required fown_entry fown = 5; } criu-3.6/images/fdinfo.proto000066400000000000000000000027311317335042600161320ustar00rootroot00000000000000syntax = "proto2"; import "regfile.proto"; import "sk-inet.proto"; import "ns.proto"; import "packet-sock.proto"; import "sk-netlink.proto"; import "eventfd.proto"; import "eventpoll.proto"; import "signalfd.proto"; import "tun.proto"; import "timerfd.proto"; import "fsnotify.proto"; import "ext-file.proto"; import "sk-unix.proto"; import "fifo.proto"; import "pipe.proto"; import "tty.proto"; enum fd_types { UND = 0; REG = 1; PIPE = 2; FIFO = 3; INETSK = 4; UNIXSK = 5; EVENTFD = 6; EVENTPOLL = 7; INOTIFY = 8; SIGNALFD = 9; PACKETSK = 10; TTY = 11; FANOTIFY = 12; NETLINKSK = 13; NS = 14; TUNF = 15; EXT = 16; TIMERFD = 17; } message fdinfo_entry { required uint32 id = 1; required uint32 flags = 2; required fd_types type = 3; required uint32 fd = 4; } message file_entry { required fd_types type = 1; required uint32 id = 2; optional reg_file_entry reg = 3; optional inet_sk_entry isk = 4; optional ns_file_entry nsf = 5; optional packet_sock_entry psk = 6; optional netlink_sk_entry nlsk = 7; optional eventfd_file_entry efd = 8; optional eventpoll_file_entry epfd = 9; optional signalfd_entry sgfd = 10; optional tunfile_entry tunf = 11; optional timerfd_entry tfd = 12; optional inotify_file_entry ify = 13; optional fanotify_file_entry ffy = 14; optional ext_file_entry ext = 15; optional unix_sk_entry usk = 16; optional fifo_entry fifo = 17; optional pipe_entry pipe = 18; optional tty_file_entry tty = 19; } criu-3.6/images/fh.proto000066400000000000000000000007101317335042600152550ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; enum fh_entry_sizes { min_entries = 16; } message fh_entry { required uint32 bytes = 1; required uint32 type = 2; /* The minimum is fh_n_handle repetitions */ repeated uint64 handle = 3; optional string path = 4; optional uint32 mnt_id = 5; } message irmap_cache_entry { required uint32 dev = 1 [(criu).dev = true, (criu).odev = true]; required uint64 inode = 2; required string path = 3; } criu-3.6/images/fifo.proto000066400000000000000000000002061317335042600156030ustar00rootroot00000000000000syntax = "proto2"; message fifo_entry { required uint32 id = 1; required uint32 pipe_id = 2; optional uint32 regf_id = 3; } criu-3.6/images/file-lock.proto000066400000000000000000000003461317335042600165320ustar00rootroot00000000000000syntax = "proto2"; message file_lock_entry { required uint32 flag = 1; required uint32 type = 2; required int32 pid = 3; required int32 fd = 4; required int64 start = 5; required int64 len = 6; } criu-3.6/images/fown.proto000066400000000000000000000002771317335042600156410ustar00rootroot00000000000000syntax = "proto2"; message fown_entry { required uint32 uid = 1; required uint32 euid = 2; required uint32 signum = 3; required uint32 pid_type = 4; required uint32 pid = 5; } criu-3.6/images/fs.proto000066400000000000000000000002061317335042600152700ustar00rootroot00000000000000syntax = "proto2"; message fs_entry { required uint32 cwd_id = 1; required uint32 root_id = 2; optional uint32 umask = 3; } criu-3.6/images/fsnotify.proto000066400000000000000000000027071317335042600165310ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "fh.proto"; import "fown.proto"; message inotify_wd_entry { required uint32 id = 1; required uint64 i_ino = 2; required uint32 mask = 3 [(criu).hex = true]; required uint32 ignored_mask = 4 [(criu).hex = true]; required uint32 s_dev = 5 [(criu).dev = true]; required uint32 wd = 6; required fh_entry f_handle = 7; } message inotify_file_entry { required uint32 id = 1; required uint32 flags = 2 [(criu).hex = true]; required fown_entry fown = 4; repeated inotify_wd_entry wd = 5; } enum mark_type { INODE = 1; MOUNT = 2; } message fanotify_inode_mark_entry { required uint64 i_ino = 1; required fh_entry f_handle = 2; } message fanotify_mount_mark_entry { required uint32 mnt_id = 1; optional string path = 2; } message fanotify_mark_entry { required uint32 id = 1; required mark_type type = 2; required uint32 mflags = 3 [(criu).hex = true]; required uint32 mask = 4 [(criu).hex = true]; required uint32 ignored_mask = 5 [(criu).hex = true]; required uint32 s_dev = 6 [(criu).dev = true]; optional fanotify_inode_mark_entry ie = 7; optional fanotify_mount_mark_entry me = 8; } message fanotify_file_entry { required uint32 id = 1; required uint32 flags = 2 [(criu).hex = true]; required fown_entry fown = 3; required uint32 faflags = 4 [(criu).hex = true]; required uint32 evflags = 5 [(criu).hex = true]; repeated fanotify_mark_entry mark = 6; } criu-3.6/images/ghost-file.proto000066400000000000000000000010171317335042600167220ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "time.proto"; message ghost_file_entry { required uint32 uid = 1; required uint32 gid = 2; required uint32 mode = 3; optional uint32 dev = 4 [(criu).dev = true]; optional uint64 ino = 5; optional uint32 rdev = 6 [(criu).dev = true, (criu).odev = true]; optional timeval atim = 7; optional timeval mtim = 8; optional bool chunks = 9; optional uint64 size = 10; } message ghost_chunk_entry { required uint64 len = 1; required uint64 off = 2; } criu-3.6/images/google/000077500000000000000000000000001317335042600150515ustar00rootroot00000000000000criu-3.6/images/google/protobuf/000077500000000000000000000000001317335042600167115ustar00rootroot00000000000000criu-3.6/images/google/protobuf/descriptor.proto000077700000000000000000000000001317335042600330412/usr/include/google/protobuf/descriptor.protoustar00rootroot00000000000000criu-3.6/images/inventory.proto000066400000000000000000000005351317335042600167220ustar00rootroot00000000000000syntax = "proto2"; import "core.proto"; enum lsmtype { NO_LSM = 0; SELINUX = 1; APPARMOR = 2; } message inventory_entry { required uint32 img_version = 1; optional bool fdinfo_per_id = 2; optional task_kobj_ids_entry root_ids = 3; optional bool ns_per_id = 4; optional uint32 root_cg_set = 5; optional lsmtype lsmtype = 6; } criu-3.6/images/ipc-desc.proto000066400000000000000000000003561317335042600163550ustar00rootroot00000000000000syntax = "proto2"; message ipc_desc_entry { required uint32 key = 1; required uint32 uid = 2; required uint32 gid = 3; required uint32 cuid = 4; required uint32 cgid = 5; required uint32 mode = 6; required uint32 id = 7; } criu-3.6/images/ipc-msg.proto000066400000000000000000000003641317335042600162240ustar00rootroot00000000000000syntax = "proto2"; import "ipc-desc.proto"; message ipc_msg { required uint64 mtype = 1; required uint32 msize = 2; } message ipc_msg_entry { required ipc_desc_entry desc = 1; required uint32 qbytes = 2; required uint32 qnum = 3; } criu-3.6/images/ipc-sem.proto000066400000000000000000000002101317335042600162100ustar00rootroot00000000000000syntax = "proto2"; import "ipc-desc.proto"; message ipc_sem_entry { required ipc_desc_entry desc = 1; required uint32 nsems = 2; } criu-3.6/images/ipc-shm.proto000066400000000000000000000002531317335042600162220ustar00rootroot00000000000000syntax = "proto2"; import "ipc-desc.proto"; message ipc_shm_entry { required ipc_desc_entry desc = 1; required uint64 size = 2; optional bool in_pagemaps = 3; } criu-3.6/images/ipc-var.proto000066400000000000000000000012741317335042600162270ustar00rootroot00000000000000syntax = "proto2"; message ipc_var_entry { repeated uint32 sem_ctls = 1; required uint32 msg_ctlmax = 2; required uint32 msg_ctlmnb = 3; required uint32 msg_ctlmni = 4; required uint32 auto_msgmni = 5; required uint64 shm_ctlmax = 6; required uint64 shm_ctlall = 7; required uint32 shm_ctlmni = 8; required uint32 shm_rmid_forced = 9; required uint32 mq_queues_max = 10; required uint32 mq_msg_max = 11; required uint32 mq_msgsize_max = 12; optional uint32 mq_msg_default = 13; optional uint32 mq_msgsize_default = 14; optional uint32 msg_next_id = 15; optional uint32 sem_next_id = 16; optional uint32 shm_next_id = 17; } criu-3.6/images/macvlan.proto000066400000000000000000000001521317335042600163010ustar00rootroot00000000000000syntax = "proto2"; message macvlan_link_entry { required uint32 mode = 1; optional uint32 flags = 2; } criu-3.6/images/mm.proto000066400000000000000000000017671317335042600153060ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "vma.proto"; message aio_ring_entry { required uint64 id = 1; required uint32 nr_req = 2; required uint32 ring_len = 3; } message mm_entry { required uint64 mm_start_code = 1 [(criu).hex = true]; required uint64 mm_end_code = 2 [(criu).hex = true]; required uint64 mm_start_data = 3 [(criu).hex = true]; required uint64 mm_end_data = 4 [(criu).hex = true]; required uint64 mm_start_stack = 5 [(criu).hex = true]; required uint64 mm_start_brk = 6 [(criu).hex = true]; required uint64 mm_brk = 7 [(criu).hex = true]; required uint64 mm_arg_start = 8 [(criu).hex = true]; required uint64 mm_arg_end = 9 [(criu).hex = true]; required uint64 mm_env_start = 10 [(criu).hex = true]; required uint64 mm_env_end = 11 [(criu).hex = true]; required uint32 exe_file_id = 12; repeated uint64 mm_saved_auxv = 13; repeated vma_entry vmas = 14; optional int32 dumpable = 15; repeated aio_ring_entry aios = 16; optional bool thp_disabled = 17; } criu-3.6/images/mnt.proto000066400000000000000000000023041317335042600154570ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; enum fstype { UNSUPPORTED = 0; PROC = 1; SYSFS = 2; DEVTMPFS = 3; BINFMT_MISC = 4; TMPFS = 5; DEVPTS = 6; SIMFS = 7; PSTORE = 8; SECURITYFS = 9; FUSECTL = 10; DEBUGFS = 11; CGROUP = 12; AUFS = 13; MQUEUE = 14; FUSE = 15; AUTO = 16; OVERLAYFS = 17; AUTOFS = 18; TRACEFS = 19; /* These three are reserved for NFS support */ // RPC_PIPEFS = 20; // NFS = 21; // NFS4 = 22; }; message mnt_entry { required uint32 fstype = 1; required uint32 mnt_id = 2; required uint32 root_dev = 3 [(criu).dev = true]; required uint32 parent_mnt_id = 4; required uint32 flags = 5 [(criu).hex = true]; required string root = 6; required string mountpoint = 7; required string source = 8; required string options = 9; optional uint32 shared_id = 10; optional uint32 master_id = 11; optional bool with_plugin = 12; optional bool ext_mount = 13; optional string fsname = 14; optional bool internal_sharing = 15; optional bool deleted = 16; optional uint32 sb_flags = 17 [(criu).hex = true]; /* user defined mapping for external mount */ optional string ext_key = 18; } criu-3.6/images/netdev.proto000066400000000000000000000022231317335042600161460ustar00rootroot00000000000000syntax = "proto2"; import "macvlan.proto"; import "opts.proto"; import "tun.proto"; import "sysctl.proto"; import "sit.proto"; enum nd_type { LOOPBACK = 1; VETH = 2; TUN = 3; /* * External link -- for those CRIU only dumps and restores * link parameters such as flags, address, MTU, etc. The * existence of the link on restore should be provided * by the setup-namespaces script. */ EXTLINK = 4; VENET = 5; /* OpenVZ device */ BRIDGE = 6; MACVLAN = 7; SIT = 8; } message net_device_entry { required nd_type type = 1; required uint32 ifindex = 2; required uint32 mtu = 3; required uint32 flags = 4 [(criu).hex = true]; required string name = 5; optional tun_link_entry tun = 6; optional bytes address = 7; repeated int32 conf = 8; repeated sysctl_entry conf4 = 9; repeated sysctl_entry conf6 = 10; optional macvlan_link_entry macvlan = 11; optional sit_entry sit = 15; } message netns_entry { repeated int32 def_conf = 1; repeated int32 all_conf = 2; repeated sysctl_entry def_conf4 = 3; repeated sysctl_entry all_conf4 = 4; repeated sysctl_entry def_conf6 = 5; repeated sysctl_entry all_conf6 = 6; } criu-3.6/images/ns.proto000066400000000000000000000002451317335042600153030ustar00rootroot00000000000000syntax = "proto2"; message ns_file_entry { required uint32 id = 1; required uint32 ns_id = 2; required uint32 ns_cflag = 3; required uint32 flags = 4; } criu-3.6/images/opts.proto000066400000000000000000000010411317335042600156430ustar00rootroot00000000000000syntax = "proto2"; import "google/protobuf/descriptor.proto"; message CRIU_Opts { optional bool hex = 1; // Idicate that CRIT should treat this field as hex. optional bool ipadd = 2; // The field is IPv4/v6 address optional string flags = 3; optional bool dev = 4; // Device major:minor packed optional bool odev = 5; // ... in old format optional string dict = 6; optional string conv = 7; } extend google.protobuf.FieldOptions { // Registered unique number to use for all kinds of custom options. optional CRIU_Opts criu = 1018; } criu-3.6/images/packet-sock.proto000066400000000000000000000021641317335042600170710ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "fown.proto"; import "sk-opts.proto"; message packet_mclist { required uint32 index = 1; required uint32 type = 2; required bytes addr = 3; } message packet_ring { required uint32 block_size = 1; required uint32 block_nr = 2; required uint32 frame_size = 3; required uint32 frame_nr = 4; required uint32 retire_tmo = 5; required uint32 sizeof_priv = 6; required uint32 features = 7; } message packet_sock_entry { required uint32 id = 1; required uint32 type = 2; required uint32 protocol = 3; required uint32 flags = 4 [(criu).hex = true]; required uint32 ifindex = 5; required fown_entry fown = 6; required sk_opts_entry opts = 7; required uint32 version = 8; required uint32 reserve = 9; required bool aux_data = 10; required bool orig_dev = 11; required bool vnet_hdr = 12; required bool loss = 13; required uint32 timestamp = 14; required uint32 copy_thresh = 15; repeated packet_mclist mclist = 16; optional uint32 fanout = 17 [ default = 0xffffffff ]; optional packet_ring rx_ring = 18; optional packet_ring tx_ring = 19; } criu-3.6/images/pagemap.proto000066400000000000000000000004471317335042600163010ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; message pagemap_head { required uint32 pages_id = 1; } message pagemap_entry { required uint64 vaddr = 1 [(criu).hex = true]; required uint32 nr_pages = 2; optional bool in_parent = 3; optional uint32 flags = 4 [(criu).flags = "pmap.flags" ]; } criu-3.6/images/pipe-data.proto000066400000000000000000000002101317335042600165170ustar00rootroot00000000000000syntax = "proto2"; message pipe_data_entry { required uint32 pipe_id = 1; required uint32 bytes = 2; optional uint32 size = 3; } criu-3.6/images/pipe.proto000066400000000000000000000003431317335042600156170ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "fown.proto"; message pipe_entry { required uint32 id = 1; required uint32 pipe_id = 2; required uint32 flags = 3 [(criu).hex = true]; required fown_entry fown = 4; } criu-3.6/images/pstree.proto000066400000000000000000000003041317335042600161610ustar00rootroot00000000000000syntax = "proto2"; message pstree_entry { required uint32 pid = 1; required uint32 ppid = 2; required uint32 pgid = 3; required uint32 sid = 4; repeated uint32 threads = 5; } criu-3.6/images/regfile.proto000066400000000000000000000006171317335042600163030ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "fown.proto"; message reg_file_entry { required uint32 id = 1; required uint32 flags = 2 [(criu).flags = "rfile.flags"]; required uint64 pos = 3; required fown_entry fown = 5; required string name = 6; optional sint32 mnt_id = 7 [default = -1]; optional uint64 size = 8; optional bool ext = 9; optional uint32 mode = 10; } criu-3.6/images/remap-file-path.proto000066400000000000000000000003361317335042600176370ustar00rootroot00000000000000syntax = "proto2"; enum remap_type { LINKED = 0; GHOST = 1; PROCFS = 2; }; message remap_file_path_entry { required uint32 orig_id = 1; required uint32 remap_id = 2; optional remap_type remap_type = 3; } criu-3.6/images/rlimit.proto000066400000000000000000000001431317335042600161600ustar00rootroot00000000000000syntax = "proto2"; message rlimit_entry { required uint64 cur = 1; required uint64 max = 2; } criu-3.6/images/rpc.proto000066400000000000000000000114101317335042600154430ustar00rootroot00000000000000syntax = "proto2"; message criu_page_server_info { optional string address = 1; optional int32 port = 2; optional int32 pid = 3; optional int32 fd = 4; } message criu_veth_pair { required string if_in = 1; required string if_out = 2; }; message ext_mount_map { required string key = 1; required string val = 2; }; message join_namespace { required string ns = 1; required string ns_file = 2; optional string extra_opt = 3; } message inherit_fd { required string key = 1; required int32 fd = 2; }; message cgroup_root { optional string ctrl = 1; required string path = 2; }; message unix_sk { required uint32 inode = 1; }; enum criu_cg_mode { IGNORE = 0; CG_NONE = 1; PROPS = 2; SOFT = 3; FULL = 4; STRICT = 5; DEFAULT = 6; }; message criu_opts { required int32 images_dir_fd = 1; optional int32 pid = 2; /* if not set on dump, will dump requesting process */ optional bool leave_running = 3; optional bool ext_unix_sk = 4; optional bool tcp_established = 5; optional bool evasive_devices = 6; optional bool shell_job = 7; optional bool file_locks = 8; optional int32 log_level = 9 [default = 2]; optional string log_file = 10; /* No subdirs are allowed. Consider using work-dir */ optional criu_page_server_info ps = 11; optional bool notify_scripts = 12; optional string root = 13; optional string parent_img = 14; optional bool track_mem = 15; optional bool auto_dedup = 16; optional int32 work_dir_fd = 17; optional bool link_remap = 18; repeated criu_veth_pair veths = 19; /* DEPRECATED, use external instead */ optional uint32 cpu_cap = 20 [default = 0xffffffff]; optional bool force_irmap = 21; repeated string exec_cmd = 22; repeated ext_mount_map ext_mnt = 23; /* DEPRECATED, use external instead */ optional bool manage_cgroups = 24; /* backward compatibility */ repeated cgroup_root cg_root = 25; optional bool rst_sibling = 26; /* swrk only */ repeated inherit_fd inherit_fd = 27; /* swrk only */ optional bool auto_ext_mnt = 28; optional bool ext_sharing = 29; optional bool ext_masters = 30; repeated string skip_mnt = 31; repeated string enable_fs = 32; repeated unix_sk unix_sk_ino = 33; /* DEPRECATED, use external instead */ optional criu_cg_mode manage_cgroups_mode = 34; optional uint32 ghost_limit = 35 [default = 0x100000]; repeated string irmap_scan_paths = 36; repeated string external = 37; optional uint32 empty_ns = 38; repeated join_namespace join_ns = 39; optional string cgroup_props = 41; optional string cgroup_props_file = 42; repeated string cgroup_dump_controller = 43; optional string freeze_cgroup = 44; optional uint32 timeout = 45; optional bool tcp_skip_in_flight = 46; optional bool weak_sysctls = 47; optional bool lazy_pages = 48; optional int32 status_fd = 49; optional bool orphan_pts_master = 50; } message criu_dump_resp { optional bool restored = 1; } message criu_restore_resp { required int32 pid = 1; } message criu_notify { optional string script = 1; optional int32 pid = 2; } enum criu_req_type { EMPTY = 0; DUMP = 1; RESTORE = 2; CHECK = 3; PRE_DUMP = 4; PAGE_SERVER = 5; NOTIFY = 6; CPUINFO_DUMP = 7; CPUINFO_CHECK = 8; FEATURE_CHECK = 9; VERSION = 10; } /* * List of features which can queried via * CRIU_REQ_TYPE__FEATURE_CHECK */ message criu_features { optional bool mem_track = 1; optional bool lazy_pages = 2; } /* * Request -- each type corresponds to must-be-there * request arguments of respective type */ message criu_req { required criu_req_type type = 1; optional criu_opts opts = 2; optional bool notify_success = 3; /* * When set service won't close the connection but * will wait for more req-s to appear. Works not * for all request types. */ optional bool keep_open = 4; /* * 'features' can be used to query which features * are supported by the installed criu/kernel * via RPC. */ optional criu_features features = 5; } /* * Response -- it states whether the request was served * and additional request-specific information */ message criu_resp { required criu_req_type type = 1; required bool success = 2; optional criu_dump_resp dump = 3; optional criu_restore_resp restore = 4; optional criu_notify notify = 5; optional criu_page_server_info ps = 6; optional int32 cr_errno = 7; optional criu_features features = 8; optional string cr_errmsg = 9; optional criu_version version = 10; } /* Answer for criu_req_type.VERSION requests */ message criu_version { required int32 major = 1; required int32 minor = 2; optional string gitid = 3; optional int32 sublevel = 4; optional int32 extra = 5; optional string name = 6; } criu-3.6/images/sa.proto000066400000000000000000000004541317335042600152700ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; message sa_entry { required uint64 sigaction = 1 [(criu).hex = true]; required uint64 flags = 2 [(criu).hex = true]; required uint64 restorer = 3 [(criu).hex = true]; required uint64 mask = 4 [(criu).hex = true]; optional bool compat_sigaction = 5; } criu-3.6/images/seccomp.proto000066400000000000000000000002661317335042600163170ustar00rootroot00000000000000syntax = "proto2"; message seccomp_filter { required bytes filter = 1; optional uint32 prev = 2; } message seccomp_entry { repeated seccomp_filter seccomp_filters = 1; } criu-3.6/images/siginfo.proto000066400000000000000000000002201317335042600163120ustar00rootroot00000000000000syntax = "proto2"; message siginfo_entry { required bytes siginfo = 1; } message signal_queue_entry { repeated siginfo_entry signals = 1; } criu-3.6/images/signalfd.proto000066400000000000000000000003741317335042600164550ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "fown.proto"; message signalfd_entry { required uint32 id = 1; required uint32 flags = 2 [(criu).hex = true]; required fown_entry fown = 3; required uint64 sigmask = 4 [(criu).hex = true]; }; criu-3.6/images/sit.proto000066400000000000000000000012311317335042600154560ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; message sit_entry { optional uint32 link = 1; repeated uint32 local = 2 [(criu).ipadd = true]; repeated uint32 remote = 3 [(criu).ipadd = true]; optional uint32 ttl = 4; optional uint32 tos = 5; optional bool pmtudisc = 6; optional uint32 proto = 7; optional uint32 flags = 8; optional uint32 encap_type = 9; optional uint32 encap_flags = 10; optional uint32 encap_sport = 11; optional uint32 encap_dport = 12; optional uint32 rd_prefixlen = 13; repeated uint32 rd_prefix = 14 [(criu).ipadd = true]; optional uint32 relay_prefixlen = 15; repeated uint32 relay_prefix = 16 [(criu).ipadd = true]; }; criu-3.6/images/sk-inet.proto000066400000000000000000000024711317335042600162400ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "fown.proto"; import "sk-opts.proto"; message ip_opts_entry { optional bool freebind = 1; } message inet_sk_entry { /* * We have two IDs here -- id and ino. The first one * is used when restoring socket behind a file descriprot. * The fdinfo image's id is it. The second one is used * in sk-inet.c internally, in particular we identify * a TCP stream to restore into this socket using the * ino value. */ required uint32 id = 1; required uint32 ino = 2; required uint32 family = 3 [(criu).dict = "sk"]; required uint32 type = 4 [(criu).dict = "sk"]; required uint32 proto = 5 [(criu).dict = "sk"]; required uint32 state = 6 [(criu).dict = "sk"]; required uint32 src_port = 7; required uint32 dst_port = 8; required uint32 flags = 9 [(criu).hex = true]; required uint32 backlog = 10; repeated uint32 src_addr = 11 [(criu).ipadd = true]; repeated uint32 dst_addr = 12 [(criu).ipadd = true]; required fown_entry fown = 13; required sk_opts_entry opts = 14; optional bool v6only = 15; optional ip_opts_entry ip_opts = 16; /* for ipv6, we need to send the ifindex to bind(); we keep the ifname * here and convert it on restore */ optional string ifname = 17; optional sk_shutdown shutdown = 19; } criu-3.6/images/sk-netlink.proto000066400000000000000000000007701317335042600167450ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "fown.proto"; import "sk-opts.proto"; message netlink_sk_entry { required uint32 id = 1; required uint32 ino = 2; required uint32 protocol = 3; required uint32 state = 4; required uint32 flags = 6 [(criu).hex = true]; required uint32 portid = 7; repeated uint32 groups = 8; required uint32 dst_portid = 9; required uint32 dst_group = 10; required fown_entry fown = 11; required sk_opts_entry opts = 12; } criu-3.6/images/sk-opts.proto000066400000000000000000000012461317335042600162650ustar00rootroot00000000000000syntax = "proto2"; message sk_opts_entry { required uint32 so_sndbuf = 1; required uint32 so_rcvbuf = 2; required uint64 so_snd_tmo_sec = 3; required uint64 so_snd_tmo_usec = 4; required uint64 so_rcv_tmo_sec = 5; required uint64 so_rcv_tmo_usec = 6; optional bool reuseaddr = 7; optional uint32 so_priority = 8; optional uint32 so_rcvlowat = 9; optional uint32 so_mark = 10; optional bool so_passcred = 11; optional bool so_passsec = 12; optional bool so_dontroute = 13; optional bool so_no_check = 14; optional string so_bound_dev = 15; repeated fixed64 so_filter = 16; } enum sk_shutdown { NONE = 0; READ = 1; WRITE = 2; BOTH = 3; } criu-3.6/images/sk-packet.proto000066400000000000000000000003411317335042600165420ustar00rootroot00000000000000syntax = "proto2"; message scm_entry { required uint32 type = 1; repeated uint32 rights = 2; } message sk_packet_entry { required uint32 id_for = 1; required uint32 length = 2; repeated scm_entry scm = 4; } criu-3.6/images/sk-unix.proto000066400000000000000000000024771317335042600162720ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "fown.proto"; import "sk-opts.proto"; message file_perms_entry { required uint32 mode = 1; required uint32 uid = 2; required uint32 gid = 3; } message unix_sk_entry { /* * Few words about why we need both -- id and ino. * * The former one is used to link file descriptor from * fdinfo image with the unix_sk_entry that should be * opened under it. * * The latter one ties together unix peers -- the peer * member on this structure is the ino one of its peer * and simetimes vise-versa. */ required uint32 id = 1; required uint32 ino = 2; required uint32 type = 3 [(criu).dict = "sk"]; required uint32 state = 4 [(criu).dict = "sk"]; required uint32 flags = 5 [(criu).hex = true]; required uint32 uflags = 6 [(criu).hex = true]; required uint32 backlog = 7; required uint32 peer = 8; required fown_entry fown = 9; required sk_opts_entry opts = 10; /* * Abstract name may contain \0 at any point, * so we need to carry it as byte sequence... */ required bytes name = 11 [(criu).conv = "unix_name"]; optional sk_shutdown shutdown = 12; optional file_perms_entry file_perms = 13; /* * Relative socket name may have prefix. */ optional string name_dir = 14; optional bool deleted = 15; } criu-3.6/images/stats.proto000066400000000000000000000014441317335042600160230ustar00rootroot00000000000000syntax = "proto2"; // This one contains statistics about dump/restore process message dump_stats_entry { required uint32 freezing_time = 1; required uint32 frozen_time = 2; required uint32 memdump_time = 3; required uint32 memwrite_time = 4; required uint64 pages_scanned = 5; required uint64 pages_skipped_parent = 6; required uint64 pages_written = 7; optional uint32 irmap_resolve = 8; required uint64 pages_lazy = 9; } message restore_stats_entry { required uint64 pages_compared = 1; required uint64 pages_skipped_cow = 2; required uint32 forking_time = 3; required uint32 restore_time = 4; optional uint64 pages_restored = 5; } message stats_entry { optional dump_stats_entry dump = 1; optional restore_stats_entry restore = 2; } criu-3.6/images/sysctl.proto000066400000000000000000000002641317335042600162050ustar00rootroot00000000000000syntax = "proto2"; enum SysctlType { CTL_STR = 5; CTL_32 = 6; } message sysctl_entry { required SysctlType type = 1; optional int32 iarg = 2; optional string sarg = 3; } criu-3.6/images/tcp-stream.proto000066400000000000000000000013551317335042600167450ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; message tcp_stream_entry { required uint32 inq_len = 1; required uint32 inq_seq = 2; required uint32 outq_len = 3; /* unsent and sent data in the send queue*/ required uint32 outq_seq = 4; required uint32 opt_mask = 5 [(criu).hex = true]; /* TCPI_OPT_ bits */ required uint32 snd_wscale = 6; required uint32 mss_clamp = 7; optional uint32 rcv_wscale = 8; optional uint32 timestamp = 9; optional bool cork = 10; optional bool nodelay = 11; optional uint32 unsq_len = 12; /* unsent data in the send queue */ optional uint32 snd_wl1 = 13; optional uint32 snd_wnd = 14; optional uint32 max_window = 15; optional uint32 rcv_wnd = 16; optional uint32 rcv_wup = 17; } criu-3.6/images/time.proto000066400000000000000000000001451317335042600156200ustar00rootroot00000000000000syntax = "proto2"; message timeval { required uint64 tv_sec = 1; required uint64 tv_usec = 2; } criu-3.6/images/timer.proto000066400000000000000000000012541317335042600160040ustar00rootroot00000000000000syntax = "proto2"; message itimer_entry { required uint64 isec = 1; required uint64 iusec = 2; required uint64 vsec = 3; required uint64 vusec = 4; } message posix_timer_entry { required uint32 it_id = 1; required uint32 clock_id = 2; required uint32 si_signo = 3; required uint32 it_sigev_notify = 4; required uint64 sival_ptr = 5; required uint32 overrun = 6; required uint64 isec = 7; required uint64 insec = 8; required uint64 vsec = 9; required uint64 vnsec = 10; } message task_timers_entry { required itimer_entry real = 1; required itimer_entry virt = 2; required itimer_entry prof = 3; repeated posix_timer_entry posix = 4; } criu-3.6/images/timerfd.proto000066400000000000000000000006661317335042600163240ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "fown.proto"; message timerfd_entry { required uint32 id = 1; required uint32 flags = 2 [(criu).hex = true]; required fown_entry fown = 3; required uint32 clockid = 4; required uint64 ticks = 5; required uint32 settime_flags = 6 [(criu).hex = true]; required uint64 vsec = 7; required uint64 vnsec = 8; required uint64 isec = 9; required uint64 insec = 10; } criu-3.6/images/tty.proto000066400000000000000000000033511317335042600155040ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; import "fown.proto"; message winsize_entry { required uint32 ws_row = 1; required uint32 ws_col = 2; required uint32 ws_xpixel = 3; required uint32 ws_ypixel = 4; }; message termios_entry { required uint32 c_iflag = 1; required uint32 c_oflag = 2; required uint32 c_cflag = 3; required uint32 c_lflag = 4; required uint32 c_line = 5; required uint32 c_ispeed = 6; required uint32 c_ospeed = 7; repeated uint32 c_cc = 8; } message tty_pty_entry { required uint32 index = 1; } enum TtyType { UNKNOWN = 0; PTY = 1; CONSOLE = 2; VT = 3; CTTY = 4; EXT_TTY = 5; SERIAL = 6; } message tty_data_entry { required uint32 tty_id = 1; required bytes data = 2; } message tty_info_entry { required uint32 id = 1; required TtyType type = 2; required bool locked = 3; /* Unix98 PTY only */ required bool exclusive = 4; required bool packet_mode = 5; /* Unix98 PTY only */ required uint32 sid = 6; required uint32 pgrp = 7; /* * Convenient for printing errors and such, with this * device encoded we can figure out major and minor * numbers. */ required uint32 rdev = 8; optional termios_entry termios = 9; optional termios_entry termios_locked = 10; optional winsize_entry winsize = 11; /* * These are optional fields which presence depends on * TTY type. */ optional tty_pty_entry pty = 12; optional uint32 dev = 13; optional uint32 uid = 14; optional uint32 gid = 15; }; message tty_file_entry { required uint32 id = 1; required uint32 tty_info_id = 2; required uint32 flags = 3 [(criu).hex = true]; required fown_entry fown = 4; optional uint32 regf_id = 6; } criu-3.6/images/tun.proto000066400000000000000000000005451317335042600154740ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; message tunfile_entry { required uint32 id = 1; optional string netdev = 2; optional bool detached = 3; }; message tun_link_entry { required uint32 flags = 1 [(criu).hex = true]; required int32 owner = 2; required int32 group = 3; required uint32 vnethdr = 4; required uint32 sndbuf = 5; }; criu-3.6/images/userns.proto000066400000000000000000000003611317335042600162010ustar00rootroot00000000000000syntax = "proto2"; message uid_gid_extent { required uint32 first = 1; required uint32 lower_first = 2; required uint32 count = 3; } message userns_entry { repeated uid_gid_extent uid_map = 1; repeated uid_gid_extent gid_map = 2; } criu-3.6/images/utsns.proto000066400000000000000000000001541317335042600160360ustar00rootroot00000000000000syntax = "proto2"; message utsns_entry { required string nodename = 1; required string domainname = 2; } criu-3.6/images/vma.proto000066400000000000000000000013121317335042600154420ustar00rootroot00000000000000syntax = "proto2"; import "opts.proto"; message vma_entry { required uint64 start = 1 [(criu).hex = true]; required uint64 end = 2 [(criu).hex = true]; required uint64 pgoff = 3; required uint64 shmid = 4; required uint32 prot = 5 [(criu).flags = "mmap.prot" ]; required uint32 flags = 6 [(criu).flags = "mmap.flags" ]; required uint32 status = 7 [(criu).flags = "mmap.status" ]; /* * This fd thing is unused in the image, it was lost * while switching from execve restore model. It is * -1 by default. */ required sint64 fd = 8; /* madvise flags bitmap */ optional uint64 madv = 9 [(criu).hex = true]; /* file status flags */ optional uint32 fdflags = 10 [(criu).hex = true]; } criu-3.6/include/000077500000000000000000000000001317335042600137535ustar00rootroot00000000000000criu-3.6/include/common/000077500000000000000000000000001317335042600152435ustar00rootroot00000000000000criu-3.6/include/common/arch/000077500000000000000000000000001317335042600161605ustar00rootroot00000000000000criu-3.6/include/common/arch/aarch64/000077500000000000000000000000001317335042600174105ustar00rootroot00000000000000criu-3.6/include/common/arch/aarch64/asm/000077500000000000000000000000001317335042600201705ustar00rootroot00000000000000criu-3.6/include/common/arch/aarch64/asm/atomic.h000066400000000000000000000037111317335042600216170ustar00rootroot00000000000000#ifndef __CR_ATOMIC_H__ #define __CR_ATOMIC_H__ typedef struct { int counter; } atomic_t; /* Copied from the Linux header arch/arm/include/asm/barrier.h */ #define smp_mb() asm volatile("dmb ish" : : : "memory") /* Copied from the Linux kernel header arch/arm64/include/asm/atomic.h */ static inline int atomic_read(const atomic_t *v) { return (*(volatile int *)&(v)->counter); } static inline void atomic_set(atomic_t *v, int i) { v->counter = i; } #define atomic_get atomic_read static inline int atomic_add_return(int i, atomic_t *v) { unsigned long tmp; int result; asm volatile( "1: ldxr %w0, %2\n" " add %w0, %w0, %w3\n" " stlxr %w1, %w0, %2\n" " cbnz %w1, 1b" : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) : "Ir" (i) : "cc", "memory"); smp_mb(); return result; } static inline int atomic_sub_return(int i, atomic_t *v) { unsigned long tmp; int result; asm volatile( "1: ldxr %w0, %2\n" " sub %w0, %w0, %w3\n" " stlxr %w1, %w0, %2\n" " cbnz %w1, 1b" : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) : "Ir" (i) : "cc", "memory"); smp_mb(); return result; } static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; } static inline int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; } static inline int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; } /* true if the result is 0, or false for all other cases. */ #define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) #define atomic_dec_return(v) (atomic_sub_return(1, v)) #define atomic_inc_return(v) (atomic_add_return(1, v)) static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) { unsigned long tmp; int oldval; smp_mb(); asm volatile("// atomic_cmpxchg\n" "1: ldxr %w1, %2\n" " cmp %w1, %w3\n" " b.ne 2f\n" " stxr %w0, %w4, %2\n" " cbnz %w0, 1b\n" "2:" : "=&r" (tmp), "=&r" (oldval), "+Q" (ptr->counter) : "Ir" (old), "r" (new) : "cc"); smp_mb(); return oldval; } #endif /* __CR_ATOMIC_H__ */ criu-3.6/include/common/arch/aarch64/asm/bitops.h000066400000000000000000000003401317335042600216360ustar00rootroot00000000000000#ifndef __CR_ASM_BITOPS_H__ #define __CR_ASM_BITOPS_H__ #include "common/compiler.h" #include "common/asm-generic/bitops.h" extern int test_and_set_bit(int nr, volatile unsigned long *p); #endif /* __CR_ASM_BITOPS_H__ */ criu-3.6/include/common/arch/aarch64/asm/bitsperlong.h000066400000000000000000000001671317335042600226750ustar00rootroot00000000000000#ifndef __CR_BITSPERLONG_H__ #define __CR_BITSPERLONG_H__ #define BITS_PER_LONG 64 #endif /* __CR_BITSPERLONG_H__ */ criu-3.6/include/common/arch/aarch64/asm/linkage.h000066400000000000000000000005711317335042600217560ustar00rootroot00000000000000#ifndef __CR_LINKAGE_H__ #define __CR_LINKAGE_H__ #ifdef __ASSEMBLY__ #define __ALIGN .align 4, 0x00 #define __ALIGN_STR ".align 4, 0x00" #define GLOBAL(name) \ .globl name; \ name: #define ENTRY(name) \ .globl name; \ .type name, #function; \ __ALIGN; \ name: #define END(sym) \ .size sym, . - sym #endif /* __ASSEMBLY__ */ #endif /* __CR_LINKAGE_H__ */ criu-3.6/include/common/arch/aarch64/asm/page.h000066400000000000000000000005631317335042600212610ustar00rootroot00000000000000#ifndef __CR_ASM_PAGE_H__ #define __CR_ASM_PAGE_H__ #include #ifndef PAGE_SHIFT # define PAGE_SHIFT 12 #endif #ifndef PAGE_SIZE # define PAGE_SIZE (1UL << PAGE_SHIFT) #endif #ifndef PAGE_MASK # define PAGE_MASK (~(PAGE_SIZE - 1)) #endif #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #define page_size() sysconf(_SC_PAGESIZE) #endif /* __CR_ASM_PAGE_H__ */ criu-3.6/include/common/arch/arm/000077500000000000000000000000001317335042600167375ustar00rootroot00000000000000criu-3.6/include/common/arch/arm/asm/000077500000000000000000000000001317335042600175175ustar00rootroot00000000000000criu-3.6/include/common/arch/arm/asm/atomic.h000066400000000000000000000051511317335042600211460ustar00rootroot00000000000000#ifndef __CR_ATOMIC_H__ #define __CR_ATOMIC_H__ #include "common/arch/arm/asm/processor.h" typedef struct { int counter; } atomic_t; /* Copied from the Linux kernel header arch/arm/include/asm/atomic.h */ #if defined(CONFIG_ARMV7) #define smp_mb() __asm__ __volatile__ ("dmb" : : : "memory") static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) { int oldval; unsigned long res; smp_mb(); prefetchw(&ptr->counter); do { __asm__ __volatile__("@ atomic_cmpxchg\n" "ldrex %1, [%3]\n" "mov %0, #0\n" "teq %1, %4\n" "it eq\n" "strexeq %0, %5, [%3]\n" : "=&r" (res), "=&r" (oldval), "+Qo" (ptr->counter) : "r" (&ptr->counter), "Ir" (old), "r" (new) : "cc"); } while (res); smp_mb(); return oldval; } #elif defined(CONFIG_ARMV6) /* SMP isn't supported for ARMv6 */ #define smp_mb() __asm__ __volatile__ ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory") static inline int atomic_cmpxchg(atomic_t *v, int old, int new) { int ret; ret = v->counter; if (ret == old) v->counter = new; return ret; } #else #error ARM architecture version (CONFIG_ARMV*) not set or unsupported. #endif static inline int atomic_read(const atomic_t *v) { return (*(volatile int *)&(v)->counter); } static inline void atomic_set(atomic_t *v, int i) { v->counter = i; } #define atomic_get atomic_read static inline int atomic_add_return(int i, atomic_t *v) { unsigned long tmp; int result; smp_mb(); __asm__ __volatile__("@ atomic_add_return\n" "1: ldrex %0, [%3]\n" " add %0, %0, %4\n" " strex %1, %0, [%3]\n" " teq %1, #0\n" " bne 1b\n" : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) : "r" (&v->counter), "Ir" (i) : "cc"); smp_mb(); return result; } static inline int atomic_sub_return(int i, atomic_t *v) { unsigned long tmp; int result; smp_mb(); __asm__ __volatile__("@ atomic_sub_return\n" "1: ldrex %0, [%3]\n" " sub %0, %0, %4\n" " strex %1, %0, [%3]\n" " teq %1, #0\n" " bne 1b\n" : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) : "r" (&v->counter), "Ir" (i) : "cc"); smp_mb(); return result; } static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; } static inline int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; } static inline int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; } /* true if the result is 0, or false for all other cases. */ #define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) #define atomic_dec_return(v) (atomic_sub_return(1, v)) #define atomic_inc_return(v) (atomic_add_return(1, v)) #endif /* __CR_ATOMIC_H__ */ criu-3.6/include/common/arch/arm/asm/bitops.h000066400000000000000000000003401317335042600211650ustar00rootroot00000000000000#ifndef __CR_ASM_BITOPS_H__ #define __CR_ASM_BITOPS_H__ #include "common/compiler.h" #include "common/asm-generic/bitops.h" extern int test_and_set_bit(int nr, volatile unsigned long *p); #endif /* __CR_ASM_BITOPS_H__ */ criu-3.6/include/common/arch/arm/asm/bitsperlong.h000066400000000000000000000001671317335042600222240ustar00rootroot00000000000000#ifndef __CR_BITSPERLONG_H__ #define __CR_BITSPERLONG_H__ #define BITS_PER_LONG 32 #endif /* __CR_BITSPERLONG_H__ */ criu-3.6/include/common/arch/arm/asm/linkage.h000066400000000000000000000005711317335042600213050ustar00rootroot00000000000000#ifndef __CR_LINKAGE_H__ #define __CR_LINKAGE_H__ #ifdef __ASSEMBLY__ #define __ALIGN .align 4, 0x00 #define __ALIGN_STR ".align 4, 0x00" #define GLOBAL(name) \ .globl name; \ name: #define ENTRY(name) \ .globl name; \ .type name, #function; \ __ALIGN; \ name: #define END(sym) \ .size sym, . - sym #endif /* __ASSEMBLY__ */ #endif /* __CR_LINKAGE_H__ */ criu-3.6/include/common/arch/arm/asm/page.h000066400000000000000000000005221317335042600206030ustar00rootroot00000000000000#ifndef __CR_ASM_PAGE_H__ #define __CR_ASM_PAGE_H__ #ifndef PAGE_SHIFT # define PAGE_SHIFT 12 #endif #ifndef PAGE_SIZE # define PAGE_SIZE (1UL << PAGE_SHIFT) #endif #ifndef PAGE_MASK # define PAGE_MASK (~(PAGE_SIZE - 1)) #endif #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #define page_size() PAGE_SIZE #endif /* __CR_ASM_PAGE_H__ */ criu-3.6/include/common/arch/arm/asm/processor.h000066400000000000000000000011341317335042600217060ustar00rootroot00000000000000#ifndef __CR_PROCESSOR_H__ #define __CR_PROCESSOR_H__ /* Copied from linux kernel arch/arm/include/asm/unified.h */ #define WASM(instr) #instr /* Copied from linux kernel arch/arm/include/asm/processor.h */ #define __ALT_SMP_ASM(smp, up) \ "9998: " smp "\n" \ " .pushsection \".alt.smp.init\", \"a\"\n" \ " .long 9998b\n" \ " " up "\n" \ " .popsection\n" static inline void prefetchw(const void *ptr) { __asm__ __volatile__( ".arch_extension mp\n" __ALT_SMP_ASM( WASM(pldw) "\t%a0", WASM(pld) "\t%a0" ) :: "p" (ptr)); } #endif /* __CR_PROCESSOR_H__ */ criu-3.6/include/common/arch/ppc64/000077500000000000000000000000001317335042600171145ustar00rootroot00000000000000criu-3.6/include/common/arch/ppc64/asm/000077500000000000000000000000001317335042600176745ustar00rootroot00000000000000criu-3.6/include/common/arch/ppc64/asm/atomic.h000066400000000000000000000047501317335042600213270ustar00rootroot00000000000000#ifndef __CR_ATOMIC_H__ #define __CR_ATOMIC_H__ /* * PowerPC atomic operations * * Copied from kernel header file arch/powerpc/include/asm/atomic.h */ typedef struct { int counter; } atomic_t; #include "common/arch/ppc64/asm/cmpxchg.h" #define PPC_ATOMIC_ENTRY_BARRIER "lwsync \n" #define PPC_ATOMIC_EXIT_BARRIER "sync \n" #define ATOMIC_INIT(i) { (i) } static __inline__ int atomic_read(const atomic_t *v) { int t; __asm__ __volatile__("lwz%U1%X1 %0,%1" : "=r"(t) : "m"(v->counter)); return t; } static __inline__ void atomic_set(atomic_t *v, int i) { __asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i)); } #define ATOMIC_OP(op, asm_op) \ static __inline__ void atomic_##op(int a, atomic_t *v) \ { \ int t; \ \ __asm__ __volatile__( \ "1: lwarx %0,0,%3 # atomic_" #op "\n" \ #asm_op " %0,%2,%0\n" \ " stwcx. %0,0,%3 \n" \ " bne- 1b\n" \ : "=&r" (t), "+m" (v->counter) \ : "r" (a), "r" (&v->counter) \ : "cc"); \ } \ ATOMIC_OP(add, add) ATOMIC_OP(sub, subf) #undef ATOMIC_OP static __inline__ void atomic_inc(atomic_t *v) { int t; __asm__ __volatile__( "1: lwarx %0,0,%2 # atomic_inc\n\ addic %0,%0,1\n" " stwcx. %0,0,%2 \n\ bne- 1b" : "=&r" (t), "+m" (v->counter) : "r" (&v->counter) : "cc", "xer"); } static __inline__ int atomic_inc_return(atomic_t *v) { int t; __asm__ __volatile__( PPC_ATOMIC_ENTRY_BARRIER \ "1: lwarx %0,0,%1 # atomic_inc_return\n\ addic %0,%0,1\n" " stwcx. %0,0,%1 \n\ bne- 1b \n" \ PPC_ATOMIC_EXIT_BARRIER : "=&r" (t) : "r" (&v->counter) : "cc", "xer", "memory"); return t; } /* * atomic_inc_and_test - increment and test * @v: pointer of type atomic_t * * Atomically increments @v by 1 * and returns true if the result is zero, or false for all * other cases. */ static __inline__ void atomic_dec(atomic_t *v) { int t; __asm__ __volatile__( "1: lwarx %0,0,%2 # atomic_dec\n\ addic %0,%0,-1\n" " stwcx. %0,0,%2\n\ bne- 1b" : "=&r" (t), "+m" (v->counter) : "r" (&v->counter) : "cc", "xer"); } static __inline__ int atomic_sub_return(int a, atomic_t *v) { int t; __asm__ __volatile__( " \nLWSYNC\n" "1: lwarx %0,0,%2 # atomic_sub_return\n\ subf %0,%1,%0\n" " stwcx. %0,0,%2 \n\ bne- 1b" " \nsync\n" : "=&r" (t) : "r" (a), "r" (&v->counter) : "cc", "memory"); return t; } #define atomic_dec_return(v) (atomic_sub_return(1, v)) #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n))) #endif /* __CR_ATOMIC_H__ */ criu-3.6/include/common/arch/ppc64/asm/bitops.h000066400000000000000000000156371317335042600213610ustar00rootroot00000000000000#ifndef __CR_BITOPS_H__ #define __CR_BITOPS_H__ /* * PowerPC atomic bit operations. * * Merged version by David Gibson . * Based on ppc64 versions by: Dave Engebretsen, Todd Inglett, Don * Reed, Pat McCarthy, Peter Bergner, Anton Blanchard. They * originally took it from the ppc32 code. * * Within a word, bits are numbered LSB first. Lot's of places make * this assumption by directly testing bits with (val & (1< 1 word) bitmaps on a * big-endian system because, unlike little endian, the number of each * bit depends on the word size. * * The bitop functions are defined to work on unsigned longs, so for a * ppc64 system the bits end up numbered: * |63..............0|127............64|191...........128|255...........192| * and on ppc32: * |31.....0|63....32|95....64|127...96|159..128|191..160|223..192|255..224| * * There are a few little-endian macros used mostly for filesystem * bitmaps, these work on similar bit arrays layouts, but * byte-oriented: * |7...0|15...8|23...16|31...24|39...32|47...40|55...48|63...56| * * The main difference is that bit 3-5 (64b) or 3-4 (32b) in the bit * number field needs to be reversed compared to the big-endian bit * fields. This can be achieved by XOR with 0x38 (64b) or 0x18 (32b). * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * -- * Copied from the kernel file arch/powerpc/include/asm/bitops.h */ #include "common/compiler.h" #include "common/asm/bitsperlong.h" #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] #define __stringify_in_c(...) #__VA_ARGS__ #define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) /* PPC bit number conversion */ #define PPC_BITLSHIFT(be) (BITS_PER_LONG - 1 - (be)) #define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) #define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs)) #define PPC_INST_LDARX 0x7c0000a8 #define ___PPC_RA(a) (((a) & 0x1f) << 16) #define ___PPC_RB(b) (((b) & 0x1f) << 11) #define ___PPC_RS(s) (((s) & 0x1f) << 21) #define __PPC_EH(eh) (((eh) & 0x1) << 0) #define ___PPC_RT(t) ___PPC_RS(t) #define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LDARX | \ ___PPC_RT(t) | ___PPC_RA(a) | \ ___PPC_RB(b) | __PPC_EH(eh)) #define PPC_LLARX(t, a, b, eh) PPC_LDARX(t, a, b, eh) /* Macro for generating the ***_bits() functions */ #define DEFINE_BITOP(fn, op) \ static __inline__ void fn(unsigned long mask, \ volatile unsigned long *_p) \ { \ unsigned long old; \ unsigned long *p = (unsigned long *)_p; \ __asm__ __volatile__ ( \ "1: ldarx %0,0,%3\n" \ stringify_in_c(op) "%0,%0,%2\n" \ "stdcx. %0,0,%3\n" \ "bne- 1b\n" \ : "=&r" (old), "+m" (*p) \ : "r" (mask), "r" (p) \ : "cc", "memory"); \ } DEFINE_BITOP(set_bits, or) DEFINE_BITOP(clear_bits, andc) DEFINE_BITOP(change_bits, xor) static __inline__ void set_bit(int nr, volatile unsigned long *addr) { set_bits(BIT_MASK(nr), addr + BIT_WORD(nr)); } static __inline__ void clear_bit(int nr, volatile unsigned long *addr) { clear_bits(BIT_MASK(nr), addr + BIT_WORD(nr)); } static __inline__ void change_bit(int nr, volatile unsigned long *addr) { change_bits(BIT_MASK(nr), addr + BIT_WORD(nr)); } static inline int test_bit(int nr, const volatile unsigned long *addr) { return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } /* Like DEFINE_BITOP(), with changes to the arguments to 'op' and the output * operands. */ #define DEFINE_TESTOP(fn, op, prefix, postfix, eh) \ static __inline__ unsigned long fn( \ unsigned long mask, \ volatile unsigned long *_p) \ { \ unsigned long old, t; \ unsigned long *p = (unsigned long *)_p; \ __asm__ __volatile__ ( \ prefix \ "1:" PPC_LLARX(%0,0,%3,eh) "\n" \ stringify_in_c(op) "%1,%0,%2\n" \ "stdcx. %1,0,%3\n" \ "bne- 1b\n" \ postfix \ : "=&r" (old), "=&r" (t) \ : "r" (mask), "r" (p) \ : "cc", "memory"); \ return (old & mask); \ } DEFINE_TESTOP(test_and_set_bits, or, "\nLWSYNC\n", "\nsync\n", 0) static __inline__ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { return test_and_set_bits(BIT_MASK(nr), addr + BIT_WORD(nr)) != 0; } /* * Return the zero-based bit position (LE, not IBM bit numbering) of * the most significant 1-bit in a double word. */ static __inline__ __attribute__((const)) int __ilog2(unsigned long x) { int lz; asm ("cntlzd %0,%1" : "=r" (lz) : "r" (x)); return BITS_PER_LONG - 1 - lz; } static __inline__ unsigned long __ffs(unsigned long x) { return __ilog2(x & -x); } #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) /* * Find the next set bit in a memory region. */ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; if (offset >= size) return size; size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp &= (~0UL << offset); if (size < BITS_PER_LONG) goto found_first; if (tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG-1)) { if ((tmp = *(p++))) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = *p; found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); } #define for_each_bit(i, bitmask) \ for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \ i < sizeof(bitmask); \ i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) #endif /* __CR_BITOPS_H__ */ criu-3.6/include/common/arch/ppc64/asm/bitsperlong.h000066400000000000000000000001671317335042600224010ustar00rootroot00000000000000#ifndef __CR_BITSPERLONG_H__ #define __CR_BITSPERLONG_H__ #define BITS_PER_LONG 64 #endif /* __CR_BITSPERLONG_H__ */ criu-3.6/include/common/arch/ppc64/asm/cmpxchg.h000066400000000000000000000040241317335042600214760ustar00rootroot00000000000000#ifndef __CR_CMPXCHG_H__ #define __CR_CMPXCHG_H__ /* * Copied from kernel header file arch/powerpc/include/asm/cmpxchg.h */ #define PPC_ACQUIRE_BARRIER "isync \n" #define PPC_RELEASE_BARRIER "lwsync \n" /* * Compare and exchange - if *p == old, set it to new, * and return the old value of *p. */ static __always_inline unsigned long __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new) { unsigned int prev; __asm__ __volatile__ ( PPC_RELEASE_BARRIER \ "1: lwarx %0,0,%2 # __cmpxchg_u32\n\ cmpw 0,%0,%3\n\ bne- 2f\n" " stwcx. %4,0,%2\n\ bne- 1b \n" \ PPC_ACQUIRE_BARRIER "\n\ 2:" : "=&r" (prev), "+m" (*p) : "r" (p), "r" (old), "r" (new) : "cc", "memory"); return prev; } static __always_inline unsigned long __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new) { unsigned long prev; __asm__ __volatile__ ( PPC_RELEASE_BARRIER \ "1: ldarx %0,0,%2 # __cmpxchg_u64\n\ cmpd 0,%0,%3\n\ bne- 2f\n\ stdcx. %4,0,%2\n\ bne- 1b \n" \ PPC_ACQUIRE_BARRIER "\n\ 2:" : "=&r" (prev), "+m" (*p) : "r" (p), "r" (old), "r" (new) : "cc", "memory"); return prev; } /* This function doesn't exist, so you'll get a linker error if something tries to do an invalid cmpxchg(). */ #ifdef CR_DEBUG static inline void __cmpxchg_called_with_bad_pointer(void) { __asm__ __volatile__ ( "1: twi 31,0,0 # trap\n" " b 1b" : : : "memory"); } #else extern void __cmpxchg_called_with_bad_pointer(void); #endif static __always_inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, unsigned int size) { switch (size) { case 4: return __cmpxchg_u32(ptr, old, new); case 8: return __cmpxchg_u64(ptr, old, new); } __cmpxchg_called_with_bad_pointer(); return old; } #define cmpxchg(ptr, o, n) \ ({ \ __typeof__(*(ptr)) _o_ = (o); \ __typeof__(*(ptr)) _n_ = (n); \ (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ (unsigned long)_n_, sizeof(*(ptr))); \ }) #endif /* __CR_CMPXCHG_H__ */ criu-3.6/include/common/arch/ppc64/asm/linkage.h000066400000000000000000000120771317335042600214660ustar00rootroot00000000000000/* * Various PowerPc assembly definitions * * Copied from the kernel file arch/powerpc/include/asm/ppc_asm.h * * Copyright (C) 1995-1999 Gary Thomas, Paul Mackerras, Cort Dougan. */ #ifndef __CR_LINKAGE_H__ #define __CR_LINKAGE_H__ #ifdef __ASSEMBLY__ #define GLOBAL(name) \ .globl name; \ name: #define ENTRY(name) \ .globl name; \ .type name, @function; \ name: #define END(sym) \ .size sym, . - sym #define STACKFRAMESIZE 256 #define __STK_REG(i) (112 + ((i)-14)*8) #define STK_REG(i) __STK_REG(__REG_##i) /* The boring bits... */ /* Condition Register Bit Fields */ #define cr0 0 #define cr1 1 #define cr2 2 #define cr3 3 #define cr4 4 #define cr5 5 #define cr6 6 #define cr7 7 /* * General Purpose Registers (GPRs) * * The lower case r0-r31 should be used in preference to the upper * case R0-R31 as they provide more error checking in the assembler. * Use R0-31 only when really nessesary. */ #define r0 %r0 #define r1 %r1 #define r2 %r2 #define r3 %r3 #define r4 %r4 #define r5 %r5 #define r6 %r6 #define r7 %r7 #define r8 %r8 #define r9 %r9 #define r10 %r10 #define r11 %r11 #define r12 %r12 #define r13 %r13 #define r14 %r14 #define r15 %r15 #define r16 %r16 #define r17 %r17 #define r18 %r18 #define r19 %r19 #define r20 %r20 #define r21 %r21 #define r22 %r22 #define r23 %r23 #define r24 %r24 #define r25 %r25 #define r26 %r26 #define r27 %r27 #define r28 %r28 #define r29 %r29 #define r30 %r30 #define r31 %r31 /* Floating Point Registers (FPRs) */ #define fr0 0 #define fr1 1 #define fr2 2 #define fr3 3 #define fr4 4 #define fr5 5 #define fr6 6 #define fr7 7 #define fr8 8 #define fr9 9 #define fr10 10 #define fr11 11 #define fr12 12 #define fr13 13 #define fr14 14 #define fr15 15 #define fr16 16 #define fr17 17 #define fr18 18 #define fr19 19 #define fr20 20 #define fr21 21 #define fr22 22 #define fr23 23 #define fr24 24 #define fr25 25 #define fr26 26 #define fr27 27 #define fr28 28 #define fr29 29 #define fr30 30 #define fr31 31 /* AltiVec Registers (VPRs) */ #define vr0 0 #define vr1 1 #define vr2 2 #define vr3 3 #define vr4 4 #define vr5 5 #define vr6 6 #define vr7 7 #define vr8 8 #define vr9 9 #define vr10 10 #define vr11 11 #define vr12 12 #define vr13 13 #define vr14 14 #define vr15 15 #define vr16 16 #define vr17 17 #define vr18 18 #define vr19 19 #define vr20 20 #define vr21 21 #define vr22 22 #define vr23 23 #define vr24 24 #define vr25 25 #define vr26 26 #define vr27 27 #define vr28 28 #define vr29 29 #define vr30 30 #define vr31 31 /* VSX Registers (VSRs) */ #define vsr0 0 #define vsr1 1 #define vsr2 2 #define vsr3 3 #define vsr4 4 #define vsr5 5 #define vsr6 6 #define vsr7 7 #define vsr8 8 #define vsr9 9 #define vsr10 10 #define vsr11 11 #define vsr12 12 #define vsr13 13 #define vsr14 14 #define vsr15 15 #define vsr16 16 #define vsr17 17 #define vsr18 18 #define vsr19 19 #define vsr20 20 #define vsr21 21 #define vsr22 22 #define vsr23 23 #define vsr24 24 #define vsr25 25 #define vsr26 26 #define vsr27 27 #define vsr28 28 #define vsr29 29 #define vsr30 30 #define vsr31 31 #define vsr32 32 #define vsr33 33 #define vsr34 34 #define vsr35 35 #define vsr36 36 #define vsr37 37 #define vsr38 38 #define vsr39 39 #define vsr40 40 #define vsr41 41 #define vsr42 42 #define vsr43 43 #define vsr44 44 #define vsr45 45 #define vsr46 46 #define vsr47 47 #define vsr48 48 #define vsr49 49 #define vsr50 50 #define vsr51 51 #define vsr52 52 #define vsr53 53 #define vsr54 54 #define vsr55 55 #define vsr56 56 #define vsr57 57 #define vsr58 58 #define vsr59 59 #define vsr60 60 #define vsr61 61 #define vsr62 62 #define vsr63 63 /* SPE Registers (EVPRs) */ #define evr0 0 #define evr1 1 #define evr2 2 #define evr3 3 #define evr4 4 #define evr5 5 #define evr6 6 #define evr7 7 #define evr8 8 #define evr9 9 #define evr10 10 #define evr11 11 #define evr12 12 #define evr13 13 #define evr14 14 #define evr15 15 #define evr16 16 #define evr17 17 #define evr18 18 #define evr19 19 #define evr20 20 #define evr21 21 #define evr22 22 #define evr23 23 #define evr24 24 #define evr25 25 #define evr26 26 #define evr27 27 #define evr28 28 #define evr29 29 #define evr30 30 #define evr31 31 /* some stab codes */ #define N_FUN 36 #define N_RSYM 64 #define N_SLINE 68 #define N_SO 100 #define __REG_R0 0 #define __REG_R1 1 #define __REG_R2 2 #define __REG_R3 3 #define __REG_R4 4 #define __REG_R5 5 #define __REG_R6 6 #define __REG_R7 7 #define __REG_R8 8 #define __REG_R9 9 #define __REG_R10 10 #define __REG_R11 11 #define __REG_R12 12 #define __REG_R13 13 #define __REG_R14 14 #define __REG_R15 15 #define __REG_R16 16 #define __REG_R17 17 #define __REG_R18 18 #define __REG_R19 19 #define __REG_R20 20 #define __REG_R21 21 #define __REG_R22 22 #define __REG_R23 23 #define __REG_R24 24 #define __REG_R25 25 #define __REG_R26 26 #define __REG_R27 27 #define __REG_R28 28 #define __REG_R29 29 #define __REG_R30 30 #define __REG_R31 31 #endif /* __ASSEMBLY__ */ #endif /* __CR_LINKAGE_H__ */ criu-3.6/include/common/arch/ppc64/asm/page.h000066400000000000000000000007501317335042600207630ustar00rootroot00000000000000#ifndef __CR_ASM_PAGE_H__ #define __CR_ASM_PAGE_H__ #include /* * Default config for Pseries is to use 64K pages. * See kernel file arch/powerpc/configs/pseries_*defconfig */ #ifndef PAGE_SHIFT # define PAGE_SHIFT 16 #endif #ifndef PAGE_SIZE # define PAGE_SIZE (1UL << PAGE_SHIFT) #endif #ifndef PAGE_MASK # define PAGE_MASK (~(PAGE_SIZE - 1)) #endif #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #define page_size() sysconf(_SC_PAGESIZE) #endif /* __CR_ASM_PAGE_H__ */ criu-3.6/include/common/arch/s390/000077500000000000000000000000001317335042600166565ustar00rootroot00000000000000criu-3.6/include/common/arch/s390/asm/000077500000000000000000000000001317335042600174365ustar00rootroot00000000000000criu-3.6/include/common/arch/s390/asm/atomic.h000066400000000000000000000026201317335042600210630ustar00rootroot00000000000000#ifndef __ARCH_S390_ATOMIC__ #define __ARCH_S390_ATOMIC__ #include "common/arch/s390/asm/atomic_ops.h" #include "common/compiler.h" #define ATOMIC_INIT(i) { (i) } typedef struct { int counter; } atomic_t; static inline int atomic_read(const atomic_t *v) { int c; asm volatile( " l %0,%1\n" : "=d" (c) : "Q" (v->counter)); return c; } static inline void atomic_set(atomic_t *v, int i) { asm volatile( " st %1,%0\n" : "=Q" (v->counter) : "d" (i)); } static inline int atomic_add_return(int i, atomic_t *v) { return __atomic_add_barrier(i, &v->counter) + i; } static inline void atomic_add(int i, atomic_t *v) { __atomic_add(i, &v->counter); } #define atomic_inc(_v) atomic_add(1, _v) #define atomic_inc_return(_v) atomic_add_return(1, _v) #define atomic_sub(_i, _v) atomic_add(-(int)(_i), _v) #define atomic_sub_return(_i, _v) atomic_add_return(-(int)(_i), _v) #define atomic_dec(_v) atomic_sub(1, _v) #define atomic_dec_return(_v) atomic_sub_return(1, _v) #define atomic_dec_and_test(_v) (atomic_sub_return(1, _v) == 0) #define ATOMIC_OPS(op) \ static inline void atomic_##op(int i, atomic_t *v) \ { \ __atomic_##op(i, &v->counter); \ } \ ATOMIC_OPS(and) ATOMIC_OPS(or) ATOMIC_OPS(xor) #undef ATOMIC_OPS static inline int atomic_cmpxchg(atomic_t *v, int old, int new) { return __atomic_cmpxchg(&v->counter, old, new); } #endif /* __ARCH_S390_ATOMIC__ */ criu-3.6/include/common/arch/s390/asm/atomic_ops.h000066400000000000000000000036471317335042600217560ustar00rootroot00000000000000#ifndef __ARCH_S390_ATOMIC_OPS__ #define __ARCH_S390_ATOMIC_OPS__ #define __ATOMIC_OP(op_name, op_string) \ static inline int op_name(int val, int *ptr) \ { \ int old, new; \ \ asm volatile( \ "0: lr %[new],%[old]\n" \ op_string " %[new],%[val]\n" \ " cs %[old],%[new],%[ptr]\n" \ " jl 0b" \ : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ return old; \ } #define __ATOMIC_OPS(op_name, op_string) \ __ATOMIC_OP(op_name, op_string) \ __ATOMIC_OP(op_name##_barrier, op_string) __ATOMIC_OPS(__atomic_add, "ar") __ATOMIC_OPS(__atomic_and, "nr") __ATOMIC_OPS(__atomic_or, "or") __ATOMIC_OPS(__atomic_xor, "xr") #undef __ATOMIC_OPS #define __ATOMIC64_OP(op_name, op_string) \ static inline long op_name(long val, long *ptr) \ { \ long old, new; \ \ asm volatile( \ "0: lgr %[new],%[old]\n" \ op_string " %[new],%[val]\n" \ " csg %[old],%[new],%[ptr]\n" \ " jl 0b" \ : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ return old; \ } #define __ATOMIC64_OPS(op_name, op_string) \ __ATOMIC64_OP(op_name, op_string) \ __ATOMIC64_OP(op_name##_barrier, op_string) __ATOMIC64_OPS(__atomic64_add, "agr") __ATOMIC64_OPS(__atomic64_and, "ngr") __ATOMIC64_OPS(__atomic64_or, "ogr") __ATOMIC64_OPS(__atomic64_xor, "xgr") #undef __ATOMIC64_OPS static inline int __atomic_cmpxchg(int *ptr, int old, int new) { asm volatile( " cs %[old],%[new],%[ptr]" : [old] "+d" (old), [ptr] "+Q" (*ptr) : [new] "d" (new) : "cc", "memory"); return old; } static inline long __atomic64_cmpxchg(long *ptr, long old, long new) { asm volatile( " csg %[old],%[new],%[ptr]" : [old] "+d" (old), [ptr] "+Q" (*ptr) : [new] "d" (new) : "cc", "memory"); return old; } #endif /* __ARCH_S390_ATOMIC_OPS__ */ criu-3.6/include/common/arch/s390/asm/bitops.h000066400000000000000000000073401317335042600211130ustar00rootroot00000000000000#ifndef _S390_BITOPS_H #define _S390_BITOPS_H #include "common/asm/bitsperlong.h" #include "common/compiler.h" #include "common/arch/s390/asm/atomic_ops.h" #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] static inline unsigned long * __bitops_word(unsigned long nr, volatile unsigned long *ptr) { unsigned long addr; addr = (unsigned long)ptr + ((nr ^ (nr & (BITS_PER_LONG - 1))) >> 3); return (unsigned long *)addr; } static inline unsigned char * __bitops_byte(unsigned long nr, volatile unsigned long *ptr) { return ((unsigned char *)ptr) + ((nr ^ (BITS_PER_LONG - 8)) >> 3); } static inline void set_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); unsigned long mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); __atomic64_or((long) mask, (long *) addr); } static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); unsigned long mask; mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); __atomic64_and((long) mask, (long *) addr); } static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); unsigned long mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); __atomic64_xor((long) mask, (long *) addr); } static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); old = __atomic64_or_barrier((long) mask, (long *) addr); return (old & mask) != 0; } static inline int test_bit(unsigned long nr, const volatile unsigned long *ptr) { const volatile unsigned char *addr; addr = ((const volatile unsigned char *)ptr); addr += (nr ^ (BITS_PER_LONG - 8)) >> 3; return (*addr >> (nr & 7)) & 1; } static inline unsigned char __flogr(unsigned long word) { if (__builtin_constant_p(word)) { unsigned long bit = 0; if (!word) return 64; if (!(word & 0xffffffff00000000UL)) { word <<= 32; bit += 32; } if (!(word & 0xffff000000000000UL)) { word <<= 16; bit += 16; } if (!(word & 0xff00000000000000UL)) { word <<= 8; bit += 8; } if (!(word & 0xf000000000000000UL)) { word <<= 4; bit += 4; } if (!(word & 0xc000000000000000UL)) { word <<= 2; bit += 2; } if (!(word & 0x8000000000000000UL)) { word <<= 1; bit += 1; } return bit; } else { return __builtin_clzl(word); } } static inline unsigned long __ffs(unsigned long word) { return __flogr(-word & word) ^ (BITS_PER_LONG - 1); } #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) static inline unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start, unsigned long invert) { unsigned long tmp; if (!nbits || start >= nbits) return nbits; tmp = addr[start / BITS_PER_LONG] ^ invert; tmp &= BITMAP_FIRST_WORD_MASK(start); start = round_down(start, BITS_PER_LONG); while (!tmp) { start += BITS_PER_LONG; if (start >= nbits) return nbits; tmp = addr[start / BITS_PER_LONG] ^ invert; } return min(start + __ffs(tmp), nbits); } static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { return _find_next_bit(addr, size, offset, 0UL); } #define for_each_bit(i, bitmask) \ for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \ i < sizeof(bitmask); \ i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) #endif /* _S390_BITOPS_H */ criu-3.6/include/common/arch/s390/asm/bitsperlong.h000066400000000000000000000001671317335042600221430ustar00rootroot00000000000000#ifndef __CR_BITSPERLONG_H__ #define __CR_BITSPERLONG_H__ #define BITS_PER_LONG 64 #endif /* __CR_BITSPERLONG_H__ */ criu-3.6/include/common/arch/s390/asm/linkage.h000066400000000000000000000004731317335042600212250ustar00rootroot00000000000000#ifndef __ASM_LINKAGE_H #define __ASM_LINKAGE_H #ifdef __ASSEMBLY__ #define __ALIGN .align 4, 0x07 #define GLOBAL(name) \ .globl name; \ name: #define ENTRY(name) \ .globl name; \ .type name, @function; \ __ALIGN; \ name: #define END(name) \ .size name, . - name #endif /* __ASSEMBLY__ */ #endif criu-3.6/include/common/arch/s390/asm/page.h000066400000000000000000000005171317335042600205260ustar00rootroot00000000000000#ifndef __CR_ASM_PAGE_H__ #define __CR_ASM_PAGE_H__ #ifndef PAGE_SHIFT #define PAGE_SHIFT 12 #endif #ifndef PAGE_SIZE #define PAGE_SIZE (1UL << PAGE_SHIFT) #endif #ifndef PAGE_MASK #define PAGE_MASK (~(PAGE_SIZE - 1)) #endif #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #define page_size() PAGE_SIZE #endif /* __CR_ASM_PAGE_H__ */ criu-3.6/include/common/arch/x86/000077500000000000000000000000001317335042600166055ustar00rootroot00000000000000criu-3.6/include/common/arch/x86/asm/000077500000000000000000000000001317335042600173655ustar00rootroot00000000000000criu-3.6/include/common/arch/x86/asm/atomic.h000066400000000000000000000027101317335042600210120ustar00rootroot00000000000000#ifndef __CR_ATOMIC_H__ #define __CR_ATOMIC_H__ #include "common/arch/x86/asm/cmpxchg.h" typedef struct { int counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } static inline int atomic_read(const atomic_t *v) { return (*(volatile int *)&(v)->counter); } static inline void atomic_set(atomic_t *v, int i) { v->counter = i; } static inline void atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) : "ir" (i)); } static inline void atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) : "ir" (i)); } static inline void atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter)); } static inline void atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter)); } static inline int atomic_dec_and_test(atomic_t *v) { unsigned char c; asm volatile(LOCK_PREFIX "decl %0; sete %1" : "+m" (v->counter), "=qm" (c) : : "memory"); return c != 0; } static inline int atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } static inline int atomic_sub_return(int i, atomic_t *v) { return atomic_add_return(-i, v); } #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) static inline int atomic_cmpxchg(atomic_t *v, int old, int new) { return cmpxchg(&v->counter, old, new); } #endif /* __CR_ATOMIC_H__ */ criu-3.6/include/common/arch/x86/asm/bitops.h000066400000000000000000000060321317335042600210370ustar00rootroot00000000000000#ifndef __CR_BITOPS_H__ #define __CR_BITOPS_H__ #include "common/arch/x86/asm/cmpxchg.h" #include "common/asm/bitsperlong.h" #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) \ unsigned long name[BITS_TO_LONGS(bits)] #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) /* Technically wrong, but this avoids compilation errors on some gcc versions. */ #define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) #else #define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) #endif #define ADDR BITOP_ADDR(addr) static inline void set_bit(int nr, volatile unsigned long *addr) { asm volatile("btsl %1,%0" : ADDR : "Ir" (nr) : "memory"); } static inline void change_bit(int nr, volatile unsigned long *addr) { asm volatile("btcl %1,%0" : ADDR : "Ir" (nr)); } static inline int test_bit(int nr, volatile const unsigned long *addr) { int oldbit; asm volatile("bt %2,%1\n\t" "sbb %0,%0" : "=r" (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); return oldbit; } static inline void clear_bit(int nr, volatile unsigned long *addr) { asm volatile("btrl %1,%0" : ADDR : "Ir" (nr)); } /** * test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ static inline int test_and_set_bit(int nr, volatile unsigned long *addr) { int oldbit; asm volatile(LOCK_PREFIX "bts %2,%1\n\t" "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); return oldbit; } /** * __ffs - find first set bit in word * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static inline unsigned long __ffs(unsigned long word) { asm("bsf %1,%0" : "=r" (word) : "rm" (word)); return word; } #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) /* * Find the next set bit in a memory region. */ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; if (offset >= size) return size; size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp &= (~0UL << offset); if (size < BITS_PER_LONG) goto found_first; if (tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG-1)) { if ((tmp = *(p++))) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = *p; found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); } #define for_each_bit(i, bitmask) \ for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \ i < sizeof(bitmask); \ i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) #endif /* __CR_BITOPS_H__ */ criu-3.6/include/common/arch/x86/asm/bitsperlong.h000066400000000000000000000002641317335042600220700ustar00rootroot00000000000000#ifndef __CR_BITSPERLONG_H__ #define __CR_BITSPERLONG_H__ #ifdef CONFIG_X86_64 # define BITS_PER_LONG 64 #else # define BITS_PER_LONG 32 #endif #endif /* __CR_BITSPERLONG_H__ */ criu-3.6/include/common/arch/x86/asm/cmpxchg.h000066400000000000000000000060731317335042600211750ustar00rootroot00000000000000#ifndef __CR_CMPXCHG_H__ #define __CR_CMPXCHG_H__ #include #define LOCK_PREFIX "\n\tlock; " #define __X86_CASE_B 1 #define __X86_CASE_W 2 #define __X86_CASE_L 4 #define __X86_CASE_Q 8 /* * An exchange-type operation, which takes a value and a pointer, and * returns the old value. Make sure you never reach non-case statement * here, otherwise behaviour is undefined. */ #define __xchg_op(ptr, arg, op, lock) \ ({ \ __typeof__ (*(ptr)) __ret = (arg); \ switch (sizeof(*(ptr))) { \ case __X86_CASE_B: \ asm volatile (lock #op "b %b0, %1\n" \ : "+q" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_W: \ asm volatile (lock #op "w %w0, %1\n" \ : "+r" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_L: \ asm volatile (lock #op "l %0, %1\n" \ : "+r" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_Q: \ asm volatile (lock #op "q %q0, %1\n" \ : "+r" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ } \ __ret; \ }) #define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock) #define xadd(ptr, inc) __xadd((ptr), (inc), "lock ;") /* Borrowed from linux kernel arch/x86/include/asm/cmpxchg.h */ /* * Atomic compare and exchange. Compare OLD with MEM, if identical, * store NEW in MEM. Return the initial value in MEM. Success is * indicated by comparing RETURN with OLD. */ #define __raw_cmpxchg(ptr, old, new, size, lock) \ ({ \ __typeof__(*(ptr)) __ret; \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case __X86_CASE_B: \ { \ volatile uint8_t *__ptr = (volatile uint8_t *)(ptr); \ asm volatile(lock "cmpxchgb %2,%1" \ : "=a" (__ret), "+m" (*__ptr) \ : "q" (__new), "0" (__old) \ : "memory"); \ break; \ } \ case __X86_CASE_W: \ { \ volatile uint16_t *__ptr = (volatile uint16_t *)(ptr); \ asm volatile(lock "cmpxchgw %2,%1" \ : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ } \ case __X86_CASE_L: \ { \ volatile uint32_t *__ptr = (volatile uint32_t *)(ptr); \ asm volatile(lock "cmpxchgl %2,%1" \ : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ } \ case __X86_CASE_Q: \ { \ volatile uint64_t *__ptr = (volatile uint64_t *)(ptr); \ asm volatile(lock "cmpxchgq %2,%1" \ : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ } \ } \ __ret; \ }) #define __cmpxchg(ptr, old, new, size) \ __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) #define cmpxchg(ptr, old, new) \ __cmpxchg(ptr, old, new, sizeof(*(ptr))) #endif /* __CR_CMPXCHG_H__ */ criu-3.6/include/common/arch/x86/asm/linkage.h000066400000000000000000000006521317335042600211530ustar00rootroot00000000000000#ifndef __CR_LINKAGE_H__ #define __CR_LINKAGE_H__ #ifdef __ASSEMBLY__ #define __ALIGN .align 4, 0x90 #define __ALIGN_STR ".align 4, 0x90" #define GLOBAL(name) \ .globl name; \ name: #define ENTRY(name) \ .globl name; \ .type name, @function; \ __ALIGN; \ name: #define END(sym) \ .size sym, . - sym #endif /* __ASSEMBLY__ */ #define __USER32_CS 0x23 #define __USER_CS 0x33 #endif /* __CR_LINKAGE_H__ */ criu-3.6/include/common/arch/x86/asm/page.h000066400000000000000000000005221317335042600204510ustar00rootroot00000000000000#ifndef __CR_ASM_PAGE_H__ #define __CR_ASM_PAGE_H__ #ifndef PAGE_SHIFT # define PAGE_SHIFT 12 #endif #ifndef PAGE_SIZE # define PAGE_SIZE (1UL << PAGE_SHIFT) #endif #ifndef PAGE_MASK # define PAGE_MASK (~(PAGE_SIZE - 1)) #endif #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #define page_size() PAGE_SIZE #endif /* __CR_ASM_PAGE_H__ */ criu-3.6/include/common/asm-generic/000077500000000000000000000000001317335042600174355ustar00rootroot00000000000000criu-3.6/include/common/asm-generic/bitops.h000066400000000000000000000054521317335042600211140ustar00rootroot00000000000000/* * Generic bits operations. * * Architectures that don't want their own implementation of those, * should include this file into the arch/$ARCH/include/asm/bitops.h */ #ifndef __CR_GENERIC_BITOPS_H__ #define __CR_GENERIC_BITOPS_H__ #include "common/asm/bitsperlong.h" #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) \ unsigned long name[BITS_TO_LONGS(bits)] #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) /* Technically wrong, but this avoids compilation errors on some gcc versions. */ #define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) #else #define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) #endif #define ADDR BITOP_ADDR(addr) static inline void set_bit(int nr, volatile unsigned long *addr) { addr += nr / BITS_PER_LONG; *addr |= (1 << (nr % BITS_PER_LONG)); } static inline void change_bit(int nr, volatile unsigned long *addr) { addr += nr / BITS_PER_LONG; *addr ^= (1 << (nr % BITS_PER_LONG)); } static inline int test_bit(int nr, volatile const unsigned long *addr) { addr += nr / BITS_PER_LONG; return (*addr & (1 << (nr % BITS_PER_LONG))) ? -1 : 0; } static inline void clear_bit(int nr, volatile unsigned long *addr) { addr += nr / BITS_PER_LONG; *addr &= ~(1 << (nr % BITS_PER_LONG)); } /** * __ffs - find first set bit in word * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static inline unsigned long __ffs(unsigned long word) { int p = 0; for (; p < 8*sizeof(word); ++p) { if (word & 1) { break; } word >>= 1; } return p; } #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) /* * Find the next set bit in a memory region. */ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; if (offset >= size) return size; size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp &= (~0UL << offset); if (size < BITS_PER_LONG) goto found_first; if (tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG-1)) { if ((tmp = *(p++))) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = *p; found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); } #define for_each_bit(i, bitmask) \ for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \ i < sizeof(bitmask); \ i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) #endif /* __CR_GENERIC_BITOPS_H__ */ criu-3.6/include/common/bitops.h000066400000000000000000000007571317335042600167250ustar00rootroot00000000000000#ifndef __CR_COMMON_BITOPS_H__ #define __CR_COMMON_BITOPS_H__ #include "common/asm/bitops.h" #include "common/bitsperlong.h" #include #if __BYTE_ORDER == __BIG_ENDIAN #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) #else #define BITOP_LE_SWIZZLE 0 #endif static inline int test_and_set_bit_le(int nr, void *addr) { return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void clear_bit_le(int nr, void *addr) { clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } #endif criu-3.6/include/common/bitsperlong.h000066400000000000000000000001631317335042600177440ustar00rootroot00000000000000#ifndef __CR_COMMON_BITSPERLONG_H__ #define __CR_COMMON_BITSPERLONG_H__ #include "common/asm/bitsperlong.h" #endif criu-3.6/include/common/bug.h000066400000000000000000000014441317335042600161740ustar00rootroot00000000000000#ifndef __CR_BUG_H__ #define __CR_BUG_H__ #include #include #include "common/compiler.h" #ifndef BUG_ON_HANDLER #ifdef CR_NOGLIBC # define __raise() #else # define __raise() raise(SIGABRT) #endif #ifndef __clang_analyzer__ # ifndef pr_err # error pr_err macro must be defined # endif # define BUG_ON_HANDLER(condition) \ do { \ if ((condition)) { \ pr_err("BUG at %s:%d\n", __FILE__, __LINE__); \ __raise(); \ *(volatile unsigned long *)NULL = 0xdead0000 + __LINE__; \ } \ } while (0) #else # define BUG_ON_HANDLER(condition) \ do { \ assert(!condition); \ } while (0) #endif #endif /* BUG_ON_HANDLER */ #define BUG_ON(condition) BUG_ON_HANDLER((condition)) #define BUG() BUG_ON(true) #endif /* __CR_BUG_H__ */ criu-3.6/include/common/compiler.h000066400000000000000000000047361317335042600172400ustar00rootroot00000000000000#ifndef __CR_COMPILER_H__ #define __CR_COMPILER_H__ /* * Various definitions for success build, * picked from various places, mostly from * the linux kernel. */ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) #define ASSIGN_TYPED(a, b) do { (a) = (typeof(a))(b); } while (0) #define ASSIGN_MEMBER(a, b, m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0) #define __stringify_1(x...) #x #define __stringify(x...) __stringify_1(x) #define NORETURN __attribute__((__noreturn__)) #define __packed __attribute__((__packed__)) #define __used __attribute__((__used__)) #define __maybe_unused __attribute__((unused)) #define __always_unused __attribute__((unused)) #define __section(S) __attribute__ ((__section__(#S))) #ifndef __always_inline # define __always_inline inline __attribute__((always_inline)) #endif #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #ifndef always_inline # define always_inline __always_inline #endif #ifndef noinline # define noinline __attribute__((noinline)) #endif #define __aligned(x) __attribute__((aligned(x))) /* * Macro to define stack alignment. * aarch64 requires stack to be aligned to 16 bytes. */ #define __stack_aligned__ __attribute__((aligned(16))) #ifndef offsetof # define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #endif #define barrier() asm volatile("" ::: "memory") #define container_of(ptr, type, member) ({ \ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) #define __round_mask(x, y) ((__typeof__(x))((y) - 1)) #define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) #define round_down(x, y) ((x) & ~__round_mask(x, y)) #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) #define min(x, y) ({ \ typeof(x) _min1 = (x); \ typeof(y) _min2 = (y); \ (void) (&_min1 == &_min2); \ _min1 < _min2 ? _min1 : _min2; }) #define max(x, y) ({ \ typeof(x) _max1 = (x); \ typeof(y) _max2 = (y); \ (void) (&_max1 == &_max2); \ _max1 > _max2 ? _max1 : _max2; }) #define min_t(type, x, y) ({ \ type __min1 = (x); \ type __min2 = (y); \ __min1 < __min2 ? __min1: __min2; }) #define max_t(type, x, y) ({ \ type __max1 = (x); \ type __max2 = (y); \ __max1 > __max2 ? __max1: __max2; }) #define is_log2(v) (((v) & ((v) - 1)) == 0) #endif /* __CR_COMPILER_H__ */ criu-3.6/include/common/err.h000066400000000000000000000017151317335042600162100ustar00rootroot00000000000000/* * Adopted from linux kernel */ #ifndef __CR_COMMON_ERR_H__ #define __CR_COMMON_ERR_H__ #include "common/compiler.h" /* * The address of a block returned by malloc or realloc in GNU * systems is always a multiple of eight (or sixteen on 64-bit systems). * * Thus we may encode error number in low bits. */ #define MAX_ERRNO 4095 #define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) static inline void *ERR_PTR(long error) { return (void *)error; } static inline long PTR_ERR(const void *ptr) { return (long)ptr; } static inline long IS_ERR(const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } static inline long IS_ERR_OR_NULL(const void *ptr) { return !ptr || IS_ERR_VALUE((unsigned long)ptr); } static inline void *ERR_CAST(const void *ptr) { /* cast away the const */ return (void *)ptr; } static inline int PTR_RET(const void *ptr) { if (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0; } #endif /* __CR_ERR_H__ */ criu-3.6/include/common/list.h000066400000000000000000000254531317335042600164000ustar00rootroot00000000000000#ifndef __CR_LIST_H__ #define __CR_LIST_H__ /* * Double linked lists. */ #include #include "common/compiler.h" #define POISON_POINTER_DELTA 0 #define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) #define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) struct list_head { struct list_head *prev, *next; }; #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name) static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list; list->prev = list; } static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; prev->next = next; } static inline void __list_del_entry(struct list_head *entry) { __list_del(entry->prev, entry->next); } static inline void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } static inline int list_empty(const struct list_head *head) { return head->next == head; } static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = head->next; return (next == head) && (next == head->prev); } static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && (head->next != entry && head != entry)) return; if (entry == head) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } #define list_entry(ptr, type, member) \ container_of(ptr, type, member) #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) #define list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) #define __list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; pos != (head); pos = pos->prev) #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ pos != (head); \ pos = n, n = pos->prev) #define list_for_each_entry(pos, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_entry((head)->prev, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.prev, typeof(*pos), member)) #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_entry(pos->member.prev, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.prev, typeof(*pos), member)) #define list_for_each_entry_from(pos, head, member) \ for (; &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member), \ n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_entry(pos->member.next, typeof(*pos), member), \ n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_entry((head)->prev, typeof(*pos), member), \ n = list_entry(pos->member.prev, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.prev, typeof(*n), member)) #define list_safe_reset_next(pos, n, member) \ n = list_entry(pos->member.next, typeof(*pos), member) /* * Double linked lists with a single pointer list head. */ struct hlist_head { struct hlist_node *first; }; struct hlist_node { struct hlist_node *next, **pprev; }; #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } static inline int hlist_empty(const struct hlist_head *h) { return !h->first; } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; *pprev = next; if (next) next->pprev = pprev; } static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; if (first) first->pprev = &n->next; h->first = n; n->pprev = &h->first; } /* next must be != NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { n->pprev = next->pprev; n->next = next; next->pprev = &n->next; *(n->pprev) = n; } static inline void hlist_add_after(struct hlist_node *n, struct hlist_node *next) { next->next = n->next; n->next = next; next->pprev = &n->next; if (next->next) next->next->pprev = &next->next; } /* after that we'll appear to be on some hlist and hlist_del will work */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } /* * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ (ptr) ? hlist_entry(ptr, type, member) : NULL #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member); \ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) #endif /* __CR_LIST_H__ */ criu-3.6/include/common/lock.h000066400000000000000000000105171317335042600163500ustar00rootroot00000000000000#ifndef __CR_COMMON_LOCK_H__ #define __CR_COMMON_LOCK_H__ #include #include #include #include #include #include "common/asm/atomic.h" #include "common/compiler.h" #define LOCK_BUG_ON(condition) \ if ((condition)) \ *(volatile unsigned long *)NULL = 0xdead0000 + __LINE__ #define LOCK_BUG() LOCK_BUG_ON(1) #ifdef CR_NOGLIBC # include #else # include # include static inline long sys_futex (uint32_t *addr1, int op, uint32_t val1, struct timespec *timeout, uint32_t *addr2, uint32_t val3) { int rc = syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3); if (rc == -1) rc = -errno; return rc; } #endif typedef struct { atomic_t raw; } __aligned(sizeof(int)) futex_t; #define FUTEX_ABORT_FLAG (0x80000000) #define FUTEX_ABORT_RAW (-1U) /* Get current futex @f value */ static inline uint32_t futex_get(futex_t *f) { return atomic_read(&f->raw); } /* Set futex @f value to @v */ static inline void futex_set(futex_t *f, uint32_t v) { atomic_set(&f->raw, (int)v); } #define futex_init(f) futex_set(f, 0) /* Wait on futex @__f value @__v become in condition @__c */ #define futex_wait_if_cond(__f, __v, __cond) \ do { \ int ret; \ uint32_t tmp; \ \ while (1) { \ struct timespec to = {.tv_sec = 120}; \ tmp = futex_get(__f); \ if ((tmp & FUTEX_ABORT_FLAG) || \ (tmp __cond (__v))) \ break; \ ret = sys_futex((uint32_t *)&(__f)->raw.counter, FUTEX_WAIT,\ tmp, &to, NULL, 0); \ if (ret == -ETIMEDOUT) \ continue; \ if (ret == -EINTR || ret == -EWOULDBLOCK) \ continue; \ if (ret < 0) \ LOCK_BUG(); \ } \ } while (0) /* Set futex @f to @v and wake up all waiters */ static inline void futex_set_and_wake(futex_t *f, uint32_t v) { atomic_set(&f->raw, (int)v); LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); } /* Wake up all futex @f waiters */ static inline void futex_wake(futex_t *f) { LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); } /* Mark futex @f as wait abort needed and wake up all waiters */ static inline void futex_abort_and_wake(futex_t *f) { BUILD_BUG_ON(!(FUTEX_ABORT_RAW & FUTEX_ABORT_FLAG)); futex_set_and_wake(f, FUTEX_ABORT_RAW); } /* Decrement futex @f value and wake up all waiters */ static inline void futex_dec_and_wake(futex_t *f) { atomic_dec(&f->raw); LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); } /* Increment futex @f value and wake up all waiters */ static inline void futex_inc_and_wake(futex_t *f) { atomic_inc(&f->raw); LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); } /* Plain increment futex @f value */ static inline void futex_inc(futex_t *f) { atomic_inc(&f->raw); } /* Plain decrement futex @f value */ static inline void futex_dec(futex_t *f) { atomic_dec(&f->raw); } /* Wait until futex @f value become @v */ #define futex_wait_until(f, v) futex_wait_if_cond(f, v, ==) /* Wait while futex @f value is greater than @v */ #define futex_wait_while_gt(f, v) futex_wait_if_cond(f, v, <=) /* Wait while futex @f value is less than @v */ #define futex_wait_while_lt(f, v) futex_wait_if_cond(f, v, >=) /* Wait while futex @f value is equal to @v */ #define futex_wait_while_eq(f, v) futex_wait_if_cond(f, v, !=) /* Wait while futex @f value is @v */ static inline void futex_wait_while(futex_t *f, uint32_t v) { while ((uint32_t)atomic_read(&f->raw) == v) { int ret = sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAIT, v, NULL, NULL, 0); LOCK_BUG_ON(ret < 0 && ret != -EWOULDBLOCK); } } typedef struct { atomic_t raw; } mutex_t; static inline void mutex_init(mutex_t *m) { uint32_t c = 0; atomic_set(&m->raw, (int)c); } static inline void mutex_lock(mutex_t *m) { uint32_t c; int ret; while ((c = (uint32_t)atomic_inc_return(&m->raw)) != 1) { ret = sys_futex((uint32_t *)&m->raw.counter, FUTEX_WAIT, c, NULL, NULL, 0); LOCK_BUG_ON(ret < 0 && ret != -EWOULDBLOCK); } } static inline void mutex_unlock(mutex_t *m) { uint32_t c = 0; atomic_set(&m->raw, (int)c); LOCK_BUG_ON(sys_futex((uint32_t *)&m->raw.counter, FUTEX_WAKE, 1, NULL, NULL, 0) < 0); } #endif /* __CR_COMMON_LOCK_H__ */ criu-3.6/include/common/page.h000066400000000000000000000001361317335042600163300ustar00rootroot00000000000000#ifndef __CR_COMMON_PAGE_H__ #define __CR_COMMON_PAGE_H__ #include "common/asm/page.h" #endif criu-3.6/include/common/scm-code.c000066400000000000000000000061251317335042600171050ustar00rootroot00000000000000#ifndef __sys #error "The __sys macro is required" #endif static void scm_fdset_init_chunk(struct scm_fdset *fdset, int nr_fds, void *data, unsigned ch_size) { struct cmsghdr *cmsg; static char dummy; fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * nr_fds); cmsg = CMSG_FIRSTHDR(&fdset->hdr); cmsg->cmsg_len = fdset->hdr.msg_controllen; if (data) { fdset->iov.iov_base = data; fdset->iov.iov_len = nr_fds * ch_size; } else { fdset->iov.iov_base = &dummy; fdset->iov.iov_len = 1; } } static int *scm_fdset_init(struct scm_fdset *fdset, struct sockaddr_un *saddr, int saddr_len) { struct cmsghdr *cmsg; BUILD_BUG_ON(sizeof(fdset->msg_buf) < (CMSG_SPACE(sizeof(int) * CR_SCM_MAX_FD))); fdset->iov.iov_base = (void *)0xdeadbeef; fdset->hdr.msg_iov = &fdset->iov; fdset->hdr.msg_iovlen = 1; fdset->hdr.msg_name = (struct sockaddr *)saddr; fdset->hdr.msg_namelen = saddr_len; fdset->hdr.msg_control = &fdset->msg_buf; fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * CR_SCM_MAX_FD); cmsg = CMSG_FIRSTHDR(&fdset->hdr); cmsg->cmsg_len = fdset->hdr.msg_controllen; cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; return (int *)CMSG_DATA(cmsg); } int send_fds(int sock, struct sockaddr_un *saddr, int len, int *fds, int nr_fds, void *data, unsigned ch_size) { /* In musl_libc the msghdr structure has pads which has to be zeroed */ struct scm_fdset fdset = {}; int *cmsg_data; int i, min_fd, ret; cmsg_data = scm_fdset_init(&fdset, saddr, len); for (i = 0; i < nr_fds; i += min_fd) { min_fd = min(CR_SCM_MAX_FD, nr_fds - i); scm_fdset_init_chunk(&fdset, min_fd, data, ch_size); memcpy(cmsg_data, &fds[i], sizeof(int) * min_fd); ret = __sys(sendmsg)(sock, &fdset.hdr, 0); if (ret <= 0) return ret ? : -1; if (data) data += min_fd * ch_size; } return 0; } int __recv_fds(int sock, int *fds, int nr_fds, void *data, unsigned ch_size, int flags) { /* In musl_libc the msghdr structure has pads which has to be zeroed */ struct scm_fdset fdset = {}; struct cmsghdr *cmsg; int *cmsg_data; int ret; int i, min_fd; cmsg_data = scm_fdset_init(&fdset, NULL, 0); for (i = 0; i < nr_fds; i += min_fd) { min_fd = min(CR_SCM_MAX_FD, nr_fds - i); scm_fdset_init_chunk(&fdset, min_fd, data, ch_size); ret = __sys(recvmsg)(sock, &fdset.hdr, flags); if (ret <= 0) return ret ? __sys_err(ret) : -ENOMSG; cmsg = CMSG_FIRSTHDR(&fdset.hdr); if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS) return -EINVAL; if (fdset.hdr.msg_flags & MSG_CTRUNC) return -ENFILE; min_fd = (cmsg->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int); /* * In case if kernel screwed the recipient, most probably * the caller stack frame will be overwriten, just scream * and exit. * * FIXME Need to sanitize util.h to be able to include it * into files which do not have glibc and a couple of * sys_write_ helpers. Meawhile opencoded BUG_ON here. */ BUG_ON(min_fd > CR_SCM_MAX_FD); if (unlikely(min_fd <= 0)) return -EBADFD; memcpy(&fds[i], cmsg_data, sizeof(int) * min_fd); if (data) data += ch_size * min_fd; } return 0; } criu-3.6/include/common/scm.h000066400000000000000000000024031317335042600161750ustar00rootroot00000000000000#ifndef __COMMON_SCM_H__ #define __COMMON_SCM_H__ #include #include #include /* * Because of kernel doing kmalloc for user data passed * in SCM messages, and there is kernel's SCM_MAX_FD as a limit * for descriptors passed at once we're trying to reduce * the pressue on kernel memory manager and use predefined * known to work well size of the message buffer. */ #define CR_SCM_MSG_SIZE (1024) #define CR_SCM_MAX_FD (252) struct scm_fdset { struct msghdr hdr; struct iovec iov; char msg_buf[CR_SCM_MSG_SIZE]; }; #ifndef F_GETOWNER_UIDS #define F_GETOWNER_UIDS 17 #endif extern int send_fds(int sock, struct sockaddr_un *saddr, int len, int *fds, int nr_fds, void *data, unsigned ch_size); extern int __recv_fds(int sock, int *fds, int nr_fds, void *data, unsigned ch_size, int flags); static inline int recv_fds(int sock, int *fds, int nr_fds, void *data, unsigned ch_size) { return __recv_fds(sock, fds, nr_fds, data, ch_size, 0); } static inline int send_fd(int sock, struct sockaddr_un *saddr, int saddr_len, int fd) { return send_fds(sock, saddr, saddr_len, &fd, 1, NULL, 0); } static inline int recv_fd(int sock) { int fd, ret; ret = recv_fds(sock, &fd, 1, NULL, 0); if (ret) return -1; return fd; } #endif criu-3.6/include/common/xmalloc.h000066400000000000000000000030511317335042600170520ustar00rootroot00000000000000#ifndef __COMMON_XMALLOC_H__ #define __COMMON_XMALLOC_H__ #include #include #ifndef pr_err #error "Macro pr_err is needed." #endif #define __xalloc(op, size, ...) \ ({ \ void *___p = op( __VA_ARGS__ ); \ if (!___p) \ pr_err("%s: Can't allocate %li bytes\n", \ __func__, (long)(size)); \ ___p; \ }) #define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) #define xmalloc(size) __xalloc(malloc, size, size) #define xzalloc(size) __xalloc(calloc, size, 1, size) #define xrealloc(p, size) __xalloc(realloc, size, p, size) #define xfree(p) free(p) #define xrealloc_safe(pptr, size) \ ({ \ int __ret = -1; \ void *new = xrealloc(*pptr, size); \ if (new) { \ *pptr = new; \ __ret = 0; \ } \ __ret; \ }) #define xmemdup(ptr, size) \ ({ \ void *new = xmalloc(size); \ if (new) \ memcpy(new, ptr, size); \ new; \ }) #define memzero_p(p) memset(p, 0, sizeof(*p)) #define memzero(p, size) memset(p, 0, size) /* * Helper for allocating trees with single xmalloc. * This one advances the void *pointer on s bytes and * returns the previous value. Use like this * * m = xmalloc(total_size); * a = xptr_pull(&m, tree_root_t); * a->b = xptr_pull(&m, leaf_a_t); * a->c = xptr_pull(&m, leaf_c_t); * ... */ static inline void *xptr_pull_s(void **m, size_t s) { void *ret = (*m); (*m) += s; return ret; } #define xptr_pull(m, type) xptr_pull_s(m, sizeof(type)) #endif /* __CR_XMALLOC_H__ */ criu-3.6/lib/000077500000000000000000000000001317335042600130765ustar00rootroot00000000000000criu-3.6/lib/Makefile000066400000000000000000000047651317335042600145520ustar00rootroot00000000000000CRIU_SO := libcriu.so UAPI_HEADERS := lib/c/criu.h images/rpc.proto PYTHON_BIN ?= python2 .PHONY: .FORCE # # File to keep track of files installed by setup.py CRIT_SETUP_FILES := lib/.crit-setup.files all-y += lib-c lib-py # # C language bindings. lib/c/Makefile: ; lib/c/%: .FORCE $(Q) $(MAKE) $(build)=lib/c $@ cflags-so += $(CFLAGS) -rdynamic -Wl,-soname,$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR) ldflags-so += -lprotobuf-c lib/c/$(CRIU_SO): lib/c/built-in.o $(call msg-link, $@) $(Q) $(CC) -shared $(cflags-so) -o $@ $^ $(ldflags-so) $(LDFLAGS) lib-c: lib/c/$(CRIU_SO) .PHONY: lib-c # # Python bindings. lib/py/Makefile: ; lib/py/%: .FORCE $(call msg-gen, $@) $(Q) $(MAKE) $(build)=lib/py $@ lib-py: $(Q) $(MAKE) $(build)=lib/py all .PHONY: lib-py clean-lib: $(Q) $(MAKE) $(build)=lib/c clean $(Q) $(MAKE) $(build)=lib/py clean .PHONY: clean-lib clean: clean-lib cleanup-y += lib/c/$(CRIU_SO) lib/c/criu.pc mrproper: clean install: lib-c lib-py crit/crit lib/c/criu.pc.in $(E) " INSTALL " lib $(Q) mkdir -p $(DESTDIR)$(LIBDIR) $(Q) install -m 755 lib/c/$(CRIU_SO) $(DESTDIR)$(LIBDIR)/$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) $(Q) ln -fns $(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) $(DESTDIR)$(LIBDIR)/$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR) $(Q) ln -fns $(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) $(DESTDIR)$(LIBDIR)/$(CRIU_SO) $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/criu/ $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ $(E) " INSTALL " pkgconfig/criu.pc $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig $(E) " INSTALL " crit $(Q) $(PYTHON_BIN) scripts/crit-setup.py install --prefix=$(DESTDIR)$(PREFIX) --record $(CRIT_SETUP_FILES) .PHONY: install uninstall: $(E) " UNINSTALL" $(CRIU_SO) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR)) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(CRIU_SO)) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR)) $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(E) " UNINSTALL" pkgconfig/criu.pc $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) $(E) " UNINSTALL" crit $(Q) while read -r file; do $(RM) "$$file"; done < $(CRIT_SETUP_FILES) .PHONY: uninstall criu-3.6/lib/c/000077500000000000000000000000001317335042600133205ustar00rootroot00000000000000criu-3.6/lib/c/Makefile000066400000000000000000000003521317335042600147600ustar00rootroot00000000000000obj-y += criu.o obj-y += ./images/rpc.pb-c.o ccflags-y += -iquote criu/$(ARCH_DIR)/include ccflags-y += -iquote criu/include ccflags-y += -iquote images ccflags-y += -fPIC -fno-stack-protector ldflags-y += -r -z noexecstack criu-3.6/lib/c/criu.c000066400000000000000000000623671317335042600144440ustar00rootroot00000000000000#include "version.h" #include #include #include #include #include #include #include #include #include #include #include #include #include "criu.h" #include "rpc.pb-c.h" #include "cr-service-const.h" #define CR_DEFAULT_SERVICE_BIN "criu" const char *criu_lib_version = CRIU_VERSION; struct criu_opts { CriuOpts *rpc; int (*notify)(char *action, criu_notify_arg_t na); enum criu_service_comm service_comm; union { char *service_address; int service_fd; char *service_binary; }; int swrk_pid; }; static criu_opts *global_opts; static int saved_errno; void criu_local_set_service_comm(criu_opts *opts, enum criu_service_comm comm) { opts->service_comm = comm; } void criu_set_service_comm(enum criu_service_comm comm) { criu_local_set_service_comm(global_opts, comm); } void criu_local_set_service_address(criu_opts *opts, char *path) { if (path) opts->service_address = path; else opts->service_address = CR_DEFAULT_SERVICE_ADDRESS; } void criu_set_service_address(char *path) { criu_local_set_service_address(global_opts, path); } void criu_local_set_service_fd(criu_opts *opts, int fd) { opts->service_fd = fd; } void criu_set_service_fd(int fd) { criu_local_set_service_fd(global_opts, fd); } void criu_local_set_service_binary(criu_opts *opts, char *path) { if (path) opts->service_binary = path; else opts->service_binary = CR_DEFAULT_SERVICE_BIN; } void criu_set_service_binary(char *path) { criu_local_set_service_binary(global_opts, path); } int criu_local_init_opts(criu_opts **o) { criu_opts *opts = NULL; CriuOpts *rpc = NULL; opts = *o; if (opts) { if (opts->rpc) criu_opts__free_unpacked(opts->rpc, NULL); free(opts); opts = NULL; } rpc = malloc(sizeof(CriuOpts)); if (rpc == NULL) { perror("Can't allocate memory for criu RPC opts"); return -1; } criu_opts__init(rpc); opts = malloc(sizeof(criu_opts)); if (opts == NULL) { perror("Can't allocate memory for criu opts"); criu_opts__free_unpacked(rpc, NULL); return -1; } opts->rpc = rpc; opts->notify = NULL; opts->service_comm = CRIU_COMM_BIN; opts->service_address = CR_DEFAULT_SERVICE_BIN; *o = opts; return 0; } int criu_init_opts(void) { return criu_local_init_opts(&global_opts); } void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)) { opts->notify = cb; opts->rpc->has_notify_scripts = true; opts->rpc->notify_scripts = true; } void criu_set_notify_cb(int (*cb)(char *action, criu_notify_arg_t na)) { criu_local_set_notify_cb(global_opts, cb); } int criu_notify_pid(criu_notify_arg_t na) { return na->has_pid ? na->pid : 0; } void criu_local_set_pid(criu_opts *opts, int pid) { opts->rpc->has_pid = true; opts->rpc->pid = pid; } void criu_set_pid(int pid) { criu_local_set_pid(global_opts, pid); } void criu_local_set_images_dir_fd(criu_opts *opts, int fd) { opts->rpc->images_dir_fd = fd; } void criu_set_images_dir_fd(int fd) { criu_local_set_images_dir_fd(global_opts, fd); } void criu_local_set_parent_images(criu_opts *opts, char *path) { opts->rpc->parent_img = strdup(path); } void criu_set_parent_images(char *path) { criu_local_set_parent_images(global_opts, path); } void criu_local_set_track_mem(criu_opts *opts, bool track_mem) { opts->rpc->has_track_mem = true; opts->rpc->track_mem = track_mem; } void criu_set_track_mem(bool track_mem) { criu_local_set_track_mem(global_opts, track_mem); } void criu_local_set_auto_dedup(criu_opts *opts, bool auto_dedup) { opts->rpc->has_auto_dedup = true; opts->rpc->auto_dedup = auto_dedup; } void criu_set_auto_dedup(bool auto_dedup) { criu_local_set_auto_dedup(global_opts, auto_dedup); } void criu_local_set_force_irmap(criu_opts *opts, bool force_irmap) { opts->rpc->has_force_irmap = true; opts->rpc->force_irmap = force_irmap; } void criu_set_force_irmap(bool force_irmap) { criu_local_set_force_irmap(global_opts, force_irmap); } void criu_local_set_link_remap(criu_opts *opts, bool link_remap) { opts->rpc->has_link_remap = true; opts->rpc->link_remap = link_remap; } void criu_set_link_remap(bool link_remap) { criu_local_set_link_remap(global_opts, link_remap); } void criu_local_set_work_dir_fd(criu_opts *opts, int fd) { opts->rpc->has_work_dir_fd = true; opts->rpc->work_dir_fd = fd; } void criu_set_work_dir_fd(int fd) { criu_local_set_work_dir_fd(global_opts, fd); } void criu_local_set_leave_running(criu_opts *opts, bool leave_running) { opts->rpc->has_leave_running = true; opts->rpc->leave_running = leave_running; } void criu_set_leave_running(bool leave_running) { criu_local_set_leave_running(global_opts, leave_running); } void criu_local_set_ext_unix_sk(criu_opts *opts, bool ext_unix_sk) { opts->rpc->has_ext_unix_sk = true; opts->rpc->ext_unix_sk = ext_unix_sk; } void criu_set_ext_unix_sk(bool ext_unix_sk) { criu_local_set_ext_unix_sk(global_opts, ext_unix_sk); } int criu_local_add_unix_sk(criu_opts *opts, unsigned int inode) { int nr; UnixSk **a, *u; /*if caller forgot enable ext_unix_sk option we do it*/ if (!opts->rpc->has_ext_unix_sk) { criu_local_set_ext_unix_sk(opts, true); } /*if user disabled ext_unix_sk and try to add unixsk inode after that*/ if (opts->rpc->has_ext_unix_sk && !opts->rpc->ext_unix_sk) { if (opts->rpc->n_unix_sk_ino > 0) { free(opts->rpc->unix_sk_ino); opts->rpc->n_unix_sk_ino = 0; } return -1; } u = malloc(sizeof(*u)); if (!u) goto er; unix_sk__init(u); u->inode = inode; nr = opts->rpc->n_unix_sk_ino + 1; a = realloc(opts->rpc->unix_sk_ino, nr * sizeof(u)); if (!a) goto er_u; a[nr - 1] = u; opts->rpc->unix_sk_ino = a; opts->rpc->n_unix_sk_ino = nr; return 0; er_u: free(u); er: return -ENOMEM; } int criu_add_unix_sk(unsigned int inode) { return criu_local_add_unix_sk(global_opts, inode); } void criu_local_set_tcp_established(criu_opts *opts, bool tcp_established) { opts->rpc->has_tcp_established = true; opts->rpc->tcp_established = tcp_established; } void criu_set_tcp_established(bool tcp_established) { criu_local_set_tcp_established(global_opts, tcp_established); } void criu_local_set_tcp_skip_in_flight(criu_opts *opts, bool tcp_skip_in_flight) { opts->rpc->has_tcp_skip_in_flight = true; opts->rpc->tcp_skip_in_flight = tcp_skip_in_flight; } void criu_set_tcp_skip_in_flight(bool tcp_skip_in_flight) { criu_local_set_tcp_skip_in_flight(global_opts, tcp_skip_in_flight); } void criu_local_set_weak_sysctls(criu_opts *opts, bool val) { opts->rpc->has_weak_sysctls = true; opts->rpc->weak_sysctls = val; } void criu_set_weak_sysctls(bool val) { criu_local_set_weak_sysctls(global_opts, val); } void criu_local_set_evasive_devices(criu_opts *opts, bool evasive_devices) { opts->rpc->has_evasive_devices = true; opts->rpc->evasive_devices = evasive_devices; } void criu_set_evasive_devices(bool evasive_devices) { criu_local_set_evasive_devices(global_opts, evasive_devices); } void criu_local_set_shell_job(criu_opts *opts, bool shell_job) { opts->rpc->has_shell_job = true; opts->rpc->shell_job = shell_job; } void criu_set_shell_job(bool shell_job) { criu_local_set_shell_job(global_opts, shell_job); } void criu_local_set_file_locks(criu_opts *opts, bool file_locks) { opts->rpc->has_file_locks = true; opts->rpc->file_locks = file_locks; } void criu_set_file_locks(bool file_locks) { criu_local_set_file_locks(global_opts, file_locks); } void criu_local_set_log_level(criu_opts *opts, int log_level) { opts->rpc->has_log_level = true; opts->rpc->log_level = log_level; } void criu_set_log_level(int log_level) { criu_local_set_log_level(global_opts, log_level); } void criu_local_set_root(criu_opts *opts, char *root) { opts->rpc->root = strdup(root); } void criu_set_root(char *root) { criu_local_set_root(global_opts, root); } void criu_local_set_manage_cgroups(criu_opts *opts, bool manage) { opts->rpc->has_manage_cgroups = true; opts->rpc->manage_cgroups = manage; } void criu_set_manage_cgroups(bool manage) { criu_local_set_manage_cgroups(global_opts, manage); } void criu_local_set_manage_cgroups_mode(criu_opts *opts, enum criu_cg_mode mode) { opts->rpc->has_manage_cgroups_mode = true; opts->rpc->manage_cgroups_mode = (CriuCgMode)mode; } void criu_set_manage_cgroups_mode(enum criu_cg_mode mode) { criu_local_set_manage_cgroups_mode(global_opts, mode); } void criu_local_set_freeze_cgroup(criu_opts *opts, char *name) { opts->rpc->freeze_cgroup = name; } void criu_set_freeze_cgroup(char *name) { criu_local_set_freeze_cgroup(global_opts, name); } void criu_local_set_timeout(criu_opts *opts, unsigned int timeout) { opts->rpc->timeout = timeout; } void criu_set_timeout(unsigned int timeout) { criu_local_set_timeout(global_opts, timeout); } void criu_local_set_auto_ext_mnt(criu_opts *opts, bool val) { opts->rpc->has_auto_ext_mnt = true; opts->rpc->auto_ext_mnt = val; } void criu_set_auto_ext_mnt(bool val) { criu_local_set_auto_ext_mnt(global_opts, val); } void criu_local_set_ext_sharing(criu_opts *opts, bool val) { opts->rpc->has_ext_sharing = true; opts->rpc->ext_sharing = val; } void criu_set_ext_sharing(bool val) { criu_local_set_ext_sharing(global_opts, val); } void criu_local_set_ext_masters(criu_opts *opts, bool val) { opts->rpc->has_ext_masters = true; opts->rpc->ext_masters = val; } void criu_set_ext_masters(bool val) { criu_local_set_ext_masters(global_opts, val); } void criu_local_set_log_file(criu_opts *opts, char *log_file) { opts->rpc->log_file = strdup(log_file); } void criu_set_log_file(char *log_file) { criu_local_set_log_file(global_opts, log_file); } void criu_local_set_cpu_cap(criu_opts *opts, unsigned int cap) { opts->rpc->has_cpu_cap = true; opts->rpc->cpu_cap = cap; } void criu_set_cpu_cap(unsigned int cap) { criu_local_set_cpu_cap(global_opts, cap); } int criu_local_set_exec_cmd(criu_opts *opts, int argc, char *argv[]) { int i; opts->rpc->n_exec_cmd = argc; opts->rpc->exec_cmd = malloc((argc) * sizeof(char *)); if (opts->rpc->exec_cmd) { for (i = 0; i < argc; i++) { opts->rpc->exec_cmd[i] = strdup(argv[i]); if (!opts->rpc->exec_cmd[i]) { while (i > 0) free(opts->rpc->exec_cmd[i--]); free(opts->rpc->exec_cmd); opts->rpc->n_exec_cmd = 0; opts->rpc->exec_cmd = NULL; goto out; } } return 0; } out: return -ENOMEM; } int criu_set_exec_cmd(int argc, char *argv[]) { return criu_local_set_exec_cmd(global_opts, argc, argv); } int criu_local_add_ext_mount(criu_opts *opts, char *key, char *val) { int nr; ExtMountMap **a, *m; m = malloc(sizeof(*m)); if (!m) goto er; ext_mount_map__init(m); m->key = strdup(key); if (!m->key) goto er_n; m->val = strdup(val); if (!m->val) goto er_k; nr = opts->rpc->n_ext_mnt + 1; a = realloc(opts->rpc->ext_mnt, nr * sizeof(m)); if (!a) goto er_v; a[nr - 1] = m; opts->rpc->ext_mnt = a; opts->rpc->n_ext_mnt = nr; return 0; er_v: free(m->val); er_k: free(m->key); er_n: free(m); er: return -ENOMEM; } int criu_add_ext_mount(char *key, char *val) { return criu_local_add_ext_mount(global_opts, key, val); } int criu_local_add_cg_root(criu_opts *opts, char *ctrl, char *path) { int nr; CgroupRoot **a, *root; root = malloc(sizeof(*root)); if (!root) goto er; cgroup_root__init(root); if (ctrl) { root->ctrl = strdup(ctrl); if (!root->ctrl) goto er_r; } root->path = strdup(path); if (!root->path) goto er_c; nr = opts->rpc->n_cg_root + 1; a = realloc(opts->rpc->cg_root, nr * sizeof(root)); if (!a) goto er_p; a[nr - 1] = root; opts->rpc->cg_root = a; opts->rpc->n_cg_root = nr; return 0; er_p: free(root->path); er_c: if (root->ctrl) free(root->ctrl); er_r: free(root); er: return -ENOMEM; } int criu_add_cg_root(char *ctrl, char *path) { return criu_local_add_cg_root(global_opts, ctrl, path); } int criu_local_add_veth_pair(criu_opts *opts, char *in, char *out) { int nr; CriuVethPair **a, *p; p = malloc(sizeof(*p)); if (!p) goto er; criu_veth_pair__init(p); p->if_in = strdup(in); if (!p->if_in) goto er_p; p->if_out = strdup(out); if (!p->if_out) goto er_i; nr = opts->rpc->n_veths + 1; a = realloc(opts->rpc->veths, nr * sizeof(p)); if (!a) goto er_o; a[nr - 1] = p; opts->rpc->veths = a; opts->rpc->n_veths = nr; return 0; er_o: free(p->if_out); er_i: free(p->if_in); er_p: free(p); er: return -ENOMEM; } int criu_add_veth_pair(char *in, char *out) { return criu_local_add_veth_pair(global_opts, in, out); } int criu_local_add_enable_fs(criu_opts *opts, char *fs) { int nr; char *str = NULL; char **ptr = NULL; str = strdup(fs); if (!str) goto err; nr = opts->rpc->n_enable_fs + 1; ptr = realloc(opts->rpc->enable_fs, nr * sizeof(*ptr)); if (!ptr) goto err; ptr[nr - 1] = str; opts->rpc->n_enable_fs = nr; opts->rpc->enable_fs = ptr; return 0; err: if (str) free(str); if (ptr) free(ptr); return -ENOMEM; } int criu_add_enable_fs(char *fs) { return criu_local_add_enable_fs(global_opts, fs); } int criu_local_add_skip_mnt(criu_opts *opts, char *mnt) { int nr; char *str = NULL; char **ptr = NULL; str = strdup(mnt); if (!str) goto err; nr = opts->rpc->n_skip_mnt + 1; ptr = realloc(opts->rpc->skip_mnt, nr * sizeof(*ptr)); if (!ptr) goto err; ptr[nr - 1] = str; opts->rpc->n_skip_mnt = nr; opts->rpc->skip_mnt = ptr; return 0; err: if (str) free(str); if (ptr) free(ptr); return -ENOMEM; } int criu_local_add_irmap_path(criu_opts *opts, char *path) { int nr; char *my_path; char **m; if (!opts) return -1; my_path = strdup(path); if (!my_path) goto err; nr = opts->rpc->n_irmap_scan_paths + 1; m = realloc(opts->rpc->irmap_scan_paths, nr * sizeof(*m)); if (!m) goto err; m[nr - 1] = my_path; opts->rpc->n_irmap_scan_paths = nr; opts->rpc->irmap_scan_paths = m; return 0; err: if (my_path) free(my_path); return -ENOMEM; } int criu_local_add_cg_props(criu_opts *opts, char *stream) { char *new; new = strdup(stream); if (!new) return -ENOMEM; free(opts->rpc->cgroup_props); opts->rpc->cgroup_props = new; return 0; } int criu_local_add_cg_props_file(criu_opts *opts, char *path) { char *new; new = strdup(path); if (!new) return -ENOMEM; free(opts->rpc->cgroup_props_file); opts->rpc->cgroup_props_file = new; return 0; } int criu_local_add_cg_dump_controller(criu_opts *opts, char *name) { char **new; size_t nr; nr = opts->rpc->n_cgroup_dump_controller + 1; new = realloc(opts->rpc->cgroup_dump_controller, nr * sizeof(char *)); if (!new) return -ENOMEM; new[opts->rpc->n_cgroup_dump_controller] = strdup(name); if (!new[opts->rpc->n_cgroup_dump_controller]) return -ENOMEM; opts->rpc->n_cgroup_dump_controller = nr; opts->rpc->cgroup_dump_controller = new; return 0; } int criu_add_skip_mnt(char *mnt) { return criu_local_add_skip_mnt(global_opts, mnt); } void criu_local_set_ghost_limit(criu_opts *opts, unsigned int limit) { opts->rpc->has_ghost_limit = true; opts->rpc->ghost_limit = limit; } void criu_set_ghost_limit(unsigned int limit) { criu_local_set_ghost_limit(global_opts, limit); } int criu_add_irmap_path(char *path) { return criu_local_add_irmap_path(global_opts, path); } int criu_local_add_inherit_fd(criu_opts *opts, int fd, char *key) { int nr; InheritFd **a, *f; /* Inheriting is only supported with swrk mode */ if (opts->service_comm != CRIU_COMM_BIN) return -1; f = malloc(sizeof(*f)); if (!f) goto er; inherit_fd__init(f); f->fd = fd; f->key = strdup(key); if (!f->key) goto er_f; nr = opts->rpc->n_inherit_fd + 1; a = realloc(opts->rpc->inherit_fd, nr * sizeof(f)); if (!a) goto err_k; a[nr - 1] = f; opts->rpc->inherit_fd = a; opts->rpc->n_inherit_fd = nr; return 0; err_k: free(f->key); er_f: free(f); er: return -ENOMEM; } int criu_add_inherit_fd(int fd, char *key) { return criu_local_add_inherit_fd(global_opts, fd, key); } int criu_local_add_external(criu_opts *opts, char *key) { int nr; char **a, *e = NULL; e = strdup(key); if (!e) goto err; nr = opts->rpc->n_external + 1; a = realloc(opts->rpc->external, nr * sizeof(*a)); if (!a) goto err; a[nr - 1] = e; opts->rpc->external = a; opts->rpc->n_external = nr; return 0; err: if (e) free(e); return -ENOMEM; } int criu_add_external(char *key) { return criu_local_add_external(global_opts, key); } static CriuResp *recv_resp(int socket_fd) { unsigned char *buf = NULL; int len; CriuResp *msg = 0; len = recv(socket_fd, NULL, 0, MSG_TRUNC | MSG_PEEK); if (len == -1) { perror("Can't read request"); goto err; } buf = malloc(len); if (!buf) { errno = ENOMEM; perror("Can't receive response"); goto err; } len = recv(socket_fd, buf, len, MSG_TRUNC); if (len == -1) { perror("Can't read request"); goto err; } msg = criu_resp__unpack(NULL, len, buf); if (!msg) { perror("Failed unpacking response"); goto err; } free(buf); return msg; err: free(buf); saved_errno = errno; return NULL; } static int send_req(int socket_fd, CriuReq *req) { unsigned char *buf; int len; len = criu_req__get_packed_size(req); buf = malloc(len); if (!buf) { errno = ENOMEM; perror("Can't send request"); goto err; } if (criu_req__pack(req, buf) != len) { perror("Failed packing request"); goto err; } if (write(socket_fd, buf, len) == -1) { perror("Can't send request"); goto err; } free(buf); return 0; err: free(buf); saved_errno = errno; return -1; } static int send_notify_ack(int socket_fd, int ret) { int send_ret; CriuReq req = CRIU_REQ__INIT; req.type = CRIU_REQ_TYPE__NOTIFY; req.has_notify_success = true; req.notify_success = (ret == 0); send_ret = send_req(socket_fd, &req); /* * If we're failing the notification then report * back the original error code (and it will be * propagated back to user). * * If the notification was OK, then report the * result of acking it. */ return ret ? : send_ret; } static void swrk_wait(criu_opts *opts) { if (opts->service_comm == CRIU_COMM_BIN) waitpid(opts->swrk_pid, NULL, 0); } static int swrk_connect(criu_opts *opts, bool d) { int sks[2], pid, ret = -1; if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sks)) goto out; pid = fork(); if (pid < 0) goto err; if (pid == 0) { sigset_t mask; char fds[11]; /* * Unblock SIGCHLD. * * The caller of this function is supposed to have * this signal blocked. Otherwise it risks to get * into situation, when this routine is not yet * returned, but the restore subtree exits and * emits the SIGCHLD. * * In turn, unblocked SIGCHLD is required to make * criu restoration process work -- it catches * subtasks restore errors in this handler. */ sigemptyset(&mask); sigaddset(&mask, SIGCHLD); sigprocmask(SIG_UNBLOCK, &mask, NULL); close(sks[0]); sprintf(fds, "%d", sks[1]); if (d) if (daemon(0, 1)) { perror("Can't detach for a self-dump"); goto child_err; } pid = getpid(); if (write(sks[1], &pid, sizeof(pid)) != sizeof(pid)) { perror("Can't write swrk pid"); goto child_err; } execlp(opts->service_binary, opts->service_binary, "swrk", fds, NULL); perror("Can't exec criu swrk"); child_err: close(sks[1]); exit(1); } close(sks[1]); if (read(sks[0], &pid, sizeof(pid)) != sizeof(pid)) { perror("Can't read swrk pid"); goto err; } opts->swrk_pid = pid; ret = sks[0]; out: return ret; err: close(sks[0]); close(sks[1]); goto out; } static int criu_connect(criu_opts *opts, bool d) { int fd, ret; struct sockaddr_un addr; socklen_t addr_len; if (opts->service_comm == CRIU_COMM_FD) return opts->service_fd; else if (opts->service_comm == CRIU_COMM_BIN) return swrk_connect(opts, d); fd = socket(AF_LOCAL, SOCK_SEQPACKET, 0); if (fd < 0) { saved_errno = errno; perror("Can't create socket"); return -1; } memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_LOCAL; strncpy(addr.sun_path, opts->service_address, sizeof(addr.sun_path)); addr_len = strlen(opts->service_address) + sizeof(addr.sun_family); ret = connect(fd, (struct sockaddr *) &addr, addr_len); if (ret < 0) { saved_errno = errno; perror("Can't connect to socket"); close(fd); return -1; } return fd; } static int send_req_and_recv_resp_sk(int fd, criu_opts *opts, CriuReq *req, CriuResp **resp) { int ret = 0; if (send_req(fd, req) < 0) { ret = -ECOMM; goto exit; } again: *resp = recv_resp(fd); if (!*resp) { perror("Can't receive response"); ret = -ECOMM; goto exit; } if ((*resp)->type == CRIU_REQ_TYPE__NOTIFY) { if (opts->notify) ret = opts->notify((*resp)->notify->script, (*resp)->notify); ret = send_notify_ack(fd, ret); if (!ret) goto again; else goto exit; } if ((*resp)->type != req->type) { if ((*resp)->type == CRIU_REQ_TYPE__EMPTY && (*resp)->success == false) ret = -EINVAL; else { perror("Unexpected response type"); ret = -EBADMSG; } } if ((*resp)->has_cr_errno) saved_errno = (*resp)->cr_errno; exit: return ret; } static int send_req_and_recv_resp(criu_opts *opts, CriuReq *req, CriuResp **resp) { int fd; int ret = 0; bool d = false; if (req->type == CRIU_REQ_TYPE__DUMP && req->opts->has_pid == false) d = true; fd = criu_connect(opts, d); if (fd < 0) { perror("Can't connect to criu"); ret = -ECONNREFUSED; } else { ret = send_req_and_recv_resp_sk(fd, opts, req, resp); close(fd); } return ret; } int criu_local_check(criu_opts *opts) { int ret = -1; CriuReq req = CRIU_REQ__INIT; CriuResp *resp = NULL; saved_errno = 0; req.type = CRIU_REQ_TYPE__CHECK; ret = send_req_and_recv_resp(opts, &req, &resp); if (ret) goto exit; ret = resp->success ? 0 : -EBADE; exit: if (resp) criu_resp__free_unpacked(resp, NULL); swrk_wait(opts); errno = saved_errno; return ret; } int criu_check(void) { return criu_local_check(global_opts); } int criu_local_dump(criu_opts *opts) { int ret = -1; CriuReq req = CRIU_REQ__INIT; CriuResp *resp = NULL; saved_errno = 0; req.type = CRIU_REQ_TYPE__DUMP; req.opts = opts->rpc; ret = send_req_and_recv_resp(opts, &req, &resp); if (ret) goto exit; if (resp->success) { if (resp->dump->has_restored && resp->dump->restored) ret = 1; else ret = 0; } else ret = -EBADE; exit: if (resp) criu_resp__free_unpacked(resp, NULL); swrk_wait(opts); errno = saved_errno; return ret; } int criu_dump(void) { return criu_local_dump(global_opts); } int criu_local_dump_iters(criu_opts *opts, int (*more)(criu_predump_info pi)) { int ret = -1, fd = -1, uret; CriuReq req = CRIU_REQ__INIT; CriuResp *resp = NULL; saved_errno = 0; req.type = CRIU_REQ_TYPE__PRE_DUMP; req.opts = opts->rpc; ret = -EINVAL; /* * Self-dump in iterable manner is tricky and * not supported for the moment. * * Calls w/o iteration callback is, well, not * allowed either. */ if (!opts->rpc->has_pid || !more) goto exit; ret = -ECONNREFUSED; fd = criu_connect(opts, false); if (fd < 0) goto exit; while (1) { ret = send_req_and_recv_resp_sk(fd, opts, &req, &resp); if (ret) goto exit; if (!resp->success) { ret = -EBADE; goto exit; } uret = more(NULL); if (uret < 0) { ret = uret; goto exit; } criu_resp__free_unpacked(resp, NULL); if (uret == 0) break; } req.type = CRIU_REQ_TYPE__DUMP; ret = send_req_and_recv_resp_sk(fd, opts, &req, &resp); if (!ret) ret = (resp->success ? 0 : -EBADE); exit: if (fd >= 0) close(fd); if (resp) criu_resp__free_unpacked(resp, NULL); swrk_wait(opts); errno = saved_errno; return ret; } int criu_dump_iters(int (*more)(criu_predump_info pi)) { return criu_local_dump_iters((void *)global_opts, more); } int criu_local_restore(criu_opts *opts) { int ret = -1; CriuReq req = CRIU_REQ__INIT; CriuResp *resp = NULL; saved_errno = 0; req.type = CRIU_REQ_TYPE__RESTORE; req.opts = opts->rpc; ret = send_req_and_recv_resp(opts, &req, &resp); if (ret) goto exit; if (resp->success) ret = resp->restore->pid; else ret = -EBADE; exit: if (resp) criu_resp__free_unpacked(resp, NULL); swrk_wait(opts); errno = saved_errno; return ret; } int criu_restore(void) { return criu_local_restore(global_opts); } int criu_local_restore_child(criu_opts *opts) { int sk, ret = -1; enum criu_service_comm saved_comm; char *saved_comm_data; bool save_comm; CriuReq req = CRIU_REQ__INIT; CriuResp *resp = NULL; /* * restore_child is not possible with criu running as a system * service, so we need to switch comm method to CRIU_COMM_BIN. * We're doing so because of the backward compatibility, and we * should probably consider requiring CRIU_COMM_BIN to be set by * user at some point. */ save_comm = (opts->service_comm != CRIU_COMM_BIN); if (save_comm) { /* Save comm */ saved_comm = opts->service_comm; saved_comm_data = opts->service_address; opts->service_comm = CRIU_COMM_BIN; opts->service_binary = CR_DEFAULT_SERVICE_BIN; } sk = swrk_connect(opts, false); if (save_comm) { /* Restore comm */ opts->service_comm = saved_comm; opts->service_address = saved_comm_data; } if (sk < 0) return -1; saved_errno = 0; req.type = CRIU_REQ_TYPE__RESTORE; req.opts = opts->rpc; req.opts->has_rst_sibling = true; req.opts->rst_sibling = true; ret = send_req_and_recv_resp_sk(sk, opts, &req, &resp); swrk_wait(opts); if (!ret) { ret = resp->success ? resp->restore->pid : -EBADE; criu_resp__free_unpacked(resp, NULL); } close(sk); errno = saved_errno; return ret; } int criu_restore_child(void) { return criu_local_restore_child(global_opts); } criu-3.6/lib/c/criu.h000066400000000000000000000211151317335042600144330ustar00rootroot00000000000000/* * (C) Copyright 2013 Parallels, Inc. (www.parallels.com). * * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Lesser General Public License * (LGPL) version 2.1 which accompanies this distribution, and is available at * http://www.gnu.org/licenses/lgpl-2.1.html * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, you can find it here: * www.gnu.org/licenses/lgpl.html */ #ifndef __CRIU_LIB_H__ #define __CRIU_LIB_H__ #include #ifdef __GNUG__ extern "C" { #endif enum criu_service_comm { CRIU_COMM_SK, CRIU_COMM_FD, CRIU_COMM_BIN }; enum criu_cg_mode { CRIU_CG_MODE_IGNORE, CRIU_CG_MODE_NONE, CRIU_CG_MODE_PROPS, CRIU_CG_MODE_SOFT, CRIU_CG_MODE_FULL, CRIU_CG_MODE_STRICT, CRIU_CG_MODE_DEFAULT, }; void criu_set_service_address(char *path); void criu_set_service_fd(int fd); void criu_set_service_binary(char *path); /* * You can choose if you want libcriu to connect to service socket * by itself, use provided file descriptor or spawn swrk by itself */ void criu_set_service_comm(enum criu_service_comm); /* * Set opts to defaults. _Must_ be called first before using any functions from * the list down below. 0 on success, -1 on fail. */ int criu_init_opts(void); void criu_set_pid(int pid); void criu_set_images_dir_fd(int fd); /* must be set for dump/restore */ void criu_set_parent_images(char *path); void criu_set_work_dir_fd(int fd); void criu_set_leave_running(bool leave_running); void criu_set_ext_unix_sk(bool ext_unix_sk); int criu_add_unix_sk(unsigned int inode); void criu_set_tcp_established(bool tcp_established); void criu_set_tcp_skip_in_flight(bool tcp_skip_in_flight); void criu_set_weak_sysctls(bool val); void criu_set_evasive_devices(bool evasive_devices); void criu_set_shell_job(bool shell_job); void criu_set_file_locks(bool file_locks); void criu_set_track_mem(bool track_mem); void criu_set_auto_dedup(bool auto_dedup); void criu_set_force_irmap(bool force_irmap); void criu_set_link_remap(bool link_remap); void criu_set_log_level(int log_level); void criu_set_log_file(char *log_file); void criu_set_cpu_cap(unsigned int cap); void criu_set_root(char *root); void criu_set_manage_cgroups(bool manage); void criu_set_manage_cgroups_mode(enum criu_cg_mode mode); void criu_set_freeze_cgroup(char *name); void criu_set_timeout(unsigned int timeout); void criu_set_auto_ext_mnt(bool val); void criu_set_ext_sharing(bool val); void criu_set_ext_masters(bool val); int criu_set_exec_cmd(int argc, char *argv[]); int criu_add_ext_mount(char *key, char *val); int criu_add_veth_pair(char *in, char *out); int criu_add_cg_root(char *ctrl, char *path); int criu_add_enable_fs(char *fs); int criu_add_skip_mnt(char *mnt); void criu_set_ghost_limit(unsigned int limit); int criu_add_irmap_path(char *path); int criu_add_inherit_fd(int fd, char *key); int criu_add_external(char *key); /* * The criu_notify_arg_t na argument is an opaque * value that callbacks (cb-s) should pass into * criu_notify_xxx() calls to fetch arbitrary values * from notification. If the value is not available * some non-existing one is reported. */ typedef struct _CriuNotify *criu_notify_arg_t; void criu_set_notify_cb(int (*cb)(char *action, criu_notify_arg_t na)); /* Get pid of root task. 0 if not available */ int criu_notify_pid(criu_notify_arg_t na); /* Here is a table of return values and errno's of functions * from the list down below. * * Return value errno Description * ---------------------------------------------------------------------------- * 0 undefined Success. * * >0 undefined Success(criu_restore() only). * * -BADE rpc err (0 for now) RPC has returned fail. * * -ECONNREFUSED errno Unable to connect to CRIU. * * -ECOMM errno Unable to send/recv msg to/from CRIU. * * -EINVAL undefined CRIU doesn't support this type of request. * You should probably update CRIU. * * -EBADMSG undefined Unexpected response from CRIU. * You should probably update CRIU. */ int criu_check(void); int criu_dump(void); int criu_restore(void); int criu_restore_child(void); /* * Perform dumping but with preliminary iterations. Each * time an iteration ends the ->more callback is called. * The callback's return value is * - positive -- one more iteration starts * - zero -- final dump is performed and call exits * - negative -- dump is aborted, the value is returned * back from criu_dump_iters * * The @pi argument is an opaque value that caller may * use to request pre-dump statistics (not yet implemented). */ typedef void *criu_predump_info; int criu_dump_iters(int (*more)(criu_predump_info pi)); /* * Same as the list above, but lets you have your very own options * structure and lets you set individual options in it. */ typedef struct criu_opts criu_opts; int criu_local_init_opts(criu_opts **opts); void criu_local_set_service_address(criu_opts *opts, char *path); void criu_local_set_service_fd(criu_opts *opts, int fd); void criu_local_set_service_comm(criu_opts *opts, enum criu_service_comm); void criu_local_set_service_fd(criu_opts *opts, int fd); void criu_local_set_pid(criu_opts *opts, int pid); void criu_local_set_images_dir_fd(criu_opts *opts, int fd); /* must be set for dump/restore */ void criu_local_set_parent_images(criu_opts *opts, char *path); void criu_local_set_work_dir_fd(criu_opts *opts, int fd); void criu_local_set_leave_running(criu_opts *opts, bool leave_running); void criu_local_set_ext_unix_sk(criu_opts *opts, bool ext_unix_sk); int criu_local_add_unix_sk(criu_opts *opts, unsigned int inode); void criu_local_set_tcp_established(criu_opts *opts, bool tcp_established); void criu_local_set_tcp_skip_in_flight(criu_opts *opts, bool tcp_skip_in_flight); void criu_local_set_weak_sysctls(criu_opts *opts, bool val); void criu_local_set_evasive_devices(criu_opts *opts, bool evasive_devices); void criu_local_set_shell_job(criu_opts *opts, bool shell_job); void criu_local_set_file_locks(criu_opts *opts, bool file_locks); void criu_local_set_track_mem(criu_opts *opts, bool track_mem); void criu_local_set_auto_dedup(criu_opts *opts, bool auto_dedup); void criu_local_set_force_irmap(criu_opts *opts, bool force_irmap); void criu_local_set_link_remap(criu_opts *opts, bool link_remap); void criu_local_set_log_level(criu_opts *opts, int log_level); void criu_local_set_log_file(criu_opts *opts, char *log_file); void criu_local_set_cpu_cap(criu_opts *opts, unsigned int cap); void criu_local_set_root(criu_opts *opts, char *root); void criu_local_set_manage_cgroups(criu_opts *opts, bool manage); void criu_local_set_manage_cgroups_mode(criu_opts *opts, enum criu_cg_mode mode); void criu_local_set_freeze_cgroup(criu_opts *opts, char *name); void criu_local_set_timeout(criu_opts *opts, unsigned int timeout); void criu_local_set_auto_ext_mnt(criu_opts *opts, bool val); void criu_local_set_ext_sharing(criu_opts *opts, bool val); void criu_local_set_ext_masters(criu_opts *opts, bool val); int criu_local_set_exec_cmd(criu_opts *opts, int argc, char *argv[]); int criu_local_add_ext_mount(criu_opts *opts, char *key, char *val); int criu_local_add_veth_pair(criu_opts *opts, char *in, char *out); int criu_local_add_cg_root(criu_opts *opts, char *ctrl, char *path); int criu_local_add_enable_fs(criu_opts *opts, char *fs); int criu_local_add_skip_mnt(criu_opts *opts, char *mnt); void criu_local_set_ghost_limit(criu_opts *opts, unsigned int limit); int criu_local_add_irmap_path(criu_opts *opts, char *path); int criu_local_add_cg_props(criu_opts *opts, char *stream); int criu_local_add_cg_props_file(criu_opts *opts, char *path); int criu_local_add_cg_dump_controller(criu_opts *opts, char *name); int criu_local_add_inherit_fd(criu_opts *opts, int fd, char *key); int criu_local_add_external(criu_opts *opts, char *key); void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)); int criu_local_check(criu_opts *opts); int criu_local_dump(criu_opts *opts); int criu_local_restore(criu_opts *opts); int criu_local_restore_child(criu_opts *opts); int criu_local_dump_iters(criu_opts *opts, int (*more)(criu_predump_info pi)); #ifdef __GNUG__ } #endif #endif /* __CRIU_LIB_H__ */ criu-3.6/lib/c/criu.pc.in000066400000000000000000000002661317335042600152170ustar00rootroot00000000000000libdir=@libdir@ includedir=@includedir@ Name: CRIU Description: RPC library for userspace checkpoint and restore Version: @version@ Libs: -L${libdir} -lcriu Cflags: -I${includedir} criu-3.6/lib/py/000077500000000000000000000000001317335042600135265ustar00rootroot00000000000000criu-3.6/lib/py/.gitignore000066400000000000000000000000171317335042600155140ustar00rootroot00000000000000*_pb2.py *.pyc criu-3.6/lib/py/Makefile000066400000000000000000000007041317335042600151670ustar00rootroot00000000000000all-y += libpy-images rpc_pb2.py .PHONY: .FORCE $(obj)/images/Makefile: ; $(obj)/images/%: .FORCE $(Q) $(MAKE) $(build)=$(obj)/images $@ libpy-images: $(Q) $(MAKE) $(build)=$(obj)/images all .PHONY: libpy-images rpc_pb2.py: $(Q) protoc -I=images/ --python_out=$(obj) images/$(@:_pb2.py=.proto) cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc) clean-lib-py: $(Q) $(MAKE) $(build)=$(obj)/images clean .PHONY: clean-lib-py clean: clean-lib-py criu-3.6/lib/py/__init__.py000066400000000000000000000000671317335042600156420ustar00rootroot00000000000000import rpc_pb2 as rpc import images from criu import * criu-3.6/lib/py/criu.py000066400000000000000000000133321317335042600150440ustar00rootroot00000000000000# Same as libcriu for C. import socket import errno import subprocess import fcntl import os import signal import sys import struct import rpc_pb2 as rpc class _criu_comm: """ Base class for communication classes. """ COMM_SK = 0 COMM_FD = 1 COMM_BIN = 2 comm_type = None comm = None sk = None def connect(self, daemon): """ Connect to criu and return socket object. daemon -- is for whether or not criu should daemonize if executing criu from binary(comm_bin). """ pass def disconnect(self): """ Disconnect from criu. """ pass class _criu_comm_sk(_criu_comm): """ Communication class for unix socket. """ def __init__(self, sk_path): self.comm_type = self.COMM_SK self.comm = sk_path def connect(self, daemon): self.sk = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) self.sk.connect(self.comm) return self.sk def disconnect(self): self.sk.close() class _criu_comm_fd(_criu_comm): """ Commnunication class for file descriptor. """ def __init__(self, fd): self.comm_type = self.COMM_FD self.comm = fd def connect(self, daemon): self.sk = socket.fromfd(self.comm, socket.AF_UNIX, socket.SOCK_SEQPACKET) return self.sk def disconnect(self): self.sk.close() class _criu_comm_bin(_criu_comm): """ Communication class for binary. """ def __init__(self, bin_path): self.comm_type = self.COMM_BIN self.comm = bin_path self.swrk = None self.daemon = None def connect(self, daemon): # Kind of the same thing we do in libcriu css = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) flags = fcntl.fcntl(css[1], fcntl.F_GETFD) fcntl.fcntl(css[1], fcntl.F_SETFD, flags | fcntl.FD_CLOEXEC) self.daemon = daemon p = os.fork() if p == 0: def exec_criu(): os.close(0) os.close(1) os.close(2) css[0].send(struct.pack('i', os.getpid())) os.execv(self.comm, [self.comm, 'swrk', "%d" % css[0].fileno()]) os._exit(1) if daemon: # Python has no daemon(3) alternative, # so we need to mimic it ourself. p = os.fork() if p == 0: os.setsid() exec_criu() else: os._exit(0) else: exec_criu() else: if daemon: os.waitpid(p, 0) css[0].close() self.swrk = struct.unpack('i', css[1].recv(4))[0] self.sk = css[1] return self.sk def disconnect(self): self.sk.close() if not self.daemon: os.waitpid(self.swrk, 0) class CRIUException(Exception): """ Exception class for handling and storing criu errors. """ typ = None _str = None def __str__(self): return self._str class CRIUExceptionInternal(CRIUException): """ Exception class for handling and storing internal errors. """ def __init__(self, typ, s): self.typ = typ self._str = "%s failed with internal error: %s" % (rpc.criu_req_type.Name(self.typ), s) class CRIUExceptionExternal(CRIUException): """ Exception class for handling and storing criu RPC errors. """ def __init__(self, req_typ, resp_typ, errno): self.typ = req_typ self.resp_typ = resp_typ self.errno = errno self._str = self._gen_error_str() def _gen_error_str(self): s = "%s failed: " % (rpc.criu_req_type.Name(self.typ), ) if self.typ != self.resp_typ: s += "Unxecpected response type %d: " % (self.resp_typ, ) s += "Error(%d): " % (self.errno, ) if self.errno == errno.EBADRQC: s += "Bad options" if self.typ == rpc.DUMP: if self.errno == errno.ESRCH: s += "No process with such pid" if self.typ == rpc.RESTORE: if self.errno == errno.EEXIST: s += "Process with requested pid already exists" s += "Unknown" return s class criu: """ Call criu through RPC. """ opts = None #CRIU options in pb format _comm = None #Communication method def __init__(self): self.use_binary('criu') self.opts = rpc.criu_opts() def use_sk(self, sk_name): """ Access criu using unix socket which that belongs to criu service daemon. """ self._comm = _criu_comm_sk(sk_name) def use_fd(self, fd): """ Access criu using provided fd. """ self._comm = _criu_comm_fd(fd) def use_binary(self, bin_name): """ Access criu by execing it using provided path to criu binary. """ self._comm = _criu_comm_bin(bin_name) def _send_req_and_recv_resp(self, req): """ As simple as send request and receive response. """ # In case of self-dump we need to spawn criu swrk detached # from our current process, as criu has a hard time separating # process resources from its own if criu is located in a same # process tree it is trying to dump. daemon = False if req.type == rpc.DUMP and not req.opts.HasField('pid'): daemon = True try: s = self._comm.connect(daemon) s.send(req.SerializeToString()) buf = s.recv(len(s.recv(1, socket.MSG_TRUNC | socket.MSG_PEEK))) self._comm.disconnect() resp = rpc.criu_resp() resp.ParseFromString(buf) except Exception as e: raise CRIUExceptionInternal(req.type, str(e)) return resp def check(self): """ Checks whether the kernel support is up-to-date. """ req = rpc.criu_req() req.type = rpc.CHECK resp = self._send_req_and_recv_resp(req) if not resp.success: raise CRIUExceptionExternal(req.type, resp.type, resp.cr_errno) def dump(self): """ Checkpoint a process/tree identified by opts.pid. """ req = rpc.criu_req() req.type = rpc.DUMP req.opts.MergeFrom(self.opts) resp = self._send_req_and_recv_resp(req) if not resp.success: raise CRIUExceptionExternal(req.type, resp.type, resp.cr_errno) return resp.dump def restore(self): """ Restore a process/tree. """ req = rpc.criu_req() req.type = rpc.RESTORE req.opts.MergeFrom(self.opts) resp = self._send_req_and_recv_resp(req) if not resp.success: raise CRIUExceptionExternal(req.type, resp.type, resp.cr_errno) return resp.restore criu-3.6/lib/py/images/000077500000000000000000000000001317335042600147735ustar00rootroot00000000000000criu-3.6/lib/py/images/.gitignore000066400000000000000000000000361317335042600167620ustar00rootroot00000000000000*.pyc *_pb2.py magic.py pb.py criu-3.6/lib/py/images/Makefile000066400000000000000000000015621317335042600164370ustar00rootroot00000000000000all-y += images magic.py pb.py proto := $(filter-out images/rpc.proto, $(sort $(wildcard images/*.proto))) proto-py-modules := $(foreach m,$(proto),$(subst -,_,$(notdir $(m:.proto=_pb2)))) # We don't need rpc_pb2.py here, as it is not related to the images. # Unfortunately, we can't drop ugly _pb2 suffixes here, because # some _pb2 files depend on others _pb2 files. images: $(Q) protoc -I=images/ -I=/usr/include/ --python_out=$(obj) $(proto) .PHONY: images magic.py: scripts/magic-gen.py criu/include/magic.h $(call msg-gen, $@) $(Q) python $^ $(obj)/$@ pb.py: images $(Q) echo "# Autogenerated. Do not edit!" > $(obj)/$@ $(Q) for m in $(proto-py-modules); do \ echo "from $$m import *" >> $(obj)/$@ ;\ done .PHONY: pb.py cleanup-y += $(addprefix $(obj)/,magic.py pb.py *.pyc) cleanup-y += $(call cleanify,$(addprefix $(obj)/,$(addsuffix .py,$(proto-py-modules)))) criu-3.6/lib/py/images/__init__.py000066400000000000000000000000721317335042600171030ustar00rootroot00000000000000from magic import * from images import * from pb import * criu-3.6/lib/py/images/images.py000066400000000000000000000360221317335042600166150ustar00rootroot00000000000000#!/bin/env python2 # This file contains methods to deal with criu images. # # According to http://criu.org/Images, criu images can be described # with such IOW: # # IMAGE_FILE ::= MAGIC { ENTRY } # ENTRY ::= SIZE PAYLOAD [ EXTRA ] # PAYLOAD ::= "message encoded in ProtocolBuffer format" # EXTRA ::= "arbitrary blob, depends on the PAYLOAD contents" # # MAGIC ::= "32 bit integer" # SIZE ::= "32 bit integer, equals the PAYLOAD length" # # Images v1.1 NOTE: MAGIC now consist of 2 32 bit integers, first one is # MAGIC_COMMON or MAGIC_SERVICE and the second one is same as MAGIC # in images V1.0. We don't keep "first" magic in json images. # # In order to convert images to human-readable format, we use dict(json). # Using json not only allows us to easily read\write images, but also # to use a great variety of tools out there to manipulate them. # It also allows us to clearly describe criu images structure. # # Using dict(json) format, criu images can be described like: # # { # 'magic' : 'FOO', # 'entries' : [ # entry, # ... # ] # } # # Entry, in its turn, could be described as: # # { # pb_msg, # 'extra' : extra_msg # } # import io import google import struct import os import sys import json import pb2dict import array import magic from pb import * # # Predefined hardcoded constants sizeof_u16 = 2 sizeof_u32 = 4 sizeof_u64 = 8 # A helper for rounding def round_up(x,y): return (((x - 1) | (y - 1)) + 1) class MagicException(Exception): def __init__(self, magic): self.magic = magic # Generic class to handle loading/dumping criu images entries from/to bin # format to/from dict(json). class entry_handler: """ Generic class to handle loading/dumping criu images entries from/to bin format to/from dict(json). """ def __init__(self, payload, extra_handler=None): """ Sets payload class and extra handler class. """ self.payload = payload self.extra_handler = extra_handler def load(self, f, pretty = False, no_payload = False): """ Convert criu image entries from binary format to dict(json). Takes a file-like object and returnes a list with entries in dict(json) format. """ entries = [] while True: entry = {} # Read payload pb = self.payload() buf = f.read(4) if buf == '': break size, = struct.unpack('i', buf) pb.ParseFromString(f.read(size)) entry = pb2dict.pb2dict(pb, pretty) # Read extra if self.extra_handler: if no_payload: def human_readable(num): for unit in ['','K','M','G','T','P','E','Z']: if num < 1024.0: if int(num) == num: return "%d%sB" % (num, unit) else: return "%.1f%sB" % (num, unit) num /= 1024.0 return "%.1fYB" % num pl_size = self.extra_handler.skip(f, pb) entry['extra'] = '... <%s>' % human_readable(pl_size) else: entry['extra'] = self.extra_handler.load(f, pb) entries.append(entry) return entries def loads(self, s, pretty = False): """ Same as load(), but takes a string as an argument. """ f = io.BytesIO(s) return self.load(f, pretty) def dump(self, entries, f): """ Convert criu image entries from dict(json) format to binary. Takes a list of entries and a file-like object to write entries in binary format to. """ for entry in entries: extra = entry.pop('extra', None) # Write payload pb = self.payload() pb2dict.dict2pb(entry, pb) pb_str = pb.SerializeToString() size = len(pb_str) f.write(struct.pack('i', size)) f.write(pb_str) # Write extra if self.extra_handler and extra: self.extra_handler.dump(extra, f, pb) def dumps(self, entries): """ Same as dump(), but doesn't take file-like object and just returns a string. """ f = io.BytesIO('') self.dump(entries, f) return f.read() def count(self, f): """ Counts the number of top-level object in the image file """ entries = 0 while True: buf = f.read(4) if buf == '': break size, = struct.unpack('i', buf) f.seek(size, 1) entries += 1 return entries # Special handler for pagemap.img class pagemap_handler: """ Special entry handler for pagemap.img, which is unique in a way that it has a header of pagemap_head type followed by entries of pagemap_entry type. """ def load(self, f, pretty = False, no_payload = False): entries = [] pb = pagemap_head() while True: buf = f.read(4) if buf == '': break size, = struct.unpack('i', buf) pb.ParseFromString(f.read(size)) entries.append(pb2dict.pb2dict(pb, pretty)) pb = pagemap_entry() return entries def loads(self, s, pretty = False): f = io.BytesIO(s) return self.load(f, pretty) def dump(self, entries, f): pb = pagemap_head() for item in entries: pb2dict.dict2pb(item, pb) pb_str = pb.SerializeToString() size = len(pb_str) f.write(struct.pack('i', size)) f.write(pb_str) pb = pagemap_entry() def dumps(self, entries): f = io.BytesIO('') self.dump(entries, f) return f.read() def count(self, f): return entry_handler(None).count(f) - 1 # Special handler for ghost-file.img class ghost_file_handler: def load(self, f, pretty = False, no_payload = False): entries = [] gf = ghost_file_entry() buf = f.read(4) size, = struct.unpack('i', buf) gf.ParseFromString(f.read(size)) g_entry = pb2dict.pb2dict(gf, pretty) if gf.chunks: entries.append(g_entry) while True: gc = ghost_chunk_entry() buf = f.read(4) if buf == '': break size, = struct.unpack('i', buf) gc.ParseFromString(f.read(size)) entry = pb2dict.pb2dict(gc, pretty) if no_payload: f.seek(gc.len, os.SEEK_CUR) else: entry['extra'] = f.read(gc.len).encode('base64') entries.append(entry) else: if no_payload: f.seek(0, os.SEEK_END) else: g_entry['extra'] = f.read().encode('base64') entries.append(g_entry) return entries def loads(self, s, pretty = False): f = io.BytesIO(s) return self.load(f, pretty) def dump(self, entries, f): pb = ghost_file_entry() item = entries.pop(0) pb2dict.dict2pb(item, pb) pb_str = pb.SerializeToString() size = len(pb_str) f.write(struct.pack('i', size)) f.write(pb_str) if pb.chunks: for item in entries: pb = ghost_chunk_entry() pb2dict.dict2pb(item, pb) pb_str = pb.SerializeToString() size = len(pb_str) f.write(struct.pack('i', size)) f.write(pb_str) f.write(item['extra'].decode('base64')) else: f.write(item['extra'].decode('base64')) def dumps(self, entries): f = io.BytesIO('') self.dump(entries, f) return f.read() # In following extra handlers we use base64 encoding # to store binary data. Even though, the nature # of base64 is that it increases the total size, # it doesn't really matter, because our images # do not store big amounts of binary data. They # are negligible comparing to pages size. class pipes_data_extra_handler: def load(self, f, pload): size = pload.bytes data = f.read(size) return data.encode('base64') def dump(self, extra, f, pload): data = extra.decode('base64') f.write(data) def skip(self, f, pload): f.seek(pload.bytes, os.SEEK_CUR) return pload.bytes class sk_queues_extra_handler: def load(self, f, pload): size = pload.length data = f.read(size) return data.encode('base64') def dump(self, extra, f, pb): data = extra.decode('base64') f.write(data) def skip(self, f, pload): f.seek(pload.length, os.SEEK_CUR) return pload.length class tcp_stream_extra_handler: def load(self, f, pb): d = {} inq = f.read(pb.inq_len) outq = f.read(pb.outq_len) d['inq'] = inq.encode('base64') d['outq'] = outq.encode('base64') return d def dump(self, extra, f, pb): inq = extra['inq'].decode('base64') outq = extra['outq'].decode('base64') f.write(inq) f.write(outq) def skip(self, f, pb): f.seek(0, os.SEEK_END) return pb.inq_len + pb.outq_len class ipc_sem_set_handler: def load(self, f, pb): entry = pb2dict.pb2dict(pb) size = sizeof_u16 * entry['nsems'] rounded = round_up(size, sizeof_u64) s = array.array('H') if s.itemsize != sizeof_u16: raise Exception("Array size mismatch") s.fromstring(f.read(size)) f.seek(rounded - size, 1) return s.tolist() def dump(self, extra, f, pb): entry = pb2dict.pb2dict(pb) size = sizeof_u16 * entry['nsems'] rounded = round_up(size, sizeof_u64) s = array.array('H') if s.itemsize != sizeof_u16: raise Exception("Array size mismatch") s.fromlist(extra) if len(s) != entry['nsems']: raise Exception("Number of semaphores mismatch") f.write(s.tostring()) f.write('\0' * (rounded - size)) def skip(self, f, pb): entry = pb2dict.pb2dict(pb) size = sizeof_u16 * entry['nsems'] f.seek(round_up(size, sizeof_u64), os.SEEK_CUR) return size class ipc_msg_queue_handler: def load(self, f, pb): entry = pb2dict.pb2dict(pb) messages = [] for x in range (0, entry['qnum']): buf = f.read(4) if buf == '': break size, = struct.unpack('i', buf) msg = ipc_msg() msg.ParseFromString(f.read(size)) rounded = round_up(msg.msize, sizeof_u64) data = f.read(msg.msize) f.seek(rounded - msg.msize, 1) messages.append(pb2dict.pb2dict(msg)) messages.append(data.encode('base64')) return messages def dump(self, extra, f, pb): entry = pb2dict.pb2dict(pb) for i in range (0, len(extra), 2): msg = ipc_msg() pb2dict.dict2pb(extra[i], msg) msg_str = msg.SerializeToString() size = len(msg_str) f.write(struct.pack('i', size)) f.write(msg_str) rounded = round_up(msg.msize, sizeof_u64) data = extra[i + 1].decode('base64') f.write(data[:msg.msize]) f.write('\0' * (rounded - msg.msize)) def skip(self, f, pb): entry = pb2dict.pb2dict(pb) pl_len = 0 for x in range (0, entry['qnum']): buf = f.read(4) if buf == '': break size, = struct.unpack('i', buf) msg = ipc_msg() msg.ParseFromString(f.read(size)) rounded = round_up(msg.msize, sizeof_u64) f.seek(rounded, os.SEEK_CUR) pl_len += size + msg.msize return pl_len class ipc_shm_handler: def load(self, f, pb): entry = pb2dict.pb2dict(pb) size = entry['size'] data = f.read(size) rounded = round_up(size, sizeof_u32) f.seek(rounded - size, 1) return data.encode('base64') def dump(self, extra, f, pb): entry = pb2dict.pb2dict(pb) size = entry['size'] data = extra.decode('base64') rounded = round_up(size, sizeof_u32) f.write(data[:size]) f.write('\0' * (rounded - size)) def skip(self, f, pb): entry = pb2dict.pb2dict(pb) size = entry['size'] rounded = round_up(size, sizeof_u32) f.seek(rounded, os.SEEK_CUR) return size handlers = { 'INVENTORY' : entry_handler(inventory_entry), 'CORE' : entry_handler(core_entry), 'IDS' : entry_handler(task_kobj_ids_entry), 'CREDS' : entry_handler(creds_entry), 'UTSNS' : entry_handler(utsns_entry), 'IPC_VAR' : entry_handler(ipc_var_entry), 'FS' : entry_handler(fs_entry), 'GHOST_FILE' : ghost_file_handler(), 'MM' : entry_handler(mm_entry), 'CGROUP' : entry_handler(cgroup_entry), 'TCP_STREAM' : entry_handler(tcp_stream_entry, tcp_stream_extra_handler()), 'STATS' : entry_handler(stats_entry), 'PAGEMAP' : pagemap_handler(), # Special one 'PSTREE' : entry_handler(pstree_entry), 'REG_FILES' : entry_handler(reg_file_entry), 'NS_FILES' : entry_handler(ns_file_entry), 'EVENTFD_FILE' : entry_handler(eventfd_file_entry), 'EVENTPOLL_FILE' : entry_handler(eventpoll_file_entry), 'EVENTPOLL_TFD' : entry_handler(eventpoll_tfd_entry), 'SIGNALFD' : entry_handler(signalfd_entry), 'TIMERFD' : entry_handler(timerfd_entry), 'INOTIFY_FILE' : entry_handler(inotify_file_entry), 'INOTIFY_WD' : entry_handler(inotify_wd_entry), 'FANOTIFY_FILE' : entry_handler(fanotify_file_entry), 'FANOTIFY_MARK' : entry_handler(fanotify_mark_entry), 'VMAS' : entry_handler(vma_entry), 'PIPES' : entry_handler(pipe_entry), 'FIFO' : entry_handler(fifo_entry), 'SIGACT' : entry_handler(sa_entry), 'NETLINK_SK' : entry_handler(netlink_sk_entry), 'REMAP_FPATH' : entry_handler(remap_file_path_entry), 'MNTS' : entry_handler(mnt_entry), 'TTY_FILES' : entry_handler(tty_file_entry), 'TTY_INFO' : entry_handler(tty_info_entry), 'TTY_DATA' : entry_handler(tty_data_entry), 'RLIMIT' : entry_handler(rlimit_entry), 'TUNFILE' : entry_handler(tunfile_entry), 'EXT_FILES' : entry_handler(ext_file_entry), 'IRMAP_CACHE' : entry_handler(irmap_cache_entry), 'FILE_LOCKS' : entry_handler(file_lock_entry), 'FDINFO' : entry_handler(fdinfo_entry), 'UNIXSK' : entry_handler(unix_sk_entry), 'INETSK' : entry_handler(inet_sk_entry), 'PACKETSK' : entry_handler(packet_sock_entry), 'ITIMERS' : entry_handler(itimer_entry), 'POSIX_TIMERS' : entry_handler(posix_timer_entry), 'NETDEV' : entry_handler(net_device_entry), 'PIPES_DATA' : entry_handler(pipe_data_entry, pipes_data_extra_handler()), 'FIFO_DATA' : entry_handler(pipe_data_entry, pipes_data_extra_handler()), 'SK_QUEUES' : entry_handler(sk_packet_entry, sk_queues_extra_handler()), 'IPCNS_SHM' : entry_handler(ipc_shm_entry, ipc_shm_handler()), 'IPCNS_SEM' : entry_handler(ipc_sem_entry, ipc_sem_set_handler()), 'IPCNS_MSG' : entry_handler(ipc_msg_entry, ipc_msg_queue_handler()), 'NETNS' : entry_handler(netns_entry), 'USERNS' : entry_handler(userns_entry), 'SECCOMP' : entry_handler(seccomp_entry), 'AUTOFS' : entry_handler(autofs_entry), 'FILES' : entry_handler(file_entry), } def __rhandler(f): # Images v1.1 NOTE: First read "first" magic. img_magic, = struct.unpack('i', f.read(4)) if img_magic in (magic.by_name['IMG_COMMON'], magic.by_name['IMG_SERVICE']): img_magic, = struct.unpack('i', f.read(4)) try: m = magic.by_val[img_magic] except: raise MagicException(img_magic) try: handler = handlers[m] except: raise Exception("No handler found for image with magic " + m) return m, handler def load(f, pretty = False, no_payload = False): """ Convert criu image from binary format to dict(json). Takes a file-like object to read criu image from. Returns criu image in dict(json) format. """ image = {} m, handler = __rhandler(f) image['magic'] = m image['entries'] = handler.load(f, pretty, no_payload) return image def info(f): res = {} m, handler = __rhandler(f) res['magic'] = m res['count'] = handler.count(f) return res def loads(s, pretty = False): """ Same as load(), but takes a string. """ f = io.BytesIO(s) return load(f, pretty) def dump(img, f): """ Convert criu image from dict(json) format to binary. Takes an image in dict(json) format and file-like object to write to. """ m = img['magic'] magic_val = magic.by_name[img['magic']] # Images v1.1 NOTE: use "second" magic to identify what "first" # should be written. if m != 'INVENTORY': if m in ('STATS', 'IRMAP_CACHE'): f.write(struct.pack('i', magic.by_name['IMG_SERVICE'])) else: f.write(struct.pack('i', magic.by_name['IMG_COMMON'])) f.write(struct.pack('i', magic_val)) try: handler = handlers[m] except: raise Exception("No handler found for image with such magic") handler.dump(img['entries'], f) def dumps(img): """ Same as dump(), but takes only an image and returns a string. """ f = io.BytesIO('') dump(img, f) return f.getvalue() criu-3.6/lib/py/images/pb2dict.py000066400000000000000000000223731317335042600167030ustar00rootroot00000000000000from google.protobuf.descriptor import FieldDescriptor as FD import opts_pb2 import ipaddr import socket import collections import os # pb2dict and dict2pb are methods to convert pb to/from dict. # Inspired by: # protobuf-to-dict - https://github.com/benhodgson/protobuf-to-dict # protobuf-json - https://code.google.com/p/protobuf-json/ # protobuf source - https://code.google.com/p/protobuf/ # Both protobuf-to-dict/json do not fit here because of several reasons, # here are some of them: # - both have a common bug in treating optional field with empty # repeated inside. # - protobuf-to-json is not avalible in pip or in any other python # repo, so it is hard to distribute and we can't rely on it. # - both do not treat enums in a way we would like to. They convert # protobuf enum to int, but we need a string here, because it is # much more informative. BTW, protobuf text_format converts pb # enums to string value too. (i.e. "march : x86_64" is better then # "march : 1"). _basic_cast = { FD.TYPE_FIXED64 : long, FD.TYPE_FIXED32 : int, FD.TYPE_SFIXED64 : long, FD.TYPE_SFIXED32 : int, FD.TYPE_INT64 : long, FD.TYPE_UINT64 : long, FD.TYPE_SINT64 : long, FD.TYPE_INT32 : int, FD.TYPE_UINT32 : int, FD.TYPE_SINT32 : int, FD.TYPE_BOOL : bool, FD.TYPE_STRING : unicode } def _marked_as_hex(field): return field.GetOptions().Extensions[opts_pb2.criu].hex def _marked_as_ip(field): return field.GetOptions().Extensions[opts_pb2.criu].ipadd def _marked_as_flags(field): return field.GetOptions().Extensions[opts_pb2.criu].flags def _marked_as_dev(field): return field.GetOptions().Extensions[opts_pb2.criu].dev def _marked_as_odev(field): return field.GetOptions().Extensions[opts_pb2.criu].odev def _marked_as_dict(field): return field.GetOptions().Extensions[opts_pb2.criu].dict def _custom_conv(field): return field.GetOptions().Extensions[opts_pb2.criu].conv mmap_prot_map = [ ('PROT_READ', 0x1), ('PROT_WRITE', 0x2), ('PROT_EXEC', 0x4), ]; mmap_flags_map = [ ('MAP_SHARED', 0x1), ('MAP_PRIVATE', 0x2), ('MAP_ANON', 0x20), ('MAP_GROWSDOWN', 0x0100), ]; mmap_status_map = [ ('VMA_AREA_NONE', 0 << 0), ('VMA_AREA_REGULAR', 1 << 0), ('VMA_AREA_STACK', 1 << 1), ('VMA_AREA_VSYSCALL', 1 << 2), ('VMA_AREA_VDSO', 1 << 3), ('VMA_AREA_HEAP', 1 << 5), ('VMA_FILE_PRIVATE', 1 << 6), ('VMA_FILE_SHARED', 1 << 7), ('VMA_ANON_SHARED', 1 << 8), ('VMA_ANON_PRIVATE', 1 << 9), ('VMA_AREA_SYSVIPC', 1 << 10), ('VMA_AREA_SOCKET', 1 << 11), ('VMA_AREA_VVAR', 1 << 12), ('VMA_AREA_AIORING', 1 << 13), ('VMA_UNSUPP', 1 << 31), ]; rfile_flags_map = [ ('O_WRONLY', 01), ('O_RDWR', 02), ('O_APPEND', 02000), ('O_DIRECT', 040000), ('O_LARGEFILE', 0100000), ]; pmap_flags_map = [ ('PE_PARENT', 1 << 0), ('PE_LAZY', 1 << 1), ('PE_PRESENT', 1 << 2), ]; flags_maps = { 'mmap.prot' : mmap_prot_map, 'mmap.flags' : mmap_flags_map, 'mmap.status' : mmap_status_map, 'rfile.flags' : rfile_flags_map, 'pmap.flags' : pmap_flags_map, } gen_maps = { 'task_state' : { 1: 'Alive', 3: 'Zombie', 6: 'Stopped' }, } sk_maps = { 'family' : { 2: 'INET' }, 'type' : { 1: 'STREAM', 2: 'DGRAM' }, 'state' : { 1: 'ESTABLISHED', 7: 'CLOSE', 10: 'LISTEN' }, 'proto' : { 6: 'TCP' }, } gen_rmaps = { k: {v2:k2 for k2,v2 in v.items()} for k,v in gen_maps.items() } sk_rmaps = { k: {v2:k2 for k2,v2 in v.items()} for k,v in sk_maps.items() } dict_maps = { 'gen' : ( gen_maps, gen_rmaps ), 'sk' : ( sk_maps, sk_rmaps ), } def map_flags(value, flags_map): bs = map(lambda x: x[0], filter(lambda x: value & x[1], flags_map)) value &= ~sum(map(lambda x: x[1], flags_map)) if value: bs.append("0x%x" % value) return " | ".join(bs) def unmap_flags(value, flags_map): if value == '': return 0 bd = dict(flags_map) return sum(map(lambda x: int(str(bd.get(x, x)), 0), map(lambda x: x.strip(), value.split('|')))) kern_minorbits = 20 # This is how kernel encodes dev_t in new format def decode_dev(field, value): if _marked_as_odev(field): return "%d:%d" % (os.major(value), os.minor(value)) else: return "%d:%d" % (value >> kern_minorbits, value & ((1 << kern_minorbits) - 1)) def encode_dev(field, value): dev = map(lambda x: int(x), value.split(':')) if _marked_as_odev(field): return os.makedev(dev[0], dev[1]) else: return dev[0] << kern_minorbits | dev[1] def encode_base64(value): return value.encode('base64') def decode_base64(value): return value.decode('base64') def encode_unix(value): return value.encode('quopri') def decode_unix(value): return value.decode('quopri') encode = { 'unix_name': encode_unix } decode = { 'unix_name': decode_unix } def get_bytes_enc(field): c = _custom_conv(field) if c: return encode[c] else: return encode_base64 def get_bytes_dec(field): c = _custom_conv(field) if c: return decode[c] else: return decode_base64 def is_string(value): return isinstance(value, unicode) or isinstance(value, str) def _pb2dict_cast(field, value, pretty = False, is_hex = False): if not is_hex: is_hex = _marked_as_hex(field) if field.type == FD.TYPE_MESSAGE: return pb2dict(value, pretty, is_hex) elif field.type == FD.TYPE_BYTES: return get_bytes_enc(field)(value) elif field.type == FD.TYPE_ENUM: return field.enum_type.values_by_number.get(value, None).name elif field.type in _basic_cast: cast = _basic_cast[field.type] if pretty and (cast == int or cast == long): if is_hex: # Fields that have (criu).hex = true option set # should be stored in hex string format. return "0x%x" % value if _marked_as_dev(field): return decode_dev(field, value) flags = _marked_as_flags(field) if flags: try: flags_map = flags_maps[flags] except: return "0x%x" % value # flags are better seen as hex anyway else: return map_flags(value, flags_map) dct = _marked_as_dict(field) if dct: return dict_maps[dct][0][field.name].get(value, cast(value)) return cast(value) else: raise Exception("Field(%s) has unsupported type %d" % (field.name, field.type)) def pb2dict(pb, pretty = False, is_hex = False): """ Convert protobuf msg to dictionary. Takes a protobuf message and returns a dict. """ d = collections.OrderedDict() if pretty else {} for field, value in pb.ListFields(): if field.label == FD.LABEL_REPEATED: d_val = [] if pretty and _marked_as_ip(field): if len(value) == 1: v = socket.ntohl(value[0]) addr = ipaddr.IPv4Address(v) else: v = 0 + (socket.ntohl(value[0]) << (32 * 3)) + \ (socket.ntohl(value[1]) << (32 * 2)) + \ (socket.ntohl(value[2]) << (32 * 1)) + \ (socket.ntohl(value[3])) addr = ipaddr.IPv6Address(v) d_val.append(addr.compressed) else: for v in value: d_val.append(_pb2dict_cast(field, v, pretty, is_hex)) else: d_val = _pb2dict_cast(field, value, pretty, is_hex) d[field.name] = d_val return d def _dict2pb_cast(field, value): # Not considering TYPE_MESSAGE here, as repeated # and non-repeated messages need special treatment # in this case, and are hadled separately. if field.type == FD.TYPE_BYTES: return get_bytes_dec(field)(value) elif field.type == FD.TYPE_ENUM: return field.enum_type.values_by_name.get(value, None).number elif field.type in _basic_cast: cast = _basic_cast[field.type] if (cast == int or cast == long) and is_string(value): if _marked_as_dev(field): return encode_dev(field, value) flags = _marked_as_flags(field) if flags: try: flags_map = flags_maps[flags] except: pass # Try to use plain string cast else: return unmap_flags(value, flags_map) dct = _marked_as_dict(field) if dct: ret = dict_maps[dct][1][field.name].get(value, None) if ret == None: ret = cast(value, 0) return ret # Some int or long fields might be stored as hex # strings. See _pb2dict_cast. return cast(value, 0) else: return cast(value) else: raise Exception("Field(%s) has unsupported type %d" % (field.name, field.type)) def dict2pb(d, pb): """ Convert dictionary to protobuf msg. Takes dict and protobuf message to be merged into. """ for field in pb.DESCRIPTOR.fields: if field.name not in d: continue value = d[field.name] if field.label == FD.LABEL_REPEATED: pb_val = getattr(pb, field.name, None) if is_string(value[0]) and _marked_as_ip(field): val = ipaddr.IPAddress(value[0]) if val.version == 4: pb_val.append(socket.htonl(int(val))) elif val.version == 6: ival = int(val) pb_val.append(socket.htonl((ival >> (32 * 3)) & 0xFFFFFFFF)) pb_val.append(socket.htonl((ival >> (32 * 2)) & 0xFFFFFFFF)) pb_val.append(socket.htonl((ival >> (32 * 1)) & 0xFFFFFFFF)) pb_val.append(socket.htonl((ival >> (32 * 0)) & 0xFFFFFFFF)) else: raise Exception("Unknown IP address version %d" % val.version) continue for v in value: if field.type == FD.TYPE_MESSAGE: dict2pb(v, pb_val.add()) else: pb_val.append(_dict2pb_cast(field, v)) else: if field.type == FD.TYPE_MESSAGE: # SetInParent method acts just like has_* = true in C, # and helps to properly treat cases when we have optional # field with empty repeated inside. getattr(pb, field.name).SetInParent() dict2pb(value, getattr(pb, field.name, None)) else: setattr(pb, field.name, _dict2pb_cast(field, value)) return pb criu-3.6/scripts/000077500000000000000000000000001317335042600140175ustar00rootroot00000000000000criu-3.6/scripts/build/000077500000000000000000000000001317335042600151165ustar00rootroot00000000000000criu-3.6/scripts/build/Dockerfile.aarch64.hdr000066400000000000000000000001711317335042600211120ustar00rootroot00000000000000FROM arm64v8/ubuntu:xenial COPY scripts/build/qemu-user-static/usr/bin/qemu-aarch64-static /usr/bin/qemu-aarch64-static criu-3.6/scripts/build/Dockerfile.aarch64.tmpl000077700000000000000000000000001317335042600242522Dockerfile.tmplustar00rootroot00000000000000criu-3.6/scripts/build/Dockerfile.alpine000066400000000000000000000014471317335042600203650ustar00rootroot00000000000000FROM alpine ARG CC=gcc ARG ENV1=FOOBAR RUN apk update && apk add \ build-base \ coreutils \ git \ protobuf-c-dev \ protobuf-dev \ python \ libaio-dev \ libcap-dev \ libnl3-dev \ pkgconfig \ libnet-dev \ ccache \ $CC COPY . /criu WORKDIR /criu ENV CC="ccache $CC" CCACHE_DIR=/tmp/.ccache CCACHE_NOCOMPRESS=1 $ENV1=yes RUN mv .ccache /tmp && make mrproper && ccache -s && ccache -z &&\ date && make -j $(nproc) CC="$CC" && date && ccache -s # Run a test RUN apk add py-yaml \ py-pip \ ip6tables \ iptables \ iproute2 \ tar \ bash RUN pip install protobuf ipaddr RUN make -C test/zdtm criu-3.6/scripts/build/Dockerfile.armv7hf.hdr000066400000000000000000000001611317335042600212330ustar00rootroot00000000000000FROM arm32v7/ubuntu:xenial COPY scripts/build/qemu-user-static/usr/bin/qemu-arm-static /usr/bin/qemu-arm-static criu-3.6/scripts/build/Dockerfile.armv7hf.tmpl000077700000000000000000000000001317335042600243742Dockerfile.tmplustar00rootroot00000000000000criu-3.6/scripts/build/Dockerfile.fedora-asan.hdr000066400000000000000000000000361317335042600220420ustar00rootroot00000000000000FROM fedora:latest ENV ASAN=1 criu-3.6/scripts/build/Dockerfile.fedora-asan.tmpl000077700000000000000000000000001317335042600264412Dockerfile.fedora.tmplustar00rootroot00000000000000criu-3.6/scripts/build/Dockerfile.fedora-rawhide-aarch64.hdr000066400000000000000000000001721317335042600237720ustar00rootroot00000000000000FROM arm64v8/fedora:rawhide COPY scripts/build/qemu-user-static/usr/bin/qemu-aarch64-static /usr/bin/qemu-aarch64-static criu-3.6/scripts/build/Dockerfile.fedora-rawhide-aarch64.tmpl000077700000000000000000000000001317335042600303702Dockerfile.fedora.tmplustar00rootroot00000000000000criu-3.6/scripts/build/Dockerfile.fedora-rawhide.hdr000066400000000000000000000000241317335042600225400ustar00rootroot00000000000000FROM fedora:rawhide criu-3.6/scripts/build/Dockerfile.fedora-rawhide.tmpl000077700000000000000000000000001317335042600271422Dockerfile.fedora.tmplustar00rootroot00000000000000criu-3.6/scripts/build/Dockerfile.fedora.tmpl000066400000000000000000000010251317335042600213200ustar00rootroot00000000000000ARG CC=gcc ARG ENV1=FOOBAR RUN dnf install -y git gcc make RUN dnf install -y protobuf-devel protobuf-c-devel libaio-devel libcap-devel libnl3-devel libnet-devel RUN dnf install -y python ccache libasan findutils tar python-yaml protobuf-python iptables iproute python-ipaddr procps-ng COPY . /criu WORKDIR /criu ENV CCACHE_DIR=/tmp/.ccache CCACHE_NOCOMPRESS=1 $ENV1=yes RUN mv .ccache /tmp && make mrproper && ccache -s && ccache -z && \ date && make -j $(nproc) CC="$CC" && date && ccache -s RUN make -C test/zdtm -j $(nproc) criu-3.6/scripts/build/Dockerfile.ppc64le.hdr000066400000000000000000000002751317335042600211440ustar00rootroot00000000000000FROM ppc64le/ubuntu:xenial ENV QEMU_CPU POWER8 COPY scripts/build/qemu-user-static/usr/bin/qemu-ppc64le-static /usr/bin/qemu-ppc64le-static RUN sed -i '/security/ d' /etc/apt/sources.list criu-3.6/scripts/build/Dockerfile.ppc64le.tmpl000077700000000000000000000000001317335042600242772Dockerfile.tmplustar00rootroot00000000000000criu-3.6/scripts/build/Dockerfile.rawhide000066400000000000000000000006431317335042600205350ustar00rootroot00000000000000FROM fedora:rawhide ARG CC=gcc RUN dnf install -y git gcc make RUN dnf install -y protobuf-devel protobuf-c-devel libaio-devel libcap-devel libnl3-devel libnet-devel RUN dnf install -y python ccache libasan findutils tar python-yaml protobuf-python iptables iproute python-ipaddr procps-ng COPY . /criu WORKDIR /criu RUN make mrproper && make -j $(nproc) RUN pip install protobuf RUN make -C test/zdtm/static env00 criu-3.6/scripts/build/Dockerfile.s390x.hdr000066400000000000000000000002051317335042600205460ustar00rootroot00000000000000FROM s390x/debian:jessie ENV QEMU_CPU z900 COPY scripts/build/qemu-user-static/usr/bin/qemu-s390x-static /usr/bin/qemu-s390x-static criu-3.6/scripts/build/Dockerfile.s390x.tmpl000077700000000000000000000000001317335042600237102Dockerfile.tmplustar00rootroot00000000000000criu-3.6/scripts/build/Dockerfile.tmpl000066400000000000000000000022061317335042600200630ustar00rootroot00000000000000ARG CC=gcc ARG ENV1=FOOBAR RUN apt-get update && apt-get install -y \ build-essential \ protobuf-c-compiler \ libprotobuf-c0-dev \ libprotobuf-dev \ bsdmainutils \ protobuf-compiler \ python-minimal \ libaio-dev \ libcap-dev \ iptables \ libnl-3-dev \ libselinux-dev \ pkg-config \ git-core \ libnet-dev \ ccache \ $CC COPY . /criu WORKDIR /criu ENV CC="ccache $CC" CCACHE_DIR=/tmp/.ccache CCACHE_NOCOMPRESS=1 $ENV1=yes RUN mv .ccache /tmp && make mrproper && ccache -s && \ date && \ # Check single object build make -j $(nproc) CC="$CC" criu/parasite-syscall.o && \ # Compile criu make -j $(nproc) CC="$CC" && \ date && \ # Check that "make mrproper" works make mrproper && ! git clean -ndx --exclude=scripts/build \ --exclude=.config --exclude=test | grep . # Compile tests RUN date && make -j $(nproc) CC="$CC" -C test/zdtm && date #RUN make test/compel/handle_binary && ./test/compel/handle_binary criu-3.6/scripts/build/Dockerfile.x86_64.hdr000066400000000000000000000001411317335042600206150ustar00rootroot00000000000000FROM ubuntu:xenial RUN apt-get update -qq && apt-get install -qq \ gcc-multilib criu-3.6/scripts/build/Dockerfile.x86_64.tmpl000077700000000000000000000000001317335042600237602Dockerfile.tmplustar00rootroot00000000000000criu-3.6/scripts/build/Makefile000066400000000000000000000024741317335042600165650ustar00rootroot00000000000000QEMU_ARCHES := armv7hf aarch64 ppc64le s390x fedora-rawhide-aarch64 # require qemu ARCHES := $(QEMU_ARCHES) x86_64 fedora-asan fedora-rawhide TARGETS := $(ARCHES) alpine TARGETS_CLANG := $(addsuffix $(TARGETS),-clang) all: $(TARGETS) $(TARGETS_CLANG) .PHONY: all # A build for each architecture requires appropriate Dockerfile define ARCH_DEP $(1): Dockerfile.$(1) endef $(foreach arch,$(ARCHES),$(eval $(call ARCH_DEP,$(arch)))) Dockerfile.%: Dockerfile.%.hdr Dockerfile.%.tmpl cat $^ > $@ qemu-user-static: ./extract-deb-pkg qemu-user-static binfmt_misc: ./binfmt_misc .PHONY: binfmt_misc $(QEMU_ARCHES): qemu-user-static binfmt_misc $(TARGETS): mkdir -p $(HOME)/.ccache mv $(HOME)/.ccache ../../ docker build -t criu-$@ -f Dockerfile.$@ $(DB_CC) $(DB_ENV) ../.. docker run criu-$@ tar c -C /tmp .ccache | tar x -C $(HOME) .PHONY: $(TARGETS) # Clang builds add some Docker build env define CLANG_DEP $(1)-clang: $(1) endef $(foreach t,$(TARGETS),$(eval $(call CLANG_DEP,$(t)))) %-clang: DB_CC=--build-arg CC=clang %-clang: DB_ENV=--build-arg ENV1=CCACHE_CPP2 s390x-clang: DB_CC=--build-arg CC=clang-3.8 .PHONY: $(TARGETS_CLANG) clean: rm -rf qemu-user-static for ARCH in $(ARCHES); do \ FILE=/proc/sys/fs/binfmt_misc/$$ARCH; \ test -f $$FILE && echo -1 > $$FILE; \ rm -f Dockerfile.$$ARCH; \ done .PHONY: clean criu-3.6/scripts/build/binfmt_misc000077500000000000000000000021571317335042600173430ustar00rootroot00000000000000set -e -x test -f /proc/sys/fs/binfmt_misc/armv7hf || echo ':armv7hf:M::\x7fELF\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x28\x00:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff:/usr/bin/qemu-arm-static:' > /proc/sys/fs/binfmt_misc/register; test -f /proc/sys/fs/binfmt_misc/aarch64 || echo ':aarch64:M::\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xb7:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff:/usr/bin/qemu-aarch64-static:' > /proc/sys/fs/binfmt_misc/register test -f /proc/sys/fs/binfmt_misc/ppc64le || echo ':ppc64le:M::\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x15\x00:\xff\xff\xff\xff\xff\xff\xff\xfc\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\x00:/usr/bin/qemu-ppc64le-static:' > /proc/sys/fs/binfmt_misc/register; done test -f /proc/sys/fs/binfmt_misc/s390x || echo ':s390x:M::\x7fELF\x02\x02\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x16:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff:/usr/bin/qemu-s390x-static:' > /proc/sys/fs/binfmt_misc/register criu-3.6/scripts/build/extract-deb-pkg000077500000000000000000000012121317335042600200210ustar00rootroot00000000000000#!/bin/bash set -e set -u set -o pipefail MIRROR="https://mirrors.kernel.org/ubuntu" PKGS="$MIRROR/dists/xenial/universe/binary-amd64/Packages.gz" if [ $# -ne 1 ]; then echo "Usage: $0 package-name" 1>&2 exit 1 fi if [ -d "$1" ]; then echo "Directory $1 already exists -- exiting" exit 0 fi if ! pkg=$(curl -sSL "$PKGS" | zgrep "Filename.*$1" | awk '{ print $2 }'); then echo "ERROR: no packages matching $1" 1>&2 exit 1 fi if [ "$(wc -w <<< "$pkg")" -gt 1 ]; then echo "$pkg" 1>&2 echo "ERROR: more than one match for $1" 1>&2 exit 1 fi mkdir "$1" cd "$1" wget "$MIRROR/$pkg" pkg=$(basename "$pkg") ar vx "$pkg" tar xJvf data.tar.xz criu-3.6/scripts/crit-setup.py000066400000000000000000000005461317335042600164750ustar00rootroot00000000000000from distutils.core import setup setup(name = "crit", version = "0.0.1", description = "CRiu Image Tool", author = "CRIU team", author_email = "criu@openvz.org", url = "https://github.com/xemul/criu", package_dir = {'pycriu': 'lib/py'}, packages = ["pycriu", "pycriu.images"], scripts = ["crit/crit"] ) criu-3.6/scripts/fake-restore.sh000077500000000000000000000005451317335042600167510ustar00rootroot00000000000000#!/bin/bash # # A stupid script to abort restore at the very end. Useful to test # restore w/o letting the restored processes continue running. E.g. # can be used to measure the restore time. # # Usage: # criu restore --action-script $(pwd)/scripts/fake-restore.sh # if [ "$CRTOOLS_SCRIPT_ACTION" == "post-restore" ]; then exit 1 else exit 0 fi criu-3.6/scripts/feature-tests.mak000066400000000000000000000033601317335042600173060ustar00rootroot00000000000000define FEATURE_TEST_TCP_REPAIR #include int main(void) { struct tcp_repair_opt opts; opts.opt_code = TCP_NO_QUEUE; opts.opt_val = 0; return opts.opt_val; } endef define FEATURE_TEST_TCP_REPAIR_WINDOW #include int main(void) { struct tcp_repair_window opts; opts.snd_wl1 = 0; return opts.snd_wl1; } endef define FEATURE_TEST_LIBBSD_DEV #include int main(void) { return 0; } endef define FEATURE_TEST_STRLCPY #include #ifdef CONFIG_HAS_LIBBSD # include #endif int main(void) { return strlcpy(NULL, NULL, 0); } endef define FEATURE_TEST_STRLCAT #include #ifdef CONFIG_HAS_LIBBSD # include #endif int main(void) { return strlcat(NULL, NULL, 0); } endef define FEATURE_TEST_PTRACE_PEEKSIGINFO #include int main(void) { struct ptrace_peeksiginfo_args args = {}; return 0; } endef define FEATURE_TEST_SETPROCTITLE_INIT #include int main(int argc, char *argv[], char *envp[]) { setproctitle_init(argc, argv, envp); return 0; } endef define FEATURE_TEST_X86_COMPAT #define __ALIGN .align 4, 0x90 #define ENTRY(name) \ .globl name; \ .type name, @function; \ __ALIGN; \ name: #define END(sym) \ .size sym, . - sym #define __USER32_CS 0x23 #define __USER_CS 0x33 .text ENTRY(call32_from_64) /* Switch into compatibility mode */ pushq \$$__USER32_CS pushq \$$1f lretq 1: .code32 /* Run function and switch back */ call *%esi jmp \$$__USER_CS,\$$1f .code64 1: END(call32_from_64) ENTRY(main) nop END(main) endef criu-3.6/scripts/flake8.cfg000066400000000000000000000005101317335042600156460ustar00rootroot00000000000000[flake8] # W191 indentation contains tabs # E128 continuation line under-indented for visual indent # E501 line too long # E251 unexpected spaces around keyword / parameter equals # E101 indentation contains mixed spaces and tabs # E126 continuation line over-indented for hanging indent ignore = W191,E128,E501,E251,E101,E126 criu-3.6/scripts/install-debian-pkgs.sh000077500000000000000000000007611317335042600202120ustar00rootroot00000000000000#!/bin/bash # Install required packages for development environment in Debian Distro REQ_PKGS=${REQ_PKGS:=contrib/debian/dev-packages.lst} help_msg="Install required packages for development environment in Debian Distro Usage: scripts/install-debian-pkgs.sh" function print_help() { exec echo -e "$help_msg" } function process() { sudo apt-get update sudo apt-get install -yq $( sed 's/\#.*$//' ${REQ_PKGS} ) } if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then print_help else process fi criu-3.6/scripts/magic-gen.py000077500000000000000000000025621317335042600162300ustar00rootroot00000000000000#!/bin/env python2 import os, sys import struct # This program parses criu magic.h file and produces # magic.py with all *_MAGIC constants except RAW and V1. def main(argv): if len(argv) != 3: print("Usage: magic-gen.py path/to/image.h path/to/magic.py") exit(1) magic_c_header = argv[1] magic_py = argv[2] out = open(magic_py, 'w+') # all_magic is used to parse constructions like: # #define PAGEMAP_MAGIC 0x56084025 # #define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC all_magic = {} # and magic is used to store only unique magic. magic = {} f = open(magic_c_header, 'r') for line in f: split = line.split() if len(split) < 3: continue if not '#define' in split[0]: continue key = split[1] value = split[2] if value in all_magic: value = all_magic[value] else: magic[key] = value all_magic[key] = value out.write('#Autogenerated. Do not edit!\n') out.write('by_name = {}\n') out.write('by_val = {}\n') for k,v in magic.items(): # We don't need RAW or V1 magic, because # they can't be used to identify images. if v == '0x0' or v == '1' or k == '0x0' or v == '1': continue if k.endswith("_MAGIC"): # Just cutting _MAGIC suffix k = k[:-6] v = int(v, 16) out.write("by_name['"+ k +"'] = "+ str(v) +"\n") out.write("by_val["+ str(v) +"] = '"+ k +"'\n") f.close() out.close() if __name__ == "__main__": main(sys.argv) criu-3.6/scripts/nmk/000077500000000000000000000000001317335042600146045ustar00rootroot00000000000000criu-3.6/scripts/nmk/.gitignore000066400000000000000000000000301317335042600165650ustar00rootroot00000000000000*.swp *.swo .git-ignore criu-3.6/scripts/nmk/Documentation/000077500000000000000000000000001317335042600174155ustar00rootroot00000000000000criu-3.6/scripts/nmk/Documentation/Makefile000066400000000000000000000016411317335042600210570ustar00rootroot00000000000000ASCIIDOC := asciidoc A2X := a2x XMLTO := xmlto PS2PDF := ps2pdf SRC += nmk.txt XMLS := $(patsubst %.txt,%.xml,$(SRC)) MANS := $(patsubst %.txt,%.8,$(SRC)) GROFF := groff PAPER := $(shell paperconf 2>/dev/null || echo letter) GROFF_OPTS := -Tps -t -dpaper=$(PAPER) -P-p$(PAPER) -man -msafer -rC1 -rD1 -rS11 PSS := $(MANS:%.8=%.ps) PDFS := $(MANS:%.8=%.pdf) ps: $(PSS) pdf: $(PDFS) all: check $(MANS) .PHONY: all ps pdf check clean check: $(Q) for B in $(ASCIIDOC) $(A2X) $(XMLTO); do \ $$B --version > /dev/null || exit 1; \ done %.8: %.txt $(call msg-gen, $@) $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.8,%.xml,$@) $< $(Q) $(XMLTO) man --skip-validation $(patsubst %.8,%.xml,$@) 2>/dev/null %.ps: %.8 $(call msg-gen, $@) $(Q) $(GROFF) $(GROFF_OPTS) $^ > $@ %.pdf: %.ps $(call msg-gen, $@) $(Q) $(PS2PDF) $< $@ clean: $(call msg-clean, "docs") $(Q) $(RM) $(XMLS) $(MANS) $(PSS) $(PDFS) criu-3.6/scripts/nmk/Documentation/nmk.txt000066400000000000000000000034231317335042600207450ustar00rootroot00000000000000nmk(8) ====== NAME ---- nmk - a framework to minimize Makefile code needed for simple projects SYNOPSIS -------- *make* -f main.mk makefile=Makefile obj= OVERVIEW -------- Most of projects have similar source code structure: * Toplevel 'Makefile' * Source code itself in directory '' * Headers are gathered into directory '' so that building procedure is invoking *make* to read toplevel 'Makefile', compile sources and link a final executable program. Taking this into account *nmk* is trying to minimize efforts needed to write 'Makefile'. USAGE ----- First of all the *nmk* scripts are to be placed into some known place so the *make* would be able to read them from a command line. Internally *nmk* uses *__nmk_dir* variable to find own sources. Thus one can export ---------- export __nmk_dir=/ ---------- in a makefile or do it via environment variables. Note the ending slash is mandatory. As been mentioned earlier source code tree should include toplevel 'Makefile' and source code in '' directory. Source code '' should provide own 'Makefile' (secondlevel) where files to be compiled are enumerated. A typical source code tree will look like ---------- Makefile # toplevel Makefile # directory with nmk scripts # source code directory Makefile # secondlevel Makefile src1.c # source code src2.c ... ---------- In toplevel 'Makefile' we should plug in *nmk* itself ---------- export __nmk_dir=scripts/ include $(__nmk_dir)include.mk ---------- In secondlevel 'Makefile' we should enumerate files to be compiled. ---------- obj-y += src1.o obj-y += src2.o ... ---------- That is basically all one need to build a program. criu-3.6/scripts/nmk/Makefile000066400000000000000000000014631317335042600162500ustar00rootroot00000000000000__nmk_dir=scripts/ export __nmk_dir include $(__nmk_dir)include.mk MAKEFLAGS := -r -R --no-print-directory .PHONY: all help test docs clean install help: @echo ' Targets:' @echo ' install dir= - Install scripts into directory ' @echo ' docs - Build documentation' @echo ' clean - Clean everything' test: $(Q) $(MAKE) -C tests all docs: $(Q) $(MAKE) -C Documentation all install: @echo 'Copying scripts into $(dir)' @cp scripts/build.mk $(dir) @cp scripts/include.mk $(dir) @cp scripts/macro.mk $(dir) @cp scripts/main.mk $(dir) @cp scripts/rules.mk $(dir) @cp scripts/tools.mk $(dir) @cp scripts/utils.mk $(dir) all: ; clean: $(call msg-clean, "nmk") $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) -C tests clean .DEFAULT_GOAL ?= all criu-3.6/scripts/nmk/README.md000066400000000000000000000002201317335042600160550ustar00rootroot00000000000000NMK === NMK stands for NetMaKe -- is a very simple framework for make build system. Most ideas are taken from the Linux kernel kbuild system. criu-3.6/scripts/nmk/scripts/000077500000000000000000000000001317335042600162735ustar00rootroot00000000000000criu-3.6/scripts/nmk/scripts/build.mk000066400000000000000000000230001317335042600177160ustar00rootroot00000000000000ifndef ____nmk_defined__build # # General helpers for simplified Makefiles. # src := $(obj) src-makefile := $(call objectify,$(makefile)) obj-y := lib-y := target := deps-y := all-y := builtin-name := lib-name := ld_flags := cleanup-y := mrproper-y := objdirs := libso-y := MAKECMDGOALS := $(call uniq,$(MAKECMDGOALS)) ifndef obj $(error obj is undefined) endif ifndef __nmk-makefile-deps # Add top-make - it isn't included into this build.mk __nmk-makefile-deps := Makefile endif __nmk-makefile-deps += $(src-makefile) export __nmk-makefile-deps # # Filter out any -Wl,XXX option: some of build farms # assumes that we're using $(CC) for building built-in # targets (and they have all rights to). But we're # using $(LD) directly instead so filter out -Wl # flags to make maintainer's life easier. LDFLAGS-MASK := -Wl,% LDFLAGS := $(filter-out $(LDFLAGS-MASK),$(LDFLAGS)) # # Accumulate common flags. define nmk-ccflags $(filter-out $(CFLAGS_REMOVE_$(@F)), $(CFLAGS) $(ccflags-y) $(CFLAGS_$(@F))) endef define nmk-asflags $(CFLAGS) $(AFLAGS) $(asflags-y) $(AFLAGS_$(@F)) endef define nmk-host-ccflags $(HOSTCFLAGS) $(host-ccflags-y) $(HOSTCFLAGS_$(@F)) endef # # General rules. define gen-cc-rules $(1).o: $(2).c $(__nmk-makefile-deps) $$(call msg-cc, $$@) $$(Q) $$(CC) -c $$(strip $$(nmk-ccflags)) $$< -o $$@ $(1).i: $(2).c $(__nmk-makefile-deps) $$(call msg-cc, $$@) $$(Q) $$(CC) -E $$(strip $$(nmk-ccflags)) $$< -o $$@ $(1).s: $(2).c $(__nmk-makefile-deps) $$(call msg-cc, $$@) $$(Q) $$(CC) -S -fverbose-asm $$(strip $$(nmk-ccflags)) $$< -o $$@ $(1).d: $(2).c $(__nmk-makefile-deps) $$(call msg-dep, $$@) $$(Q) $$(CC) -M -MT $$@ -MT $$(patsubst %.d,%.o,$$@) $$(strip $$(nmk-ccflags)) $$< -o $$@ $(1).o: $(2).S $(__nmk-makefile-deps) $$(call msg-cc, $$@) $$(Q) $$(CC) -c $$(strip $$(nmk-asflags)) $$< -o $$@ $(1).i: $(2).S $(__nmk-makefile-deps) $$(call msg-cc, $$@) $$(Q) $$(CC) -E $$(strip $$(nmk-asflags)) $$< -o $$@ $(1).d: $(2).S $(__nmk-makefile-deps) $$(call msg-dep, $$@) $$(Q) $$(CC) -M -MT $$@ -MT $$(patsubst %.d,%.o,$$@) $$(strip $$(nmk-asflags)) $$< -o $$@ endef include $(src-makefile) ifneq ($(strip $(target)),) target := $(sort $(call uniq,$(target))) endif # # Prepare the unique entries. obj-y := $(sort $(call uniq,$(obj-y))) lib-y := $(filter-out $(obj-y),$(lib-y)) # # Add subdir path obj-y := $(call objectify,$(obj-y)) lib-y := $(call objectify,$(lib-y)) # # Strip custom names. lib-name := $(strip $(lib-name)) builtin-name := $(strip $(builtin-name)) # # Link flags. ld_flags := $(strip $(LDFLAGS) $(ldflags-y)) # # $(obj) related rules. $(eval $(call gen-cc-rules,$(obj)/%,$(obj)/%)) # # Prepare targets. ifneq ($(lib-y),) lib-target := ifneq ($(lib-name),) lib-target := $(obj)/$(lib-name) else lib-target := $(obj)/lib.a endif cleanup-y += $(call cleanify,$(lib-y)) cleanup-y += $(lib-target) all-y += $(lib-target) objdirs += $(dir $(lib-y)) endif ifneq ($(obj-y),) builtin-target := ifneq ($(builtin-name),) builtin-target := $(obj)/$(builtin-name) else builtin-target := $(obj)/built-in.o endif cleanup-y += $(call cleanify,$(obj-y)) cleanup-y += $(builtin-target) all-y += $(builtin-target) objdirs += $(dir $(obj-y)) endif # # Helpers for targets. define gen-ld-target-rule $(1): $(3) $$(call msg-link, $$@) $$(Q) $$(LD) $(2) -o $$@ $(4) endef define gen-ar-target-rule $(1): $(3) $$(call msg-ar, $$@) $$(Q) $$(AR) -rcs$(2) $$@ $(4) endef # # Predefined (builtins) targets rules. ifdef builtin-target $(eval $(call gen-ld-target-rule, \ $(builtin-target), \ $(ld_flags), \ $(obj-y) $(__nmk-makefile-deps), \ $(obj-y) $(call objectify,$(obj-e)))) endif ifdef lib-target $(eval $(call gen-ar-target-rule, \ $(lib-target), \ $(ARFLAGS) $(arflags-y), \ $(lib-y) $(__nmk-makefile-deps), \ $(lib-y) $(call objectify,$(lib-e)))) endif # # Custom targets rules. define gen-custom-target-rule ifneq ($($(1)-obj-y),) $(eval $(call gen-ld-target-rule, \ $(obj)/$(1).built-in.o, \ $(ld_flags) $(LDFLAGS_$(1)), \ $(call objectify,$($(1)-obj-y)) \ $(__nmk-makefile-deps), \ $(call objectify,$($(1)-obj-y)) \ $(call objectify,$($(1)-obj-e)))) all-y += $(obj)/$(1).built-in.o cleanup-y += $(call cleanify,$(call objectify,$($(1)-obj-y))) cleanup-y += $(obj)/$(1).built-in.o objdirs += $(dir $(call objectify,$($(1)-obj-y))) endif ifneq ($($(1)-lib-y),) $(eval $(call gen-ar-target-rule, \ $(obj)/$(1).lib.a, \ $(ARFLAGS) $($(1)-arflags-y), \ $(call objectify,$($(1)-lib-y)) \ $(__nmk-makefile-deps), \ $(call objectify,$($(1)-lib-y))) \ $(call objectify,$($(1)-lib-e))) all-y += $(obj)/$(1).lib.a cleanup-y += $(call cleanify,$(call objectify,$($(1)-lib-y))) cleanup-y += $(obj)/$(1).lib.a objdirs += $(dir $(call objectify,$($(1)-lib-y))) endif endef $(foreach t,$(target),$(eval $(call gen-custom-target-rule,$(t)))) # # Prepare rules for dirs other than (obj)/. objdirs := $(patsubst %/,%,$(filter-out $(obj)/,$(call uniq,$(objdirs)))) $(foreach t,$(objdirs),$(eval $(call gen-cc-rules,$(t)/%,$(t)/%))) # # Host programs. define gen-host-cc-rules $(addprefix $(obj)/,$(1)): $(obj)/%.o: $(obj)/%.c $(__nmk-makefile-deps) $$(call msg-host-cc, $$@) $$(Q) $$(HOSTCC) -c $$(strip $$(nmk-host-ccflags)) $$< -o $$@ $(patsubst %.o,%.i,$(addprefix $(obj)/,$(1))): $(obj)/%.i: $(obj)/%.c $(__nmk-makefile-deps) $$(call msg-host-cc, $$@) $$(Q) $$(HOSTCC) -E $$(strip $$(nmk-host-ccflags)) $$< -o $$@ $(patsubst %.o,%.s,$(addprefix $(obj)/,$(1))): $(obj)/%.s: $(obj)/%.c $(__nmk-makefile-deps) $$(call msg-host-cc, $$@) $$(Q) $$(HOSTCC) -S -fverbose-asm $$(strip $$(nmk-host-ccflags)) $$< -o $$@ $(patsubst %.o,%.d,$(addprefix $(obj)/,$(1))): $(obj)/%.d: $(obj)/%.c $(__nmk-makefile-deps) $$(call msg-host-dep, $$@) $$(Q) $$(HOSTCC) -M -MT $$@ -MT $$(patsubst %.d,%.o,$$@) $$(strip $$(nmk-host-ccflags)) $$< -o $$@ endef define gen-host-rules $(eval $(call gen-host-cc-rules,$($(1)-objs))) all-y += $(addprefix $(obj)/,$($(1)-objs)) cleanup-y += $(call cleanify,$(addprefix $(obj)/,$($(1)-objs))) $(obj)/$(1): $(addprefix $(obj)/,$($(1)-objs)) $(__nmk-makefile-deps) $$(call msg-host-link, $$@) $$(Q) $$(HOSTCC) $$(HOSTCFLAGS) $(addprefix $(obj)/,$($(1)-objs)) $$(HOSTLDFLAGS) $$(HOSTLDFLAGS_$$(@F)) -o $$@ all-y += $(obj)/$(1) cleanup-y += $(obj)/$(1) endef $(foreach t,$(hostprogs-y),$(eval $(call gen-host-rules,$(t)))) # # Dynamic library linking. define gen-so-link-rules $(call objectify,$(1)).so: $(call objectify,$($(1)-objs)) $(__nmk-makefile-deps) $$(call msg-link, $$@) $$(Q) $$(CC) -shared $$(ldflags-so) $$(LDFLAGS) $$(LDFLAGS_$$(@F)) -o $$@ $(call objectify,$($(1)-objs)) all-y += $(call objectify,$(1)).so cleanup-y += $(call objectify,$(1)).so endef $(foreach t,$(libso-y),$(eval $(call gen-so-link-rules,$(t)))) # # Figure out if the target we're building needs deps to include. define collect-deps ifneq ($(filter-out %.d,$(1)),) ifneq ($(filter %.o %.i %.s,$(1)),) deps-y += $(addsuffix .d,$(basename $(1))) endif endif ifeq ($(builtin-target),$(1)) deps-y += $(obj-y:.o=.d) endif ifeq ($(lib-target),$(1)) deps-y += $(lib-y:.o=.d) endif ifneq ($(filter all $(all-y) $(hostprogs-y),$(1)),) deps-y += $(obj-y:.o=.d) deps-y += $(lib-y:.o=.d) deps-y += $(foreach t,$(target),$(call objectify,$($(t)-lib-y:.o=.d)) $(call objectify,$($(t)-obj-y:.o=.d))) deps-y += $(foreach t,$(hostprogs-y),$(addprefix $(obj)/,$($(t)-objs:.o=.d))) endif endef ifneq ($(MAKECMDGOALS),) ifneq ($(MAKECMDGOALS),clean) $(foreach goal,$(MAKECMDGOALS),$(eval $(call collect-deps,$(goal)))) deps-y := $(call uniq,$(deps-y)) ifneq ($(deps-y),) $(eval -include $(deps-y)) endif endif endif # # Main phony rule. all: $(all-y) ; .PHONY: all # # Clean most files, but leave enough to navigate with tags (generated files) clean: $(call msg-clean, $(obj)) $(Q) $(RM) $(cleanup-y) .PHONY: clean # # Delete all generated files mrproper: clean $(Q) $(RM) $(mrproper-y) .PHONY: mrproper # # Footer. ____nmk_defined__build = y endif criu-3.6/scripts/nmk/scripts/include.mk000066400000000000000000000025351317335042600202540ustar00rootroot00000000000000ifndef ____nmk_defined__include ifndef ____nmk_defined__msg include $(__nmk_dir)msg.mk endif # # Common vars. SUBARCH := $(shell uname -m | sed \ -e s/i.86/x86/ \ -e s/x86_64/x86/ \ -e s/sun4u/sparc64/ \ -e s/arm.*/arm/ \ -e s/sa110/arm/ \ -e s/s390x/s390/ \ -e s/parisc64/parisc/ \ -e s/ppc64.*/ppc64/ \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ -e s/aarch64.*/aarch64/) ARCH ?= $(SUBARCH) SRCARCH := $(ARCH) export SUBARCH ARCH SRCARCH ifndef ____nmk_defined__tools include $(__nmk_dir)tools.mk endif # Do not use make's built-in rules and variables # (this increases performance and avoids hard-to-debug behaviour). MAKEFLAGS += -rR --no-print-directory export MAKEFLAGS # Avoid funny character set dependencies. unexport LC_ALL LC_COLLATE=C LC_NUMERIC=C export LC_COLLATE LC_NUMERIC # Avoid interference with shell env settings. unexport GREP_OPTIONS # Shorthand for build. build := -r -R -f $(__nmk_dir)main.mk makefile=Makefile obj export build # With specified Makefile build-as = -r -R -f $(__nmk_dir)main.mk makefile=$(1) obj=$(2) export build-as # # Footer. ____nmk_defined__include = y endif criu-3.6/scripts/nmk/scripts/macro.mk000066400000000000000000000011561317335042600177300ustar00rootroot00000000000000ifndef ____nmk_defined__macro # # Helper to include makefile only once. # define include-once ifndef $(join ____nmk_defined__,$(1:.mk=)) include $(__nmk_dir)$(1) endif endef # Helper to build built-in target in directory. # $(eval $(call gen-built-in,,,)) define gen-built-in $(1)/%: $(2) $$(Q) $$(MAKE) $$(build)=$(1) $$@ ifneq ($(3),) $(3): $(2) $$(Q) $$(MAKE) $$(build)=$(1) all .PHONY: $(3) $(1)/built-in.o: $(3) else $(1): $(2) $$(Q) $$(MAKE) $$(build)=$(1) all .PHONY: $(1) $(1)/built-in.o: $(1) endif endef # # Footer. ____nmk_defined__macro = y endif criu-3.6/scripts/nmk/scripts/main.mk000066400000000000000000000007601317335042600175530ustar00rootroot00000000000000ifndef ____nmk_defined__main # # Genaral inclusion statement ifndef ____nmk_defined__include include $(__nmk_dir)include.mk endif ifndef ____nmk_defined__macro include $(__nmk_dir)macro.mk endif # # Anything else might be included with # # $(eval $(call include-once,)) # # Note the order does matter! $(eval $(call include-once,tools.mk)) $(eval $(call include-once,utils.mk)) $(eval $(call include-once,build.mk)) # # Footer ____nmk_defined__main = y endif criu-3.6/scripts/nmk/scripts/msg.mk000066400000000000000000000017201317335042600174120ustar00rootroot00000000000000ifndef ____nmk_defined__msg # # Silent make rules. ifeq ($(strip $(V)),) E := @echo Q := @ else E := @\# Q := endif export E Q # # Message helpers. define msg-gen $(E) " GEN " $(1) endef define msg-clean $(E) " CLEAN " $(1) endef define msg-cc $(E) " CC " $(1) endef define msg-dep $(E) " DEP " $(1) endef define msg-link $(E) " LINK " $(1) endef define msg-ar $(E) " AR " $(1) endef define msg-build $(E) " BUILD " $(1) endef define msg-host-cc $(E) " HOSTCC " $(1) endef define msg-host-dep $(E) " HOSTDEP " $(1) endef define msg-host-link $(E) " HOSTLINK" $(1) endef define newline endef # map funciton: # $1 - func to call # $2 - list over which map the $1 func # result is divided with newlines map = $(foreach x,$2,$(call $1,$x)$(newline)) # # Footer. ____nmk_defined__msg = y endif #____nmk_defined__msg criu-3.6/scripts/nmk/scripts/tools.mk000066400000000000000000000015211317335042600177630ustar00rootroot00000000000000ifndef ____nmk_defined__tools # # System tools shorthands RM := rm -f HOSTLD ?= ld LD := $(CROSS_COMPILE)$(HOSTLD) HOSTCC ?= gcc CC := $(CROSS_COMPILE)$(HOSTCC) CPP := $(CC) -E AS := $(CROSS_COMPILE)as AR := $(CROSS_COMPILE)ar STRIP := $(CROSS_COMPILE)strip OBJCOPY := $(CROSS_COMPILE)objcopy OBJDUMP := $(CROSS_COMPILE)objdump NM := $(CROSS_COMPILE)nm MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl PYTHON := python FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ else if [ -x /bin/bash ]; then echo /bin/bash; \ else echo sh; fi ; fi) CSCOPE := cscope ETAGS := etags CTAGS := ctags export RM HOSTLD LD HOSTCC CC CPP AS AR STRIP OBJCOPY OBJDUMP export NM SH MAKE MKDIR AWK PERL PYTHON SH CSCOPE # # Footer. ____nmk_defined__tools = y endif criu-3.6/scripts/nmk/scripts/utils.mk000066400000000000000000000021031317335042600177600ustar00rootroot00000000000000ifndef ____nmk_defined__utils # # Usage: option := $(call try-compile,language,source-to-build,cc-options,cc-defines) try-compile = $(shell sh -c 'echo "$(2)" | \ $(CC) $(4) -x $(1) - $(3) -o /dev/null > /dev/null 2>&1 && \ echo true || echo false') # # Usage: option := $(call try-cc,source-to-build,cc-options,cc-defines) try-cc = $(call try-compile,c,$(1),$(2),$(3)) # # Usage: option := $(call try-cc,source-to-build,cc-options,cc-defines) try-asm = $(call try-compile,assembler-with-cpp,$(1),$(2),$(3)) # pkg-config-check # Usage: ifeq ($(call pkg-config-check, library),y) pkg-config-check = $(shell sh -c 'pkg-config $(1) && echo y') # # Remove duplicates. uniq = $(strip $(if $1,$(firstword $1) $(call uniq,$(filter-out $(firstword $1),$1)))) # # Add $(obj)/ for paths that are not relative objectify = $(foreach o,$(1),$(if $(filter /% ./% ../%,$(o)),$(o),$(obj)/$(o))) # To cleanup entries. cleanify = $(foreach o,$(sort $(call uniq,$(1))),$(o) $(o:.o=.d) $(o:.o=.i) $(o:.o=.s) $(o:.o=.gcda) $(o:.o=.gcno)) # # Footer. ____nmk_defined__utils = y endif criu-3.6/scripts/protobuf-gen.sh000066400000000000000000000006661317335042600167720ustar00rootroot00000000000000TR="y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/" for x in $(sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { /PB_AUTOGEN_ST/d; s/,.*$//; s/\tPB_//; p; }' criu/include/protobuf-desc.h); do x_la=$(echo $x | sed $TR) x_uf=$(echo $x | sed -nr 's/^./&#\\\ /; s/_(.)/\\\ \1#\\\ /g; p;' | \ sed -r "/^[A-Z]#\\\\\$/!{ $TR; }" | \ sed -r ':loop; N; s/#?\\\n//; t loop') echo "CR_PB_DESC($x, $x_uf, $x_la);" done criu-3.6/scripts/systemd-autofs-restart.sh000077500000000000000000000106101317335042600210250ustar00rootroot00000000000000#!/bin/bash # # This script can be used as a workaround for systemd autofs mount migration. # The problem is that systemd is a clever guy: before mounting of actual file # system on top of autofs mount, it first checks that device number of autofs # mount is equal to the one, stored in sytemd internals. If they do not match, # systemd ignores kernel request. # The problem happens each time autofs is restored (new device number for # autofs superblock) and can't be properly solved without some kind of "device # namespaces", where device number can be preseved. # But some of systemd services can be painlessly restarted. Like # proc-sys-fs-binfmt_misc. # # Usage: # criu restore --action-script $(pwd)/scripts/systemd-autofs-restart.sh # [ "$CRTOOLS_SCRIPT_ACTION" == "post-resume" ] || exit 0 if [ ! -n "$CRTOOLS_INIT_PID" ]; then echo "CRTOOLS_INIT_PID environment variable is not set" exit 1 fi if [ ! -d "/proc/$CRTOOLS_INIT_PID" ]; then echo "Process with CRTOOLS_INIT_PID=$CRTOOLS_INIT_PID doesn't exist" exit 1 fi NS_ENTER=/bin/nsenter [ ! -x $NSENTER ] || NS_ENTER=/usr/bin/nsenter if [ ! -x $NS_ENTER ]; then echo "$NS_ENTER binary not found" exit 2 fi JOIN_CT="$NS_ENTER -t $CRTOOLS_INIT_PID -m -u -p" # Skip container, if it's not systemd based [ "$($JOIN_CT basename -- $($JOIN_CT readlink /proc/1/exe))" == "systemd" ] || exit 0 AUTOFS_SERVICES="proc-sys-fs-binfmt_misc.automount" bindmount="" function remove_bindmount { if [ -n "$bindmount" ]; then $JOIN_CT umount $bindmount $JOIN_CT rm -rf $bindmount bindmount="" fi } trap remove_bindmount EXIT function get_fs_type { local mountpoint=$1 local top_mount_id="" local top_mount_fs_type="" while IFS='' read -r line; do # Skip those entries which do not match the mountpoint [ "$(echo $line | awk '{print $5;}')" = "$mountpoint" ] || continue local mnt_id=$(echo $line | awk '{print $1;}') local mnt_parent_id=$(echo $line | awk '{print $2;}') local mnt_fs_type=$(echo $line | sed 's/.* - //g' | awk '{print $1;}') # Skip mount entry, if not the first one and not a child [ -n "$top_mount_id" ] && [ "$mnt_parent_id" != "$top_mount_id" ] && continue top_mount_id=$mnt_id top_mount_fs_type=$mnt_fs_type done < "/proc/$CRTOOLS_INIT_PID/mountinfo" if [ -z "$top_mount_fs_type" ]; then echo "Failed to find $mountpoint mountpoint" return 1 fi echo $top_mount_fs_type return 0 } function bind_mount { local from=$1 local to=$2 $JOIN_CT mount --bind $from $to && return 0 echo "Failed to bind mount $from to $to" return 1 } function save_mountpoint { local mountpoint=$1 local top_mount_fs_type="" top_mount_fs_type=$(get_fs_type $mountpoint) if [ $? -ne 0 ]; then echo "$top_mount_fs_type" return fi # Nothing to do, if no file system is on top of autofs [ "$top_mount_fs_type" = "autofs" ] && return bindmount=$($JOIN_CT mktemp -d) if [ -z "$bindmount" ]; then echo "Failed to create temporary directory" return 1 fi # No need to unmount fs on top of autofs: # systemd will does it for us on service restart bind_mount $mountpoint $bindmount || $JOIN_CT rm -rf $bindmount } function restore_mountpoint { local mountpoint=$1 [ -n "$bindmount" ] || return # Umount file system, remounted by systemd, if any top_mount_fs_type=$(get_fs_type $mountpoint) if [ $? -ne 0 ]; then echo "$top_mount_fs_type" return fi # Nothing to do, if no file system is on top of autofs if [ "$top_mount_fs_type" != "autofs" ]; then $JOIN_CT umount $mountpoint || echo "Failed to umount $mountpoint" fi # Restore origin file system even if we failed to unmount the new one bind_mount $bindmount $mountpoint remove_bindmount } function restart_service { local service=$1 local mountpoint=$($JOIN_CT systemctl show $service -p Where | sed 's/.*=//g') if [ -z "$mountpoint" ]; then echo "Failed to discover $service mountpoint" return fi # Try to move restored bind-mount aside and exit if Failed # Nothing to do, if we Failed save_mountpoint $mountpoint || return $JOIN_CT systemctl restart $service if [ $? -ne 0 ]; then echo "Failed to restart $service service" return fi echo "$service restarted" # Try to move saved monutpoint back on top of autofs restore_mountpoint $mountpoint } for service in $AUTOFS_SERVICES; do status=$($JOIN_CT systemctl is-active $service) if [ $status == "active" ]; then restart_service $service else echo "$service skipped ($status)" fi done exit 0 criu-3.6/scripts/tmp-files.sh000077500000000000000000000025011317335042600162540ustar00rootroot00000000000000#!/bin/bash # # Script allows to save arbitrary files in CRIU dump dir and properly restore # these files on CRIU restore cmd. # # Normally you need to call this script for files that can be lost between # CRIU checkpoint and restore cmds. For example for files stored on non-tmpfs # mount points. That's why this script is called tmp-files. # # You should call this script on both CRIU dump and restore cmds. # # Usage: # criu dump --action-script \ # '$CRIU_SCRIPTS_PATH/tmp-files.sh /tmp/ycm_temp /home/user/.tmpfile.txt.swp' # criu restore --action-script $CRIU_SCRIPTS_PATH/tmp-files.sh # # Note: absolute path to tmp-files.sh should be supplied in --action-script with '' # POSTDUMP="post-dump" PRERESTORE="pre-restore" DUMPARGS="--create --absolute-names --gzip --no-unquote --no-wildcards --file" RESTOREARGS="--extract --gzip --no-unquote --no-wildcards --absolute-names --directory / --file" IMGFILE=$CRTOOLS_IMAGE_DIR"/tmpfiles.tar.gz" MY_NAME=`basename "$0"` case "$CRTOOLS_SCRIPT_ACTION" in $POSTDUMP ) if [ "$#" -lt 1 ]; then echo "$MY_NAME: ERROR! No files are given." exit 1 fi tar $DUMPARGS $IMGFILE -- "$@" exit $? ;; $PRERESTORE ) if [ "$#" -ne 0 ]; then echo "$MY_NAME: ERROR! Not expected script args." exit 1 fi tar $RESTOREARGS $IMGFILE exit $? ;; esac exit 0 criu-3.6/scripts/travis/000077500000000000000000000000001317335042600153275ustar00rootroot00000000000000criu-3.6/scripts/travis/Makefile000066400000000000000000000014221317335042600167660ustar00rootroot00000000000000local: ./travis-tests .PHONY: local after_success: ./travis-after_success .PHONY: after_success target-suffix = ifdef CLANG target-suffix = -clang endif TARGETS := fedora-asan alpine fedora-rawhide ZDTM_OPTIONS := alpine: ZDTM_OPTIONS=-x zdtm/static/binfmt_misc -x zdtm/static/netns-nf -x zdtm/static/sched_policy00 -x zdtm/static/seccomp_strict -x zdtm/static/sigaltstack -x zdtm/static/signalfd00 -x zdtm/static/config_inotify_irmap $(TARGETS): echo 'DOCKER_OPTS="--storage-driver=devicemapper"' > /etc/default/docker restart docker $(MAKE) -C ../build $@$(target-suffix) docker run --rm -it --privileged -v /lib/modules:/lib/modules --tmpfs /run criu-$@ ./scripts/travis/asan.sh $(ZDTM_OPTIONS) docker-test: ./docker-test.sh %: $(MAKE) -C ../build $@$(target-suffix) criu-3.6/scripts/travis/asan.sh000077500000000000000000000006061317335042600166120ustar00rootroot00000000000000#!/bin/sh set -x cat /proc/self/mountinfo chmod 0777 test chmod 0777 test/zdtm/transition/ chmod 0777 test/zdtm/static ./test/zdtm.py run -a --keep-going -k always --parallel 4 -x zdtm/static/rtc "$@" ret=$? for i in `find / -name 'asan.log*'`; do echo $i; echo ======================================== cat $i; echo ======================================== ret=1; done; exit $ret criu-3.6/scripts/travis/docker-test.sh000077500000000000000000000025501317335042600201140ustar00rootroot00000000000000#!/bin/bash set -x -e -o pipefail apt-get install -qq \ apt-transport-https \ ca-certificates \ curl \ software-properties-common curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - add-apt-repository \ "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ $(lsb_release -cs) \ stable" apt-get update -qq apt-get install -qq docker-ce cat > /etc/docker/daemon.json <&1 | tee log || { cat "`cat log | grep 'log file:' | sed 's/log file:\s*//'`" || true docker logs cr || true cat /tmp/zdtm-core-* || true dmesg docker ps exit 1 } docker ps sleep 1 done criu-3.6/scripts/travis/travis-after_success000077500000000000000000000004311317335042600214120ustar00rootroot00000000000000#!/bin/sh set -x -e # We only need to run the below for gcov-enabled builds test -z "$GCOV" && exit 0 sudo apt-get install -qq -y lcov gem install coveralls-lcov sudo lcov --directory ../.. --capture --output-file coverage.info --ignore-errors graph coveralls-lcov coverage.info criu-3.6/scripts/travis/travis-tests000077500000000000000000000072461317335042600177360ustar00rootroot00000000000000#!/bin/sh set -x -e TRAVIS_PKGS="protobuf-c-compiler libprotobuf-c0-dev libaio-dev libprotobuf-dev protobuf-compiler python-ipaddr libcap-dev libnl-3-dev gcc-multilib gdb bash python-protobuf libnet-dev util-linux asciidoc xmlto" travis_prep () { [ -n "$SKIP_TRAVIS_PREP" ] && return cd ../../ service apport stop CC=gcc # clang support if [ "$CLANG" = "1" ]; then TRAVIS_PKGS="$TRAVIS_PKGS clang" CC=clang fi # ccache support, only enable for non-GCOV case if [ "$CCACHE" = "1" -a -z "$GCOV" ]; then # ccache is installed by default, need to set it up export CCACHE_DIR=$HOME/.ccache [ "$CC" = "clang" ] && export CCACHE_CPP2=yes # uncomment the following to get detailed ccache logs #export CCACHE_LOGFILE=$HOME/ccache.log CC="ccache $CC" fi # The /etc/apt/sources.list in the current trusty image for ppc64le is # broken and needs to be fixed if [ "$TR_ARCH" = "ppc64le" ] ; then sed -i '/security/ d' /etc/apt/sources.list fi apt-get update -qq apt-get install -qq --no-install-recommends $TRAVIS_PKGS pip install junit-xml chmod a+x $HOME } travis_prep ulimit -c unlimited echo "|`pwd`/test/abrt.sh %P %p %s %e" > /proc/sys/kernel/core_pattern export GCOV time make CC="$CC" -j4 [ -n "$SKIP_TRAVIS_TEST" ] && return time make CC="$CC" -j4 -C test/zdtm [ -f "$CCACHE_LOGFILE" ] && cat $CCACHE_LOGFILE # umask has to be called before a first criu run, so that .gcda (coverage data) # files are created with read-write permissions for all. umask 0000 ./criu/criu check ./criu/criu check --all || echo $? ./criu/criu cpuinfo dump ./criu/criu cpuinfo check export SKIP_PREP=1 # The 3.19 kernel (from Ubuntu 14.04) has a bug. When /proc/PID/pagemap # is read for a few VMAs in one read call, incorrect data is returned. # See https://github.com/xemul/criu/issues/207 # Kernel 4.4 (from Ubuntu 14.04.5 update) fixes this. uname -r | grep -q ^3\.19 && export CRIU_PMC_OFF=1 chmod 0777 test/ chmod 0777 test/zdtm/static chmod 0777 test/zdtm/transition ./test/zdtm.py run -a -p 2 KERN_MAJ=`uname -r | cut -d. -f1` KERN_MIN=`uname -r | cut -d. -f2` if [ $KERN_MAJ -ge "4" ] && [ $KERN_MIN -ge "11" ]; then LAZY_EXCLUDE="-x cmdlinenv00" else LAZY_EXCLUDE="-x maps007 -x fork -x fork2 -x uffd-events -x cgroupns -x socket_listen -x socket_listen6 -x cmdlinenv00 -x socket_close_data01 -x file_read -x lazy-thp" fi LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps04" LAZY_TESTS=.*\(maps0\|uffd-events\|lazy-thp\|futex\|fork\).* ./test/zdtm.py run -p 2 -T $LAZY_TESTS --lazy-pages $LAZY_EXCLUDE ./test/zdtm.py run -p 2 -T $LAZY_TESTS --remote-lazy-pages $LAZY_EXCLUDE bash ./test/jenkins/criu-fault.sh bash ./test/jenkins/criu-fcg.sh bash ./test/jenkins/criu-inhfd.sh make -C test/others/mnt-ext-dev/ run #make -C test/others/exec/ run make -C test/others/make/ run ./test/zdtm.py run -t zdtm/static/env00 --sibling ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server --dedup ./test/zdtm.py run -t zdtm/static/socket-tcp-local --norst ip net add test ./test/zdtm.py run -t zdtm/static/env00 -f h --join-ns # RPC testing ./test/zdtm.py run -t zdtm/static/env00 --rpc # Basic ./test/zdtm.py run -t zdtm/static/ptrace_sig -f h --rpc # Error handling (crfail test) ./test/zdtm.py run --empty-ns -T zdtm/static/socket-tcp*-local --iter 2 pip install flake8 make lint # Check that help output fits into 80 columns WIDTH=$(./criu/criu --help | wc --max-line-length) if [ "$WIDTH" -gt 80 ]; then echo "criu --help output does not obey 80 characters line width!" exit 1 fi criu-3.6/soccr/000077500000000000000000000000001317335042600134415ustar00rootroot00000000000000criu-3.6/soccr/Makefile000066400000000000000000000000501317335042600150740ustar00rootroot00000000000000lib-name := libsoccr.a lib-y += soccr.o criu-3.6/soccr/soccr.c000066400000000000000000000521431317335042600147230ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "soccr.h" #ifndef SIOCOUTQNSD /* MAO - Define SIOCOUTQNSD ioctl if we don't have it */ #define SIOCOUTQNSD 0x894B #endif enum { TCPF_ESTABLISHED = (1 << 1), TCPF_SYN_SENT = (1 << 2), TCPF_SYN_RECV = (1 << 3), TCPF_FIN_WAIT1 = (1 << 4), TCPF_FIN_WAIT2 = (1 << 5), TCPF_TIME_WAIT = (1 << 6), TCPF_CLOSE = (1 << 7), TCPF_CLOSE_WAIT = (1 << 8), TCPF_LAST_ACK = (1 << 9), TCPF_LISTEN = (1 << 10), TCPF_CLOSING = (1 << 11), }; /* * The TCP transition diagram for half closed connections * * ------------ * FIN_WAIT1 \ FIN * --------- * / ACK CLOSE_WAIT * ----------- * FIN_WAIT2 * ---------- * / FIN LAST_ACK * ----------- * TIME_WAIT \ ACK * ---------- * CLOSED * * How to get the TCP_CLOSING state * * ----------- ---------- * FIN_WAIT1 \/ FIN FIN_WAIT1 * ----------- ---------- * CLOSING CLOSING * \/ ACK * ----------- ---------- * TIME_WAIT TIME_WAIT */ /* Restore a fin packet in a send queue first */ #define SNDQ_FIRST_FIN (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) /* Restore fin in a send queue after restoring fi in the receive queue. */ #define SNDQ_SECOND_FIN (TCPF_LAST_ACK | TCPF_CLOSE) #define SNDQ_FIN_ACKED (TCPF_FIN_WAIT2 | TCPF_CLOSE) #define RCVQ_FIRST_FIN (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSE) #define RCVQ_SECOND_FIN (TCPF_CLOSING) #define RCVQ_FIN_ACKED (TCPF_CLOSE) static void (*log)(unsigned int loglevel, const char *format, ...) __attribute__ ((__format__ (__printf__, 2, 3))); static unsigned int log_level = 0; void libsoccr_set_log(unsigned int level, void (*fn)(unsigned int level, const char *fmt, ...)) { log_level = level; log = fn; } #define loge(msg, ...) do { if (log && (log_level >= SOCCR_LOG_ERR)) log(SOCCR_LOG_ERR, "Error (%s:%d): " msg, __FILE__, __LINE__, ##__VA_ARGS__); } while (0) #define logerr(msg, ...) loge(msg ": %s\n", ##__VA_ARGS__, strerror(errno)) #define logd(msg, ...) do { if (log && (log_level >= SOCCR_LOG_DBG)) log(SOCCR_LOG_DBG, "Debug: " msg, ##__VA_ARGS__); } while (0) static int tcp_repair_on(int fd) { int ret, aux = 1; ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) logerr("Can't turn TCP repair mode ON"); return ret; } static int tcp_repair_off(int fd) { int aux = 0, ret; ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) logerr("Failed to turn off repair mode on socket"); return ret; } struct libsoccr_sk { int fd; unsigned flags; char *recv_queue; char *send_queue; union libsoccr_addr *src_addr; union libsoccr_addr *dst_addr; }; #define SK_FLAG_FREE_RQ 0x1 #define SK_FLAG_FREE_SQ 0x2 #define SK_FLAG_FREE_SA 0x4 #define SK_FLAG_FREE_DA 0x8 struct libsoccr_sk *libsoccr_pause(int fd) { struct libsoccr_sk *ret; ret = malloc(sizeof(*ret)); if (!ret) { loge("Unable to allocate memory\n"); return NULL; } if (tcp_repair_on(fd) < 0) { free(ret); return NULL; } ret->flags = 0; ret->recv_queue = NULL; ret->send_queue = NULL; ret->src_addr = NULL; ret->dst_addr = NULL; ret->fd = fd; return ret; } void libsoccr_resume(struct libsoccr_sk *sk) { tcp_repair_off(sk->fd); libsoccr_release(sk); } void libsoccr_release(struct libsoccr_sk *sk) { if (sk->flags & SK_FLAG_FREE_RQ) free(sk->recv_queue); if (sk->flags & SK_FLAG_FREE_SQ) free(sk->send_queue); if (sk->flags & SK_FLAG_FREE_SA) free(sk->src_addr); if (sk->flags & SK_FLAG_FREE_DA) free(sk->dst_addr); free(sk); } struct soccr_tcp_info { __u8 tcpi_state; __u8 tcpi_ca_state; __u8 tcpi_retransmits; __u8 tcpi_probes; __u8 tcpi_backoff; __u8 tcpi_options; __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; }; static int refresh_sk(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, struct soccr_tcp_info *ti) { int size; socklen_t olen = sizeof(*ti); if (getsockopt(sk->fd, SOL_TCP, TCP_INFO, ti, &olen) || olen != sizeof(*ti)) { logerr("Failed to obtain TCP_INFO"); return -1; } switch (ti->tcpi_state) { case TCP_ESTABLISHED: case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: case TCP_LAST_ACK: case TCP_CLOSE_WAIT: case TCP_CLOSING: case TCP_CLOSE: case TCP_SYN_SENT: break; default: loge("Unknown state %d\n", ti->tcpi_state); return -1; } data->state = ti->tcpi_state; if (ioctl(sk->fd, SIOCOUTQ, &size) == -1) { logerr("Unable to get size of snd queue"); return -1; } data->outq_len = size; if (ioctl(sk->fd, SIOCOUTQNSD, &size) == -1) { logerr("Unable to get size of unsent data"); return -1; } data->unsq_len = size; if (data->state == TCP_CLOSE) { /* A connection could be reseted. In thise case a sent queue * may contain some data. A user can't read this data, so let's * ignore them. Otherwise we will need to add a logic whether * the send queue contains a fin packet or not and decide whether * a fin or reset packet has to be sent to restore a state */ data->unsq_len = 0; data->outq_len = 0; } /* Don't account the fin packet. It doesn't countain real data. */ if ((1 << data->state) & (SNDQ_FIRST_FIN | SNDQ_SECOND_FIN)) { if (data->outq_len) data->outq_len--; data->unsq_len = data->unsq_len ? data->unsq_len - 1 : 0; } if (ioctl(sk->fd, SIOCINQ, &size) == -1) { logerr("Unable to get size of recv queue"); return -1; } data->inq_len = size; return 0; } static int get_stream_options(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, struct soccr_tcp_info *ti) { int ret; socklen_t auxl; int val; auxl = sizeof(data->mss_clamp); ret = getsockopt(sk->fd, SOL_TCP, TCP_MAXSEG, &data->mss_clamp, &auxl); if (ret < 0) goto err_sopt; data->opt_mask = ti->tcpi_options; if (ti->tcpi_options & TCPI_OPT_WSCALE) { data->snd_wscale = ti->tcpi_snd_wscale; data->rcv_wscale = ti->tcpi_rcv_wscale; } if (ti->tcpi_options & TCPI_OPT_TIMESTAMPS) { auxl = sizeof(val); ret = getsockopt(sk->fd, SOL_TCP, TCP_TIMESTAMP, &val, &auxl); if (ret < 0) goto err_sopt; data->timestamp = val; } return 0; err_sopt: logerr("\tsockopt failed"); return -1; } static int get_window(struct libsoccr_sk *sk, struct libsoccr_sk_data *data) { struct tcp_repair_window opt; socklen_t optlen = sizeof(opt); if (getsockopt(sk->fd, SOL_TCP, TCP_REPAIR_WINDOW, &opt, &optlen)) { /* Appeared since 4.8, but TCP_repair itself is since 3.11 */ if (errno == ENOPROTOOPT) return 0; logerr("Unable to get window properties"); return -1; } data->flags |= SOCCR_FLAGS_WINDOW; data->snd_wl1 = opt.snd_wl1; data->snd_wnd = opt.snd_wnd; data->max_window = opt.max_window; data->rcv_wnd = opt.rcv_wnd; data->rcv_wup = opt.rcv_wup; return 0; } /* * TCP queues sequences and their relations to the code below * * output queue * net <----------------------------- sk * ^ ^ ^ seq >> * snd_una snd_nxt write_seq * * input queue * net -----------------------------> sk * << seq ^ ^ * rcv_nxt copied_seq * * * inq_len = rcv_nxt - copied_seq = SIOCINQ * outq_len = write_seq - snd_una = SIOCOUTQ * inq_seq = rcv_nxt * outq_seq = write_seq * * On restore kernel moves the option we configure with setsockopt, * thus we should advance them on the _len value in restore_tcp_seqs. * */ static int get_queue(int sk, int queue_id, __u32 *seq, __u32 len, char **bufp) { int ret, aux; socklen_t auxl; char *buf; aux = queue_id; auxl = sizeof(aux); ret = setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &aux, auxl); if (ret < 0) goto err_sopt; auxl = sizeof(*seq); ret = getsockopt(sk, SOL_TCP, TCP_QUEUE_SEQ, seq, &auxl); if (ret < 0) goto err_sopt; if (len) { /* * Try to grab one byte more from the queue to * make sure there are len bytes for real */ buf = malloc(len + 1); if (!buf) { loge("Unable to allocate memory\n"); goto err_buf; } ret = recv(sk, buf, len + 1, MSG_PEEK | MSG_DONTWAIT); if (ret != len) goto err_recv; } else buf = NULL; *bufp = buf; return 0; err_sopt: logerr("\tsockopt failed"); err_buf: return -1; err_recv: logerr("\trecv failed (%d, want %d)", ret, len); free(buf); goto err_buf; } /* * This is how much data we've had in the initial libsoccr */ #define SOCR_DATA_MIN_SIZE (17 * sizeof(__u32)) int libsoccr_save(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size) { struct soccr_tcp_info ti; if (!data || data_size < SOCR_DATA_MIN_SIZE) { loge("Invalid input parameters\n"); return -1; } memset(data, 0, data_size); if (refresh_sk(sk, data, &ti)) return -2; if (get_stream_options(sk, data, &ti)) return -3; if (get_window(sk, data)) return -4; sk->flags |= SK_FLAG_FREE_SQ | SK_FLAG_FREE_RQ; if (get_queue(sk->fd, TCP_RECV_QUEUE, &data->inq_seq, data->inq_len, &sk->recv_queue)) return -5; if (get_queue(sk->fd, TCP_SEND_QUEUE, &data->outq_seq, data->outq_len, &sk->send_queue)) return -6; return sizeof(struct libsoccr_sk_data); } #define GET_Q_FLAGS (SOCCR_MEM_EXCL) char *libsoccr_get_queue_bytes(struct libsoccr_sk *sk, int queue_id, unsigned flags) { char **p, *ret; if (flags & ~GET_Q_FLAGS) return NULL; switch (queue_id) { case TCP_RECV_QUEUE: p = &sk->recv_queue; break; case TCP_SEND_QUEUE: p = &sk->send_queue; break; default: return NULL; } ret = *p; if (flags & SOCCR_MEM_EXCL) *p = NULL; return ret; } #define GET_SA_FLAGS (SOCCR_MEM_EXCL) union libsoccr_addr *libsoccr_get_addr(struct libsoccr_sk *sk, int self, unsigned flags) { if (flags & ~GET_SA_FLAGS) return NULL; /* FIXME -- implemeted in CRIU, makes sence to have it here too */ return NULL; } static int set_queue_seq(struct libsoccr_sk *sk, int queue, __u32 seq) { logd("\tSetting %d queue seq to %u\n", queue, seq); if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) { logerr("Can't set repair queue"); return -1; } if (setsockopt(sk->fd, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq)) < 0) { logerr("Can't set queue seq"); return -1; } return 0; } #ifndef TCPOPT_SACK_PERM #define TCPOPT_SACK_PERM TCPOPT_SACK_PERMITTED #endif static int libsoccr_set_sk_data_noq(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size) { struct tcp_repair_opt opts[4]; int addr_size, mstate; int onr = 0; __u32 seq; if (!data || data_size < SOCR_DATA_MIN_SIZE) { loge("Invalid input parameters\n"); return -1; } if (!sk->dst_addr || !sk->src_addr) { loge("Destination or/and source addresses aren't set\n"); return -1; } mstate = 1 << data->state; if (data->state == TCP_LISTEN) { loge("Unable to handle listen sockets\n"); return -1; } if (sk->src_addr->sa.sa_family == AF_INET) addr_size = sizeof(sk->src_addr->v4); else addr_size = sizeof(sk->src_addr->v6); if (bind(sk->fd, &sk->src_addr->sa, addr_size)) { logerr("Can't bind inet socket back"); return -1; } if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN)) data->inq_seq--; /* outq_seq is adjusted due to not accointing the fin packet */ if (mstate & (SNDQ_FIRST_FIN | SNDQ_SECOND_FIN)) data->outq_seq--; if (set_queue_seq(sk, TCP_RECV_QUEUE, data->inq_seq - data->inq_len)) return -2; seq = data->outq_seq - data->outq_len; if (data->state == TCP_SYN_SENT) seq--; if (set_queue_seq(sk, TCP_SEND_QUEUE, seq)) return -3; if (sk->dst_addr->sa.sa_family == AF_INET) addr_size = sizeof(sk->dst_addr->v4); else addr_size = sizeof(sk->dst_addr->v6); if (data->state == TCP_SYN_SENT && tcp_repair_off(sk->fd)) return -1; if (connect(sk->fd, &sk->dst_addr->sa, addr_size) == -1 && errno != EINPROGRESS) { loge("Can't connect inet socket back\n"); return -1; } if (data->state == TCP_SYN_SENT && tcp_repair_on(sk->fd)) return -1; logd("\tRestoring TCP options\n"); if (data->opt_mask & TCPI_OPT_SACK) { logd("\t\tWill turn SAK on\n"); opts[onr].opt_code = TCPOPT_SACK_PERM; opts[onr].opt_val = 0; onr++; } if (data->opt_mask & TCPI_OPT_WSCALE) { logd("\t\tWill set snd_wscale to %u\n", data->snd_wscale); logd("\t\tWill set rcv_wscale to %u\n", data->rcv_wscale); opts[onr].opt_code = TCPOPT_WINDOW; opts[onr].opt_val = data->snd_wscale + (data->rcv_wscale << 16); onr++; } if (data->opt_mask & TCPI_OPT_TIMESTAMPS) { logd("\t\tWill turn timestamps on\n"); opts[onr].opt_code = TCPOPT_TIMESTAMP; opts[onr].opt_val = 0; onr++; } logd("Will set mss clamp to %u\n", data->mss_clamp); opts[onr].opt_code = TCPOPT_MAXSEG; opts[onr].opt_val = data->mss_clamp; onr++; if (data->state != TCP_SYN_SENT && setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_OPTIONS, opts, onr * sizeof(struct tcp_repair_opt)) < 0) { logerr("Can't repair options"); return -2; } if (data->opt_mask & TCPI_OPT_TIMESTAMPS) { if (setsockopt(sk->fd, SOL_TCP, TCP_TIMESTAMP, &data->timestamp, sizeof(data->timestamp)) < 0) { logerr("Can't set timestamp"); return -3; } } return 0; } static int send_fin(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, uint8_t flags) { int ret, exit_code = -1; char errbuf[LIBNET_ERRBUF_SIZE]; int mark = SOCCR_MARK;; int libnet_type; libnet_t *l; if (sk->dst_addr->sa.sa_family == AF_INET6) libnet_type = LIBNET_RAW6; else libnet_type = LIBNET_RAW4; l = libnet_init( libnet_type, /* injection type */ NULL, /* network interface */ errbuf); /* errbuf */ if (l == NULL) { loge("libnet_init failed (%s)\n", errbuf); return -1; } if (setsockopt(l->fd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark))) { logerr("Can't set SO_MARK (%d) for socket\n", mark); goto err; } ret = libnet_build_tcp( ntohs(sk->dst_addr->v4.sin_port), /* source port */ ntohs(sk->src_addr->v4.sin_port), /* destination port */ data->inq_seq, /* sequence number */ data->outq_seq - data->outq_len, /* acknowledgement num */ flags, /* control flags */ data->rcv_wnd, /* window size */ 0, /* checksum */ 10, /* urgent pointer */ LIBNET_TCP_H + 20, /* TCP packet size */ NULL, /* payload */ 0, /* payload size */ l, /* libnet handle */ 0); /* libnet id */ if (ret == -1) { loge("Can't build TCP header: %s\n", libnet_geterror(l)); goto err; } if (sk->dst_addr->sa.sa_family == AF_INET6) { struct libnet_in6_addr src, dst; memcpy(&dst, &sk->dst_addr->v6.sin6_addr, sizeof(dst)); memcpy(&src, &sk->src_addr->v6.sin6_addr, sizeof(src)); ret = libnet_build_ipv6( 0, 0, LIBNET_TCP_H, /* length */ IPPROTO_TCP, /* protocol */ 64, /* hop limit */ dst, /* source IP */ src, /* destination IP */ NULL, /* payload */ 0, /* payload size */ l, /* libnet handle */ 0); /* libnet id */ } else if (sk->dst_addr->sa.sa_family == AF_INET) ret = libnet_build_ipv4( LIBNET_IPV4_H + LIBNET_TCP_H + 20, /* length */ 0, /* TOS */ 242, /* IP ID */ 0, /* IP Frag */ 64, /* TTL */ IPPROTO_TCP, /* protocol */ 0, /* checksum */ sk->dst_addr->v4.sin_addr.s_addr, /* source IP */ sk->src_addr->v4.sin_addr.s_addr, /* destination IP */ NULL, /* payload */ 0, /* payload size */ l, /* libnet handle */ 0); /* libnet id */ else { loge("Unknown socket family\n"); goto err; } if (ret == -1) { loge("Can't build IP header: %s\n", libnet_geterror(l)); goto err; } ret = libnet_write(l); if (ret == -1) { loge("Unable to send a fin packet: %s\n", libnet_geterror(l)); goto err; } exit_code = 0; err: libnet_destroy(l); return exit_code; } static int restore_fin_in_snd_queue(int sk, int acked) { int queue = TCP_SEND_QUEUE; int ret; /* * If TCP_SEND_QUEUE is set, a fin packet will be * restored as a sent packet. */ if (acked && setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) { logerr("Can't set repair queue"); return -1; } ret = shutdown(sk, SHUT_WR); if (ret < 0) logerr("Unable to shut down a socket"); queue = TCP_NO_QUEUE; if (acked && setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) { logerr("Can't set repair queue"); return -1; } return ret; } static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, int queue, char *buf); int libsoccr_restore(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size) { int mstate = 1 << data->state; if (libsoccr_set_sk_data_noq(sk, data, data_size)) return -1; if (libsoccr_restore_queue(sk, data, sizeof(*data), TCP_RECV_QUEUE, sk->recv_queue)) return -1; if (libsoccr_restore_queue(sk, data, sizeof(*data), TCP_SEND_QUEUE, sk->send_queue)) return -1; if (data->flags & SOCCR_FLAGS_WINDOW) { struct tcp_repair_window wopt = { .snd_wl1 = data->snd_wl1, .snd_wnd = data->snd_wnd, .max_window = data->max_window, .rcv_wnd = data->rcv_wnd, .rcv_wup = data->rcv_wup, }; if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN)) { wopt.rcv_wup--; wopt.rcv_wnd++; } if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_WINDOW, &wopt, sizeof(wopt))) { logerr("Unable to set window parameters"); return -1; } } /* * To restore a half closed sockets, fin packets has to be restored in * recv and send queues. Here shutdown() is used to restore a fin * packet in the send queue and a fake fin packet is send to restore it * in the recv queue. */ if (mstate & SNDQ_FIRST_FIN) restore_fin_in_snd_queue(sk->fd, mstate & SNDQ_FIN_ACKED); /* Send a fin packet to the socket to restore it in a receive queue. */ if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN)) if (send_fin(sk, data, data_size, TH_ACK | TH_FIN) < 0) return -1; if (mstate & SNDQ_SECOND_FIN) restore_fin_in_snd_queue(sk->fd, mstate & SNDQ_FIN_ACKED); if (mstate & RCVQ_FIN_ACKED) data->inq_seq++; if (mstate & SNDQ_FIN_ACKED) { data->outq_seq++; if (send_fin(sk, data, data_size, TH_ACK) < 0) return -1; } return 0; } static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) { int ret, err = -1, max_chunk; int off; max_chunk = len; off = 0; do { int chunk = len; if (chunk > max_chunk) chunk = max_chunk; ret = send(sk->fd, buf + off, chunk, 0); if (ret <= 0) { if (max_chunk > 1024) { /* * Kernel not only refuses the whole chunk, * but refuses to split it into pieces too. * * When restoring recv queue in repair mode * kernel doesn't try hard and just allocates * a linear skb with the size we pass to the * system call. Thus, if the size is too big * for slab allocator, the send just fails * with ENOMEM. * * In any case -- try smaller chunk, hopefully * there's still enough memory in the system. */ max_chunk >>= 1; continue; } logerr("Can't restore %d queue data (%d), want (%d:%d:%d)", queue, ret, chunk, len, max_chunk); goto err; } off += ret; len -= ret; } while (len); err = 0; err: return err; } static int send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) { logd("\tRestoring TCP %d queue data %u bytes\n", queue, len); if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) { logerr("Can't set repair queue"); return -1; } return __send_queue(sk, queue, buf, len); } static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, int queue, char *buf) { if (!buf) return 0; if (!data || data_size < SOCR_DATA_MIN_SIZE) return -1; if (queue == TCP_RECV_QUEUE) { if (!data->inq_len) return 0; return send_queue(sk, TCP_RECV_QUEUE, buf, data->inq_len); } if (queue == TCP_SEND_QUEUE) { __u32 len, ulen; /* * All data in a write buffer can be divided on two parts sent * but not yet acknowledged data and unsent data. * The TCP stack must know which data have been sent, because * acknowledgment can be received for them. These data must be * restored in repair mode. */ ulen = data->unsq_len; len = data->outq_len - ulen; if (len && send_queue(sk, TCP_SEND_QUEUE, buf, len)) return -2; if (ulen) { /* * The second part of data have never been sent to outside, so * they can be restored without any tricks. */ tcp_repair_off(sk->fd); if (__send_queue(sk, TCP_SEND_QUEUE, buf + len, ulen)) return -3; if (tcp_repair_on(sk->fd)) return -4; } return 0; } return -5; } #define SET_Q_FLAGS (SOCCR_MEM_EXCL) int libsoccr_set_queue_bytes(struct libsoccr_sk *sk, int queue_id, char *bytes, unsigned flags) { if (flags & ~SET_Q_FLAGS) return -1; switch (queue_id) { case TCP_RECV_QUEUE: sk->recv_queue = bytes; if (flags & SOCCR_MEM_EXCL) sk->flags |= SK_FLAG_FREE_RQ; return 0; case TCP_SEND_QUEUE: sk->send_queue = bytes; if (flags & SOCCR_MEM_EXCL) sk->flags |= SK_FLAG_FREE_SQ; return 0; } return -1; } #define SET_SA_FLAGS (SOCCR_MEM_EXCL) int libsoccr_set_addr(struct libsoccr_sk *sk, int self, union libsoccr_addr *addr, unsigned flags) { if (flags & ~SET_SA_FLAGS) return -1; if (self) { sk->src_addr = addr; if (flags & SOCCR_MEM_EXCL) sk->flags |= SK_FLAG_FREE_SA; } else { sk->dst_addr = addr; if (flags & SOCCR_MEM_EXCL) sk->flags |= SK_FLAG_FREE_DA; } return 0; } criu-3.6/soccr/soccr.h000066400000000000000000000145031317335042600147260ustar00rootroot00000000000000#ifndef __LIBSOCCR_H__ #define __LIBSOCCR_H__ #include /* sockaddr_in, sockaddr_in6 */ #include /* TCP_REPAIR_WINDOW, TCP_TIMESTAMP */ #include /* uint32_t */ #include /* sockaddr */ #include "config.h" /* All packets with this mark have not to be blocked. */ #define SOCCR_MARK 0xC114 #ifndef CONFIG_HAS_TCP_REPAIR_WINDOW struct tcp_repair_window { uint32_t snd_wl1; uint32_t snd_wnd; uint32_t max_window; uint32_t rcv_wnd; uint32_t rcv_wup; }; #endif #ifndef CONFIG_HAS_TCP_REPAIR /* * It's been reported that both tcp_repair_opt * and TCP_ enum already shipped in netinet/tcp.h * system header by some distros thus we need a * test if we can use predefined ones or provide * our own. */ struct tcp_repair_opt { uint32_t opt_code; uint32_t opt_val; }; enum { TCP_NO_QUEUE, TCP_RECV_QUEUE, TCP_SEND_QUEUE, TCP_QUEUES_NR, }; #endif #ifndef TCP_TIMESTAMP #define TCP_TIMESTAMP 24 #endif #ifndef TCP_REPAIR_WINDOW #define TCP_REPAIR_WINDOW 29 #endif void libsoccr_set_log(unsigned int level, void (*fn)(unsigned int level, const char *fmt, ...)); #define SOCCR_LOG_ERR 1 #define SOCCR_LOG_DBG 2 /* * An opaque handler for C/R-ing a TCP socket. */ struct libsoccr_sk; union libsoccr_addr { struct sockaddr sa; struct sockaddr_in v4; struct sockaddr_in6 v6; }; /* * Connection info that should be saved after fetching from the * socket and given back into the library in two steps (see below). */ struct libsoccr_sk_data { uint32_t state; uint32_t inq_len; uint32_t inq_seq; uint32_t outq_len; uint32_t outq_seq; uint32_t unsq_len; uint32_t opt_mask; uint32_t mss_clamp; uint32_t snd_wscale; uint32_t rcv_wscale; uint32_t timestamp; uint32_t flags; /* SOCCR_FLAGS_... below */ uint32_t snd_wl1; uint32_t snd_wnd; uint32_t max_window; uint32_t rcv_wnd; uint32_t rcv_wup; }; /* * The flags below denote which data on libsoccr_sk_data was get * from the kernel and is required for restore. Not present data * is zeroified by the library. * * Ideally the caller should carry the whole _data structure between * calls, but for optimization purposes it may analyze the flags * field and drop the unneeded bits. */ /* * Window parameters. Mark snd_wl1, snd_wnd, max_window, rcv_wnd * and rcv_wup fields. */ #define SOCCR_FLAGS_WINDOW 0x1 /* * These two calls pause and resume the socket for and after C/R * The first one returns an opaque handle that is to be used by all * the subsequent calls. * * For now the library only supports ESTABLISHED sockets. The caller * should check the socket is supported before calling the library. * * Before doing socket C/R make sure no packets can reach the socket * you're working with, nor any packet can leave the node from this * socket. This can be done by using netfilter DROP target (of by * DOWN-ing an interface in case of containers). */ struct libsoccr_sk *libsoccr_pause(int fd); void libsoccr_resume(struct libsoccr_sk *sk); /* This one is like _resume, but doesn't turn repair off on socket. */ void libsoccr_release(struct libsoccr_sk *sk); /* * Flags for calls below */ /* * Memory given to or taken from library is in exclusive ownership * of the resulting owner. I.e. -- when taken by caller from library, * the former will free() one, when given to the library, the latter * is to free() it. */ #define SOCCR_MEM_EXCL 0x1 /* * CHECKPOINTING calls * * Roughly the checkpoint steps for sockets in supported states are * * h = libsoccr_pause(sk); * libsoccr_save(h, &data, sizeof(data)) * inq = libsoccr_get_queue_bytes(h, TCP_RECV_QUEUE, 0) * outq = libsoccr_get_queue_bytes(h, TCP_SEND_QUEUE, 0) * getsocname(sk, &name, ...) * getpeername(sk, &peer, ...) * * save_all_bytes(h, inq, outq, name, peer) * * Resuming the socket afterwards effectively obsoletes the saved * info, as the connection resumes and old saved bytes become * outdated. * * Please note, that getsocname() and getpeername() are standard glibc * calls, not the libsoccr's ones. */ /* * Fills in the libsoccr_sk_data structure with connection info. The * data_size shows the size of a buffer. The returned value is the * amount of bytes put into data (the rest is zeroed with memcpy). */ int libsoccr_save(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size); /* * Get a pointer on the contents of queues. The amount of bytes is * determined from the filled libsoccr_sk_data by queue_id. * * For TCP_RECV_QUEUE the lenght is .inq_len * For TCP_SEND_QUEUE the lenght is .outq_len * * For any other queues returns NULL. * * The steal argument means that the caller grabs the buffer from * library and should free() it himself. Otherwise the buffer can * be claimed again and will be free by library upon _resume call. */ char *libsoccr_get_queue_bytes(struct libsoccr_sk *sk, int queue_id, unsigned flags); /* * Returns filled libsoccr_addr for a socket. This value is also required * on restore, but addresses may be obtained from somewhere else, these * are just common sockaddr-s. */ union libsoccr_addr *libsoccr_get_addr(struct libsoccr_sk *sk, int self, unsigned flags); /* * RESTORING calls * * The restoring of a socket is like below * * get_all_bytes(h, inq, outq, name, peer) * * sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); * * h = libsoccr_pause(sk) * libsoccr_set_queue_bytes(h, TCP_SEND_QUEUE, outq); * libsoccr_set_queue_bytes(h, TCP_RECV_QUEUE, inq); * libsoccr_set_addr(h, 1, src_addr); * libsoccr_set_addr(h, 0, dst_addr); * libsoccr_restore(h, &data, sizeof(data)) * * libsoccr_resume(h) * * Only after this the packets path from and to the socket can be * enabled back. */ /* * Set a pointer on the send/recv queue data. * If flags have SOCCR_MEM_EXCL, the buffer is stolen by the library and is * free()-ed after libsoccr_resume(). */ int libsoccr_set_queue_bytes(struct libsoccr_sk *sk, int queue_id, char *bytes, unsigned flags); /* * Set a pointer on the libsoccr_addr for src/dst. * If flags have SOCCR_MEM_EXCL, the buffer is stolen by the library and is * fre()-ed after libsoccr_resume(). */ int libsoccr_set_addr(struct libsoccr_sk *sk, int self, union libsoccr_addr *, unsigned flags); /* * Performs restore actions on a socket */ int libsoccr_restore(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size); #endif criu-3.6/soccr/test/000077500000000000000000000000001317335042600144205ustar00rootroot00000000000000criu-3.6/soccr/test/Makefile000066400000000000000000000011611317335042600160570ustar00rootroot00000000000000CFLAGS += -Wall -g -I../../ LDFLAGS += -L../ -lsoccr ../libsoccr.a -lnet RUN ?= tcp-constructor run: ./local.sh tcp-constructor: tcp-constructor.c ../libsoccr.a $(CC) $(CFLAGS) tcp-constructor.c -o tcp-constructor $(LDFLAGS) clean: rm -f tcp-constructor tcp-conn: tcp-conn.c $(CC) $(CFLAGS) tcp-conn.c -o tcp-conn $(LDFLAGS) tcp-conn-v6: tcp-conn-v6.c $(CC) $(CFLAGS) -DTEST_IPV6 tcp-conn-v6.c -o tcp-conn-v6 $(LDFLAGS) test: tcp-constructor tcp-conn tcp-conn-v6 unshare -n sh -c "ip link set up dev lo; ./tcp-conn" unshare -n sh -c "ip link set up dev lo; ./tcp-conn-v6" python run.py ./$(RUN) .PHONY: test criu-3.6/soccr/test/local.sh000077500000000000000000000000701317335042600160460ustar00rootroot00000000000000unshare -Urn sh -c 'ip link set up dev lo && make test' criu-3.6/soccr/test/run.py000066400000000000000000000022751317335042600156040ustar00rootroot00000000000000#!/usr/bin/env python2 import sys, os import hashlib from subprocess import Popen, PIPE str2 = "test_test" * (1 << 20) str1 = "Test_Test!" src = os.getenv("TCP_SRC", "127.0.0.1") dst = os.getenv("TCP_DST", "127.0.0.1") sport = os.getenv("TCP_SPORT", "12345") dport = os.getenv("TCP_DPORT", "54321") print sys.argv[1] args = [sys.argv[1], "--addr", src, "--port", sport, "--seq", "555", "--next", "--addr", dst, "--port", dport, "--seq", "666", "--reverse", "--", "./tcp-test.py"] p1 = Popen(args + ["dst"], stdout = PIPE, stdin = PIPE) args.remove("--reverse"); p2 = Popen(args + ["src"], stdout = PIPE, stdin = PIPE) p1.stdout.read(5) p2.stdout.read(5) p1.stdin.write("start") p2.stdin.write("start") p1.stdin.write(str1) p1.stdin.close() p2.stdin.write(str2) p2.stdin.close() s = p1.stdout.read() m = hashlib.md5() m.update(str2) str2 = m.hexdigest() if str2 != eval(s): print "FAIL", repr(str2), repr(s) sys.exit(5); s = p1.stdout.read() m = hashlib.md5() m.update(str1) str1 = m.hexdigest() s = p2.stdout.read() if str1 != eval(s): print "FAIL", repr(str1), s sys.exit(5); if p1.wait(): sys.exit(1) if p2.wait(): sys.exit(1) print "PASS" criu-3.6/soccr/test/tcp-conn-v6.c000077700000000000000000000000001317335042600205162tcp-conn.custar00rootroot00000000000000criu-3.6/soccr/test/tcp-conn.c000066400000000000000000000066531317335042600163170ustar00rootroot00000000000000#include #include /* for srvaddr_in and inet_ntoa() */ #include #include #include #include "../soccr.h" #include #define pr_perror(fmt, ...) printf(fmt ": %m\n", ##__VA_ARGS__) enum { TCP_NO_QUEUE, TCP_RECV_QUEUE, TCP_SEND_QUEUE, TCP_QUEUES_NR, }; static void pr_printf(unsigned int level, const char *fmt, ...) { va_list args; va_start(args, fmt); vprintf(fmt, args); va_end(args); } int main() { union libsoccr_addr addr, dst; int srv, sock, clnt, rst; int ret, dsize; socklen_t dst_let; struct libsoccr_sk_data data = {}; struct libsoccr_sk *so, *so_rst; char buf[11] = "0123456789", *queue; libsoccr_set_log(10, pr_printf); memset(&addr,0,sizeof(addr)); #ifndef TEST_IPV6 addr.v4.sin_family = AF_INET; inet_pton(AF_INET, "0.0.0.0", &(addr.v4.sin_addr)); #else addr.v6.sin6_family = AF_INET6; inet_pton(AF_INET6, "::0", &(addr.v6.sin6_addr)); #endif #ifndef TEST_IPV6 srv = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); #else srv = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); #endif if (srv == -1) { pr_perror("socket() failed"); return -1; } #ifndef TEST_IPV6 addr.v4.sin_port = htons(8765); #else addr.v6.sin6_port = htons(8765); #endif ret = bind(srv, (struct sockaddr *) &addr, sizeof(addr)); if (ret == -1) { pr_perror("bind() failed"); return -1; } if (listen(srv, 1) == -1) { pr_perror("listen() failed"); return -1; } #ifndef TEST_IPV6 clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); #else clnt = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); #endif if (clnt == -1) { pr_perror("socket() failed"); return -1; } if (connect(clnt, (struct sockaddr *) &addr, sizeof(addr))) { pr_perror("connect"); return 1; } dst_let = sizeof(dst); sock = accept(srv, (struct sockaddr *) &dst, &dst_let); if (sock < 0) { pr_perror("accept"); return 1; } if (write(clnt, &buf, sizeof(buf)) != sizeof(buf)) { pr_perror("write"); return 1; } /* Start testing */ dst_let = sizeof(addr); if (getsockname(sock, (struct sockaddr *) &addr, &dst_let)) { pr_perror("connect"); return 1; } dst_let = sizeof(addr); if (getpeername(sock, (struct sockaddr *) &dst, &dst_let)) { pr_perror("connect"); return 1; } so = libsoccr_pause(sock); dsize = libsoccr_save(so, &data, sizeof(data)); if (dsize < 0) { pr_perror("libsoccr_save"); return 1; } #ifndef TEST_IPV6 rst = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); #else rst = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); #endif if (rst == -1) { pr_perror("socket() failed"); return -1; } close(sock); so_rst = libsoccr_pause(rst); libsoccr_set_addr(so_rst, 1, &addr, 0); libsoccr_set_addr(so_rst, 0, &dst, 0); queue = libsoccr_get_queue_bytes(so, TCP_RECV_QUEUE, SOCCR_MEM_EXCL); libsoccr_set_queue_bytes(so_rst, TCP_RECV_QUEUE, queue, SOCCR_MEM_EXCL); queue = libsoccr_get_queue_bytes(so, TCP_SEND_QUEUE, SOCCR_MEM_EXCL); libsoccr_set_queue_bytes(so_rst, TCP_SEND_QUEUE, queue, SOCCR_MEM_EXCL); ret = libsoccr_restore(so_rst, &data, dsize); if (ret) { pr_perror("libsoccr_restore: %d", ret); return 1; } libsoccr_resume(so_rst); libsoccr_resume(so); if (read(rst, &buf, sizeof(buf)) != sizeof(buf)) { pr_perror("read"); return 1; } if (write(rst, &buf, sizeof(buf)) != sizeof(buf)) { pr_perror("write"); return 1; } shutdown(rst, SHUT_WR); if (read(clnt, &buf, sizeof(buf)) != sizeof(buf)) { pr_perror("read"); return 1; } return 0; } criu-3.6/soccr/test/tcp-constructor.c000066400000000000000000000072061317335042600177420ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "soccr/soccr.h" #define pr_perror(fmt, ...) ({ fprintf(stderr, "%s:%d: " fmt " : %m\n", __func__, __LINE__, ##__VA_ARGS__); 1; }) struct tcp { char *addr; uint32_t port; uint32_t seq; uint16_t mss_clamp; uint16_t wscale; }; static void usage() { printf( "Usage: --addr ADDR -port PORT --seq SEQ --next --addr ADDR -port PORT --seq SEQ -- CMD ...\n" "\t Describe a source side of a connection, then set the --next option\n" "\t and describe a destination side.\n" "\t --reverse - swap source and destination sides\n" "\t The idea is that the same command line is execute on both sides,\n" "\t but the --reverse is added to one of them.\n" "\n" "\t CMD ... - a user command to handle a socket, which is the descriptor 3.\n" "\n" "\t It prints the \"start\" on stdout when a socket is created and\n" "\t resumes it when you write \"start\" to stdin.\n" ); } int main(int argc, char **argv) { static const char short_opts[] = ""; static struct option long_opts[] = { { "addr", required_argument, 0, 'a' }, { "port", required_argument, 0, 'p' }, { "seq", required_argument, 0, 's' }, { "next", no_argument, 0, 'n'}, { "reverse", no_argument, 0, 'r'}, {}, }; struct tcp tcp[2] = { {"127.0.0.1", 12345, 5000000, 1460, 7}, {"127.0.0.1", 54321, 6000000, 1460, 7} }; int sk, yes = 1, val, idx, opt, i, src = 0, dst = 1; union libsoccr_addr src_addr, dst_addr; struct libsoccr_sk_data data = {}; struct libsoccr_sk *so; char buf[1024]; i = 0; while (1) { idx = -1; opt = getopt_long(argc, argv, short_opts, long_opts, &idx); if (opt == -1) break; switch (opt) { case 'a': tcp[i].addr = optarg; break; case 'p': tcp[i].port = atol(optarg); break; case 's': tcp[i].seq = atol(optarg); break; case 'n': i++; if (i > 1) return pr_perror("--next is used twice or more"); break; case 'r': src = 1; dst = 0; break; default: usage(); return 3; } } if (i != 1) return pr_perror("--next is required"); if (optind == argc) { usage(); return 1; } for (i = 0; i < 2; i++) fprintf(stderr, "%s:%d:%d\n", tcp[i].addr, tcp[i].port, tcp[i].seq); data.state = TCP_ESTABLISHED; data.inq_seq = tcp[dst].seq; data.outq_seq = tcp[src].seq; sk = socket(AF_INET, SOCK_STREAM, 0); if (sk < 0) return pr_perror("socket"); so = libsoccr_pause(sk); if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) == -1) return pr_perror("setsockopt"); src_addr.v4.sin_family = AF_INET; src_addr.v4.sin_port = htons(tcp[src].port); if (inet_pton(AF_INET, tcp[src].addr, &src_addr.v4.sin_addr) != 1) return pr_perror("inet_pton"); dst_addr.v4.sin_family = AF_INET; dst_addr.v4.sin_port = htons(tcp[dst].port); if (inet_pton(AF_INET, tcp[dst].addr, &(dst_addr.v4.sin_addr)) != 1) return pr_perror("inet_pton"); libsoccr_set_addr(so, 1, &src_addr, 0); libsoccr_set_addr(so, 0, &dst_addr, 0); data.snd_wscale = tcp[src].wscale; data.rcv_wscale = tcp[dst].wscale; data.mss_clamp = tcp[src].mss_clamp; data.opt_mask = TCPI_OPT_WSCALE | TCPOPT_MAXSEG; if (libsoccr_restore(so, &data, sizeof(data))) return 1; /* Let's go */ if (write(STDOUT_FILENO, "start", 5) != 5) return pr_perror("write"); if (read(STDIN_FILENO, buf, 5) != 5) return pr_perror("read"); val = 0; if (setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val))) return pr_perror("TCP_REPAIR"); execv(argv[optind], argv + optind); return pr_perror("Unable to exec %s", argv[optind]); } criu-3.6/soccr/test/tcp-test.py000077500000000000000000000007011317335042600165360ustar00rootroot00000000000000#!/usr/bin/env python2 import os, sys, socket import hashlib sk = socket.fromfd(3, socket.AF_INET, socket.SOCK_STREAM) s = sys.stdin.read() ret = sk.send(s) print >> sys.stderr, "%s: send() -> %d" % (sys.argv[1], ret) sk.shutdown(socket.SHUT_WR) m = hashlib.md5() while True: s = sk.recv((1 << 20) * 10) if not s: break print >> sys.stderr, "%s: recv() -> %d" % (sys.argv[1], len(s)) m.update(s) print repr(m.hexdigest()) criu-3.6/test/000077500000000000000000000000001317335042600133075ustar00rootroot00000000000000criu-3.6/test/.gitignore000066400000000000000000000002401317335042600152730ustar00rootroot00000000000000/lib /lib64 /bin /sbin /dev /dump /tmp /usr /.constructed /*.log /zdtm_ct /zdtm-tst-list /stats-restore /zdtm_mount_cgroups.lock /compel/handle_binary /umount2 criu-3.6/test/Makefile000066400000000000000000000024561317335042600147560ustar00rootroot00000000000000RM := rm -f --one-file-system ZDTM_ARGS ?= -C export ZDTM_ARGS all: $(MAKE) zdtm $(MAKE) zdtm-pre-dump $(MAKE) zdtm-snapshot $(MAKE) zdtm-iter $(MAKE) zdtm-freezer .PHONY: all TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev other: for t in $(TESTS); do \ setsid $(MAKE) -C others/$$t run || exit 1; \ done .PHONY: other zdtm: ./zdtm.py run -a --parallel 2 .PHONY: zdtm zdtm-pre-dump: ./zdtm.py run --pre 2:1 -t zdtm/transition/fork -f uns .PHONY: zdtm-pre-dump zdtm-snapshot: ./zdtm.py run --pre 2:1 --snap -t zdtm/transition/fork -f uns .PHONY: zdtm-snapshot zdtm-iter: ./zdtm.py run --iters 3:1 -t zdtm/transition/fork -f uns .PHONY: zdtm-iter zdtm-freezer: ./zdtm.py run --test zdtm/transition/thread-bomb --pre 3 --freezecg zdtm:t ./zdtm.py run --test zdtm/transition/thread-bomb --pre 3 --freezecg zdtm:f .PHONY: zdtm-freezer fault-injection: $(MAKE) -C fault-injection .PHONY: fault-injection override CFLAGS += -D_GNU_SOURCE clean_root: $(Q) ./zdtm.py clean nsroot .PHONY: clean_root clean: clean_root $(RM) zdtm_ct zdtm-tst-list umount2 $(Q) $(RM) *.log $(Q) $(RM) -r ./dump/ $(Q) $(MAKE) -C zdtm cleandep clean cleanout $(Q) $(MAKE) -C libcriu clean $(Q) $(MAKE) -C rpc clean $(Q) $(MAKE) -C crit clean .PHONY: clean criu-3.6/test/abrt.sh000077500000000000000000000006301317335042600145750ustar00rootroot00000000000000#!/bin/bash -x pid=$1 vpid=$2 sig=$3 comm=$4 exec &>> /tmp/zdtm-core.log expr match "$comm" zombie00 && { cat > /dev/null exit 0 } report="/tmp/zdtm-core-$pid-$comm" exec &> ${report}.txt ps axf ps -p $pid cat /proc/$pid/status ls -l /proc/$pid/fd cat /proc/$pid/maps exec 33< /proc/$pid/exe cat > $report.core echo 'bt i r disassemble $rip-0x10,$rip + 0x10 ' | gdb -c $report.core /proc/self/fd/33 criu-3.6/test/check_actions.py000077500000000000000000000020651317335042600164640ustar00rootroot00000000000000#!/usr/bin/env python2 import sys import os actions = set(['pre-dump', 'pre-restore', 'post-dump', 'setup-namespaces', \ 'post-setup-namespaces', 'post-restore', 'post-resume', \ 'network-lock', 'network-unlock' ]) errors = [] af = os.path.dirname(os.path.abspath(__file__)) + '/actions_called.txt' for act in open(af): act = act.strip().split() act.append('EMPTY') act.append('EMPTY') if act[0] == 'EMPTY': raise Exception("Error in test, bogus actions line") if act[1] == 'EMPTY': errors.append('Action %s misses CRTOOLS_IMAGE_DIR' % act[0]) if act[0] in ('post-dump', 'setup-namespaces', 'post-setup-namespaces', \ 'post-restore', 'post-resume', 'network-lock', 'network-unlock'): if act[2] == 'EMPTY': errors.append('Action %s misses CRTOOLS_INIT_PID' % act[0]) elif not act[2].isdigit() or int(act[2]) == 0: errors.append('Action %s PID is not number (%s)' % (act[0], act[2])) actions -= set([act[0]]) if actions: errors.append('Not all actions called: %r' % actions) if errors: for x in errors: print x sys.exit(1) print 'PASS' criu-3.6/test/compel/000077500000000000000000000000001317335042600145665ustar00rootroot00000000000000criu-3.6/test/compel/Makefile000066400000000000000000000010441317335042600162250ustar00rootroot00000000000000# Relative path to original objects define compel_obj_path $(addprefix ../../compel/,$(1)) endef host-ccflags-y += -iquote test/compel/arch/$(ARCH)/include test_objs := $(filter-out main.o,$(compel-objs)) hostprogs-y += handle_binary handle_binary-objs += $(call compel_obj_path,$(test_objs)) handle_binary-objs += main.o handle_binary-objs += handle_binary.o ifeq ($(ARCH),x86) handle_binary-objs += handle_binary_32.o HOSTCFLAGS_handle_binary.o += -DCONFIG_X86_64 HOSTCFLAGS_handle_binary_32.o += -DCONFIG_X86_32 endif criu-3.6/test/compel/arch/000077500000000000000000000000001317335042600155035ustar00rootroot00000000000000criu-3.6/test/compel/arch/aarch64/000077500000000000000000000000001317335042600167335ustar00rootroot00000000000000criu-3.6/test/compel/arch/aarch64/include/000077500000000000000000000000001317335042600203565ustar00rootroot00000000000000criu-3.6/test/compel/arch/aarch64/include/arch_test_handle_binary.h000066400000000000000000000011461317335042600253640ustar00rootroot00000000000000#ifndef __ARCH_TEST_HANDLE_BINARY__ #define __ARCH_TEST_HANDLE_BINARY__ #include #include "uapi/elf64-types.h" #define arch_run_tests(mem) __run_tests(mem, "") extern int __run_tests(void *mem, const char *msg); static __maybe_unused void arch_test_set_elf_hdr_ident(void *mem) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ memcpy(mem, elf_ident_64_le, sizeof(elf_ident_64_le)); #else memcpy(mem, elf_ident_64_be, sizeof(elf_ident_64_be)); #endif } static __maybe_unused void arch_test_set_elf_hdr_machine(Ehdr_t *hdr) { hdr->e_machine = EM_AARCH64; } #endif /* __ARCH_TEST_HANDLE_BINARY__ */ criu-3.6/test/compel/arch/arm/000077500000000000000000000000001317335042600162625ustar00rootroot00000000000000criu-3.6/test/compel/arch/arm/include/000077500000000000000000000000001317335042600177055ustar00rootroot00000000000000criu-3.6/test/compel/arch/arm/include/arch_test_handle_binary.h000066400000000000000000000007521317335042600247150ustar00rootroot00000000000000#ifndef __ARCH_TEST_HANDLE_BINARY__ #define __ARCH_TEST_HANDLE_BINARY__ #include #include "uapi/elf32-types.h" #define arch_run_tests(mem) __run_tests(mem, "") extern int __run_tests(void *mem, const char *msg); static __maybe_unused void arch_test_set_elf_hdr_ident(void *mem) { memcpy(mem, elf_ident_32, sizeof(elf_ident_32)); } static __maybe_unused void arch_test_set_elf_hdr_machine(Ehdr_t *hdr) { hdr->e_machine = EM_ARM; } #endif /* __ARCH_TEST_HANDLE_BINARY__ */ criu-3.6/test/compel/arch/ppc64/000077500000000000000000000000001317335042600164375ustar00rootroot00000000000000criu-3.6/test/compel/arch/ppc64/include/000077500000000000000000000000001317335042600200625ustar00rootroot00000000000000criu-3.6/test/compel/arch/ppc64/include/arch_test_handle_binary.h000066400000000000000000000011441317335042600250660ustar00rootroot00000000000000#ifndef __ARCH_TEST_HANDLE_BINARY__ #define __ARCH_TEST_HANDLE_BINARY__ #include #include "uapi/elf64-types.h" #define arch_run_tests(mem) __run_tests(mem, "") extern int __run_tests(void *mem, const char *msg); static __maybe_unused void arch_test_set_elf_hdr_ident(void *mem) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ memcpy(mem, elf_ident_64_le, sizeof(elf_ident_64_le)); #else memcpy(mem, elf_ident_64_be, sizeof(elf_ident_64_be)); #endif } static __maybe_unused void arch_test_set_elf_hdr_machine(Ehdr_t *hdr) { hdr->e_machine = EM_PPC64; } #endif /* __ARCH_TEST_HANDLE_BINARY__ */ criu-3.6/test/compel/arch/x86/000077500000000000000000000000001317335042600161305ustar00rootroot00000000000000criu-3.6/test/compel/arch/x86/include/000077500000000000000000000000001317335042600175535ustar00rootroot00000000000000criu-3.6/test/compel/arch/x86/include/arch_test_handle_binary.h000066400000000000000000000020521317335042600245560ustar00rootroot00000000000000#ifndef __ARCH_TEST_HANDLE_BINARY__ #define __ARCH_TEST_HANDLE_BINARY__ #include #ifdef CONFIG_X86_64 #include "uapi/elf64-types.h" #define __run_tests run_tests_64 static __maybe_unused void arch_test_set_elf_hdr_ident(void *mem) { memcpy(mem, elf_ident_64_le, sizeof(elf_ident_64_le)); } static __maybe_unused void arch_test_set_elf_hdr_machine(Ehdr_t *hdr) { hdr->e_machine = EM_X86_64; } #else /* !CONFIG_X86_64 */ #include "uapi/elf32-types.h" #define __run_tests run_tests_32 static __maybe_unused void arch_test_set_elf_hdr_ident(void *mem) { memcpy(mem, elf_ident_32, sizeof(elf_ident_32)); } static __maybe_unused void arch_test_set_elf_hdr_machine(Ehdr_t *hdr) { hdr->e_machine = EM_386; } #endif /* CONFIG_X86_32 */ extern int run_tests_64(void *mem, const char *msg); extern int run_tests_32(void *mem, const char *msg); static __maybe_unused int arch_run_tests(void *mem) { int ret; ret = run_tests_64(mem, "(64-bit ELF)"); ret += run_tests_32(mem, "(32-bit ELF)"); return ret; } #endif /* __ARCH_TEST_HANDLE_BINARY__ */ criu-3.6/test/compel/handle_binary.c000066400000000000000000000044331317335042600175350ustar00rootroot00000000000000#include #include "uapi/piegen-err.h" #include "piegen.h" #include "arch_test_handle_binary.h" extern int launch_test(void *mem, int expected_ret, const char *test_fmt, ...); extern const size_t test_elf_buf_size; static uintptr_t elf_addr; static const char *test_bitness; #define ASSERT(expected, fmt, ...) \ launch_test((void *)elf_addr, expected, \ fmt " %s", ##__VA_ARGS__, test_bitness) static const unsigned int sections_nr = 1; static void set_elf_hdr_relocatable(Ehdr_t *hdr) { hdr->e_type = ET_REL; hdr->e_version = EV_CURRENT; } static int test_add_strings_section(Ehdr_t *hdr) { Shdr_t *sec_strings_hdr; uintptr_t sections_table = elf_addr + hdr->e_shoff; size_t sections_table_size = sections_nr*sizeof(hdr->e_shentsize); hdr->e_shnum = sections_nr; hdr->e_shstrndx = sections_nr; /* off-by-one */ if (ASSERT(-E_NO_STR_SEC, "strings section's header oob of section table")) return -1; hdr->e_shstrndx = 0; sec_strings_hdr = (void *)sections_table; sec_strings_hdr->sh_offset = (Off_t)-1; if (ASSERT(-E_NO_STR_SEC, "strings section oob")) return -1; /* Put strings just right after sections table. */ sec_strings_hdr->sh_offset = sections_table - elf_addr + sections_table_size; return 0; } static int test_prepare_section_table(Ehdr_t *hdr) { hdr->e_shoff = (Off_t)test_elf_buf_size; if (ASSERT(-E_NO_STR_SEC, "section table start oob")) return -1; /* Lets put sections table right after ELF header. */ hdr->e_shoff = (Off_t) sizeof(Ehdr_t); hdr->e_shentsize = (Half_t) sizeof(Shdr_t); hdr->e_shnum = (Half_t)-1; if (ASSERT(-E_NO_STR_SEC, "too many sections in table")) return -1; if (test_add_strings_section(hdr)) return -1; return 0; } static int test_prepare_elf_header(void *elf) { memset(elf, 0, sizeof(Ehdr_t)); if (ASSERT(-E_NOT_ELF, "zero ELF header")) return -1; arch_test_set_elf_hdr_ident(elf); if (ASSERT(-E_NOT_ELF, "unsupported ELF header")) return -1; arch_test_set_elf_hdr_machine(elf); if (ASSERT(-E_NOT_ELF, "non-relocatable ELF header")) return -1; set_elf_hdr_relocatable(elf); if (test_prepare_section_table(elf)) return -1; return 0; } int __run_tests(void *mem, const char *msg) { elf_addr = (uintptr_t)mem; test_bitness = msg; if (test_prepare_elf_header(mem)) return 1; return 0; } criu-3.6/test/compel/handle_binary_32.c000077700000000000000000000000001317335042600227732handle_binary.custar00rootroot00000000000000criu-3.6/test/compel/main.c000066400000000000000000000027101317335042600156560ustar00rootroot00000000000000/* * Test for handle_binary(). * In this test ELF binary file is constructed from * header up to sections and relocations. * On each stage it tests non-valid ELF binaries to be parsed. * For passing test, handle_binary should return errors for all * non-valid binaries and handle all relocations. * * Test author: Dmitry Safonov */ #include #include #include #include #include "piegen.h" #include "arch_test_handle_binary.h" /* size of buffer with formed ELF file */ const size_t test_elf_buf_size = 4096; extern int handle_binary(void *mem, size_t size); extern void run_tests(void *mem); /* To shut down error printing on tests for failures */ piegen_opt_t opts = { .fout = NULL, .ferr = NULL, .fdebug = NULL, }; int launch_test(void *mem, int expected_ret, const char *test_fmt, ...) { static unsigned test_nr = 1; int ret = handle_binary(mem, test_elf_buf_size); va_list params; va_start(params, test_fmt); if (ret != expected_ret) { printf("not ok %u - ", test_nr); vprintf(test_fmt, params); printf(", expected %d but ret is %d\n", expected_ret, ret); } else { printf("ok %u - ", test_nr); vprintf(test_fmt, params); putchar('\n'); } va_end(params); test_nr++; fflush(stdout); return ret != expected_ret; } int main(int argc, char **argv) { void *elf_buf = malloc(test_elf_buf_size); int ret; ret = arch_run_tests(elf_buf); free(elf_buf); return ret; } criu-3.6/test/crit-recode.py000077500000000000000000000027631317335042600160740ustar00rootroot00000000000000#!/bin/env python2 import py as pycriu import sys import os import subprocess find = subprocess.Popen(['find', 'test/dump/', '-size', '+0', '-name', '*.img'], stdout = subprocess.PIPE) test_pass = True def recode_and_check(imgf, o_img, pretty): try: pb = pycriu.images.loads(o_img, pretty) except pycriu.images.MagicException as me: print "%s magic %x error" % (imgf, me.magic) return False except: print "%s %sdecode fails" % (imgf, pretty and 'pretty ' or '') return False try: r_img = pycriu.images.dumps(pb) except: print "%s %sencode fails" % (imgf, pretty and 'pretty ' or '') return False if o_img != r_img: print "%s %srecode mismatch" % (imgf, pretty and 'pretty ' or '') return False return True for imgf in find.stdout.readlines(): imgf = imgf.strip() imgf_b = os.path.basename(imgf) if imgf_b.startswith('pages-'): continue if imgf_b.startswith('iptables-'): continue if imgf_b.startswith('ip6tables-'): continue if imgf_b.startswith('route-'): continue if imgf_b.startswith('route6-'): continue if imgf_b.startswith('ifaddr-'): continue if imgf_b.startswith('tmpfs-'): continue if imgf_b.startswith('netns-ct-'): continue if imgf_b.startswith('netns-exp-'): continue if imgf_b.startswith('rule-'): continue o_img = open(imgf).read() if not recode_and_check(imgf, o_img, False): test_pass = False if not recode_and_check(imgf, o_img, True): test_pass = False find.wait() if not test_pass: print "FAIL" sys.exit(1) print "PASS" criu-3.6/test/criu.py000077700000000000000000000000001317335042600175432../lib/py/criu.pyustar00rootroot00000000000000criu-3.6/test/empty-netns-prep.sh000077500000000000000000000006261317335042600171010ustar00rootroot00000000000000#!/bin/bash set -ex if [ "$CRTOOLS_SCRIPT_ACTION" == "setup-namespaces" ]; then echo "Will up lo at $CRTOOLS_INIT_PID netns" mkdir -p /var/run/netns mount -t tmpfs xxx /var/run/netns touch /var/run/netns/emptyns mount --bind /proc/$CRTOOLS_INIT_PID/ns/net /var/run/netns/emptyns ip netns exec emptyns ip link set up dev lo || exit 1 ip netns exec emptyns ip a umount -l /var/run/netns fi exit 0 criu-3.6/test/groups.desc000066400000000000000000000000451317335042600154650ustar00rootroot00000000000000{ 'dir': 'groups/', 'exclude': [ ] } criu-3.6/test/inhfd.desc000066400000000000000000000000441317335042600152350ustar00rootroot00000000000000{ 'dir': 'inhfd/', 'exclude': [ ] } criu-3.6/test/inhfd/000077500000000000000000000000001317335042600143775ustar00rootroot00000000000000criu-3.6/test/inhfd/fifo.py000077500000000000000000000013571317335042600157050ustar00rootroot00000000000000import os, tempfile id_str = "" def create_fds(): tdir = tempfile.mkdtemp("zdtm.inhfd.XXXXXX") if os.system("mount -t tmpfs zdtm.inhfd %s" % tdir) != 0: raise Exception("Unable to mount tmpfs") tfifo = os.path.join(tdir, "test_fifo") os.mkfifo(tfifo) fd2 = open(tfifo, "w+") fd1 = open(tfifo, "r") os.system("umount -l %s" % tdir) os.rmdir(tdir) mnt_id = -1; f = open("/proc/self/fdinfo/%d" % fd1.fileno()) for l in f: l = l.split() if l[0] == "mnt_id:": mnt_id = int(l[1]) break else: raise Exception("Unable to find mnt_id") global id_str id_str = "file[%x:%x]" % (mnt_id, os.fstat(fd1.fileno()).st_ino) return (fd2, fd1) def filename(pipef): return id_str def dump_opts(sockf): return [ "--external", id_str ] criu-3.6/test/inhfd/fifo.py.desc000066400000000000000000000000221317335042600166030ustar00rootroot00000000000000{ 'flavor': 'h' } criu-3.6/test/inhfd/pipe.py000077500000000000000000000003271317335042600157130ustar00rootroot00000000000000import os def create_fds(): (fd1, fd2) = os.pipe() return (os.fdopen(fd2, "w"), os.fdopen(fd1, "r")) def filename(pipef): return 'pipe:[%d]' % os.fstat(pipef.fileno()).st_ino def dump_opts(sockf): return [ ] criu-3.6/test/inhfd/pipe.py.desc000066400000000000000000000000221317335042600166150ustar00rootroot00000000000000{ 'flavor': 'h' } criu-3.6/test/inhfd/socket.py000077500000000000000000000005541317335042600162500ustar00rootroot00000000000000import socket import os def create_fds(): (sk1, sk2) = socket.socketpair(socket.AF_UNIX, socket.SOCK_STREAM) return (sk1.makefile("w"), sk2.makefile("r")) def __sock_ino(sockf): return os.fstat(sockf.fileno()).st_ino def filename(sockf): return 'socket:[%d]' % __sock_ino(sockf) def dump_opts(sockf): return ['--external', 'unix[%d]' % __sock_ino(sockf)] criu-3.6/test/inhfd/socket.py.desc000066400000000000000000000000221317335042600171500ustar00rootroot00000000000000{ 'flavor': 'h' } criu-3.6/test/inhfd/tty.py000077500000000000000000000006451317335042600156010ustar00rootroot00000000000000import os, pty import termios, fcntl def child_prep(fd): fcntl.ioctl(fd.fileno(), termios.TIOCSCTTY, 1) def create_fds(): (fd1, fd2) = pty.openpty() return (os.fdopen(fd2, "w"), os.fdopen(fd1, "r")) def filename(pipef): st = os.fstat(pipef.fileno()) return 'tty[%x:%x]' % (st.st_rdev, st.st_dev) def dump_opts(sockf): st = os.fstat(sockf.fileno()) return ["--external", 'tty[%x:%x]' % (st.st_rdev, st.st_dev)] criu-3.6/test/inhfd/tty.py.desc000066400000000000000000000000221317335042600165000ustar00rootroot00000000000000{ 'flavor': 'h' } criu-3.6/test/jenkins/000077500000000000000000000000001317335042600147505ustar00rootroot00000000000000criu-3.6/test/jenkins/_run_ct000077500000000000000000000002211317335042600163220ustar00rootroot00000000000000#!/bin/sh set -e mount --make-rslave / umount -l /proc mount -t proc proc /proc/ mount -t binfmt_misc none /proc/sys/fs/binfmt_misc/ exec "$@" criu-3.6/test/jenkins/actions.sh000077500000000000000000000003471317335042600167530ustar00rootroot00000000000000# Check how crit de/encodes images set -e source `dirname $0`/criu-lib.sh # prep rm -f actions_called.txt ./test/zdtm.py run -t zdtm/static/env00 --script "$(pwd)/test/show_action.sh" || fail ./test/check_actions.py || fail exit 0 criu-3.6/test/jenkins/crit.sh000077500000000000000000000003461317335042600162530ustar00rootroot00000000000000# Check how crit de/encodes images set -e source `dirname $0`/criu-lib.sh prep ./test/zdtm.py run --all -f best -x maps04 -x cgroup02 --norst --keep-img always || fail PYTHONPATH="$(pwd)/lib/" ./test/crit-recode.py || fail exit 0 criu-3.6/test/jenkins/criu-btrfs.sh000066400000000000000000000002711317335042600173640ustar00rootroot00000000000000# This is a job which is executed on btrfs source `dirname $0`/criu-lib.sh && prep && make -C test -j 4 ZDTM_ARGS="-C -x '\(maps04\|mountpoints\|inotify_irmap\)'" zdtm && true || fail criu-3.6/test/jenkins/criu-by-id.sh000066400000000000000000000006371317335042600172560ustar00rootroot00000000000000echo 950000 > /sys/fs/cgroup/cpu,cpuacct/system/cpu.rt_runtime_us echo 950000 > /sys/fs/cgroup/cpu,cpuacct/system/jenkins.service/cpu.rt_runtime_us git checkout -f ${TEST_COMMIT} git clean -dfx && make -j 4 && make -j 4 -C test/zdtm && mkdir -p test/dump && mount -t tmpfs zdtm test/dump && make -C test -j 4 zdtm_ns && true || { tar -czf /home/criu-by-id-${TEST_COMMIT}-$(date +%m%d%H%M).tar.gz . exit 1 } criu-3.6/test/jenkins/criu-dedup.sh000077500000000000000000000015721317335042600173550ustar00rootroot00000000000000# Check auto-deduplication of pagemaps set -e source `dirname $0`/criu-lib.sh prep ./test/zdtm.py run --all --keep-going --report report --parallel 4 -f h --pre 2 --dedup -x maps04 -x maps007 || fail # Additionally run these tests as they touch a lot of # memory and it makes sense to additionally check it # with delays between iterations ./test/zdtm.py run -t zdtm/transition/maps007 --keep-going --report report -f h --pre 8:.1 --dedup || fail ./test/zdtm.py run -t zdtm/static/mem-touch --keep-going --report report -f h --pre 8:.1 --dedup || fail ./test/zdtm.py run -t zdtm/transition/maps008 --keep-going --report report -f h --pre 8:.1 --dedup || fail ./test/zdtm.py run -t zdtm/transition/maps007 --keep-going --report report -f h --pre 8:.1 --noauto-dedup || fail ./test/zdtm.py run -t zdtm/static/mem-touch --keep-going --report report -f h --pre 8:.1 --noauto-dedup || fail criu-3.6/test/jenkins/criu-dump.sh000077500000000000000000000003221317335042600172110ustar00rootroot00000000000000# Check that dump is not destructive set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump ./test/zdtm.py run --all --keep-going --report report --parallel 4 --norst -x 'maps04' -x 'cgroup02' || fail criu-3.6/test/jenkins/criu-fault.sh000077500000000000000000000026751317335042600173740ustar00rootroot00000000000000#!/bin/bash # Check known fault injections set -e source `dirname $0`/criu-lib.sh prep ./test/zdtm.py run -t zdtm/static/env00 --fault 1 --keep-going --report report -f h || fail ./test/zdtm.py run -t zdtm/static/unlink_fstat00 --fault 2 --keep-going --report report -f h || fail ./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --keep-going --report report -f h || fail ./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --keep-going --pre 2 -f uns || fail ./test/zdtm.py run -t zdtm/static/env00 --fault 129 -f uns || fail ./test/zdtm.py run -t zdtm/transition/fork --fault 130 -f h || fail ./test/zdtm.py run -t zdtm/static/vdso01 --fault 127 || fail ./test/zdtm.py run -t zdtm/static/vdso-proxy --fault 127 --iters 3 || fail ./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 2 --keep-going --report report || fail ./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 4 --keep-going --report report || fail ./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 6 --report report || fail ./test/zdtm.py run -t zdtm/static/mntns_link_remap --fault 6 --report report || fail ./test/zdtm.py run -t zdtm/static/unlink_fstat03 --fault 6 --report report || fail ./test/zdtm.py run -t zdtm/static/env00 --fault 5 --keep-going --report report || fail ./test/zdtm.py run -t zdtm/static/maps04 --fault 131 --keep-going --report report --pre 2:1 || fail ./test/zdtm.py run -t zdtm/transition/maps008 --fault 131 --keep-going --report report --pre 2:1 || fail criu-3.6/test/jenkins/criu-fcg.sh000077500000000000000000000014251317335042600170100ustar00rootroot00000000000000# Test how freeze cgroup works set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump ./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f || fail ./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f --pre 3 || fail ./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f --norst || fail ./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t || fail ./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t --pre 3 || fail ./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t --norst || fail criu-3.6/test/jenkins/criu-groups.sh000077500000000000000000000004111317335042600175620ustar00rootroot00000000000000# Make one regular C/R cycle over randomly-generated groups set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump ./test/zdtm.py group --max 32 -x maps04 -x cgroup || fail ./test/zdtm.py --set groups run --all --keep-going --report report -f best || fail criu-3.6/test/jenkins/criu-inhfd.sh000077500000000000000000000002331317335042600173350ustar00rootroot00000000000000# Check known fault injections set -e source `dirname $0`/criu-lib.sh prep ./test//zdtm.py --set inhfd run --all --keep-going --report report -f h || fail criu-3.6/test/jenkins/criu-iter.sh000077500000000000000000000003211317335042600172060ustar00rootroot00000000000000# Make 3 iteration of dump/restore for each test set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump ./test/zdtm.py run --all --keep-going --report report --parallel 4 --iter 3 -x 'maps04' || fail criu-3.6/test/jenkins/criu-join-ns.sh000077500000000000000000000003201317335042600176170ustar00rootroot00000000000000# Make one regular C/R cycle set -e source `dirname $0`/criu-lib.sh prep mkdir -p /var/run/netns mount -t tmpfs zdtm_run /var/run/netns ./test/zdtm.py run --all --keep-going --report report --join-ns || fail criu-3.6/test/jenkins/criu-lazy-pages.sh000077500000000000000000000014421317335042600203240ustar00rootroot00000000000000# Check lazy-pages set -e source `dirname $0`/criu-lib.sh prep KERN_MAJ=`uname -r | cut -d. -f1` KERN_MIN=`uname -r | cut -d. -f2` if [ $KERN_MAJ -ge "4" ] && [ $KERN_MIN -ge "11" ]; then LAZY_EXCLUDE="-x cmdlinenv00" else LAZY_EXCLUDE="-x maps007 -x fork -x fork2 -x uffd-events -x cgroupns -x socket_listen -x socket_listen6 -x cmdlinenv00 -x socket_close_data01 -x file_read" fi # lazy restore from images ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ --lazy-pages $LAZY_EXCLUDE || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from images with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ --lazy-pages --pre 2 $LAZY_EXCLUDE || fail criu-3.6/test/jenkins/criu-lib.sh000066400000000000000000000016221317335042600170130ustar00rootroot00000000000000function exit_hook() { test -z "$GCOV" && return make gcov } function prep() { test -n "$SKIP_PREP" && return # systemd executes jenkins in a separate sched cgroup. echo 950000 > /sys/fs/cgroup/cpu,cpuacct/system/cpu.rt_runtime_us || true echo 950000 > /sys/fs/cgroup/cpu,cpuacct/system/jenkins.service/cpu.rt_runtime_us || true test -n "$GCOV" && umask 0000 ulimit -c unlimited && export CFLAGS=-g git clean -dfx && make -j 4 && make -j 4 -C test/zdtm/ && make -C test zdtm_ct && mkdir -p test/report && trap exit_hook EXIT } function mount_tmpfs_to_dump() { test -n "$SKIP_PREP" && return mkdir -p test/dump && mount -t tmpfs criu_dump test/dump && true } function fail() { uname -a ps axf > ps.log cat /sys/kernel/debug/tracing/trace > trace.log tar -czf /home/`basename $0`-${BUILD_NUMBER}-${GIT_COMMIT}-$(date +%m%d%H%M).tar.gz . tar -czf report.tar.gz -C test/ report exit 1 } criu-3.6/test/jenkins/criu-other.sh000077500000000000000000000001161317335042600173660ustar00rootroot00000000000000source `dirname $0`/criu-lib.sh && prep && make -C test other && true || fail criu-3.6/test/jenkins/criu-overlay.sh000077500000000000000000000005311317335042600177270ustar00rootroot00000000000000# Make one regular C/R cycle set -e source `dirname $0`/criu-lib.sh prep mkdir -p test.up test.work mount -t overlay overlay -olowerdir=test,upperdir=test.up,workdir=test.work test ./test/zdtm.py run --all --keep-going --report report --parallel 4 -x inotify -x mntns_open -x socket -x sk-unix -x unlink -x fsnotify -x fanotify -x ghost || fail criu-3.6/test/jenkins/criu-pre-dump.sh000077500000000000000000000005101317335042600177740ustar00rootroot00000000000000# Check 3 pre-dump-s before dump (with and w/o page server) set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump ./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 -x 'maps04' || fail ./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --page-server -x 'maps04' || fail criu-3.6/test/jenkins/criu-remote-lazy-pages.sh000077500000000000000000000015171317335042600216200ustar00rootroot00000000000000# Check remote-lazy-pages set -e source `dirname $0`/criu-lib.sh prep KERN_MAJ=`uname -r | cut -d. -f1` KERN_MIN=`uname -r | cut -d. -f2` if [ $KERN_MAJ -ge "4" ] && [ $KERN_MIN -ge "11" ]; then LAZY_EXCLUDE="-x cmdlinenv00" else LAZY_EXCLUDE="-x maps007 -x fork -x fork2 -x uffd-events -x cgroupns -x socket_listen -x socket_listen6 -x cmdlinenv00 -x socket_close_data01 -x file_read" fi # lazy restore from "remote" dump ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ --remote-lazy-pages $LAZY_EXCLUDE -x maps04 || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from "remote" dump with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ --remote-lazy-pages --pre 2 $LAZY_EXCLUDE || fail criu-3.6/test/jenkins/criu-sibling.sh000077500000000000000000000003221317335042600176730ustar00rootroot00000000000000# Make 3 iteration of dump/restore for each test set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump ./test/zdtm.py run --all --keep-going --report report --sibling --parallel 4 -x 'maps04' || fail criu-3.6/test/jenkins/criu-snap.sh000077500000000000000000000004561317335042600172150ustar00rootroot00000000000000# Check snapshots set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump ./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps -x 'maps04' || fail ./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps --page-server -x 'maps04' || fail criu-3.6/test/jenkins/criu-stop.sh000066400000000000000000000002161317335042600172300ustar00rootroot00000000000000# Check --leave-stopped option set -e source `dirname $0`/criu-lib.sh prep ./test/zdtm.py run -t zdtm/transition/fork --stop --iter 3 || fail criu-3.6/test/jenkins/criu-user.sh000077500000000000000000000003171317335042600172260ustar00rootroot00000000000000# Make 3 iteration of dump/restore for each test set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump ./test/zdtm.py run --all --keep-going --report report --parallel 4 --user -x 'maps04' || fail criu-3.6/test/jenkins/criu.sh000077500000000000000000000002241317335042600162470ustar00rootroot00000000000000# Make one regular C/R cycle set -e source `dirname $0`/criu-lib.sh prep ./test/zdtm.py run --all --keep-going --report report --parallel 4 || fail criu-3.6/test/jenkins/run_ct000077500000000000000000000001241317335042600161650ustar00rootroot00000000000000#!/bin/sh unshare --mount --pid --fork -- $(readlink -f `dirname $0`/_run_ct) "$@" criu-3.6/test/others/000077500000000000000000000000001317335042600146135ustar00rootroot00000000000000criu-3.6/test/others/app-emu.sh000077500000000000000000000006141317335042600165170ustar00rootroot00000000000000#!/bin/sh TEST_LIST=" vnc java/HelloWorld screen tarbz make " [ -n "$1" ] && TEST_LIST="$1" BASE_DIR=`pwd`/`dirname $0` for t in $TEST_LIST; do dir=$BASE_DIR/app-emu/$t log=$dir/run.log ( cd $dir bash ./run.sh ) 2>&1 | tee $log grep PASS $log || { echo "Test: $t" echo "====================== ERROR ======================" echo "Run log : $log" echo "$t " exit 1 } done criu-3.6/test/others/app-emu/000077500000000000000000000000001317335042600161575ustar00rootroot00000000000000criu-3.6/test/others/app-emu/java/000077500000000000000000000000001317335042600171005ustar00rootroot00000000000000criu-3.6/test/others/app-emu/java/HelloWorld/000077500000000000000000000000001317335042600211535ustar00rootroot00000000000000criu-3.6/test/others/app-emu/java/HelloWorld/HelloWorld.java000066400000000000000000000006071317335042600240740ustar00rootroot00000000000000/* * Trivial program which requires no * additional imports */ public class HelloWorld { public static void main(String[] args) { int nr_sleeps = 5; for (;;) { System.out.println("Hello World"); if (nr_sleeps == 0) System.exit(0); try { Thread.sleep(1000); nr_sleeps--; } catch(InterruptedException ex) { Thread.currentThread().interrupt(); } } } } criu-3.6/test/others/app-emu/java/HelloWorld/run.sh000066400000000000000000000010521317335042600223110ustar00rootroot00000000000000#!/bin/bash source ../../../functions.sh || exit 1 source ../../../env.sh || exit 1 cleanup_class() { rm -f ./*.class } javac HelloWorld.java || exit 1 set -x rm -rf dump mkdir dump setsid java HelloWorld & pid=${!} echo Lanuched java application with pid $pid in background ${criu} dump -D dump -o dump.log -v4 --shell-job -t ${pid} || { echo "Dump failed" exit 1 } wait_tasks dump echo "Dumped, restoring and waiting for completion" ${criu} restore -D dump -o restore.log -v4 --shell-job || { echo "Restore failed" exit 1 } echo PASS criu-3.6/test/others/app-emu/job/000077500000000000000000000000001317335042600167315ustar00rootroot00000000000000criu-3.6/test/others/app-emu/job/Makefile000066400000000000000000000001621317335042600203700ustar00rootroot00000000000000all: job .PHONY: all %.o: %.c gcc -c $< -o $@ job: job.o gcc -o $@ job.o clean: rm -f *.o job .PHONY: clean criu-3.6/test/others/app-emu/job/job.c000066400000000000000000000032271317335042600176530ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include static int stop = 0; void sighandler(int sig) { stop = 1; } int main(int argc, char *argv[]) { int pid, gid, sid; int tty_sid, tty_gid; int fd = fileno(stdout); char buf[32]; struct dirent *de; DIR *fd_dir; sigset_t bmask, cmask; if (signal(SIGTERM, sighandler)) { printf("Unable to set a signal handler: %m\n"); return 1; } if (!isatty(fd)) { printf("stdout is not tty\n"); return -1; } pid = getpid(); gid = getgid(); sid = getsid(pid); printf("pid %d gid %d sid %d\n", pid, gid, sid); snprintf(buf, sizeof(buf), "/proc/%d/fd", pid); fd_dir = opendir(buf); if (!fd_dir) { printf("cant open %s\n", buf); return -1; } while ((de = readdir(fd_dir))) { int _fd; if (!strcmp(de->d_name, ".")) continue; if (!strcmp(de->d_name, "..")) continue; _fd = atoi(de->d_name); if (_fd > 2 && _fd != fd && isatty(_fd)) { close(_fd); printf("Closed %d\n", _fd); } } closedir(fd_dir); if (ioctl(fd, TIOCGSID, &tty_sid) < 0) { printf("cant obtain sid on stdout\n"); return -1; } printf("stdout sid = %d\n", tty_sid); if (ioctl(fd, TIOCGPGRP, &tty_gid) < 0) { printf("cant obtain gid on stdout\n"); return -1; } printf("stdout gid = %d\n", tty_gid); sigemptyset(&cmask); sigemptyset(&bmask); sigaddset(&bmask, SIGTERM); sigprocmask(SIG_SETMASK, &bmask, NULL); printf("READY\n"); while (!stop) sigsuspend(&cmask); if (getsid(pid) == sid) printf("ALIVE\n"); return 0; } criu-3.6/test/others/app-emu/job/job.exp000077500000000000000000000017111317335042600202240ustar00rootroot00000000000000#!/usr/bin/expect source ../../env.sh || exit 1 exec rm -rf ./dump exec mkdir ./dump system echo "-1" > ./dump/pid.pid set current [fork] switch $current { -1 { puts "Fork failed." exit -1 } 0 { set timeout 5 spawn ./job set pid [exp_pid] expect "READY" { puts "READY" } timeout { puts "FAIL: Timed out on ready" exit -1 } system $criu dump -v4 -D ./dump -o dump.log -j -t $pid system echo "$pid" > ./dump/pid.pid exit 0 } default { sleep 2 set timeout 5 set ::pidfile [open ./dump/pid.pid r] set pid [gets $::pidfile] if {$pid == -1} { puts "FAIL: Invalid pid read" exit -1 } spawn $criu restore -v4 -D ./dump -o restore.log -j # # spawn doesn't wait for restore to complete, so # add some sleep here. Still better would be to # rewrite this test completely. sleep 2 system kill -15 $pid expect "ALIVE" { puts "PASS" } timeout { puts "FAIL: Timed out" exit -1 } exit 0 } } criu-3.6/test/others/app-emu/job/run.sh000066400000000000000000000000411317335042600200640ustar00rootroot00000000000000#!/bin/sh exec expect ./job.exp criu-3.6/test/others/app-emu/lxc/000077500000000000000000000000001317335042600167455ustar00rootroot00000000000000criu-3.6/test/others/app-emu/lxc/network-script.sh000077500000000000000000000020151317335042600222750ustar00rootroot00000000000000#!/bin/bash [ -z "$CR_IP_TOOL" ] && CR_IP_TOOL=ip action=$1 shift [[ "network-unlock" == "$CRTOOLS_SCRIPT_ACTION" || "network-lock" == "$CRTOOLS_SCRIPT_ACTION" ]] || exit 0 set -o pipefail [ "$action" == dump ] && { pid=$1 name=$2 # Find a pair of CT's eth0 ifindex=`$CR_IP_TOOL netns exec $name ethtool -S eth0 | awk '/index/ { print $2}'` [ $? -eq 0 ] || exit 1 for i in /sys/devices/virtual/net/*; do [ "`cat $i/ifindex`" == $ifindex ] && { dst=`basename $i` break; } done [ -z "$dst" ] && exit 1 echo "$dst<=>eth0" [ "network-unlock" == "$CRTOOLS_SCRIPT_ACTION" ] && { echo Attach $dst to the bridge br0 brctl addif br0 $dst exit $? } [ "network-lock" == "$CRTOOLS_SCRIPT_ACTION" ] && { echo Detach $dst to the bridge br0 brctl delif br0 $dst exit $? } exit 0 } [ "$action" == restore ] && { [ "network-unlock" == "$CRTOOLS_SCRIPT_ACTION" ] && { ethname=$1 echo Attach $ethname to the bridge br0 ip link set up dev $ethname brctl addif br0 $ethname exit $? } } exit 0 criu-3.6/test/others/app-emu/lxc/run.sh000077500000000000000000000031201317335042600201040ustar00rootroot00000000000000#!/bin/bash source ../../env.sh || exit 1 [ -z "$CR_IP_TOOL" ] && CR_IP_TOOL=ip cd `dirname $0` name=$1 [ -z "$name" ] && { cat <(b))?A0((a)-(b), (b)):A0((b)-(a), (a)) #define A2(a, b) ((a)>(b))?A1((a)-(b), (b)):A1((b)-(a), (a)) #define A3(a, b) ((a)>(b))?A2((a)-(b), (b)):A2((b)-(a), (a)) #define A4(a, b) ((a)>(b))?A3((a)-(b), (b)):A3((b)-(a), (a)) #define A5(a, b) ((a)>(b))?A4((a)-(b), (b)):A4((b)-(a), (a)) #define A6(a, b) ((a)>(b))?A5((a)-(b), (b)):A5((b)-(a), (a)) #define A7(a, b) ((a)>(b))?A6((a)-(b), (b)):A6((b)-(a), (a)) #define A8(a, b) ((a)>(b))?A7((a)-(b), (b)):A7((b)-(a), (a)) #define A9(a, b) ((a)>(b))?A8((a)-(b), (b)):A8((b)-(a), (a)) #define A10(a, b) ((a)>(b))?A9((a)-(b), (b)):A9((b)-(a), (a)) #define A11(a, b) ((a)>(b))?A10((a)-(b), (b)):A10((b)-(a), (a)) return A10(a, b); } criu-3.6/test/others/app-emu/screen/000077500000000000000000000000001317335042600174365ustar00rootroot00000000000000criu-3.6/test/others/app-emu/screen/run.sh000066400000000000000000000007751317335042600206070ustar00rootroot00000000000000#!/bin/bash source ../../functions.sh || exit 1 source ../../env.sh || exit 1 set -x echo "Creating reference objects" screen -d -m -S criu-zdtm pid=$(screen -list | grep '\.*Detached' | sed 's/\s*\([0-9]*\).*/\1/'); echo PID=$pid mkdir dump ${criu} dump -D dump -o dump.log -v4 -t ${pid} || { echo "Dump failed" exit 1 } wait_tasks dump echo "Dumped, restoring and waiting for completion" ${criu} restore -d -D dump -o restore.log -v4 || { echo "Restore failed" exit 1 } echo PASS criu-3.6/test/others/app-emu/tarbz/000077500000000000000000000000001317335042600173015ustar00rootroot00000000000000criu-3.6/test/others/app-emu/tarbz/run.sh000066400000000000000000000022621317335042600204430ustar00rootroot00000000000000#!/bin/bash source ../../functions.sh || exit 1 source ../../env.sh || exit 1 DEPTH=3 SPAN=5 archref="arch-ref.tar.bz2" archcr="arch.tar.bz2" rm -f ${archref} rm -f ${archcr} rm -rf tree/ rm -rf dump/ mkdir dump mkdir tree echo "Generating tree, depth ${DEPTH} span ${SPAN}" function gen_sub { local dir="${1}" local dep="${2}" for i in $(seq 1 $SPAN); do subdir="$dir/dir_$((RANDOM % 32))_$i" subfl="$dir/file_$((RANDOM % 32))_$i" mkdir "$subdir" dd if=/dev/urandom of=$subfl bs=4096 count=$((RANDOM % 32 + 16)) > /dev/null 2>&1 if [ $dep -gt 0 ]; then gen_sub "$subdir" $((dep - 1)) fi done } gen_sub "./tree/" "$DEPTH" set -x time tar cjf ${archref} tree || exit 1 setsid tar cjf ${archcr} tree & pid=${!} echo "Started tar in $pid background" sleep 3 ${criu} dump --shell-job -D dump -o dump.log -v4 -t ${pid} || { echo "Dump failed" exit 1 } wait_tasks dump echo "Dump OK, restoring" ${criu} restore --shell-job -D dump -o restore.log -v4 || { echo "Restore failed" exit 1 } echo "Finished, comparing tarballs" if ! cmp ${archref} ${archcr} ; then echo "Archives differ" echo "FAIL" else echo "PASS" rm -f ${archref} rm -f ${archcr} rm -rf tree/ fi criu-3.6/test/others/app-emu/vnc/000077500000000000000000000000001317335042600167455ustar00rootroot00000000000000criu-3.6/test/others/app-emu/vnc/run.sh000077500000000000000000000007401317335042600201110ustar00rootroot00000000000000set -m source ../../functions.sh || exit 1 source ../../env.sh || exit 1 mkdir data ./vnc-server.sh 25 &> data/vnc.log pid=`jobs -p %1` bg $criu dump -j --tcp-established -D data/ -o dump.log -v4 -t $pid || { echo "Dump failed" exit 1 } wait_tasks dump $criu restore -j --tcp-established -D data/ -d -o restore.log -v4 || { echo "Restore failed" exit 1 } nc -w 1 localhost 5925 | grep -am 1 RFB ret=$? kill $pid [ "$ret" -eq 0 ] && echo PASS || echo FAIL; exit $ret criu-3.6/test/others/app-emu/vnc/vnc-server.sh000077500000000000000000000004361317335042600214010ustar00rootroot00000000000000#!/bin/bash #set -x set -m Xvnc :25 -v -geometry 500x500 -i 0.0.0.0 -SecurityTypes none & pid=$! trap "kill $pid; wait" EXIT for i in `seq 10`; do nc -w 1 localhost 5925 | grep -am 1 RFB && break || echo Waiting kill -0 $pid || exit 1 sleep 1 done kill -STOP $$ DISPLAY=:25 glxgears criu-3.6/test/others/bers/000077500000000000000000000000001317335042600155465ustar00rootroot00000000000000criu-3.6/test/others/bers/Makefile000066400000000000000000000013421317335042600172060ustar00rootroot00000000000000ifeq ($(strip $(V)),) E = @echo Q = @ else E = @\# Q = endif export E Q ASCIIDOC := asciidoc A2X := a2x XMLTO := xmlto SRC += bers.txt XMLS := $(patsubst %.txt,%.xml,$(SRC)) MANS := $(patsubst %.txt,%.8,$(SRC)) %.8: %.txt $(E) " GEN " $@ $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.8,%.xml,$@) $< $(Q) $(XMLTO) man --skip-validation $(patsubst %.8,%.xml,$@) 2>/dev/null docs: $(MANS) @true CFLAGS := -O0 -ggdb3 LIBS := -lpthread %.o: %.c $(E) " CC " $@ $(Q) $(CC) -c -o $@ $(CFLAGS) $^ bers: bers.o $(E) " LINK " $@ $(Q) $(CC) -o $@ $(CFLAGS) $(LIBS) $^ all: bers @true clean: $(E) " CLEAN " $(Q) rm -f $(XMLS) $(MANS) $(Q) rm -f bers.o $(Q) rm -f bers .PHONY: all docs clean criu-3.6/test/others/bers/bers.c000066400000000000000000000242071317335042600166520ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define min(x, y) ({ \ typeof(x) _min1 = (x); \ typeof(y) _min2 = (y); \ (void) (&_min1 == &_min2); \ _min1 < _min2 ? _min1 : _min2; }) #define max(x, y) ({ \ typeof(x) _max1 = (x); \ typeof(y) _max2 = (y); \ (void) (&_max1 == &_max2); \ _max1 > _max2 ? _max1 : _max2; }) #define MAX_CHUNK 4096 #define PAGE_SIZE 4096 #define pr_info(fmt, ...) \ printf("%8d: " fmt, sys_gettid(), ##__VA_ARGS__) #define pr_err(fmt, ...) \ printf("%8d: Error (%s:%d): " fmt, sys_gettid(),\ __FILE__, __LINE__, ##__VA_ARGS__) #define pr_perror(fmt, ...) \ pr_err(fmt ": %m\n", ##__VA_ARGS__) #define pr_msg(fmt, ...) \ printf(fmt, ##__VA_ARGS__) #define pr_trace(fmt, ...) \ printf("%8d: %s: " fmt, sys_gettid(), __func__, \ ##__VA_ARGS__) enum { MEM_FILL_MODE_NONE = 0, MEM_FILL_MODE_ALL = 1, MEM_FILL_MODE_LIGHT = 2, MEM_FILL_MODE_DIRTIFY = 3, }; typedef struct { pthread_mutex_t mutex; pthread_mutexattr_t mutex_attr; size_t opt_tasks; size_t opt_files; size_t opt_file_size; int prev_fd[MAX_CHUNK]; size_t opt_mem; size_t opt_mem_chunks; size_t opt_mem_chunk_size; int opt_mem_fill_mode; int opt_mem_cycle_mode; unsigned int opt_refresh_time; char *opt_work_dir; int work_dir_fd; DIR *work_dir; pid_t err_pid; int err_no; unsigned long prev_map[MAX_CHUNK]; } shared_data_t; static shared_data_t *shared; static int sys_gettid(void) { return syscall(__NR_gettid); } static void dirtify_memory(unsigned long *chunks, size_t nr_chunks, size_t chunk_size, int mode, const size_t nr_pages) { void *page; size_t i; pr_trace("filling memory\n"); switch (mode) { case MEM_FILL_MODE_LIGHT: *((unsigned long *)chunks[0]) = -1ul; break; case MEM_FILL_MODE_ALL: for (i = 0; i < nr_chunks; i++) memset((void *)chunks[i], (char)i, chunk_size); break; case MEM_FILL_MODE_DIRTIFY: for (i = 0; i < nr_chunks; i++) *((unsigned long *)chunks[i]) = -1ul; break; } } static void dirtify_files(int *fd, size_t nr_files, size_t size) { size_t buf[8192]; size_t i, j, c; /* * Note we don't write any _sane_ data here, the only * important thing is I/O activity by self. */ for (i = 0; i < nr_files; i++) { size_t c = min(size, sizeof(buf)); size_t left = size; while (left > 0) { write(fd[i], buf, c); left -= c; c = min(left, sizeof(buf)); } } } static int create_files(shared_data_t *shared, int *fd, size_t nr_files) { char path[PATH_MAX]; size_t i; memset(fd, 0xff, sizeof(fd)); pr_info("\tCreating %lu files\n", shared->opt_files); for (i = 0; i < shared->opt_files; i++) { if (shared->prev_fd[i] != -1) { close(shared->prev_fd[i]); shared->prev_fd[i] = -1; } snprintf(path, sizeof(path), "%08d-%04d-temp", sys_gettid(), i); fd[i] = openat(shared->work_dir_fd, path, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd[i] < 0) { pr_perror("Can't open %s/%s", shared->opt_work_dir, path); shared->err_pid = sys_gettid(); shared->err_no = -errno; return -1; } shared->prev_fd[i] = fd[i]; } return 0; } static void work_on_fork(shared_data_t *shared) { const size_t nr_pages = shared->opt_mem_chunk_size / PAGE_SIZE; unsigned long chunks[MAX_CHUNK] = { }; int fd[MAX_CHUNK]; size_t i; void *mem; pr_trace("locking\n"); pthread_mutex_lock(&shared->mutex); pr_trace("init\n"); pr_info("\tCreating %lu mmaps each %lu K\n", shared->opt_mem_chunks, shared->opt_mem_chunk_size >> 10); for (i = 0; i < shared->opt_mem_chunks; i++) { if (shared->prev_map[i]) { munmap((void *)shared->prev_map[i], shared->opt_mem_chunk_size); shared->prev_map[i] = 0; } /* If we won't change proto here, the kernel might merge close areas */ mem = mmap(NULL, shared->opt_mem_chunk_size, PROT_READ | PROT_WRITE | ((i % 2) ? PROT_EXEC : 0), MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (mem != (void *)MAP_FAILED) { shared->prev_map[i] = (unsigned long)mem; chunks[i] = (unsigned long)mem; pr_info("\t\tMap at %lx\n",(unsigned long)mem); } else { pr_info("\t\tCan't map\n"); shared->err_pid = sys_gettid(); shared->err_no = -errno; exit(1); } } if (shared->opt_mem_fill_mode) dirtify_memory(chunks, shared->opt_mem_chunks, shared->opt_mem_chunk_size, shared->opt_mem_fill_mode, nr_pages); if (create_files(shared, fd, shared->opt_files)) exit(1); if (shared->opt_file_size) dirtify_files(fd, shared->opt_files, shared->opt_file_size); pr_trace("releasing\n"); pthread_mutex_unlock(&shared->mutex); while (1) { sleep(shared->opt_refresh_time); if (shared->opt_mem_cycle_mode) dirtify_memory(chunks, shared->opt_mem_chunks, shared->opt_mem_chunk_size, shared->opt_mem_cycle_mode, nr_pages); if (shared->opt_file_size) dirtify_files(fd, shared->opt_files, shared->opt_file_size); } } static int parse_mem_mode(int *mode, char *opt) { if (!strcmp(opt, "all")) { *mode = MEM_FILL_MODE_ALL; } else if (!strcmp(opt, "light")) { *mode = MEM_FILL_MODE_LIGHT; } else if (!strcmp(opt, "dirtify")) { *mode = MEM_FILL_MODE_DIRTIFY; } else { pr_err("Unrecognized option %s\n", opt); return -1; } return 0; } int main(int argc, char *argv[]) { /* a - 97, z - 122, A - 65, 90 */ static const char short_opts[] = "t:d:f:m:c:h"; static struct option long_opts[] = { {"tasks", required_argument, 0, 't'}, {"dir", required_argument, 0, 'd'}, {"files", required_argument, 0, 'f'}, {"memory", required_argument, 0, 'm'}, {"mem-chunks", required_argument, 0, 'c'}, {"help", no_argument, 0, 'h'}, {"mem-fill", required_argument, 0, 10}, {"mem-cycle", required_argument, 0, 11}, {"refresh", required_argument, 0, 12}, {"file-size", required_argument, 0, 13}, { }, }; char workdir[PATH_MAX]; int opt, idx, pidfd; char pidbuf[32]; int status; pid_t pid; size_t i; shared = (void *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); if ((void *)shared == MAP_FAILED) { pr_err("Failed to setup shared data\n"); exit(1); } pthread_mutexattr_init(&shared->mutex_attr); pthread_mutexattr_setpshared(&shared->mutex_attr, PTHREAD_PROCESS_SHARED); pthread_mutex_init(&shared->mutex, &shared->mutex_attr); /* * Default options. */ shared->opt_mem_chunks = 1; shared->opt_refresh_time = 1; shared->opt_tasks = 1; shared->opt_mem = 1 << 20ul; memset(shared->prev_fd, 0xff, sizeof(shared->prev_fd)); while (1) { idx = -1; opt = getopt_long(argc, argv, short_opts, long_opts, &idx); if (opt == -1) break; switch(opt) { case 't': shared->opt_tasks = (size_t)atol(optarg); break; case 'f': shared->opt_files = (size_t)atol(optarg); break; case 'm': /* In megabytes */ shared->opt_mem = (size_t)atol(optarg) << 20ul; break; case 'c': shared->opt_mem_chunks = (size_t)atol(optarg); break; case 'd': shared->opt_work_dir = optarg; break; case 'h': goto usage; break; case 10: if (parse_mem_mode(&shared->opt_mem_fill_mode, optarg)) goto usage; case 11: if (parse_mem_mode(&shared->opt_mem_cycle_mode, optarg)) goto usage; break; case 12: shared->opt_refresh_time = (unsigned int)atoi(optarg); break; case 13: shared->opt_file_size = (size_t)atol(optarg); } } if (!shared->opt_work_dir) { shared->opt_work_dir = getcwd(workdir, sizeof(workdir)); if (!shared->opt_work_dir) { pr_perror("Can't fetch current working dir"); exit(1); } shared->opt_work_dir = workdir; } if (shared->opt_mem_chunks > MAX_CHUNK) shared->opt_mem_chunks = MAX_CHUNK; if (shared->opt_files > MAX_CHUNK) shared->opt_files = MAX_CHUNK; shared->work_dir = opendir(shared->opt_work_dir); if (!shared->work_dir) { pr_perror("Can't open working dir `%s'", shared->opt_work_dir); exit(1); } shared->work_dir_fd = dirfd(shared->work_dir); shared->opt_mem_chunk_size = shared->opt_mem / shared->opt_mem_chunks; if (shared->opt_mem_chunk_size && shared->opt_mem_chunk_size < PAGE_SIZE) { pr_err("Memory chunk size is too small, provide at least %lu M of memory\n", (shared->opt_mem_chunks * PAGE_SIZE) >> 20ul); exit(1); } for (i = 0; i < shared->opt_tasks; i++) { if (shared->err_no) goto err_child; pid = fork(); if (pid < 0) { printf("Can't create fork: %m\n"); exit(1); } else if (pid == 0) { work_on_fork(shared); } } /* * Once everything is done and we're in cycle, * create pidfile and go to sleep... */ pid = sys_gettid(); pidfd = openat(shared->work_dir_fd, "bers.pid", O_RDWR | O_CREAT | O_TRUNC, 0666); if (pidfd < 0) { pr_perror("Can't open pidfile"); exit(1); } snprintf(pidbuf, sizeof(pidbuf), "%d", sys_gettid()); write(pidfd, pidbuf, strlen(pidbuf)); close(pidfd); pidfd = -1; /* * Endless! */ while (!shared->err_no) sleep(1); err_child: pr_err("Child %d exited with %d\n", shared->err_pid, shared->err_no); return shared->err_no; usage: pr_msg("bers [options]\n"); pr_msg(" -t|--tasks create of tasks\n"); pr_msg(" -d|--dir use directory for temporary files\n"); pr_msg(" -f|--files create files for each task\n"); pr_msg(" -m|--memory allocate megabytes for each task\n"); pr_msg(" --memory-chunks split memory to equal parts\n"); pr_msg(" --mem-fill fill memory with data dependin on :\n"); pr_msg(" all fill every byte of memory\n"); pr_msg(" light fill first bytes of every page\n"); pr_msg(" dirtify fill every page\n"); pr_msg(" --mem-cycle same as --mem-fill but for cycling\n"); pr_msg(" --refresh refresh loading of every task each \n"); pr_msg(" --file-size write of data into each file on every refresh cycle\n"); return 1; } criu-3.6/test/others/bers/bers.txt000066400000000000000000000036001317335042600172410ustar00rootroot00000000000000bers(8) ======= :doctype: manpage :man source: bers :man version: 0.0.1 :man manual: bers manual NAME ---- bers - go berserk and eat computer resources SYNOPSIS -------- *bers* ['options'] DESCRIPTION ----------- *bers* is a command line utility aimed to eat resources of the computer it runs on. Idea behind is to create a number of tasks which would trash computer resources eating cpu and i/o time. OPTIONS ------- *-t*, *--tasks* 'num':: Create 'num' number of forks. *-d*, *--dir* 'dir':: Path to 'dir' directory where temporary files will be created to load I/O subsystem. *-f*, *--files* 'num':: Create 'num' files in each task. *-m*, *--memory* 'num':: Allocate 'num' megabytes of memory for every task. *--mem-chunks* 'num':: Allocate memory for each task not as one slab but split it into 'num' equal parts. *--mem-fill* 'mode':: Touch (write) into allocated memory once task is created. The 'mode' might be one of the following: 'all' -- write every single byte of the memory, 'light' -- write into first bytes of first page of the allocated memory chunk, 'dirtify' -- write into every page of every allocated chunk. *--mem-cycle* 'mode':: Same as *--mem-fill*, but 'mode' taken into account while task is cycling. By default each cycle initiated per one second. *--refresh* 'second':: Refresh load state of every task each 'second'. By refsresh here means to dirtify memory and file contents. *--file-size* 'bytes':: Write 'bytes' of data into each file on every refresh cycle. EXAMPLE ------- bers -d test/bers/dump -t 256 -m 54 -c 4 -f 200 --mem-fill dirtify --mem-cycle dirtify We generate 256 tasks wit each allocating 54 megabytes of memory splitted equally into 4 memory areas. Each task opens 200 files. On creation and cycling we touch every page of every memory area. AUTHOR ------ OpenVZ team. COPYRIGHT --------- Copyright \(C) 2014, Parallels Inc. criu-3.6/test/others/crit/000077500000000000000000000000001317335042600155545ustar00rootroot00000000000000criu-3.6/test/others/crit/.gitignore000066400000000000000000000000411317335042600175370ustar00rootroot00000000000000*.img *.log *.txt stats-* *.json criu-3.6/test/others/crit/Makefile000066400000000000000000000001061317335042600172110ustar00rootroot00000000000000run: clean ./test.sh clean: rm -f *.img *.log *.txt stats-* *.json criu-3.6/test/others/crit/loop.sh000077500000000000000000000000461317335042600170640ustar00rootroot00000000000000#!/bin/bash while :; do sleep 1 done criu-3.6/test/others/crit/test.sh000077500000000000000000000014021317335042600170670ustar00rootroot00000000000000source ../env.sh images_list="" function _exit { if [ $? -ne 0 ]; then echo "FAIL" exit 1 fi } function gen_imgs { setsid ./loop.sh < /dev/null &> /dev/null & PID=$! $CRIU dump -v4 -o dump.log -D ./ -t $PID if [ $? -ne 0 ]; then kill -9 $PID _exit 1 fi images_list=$(ls -1 *.img) if [ -z "$images_list" ]; then echo "Failed to generate images" _exit 1 fi } function run_test { for x in $images_list do echo "=== $x" if [[ $x == pages* ]]; then echo "skip" continue fi echo " -- to json" $CRIT decode -o "$x"".json" --pretty < $x || _exit $? echo " -- to img" $CRIT encode -i "$x"".json" > "$x"".json.img" || _exit $? echo " -- cmp" cmp $x "$x"".json.img" || _exit $? echo "=== done" done } gen_imgs run_test criu-3.6/test/others/criu-coredump/000077500000000000000000000000001317335042600173715ustar00rootroot00000000000000criu-3.6/test/others/criu-coredump/.gitignore000066400000000000000000000000501317335042600213540ustar00rootroot00000000000000*.img *.log *.txt stats-* *.json core.* criu-3.6/test/others/criu-coredump/Makefile000066400000000000000000000000721317335042600210300ustar00rootroot00000000000000run: clean ./test.sh clean: rm -f *.img stats-* core.* criu-3.6/test/others/criu-coredump/loop.sh000077500000000000000000000000461317335042600207010ustar00rootroot00000000000000#!/bin/bash while :; do sleep 1 done criu-3.6/test/others/criu-coredump/test.sh000077500000000000000000000013451317335042600207120ustar00rootroot00000000000000source ../env.sh function _exit { if [ $? -ne 0 ]; then echo "FAIL" exit 1 fi } function gen_imgs { setsid ./loop.sh < /dev/null &> /dev/null & PID=$! $CRIU dump -v4 -o dump.log -D ./ -t $PID if [ $? -ne 0 ]; then kill -9 $PID _exit 1 fi images_list=$(ls -1 *.img) if [ -z "$images_list" ]; then echo "Failed to generate images" _exit 1 fi } function run_test { echo "= Test core dump" echo "=== img to core dump" $CRIU_COREDUMP -i ./ -o ./ || _exit $? echo "=== done" cores=$(ls -1 core.*) if [ -z "$cores" ]; then echo "Failed to generate coredumps" _exit 1 fi for x in $cores do echo "=== try readelf $x" readelf -a $x || _exit $? echo "=== done" done echo "= done" } gen_imgs run_test criu-3.6/test/others/env.sh000077500000000000000000000004311317335042600157400ustar00rootroot00000000000000#!/bin/sh CRIU=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu/criu) criu=$CRIU CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit) crit=$CRIT CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu-coredump/criu-coredump) criu_coredump=$CRIU_COREDUMP criu-3.6/test/others/exec/000077500000000000000000000000001317335042600155375ustar00rootroot00000000000000criu-3.6/test/others/exec/Makefile000066400000000000000000000000171317335042600171750ustar00rootroot00000000000000run: ./run.sh criu-3.6/test/others/exec/run.sh000077500000000000000000000004301317335042600166770ustar00rootroot00000000000000#!/bin/bash CRIU=../../../criu/criu set -e -m -x cat < /dev/zero > /dev/null & pid=$! sleep 1 lsof -p $pid $CRIU exec -t $pid fake_syscall && exit 1 || true fd=`$CRIU exec -t $pid open '&/dev/null' 0 | sed 's/.*(\(.*\))/\1/'` $CRIU exec -t $pid dup2 $fd 0 wait $pid echo PASS criu-3.6/test/others/ext-links/000077500000000000000000000000001317335042600165315ustar00rootroot00000000000000criu-3.6/test/others/ext-links/Makefile000066400000000000000000000002121317335042600201640ustar00rootroot00000000000000all: mvlink.so mvlink.so: mvlink.c gcc -g -Werror -Wall -shared -nostartfiles mvlink.c -o mvlink.so -iquote ../../../criu/include -fPIC criu-3.6/test/others/ext-links/addmv.sh000077500000000000000000000002631317335042600201640ustar00rootroot00000000000000#!/bin/bash # $1 -- link name # $2 -- file with namespace pid if [ "$CRTOOLS_SCRIPT_ACTION" == "setup-namespaces" ]; then $(dirname $0)/addmv_raw.sh $1 $(cat $2) else exit 0 fi criu-3.6/test/others/ext-links/addmv_raw.sh000077500000000000000000000002251317335042600210330ustar00rootroot00000000000000#!/bin/bash # $1 -- link name # $2 -- pid of task in namespace set -x $ip link add link eth0 name $1 type macvlan || exit 1 $ip link set $1 netns $2 criu-3.6/test/others/ext-links/mvlink.c000066400000000000000000000010141317335042600201710ustar00rootroot00000000000000#include #include #include #include #include #include #include "criu-plugin.h" #include "criu-log.h" extern cr_plugin_init_t cr_plugin_init; extern cr_plugin_dump_ext_link_t cr_plugin_dump_ext_link; int cr_plugin_init(void) { pr_info("Initialized macvlan dumper\n"); return 0; } int cr_plugin_dump_ext_link(int index, int type, char *kind) { if (strcmp(kind, "macvlan")) return -ENOTSUP; else { pr_info("Dump %d macvlan\n", index); return 0; } } criu-3.6/test/others/ext-links/run.sh000077500000000000000000000022111317335042600176700ustar00rootroot00000000000000#!/bin/bash ip=${CR_IP_TOOL:-ip} mvln="mv0" finf="finish" outf="ns_output" pidf="ns_pid" criu="../../../criu/criu" export ip export mvln export finf export outf export pidf function fail { $ip link del $mvln touch $finf echo $@ exit 1 } # Build the mvlink plugin make set -x rm -f "$finf" "$outf" "$pidf" rm -rf "dump" # Unshare netns. The run_ns will exit once ns is spawned. unshare --net ./run_ns.sh nspid=$(cat $pidf) ps $nspid # Create and push macvlan device into it. CRIU doesn't support # macvlans treating them as external devices. ./addmv_raw.sh $mvln $nspid || fail "Can't setup namespace" # Dump sleep 1 mkdir dump $criu dump -t $nspid -D dump/ -o dump.log -v4 --lib $(pwd) || fail "Can't dump namespace" # Restore # Ask for the pid (shouldn't change, so just as an example), ask to call # script that will put macvlan device back into namespace sleep 1 rm -f $pidf $criu restore -D dump/ -o restore.log -v4 --pidfile $(pwd)/$pidf --action-script "$(pwd)/addmv.sh $mvln $(pwd)/$pidf" -d || fail "Can't restore namespaces" # Finish and check results touch $finf set +x while ! egrep 'PASS|FAIL' $outf; do echo "Waiting" sleep 1 done criu-3.6/test/others/ext-links/run_ns.sh000077500000000000000000000003741317335042600204000ustar00rootroot00000000000000#!/bin/bash set -x echo "NS: $$" >> $outf echo "Links before:" >> $outf $ip link list >> $outf 2>&1 # Detach from session, terminal and parent setsid ./run_wait.sh < /dev/null >> $outf 2>&1 & # Keep pid for future reference :) echo "$!" > $pidf exit 0 criu-3.6/test/others/ext-links/run_wait.sh000077500000000000000000000004421317335042600207200ustar00rootroot00000000000000#!/bin/bash echo "Wait: $$" while [ ! -e "$finf" ]; do echo "WAIT ($$)" sleep 1; done echo "Links after:" $ip link list # The mvln device (exported from run.sh) should exits in # namespace after we get restored echo "Check for $mvln:" $ip link list $mvln && echo "PASS" || echo "FAIL" criu-3.6/test/others/ext-tty/000077500000000000000000000000001317335042600162315ustar00rootroot00000000000000criu-3.6/test/others/ext-tty/run.py000077500000000000000000000017661317335042600174240ustar00rootroot00000000000000#!/usr/bin/env python2 import subprocess import os, sys, time, signal, pty master, slave = pty.openpty() p = subprocess.Popen(["setsid", "--ctty", "sleep", "10000"], stdin = slave, stdout = slave, stderr = slave, close_fds = True) st = os.stat("/proc/self/fd/%d" % slave) ttyid = "tty[%x:%x]" % (st.st_rdev, st.st_dev) os.close(slave) time.sleep(1) ret = subprocess.Popen(["../../../criu/criu", "dump", "-t", str(p.pid), "-v4", "--external", ttyid]).wait() if ret: sys.exit(ret) p.wait() new_master, slave = pty.openpty() # get another pty pair os.close(master) ttyid = "fd[%d]:tty[%x:%x]" % (slave, st.st_rdev, st.st_dev) ret = subprocess.Popen(["../../../criu/criu", "restore", "-v4", "--inherit-fd", ttyid, "--restore-sibling", "--restore-detach"]).wait() if ret: sys.exit(ret) os.close(slave) os.waitpid(-1, os.WNOHANG) # is the process alive os.close(new_master) _, status = os.wait() if not os.WIFSIGNALED(status) or os.WTERMSIG(status) != signal.SIGHUP: print status sys.exit(1) print "PASS" criu-3.6/test/others/functions.sh000066400000000000000000000004621317335042600171610ustar00rootroot00000000000000# Wait while tasks are dying, otherwise PIDs would be busy. function wait_tasks() { local dump=$1 local pid for i in $dump/core-*.img; do pid=`expr "$i" : '.*/core-\([0-9]*\).img'` while :; do kill -0 $pid > /dev/null 2>&1 || break; echo Waiting the process $pid sleep 0.1 done done } criu-3.6/test/others/libcriu/000077500000000000000000000000001317335042600162445ustar00rootroot00000000000000criu-3.6/test/others/libcriu/.gitignore000066400000000000000000000000721317335042600202330ustar00rootroot00000000000000test_errno test_iters test_notify test_self test_sub wdir criu-3.6/test/others/libcriu/Makefile000066400000000000000000000007171317335042600177110ustar00rootroot00000000000000TESTS += test_sub TESTS += test_self TESTS += test_notify TESTS += test_iters TESTS += test_errno all: $(TESTS) run: all ./run.sh define genb $(1): $(1).o lib.o gcc $$^ -L ../../../../criu/lib/c/ -L ../../../../criu/images/ -lcriu -o $$@ endef $(foreach t, $(TESTS), $(eval $(call genb, $(t)))) %.o: %.c gcc -c $^ -I../../../../criu/lib/c/ -I../../../../criu/images/ -o $@ -Werror clean: rm -rf $(TESTS) $(TESTS:%=%.o) lib.o .PHONY: clean .PHONY: all criu-3.6/test/others/libcriu/lib.c000066400000000000000000000017451317335042600171650ustar00rootroot00000000000000#include #include #include void what_err_ret_mean(int ret) { /* NOTE: errno is set by libcriu */ switch (ret) { case -EBADE: perror("RPC has returned fail"); break; case -ECONNREFUSED: perror("Unable to connect to CRIU"); break; case -ECOMM: perror("Unable to send/recv msg to/from CRIU"); break; case -EINVAL: perror("CRIU doesn't support this type of request." "You should probably update CRIU"); break; case -EBADMSG: perror("Unexpected response from CRIU." "You should probably update CRIU"); break; default: perror("Unknown error type code." "You should probably update CRIU"); } } int chk_exit(int status, int want) { if (WIFEXITED(status)) { if (WEXITSTATUS(status) == want) return 0; printf(" `- FAIL (exit %d)\n", WEXITSTATUS(status)); } else if (WIFSIGNALED(status)) printf(" `- FAIL (die %d)\n", WTERMSIG(status)); else printf(" `- FAIL (%#x)\n", status); return 1; } criu-3.6/test/others/libcriu/lib.h000066400000000000000000000001051317335042600171570ustar00rootroot00000000000000void what_err_ret_mean(int ret); int chk_exit(int status, int want); criu-3.6/test/others/libcriu/run.sh000077500000000000000000000014071317335042600174110ustar00rootroot00000000000000#!/bin/bash set -x source ../env.sh || exit 1 echo "== Clean" make clean rm -rf wdir rm -f ./libcriu.so.1 echo "== Prepare" mkdir -p wdir/i/ echo "== Run tests" ln -s ../../../../criu/lib/c/libcriu.so libcriu.so.1 export LD_LIBRARY_PATH=. export PATH="`dirname ${BASH_SOURCE[0]}`/../../:$PATH" RESULT=0 function run_test { echo "== Build $1" if ! make $1; then echo "FAIL build $1" RESULT=1; else echo "== Test $1" mkdir wdir/i/$1/ if ! setsid ./$1 ${CRIU} wdir/i/$1/ < /dev/null &>> wdir/i/$1/test.log; then echo "$1: FAIL" RESULT=1 fi fi } run_test test_sub run_test test_self run_test test_notify run_test test_iters run_test test_errno echo "== Tests done" unlink libcriu.so.1 [ $RESULT -eq 0 ] && echo "Success" || echo "FAIL" exit $RESULT criu-3.6/test/others/libcriu/test_errno.c000066400000000000000000000046501317335042600206010ustar00rootroot00000000000000#include "criu.h" #include #include #include #include #include #include #include #include #define PID_MAX "/proc/sys/kernel/pid_max" static int dir_fd; static char *service; static int init(char *argv[]) { service = argv[1]; dir_fd = open(argv[2], O_DIRECTORY); if (dir_fd < 0) { perror("Can't open images dir"); return -1; } return 0; } static void get_base_req(void) { criu_init_opts(); criu_set_service_binary(service); criu_set_images_dir_fd(dir_fd); criu_set_log_level(4); } static int check_resp(int ret, int expected_ret, int err, int expected_err) { if (ret != expected_ret) { fprintf(stderr, "Unexpected ret %d (%d expected)\n", ret, expected_ret); return -1; } if (err != expected_err) { fprintf(stderr, "Unexpected errno %d (%d expected)\n", err, expected_err); return -1; } return 0; } static int no_process(void) { FILE *f = NULL; size_t len; ssize_t count; char *buf = NULL; int pid, fd, ret; printf("--- Try to dump unexisting process\n"); f = fopen(PID_MAX, "r"); if (!f) { perror("Can't open " PID_MAX); goto err; } count = getline(&buf, &len, f); if (count == -1) { perror("Can't read " PID_MAX); goto err; } pid = atoi(buf); if (!kill(pid, 0)) { fprintf(stderr, "max pid is taken\n"); goto err; } get_base_req(); criu_set_pid(pid); ret = criu_dump(); if (check_resp(ret, -EBADE, errno, ESRCH)) goto err; printf(" `- Success\n"); return 0; err: if (f) fclose(f); return -1; } static int process_exists(void) { int ret; printf("--- Try to restore process which pid is already taken by other process\n"); get_base_req(); criu_set_leave_running(true); if (criu_dump()) { fprintf(stderr, "Self-dump failed"); goto err; } get_base_req(); ret = criu_restore(); if (check_resp(ret, -EBADE, errno, EEXIST)) goto err; printf(" `- Success\n"); return 0; err: return -1; } static int bad_options(void) { int ret; printf("--- Try to send criu invalid opts\n"); get_base_req(); criu_set_log_file("../file.log"); ret = criu_dump(); if (check_resp(ret, -EBADE, errno, EBADRQC)) goto err; printf(" `- Success\n"); return 0; err: return -1; } int main(int argc, char *argv[]) { int ret = 1; if (init(argv)) goto out; if (no_process() || process_exists() || bad_options()) goto out; ret = 0; out: if (dir_fd) close(dir_fd); return ret; } criu-3.6/test/others/libcriu/test_iters.c000066400000000000000000000045711317335042600206040ustar00rootroot00000000000000#include "criu.h" #include #include #include #include #include #include #include #include #include #include "lib.h" static int wdir_fd, cur_iter = 1, cur_imgdir = -1; static int stop = 0; static void sh(int sig) { stop = 1; } static int open_imgdir(void) { char p[10]; sprintf(p, "%d", cur_iter); mkdirat(wdir_fd, p, 0700); cur_imgdir = openat(wdir_fd, p, O_DIRECTORY); criu_set_images_dir_fd(cur_imgdir); } #define MAX_ITERS 2 static int next_iter(criu_predump_info pi) { char p[10]; printf(" `- %d iter over\n", cur_iter); close(cur_imgdir); sprintf(p, "../%d", cur_iter); criu_set_parent_images(p); cur_iter++; open_imgdir(); return cur_iter < MAX_ITERS; } #define SUCC_ECODE 42 int main(int argc, char **argv) { int pid, ret, p[2]; wdir_fd = open(argv[2], O_DIRECTORY); if (wdir_fd < 0) { perror("Can't open wdir"); return 1; } printf("--- Start loop ---\n"); pipe(p); pid = fork(); if (pid < 0) { perror("Can't"); return -1; } if (!pid) { printf(" `- loop: initializing\n"); if (setsid() < 0) exit(1); if (signal(SIGUSR1, sh) == SIG_ERR) exit(1); close(0); close(1); close(2); close(p[0]); ret = SUCC_ECODE; write(p[1], &ret, sizeof(ret)); close(p[1]); while (!stop) sleep(1); exit(SUCC_ECODE); } close(p[1]); /* Wait for kid to start */ ret = -1; read(p[0], &ret, sizeof(ret)); if (ret != SUCC_ECODE) { printf("Error starting loop\n"); goto err; } /* Wait for pipe to get closed, then dump */ read(p[0], &ret, 1); close(p[0]); printf("--- Dump loop ---\n"); criu_init_opts(); criu_set_service_binary(argv[1]); criu_set_pid(pid); criu_set_log_file("dump.log"); criu_set_log_level(4); open_imgdir(); ret = criu_dump_iters(next_iter); if (ret < 0) { what_err_ret_mean(ret); kill(pid, SIGKILL); goto err; } printf(" `- Dump succeeded\n"); waitpid(pid, NULL, 0); printf("--- Restore loop ---\n"); criu_init_opts(); criu_set_log_level(4); criu_set_log_file("restore.log"); criu_set_images_dir_fd(cur_imgdir); pid = criu_restore_child(); if (pid <= 0) { what_err_ret_mean(pid); return -1; } printf(" `- Restore returned pid %d\n", pid); kill(pid, SIGUSR1); err: if (waitpid(pid, &ret, 0) < 0) { perror(" Can't wait kid"); return -1; } return chk_exit(ret, SUCC_ECODE); } criu-3.6/test/others/libcriu/test_notify.c000066400000000000000000000031551317335042600207630ustar00rootroot00000000000000#include "criu.h" #include #include #include #include #include #include #include #include #include "lib.h" #define SUCC_ECODE 42 static int actions_called = 0; static int notify(char *action, criu_notify_arg_t na) { printf("ACTION: %s\n", action); actions_called++; return 0; } int main(int argc, char **argv) { int pid, ret, fd, p[2]; printf("--- Start loop ---\n"); pipe(p); pid = fork(); if (pid < 0) { perror("Can't"); return -1; } if (!pid) { printf(" `- loop: initializing\n"); if (setsid() < 0) exit(1); close(0); close(1); close(2); close(p[0]); ret = SUCC_ECODE; write(p[1], &ret, sizeof(ret)); close(p[1]); while (1) sleep(1); exit(SUCC_ECODE); } close(p[1]); /* Wait for kid to start */ ret = -1; read(p[0], &ret, sizeof(ret)); if (ret != SUCC_ECODE) { printf("Error starting loop\n"); goto err; } /* Wait for pipe to get closed, then dump */ read(p[0], &ret, 1); close(p[0]); printf("--- Dump loop ---\n"); criu_init_opts(); criu_set_service_binary(argv[1]); criu_set_pid(pid); criu_set_log_file("dump.log"); criu_set_log_level(4); criu_set_notify_cb(notify); fd = open(argv[2], O_DIRECTORY); criu_set_images_dir_fd(fd); ret = criu_dump(); if (ret < 0) { what_err_ret_mean(ret); kill(pid, SIGKILL); goto err; } printf(" `- Dump succeeded\n"); ret = 0; err: waitpid(pid, NULL, 0); if (ret || !actions_called) { printf("FAIL (%d/%d)\n", ret, actions_called); return 1; } printf(" `- Success (%d actions)\n", actions_called); return 0; } criu-3.6/test/others/libcriu/test_self.c000066400000000000000000000030301317335042600203740ustar00rootroot00000000000000#include "criu.h" #include #include #include #include #include #include #include #include "lib.h" #define SUCC_DUMP_ECODE 41 #define SUCC_RSTR_ECODE 43 int main(int argc, char *argv[]) { int ret, fd, pid; fd = open(argv[2], O_DIRECTORY); if (fd < 0) { perror("Can't open images dir"); return 1; } criu_init_opts(); criu_set_service_binary(argv[1]); criu_set_images_dir_fd(fd); criu_set_log_level(4); printf("--- Start child ---\n"); pid = fork(); if (pid < 0) { perror("Can't"); return 1; } if (!pid) { /* * Child process -- dump itself, then * parent would restore us. */ close(0); close(1); close(2); if (setsid() < 0) exit(1); criu_set_log_file("dump.log"); criu_set_leave_running(true); ret = criu_dump(); if (ret < 0) { what_err_ret_mean(ret); exit(1); } if (ret == 0) ret = SUCC_DUMP_ECODE; /* dumped OK */ else if (ret == 1) ret = SUCC_RSTR_ECODE; /* restored OK */ else ret = 1; exit(ret); } printf("--- Wait for self-dump ---\n"); if (waitpid(pid, &ret, 0) < 0) { perror("Can't wait child"); goto errk; } if (chk_exit(ret, SUCC_DUMP_ECODE)) goto errk; printf("--- Restore ---\n"); criu_set_log_file("restore.log"); pid = criu_restore_child(); if (pid <= 0) { what_err_ret_mean(pid); goto err; } if (waitpid(pid, &ret, 0) < 0) { perror("Can't wait rchild"); goto errk; } return chk_exit(ret, SUCC_RSTR_ECODE); errk: kill(pid, SIGKILL); err: return 1; } criu-3.6/test/others/libcriu/test_sub.c000066400000000000000000000034361317335042600202460ustar00rootroot00000000000000#include "criu.h" #include #include #include #include #include #include #include #include #include "lib.h" static int stop = 0; static void sh(int sig) { stop = 1; } #define SUCC_ECODE 42 int main(int argc, char **argv) { int pid, ret, fd, p[2]; printf("--- Start loop ---\n"); pipe(p); pid = fork(); if (pid < 0) { perror("Can't"); return -1; } if (!pid) { printf(" `- loop: initializing\n"); if (setsid() < 0) exit(1); if (signal(SIGUSR1, sh) == SIG_ERR) exit(1); close(0); close(1); close(2); close(p[0]); ret = SUCC_ECODE; write(p[1], &ret, sizeof(ret)); close(p[1]); while (!stop) sleep(1); exit(SUCC_ECODE); } close(p[1]); /* Wait for kid to start */ ret = -1; read(p[0], &ret, sizeof(ret)); if (ret != SUCC_ECODE) { printf("Error starting loop\n"); goto err; } /* Wait for pipe to get closed, then dump */ read(p[0], &ret, 1); close(p[0]); printf("--- Dump loop ---\n"); criu_init_opts(); criu_set_service_binary(argv[1]); criu_set_pid(pid); criu_set_log_file("dump.log"); criu_set_log_level(4); fd = open(argv[2], O_DIRECTORY); criu_set_images_dir_fd(fd); ret = criu_dump(); if (ret < 0) { what_err_ret_mean(ret); kill(pid, SIGKILL); goto err; } printf(" `- Dump succeeded\n"); waitpid(pid, NULL, 0); printf("--- Restore loop ---\n"); criu_init_opts(); criu_set_log_level(4); criu_set_log_file("restore.log"); criu_set_images_dir_fd(fd); pid = criu_restore_child(); if (pid <= 0) { what_err_ret_mean(pid); return -1; } printf(" `- Restore returned pid %d\n", pid); kill(pid, SIGUSR1); err: if (waitpid(pid, &ret, 0) < 0) { perror(" Can't wait kid"); return -1; } return chk_exit(ret, SUCC_ECODE); } criu-3.6/test/others/make/000077500000000000000000000000001317335042600155305ustar00rootroot00000000000000criu-3.6/test/others/make/Makefile000066400000000000000000000000771317335042600171740ustar00rootroot00000000000000# Tests for the build system run: ./uninstall.sh .PHONY: run criu-3.6/test/others/make/uninstall.sh000077500000000000000000000007041317335042600201010ustar00rootroot00000000000000#!/bin/sh # A test to make sure "make uninstall" works as intended. set -e SELFDIR=$(dirname $(readlink -f $0)) DESTDIR=$SELFDIR/test.install-$$ cd $SELFDIR/../../.. set -x make install DESTDIR=$DESTDIR make uninstall DESTDIR=$DESTDIR set +x # There should be no files left (directories are OK for now) if [ $(find $DESTDIR -type f | wc -l) -gt 0 ]; then echo "Files left after uninstall:" find $DESTDIR -type f echo "FAIL" exit 1 fi echo PASS criu-3.6/test/others/mem-snap/000077500000000000000000000000001317335042600163305ustar00rootroot00000000000000criu-3.6/test/others/mem-snap/Makefile000066400000000000000000000000171317335042600177660ustar00rootroot00000000000000run: ./run.sh criu-3.6/test/others/mem-snap/run-predump-2.sh000077500000000000000000000024301317335042600213030ustar00rootroot00000000000000#!/bin/bash source ../env.sh || exit 1 function fail { echo "$@" exit 1 } set -x IMGDIR="dump/" rm -rf "$IMGDIR" mkdir "$IMGDIR" function launch_test { echo "Launching test" cd ../../zdtm/static/ make cleanout make maps04 make maps04.pid || fail "Can't start test" PID=$(cat maps04.pid) kill -0 $PID || fail "Test didn't start" cd - } function stop_test { wtime=1 cd ../../zdtm/static/ make maps04.stop cat maps04.out | fgrep PASS || fail "Test failed" echo "OK" } launch_test echo "Taking plain dump" mkdir "$IMGDIR/dump-1/" ${CRIU} dump -D "$IMGDIR/dump-1/" -o dump.log -t ${PID} -v4 || fail "Fail to dump" sleep 1 echo "Restore to check it works" ${CRIU} restore -D "${IMGDIR}/dump-1/" -o restore.log -d -v4 || fail "Fail to restore server" stop_test launch_test echo "Taking pre and plain dumps" echo "Pre-dump" mkdir "$IMGDIR/dump-2/" mkdir "$IMGDIR/dump-2/pre/" ${CRIU} pre-dump -D "$IMGDIR/dump-2/pre/" -o dump.log -t ${PID} -v4 || fail "Fail to pre-dump" echo "Plain dump" mkdir "$IMGDIR/dump-2/plain/" ${CRIU} dump -D "$IMGDIR/dump-2/plain/" -o dump.log -t ${PID} -v4 --prev-images-dir=../pre/ --track-mem || fail "Fail to dump" sleep 1 echo "Restore" ${CRIU} restore -D "${IMGDIR}/dump-2/plain/" -o restore.log -d -v4 || fail "Fail to restore server" stop_test criu-3.6/test/others/mem-snap/run-predump.sh000077500000000000000000000026561317335042600211560ustar00rootroot00000000000000#!/bin/bash source ../env.sh || exit 1 USEPS=0 if [ "$1" = "-s" ]; then echo "Will test via page-server" USEPS=1 shift fi NRSNAP=${1:-3} SPAUSE=${2:-4} PORT=12345 function fail { echo "$@" exit 1 } set -x IMGDIR="dump/" rm -rf "$IMGDIR" mkdir "$IMGDIR" echo "Launching test" cd ../../zdtm/static/ make cleanout make mem-touch make mem-touch.pid || fail "Can't start test" PID=$(cat mem-touch.pid) kill -0 $PID || fail "Test didn't start" cd - echo "Making $NRSNAP pre-dumps" for SNAP in $(seq 1 $NRSNAP); do sleep $SPAUSE mkdir "$IMGDIR/$SNAP/" if [ $SNAP -eq 1 ] ; then # First pre-dump cmd="pre-dump" args="--track-mem -R" elif [ $SNAP -eq $NRSNAP ]; then # Last dump cmd="dump" args="--prev-images-dir=../$((SNAP - 1))/ --track-mem" else # Other pre-dumps cmd="pre-dump" args="--prev-images-dir=../$((SNAP - 1))/ --track-mem -R" fi if [ $USEPS -eq 1 ]; then ${CRIU} page-server -D "${IMGDIR}/$SNAP/" -o ps.log --port ${PORT} -v4 & PS_PID=$! ps_args="--page-server --address 127.0.0.1 --port=${PORT}" else ps_args="" fi ${CRIU} $cmd -D "${IMGDIR}/$SNAP/" -o dump.log -t ${PID} -v4 $args $ps_args || fail "Fail to dump" if [ $USEPS -eq 1 ]; then wait $PS_PID fi done echo "Restoring" ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to restore server" cd ../../zdtm/static/ make mem-touch.stop cat mem-touch.out | fgrep PASS || fail "Test failed" echo "Test PASSED" criu-3.6/test/others/mem-snap/run-snap-auto-dedup.sh000077500000000000000000000040231317335042600224760ustar00rootroot00000000000000#!/bin/bash source ../env.sh || exit 1 USEPS=0 if [ "$1" = "-s" ]; then echo "Will test via page-server" USEPS=1 shift fi NRSNAP=${1:-3} SPAUSE=${2:-4} PORT=12345 function fail { echo "$@" exit 1 } set -x IMGDIR="dump/" rm -rf "$IMGDIR" mkdir "$IMGDIR" echo "Launching test" cd ../../zdtm/static/ make cleanout make mem-touch make mem-touch.pid || fail "Can't start test" PID=$(cat mem-touch.pid) kill -0 $PID || fail "Test didn't start" cd - echo "Making $NRSNAP snapshots" for SNAP in $(seq 1 $NRSNAP); do sleep $SPAUSE mkdir "$IMGDIR/$SNAP/" if [ $SNAP -eq 1 ] ; then # First snapshot -- no parent, keep running args="--track-mem -R" elif [ $SNAP -eq $NRSNAP ]; then # Last snapshot -- has parent, kill afterwards size_first_2=$(du -sh -BK dump/2/pages-*.img | grep -Eo '[0-9]+' | head -1) size_first_1=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) args="--prev-images-dir=../$((SNAP - 1))/ --track-mem --auto-dedup" else # Other snapshots -- have parent, keep running args="--prev-images-dir=../$((SNAP - 1))/ --track-mem -R" fi if [ $USEPS -eq 1 ]; then ${CRIU} page-server -D "${IMGDIR}/$SNAP/" -o ps.log --auto-dedup --port ${PORT} -v4 & PS_PID=$! ps_args="--page-server --address 127.0.0.1 --port=${PORT}" else ps_args="" fi ${CRIU} dump -D "${IMGDIR}/$SNAP/" -o dump.log -t ${PID} -v4 $args $ps_args || fail "Fail to dump" if [ $USEPS -eq 1 ]; then wait $PS_PID fi done size_last_2=$(du -sh -BK dump/2/pages-*.img | grep -Eo '[0-9]+' | head -1) size_last_1=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) dedup_ok_2=1 if [ $size_first_2 -gt $size_last_2 ]; then dedup_ok_2=0 fi dedup_ok_1=1 if [ $size_first_1 -gt $size_last_1 ]; then dedup_ok_1=0 fi echo "Restoring" ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to restore server" cd ../../zdtm/static/ make mem-touch.stop cat mem-touch.out | fgrep PASS || fail "Test failed" if [[ $dedup_ok_2 -ne 0 || $dedup_ok_1 -ne 0 ]]; then fail "Dedup test failed" fi echo "Test PASSED" criu-3.6/test/others/mem-snap/run-snap-dedup-on-restore.sh000077500000000000000000000036461317335042600236350ustar00rootroot00000000000000#!/bin/bash source ../env.sh || exit 1 USEPS=0 if [ "$1" = "-s" ]; then echo "Will test via page-server" USEPS=1 shift fi NRSNAP=${1:-3} SPAUSE=${2:-4} PORT=12345 function fail { echo "$@" exit 1 } set -x IMGDIR="dump/" rm -rf "$IMGDIR" mkdir "$IMGDIR" echo "Launching test" cd ../../zdtm/static/ make cleanout make mem-touch make mem-touch.pid || fail "Can't start test" PID=$(cat mem-touch.pid) kill -0 $PID || fail "Test didn't start" cd - echo "Making $NRSNAP snapshots" for SNAP in $(seq 1 $NRSNAP); do sleep $SPAUSE mkdir "$IMGDIR/$SNAP/" if [ $SNAP -eq 1 ] ; then # First snapshot -- no parent, keep running args="--track-mem -R" elif [ $SNAP -eq $NRSNAP ]; then # Last snapshot -- has parent, kill afterwards args="--prev-images-dir=../$((SNAP - 1))/ --track-mem --auto-dedup" else # Other snapshots -- have parent, keep running args="--prev-images-dir=../$((SNAP - 1))/ --track-mem -R --auto-dedup" fi if [ $USEPS -eq 1 ]; then ${CRIU} page-server -D "${IMGDIR}/$SNAP/" -o ps.log --auto-dedup --port ${PORT} -v4 & PS_PID=$! ps_args="--page-server --address 127.0.0.1 --port=${PORT}" else ps_args="" fi ${CRIU} dump -D "${IMGDIR}/$SNAP/" -o dump.log -t ${PID} -v4 $args $ps_args || fail "Fail to dump" if [ $USEPS -eq 1 ]; then wait $PS_PID fi done echo "Restoring" ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log --auto-dedup -d -v4 || fail "Fail to restore server" size_last3=$(du -sh -BK dump/3/pages-*.img | grep -Eo '[0-9]+' | head -1) size_last2=$(du -sh -BK dump/2/pages-*.img | grep -Eo '[0-9]+' | head -1) size_last1=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) restore_dedup_ok=0 if [[ $size_last1 -ne 0 || $size_last2 -ne 0 || $size_last3 -ne 0 ]]; then restore_dedup_ok=1 fi cd ../../zdtm/static/ make mem-touch.stop cat mem-touch.out | fgrep PASS || fail "Test failed" if [ $restore_dedup_ok -ne 0 ]; then fail "Dedup test failed" fi echo "Test PASSED" criu-3.6/test/others/mem-snap/run-snap-dedup.sh000077500000000000000000000040611317335042600215320ustar00rootroot00000000000000#!/bin/bash source ../env.sh || exit 1 USEPS=0 if [ "$1" = "-s" ]; then echo "Will test via page-server" USEPS=1 shift fi NRSNAP=${1:-3} SPAUSE=${2:-4} PORT=12345 function fail { echo "$@" exit 1 } set -x IMGDIR="dump/" rm -rf "$IMGDIR" mkdir "$IMGDIR" echo "Launching test" cd ../../zdtm/static/ make cleanout make mem-touch make mem-touch.pid || fail "Can't start test" PID=$(cat mem-touch.pid) kill -0 $PID || fail "Test didn't start" cd - echo "Making $NRSNAP snapshots" for SNAP in $(seq 1 $NRSNAP); do sleep $SPAUSE mkdir "$IMGDIR/$SNAP/" if [ $SNAP -eq 1 ] ; then # First snapshot -- no parent, keep running args="--track-mem -R" elif [ $SNAP -eq $NRSNAP ]; then # Last snapshot -- has parent, kill afterwards args="--prev-images-dir=../$((SNAP - 1))/ --track-mem" else # Other snapshots -- have parent, keep running args="--prev-images-dir=../$((SNAP - 1))/ --track-mem -R" fi if [ $USEPS -eq 1 ]; then ${CRIU} page-server -D "${IMGDIR}/$SNAP/" -o ps.log --port ${PORT} -v4 & PS_PID=$! ps_args="--page-server --address 127.0.0.1 --port=${PORT}" else ps_args="" fi ${CRIU} dump -D "${IMGDIR}/$SNAP/" -o dump.log -t ${PID} -v4 $args $ps_args || fail "Fail to dump" if [ $USEPS -eq 1 ]; then wait $PS_PID fi done echo "Dedup test" size_first_2=$(du -sh -BK dump/2/pages-*.img | grep -Eo '[0-9]+' | head -1) size_first_1=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) ${CRIU} dedup -D "${IMGDIR}/$NRSNAP/" size_last_2=$(du -sh -BK dump/2/pages-*.img | grep -Eo '[0-9]+' | head -1) size_last_1=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) dedup_ok_2=1 dedup_ok_1=1 if [ $size_first_2 -gt $size_last_2 ]; then dedup_ok_2=0 fi if [ $size_first_1 -gt $size_last_1 ]; then dedup_ok_1=0 fi echo "Restoring" ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to restore server" cd ../../zdtm/static/ make mem-touch.stop cat mem-touch.out | fgrep PASS || fail "Test failed" if [[ $dedup_ok_2 -ne 0 || $dedup_ok_1 -ne 0 ]]; then fail "Dedup test failed" fi echo "Test PASSED" criu-3.6/test/others/mem-snap/run-snap-maps04.sh000077500000000000000000000025561317335042600215440ustar00rootroot00000000000000#!/bin/bash source ../env.sh || exit 1 USEPS=0 if [ "$1" = "-s" ]; then echo "Will test via page-server" USEPS=1 shift fi NRSNAP=1 SPAUSE=${2:-4} PORT=12345 function fail { echo "$@" exit 1 } set -x IMGDIR="dump" CURDIR=${pwd} if ! mount | fgrep "$CURDIR/$IMGDIR" ; then rm -rf "$IMGDIR" mkdir "$IMGDIR" mount -t tmpfs -o size=1500M,nr_inodes=10k,mode=700 tmpfs $IMGDIR fi rm -rf "$IMGDIR/*" echo "Launching test" make -C ../../zdtm/static/ cleanout make -C ../../zdtm/static/ maps04 make -C ../../zdtm/static/ maps04.pid || fail "Can't start test" PID=$(cat ../../zdtm/static/maps04.pid) kill -0 $PID || fail "Test haven't started" mkdir "$IMGDIR/$NRSNAP/" if [ $USEPS -eq 1 ] ; then ${CRIU} page-server -D "${IMGDIR}/$NRSNAP/" -o ps.log --port ${PORT} -d -v4 #& PS_PID=$! ps_args="--page-server --address 127.0.0.1 --port=${PORT}" else ps_args="" fi ${CRIU} dump -D "${IMGDIR}/$NRSNAP/" -o dump.log -t ${PID} -v4 $ps_args || fail "Fail to dump" if [ $USEPS -eq 1 ] ; then wait $PS_PID fi echo "Restoring" ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log --auto-dedup -d -v4 || fail "Fail to restore" make -C ../../zdtm/static/ maps04.stop sleep 1 cat "../zdtm/static/maps04.out" | fgrep PASS || fail "Test failed" size=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) if [ $size -ne 0 ] ; then fail "Size not null" fi echo "Test PASSED" criu-3.6/test/others/mem-snap/run-snap.sh000077500000000000000000000027321317335042600204360ustar00rootroot00000000000000#!/bin/bash source ../env.sh || exit 1 USEPS=0 if [ "$1" = "-s" ]; then echo "Will test via page-server" USEPS=1 shift fi NRSNAP=${1:-3} SPAUSE=${2:-4} PORT=12345 function fail { echo "$@" exit 1 } set -x IMGDIR="dump/" rm -rf "$IMGDIR" mkdir "$IMGDIR" echo "Launching test" cd ../../zdtm/static/ make cleanout make mem-touch make mem-touch.pid || fail "Can't start test" PID=$(cat mem-touch.pid) kill -0 $PID || fail "Test didn't start" cd - echo "Making $NRSNAP snapshots" for SNAP in $(seq 1 $NRSNAP); do sleep $SPAUSE mkdir "$IMGDIR/$SNAP/" if [ $SNAP -eq 1 ] ; then # First snapshot -- no parent, keep running args="--track-mem -R" elif [ $SNAP -eq $NRSNAP ]; then # Last snapshot -- has parent, kill afterwards args="--prev-images-dir=../$((SNAP - 1))/ --track-mem" else # Other snapshots -- have parent, keep running args="--prev-images-dir=../$((SNAP - 1))/ --track-mem -R" fi if [ $USEPS -eq 1 ]; then ${CRIU} page-server -D "${IMGDIR}/$SNAP/" -o ps.log --port ${PORT} -v4 & PS_PID=$! ps_args="--page-server --address 127.0.0.1 --port=${PORT}" else ps_args="" fi ${CRIU} dump -D "${IMGDIR}/$SNAP/" -o dump.log -t ${PID} -v4 $args $ps_args || fail "Fail to dump" if [ $USEPS -eq 1 ]; then wait $PS_PID fi done echo "Restoring" ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to restore server" cd ../../zdtm/static/ make mem-touch.stop cat mem-touch.out | fgrep PASS || fail "Test failed" echo "Test PASSED" criu-3.6/test/others/mem-snap/run.sh000077500000000000000000000003561317335042600174770ustar00rootroot00000000000000#!/bin/bash # Don't execute tests, which use maps04, they are executed by zdtm set -e #./run-predump-2.sh ./run-predump.sh ./run-snap-auto-dedup.sh ./run-snap-dedup-on-restore.sh ./run-snap-dedup.sh #./run-snap-maps04.sh ./run-snap.sh criu-3.6/test/others/mnt-ext-dev/000077500000000000000000000000001317335042600167635ustar00rootroot00000000000000criu-3.6/test/others/mnt-ext-dev/Makefile000066400000000000000000000000171317335042600204210ustar00rootroot00000000000000run: ./run.sh criu-3.6/test/others/mnt-ext-dev/run.sh000077500000000000000000000006021317335042600201240ustar00rootroot00000000000000#!/bin/sh set -e -x # construct root python ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns truncate -s 0 zdtm.loop truncate -s 50M zdtm.loop mkfs.ext4 -F zdtm.loop dev=`losetup --find --show zdtm.loop` mkdir -p ../../dev cp -ap $dev ../../dev export ZDTM_MNT_EXT_DEV=$dev python ../../zdtm.py run -t zdtm/static/mnt_ext_dev || ret=$? losetup -d $dev unlink zdtm.loop exit $ret criu-3.6/test/others/mounts/000077500000000000000000000000001317335042600161405ustar00rootroot00000000000000criu-3.6/test/others/mounts/ext/000077500000000000000000000000001317335042600167405ustar00rootroot00000000000000criu-3.6/test/others/mounts/ext/Makefile000066400000000000000000000004011317335042600203730ustar00rootroot00000000000000all: ext-mount.so ns_init ext-mount.so: ext-mount.c gcc -g -Werror -Wall -shared -nostartfiles ext-mount.c -o ext-mount.so -iquote ../../../include -fPIC ns_init: ns_init.o gcc -static $< -o $@ ns_init.o: ns_init.c gcc -c $< -o $@ run: all ./run.sh criu-3.6/test/others/mounts/ext/ext-mount.c000066400000000000000000000037331317335042600210520ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "criu-plugin.h" #include "criu-log.h" #define IMG_NAME "ext-mount-test-%d.img" extern cr_plugin_init_t cr_plugin_init; extern cr_plugin_dump_ext_mount_t cr_plugin_dump_ext_mount; extern cr_plugin_restore_ext_mount_t cr_plugin_restore_ext_mount; int cr_plugin_init(void) { pr_info("Initialized ext mount c/r\n"); return 0; } int cr_plugin_dump_ext_mount(char *mountpoint, int id) { char *aux, *dst; int fd; char img[64]; pr_info("Check for ext mount %s being mine\n", mountpoint); aux = strrchr(mountpoint, '/'); if (!aux) { pr_err("Bad path provided\n"); return -ENOTSUP; } dst = getenv("EMP_MOUNTPOINT"); if (!dst) { pr_err("No EMP_MOUNTPOINT env\n"); return -1; } if (strcmp(aux + 1, dst)) { pr_info("Not mine\n"); return -ENOTSUP; } pr_info("Dumping my mount %d\n", id); sprintf(img, IMG_NAME, id); fd = openat(criu_get_image_dir(), img, O_RDWR | O_CREAT | O_TRUNC, 0600); if (fd < 0) { pr_perror("Can't open image"); return -1; } close(fd); return 0; } int cr_plugin_restore_ext_mount(int id, char *mountpoint, char *old_root, int *is_file) { int fd; char img[64], src[256], *src_file; pr_info("Restoring my mount %d?\n", id); sprintf(img, IMG_NAME, id); fd = openat(criu_get_image_dir(), img, O_RDONLY); if (fd < 0) { if (errno == ENOENT) return -ENOTSUP; pr_perror("Can't open my image"); return -1; } close(fd); src_file = getenv("EMP_ROOT_P"); if (!src_file) { pr_err("Can't get EMP_ROOT_P env\n"); return -1; } if (creat(mountpoint, 0600) < 0) { if (errno != EEXIST) { pr_perror("Can't make mountpoint"); return -1; } } if (is_file) *is_file = 1; sprintf(src, "/%s/%s", old_root, src_file); pr_info("Mount %s -> %s\n", src, mountpoint); if (mount(src, mountpoint, NULL, MS_BIND, NULL) < 0) { pr_perror("Can't bind mount"); return -1; } return 0; } criu-3.6/test/others/mounts/ext/ns_init.c000066400000000000000000000045511317335042600205540ustar00rootroot00000000000000#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include static void sigh(int sig) { } int main(int argc, char **argv) { int start[2]; char res; pid_t pid; /* * Usage: * run */ if (getpid() == 1) { int fd; struct sigaction sa = {}; sigset_t mask; if (setsid() == -1) { fprintf(stderr, "setsid: %m\n"); return 1; } sa.sa_handler = sigh; sigaction(SIGTERM, &sa, NULL); if (chdir(argv[2])) return 1; fd = open(argv[3], O_WRONLY|O_CREAT|O_TRUNC|O_APPEND, 0600); if (fd < 0) return 1; dup2(fd, 1); dup2(fd, 2); close(fd); close(0); if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) { fprintf(stderr, "mount(/, S_REC | MS_PRIVATE)): %m"); return 1; } mkdir("oldm"); if (pivot_root(".", "./oldm") < 0) return 1; umount2("/oldm", MNT_DETACH); mkdir("/proc"); if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { fprintf(stderr, "mount(/proc): %m"); return 1; } sigemptyset(&mask); sigaddset(&mask, SIGTERM); sigprocmask(SIG_BLOCK, &mask, NULL); fd = atoi(argv[1]); write(fd, "!", 1); close(fd); sigemptyset(&mask); sigsuspend(&mask); printf("Woken UP\n"); printf("Reading %s for [%s]\n", argv[4], argv[5]); { FILE *f; char buf[128]; f = fopen(argv[4], "r"); if (!f) perror("No file with message"); else { memset(buf, 0, sizeof(buf)); fgets(buf, sizeof(buf), f); fclose(f); printf("Got [%s]\n", buf); if (!strcmp(buf, argv[5])) printf("PASS\n"); } } exit(0); } if (unshare(CLONE_NEWNS | CLONE_NEWPID)) return 1; pipe(start); pid = fork(); if (pid == 0) { char *nargv[7], aux[10]; close(start[0]); sprintf(aux, "%d", start[1]); nargv[0] = argv[0]; nargv[1] = aux; nargv[2] = argv[2]; nargv[3] = argv[3]; nargv[4] = argv[4]; nargv[5] = argv[5]; nargv[6] = NULL; execv(argv[0], nargv); exit(0); } close(start[1]); res = 'F'; read(start[0], &res, 1); if (res != '!') { printf("Failed to start\n"); return 1; } printf("Container w/ tests started\n"); { FILE *pidf; pidf = fopen(argv[1], "w"); fprintf(pidf, "%d", pid); fclose(pidf); } return 0; } criu-3.6/test/others/mounts/ext/run.sh000077500000000000000000000051631317335042600201100ustar00rootroot00000000000000#!/bin/bash set -x function fail { echo $@ exit 1 } make || fail "Can't compile library or ns init" criu="../../../criu/criu" # New root for namespace NSROOT="nsroot" # External file with contents (exported for plugin.restore) EMP_ROOT="external_file" export EMP_ROOT_P="$(pwd)/$EMP_ROOT" # Internal file as seen from namespace (exported for plugin.dump) export EMP_MOUNTPOINT="file" # Message in a file to check visibility FMESSAGE="tram-pam-pam" # Binary of namespace's init NS_INIT="ns_init" # File with namespace init pid PIDF="pidf" start_ns() { # # Prepare the namespace's FS layout # mkdir $NSROOT echo -n "$FMESSAGE" > "$EMP_ROOT" mount --bind "$NSROOT" "$NSROOT" mount --make-private "$NSROOT" touch "$NSROOT/$EMP_MOUNTPOINT" mount --bind "$EMP_ROOT" "$NSROOT/$EMP_MOUNTPOINT" || fail "Can't prepare fs for ns" # # Start the namespace's init # cp $NS_INIT "$NSROOT/" "./$NSROOT/$NS_INIT" "$PIDF" "$NSROOT" "log" "$EMP_MOUNTPOINT" "$FMESSAGE" || fail "Can't start namespace" umount "$NSROOT/$EMP_MOUNTPOINT" echo "Namespace started, pid $(cat $PIDF)" } stop_ns() { # # Kill the init # kill -TERM $(cat $PIDF) sleep 2 # Shitty, but... umount $NSROOT if [ -z "$1" ]; then rm -f "$NSROOT/log" else mv "$NSROOT/log" "$1" fi rm -f "$PIDF" "$EMP_ROOT" "$NSROOT/$NS_INIT" "$NSROOT/log" "$NSROOT/$EMP_MOUNTPOINT" rmdir "$NSROOT/oldm" rmdir "$NSROOT/proc" rmdir "$NSROOT" } DDIR="dump" rm -rf $DDIR mkdir $DDIR chk_pass() { tail -n1 $1 | fgrep -q "PASS" } # # Test 1: handle external mount with plugin # test_plugin() { echo "=== Testing how plugin works" mkdir "$DDIR/plugin/" start_ns $criu dump -D "$DDIR/plugin/" -v4 -o "dump.log" --lib=$(pwd) \ -t $(cat pidf) || { stop_ns; return 1; } $criu restore -D "$DDIR/plugin/" -v4 -o "rstr.log" --lib=$(pwd) \ -d --root="$(pwd)/$NSROOT" --pidfile=$PIDF || { stop_ns; return 1; } echo "Restored, checking results" mv "$DDIR/plugin/$PIDF" . stop_ns "$DDIR/plugin/ns.log" chk_pass "$DDIR/plugin/ns.log" } test_ext_mount_map() { echo "=== Testing how --ext-mount-map works" mkdir "$DDIR/ext_mount_map/" start_ns $criu dump -D "$DDIR/ext_mount_map/" -v4 -o "dump.log" \ -t $(cat pidf) --ext-mount-map "/$EMP_MOUNTPOINT:TM" || { stop_ns; return 1; } $criu restore -D "$DDIR/ext_mount_map/" -v4 -o "rstr.log" \ -d --root="$(pwd)/$NSROOT" --pidfile=$PIDF --ext-mount-map "TM:$EMP_ROOT_P" || { stop_ns; return 1; } echo "Restored, checking results" mv "$DDIR/ext_mount_map/$PIDF" . stop_ns "$DDIR/ext_mount_map/ns.log" chk_pass "$DDIR/ext_mount_map/ns.log" } test_plugin || exit 1 test_ext_mount_map || exit 1 echo "All tests passed" exit 0 criu-3.6/test/others/mounts/mounts.py000077500000000000000000000013441317335042600200440ustar00rootroot00000000000000import os import tempfile, random def mount(src, dst, shared, private, slave): cmd = "mount" if shared: cmd += " --make-shared" if private: cmd += " --make-private" if slave: cmd += " --make-slave" if src: cmd += " --bind '%s' '%s'" % (src, dst) else: cmd += " -t tmpfs none '%s'" % (dst) print cmd ret = os.system(cmd) if ret: print "failed" root = tempfile.mkdtemp(prefix = "root.mount", dir = "/tmp") mount(None, root, 1, 0, 0) mounts = [root] for i in xrange(10): dstdir = random.choice(mounts) dst = tempfile.mkdtemp(prefix = "mount", dir = dstdir) src = random.choice(mounts + [None]) mount(src, dst, random.randint(0,100) > 50, random.randint(0,100) > 90, random.randint(0,100) > 50) mounts.append(dst) criu-3.6/test/others/mounts/mounts.sh000077500000000000000000000007251317335042600200300ustar00rootroot00000000000000[ -z "$INMNTNS" ] && { export INMNTNS=`pwd` export INMNTNS_PID=$$ unshare -m -- setsid bash -x "$0" "$@" < /dev/null &> mounts.log & echo $! > mounts.pid while :; do sleep 1 done } cd $INMNTNS mount --make-rprivate / for i in `cat /proc/self/mounts | awk '{ print $2 }'`; do [ '/' = "$i" ] && continue [ '/proc' = "$i" ] && continue [ '/dev' = "$i" ] && continue echo $i umount -l $i done python2 mounts.py kill $INMNTNS_PID while :; do sleep 10 done criu-3.6/test/others/mounts/run.sh000077500000000000000000000006531317335042600173070ustar00rootroot00000000000000#!/bin/bash CRIU=../../../criu/criu set -x mkdir -p dump ./mounts.sh pid=`cat mounts.pid` kill -0 $pid || exit cat /proc/$pid/mountinfo | sort -k 4 echo "Suspend server" ${CRIU} dump -D dump -o dump.log -t $pid -v4 || { cat dump/dump.log | grep Error exit 1 } echo "Resume server" ${CRIU} restore -d -D dump -o restore.log -v4 || { cat dump/dump.log | grep Error exit 1 } cat /proc/$pid/mountinfo | sort -k 4 kill $pid criu-3.6/test/others/overlayfs/000077500000000000000000000000001317335042600166255ustar00rootroot00000000000000criu-3.6/test/others/overlayfs/Makefile000066400000000000000000000001301317335042600202570ustar00rootroot00000000000000run: ./run.sh clean: umount -f overlay_test/z rm -rf overlay_test output checkpoint criu-3.6/test/others/overlayfs/run.sh000077500000000000000000000014731317335042600177750ustar00rootroot00000000000000#!/bin/bash set -eu CRIU=../../../criu/criu setup() { setup_mount setsid sleep 10 3>z/file < /dev/null &> output & PROC_PID=$! echo "PROC_PID=$PROC_PID" sleep 1 } setup_mount() { mkdir -p overlay_test cd overlay_test mkdir -p a b c z checkpoint mount -t overlay -o lowerdir=a,upperdir=b,workdir=c overlayfs z } check_criu() { echo "Dumping $PROC_PID..." if ! $CRIU dump -D checkpoint -t "${PROC_PID}"; then echo "ERROR! dump failed" return 1 fi echo "Restoring..." if ! $CRIU restore -d -D checkpoint; then echo "ERROR! restore failed" return 1 fi return 0 } cleanup() { kill -INT "${PROC_PID}" > /dev/null 2>&1 umount z cd "${ORIG_WD}" rm -rf overlay_test } main() { ORIG_WD=$(pwd) setup check_criu || { cleanup exit 1 } cleanup echo "OverlayFS C/R successful." exit 0 } main criu-3.6/test/others/pipes/000077500000000000000000000000001317335042600157335ustar00rootroot00000000000000criu-3.6/test/others/pipes/Makefile000066400000000000000000000003131317335042600173700ustar00rootroot00000000000000CFLAGS += -Wall pipe: pipe.c clean: rm -f pipe run: pipe ./pipe - && \ ./pipe -c && \ ./pipe -cl && \ ./pipe -d && \ ./pipe -o && \ ./pipe -r && \ ./pipe -dc && \ ./pipe -dcl && \ true criu-3.6/test/others/pipes/pipe.c000066400000000000000000000373171317335042600170470ustar00rootroot00000000000000/* * A simple demo/test program using criu's --inherit-fd command line * option to restore a process with (1) an external pipe and (2) a * new log file. * * Note that it's possible to restore the process without --inherit-fd, * but when it reads from or writes to the pipe, it will get a broken * pipe signal. * * Also note that changing the log file during restore has nothing to do * with the pipe. It's just a nice feature for cases where it's desirable * to have a restored process use a different file then the original one. * * The parent process spawns a child that will write messages to its * parent through a pipe. After a couple of messages, parent invokes * criu to checkpoint the child. Since the child exits after checkpoint, * its pipe will be broken. Parent sets up a new pipe and invokes criu * to restore the child using the new pipe (instead of the old one). * The restored child exits after writing a couple more messages. * * To make sure that fd clashes are correctly handled during restore, * child can optionally open a regular file and move it to a clashing fd. * * Make sure CRIU_BINARY defined below points to the right criu. * * $ cc -Wall -o pipe pipe.c * $ sudo ./pipe -v * * The following should all succeed: * * $ sudo ./pipe -q && echo OK * $ sudo ./pipe -qc && echo OK * $ sudo ./pipe -qcl && echo OK * $ sudo ./pipe -qd && echo OK * $ sudo ./pipe -qdc && echo OK * $ sudo ./pipe -qdcl && echo OK * * The following should all fail: * * $ sudo ./pipe -qn || echo $? * $ sudo ./pipe -qo || echo $? * $ sudo ./pipe -qr || echo $? */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef void (*sighandler_t)(int); typedef unsigned long ulong; /* colors */ #define CS_PARENT "\033[00;32m" #define CS_CHILD "\033[00;33m" #define CS_DUMP "\033[00;34m" #define CS_RESTORE "\033[00;35m" #define CE "\033[0m" #define die(fmt, ...) do { \ if (!qflag) \ fprintf(stderr, fmt ": %m\n", __VA_ARGS__); \ if (getpid() == parent_pid) { \ (void)kill(0, 9); \ exit(1); \ } \ _exit(1); \ } while (0) #define READ_FD 0 /* pipe read fd */ #define WRITE_FD 1 /* pipe write fd */ #define CLASH_FD 3 /* force inherit fd clash */ #define MAX_FORKS 3 /* child, checkpoint, restore */ #define CRIU_BINARY "../../../criu/criu" #define IMG_DIR "images" #define DUMP_LOG_FILE "dump.log" #define RESTORE_LOG_FILE "restore.log" #define RESTORE_PID_FILE "restore.pid" #define INHERIT_FD_OPTION "--inherit-fd" #define OLD_LOG_FILE "/tmp/oldlog" #define NEW_LOG_FILE "/tmp/newlog" /* * Command line options (see usage()). */ char *cli_flags = "cdhlnoqrv"; int cflag; int dflag; int lflag; int nflag; int oflag; int qflag; int rflag; int vflag; char pid_number[8]; char inh_pipe_opt[16]; char inh_pipe_arg[64]; char inh_file_opt[16]; char inh_file_arg[64]; char *dump_argv[] = { "criu", "dump", "-D", IMG_DIR, "-o", DUMP_LOG_FILE, "-v4", "-t", pid_number, NULL }; char *restore_argv[] = { "criu", "restore", "-d", "-D", IMG_DIR, "-o", RESTORE_LOG_FILE, "--pidfile", RESTORE_PID_FILE, "-v4", inh_pipe_opt, inh_pipe_arg, inh_file_opt, inh_file_arg, NULL }; int max_msgs; int max_forks; int parent_pid; int child_pid; int criu_dump_pid; int criu_restore_pid; /* prototypes */ void chld_handler(int signum); int parent(int *pipefd); int child(int *pipefd, int dupfd, int newfd); void checkpoint_child(int child_pid, int *pipefd); void restore_child(int *new_pipefd, char *old_pipe_name); void write_to_fd(int fd, char *name, int i, int newline); void ls_proc_fd(int fd); char *pipe_name(int fd); char *who(pid_t pid); void pipe_safe(int pipefd[2]); pid_t fork_safe(void); void signal_safe(int signum, sighandler_t handler); int open_safe(char *pathname, int flags); void close_safe(int fd); void write_safe(int fd, char *buf, int count); int read_safe(int fd, char *buf, int count); int dup_safe(int oldfd); void move_fd(int oldfd, int newfd); void mkdir_safe(char *dirname, int mode); void unlink_safe(char *pathname); void execv_safe(char *path, char *argv[], int ls); pid_t waitpid_safe(pid_t pid, int *status, int options, int id); void prctl_safe(int option, ulong arg2, ulong arg3, ulong arg4, ulong arg5); int dup2_safe(int oldfd, int newfd); void usage(char *cmd) { printf("Usage: %s [%s]\n", cmd, cli_flags); printf("-c\tcause a clash during restore by opening %s as fd %d\n", OLD_LOG_FILE, CLASH_FD); printf("-d\tdup the pipe and write to it\n"); printf("-l\tchange log file from %s to %s during restore\n", OLD_LOG_FILE, NEW_LOG_FILE); printf("\n"); printf("The following flags should cause restore failure\n"); printf("-n\tdo not use the %s option\n", INHERIT_FD_OPTION); printf("-o\topen the pipe via /proc//fd and write to it\n"); printf("-r\tspecify read end of pipe during restore\n"); printf("\n"); printf("Miscellaneous flags\n"); printf("-h\tprint this help and exit\n"); printf("-q\tquiet mode, don't print anything\n"); printf("-v\tverbose mode (list contents of /proc//fd)\n"); } int main(int argc, char *argv[]) { int ret; int opt; int pipefd[2]; max_msgs = 4; while ((opt = getopt(argc, argv, cli_flags)) != -1) { switch (opt) { case 'c': cflag++; break; case 'd': dflag++; max_msgs += 4; break; case 'h': usage(argv[0]); return 0; case 'l': lflag++; break; case 'n': nflag++; break; case 'o': oflag++; max_msgs += 4; break; case 'q': qflag++; vflag = 0;break; case 'r': rflag++; break; case 'v': vflag++; qflag = 0; break; default: usage(argv[0]); return 1; } } setbuf(stdout, NULL); setbuf(stderr, NULL); mkdir_safe(IMG_DIR, 0700); pipe_safe(pipefd); child_pid = fork_safe(); if (child_pid > 0) { parent_pid = getpid(); signal_safe(SIGCHLD, chld_handler); prctl_safe(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); close_safe(pipefd[WRITE_FD]); ret = parent(pipefd); } else { /* child */ int dupfd = -1; int openfd = -1; int logfd, flags; child_pid = getpid(); close_safe(pipefd[READ_FD]); setsid(); logfd = open_safe(OLD_LOG_FILE, O_WRONLY | O_APPEND | O_CREAT); dup2_safe(logfd, 1); dup2_safe(logfd, 2); close(logfd); close(0); /* open a regular file and move it to CLASH_FD */ if (cflag) move_fd(open_safe(OLD_LOG_FILE, O_WRONLY | O_APPEND | O_CREAT), CLASH_FD); fcntl(pipefd[WRITE_FD], F_SETFL, O_NONBLOCK | O_WRONLY); /* open additional descriptors on the pipe and use them all */ if (dflag) dupfd = dup_safe(pipefd[WRITE_FD]); if (oflag) { char buf[128]; snprintf(buf, sizeof buf, "/proc/self/fd/%d", pipefd[WRITE_FD]); openfd = open_safe(buf, O_WRONLY); } ret = child(pipefd, dupfd, openfd); flags = fcntl(pipefd[WRITE_FD], F_GETFL, 0); if ((flags & O_NONBLOCK) == 0) { printf("Unexpected flags %x\n", flags); ret = -1; } } return ret; } /* * Parent reads message from its pipe with the child. * After a couple of messages, it checkpoints the child * which causes the child to exit. Parent then creates * a new pipe and restores the child. */ int parent(int *pipefd) { char buf[32]; char old_pipe[32]; int nread; nread = 0; while (max_forks <= MAX_FORKS) { if (read_safe(pipefd[READ_FD], buf, sizeof buf) == 0) continue; nread++; if (vflag && nread == 1) ls_proc_fd(-1); if (!qflag) { printf("%s read %s from %s\n", who(0), buf, pipe_name(pipefd[READ_FD])); } if (nread == (max_msgs / 2)) { checkpoint_child(child_pid, pipefd); if (!nflag) { /* save the old pipe's name before closing it */ snprintf(old_pipe, sizeof old_pipe, "%s", pipe_name(pipefd[READ_FD])); close_safe(pipefd[READ_FD]); /* create a new one */ if (!qflag) printf("%s creating a new pipe\n", who(0)); pipe_safe(pipefd); } restore_child(pipefd, old_pipe); } } return 0; } /* * Child sends a total of max_messages messages to its * parent, half before checkpoint and half after restore. */ int child(int *pipefd, int dupfd, int openfd) { int i; int fd; int num_wfds; struct timespec req = { 1, 0 }; /* * Count the number of pipe descriptors we'll be * writing to. At least 1 (for pipefd[WRITE_FD]) * and at most 3. */ num_wfds = 1; if (dupfd >= 0) num_wfds++; if (openfd >= 0) num_wfds++; for (i = 0; i < max_msgs; i++) { /* print first time and after checkpoint */ if (vflag && (i == 0 || i == (max_msgs / 2))) ls_proc_fd(-1); switch (i % num_wfds) { case 0: fd = pipefd[WRITE_FD]; break; case 1: fd = dflag ? dupfd : openfd; break; case 2: fd = openfd; break; } write_to_fd(fd, pipe_name(pipefd[WRITE_FD]), i+1, 0); if (cflag) write_to_fd(CLASH_FD, "log file", i+1, 1); /* * Since sleep will be interrupted by C/R, make sure * to sleep an entire second to minimize the chance of * writing before criu restore has exited. If criu is * still around and we write to a broken pipe, we'll be * killed but SIGCHLD will be delivered to criu instead * of parent. */ while (nanosleep(&req, NULL)) ; if (!qflag) printf("\n"); } return 0; } void chld_handler(int signum) { int status; pid_t pid; pid = waitpid_safe(-1, &status, WNOHANG, 1); if (WIFEXITED(status)) status = WEXITSTATUS(status); if (pid == child_pid) { if (!qflag) { printf("%s %s exited with status %d\n", who(0), who(pid), status); } /* if child exited successfully, we're done */ if (status == 0) exit(0); /* checkpoint kills the child */ if (status != 9) exit(status); } } void checkpoint_child(int child_pid, int *pipefd) { /* prepare -t */ snprintf(pid_number, sizeof pid_number, "%d", child_pid); criu_dump_pid = fork_safe(); if (criu_dump_pid > 0) { int status; pid_t pid; pid = waitpid_safe(criu_dump_pid, &status, 0, 2); if (WIFEXITED(status)) status = WEXITSTATUS(status); if (!qflag) { printf("%s %s exited with status %d\n", who(0), who(pid), status); } if (status) exit(status); } else { close(pipefd[READ_FD]); criu_dump_pid = getpid(); execv_safe(CRIU_BINARY, dump_argv, 0); } } void restore_child(int *new_pipefd, char *old_pipe_name) { char buf[64]; criu_restore_pid = fork_safe(); if (criu_restore_pid > 0) { int status; pid_t pid; if (!nflag) close_safe(new_pipefd[WRITE_FD]); pid = waitpid_safe(criu_restore_pid, &status, 0, 3); if (WIFEXITED(status)) status = WEXITSTATUS(status); if (!qflag) { printf("%s %s exited with status %d\n", who(0), who(pid), status); } if (status) exit(status); } else { criu_restore_pid = getpid(); if (!nflag) { /* * We should close the read descriptor of the new pipe * and use its write descriptor to call criu restore. * But if rflag was set (for testing purposes), use the * read descriptor which should cause the application to * fail. * * Regardless of read or write descriptor, move it to a * clashing fd to test inherit fd clash resolve code. */ if (rflag) move_fd(new_pipefd[READ_FD], CLASH_FD); else { close_safe(new_pipefd[READ_FD]); move_fd(new_pipefd[WRITE_FD], CLASH_FD); } /* --inherit-fd fd[CLASH_FD]:pipe[xxxxxx] */ snprintf(inh_pipe_opt, sizeof inh_pipe_opt, "%s", INHERIT_FD_OPTION); snprintf(inh_pipe_arg, sizeof inh_pipe_arg, "fd[%d]:%s", CLASH_FD, old_pipe_name); if (lflag) { /* create a new log file to replace the old one */ int filefd = open_safe(NEW_LOG_FILE, O_WRONLY | O_APPEND | O_CREAT); /* --inherit-fd fd[x]:tmp/oldlog */ snprintf(inh_file_opt, sizeof inh_file_opt, "%s", INHERIT_FD_OPTION); snprintf(inh_file_arg, sizeof inh_file_arg, "fd[%d]:%s", filefd, OLD_LOG_FILE + 1); restore_argv[12] = inh_file_opt; } else restore_argv[12] = NULL; restore_argv[10] = inh_pipe_opt; } else restore_argv[10] = NULL; snprintf(buf, sizeof buf, "%s/%s", IMG_DIR, RESTORE_PID_FILE); unlink_safe(buf); execv_safe(CRIU_BINARY, restore_argv, 1); } } void write_to_fd(int fd, char *name, int i, int newline) { int n; char buf[16]; /* fit "hello d\n" for small d */ n = snprintf(buf, sizeof buf, "hello %d", i); if (!qflag) printf("%s writing %s to %s via fd %d\n", who(0), buf, name, fd); if (newline) { buf[n++] = '\n'; buf[n] = '\0'; } write_safe(fd, buf, strlen(buf)); } void ls_proc_fd(int fd) { char cmd[128]; if (qflag) return; if (fd == -1) snprintf(cmd, sizeof cmd, "ls -l /proc/%d/fd", getpid()); else snprintf(cmd, sizeof cmd, "ls -l /proc/%d/fd/%d", getpid(), fd); printf("%s %s\n", who(0), cmd); system(cmd); } char *pipe_name(int fd) { static char pipe_name[64]; char path[64]; snprintf(path, sizeof path, "/proc/self/fd/%d", fd); if (readlink(path, pipe_name, sizeof pipe_name) == -1) die("readlink: path=%s", path); return pipe_name; } /* * Use two buffers to support two calls to * this function in a printf argument list. */ char *who(pid_t pid) { static char pidstr1[64]; static char pidstr2[64]; static char *cp; char *np; char *ep; int p; p = pid ? pid : getpid(); if (p == parent_pid) { np = "parent"; ep = CS_PARENT; } else if (p == child_pid) { np = "child"; ep = CS_CHILD; } else if (p == criu_dump_pid) { np = "dump"; ep = CS_DUMP; } else if (p == criu_restore_pid) { np = "restore"; ep = CS_RESTORE; } else np = "???"; cp = (cp == pidstr1) ? pidstr2 : pidstr1; snprintf(cp, sizeof pidstr1, "%s[%s %d]", pid ? "" : ep, np, p); return cp; } void pipe_safe(int pipefd[2]) { if (pipe(pipefd) == -1) die("pipe: %p", pipefd); } pid_t fork_safe(void) { pid_t pid; if ((pid = fork()) == -1) die("fork: pid=%d", pid); max_forks++; return pid; } void signal_safe(int signum, sighandler_t handler) { if (signal(signum, handler) == SIG_ERR) die("signal: signum=%d", signum); } int open_safe(char *pathname, int flags) { int fd; if ((fd = open(pathname, flags, 0777)) == -1) die("open: pathname=%s", pathname); return fd; } void close_safe(int fd) { if (close(fd) == -1) die("close: fd=%d", fd); } void write_safe(int fd, char *buf, int count) { if (write(fd, buf, count) != count) { die("write: fd=%d buf=\"%s\" count=%d errno=%d", fd, buf, count, errno); } } int read_safe(int fd, char *buf, int count) { int n; if ((n = read(fd, buf, count)) < 0) die("read: fd=%d count=%d", fd, count); buf[n] = '\0'; return n; } int dup_safe(int oldfd) { int newfd; if ((newfd = dup(oldfd)) == -1) die("dup: oldfd=%d", oldfd); return newfd; } int dup2_safe(int oldfd, int newfd) { if (dup2(oldfd, newfd) != newfd) die("dup2: oldfd=%d newfd=%d", oldfd, newfd); return newfd; } void move_fd(int oldfd, int newfd) { if (oldfd != newfd) { dup2_safe(oldfd, newfd); close_safe(oldfd); } } void mkdir_safe(char *dirname, int mode) { if (mkdir(dirname, mode) == -1 && errno != EEXIST) die("mkdir dirname=%s mode=0x%x\n", dirname, mode); } void unlink_safe(char *pathname) { if (unlink(pathname) == -1 && errno != ENOENT) { die("unlink: pathname=%s\n", pathname); } } void execv_safe(char *path, char *argv[], int ls) { int i; struct timespec req = { 0, 1000000 }; if (!qflag) { printf("\n%s ", who(0)); for (i = 0; argv[i] != NULL; i++) printf("%s ", argv[i]); printf("\n"); } /* give parent a chance to wait for us */ while (nanosleep(&req, NULL)) ; if (vflag && ls) ls_proc_fd(-1); execv(path, argv); die("execv: path=%s", path); } pid_t waitpid_safe(pid_t pid, int *status, int options, int id) { pid_t p; p = waitpid(pid, status, options); if (p == -1) fprintf(stderr, "waitpid pid=%d id=%d %m\n", pid, id); return p; } void prctl_safe(int option, ulong arg2, ulong arg3, ulong arg4, ulong arg5) { if (prctl(option, arg2, arg3, arg4, arg5) == -1) die("prctl: option=0x%x", option); } criu-3.6/test/others/rpc/000077500000000000000000000000001317335042600153775ustar00rootroot00000000000000criu-3.6/test/others/rpc/.gitignore000066400000000000000000000000331317335042600173630ustar00rootroot00000000000000rpc.pb-c.* *_pb2.py test-c criu-3.6/test/others/rpc/Makefile000066400000000000000000000010541317335042600170370ustar00rootroot00000000000000all: test-c rpc_pb2.py criu .PHONY: all CFLAGS += -g -Werror -Wall -I. LDLIBS += -lprotobuf-c run: all mkdir -p build chmod a+rwx build sudo -g '#1000' -u '#1000' ./run.sh criu: ../../../criu/criu cp ../../../criu/criu $@ chmod u+s $@ test-c: rpc.pb-c.o test-c.o test-c.o: test-c.c rpc.pb-c.c rpc_pb2.py: rpc.proto protoc --proto_path=. --python_out=. rpc.proto rpc.pb-c.c: rpc.proto protoc-c --proto_path=. --c_out=. rpc.proto clean: rm -rf build rpc.pb-c.o test-c.o test-c rpc.pb-c.c rpc.pb-c.h rpc_pb2.py rpc_pb2.pyc criu .PHONY: clean criu-3.6/test/others/rpc/errno.py000077500000000000000000000060011317335042600170760ustar00rootroot00000000000000#!/usr/bin/python2 # Test criu errno import socket, os, imp, sys, errno import rpc_pb2 as rpc import argparse parser = argparse.ArgumentParser(description="Test errno reported by CRIU RPC") parser.add_argument('socket', type = str, help = "CRIU service socket") parser.add_argument('dir', type = str, help = "Directory where CRIU images should be placed") args = vars(parser.parse_args()) # Prepare dir for images class test: def __init__(self): self.imgs_fd = os.open(args['dir'], os.O_DIRECTORY) self.s = -1 self._MAX_MSG_SIZE = 1024 def connect(self): self.s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) self.s.connect(args['socket']) def get_base_req(self): req = rpc.criu_req() req.opts.log_level = 4 req.opts.images_dir_fd = self.imgs_fd return req def send_req(self, req): self.connect() self.s.send(req.SerializeToString()) def recv_resp(self): resp = rpc.criu_resp() resp.ParseFromString(self.s.recv(self._MAX_MSG_SIZE)) return resp def check_resp(self, resp, typ, err): if resp.type != typ: raise Exception('Unexpected responce type ' + str(resp.type)) if resp.success: raise Exception('Unexpected success = True') if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) def no_process(self): print 'Try to dump unexisting process' # Get pid of non-existing process. # Suppose max_pid is not taken by any process. with open("/proc/sys/kernel/pid_max", "r") as f: pid = int(f.readline()) try: os.kill(pid, 0) except OSError: pass else: raise Exception('max pid is taken') # Ask criu to dump non-existing process. req = self.get_base_req() req.type = rpc.DUMP req.opts.pid = pid self.send_req(req) resp = self.recv_resp() self.check_resp(resp, rpc.DUMP, errno.ESRCH) print 'Success' def process_exists(self): print 'Try to restore process which pid is already taken by other process' # Perform self-dump req = self.get_base_req() req.type = rpc.DUMP req.opts.leave_running = True self.send_req(req) resp = self.recv_resp() if resp.success != True: raise Exception('Self-dump failed') # Ask to restore process from images of ourselves req = self.get_base_req() req.type = rpc.RESTORE self.send_req(req) resp = self.recv_resp() self.check_resp(resp, rpc.RESTORE, errno.EEXIST) print 'Success' def bad_options(self): print 'Try to send criu invalid opts' # Subdirs are not allowed in log_file req = self.get_base_req() req.type = rpc.DUMP req.opts.log_file = "../file.log" self.send_req(req) resp = self.recv_resp() self.check_resp(resp, rpc.DUMP, errno.EBADRQC) print 'Success' def bad_request(self): print 'Try to send criu invalid request type' req = self.get_base_req() req.type = rpc.NOTIFY self.send_req(req) resp = self.recv_resp() self.check_resp(resp, rpc.EMPTY, None) print 'Success' def run(self): self.no_process() self.process_exists() self.bad_options() self.bad_request() t = test() t.run() criu-3.6/test/others/rpc/loop.sh000077500000000000000000000000461317335042600167070ustar00rootroot00000000000000#!/bin/bash while :; do sleep 1 done criu-3.6/test/others/rpc/ps_test.py000077500000000000000000000034571317335042600174460ustar00rootroot00000000000000#!/usr/bin/python2 import socket, os, imp, sys, errno import rpc_pb2 as rpc import argparse parser = argparse.ArgumentParser(description="Test page-server using CRIU RPC") parser.add_argument('socket', type = str, help = "CRIU service socket") parser.add_argument('dir', type = str, help = "Directory where CRIU images should be placed") args = vars(parser.parse_args()) # Connect to service socket s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) s.connect(args['socket']) # Start page-server print 'Starting page-server' req = rpc.criu_req() req.type = rpc.PAGE_SERVER req.opts.log_file = 'page-server.log' req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) s.send(req.SerializeToString()) resp = rpc.criu_resp() MAX_MSG_SIZE = 1024 resp.ParseFromString(s.recv(MAX_MSG_SIZE)) if resp.type != rpc.PAGE_SERVER: print 'Unexpected msg type' sys.exit(1) else: if resp.success: # check if pid even exists try: os.kill(resp.ps.pid, 0) except OSError as err: if err.errno == errno.ESRCH: print 'No process with page-server pid %d' %(resp.ps.pid) else: print 'Can\'t check that process %d exists' %(resp.ps.pid) sys.exit(1) print 'Success, page-server pid %d started on port %u' %(resp.ps.pid, resp.ps.port) else: print 'Failed to start page-server' sys.exit(1) # Perform self-dump print 'Dumping myself using page-server' req.type = rpc.DUMP req.opts.ps.port = resp.ps.port req.opts.log_file = 'dump.log' req.opts.leave_running = True s.close() s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) s.connect(args['socket']) s.send(req.SerializeToString()) resp.ParseFromString(s.recv(MAX_MSG_SIZE)) if resp.type != rpc.DUMP: print 'Unexpected msg type' sys.exit(1) else: if resp.success: print 'Success' else: print 'Fail' sys.exit(1) criu-3.6/test/others/rpc/restore-loop.py000077500000000000000000000021361317335042600204100ustar00rootroot00000000000000#!/usr/bin/python2 import socket, os, imp, sys import rpc_pb2 as rpc import argparse parser = argparse.ArgumentParser(description="Test ability to restore a process from images using CRIU RPC") parser.add_argument('socket', type = str, help = "CRIU service socket") parser.add_argument('dir', type = str, help = "Directory where CRIU images could be found") args = vars(parser.parse_args()) # Connect to service socket s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) s.connect(args['socket']) # Create criu msg, set it's type to dump request # and set dump options. Checkout more options in protobuf/rpc.proto req = rpc.criu_req() req.type = rpc.RESTORE req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) # Send request s.send(req.SerializeToString()) # Recv response resp = rpc.criu_resp() MAX_MSG_SIZE = 1024 resp.ParseFromString(s.recv(MAX_MSG_SIZE)) if resp.type != rpc.RESTORE: print 'Unexpected msg type' sys.exit(-1) else: if resp.success: print 'Restore success' else: print 'Restore fail' sys.exit(-1) print "PID of the restored program is %d\n" %(resp.restore.pid) criu-3.6/test/others/rpc/rpc.proto000077700000000000000000000000001317335042600232222../../../images/rpc.protoustar00rootroot00000000000000criu-3.6/test/others/rpc/run.sh000077500000000000000000000035741317335042600165530ustar00rootroot00000000000000#!/bin/bash set -e CRIU=./criu export PROTODIR=`readlink -f "${PWD}/../../protobuf"` echo $PROTODIR function title_print { echo -e "\n**************************************************" echo -e "\t\t"$1 echo -e "**************************************************\n" } function start_server { title_print "Start service server" ${CRIU} service -v4 -W build -o service.log --address criu_service.socket -d --pidfile pidfile } function stop_server { title_print "Shutdown service server" kill -SIGTERM $(cat build/pidfile) unlink build/pidfile } function test_c { mkdir -p build/imgs_c title_print "Run test-c" setsid ./test-c build/criu_service.socket build/imgs_c < /dev/null &>> build/output_c title_print "Restore test-c" ${CRIU} restore -v4 -o restore-c.log -D build/imgs_c } function test_py { mkdir -p build/imgs_py title_print "Run test-py" setsid ./test.py build/criu_service.socket build/imgs_py < /dev/null &>> build/output_py title_print "Restore test-py" ${CRIU} restore -v4 -o restore-py.log -D build/imgs_py } function test_restore_loop { mkdir -p build/imgs_loop title_print "Run loop.sh" setsid ./loop.sh < /dev/null &> build/loop.log & P=${!} echo "pid ${P}" title_print "Dump loop.sh" ${CRIU} dump -v4 -o dump-loop.log -D build/imgs_loop -t ${P} title_print "Run restore-loop" ./restore-loop.py build/criu_service.socket build/imgs_loop kill -SIGTERM ${P} } function test_ps { mkdir -p build/imgs_ps title_print "Run ps_test" setsid ./ps_test.py build/criu_service.socket build/imgs_ps < /dev/null &>> build/output_ps } function test_errno { mkdir -p build/imgs_errno title_print "Run cr_errno test" setsid ./errno.py build/criu_service.socket build/imgs_errno < /dev/null &>> build/output_errno } trap 'echo "FAIL"; stop_server' EXIT start_server test_c test_py test_restore_loop test_ps test_errno stop_server trap 'echo "Success"' EXIT criu-3.6/test/others/rpc/test-c.c000066400000000000000000000056111317335042600167450ustar00rootroot00000000000000#include "rpc.pb-c.h" #include #include #include #include #include #include #include #include #define MAX_MSG_SIZE 1024 static CriuResp *recv_resp(int socket_fd) { unsigned char buf[MAX_MSG_SIZE]; int len; CriuResp *msg = 0; len = read(socket_fd, buf, MAX_MSG_SIZE); if (len == -1) { perror("Can't read response"); return NULL; } msg = criu_resp__unpack(NULL, len, buf); if (!msg) { perror("Failed unpacking response"); return NULL; } return msg; } static int send_req(int socket_fd, CriuReq *req) { unsigned char buf[MAX_MSG_SIZE]; int len; len = criu_req__get_packed_size(req); if (criu_req__pack(req, buf) != len) { perror("Failed packing request"); return -1; } if (write(socket_fd, buf, len) == -1) { perror("Can't send request"); return -1; } return 0; } int main(int argc, char *argv[]) { CriuReq req = CRIU_REQ__INIT; CriuResp *resp = NULL; int fd, dir_fd; int ret = 0; struct sockaddr_un addr; socklen_t addr_len; if (argc != 3) { fprintf(stderr, "Usage: test-c criu-service.socket imgs_dir"); return -1; } /* * Open a directory, in which criu will * put images */ puts(argv[2]); dir_fd = open(argv[2], O_DIRECTORY); if (dir_fd == -1) { perror("Can't open imgs dir"); return -1; } /* * Set "DUMP" type of request. * Allocate CriuDumpReq. */ req.type = CRIU_REQ_TYPE__DUMP; req.opts = malloc(sizeof(CriuOpts)); if (!req.opts) { perror("Can't allocate memory for dump request"); return -1; } criu_opts__init(req.opts); /* * Set dump options. * Checkout more in protobuf/rpc.proto. */ req.opts->has_leave_running = true; req.opts->leave_running = true; req.opts->images_dir_fd = dir_fd; req.opts->has_log_level = true; req.opts->log_level = 4; /* * Connect to service socket */ fd = socket(AF_LOCAL, SOCK_SEQPACKET, 0); if (fd == -1) { perror("Can't create socket"); return -1; } memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_LOCAL; strcpy(addr.sun_path, argv[1]); addr_len = strlen(addr.sun_path) + sizeof(addr.sun_family); ret = connect(fd, (struct sockaddr *) &addr, addr_len); if (ret == -1) { perror("Cant connect to socket"); goto exit; } /* * Send request */ ret = send_req(fd, &req); if (ret == -1) { perror("Can't send request"); goto exit; } /* * Recv response */ resp = recv_resp(fd); if (!resp) { perror("Can't recv response"); ret = -1; goto exit; } if (resp->type != CRIU_REQ_TYPE__DUMP) { perror("Unexpected response type"); ret = -1; goto exit; } /* * Check response. */ if (resp->success) puts("Success"); else { puts("Fail"); ret = -1; goto exit; } if (resp->dump->has_restored && resp->dump->restored) puts("Restored"); exit: close(fd); close(dir_fd); if (resp) criu_resp__free_unpacked(resp, NULL); return ret; } criu-3.6/test/others/rpc/test.py000077500000000000000000000021161317335042600167330ustar00rootroot00000000000000#!/usr/bin/python2 import socket, os, imp, sys import rpc_pb2 as rpc import argparse parser = argparse.ArgumentParser(description="Test dump/restore using CRIU RPC") parser.add_argument('socket', type = str, help = "CRIU service socket") parser.add_argument('dir', type = str, help = "Directory where CRIU images should be placed") args = vars(parser.parse_args()) # Connect to service socket s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) s.connect(args['socket']) # Create criu msg, set it's type to dump request # and set dump options. Checkout more options in protobuf/rpc.proto req = rpc.criu_req() req.type = rpc.DUMP req.opts.leave_running = True req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) # Send request s.send(req.SerializeToString()) # Recv response resp = rpc.criu_resp() MAX_MSG_SIZE = 1024 resp.ParseFromString(s.recv(MAX_MSG_SIZE)) if resp.type != rpc.DUMP: print 'Unexpected msg type' sys.exit(-1) else: if resp.success: print 'Success' else: print 'Fail' sys.exit(-1) if resp.dump.restored: print 'Restored' criu-3.6/test/others/rpc/version.py000077500000000000000000000023231317335042600174410ustar00rootroot00000000000000#!/usr/bin/python2 import socket import sys import rpc_pb2 as rpc import argparse import subprocess print('Connecting to CRIU in swrk mode to check the version:') css = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) swrk = subprocess.Popen(['./criu', "swrk", "%d" % css[0].fileno()]) css[0].close() s = css[1] # Create criu msg, set it's type to dump request # and set dump options. Checkout more options in protobuf/rpc.proto req = rpc.criu_req() req.type = rpc.VERSION # Send request s.send(req.SerializeToString()) # Recv response resp = rpc.criu_resp() MAX_MSG_SIZE = 1024 resp.ParseFromString(s.recv(MAX_MSG_SIZE)) if resp.type != rpc.VERSION: print('RPC: Unexpected msg type') sys.exit(-1) else: if resp.success: print('RPC: Success') print('CRIU major %d' % resp.version.major) print('CRIU minor %d' % resp.version.minor) if resp.version.HasField('gitid'): print('CRIU gitid %s' % resp.version.gitid) if resp.version.HasField('sublevel'): print('CRIU sublevel %s' % resp.version.sublevel) if resp.version.HasField('extra'): print('CRIU extra %s' % resp.version.extra) if resp.version.HasField('name'): print('CRIU name %s' % resp.version.name) else: print 'Fail' sys.exit(-1) criu-3.6/test/others/security/000077500000000000000000000000001317335042600164625ustar00rootroot00000000000000criu-3.6/test/others/security/Makefile000066400000000000000000000011261317335042600201220ustar00rootroot00000000000000DIR := $(shell pwd)/data LOOP := $(DIR)/loop.sh PIDFILE := $(DIR)/loop.pid IMGS := $(DIR)/imgs CRIU := $(DIR)/criu ROOT :=root USR1 :=criu-test-user1 USR2 :=criu-test-user2 export DIR LOOP PIDFILE IMGS CRIU ROOT USR1 USR2 run: testdir users ./run.sh testdir: ../../../criu/criu mkdir -p $(DIR) mkdir -p $(IMGS) cp ../../../criu/criu $(CRIU) chmod u+s $(CRIU) cp loop.sh $(LOOP) chmod 777 $(DIR) users: -userdel -f $(USR1) -userdel -f $(USR2) useradd -M -U $(USR1) useradd -M -U $(USR2) usermod -a -G $(USR2) $(USR1) clean: rm -rf $(DIR) -userdel -f $(USR1) -userdel -f $(USR2) criu-3.6/test/others/security/loop.sh000077500000000000000000000002201317335042600177640ustar00rootroot00000000000000#!/bin/bash echo $$ > $1.int mv $1.int $1 if [ "$2" == "--chgrp" ]; then grps=( $(groups) ) newgrp ${grps[2]} fi while :; do sleep 1 done criu-3.6/test/others/security/run.sh000077500000000000000000000031501317335042600176240ustar00rootroot00000000000000#!/bin/bash set -x PID= function run_as { echo "== Run ${LOOP} as $1" echo ${PIDFILE} rm -f ${PIDFILE} su $1 -c "setsid ${LOOP} ${PIDFILE} $2 < /dev/null &> /dev/null &" for i in `seq 100`; do test -f ${PIDFILE} && break sleep 1 done PID=`cat ${PIDFILE}` echo ${PID} } function dump_as { test -d ${IMGS} && rm -rf ${IMGS} mkdir -p ${IMGS} echo "== Dump ${PID} as $@" su $@ -c "${CRIU} dump --tree ${PID} --images-dir ${IMGS}" return $? } function rstr_as { echo "== Restore ${IMGS} as $@" su $@ -c "${CRIU} restore --images-dir ${IMGS} --restore-detached" return $? } function result { local BGRED='\033[41m' local BGGREEN='\033[42m' local NORMAL=$(tput sgr0) if [ $1 -ne 0 ]; then echo -e "${BGRED}FAIL${NORMAL}" exit 1 else echo -e "${BGGREEN}PASS${NORMAL}" fi } function test_root { echo "==== Check that non-root can't dump/restore process owned by root" run_as ${ROOT} dump_as ${USR1} ; result $((!$?)) dump_as ${ROOT} ; result $(($?)) rstr_as ${USR1} ; result $((!$?)) rstr_as ${ROOT} ; result $(($?)) kill -SIGKILL ${PID} } function test_other { echo "==== Check that user2 can't dump/restore process owned by user1" run_as ${USR1} dump_as ${USR2} ; result $((!$?)) dump_as ${USR1} ; result $(($?)) rstr_as ${USR2} ; result $((!$?)) rstr_as ${USR1} ; result $(($?)) kill -SIGKILL ${PID} } function test_own { echo "==== Check that user1 can dump/restore his own process that changes it's gid to one from groups" run_as ${USR1} "--chgrp" dump_as ${USR1} ; result $(($?)) rstr_as ${USR1} ; result $(($?)) kill -SIGKILL ${PID} } test_root test_other test_own criu-3.6/test/others/socketpairs/000077500000000000000000000000001317335042600171425ustar00rootroot00000000000000criu-3.6/test/others/socketpairs/Makefile000066400000000000000000000002341317335042600206010ustar00rootroot00000000000000CFLAGS += -Wall socketpair: socketpair.c clean: rm -f socketpair run: socketpair ./socketpair && \ ./socketpair -v && \ ./socketpair -m4 && \ true criu-3.6/test/others/socketpairs/socketpair.c000066400000000000000000000312231317335042600214530ustar00rootroot00000000000000/* * A simple demo/test program using criu's --inherit-fd command line * option to restore a process with an external unix socket. * Extending inherit's logic to unix sockets created by socketpair(..) syscall. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef void (*sighandler_t)(int); typedef unsigned long ulong; /* colors */ #define CS_PARENT "\033[00;32m" #define CS_CHILD "\033[00;33m" #define CS_DUMP "\033[00;34m" #define CS_RESTORE "\033[00;35m" #define CE "\033[0m" #define die(fmt, ...) do { \ fprintf(stderr, fmt ": %m\n", __VA_ARGS__); \ if (getpid() == parent_pid) { \ (void)kill(0, 9); \ exit(1); \ } \ _exit(1); \ } while (0) #define READ_FD 0 /* pipe read fd */ #define WRITE_FD 1 /* pipe write fd */ #define CLASH_FD 3 /* force inherit fd clash */ #define MAX_FORKS 3 /* child, checkpoint, restore */ #define CRIU_BINARY "../../../criu/criu" #define IMG_DIR "images" #define DUMP_LOG_FILE "dump.log" #define RESTORE_LOG_FILE "restore.log" #define RESTORE_PID_FILE "restore.pid" #define INHERIT_FD_OPTION "--inherit-fd" #define OLD_LOG_FILE "/tmp/oldlog" #define NEW_LOG_FILE "/tmp/newlog" /* * Command line options (see usage()). */ char *cli_flags = "hm:nv"; int max_msgs = 10; int vflag; int nflag; char pid_number[8]; char inh_unixsk_opt[16]; char inh_unixsk_arg[64]; char external_sk_ino[32]; char *dump_argv[] = { "criu", "dump", "-D", IMG_DIR, "-o", DUMP_LOG_FILE, "-v4", external_sk_ino, "-t", pid_number, NULL }; char *restore_argv[] = { "criu", "restore", "-d", "-D", IMG_DIR, "-o", RESTORE_LOG_FILE, "--pidfile", RESTORE_PID_FILE, "-v4", "-x", inh_unixsk_opt, inh_unixsk_arg, NULL }; int max_forks; int parent_pid; int child_pid; int criu_dump_pid; int criu_restore_pid; /* prototypes */ void chld_handler(int signum); int parent(int *socketfd, const char *ino_child_sk); int child(int *socketfd, int dupfd, int newfd); void checkpoint_child(int child_pid, int *old_socket_namefd); void restore_child(int *new_socketfd, const char *old_socket_name); void write_to_fd(int fd, char *name, int i, int newline); void ls_proc_fd(int fd); char *socket_name(int fd); ino_t socket_inode(int fd); char *who(pid_t pid); void socketpair_safe(int socketfd[2]); pid_t fork_safe(void); void signal_safe(int signum, sighandler_t handler); int open_safe(char *pathname, int flags); void close_safe(int fd); void write_safe(int fd, char *buf, int count); int read_safe(int fd, char *buf, int count); int dup_safe(int oldfd); void move_fd(int oldfd, int newfd); void mkdir_safe(char *dirname, int mode); void unlink_safe(char *pathname); void execv_safe(char *path, char *argv[], int ls); pid_t waitpid_safe(pid_t pid, int *status, int options, int id); void prctl_safe(int option, ulong arg2, ulong arg3, ulong arg4, ulong arg5); int dup2_safe(int oldfd, int newfd); void usage(char *cmd) { printf("Usage: %s [%s]\n", cmd, cli_flags); printf("-h\tprint this help and exit\n"); printf("-m\tcount of send messages (by default 10 will send from child) \n"); printf("-n\tdo not use the %s option\n", INHERIT_FD_OPTION); printf("-v\tverbose mode (list contents of /proc//fd)\n"); } int main(int argc, char *argv[]) { int ret; int opt; int socketfd[2]; while ((opt = getopt(argc, argv, cli_flags)) != -1) { switch (opt) { case 'h': usage(argv[0]); return 0; case 'm': max_msgs = atoi(optarg); break; case 'n': nflag++; break; case 'v': vflag++; break; case '?': if ('m' == optopt) fprintf (stderr, "Option -%c requires an argument.\n", optopt); else fprintf ( stderr, "Unknown option character `\\x%x'.\n", optopt); return 1; default: usage(argv[0]); return 1; } } setbuf(stdout, NULL); setbuf(stderr, NULL); mkdir_safe(IMG_DIR, 0700); socketpair_safe(socketfd); child_pid = fork_safe(); if (child_pid > 0) { parent_pid = getpid(); signal_safe(SIGCHLD, chld_handler); prctl_safe(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); snprintf(external_sk_ino, sizeof(external_sk_ino), "--ext-unix-sk=%u", (unsigned int)socket_inode(socketfd[WRITE_FD])); char unix_sk_ino[32] = {0}; strcpy(unix_sk_ino, socket_name(socketfd[WRITE_FD])); close_safe(socketfd[WRITE_FD]); ret = parent(socketfd, unix_sk_ino); } else { /* child */ int dupfd = -1; int openfd = -1; int logfd; child_pid = getpid(); close_safe(socketfd[READ_FD]); setsid(); logfd = open_safe(OLD_LOG_FILE, O_WRONLY | O_APPEND | O_CREAT); dup2_safe(logfd, 1); dup2_safe(logfd, 2); close(logfd); close(0); ret = child(socketfd, dupfd, openfd); } return ret; } /* * Parent reads message from its pipe with the child. * After a couple of messages, it checkpoints the child * which causes the child to exit. Parent then creates * a new pipe and restores the child. */ int parent(int *socketfd, const char *ino_child_sk) { char buf[32]; int nread; nread = 0; while (max_forks <= MAX_FORKS) { if (read_safe(socketfd[READ_FD], buf, sizeof buf) == 0) continue; nread++; if (vflag && nread == 1) ls_proc_fd(-1); printf( "%s read %s from %s\n", who(0), buf, socket_name(socketfd[READ_FD])); if (nread == (max_msgs / 2)) { checkpoint_child(child_pid, socketfd); if (!nflag) { close_safe(socketfd[READ_FD]); /* create a new one */ printf("%s creating a new socket\n", who(0)); socketpair_safe(socketfd); } restore_child(socketfd, ino_child_sk); } } return 0; } /* * Child sends a total of max_messages messages to its * parent, half before checkpoint and half after restore. */ int child(int *socketfd, int dupfd, int openfd) { int i; int fd; int num_wfds; struct timespec req = { 1, 0 }; /* * Count the number of pipe descriptors we'll be * writing to. At least 1 (for socketfd[WRITE_FD]) * and at most 3. */ num_wfds = 1; if (dupfd >= 0) num_wfds++; if (openfd >= 0) num_wfds++; for (i = 0; i < max_msgs; i++) { /* print first time and after checkpoint */ if (vflag && (i == 0 || i == (max_msgs / 2))) ls_proc_fd(-1); switch (i % num_wfds) { case 0: fd = socketfd[WRITE_FD]; break; case 1: fd = openfd; break; case 2: fd = openfd; break; } write_to_fd(fd, socket_name(socketfd[WRITE_FD]), i+1, 0); /* * Since sleep will be interrupted by C/R, make sure * to sleep an entire second to minimize the chance of * writing before criu restore has exited. If criu is * still around and we write to a broken pipe, we'll be * killed but SIGCHLD will be delivered to criu instead * of parent. */ while (nanosleep(&req, NULL)) ; printf("\n"); } return 0; } void chld_handler(int signum) { int status; pid_t pid; pid = waitpid_safe(-1, &status, WNOHANG, 1); if (WIFEXITED(status)) status = WEXITSTATUS(status); if (pid == child_pid) { printf("%s %s exited with status %d\n", who(0), who(pid), status); /* if child exited successfully, we're done */ if (status == 0) exit(0); /* checkpoint kills the child */ if (status != 9) exit(status); } } void checkpoint_child(int child_pid, int *socketfd) { /* prepare -t */ snprintf(pid_number, sizeof pid_number, "%d", child_pid); criu_dump_pid = fork_safe(); if (criu_dump_pid > 0) { int status; pid_t pid; pid = waitpid_safe(criu_dump_pid, &status, 0, 2); if (WIFEXITED(status)) status = WEXITSTATUS(status); printf("%s %s exited with status %d\n", who(0), who(pid), status); if (status) exit(status); } else { close(socketfd[READ_FD]); criu_dump_pid = getpid(); execv_safe(CRIU_BINARY, dump_argv, 0); } } void restore_child(int *new_socketfd, const char *old_sock_name) { char buf[64]; criu_restore_pid = fork_safe(); if (criu_restore_pid > 0) { int status; pid_t pid; if (!nflag) close_safe(new_socketfd[WRITE_FD]); pid = waitpid_safe(criu_restore_pid, &status, 0, 3); if (WIFEXITED(status)) status = WEXITSTATUS(status); printf("%s %s exited with status %d\n", who(0), who(pid), status); if (status) exit(status); } else { criu_restore_pid = getpid(); if (!nflag) { close_safe(new_socketfd[READ_FD]); move_fd(new_socketfd[WRITE_FD], CLASH_FD); /* --inherit-fd fd[CLASH_FD]:socket[xxxxxx] */ snprintf(inh_unixsk_opt, sizeof inh_unixsk_opt, "%s", INHERIT_FD_OPTION); snprintf(inh_unixsk_arg, sizeof inh_unixsk_arg, "fd[%d]:%s", CLASH_FD, old_sock_name); restore_argv[11] = inh_unixsk_opt; restore_argv[13] = NULL; } else restore_argv[11] = NULL; snprintf(buf, sizeof buf, "%s/%s", IMG_DIR, RESTORE_PID_FILE); unlink_safe(buf); execv_safe(CRIU_BINARY, restore_argv, 1); } } void write_to_fd(int fd, char *name, int i, int newline) { int n; char buf[16]; /* fit "hello d\n" for small d */ n = snprintf(buf, sizeof buf, "hello %d", i); printf("%s writing %s to %s via fd %d\n", who(0), buf, name, fd); if (newline) { buf[n++] = '\n'; buf[n] = '\0'; } write_safe(fd, buf, strlen(buf)); } void ls_proc_fd(int fd) { char cmd[128]; if (fd == -1) snprintf(cmd, sizeof cmd, "ls -l /proc/%d/fd", getpid()); else snprintf(cmd, sizeof cmd, "ls -l /proc/%d/fd/%d", getpid(), fd); printf("%s %s\n", who(0), cmd); system(cmd); } char *socket_name(int fd) { static char sock_name[64]; char path[64]; snprintf(path, sizeof path, "/proc/self/fd/%d", fd); if (readlink(path, sock_name, sizeof sock_name) == -1) die("readlink: path=%s", path); return sock_name; } ino_t socket_inode(int fd) { struct stat sbuf; if (fstat(fd, &sbuf) == -1) die("fstat: fd=%i", fd); return sbuf.st_ino; } /* * Use two buffers to support two calls to * this function in a printf argument list. */ char *who(pid_t pid) { static char pidstr1[64]; static char pidstr2[64]; static char *cp; char *np; char *ep; int p; p = pid ? pid : getpid(); if (p == parent_pid) { np = "parent"; ep = CS_PARENT; } else if (p == child_pid) { np = "child"; ep = CS_CHILD; } else if (p == criu_dump_pid) { np = "dump"; ep = CS_DUMP; } else if (p == criu_restore_pid) { np = "restore"; ep = CS_RESTORE; } else np = "???"; cp = (cp == pidstr1) ? pidstr2 : pidstr1; snprintf(cp, sizeof pidstr1, "%s[%s %d]", pid ? "" : ep, np, p); return cp; } void socketpair_safe(int socketfd[2]) { if (socketpair(AF_UNIX, SOCK_STREAM, 0, socketfd) == -1) die("socketpair %p", socketfd); } pid_t fork_safe(void) { pid_t pid; if ((pid = fork()) == -1) die("fork: pid=%d", pid); max_forks++; return pid; } void signal_safe(int signum, sighandler_t handler) { if (signal(signum, handler) == SIG_ERR) die("signal: signum=%d", signum); } int open_safe(char *pathname, int flags) { int fd; if ((fd = open(pathname, flags, 0777)) == -1) die("open: pathname=%s", pathname); return fd; } void close_safe(int fd) { if (close(fd) == -1) die("close: fd=%d", fd); } void write_safe(int fd, char *buf, int count) { if (write(fd, buf, count) != count) { die("write: fd=%d buf=\"%s\" count=%d errno=%d", fd, buf, count, errno); } } int read_safe(int fd, char *buf, int count) { int n; if ((n = read(fd, buf, count)) < 0) die("read: fd=%d count=%d", fd, count); buf[n] = '\0'; return n; } int dup_safe(int oldfd) { int newfd; if ((newfd = dup(oldfd)) == -1) die("dup: oldfd=%d", oldfd); return newfd; } int dup2_safe(int oldfd, int newfd) { if (dup2(oldfd, newfd) != newfd) die("dup2: oldfd=%d newfd=%d", oldfd, newfd); return newfd; } void move_fd(int oldfd, int newfd) { if (oldfd != newfd) { dup2_safe(oldfd, newfd); close_safe(oldfd); } } void mkdir_safe(char *dirname, int mode) { if (mkdir(dirname, mode) == -1 && errno != EEXIST) die("mkdir dirname=%s mode=0x%x\n", dirname, mode); } void unlink_safe(char *pathname) { if (unlink(pathname) == -1 && errno != ENOENT) { die("unlink: pathname=%s\n", pathname); } } void execv_safe(char *path, char *argv[], int ls) { int i; struct timespec req = { 0, 1000000 }; printf("\n%s ", who(0)); for (i = 0; argv[i] != NULL; i++) printf("%s ", argv[i]); printf("\n"); /* give parent a chance to wait for us */ while (nanosleep(&req, NULL)) ; if (vflag && ls) ls_proc_fd(-1); execv(path, argv); die("execv: path=%s", path); } pid_t waitpid_safe(pid_t pid, int *status, int options, int id) { pid_t p; p = waitpid(pid, status, options); if (p == -1) fprintf(stderr, "waitpid pid=%d id=%d %m\n", pid, id); return p; } void prctl_safe(int option, ulong arg2, ulong arg3, ulong arg4, ulong arg5) { if (prctl(option, arg2, arg3, arg4, arg5) == -1) die("prctl: option=0x%x", option); } criu-3.6/test/others/tcp/000077500000000000000000000000001317335042600154015ustar00rootroot00000000000000criu-3.6/test/others/tcp/Makefile000066400000000000000000000001401317335042600170340ustar00rootroot00000000000000OBJS=cln srv all: $(OBJS) .PHONY: all run: all ./run.sh clean: rm -f $(OBJS) .PHONY: clean criu-3.6/test/others/tcp/cln.c000066400000000000000000000037351317335042600163310ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #define BUF_SIZE (1024) static char rbuf[BUF_SIZE]; static char buf[BUF_SIZE]; static int check_buf(int sk, char *buf, int count) { int rd, i; printf("Checking for %d bytes\n", count); rd = 0; while (rd < count) { int r; r = read(sk, rbuf + rd, count - rd); if (r == 0) { printf("Unexpected EOF\n"); return 1; } if (r < 0) { perror("Can't read buf"); return 1; } rd += r; } for (i = 0; i < count; i++) if (buf[i] != rbuf[i]) { printf("Mismatch on %d byte %d != %d\n", i, (int)buf[i], (int)rbuf[i]); return 1; } return 0; } static int serve_new_conn(int in_fd, int sk) { printf("New connection\n"); while (1) { int rd, wr; rd = read(in_fd, buf, sizeof(buf)); if (rd == 0) break; if (rd < 0) { perror("Can't read from infd"); return 1; } printf("Read %d bytes, sending to sock\n", rd); wr = 0; while (wr < rd) { int w; w = write(sk, buf + wr, rd - wr); if (w <= 0) { perror("Can't write to socket"); return 1; } if (check_buf(sk, buf + wr, w)) return 1; wr += w; } } printf("Done\n"); return 0; } int main(int argc, char **argv) { int sk, port, ret; struct sockaddr_in addr; if (argc < 3) { printf("Need addr, port and iters\n"); return -1; } sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); if (sk < 0) { perror("Can't create socket"); return -1; } port = atoi(argv[2]); printf("Connecting to %s:%d\n", argv[1], port); memset(&addr, 0, sizeof(addr)); addr.sin_family = AF_INET; ret = inet_aton(argv[1], &addr.sin_addr); if (ret < 0) { perror("Can't convert addr"); return -1; } addr.sin_port = htons(port); ret = connect(sk, (struct sockaddr *)&addr, sizeof(addr)); if (ret < 0) { perror("Can't connect"); return -1; } return serve_new_conn(0, sk); } criu-3.6/test/others/tcp/run.sh000077500000000000000000000022741317335042600165510ustar00rootroot00000000000000#!/bin/bash source ../env.sh || exit 1 set -x PORT=12345 CLN_PIPE="./clnt_pipe" SRV_LOG="./srv.log" CLN_LOG="./cln.log" DDIR="dump" TEXT=$(hexdump -C /dev/urandom | head -n 1) echo "Building services" make clean && make || { echo "Failed to build"; exit 1; } rm -rf ${DDIR} ${SRV_LOG} ${CLN_LOG} ${CLN_PIPE} mkdir ${DDIR} echo "Starting server" setsid ./srv ${PORT} > ${SRV_LOG} 2>&1 & SRV_PID=${!} echo "Starting pipe" mkfifo ${CLN_PIPE} echo "Starting client" ./cln "127.0.0.1" ${PORT} < ${CLN_PIPE} > ${CLN_LOG} & CLN_PID=${!} exec 3>${CLN_PIPE} echo "Make it run" echo "${TEXT}" >&3 function fail { echo FAIL ( exec >&2 echo "$@" kill -9 ${CLN_PID} kill -9 ${SRV_PID} echo ${CLN_LOG}: cat ${CLN_LOG} ) exit 1 } kill -s 0 ${CLN_PID} || fail "Client is dead" echo "Suspend server" ${CRIU} dump -D ${DDIR} -o dump.log -t ${SRV_PID} --tcp-established -vvvv || fail "Fail to dump server" sleep 1 echo "Resume server" ${CRIU} restore -D ${DDIR} -o restore.log -d --tcp-established -vvvv --close 3 || fail "Fail to restore server" echo "Make client run again" echo "${TEXT}" >&3 echo "Collect results" exec 3>&- wait ${CLN_PID} || fail "Client exits abruptly" kill -9 ${SRV_PID} echo PASS criu-3.6/test/others/tcp/srv.c000066400000000000000000000032661317335042600163660ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include static int serve_new_conn(int sk) { int rd, wr; char buf[1024]; printf("New connection\n"); while (1) { rd = read(sk, buf, sizeof(buf)); if (!rd) break; if (rd < 0) { perror("Can't read socket"); return 1; } wr = 0; while (wr < rd) { int w; w = write(sk, buf + wr, rd - wr); if (w <= 0) { perror("Can't write socket"); return 1; } wr += w; } } printf("Done\n"); return 0; } int main(int argc, char **argv) { int sk, port, ret; struct sockaddr_in addr; if (argc < 2) { printf("Need port\n"); return -1; } /* * Let kids die themselves */ signal(SIGCHLD, SIG_IGN); sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); if (sk < 0) { perror("Can't create socket"); return -1; } port = atoi(argv[1]); memset(&addr, 0, sizeof(addr)); addr.sin_family = AF_INET; addr.sin_addr.s_addr = htonl(INADDR_ANY); addr.sin_port = htons(port); printf("Binding to port %d\n", port); ret = bind(sk, (struct sockaddr *)&addr, sizeof(addr)); if (ret < 0) { perror("Can't bind socket"); return -1; } ret = listen(sk, 16); if (ret < 0) { perror("Can't put sock to listen"); return -1; } printf("Waiting for connections\n"); while (1) { int ask, pid; ask = accept(sk, NULL, NULL); if (ask < 0) { perror("Can't accept new conn"); return -1; } pid = fork(); if (pid < 0) { perror("Can't fork"); return -1; } if (pid > 0) close(ask); else { close(sk); ret = serve_new_conn(ask); exit(ret); } } } criu-3.6/test/others/unix-callback/000077500000000000000000000000001317335042600173305ustar00rootroot00000000000000criu-3.6/test/others/unix-callback/Makefile000066400000000000000000000012561317335042600207740ustar00rootroot00000000000000all: unix-lib.so unix-server unix-client syslog-lib.so run: all ./run.sh unix.pb-c.c: unix.proto protoc-c --proto_path=. --c_out=. unix.proto unix-lib.so: unix-lib.c unix.pb-c.c gcc -g -Werror -Wall -shared -nostartfiles unix-lib.c unix.pb-c.c -o unix-lib.so -iquote ../../../criu/include -fPIC syslog-lib.so: syslog-lib.c gcc -g -Werror -Wall -shared -nostartfiles syslog-lib.c -o syslog-lib.so -iquote ../../../criu/include -fPIC unix-server: unix-server.c gcc -Werror -Wall -o unix-server unix-server.c unix-client: unix-client.c gcc -Werror -Wall -o unix-client unix-client.c clean: rm -rf data unix-lib.so unix-server unix-client syslog-lib.so output pid unix.pb-c.* criu-3.6/test/others/unix-callback/lib/000077500000000000000000000000001317335042600200765ustar00rootroot00000000000000criu-3.6/test/others/unix-callback/lib/syslog-lib.so000077700000000000000000000000001317335042600253622../syslog-lib.soustar00rootroot00000000000000criu-3.6/test/others/unix-callback/lib/unix-lib.so000077700000000000000000000000001317335042600244702../unix-lib.soustar00rootroot00000000000000criu-3.6/test/others/unix-callback/run.sh000077500000000000000000000014711317335042600204760ustar00rootroot00000000000000#!/bin/bash -x cd `dirname $0` source ../env.sh || exit 1 rm -rf /tmp/criu.unix.callback.test* test -f pid && unlink pid test -f output && unlink output rm -rf data mkdir -p data ./unix-server & srv_pid=$! for i in `seq 20`; do test -f /tmp/criu.unix.callback.test && break sleep 0.1 done ( setsid ./unix-client < /dev/null &> output ) & while :; do test -f pid && break sleep 1 done pid=`cat pid` ${CRIU} dump -D data -o dump.log -v4 --lib `pwd`/lib -t $pid || exit 1 kill $srv_pid wait $srv_pid unlink /tmp/criu.unix.callback.test ./unix-server & srv_pid=$! for i in `seq 20`; do test -f /tmp/criu.unix.callback.test && break sleep 0.1 done ${CRIU} restore -D data -o restore.log -v4 --lib `pwd`/lib -d || exit 1 kill $pid while :; do cat output | grep PASS && break sleep 1 done cat output kill $srv_pid criu-3.6/test/others/unix-callback/syslog-lib.c000066400000000000000000000025141317335042600215620ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "criu-plugin.h" #include "criu-log.h" extern cr_plugin_dump_unix_sk_t cr_plugin_dump_unix_sk; extern cr_plugin_restore_unix_sk_t cr_plugin_restore_unix_sk; int cr_plugin_dump_unix_sk(int sk, int id) { struct sockaddr_un addr; socklen_t addr_len = sizeof(addr); char buf[4096]; int fd; if (getsockname(sk, (struct sockaddr *) &addr, &addr_len) < 0) return -1; if (strncmp(addr.sun_path, "/dev/log", addr_len - sizeof(addr.sun_family))) return -ENOTSUP; snprintf(buf, sizeof(buf), "syslog-%x.img", id); fd = open(buf, O_WRONLY | O_CREAT); if (fd < 0) return -1; close(fd); return 0; } int cr_plugin_restore_unix_sk(int id) { struct sockaddr_un addr; socklen_t addr_len; char buf[4096]; int sk, fd; snprintf(buf, sizeof(buf), "syslog-%x.img", id); fd = open(buf, O_RDONLY); if (fd < 0) return -ENOTSUP; close(fd); sk = socket(AF_FILE, SOCK_DGRAM|SOCK_CLOEXEC, 0); if (sk == -1) return sk; addr.sun_family = AF_FILE; addr_len = strlen("/dev/log"); strncpy(addr.sun_path, "/dev/log", addr_len); addr_len += sizeof(addr.sun_family); if (connect(sk, (struct sockaddr *) &addr, addr_len) == -1) { close(sk); return -1; } return sk; } criu-3.6/test/others/unix-callback/unix-client.c000066400000000000000000000040051317335042600217320ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #define SK_NAME "/tmp/criu.unix.callback.test" #define SK_NR 2 struct { int id; int sk; int val; } sks[SK_NR]; static int create_sock(int i) { int ret, id, sk, val = time(NULL) + i * 314; char buf[4096]; struct sockaddr_un addr; socklen_t addr_len; id = getpid() * 10 + i; sk = socket(AF_UNIX, SOCK_DGRAM, 0); if (sk < 0) return -1; addr.sun_family = AF_UNIX; addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, "%s%d", SK_NAME, id); addr_len += sizeof(addr.sun_family); if (bind(sk, (struct sockaddr *) &addr, addr_len) < 0) { perror("bind"); return 1; } addr.sun_family = AF_UNIX; addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, SK_NAME); addr_len += sizeof(addr.sun_family); if (connect(sk, (struct sockaddr *) &addr, addr_len) < 0) { perror("connect"); return 1; } printf("init %d\n", val); ret = sprintf(buf, "t%d", val); if (send(sk, buf, ret, 0) < 0) { perror("send"); return -1; } sks[i].sk = sk; sks[i].val = val; return 0; } static int check_sock(int i) { int sk = sks[i].sk, val = sks[i].val; char buf[4096]; if (send(sk, "r", 1, 0) < 0) { perror("send(\"r\")"); return -1; } if (recv(sk, buf, sizeof(buf), 0) <= 0) { perror("recv"); return -1; } printf("%s - %d\n", buf, val); if (atoi(buf) != val) return -1; return 0; } int main() { int i, fd; sigset_t set; int sig; for (i = 0; i < SK_NR; i++) if (create_sock(i)) return -1; fd = open("pid", O_WRONLY | O_CREAT, 0666); if (fd < 0) return 1; dprintf(fd, "%d\n", getpid()); close(fd); openlog("test", LOG_NDELAY, LOG_USER ); sigemptyset(&set); sigaddset(&set, SIGTERM); sigprocmask(SIG_BLOCK, &set, NULL); sigwait(&set, &sig); syslog(LOG_CRIT, "test message"); for (i = 0; i < SK_NR; i++) if (check_sock(i)) return -1; printf("PASS\n"); return 0; } criu-3.6/test/others/unix-callback/unix-lib.c000066400000000000000000000074751317335042600212400ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "criu-plugin.h" #include "criu-log.h" #include "unix.pb-c.h" extern cr_plugin_init_t cr_plugin_init; extern cr_plugin_dump_unix_sk_t cr_plugin_dump_unix_sk; extern cr_plugin_restore_unix_sk_t cr_plugin_restore_unix_sk; #define SK_NAME "/tmp/criu.unix.callback.test" static int get_srv_socket(void) { struct sockaddr_un addr; socklen_t addr_len; int skd; skd = socket(AF_UNIX, SOCK_DGRAM, 0); if (skd < 0) { pr_perror("socket"); return -1; } addr.sun_family = AF_UNIX; addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, "%s.dump.%d", SK_NAME, getpid()); addr_len += sizeof(addr.sun_family); unlink(addr.sun_path); if (bind(skd, (struct sockaddr *) &addr, addr_len) < 0) { pr_perror("bind"); return 1; } addr.sun_family = AF_UNIX; addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, SK_NAME); addr_len += sizeof(addr.sun_family); if (connect(skd, (struct sockaddr *) &addr, addr_len) < 0) { pr_perror("connect"); return -1; } return skd; } int cr_plugin_init(void) { return 0; } int cr_plugin_dump_unix_sk(int sk, int sk_id) { struct sockaddr_un addr; socklen_t addr_len = sizeof(addr); char buf[4096]; int skd, id, ret, fd, len; UnixTest e = UNIX_TEST__INIT; if (getpeername(sk, (struct sockaddr *) &addr, &addr_len)) { pr_perror("getpeername"); return -1; } len = addr_len - sizeof(addr.sun_family); if (addr.sun_path[len - 1] == 0) len--; if (len != strlen(SK_NAME) || strncmp(addr.sun_path, SK_NAME, strlen(SK_NAME))) return -ENOTSUP; pr_info("Dump the socket %x\n", sk_id); skd = get_srv_socket(); if (skd < 0) return -1; addr_len = sizeof(struct sockaddr_un); if (getsockname(sk, (struct sockaddr *) &addr, &addr_len) < 0) return -1; id = atoi(addr.sun_path + strlen(SK_NAME)); ret = sprintf(buf, "d%d", id) + 1; if (send(skd, buf, ret, 0) < 0) { pr_perror("send"); return -1; } if (recv(skd, buf, sizeof(buf), 0) <= 0) return -1; close(skd); e.val = atoi(buf); pr_err("%x: val %d\n", sk_id, e.val); e.name.data = (void *)addr.sun_path; e.name.len = addr_len - sizeof(addr.sun_family); snprintf(buf, sizeof(buf), "unix-test-%x.img", sk_id); fd = openat(criu_get_image_dir(), buf, O_WRONLY | O_CREAT, 0600); if (fd < 0) return -1; if (unix_test__get_packed_size(&e) > sizeof(buf)) { pr_err("%ld\n", unix_test__get_packed_size(&e)); return -1; } ret = unix_test__pack(&e, (uint8_t *) buf); if (write(fd, buf, ret) != ret) return -1; close(fd); return 0; } int cr_plugin_restore_unix_sk(int sk_id) { struct sockaddr_un addr; socklen_t addr_len; int fd, sk, ret; char buf[4096]; UnixTest *e; snprintf(buf, sizeof(buf), "unix-test-%x.img", sk_id); fd = openat(criu_get_image_dir(), buf, O_RDONLY, 0600); if (fd < 0) return -ENOTSUP; ret = read(fd, buf, sizeof(buf)); if (ret < 0) { pr_perror("read"); return -1; } close(fd); e = unix_test__unpack(NULL, ret, (uint8_t *) buf); if (e == NULL) return -1; sk = socket(AF_UNIX, SOCK_DGRAM, 0); if (sk < 0) { pr_perror("socket"); return -1; } addr.sun_family = AF_UNIX; memcpy(addr.sun_path, e->name.data, e->name.len); addr_len = sizeof(addr.sun_family) + e->name.len; if (bind(sk, (struct sockaddr *) &addr, addr_len) < 0) { pr_perror("bind"); return -1; } addr.sun_family = AF_UNIX; addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, SK_NAME); addr_len += sizeof(addr.sun_family); if (connect(sk, (struct sockaddr *) &addr, addr_len) < 0) { pr_perror("connect"); return -1; } pr_err("id %d val %d\n", sk_id, e->val); ret = sprintf(buf, "t%d", e->val); if (send(sk, buf, ret, 0) < 0) { pr_perror("send"); return -1; } return sk; } criu-3.6/test/others/unix-callback/unix-server.c000066400000000000000000000037431317335042600217720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include struct ticket { struct ticket *next; int val; int id; }; struct ticket *tickets; #define SK_NAME "/tmp/criu.unix.callback.test" int main() { int sk, ret, id; char buf[4096]; struct ticket *t; struct sockaddr_un addr; socklen_t addr_len; struct stat st; unlink(SK_NAME); sk = socket(AF_UNIX, SOCK_DGRAM, 0); if (sk < 0) { perror("socket"); return -1; } addr.sun_family = AF_UNIX; addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, SK_NAME); addr_len += sizeof(addr.sun_family); if (bind(sk, (struct sockaddr *) &addr, addr_len) < 0) { perror("bind"); return 1; } fstat(sk, &st); while (1) { addr_len = sizeof(struct sockaddr_un); ret = recvfrom(sk, buf, sizeof(buf), 0, (struct sockaddr *) &addr, &addr_len); if (ret == 0) return 0; if (ret < 0) { perror("recvfrom"); return 1; } id = 0; switch (buf[0]) { case 'l': ret = sprintf(buf, "%ld", st.st_ino); if (sendto(sk, buf, ret + 1, 0, (struct sockaddr *) &addr, addr_len) < 0) { perror("sendto"); return -1; } break; case 't': /* ticket */ t = malloc(sizeof(struct ticket)); if (t == 0) { perror("Can't allocate memory"); return 1; } t->val = atoi(buf + 1); t->next = tickets; t->id = atoi(addr.sun_path +strlen(SK_NAME)); printf("t: id %d val %d\n", t->id, t->val); tickets = t; break; case 'd': /* dump */ id = atoi(buf + 1); case 'r': /* request */ if (!id) id = atoi(addr.sun_path + strlen(SK_NAME)); for (t = tickets; t; t = t->next) if (t->id == id) break; if (t == NULL) return 1; printf("r: id %d val %d\n", id, t->val); ret = sprintf(buf, "%d", t->val); if (sendto(sk, buf, ret + 1, 0, (struct sockaddr *) &addr, addr_len) < 0) { perror("sendto"); return 1; } break; default: return -1; } } return 0; } criu-3.6/test/others/unix-callback/unix.proto000066400000000000000000000001361317335042600214000ustar00rootroot00000000000000syntax = "proto2"; message unix_test { required uint32 val = 1; required bytes name = 2; } criu-3.6/test/rpc_pb2.py000077700000000000000000000000001317335042600205152../lib/py/rpc_pb2.pyustar00rootroot00000000000000criu-3.6/test/show_action.sh000077500000000000000000000002031317335042600161560ustar00rootroot00000000000000#!/bin/bash echo "${CRTOOLS_SCRIPT_ACTION} ${CRTOOLS_IMAGE_DIR} ${CRTOOLS_INIT_PID}" \ >> "$(dirname $0)/actions_called.txt" criu-3.6/test/umount2.c000066400000000000000000000004031317335042600150610ustar00rootroot00000000000000#include #include int main(int argc, char *argv[]) { if (argc < 2) { fprintf(stderr, "umount PATH\n"); return 1; } if (umount2(argv[1], MNT_DETACH)) { fprintf(stderr, "umount %s: %m\n", argv[1]); return 1; } return 0; } criu-3.6/test/zdtm.desc000066400000000000000000000001711317335042600151240ustar00rootroot00000000000000{ 'dir': 'zdtm/', 'exclude': [ 'static/route_rules', 'static/criu-rtc.so', 'lib/parseargs.sh', 'lib/stop_and_chk.sh' ] } criu-3.6/test/zdtm.py000077500000000000000000001521471317335042600146540ustar00rootroot00000000000000#!/usr/bin/env python2 # vim: noet ts=8 sw=8 sts=8 import argparse import glob import os import subprocess import time import tempfile import shutil import re import stat import signal import atexit import sys import linecache import random import string import imp import fcntl import errno import datetime import yaml import criu as crpc os.chdir(os.path.dirname(os.path.abspath(__file__))) prev_line = None def traceit(f, e, a): if e == "line": lineno = f.f_lineno fil = f.f_globals["__file__"] if fil.endswith("zdtm.py"): global prev_line line = linecache.getline(fil, lineno) if line == prev_line: print " ..." else: prev_line = line print "+%4d: %s" % (lineno, line.rstrip()) return traceit # Root dir for ns and uns flavors. All tests # sit in the same dir tests_root = None def clean_tests_root(): global tests_root if tests_root and tests_root[0] == os.getpid(): os.rmdir(tests_root[1]) def make_tests_root(): global tests_root if not tests_root: tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", "/tmp")) atexit.register(clean_tests_root) return tests_root[1] # Report generation report_dir = None def init_report(path): global report_dir report_dir = path if not os.access(report_dir, os.F_OK): os.makedirs(report_dir) def add_to_report(path, tgt_name): global report_dir if report_dir: tgt_path = os.path.join(report_dir, tgt_name) att = 0 while os.access(tgt_path, os.F_OK): tgt_path = os.path.join(report_dir, tgt_name + ".%d" % att) att += 1 ignore = shutil.ignore_patterns('*.socket') if os.path.isdir(path): shutil.copytree(path, tgt_path, ignore = ignore) else: if not os.path.exists(os.path.dirname(tgt_path)): os.mkdir(os.path.dirname(tgt_path)) shutil.copy2(path, tgt_path) def add_to_output(path): global report_dir if not report_dir: return fdi = open(path, "r") fdo = open(os.path.join(report_dir, "output"), "a") while True: buf = fdi.read(1 << 20) if not buf: break fdo.write(buf) prev_crash_reports = set(glob.glob("/tmp/zdtm-core-*.txt")) def check_core_files(): reports = set(glob.glob("/tmp/zdtm-core-*.txt")) - prev_crash_reports if not reports: return False while subprocess.Popen("ps axf | grep 'abrt\.sh'", shell = True).wait() == 0: time.sleep(1) for i in reports: add_to_report(i, os.path.basename(i)) print_sep(i) print open(i).read() print_sep(i) return True # Arch we run on arch = os.uname()[4] # # Flavors # h -- host, test is run in the same set of namespaces as criu # ns -- namespaces, test is run in itw own set of namespaces # uns -- user namespace, the same as above plus user namespace # class host_flavor: def __init__(self, opts): self.name = "host" self.ns = False self.root = None def init(self, l_bins, x_bins): pass def fini(self): pass @staticmethod def clean(): pass class ns_flavor: __root_dirs = ["/bin", "/sbin", "/etc", "/lib", "/lib64", "/dev", "/dev/pts", "/dev/net", "/tmp", "/usr", "/proc"] def __init__(self, opts): self.name = "ns" self.ns = True self.uns = False self.root = make_tests_root() self.root_mounted = False def __copy_one(self, fname): tfname = self.root + fname if not os.access(tfname, os.F_OK): # Copying should be atomic as tests can be # run in parallel try: os.makedirs(self.root + os.path.dirname(fname)) except: pass dst = tempfile.mktemp(".tso", "", self.root + os.path.dirname(fname)) shutil.copy2(fname, dst) os.rename(dst, tfname) def __copy_libs(self, binary): ldd = subprocess.Popen(["ldd", binary], stdout = subprocess.PIPE) xl = re.compile('^(linux-gate.so|linux-vdso(64)?.so|not a dynamic)') # This Mayakovsky-style code gets list of libraries a binary # needs minus vdso and gate .so-s libs = map(lambda x: x[1] == '=>' and x[2] or x[0], map(lambda x: x.split(), filter(lambda x: not xl.match(x), map(lambda x: x.strip(), filter(lambda x: x.startswith('\t'), ldd.stdout.readlines()))))) ldd.wait() for lib in libs: if not os.access(lib, os.F_OK): raise test_fail_exc("Can't find lib %s required by %s" % (lib, binary)) self.__copy_one(lib) def __mknod(self, name, rdev = None): name = "/dev/" + name if not rdev: if not os.access(name, os.F_OK): print "Skipping %s at root" % name return else: rdev = os.stat(name).st_rdev name = self.root + name os.mknod(name, stat.S_IFCHR, rdev) os.chmod(name, 0666) def __construct_root(self): for dir in self.__root_dirs: os.mkdir(self.root + dir) os.chmod(self.root + dir, 0777) for ldir in ["/bin", "/sbin", "/lib", "/lib64"]: os.symlink(".." + ldir, self.root + "/usr" + ldir) self.__mknod("tty", os.makedev(5, 0)) self.__mknod("null", os.makedev(1, 3)) self.__mknod("net/tun") self.__mknod("rtc") self.__mknod("autofs", os.makedev(10, 235)) def __copy_deps(self, deps): for d in deps.split('|'): if os.access(d, os.F_OK): self.__copy_one(d) self.__copy_libs(d) return raise test_fail_exc("Deps check %s failed" % deps) def init(self, l_bins, x_bins): subprocess.check_call(["mount", "--make-slave", "--bind", ".", self.root]) self.root_mounted = True if not os.access(self.root + "/.constructed", os.F_OK): with open(os.path.abspath(__file__)) as o: fcntl.flock(o, fcntl.LOCK_EX) if not os.access(self.root + "/.constructed", os.F_OK): print "Construct root for %s" % l_bins[0] self.__construct_root() os.mknod(self.root + "/.constructed", stat.S_IFREG | 0600) for b in l_bins: self.__copy_libs(b) for b in x_bins: self.__copy_deps(b) def fini(self): if self.root_mounted: subprocess.check_call(["./umount2", self.root]) self.root_mounted = False @staticmethod def clean(): for d in ns_flavor.__root_dirs: p = './' + d print 'Remove %s' % p if os.access(p, os.F_OK): shutil.rmtree('./' + d) if os.access('./.constructed', os.F_OK): os.unlink('./.constructed') class userns_flavor(ns_flavor): def __init__(self, opts): ns_flavor.__init__(self, opts) self.name = "userns" self.uns = True def init(self, l_bins, x_bins): # To be able to create roots_yard in CRIU os.chmod(".", os.stat(".").st_mode | 0077) ns_flavor.init(self, l_bins, x_bins) @staticmethod def clean(): pass flavors = {'h': host_flavor, 'ns': ns_flavor, 'uns': userns_flavor} # # Helpers # def encode_flav(f): return (flavors.keys().index(f) + 128) def decode_flav(i): try: return flavors.keys()[i - 128] except: return "unknown" def tail(path): p = subprocess.Popen(['tail', '-n1', path], stdout = subprocess.PIPE) out = p.stdout.readline() p.wait() return out def rpidfile(path): return open(path).readline().strip() def wait_pid_die(pid, who, tmo = 30): stime = 0.1 while stime < tmo: try: os.kill(int(pid), 0) except: # Died break print "Wait for %s(%d) to die for %f" % (who, pid, stime) time.sleep(stime) stime *= 2 else: subprocess.Popen(["ps", "-p", str(pid)]).wait() subprocess.Popen(["ps", "axf", str(pid)]).wait() raise test_fail_exc("%s die" % who) def test_flag(tdesc, flag): return flag in tdesc.get('flags', '').split() # # Exception thrown when something inside the test goes wrong, # e.g. test doesn't start, criu returns with non zero code or # test checks fail # class test_fail_exc: def __init__(self, step): self.step = step class test_fail_expected_exc: def __init__(self, cr_action): self.cr_action = cr_action # # A test from zdtm/ directory. # class zdtm_test: def __init__(self, name, desc, flavor, freezer): self.__name = name self.__desc = desc self.__freezer = None self.__make_action('cleanout') self.__pid = 0 self.__flavor = flavor self.__freezer = freezer self._bins = [name] self._env = {} self._deps = desc.get('deps', []) self.auto_reap = True self.__timeout = int(self.__desc.get('timeout') or 30) def __make_action(self, act, env = None, root = None): sys.stdout.flush() # Not to let make's messages appear before ours tpath = self.__name + '.' + act s_args = ['make', '--no-print-directory', '-C', os.path.dirname(tpath), os.path.basename(tpath)] if env: env = dict(os.environ, **env) s = subprocess.Popen(s_args, env = env, cwd = root, close_fds = True, preexec_fn = self.__freezer and self.__freezer.attach or None) if act == "pid": try_run_hook(self, ["--post-start"]) s.wait() if self.__freezer: self.__freezer.freeze() def __pidfile(self): return self.__name + '.pid' def __wait_task_die(self): wait_pid_die(int(self.__pid), self.__name, self.__timeout) def __add_wperms(self): # Add write perms for .out and .pid files for b in self._bins: p = os.path.dirname(b) os.chmod(p, os.stat(p).st_mode | 0222) def start(self): self.__flavor.init(self._bins, self._deps) print "Start test" env = self._env if not self.__freezer.kernel: env['ZDTM_THREAD_BOMB'] = "5" if not test_flag(self.__desc, 'suid'): # Numbers should match those in criu env['ZDTM_UID'] = "18943" env['ZDTM_GID'] = "58467" env['ZDTM_GROUPS'] = "27495 48244" self.__add_wperms() else: print "Test is SUID" if self.__flavor.ns: env['ZDTM_NEWNS'] = "1" env['ZDTM_ROOT'] = self.__flavor.root env['PATH'] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" if self.__flavor.uns: env['ZDTM_USERNS'] = "1" self.__add_wperms() if os.getenv("GCOV"): criu_dir = os.path.dirname(os.getcwd()) criu_dir_r = "%s%s" % (self.__flavor.root, criu_dir) env['ZDTM_CRIU'] = os.path.dirname(os.getcwd()) subprocess.check_call(["mkdir", "-p", criu_dir_r]) self.__make_action('pid', env, self.__flavor.root) try: os.kill(int(self.getpid()), 0) except: raise test_fail_exc("start") if not self.static(): # Wait less than a second to give the test chance to # move into some semi-random state time.sleep(random.random()) def kill(self, sig = signal.SIGKILL): self.__freezer.thaw() if self.__pid: print "Send the %d signal to %s" % (sig, self.__pid) os.kill(int(self.__pid), sig) self.gone(sig == signal.SIGKILL) self.__flavor.fini() def stop(self): self.__freezer.thaw() self.getpid() # Read the pid from pidfile back self.kill(signal.SIGTERM) res = tail(self.__name + '.out') if 'PASS' not in res.split(): if os.access(self.__name + '.out.inprogress', os.F_OK): print_sep(self.__name + '.out.inprogress') print open(self.__name + '.out.inprogress').read() print_sep(self.__name + '.out.inprogress') raise test_fail_exc("result check") def getpid(self): if self.__pid == 0: self.__pid = rpidfile(self.__pidfile()) return self.__pid def getname(self): return self.__name def __getcropts(self): opts = self.__desc.get('opts', '').split() + ["--pidfile", os.path.realpath(self.__pidfile())] if self.__flavor.ns: opts += ["--root", self.__flavor.root] if test_flag(self.__desc, 'crlib'): opts += ["-L", os.path.dirname(os.path.realpath(self.__name)) + '/lib'] return opts def getdopts(self): return self.__getcropts() + self.__freezer.getdopts() + self.__desc.get('dopts', '').split() def getropts(self): return self.__getcropts() + self.__freezer.getropts() + self.__desc.get('ropts', '').split() def gone(self, force = True): if not self.auto_reap: pid, status = os.waitpid(int(self.__pid), 0) if pid != int(self.__pid): raise test_fail_exc("kill pid mess") self.__wait_task_die() self.__pid = 0 if force: os.unlink(self.__pidfile()) def print_output(self): if os.access(self.__name + '.out', os.R_OK): print "Test output: " + "=" * 32 print open(self.__name + '.out').read() print " <<< " + "=" * 32 def static(self): return self.__name.split('/')[1] == 'static' def ns(self): return self.__flavor.ns def blocking(self): return test_flag(self.__desc, 'crfail') @staticmethod def available(): if not os.access("umount2", os.X_OK): subprocess.check_call(["make", "umount2"]) if not os.access("zdtm_ct", os.X_OK): subprocess.check_call(["make", "zdtm_ct"]) if not os.access("zdtm/lib/libzdtmtst.a", os.F_OK): subprocess.check_call(["make", "-C", "zdtm/"]) subprocess.check_call(["flock", "zdtm_mount_cgroups.lock", "./zdtm_mount_cgroups"]) class inhfd_test: def __init__(self, name, desc, flavor, freezer): self.__name = os.path.basename(name) print "Load %s" % name self.__fdtyp = imp.load_source(self.__name, name) self.__my_file = None self.__peer_pid = 0 self.__peer_file = None self.__peer_file_name = None self.__dump_opts = None def start(self): self.__message = "".join([random.choice(string.ascii_letters) for _ in range(16)]) (self.__my_file, peer_file) = self.__fdtyp.create_fds() # Check FDs returned for inter-connection self.__my_file.write(self.__message) self.__my_file.flush() if peer_file.read(16) != self.__message: raise test_fail_exc("FDs screwup") start_pipe = os.pipe() self.__peer_pid = os.fork() if self.__peer_pid == 0: os.setsid() getattr(self.__fdtyp, "child_prep", lambda fd: None)(peer_file) os.close(0) os.close(1) os.close(2) self.__my_file.close() os.close(start_pipe[0]) os.close(start_pipe[1]) try: data = peer_file.read(16) except: sys.exit(1) sys.exit(data == self.__message and 42 or 2) os.close(start_pipe[1]) os.read(start_pipe[0], 12) os.close(start_pipe[0]) self.__peer_file_name = self.__fdtyp.filename(peer_file) self.__dump_opts = self.__fdtyp.dump_opts(peer_file) def stop(self): self.__my_file.write(self.__message) self.__my_file.flush() pid, status = os.waitpid(self.__peer_pid, 0) if not os.WIFEXITED(status) or os.WEXITSTATUS(status) != 42: raise test_fail_exc("test failed with %d" % status) def kill(self): if self.__peer_pid: os.kill(self.__peer_pid, signal.SIGKILL) def getname(self): return self.__name def getpid(self): return "%s" % self.__peer_pid def gone(self, force = True): os.waitpid(self.__peer_pid, 0) wait_pid_die(self.__peer_pid, self.__name) self.__my_file = None self.__peer_file = None def getdopts(self): return self.__dump_opts def getropts(self): (self.__my_file, self.__peer_file) = self.__fdtyp.create_fds() return ["--restore-sibling", "--inherit-fd", "fd[%d]:%s" % (self.__peer_file.fileno(), self.__peer_file_name)] def print_output(self): pass def static(self): return True def blocking(self): return False @staticmethod def available(): pass class groups_test(zdtm_test): def __init__(self, name, desc, flavor, freezer): zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer) if flavor.ns: self.__real_name = name self.__subs = map(lambda x: x.strip(), open(name).readlines()) print "Subs:\n%s" % '\n'.join(self.__subs) else: self.__real_name = '' self.__subs = [] self._bins += self.__subs self._deps += get_test_desc('zdtm/lib/groups')['deps'] self._env = {'ZDTM_TESTS': self.__real_name} def __get_start_cmd(self, name): tdir = os.path.dirname(name) tname = os.path.basename(name) s_args = ['make', '--no-print-directory', '-C', tdir] subprocess.check_call(s_args + [tname + '.cleanout']) s = subprocess.Popen(s_args + ['--dry-run', tname + '.pid'], stdout = subprocess.PIPE) cmd = s.stdout.readlines().pop().strip() s.wait() return 'cd /' + tdir + ' && ' + cmd def start(self): if (self.__subs): with open(self.__real_name + '.start', 'w') as f: for test in self.__subs: cmd = self.__get_start_cmd(test) f.write(cmd + '\n') with open(self.__real_name + '.stop', 'w') as f: for test in self.__subs: f.write('kill -TERM `cat /%s.pid`\n' % test) zdtm_test.start(self) def stop(self): zdtm_test.stop(self) for test in self.__subs: res = tail(test + '.out') if 'PASS' not in res.split(): raise test_fail_exc("sub %s result check" % test) test_classes = {'zdtm': zdtm_test, 'inhfd': inhfd_test, 'groups': groups_test} # # CRIU when launched using CLI # criu_bin = "../criu/criu" join_ns_file = '/run/netns/zdtm_netns' class criu_cli: @staticmethod def run(action, args, fault = None, strace = [], preexec = None, nowait = False): env = dict(os.environ, ASAN_OPTIONS = "log_path=asan.log:disable_coredump=0:detect_leaks=0") if fault: print "Forcing %s fault" % fault env['CRIU_FAULT'] = fault cr = subprocess.Popen(strace + [criu_bin, action] + args, env = env, preexec_fn = preexec) if nowait: return cr return cr.wait() class criu_rpc: @staticmethod def __set_opts(criu, args, ctx): while len(args) != 0: arg = args.pop(0) if arg == '-v4': criu.opts.log_level = 4 continue if arg == '-o': criu.opts.log_file = args.pop(0) continue if arg == '-D': criu.opts.images_dir_fd = os.open(args.pop(0), os.O_DIRECTORY) ctx['imgd'] = criu.opts.images_dir_fd continue if arg == '-t': criu.opts.pid = int(args.pop(0)) continue if arg == '--pidfile': ctx['pidf'] = args.pop(0) continue if arg == '--timeout': criu.opts.timeout = int(args.pop(0)) continue if arg == '--restore-detached': # Set by service by default ctx['rd'] = True continue if arg == '--root': criu.opts.root = args.pop(0) continue if arg == '--external': criu.opts.external.append(args.pop(0)) continue raise test_fail_exc('RPC for %s required' % arg) @staticmethod def run(action, args, fault = None, strace = [], preexec = None, nowait = False): if fault: raise test_fail_exc('RPC and FAULT not supported') if strace: raise test_fail_exc('RPC and SAT not supported') if preexec: raise test_fail_exc('RPC and PREEXEC not supported') if nowait: raise test_fail_exc("RPC and status-fd not supported") ctx = {} # Object used to keep info untill action is done criu = crpc.criu() criu.use_binary(criu_bin) criu_rpc.__set_opts(criu, args, ctx) try: if action == 'dump': criu.dump() elif action == 'restore': if 'rd' not in ctx: raise test_fail_exc('RPC Non-detached restore is impossible') res = criu.restore() pidf = ctx.get('pidf') if pidf: open(pidf, 'w').write('%d\n' % res.pid) else: raise test_fail_exc('RPC for %s required' % action) except crpc.CRIUExceptionExternal: print "Fail" ret = -1 else: ret = 0 imgd = ctx.get('imgd') if imgd: os.close(imgd) return ret class criu: def __init__(self, opts): self.__test = None self.__dump_path = None self.__iter = 0 self.__prev_dump_iter = None self.__page_server = (opts['page_server'] and True or False) self.__remote_lazy_pages = (opts['remote_lazy_pages'] and True or False) self.__lazy_pages = (self.__remote_lazy_pages or opts['lazy_pages'] and True or False) self.__restore_sibling = (opts['sibling'] and True or False) self.__join_ns = (opts['join_ns'] and True or False) self.__empty_ns = (opts['empty_ns'] and True or False) self.__fault = (opts['fault']) self.__script = opts['script'] self.__sat = (opts['sat'] and True or False) self.__dedup = (opts['dedup'] and True or False) self.__mdedup = (opts['noauto_dedup'] and True or False) self.__user = (opts['user'] and True or False) self.__leave_stopped = (opts['stop'] and True or False) self.__criu = (opts['rpc'] and criu_rpc or criu_cli) self.__lazy_pages_p = None self.__page_server_p = None def fini(self): if self.__lazy_pages_p: ret = self.__lazy_pages_p.wait() self.__lazy_pages_p = None if ret: raise test_fail_exc("criu lazy-pages exited with %s" % ret) if self.__page_server_p: ret = self.__page_server_p.wait() self.__page_server_p = None if ret: raise test_fail_exc("criu page-server exited with %s" % ret) return def logs(self): return self.__dump_path def set_test(self, test): self.__test = test self.__dump_path = "dump/" + test.getname() + "/" + test.getpid() if os.path.exists(self.__dump_path): for i in xrange(100): newpath = self.__dump_path + "." + str(i) if not os.path.exists(newpath): os.rename(self.__dump_path, newpath) break else: raise test_fail_exc("couldn't find dump dir %s" % self.__dump_path) os.makedirs(self.__dump_path) def cleanup(self): if self.__dump_path: print "Removing %s" % self.__dump_path shutil.rmtree(self.__dump_path) def __ddir(self): return os.path.join(self.__dump_path, "%d" % self.__iter) def set_user_id(self): # Numbers should match those in zdtm_test os.setresgid(58467, 58467, 58467) os.setresuid(18943, 18943, 18943) def __criu_act(self, action, opts = [], log = None, nowait = False): if not log: log = action + ".log" s_args = ["-o", log, "-D", self.__ddir(), "-v4"] + opts with open(os.path.join(self.__ddir(), action + '.cropt'), 'w') as f: f.write(' '.join(s_args) + '\n') print "Run criu " + action strace = [] if self.__sat: fname = os.path.join(self.__ddir(), action + '.strace') print_fname(fname, 'strace') strace = ["strace", "-o", fname, '-T'] if action == 'restore': strace += ['-f'] s_args += ['--action-script', os.getcwd() + '/../scripts/fake-restore.sh'] if self.__script: s_args += ['--action-script', self.__script] if action == "restore": preexec = None else: preexec = self.__user and self.set_user_id or None __ddir = self.__ddir() status_fds = None if nowait: status_fds = os.pipe() s_args += ["--status-fd", str(status_fds[1])] ns_last_pid = open("/proc/sys/kernel/ns_last_pid").read() ret = self.__criu.run(action, s_args, self.__fault, strace, preexec, nowait) if nowait: os.close(status_fds[1]) if os.read(status_fds[0], 1) != '\0': ret = ret.wait() raise test_fail_exc("criu %s exited with %s" % (action, ret)) os.close(status_fds[0]) return ret grep_errors(os.path.join(__ddir, log)) if ret != 0: if self.__fault and int(self.__fault) < 128: try_run_hook(self.__test, ["--fault", action]) if action == "dump": # create a clean directory for images os.rename(__ddir, __ddir + ".fail") os.mkdir(__ddir) os.chmod(__ddir, 0777) else: # on restore we move only a log file, because we need images os.rename(os.path.join(__ddir, log), os.path.join(__ddir, log + ".fail")) # restore ns_last_pid to avoid a case when criu gets # PID of one of restored processes. open("/proc/sys/kernel/ns_last_pid", "w+").write(ns_last_pid) # try again without faults print "Run criu " + action ret = self.__criu.run(action, s_args, False, strace, preexec) grep_errors(os.path.join(__ddir, log)) if ret == 0: return if self.__test.blocking() or (self.__sat and action == 'restore'): raise test_fail_expected_exc(action) else: raise test_fail_exc("CRIU %s" % action) def dump(self, action, opts = []): self.__iter += 1 os.mkdir(self.__ddir()) os.chmod(self.__ddir(), 0777) a_opts = ["-t", self.__test.getpid()] if self.__prev_dump_iter: a_opts += ["--prev-images-dir", "../%d" % self.__prev_dump_iter, "--track-mem"] self.__prev_dump_iter = self.__iter if self.__page_server: print "Adding page server" ps_opts = ["--port", "12345"] if self.__dedup: ps_opts += ["--auto-dedup"] self.__page_server_p = self.__criu_act("page-server", opts = ps_opts, nowait = True) a_opts += ["--page-server", "--address", "127.0.0.1", "--port", "12345"] a_opts += self.__test.getdopts() if self.__dedup: a_opts += ["--auto-dedup"] a_opts += ["--timeout", "10"] criu_dir = os.path.dirname(os.getcwd()) if os.getenv("GCOV"): a_opts.append('--external') a_opts.append('mnt[%s]:zdtm' % criu_dir) if self.__leave_stopped: a_opts += ['--leave-stopped'] if self.__empty_ns: a_opts += ['--empty-ns', 'net'] self.__criu_act(action, opts = a_opts + opts) if self.__mdedup and self.__iter > 1: self.__criu_act("dedup", opts = []) if self.__leave_stopped: pstree_check_stopped(self.__test.getpid()) pstree_signal(self.__test.getpid(), signal.SIGKILL) if self.__page_server_p: ret = self.__page_server_p.wait() self.__page_server_p = None if ret: raise test_fail_exc("criu page-server exited with %d" % ret) def restore(self): r_opts = [] if self.__restore_sibling: r_opts = ["--restore-sibling"] self.__test.auto_reap = False r_opts += self.__test.getropts() if self.__join_ns: r_opts.append("--join-ns") r_opts.append("net:%s" % join_ns_file) if self.__empty_ns: r_opts += ['--empty-ns', 'net'] r_opts += ['--action-script', os.getcwd() + '/empty-netns-prep.sh'] self.__prev_dump_iter = None criu_dir = os.path.dirname(os.getcwd()) if os.getenv("GCOV"): r_opts.append('--external') r_opts.append('mnt[zdtm]:%s' % criu_dir) if self.__lazy_pages: lp_opts = [] if self.__remote_lazy_pages: lp_opts += ['--page-server', "--port", "12345"] ps_opts = ["--pidfile", "ps.pid", "--port", "12345", "--lazy-pages"] self.__page_server_p = self.__criu_act("page-server", opts = ps_opts, nowait = True) self.__lazy_pages_p = self.__criu_act("lazy-pages", opts = lp_opts, nowait = True) r_opts += ["--lazy-pages"] if self.__leave_stopped: r_opts += ['--leave-stopped'] self.__criu_act("restore", opts = r_opts + ["--restore-detached"]) if self.__leave_stopped: pstree_check_stopped(self.__test.getpid()) pstree_signal(self.__test.getpid(), signal.SIGCONT) @staticmethod def check(feature): return criu_cli.run("check", ["-v0", "--feature", feature]) == 0 @staticmethod def available(): if not os.access(criu_bin, os.X_OK): print "CRIU binary not built" sys.exit(1) def kill(self): if self.__lazy_pages_p: self.__lazy_pages_p.terminate() print "criu lazy-pages exited with %s" & self.wait() self.__lazy_pages_p = None if self.__page_server_p: self.__page_server_p.terminate() print "criu page-server exited with %s" & self.wait() self.__page_server_p = None def try_run_hook(test, args): hname = test.getname() + '.hook' if os.access(hname, os.X_OK): print "Running %s(%s)" % (hname, ', '.join(args)) hook = subprocess.Popen([hname] + args) if hook.wait() != 0: raise test_fail_exc("hook " + " ".join(args)) # # Step by step execution # do_sbs = False def init_sbs(): if sys.stdout.isatty(): global do_sbs do_sbs = True else: print "Can't do step-by-step in this runtime" def sbs(what): if do_sbs: raw_input("Pause at %s. Press any key to continue." % what) # # Main testing entity -- dump (probably with pre-dumps) and restore # def iter_parm(opt, dflt): x = ((opt or str(dflt)) + ":0").split(':') return (xrange(0, int(x[0])), float(x[1])) def cr(cr_api, test, opts): if opts['nocr']: return cr_api.set_test(test) iters = iter_parm(opts['iters'], 1) for i in iters[0]: pres = iter_parm(opts['pre'], 0) for p in pres[0]: if opts['snaps']: cr_api.dump("dump", opts = ["--leave-running"]) else: cr_api.dump("pre-dump") try_run_hook(test, ["--post-pre-dump"]) time.sleep(pres[1]) sbs('pre-dump') os.environ["ZDTM_TEST_PID"] = str(test.getpid()) if opts['norst']: try_run_hook(test, ["--pre-dump"]) cr_api.dump("dump", opts = ["--leave-running"]) else: try_run_hook(test, ["--pre-dump"]) cr_api.dump("dump") test.gone() sbs('pre-restore') try_run_hook(test, ["--pre-restore"]) cr_api.restore() os.environ["ZDTM_TEST_PID"] = str(test.getpid()) os.environ["ZDTM_IMG_DIR"] = cr_api.logs() try_run_hook(test, ["--post-restore"]) sbs('post-restore') time.sleep(iters[1]) # Additional checks that can be done outside of test process def get_visible_state(test): maps = {} files = {} mounts = {} if not getattr(test, "static", lambda: False)() or \ not getattr(test, "ns", lambda: False)(): return ({}, {}, {}) r = re.compile('^[0-9]+$') pids = filter(lambda p: r.match(p), os.listdir("/proc/%s/root/proc/" % test.getpid())) for pid in pids: files[pid] = set(os.listdir("/proc/%s/root/proc/%s/fd" % (test.getpid(), pid))) cmaps = [[0, 0, ""]] last = 0 for mp in open("/proc/%s/root/proc/%s/maps" % (test.getpid(), pid)): m = map(lambda x: int('0x' + x, 0), mp.split()[0].split('-')) m.append(mp.split()[1]) f = "/proc/%s/root/proc/%s/map_files/%s" % (test.getpid(), pid, mp.split()[0]) if os.access(f, os.F_OK): st = os.lstat(f) m.append(oct(st.st_mode)) if cmaps[last][1] == m[0] and cmaps[last][2] == m[2]: cmaps[last][1] = m[1] else: cmaps.append(m) last += 1 maps[pid] = set(map(lambda x: '%x-%x %s' % (x[0], x[1], x[2:]), cmaps)) cmounts = [] try: r = re.compile("^\S+\s\S+\s\S+\s(\S+)\s(\S+)") for m in open("/proc/%s/root/proc/%s/mountinfo" % (test.getpid(), pid)): cmounts.append(r.match(m).groups()) except IOError, e: if e.errno != errno.EINVAL: raise e mounts[pid] = cmounts return files, maps, mounts def check_visible_state(test, state, opts): new = get_visible_state(test) for pid in state[0].keys(): fnew = new[0][pid] fold = state[0][pid] if fnew != fold: print "%s: Old files lost: %s" % (pid, fold - fnew) print "%s: New files appeared: %s" % (pid, fnew - fold) raise test_fail_exc("fds compare") old_maps = state[1][pid] new_maps = new[1][pid] if os.getenv("COMPAT_TEST"): # the vsyscall vma isn't unmapped from x32 processes vsyscall = "ffffffffff600000-ffffffffff601000 ['r-xp']" if vsyscall in new_maps and vsyscall not in old_maps: new_maps.remove(vsyscall) if old_maps != new_maps: print "%s: Old maps lost: %s" % (pid, old_maps - new_maps) print "%s: New maps appeared: %s" % (pid, new_maps - old_maps) if not opts['fault']: # skip parasite blob raise test_fail_exc("maps compare") old_mounts = state[2][pid] new_mounts = new[2][pid] for i in xrange(len(old_mounts)): m = old_mounts.pop(0) if m in new_mounts: new_mounts.remove(m) else: old_mounts.append(m) if old_mounts or new_mounts: print "%s: Old mounts lost: %s" % (pid, old_mounts) print "%s: New mounts appeared: %s" % (pid, new_mounts) raise test_fail_exc("mounts compare") if '--link-remap' in test.getdopts(): import glob link_remap_list = glob.glob(os.path.dirname(test.getname()) + '/link_remap*') if link_remap_list: print "%s: link-remap files left: %s" % (test.getname(), link_remap_list) raise test_fail_exc("link remaps left") class noop_freezer: def __init__(self): self.kernel = False def attach(self): pass def freeze(self): pass def thaw(self): pass def getdopts(self): return [] def getropts(self): return [] class cg_freezer: def __init__(self, path, state): self.__path = '/sys/fs/cgroup/freezer/' + path self.__state = state self.kernel = True def attach(self): if not os.access(self.__path, os.F_OK): os.makedirs(self.__path) with open(self.__path + '/tasks', 'w') as f: f.write('0') def __set_state(self, state): with open(self.__path + '/freezer.state', 'w') as f: f.write(state) def freeze(self): if self.__state.startswith('f'): self.__set_state('FROZEN') def thaw(self): if self.__state.startswith('f'): self.__set_state('THAWED') def getdopts(self): return ['--freeze-cgroup', self.__path, '--manage-cgroups'] def getropts(self): return ['--manage-cgroups'] def get_freezer(desc): if not desc: return noop_freezer() fd = desc.split(':') fr = cg_freezer(path = fd[0], state = fd[1]) return fr def cmp_ns(ns1, match, ns2, msg): ns1_ino = os.stat(ns1).st_ino ns2_ino = os.stat(ns2).st_ino if eval("%r %s %r" % (ns1_ino, match, ns2_ino)): print "%s match (%r %s %r) fail" % (msg, ns1_ino, match, ns2_ino) raise test_fail_exc("%s compare" % msg) def check_joinns_state(t): cmp_ns("/proc/%s/ns/net" % t.getpid(), "!=", join_ns_file, "join-ns") def pstree_each_pid(root_pid): f_children_path = "/proc/{0}/task/{0}/children".format(root_pid) child_pids = [] try: with open(f_children_path, "r") as f_children: pid_line = f_children.readline().strip(" \n") if pid_line: child_pids += pid_line.split(" ") except: return # process is dead yield root_pid for child_pid in child_pids: for pid in pstree_each_pid(child_pid): yield pid def is_proc_stopped(pid): def get_thread_status(thread_dir): try: with open(os.path.join(thread_dir, "status")) as f_status: for line in f_status.readlines(): if line.startswith("State:"): return line.split(":", 1)[1].strip().split(" ")[0] except: pass # process is dead return None def is_thread_stopped(status): return (status is None) or (status == "T") or (status == "Z") tasks_dir = "/proc/%s/task" % pid thread_dirs = [] try: thread_dirs = os.listdir(tasks_dir) except: pass # process is dead for thread_dir in thread_dirs: thread_status = get_thread_status(os.path.join(tasks_dir, thread_dir)) if not is_thread_stopped(thread_status): return False if not is_thread_stopped(get_thread_status("/proc/%s" % pid)): return False return True def pstree_check_stopped(root_pid): for pid in pstree_each_pid(root_pid): if not is_proc_stopped(pid): raise test_fail_exc("CRIU --leave-stopped %s" % pid) def pstree_signal(root_pid, signal): for pid in pstree_each_pid(root_pid): try: os.kill(int(pid), signal) except: pass # process is dead def do_run_test(tname, tdesc, flavs, opts): tcname = tname.split('/')[0] tclass = test_classes.get(tcname, None) if not tclass: print "Unknown test class %s" % tcname return if opts['report']: init_report(opts['report']) if opts['sbs']: init_sbs() fcg = get_freezer(opts['freezecg']) for f in flavs: print print_sep("Run %s in %s" % (tname, f)) if opts['dry_run']: continue flav = flavors[f](opts) t = tclass(tname, tdesc, flav, fcg) cr_api = criu(opts) try: t.start() s = get_visible_state(t) try: cr(cr_api, t, opts) except test_fail_expected_exc as e: if e.cr_action == "dump": t.stop() else: check_visible_state(t, s, opts) if opts['join_ns']: check_joinns_state(t) t.stop() cr_api.fini() try_run_hook(t, ["--clean"]) except test_fail_exc as e: print_sep("Test %s FAIL at %s" % (tname, e.step), '#') t.print_output() t.kill() cr_api.kill() try_run_hook(t, ["--clean"]) if cr_api.logs(): add_to_report(cr_api.logs(), tname.replace('/', '_') + "_" + f + "/images") if opts['keep_img'] == 'never': cr_api.cleanup() # When option --keep-going not specified this exit # does two things: exits from subprocess and aborts the # main script execution on the 1st error met sys.exit(encode_flav(f)) else: if opts['keep_img'] != 'always': cr_api.cleanup() print_sep("Test %s PASS" % tname) class launcher: def __init__(self, opts, nr_tests): self.__opts = opts self.__total = nr_tests self.__runtest = 0 self.__nr = 0 self.__max = int(opts['parallel'] or 1) self.__subs = {} self.__fail = False self.__file_report = None self.__junit_file = None self.__junit_test_cases = None self.__failed = [] self.__nr_skip = 0 if self.__max > 1 and self.__total > 1: self.__use_log = True elif opts['report']: self.__use_log = True else: self.__use_log = False if opts['report'] and (opts['keep_going'] or self.__total == 1): global TestSuite, TestCase from junit_xml import TestSuite, TestCase now = datetime.datetime.now() att = 0 reportname = os.path.join(report_dir, "criu-testreport.tap") junitreport = os.path.join(report_dir, "criu-testreport.xml") while os.access(reportname, os.F_OK) or os.access(junitreport, os.F_OK): reportname = os.path.join(report_dir, "criu-testreport" + ".%d.tap" % att) junitreport = os.path.join(report_dir, "criu-testreport" + ".%d.xml" % att) att += 1 self.__junit_file = open(junitreport, 'a') self.__junit_test_cases = [] self.__file_report = open(reportname, 'a') print >> self.__file_report, "TAP version 13" print >> self.__file_report, "# Hardware architecture: " + arch print >> self.__file_report, "# Timestamp: " + now.strftime("%Y-%m-%d %H:%M") + " (GMT+1)" print >> self.__file_report, "# " print >> self.__file_report, "1.." + str(nr_tests) self.__taint = open("/proc/sys/kernel/tainted").read() if int(self.__taint, 0) != 0: print "The kernel is tainted: %r" % self.__taint if not opts["ignore_taint"]: raise Exception("The kernel is tainted: %r" % self.__taint) def __show_progress(self, msg): perc = self.__nr * 16 / self.__total print "=== Run %d/%d %s %s" % (self.__nr, self.__total, '=' * perc + '-' * (16 - perc), msg) def skip(self, name, reason): print "Skipping %s (%s)" % (name, reason) self.__nr += 1 self.__runtest += 1 self.__nr_skip += 1 if self.__junit_test_cases is not None: tc = TestCase(name) tc.add_skipped_info(reason) self.__junit_test_cases.append(tc) if self.__file_report: testline = "ok %d - %s # SKIP %s" % (self.__runtest, name, reason) print >> self.__file_report, testline def run_test(self, name, desc, flavor): if len(self.__subs) >= self.__max: self.wait() taint = open("/proc/sys/kernel/tainted").read() if self.__taint != taint: raise Exception("The kernel is tainted: %r (%r)" % (taint, self.__taint)) if test_flag(desc, 'excl'): self.wait_all() self.__nr += 1 self.__show_progress(name) nd = ('nocr', 'norst', 'pre', 'iters', 'page_server', 'sibling', 'stop', 'empty_ns', 'fault', 'keep_img', 'report', 'snaps', 'sat', 'script', 'rpc', 'lazy_pages', 'join_ns', 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: logf = name.replace('/', '_') + ".log" log = open(logf, "w") else: logf = None log = None sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], env = dict(os.environ, CR_CT_TEST_INFO = arg), stdout = log, stderr = subprocess.STDOUT, close_fds = True) self.__subs[sub.pid] = {'sub': sub, 'log': logf, 'name': name, "start": time.time()} if test_flag(desc, 'excl'): self.wait() def __wait_one(self, flags): pid, status = os.waitpid(0, flags) self.__runtest += 1 if pid != 0: sub = self.__subs.pop(pid) tc = None if self.__junit_test_cases is not None: tc = TestCase(sub['name'], elapsed_sec=time.time() - sub['start']) self.__junit_test_cases.append(tc) if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) self.__failed.append([sub['name'], failed_flavor]) if self.__file_report: testline = "not ok %d - %s # flavor %s" % (self.__runtest, sub['name'], failed_flavor) output = open(sub['log']).read() details = {'output': output} tc.add_error_info(output = output) print >> self.__file_report, testline print >> self.__file_report, yaml.dump(details, explicit_start=True, explicit_end=True, default_style='|') if sub['log']: add_to_output(sub['log']) else: if self.__file_report: testline = "ok %d - %s" % (self.__runtest, sub['name']) print >> self.__file_report, testline if sub['log']: print open(sub['log']).read() os.unlink(sub['log']) return True return False def __wait_all(self): while self.__subs: self.__wait_one(0) def wait(self): self.__wait_one(0) while self.__subs: if not self.__wait_one(os.WNOHANG): break if self.__fail and not opts['keep_going']: raise test_fail_exc('') def wait_all(self): self.__wait_all() if self.__fail and not opts['keep_going']: raise test_fail_exc('') def finish(self): self.__wait_all() if not opts['fault'] and check_core_files(): self.__fail = True if self.__file_report: ts = TestSuite(opts['title'], self.__junit_test_cases, os.getenv("NODE_NAME")) self.__junit_file.write(TestSuite.to_xml_string([ts])) self.__file_report.close() if opts['keep_going']: if self.__fail: print_sep("%d TEST(S) FAILED (TOTAL %d/SKIPPED %d)" % (len(self.__failed), self.__total, self.__nr_skip), "#") for failed in self.__failed: print " * %s(%s)" % (failed[0], failed[1]) else: print_sep("ALL TEST(S) PASSED (TOTAL %d/SKIPPED %d)" % (self.__total, self.__nr_skip), "#") if self.__fail: print_sep("FAIL", "#") sys.exit(1) def all_tests(opts): desc = eval(open(opts['set'] + '.desc').read()) files = [] mask = stat.S_IFREG | stat.S_IXUSR for d in os.walk(desc['dir']): for f in d[2]: fp = os.path.join(d[0], f) st = os.lstat(fp) if (st.st_mode & mask) != mask: continue if stat.S_IFMT(st.st_mode) in [stat.S_IFLNK, stat.S_IFSOCK]: continue files.append(fp) excl = map(lambda x: os.path.join(desc['dir'], x), desc['exclude']) tlist = filter(lambda x: not x.endswith('.checkskip') and not x.endswith('.hook') and x not in excl, map(lambda x: x.strip(), files) ) return tlist # Descriptor for abstract test not in list default_test = {} def get_test_desc(tname): d_path = tname + '.desc' if os.access(d_path, os.F_OK) and os.path.getsize(d_path) > 0: return eval(open(d_path).read()) return default_test def self_checkskip(tname): chs = tname + '.checkskip' if os.access(chs, os.X_OK): ch = subprocess.Popen([chs]) return not ch.wait() == 0 return False def print_fname(fname, typ): print "=[%s]=> %s" % (typ, fname) def print_sep(title, sep = "=", width = 80): print (" " + title + " ").center(width, sep) def print_error(line): line = line.rstrip() print line if line.endswith('>'): # combine pie output return True return False def grep_errors(fname): first = True print_next = False before = [] for l in open(fname): before.append(l) if len(before) > 5: before.pop(0) if "Error" in l: if first: print_fname(fname, 'log') print_sep("grep Error", "-", 60) first = False for i in before: print_next = print_error(i) before = [] else: if print_next: print_next = print_error(l) if not first: print_sep("ERROR OVER", "-", 60) def run_tests(opts): excl = None features = {} if opts['pre'] or opts['snaps']: if not criu.check("mem_dirty_track"): print "Tracking memory is not available" return if opts['all']: torun = all_tests(opts) run_all = True elif opts['tests']: r = re.compile(opts['tests']) torun = filter(lambda x: r.match(x), all_tests(opts)) run_all = True elif opts['test']: torun = opts['test'] run_all = False elif opts['from']: if not os.access(opts['from'], os.R_OK): print "No such file" return torun = map(lambda x: x.strip(), open(opts['from'])) opts['keep_going'] = False run_all = True else: print "Specify test with -t or -a" return if opts['keep_going'] and len(torun) < 2: print "[WARNING] Option --keep-going is more useful when running multiple tests" opts['keep_going'] = False if opts['exclude']: excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") print "Compiled exclusion list" if opts['report']: init_report(opts['report']) if opts['parallel'] and opts['freezecg']: print "Parallel launch with freezer not supported" opts['parallel'] = None if opts['join_ns']: if subprocess.Popen(["ip", "netns", "add", "zdtm_netns"]).wait(): raise Exception("Unable to create a network namespace") if subprocess.Popen(["ip", "netns", "exec", "zdtm_netns", "ip", "link", "set", "up", "dev", "lo"]).wait(): raise Exception("ip link set up dev lo") if opts['lazy_pages'] or opts['remote_lazy_pages']: uffd = criu.check("uffd") uffd_noncoop = criu.check("uffd-noncoop") if not uffd: raise Exception("UFFD is not supported, cannot run with --lazy-pages") if not uffd_noncoop: # Most tests will work with 4.3 - 4.11 print "[WARNING] Non-cooperative UFFD is missing, some tests might spuriously fail" l = launcher(opts, len(torun)) try: for t in torun: global arch if excl and excl.match(t): l.skip(t, "exclude") continue tdesc = get_test_desc(t) if tdesc.get('arch', arch) != arch: l.skip(t, "arch %s" % tdesc['arch']) continue if test_flag(tdesc, 'reqrst') and opts['norst']: l.skip(t, "restore stage is required") continue if run_all and test_flag(tdesc, 'noauto'): l.skip(t, "manual run only") continue feat = tdesc.get('feature', None) if feat: if feat not in features: print "Checking feature %s" % feat features[feat] = criu.check(feat) if not features[feat]: l.skip(t, "no %s feature" % feat) continue if self_checkskip(t): l.skip(t, "checkskip failed") continue if opts['user']: if test_flag(tdesc, 'suid'): l.skip(t, "suid test in user mode") continue if test_flag(tdesc, 'nouser'): l.skip(t, "criu root prio needed") continue if opts['join_ns']: if test_flag(tdesc, 'samens'): l.skip(t, "samens test in the same namespace") continue if opts['lazy_pages'] or opts['remote_lazy_pages']: if test_flag(tdesc, 'nolazy'): l.skip(t, "lazy pages are not supported") continue if opts['remote_lazy_pages']: if test_flag(tdesc, 'noremotelazy'): l.skip(t, "remote lazy pages are not supported") continue test_flavs = tdesc.get('flavor', 'h ns uns').split() opts_flavs = (opts['flavor'] or 'h,ns,uns').split(',') if opts_flavs != ['best']: run_flavs = set(test_flavs) & set(opts_flavs) else: run_flavs = set([test_flavs.pop()]) if not criu.check("userns"): run_flavs -= set(['uns']) if opts['user']: # FIXME -- probably uns will make sense run_flavs -= set(['ns', 'uns']) # remove ns and uns flavor in join_ns if opts['join_ns']: run_flavs -= set(['ns', 'uns']) if opts['empty_ns']: run_flavs -= set(['h']) if run_flavs: l.run_test(t, tdesc, run_flavs) else: l.skip(t, "no flavors") finally: l.finish() if opts['join_ns']: subprocess.Popen(["ip", "netns", "delete", "zdtm_netns"]).wait() sti_fmt = "%-40s%-10s%s" def show_test_info(t): tdesc = get_test_desc(t) flavs = tdesc.get('flavor', '') return sti_fmt % (t, flavs, tdesc.get('flags', '')) def list_tests(opts): tlist = all_tests(opts) if opts['info']: print sti_fmt % ('Name', 'Flavors', 'Flags') tlist = map(lambda x: show_test_info(x), tlist) print '\n'.join(tlist) class group: def __init__(self, tname, tdesc): self.__tests = [tname] self.__desc = tdesc self.__deps = set() def __is_mergeable_desc(self, desc): # For now make it full match if self.__desc.get('flags') != desc.get('flags'): return False if self.__desc.get('flavor') != desc.get('flavor'): return False if self.__desc.get('arch') != desc.get('arch'): return False if self.__desc.get('opts') != desc.get('opts'): return False if self.__desc.get('feature') != desc.get('feature'): return False return True def merge(self, tname, tdesc): if not self.__is_mergeable_desc(tdesc): return False self.__deps |= set(tdesc.get('deps', [])) self.__tests.append(tname) return True def size(self): return len(self.__tests) # common method to write a "meta" auxiliary script (hook/checkskip) # which will call all tests' scripts in turn def __dump_meta(self, fname, ext): scripts = filter(lambda names: os.access(names[1], os.X_OK), map(lambda test: (test, test + ext), self.__tests)) if scripts: f = open(fname + ext, "w") f.write("#!/bin/sh -e\n") for test, script in scripts: f.write("echo 'Running %s for %s'\n" % (ext, test)) f.write('%s "$@"\n' % script) f.write("echo 'All %s scripts OK'\n" % ext) f.close() os.chmod(fname + ext, 0700) def dump(self, fname): f = open(fname, "w") for t in self.__tests: f.write(t + '\n') f.close() os.chmod(fname, 0700) if len(self.__desc) or len(self.__deps): f = open(fname + '.desc', "w") if len(self.__deps): self.__desc['deps'] = list(self.__deps) f.write(repr(self.__desc)) f.close() # write "meta" .checkskip and .hook scripts self.__dump_meta(fname, '.checkskip') self.__dump_meta(fname, '.hook') def group_tests(opts): excl = None groups = [] pend_groups = [] maxs = int(opts['max_size']) if not os.access("groups", os.F_OK): os.mkdir("groups") tlist = all_tests(opts) random.shuffle(tlist) if opts['exclude']: excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") print "Compiled exclusion list" for t in tlist: if excl and excl.match(t): continue td = get_test_desc(t) for g in pend_groups: if g.merge(t, td): if g.size() == maxs: pend_groups.remove(g) groups.append(g) break else: g = group(t, td) pend_groups.append(g) groups += pend_groups nr = 0 suf = opts['name'] or 'group' for g in groups: if maxs > 1 and g.size() == 1: # Not much point in group test for this continue fn = os.path.join("groups", "%s.%d" % (suf, nr)) g.dump(fn) nr += 1 print "Generated %d group(s)" % nr def clean_stuff(opts): print "Cleaning %s" % opts['what'] if opts['what'] == 'nsroot': for f in flavors: f = flavors[f] f.clean() # # main() starts here # if 'CR_CT_TEST_INFO' in os.environ: # Fork here, since we're new pidns init and are supposed to # collect this namespace's zombies status = 0 pid = os.fork() if pid == 0: tinfo = eval(os.environ['CR_CT_TEST_INFO']) do_run_test(tinfo[0], tinfo[1], tinfo[2], tinfo[3]) else: while True: wpid, status = os.wait() if wpid == pid: if os.WIFEXITED(status): status = os.WEXITSTATUS(status) else: status = 1 break sys.exit(status) p = argparse.ArgumentParser("CRIU test suite") p.add_argument("--debug", help = "Print what's being executed", action = 'store_true') p.add_argument("--set", help = "Which set of tests to use", default = 'zdtm') sp = p.add_subparsers(help = "Use --help for list of actions") rp = sp.add_parser("run", help = "Run test(s)") rp.set_defaults(action = run_tests) rp.add_argument("-a", "--all", action = 'store_true') rp.add_argument("-t", "--test", help = "Test name", action = 'append') rp.add_argument("-T", "--tests", help = "Regexp") rp.add_argument("-F", "--from", help = "From file") rp.add_argument("-f", "--flavor", help = "Flavor to run") rp.add_argument("-x", "--exclude", help = "Exclude tests from --all run", action = 'append') rp.add_argument("--sibling", help = "Restore tests as siblings", action = 'store_true') rp.add_argument("--join-ns", help = "Restore tests and join existing namespace", action = 'store_true') rp.add_argument("--empty-ns", help = "Restore tests in empty net namespace", action = 'store_true') rp.add_argument("--pre", help = "Do some pre-dumps before dump (n[:pause])") rp.add_argument("--snaps", help = "Instead of pre-dumps do full dumps", action = 'store_true') rp.add_argument("--dedup", help = "Auto-deduplicate images on iterations", action = 'store_true') rp.add_argument("--noauto-dedup", help = "Manual deduplicate images on iterations", action = 'store_true') rp.add_argument("--nocr", help = "Do not CR anything, just check test works", action = 'store_true') rp.add_argument("--norst", help = "Don't restore tasks, leave them running after dump", action = 'store_true') rp.add_argument("--stop", help = "Check that --leave-stopped option stops ps tree.", action = 'store_true') rp.add_argument("--iters", help = "Do CR cycle several times before check (n[:pause])") rp.add_argument("--fault", help = "Test fault injection") rp.add_argument("--sat", help = "Generate criu strace-s for sat tool (restore is fake, images are kept)", action = 'store_true') rp.add_argument("--sbs", help = "Do step-by-step execution, asking user for keypress to continue", action = 'store_true') rp.add_argument("--freezecg", help = "Use freeze cgroup (path:state)") rp.add_argument("--user", help = "Run CRIU as regular user", action = 'store_true') rp.add_argument("--rpc", help = "Run CRIU via RPC rather than CLI", action = 'store_true') rp.add_argument("--page-server", help = "Use page server dump", action = 'store_true') rp.add_argument("-p", "--parallel", help = "Run test in parallel") rp.add_argument("--dry-run", help="Don't run tests, just pretend to", action='store_true') rp.add_argument("--script", help="Add script to get notified by criu") rp.add_argument("-k", "--keep-img", help = "Whether or not to keep images after test", choices = ['always', 'never', 'failed'], default = 'failed') rp.add_argument("--report", help = "Generate summary report in directory") rp.add_argument("--keep-going", help = "Keep running tests in spite of failures", action = 'store_true') rp.add_argument("--ignore-taint", help = "Don't care about a non-zero kernel taint flag", action = 'store_true') rp.add_argument("--lazy-pages", help = "restore pages on demand", action = 'store_true') rp.add_argument("--remote-lazy-pages", help = "simulate lazy migration", action = 'store_true') rp.add_argument("--title", help = "A test suite title", default = "criu") lp = sp.add_parser("list", help = "List tests") lp.set_defaults(action = list_tests) lp.add_argument('-i', '--info', help = "Show more info about tests", action = 'store_true') gp = sp.add_parser("group", help = "Generate groups") gp.set_defaults(action = group_tests) gp.add_argument("-m", "--max-size", help = "Maximum number of tests in group") gp.add_argument("-n", "--name", help = "Common name for group tests") gp.add_argument("-x", "--exclude", help = "Exclude tests from --all run", action = 'append') cp = sp.add_parser("clean", help = "Clean something") cp.set_defaults(action = clean_stuff) cp.add_argument("what", choices = ['nsroot']) opts = vars(p.parse_args()) if opts.get('sat', False): opts['keep_img'] = 'always' if opts['debug']: sys.settrace(traceit) criu.available() for tst in test_classes.values(): tst.available() opts['action'](opts) criu-3.6/test/zdtm/000077500000000000000000000000001317335042600142655ustar00rootroot00000000000000criu-3.6/test/zdtm/.gitignore000066400000000000000000000002351317335042600162550ustar00rootroot00000000000000/lib/libzdtmtst.a /lib/.gitignore /static/.gitignore /transition/.gitignore *.pid *.pidns *.out *.outns *.out.external *.inprogress *.test *.test.* *.state criu-3.6/test/zdtm/Makefile000066400000000000000000000003031317335042600157210ustar00rootroot00000000000000SUBDIRS := lib static transition all: $(SUBDIRS) .PHONY: all $(SUBDIRS) $(SUBDIRS): $(MAKE) -C $@ all static: lib transition: lib %: set -e; for d in $(SUBDIRS); do $(MAKE) -C $$d $@; done criu-3.6/test/zdtm/Makefile.inc000066400000000000000000000036131317335042600165000ustar00rootroot00000000000000.SUFFIXES: MAKEFLAGS += -r ARCH ?= $(shell uname -m | sed \ -e s/i.86/x86/ \ -e s/x86_64/x86/ \ -e s/sun4u/sparc64/ \ -e s/arm.*/arm/ \ -e s/sa110/arm/ \ -e s/s390x/s390/ \ -e s/parisc64/parisc/ \ -e s/ppc64.*/ppc64/ \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ -e s/aarch64.*/arm64/) ifeq ($(ARCH),arm64) ARCH ?= aarch64 SRCARCH ?= aarch64 endif SRCARCH ?= $(ARCH) CC := gcc CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += $(USERCFLAGS) CFLAGS += -D_GNU_SOURCE CPPFLAGS += -iquote $(LIBDIR)/arch/$(SRCARCH)/include ifeq ($(strip $(V)),) E = @echo Q = @ else E = @\# Q = endif RM := rm -f --one-file-system ifeq ($(COMPAT_TEST),y) ifeq ($(ARCH),x86) export CFLAGS += -m32 export LDFLAGS += -m32 endif endif DEPEND.c = $(COMPILE.c) -MM -MP %.d: %.c $(E) " DEP " $*.d $(Q)$(DEPEND.c) $(OUTPUT_OPTION) $< %.o: %.c | %.d $(E) " CC " $@ $(Q)$(COMPILE.c) $(OUTPUT_OPTION) $< %: %.o $(LDLIBS) @echo $@ >> .gitignore $(E) " LINK " $@ $(Q)$(CC) $(LDFLAGS) $^ $(LDLIBS) -o $@ default: all @true .PHONY: default gitignore-clean: $(RM) .gitignore .PHONY: gitignore-clean clean: gitignore-clean $(RM) $(OBJ) $(TST) *~ .PHONY: clean cleandep: clean $(RM) $(DEP) .PHONY: cleandep cleanout: $(RM) -r *.pid *.out* *.test* *.state .PHONY: cleanout %.cleanout: % $(Q) $(RM) -r $<.pid* $<.out* *$<.test* $<.*.test $<.*.state $<.state chew_$<.test* realclean: cleandep cleanout .PHONY: realclean dep: $(DEP) .PHONY: dep no-deps-targets := clean cleandep cleanout realclean groups.cleanout ifeq ($(filter $(no-deps-targets), $(MAKECMDGOALS)),) -include $(wildcard $(DEP)) endif .SECONDARY: criu-3.6/test/zdtm/lib/000077500000000000000000000000001317335042600150335ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/Makefile000066400000000000000000000007351317335042600165000ustar00rootroot00000000000000LIBDIR := . CFLAGS += $(USERCFLAGS) LIB := libzdtmtst.a LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c fs.c LIBOBJ := $(LIBSRC:%.c=%.o) BIN := groups SRC := $(LIBSRC) groups.c DEP := $(SRC:%.c=%.d) OBJ := $(SRC:%.c=%.o) LDLIBS := $(LIB) TARGETS := $(LIB) $(BIN) include ../Makefile.inc all: $(TARGETS) .PHONY: all clean-more: $(RM) $(TARGETS) .PHONY: clean-more clean: clean-more $(LIB): $(LIBOBJ) $(E) " AR " $@ $(Q)ar rcs $@ $^ criu-3.6/test/zdtm/lib/arch/000077500000000000000000000000001317335042600157505ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/aarch64/000077500000000000000000000000001317335042600172005ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/aarch64/include/000077500000000000000000000000001317335042600206235ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/aarch64/include/asm/000077500000000000000000000000001317335042600214035ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/aarch64/include/asm/atomic.h000066400000000000000000000027551317335042600230410ustar00rootroot00000000000000#ifndef __CR_ATOMIC_H__ #define __CR_ATOMIC_H__ typedef uint32_t atomic_t; /* Copied from the Linux header arch/arm/include/asm/barrier.h */ #define smp_mb() asm volatile("dmb ish" : : : "memory") /* Copied from the Linux kernel header arch/arm64/include/asm/atomic.h */ static inline int atomic_read(const atomic_t *v) { return (*(volatile int *)v); } static inline void atomic_set(atomic_t *v, int i) { *v = i; } #define atomic_get atomic_read static inline int atomic_add_return(int i, atomic_t *v) { unsigned long tmp; int result; asm volatile( "1: ldxr %w0, %2\n" " add %w0, %w0, %w3\n" " stlxr %w1, %w0, %2\n" " cbnz %w1, 1b" : "=&r" (result), "=&r" (tmp), "+Q" (v) : "Ir" (i) : "cc", "memory"); smp_mb(); return result; } static inline int atomic_sub_return(int i, atomic_t *v) { unsigned long tmp; int result; asm volatile( "1: ldxr %w0, %2\n" " sub %w0, %w0, %w3\n" " stlxr %w1, %w0, %2\n" " cbnz %w1, 1b" : "=&r" (result), "=&r" (tmp), "+Q" (v) : "Ir" (i) : "cc", "memory"); smp_mb(); return result; } static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; } static inline int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; } static inline int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; } /* true if the result is 0, or false for all other cases. */ #define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) #define atomic_inc_return(v) (atomic_add_return(1, v)) #endif /* __CR_ATOMIC_H__ */ criu-3.6/test/zdtm/lib/arch/arm/000077500000000000000000000000001317335042600165275ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/arm/include/000077500000000000000000000000001317335042600201525ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/arm/include/asm/000077500000000000000000000000001317335042600207325ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/arm/include/asm/atomic.h000066400000000000000000000027101317335042600223570ustar00rootroot00000000000000#ifndef __CR_ATOMIC_H__ #define __CR_ATOMIC_H__ typedef uint32_t atomic_t; /* Copied from the Linux kernel header arch/arm/include/asm/atomic.h */ #define smp_mb() __asm__ __volatile__ ("dmb" : : : "memory") #define atomic_set(mem,v) (*(mem) = (v)) #define atomic_get(v) (*(volatile uint32_t *)v) static inline unsigned int atomic_add_return(int i, atomic_t *v) { unsigned long tmp; unsigned int result; smp_mb(); __asm__ __volatile__("@ atomic_add_return\n" "1: ldrex %0, [%3]\n" " add %0, %0, %4\n" " strex %1, %0, [%3]\n" " teq %1, #0\n" " bne 1b\n" : "=&r" (result), "=&r" (tmp), "+Qo" (v) : "r" (&v), "Ir" (i) : "cc"); smp_mb(); return result; } static inline unsigned int atomic_sub_return(int i, atomic_t *v) { unsigned long tmp; int result; smp_mb(); __asm__ __volatile__("@ atomic_sub_return\n" "1: ldrex %0, [%3]\n" " sub %0, %0, %4\n" " strex %1, %0, [%3]\n" " teq %1, #0\n" " bne 1b\n" : "=&r" (result), "=&r" (tmp), "+Qo" (v) : "r" (&v), "Ir" (i) : "cc"); smp_mb(); return result; } static inline unsigned int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; } static inline unsigned int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; } static inline unsigned int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; } /* true if the result is 0, or false for all other cases. */ #define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) #endif /* __CR_ATOMIC_H__ */ criu-3.6/test/zdtm/lib/arch/ppc64/000077500000000000000000000000001317335042600167045ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/ppc64/include/000077500000000000000000000000001317335042600203275ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/ppc64/include/asm/000077500000000000000000000000001317335042600211075ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/ppc64/include/asm/atomic.h000066400000000000000000000031501317335042600225330ustar00rootroot00000000000000#ifndef __CR_ATOMIC_H__ #define __CR_ATOMIC_H__ /* * PowerPC atomic operations * * Copied from kernel header file arch/powerpc/include/asm/atomic.h */ typedef uint32_t atomic_t; #define PPC_ATOMIC_ENTRY_BARRIER "lwsync \n" #define PPC_ATOMIC_EXIT_BARRIER "sync \n" #define ATOMIC_INIT(i) { (i) } static __inline__ int atomic_get(const atomic_t *v) { int t; __asm__ __volatile__("lwz%U1%X1 %0,%1" : "=r"(t) : "m"(*v)); return t; } static __inline__ void atomic_set(atomic_t *v, int i) { __asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(*v) : "r"(i)); } #define ATOMIC_OP(op, asm_op) \ static __inline__ void atomic_##op(int a, atomic_t *v) \ { \ int t; \ \ __asm__ __volatile__( \ "1: lwarx %0,0,%3 # atomic_" #op "\n" \ #asm_op " %0,%2,%0\n" \ " stwcx. %0,0,%3 \n" \ " bne- 1b\n" \ : "=&r" (t), "+m" (*v) \ : "r" (a), "r" (v) \ : "cc"); \ } \ ATOMIC_OP(add, add) ATOMIC_OP(sub, subf) #undef ATOMIC_OP static __inline__ int atomic_inc_return(atomic_t *v) { int t; __asm__ __volatile__( PPC_ATOMIC_ENTRY_BARRIER \ "1: lwarx %0,0,%1 # atomic_inc_return\n\ addic %0,%0,1\n" " stwcx. %0,0,%1 \n\ bne- 1b \n" \ PPC_ATOMIC_EXIT_BARRIER : "=&r" (t) : "r" (v) : "cc", "xer", "memory"); return t; } static __inline__ int atomic_inc(atomic_t *v) { return atomic_inc_return(v) - 1; } static __inline__ void atomic_dec(atomic_t *v) { int t; __asm__ __volatile__( "1: lwarx %0,0,%2 # atomic_dec\n\ addic %0,%0,-1\n" " stwcx. %0,0,%2\n\ bne- 1b" : "=&r" (t), "+m" (*v) : "r" (v) : "cc", "xer"); } #endif /* __CR_ATOMIC_H__ */ criu-3.6/test/zdtm/lib/arch/s390/000077500000000000000000000000001317335042600164465ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/s390/include/000077500000000000000000000000001317335042600200715ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/s390/include/asm/000077500000000000000000000000001317335042600206515ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/s390/include/asm/atomic.h000066400000000000000000000026151317335042600223020ustar00rootroot00000000000000#ifndef __ARCH_S390_ATOMIC__ #define __ARCH_S390_ATOMIC__ #include typedef uint32_t atomic_t; #define __ATOMIC_OP(op_name, op_type, op_string) \ static inline op_type op_name(op_type val, op_type *ptr) \ { \ op_type old, new; \ \ asm volatile( \ "0: lr %[new],%[old]\n" \ op_string " %[new],%[val]\n" \ " cs %[old],%[new],%[ptr]\n" \ " jl 0b" \ : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ return old; \ } #define __ATOMIC_OPS(op_name, op_type, op_string) \ __ATOMIC_OP(op_name, op_type, op_string) \ __ATOMIC_OP(op_name##_barrier, op_type, op_string) __ATOMIC_OPS(__atomic_add, uint32_t, "ar") #undef __ATOMIC_OPS #undef __ATOMIC_OP static inline int atomic_get(const atomic_t *v) { int c; asm volatile( " l %0,%1\n" : "=d" (c) : "Q" (*v)); return c; } static inline void atomic_set(atomic_t *v, int i) { asm volatile( " st %1,%0\n" : "=Q" (*v) : "d" (i)); } static inline int atomic_add_return(int i, atomic_t *v) { return __atomic_add_barrier(i, v) + i; } static inline void atomic_add(int i, atomic_t *v) { __atomic_add(i, v); } #define atomic_sub(_i, _v) atomic_add(-(int)(_i), _v) static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; } #define atomic_dec(_v) atomic_sub(1, _v) #endif /* __ARCH_S390_ATOMIC__ */ criu-3.6/test/zdtm/lib/arch/x86/000077500000000000000000000000001317335042600163755ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/x86/include/000077500000000000000000000000001317335042600200205ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/x86/include/asm/000077500000000000000000000000001317335042600206005ustar00rootroot00000000000000criu-3.6/test/zdtm/lib/arch/x86/include/asm/atomic.h000066400000000000000000000017101317335042600222240ustar00rootroot00000000000000#ifndef ATOMIC_H__ #define ATOMIC_H__ #define atomic_set(mem, v) \ ({ \ asm volatile ("lock xchg %0, %1\n" \ : "+r" (v), "+m" (*mem) \ : \ : "cc", "memory"); \ }) #define atomic_get(mem) \ ({ \ uint32_t ret__ = 0; \ asm volatile ("lock xadd %0, %1\n" \ : "+r" (ret__), "+m" (*mem) \ : \ : "cc", "memory"); \ ret__; \ }) #define atomic_inc(mem) \ ({ \ uint32_t ret__ = 1; \ asm volatile ("lock xadd %0, %1\n" \ : "+r" (ret__), "+m" (*mem) \ : \ : "cc", "memory"); \ ret__; \ }) #define atomic_dec(mem) \ ({ \ uint32_t ret__ = -1; \ asm volatile ("lock xadd %0, %1\n" \ : "+r" (ret__), "+m" (*mem) \ : \ : "cc", "memory"); \ ret__; \ }) #define atomic_add(i, mem) \ ({ \ asm volatile("lock addl %1,%0" \ : "+m" (*mem) \ : "ir" (i)); \ }) #endif /* ATOMIC_H__ */ criu-3.6/test/zdtm/lib/cpuid.h000066400000000000000000000015301317335042600163070ustar00rootroot00000000000000#ifndef ZDTM_CPUID_H__ #define ZDTM_CPUID_H__ /* * Adopted from linux kernel code. */ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ asm volatile("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx) : "memory"); } static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = 0; native_cpuid(eax, ebx, ecx, edx); } static inline void cpuid_count(unsigned int op, unsigned int count, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = count; native_cpuid(eax, ebx, ecx, edx); } #endif /* ZDTM_CPUID_H__ */ criu-3.6/test/zdtm/lib/datagen.c000066400000000000000000000051631317335042600166070ustar00rootroot00000000000000#include #include #include "zdtmtst.h" /* * Generate random data only for buffers with sizes less than FAST_SIZE * If a size of buffer is more than FAST_SIZE, the first FAST_SIZE bytes * are filled by random generator and then this chunk is used as pattern * for all other chunks. */ #define FAST_SIZE 99971 /* Prime number */ static void datagen_fast(uint8_t *buffer, unsigned length, uint32_t *crc) { size_t off; datagen(buffer, FAST_SIZE, crc); off = FAST_SIZE; while (off < length) { unsigned long size = FAST_SIZE; if (off + FAST_SIZE > length) size = length - off; memcpy(buffer + off, buffer, size); off += size; } } static int datachk_fast(const uint8_t *buffer, unsigned length, uint32_t *crc) { size_t off; if (datachk(buffer, FAST_SIZE, crc)) return 1; off = FAST_SIZE; while (off < length) { unsigned long size = FAST_SIZE; if (off + FAST_SIZE > length) size = length - off; if (memcmp(buffer + off, buffer, size)) { test_msg("Memory corruption [%p, %p]\n", buffer, buffer + size); return 1; } off += size; } return 0; } /* update CRC-32 */ #define CRCPOLY 0xedb88320 static inline uint32_t crc32_le8(uint32_t crc, uint8_t datum) { int i; crc ^= datum; for (i = 0; i < 8; i++) crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY : 0); return crc; } void datagen(uint8_t *buffer, unsigned length, uint32_t *crc) { uint32_t rnd = 0; unsigned shift; if (length > FAST_SIZE) return datagen_fast(buffer, length, crc); for (shift = 0; length-- > 4; buffer++, shift--, rnd >>= 8) { if (!shift) { shift = 4; rnd = mrand48(); } *buffer = rnd; if (crc) *crc = crc32_le8(*crc, *buffer); } if (crc) { *buffer++ = *crc; *buffer++ = *crc >> 8; *buffer++ = *crc >> 16; *buffer++ = *crc >> 24; } } void datagen2(uint8_t *buffer, unsigned length, uint32_t *crc) { uint32_t rnd = 0; unsigned shift; for (shift = 0; length-- > 0; buffer++, shift--, rnd >>= 8) { if (!shift) { shift = 4; rnd = mrand48(); } *buffer = rnd; if (crc) *crc = crc32_le8(*crc, *buffer); } } int datachk(const uint8_t *buffer, unsigned length, uint32_t *crc) { uint32_t read_crc; if (length > FAST_SIZE) return datachk_fast(buffer, length, crc); for (; length-- > 4; buffer++) *crc = crc32_le8(*crc, *buffer); read_crc = buffer[0] | buffer[1] << 8 | buffer[2] << 16 | buffer[3] << 24; if (read_crc != *crc) { test_msg("Read: %x, Expected: %x\n", read_crc, *crc); return 1; } return 0; } int datasum(const uint8_t *buffer, unsigned length, uint32_t *crc) { for (; length-- > 0; buffer++) *crc = crc32_le8(*crc, *buffer); return 0; } criu-3.6/test/zdtm/lib/fs.c000066400000000000000000000031021317335042600156030ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" #include "fs.h" mnt_info_t *mnt_info_alloc(void) { mnt_info_t *m = malloc(sizeof(*m)); if (m) memset(m, 0, sizeof(*m)); return m; } void mnt_info_free(mnt_info_t **m) { if (m && *m) { free(*m); *m = NULL; } } mnt_info_t *get_cwd_mnt_info(void) { int mnt_id, parent_mnt_id; unsigned int kmaj, kmin; char str[1024], *cwd; int ret; FILE *f; mnt_info_t *m = NULL; char mountpoint[PATH_MAX]; char root[PATH_MAX]; char *fsname = NULL; size_t len = 0, best_len = 0; f = fopen("/proc/self/mountinfo", "r"); if (!f) return NULL; cwd = get_current_dir_name(); if (!cwd) goto err; m = mnt_info_alloc(); if (!m) goto err; while (fgets(str, sizeof(str), f)) { char *hyphen = strchr(str, '-'); ret = sscanf(str, "%i %i %u:%u %s %s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); if (ret != 6 || !hyphen) goto err; ret = sscanf(hyphen + 1, " %ms", &fsname); if (ret != 1) goto err; len = strlen(mountpoint); if (!strncmp(mountpoint, cwd, len)) { if (len > best_len) { best_len = len; m->mnt_id = mnt_id; m->parent_mnt_id = parent_mnt_id; m->s_dev = MKKDEV(kmaj, kmin); strncpy(m->root, root, sizeof(m->root)); strncpy(m->mountpoint, mountpoint, sizeof(m->mountpoint)); strncpy(m->fsname, fsname, sizeof(m->fsname)); } } free(fsname); fsname = NULL; } out: free(cwd); fclose(f); return m; err: mnt_info_free(&m); goto out; } criu-3.6/test/zdtm/lib/fs.h000066400000000000000000000021231317335042600156120ustar00rootroot00000000000000#ifndef ZDTM_FS_H_ #define ZDTM_FS_H_ #ifndef _BSD_SOURCE # define _BSD_SOURCE #endif #include #include #include #define KDEV_MINORBITS 20 #define KDEV_MINORMASK ((1UL << KDEV_MINORBITS) - 1) #define MKKDEV(ma, mi) (((ma) << KDEV_MINORBITS) | (mi)) static inline unsigned int kdev_major(unsigned int kdev) { return kdev >> KDEV_MINORBITS; } static inline unsigned int kdev_minor(unsigned int kdev) { return kdev & KDEV_MINORMASK; } static inline dev_t kdev_to_odev(unsigned int kdev) { /* * New kernels encode devices in a new form. * See kernel's fs/stat.c for details, there * choose_32_64 helpers which are the key. */ unsigned major = kdev_major(kdev); unsigned minor = kdev_minor(kdev); return makedev(major, minor); } typedef struct { int mnt_id; int parent_mnt_id; unsigned int s_dev; char root[PATH_MAX]; char mountpoint[PATH_MAX]; char fsname[64]; } mnt_info_t; extern mnt_info_t *mnt_info_alloc(void); extern void mnt_info_free(mnt_info_t **m); extern mnt_info_t *get_cwd_mnt_info(void); #endif /* ZDTM_FS_H_ */ criu-3.6/test/zdtm/lib/groups.c000066400000000000000000000013561317335042600165230ustar00rootroot00000000000000#include #include #include #include "zdtmtst.h" const char *test_doc = "Group starter"; const char *test_author = "Pavel Emelianov "; int main(int argc, char **argv) { int sret = 0; char *env; char sh[1024]; test_init(argc, argv); env = getenv("ZDTM_TESTS"); if (env[0] != '\0') { unsetenv("ZDTM_NEWNS"); unsetenv("ZDTM_GROUPS"); unsetenv("ZDTM_UID"); unsetenv("ZDTM_GID"); unsetenv("ZDTM_ROOT"); test_msg("List: [%s]\n", env); sprintf(sh, "sh /%s.start", env); system(sh); } test_daemon(); test_waitsig(); if (env[0] != '\0') { sprintf(sh, "sh /%s.stop", env); sret = system(sh); } if (sret == 0) pass(); else fail("Some subs failed"); return 0; } criu-3.6/test/zdtm/lib/groups.desc000066400000000000000000000001041317335042600172050ustar00rootroot00000000000000{'flags': 'noauto', 'deps': [ '/bin/sh', '/bin/kill', '/bin/cat' ]} criu-3.6/test/zdtm/lib/lock.c000066400000000000000000000034221317335042600161300ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" #define TASK_WAITER_INITIAL 0x0fffff static long sys_gettid(void) { return syscall(__NR_gettid); } void task_waiter_init(task_waiter_t *t) { datagen((void *)&t->seed, sizeof(t->seed), NULL); t->seed = t->seed % TASK_WAITER_INITIAL; if (pipe(t->pipes)) { pr_perror("task_waiter_init failed"); exit(1); } } void task_waiter_fini(task_waiter_t *t) { close(t->pipes[0]); close(t->pipes[1]); } void task_waiter_wait4(task_waiter_t *t, unsigned int lockid) { struct timespec req = { .tv_nsec = TASK_WAITER_INITIAL, }; struct timespec rem = { }; unsigned int v; for (;;) { if (read(t->pipes[0], &v, sizeof(v)) != sizeof(v)) goto err; /* * If we read a value not intended for us, say parent * waits for specified child to complete among set of * children, or we just have completed and wait for * another lockid from a parent -- we need to write * the value back and wait for some time before * next attempt. */ if (v != lockid) { if (write(t->pipes[1], &v, sizeof(v)) != sizeof(v)) goto err; /* * If we get a collision in access, lets sleep * semi-random time magnitude to decrease probability * of a new collision. */ nanosleep(&req, &rem); req.tv_nsec += t->seed; } else break; } return; err: pr_perror("task_waiter_wait4 failed"); exit(errno); } void task_waiter_complete(task_waiter_t *t, unsigned int lockid) { if (write(t->pipes[1], &lockid, sizeof(lockid)) != sizeof(lockid)) { pr_perror("task_waiter_complete failed"); exit(1); } } void task_waiter_complete_current(task_waiter_t *t) { return task_waiter_complete(t, (int)sys_gettid()); } criu-3.6/test/zdtm/lib/lock.h000066400000000000000000000076221317335042600161430ustar00rootroot00000000000000#ifndef CR_LOCK_H_ #define CR_LOCK_H_ #include #include #include #include #include #include #include "asm/atomic.h" #define BUG_ON(condition) \ do { \ if ((condition)) { \ raise(SIGABRT); \ *(volatile unsigned long *)NULL = 0xdead0000 + __LINE__; \ } \ } while (0) typedef struct { uint32_t raw; } futex_t; #define FUTEX_ABORT_FLAG (0x80000000) #define FUTEX_ABORT_RAW (-1U) static inline int sys_futex(unsigned int *uaddr, int op, unsigned int val, const struct timespec *timeout, int *uaddr2, unsigned int val3) { return syscall(__NR_futex, uaddr, op, val, timeout, uaddr2, val3); } /* Get current futex @f value */ static inline uint32_t futex_get(futex_t *f) { return atomic_get(&f->raw); } /* Set futex @f value to @v */ static inline void futex_set(futex_t *f, uint32_t v) { atomic_set(&f->raw, v); } /* Set futex @f to @v and wake up all waiters */ static inline void futex_add_and_wake(futex_t *f, uint32_t v) { atomic_add(v, &f->raw); BUG_ON(sys_futex(&f->raw, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); } #define futex_init(f) futex_set(f, 0) /* Wait on futex @__f value @__v become in condition @__c */ #define futex_wait_if_cond(__f, __v, __cond) \ do { \ int ret; \ uint32_t tmp; \ \ while (1) { \ tmp = (__f)->raw; \ if ((tmp & FUTEX_ABORT_FLAG) || \ (tmp __cond (__v))) \ break; \ ret = sys_futex(&(__f)->raw, FUTEX_WAIT,\ tmp, NULL, NULL, 0); \ if (ret < 0 && (errno == EAGAIN || errno == EINTR)) \ continue; \ BUG_ON(ret < 0 && errno != EWOULDBLOCK); \ } \ } while (0) /* Set futex @f to @v and wake up all waiters */ static inline void futex_set_and_wake(futex_t *f, uint32_t v) { atomic_set(&f->raw, v); BUG_ON(sys_futex(&f->raw, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); } /* Mark futex @f as wait abort needed and wake up all waiters */ static inline void futex_abort_and_wake(futex_t *f) { futex_set_and_wake(f, FUTEX_ABORT_RAW); } /* Decrement futex @f value and wake up all waiters */ static inline void futex_dec_and_wake(futex_t *f) { atomic_dec(&f->raw); BUG_ON(sys_futex(&f->raw, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); } /* Increment futex @f value and wake up all waiters */ static inline void futex_inc_and_wake(futex_t *f) { atomic_inc(&f->raw); BUG_ON(sys_futex(&f->raw, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); } /* Plain increment futex @f value */ static inline void futex_inc(futex_t *f) { atomic_inc(&f->raw); } /* Plain decrement futex @f value */ static inline void futex_dec(futex_t *f) { atomic_dec(&f->raw); } /* Wait until futex @f value become @v */ static inline void futex_wait_until(futex_t *f, uint32_t v) { futex_wait_if_cond(f, v, ==); } /* Wait while futex @f value is greater than @v */ static inline void futex_wait_while_gt(futex_t *f, uint32_t v) { futex_wait_if_cond(f, v, <=); } /* Wait while futex @f value is less than @v */ static inline void futex_wait_while_lt(futex_t *f, uint32_t v) { futex_wait_if_cond(f, v, >=); } /* Wait while futex @f value is @v */ static inline uint32_t futex_wait_while(futex_t *f, uint32_t v) { while (f->raw == v) { int ret = sys_futex(&f->raw, FUTEX_WAIT, v, NULL, NULL, 0); if (ret < 0 && (errno == EAGAIN || errno == EINTR)) continue; BUG_ON(ret < 0 && errno != EWOULDBLOCK); } return f->raw; } typedef struct { uint32_t raw; } mutex_t; static void inline mutex_init(mutex_t *m) { uint32_t c = 0; atomic_set(&m->raw, c); } static void inline mutex_lock(mutex_t *m) { uint32_t c; int ret; while ((c = atomic_inc(&m->raw))) { ret = sys_futex(&m->raw, FUTEX_WAIT, c + 1, NULL, NULL, 0); BUG_ON(ret < 0 && ret != -EWOULDBLOCK); } } static void inline mutex_unlock(mutex_t *m) { uint32_t c = 0; atomic_set(&m->raw, c); BUG_ON(sys_futex(&m->raw, FUTEX_WAKE, 1, NULL, NULL, 0) < 0); } #endif /* CR_LOCK_H_ */ criu-3.6/test/zdtm/lib/msg.c000066400000000000000000000025651317335042600157750ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" int test_log_init(const char *fname, const char *suffix) { char path[PATH_MAX]; int logfd; snprintf(path, sizeof(path), "%s%s", fname, suffix); logfd = open(path, O_WRONLY | O_EXCL | O_CREAT | O_APPEND, 0644); if (logfd < 0) { pr_perror("Can't open file %s", path); return -1; } dup2(logfd, STDERR_FILENO); dup2(logfd, STDOUT_FILENO); close(logfd); setbuf(stdout, NULL); setbuf(stderr, NULL); return 0; } int zdtm_seccomp; void test_msg(const char *format, ...) { va_list arg; int off = 0; char buf[TEST_MSG_BUFFER_SIZE]; int __errno = errno; struct timeval tv; struct tm *tm; if (zdtm_seccomp) /* seccomp allows restricted set of syscall-s */ goto skip; gettimeofday(&tv, NULL); tm = localtime(&tv.tv_sec); if (tm == NULL) { fprintf(stderr, "ERROR in %s: localtime() failed: %m\n", __func__); } else { off += strftime(buf, sizeof(buf), "%H:%M:%S", tm); } off += sprintf(buf + off, ".%.3ld: ", tv.tv_usec / 1000); off += sprintf(buf + off, "%5d: ", getpid()); skip: va_start(arg, format); off += vsnprintf(buf + off, sizeof(buf) - off, format, arg); va_end(arg); fprintf(stderr, "%s", buf); errno = __errno; } criu-3.6/test/zdtm/lib/ns.c000066400000000000000000000225361317335042600156270ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #include "ns.h" extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; char *root, *criu_path; char path[PATH_MAX]; root = getenv("ZDTM_ROOT"); if (!root) { fprintf(stderr, "ZDTM_ROOT isn't set\n"); return -1; } /* * In a new userns all mounts are locked to protect what is * under them. So we need to create another mount for the * new root. */ if (mount(root, root, NULL, MS_SLAVE , NULL)) { fprintf(stderr, "Can't bind-mount root: %m\n"); return -1; } if (mount(root, root, NULL, MS_BIND | MS_REC, NULL)) { fprintf(stderr, "Can't bind-mount root: %m\n"); return -1; } criu_path = getenv("ZDTM_CRIU"); if (criu_path) { snprintf(path, sizeof(path), "%s%s", root, criu_path); if (mount(criu_path, path, NULL, MS_BIND, NULL) || mount(NULL, path, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to mount %s", path); return -1; } } /* Move current working directory to the new root */ ret = readlink("/proc/self/cwd", path, sizeof(path) - 1); if (ret < 0) return -1; path[ret] = 0; dfd = open(path, O_RDONLY | O_DIRECTORY); if (dfd == -1) { fprintf(stderr, "open(.) failed: %m\n"); return -1; } if (chdir(root)) { fprintf(stderr, "chdir(%s) failed: %m\n", root); return -1; } if (mkdir("old", 0777) && errno != EEXIST) { fprintf(stderr, "mkdir(old) failed: %m\n"); return -1; } if (pivot_root(".", "./old")) { fprintf(stderr, "pivot_root(., ./old) failed: %m\n"); return -1; } if (mount("./old", "./old", NULL, MS_SLAVE | MS_REC , NULL)) { fprintf(stderr, "Can't bind-mount root: %m\n"); return -1; } /* * proc and sysfs can be mounted in an unprivileged namespace, * if they are already mounted when the user namespace is created. * So ./old must be umounted after mounting /proc and /sys. */ if (mount("proc", "/proc", "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) { fprintf(stderr, "mount(/proc) failed: %m\n"); return -1; } if (umount2("./old", MNT_DETACH)) { fprintf(stderr, "umount(./old) failed: %m\n"); return -1; } if (mount("pts", "/dev/pts", "devpts", MS_MGC_VAL, "mode=666,ptmxmode=666,newinstance")) { fprintf(stderr, "mount(/dev/pts) failed: %m\n"); return -1; } /* * If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, then /dev/pts/ptmx * does not exist. Fall back to creating the device with * mknod() in that case. */ if (access("/dev/pts/ptmx", F_OK) == 0) { if (symlink("pts/ptmx", "/dev/ptmx") && errno != EEXIST) { fprintf(stderr, "symlink(/dev/ptmx) failed: %m\n"); return -1; } } else { if (mknod("/dev/ptmx", 0666 | S_IFCHR, makedev(5, 2)) == 0) { chmod("/dev/ptmx", 0666); } else if (errno != EEXIST) { fprintf(stderr, "mknod(/dev/ptmx) failed: %m\n"); return -1; } } if (fchdir(dfd)) { fprintf(stderr, "fchdir() failed: %m\n"); return -1; } close(dfd); return 0; } static int prepare_namespaces(void) { if (setuid(0) || setgid(0) || setgroups(0, NULL)) { fprintf(stderr, "set*id failed: %m\n"); return -1; } system("ip link set up dev lo"); if (prepare_mntns()) return -1; return 0; } #define NS_STACK_SIZE 4096 /* All arguments should be above stack, because it grows down */ struct ns_exec_args { char stack[NS_STACK_SIZE] __stack_aligned__; char stack_ptr[0]; int argc; char **argv; int status_pipe[2]; }; static void ns_sig_hand(int signo) { int status, len = 0; pid_t pid; char buf[128] = ""; if (signo == SIGTERM) { futex_set_and_wake(&sig_received, signo); len = snprintf(buf, sizeof(buf), "Time to stop and check\n"); goto write_out; } while (1) { pid = waitpid(-1, &status, WNOHANG); if (pid == 0) return; if (pid == -1) { if (errno == ECHILD) { if (futex_get(&sig_received)) return; futex_set_and_wake(&sig_received, signo); len = snprintf(buf, sizeof(buf), "All test processes exited\n"); } else { len = snprintf(buf, sizeof(buf), "wait() failed: %m\n"); } goto write_out; } if (status) fprintf(stderr, "%d return %d\n", pid, status); } return; write_out: /* fprintf can't be used in a sighandler due to glibc locks */ write(STDERR_FILENO, buf, MIN(len, sizeof(buf))); } #define STATUS_FD 255 static int ns_exec(void *_arg) { struct ns_exec_args *args = (struct ns_exec_args *) _arg; char buf[4096]; int ret; close(args->status_pipe[0]); setsid(); ret = dup2(args->status_pipe[1], STATUS_FD); if (ret < 0) { fprintf(stderr, "dup2() failed: %m\n"); return -1; } close(args->status_pipe[1]); read(STATUS_FD, buf, sizeof(buf)); shutdown(STATUS_FD, SHUT_RD); if (prepare_namespaces()) return -1; setenv("ZDTM_NEWNS", "2", 1); execvp(args->argv[0], args->argv); fprintf(stderr, "exec(%s) failed: %m\n", args->argv[0]); return -1; } int ns_init(int argc, char **argv) { struct sigaction sa = { .sa_handler = ns_sig_hand, .sa_flags = SA_RESTART, }; int ret, fd, status_pipe = STATUS_FD; char buf[128], *x; pid_t pid; bool reap; ret = fcntl(status_pipe, F_SETFD, FD_CLOEXEC); if (ret == -1) { fprintf(stderr, "fcntl failed %m\n"); exit(1); } reap = getenv("ZDTM_NOREAP") == NULL; sigemptyset(&sa.sa_mask); sigaddset(&sa.sa_mask, SIGTERM); if (reap) sigaddset(&sa.sa_mask, SIGCHLD); if (sigaction(SIGTERM, &sa, NULL)) { fprintf(stderr, "Can't set SIGTERM handler: %m\n"); exit(1); } x = malloc(strlen(pidfile) + 3); sprintf(x, "%sns", pidfile); pidfile = x; /* Start test */ pid = fork(); if (pid < 0) { fprintf(stderr, "fork() failed: %m\n"); exit(1); } else if (pid == 0) { close(status_pipe); unsetenv("ZDTM_NEWNS"); return 0; /* Continue normal test startup */ } ret = -1; if (waitpid(pid, &ret, 0) < 0) fprintf(stderr, "waitpid() failed: %m\n"); else if (ret) fprintf(stderr, "The test returned non-zero code %d\n", ret); if (reap && sigaction(SIGCHLD, &sa, NULL)) { fprintf(stderr, "Can't set SIGCHLD handler: %m\n"); exit(1); } while (reap && 1) { int status; pid = waitpid(-1, &status, WNOHANG); if (pid == 0) break; if (pid < 0) { fprintf(stderr, "waitpid() failed: %m\n"); exit (1); } if (status) fprintf(stderr, "%d return %d\n", pid, status); } /* Daemonize */ write(status_pipe, &ret, sizeof(ret)); close(status_pipe); if (ret) exit(ret); /* suspend/resume */ test_waitsig(); fd = open(pidfile, O_RDONLY); if (fd == -1) { fprintf(stderr, "open(%s) failed: %m\n", pidfile); exit(1); } ret = read(fd, buf, sizeof(buf) - 1); buf[ret] = '\0'; if (ret == -1) { fprintf(stderr, "read() failed: %m\n"); exit(1); } pid = atoi(buf); fprintf(stderr, "kill(%d, SIGTERM)\n", pid); if (pid > 0) kill(pid, SIGTERM); ret = 0; if (reap) { while (true) { pid_t child; ret = -1; child = waitpid(-1, &ret, 0); if (child < 0) { fprintf(stderr, "Unable to wait a test process: %m"); exit(1); } if (child == pid) { fprintf(stderr, "The test returned 0x%x", ret); exit(!(ret == 0)); } if (ret) fprintf(stderr, "The %d process exited with 0x%x", child, ret); } } else { waitpid(pid, NULL, 0); } exit(1); } #define UID_MAP "0 100000 100000\n100000 200000 50000" #define GID_MAP "0 400000 50000\n50000 500000 100000" void ns_create(int argc, char **argv) { pid_t pid; int ret, status; struct ns_exec_args args; int flags; char *pidf; args.argc = argc; args.argv = argv; ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, args.status_pipe); if (ret) { fprintf(stderr, "Pipe() failed %m\n"); exit(1); } flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWNET | CLONE_NEWIPC | SIGCHLD; if (getenv("ZDTM_USERNS")) flags |= CLONE_NEWUSER; pid = clone(ns_exec, args.stack_ptr, flags, &args); if (pid < 0) { fprintf(stderr, "clone() failed: %m\n"); exit(1); } close(args.status_pipe[1]); if (flags & CLONE_NEWUSER) { char pname[PATH_MAX]; int fd; snprintf(pname, sizeof(pname), "/proc/%d/uid_map", pid); fd = open(pname, O_WRONLY); if (fd < 0) { fprintf(stderr, "open(%s): %m\n", pname); exit(1); } if (write(fd, UID_MAP, sizeof(UID_MAP)) < 0) { fprintf(stderr, "write(" UID_MAP "): %m\n"); exit(1); } close(fd); snprintf(pname, sizeof(pname), "/proc/%d/gid_map", pid); fd = open(pname, O_WRONLY); if (fd < 0) { fprintf(stderr, "open(%s): %m\n", pname); exit(1); } if (write(fd, GID_MAP, sizeof(GID_MAP)) < 0) { fprintf(stderr, "write(" GID_MAP "): %m\n"); exit(1); } close(fd); } shutdown(args.status_pipe[0], SHUT_WR); pidf = pidfile; pidfile = malloc(strlen(pidfile) + 13); sprintf(pidfile, "%s%s", pidf, INPROGRESS); if (write_pidfile(pid)) { fprintf(stderr, "Preparations fail\n"); exit(1); } status = 1; ret = read(args.status_pipe[0], &status, sizeof(status)); if (ret != sizeof(status) || status) { fprintf(stderr, "The test failed (%d, %d)\n", ret, status); exit(1); } ret = read(args.status_pipe[0], &status, sizeof(status)); if (ret != 0) { fprintf(stderr, "Unexpected message from test\n"); exit(1); } unlink(pidfile); pidfile = pidf; if (write_pidfile(pid)) exit(1); exit(0); } criu-3.6/test/zdtm/lib/ns.h000066400000000000000000000004271317335042600156270ustar00rootroot00000000000000#ifndef __ZDTM_NS__ #define __ZDTM_NS__ #include "lock.h" extern futex_t sig_received; extern char *pidfile; extern void ns_create(int argc, char **argv); extern int ns_init(int argc, char **argv); extern void test_waitsig(void); extern void parseargs(int, char **); #endif criu-3.6/test/zdtm/lib/parseargs.c000066400000000000000000000063251317335042600171740ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" static struct long_opt *opt_head; static int help; TEST_OPTION(help, bool, "print help message and exit", 0); void __push_opt(struct long_opt *opt) { opt->next = opt_head; /* FIXME: barrier ? */ opt_head = opt; } int parse_opt_bool(char *param, void *arg) { if (param == NULL || !strcmp(param, "on") || !strcmp(param, "yes") || !strcmp(param, "true")) { * (int *) arg = 1; return 0; } if (!strcmp(param, "off") || !strcmp(param, "no") || !strcmp(param, "false")) { * (int *) arg = 0; return 0; } return -EINVAL; } int parse_opt_int(char *param, void *arg) { char *tail; if (param == NULL || param[0] == '\0') return -EINVAL; * (int *) arg = strtol(param, &tail, 0); if (tail[0] != '\0') return -EINVAL; return 0; } int parse_opt_uint(char *param, void *arg) { char *tail; if (param == NULL || param[0] == '\0') return -EINVAL; * (unsigned int *) arg = strtoul(param, &tail, 0); if (tail[0] != '\0') return -EINVAL; return 0; } int parse_opt_long(char *param, void *arg) { char *tail; if (param == NULL || param[0] == '\0') return -EINVAL; * (long *) arg = strtol(param, &tail, 0); if (tail[0] != '\0') return -EINVAL; return 0; } int parse_opt_ulong(char *param, void *arg) { char *tail; if (param == NULL || param[0] == '\0') return -EINVAL; * (unsigned long *) arg = strtoul(param, &tail, 0); if (tail[0] != '\0') return -EINVAL; return 0; } int parse_opt_string(char *param, void *arg) { if (param == NULL || param[0] == '\0') return -EINVAL; * (char **) arg = param; return 0; } static void printopt(const struct long_opt *opt) { const char *obracket = "", *cbracket = ""; if (!opt->is_required) { obracket = "["; cbracket = "]"; } fprintf(stderr, " %s--%s=%s%s\t%s\n", obracket, opt->name, opt->type, cbracket, opt->doc); } static void helpexit(void) { struct long_opt *opt; fputs("Usage:\n", stderr); for (opt = opt_head; opt; opt = opt->next) printopt(opt); exit(1); } const char *test_doc; const char *test_author; static void prdoc(void) { if (test_doc) fprintf(stderr, "%s\n", test_doc); if (test_author) fprintf(stderr, "Author: %s\n", test_author); } void parseargs(int argc, char ** argv) { int i; struct long_opt *opt; for (i = 1; i < argc; i++) { char *name, *value; if (strlen(argv[i]) < 2 || strncmp(argv[i], "--", 2)) { fprintf(stderr, "%s: options should start with --\n", argv[i]); helpexit(); } name = argv[i] + 2; value = strchr(name, '='); if (value) value++; for (opt = opt_head; opt; opt = opt->next) if (!strncmp(name, opt->name, value - name - 1)) { if (opt->parse_opt(value, opt->value)) { fprintf(stderr, "%s: failed to parse\n", argv[i]); helpexit(); } else /* -1 marks fulfilled requirement */ opt->is_required = - opt->is_required; break; } if (!opt) { fprintf(stderr, "%s: unknown option\n", argv[i]); helpexit(); } } if (help) { prdoc(); helpexit(); } for (opt = opt_head; opt; opt = opt->next) if (opt->is_required > 0) { fprintf(stderr, "mandatory flag --%s not given\n", opt->name); helpexit(); } } criu-3.6/test/zdtm/lib/parseargs.sh000077500000000000000000000032731317335042600173660ustar00rootroot00000000000000#!/bin/bash # # parse command line flags of the form --foo=bar and print out an eval-able line name=$0 function die() { echo "$name: $*" >&2 exit 1 } # eat our flags first while : ; do flag=$1 shift || break case $flag in --flags-req=*) # req'd flags oIFS="$IFS" IFS="," vars_req=(${flag#*=}) IFS="$oIFS" ;; --flags-opt=*) # optional flags oIFS="$IFS" IFS="," vars_opt=(${flag#*=}) IFS="$oIFS" ;; --name=*) # name to report errors as name=${flag#*=} ;; --flags-only) # report only flags show_flags=true show_args=false ;; --no-flags) # report only remaining args show_flags=false show_args=true ;; --) # end of our flags; external flags follow break ;; esac done # consume external flags while : ; do flag=$1 shift || break case $flag in --*=*) ;; --) # end of external flags; uninterpreted arguments follow break ;; *) # pass unrecognized arguments through args="$args '$flag'" continue ;; esac flagname=${flag%%=*} flagname=${flagname#--} flagval=${flag#*=} # check if this flag is declared case " ${vars_req[*]} ${vars_opt[*]} " in *" $flagname "*) ;; *) # pass unrecognized flags through args="$args '$flag'" continue ;; esac eval $flagname=\"$flagval\" done # check that we have all required flags for var in ${vars_req[@]}; do ${!var+true} die "--$var is required" done # now print 'em out if ${show_flags:-true}; then for var in ${vars_req[@]} ${vars_opt[@]}; do # only print those that are set (even to an empty string) ${!var+echo $var="'${!var}'"} done fi if ${show_args:-true}; then for arg in "$@"; do # get quotes right args="$args '$arg'" done echo "set -- $args" fi criu-3.6/test/zdtm/lib/stop_and_chk.sh000077500000000000000000000020111317335042600200200ustar00rootroot00000000000000#!/bin/bash export PATH=$PATH:${0%/*} function die() { echo "ERR: $*" exit 1 } tmpargs="$(parseargs.sh --name=$0 --flags-req=pidfile,outfile -- "$@")" || die "can't parse command line" eval "$tmpargs" # check that pidfile exists if [ ! -r "$pidfile" ]; then # if the testcase has written out the outfile, print it if [ -r "$outfile" ]; then echo $(< "$outfile") exit 1 else die "pidfile $pidfile doesn't exist" fi fi # try to stop the testcase kill -TERM $(< $pidfile) # wait at most this many sec for the testcase to stop and wipe out the pidfile declare -i loops=10 while [ -f "$pidfile" ]; do ((loops--)) || die "$pidfile still exists" sleep 1 done # see if the testcase has written out the result file [ -f "$outfile" ] || die "$outfile doesn't exist" # read in the result res="$(< "$outfile")" # dump it to stdout, with the return code reflecting the status case "$res" in PASS) echo "$res" exit 0 ;; FAIL:* | ERR:*) echo "$res" exit 1 ;; *) die "$outfile is incomprehensible" ;; esac criu-3.6/test/zdtm/lib/streamutil.c000066400000000000000000000012571317335042600173750ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" int set_nonblock(int fd, int on) { int flag; flag = fcntl(fd, F_GETFL, 0); if (flag < 0) return flag; if (on) flag |= O_NONBLOCK; else flag &= ~O_NONBLOCK; return fcntl(fd, F_SETFL, flag); } int pipe_in2out(int infd, int outfd, uint8_t *buffer, int length) { uint8_t *buf; int rlen, wlen; while (1) { rlen = read(infd, buffer, length); if (rlen <= 0) return rlen; /* don't go reading until we're done with writing */ for (buf = buffer; rlen > 0; buf += wlen, rlen -= wlen) { wlen = write(outfd, buf, rlen); if (wlen < 0) return wlen; } } } criu-3.6/test/zdtm/lib/tcp.c000066400000000000000000000050251317335042600157670ustar00rootroot00000000000000#include #include #include /* for sockaddr_in and inet_ntoa() */ #include "zdtmtst.h" union sockaddr_inet { struct sockaddr_in v4; struct sockaddr_in6 v6; }; int tcp_init_server(int family, int *port) { union sockaddr_inet addr; int sock; int yes = 1, ret; memset(&addr,0,sizeof(addr)); if (family == AF_INET) { addr.v4.sin_family = family; inet_pton(family, "0.0.0.0", &(addr.v4.sin_addr)); } else if (family == AF_INET6){ addr.v6.sin6_family = family; inet_pton(family, "::0", &(addr.v6.sin6_addr)); } else return -1; sock = socket(family, SOCK_STREAM, IPPROTO_TCP); if (sock == -1) { pr_perror("socket() failed"); return -1; } if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(int)) == -1 ) { pr_perror("setsockopt() error"); return -1; } while (1) { if (family == AF_INET) addr.v4.sin_port = htons(*port); else if (family == AF_INET6) addr.v6.sin6_port = htons(*port); ret = bind(sock, (struct sockaddr *) &addr, sizeof(addr)); /* criu doesn't restore sock opts, so we need this hack */ if (ret == -1 && errno == EADDRINUSE) { test_msg("The port %d is already in use.\n", *port); (*port)++; continue; } break; } if (ret == -1) { pr_perror("bind() failed"); return -1; } if (listen(sock, 1) == -1) { pr_perror("listen() failed"); return -1; } return sock; } int tcp_accept_server(int sock) { struct sockaddr_in maddr; int sock2; socklen_t addrlen; #ifdef DEBUG test_msg ("Waiting for connection..........\n"); #endif addrlen = sizeof(maddr); sock2 = accept(sock,(struct sockaddr *) &maddr, &addrlen); if (sock2 == -1) { pr_perror("accept() failed"); return -1; } #ifdef DEBUG test_msg ("Connection!!\n"); #endif return sock2; } int tcp_init_client(int family, char *servIP, unsigned short servPort) { int sock; union sockaddr_inet servAddr; if ((sock = socket(family, SOCK_STREAM, IPPROTO_TCP)) < 0) { pr_perror("can't create socket"); return -1; } /* Construct the server address structure */ memset(&servAddr, 0, sizeof(servAddr)); if (family == AF_INET) { servAddr.v4.sin_family = AF_INET; servAddr.v4.sin_port = htons(servPort); inet_pton(AF_INET, servIP, &servAddr.v4.sin_addr); } else { servAddr.v6.sin6_family = AF_INET6; servAddr.v6.sin6_port = htons(servPort); inet_pton(AF_INET6, servIP, &servAddr.v6.sin6_addr); } if (connect(sock, (struct sockaddr *) &servAddr, sizeof(servAddr)) < 0) { pr_perror("can't connect to server"); return -1; } return sock; } criu-3.6/test/zdtm/lib/test.c000066400000000000000000000125201317335042600161560ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #include "lock.h" #include "ns.h" futex_t sig_received; static struct { futex_t stage; } *test_shared_state; enum { TEST_INIT_STAGE = 0, TEST_RUNNING_STAGE, TEST_FINI_STAGE, TEST_FAIL_STAGE, }; static int parent; static void sig_hand(int signo) { if (parent) futex_set_and_wake(&test_shared_state->stage, TEST_FAIL_STAGE); futex_set_and_wake(&sig_received, signo); } static char *outfile; TEST_OPTION(outfile, string, "output file", 1); char *pidfile; TEST_OPTION(pidfile, string, "file to store pid", 1); static pid_t master_pid = 0; int test_fork_id(int id) { return fork(); } static int cwd = -1; static void test_fini(void) { char path[PATH_MAX]; if (getpid() != master_pid) return; snprintf(path, sizeof(path), "%s%s", outfile, INPROGRESS); renameat(cwd, path, cwd, outfile); unlinkat(cwd, pidfile, 0); } static void setup_outfile() { if (!access(outfile, F_OK) || errno != ENOENT) { fprintf(stderr, "Output file %s appears to exist, aborting\n", outfile); exit(1); } cwd = open(".", O_RDONLY); if (cwd < 0) { fprintf(stderr, "Unable to open\n"); exit(1); } if (atexit(test_fini)) { fprintf(stderr, "Can't register exit function\n"); exit(1); } if (test_log_init(outfile, INPROGRESS)) exit(1); } static void redir_stdfds() { int nullfd; nullfd = open("/dev/null", O_RDWR); if (nullfd < 0) { pr_perror("Can't open /dev/null"); exit(1); } dup2(nullfd, STDIN_FILENO); if (nullfd > 2) close(nullfd); } void test_ext_init(int argc, char **argv) { parseargs(argc, argv); if (test_log_init(outfile, ".external")) exit(1); } int write_pidfile(int pid) { int fd = -1; char tmp[] = ".zdtm.pidfile.XXXXXX"; fd = mkstemp(tmp); if (fd == -1) { fprintf(stderr, "Can't create the file %s: %m\n", tmp); return -1; } if (fchmod(fd, 0666) < 0) { fprintf(stderr, "Can't fchmod %s: %m\n", tmp); goto err_c; } if (dprintf(fd, "%d", pid) == -1) { fprintf(stderr, "Can't write in the file %s: %m\n", tmp); goto err_c; } close(fd); if (rename(tmp, pidfile) < 0) { fprintf(stderr, "Can't rename %s to %s: %m\n", tmp, pidfile); goto err_u; } return 0; err_c: close(fd); err_u: unlink(tmp); return -1; } void test_init(int argc, char **argv) { pid_t pid; char *val; struct sigaction sa = { .sa_handler = sig_hand, .sa_flags = SA_RESTART, }; sigemptyset(&sa.sa_mask); parseargs(argc, argv); val = getenv("ZDTM_NEWNS"); if (val) { if (!strcmp(val, "1")) { ns_create(argc, argv); exit(1); } if (!strcmp(val, "2")) { test_log_init(outfile, "ns"); redir_stdfds(); ns_init(argc, argv); } } val = getenv("ZDTM_GROUPS"); if (val) { char *tok = NULL; unsigned int size = 0, groups[NGROUPS_MAX]; tok = strtok(val, " "); while (tok) { size++; groups[size - 1] = atoi(tok); tok = strtok(NULL, " "); } if (setgroups(size, groups)) { fprintf(stderr, "Can't set groups: %m"); exit(1); } } val = getenv("ZDTM_GID"); if (val && (setgid(atoi(val)) == -1)) { fprintf(stderr, "Can't set gid: %m"); exit(1); } val = getenv("ZDTM_UID"); if (val && (setuid(atoi(val)) == -1)) { fprintf(stderr, "Can't set gid: %m"); exit(1); } if (prctl(PR_SET_DUMPABLE, 1)) { fprintf(stderr, "Can't set the dumpable flag"); exit(1); } if (sigaction(SIGTERM, &sa, NULL)) { fprintf(stderr, "Can't set SIGTERM handler: %m\n"); exit(1); } if (sigaction(SIGCHLD, &sa, NULL)) { fprintf(stderr, "Can't set SIGCHLD handler: %m\n"); exit(1); } setup_outfile(); redir_stdfds(); test_shared_state = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, 0, 0); if (test_shared_state == MAP_FAILED) { pr_perror("Unable to map a shared memory"); exit(1); } futex_init(&test_shared_state->stage); futex_set(&test_shared_state->stage, TEST_INIT_STAGE); pid = fork(); if (pid < 0) { pr_perror("Daemonizing failed"); exit(1); } parent = 1; if (pid) { /* parent will exit when the child is ready */ futex_wait_while(&test_shared_state->stage, TEST_INIT_STAGE); if (futex_get(&test_shared_state->stage) == TEST_FAIL_STAGE) { int ret; if (waitpid(pid, &ret, 0) != pid) { pr_perror("Unable to wait %d", pid); exit(1); } if (WIFEXITED(ret)) { pr_err("Test exited unexpectedly with code %d\n", WEXITSTATUS(ret)); exit(1); } if (WIFSIGNALED(ret)) { pr_err("Test exited on unexpected signal %d\n", WTERMSIG(ret)); exit(1); } } if (write_pidfile(pid)) exit(1); _exit(0); } parent = 0; if (setsid() < 0) { pr_perror("Can't become session group leader"); exit(1); } /* record the test pid to remember the ownership of the pidfile */ master_pid = getpid(); sa.sa_handler = SIG_DFL; if (sigaction(SIGCHLD, &sa, NULL)) { pr_perror("Can't reset SIGCHLD handler"); exit(1); } srand48(time(NULL)); /* just in case we need it */ } void test_daemon() { futex_set_and_wake(&test_shared_state->stage, TEST_RUNNING_STAGE); } int test_go(void) { return !futex_get(&sig_received); } void test_waitsig(void) { futex_wait_while(&sig_received, 0); } criu-3.6/test/zdtm/lib/zdtmtst.h000066400000000000000000000112271317335042600167200ustar00rootroot00000000000000#ifndef _VIMITESU_H_ #define _VIMITESU_H_ #include #include #define INPROGRESS ".inprogress" #ifndef PAGE_SIZE # define PAGE_SIZE (unsigned int)(sysconf(_SC_PAGESIZE)) #endif #ifndef PR_SET_CHILD_SUBREAPER # define PR_SET_CHILD_SUBREAPER 36 #endif /* set up test */ extern void test_ext_init(int argc, char **argv); extern void test_init(int argc, char **argv); #ifndef CLONE_NEWUTS #define CLONE_NEWUTS 0x04000000 #endif #ifndef CLONE_NEWIPC #define CLONE_NEWIPC 0x08000000 #endif #define TEST_MSG_BUFFER_SIZE 2048 /*wrapper for fork: init log offset*/ #define test_fork() test_fork_id(-1) extern int test_fork_id(int id); /* finish setting up the test, write out pid file, and go to background */ extern void test_daemon(void); /* store a message to a static buffer */ extern void test_msg(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2))); /* tell if SIGTERM hasn't been received yet */ extern int test_go(void); /* sleep until SIGTERM is delivered */ extern void test_waitsig(void); #include /* generate data with crc32 at the end of the buffer */ extern void datagen(uint8_t *buffer, unsigned length, uint32_t *crc); /* generate data without crc32 at the end of the buffer */ extern void datagen2(uint8_t *buffer, unsigned length, uint32_t *crc); /* check the data buffer against its crc32 */ extern int datachk(const uint8_t *buffer, unsigned length, uint32_t *crc); /* calculate crc for the data buffer*/ extern int datasum(const uint8_t *buffer, unsigned length, uint32_t *crc); /* streaming helpers */ extern int set_nonblock(int fd, int on); extern int pipe_in2out(int infd, int outfd, uint8_t *buffer, int length); /* command line args */ struct long_opt { const char *name; const char *type; const char *doc; int is_required; int (*parse_opt)(char *arg, void *value); void *value; struct long_opt *next; }; extern void __push_opt(struct long_opt *opt); #define TEST_OPTION(name, type, doc, is_required) \ param_check_##type(name, &(name)); \ static struct long_opt __long_opt_##name = { \ #name, #type, doc, is_required, parse_opt_##type, &name }; \ static void __init_opt_##name(void) __attribute__ ((constructor)); \ static void __init_opt_##name(void) \ { (void)__check_##name; __push_opt(&__long_opt_##name); } #define __param_check(name, p, type) \ static inline type *__check_##name(void) { return(p); } #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) extern void parseargs(int, char **); extern int parse_opt_bool(char *param, void *arg); #define param_check_bool(name, p) __param_check(name, p, int) extern int parse_opt_int(char *param, void *arg); #define param_check_int(name, p) __param_check(name, p, int) extern int parse_opt_uint(char *param, void *arg); #define param_check_uint(name, p) __param_check(name, p, unsigned int) extern int parse_opt_long(char *param, void *arg); #define param_check_long(name, p) __param_check(name, p, long) extern int parse_opt_ulong(char *param, void *arg); #define param_check_ulong(name, p) __param_check(name, p, unsigned long) extern int parse_opt_string(char *param, void *arg); #define param_check_string(name, p) __param_check(name, p, char *) extern int write_pidfile(int pid); #include #include #include #define __stringify_1(x) #x #define __stringify(x) __stringify_1(x) /* * Macro to define stack alignment. * aarch64 requires stack to be aligned to 16 bytes. */ #define __stack_aligned__ __attribute__((aligned(16))) /* message helpers */ extern int test_log_init(const char *outfile, const char *suffix); extern int zdtm_seccomp; #define pr_err(format, arg...) \ test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ## arg) #define pr_perror(format, arg...) \ test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", \ __FILE__, __LINE__, ## arg, errno, strerror(errno)) #define fail(format, arg...) \ test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", \ __FILE__, __LINE__, ## arg, errno, strerror(errno)) #define skip(format, arg...) \ test_msg("SKIP: %s:%d: " format "\n", \ __FILE__, __LINE__, ## arg) #define pass() test_msg("PASS\n") typedef struct { unsigned long seed; int pipes[2]; } task_waiter_t; extern void task_waiter_init(task_waiter_t *t); extern void task_waiter_fini(task_waiter_t *t); extern void task_waiter_wait4(task_waiter_t *t, unsigned int lockid); extern void task_waiter_complete(task_waiter_t *t, unsigned int lockid); extern void task_waiter_complete_current(task_waiter_t *t); extern int tcp_init_server(int family, int *port); extern int tcp_accept_server(int sock); extern int tcp_init_client(int family, char *servIP, unsigned short servPort); #endif /* _VIMITESU_H_ */ criu-3.6/test/zdtm/static/000077500000000000000000000000001317335042600155545ustar00rootroot00000000000000criu-3.6/test/zdtm/static/Makefile000066400000000000000000000265161317335042600172260ustar00rootroot00000000000000LIBDIR := ../lib LIB := $(LIBDIR)/libzdtmtst.a LDLIBS += $(LIB) CPPFLAGS += -I$(LIBDIR) TST_NOFILE := \ busyloop00 \ sleeping00 \ pid00 \ caps00 \ wait00 \ zombie00 \ fpu00 \ fpu01 \ arm-neon00 \ futex \ futex-rl \ mmx00 \ sse00 \ sse20 \ mprotect00 \ timers \ timerfd \ unbound_sock \ sched_prio00 \ sched_policy00 \ socket_listen \ socket_listen6 \ socket_udp \ socket6_udp \ socket_udp_shutdown \ sk-freebind \ sk-freebind-false \ socket_udplite \ socket_aio \ socket_close_data \ socket_snd_addr \ socket_dgram_data \ packet_sock \ packet_sock_mmap \ packet_sock_spkt \ sock_filter \ msgque \ inotify_system \ inotify_system_nodel \ shm \ shm-mp \ ptrace_sig \ pipe00 \ pipe01 \ pipe02 \ pthread00 \ pthread01 \ pthread02 \ vdso00 \ vdso01 \ vdso02 \ vdso-proxy \ utsname \ pstree \ sockets01 \ sockets02 \ sockets_spair \ socket_queues \ socket-tcp \ socket-tcp-reseted \ socket-tcp6 \ socket-tcp-local \ socket-tcp-nfconntrack \ socket-tcp6-local \ socket-tcpbuf \ socket-tcpbuf-local \ socket-tcpbuf6-local \ socket-tcpbuf6 \ socket-tcp-fin-wait1 \ socket-tcp6-fin-wait1 \ socket-tcp-fin-wait2 \ socket-tcp6-fin-wait2 \ socket-tcp-close-wait \ socket-tcp6-close-wait \ socket-tcp-last-ack \ socket-tcp6-last-ack \ socket-tcp-closing \ socket-tcp6-closing \ socket-tcp-closed \ socket-tcp-closed-last-ack \ socket-tcp6-closed \ socket-tcp-close0 \ socket-tcp-close1 \ socket-tcp-unconn \ socket-tcp6-unconn \ socket-tcp-syn-sent \ sock_opts00 \ sock_opts01 \ sk-unix-unconn \ ipc_namespace \ selfexe00 \ sem \ maps01 \ maps02 \ maps04 \ maps05 \ mlock_setuid \ xids00 \ groups \ pdeath_sig \ file_fown \ proc-self \ eventfs00 \ signalfd00 \ inotify_irmap \ fanotify00 \ uptime_grow \ session00 \ rlimits00 \ pty00 \ pty01 \ pty-console \ pty02 \ pty03 \ pty04 \ tty00 \ tty02 \ tty03 \ poll \ mountpoints \ netns \ netns-dev \ session01 \ session02 \ session03 \ socket-ext \ unhashed_proc \ cow00 \ child_opened_proc \ posix_timers \ sigpending \ sigaltstack \ sk-netlink \ mem-touch \ grow_map \ grow_map02 \ grow_map03 \ tun \ stopped \ stopped01 \ stopped02 \ stopped12 \ rtc \ clean_mntns \ mntns_rw_ro_rw \ dumpable01 \ dumpable02 \ remap_dead_pid \ remap_dead_pid_root \ scm00 \ scm01 \ scm02 \ scm03 \ scm04 \ aio00 \ aio01 \ fd \ apparmor \ seccomp_strict \ seccomp_filter \ seccomp_filter_tsync \ seccomp_filter_inheritance \ different_creds \ vsx \ bridge \ vfork00 \ oom_score_adj \ loginuid \ cgroupns \ helper_zombie_child \ clone_fs \ macvlan \ sit \ cr_veth \ sock_peercred \ s390x_mmap_high \ uffd-events \ thread_different_uid_gid \ # jobctl00 \ include ../Makefile.inc ifneq ($(SRCARCH),arm) ifneq ($(COMPAT_TEST),y) TST_NOFILE += maps03 endif endif ifeq ($(SRCARCH),s390) TST_NOFILE += s390x_regs_check \ s390x_gs_threads \ s390x_runtime_instr endif TST_FILE = \ maps06 \ write_read00 \ write_read01 \ write_read02 \ write_read10 \ maps00 \ link10 \ file_attr \ deleted_unix_sock \ sk-unix-rel \ deleted_dev \ unlink_fstat00 \ unlink_fstat01 \ unlink_fstat01+ \ unlink_fstat02 \ unlink_fstat03 \ ghost_holes00 \ ghost_holes01 \ ghost_holes02 \ unlink_largefile \ mtime_mmap \ fifo \ fifo-ghost \ fifo_ro \ fifo_wronly \ console \ vt \ unlink_fifo \ unlink_fifo_wronly \ unlink_mmap00 \ unlink_mmap01 \ unlink_mmap02 \ file_shared \ file_append \ cow01 \ fdt_shared \ sockets00 \ sockets03 \ sockets_dgram \ file_locks00 \ file_locks01 \ file_locks02 \ file_locks03 \ file_locks04 \ file_locks05 \ file_locks06 \ file_locks07 \ file_locks08 \ netns-nf \ maps_file_prot \ socket_close_data01 \ TST_DIR = \ cwd00 \ cwd01 \ cwd02 \ overmount_dev \ overmount_file \ overmount_fifo \ overmount_sock \ tempfs \ tempfs_overmounted \ tempfs_overmounted01 \ tempfs_ro \ tempfs_ro02 \ tempfs_subns \ mnt_ro_bind \ mount_paths \ bind-mount \ inotify00 \ inotify01 \ inotify02 \ cgroup00 \ rmdir_open \ cgroup01 \ cgroup02 \ cgroup03 \ cgroup04 \ cgroup_stray \ unlink_fstat04 \ mntns_remap \ mntns_open \ mntns_link_remap \ mntns_ghost \ mntns_ro_root \ mntns_link_ghost \ mntns_shared_bind \ mntns_shared_bind02 \ mntns_shared_bind03 \ mntns_root_bind \ mntns_root_bind02 \ mntns_overmount \ mntns_shared_vs_private \ mnt_ext_manual \ mnt_ext_auto \ mnt_ext_master \ mnt_ext_dev \ mnt_tracefs \ mntns_deleted \ unlink_regular00 \ mnt_enablefs \ autofs \ del_standalone_un \ TST_DIR_FILE = \ chroot \ chroot-file \ binfmt_misc \ TST = \ $(TST_NOFILE) \ $(TST_FILE) \ $(TST_DIR) \ $(TST_DIR_FILE) \ env00 \ fifo-rowo-pair \ umask00 \ cmdlinenv00 \ shm-unaligned \ TST_STATE = \ conntracks \ route_rules \ AUX_SRC = get_smaps_bits.c ofd_file_locks.c SRC = $(TST:%=%.c) $(AUX_SRC) OBJ = $(SRC:%.c=%.o) DEP = $(SRC:%.c=%.d) PID = $(TST:%=%.pid) OUT = $(TST:%=%.out) STATE = $(TST_STATE:%=%.state) STATE_OUT = $(TST_STATE:%=%.out) all: $(TST) criu-rtc.so install: all .PHONY: all install $(TST_NOFILE:%=%.pid): %.pid: % $(/dev/null` 2>/dev/null || break; \ sleep 1; \ echo -n .; \ i=`expr $$i + 1`; \ done; \ echo; \ [ $$i -lt $(WAIT_TIME) ] wait_stop: i=0; \ while [ $$i -lt $(WAIT_TIME) ] ; do \ kill -0 `awk '{print}' *.pid 2>/dev/null` 2>/dev/null || break; \ sleep 1; \ i=`expr $$i + 1`; \ done $(TST): | $(LIB) aio00: LDLIBS += -laio different_creds: LDLIBS += -lcap file_locks06 file_locks07 file_locks08: ofd_file_locks.o futex: CFLAGS += -pthread futex: LDFLAGS += -pthread futex-rl: CFLAGS += -pthread futex-rl: LDFLAGS += -pthread jobctl00: LDLIBS += -lutil socket_listen: LDLIBS += -lrt -pthread socket_aio: LDLIBS += -lrt -pthread uptime_grow: LDLIBS += -lrt -pthread unlink_largefile: CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE inotify_system_nodel: CFLAGS += -DNODEL pthread00: LDLIBS += -pthread pthread01: LDLIBS += -pthread pthread02: LDLIBS += -pthread different_creds: LDLIBS += -pthread sigpending: LDLIBS += -pthread sigaltstack: LDLIBS += -pthread seccomp_filter_tsync: LDLIBS += -pthread shm: CFLAGS += -DNEW_IPC_NS msgque: CFLAGS += -DNEW_IPC_NS sem: CFLAGS += -DNEW_IPC_NS posix_timers: LDLIBS += -lrt -pthread remap_dead_pid_root: CFLAGS += -DREMAP_PID_ROOT socket-tcp6: CFLAGS += -D ZDTM_IPV6 socket-tcpbuf6: CFLAGS += -D ZDTM_IPV6 socket-tcpbuf-local: CFLAGS += -D ZDTM_TCP_LOCAL socket-tcpbuf6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 socket-tcp6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 socket-tcp-local: CFLAGS += -D ZDTM_TCP_LOCAL socket-tcp-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_CONNTRACK socket_listen6: CFLAGS += -D ZDTM_IPV6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK mnt_ext_manual: CFLAGS += -D ZDTM_EXTMAP_MANUAL sigpending: LDLIBS += -lrt vdso01: LDLIBS += -lrt scm01: CFLAGS += -DKEEP_SENT_FD scm02: CFLAGS += -DSEND_BOTH scm04: CFLAGS += -DSEPARATE mntns_link_remap: CFLAGS += -DZDTM_LINK_REMAP mntns_shared_bind02: CFLAGS += -DSHARED_BIND02 mntns_root_bind02: CFLAGS += -DROOT_BIND02 maps02: get_smaps_bits.o mlock_setuid: get_smaps_bits.o inotify01: CFLAGS += -DINOTIFY01 unlink_fstat01+: CFLAGS += -DUNLINK_OVER unlink_fstat04: CFLAGS += -DUNLINK_FSTAT04 ghost_holes01: CFLAGS += -DTAIL_HOLE ghost_holes02: CFLAGS += -DHEAD_HOLE sk-freebind-false: CFLAGS += -DZDTM_FREEBIND_FALSE stopped01: CFLAGS += -DZDTM_STOPPED_KILL stopped02: CFLAGS += -DZDTM_STOPPED_TKILL stopped12: CFLAGS += -DZDTM_STOPPED_KILL -DZDTM_STOPPED_TKILL clone_fs: LDLIBS += -pthread # As generating dependencies won't work without proper includes, # we have to explicitly specify both .o and .d for this case: netns_sub_veth.o netns_sub_veth.d: CPPFLAGS += -I/usr/include/libnl3 netns_sub_veth: LDLIBS += -lnl-3 -l nl-route-3 socket-tcp-fin-wait1: CFLAGS += -D ZDTM_TCP_FIN_WAIT1 socket-tcp-fin-wait2: CFLAGS += -D ZDTM_TCP_FIN_WAIT2 socket-tcp6-fin-wait1: CFLAGS += -D ZDTM_TCP_FIN_WAIT1 -D ZDTM_IPV6 socket-tcp6-fin-wait2: CFLAGS += -D ZDTM_TCP_FIN_WAIT2 -D ZDTM_IPV6 socket-tcp-close-wait: CFLAGS += -D ZDTM_TCP_CLOSE_WAIT socket-tcp6-close-wait: CFLAGS += -D ZDTM_TCP_CLOSE_WAIT -D ZDTM_IPV6 socket-tcp-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK socket-tcp6-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK -D ZDTM_IPV6 socket-tcp6-closing: CFLAGS += -D ZDTM_IPV6 socket-tcp6-unconn: CFLAGS += -D ZDTM_IPV6 pty-console: CFLAGS += -D ZDTM_DEV_CONSOLE shm-unaligned: CFLAGS += -DZDTM_SHM_UNALIGNED s390x_regs_check: LDFLAGS += -pthread s390x_gs_threads: LDFLAGS += -pthread thread_different_uid_gid: LDLIBS += -pthread -lcap $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) clean-more: $(RM) criu-rtc.so criu-rtc.pb-c.c criu-rtc.pb-c.h .PHONY: clean-more clean: clean-more rtc.c: criu-rtc.so criu-rtc.pb-c.c: criu-rtc.proto $(Q)echo $@ >> .gitignore $(Q)echo $(@:%.c=%.h) >> .gitignore $(E) " PBCC " $@ $(Q)protoc-c --proto_path=. --c_out=. criu-rtc.proto criu-rtc.so: criu-rtc.c criu-rtc.pb-c.c $(E) " LD " $@ $(Q)$(CC) -g -Wall -shared -nostartfiles criu-rtc.c criu-rtc.pb-c.c -o criu-rtc.so -iquote ../../../criu/include -fPIC $(filter-out -m32,$(USERCFLAGS)) .PHONY: force start check_start stop wait_stop criu-3.6/test/zdtm/static/aio00.c000066400000000000000000000011611317335042600166270ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that plain io_setup works"; const char *test_author = "Pavel Emelianov "; int main(int argc, char **argv) { int ret; io_context_t ctx = 0; test_init(argc, argv); if (io_setup(1, &ctx) < 0) { pr_perror("Can't setup io ctx"); return 1; } test_daemon(); test_waitsig(); ret = io_getevents(ctx, 0, 1, NULL, NULL); if (ret != 0) { if (ret < 0) fail("IO ctx lost (%d)", ret); else fail("IO ctx screwed up (%d)", ret); } else pass(); return 0; } criu-3.6/test/zdtm/static/aio00.desc000066400000000000000000000000541317335042600173230ustar00rootroot00000000000000{'feature': 'aio_remap', 'flags': 'nouser'} criu-3.6/test/zdtm/static/aio01.c000066400000000000000000000051271317335042600166360ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check head and tail restore correct"; const char *test_author = "Kirill Tkhai "; struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; unsigned compat_features; unsigned incompat_features; unsigned header_length; /* size of aio_ring */ struct io_event io_events[0]; }; /* 128 bytes + ring size */ int main(int argc, char **argv) { struct iocb iocb, *iocbp = &iocb; volatile struct aio_ring *ring; unsigned long ctx = 0; struct io_event event; unsigned tail[2], head[2]; unsigned nr[2]; int i, fd, ret; char buf[1]; test_init(argc, argv); memset(&iocb, 0, sizeof(iocb)); if (syscall(__NR_io_setup, 64, &ctx) < 0) { pr_perror("Can't setup io ctx"); return 1; } fd = open("/dev/null", O_WRONLY); if (fd < 0) { pr_perror("Can't open /dev/null"); return 1; } iocb.aio_fildes = fd; iocb.aio_buf = (unsigned long)buf; iocb.aio_nbytes = 1; iocb.aio_lio_opcode = IOCB_CMD_PWRITE; ring = (struct aio_ring *)ctx; nr[0] = ring->nr; for (i = 0; i < nr[0] + 2; i++) { if (syscall(__NR_io_submit, ctx, 1, &iocbp) != 1) { fail("Can't submit aio"); return 1; } if (!(i % 2)) continue; ret = syscall(__NR_io_getevents, ctx, 0, 1, &event, NULL); if (ret != 1) { fail("Can't get event"); return 1; } } tail[0] = *((volatile unsigned *)&ring->tail); head[0] = *((volatile unsigned *)&ring->head); test_msg("tail=%u, head=%u, nr=%u\n", tail[0], head[0], nr[0]); test_daemon(); test_waitsig(); tail[1] = *((volatile unsigned *)&ring->tail); head[1] = *((volatile unsigned *)&ring->head); nr[1] = *((volatile unsigned *)&ring->nr); test_msg("tail=%u, head=%u, nr=%u\n", tail[1], head[1], nr[1]); if (tail[0] != tail[1] || head[0] != head[1] || nr[0] != nr[1]) { fail("missmatch"); return 1; } if (syscall(__NR_io_submit, ctx, 1, &iocbp) != 1) { fail("Can't submit aio"); return 1; } tail[1] = *((volatile unsigned *)&ring->tail); head[1] = *((volatile unsigned *)&ring->head); nr[1] = *((volatile unsigned *)&ring->nr); test_msg("tail=%u, head=%u, nr=%u\n", tail[1], head[1], nr[1]); if (tail[1] == tail[0] + 1 && head[1] == head[0] && nr[1] == nr[0]) pass(); else fail("mismatch"); return 0; } criu-3.6/test/zdtm/static/aio01.desc000066400000000000000000000000541317335042600173240ustar00rootroot00000000000000{'feature': 'aio_remap', 'flags': 'nouser'} criu-3.6/test/zdtm/static/apparmor.c000066400000000000000000000030761317335042600175470ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that an apparmor profile is restored"; const char *test_author = "Tycho Andersen "; #define PROFILE "criu_test" int setprofile() { char profile[1024]; int fd, len; len = snprintf(profile, sizeof(profile), "changeprofile " PROFILE); if (len < 0 || len >= sizeof(profile)) { fail("bad sprintf\n"); return -1; } fd = open("/proc/self/attr/current", O_WRONLY); if (fd < 0) { fail("couldn't open fd\n"); return -1; } /* apparmor wants this in exactly one write, so we use write() here * vs. fprintf Just To Be Sure */ len = write(fd, profile, len); close(fd); if (len < 0) { fail("couldn't write profile\n"); return -1; } return 0; } int checkprofile() { FILE *f; char path[PATH_MAX], profile[1024]; int len; sprintf(path, "/proc/self/attr/current"); f = fopen(path, "r"); if (!f) { fail("couldn't open lsm current\n"); return -1; } len = fscanf(f, "%[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d\n", len); return -1; } if (strcmp(profile, PROFILE) != 0) { fail("bad profile .%s. expected .%s.\n", profile, PROFILE); return -1; } return 0; } int main(int argc, char **argv) { test_init(argc, argv); setprofile(); test_daemon(); test_waitsig(); if (checkprofile() == 0) pass(); return 0; } criu-3.6/test/zdtm/static/apparmor.checkskip000077500000000000000000000001561317335042600212700ustar00rootroot00000000000000#!/bin/bash test -d /sys/kernel/security/apparmor || exit 1 apparmor_parser -r `dirname $0`/apparmor.profile criu-3.6/test/zdtm/static/apparmor.desc000066400000000000000000000000441317335042600202330ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/apparmor.profile000066400000000000000000000001301317335042600207510ustar00rootroot00000000000000# vim:syntax=apparmor profile criu_test { /** rwmlkix, capability, unix, signal, } criu-3.6/test/zdtm/static/arm-neon00.c000066400000000000000000000022171317335042600175760ustar00rootroot00000000000000#include #include #include "zdtmtst.h" const char *test_doc = "Initialize VFP registers before a migration,\n" "check the VFP state is the same after a restore."; const char *test_author = "Alexander Karatshov "; #ifdef __arm__ int main(int argc, char ** argv) { srand(time(0)); int a = rand() % 100; int b = rand() % 100; int c = rand() % 100; int y1 = a + b*c; int y2; test_init(argc, argv); asm ( ".fpu neon \n" "vmov.32 d0[0], %0 \n" "vmov.32 d1[0], %1 \n" "vmov.32 d2[0], %2 \n" ".fpu softvfp \n" : : "r"(a), "r"(b), "r"(c) ); test_msg("Preparing to wait...\n"); test_daemon(); test_waitsig(); test_msg("Restored.\n"); asm ( ".fpu neon \n" "vmul.I32 d3, d1, d2 \n" "vadd.I32 d4, d0, d3 \n" "vmov.32 %0, d4[0] \n" ".fpu softvfp \n" : "=r"(y2) ); if (y1 != y2) fail("VFP restoration failed: result = %d, expected = %d (a = %d, b = %d, c = %d)\n", y2, y1, a, b, c); else pass(); return 0; } #else int main(int argc, char *argv[]) { test_init(argc, argv); skip("This test is supposed to run on an ARM machine!"); return 0; } #endif criu-3.6/test/zdtm/static/arm-neon00.desc000066400000000000000000000000241317335042600202640ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/auto_dev-ioctl.h000066400000000000000000000121021317335042600206370ustar00rootroot00000000000000/* * Copyright 2008 Red Hat, Inc. All rights reserved. * Copyright 2008 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. */ #ifndef _LINUX_AUTO_DEV_IOCTL_H #define _LINUX_AUTO_DEV_IOCTL_H #include #ifdef __KERNEL__ #include #else #include #endif /* __KERNEL__ */ #define AUTOFS_DEVICE_NAME "autofs" #define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 #define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 #define AUTOFS_DEVID_LEN 16 #define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) /* * An ioctl interface for autofs mount point control. */ struct args_protover { __u32 version; }; struct args_protosubver { __u32 sub_version; }; struct args_openmount { __u32 devid; }; struct args_ready { __u32 token; }; struct args_fail { __u32 token; __s32 status; }; struct args_setpipefd { __s32 pipefd; }; struct args_timeout { __u64 timeout; }; struct args_requester { __u32 uid; __u32 gid; }; struct args_expire { __u32 how; }; struct args_askumount { __u32 may_umount; }; struct args_ismountpoint { union { struct args_in { __u32 type; } in; struct args_out { __u32 devid; __u32 magic; } out; }; }; /* * All the ioctls use this structure. * When sending a path size must account for the total length * of the chunk of memory otherwise is is the size of the * structure. */ struct autofs_dev_ioctl { __u32 ver_major; __u32 ver_minor; __u32 size; /* total size of data passed in * including this struct */ __s32 ioctlfd; /* automount command fd */ /* Command parameters */ union { struct args_protover protover; struct args_protosubver protosubver; struct args_openmount openmount; struct args_ready ready; struct args_fail fail; struct args_setpipefd setpipefd; struct args_timeout timeout; struct args_requester requester; struct args_expire expire; struct args_askumount askumount; struct args_ismountpoint ismountpoint; }; char path[0]; }; static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) { memset(in, 0, sizeof(struct autofs_dev_ioctl)); in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; in->size = sizeof(struct autofs_dev_ioctl); in->ioctlfd = -1; return; } /* * If you change this make sure you make the corresponding change * to autofs-dev-ioctl.c:lookup_ioctl() */ enum { /* Get various version info */ AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71, AUTOFS_DEV_IOCTL_PROTOVER_CMD, AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, /* Open mount ioctl fd */ AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, /* Close mount ioctl fd */ AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, /* Mount/expire status returns */ AUTOFS_DEV_IOCTL_READY_CMD, AUTOFS_DEV_IOCTL_FAIL_CMD, /* Activate/deactivate autofs mount */ AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, AUTOFS_DEV_IOCTL_CATATONIC_CMD, /* Expiry timeout */ AUTOFS_DEV_IOCTL_TIMEOUT_CMD, /* Get mount last requesting uid and gid */ AUTOFS_DEV_IOCTL_REQUESTER_CMD, /* Check for eligible expire candidates */ AUTOFS_DEV_IOCTL_EXPIRE_CMD, /* Request busy status */ AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, /* Check if path is a mountpoint */ AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, }; #define AUTOFS_IOCTL 0x93 #define AUTOFS_DEV_IOCTL_VERSION \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_PROTOVER \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_PROTOSUBVER \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_OPENMOUNT \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_CLOSEMOUNT \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_READY \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_FAIL \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_SETPIPEFD \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_CATATONIC \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_TIMEOUT \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_REQUESTER \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_EXPIRE \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_ASKUMOUNT \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl) #define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \ _IOWR(AUTOFS_IOCTL, \ AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl) #endif /* _LINUX_AUTO_DEV_IOCTL_H */ criu-3.6/test/zdtm/static/autofs.c000066400000000000000000000454221317335042600172300ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #include "auto_dev-ioctl.h" const char *test_doc = "Autofs (v5) migration test"; const char *test_author = "Stanislav Kinsburskii "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #define AUTOFS_DEV "/dev/autofs" #define INDIRECT_MNT_DIR "mnt" int autofs_dev; task_waiter_t t; static char *xvstrcat(char *str, const char *fmt, va_list args) { size_t offset = 0, delta; int ret; char *new; va_list tmp; if (str) offset = strlen(str); delta = strlen(fmt) * 2; do { ret = -ENOMEM; new = realloc(str, offset + delta); if (new) { va_copy(tmp, args); ret = vsnprintf(new + offset, delta, fmt, tmp); if (ret >= delta) { /* NOTE: vsnprintf returns the amount of bytes * * to allocate. */ delta = ret +1; str = new; ret = 0; } } } while (ret == 0); if (ret == -ENOMEM) { /* realloc failed. We must release former string */ pr_err("Failed to allocate string\n"); free(str); } else if (ret < 0) { /* vsnprintf failed */ pr_err("Failed to print string\n"); free(new); new = NULL; } return new; } char *xstrcat(char *str, const char *fmt, ...) { va_list args; va_start(args, fmt); str = xvstrcat(str, fmt, args); va_end(args); return str; } char *xsprintf(const char *fmt, ...) { va_list args; char *str; va_start(args, fmt); str = xvstrcat(NULL, fmt, args); va_end(args); return str; } struct autofs_params { const char *mountpoint; int (*create)(struct autofs_params *p); int (*setup)(struct autofs_params *p); int (*check)(struct autofs_params *p); int (*reap)(struct autofs_params *p); const unsigned type; int fd; struct stat fd_stat; void (*onexit)(void); const int close_pipe; pid_t pid; }; struct autofs_params *my_type; static int stop; static int setup_direct(struct autofs_params *p) { char *path; path = xsprintf("%s/%s/direct_file", dirname, p->mountpoint); if (!path) { pr_err("failed to allocate path\n"); return -ENOMEM; } p->fd = open(path, O_CREAT | O_EXCL, 0600); if (p->fd < 0) { pr_perror("%d: failed to open file %s", getpid(), path); return -errno; } if (fstat(p->fd, &p->fd_stat)) { pr_perror("%d: failed to stat %s", getpid(), path); return -errno; } free(path); return 0; } static int setup_indirect(struct autofs_params *p) { char *path; path = xsprintf("%s/%s/%s/indirect_file", dirname, p->mountpoint, INDIRECT_MNT_DIR); if (!path) { pr_err("failed to allocate path\n"); return -ENOMEM; } p->fd = open(path, O_CREAT | O_EXCL, 0600); if (p->fd < 0) { pr_perror("%d: failed to open file %s", getpid(), path); return -errno; } if (fstat(p->fd, &p->fd_stat)) { pr_perror("%d: failed to stat %s", getpid(), path); return -errno; } free(path); return 0; } static int umount_fs(const char *mountpoint, int magic) { struct statfs buf; if (statfs(mountpoint, &buf)) { pr_perror("%s: failed to statfs", mountpoint); return -errno; } if (buf.f_type == magic) { if (umount(mountpoint) < 0) { pr_perror("failed to umount %s tmpfs", mountpoint); return -errno; } } return 0; } static int check_fd(struct autofs_params *p) { struct stat st; int ret = 0; if (fstat(p->fd, &st)) { pr_perror("failed to stat fd %d", p->fd); return -errno; } if (st.st_dev != p->fd_stat.st_dev) { skip("%s: st_dev differs: %llu != %llu " "(waiting for \"device namespaces\")", p->mountpoint, (long long unsigned)st.st_dev, (long long unsigned)p->fd_stat.st_dev); // ret++; } if (st.st_mode != p->fd_stat.st_mode) { pr_err("%s: st_mode differs: 0%o != 0%o\n", p->mountpoint, st.st_mode, p->fd_stat.st_mode); ret++; } if (st.st_nlink != p->fd_stat.st_nlink) { pr_err("%s: st_nlink differs: %ld != %ld\n", p->mountpoint, (long)st.st_nlink, (long)p->fd_stat.st_nlink); ret++; } if (st.st_uid != p->fd_stat.st_uid) { pr_err("%s: st_uid differs: %u != %u\n", p->mountpoint, st.st_uid, p->fd_stat.st_uid); ret++; } if (st.st_gid != p->fd_stat.st_gid) { pr_err("%s: st_gid differs: %u != %u\n", p->mountpoint, st.st_gid, p->fd_stat.st_gid); ret++; } if (st.st_rdev != p->fd_stat.st_rdev) { pr_err("%s: st_rdev differs: %lld != %lld\n", p->mountpoint, (long long)st.st_rdev, (long long)p->fd_stat.st_rdev); ret++; } if (st.st_size != p->fd_stat.st_size) { pr_err("%s: st_size differs: %lld != %lld\n", p->mountpoint, (long long)st.st_size, (long long)p->fd_stat.st_size); ret++; } if (st.st_blksize != p->fd_stat.st_blksize) { pr_err("%s: st_blksize differs %lld != %lld:\n", p->mountpoint, (long long)st.st_blksize, (long long)p->fd_stat.st_blksize); ret++; } if (st.st_blocks != p->fd_stat.st_blocks) { pr_err("%s: st_blocks differs: %lld != %lld\n", p->mountpoint, (long long)st.st_blocks, (long long)p->fd_stat.st_blocks); ret++; } return ret; } static int check_automount(struct autofs_params *p) { int err; char *mountpoint; err = check_fd(p); if (err) { pr_err("%s: opened file descriptor wasn't migrated properly\n", p->mountpoint); return err; } if (p->type == AUTOFS_TYPE_DIRECT) mountpoint = xsprintf("%s/%s", dirname, p->mountpoint); else if (p->type == AUTOFS_TYPE_INDIRECT) mountpoint = xsprintf("%s/%s/%s", dirname, p->mountpoint, INDIRECT_MNT_DIR); else { pr_err("Unknown autofs type: %d\n", p->type); return -EINVAL; } if (!mountpoint) { pr_err("failed to allocate string\n"); return -ENOMEM; } if (close(p->fd)) { pr_err("%s: failed to close fd %d\n", p->mountpoint, p->fd); return -errno; } err = umount_fs(mountpoint, TMPFS_MAGIC); if (err) return err; free(mountpoint); err = p->setup(p); if (err) { pr_err("autofs doesn't workafter restore\n"); return err; } if (close(p->fd)) { pr_perror("%s: failed to close fd %d", mountpoint, p->fd); return -errno; } return 0; } static int autofs_dev_open(void) { int fd; if (access(AUTOFS_DEV, R_OK | W_OK)) { pr_perror("Device /dev/autofs is not accessible"); return -1; } fd = open(AUTOFS_DEV, O_RDONLY); if (fd == -1) { pr_perror("failed to open /dev/autofs"); return -errno; } return fd; } static int autofs_open_mount(int devid, const char *mountpoint) { struct autofs_dev_ioctl *param; size_t size; int fd; size = sizeof(struct autofs_dev_ioctl) + strlen(mountpoint) + 1; param = malloc(size); init_autofs_dev_ioctl(param); param->size = size; param->ioctlfd = -1; param->openmount.devid = devid; strcpy(param->path, mountpoint); if (ioctl(autofs_dev, AUTOFS_DEV_IOCTL_OPENMOUNT, param) < 0) { pr_perror("failed to open autofs mount %s", mountpoint); return -errno; } fd = param->ioctlfd; free(param); return fd; } static int autofs_report_result(int token, int devid, const char *mountpoint, int result) { int ioctl_fd; struct autofs_dev_ioctl param; int err; ioctl_fd = autofs_open_mount(devid, mountpoint); if (ioctl_fd < 0) { pr_err("failed to open autofs mountpoint %s\n", mountpoint); return ioctl_fd; } init_autofs_dev_ioctl(¶m); param.ioctlfd = ioctl_fd; if (result) { param.fail.token = token; param.fail.status = result; } else param.ready.token = token; err = ioctl(autofs_dev, result ? AUTOFS_DEV_IOCTL_FAIL : AUTOFS_DEV_IOCTL_READY, ¶m); if (err) { pr_perror("failed to report result to autofs mountpoint %s", mountpoint); err = -errno; } close(ioctl_fd); return err; } static int mount_tmpfs(const char *mountpoint) { struct statfs buf; if (statfs(mountpoint, &buf)) { pr_perror("failed to statfs %s", mountpoint); return -errno; } if (buf.f_type == TMPFS_MAGIC) return 0; if (mount("autofs_test", mountpoint, "tmpfs", 0, "size=1M") < 0) { pr_perror("failed to mount tmpfs to %s", mountpoint); return -errno; } return 0; } static int autofs_mount_direct(const char *mountpoint, const struct autofs_v5_packet *packet) { int err; const char *direct_mnt = mountpoint; err = mount_tmpfs(direct_mnt); if (err) pr_err("%d: failed to mount direct autofs mountpoint\n", getpid()); return err; } static int autofs_mount_indirect(const char *mountpoint, const struct autofs_v5_packet *packet) { char *indirect_mnt; int err; indirect_mnt = xsprintf("%s/%s", mountpoint, packet->name); if (!indirect_mnt) { pr_err("failed to allocate indirect mount path\n"); return -ENOMEM; } if ((mkdir(indirect_mnt, 0755) < 0) && (errno != EEXIST)) { pr_perror("failed to create %s directory", indirect_mnt); return -errno; } err = mount_tmpfs(indirect_mnt); if (err) pr_err("%d: failed to mount indirect autofs mountpoint\n", getpid()); return err; } static int automountd_serve(const char *mountpoint, struct autofs_params *p, const union autofs_v5_packet_union *packet) { const struct autofs_v5_packet *v5_packet = &packet->v5_packet; int err, res; switch (packet->hdr.type) { case autofs_ptype_missing_indirect: res = autofs_mount_indirect(mountpoint, v5_packet); break; case autofs_ptype_missing_direct: res = autofs_mount_direct(mountpoint, v5_packet); break; case autofs_ptype_expire_indirect: pr_err("%d: expire request for indirect mount %s?", getpid(), v5_packet->name); return -EINVAL; case autofs_ptype_expire_direct: pr_err("%d: expire request for direct mount?", getpid()); return -EINVAL; default: pr_err("unknown request type: %d\n", packet->hdr.type); return -EINVAL; } err = autofs_report_result(v5_packet->wait_queue_token, v5_packet->dev, mountpoint, res); if (err) return err; return res; } static int automountd_loop(int pipe, const char *mountpoint, struct autofs_params *param) { union autofs_v5_packet_union *packet; ssize_t bytes; size_t psize = sizeof(*packet); int err = 0; packet = malloc(psize); if (!packet) { pr_err("failed to allocate autofs packet\n"); return -ENOMEM; } /* Allow SIGUSR2 to interrupt system call */ siginterrupt(SIGUSR2, 1); while (!stop && !err) { memset(packet, 0, psize); bytes = read(pipe, packet, psize); if (bytes < 0) { if (errno != EINTR) { pr_perror("failed to read packet"); return -errno; } continue; } if (bytes != psize) { pr_err("read less than expected: %zd < %zd\n", bytes, psize); return -EINVAL; } err = automountd_serve(mountpoint, param, packet); if (err) pr_err("request to autofs failed: %d\n", err); } return err; } static int automountd(struct autofs_params *p, int control_fd) { int pipes[2]; char *autofs_path; char *options; int ret = -1; char *type; my_type = p; if (p->onexit) atexit(p->onexit); autofs_path = xsprintf("%s/%s", dirname, p->mountpoint); if (!autofs_path) { pr_err("failed to allocate autofs path"); goto err; } if (pipe(pipes) < 0) { pr_perror("%d: failed to create pipe", getpid()); goto err; } if (setpgrp() < 0) { pr_perror("failed to become a process group leader"); goto err; } switch (p->type) { case AUTOFS_TYPE_DIRECT: type = "direct"; break; case AUTOFS_TYPE_INDIRECT: type = "indirect"; break; case AUTOFS_TYPE_OFFSET: type = "offset"; break; default: pr_err("unknown autofs type: %d\n", p->type); return -EINVAL; } options = xsprintf("fd=%d,pgrp=%d,minproto=5,maxproto=5,%s", pipes[1], getpgrp(), type); if (!options) { pr_err("failed to allocate autofs options\n"); goto err; } if (mkdir(autofs_path, 0600) < 0) { pr_perror("failed to create %s", autofs_path); test_msg("cwd: %s\n", get_current_dir_name()); goto err; } if (mount("autofs_test", autofs_path, "autofs", 0, options) < 0) { pr_perror("failed to mount autofs with options \"%s\"", options); goto err; } if (p->close_pipe) close(pipes[1]); ret = 0; if (write(control_fd, &ret, sizeof(ret)) != sizeof(ret)) { pr_perror("failed to send result"); goto err; } close(control_fd); task_waiter_complete(&t, getpid()); return automountd_loop(pipes[0], autofs_path, p); err: if (write(control_fd, &ret, sizeof(ret) != sizeof(ret))) { pr_perror("failed to send result"); return -errno; } return ret; } static int start_automounter(struct autofs_params *p) { int pid; int control_fd[2]; ssize_t bytes; int ret; if (pipe(control_fd) < 0) { pr_perror("failed to create control_fd pipe"); return -errno; } pid = test_fork(); switch (pid) { case -1: pr_perror("failed to fork"); return -1; case 0: close(control_fd[0]); exit(automountd(p, control_fd[1])); } task_waiter_wait4(&t, pid); p->pid = pid; close(control_fd[1]); bytes = read(control_fd[0], &ret, sizeof(ret)); close(control_fd[0]); if (bytes < 0) { pr_perror("failed to get start result"); return -errno; } if (bytes != sizeof(ret)) { pr_err("received less than expected: %zu. Child %d died?\n", bytes, p->pid); return -EINVAL; } return ret; } static void do_stop(int sig) { stop = 1; } static int reap_child(struct autofs_params *p) { int status; int pid = p->pid; if (kill(pid, SIGUSR2)) { pr_perror("failed to kill child %d", pid); return -errno; } if (waitpid(pid, &status, 0) == -1) { pr_perror("failed to collect child %d", pid); return -errno; } if (WIFSIGNALED(status)) { pr_err("Child was killed by %d\n", WTERMSIG(status)); return -1; } return WEXITSTATUS(status); } static int reap_catatonic(struct autofs_params *p) { char *mountpoint; int err; mountpoint = xsprintf("%s/%s", dirname, p->mountpoint); if (!mountpoint) { pr_err("failed to allocate string\n"); return -ENOMEM; } err = umount_fs(mountpoint, AUTOFS_SUPER_MAGIC); if (!err) { if (rmdir(mountpoint) < 0) { skip("failed to remove %s directory: %s\n", mountpoint, strerror(errno)); err = -errno; } } return err; } static int setup_catatonic(struct autofs_params *p) { char *path; path = xsprintf("%s/%s/file", dirname, p->mountpoint); if (!path) { pr_err("failed to allocate path\n"); return -ENOMEM; } p->fd = open(path, O_CREAT | O_EXCL, 0600); if (p->fd >= 0) { pr_perror("%d: was able to open file %s on catatonic mount", getpid(), path); return -EINVAL; } free(path); return 0; } static int check_catatonic(struct autofs_params *p) { char *mountpoint; struct statfs buf; mountpoint = xsprintf("%s/%s", dirname, p->mountpoint); if (!mountpoint) { pr_err("failed to allocate path\n"); return -ENOMEM; } if (statfs(mountpoint, &buf)) { pr_perror("%s: failed to statfs", mountpoint); return -errno; } if (buf.f_type != AUTOFS_SUPER_MAGIC) { pr_err("Non-autofs mount on path %s\n", mountpoint); return -EINVAL; } return setup_catatonic(p); } static int create_catatonic(struct autofs_params *p) { int err; int status; err = start_automounter(p); if (err) return err; if (kill(p->pid, SIGKILL)) { pr_perror("failed to kill child %d", p->pid); return -errno; } if (waitpid(p->pid, &status, 0) == -1) { pr_perror("failed to collect child %d", p->pid); return -errno; } return 0; } static void test_exit(void) { if (rmdir(dirname) < 0) skip("failed to remove %s directory: %s\n", dirname, strerror(errno)); } typedef enum { AUTOFS_START, AUTOFS_SETUP, AUTOFS_CHECK, AUTOFS_STOP } autfs_test_action; static int test_action(autfs_test_action act, struct autofs_params *p) { int ret = 0; while(p->mountpoint) { int (*action)(struct autofs_params *p); switch (act) { case AUTOFS_START: action = p->create; break; case AUTOFS_SETUP: action = p->setup; break; case AUTOFS_CHECK: action = p->check; break; case AUTOFS_STOP: action = p->reap; break; default: pr_err("unknown action: %d\n", act); return -1; } if (action && action(p)) ret++; p++; } return ret; } static void direct_exit(void) { struct autofs_params *p = my_type; char *mountpoint; mountpoint = xsprintf("%s/%s", dirname, p->mountpoint); if (!mountpoint) { pr_err("failed to allocate string\n"); return; } if (umount_fs(mountpoint, TMPFS_MAGIC)) return; if (umount_fs(mountpoint, AUTOFS_SUPER_MAGIC)) return; if (rmdir(mountpoint) < 0) skip("failed to remove %s directory: %s\n", mountpoint, strerror(errno)); } static void indirect_exit(void) { struct autofs_params *p = my_type; char *mountpoint, *tmpfs; mountpoint = xsprintf("%s/%s", dirname, p->mountpoint); if (!mountpoint) { pr_err("failed to allocate string\n"); return; } tmpfs = xsprintf("%s/%s/%s", dirname, p->mountpoint, INDIRECT_MNT_DIR); if (!tmpfs) { pr_err("failed to allocate string\n"); return; } if (!access(tmpfs, F_OK)) { if (umount_fs(tmpfs, TMPFS_MAGIC)) return; } if (umount_fs(mountpoint, AUTOFS_SUPER_MAGIC)) return; if (rmdir(mountpoint) < 0) skip("failed to remove %s directory: %s\n", mountpoint, strerror(errno)); } enum autofs_tests { AUTOFS_DIRECT, AUTOFS_INDIRECT, AUTOFS_CATATONIC, }; struct autofs_params autofs_types[] = { [AUTOFS_DIRECT] = { .mountpoint = "direct", .create = start_automounter, .setup = setup_direct, .check = check_automount, .reap = reap_child, .type = AUTOFS_TYPE_DIRECT, .fd = -1, .onexit = direct_exit, .close_pipe = 1, }, [AUTOFS_INDIRECT] = { .mountpoint = "indirect", .create = start_automounter, .setup = setup_indirect, .check = check_automount, .reap = reap_child, .type = AUTOFS_TYPE_INDIRECT, .fd = -1, .onexit = indirect_exit, .close_pipe = 0, }, [AUTOFS_CATATONIC] = { .mountpoint = "catatonic", .create = create_catatonic, .setup = setup_catatonic, .check = check_catatonic, .reap = reap_catatonic, .type = AUTOFS_TYPE_DIRECT, .onexit = NULL, .fd = -1, .close_pipe = 1, }, { NULL, NULL, NULL, NULL } }; int main(int argc, char **argv) { int ret = 0; test_init(argc, argv); task_waiter_init(&t); if (mkdir(dirname, 0777) < 0) { pr_perror("failed to create %s directory", dirname); return -1; } autofs_dev = autofs_dev_open(); if (autofs_dev < 0) return -1; if (signal(SIGUSR2, do_stop) == SIG_ERR) { pr_perror("Failed to set SIGUSR2 handler"); return -1; } if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { pr_perror("Failed to set SIGPIPE handler"); return -1; } if (test_action(AUTOFS_START, autofs_types)) { pr_err("AUTOFS_START action failed\n"); ret++; goto err; } close(autofs_dev); atexit(test_exit); if (test_action(AUTOFS_SETUP, autofs_types)) { pr_err("AUTOFS_SETUP action failed\n"); ret++; goto err; } test_daemon(); test_waitsig(); if (test_action(AUTOFS_CHECK, autofs_types)) { pr_err("AUTOFS_CHECK action failed\n"); ret++; } err: if (test_action(AUTOFS_STOP, autofs_types)) { pr_err("AUTOFS_STOP action failed\n"); ret++; } if (ret) { fail(); return ret; } pass(); return 0; } criu-3.6/test/zdtm/static/autofs.desc000066400000000000000000000000671317335042600177200ustar00rootroot00000000000000{'feature': 'autofs', 'flavor': 'ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/bind-mount.c000066400000000000000000000025651317335042600200040ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check bind-mounts"; const char *test_author = "Pavel Emelianov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { char test_dir[PATH_MAX], test_bind[PATH_MAX]; char test_file[PATH_MAX], test_bind_file[PATH_MAX]; int fd; test_init(argc, argv); mkdir(dirname, 0700); snprintf(test_dir, sizeof(test_dir), "%s/test", dirname); snprintf(test_bind, sizeof(test_bind), "%s/bind", dirname); snprintf(test_file, sizeof(test_file), "%s/test/test.file", dirname); snprintf(test_bind_file, sizeof(test_bind_file), "%s/bind/test.file", dirname); mkdir(test_dir, 0700); mkdir(test_bind, 0700); if (mount(test_dir, test_bind, NULL, MS_BIND, NULL)) { pr_perror("Unable to mount %s to %s", test_dir, test_bind); return 1; } test_daemon(); test_waitsig(); fd = open(test_file, O_CREAT | O_WRONLY | O_EXCL, 0600); if (fd < 0) { pr_perror("Unable to open %s", test_file); return 1; } close(fd); if (access(test_bind_file, F_OK)) { pr_perror("%s doesn't exist", test_bind_file); return 1; } if (umount(test_bind)) { pr_perror("Unable to umount %s", test_bind); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/bind-mount.desc000066400000000000000000000000461317335042600204700ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid'} criu-3.6/test/zdtm/static/binfmt_misc.c000066400000000000000000000073221317335042600202160ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that binfmt_misc entries remain registered"; const char *test_author = "Kirill Tkhai $tname/${name}_magic fi if [ -e $tname/${name}_extention ]; then echo -1 > $tname/${name}_extention fi set -e umount "$tname" rmdir "$tname" criu-3.6/test/zdtm/static/bridge.c000066400000000000000000000047011317335042600171560ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "check that empty bridges are c/r'd correctly"; const char *test_author = "Tycho Andersen "; #define BRIDGE_NAME "zdtmbr0" int add_bridge(void) { if (system("ip link add " BRIDGE_NAME " type bridge")) return -1; if (system("ip addr add 10.0.55.55/32 dev " BRIDGE_NAME)) return -1; /* use a link local address so we can test scope_id change */ if (system("ip addr add fe80:4567::1/64 nodad dev " BRIDGE_NAME)) return -1; if (system("ip link set " BRIDGE_NAME " up")) return -1; return 0; } int del_bridge(void) { /* don't check for errors, let's try to make sure it's deleted */ system("ip link set " BRIDGE_NAME " down"); if (system("ip link del " BRIDGE_NAME)) return -1; return 0; } int main(int argc, char **argv) { int ret = 1; struct sockaddr_in6 addr; int sk; test_init(argc, argv); if (add_bridge() < 0) return 1; sk = socket(AF_INET6, SOCK_DGRAM, 0); if (sk < 0) { fail("can't get socket"); goto out; } memset(&addr, 0, sizeof(addr)); addr.sin6_port = htons(0); addr.sin6_family = AF_INET6; if (inet_pton(AF_INET6, "fe80:4567::1", &addr.sin6_addr) < 0) { fail("can't convert inet6 addr"); goto out; } addr.sin6_scope_id = if_nametoindex(BRIDGE_NAME); if (bind(sk, (struct sockaddr*)&addr, sizeof(addr)) < 0) { fail("can't bind"); goto out; } /* Here, we grep for inet because some of the IPV6 DAD stuff can be * racy, and all we really care about is that the bridge got restored * with the right MAC, since we know DAD will succeed eventually. * * (I got this race with zdtm.py, but not with zdtm.sh; not quite sure * what the environment difference is/was.) */ if (system("ip addr list dev " BRIDGE_NAME " | grep inet | sort > bridge.dump.test")) { pr_perror("can't save net config"); fail("Can't save net config"); goto out; } test_daemon(); test_waitsig(); if (system("ip addr list dev " BRIDGE_NAME " | grep inet | sort > bridge.rst.test")) { fail("Can't get net config"); goto out; } if (system("diff bridge.rst.test bridge.dump.test")) { fail("Net config differs after restore"); goto out; } pass(); ret = 0; out: del_bridge(); return ret; } criu-3.6/test/zdtm/static/bridge.desc000066400000000000000000000003141317335042600176460ustar00rootroot00000000000000{ 'deps': [ '/bin/sh', '/usr/bin/sort', '/bin/grep', '/sbin/ip|/bin/ip', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} criu-3.6/test/zdtm/static/busyloop00.c000066400000000000000000000004131317335042600177320ustar00rootroot00000000000000#include "zdtmtst.h" const char *test_doc = "Run busy loop while migrating"; const char *test_author = "Roman Kagan "; int main(int argc, char ** argv) { test_init(argc, argv); test_daemon(); while (test_go()) ; pass(); return 0; } criu-3.6/test/zdtm/static/caps00.c000066400000000000000000000072001317335042600170050ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that aps are preserved"; const char *test_author = "Pavel Emelianov "; struct cap_hdr { unsigned int version; int pid; }; struct cap_data { unsigned int eff; unsigned int prm; unsigned int inh; }; #define _LINUX_CAPABILITY_VERSION_3 0x20080522 #define _LINUX_CAPABILITY_U32S_3 2 #define CAP_CHOWN 0 #define CAP_DAC_OVERRIDE 1 int capget(struct cap_hdr *hdrp, struct cap_data *datap); int capset(struct cap_hdr *hdrp, const struct cap_data *datap); static int cap_last_cap = 63; #define NORM_CAPS(v, cap) v[1].cap &= (1LL << (cap_last_cap + 1 - 32)) - 1; int main(int argc, char **argv) { task_waiter_t t; int pid, result_pipe[2]; struct cap_data data[_LINUX_CAPABILITY_U32S_3]; struct cap_data data_2[_LINUX_CAPABILITY_U32S_3]; char res = 'x'; FILE *f; test_init(argc, argv); task_waiter_init(&t); f = fopen("/proc/sys/kernel/cap_last_cap", "r"); if (f) { if (fscanf(f, "%d", &cap_last_cap) != 1) { pr_perror("Unable to read cal_last_cap"); return 1; } fclose(f); } else test_msg("/proc/sys/kernel/cap_last_cap is not available\n"); if (pipe(result_pipe)) { pr_perror("Can't create pipe"); return 1; } pid = test_fork(); if (pid == 0) { struct cap_hdr hdr; if (prctl(PR_CAPBSET_DROP, CAP_SETPCAP, 0, 0, 0)) { res = 'x'; task_waiter_complete_current(&t); goto bad; } hdr.version = _LINUX_CAPABILITY_VERSION_3; hdr.pid = 0; if (capget(&hdr, data) < 0) { pr_perror("capget"); return -1; } hdr.version = _LINUX_CAPABILITY_VERSION_3; hdr.pid = 0; data[0].eff &= ~((1 << CAP_CHOWN) | (1 << CAP_DAC_OVERRIDE)); data[0].prm &= ~(1 << CAP_DAC_OVERRIDE); if (capset(&hdr, data) < 0) { pr_perror("capset"); return -1; } task_waiter_complete_current(&t); task_waiter_wait4(&t, getppid()); hdr.version = _LINUX_CAPABILITY_VERSION_3; hdr.pid = 0; if (capget(&hdr, data_2) < 0) { pr_perror("second capget"); return -1; } NORM_CAPS(data, eff); NORM_CAPS(data, prm); NORM_CAPS(data, inh); NORM_CAPS(data_2, eff); NORM_CAPS(data_2, prm); NORM_CAPS(data_2, inh); if (data[0].eff != data_2[0].eff) { res = '1'; goto bad; } if (data[1].eff != data_2[1].eff) { res = '2'; goto bad; } if (data[0].prm != data_2[0].prm) { res = '3'; goto bad; } if (data[1].prm != data_2[1].prm) { res = '4'; goto bad; } if (data[0].inh != data_2[0].inh) { res = '3'; goto bad; } if (data[1].inh != data_2[1].inh) { res = '4'; goto bad; } if (prctl(PR_CAPBSET_READ, CAP_SETPCAP, 0, 0, 0) != 0) { res='5'; goto bad; } res = '0'; bad: write(result_pipe[1], &res, 1); if (res != '0') { write(result_pipe[1], data, sizeof(data)); write(result_pipe[1], data_2, sizeof(data_2)); } close(result_pipe[0]); close(result_pipe[1]); _exit(0); } task_waiter_wait4(&t, pid); test_daemon(); test_waitsig(); task_waiter_complete_current(&t); read(result_pipe[0], &res, 1); if (res == '0') pass(); else { read(result_pipe[0], data, sizeof(data)); read(result_pipe[0], data_2, sizeof(data_2)); test_msg("{eff,prm,inh}[]={%08x,%08x,%08x}, {%08x,%08x,%08x}\n", data[0].eff, data[0].prm, data[0].inh, data[1].eff, data[1].prm, data[1].inh); test_msg("{eff,prm,inh}[]={%08x,%08x,%08x}, {%08x,%08x,%08x}\n", data_2[0].eff, data_2[0].prm, data_2[0].inh, data_2[1].eff, data_2[1].prm, data_2[1].inh); fail("Fail: %c", res); } close(result_pipe[0]); close(result_pipe[1]); return 0; } criu-3.6/test/zdtm/static/caps00.desc000066400000000000000000000000221317335042600174740ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/cgroup00.c000066400000000000000000000066311317335042600173650ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that cgroups layout is preserved"; const char *test_author = "Pavel Emelianov "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); static const char *cgname = "zdtmtst"; #define SUBNAME "subcg00" #define SUBNAME2 SUBNAME"/subsubcg" static int cg_move(char *name) { int cgfd, l; char paux[256]; sprintf(paux, "%s/%s", dirname, name); mkdir(paux, 0600); sprintf(paux, "%s/%s/tasks", dirname, name); cgfd = open(paux, O_WRONLY); if (cgfd < 0) { pr_perror("Can't open tasks"); return -1; } l = write(cgfd, "0", 2); close(cgfd); if (l < 0) { pr_perror("Can't move self to subcg"); return -1; } return 0; } static int cg_check(char *name) { int found = 0; FILE *cgf; char paux[256], aux[128]; cgf = fopen("/proc/self/cgroup", "r"); if (cgf == NULL) return -1; sprintf(aux, "name=%s:/%s\n", cgname, name); while (fgets(paux, sizeof(paux), cgf)) { char *s; s = strchr(paux, ':') + 1; test_msg("CMP [%s] vs [%s]\n", s, aux); if (!strcmp(s, aux)) { found = 1; break; } } fclose(cgf); return found ? 0 : -1; } int main(int argc, char **argv) { char aux[64]; int p1[2], p2[2], pr[2], status; test_init(argc, argv); /* * Pipes to talk to two kids. * First, they report that they are ready (int), * then they report the restore status (int). */ pipe(p1); pipe(p2); /* "Restore happened" pipe */ pipe(pr); if (mkdir(dirname, 0700) < 0) { pr_perror("Can't make dir"); goto out; } sprintf(aux, "none,name=%s", cgname); if (mount("none", dirname, "cgroup", 0, aux)) { pr_perror("Can't mount cgroups"); goto out_rd; } if (cg_move(SUBNAME)) goto out_rs; if (fork() == 0) { if (fork() == 0) { /* * 2nd level kid -- moves into its own * cgroup and triggers slow-path cg_set * restore in criu */ close(p1[0]); close(p1[1]); close(p2[0]); close(pr[1]); status = cg_move(SUBNAME2); write(p2[1], &status, sizeof(status)); if (status == 0) { read(pr[0], &status, sizeof(status)); status = cg_check(SUBNAME2); write(p2[1], &status, sizeof(status)); } exit(0); } /* * 1st level kid -- inherits cgroup from * parent and triggers fast-path cg_set * restore in criu */ close(p1[0]); close(p2[0]); close(p2[1]); close(pr[1]); status = 0; write(p1[1], &status, sizeof(status)); read(pr[0], &status, sizeof(status)); status = cg_check(SUBNAME); write(p1[1], &status, sizeof(status)); exit(0); } close(p1[1]); close(p2[1]); close(pr[0]); status = -1; read(p1[0], &status, sizeof(status)); if (status != 0) goto out_ks; status = -1; read(p2[0], &status, sizeof(status)); if (status != 0) goto out_ks; test_daemon(); test_waitsig(); close(pr[1]); if (cg_check(SUBNAME)) { fail("Top level task cg changed"); goto out_rs; } status = -1; read(p1[0], &status, sizeof(status)); if (status != 0) { fail("1st level task cg changed"); goto out_rs; } status = -1; read(p2[0], &status, sizeof(status)); if (status != 0) { fail("2nd level task cg changed"); goto out_rs; } pass(); out_rs: umount(dirname); out_rd: rmdir(dirname); out: return 0; out_ks: pr_perror("Error moving into cgroups"); close(pr[0]); goto out_rs; } criu-3.6/test/zdtm/static/cgroup00.desc000066400000000000000000000000751317335042600200550ustar00rootroot00000000000000{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} criu-3.6/test/zdtm/static/cgroup00.hook000077500000000000000000000005101317335042600200740ustar00rootroot00000000000000#!/bin/bash [ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 set -e tname=$(mktemp -d cgclean.XXXXXX) mount -t cgroup none $tname -o "none,name=zdtmtst" echo "Cleaning $tname" set +e rmdir "$tname/subcg00/subsubcg/" rmdir "$tname/subcg00/" set -e echo "Left there is:" ls "$tname" umount "$tname" rmdir "$tname" criu-3.6/test/zdtm/static/cgroup01.c000066400000000000000000000042261317335042600173640ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that empty cgroups are preserved"; const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); static const char *cgname = "zdtmtst"; static const char *subname = "subcg01"; static const char *empty = "empty"; int main(int argc, char **argv) { int cgfd, l, ret = 1, i; char aux[1024], paux[1024]; FILE *cgf; struct stat st; test_init(argc, argv); if (mkdir(dirname, 0700) < 0) { pr_perror("Can't make dir"); goto out; } sprintf(aux, "none,name=%s", cgname); if (mount("none", dirname, "cgroup", 0, aux)) { pr_perror("Can't mount cgroups"); goto out_rd; } sprintf(paux, "%s/%s", dirname, subname); mkdir(paux, 0600); l = sprintf(aux, "%d", getpid()); sprintf(paux, "%s/%s/tasks", dirname, subname); cgfd = open(paux, O_WRONLY); if (cgfd < 0) { pr_perror("Can't open tasks"); goto out_rs; } l = write(cgfd, aux, l); close(cgfd); if (l < 0) { pr_perror("Can't move self to subcg"); goto out_rs; } for (i = 0; i < 2; i++) { sprintf(paux, "%s/%s/%s.%d", dirname, subname, empty, i); if (mkdir(paux, 0600)) { pr_perror("mkdir %s", paux); goto out_rs; } } test_daemon(); test_waitsig(); cgf = fopen("/proc/self/mountinfo", "r"); if (cgf == NULL) { fail("No mountinfo file"); goto out_rs; } while (fgets(paux, sizeof(paux), cgf)) { char *s; s = strstr(paux, cgname); if (!s) continue; sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); test_msg("found cgroup at %s\n", aux); for (i = 0; i < 2; i++) { sprintf(paux, "%s/%s/%s.%d", aux, subname, empty, i); if (stat(paux, &st)) { fail("couldn't stat %s\n", paux); ret = -1; goto out_close; } if (!S_ISDIR(st.st_mode)) { fail("%s is not a directory\n", paux); ret = -1; goto out_close; } } pass(); ret = 0; goto out_close; } fail("empty cgroup not found!\n"); out_close: fclose(cgf); out_rs: umount(dirname); out_rd: rmdir(dirname); out: return ret; } criu-3.6/test/zdtm/static/cgroup01.desc000066400000000000000000000000751317335042600200560ustar00rootroot00000000000000{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} criu-3.6/test/zdtm/static/cgroup01.hook000077500000000000000000000005471317335042600201070ustar00rootroot00000000000000#!/bin/bash [ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 set -e tname=$(mktemp -d cgclean.XXXXXX) mount -t cgroup none $tname -o "none,name=zdtmtst" echo "Cleaning $tname" set +e rmdir "$tname/subcg01/empty.0/" rmdir "$tname/subcg01/empty.1/" rmdir "$tname/subcg01/" set -e echo "Left there is:" ls "$tname" umount "$tname" rmdir "$tname" criu-3.6/test/zdtm/static/cgroup02.c000066400000000000000000000064351317335042600173710ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that empty cgroups are preserved"; const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); static const char *cgname = "zdtmtst"; static const char *subname = "oldroot"; static const char *cgname2 = "zdtmtst.defaultroot"; int mount_and_add(const char *controller, const char *prefix, const char *path) { char aux[1024], paux[1024], subdir[1024]; int cgfd, l; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { pr_perror("Can't make dir"); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { pr_perror("Can't make dir"); return -1; } sprintf(aux, "none,name=%s", controller); if (mount("none", subdir, "cgroup", 0, aux)) { pr_perror("Can't mount cgroups"); goto err_rd; } sprintf(paux, "%s/%s", subdir, prefix); mkdir(paux, 0600); sprintf(paux, "%s/%s/%s", subdir, prefix, path); mkdir(paux, 0600); l = sprintf(aux, "%d", getpid()); sprintf(paux, "%s/%s/%s/tasks", subdir, prefix, path); cgfd = open(paux, O_WRONLY); if (cgfd < 0) { pr_perror("Can't open tasks"); goto err_rs; } l = write(cgfd, aux, l); close(cgfd); if (l < 0) { pr_perror("Can't move self to subcg"); goto err_rs; } return 0; err_rs: umount(dirname); err_rd: rmdir(dirname); return -1; } bool test_exists(char *mountinfo_line, char *path) { char aux[1024], paux[1024]; struct stat st; sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); test_msg("found cgroup at %s\n", aux); sprintf(paux, "%s/%s", aux, path); if (stat(paux, &st)) { return false; } if (!S_ISDIR(st.st_mode)) { return false; } return true; } int main(int argc, char **argv) { FILE *cgf; bool found_zdtmtstroot = false, found_newroot = false; char paux[1024]; int ret = -1; int fd; test_init(argc, argv); if (mount_and_add(cgname, "prefix", subname)) goto out; if (mount_and_add(cgname2, "prefix", subname)) { sprintf(paux, "%s/%s", dirname, cgname); umount(paux); rmdir(paux); goto out; } sprintf(paux, "%s/%s/prefix", dirname, cgname); fd = open(paux, O_DIRECTORY); if (fd < 0) goto out_umount; if (fchmod(fd, 0777) < 0) { fail("fchmod"); goto out_umount; } test_daemon(); test_waitsig(); if (close(fd) < 0) { fail("fd didn't survive"); goto out_umount; } cgf = fopen("/proc/self/mountinfo", "r"); if (cgf == NULL) { fail("No mountinfo file"); goto out_umount; } while (fgets(paux, sizeof(paux), cgf)) { char *s; s = strstr(paux, cgname); if (s && test_exists(paux, "prefix")) { found_zdtmtstroot = true; } s = strstr(paux, cgname2); if (s && test_exists(paux, "newroot")) { found_newroot = true; } } if (!found_zdtmtstroot) { fail("oldroot not rewritten to zdtmtstroot!\n"); goto out_close; } if (!found_newroot) { fail("oldroot not rewritten to newroot!\n"); goto out_close; } pass(); ret = 0; out_close: fclose(cgf); out_umount: sprintf(paux, "%s/%s", dirname, cgname); umount(paux); rmdir(paux); sprintf(paux, "%s/%s", dirname, cgname2); umount(paux); rmdir(paux); out: return ret; } criu-3.6/test/zdtm/static/cgroup02.desc000066400000000000000000000003071317335042600200550ustar00rootroot00000000000000{ 'dopts': '--manage-cgroups --cgroup-root name=zdtmtst:/prefix', 'flags': 'suid', 'flavor': 'h', 'ropts': '--manage-cgroups --cgroup-root /newroot --cgroup-root name=zdtmtst:/prefix'} criu-3.6/test/zdtm/static/cgroup02.hook000077500000000000000000000010651317335042600201040ustar00rootroot00000000000000#!/bin/bash [ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 set -e rmroots() { echo "Cleaning $tname ($1)" mount -t cgroup none $tname -o "$1" for d in "$tname/prefix" "$tname/newroot"; do test -d "$d" || continue # sort by line length for i in `find $d -type d | awk '{print length, $0}' | sort -rn | cut -d " " -f2-`; do echo $i rmdir $i done done echo "Left there is:" ls "$tname" umount "$tname" } tname=$(mktemp -d cgclean.XXXXXX) for ctl in $(cat /proc/self/cgroup | cut -d: -f2); do rmroots "$ctl" done rmdir $tname criu-3.6/test/zdtm/static/cgroup03.c000066400000000000000000000062751317335042600173740ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that global cgroup settings (+perms) are restored"; const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); static const char *cgname = "zdtmtst"; int mount_and_add(const char *controller, const char *path) { char aux[1024], paux[1024], subdir[1024]; int cgfd, l; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { pr_perror("Can't make dir"); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { pr_perror("Can't make dir"); return -1; } sprintf(aux, "none,name=%s", controller); if (mount("none", subdir, "cgroup", 0, aux)) { pr_perror("Can't mount cgroups"); goto err_rd; } sprintf(paux, "%s/%s", subdir, path); mkdir(paux, 0600); l = sprintf(aux, "%d", getpid()); sprintf(paux, "%s/%s/tasks", subdir, path); cgfd = open(paux, O_WRONLY); if (cgfd < 0) { pr_perror("Can't open tasks"); goto err_rs; } l = write(cgfd, aux, l); close(cgfd); if (l < 0) { pr_perror("Can't move self to subcg"); goto err_rs; } return 0; err_rs: umount(dirname); err_rd: rmdir(dirname); return -1; } int chownmod(char *path, int flags) { int fd, ret = -1; fd = open(path, flags); if (fd < 0) { pr_perror("can't open %s", path); return -1; } if (fchown(fd, 1000, 1000) < 0) { pr_perror("can't chown %s", path); goto out; } if (fchmod(fd, 0777) < 0) { pr_perror("can't chmod %s", path); goto out; } ret = 0; out: close(fd); return ret; } int checkperms(char *path) { struct stat sb; if (stat(path, &sb) < 0) { pr_perror("can't stat %s", path); return -1; } if ((sb.st_mode & 0777) != 0777) { fail("mode for %s doesn't match (%o)\n", path, sb.st_mode); return -1; } if (sb.st_uid != 1000) { fail("uid for %s doesn't match (%d)\n", path, sb.st_uid); return -1; } if (sb.st_gid != 1000) { fail("gid for %s doesn't match (%d)\n", path, sb.st_gid); return -1; } return 0; } int main(int argc, char **argv) { int ret = -1; char path[PATH_MAX]; test_init(argc, argv); if (mount_and_add(cgname, "test") < 0) return -1; sprintf(path, "%s/%s/test", dirname, cgname); if (chownmod(path, O_DIRECTORY) < 0) goto out_umount; sprintf(path, "%s/%s/test/notify_on_release", dirname, cgname); if (chownmod(path, O_RDWR) < 0) goto out_umount; sprintf(path, "%s/%s/test/cgroup.procs", dirname, cgname); if (chownmod(path, O_RDWR) < 0) goto out_umount; test_daemon(); test_waitsig(); sprintf(path, "%s/%s/test", dirname, cgname); if (checkperms(path) < 0) goto out_umount; sprintf(path, "%s/%s/test/notify_on_release", dirname, cgname); if (checkperms(path) < 0) goto out_umount; sprintf(path, "%s/%s/test/cgroup.procs", dirname, cgname); if (checkperms(path) < 0) goto out_umount; pass(); ret = 0; out_umount: sprintf(path, "%s/%s/test", dirname, cgname); rmdir(path); sprintf(path, "%s/%s", dirname, cgname); umount(path); rmdir(path); rmdir(dirname); return ret; } criu-3.6/test/zdtm/static/cgroup03.desc000066400000000000000000000001021317335042600200470ustar00rootroot00000000000000{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} criu-3.6/test/zdtm/static/cgroup03.hook000077500000000000000000000003701317335042600201030ustar00rootroot00000000000000#!/bin/bash [ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 tname=$(mktemp -d cgclean.XXXXXX) mount -t cgroup none $tname -o "none,name=zdtmtst" echo "Cleaning $tname" set +e rmdir "$tname/test" set -e umount "$tname" rmdir "$tname" criu-3.6/test/zdtm/static/cgroup04.c000066400000000000000000000074241317335042600173720ustar00rootroot00000000000000 #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) const char *test_doc = "Check that some cgroups properties in kernel controllers are preserved"; const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); static const char *cgname = "zdtmtst"; int write_value(const char *path, const char *value) { int fd, l; fd = open(path, O_WRONLY); if (fd < 0) { pr_perror("open %s", path); return -1; } l = write(fd, value, strlen(value)); close(fd); if (l < 0) { pr_perror("failed to write %s to %s", value, path); return -1; } return 0; } int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) { char aux[1024], paux[1024], subdir[1024]; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { pr_perror("Can't make dir"); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { pr_perror("Can't make dir"); return -1; } if (mount("none", subdir, "cgroup", 0, controller)) { pr_perror("Can't mount cgroups"); goto err_rd; } sprintf(paux, "%s/%s", subdir, path); mkdir(paux, 0600); sprintf(paux, "%s/%s/%s", subdir, path, prop); if (write_value(paux, value) < 0) goto err_rs; sprintf(aux, "%d", getpid()); sprintf(paux, "%s/%s/tasks", subdir, path); if (write_value(paux, aux) < 0) goto err_rs; sprintf(paux, "%s/%s/special_prop_check", subdir, path); mkdir(paux, 0600); return 0; err_rs: umount(dirname); err_rd: rmdir(dirname); return -1; } bool checkval(char *path, char *val) { char buf[1024]; int fd, n; fd = open(path, O_RDONLY); if (fd < 0) { pr_perror("open %s", path); return false; } n = read(fd, buf, sizeof(buf) - 1); close(fd); if (n < 0) { pr_perror("read"); return false; } buf[n] = 0; if (strcmp(val, buf)) { pr_err("got %s expected %s\n", buf, val); return false; } return true; } int main(int argc, char **argv) { int ret = -1, i; char buf[1024], path[PATH_MAX]; struct stat sb; char *deny[] = { "c *:* m", "b *:* m", "c 1:3 rwm", "c 1:5 rwm", "c 1:7 rwm", "c 5:0 rwm", "c 5:2 rwm", "c 1:8 rwm", "c 1:9 rwm", "c 136:* rwm", "c 10:229 rwm", }; test_init(argc, argv); if (mount_and_add("devices", cgname, "devices.deny", "a") < 0) goto out; /* need to allow /dev/null for restore */ sprintf(path, "%s/devices/%s/devices.allow", dirname, cgname); for (i = 0; i < ARRAY_SIZE(deny); i++) { if (write_value(path, deny[i]) < 0) goto out; } if (mount_and_add("memory", cgname, "memory.limit_in_bytes", "268435456") < 0) goto out; test_daemon(); test_waitsig(); buf[0] = 0; for (i = 0; i < ARRAY_SIZE(deny); i++) { strcat(buf, deny[i]); strcat(buf, "\n"); } sprintf(path, "%s/devices/%s/devices.list", dirname, cgname); if (!checkval(path, buf)) { fail(); goto out; } sprintf(path, "%s/memory/%s/memory.limit_in_bytes", dirname, cgname); if (!checkval(path, "268435456\n")) { fail(); goto out; } sprintf(path, "%s/devices/%s/special_prop_check", dirname, cgname); if (stat(path, &sb) < 0) { fail("special_prop_check doesn't exist?"); goto out; } if (!S_ISDIR(sb.st_mode)) { fail("special_prop_check not a directory?"); goto out; } pass(); ret = 0; out: sprintf(path, "%s/devices/%s/special_prop_check", dirname, cgname); rmdir(path); sprintf(path, "%s/devices/%s", dirname, cgname); rmdir(path); sprintf(path, "%s/devices", dirname); umount(path); sprintf(path, "%s/memory/%s", dirname, cgname); rmdir(path); sprintf(path, "%s/memory", dirname); umount(path); return ret; } criu-3.6/test/zdtm/static/cgroup04.desc000066400000000000000000000001071317335042600200550ustar00rootroot00000000000000{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} criu-3.6/test/zdtm/static/cgroup04.hook000077500000000000000000000004331317335042600201040ustar00rootroot00000000000000#!/bin/bash [ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 tname=$(mktemp -d cgclean.XXXXXX) mount -t cgroup none $tname -o "devices" echo "Cleaning $tname" set +e rmdir "$tname/zdtmtst/special_prop_check" rmdir "$tname/zdtmtst" set -e umount "$tname" rmdir "$tname" criu-3.6/test/zdtm/static/cgroup_stray.c000066400000000000000000000102631317335042600204430ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that stray cgroups are c/r'd correctly"; const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); static const char *cgname = "zdtmtst"; static int mount_ctrl(const char *controller) { char aux[1024], subdir[1024]; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { pr_perror("Can't make dir"); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { pr_perror("Can't make dir"); return -1; } sprintf(aux, "none,name=%s", controller); if (mount("none", subdir, "cgroup", 0, aux)) { pr_perror("Can't mount cgroups"); goto err_rd; } return 0; err_rd: rmdir(dirname); return -1; } static int add_to_cg(const char *controller, const char *path) { char aux[1024], paux[1024], subdir[1024]; int cgfd, l; sprintf(subdir, "%s/%s", dirname, controller); sprintf(paux, "%s/%s", subdir, path); mkdir(paux, 0600); l = sprintf(aux, "%d", getpid()); sprintf(paux, "%s/%s/tasks", subdir, path); cgfd = open(paux, O_WRONLY); if (cgfd < 0) { pr_perror("Can't open tasks %s", paux); return -1; } l = write(cgfd, aux, l); close(cgfd); if (l < 0) { pr_perror("Can't move self to subcg %s", path); return -1; } return 0; } static bool pid_in_cgroup(pid_t pid, const char *controller, const char *path) { char buf[2048]; FILE *f; bool ret = false; sprintf(buf, "/proc/%d/cgroup", pid); f = fopen(buf, "r"); if (!f) { pr_perror("fopen"); return false; } while (NULL != fgets(buf, sizeof(buf), f)) { char *pos, *pid_controller, *pid_path; /* chop off trailing \n */ buf[strlen(buf)-1] = '\0'; /* skip heirarchy no. */ pos = strstr(buf, ":"); if (!pos) { pr_err("invalid /proc/pid/cgroups file"); goto out; } pos++; pid_controller = pos; pos = strstr(pos, ":"); if (!pos) { pr_err("invalid /proc/pid/cgroups file"); goto out; } *pos = '\0'; pos++; pid_path = pos; test_msg("comparing %s and %s\n", controller, pid_controller); if (strcmp(controller, pid_controller)) continue; if (strcmp(path, pid_path)) pr_err("task not in right cg for controller %s expected %s, got %s\n", controller, path, pid_path); else ret = true; goto out; } out: fclose(f); return ret; } int main(int argc, char **argv) { int ret = -1, sk_pair[2], sk, status; char path[PATH_MAX], c; pid_t pid = 0; test_init(argc, argv); if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } if (mount_ctrl(cgname) < 0) return -1; pid = fork(); if (pid < 0) { pr_perror("fork"); goto out_umount; } if (pid == 0) { close(sk_pair[0]); sk = sk_pair[1]; if (add_to_cg(cgname, "foo")) exit(1); if (write(sk, &c, 1) != 1) { pr_perror("write"); exit(1); } if (read(sk, &c, 1) != 1) { pr_perror("read %d", ret); exit(1); } sprintf(path, "name=%s", cgname); if (!pid_in_cgroup(getpid(), path, "/foo")) exit(1); exit(0); } close(sk_pair[1]); sk = sk_pair[0]; if (add_to_cg(cgname, "bar")) goto out_kill; if ((ret = read(sk, &c, 1)) != 1) { pr_perror("read %d", ret); goto out_kill; } test_daemon(); test_waitsig(); if (write(sk, &c, 1) != 1) { pr_perror("write"); goto out_kill; } sprintf(path, "name=%s", cgname); if (!pid_in_cgroup(getpid(), path, "/bar")) { fail("parent not in cgroup /bar"); goto out_kill; } if (pid != waitpid(pid, &status, 0)) { pr_perror("waitpid"); goto out_umount; } if (!WIFEXITED(status) || WEXITSTATUS(status)) { fail("exit status %d\n", status); goto out_umount; } pass(); ret = 0; out_kill: if (pid > 0) kill(pid, SIGKILL); out_umount: sprintf(path, "%s/%s/foo", dirname, cgname); rmdir(path); sprintf(path, "%s/%s/test", dirname, cgname); rmdir(path); sprintf(path, "%s/%s", dirname, cgname); umount(path); rmdir(path); rmdir(dirname); return ret; } criu-3.6/test/zdtm/static/cgroup_stray.desc000066400000000000000000000001461317335042600211360ustar00rootroot00000000000000{ 'feature': 'cgroupns', 'flags': 'suid', 'flavor': 'h ns', 'opts': '--manage-cgroups'} criu-3.6/test/zdtm/static/cgroupns.c000066400000000000000000000076211317335042600175660ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef CLONE_NEWCGROUP #define CLONE_NEWCGROUP 0x02000000 #endif const char *test_doc = "Check that cgroup NS is correctly handled."; const char *test_author = "Tycho Andersen "; /* we need dirname before test_init() here */ char *dirname = "cgroupns.test"; static const char *cgname = "zdtmtst"; int mount_and_add(const char *controller, const char *path) { char aux[1024], paux[1024], subdir[1024]; int cgfd, l; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { pr_perror("Can't make dir"); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { pr_perror("Can't make dir"); return -1; } sprintf(aux, "none,name=%s", controller); if (mount("none", subdir, "cgroup", 0, aux)) { pr_perror("Can't mount cgroups"); goto err_rd; } sprintf(paux, "%s/%s", subdir, path); mkdir(paux, 0600); l = sprintf(aux, "%d", getpid()); sprintf(paux, "%s/%s/tasks", subdir, path); cgfd = open(paux, O_WRONLY); if (cgfd < 0) { pr_perror("Can't open tasks"); goto err_rs; } l = write(cgfd, aux, l); close(cgfd); if (l < 0) { pr_perror("Can't move self to subcg"); goto err_rs; } return 0; err_rs: umount(dirname); err_rd: rmdir(dirname); return -1; } static bool pid_in_cgroup(pid_t pid, const char *controller, const char *path) { char buf[2048]; FILE *f; bool ret = false; sprintf(buf, "/proc/%d/cgroup", pid); f = fopen(buf, "r"); if (!f) { pr_perror("fopen"); return false; } while (NULL != fgets(buf, sizeof(buf), f)) { char *pos, *pid_controller, *pid_path; /* chop off trailing \n */ buf[strlen(buf)-1] = '\0'; /* skip heirarchy no. */ pos = strstr(buf, ":"); if (!pos) { pr_err("invalid /proc/pid/cgroups file"); goto out; } pos++; pid_controller = pos; pos = strstr(pos, ":"); if (!pos) { pr_err("invalid /proc/pid/cgroups file"); goto out; } *pos = '\0'; pos++; pid_path = pos; if (strcmp(controller, pid_controller)) continue; if (strcmp(path, pid_path)) pr_err("task not in right cg for controller %s expected %s, got %s\n", controller, path, pid_path); else ret = true; goto out; } out: fclose(f); return ret; } int main(int argc, char **argv) { int ret = -1, fd, status; char path[PATH_MAX]; pid_t pid; if (!getenv("ZDTM_NEWNS")) { if (mount_and_add(cgname, "test") < 0) return -1; if (unshare(CLONE_NEWCGROUP) < 0) { pr_perror("unshare"); goto out; } } test_init(argc, argv); test_daemon(); test_waitsig(); sprintf(path, "name=%s", cgname); /* first check that the task is in zdtmtst:/ */ if (!pid_in_cgroup(getpid(), path, "/")) { fail("pid not in cgroup /"); goto out; } /* now check that the task is in the right place in a ns by setnsing to * someone else's ns and looking there. */ pid = fork(); if (pid < 0) { pr_perror("fork"); goto out; } if (pid == 0) { sprintf(path, "/proc/%d/ns/cgroup", 1); fd = open(path, O_RDONLY); if (fd < 0) { pr_perror("open"); exit(1); } ret = setns(fd, CLONE_NEWCGROUP); close(fd); if (ret < 0) { pr_perror("setns"); exit(1); } sprintf(path, "name=%s", cgname); if (!pid_in_cgroup(getppid(), path, "/test")) { fail("pid not in cgroup %s", path); exit(1); } exit(0); } if (pid != waitpid(pid, &status, 0)) { pr_err("wrong pid"); goto out; } if (!WIFEXITED(status) || WEXITSTATUS(status)) { pr_err("got bad exit status %d\n", status); goto out; } ret = 0; pass(); out: sprintf(path, "%s/%s/test", dirname, cgname); rmdir(path); sprintf(path, "%s/%s", dirname, cgname); umount(path); rmdir(path); rmdir(dirname); return ret; } criu-3.6/test/zdtm/static/cgroupns.desc000066400000000000000000000001431317335042600202520ustar00rootroot00000000000000{ 'feature': 'cgroupns', 'flags': 'suid', 'flavor': 'h', 'opts': '--manage-cgroups'} criu-3.6/test/zdtm/static/child_opened_proc.c000066400000000000000000000020601317335042600213560ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that tree prior to files opening"; const char *test_author = "Stanislav Kinsbursky #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that out-of-root file survives"; const char *test_author = "Pavel Emelianov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); char *filename; TEST_OPTION(filename, string, "file name", 1); #define MSG "out-file-contents" static int make_file(char *name) { int fd; fd = open(name, O_RDWR | O_CREAT, 0666); if (fd < 0) return -1; if (write(fd, MSG, sizeof(MSG)) != sizeof(MSG)) return -1; return fd; } static int check_file(int fd) { char r[sizeof(MSG)]; lseek(fd, 0, SEEK_SET); if (read(fd, r, sizeof(r)) != sizeof(MSG)) return -1; if (memcmp(r, MSG, sizeof(MSG))) return -1; return 0; } #define SUCCESS 0 #define ERR_PIPES (char)0x7f /* bitmap of errors */ #define ERR_IN_FILE 1 #define ERR_ROOT 2 #define ERR_DIR 4 #define ERR_CHDIR 8 #define ERR_ROOT2 4 int main(int argc, char **argv) { int pid, pipe_prep[2], pipe_goon[2], pipe_res[2]; char res; int fd, fd2; test_init(argc, argv); pipe(pipe_prep); pipe(pipe_goon); pipe(pipe_res); pid = test_fork(); if (pid != 0) { close(pipe_prep[1]); close(pipe_goon[0]); close(pipe_res[1]); res = ERR_PIPES; read(pipe_prep[0], &res, 1); read(pipe_prep[0], &res, 1); /* wait when a descriptor will be closed */ if (res != SUCCESS) { if (res == ERR_PIPES) pr_perror("broken pipes"); else { if (res & ERR_IN_FILE) pr_perror("inside-root file fail"); if (res & ERR_ROOT) pr_perror("chroot fail"); if (res & ERR_DIR) pr_perror("mkdir fail"); if (res & ERR_CHDIR) pr_perror("chrid fail"); } return 0; } test_daemon(); test_waitsig(); close(pipe_goon[1]); res = ERR_PIPES; read(pipe_res[0], &res, 1); if (res == SUCCESS) pass(); else if (res == ERR_PIPES) fail("broken pipes"); else { if (res & ERR_IN_FILE) fail("opened file broken"); if (res & ERR_ROOT) fail("open in chroot succeeded"); if (res & ERR_ROOT2) fail("open in chroot might work"); } wait(NULL); return 0; } close(pipe_prep[0]); close(pipe_goon[1]); close(pipe_res[0]); fd = make_file(filename); if (fd < 0) { res = ERR_IN_FILE; goto err; } if (mkdir(dirname, 0700)) { res = ERR_DIR; goto err; } if (chroot(dirname)) { res = ERR_ROOT; goto err; } if (chdir("/")) { res = ERR_CHDIR; goto err; } res = SUCCESS; write(pipe_prep[1], &res, 1); close(pipe_prep[1]); read(pipe_goon[0], &res, 1); res = SUCCESS; if (check_file(fd)) res |= ERR_IN_FILE; fd2 = open(filename, O_RDWR); if (fd2 >= 0) { res |= ERR_ROOT; close(fd2); } else if (errno != ENOENT) res |= ERR_ROOT2; write(pipe_res[1], &res, 1); exit(0); err: write(pipe_prep[1], &res, 1); exit(0); } criu-3.6/test/zdtm/static/chroot-file.desc000066400000000000000000000000221317335042600206210ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/chroot.c000066400000000000000000000055451317335042600172270ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that root didn't change"; const char *test_author = "Pavel Emelianov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); char *filename; TEST_OPTION(filename, string, "file name", 1); static char *filepath; #define MSG "chroot-file-contents" static int make_file(char *name) { int fd; fd = open(name, O_RDWR | O_CREAT, 0666); if (fd < 0) return -1; if (write(fd, MSG, sizeof(MSG)) != sizeof(MSG)) return -1; return fd; } static int check_file(int fd) { char r[sizeof(MSG)]; lseek(fd, 0, SEEK_SET); if (read(fd, r, sizeof(r)) != sizeof(MSG)) return -1; if (memcmp(r, MSG, sizeof(MSG))) return -1; return 0; } #define SUCCESS 0 #define ERR_PIPES (char)0x7f /* bitmap of errors */ #define ERR_IN_FILE 1 #define ERR_ROOT 2 #define ERR_DIR 4 #define ERR_OPEN 2 #define ERR_FILE2 4 int main(int argc, char **argv) { int pid, pipe_prep[2], pipe_goon[2], pipe_res[2]; char res; int fd, fd2; test_init(argc, argv); filepath = malloc(strlen(filename) + 2); sprintf(filepath, "/%s", filename); pipe(pipe_prep); pipe(pipe_goon); pipe(pipe_res); pid = test_fork(); if (pid != 0) { close(pipe_prep[1]); close(pipe_goon[0]); close(pipe_res[1]); res = ERR_PIPES; read(pipe_prep[0], &res, 1); read(pipe_prep[0], &res, 1); /* wait when pipe_prep[] will be closed */ if (res != SUCCESS) { if (res == ERR_PIPES) pr_perror("broken pipes"); else { if (res & ERR_IN_FILE) pr_perror("inside-root file fail"); if (res & ERR_ROOT) pr_perror("chroot fail"); if (res & ERR_DIR) pr_perror("mkdir fail"); } return 0; } test_daemon(); test_waitsig(); close(pipe_goon[1]); res = ERR_PIPES; read(pipe_res[0], &res, 1); if (res == SUCCESS) pass(); else if (res == ERR_PIPES) fail("broken pipes"); else { if (res & ERR_IN_FILE) fail("opened file broken"); if (res & ERR_OPEN) fail("open in chroot fail"); if (res & ERR_FILE2) fail("wrong file opened"); } wait(NULL); return 0; } close(pipe_prep[0]); close(pipe_goon[1]); close(pipe_res[0]); if (mkdir(dirname, 0700)) { res = ERR_DIR; goto err_nodir; } if (chroot(dirname)) { res = ERR_ROOT; goto err_noroot; } fd = make_file(filepath); if (fd < 0) { res = ERR_IN_FILE; goto err_nofile2; } res = SUCCESS; write(pipe_prep[1], &res, 1); close(pipe_prep[1]); read(pipe_goon[0], &res, 1); res = SUCCESS; if (check_file(fd)) res |= ERR_IN_FILE; fd2 = open(filepath, O_RDWR); if (fd2 < 0) res |= ERR_OPEN; else if (check_file(fd2)) res |= ERR_FILE2; write(pipe_res[1], &res, 1); exit(0); err_nofile2: err_noroot: err_nodir: write(pipe_prep[1], &res, 1); exit(0); } criu-3.6/test/zdtm/static/chroot.desc000066400000000000000000000000221317335042600177040ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/clean_mntns.c000066400000000000000000000007021317335042600202200ustar00rootroot00000000000000#include #include #include #include "zdtmtst.h" const char *test_doc = "Check that clean mntns works"; const char *test_author = "Pavel Emelianov "; int main(int argc, char **argv) { test_init(argc, argv); if (umount("/proc") < 0) pr_perror("Can't umount proc"); if (umount("/dev/pts") < 0) pr_perror("Can't umount devpts"); test_daemon(); test_waitsig(); pass(); return 0; } criu-3.6/test/zdtm/static/clean_mntns.desc000066400000000000000000000000421317335042600207110ustar00rootroot00000000000000{'flavor': 'ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/clone_fs.c000066400000000000000000000035741317335042600175210ustar00rootroot00000000000000#include #include #include #include "zdtmtst.h" const char *test_doc = "Check that shared FS is migrated properly"; const char *test_author = "Stanislav Kinsburskiy "; enum kcmp_type { KCMP_FILE, KCMP_VM, KCMP_FILES, KCMP_FS, KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, KCMP_TYPES, }; static int kcmp(int type, pid_t pid1, pid_t pid2, unsigned long idx1, unsigned long idx2) { int ret; ret = syscall(SYS_kcmp, pid1, pid2, type, idx1, idx2); switch (ret) { case 0: break; case 1: case 2: test_msg("FS for pids %d and %d doesn't match: %d\n", pid1, pid2, ret); break; case -1: pr_err("kcmp (type: %d, pid1: %d, pid2: %d, " "idx1: %ld, idx2: %ld) failed: %d\n", type, pid1, pid2, idx1, idx2, errno); break; default: pr_err("kcmp (type: %d, pid1: %d, pid2: %d, " "idx1: %ld, idx2: %ld) returned %d\n", type, pid1, pid2, idx1, idx2, ret); break; } return ret; } #define gettid(code) \ syscall(__NR_gettid) static pthread_mutex_t init_lock; static pthread_mutex_t exit_lock; static void *thread_func(void *tid2) { *(int *)tid2 = gettid(); pthread_mutex_unlock(&init_lock); pthread_mutex_lock(&exit_lock); return NULL; } int main(int argc, char **argv) { pid_t tid; int ret; pthread_t th; test_init(argc, argv); pthread_mutex_init(&init_lock, NULL); pthread_mutex_lock(&init_lock); pthread_mutex_init(&exit_lock, NULL); pthread_mutex_lock(&exit_lock); if (pthread_create(&th, NULL, thread_func, &tid)) { fail("Can't pthread_create"); return 1; } pthread_mutex_lock(&init_lock); ret = kcmp(KCMP_FS, gettid(), tid, 0, 0); if (ret) exit(1); test_daemon(); test_waitsig(); ret = kcmp(KCMP_FS, gettid(), tid, 0, 0); if (ret) { fail(); exit(1); } pthread_mutex_unlock(&exit_lock); pthread_join(th, NULL); pass(); return 0; } criu-3.6/test/zdtm/static/cmdlinenv00.c000066400000000000000000000052751317335042600200500ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test that env/cmdline/auxv restored well\n"; const char *test_author = "Cyrill Gorcunov 0) { if (*new != *old) return -1; new++; old++; size -= sizeof(*new); } return 0; } int main(int argc, char *argv[]) { char cmdline_orig[4096]; char cmdline[4096]; char env_orig[4096]; char env[4096]; char auxv_orig[1024]; char auxv[1024]; memset(cmdline_orig, 0, sizeof(cmdline_orig)); memset(cmdline, 0, sizeof(cmdline)); memset(env_orig, 0, sizeof(env_orig)); memset(env, 0, sizeof(env)); memset(auxv_orig, 0, sizeof(auxv_orig)); memset(auxv, 0, sizeof(auxv)); test_init(argc, argv); read_from_proc("/proc/self/cmdline", cmdline_orig, sizeof(cmdline_orig)); read_from_proc("/proc/self/environ", env_orig, sizeof(env_orig)); read_from_proc("/proc/self/auxv", auxv_orig, sizeof(auxv_orig)); test_msg("old cmdline: %s\n", cmdline_orig); test_msg("old environ: %s\n", env_orig); test_daemon(); test_waitsig(); read_from_proc("/proc/self/cmdline", cmdline, sizeof(cmdline)); read_from_proc("/proc/self/environ", env, sizeof(env)); read_from_proc("/proc/self/auxv", auxv, sizeof(auxv)); test_msg("new cmdline: %s\n", cmdline); test_msg("new environ: %s\n", env); if (strncmp(cmdline_orig, cmdline, sizeof(cmdline_orig))) { fail("cmdline corrupted on restore"); exit(1); } if (strncmp(env_orig, env, sizeof(env_orig))) { fail("envirion corrupted on restore"); exit(1); } if (cmp_auxv(auxv_orig, auxv, sizeof(auxv_orig))) { fail("auxv corrupted on restore"); exit(1); } pass(); return 0; } criu-3.6/test/zdtm/static/cmdlinenv00.desc000066400000000000000000000000221317335042600205250ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/conntracks000077500000000000000000000017141317335042600176520ustar00rootroot00000000000000#!/bin/bash export PATH=$PATH:${0%/*}/../../lib die() { echo "$0:${BASH_LINENO[0]}: $*" >&2 exit 1 } fail() { echo "FAIL: $0:${BASH_LINENO[0]}: $*" > "$outfile" exit 1 } do_or_fail() { local failmsg="$1" output shift output="$(eval $@ 2>&1)" || fail "$failmsg: $output" } do_start() { [ -f "$statefile" ] && die "state file $statefile aleady exists" do_or_fail "can't install a state match" \ iptables -A INPUT \ -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT do_or_fail "can't list the loaded iptables" \ iptables -L \> "$statefile" } do_stop() { do_or_fail "can't compare the iptables" \ iptables -L \| diff -u "$statefile" - rm -f "$statefile" echo "PASS" > $outfile } tmpargs="$(../lib/parseargs.sh --name=$0 \ --flags-req=statefile,outfile \ --flags-opt="start,stop" -- "$@")" || die "can't parse command line" eval "$tmpargs" [ -f "$outfile" ] && die "out file $outfile aleady exists" # expect "start" or "stop" do_$1 criu-3.6/test/zdtm/static/conntracks.desc000066400000000000000000000000241317335042600205550ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/console.c000066400000000000000000000021261317335042600173630ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check c/r for console device"; const char *test_author = "Cyrill Gorcunov "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char ** argv) { struct stat st1, st2; int fd; test_init(argc, argv); if (mknod(filename, S_IFCHR | S_IRUSR | S_IWUSR, makedev(5,1))) { pr_perror("Can't create console %s", filename); return 1; } fd = open(filename, O_RDONLY); if (fd < 0) { pr_perror("Open console %s failed", filename); return 1; } if (fstat(fd, &st1)) { pr_perror("Can't stat %s console", filename); return 1; } test_daemon(); test_waitsig(); if (fstat(fd, &st2)) { pr_perror("Can't stat %s console", filename); return 1; } if (st1.st_rdev != st2.st_rdev) { fail("Console rdev mismatch %x != %x on %s", (int)st1.st_rdev, (int)st2.st_rdev, filename); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/console.desc000066400000000000000000000000441317335042600200540ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/cow00.c000066400000000000000000000041231317335042600166500ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that cow memory are restored"; const char *test_author = "Andrey Vagin #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that cow memory are restored"; const char *test_author = "Andrey Vagin 5) break; p = (void **)(addr + i * PAGE_SIZE); test_msg("Read *%p = %p\n", p, p[0]); if (write(fd, &p, sizeof(p)) != sizeof(p)) { pr_perror("write"); return -1; } if (read(fd, &p, sizeof(p)) != sizeof(p)) { pr_perror("read"); return -1; } test_msg("Child %p\n", p); } close(fd_child); close(fd_parent); if (map_child_ret) *map_child_ret = map_child; if (map_parent_ret) *map_parent_ret = map_parent; // Return 0 for success, 1 if the pages differ. return map_child != map_parent; } static int child_prep(struct test_cases *test_cases, int fd) { int i; uint8_t *addr = test_cases->addr; for (i = 0; i < TEST_CASES; i++) { struct test_case *tc = test_cases->tc + i; if (tc->a_f_write_child) { tc->crc_child = ~1; datagen2(addr + i * PAGE_SIZE, PAGE_SIZE, &tc->crc_child); } if (tc->a_f_read_child) { uint32_t crc = ~1; datasum(addr + i * PAGE_SIZE, PAGE_SIZE, &crc); } } return 0; } static int child_check(struct test_cases *test_cases, int fd) { int i, ret = 0; uint8_t *addr = test_cases->addr; for (i = 0; i < TEST_CASES; i++) { uint32_t crc = ~1; struct test_case *tc = test_cases->tc + i; datasum(addr + i * PAGE_SIZE, PAGE_SIZE, &crc); if (crc != tc->crc_child) { errno = 0; fail("%s[%#x]: %p child data mismatch (expected [%04x] got [%04x])", test_cases->tname, i, addr + i * PAGE_SIZE, tc->crc_child, crc); ret |= 1; } } return ret; } static int parent_before_fork(struct test_cases *test_cases, int fd) { uint8_t *addr; int i; if (test_cases->init(test_cases)) return -1; addr = test_cases->addr; for (i = 0; i < TEST_CASES; i++) { struct test_case *tc = test_cases->tc + i; tc->num = i; if (tc->b_f_write) { tc->crc_parent = ~1; datagen2(addr + i * PAGE_SIZE, PAGE_SIZE, &tc->crc_parent); if (test_cases != &sep_tcs) tc->crc_child = tc->crc_parent; } if (tc->b_f_read) { uint32_t crc = ~1; datasum(addr + i * PAGE_SIZE, PAGE_SIZE, &crc); } } return 0; } static int parent_post_fork(struct test_cases *test_cases, int fd) { uint8_t *addr = test_cases->addr; int i; for (i = 0; i < TEST_CASES; i++) { struct test_case *tc = test_cases->tc + i; if (tc->a_f_write_parent) { tc->crc_parent = ~1; datagen2(addr + i * PAGE_SIZE, PAGE_SIZE, &tc->crc_parent); } if (tc->a_f_read_parent) { uint32_t crc = ~1; datasum(addr + i * PAGE_SIZE, PAGE_SIZE, &crc); } } return 0; } static int parent_check(struct test_cases *test_cases, int fd) { uint8_t *addr = test_cases->addr; int i, ret = 0; for (i = 0; i < TEST_CASES; i++) { struct test_case *tc = test_cases->tc + i; uint32_t crc = ~1; datasum(addr + i * PAGE_SIZE, PAGE_SIZE, &crc); if (crc != tc->crc_parent) { errno = 0; fail("%s[%#x]: %p parent data mismatch (expected [%04x] got [%04x])", test_cases->tname, i, addr + i * PAGE_SIZE, tc->crc_parent, crc); ret |= 1; } if (test_cases == &sep_tcs) continue; if (!tc->a_f_write_child && !tc->a_f_write_parent && tc->b_f_write) { uint64_t map_child, map_parent; int is_cow_ret; is_cow_ret = is_cow(addr + i * PAGE_SIZE, child_pid, getpid(), &map_child, &map_parent, fd); ret |= is_cow_ret; if (is_cow_ret == 1) { errno = 0; fail("%s[%#x]: %p is not COW-ed (pagemap of " "child=[%"PRIx64"], parent=[%"PRIx64"])", test_cases->tname, i, addr + i * PAGE_SIZE, map_child, map_parent); } } } return ret; } static int __init_cow(struct test_cases *tcs, int flags) { int i; void *addr; addr = mmap(NULL, PAGE_SIZE * (TEST_CASES + 2), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { pr_perror("Can't allocate memory"); return -1; } /* * Guard pages are used for preventing merging with other vma-s. * In parent cow-ed and coinciding regions can be merged, but * in child they cannot be, so COW will not be restored. FIXME */ mmap(addr, PAGE_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); addr += PAGE_SIZE; tcs->addr = addr; mmap(addr + PAGE_SIZE * TEST_CASES, PAGE_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | flags, -1, 0); test_msg("addr[%s]=%p\n", tcs->tname, tcs->addr); for (i = 0; i < TEST_CASES; i++) { struct test_case *tc = tcs->tc + i; tc->crc_parent = zero_crc; tc->crc_child = zero_crc; } return 0; } static int init_cow(struct test_cases *tcs) { return __init_cow(tcs, 0); } static int init_cow_gd(struct test_cases *tcs) { return __init_cow(tcs, MAP_GROWSDOWN); } static int init_sep(struct test_cases *tcs) { int i; tcs->addr = mmap(NULL, PAGE_SIZE * TEST_CASES, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (tcs->addr == MAP_FAILED) { pr_perror("Can't allocate memory"); return -1; } test_msg("addr[%s]=%p\n", tcs->tname, tcs->addr); for (i = 0; i < TEST_CASES; i++) { struct test_case *tc = tcs->tc + i; tc->crc_parent = zero_crc; tc->crc_child = zero_crc; } return 0; } static int init_file(struct test_cases *tcs) { int i, ret, fd; uint8_t buf[PAGE_SIZE]; uint32_t crc; fd = open(filename, O_TRUNC | O_CREAT | O_RDWR, 0600); if (fd < 0) { pr_perror("Unable to create a test file"); return -1; } for (i = 0; i < TEST_CASES; i++) { struct test_case *tc = tcs->tc + i; crc = ~1; datagen2(buf, sizeof(buf), &crc); ret = write(fd, buf, sizeof(buf)); if (ret != sizeof(buf)) { pr_perror("Unable to write data in test file %s", filename); return -1; } tc->crc_parent = crc; tc->crc_child = crc; } tcs->addr = mmap(NULL, PAGE_SIZE * TEST_CASES, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, 0); if (tcs->addr == MAP_FAILED) { pr_perror("Can't allocate memory"); return -1; } test_msg("addr[%s]=%p\n", tcs->tname, tcs->addr); close(fd); return 0; } static int child(task_waiter_t *child_waiter, int fd) { int ret = 0; sep_tcs.addr = mmap(sep_tcs.addr, PAGE_SIZE * TEST_CASES, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); if (sep_tcs.addr == MAP_FAILED) { pr_perror("Can't allocate memory"); return -1; } EXECUTE_ACTION(child_prep, fd); task_waiter_complete_current(child_waiter); while (1) { void **p; ret = read(fd, &p, sizeof(p)); if (ret == 0) break; if (ret != sizeof(p)) { pr_perror("read"); return -1; } test_msg("Read *%p = %p\n", p, p[0]); p = ((void **)p)[0]; if (write(fd, &p, sizeof(p)) != sizeof(p)) { pr_perror("write"); return -1; } ret = 0; } ret = EXECUTE_ACTION(child_check, fd); // Exit code of child process, so return 2 for a test error, 1 for a // test failure (child_check got mismatched checksums) and 0 for // success. return (ret < 0) ? 2 : (ret != 0); } int main(int argc, char ** argv) { uint8_t zero_page[PAGE_SIZE]; int status = -1, ret = 0; task_waiter_t child_waiter; int pfd[2], fd; test_init(argc, argv); task_waiter_init(&child_waiter); memset(zero_page, 0, sizeof(zero_page)); datasum(zero_page, sizeof(zero_page), &zero_crc); if (socketpair(AF_UNIX, SOCK_SEQPACKET, 0, pfd)) { pr_perror("pipe"); return 1; } if (EXECUTE_ACTION(parent_before_fork, -1)) return 2; child_pid = test_fork(); if (child_pid < 0) { pr_perror("Can't fork"); return 2; } if (child_pid == 0) { close(pfd[0]); return child(&child_waiter, pfd[1]); } close(pfd[1]); fd = pfd[0]; task_waiter_wait4(&child_waiter, child_pid); EXECUTE_ACTION(parent_post_fork, -1); test_daemon(); test_waitsig(); ret |= EXECUTE_ACTION(parent_check, fd); close(fd); wait(&status); unlink(filename); if (WIFEXITED(status) && WEXITSTATUS(status) != 2) ret |= WEXITSTATUS(status); else ret |= -1; if (ret == 0) pass(); // Exit code, so return 2 for a test error, 1 for a test failure and 0 // for success. return (ret < 0) ? 2 : (ret != 0); } criu-3.6/test/zdtm/static/cow01.desc000066400000000000000000000000531317335042600173430ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'suid nolazy'} criu-3.6/test/zdtm/static/cr_veth.c000066400000000000000000000023321317335042600173520ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "check that veth C/R-s right"; const char *test_author = "Pavel Emelyanov "; #define IF_NAME "zdtmvthc0" static bool wait_for_veth(void) { int i; for (i = 0; i < 10; i++) { if (system("ip addr list dev " IF_NAME) == 0) return true; sleep(1); } return false; } int main(int argc, char **argv) { int ret = 1; test_init(argc, argv); if (!wait_for_veth()) { fail("failed to inject veth device\n"); return 1; } if (system("ip addr list dev " IF_NAME " | sed -e 's/@.*://' > cr_veth.dump.state")) { fail("can't save net config"); goto out; } test_daemon(); test_waitsig(); if (system("ip addr list dev " IF_NAME " | sed -e 's/@.*://' > cr_veth.rst.state")) { fail("can't get net config"); goto out; } if (system("diff cr_veth.rst.state cr_veth.dump.state")) { fail("Net config differs after restore"); goto out; } pass(); ret = 0; out: return ret; } criu-3.6/test/zdtm/static/cr_veth.checkskip000077500000000000000000000000601317335042600210730ustar00rootroot00000000000000#!/bin/bash unshare --net ip link add type veth criu-3.6/test/zdtm/static/cr_veth.desc000066400000000000000000000003011317335042600200400ustar00rootroot00000000000000{ 'deps': ['/bin/sh', '/bin/sed', '/bin/grep', '/sbin/ip|/bin/ip', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns', 'ropts': '--external veth[zdtmvthc0]:zdtmvthh0@zdtmbr0'} criu-3.6/test/zdtm/static/cr_veth.hook000077500000000000000000000014311317335042600200720ustar00rootroot00000000000000#!/bin/bash if [ "$1" == "--post-start" ]; then set -e PIDF="zdtm/static/cr_veth.pid.inprogress" while [ ! -f "$PIDF" ]; do sleep ".1" done TPID=$(cat $PIDF) ps xaf echo "-> $TPID" set -x ip l l ip link add zdtmvthc0 type veth peer name zdtmvthh0 ip link set zdtmvthc0 netns $TPID ip link del zdtmbr0 || true # Ignore the failure ip link add zdtmbr0 type bridge ip link set zdtmbr0 up ip link set zdtmvthh0 master zdtmbr0 elif [ "$1" == "--post-restore" ]; then ip link list zdtmvthh0 if ! ip link list zdtmvthh0 | fgrep -q 'master zdtmbr0'; then echo "Device missing or not in bridge" exit 1 fi echo "Device OK" elif [ "$1" == "--pre-restore" -o "$1" == "--clean" ]; then # Wait for the link to die ip l l while ip l l zdtmvthh0 ; do sleep ".5" done fi criu-3.6/test/zdtm/static/criu-rtc.c000066400000000000000000000044451317335042600174570ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "criu-plugin.h" #include "criu-log.h" #include "criu-rtc.pb-c.h" extern cr_plugin_dump_file_t cr_plugin_dump_file; extern cr_plugin_restore_file_t cr_plugin_restore_file; int cr_plugin_dump_file(int fd, int id) { CriuRtc e = CRIU_RTC__INIT; char img_path[PATH_MAX]; unsigned char buf[4096]; int img_fd, ret, len; unsigned long irqp; struct stat st, st_rtc; if (fstat(fd, &st) == -1) { pr_perror("fstat"); return -1; } ret = stat("/dev/rtc", &st_rtc); if (ret == -1) { pr_perror("fstat"); return -1; } if (major(st.st_rdev) != major(st_rtc.st_rdev) || minor(st.st_rdev) != 0) return -ENOTSUP; if (ioctl(fd, RTC_IRQP_READ, &irqp) == -1) { pr_perror("RTC_IRQP_READ"); return -1; } e.irqp = irqp; snprintf(img_path, sizeof(img_path), "rtc.%x", id); img_fd = openat(criu_get_image_dir(), img_path, O_WRONLY | O_CREAT); if (img_fd < 0) { pr_perror("Can't open %s", img_path); return -1; } len = criu_rtc__get_packed_size(&e); if (len > sizeof(buf)) return -1; criu_rtc__pack(&e, buf); ret = write(img_fd, buf, len); if (ret != len) { pr_perror("Unable to write in %s", img_path); close(img_fd); return -1; } close(img_fd); return 0; } int cr_plugin_restore_file(int id) { unsigned char buf[4096]; char img_path[PATH_MAX]; int img_fd, len, fd; CriuRtc *e; snprintf(img_path, sizeof(img_path), "rtc.%x", id); img_fd = openat(criu_get_image_dir(), img_path, O_RDONLY); if (img_fd < 0) { pr_perror("open(%s)", img_path); return -ENOTSUP; } len = read(img_fd, &buf, sizeof(buf)); if (len <= 0) { pr_perror("Unable to read from %s", img_path); close(img_fd); return -1; } close(img_fd); e = criu_rtc__unpack(NULL, len, buf); if (e == NULL) { pr_err("Unable to parse the RTC message %#x", id); return -1; } fd = open("/dev/rtc", O_RDWR); if (fd < 0) { pr_perror("open"); return -1; } if (ioctl(fd, RTC_IRQP_SET, e->irqp) == -1) { pr_perror("RTC_IRQP_SET"); close(fd); return -1; } criu_rtc__free_unpacked(e, NULL); if (ioctl(fd, RTC_PIE_ON, 0) == -1) { pr_perror("RTC_PIE_ON"); close(fd); return -1; } return fd; } criu-3.6/test/zdtm/static/criu-rtc.proto000066400000000000000000000001041317335042600203640ustar00rootroot00000000000000syntax = "proto2"; message criu_rtc { required uint64 IRQP = 1; } criu-3.6/test/zdtm/static/cwd00.c000066400000000000000000000024021317335042600166330ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that cwd didn't change"; const char *test_author = "Pavel Emelianov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { char cwd1[256], cwd2[256]; int fd; test_init(argc, argv); fd = open(".", O_DIRECTORY | O_RDONLY); if (fd == -1) { pr_perror("Unable to open the current dir"); exit(1); } if (mkdir(dirname, 0700)) { pr_perror("can't make directory %s", dirname); exit(1); } if (chdir(dirname)) { pr_perror("can't change directory to %s", dirname); goto cleanup; } if (!getcwd(cwd1, sizeof(cwd1))) { pr_perror("can't get cwd"); goto cleanup; } test_daemon(); test_waitsig(); if (!getcwd(cwd2, sizeof(cwd2))) { fail("can't get cwd: %m\n"); goto cleanup; } if (strcmp(cwd1, cwd2)) fail("%s != %s\n", cwd1, cwd2); else pass(); cleanup: /* return to the initial dir before writing out results */ if (fchdir(fd)) { pr_perror("can't restore cwd"); exit(1); } if (rmdir(dirname)) { pr_perror("can't remove directory %s", dirname); exit(1); } return 0; } criu-3.6/test/zdtm/static/cwd01.c000066400000000000000000000036061317335042600166430ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that removed cwd works"; const char *test_author = "Pavel Emelianov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { char cwd1[PATH_MAX], cwd2[PATH_MAX]; int pid, p[2], aux, aux2, fd; test_init(argc, argv); pipe(p); pid = fork(); if (pid == 0) { close(p[1]); read(p[0], &aux, sizeof(aux)); aux = rmdir(dirname); exit(aux ? 1 : 0); } fd = open(".", O_DIRECTORY | O_RDONLY); if (fd == -1) { pr_perror("Unable to open the current dir"); exit(1); } if (mkdir(dirname, 0700)) { pr_perror("can't make directory %s", dirname); exit(1); } if (chdir(dirname)) { pr_perror("can't change directory to %s", dirname); goto cleanup; } close(p[1]); close(p[0]); waitpid(pid, &aux, 0); if (!WIFEXITED(aux) || WEXITSTATUS(aux) != 0) { pr_perror("can't remove dir"); goto cleanup; } aux = readlink("/proc/self/cwd", cwd1, sizeof(cwd1)); if (aux < 0) { pr_perror("can't get cwd"); goto cleanup; } if (aux == sizeof(cwd1)) { pr_perror("A buffer is too small"); goto cleanup; } cwd1[aux] = '\0'; test_daemon(); test_waitsig(); aux2 = readlink("/proc/self/cwd", cwd2, sizeof(cwd2)); if (aux2 < 0) { fail("can't get cwd: %m\n"); goto cleanup; } if (aux2 == sizeof(cwd2)) { pr_perror("A buffer is too small"); goto cleanup; } cwd2[aux2] = '\0'; /* FIXME -- criu adds a suffix to removed cwd */ if (strncmp(cwd1, cwd2, aux)) fail("%s != %s\n", cwd1, cwd2); else pass(); cleanup: /* return to the initial dir before writing out results */ if (fchdir(fd)) { pr_perror("can't restore cwd"); exit(1); } rmdir(dirname); return 0; } criu-3.6/test/zdtm/static/cwd02.c000066400000000000000000000032471317335042600166450ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that removed and opened cwd are kept"; const char *test_author = "Pavel Emelianov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { int cwd, fd, pid, p[2], aux; struct stat std, stf; test_init(argc, argv); pipe(p); pid = fork(); if (pid == 0) { close(p[1]); read(p[0], &aux, sizeof(aux)); aux = rmdir(dirname); exit(aux ? 1 : 0); } cwd = open(".", O_DIRECTORY | O_RDONLY); if (cwd == -1) { pr_perror("Unable to open the current dir"); exit(1); } if (mkdir(dirname, 0700)) { pr_perror("can't make directory %s", dirname); exit(1); } if ((fd = open(dirname, O_DIRECTORY)) < 0) { pr_perror("can't open dir %s", dirname); goto cleanup; } if (chdir(dirname)) { pr_perror("can't change directory to %s", dirname); goto cleanup; } close(p[1]); close(p[0]); waitpid(pid, &aux, 0); if (!WIFEXITED(aux) || WEXITSTATUS(aux) != 0) { pr_perror("can't remove dir"); goto cleanup; } test_daemon(); test_waitsig(); if (fstat(fd, &stf) < 0) { fail("dir fd closed\n"); goto cleanup; } if (stat("/proc/self/cwd", &std) < 0) { fail("cwd is not OK\n"); goto cleanup; } if (stf.st_ino != std.st_ino || stf.st_dev != std.st_dev) { fail("cwd and opened fd are not the same\n"); goto cleanup; } pass(); cleanup: /* return to the initial dir before writing out results */ if (fchdir(cwd)) { pr_perror("can't restore cwd"); exit(1); } rmdir(dirname); return 0; } criu-3.6/test/zdtm/static/del_standalone_un.c000066400000000000000000000043541317335042600214040ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that deleted unix sockets are restored correctly"; const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); static int fill_sock_name(struct sockaddr_un *name, const char *filename) { char *cwd; cwd = get_current_dir_name(); if (strlen(filename) + strlen(cwd) + 1 >= sizeof(name->sun_path)) return -1; name->sun_family = AF_LOCAL; sprintf(name->sun_path, "%s/%s", cwd, filename); return 0; } static int bind_and_listen(struct sockaddr_un *addr) { int sk; sk = socket(PF_UNIX, SOCK_STREAM, 0); if (sk < 0) { fail("socket"); return -1; } if (bind(sk, (struct sockaddr *) addr, sizeof(*addr))) { fail("bind %s", addr->sun_path); close(sk); return -1; } if (listen(sk, 1)) { fail("listen"); close(sk); return -1; } return sk; } int main(int argc, char **argv) { struct sockaddr_un addr; int sk1 = -1, sk2 = -1, ret = 1; struct stat sb; char filename[PATH_MAX], temp[PATH_MAX]; test_init(argc, argv); sprintf(filename, "%s/sock", dirname); sprintf(temp, "%s/temp", dirname); if (mkdir(dirname, 0755) < 0) { fail("mkdir"); goto out; } if (fill_sock_name(&addr, filename) < 0) { pr_err("filename \"%s\" is too long\n", filename); goto out; } sk1 = bind_and_listen(&addr); if (sk1 < 0) goto out; if (rename(filename, temp) < 0) { fail("rename"); goto out; } sk2 = bind_and_listen(&addr); if (sk2 < 0) goto out; if (rename(temp, filename) < 0) { fail("rename2"); goto out; } test_daemon(); test_waitsig(); if (getsockopt(sk1, 0, 0, NULL, 0) && errno != EOPNOTSUPP) { fail("socket 1 didn't survive restore"); goto out; } if (getsockopt(sk2, 0, 0, NULL, 0) && errno != EOPNOTSUPP) { fail("socket 2 didn't survive restore"); goto out; } if (stat(addr.sun_path, &sb) != 0) { fail("%s doesn't exist after restore\n", addr.sun_path); goto out; } pass(); ret = 0; out: if (sk1 > 0) close(sk1); if (sk2 > 0) close(sk2); rmdir(dirname); return ret; } criu-3.6/test/zdtm/static/del_standalone_un.desc000066400000000000000000000000271317335042600220710ustar00rootroot00000000000000{'flavor': 'h ns uns'} criu-3.6/test/zdtm/static/deleted_dev.c000066400000000000000000000030531317335042600201650ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that we can migrate with a device special file " "open and unlinked before migration"; const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char **argv) { int fd; struct stat st; /* /dev/null params - sure to exist in a VPS */ mode_t mode = S_IFCHR | 0700; dev_t dev = makedev(1, 3); test_init(argc, argv); if (mknod(filename, mode, dev)) { pr_perror("can't make device file \"%s\"", filename); exit(1); } fd = open(filename, O_RDWR); if (fd < 0) { pr_perror("can't open %s", filename); goto out; } if (unlink(filename) < 0) { pr_perror("can't unlink %s", filename); goto out; } test_daemon(); test_waitsig(); if (fstat(fd, &st) < 0) { fail("can't stat %s: %m", filename); goto out; } if (st.st_mode != mode || st.st_rdev != dev) { fail("%s is no longer the device file we had", filename); test_msg("mode %x want %x, dev %llx want %llx\n", st.st_mode, mode, (long long unsigned)st.st_rdev, (long long unsigned)dev); goto out; } if (close(fd) < 0) { fail("can't close %s: %m", filename); goto out; } if (unlink(filename) != -1 || errno != ENOENT) { fail("file %s should have been deleted before migration: unlink: %m\n", filename); goto out; } pass(); out: close(fd); unlink(filename); return 0; } criu-3.6/test/zdtm/static/deleted_dev.desc000066400000000000000000000000441317335042600206560ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/deleted_unix_sock.c000066400000000000000000000067541317335042600214240ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Create a unix socket, and destroy it before " "migration; check that the child can write to it " "and the parent can read from it after migration"; const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); static int fill_sock_name(struct sockaddr_un *name, const char *filename) { char *cwd; cwd = get_current_dir_name(); if (strlen(filename) + strlen(cwd) + 1 >= sizeof(name->sun_path)) return -1; name->sun_family = AF_LOCAL; sprintf(name->sun_path, "%s/%s", cwd, filename); return 0; } static int setup_srv_sock(void) { struct sockaddr_un name; int sock; if (fill_sock_name(&name, filename) < 0) { pr_perror("filename \"%s\" is too long", filename); return -1; } sock = socket(PF_LOCAL, SOCK_STREAM, 0); if (sock < 0) { pr_perror("can't create socket"); return -1; } if (bind(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) { pr_perror("can't bind to socket \"%s\"", filename); goto err; } if (listen(sock, 1) < 0) { pr_perror("can't listen on a socket \"%s\"", filename); goto err; } return sock; err: close(sock); return -1; } static int setup_clnt_sock(void) { struct sockaddr_un name; int sock; if (fill_sock_name(&name, filename) < 0) return -1; sock = socket(PF_LOCAL, SOCK_STREAM, 0); if (sock < 0) return -1; if (connect(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) goto err; return sock; err: close(sock); return -1; } int main(int argc, char ** argv) { int sock, acc_sock, ret; pid_t pid; uint32_t crc; uint8_t buf[1000]; test_init(argc, argv); sock = setup_srv_sock(); if (sock < 0) exit(1); pid = test_fork(); if (pid < 0) { pr_perror("can't fork"); exit(1); } if (pid == 0) { /* child writes to the unlinked socket and returns */ close(sock); sock = setup_clnt_sock(); if (sock < 0) _exit(1); test_waitsig(); crc = ~0; datagen(buf, sizeof(buf), &crc); if (write(sock, buf, sizeof(buf)) != sizeof(buf)) { pr_perror("can't write to socket"); exit(errno); } close(sock); exit(0); } acc_sock = accept(sock, NULL, NULL); if (acc_sock < 0) { pr_perror("can't accept() the connection on \"%s\"", filename); goto out_kill; } close(sock); sock = acc_sock; if (unlink(filename)) { pr_perror("can't unlink %s", filename); goto out_kill; } test_daemon(); test_waitsig(); if (kill(pid, SIGTERM)) { fail("terminating the child failed: %m\n"); goto out; } if (wait(&ret) != pid) { fail("wait() returned wrong pid %d: %m\n", pid); goto out; } if (WIFEXITED(ret)) { ret = WEXITSTATUS(ret); if (ret) { fail("child exited with nonzero code %d (%s)\n", ret, strerror(ret)); goto out; } } if (WIFSIGNALED(ret)) { fail("child exited on unexpected signal %d\n", WTERMSIG(ret)); goto out; } if (read(sock, buf, sizeof(buf)) != sizeof(buf)) { fail("can't read %s: %m\n", filename); goto out; } crc = ~0; if (datachk(buf, sizeof(buf), &crc)) { fail("CRC mismatch\n"); goto out; } if (close(sock)) { fail("close failed: %m\n"); goto out; } if (unlink(filename) != -1 || errno != ENOENT) { fail("file %s should have been deleted before migration: unlink: %m\n", filename); goto out; } pass(); out_kill: kill(pid, SIGTERM); out: close(sock); return 0; } criu-3.6/test/zdtm/static/different_creds.c000066400000000000000000000053441317335042600210540ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that threads with different creds aren't checkpointed"; const char *test_author = "Tycho Andersen "; void *drop_caps_and_wait(void *arg) { int fd = *((int *) arg), i; void *retcode = (void *)0xdeadbeaf; cap_t caps; char c; typedef struct cap_set { cap_flag_value_t val; cap_flag_value_t new; cap_flag_t flag; cap_value_t bit; } cap_set_t; cap_set_t src[] = { { .val = CAP_CLEAR, .flag = CAP_EFFECTIVE, .bit = CAP_CHOWN, }, { .val = CAP_SET, .flag = CAP_EFFECTIVE, .bit = CAP_DAC_OVERRIDE, }, { .val = CAP_CLEAR, .flag = CAP_INHERITABLE, .bit = CAP_SETPCAP, }, { .val = CAP_SET, .flag = CAP_INHERITABLE, .bit = CAP_NET_BIND_SERVICE, }, }; caps = cap_get_proc(); if (!caps) { pr_perror("cap_get_proc"); return NULL; } for (i = 0; i < ARRAY_SIZE(src); i++) { if (cap_set_flag(caps, src[i].flag, 1, &src[i].bit, src[i].val) < 0) { pr_perror("Can't setup CAP %s", cap_to_name(src[i].bit)); goto die; } } if (cap_set_proc(caps) < 0) { pr_perror("cap_set_proc"); goto die; } if (write(fd, "a", 1) != 1) { pr_perror("Unable to send a status"); goto die; } if (read(fd, &c, 1) != 1) { pr_perror("Unable to read a status"); goto die; } for (i = 0; i < ARRAY_SIZE(src); i++) { if (cap_get_flag(caps, src[i].bit, src[i].flag, &src[i].new) < 0) { pr_perror("Can't get CAP %s", cap_to_name(src[i].bit)); goto die; } if (src[i].val != src[i].new) { pr_err("Val mismatch on CAP %s\n", cap_to_name(src[i].bit)); goto die; } } retcode = NULL; die: cap_free(caps); return retcode; } int main(int argc, char ** argv) { int pipefd[2]; pthread_t thr; char c; void *retcode; test_init(argc, argv); if (socketpair(AF_FILE, SOCK_SEQPACKET, 0, pipefd)) { pr_perror("pipe"); return -1; } if (pthread_create(&thr, NULL, drop_caps_and_wait, &pipefd[0])) { pr_perror("Unable to create thread"); return -1; } /* * Wait for child to signal us that it has droped caps. */ if (read(pipefd[1], &c, 1) != 1) { pr_perror("read"); return 1; } test_daemon(); test_waitsig(); if (write(pipefd[1], &c, 1) != 1) { pr_perror("write"); return 1; } if (pthread_join(thr, &retcode)) { pr_perror("Unable to jount a thread"); return 1; } if (retcode != NULL) { fail("retcode returned %p", retcode); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/different_creds.desc000066400000000000000000000000411317335042600215350ustar00rootroot00000000000000{'flavor': 'h', 'flags': 'suid'} criu-3.6/test/zdtm/static/dumpable01.c000066400000000000000000000017701317335042600176570ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check dumpable flag handling (dumpable case)"; const char *test_author = "Filipe Brandenburger "; int main(int argc, char **argv) { int save_dumpable; int dumpable; test_init(argc, argv); save_dumpable = prctl(PR_GET_DUMPABLE); if (save_dumpable < 0) { pr_perror("error getting prctl(PR_GET_DUMPABLE) before dump"); return 1; } #ifdef DEBUG test_msg("DEBUG: before dump: dumpable=%d\n", save_dumpable); #endif /* Wait for criu dump and restore. */ test_daemon(); test_waitsig(); dumpable = prctl(PR_GET_DUMPABLE); if (dumpable < 0) { pr_perror("error getting prctl(PR_GET_DUMPABLE) after restore"); return 1; } #ifdef DEBUG test_msg("DEBUG: after dump: dumpable=%d\n", dumpable); #endif if (dumpable != save_dumpable) { errno = 0; fail("dumpable flag was not preserved over migration"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/dumpable02.c000066400000000000000000000111071317335042600176530ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check dumpable flag handling (non-dumpable case)"; const char *test_author = "Filipe Brandenburger "; int dumpable_server() { char buf[256]; int ret; for (;;) { ret = read(0, buf, sizeof(buf)); if (ret == 0) break; ret = snprintf(buf, sizeof(buf), "DUMPABLE:%d\n", prctl(PR_GET_DUMPABLE)); write(1, buf, ret); } return 0; } int get_dumpable_from_pipes(int pipe_input, int pipe_output) { char buf[256]; int len; long value; char *endptr = NULL; /* input and output are from the child's point of view. */ write(pipe_input, "GET\n", 4); len = read(pipe_output, buf, sizeof(buf) - 1); if (len < 0) { pr_perror("error in parent reading from pipe"); return -1; } buf[len] = 0; if (memcmp(buf, "DUMPABLE:", 9) != 0) { pr_perror("child returned [%s]", buf); return -1; } value = strtol(&buf[9], &endptr, 10); if (!endptr || *endptr != '\n' || endptr != buf + len - 1) { pr_perror("child returned [%s]", buf); return -1; } return (int)value; } int main(int argc, char **argv) { int pipe_input[2]; int pipe_output[2]; int save_dumpable; int dumpable; int ret; pid_t pid; pid_t waited; int status; /* * Check if we are being re-executed to spawn the dumpable server. This * re-execution is what essentially causes the dumpable flag to be * cleared since we have execute but not read permissions to the * binary. */ if (getenv("DUMPABLE_SERVER")) return dumpable_server(); /* * Otherwise, do normal startup and spawn a dumpable server. While we * are still running as root, chmod() the binary to give it execute but * not read permissions, that way when we execv() it as a non-root user * the kernel will drop our dumpable flag and reset it to the value in * /proc/sys/fs/suid_dumpable. */ ret = chmod(argv[0], 0111); if (ret < 0) { pr_perror("error chmodding %s", argv[0]); return 1; } test_init(argc, argv); ret = pipe(pipe_input); if (ret < 0) { pr_perror("error creating input pipe"); return 1; } ret = pipe(pipe_output); if (ret < 0) { pr_perror("error creating output pipe"); return 1; } pid = fork(); if (pid < 0) { pr_perror("error forking the dumpable server"); return 1; } if (pid == 0) { /* * Child process will execv() the dumpable server. Start by * reopening stdin and stdout to use the pipes, then set the * environment variable and execv() the same binary. */ close(0); close(1); ret = dup2(pipe_input[0], 0); if (ret < 0) { pr_perror("could not dup2 pipe into child's stdin"); return 1; } ret = dup2(pipe_output[1], 1); if (ret < 0) { pr_perror("could not dup2 pipe into child's stdout"); return 1; } close(pipe_output[0]); close(pipe_output[1]); close(pipe_input[0]); close(pipe_input[1]); ret = setenv("DUMPABLE_SERVER", "yes", 1); if (ret < 0) { pr_perror("could not set the DUMPABLE_SERVER env variable"); return 1; } ret = execl(argv[0], "dumpable_server", NULL); pr_perror("could not execv %s as a dumpable_server", argv[0]); return 1; } /* * Parent process, write to the pipe_input socket to ask the server * child to tell us what its dumpable flag value is on its side. */ close(pipe_input[0]); close(pipe_output[1]); save_dumpable = get_dumpable_from_pipes(pipe_input[1], pipe_output[0]); if (save_dumpable < 0) return 1; #ifdef DEBUG test_msg("DEBUG: before dump: dumpable=%d\n", save_dumpable); #endif /* Wait for dump and restore. */ test_daemon(); test_waitsig(); dumpable = get_dumpable_from_pipes(pipe_input[1], pipe_output[0]); if (dumpable < 0) return 1; #ifdef DEBUG test_msg("DEBUG: after restore: dumpable=%d\n", dumpable); #endif if (dumpable != save_dumpable) { errno = 0; fail("dumpable flag was not preserved over migration"); return 1; } /* Closing the pipes will terminate the child server. */ close(pipe_input[1]); close(pipe_output[0]); waited = wait(&status); if (waited < 0) { pr_perror("error calling wait on the child"); return 1; } errno = 0; if (waited != pid) { pr_perror("waited pid %d did not match child pid %d", waited, pid); return 1; } if (!WIFEXITED(status)) { pr_perror("child dumpable server returned abnormally with status=%d", status); return 1; } if (WEXITSTATUS(status) != 0) { pr_perror("child dumpable server returned rc=%d", WEXITSTATUS(status)); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/dumpable02.desc000066400000000000000000000000461317335042600203470ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'nouser'} criu-3.6/test/zdtm/static/env00.c000066400000000000000000000013361317335042600166530ustar00rootroot00000000000000#include #include #include #include "zdtmtst.h" const char *test_doc = "Check that environment didn't change"; const char *test_author = "Pavel Emelianov "; char *envname; TEST_OPTION(envname, string, "environment variable name", 1); int main(int argc, char **argv) { char *env; test_init(argc, argv); if (setenv(envname, test_author, 1)) { pr_perror("Can't set env var \"%s\" to \"%s\"", envname, test_author); exit(1); } test_daemon(); test_waitsig(); env = getenv(envname); if (!env) { fail("can't get env var \"%s\": %m\n", envname); goto out; } if (strcmp(env, test_author)) fail("%s != %s\n", env, test_author); else pass(); out: return 0; } criu-3.6/test/zdtm/static/eventfs00.c000066400000000000000000000032211317335042600175300ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef F_SETSIG #define F_SETSIG 10 /* for sockets. */ #define F_GETSIG 11 /* for sockets. */ #endif const char *test_doc = "Check for eventfs"; const char *test_author = "Cyrill Gorcunov "; #define EVENTFD_INITIAL 30 #define EVENTFD_FINAL 90 int main(int argc, char *argv[]) { int efd, ret, epollfd; int pipefd[2]; uint64_t v = EVENTFD_INITIAL; struct epoll_event ev; test_init(argc, argv); epollfd = epoll_create(1); if (epollfd < 0) { fail("epoll_create"); exit(1); } efd = eventfd((unsigned int)v, EFD_NONBLOCK); if (efd < 0) { fail("eventfd"); exit(1); } memset(&ev, 0xff, sizeof(ev)); ev.events = EPOLLIN | EPOLLOUT; if (pipe(pipefd)) { fail("pipe"); exit(1); } if (epoll_ctl(epollfd, EPOLL_CTL_ADD, pipefd[0], &ev)) { fail("epoll_ctl"); exit(1); } test_msg("created eventfd with %"PRIu64"\n", v); ret = write(efd, &v, sizeof(v)); if (ret != sizeof(v)) { fail("write"); exit(1); } ret = write(efd, &v, sizeof(v)); if (ret != sizeof(v)) { fail("write"); exit(1); } test_daemon(); test_waitsig(); ret = read(efd, &v, sizeof(v)); if (ret != sizeof(v)) { fail("write"); exit(1); } if (v != EVENTFD_FINAL) { fail("EVENTFD_FINAL mismatch\n"); exit(1); } pass(); return 0; } criu-3.6/test/zdtm/static/fanotify00.c000066400000000000000000000167511317335042600177110ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifdef __x86_64__ # define __NR_fanotify_init 300 # define __NR_fanotify_mark 301 #elif defined(__PPC64__) # define __NR_fanotify_init 323 # define __NR_fanotify_mark 324 #elif __aarch64__ # define __NR_fanotify_init 262 # define __NR_fanotify_mark 263 #elif __s390x__ # define __NR_fanotify_init 332 # define __NR_fanotify_mark 333 #else # define __NR_fanotify_init 338 # define __NR_fanotify_mark 339 #endif const char *test_doc = "Check for fanotify delivery"; const char *test_author = "Cyrill Gorcunov "; const char fanotify_path[] = "fanotify-del-after-cr"; #define BUFF_SIZE (8192) struct fanotify_mark_inode { unsigned long i_ino; unsigned int s_dev; unsigned int mflags; unsigned int mask; unsigned int ignored_mask; unsigned int fhandle_bytes; unsigned int fhandle_type; unsigned char fhandle[512]; }; struct fanotify_mark_mount { unsigned int mnt_id; unsigned int mflags; unsigned int mask; unsigned int ignored_mask; }; struct fanotify_glob { unsigned int faflags; unsigned int evflags; }; struct fanotify_obj { struct fanotify_glob glob; struct fanotify_mark_inode inode; struct fanotify_mark_mount mount; }; static int fanotify_init(unsigned int flags, unsigned int event_f_flags) { return syscall(__NR_fanotify_init, flags, event_f_flags); } static int fanotify_mark(int fanotify_fd, unsigned int flags, unsigned long mask, int dfd, const char *pathname) { #ifdef __i386__ return syscall(__NR_fanotify_mark, fanotify_fd, flags, mask, 0, dfd, pathname); #else return syscall(__NR_fanotify_mark, fanotify_fd, flags, mask, dfd, pathname); #endif } #define fdinfo_field(str, field) !strncmp(str, field":", sizeof(field)) static void show_fanotify_obj(struct fanotify_obj *obj) { test_msg("fanotify obj at %p\n", obj); test_msg(" glob\n"); test_msg(" faflags: %x evflags: %x\n", obj->glob.faflags, obj->glob.evflags); test_msg(" inode\n"); test_msg(" i_ino: %lx s_dev: %x mflags: %x " "mask: %x ignored_mask: %x " "fhandle_bytes: %x fhandle_type: %x " "fhandle: %s", obj->inode.i_ino, obj->inode.s_dev, obj->inode.mflags, obj->inode.mask, obj->inode.ignored_mask, obj->inode.fhandle_bytes, obj->inode.fhandle_type, obj->inode.fhandle); test_msg(" mount\n"); test_msg(" mnt_id: %x mflags: %x mask: %x ignored_mask: %x\n", obj->mount.mnt_id, obj->mount.mflags, obj->mount.mask, obj->mount.ignored_mask); } static void copy_fhandle(char *tok, struct fanotify_mark_inode *inode) { int off = 0; while (*tok && (*tok > '0' || *tok < 'f')) { inode->fhandle[off++] = *tok++; if (off >= sizeof(inode->fhandle) - 1) break; } inode->fhandle[off] = '\0'; } static int cmp_fanotify_obj(struct fanotify_obj *old, struct fanotify_obj *new) { /* * mnt_id and s_dev may change during container migration, * moreover the backend (say PLOOP) may be re-mounted during * c/r, so exclude them. */ if ((old->glob.faflags != new->glob.faflags) || (old->glob.evflags != new->glob.evflags) || (old->inode.i_ino != new->inode.i_ino) || (old->inode.mflags != new->inode.mflags) || (old->inode.mask != new->inode.mask) || (old->inode.ignored_mask != new->inode.ignored_mask)) return -1; if (memcmp(old->inode.fhandle, new->inode.fhandle, sizeof(new->inode.fhandle))) return -2; if ((old->mount.mflags != new->mount.mflags) || (old->mount.mask != new->mount.mask) || (old->mount.ignored_mask != new->mount.ignored_mask)) return -3; return 0; } int parse_fanotify_fdinfo(int fd, struct fanotify_obj *obj, unsigned int expected_to_meet) { unsigned int met = 0; char str[512]; FILE *f; int ret; sprintf(str, "/proc/self/fdinfo/%d", fd); f = fopen(str, "r"); if (!f) { pr_perror("Can't open fdinfo to parse"); return -1; } while (fgets(str, sizeof(str), f)) { if (fdinfo_field(str, "fanotify flags")) { ret = sscanf(str, "fanotify flags:%x event-flags:%x", &obj->glob.faflags, &obj->glob.evflags); if (ret != 2) goto parse_err; met++; continue; } if (fdinfo_field(str, "fanotify mnt_id")) { ret = sscanf(str, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x", &obj->mount.mnt_id, &obj->mount.mflags, &obj->mount.mask, &obj->mount.ignored_mask); if (ret != 4) goto parse_err; met++; continue; } if (fdinfo_field(str, "fanotify ino")) { int hoff; ret = sscanf(str, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x " "fhandle-bytes:%x fhandle-type:%x f_handle: %n", &obj->inode.i_ino, &obj->inode.s_dev, &obj->inode.mflags, &obj->inode.mask, &obj->inode.ignored_mask, &obj->inode.fhandle_bytes, &obj->inode.fhandle_type, &hoff); if (ret != 7) goto parse_err; copy_fhandle(&str[hoff], &obj->inode); met++; continue; } } if (expected_to_meet != met) { pr_perror("Expected to meet %d entries but got %d", expected_to_meet, met); return -1; } return 0; parse_err: pr_perror("Can't parse '%s'", str); return -1; } int main (int argc, char *argv[]) { struct fanotify_obj old = { }, new = { }; int fa_fd, fd, del_after; char buf[BUFF_SIZE]; ssize_t length; int ns = getenv("ZDTM_NEWNS") != NULL; test_init(argc, argv); if (ns) { if (mkdir("/tmp", 666) && errno != EEXIST) { pr_perror("Unable to create the /tmp directory"); return -1; } if (mount("zdtm", "/tmp", "tmpfs", 0, NULL)) { pr_perror("Unable to mount tmpfs into %s", "/tmp"); } } fa_fd = fanotify_init(FAN_NONBLOCK | FAN_CLASS_NOTIF | FAN_UNLIMITED_QUEUE, O_RDONLY | O_LARGEFILE); if (fa_fd < 0) { pr_perror("fanotify_init failed"); exit(1); } del_after = open(fanotify_path, O_CREAT | O_TRUNC); if (del_after < 0) { pr_perror("open failed"); exit(1); } if (fanotify_mark(fa_fd, FAN_MARK_ADD, FAN_MODIFY | FAN_ACCESS | FAN_OPEN | FAN_CLOSE, AT_FDCWD, fanotify_path)) { pr_perror("fanotify_mark failed"); exit(1); } if (fanotify_mark(fa_fd, FAN_MARK_ADD | FAN_MARK_MOUNT, FAN_ONDIR | FAN_OPEN | FAN_CLOSE, AT_FDCWD, "/tmp")) { pr_perror("fanotify_mark failed"); exit(1); } if (fanotify_mark(fa_fd, FAN_MARK_ADD | FAN_MARK_MOUNT | FAN_MARK_IGNORED_MASK | FAN_MARK_IGNORED_SURV_MODIFY, FAN_MODIFY | FAN_ACCESS, AT_FDCWD, "/tmp")) { pr_perror("fanotify_mark failed"); exit(1); } if (parse_fanotify_fdinfo(fa_fd, &old, 3)) { pr_perror("parsing fanotify fdinfo failed"); exit(1); } show_fanotify_obj(&old); test_daemon(); test_waitsig(); fd = open("/", O_RDONLY); close(fd); fd = open(fanotify_path, O_RDWR); close(fd); if (unlink(fanotify_path)) { fail("can't unlink %s\n", fanotify_path); exit(1); } if (parse_fanotify_fdinfo(fa_fd, &new, 3)) { fail("parsing fanotify fdinfo failed\n"); exit(1); } show_fanotify_obj(&new); if (cmp_fanotify_obj(&old, &new)) { fail("fanotify mismatch on fdinfo level\n"); exit(1); } length = read(fa_fd, buf, sizeof(buf)); if (length <= 0) { fail("No events in fanotify queue\n"); exit(1); } if (fanotify_mark(fa_fd, FAN_MARK_REMOVE | FAN_MARK_MOUNT, FAN_ONDIR | FAN_OPEN | FAN_CLOSE, AT_FDCWD, "/tmp")) { pr_perror("fanotify_mark failed"); exit(1); } pass(); return 0; } criu-3.6/test/zdtm/static/fanotify00.desc000066400000000000000000000000441317335042600203710ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/fd.c000066400000000000000000000033711317335042600163150ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" #include "lock.h" const char *test_doc = "Check that criu closes up all its descriptors"; const char *test_author = "Andrew Vagin "; int main(int argc, char **argv) { struct dirent *de; char pfd[PATH_MAX]; mutex_t *lock; int status; pid_t pid; DIR *d; test_init(argc, argv); lock = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (lock == MAP_FAILED) return 1; mutex_init(lock); mutex_lock(lock); pid = fork(); if (pid < 0) { pr_perror("fork()"); return 1; } if (pid == 0) { d = opendir("/proc/self/fd"); if (d == NULL) return 1; while ((de = readdir(d))) { int fd; if (de->d_name[0] == '.') continue; fd = atoi(de->d_name); if (dirfd(d) == fd) continue; close(fd); } closedir(d); mutex_unlock(lock); test_waitsig(); return 0; } mutex_lock(lock); test_daemon(); test_waitsig(); snprintf(pfd, sizeof(pfd), "/proc/%d/fd", pid); d = opendir(pfd); if (d == NULL) return 2; while ((de = readdir(d))) { int ret; if (de->d_name[0] == '.') continue; ret = readlinkat(dirfd(d), de->d_name, pfd, sizeof(pfd) - 1); if (ret < 0) { pr_perror("readlink"); ret = 0; } pfd[ret] = '\0'; fail("Unexpected fd: %s -> %s\n", de->d_name, pfd); return 1; } closedir(d); kill(pid, SIGTERM); if (waitpid(pid, &status, 0) != pid) { pr_perror("waitpid()"); return 1; } if (status != 0) { fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/fdt_shared.c000066400000000000000000000064721317335042600200340ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check a shared file descriptor table."; const char *test_author = "Andrew Vagin "; char *filename; TEST_OPTION(filename, string, "file name", 1); #define STACK_SIZE 4096 #define TEST_FD 128 #define TEST_STRING "Hello World!" #define CHILDREN 4 static int fork_pfd[2]; static void forked() { char c = 0; if (write(fork_pfd[1], &c, 1) != 1) { pr_perror("Unable to send a signal to the parent"); exit(5); } } static void wait_children() { int i; char c; for (i = 0; i < CHILDREN; i++) { if (read(fork_pfd[0], &c, 1) != 1) { pr_perror("Unable to read a signal from a child"); exit(5); } } } static pid_t clone_child(int (*fn)(void *), int flags) { char stack[STACK_SIZE] __stack_aligned__; pid_t pid; pid = clone(fn, stack + STACK_SIZE, flags | SIGCHLD, NULL); if (pid == -1) { pr_perror("Unable to clone a new process"); return -1; } return pid; } static int child2(void *_arg) { char buf[sizeof(TEST_STRING)]; forked(); test_waitsig(); if (read(TEST_FD, buf, sizeof(TEST_STRING)) != sizeof(TEST_STRING)) { pr_perror("Unable to read from %d", TEST_FD); return 1; } return 0; } static int child3(void *_arg) { forked(); test_waitsig(); if (close(TEST_FD) != -1) { fail("%d is exist\n", TEST_FD); return 1; } return 0; } static int child(void *_arg) { char buf[sizeof(TEST_STRING)]; pid_t pid, pid2; int status; pid = clone_child(child2, CLONE_FILES); if (pid < 0) return 1; pid2 = clone_child(child3, 0); if (pid < 0) return 1; forked(); test_waitsig(); kill(pid2, SIGTERM); kill(pid, SIGTERM); waitpid(pid2, &status, 0); if (status) { fail("The child3 returned %d\n", status); return 1; } waitpid(pid, &status, 0); if (status) { fail("The child2 returned %d\n", status); return 1; } if (read(TEST_FD, buf, sizeof(TEST_STRING)) != sizeof(TEST_STRING)) { pr_perror("Unable to read from %d", TEST_FD); return 1; } if (close(TEST_FD) == -1) { pr_perror("Unable to close(%d)", TEST_FD); return 1; } return 0; } int main(int argc, char ** argv) { int status; pid_t pid, pid2; int fd, i; test_init(argc, argv); if (pipe(fork_pfd)) { pr_perror("pipe"); return 1; } pid = clone_child(child, CLONE_FILES); if (pid < 0) return 1; pid2 = clone_child(child2, CLONE_FILES); if (pid2 < 0) return 1; wait_children(); test_daemon(); test_waitsig(); fd = open(filename, O_RDWR | O_CREAT, 0666); if (fd == -1) { pr_perror("Can't open /dev/zero"); return -1; } for (i = 0; i < 3; i++) if (write(fd, TEST_STRING, sizeof(TEST_STRING)) != sizeof(TEST_STRING)) { pr_perror("Unable to write a test string"); return -1; } fd = dup2(fd, TEST_FD); if (fd == -1) { pr_perror("Can't dup fd %d to %d", fd, TEST_FD); return -1; } lseek(fd, 0, SEEK_SET); kill(pid2, SIGTERM); waitpid(pid2, &status, 0); kill(pid, SIGTERM); if (status) { fail("The child returned %d\n", status); return 1; } waitpid(pid, &status, 0); if (status) { fail("The child returned %d\n", status); return 1; } if (close(TEST_FD) == 0) { fail("%d was not closed\n", TEST_FD); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/fifo-ghost.c000066400000000000000000000025651317335042600177750ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that a ghost fifo with data restored"; const char *test_author = "Cyrill Gorcunov "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char **argv) { int fd; int fd_ro; mode_t mode = S_IFIFO | 0700; uint8_t buf[256]; uint32_t crc; int ret; test_init(argc, argv); if (mknod(filename, mode, 0)) { pr_perror("can't make fifo \"%s\"", filename); exit(1); } fd = open(filename, O_RDWR); if (fd < 0) { pr_perror("can't open %s", filename); return 1; } fd_ro = open(filename, O_RDONLY); if (fd_ro < 0) { pr_perror("can't open %s", filename); return 1; } crc = ~0; datagen(buf, sizeof(buf), &crc); ret = write(fd, buf, sizeof(buf)); if (ret != sizeof(buf)) { pr_perror("write() failed"); return 1; } if (unlink(filename) < 0) { fail("can't unlink %s", filename); return 1; } close(fd); test_daemon(); test_waitsig(); ret = read(fd_ro, buf, sizeof(buf)); if (ret != sizeof(buf)) { pr_perror("read() failed"); return 1; } crc = ~0; if (datachk(buf, sizeof(buf), &crc)) { fail("data corrupted"); return 1; } if (close(fd_ro) < 0) { fail("can't close %s", filename); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/fifo-rowo-pair.c000066400000000000000000000060361317335042600205650ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test for fifo ro/wo with " "fake fifo needed on criu side"; const char *test_author = "Cyrill Gorcunov "; char *name_master; TEST_OPTION(name_master, string, "master fifo name", 1); char *name_slave; TEST_OPTION(name_slave, string, "slave fifo name", 1); #define TEST_VALUE (00100) #define exit_shot(pid, code) \ do { kill(pid, SIGKILL); exit(code); } while (0) #define exit_shot_parent(code) \ exit_shot(getppid(), 1) int main(int argc, char **argv) { task_waiter_t t; pid_t pid; int fd_master, fd_slave; int v, status; test_init(argc, argv); if (mknod(name_master, S_IFIFO | 0700, 0)) { pr_perror("can't make fifo \"%s\"", name_master); exit(1); } if (mknod(name_slave, S_IFIFO | 0700, 0)) { pr_perror("can't make fifo \"%s\"", name_slave); exit(1); } fd_slave = open(name_slave, O_RDWR); if (fd_slave < 0) { pr_perror("can't open %s", name_slave); exit(1); } task_waiter_init(&t); pid = test_fork(); if (pid == 0) { int new_slave; fd_master = open(name_master, O_WRONLY); if (fd_master < 0) { pr_perror("can't open %s", name_master); exit_shot_parent(1); } new_slave = dup2(fd_slave, 64); if (new_slave < 0) { pr_perror("can't dup %s", name_slave); exit_shot_parent(1); } close(fd_slave); task_waiter_complete_current(&t); v = TEST_VALUE; if (write(new_slave, &v, sizeof(v)) != sizeof(v)) { pr_perror("write failed"); exit_shot_parent(1); } v = TEST_VALUE; if (write(fd_master, &v, sizeof(v)) != sizeof(v)) { pr_perror("write failed"); exit_shot_parent(1); } /* Don't exit until explicitly asked */ task_waiter_wait4(&t, getppid()); exit(0); } else if (pid < 0) { pr_perror("test_fork failed"); exit(1); } fd_master = open(name_master, O_RDONLY); if (fd_master < 0) { pr_perror("can't open %s", name_master); exit_shot(pid, 1); } /* Wait until data appear in kernel fifo buffer */ task_waiter_wait4(&t, pid); test_daemon(); test_waitsig(); if (read(fd_master, &v, sizeof(v)) != sizeof(v)) { pr_perror("read failed"); exit_shot(pid, 1); } task_waiter_complete_current(&t); if (v != TEST_VALUE) { fail("read data mismatch\n"); exit_shot(pid, 1); } if (read(fd_slave, &v, sizeof(v)) != sizeof(v)) { pr_perror("read failed"); exit_shot(pid, 1); } if (v != TEST_VALUE) { fail("read data mismatch\n"); exit_shot(pid, 1); } waitpid(pid, &status, P_ALL); if (unlink(name_master) < 0) pr_perror("can't unlink %s", name_master); if (unlink(name_slave) < 0) pr_perror("can't unlink %s", name_slave); if (!WIFEXITED(status)) { pr_perror("child %d is still running", pid); exit_shot(pid, 1); } errno = WEXITSTATUS(status); if (errno) { fail("Child exited with error %m"); exit(errno); } pass(); return 0; } criu-3.6/test/zdtm/static/fifo.c000066400000000000000000000030021317335042600166360ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that we can migrate with a named pipe " "open"; const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); #define BUF_SIZE (16 * 4096) /* A fifo buffer has 16 slots by default */ int main(int argc, char **argv) { int fd; struct stat st; mode_t mode = S_IFIFO | 0700; uint8_t buf[BUF_SIZE]; uint32_t crc; int ret;; test_init(argc, argv); if (mknod(filename, mode, 0)) { pr_perror("can't make fifo \"%s\"", filename); exit(1); } fd = open(filename, O_RDWR); if (fd < 0) { pr_perror("can't open %s", filename); return 1; } crc = ~0; datagen(buf, BUF_SIZE, &crc); ret = write(fd, buf, BUF_SIZE); if (ret != BUF_SIZE) { pr_perror("write() failed"); return 1; } test_daemon(); test_waitsig(); ret = read(fd, buf, BUF_SIZE); if (ret != BUF_SIZE) { pr_perror("read() failed"); return 1; } crc = ~0; if (datachk(buf, BUF_SIZE, &crc)) { fail("data corrupted\n"); return 1; } if (close(fd) < 0) { fail("can't close %s: %m", filename); return 1; } if (stat(filename, &st) < 0) { fail("can't stat %s: %m", filename); return 1; } if (st.st_mode != mode) { fail("%s is no longer the fifo we had", filename); return 1; } if (unlink(filename) < 0) { fail("can't unlink %s: %m", filename); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/fifo_ro.c000066400000000000000000000032221317335042600173420ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that a fifo read-only descriptor is restored with data"; const char *test_author = "Andrew Vagin "; char *filename; TEST_OPTION(filename, string, "file name", 1); #define BUF_SIZE (16 * 4096) /* A fifo buffer has 16 slots by default */ int main(int argc, char **argv) { int fd; int fd_ro; struct stat st; mode_t mode = S_IFIFO | 0700; uint8_t buf[BUF_SIZE]; uint32_t crc; int ret;; test_init(argc, argv); if (mknod(filename, mode, 0)) { pr_perror("can't make fifo \"%s\"", filename); exit(1); } fd = open(filename, O_RDWR); if (fd < 0) { pr_perror("can't open %s", filename); return 1; } fd_ro = open(filename, O_RDONLY); if (fd_ro < 0) { pr_perror("can't open %s", filename); return 1; } crc = ~0; datagen(buf, BUF_SIZE, &crc); ret = write(fd, buf, BUF_SIZE); if (ret != BUF_SIZE) { pr_perror("write() failed"); return 1; } close(fd); test_daemon(); test_waitsig(); ret = read(fd_ro, buf, BUF_SIZE); if (ret != BUF_SIZE) { pr_perror("read() failed"); return 1; } crc = ~0; if (datachk(buf, BUF_SIZE, &crc)) { fail("data corrupted\n"); return 1; } if (close(fd_ro) < 0) { fail("can't close %s: %m", filename); return 1; } if (stat(filename, &st) < 0) { fail("can't stat %s: %m", filename); return 1; } if (st.st_mode != mode) { fail("%s is no longer the fifo we had", filename); return 1; } if (unlink(filename) < 0) { fail("can't unlink %s: %m", filename); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/fifo_wronly.c000066400000000000000000000042331317335042600202570ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that we can migrate with a named pipe, " "opened in WRONLY mode"; #define BUF_SIZE 256 char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char **argv) { task_waiter_t t; int fd, fd1; struct stat st; mode_t mode = S_IFIFO | 0600; int pid; int chret; test_init(argc, argv); task_waiter_init(&t); if (mknod(filename, mode, 0)) { pr_perror("can't make fifo \"%s\"", filename); exit(1); } pid = test_fork(); if (pid < 0) { pr_perror("Can't fork"); exit(1); } if (pid == 0) { char rbuf[BUF_SIZE]; int res; fd1 = open(filename, O_RDONLY); if (fd1 < 0) { pr_perror("open(%s, O_RDONLY) Failed", filename); chret = errno; return chret; } task_waiter_complete(&t, 1); res = read(fd1, rbuf, 7); if (res < 0) { pr_perror("read error %s", filename); chret = errno; return chret; } else if (res == 0) { pr_perror("read(%d, rbuf, 7) return 0", fd1); return 1; } if (close(fd1) < 0) { fail("can't close %d, %s: %m", fd1, filename); chret = errno; return chret; } } else { fd = open(filename, O_WRONLY); if (fd < 0) { pr_perror("open(%s, O_WRONLY) Failed", filename); kill(pid, SIGKILL); wait(NULL); return 1; } task_waiter_wait4(&t, 1); test_daemon(); test_waitsig(); if (write(fd, "string", 7) == -1) { pr_perror("write(%d, 'string', 7) Failed", fd); return 1; } wait(&chret); chret = WEXITSTATUS(chret); if (chret) { fail("child exited with non-zero code %d (%s)\n", chret, strerror(chret)); return 1; } if (close(fd) < 0) { fail("can't close %d, %s: %m", fd, filename); return 1; } if (stat(filename, &st) < 0) { fail("can't stat %s: %m", filename); return 1; } if (st.st_mode != mode) { fail("%s is no longer the fifo we had", filename); return 1; } if (unlink(filename) < 0) { fail("can't unlink %s: %m", filename); return 1; } } pass(); return 0; } criu-3.6/test/zdtm/static/file_append.c000066400000000000000000000020441317335042600201660ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check O_APPEND preserved"; const char *test_author = "Pavel Emelyanov "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char **argv) { int fd, fd2, ret; char tmp[3]; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT | O_APPEND, 0644); if (fd == -1) return 1; fd2 = open(filename, O_RDWR, 0644); if (fd2 == -1) return 1; test_daemon(); test_waitsig(); if (write(fd2, "x", 1) != 1) { pr_perror("Can't write x"); return 1; } if (write(fd, "y", 1) != 1) { pr_perror("Can't write y"); return 1; } lseek(fd2, 0, SEEK_SET); ret = read(fd2, tmp, 3); if (ret != 2) { fail("Smth's wrong with file size"); return 1; } tmp[2] = '\0'; if (strcmp(tmp, "xy")) { fail("Smth's wron with file contents (%s)", tmp); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/file_attr.c000066400000000000000000000047061317335042600177000ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that attributes and content of an open, " "written to, and then unlinked file migrate " "correctly"; const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); #define DEF_PERMS 06604 /* -rwS--Sr--, really esoteric one */ unsigned int perms = DEF_PERMS; TEST_OPTION(perms, uint, "permissions to set on file " "(default " __stringify(DEF_PERMS) ")", 0); #define DEF_MTIME 123456 /* another really esoteric one */ unsigned int mtime = DEF_MTIME; TEST_OPTION(mtime, uint, "mtime to set on file " "(default " __stringify(DEF_MTIME) ")", 0); int main(int argc, char ** argv) { int fd; struct utimbuf ut; uint32_t crc; struct stat st; uint8_t buf[1000000]; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } crc = ~0; datagen(buf, sizeof(buf), &crc); if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { pr_perror("can't write to %s", filename); exit(1); } ut = (struct utimbuf) { .actime = 0, .modtime = mtime, }; if (utime(filename, &ut)) { pr_perror("can't set modtime %d on %s", mtime, filename); exit(1); } if (fchmod(fd, perms)) { pr_perror("can't set perms %o on %s", perms, filename); exit(1); } if (unlink(filename)) { pr_perror("can't unlink %s", filename); exit(1); } test_daemon(); test_waitsig(); if (lseek(fd, 0, SEEK_SET) < 0) { fail("lseeking to the beginning of file failed: %m\n"); goto out; } if (read(fd, buf, sizeof(buf)) != sizeof(buf)) { fail("can't read %s: %m\n", filename); goto out; } crc = ~0; if (datachk(buf, sizeof(buf), &crc)) { fail("CRC mismatch\n"); goto out; } if (fstat(fd, &st) < 0) { fail("can't fstat %s: %m", filename); goto out; } if ((st.st_mode & 07777) != perms) { fail("permissions have changed"); goto out; } if (st.st_mtime != mtime) { fail("modification time has changed"); goto out; } if (close(fd)) { fail("close failed: %m\n"); goto out_noclose; } if (unlink(filename) != -1 || errno != ENOENT) { fail("file %s should have been deleted before migration: unlink: %m\n", filename); goto out_noclose; } pass(); out: close(fd); out_noclose: return 0; } criu-3.6/test/zdtm/static/file_fown.c000066400000000000000000000073051317335042600176750ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef F_SETSIG #define F_SETSIG 10 /* for sockets. */ #define F_GETSIG 11 /* for sockets. */ #endif const char *test_doc = "Check for signal delivery on file owners"; const char *test_author = "Cyrill Gorcunov "; struct params { int sigio; int pipe_flags[2]; int pipe_pid[2]; int pipe_sig[2]; } *shared; static void signal_handler_io(int status) { shared->sigio++; } static void fill_pipe_params(struct params *p, int *pipes) { p->pipe_flags[0] = fcntl(pipes[0], F_GETFL); p->pipe_flags[1] = fcntl(pipes[1], F_GETFL); test_msg("pipe_flags0 %08o\n", p->pipe_flags[0]); test_msg("pipe_flags1 %08o\n", p->pipe_flags[1]); p->pipe_pid[0] = fcntl(pipes[0], F_GETOWN); p->pipe_pid[1] = fcntl(pipes[1], F_GETOWN); p->pipe_sig[0] = fcntl(pipes[0], F_GETSIG); p->pipe_sig[1] = fcntl(pipes[1], F_GETSIG); } static int cmp_pipe_params(struct params *p1, struct params *p2) { int i; for (i = 0; i < 2; i++) { if (p1->pipe_flags[i] != p2->pipe_flags[i]) { fail("pipe flags failed [%d] expected %08o got %08o\n", i, p1->pipe_flags[i], p2->pipe_flags[i]); return -1; } if (p1->pipe_pid[i] != p2->pipe_pid[i]) { fail("pipe pid failed [%d] expected %d got %d\n", i, p1->pipe_pid[i], p2->pipe_pid[i]); return -1; } if (p1->pipe_sig[i] != p2->pipe_sig[i]) { fail("pipe sig failed [%d] expected %d got %d\n", i, p1->pipe_sig[i], p2->pipe_sig[i]); return -1; } } return 0; } int main(int argc, char *argv[]) { struct sigaction saio = { }; struct params obtained = { }; uid_t ruid, euid, suid; int status, pipes[2]; pid_t pid; test_init(argc, argv); shared = (void *)mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if ((void *)shared == MAP_FAILED) { fail("mmap failed"); exit(1); } if (getresuid(&ruid, &euid, &suid)) { fail("getresuid failed\n"); exit(1); } if (pipe(pipes)) { pr_perror("Can't create pipe"); exit(1); } saio.sa_handler = (sig_t)signal_handler_io; saio.sa_flags = SA_RESTART; if (sigaction(SIGIO, &saio, 0)) { fail("sigaction failed\n"); exit(1); } if (!getuid() && setresuid(-1, 1, -1)) { fail("setresuid failed\n"); exit(1); } if (fcntl(pipes[0], F_SETOWN, getpid()) || fcntl(pipes[1], F_SETOWN, getpid()) || fcntl(pipes[0], F_SETSIG, SIGIO) || fcntl(pipes[1], F_SETSIG, SIGIO) || fcntl(pipes[0], F_SETFL, fcntl(pipes[0], F_GETFL) | O_ASYNC) || fcntl(pipes[1], F_SETFL, fcntl(pipes[1], F_GETFL) | O_ASYNC)) { fail("fcntl failed\n"); exit(1); } fill_pipe_params(shared, pipes); if (setresuid(-1, euid, -1)) { fail("setresuid failed\n"); exit(1); } pid = test_fork(); if (pid < 0) { pr_perror("can't fork"); exit(1); } if (pid == 0) { struct params p = { }; test_waitsig(); fcntl(pipes[1], F_SETOWN, getpid()); fill_pipe_params(&p, pipes); if (write(pipes[1], &p, sizeof(p)) != sizeof(p)) { fail("write failed\n"); exit(1); } exit(0); } test_daemon(); test_waitsig(); kill(pid, SIGTERM); if (waitpid(pid, &status, P_ALL) == -1) { fail("waitpid failed\n"); exit(1); } if (read(pipes[0], &obtained, sizeof(obtained)) != sizeof(obtained)) { fail("read failed\n"); exit(1); } if (shared->sigio < 1) { fail("shared->sigio = %d (> 0 expected)\n", shared->sigio); exit(1); } shared->pipe_pid[1] = pid; if (cmp_pipe_params(shared, &obtained)) { fail("params comparison failed\n"); exit(1); } pass(); return 0; } criu-3.6/test/zdtm/static/file_fown.desc000066400000000000000000000000201317335042600203540ustar00rootroot00000000000000{'flavor': 'h'} criu-3.6/test/zdtm/static/file_locks00.c000066400000000000000000000101071317335042600201710ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that posix flocks are restored"; const char *test_author = "Qiang Huang "; char *filename; TEST_OPTION(filename, string, "file name", 1); char file0[PATH_MAX]; char file1[PATH_MAX]; static int lock_reg(int fd, int cmd, int type, int whence, off_t offset, off_t len) { struct flock64 lock; lock.l_type = type; /* F_RDLCK, F_WRLCK, F_UNLCK */ lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ lock.l_start = offset; /* byte offset, relative to l_whence */ lock.l_len = len; /* #bytes (0 means to EOF) */ errno = 0; return fcntl(fd, cmd, &lock); } #define set_read_lock(fd, whence, offset, len) \ lock_reg(fd, F_SETLK64, F_RDLCK, whence, offset, len) #define set_write_lock(fd, whence, offset, len) \ lock_reg(fd, F_SETLK64, F_WRLCK, whence, offset, len) static int check_read_lock(int fd, int whence, off_t offset, off_t len) { struct flock64 lock; int ret; lock.l_type = F_RDLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ lock.l_start = offset; /* byte offset, relative to l_whence */ lock.l_len = len; /* #bytes (0 means to EOF) */ lock.l_pid = -1; errno = 0; ret = fcntl(fd, F_GETLK64, &lock); if (ret == -1) { pr_perror("F_GETLK failed."); return -1; } if (lock.l_pid == -1) { /* Share lock should succeed. */ return 0; } fail("Read lock check failed."); return -1; } static int check_write_lock(int fd, int whence, off_t offset, off_t len) { struct flock64 lock; int ret; pid_t ppid = getppid(); lock.l_type = F_WRLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ lock.l_start = offset; /* byte offset, relative to l_whence */ lock.l_len = len; /* #bytes (0 means to EOF) */ lock.l_pid = -1; errno = 0; ret = fcntl(fd, F_GETLK64, &lock); if (ret == -1) { pr_perror("F_GETLK failed."); return -1; } if (lock.l_pid == -1) { fail("Write lock check failed."); return -1; } /* * It only succeed when the file lock's owner is exactly * the same as the file lock was dumped. */ if (lock.l_pid == ppid) return 0; fail("Write lock check failed."); return -1; } static int check_file_locks() { int fd_0, fd_1; int ret0, ret1; fd_0 = open(file0, O_RDWR | O_CREAT, 0644); if (fd_0 < 0) { pr_perror("Unable to open file %s", file0); return -1; } ret0 = check_read_lock(fd_0, SEEK_SET, 0, 0); fd_1 = open(file1, O_RDWR | O_CREAT, 0644); if (fd_1 < 0) { close(fd_0); unlink(file0); pr_perror("Unable to open file %s", file1); return -1; } ret1 = check_write_lock(fd_1, SEEK_SET, 0, 0); close(fd_0); close(fd_1); return ret0 | ret1; } int main(int argc, char **argv) { int fd_0, fd_1, ret; pid_t pid; test_init(argc, argv); snprintf(file0, sizeof(file0), "%s.0", filename); snprintf(file1, sizeof(file0), "%s.1", filename); fd_0 = open(file0, O_RDWR | O_CREAT | O_EXCL, 0666); if (fd_0 < 0) { pr_perror("Unable to open file %s", file0); return -1; } fd_1 = open(file1, O_RDWR | O_CREAT | O_EXCL, 0666); if (fd_1 < 0) { close(fd_0); unlink(file0); pr_perror("Unable to open file %s", file1); return -1; } pid = fork(); if (pid < 0) { pr_perror("Can't fork"); return -1; } if (pid == 0) { /* child will check father's file locks */ test_waitsig(); if (check_file_locks()) { fail("Posix file lock check failed"); exit(1); } pass(); exit(0); } ret = set_read_lock(fd_0, SEEK_SET, 0, 0); if (ret == -1) { pr_perror("Failed to set read lock"); kill(pid, SIGTERM); return -1; } ret = set_write_lock(fd_1, SEEK_SET, 0, 0); if (ret == -1) { pr_perror("Failed to set write lock"); kill(pid, SIGTERM); return -1; } test_daemon(); test_waitsig(); kill(pid, SIGTERM); waitpid(pid, NULL, 0); close(fd_0); close(fd_1); unlink(file0); unlink(file1); return 0; } criu-3.6/test/zdtm/static/file_locks00.desc000066400000000000000000000000521317335042600206630ustar00rootroot00000000000000{'flags': 'excl', 'opts': '--file-locks'} criu-3.6/test/zdtm/static/file_locks01.c000066400000000000000000000072041317335042600201760ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" #include "fs.h" #ifndef LOCK_MAND #define LOCK_MAND 32 #endif #ifndef LOCK_READ #define LOCK_READ 64 #endif const char *test_doc = "Check that flock locks are restored"; const char *test_author = "Qiang Huang "; char *filename; TEST_OPTION(filename, string, "file name", 1); char file0[PATH_MAX]; char file1[PATH_MAX]; char file2[PATH_MAX]; unsigned int inodes[3]; static mnt_info_t *m; dev_t dev; static int open_all_files(int *fd_0, int *fd_1, int *fd_2) { struct stat buf; snprintf(file0, sizeof(file0), "%s.0", filename); snprintf(file1, sizeof(file0), "%s.1", filename); snprintf(file2, sizeof(file0), "%s.2", filename); *fd_0 = open(file0, O_RDWR | O_CREAT | O_EXCL, 0666); if (*fd_0 < 0) { pr_perror("Unable to open file %s", file0); return -1; } fstat(*fd_0, &buf); inodes[0] = buf.st_ino; if (!strcmp(m->fsname, "btrfs")) dev = m->s_dev; else dev = buf.st_dev; *fd_1 = open(file1, O_RDWR | O_CREAT | O_EXCL, 0666); if (*fd_1 < 0) { close(*fd_0); unlink(file0); pr_perror("Unable to open file %s", file1); return -1; } fstat(*fd_1, &buf); inodes[1] = buf.st_ino; *fd_2 = open(file2, O_RDWR | O_CREAT | O_EXCL, 0666); if (*fd_2 < 0) { close(*fd_0); close(*fd_1); unlink(file0); unlink(file1); pr_perror("Unable to open file %s", file1); return -1; } fstat(*fd_2, &buf); inodes[2] = buf.st_ino; return 0; } static int check_file_locks() { FILE *fp_locks = NULL; char buf[100]; long long fl_id = 0; char fl_flag[10], fl_type[15], fl_option[10]; pid_t fl_owner; int maj, min; unsigned long i_no; long long start; char end[32]; int num; int count = 3; fp_locks = fopen("/proc/locks", "r"); if (!fp_locks) return -1; test_msg("C: %d/%d/%d\n", inodes[0], inodes[1], inodes[2]); while (fgets(buf, sizeof(buf), fp_locks)) { test_msg("c: %s", buf); if (strstr(buf, "->")) continue; num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl_id, fl_flag, fl_type, fl_option, &fl_owner, &maj, &min, &i_no, &start, end); if (num < 10) { pr_perror("Invalid lock info."); break; } if (i_no != inodes[0] && i_no != inodes[1] && i_no != inodes[2]) continue; if (!strcmp(m->fsname, "btrfs")) { if (MKKDEV(major(maj), minor(min)) != dev) continue; } else { if (makedev(maj, min) != dev) continue; } if (!strcmp(fl_flag, "FLOCK") && !strcmp(fl_type, "ADVISORY")) { if (!strcmp(fl_option, "READ")) count--; else if (!strcmp(fl_option, "WRITE")) count--; } if (!strcmp(fl_flag, "FLOCK") && !strcmp(fl_type, "MSNFS") && !strcmp(fl_option, "READ")) count--; memset(fl_flag, 0, sizeof(fl_flag)); memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); } fclose(fp_locks); /* * If we find all three matched file locks, count would be 0, * return 0 for success. */ return count; } int main(int argc, char **argv) { int fd_0, fd_1, fd_2; test_init(argc, argv); m = get_cwd_mnt_info(); if (!m) { pr_perror("Can't fetch mountinfo"); return -1; } if (!strcmp(m->fsname, "btrfs")) m->s_dev = kdev_to_odev(m->s_dev); if (open_all_files(&fd_0, &fd_1, &fd_2)) return -1; flock(fd_0, LOCK_SH); flock(fd_1, LOCK_EX); flock(fd_2, LOCK_MAND | LOCK_READ); test_daemon(); test_waitsig(); if (check_file_locks()) fail("Flock file locks check failed"); else pass(); close(fd_0); close(fd_1); close(fd_2); unlink(file0); unlink(file1); unlink(file2); return 0; } criu-3.6/test/zdtm/static/file_locks01.desc000066400000000000000000000000521317335042600206640ustar00rootroot00000000000000{'flags': 'excl', 'opts': '--file-locks'} criu-3.6/test/zdtm/static/file_locks02.c000066400000000000000000000033151317335042600201760ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that 'shared' flocks work"; const char *test_author = "Pavel Emelyanov "; char *filename; TEST_OPTION(filename, string, "file name", 1); static int check_file_locks(pid_t child) { FILE *fp_locks = NULL; char buf[100], fl_flag[16], fl_type[16], fl_option[16]; pid_t pid = getpid(); int found = 0, num, fl_owner; fp_locks = fopen("/proc/locks", "r"); if (!fp_locks) return -1; test_msg("C: %d\n", pid); while (fgets(buf, sizeof(buf), fp_locks)) { test_msg("c: %s", buf); if (strstr(buf, "->")) continue; num = sscanf(buf, "%*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; } if (fl_owner != pid && fl_owner != child) continue; if (!strcmp(fl_flag, "FLOCK") && !strcmp(fl_type, "ADVISORY") && !strcmp(fl_option, "WRITE")) found++; memset(fl_flag, 0, sizeof(fl_flag)); memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); } fclose(fp_locks); return found == 1; } int main(int argc, char **argv) { int fd, pid; test_init(argc, argv); fd = open(filename, O_CREAT | O_RDWR, 0600); if (fd < 0) { pr_perror("No file"); return -1; } flock(fd, LOCK_EX); pid = fork(); if (pid == 0) { test_waitsig(); exit(0); } test_daemon(); test_waitsig(); if (check_file_locks(pid)) pass(); else fail("Flock file locks check failed"); kill(pid, SIGTERM); waitpid(pid, NULL, 0); close(fd); unlink(filename); return 0; } criu-3.6/test/zdtm/static/file_locks02.desc000066400000000000000000000000521317335042600206650ustar00rootroot00000000000000{'flags': 'excl', 'opts': '--file-locks'} criu-3.6/test/zdtm/static/file_locks03.c000066400000000000000000000033531317335042600202010ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that 'inherited' flocks work"; const char *test_author = "Pavel Emelyanov "; char *filename; TEST_OPTION(filename, string, "file name", 1); static int check_file_locks(int alt_pid) { FILE *fp_locks = NULL; char buf[100], fl_flag[16], fl_type[16], fl_option[16]; pid_t pid = getpid(); int found = 0, num, fl_owner; fp_locks = fopen("/proc/locks", "r"); if (!fp_locks) return -1; test_msg("C: %d/%d\n", pid, alt_pid); while (fgets(buf, sizeof(buf), fp_locks)) { test_msg("c: %s", buf); if (strstr(buf, "->")) continue; num = sscanf(buf, "%*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; } if (fl_owner != pid && fl_owner != alt_pid) continue; if (!strcmp(fl_flag, "FLOCK") && !strcmp(fl_type, "ADVISORY") && !strcmp(fl_option, "WRITE")) found++; memset(fl_flag, 0, sizeof(fl_flag)); memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); } fclose(fp_locks); return found == 1; } int main(int argc, char **argv) { int fd, pid; test_init(argc, argv); fd = open(filename, O_CREAT | O_RDWR, 0600); if (fd < 0) { pr_perror("No file"); return -1; } flock(fd, LOCK_EX); pid = fork(); if (pid == 0) { test_waitsig(); exit(0); } close(fd); test_daemon(); test_waitsig(); if (check_file_locks(pid)) pass(); else fail("Flock file locks check failed"); kill(pid, SIGTERM); waitpid(pid, NULL, 0); close(fd); unlink(filename); return 0; } criu-3.6/test/zdtm/static/file_locks03.desc000066400000000000000000000000521317335042600206660ustar00rootroot00000000000000{'flags': 'excl', 'opts': '--file-locks'} criu-3.6/test/zdtm/static/file_locks04.c000066400000000000000000000043611317335042600202020ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that 'overlapping' flocks work"; const char *test_author = "Pavel Emelyanov "; char *filename; TEST_OPTION(filename, string, "file name", 1); static int check_file_locks(pid_t child_pid, int fd, int child_fd) { char path[PATH_MAX]; FILE *fp_locks = NULL; char buf[100], fl_flag[16], fl_type[16], fl_option[16]; int found = 0, num, fl_owner; sprintf(path, "/proc/%d/fdinfo/%d", child_pid, child_fd); fp_locks = fopen(path, "r"); if (!fp_locks) { pr_err("Can't open %s\n", path); return -1; } while (fgets(buf, sizeof(buf), fp_locks)) { if (strncmp(buf, "lock:\t", 6) != 0) continue; test_msg("c: %s", buf); num = sscanf(buf, "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; } if (fl_owner != child_pid && fl_owner != getpid()) { pr_err("Wrong owner\n"); continue; } if (!strcmp(fl_flag, "FLOCK") && !strcmp(fl_type, "ADVISORY") && !strcmp(fl_option, "WRITE")) found++; memset(fl_flag, 0, sizeof(fl_flag)); memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); } fclose(fp_locks); if (flock(fd, LOCK_EX | LOCK_NB) == 0) return 0; return found == 1; } int main(int argc, char **argv) { int fd, child_fd, pid; test_init(argc, argv); fd = child_fd = open(filename, O_CREAT | O_RDWR, 0600); if (fd < 0) { pr_perror("No file"); return -1; } flock(fd, LOCK_EX); pid = fork(); if (pid == 0) { test_waitsig(); exit(0); } close(fd); fd = open(filename, O_RDONLY); if (fd < 0) { pr_perror("No file 2"); kill(pid, SIGTERM); waitpid(pid, NULL, 0); return -1; } if (flock(fd, LOCK_EX | LOCK_NB) == 0) { pr_perror("Bogus locks"); kill(pid, SIGTERM); waitpid(pid, NULL, 0); return -1; } test_daemon(); test_waitsig(); if (check_file_locks(pid, fd, child_fd) > 0) pass(); else fail("Flock file locks check failed"); kill(pid, SIGTERM); waitpid(pid, NULL, 0); close(fd); unlink(filename); return 0; } criu-3.6/test/zdtm/static/file_locks04.desc000066400000000000000000000001041317335042600206650ustar00rootroot00000000000000{'flags': 'excl', 'opts': '--file-locks', 'feature': 'fdinfo_lock'} criu-3.6/test/zdtm/static/file_locks05.c000066400000000000000000000015111317335042600201750ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Sanity check for criu lock-test quirk"; const char *test_author = "Pavel Emelyanov "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char **argv) { int fd, fd2; test_init(argc, argv); fd = open(filename, O_CREAT | O_RDWR, 0600); if (fd < 0) { pr_perror("No file"); return -1; } fd2 = open(filename, O_RDWR); if (fd2 < 0) { pr_perror("No file2"); return -1; } flock(fd, LOCK_SH); test_daemon(); test_waitsig(); if (flock(fd2, LOCK_SH) == 0) pass(); else fail("Flock file locks check failed (%d)", errno); close(fd); close(fd2); unlink(filename); return 0; } criu-3.6/test/zdtm/static/file_locks05.desc000066400000000000000000000000521317335042600206700ustar00rootroot00000000000000{'flags': 'excl', 'opts': '--file-locks'} criu-3.6/test/zdtm/static/file_locks06.c000066400000000000000000000022301317335042600201750ustar00rootroot00000000000000#include #include #include #include "ofd_file_locks.h" #include "zdtmtst.h" const char *test_doc = "Check that OFD lock for the whole file is restored"; const char *test_author = "Begunkov Pavel "; char *filename; TEST_OPTION(filename, string, "file name", 1); int init_lock(int *fd, struct flock64 *lck) { *fd = open(filename, O_RDWR | O_CREAT, 0666); if (*fd < 0) { pr_perror("Can't open file"); return -1; } lck->l_type = F_WRLCK; lck->l_whence = SEEK_SET; lck->l_start = 0; lck->l_len = 0; lck->l_pid = 0; if (fcntl(*fd, F_OFD_SETLK, lck) < 0) { pr_perror("Can't set ofd lock"); return -1; } return 0; } void cleanup(int *fd) { if (close(*fd)) pr_perror("Can't close fd\n"); if (unlink(filename)) pr_perror("Can't unlink file\n"); } int main(int argc, char **argv) { int fd; struct flock64 lck; test_init(argc, argv); if (init_lock(&fd, &lck)) return 1; test_daemon(); test_waitsig(); if (check_file_lock_restored(getpid(), fd, &lck) || check_lock_exists(filename, &lck) < 0) fail("OFD file locks check failed\n"); else pass(); cleanup(&fd); return 0; } criu-3.6/test/zdtm/static/file_locks06.checkskip000077500000000000000000000006401317335042600217250ustar00rootroot00000000000000#!/usr/bin/env python2 import fcntl import tempfile import struct import errno F_OFD_SETLK = 37 try: with tempfile.TemporaryFile() as fd: flock = struct.pack('hhllhh', fcntl.F_RDLCK, 0, 0, 0, 0, 0) fcntl.fcntl(fd.fileno(), F_OFD_SETLK, flock) except IOError as e: if e.errno == errno.EINVAL: print "I/O error({0}): {1}".format(e.errno, e.strerror) print "OFD locks are not supported." exit(1) exit(0) criu-3.6/test/zdtm/static/file_locks06.desc000066400000000000000000000000521317335042600206710ustar00rootroot00000000000000{'flags': 'excl', 'opts': '--file-locks'} criu-3.6/test/zdtm/static/file_locks07.c000066400000000000000000000035221317335042600202030ustar00rootroot00000000000000#include #include #include #include "ofd_file_locks.h" #include "zdtmtst.h" const char *test_doc = "Check that 'overlapping' OFD read locks work"; const char *test_author = "Begunkov Pavel "; char *filename; TEST_OPTION(filename, string, "file name", 1); #define FILE_NUM 4 static int fds[FILE_NUM]; static struct flock64 lcks[FILE_NUM]; static short types[] = {F_RDLCK, F_RDLCK, F_RDLCK, F_RDLCK}; static off_t starts[] = {0, 10, 0, 70}; static off_t lens[] = {20, 30, 100, 200}; void fill_lock(struct flock64 *lock, off_t start, off_t len, short int type) { lock->l_start = start; lock->l_len = len; lock->l_type = type; lock->l_whence = SEEK_SET; lock->l_pid = 0; } int init_file_locks(void) { size_t i; for (i = 0; i < FILE_NUM; ++i) fill_lock(&lcks[i], starts[i], lens[i], types[i]); for (i = 0; i < FILE_NUM; ++i) { fds[i] = open(filename, O_RDWR | O_CREAT, 0666); if (fds[i] < 0) { pr_perror("Can't open file"); return -1; } } for (i = 0; i < FILE_NUM; ++i) if (fcntl(fds[i], F_OFD_SETLKW, &lcks[i]) < 0) { pr_perror("Can't set ofd lock"); return -1; } return 0; } void cleanup(void) { size_t i; for (i = 0; i < FILE_NUM; ++i) if (close(fds[i])) pr_perror("Can't close fd\n"); if (unlink(filename)) pr_perror("Can't unlink file failed\n"); } int check_file_locks_restored(void) { size_t i; int pid = getpid(); for (i = 0; i < FILE_NUM; ++i) { if (check_file_lock_restored(pid, fds[i], &lcks[i])) return -1; if (check_lock_exists(filename, &lcks[i]) < 0) return -1; } return 0; } int main(int argc, char **argv) { test_init(argc, argv); if (init_file_locks()) return -1; test_daemon(); test_waitsig(); if (check_file_locks_restored()) fail("OFD file locks check failed\n"); else pass(); cleanup(); return 0; } criu-3.6/test/zdtm/static/file_locks07.checkskip000077700000000000000000000000001317335042600260642file_locks06.checkskipustar00rootroot00000000000000criu-3.6/test/zdtm/static/file_locks07.desc000066400000000000000000000000521317335042600206720ustar00rootroot00000000000000{'flags': 'excl', 'opts': '--file-locks'} criu-3.6/test/zdtm/static/file_locks08.c000066400000000000000000000033111317335042600202000ustar00rootroot00000000000000#include #include #include #include #include #include "ofd_file_locks.h" #include "zdtmtst.h" const char *test_doc = "Check that inherited OFD locks work"; const char *test_author = "Begunkov Pavel "; char *filename; TEST_OPTION(filename, string, "file name", 1); int init_file_lock(int *fd, struct flock64 *lck) { *fd = open(filename, O_RDWR | O_CREAT, 0666); if (*fd < 0) { pr_perror("Can't open file"); return -1; } lck->l_type = F_WRLCK; lck->l_whence = SEEK_SET; lck->l_start = 0; lck->l_len = 0; /* lock whole file */ lck->l_pid = 0; /* should be 0 for ofd lock */ if (fcntl(*fd, F_OFD_SETLKW, lck) < 0) { pr_perror("Can't set ofd lock"); return -1; } return 0; } void cleanup(int *fd) { if (close(*fd)) pr_perror("Can't close fd\n"); if (unlink(filename)) pr_perror("Can't unlink file\n"); } int main(int argc, char **argv) { int fd; int pid; int status; int ret = 0; task_waiter_t tw; struct flock64 lck; test_init(argc, argv); if (init_file_lock(&fd, &lck)) return -1; task_waiter_init(&tw); pid = fork(); if (pid == 0) { task_waiter_wait4(&tw, getppid()); if (check_file_lock_restored(getpid(), fd, &lck) || check_lock_exists(filename, &lck) < 0) ret = -1; exit(ret); } test_daemon(); test_waitsig(); task_waiter_complete_current(&tw); if (check_file_lock_restored(getpid(), fd, &lck) || check_lock_exists(filename, &lck) < 0) fail("OFD file locks check failed\n"); kill(pid, SIGTERM); ret = waitpid(pid, &status, 0); if (ret < 0 || !WIFEXITED(status) || WEXITSTATUS(status)) fail("OFD file locks check failed\n"); else pass(); cleanup(&fd); return 0; } criu-3.6/test/zdtm/static/file_locks08.checkskip000077700000000000000000000000001317335042600260652file_locks06.checkskipustar00rootroot00000000000000criu-3.6/test/zdtm/static/file_locks08.desc000066400000000000000000000000521317335042600206730ustar00rootroot00000000000000{'flags': 'excl', 'opts': '--file-locks'} criu-3.6/test/zdtm/static/file_shared.c000066400000000000000000000037631317335042600201760ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" #define OFFSET 1000 #define OFFSET2 500 const char *test_doc = "Check shared struct file-s"; const char *test_author = "Andrey Vagin "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char **argv) { pid_t pid; int fd, fd2, fd3, ret, status; off_t off; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT, 0644); if (fd == -1) return 1; fd2 = dup(fd); if (fd < 0) return 1; fd3 = open(filename, O_RDWR | O_CREAT, 0644); if (fd3 == -1) return 1; pid = test_fork(); if (pid == -1) return 1; else if (pid) { fcntl(fd2, F_SETFD, 1); test_daemon(); test_waitsig(); off = lseek(fd, OFFSET, SEEK_SET); if (off == (off_t) -1) return 1; off = lseek(fd3, OFFSET2, SEEK_SET); if (off == (off_t) -1) return 1; ret = kill(pid, SIGTERM); if (ret == -1) { pr_perror("kill() failed"); } ret = wait(&status); if (ret == -1) { pr_perror("wait() failed"); return 1; } if (!WIFEXITED(status) || WEXITSTATUS(status)) { fail("Child exited with non-zero status"); return 1; } off = lseek(fd2, 0, SEEK_CUR); if (off != OFFSET) { fail("offset1 fail\n"); return 1; } off = lseek(fd3, 0, SEEK_CUR); if (off != OFFSET2) { fail("offset2 fail\n"); return 1; } ret = fcntl(fd, F_GETFD, 0); if (ret != 0) { fail("fd cloexec broken\n"); return 1; } ret = fcntl(fd2, F_GETFD, 0); if (ret != 1) { fail("fd2 cloexec broken\n"); return 1; } } else { test_waitsig(); off = lseek(fd, 0, SEEK_CUR); if (off != OFFSET) { fail("offset3 fail\n"); return 1; } off = lseek(fd2, 0, SEEK_CUR); if (off != OFFSET) { fail("offset4 fail\n"); return 1; } off = lseek(fd3, 0, SEEK_CUR); if (off != OFFSET2) { fail("offset5 fail\n"); return 1; } return 0; } pass(); return 0; } criu-3.6/test/zdtm/static/fpu00.c000066400000000000000000000024671317335042600166630ustar00rootroot00000000000000#include #include "zdtmtst.h" const char *test_doc = "Start a calculation, leaving FPU in a certain state,\n" "before migration, continue after"; const char *test_author = "Pavel Emelianov "; #if defined(__i386__) || defined(__x86_64__) void start(float a, float b, float c, float d) { __asm__ volatile ( "flds %0\n" "fadds %1\n" "flds %2\n" "fadds %3\n" "fmulp %%st(1)\n" : : "m" (a), "m" (b), "m" (c), "m" (d) ); } float finish(void) { float res; __asm__ volatile ( "fstps %0\n" : "=m" (res) ); return res; } int chk_proc_fpu(void) { unsigned long fi; __asm__ volatile ( "mov $1, %%eax\n" "cpuid\n" : "=d" (fi) : : "eax" ); return fi & (1 << 0); } #endif int main(int argc, char ** argv) { #if defined(__i386__) || defined(__x86_64__) float a, b, c, d; float res1, res2; #endif test_init(argc, argv); #if defined(__i386__) || defined(__x86_64__) if (!chk_proc_fpu()) { skip("FPU not supported"); return 1; } a = drand48(); b = drand48(); c = drand48(); d = drand48(); start(a, b, c, d); res1 = finish(); start(a, b, c, d); test_daemon(); test_waitsig(); res2 = finish(); if (res1 != res2) fail("%f != %f\n", res1, res2); else pass(); #else skip("Unsupported arch"); #endif return 0; } criu-3.6/test/zdtm/static/fpu00.desc000066400000000000000000000000231317335042600173410ustar00rootroot00000000000000{'arch': 'x86_64'} criu-3.6/test/zdtm/static/fpu01.c000066400000000000000000000040631317335042600166560ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" #if defined(__i386__) || defined(__x86_64__) #include "cpuid.h" const char *test_doc = "Test if FPU data in YMM registers do survive the c/r"; const char *test_author = "Cyrill Gorcunov "; static int verify_cpu(void) { unsigned int eax, ebx, ecx, edx; /* Do we have xsave? */ cpuid(1, &eax, &ebx, &ecx, &edx); if (!(ecx & (1u << 27))) return -1; /* Is YMM here? */ cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); if ((eax & (0x4)) != 0x4) return -1; return 0; } #define __aligned __attribute__((aligned(64))) /* * These are random strings generated by pwgen. */ static __aligned unsigned char ymm1[32 + 1] = "code9Ee5sohphie1ae1kaeMahngoh5oe"; static __aligned unsigned char ymm2[32 + 1] = "Tacuthahhien9Fi7aGhaa5toGh6vi7Ch"; static __aligned unsigned char ymm3[32 + 1]; static __aligned unsigned char ymm4[32 + 1]; static int fpu_test(void) { int ret = 0; asm volatile("vmovapd %0, %%ymm0 \n" : : "m" (*ymm1) : "memory"); asm volatile("vmovapd %0, %%ymm7 \n" : : "m" (*ymm2) : "memory"); test_daemon(); test_waitsig(); asm volatile("vmovapd %%ymm0, %0 \n" : "=m" (*ymm3) : : "memory"); asm volatile("vmovapd %%ymm7, %0 \n" : "=m" (*ymm4) : : "memory"); if (memcmp(ymm1, ymm3, 32) || memcmp(ymm2, ymm4, 32)) { test_msg("Data mismatch ('%s' '%s' '%s' '%s')\n", ymm1, ymm2, ymm3, ymm4); ret = -1; } else { test_msg("Data match ('%s' '%s' '%s' '%s')\n", ymm1, ymm2, ymm3, ymm4); ret = 0; } return ret; } static int bare_run(void) { test_msg("Your cpu doesn't support ymm registers, skipping\n"); test_daemon(); test_waitsig(); return 0; } int main(int argc, char *argv[]) { int ret = 0; test_init(argc, argv); ret = verify_cpu() ? bare_run() : fpu_test(); if (!ret) pass(); else fail(); return 0; } #else int main(int argc, char *argv[]) { test_init(argc, argv); skip("Unsupported arch"); return 0; } #endif criu-3.6/test/zdtm/static/fpu01.desc000066400000000000000000000000231317335042600173420ustar00rootroot00000000000000{'arch': 'x86_64'} criu-3.6/test/zdtm/static/futex-rl.c000066400000000000000000000051371317335042600174740ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check the futex robust list c/r"; const char *test_author = "Cyrill Gorcunov "; struct args { task_waiter_t waiter; int result; }; static pid_t __gettid(void) { return syscall(__NR_gettid); } void *thread_fn(void *arg) { struct robust_list_head *head_orig = NULL, *head_new = NULL; size_t len_orig = 0, len_new = 0; struct args *args = arg; test_msg("Obtaining old RL\n"); if (syscall(__NR_get_robust_list, __gettid(), &head_orig, &len_orig)) { args->result = -1; fail("__NR_get_robust_list failed"); } test_msg("Complete\n"); task_waiter_complete(&args->waiter, 1); if (args->result == -1) goto out; task_waiter_wait4(&args->waiter, 2); test_msg("Obtaining new RL\n"); if (syscall(__NR_get_robust_list, __gettid(), &head_new, &len_new)) { args->result = -1; fail("__NR_get_robust_list failed"); } if (args->result == -1) goto out; if (head_orig != head_new || len_orig != len_new) { args->result = -1; fail("comparison failed"); } args->result = 0; out: return NULL; } int main(int argc, char **argv) { struct robust_list_head *head_orig = NULL, *head_new = NULL; size_t len_orig = 0, len_new = 0; pthread_t thread; struct args *args; test_init(argc, argv); args = (struct args *)mmap(NULL, sizeof(*args), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); if ((void *)args == MAP_FAILED) { fail("mmap failed\n"); exit(1); } test_msg("Obtaining old RL for thread-leader\n"); if (syscall(__NR_get_robust_list, __gettid(), &head_orig, &len_orig)) { fail("__NR_get_robust_list failed"); exit(1); } task_waiter_init(&args->waiter); args->result = 0; test_msg("Createing thread\n"); if (pthread_create(&thread, NULL, thread_fn, (void *)args)) { fail("Can't create thread\n"); exit(1); } test_msg("Wait for thread work\n"); task_waiter_wait4(&args->waiter, 1); if (args->result == -1) { fail("thread failed\n"); exit(1); } test_msg("C/R cycle\n"); test_daemon(); test_waitsig(); task_waiter_complete(&args->waiter, 2); test_msg("Obtaining new RL for thread-leader\n"); if (syscall(__NR_get_robust_list, __gettid(), &head_new, &len_new)) { fail("__NR_get_robust_list failed"); exit(1); } if (head_orig != head_new || len_orig != len_new) { fail("comparison failed"); exit(1); } pthread_join(thread, NULL); if (args->result) fail(); else pass(); munmap((void *)args, sizeof(*args)); return 0; } criu-3.6/test/zdtm/static/futex.c000066400000000000000000000033601317335042600170550ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check (via pthread/NPTL) that futeces behave through migration"; const char *test_author = "Pavel Emelianov "; volatile int kid_passed; void *thread_fn(void *lock) { pthread_mutex_t *mutex; mutex = (pthread_mutex_t *)lock; pthread_mutex_lock(mutex); kid_passed++; pthread_mutex_unlock(mutex); return NULL; } #define DEF_NUM_THREADS 10 #define MAX_NUM_THREADS 50 int num_threads = DEF_NUM_THREADS; TEST_OPTION(num_threads, int, "number of threads " "(default " __stringify(DEF_NUM_THREADS) " maximum " __stringify(MAX_NUM_THREADS) ")", 0); int main(int argc, char **argv) { int i; pthread_t thr[num_threads]; pthread_mutex_t m; test_init(argc, argv); if (num_threads > MAX_NUM_THREADS) { pr_perror("%d threads it too much. max is %d", num_threads, MAX_NUM_THREADS); goto out; } pthread_mutex_init(&m, NULL); pthread_mutex_lock(&m); for (i = 0; i < num_threads; i++) if (pthread_create(&thr[i], NULL, thread_fn, &m)) { pr_perror("Can't create %d'th thread", i + 1); goto out_kill; } kid_passed = 0; test_daemon(); test_waitsig(); sleep(1); if (kid_passed != 0) fail("some kids broke through\n"); pthread_mutex_unlock(&m); for (i = 0; i < num_threads; i++) pthread_join(thr[i], NULL); if (pthread_mutex_trylock(&m)) { if (errno == EBUSY) fail("kids left my mutex locked\n"); else pr_perror("kids spoiled my mutex"); } if (kid_passed != num_threads) fail("some kids died during migration\n"); pass(); out: return 0; out_kill: for (i--; i >= 0; i--) { pthread_kill(thr[i], SIGKILL); pthread_join(thr[i], NULL); } goto out; } criu-3.6/test/zdtm/static/get_smaps_bits.c000066400000000000000000000047311317335042600207300ustar00rootroot00000000000000#include #include #include "zdtmtst.h" #ifndef MAP_HUGETLB # define MAP_HUGETLB 0x40000 #endif #ifndef MADV_HUGEPAGE # define MADV_HUGEPAGE 14 #endif #ifndef MADV_NOHUGEPAGE # define MADV_NOHUGEPAGE 15 #endif #ifndef MADV_DONTDUMP # define MADV_DONTDUMP 16 #endif static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) { char *tok; if (!buf[0]) return; tok = strtok(buf, " \n"); if (!tok) return; #define _vmflag_match(_t, _s) (_t[0] == _s[0] && _t[1] == _s[1]) do { /* mmap() block */ if (_vmflag_match(tok, "gd")) *flags |= MAP_GROWSDOWN; else if (_vmflag_match(tok, "lo")) *flags |= MAP_LOCKED; else if (_vmflag_match(tok, "nr")) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; /* madvise() block */ if (_vmflag_match(tok, "sr")) *madv |= (1ul << MADV_SEQUENTIAL); else if (_vmflag_match(tok, "rr")) *madv |= (1ul << MADV_RANDOM); else if (_vmflag_match(tok, "dc")) *madv |= (1ul << MADV_DONTFORK); else if (_vmflag_match(tok, "dd")) *madv |= (1ul << MADV_DONTDUMP); else if (_vmflag_match(tok, "mg")) *madv |= (1ul << MADV_MERGEABLE); else if (_vmflag_match(tok, "hg")) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); /* * Anything else is just ignored. */ } while ((tok = strtok(NULL, " \n"))); #undef _vmflag_match } #define is_hex_digit(c) \ (((c) >= '0' && (c) <= '9') || \ ((c) >= 'a' && (c) <= 'f') || \ ((c) >= 'A' && (c) <= 'F')) static int is_vma_range_fmt(char *line, unsigned long *start, unsigned long *end) { char *p = line; while (*line && is_hex_digit(*line)) line++; if (*line++ != '-') return 0; while (*line && is_hex_digit(*line)) line++; if (*line++ != ' ') return 0; sscanf(p, "%lx-%lx", start, end); return 1; } int get_smaps_bits(unsigned long where, unsigned long *flags, unsigned long *madv) { unsigned long start = 0, end = 0; FILE *smaps = NULL; char buf[1024]; int found = 0; if (!where) return 0; smaps = fopen("/proc/self/smaps", "r"); if (!smaps) { pr_perror("Can't open smaps"); return -1; } while (fgets(buf, sizeof(buf), smaps)) { is_vma_range_fmt(buf, &start, &end); if (!strncmp(buf, "VmFlags: ", 9) && start == where) { found = 1; parse_vmflags(buf, flags, madv); break; } } fclose(smaps); if (!found) { pr_perror("VmFlags not found for %lx", where); return -1; } return 0; } criu-3.6/test/zdtm/static/get_smaps_bits.h000066400000000000000000000002771317335042600207360ustar00rootroot00000000000000#ifndef ZDTM_GET_SMAPS_BITS_H_ #define ZDTM_GET_SMAPS_BITS_H_ extern int get_smaps_bits(unsigned long where, unsigned long *flags, unsigned long *madv); #endif /* ZDTM_GET_SMAPS_BITS_H_ */ criu-3.6/test/zdtm/static/ghost_holes00.c000066400000000000000000000057141317335042600204050ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test ghost with one hole in the middle"; char *filename; TEST_OPTION(filename, string, "file name", 1); /* Buffer that is suitable for hole size */ #define BUFSIZE 4096 static unsigned char buf4k[BUFSIZE]; #ifndef SEEK_DATA #define SEEK_DATA 3 #define SEEK_HOLE 4 #endif #ifdef HEAD_HOLE #define HH 1 #else #define HH 0 #endif #ifdef TAIL_HOLE #define TH 1 #else #define TH 0 #endif #define DATA1_BLK (HH) #define DATA1_OFF (DATA1_BLK * BUFSIZE) #define DATA2_BLK (HH + 2) #define DATA2_OFF (DATA2_BLK * BUFSIZE) #define FILE_BLOCKS (TH + HH + 1 /* mid hole */ + 2 /* data */) #define FILE_SIZE (FILE_BLOCKS * BUFSIZE) int main(int argc, char ** argv) { int fd; struct stat st; uint32_t crc; bool chk_hole = true; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } if (unlink(filename) < 0) { pr_perror("can't unlink %s", filename); goto failed; } crc = ~0; datagen(buf4k, BUFSIZE, &crc); if (pwrite(fd, buf4k, BUFSIZE, DATA1_OFF) != BUFSIZE) { pr_perror("can't write data1"); goto failed; } crc = ~0; datagen(buf4k, BUFSIZE, &crc); if (pwrite(fd, buf4k, BUFSIZE, DATA2_OFF) != BUFSIZE) { pr_perror("can't write data2"); goto failed; } if (ftruncate(fd, FILE_SIZE)) { pr_perror("Can't fixup file size"); goto failed; } if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { test_msg("Won't check for hole\n"); chk_hole = false; } test_daemon(); test_waitsig(); if (fstat(fd, &st) < 0) { fail("can't stat after"); goto failed; } if (st.st_size != FILE_SIZE) { fail("file size changed to %ld", (long)st.st_size); goto failed; } test_msg("Blocks %u OK\n", FILE_BLOCKS); /* Data 1 */ if (pread(fd, buf4k, BUFSIZE, DATA1_OFF) != BUFSIZE) { fail("pread1 fail"); goto failed; } crc = ~0; if (datachk(buf4k, BUFSIZE, &crc)) { fail("datachk1 fail"); goto failed; } test_msg("Data @%u OK\n", DATA1_BLK); /* Data 2 */ if (pread(fd, buf4k, BUFSIZE, DATA2_OFF) != BUFSIZE) { fail("pread2 fail"); goto failed; } crc = ~0; if (datachk(buf4k, BUFSIZE, &crc)) { fail("datachk2 fail"); goto failed; } test_msg("Data @%u OK\n", DATA2_BLK); /* Hole */ if (chk_hole) { #ifdef HEAD_HOLE if (lseek(fd, 0, SEEK_HOLE) != 0) { fail("hh not found"); goto failed; } test_msg("Head hole OK\n"); #endif if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { fail("mh not found"); goto failed; } test_msg("Mid hole OK\n"); #ifdef TAIL_HOLE if (lseek(fd, DATA2_OFF, SEEK_HOLE) != DATA2_OFF + BUFSIZE) { fail("tail hole not found"); goto failed; } test_msg("Tail hole OK\n"); #endif } close(fd); pass(); return 0; failed: close(fd); return 1; } criu-3.6/test/zdtm/static/ghost_holes01.c000077700000000000000000000000001317335042600232142ghost_holes00.custar00rootroot00000000000000criu-3.6/test/zdtm/static/ghost_holes02.c000077700000000000000000000000001317335042600232152ghost_holes00.custar00rootroot00000000000000criu-3.6/test/zdtm/static/groups.c000066400000000000000000000020611317335042600172360ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that supplementary groups are supported"; const char *test_author = "Pavel Emelianov "; int main(int argc, char **argv) { int ng; unsigned int *grp, *grp2, i, max; test_init(argc, argv); ng = getgroups(0, NULL); if (ng < 0) { pr_perror("Can't get groups"); return -1; } grp = malloc((ng + 1) * sizeof(*grp)); ng = getgroups(ng, grp); if (ng < 0) { pr_perror("Can't get groups2"); return -1; } max = 0; for (i = 0; i < ng; i++) if (max < grp[i]) max = grp[i]; grp[ng++] = max + 1; if (setgroups(ng, grp) < 0) { pr_perror("Can't set groups"); return -1; } test_daemon(); test_waitsig(); grp2 = malloc(ng * sizeof(*grp2)); if (getgroups(ng, grp2) != ng) { fail("Nr groups changed"); return -1; } if (memcmp(grp, grp2, ng * sizeof(*grp))) { fail("Groups have changed"); return -1; } pass(); return 0; } criu-3.6/test/zdtm/static/groups.desc000066400000000000000000000000221317335042600177250ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/grow_map.c000066400000000000000000000032431317335042600175350ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that VMA-s with MAP_GROWSDOWN are restored correctly"; const char *test_author = "Andrew Vagin "; int main(int argc, char **argv) { char *start_addr, *fake_grow_down, *test_addr, *grow_down; volatile char *p; test_init(argc, argv); start_addr = mmap(NULL, PAGE_SIZE * 10, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (start_addr == MAP_FAILED) { pr_perror("Can't mal a new region"); return 1; } munmap(start_addr, PAGE_SIZE * 10); fake_grow_down = mmap(start_addr + PAGE_SIZE * 5, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED | MAP_GROWSDOWN, -1, 0); if (fake_grow_down == MAP_FAILED) { pr_perror("Can't mal a new region"); return 1; } p = fake_grow_down; *p-- = 'c'; *p = 'b'; /* overlap the guard page of fake_grow_down */ test_addr = mmap(start_addr + PAGE_SIZE * 3, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (test_addr == MAP_FAILED) { pr_perror("Can't mal a new region"); return 1; } grow_down = mmap(start_addr + PAGE_SIZE * 2, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED | MAP_GROWSDOWN, -1, 0); if (grow_down == MAP_FAILED) { pr_perror("Can't mal a new region"); return 1; } test_daemon(); test_waitsig(); munmap(test_addr, PAGE_SIZE); if (fake_grow_down[0] != 'c' || *(fake_grow_down - 1) != 'b') { fail("%c %c\n", fake_grow_down[0], *(fake_grow_down - 1)); return 1; } p = grow_down; *p-- = 'z'; *p = 'x'; pass(); return 0; } criu-3.6/test/zdtm/static/grow_map.desc000066400000000000000000000000241317335042600202230ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/grow_map02.c000066400000000000000000000027531317335042600177040ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that a few grow-down VMA-s are restored correctly"; const char *test_author = "Andrew Vagin "; int main(int argc, char **argv) { char *start_addr, *grow_down; test_init(argc, argv); start_addr = mmap(NULL, PAGE_SIZE * 10, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (start_addr == MAP_FAILED) { pr_perror("Can't mal a new region"); return 1; } munmap(start_addr, PAGE_SIZE * 10); grow_down = mmap(start_addr + PAGE_SIZE * 3, PAGE_SIZE * 3, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED | MAP_GROWSDOWN, -1, 0); if (grow_down == MAP_FAILED) { pr_perror("Can't mal a new region"); return 1; } grow_down[0 * PAGE_SIZE] = 'x'; grow_down[1 * PAGE_SIZE] = 'y'; grow_down[2 * PAGE_SIZE] = 'z'; /* * Split the grow-down vma on three parts. * Only the irst one will have a guard page */ if (mprotect(grow_down + PAGE_SIZE, PAGE_SIZE, PROT_READ)) { pr_perror("Can't change set protection on a region of memory"); return 1; } test_daemon(); test_waitsig(); test_msg("%c %c %c\n", grow_down[0 * PAGE_SIZE], grow_down[1 * PAGE_SIZE], grow_down[2 * PAGE_SIZE]); if (grow_down[0 * PAGE_SIZE] != 'x') return 1; if (grow_down[1 * PAGE_SIZE] != 'y') return 1; if (grow_down[2 * PAGE_SIZE] != 'z') return 1; pass(); return 0; } criu-3.6/test/zdtm/static/grow_map02.desc000066400000000000000000000000241317335042600203650ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/grow_map03.c000066400000000000000000000017751317335042600177100ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that VMA-s with MAP_GROWSDOWN are restored correctly"; const char *test_author = "Andrew Vagin "; /* * This test case creates two consecutive grows down vmas with a hole * between them. */ int main(int argc, char **argv) { char *start_addr, *addr1, *addr2; test_init(argc, argv); start_addr = mmap(NULL, PAGE_SIZE * 10, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (start_addr == MAP_FAILED) { pr_perror("Can't mal a new region"); return 1; } munmap(start_addr, PAGE_SIZE * 10); addr1 = mmap(start_addr + PAGE_SIZE * 5, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); addr2 = mmap(start_addr + PAGE_SIZE * 3, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); test_msg("%p %p\n", addr1, addr2); test_daemon(); test_waitsig(); pass(); return 0; } criu-3.6/test/zdtm/static/grow_map03.desc000066400000000000000000000000241317335042600203660ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/helper_zombie_child.c000066400000000000000000000034101317335042600217050ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that a zombie with a helper parent is restored"; const char *test_author = "Tycho Andersen "; void setsid_and_fork(int sk) { siginfo_t infop; pid_t zombie; setsid(); zombie = fork(); if (zombie < 0) { fail("fork"); exit(1); } if (zombie == 0) exit(0); if (waitid(P_PID, zombie, &infop, WNOWAIT | WEXITED) < 0) { fail("waitid"); exit(1); } if (write(sk, &zombie, sizeof(zombie)) != sizeof(zombie)) { fail("write"); exit(1); } close(sk); exit(0); } int main(int argc, char **argv) { pid_t pid, zombie; int status, sk_pair[2]; if (setenv("ZDTM_NOREAP", "1", 1) < 0) { fail("setenv"); return 1; } test_init(argc, argv); if (socketpair(PF_LOCAL, SOCK_STREAM, 0, sk_pair)) { pr_perror("socketpair"); return 1; } pid = fork(); if (pid < 0) { fail("fork"); return 1; } if (pid == 0) { close(sk_pair[0]); setsid_and_fork(sk_pair[1]); } close(sk_pair[1]); if (read(sk_pair[0], &zombie, sizeof(zombie)) != sizeof(zombie)) { fail("read"); kill(pid, SIGKILL); return 1; } if (waitpid(pid, &status, 0) < 0) { fail("waitpid"); return 1; } if (!WIFEXITED(status) || WEXITSTATUS(status)) { fail("setsid_and_fork"); return 1; } if (kill(zombie, 0) < 0) { fail("zombie already dead?"); return 1; } test_daemon(); test_waitsig(); /* XXX: we don't restore zombies with the right uid right now; they're all root */ if (kill(zombie, 0) < 0 && errno != EPERM) { fail("zombie didn't survive restore"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/helper_zombie_child.desc000066400000000000000000000000251317335042600224000ustar00rootroot00000000000000{'flavor': 'ns uns'} criu-3.6/test/zdtm/static/inotify00.c000066400000000000000000000125001317335042600175370ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check for inotify delivery"; const char *test_author = "Cyrill Gorcunov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #define TEST_FILE "inotify-removed" #define TEST_LINK "inotify-hardlink" #define BUFF_SIZE ((sizeof(struct inotify_event) + PATH_MAX)) static void decode_event_mask(char *buf, size_t size, unsigned int mask) { static const char *names[32] = { [ 0] = "IN_ACCESS", [ 1] = "IN_MODIFY", [ 2] = "IN_ATTRIB", [ 3] = "IN_CLOSE_WRITE", [ 4] = "IN_CLOSE_NOWRITE", [ 5] = "IN_OPEN", [ 6] = "IN_MOVED_FROM", [ 7] = "IN_MOVED_TO", [ 8] = "IN_CREATE", [ 9] = "IN_DELETE", [10] = "IN_DELETE_SELF", [11] = "IN_MOVE_SELF", [13] = "IN_UNMOUNT", [14] = "IN_Q_OVERFLOW", [15] = "IN_IGNORED", [24] = "IN_ONLYDIR", [25] = "IN_DONT_FOLLOW", [26] = "IN_EXCL_UNLINK", [29] = "IN_MASK_ADD", [30] = "IN_ISDIR", [31] = "IN_ONESHOT", }; size_t i, j; memset(buf, 0, size); for (i = 0, j = 0; i < 32 && j < size; i++) { if (!(mask & (1u << i))) continue; if (j) j += snprintf(&buf[j], size - j, " | %s", names[i]); else j += snprintf(&buf[j], size - j, "%s", names[i]); } } static int inotify_read_events(char *prefix, int inotify_fd, unsigned int *expected) { struct inotify_event *event; char buf[BUFF_SIZE * 8]; int ret, off, n = 0; while (1) { ret = read(inotify_fd, buf, sizeof(buf)); if (ret < 0) { if (errno != EAGAIN) { pr_perror("Can't read inotify queue"); return -1; } else { ret = 0; goto out; } } else if (ret == 0) break; for (off = 0; off < ret; n++, off += sizeof(*event) + event->len) { char emask[128]; event = (void *)(buf + off); decode_event_mask(emask, sizeof(emask), event->mask); test_msg("\t%-16s: event %#10x -> %s\n", prefix, event->mask, emask); if (expected) *expected &= ~event->mask; } } out: test_msg("\t%-16s: read %2d events\n", prefix, n); return ret; } int main (int argc, char *argv[]) { unsigned int mask = IN_DELETE | IN_CLOSE_WRITE | IN_DELETE_SELF | IN_CREATE; char test_file_path[PATH_MAX]; int fd, real_fd; unsigned int emask; test_init(argc, argv); if (mkdir(dirname, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)) { pr_perror("Can't create directory %s", dirname); exit(1); } #ifdef INOTIFY01 { pid_t pid; task_waiter_t t; task_waiter_init(&t); static char buf[PATH_MAX]; if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) { pr_perror("Unable to remount /"); return 1; } pid = fork(); if (pid < 0) { pr_perror("Can't fork a test process"); exit(1); } if (pid == 0) { int fd; prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0); if (unshare(CLONE_NEWNS)) { pr_perror("Unable to unshare mount namespace"); exit(1); } if (mount("zdtm", dirname, "tmpfs", 0, NULL)) { pr_perror("Unable to mount tmpfs"); exit(1); } fd = open(dirname, O_RDONLY); if (fd < 0) { pr_perror("Unable to open %s", dirname); exit(1); } dup2(fd, 100); task_waiter_complete_current(&t); while (1) sleep(1000); exit(1); } task_waiter_wait4(&t, pid); snprintf(buf, sizeof(buf), "/proc/%d/fd/100", pid); dirname = buf; } #endif fd = inotify_init1(IN_NONBLOCK); if (fd < 0) { pr_perror("inotify_init failed"); exit(1); } snprintf(test_file_path, sizeof(test_file_path), "%s/%s", dirname, TEST_FILE); real_fd = open(test_file_path, O_CREAT | O_TRUNC | O_RDWR, 0644); if (real_fd < 0) { pr_perror("Can't create %s", test_file_path); exit(1); } if (inotify_add_watch(fd, dirname, mask) < 0) { pr_perror("inotify_add_watch failed"); exit(1); } if (inotify_add_watch(fd, test_file_path, mask) < 0) { pr_perror("inotify_add_watch failed"); exit(1); } /* * At this moment we have a file inside testing * directory and a hardlink to it. The file and * hardlink are opened. */ #ifndef INOTIFY01 if (unlink(test_file_path)) { pr_perror("can't unlink %s", test_file_path); exit(1); } emask = IN_DELETE; inotify_read_events("unlink 02", fd, &emask); if (emask) { char emask_bits[128]; decode_event_mask(emask_bits, sizeof(emask_bits), emask); pr_perror("Unhandled events in emask %#x -> %s", emask, emask_bits); exit(1); } #endif test_daemon(); test_waitsig(); close(real_fd); emask = IN_CLOSE_WRITE; inotify_read_events("after", fd, &emask); if (emask) { char emask_bits[128]; decode_event_mask(emask_bits, sizeof(emask_bits), emask); fail("Unhandled events in emask %#x -> %s", emask, emask_bits); return 1; } #ifndef INOTIFY01 real_fd = open(test_file_path, O_CREAT | O_TRUNC | O_RDWR, 0644); if (real_fd < 0) { pr_perror("Can't create %s", test_file_path); exit(1); } close(real_fd); emask = IN_CREATE | IN_CLOSE_WRITE; inotify_read_events("after2", fd, &emask); if (emask) { char emask_bits[128]; decode_event_mask(emask_bits, sizeof(emask_bits), emask); fail("Unhandled events in emask %#x -> %s", emask, emask_bits); return 1; } #endif pass(); return 0; } criu-3.6/test/zdtm/static/inotify00.desc000066400000000000000000000000541317335042600202340ustar00rootroot00000000000000{'opts': '--link-remap', 'flags': 'nouser'} criu-3.6/test/zdtm/static/inotify01.c000077700000000000000000000000001317335042600215222inotify00.custar00rootroot00000000000000criu-3.6/test/zdtm/static/inotify01.desc000066400000000000000000000000731317335042600202360ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/inotify02.c000066400000000000000000000036231317335042600175470ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check for inotify file-handles storm"; const char *test_author = "Cyrill Gorcunov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); static int num_of_handles(int fd) { char path[64]; char buf[512]; int ret = 0; FILE *f; snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd); f = fopen(path, "r"); if (!f) { pr_err("Can't open %s", path); return -1; } while (fgets(buf, sizeof(buf), f)) { if (memcmp(buf, "inotify ", 8)) continue; ret++; } fclose(f); return ret; } int main (int argc, char *argv[]) { const unsigned int mask = IN_DELETE | IN_CLOSE_WRITE | IN_DELETE_SELF | IN_CREATE; const int nr_dirs = 64; char temp[nr_dirs][16]; char path[PATH_MAX]; int fd, i; test_init(argc, argv); if (mkdir(dirname, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)) { pr_err("Can't create directory %s", dirname); exit(1); } fd = inotify_init1(IN_NONBLOCK); if (fd < 0) { pr_err("inotify_init failed"); exit(1); } for (i = 0; i < nr_dirs; i++) { snprintf(temp[i], sizeof(temp[0]), "d.%03d", i); snprintf(path, sizeof(path), "%s/%s", dirname, temp[i]); if (mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)) { pr_err("Can't create %s", path); exit(1); } if (inotify_add_watch(fd, path, mask) < 0) { pr_err("inotify_add_watch failed on %s", path); exit(1); } } test_daemon(); test_waitsig(); i = num_of_handles(fd); close(fd); if (i < nr_dirs) fail("Expected %d handles but got %d", nr_dirs, i); else pass(); return 0; } criu-3.6/test/zdtm/static/inotify02.desc000066400000000000000000000000241317335042600202330ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/inotify_irmap.c000066400000000000000000000026251317335042600205760ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check for irmap"; const char *test_author = "Pavel Emelyanov "; #define TDIR "/etc" char test_files[2][128] = {TDIR"/zdtm-test", TDIR"/zdtm-test1"}; #define BUFF_SIZE ((sizeof(struct inotify_event) + PATH_MAX)) int main (int argc, char *argv[]) { char buf[BUFF_SIZE]; int fd, wd, i; test_init(argc, argv); for (i = 0; i < 2; i++) { unlink(test_files[i]); if (creat(test_files[i], 0600) < 0) { pr_perror("Can't make test file"); exit(1); } } fd = inotify_init1(IN_NONBLOCK); if (fd < 0) { pr_perror("inotify_init failed"); goto err; } for (i = 0; i < 2; i++) { wd = inotify_add_watch(fd, test_files[i], IN_OPEN); if (wd < 0) { pr_perror("inotify_add_watch failed"); goto err; } } test_daemon(); test_waitsig(); for (i = 0; i < 2; i++) { memset(buf, 0, sizeof(buf)); wd = open(test_files[i], O_RDONLY); if (read(fd, buf, sizeof(buf)) <= 0) { fail("No events in queue"); goto err; } } close(wd); close(fd); for (i = 0; i < 2; i++) unlink(test_files[i]); pass(); return 0; err: for (i = 0; i < 2; i++) unlink(test_files[i]); return 1; } criu-3.6/test/zdtm/static/inotify_irmap.desc000066400000000000000000000001131317335042600212600ustar00rootroot00000000000000{'flags': 'suid', 'opts' : '--force-irmap --irmap-scan-path /zdtm/static'} criu-3.6/test/zdtm/static/inotify_irmap.hook000077500000000000000000000004571317335042600213200ustar00rootroot00000000000000#!/bin/sh umask 0000 [ "$1" = "--pre-restore" ] && { exit # emulate rsync rm -rf etc/zdtm-test touch etc/zdtm-test } [ "$1" = "--post-pre-dump" ] && { echo 'invalidate the irmap cache' mv etc/zdtm-test etc/zdtm-test2 mv etc/zdtm-test1 etc/zdtm-test mv etc/zdtm-test2 etc/zdtm-test1 } exit 0 criu-3.6/test/zdtm/static/inotify_system.c000066400000000000000000000212541317335042600210110ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Inotify on symlink should be checked"; #ifndef NODEL char filename[] = "file"; char linkname[] = "file.lnk"; const char *inot_dir = "./inotify"; #else char filename[] = "file.nodel"; char linkname[] = "file.nodel.lnk"; const char *inot_dir = "./inotify.nodel"; #endif #ifdef __NR_inotify_init #include #ifndef IN_DONT_FOLLOW /* Missed in SLES 10 header */ #define IN_DONT_FOLLOW 0x02000000 #endif #define EVENT_MAX 1024 /* size of the event structure, not counting name */ #define EVENT_SIZE (sizeof (struct inotify_event)) /* reasonable guess as to size of 1024 events */ #define EVENT_BUF_LEN (EVENT_MAX * (EVENT_SIZE + 16)) #define BUF_SIZE 256 #define min_value(a,b) (a #include typedef struct { int inot; int file; int link; int dir; } desc; void do_wait() { test_daemon(); test_waitsig(); } int createFiles(char *path, char *target, char *link) { int fd; fd = open(path,O_CREAT, 0644); if (fd < 0) { pr_perror("can't open %s", path); return -1; } close(fd); if (symlink(target, link) < 0) { pr_perror("can't symlink %s to %s", path, link); return -1; } return 0; } int addWatcher(int fd, const char *path) { int wd; wd = inotify_add_watch(fd, path, IN_ALL_EVENTS | IN_DONT_FOLLOW); if (wd < 0) { pr_perror("inotify_add_watch(%d, %s, IN_ALL_EVENTS) Failed, %s", fd, path, strerror(errno)); return -1; } return wd; } int fChmod(char *path) { if (chmod(path, 0755) < 0) { pr_perror("chmod(%s, 0755) Failed, %s", path, strerror(errno)); return -1; } return 0; } int fWriteClose(char *path) { int fd = open(path, O_RDWR | O_CREAT, 0700); if (fd == -1) { pr_perror("open(%s, O_RDWR|O_CREAT,0700) Failed, %s", path, strerror(errno)); return -1; } if (write(fd, "string", 7) == -1) { pr_perror("write(%d, %s, 1) Failed, %s", fd, path, strerror(errno)); return -1; } if (close(fd) == -1) { pr_perror("close(%s) Failed, %s", path, strerror(errno)); return -1; } return 0; } int fNoWriteClose(char *path) { char buf[BUF_SIZE]; int fd = open(path, O_RDONLY); if ( fd < 0 ) { pr_perror("open(%s, O_RDONLY) Failed, %s", path, strerror(errno)); return -1; } if (read(fd, buf, BUF_SIZE) == -1) { pr_perror("read error: %s", strerror(errno)); close(fd); return -1; } if (close(fd) == -1) { pr_perror("close(%s) Failed, %s", path, strerror(errno)); return -1; } return 0; } int fMove(char *from, char *to) { if (rename(from, to) == -1) { pr_perror("rename error (from: %s to: %s) : %s", from, to, strerror(errno)); return -1; } return 0; } desc init_env(const char *dir, char *file_path, char *link_path) { desc in_desc = {-1, -1, -1, -1}; if (mkdir(dir, 0777) < 0) { pr_perror("error in creating directory: %s, %s", dir, strerror(errno)); return in_desc; } in_desc.inot = inotify_init(); if (in_desc.inot < 0) { pr_perror("inotify_init () Failed, %s", strerror(errno)); rmdir(dir); return in_desc; } if (snprintf(file_path, BUF_SIZE, "%s/%s", dir, filename) >= BUF_SIZE) { pr_perror("filename %s is too long", filename); rmdir(dir); return in_desc; } if (snprintf(link_path, BUF_SIZE, "%s/%s", dir, linkname) >= BUF_SIZE) { pr_perror("filename %s is too long", linkname); rmdir(dir); return in_desc; } in_desc.dir = addWatcher(in_desc.inot, dir); if (createFiles(file_path, filename, link_path)) { return in_desc; } in_desc.link = addWatcher(in_desc.inot, link_path); in_desc.file = addWatcher(in_desc.inot, file_path); return in_desc; } int fDelete(char *path) { if (unlink(path) != 0) { pr_perror("unlink: (%s)", strerror(errno)); return -1; } return 0; } int fRemDir(const char *target) { if(rmdir(target)) { pr_perror("rmdir: (%s)", strerror(errno)); return -1; } return 0; } int test_actions(const char *dir, char *file_path, char *link_path) { if ( fChmod(link_path) == 0 && fWriteClose(link_path) == 0 && fNoWriteClose(link_path) == 0 && fMove(file_path, filename) == 0 && fMove(filename, file_path) == 0 #ifndef NODEL && fDelete(file_path) == 0 && fDelete(link_path) == 0 && fRemDir(dir) == 0 #endif ) { return 0; } return -1; } void dump_events(char *buf, int len) { int marker = 0; struct inotify_event *event; while (marker < len) { event = (struct inotify_event *) &buf[marker]; test_msg("\t%s (%x mask, %d len", handle_event(event->mask), event->mask, event->len); if (event->len) test_msg(", '%s' name", event->name); test_msg(")\n"); marker += EVENT_SIZE + event->len; } } int harmless(int mask) { switch (mask) { case IN_CLOSE_NOWRITE: case IN_ATTRIB: return 1; } return 0; } int errors(int exp_len, int len, char *etalon_buf, char *buf) { int marker=0; int error=0; while (marker < len){ struct inotify_event *event; struct inotify_event *exp_event; event = (struct inotify_event *) &buf[marker]; /* It's OK if some additional events are recevived */ if (marker < exp_len) exp_event = (struct inotify_event *) &etalon_buf[marker]; else { if (!harmless(event->mask)) { fail("got unexpected event %s (%x mask)\n", handle_event(event->mask), event->mask); error++; } goto next_event; } if (event->mask != exp_event->mask) { fail("Handled %s (%x mask), expected %s (%x mask)", handle_event(event->mask), event->mask, handle_event(exp_event->mask), exp_event->mask); error++; } if (event->len != exp_event->len) { fail("Incorrect length of field name."); error++; break; } else if (event->len && strncmp(event->name, exp_event->name, event->len)) { fail("Handled file name %s, expected %s", event->name, exp_event->name); error++; } next_event: marker += EVENT_SIZE + event->len; } return error; } int read_set(int inot_fd, char *event_set) { int len; if ((len = read(inot_fd, event_set, EVENT_BUF_LEN)) < 0) { pr_perror("read(%d, buf, %lu) Failed, errno=%d", inot_fd, (unsigned long)EVENT_BUF_LEN, errno); return -1; } return len; } void common_close(desc *descr) { if (descr->inot > 0) { close(descr->inot); descr->inot=-1; descr->file=-1; descr->dir=-1; descr->link=-1; } } int get_event_set(char *event_set, int wait) { int len; char link_path[BUF_SIZE]; char file_path[BUF_SIZE]; desc common_desc; common_desc = init_env(inot_dir, file_path, link_path); if ((common_desc.inot < 0) || (common_desc.file < 0) || \ (common_desc.dir < 0) || (common_desc.link < 0)) { common_close(&common_desc); return -1; } if(test_actions(inot_dir, file_path, link_path) < 0) { common_close(&common_desc); return -1; } if (wait) { do_wait(); } len = read_set(common_desc.inot, event_set); common_close(&common_desc); #ifdef NODEL if (! (fDelete(file_path) == 0 && fDelete(link_path) == 0 && fRemDir(inot_dir) == 0)) return -1; #endif return len; } int check(int len, char *event_set, int exp_len, char *etalon_event_set) { if ((exp_len < 0) || (len < 0)){ fail("Error in preparing event sets."); return -1; } if (len < exp_len) { fail("Events are lost. Read: %d, Expected: %d", len, exp_len); test_msg("expected events\n"); dump_events(etalon_event_set, exp_len); test_msg("real events\n"); dump_events(event_set, len); return -1; } if (errors(exp_len, len, etalon_event_set, event_set) == 0) { pass(); return 0; } return -1; } int main(int argc, char ** argv) { int exp_len=-1, len=-1; char etalon_event_set[EVENT_BUF_LEN]; char event_set[EVENT_BUF_LEN]; test_init(argc, argv); exp_len = get_event_set(etalon_event_set, 0); len = get_event_set(event_set, 1); if (check(len, event_set, exp_len, etalon_event_set)) { return 1; } return 0; } #else int main(int argc, char ** argv) { test_init(argc, argv); skip("Inotify not supported."); return 0; } #endif //__NR_inotify_init criu-3.6/test/zdtm/static/inotify_system.desc000066400000000000000000000000241317335042600214750ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/inotify_system_nodel.c000077700000000000000000000000001317335042600254122inotify_system.custar00rootroot00000000000000criu-3.6/test/zdtm/static/inotify_system_nodel.desc000066400000000000000000000000241317335042600226560ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/ipc_namespace.c000066400000000000000000000255121317335042600205140ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define CLONE_NEWIPC 0x08000000 extern int msgctl (int __msqid, int __cmd, struct msqid_ds *__buf); extern int semctl (int __semid, int __semnum, int __cmd, ...); extern int shmctl (int __shmid, int __cmd, struct shmid_ds *__buf); struct ipc_ids { int in_use; /* TODO: Check for 0 */ // unsigned short seq; // unsigned short seq_max; // struct rw_semaphore rw_mutex; // struct idr ipcs_idr; /* TODO */ }; struct ipc_ns { struct ipc_ids ids[3]; int sem_ctls[4]; // + int used_sems; // + int msg_ctlmax; // + int msg_ctlmnb; // + int msg_ctlmni; // + int msg_bytes; // + int msg_hdrs; // + int auto_msgmni; // + int msg_next_id; // + int sem_next_id; // + int shm_next_id; // + size_t shm_ctlmax; size_t shm_ctlall; int shm_ctlmni; int shm_tot; int shm_rmid_forced; // struct vfsmount *mq_mnt; // unsigned int mq_queues_count; unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ unsigned int mq_msg_default; /* initialized to DFLT_MSG */ unsigned int mq_msgsize_default; /* initialized to DFLT_MSGSIZE */ struct user_ns *user_ns; }; #define IPC_SEM_IDS 0 #define IPC_MSG_IDS 1 #define IPC_SHM_IDS 2 const char *test_doc = "Check that ipc ns context migrated successfully"; const char *test_author = "Stanislav Kinsbursky "; struct ipc_ns ipc_before, ipc_after; static int read_ipc_sysctl(char *name, int *data, size_t size) { int fd; int ret; char buf[32]; fd = open(name, O_RDONLY); if (fd < 0) { pr_perror("Can't open %s", name); return fd; } ret = read(fd, buf, 32); if (ret < 0) { pr_perror("Can't read %s", name); ret = -errno; goto err; } *data = (int)strtoul(buf, NULL, 10); ret = 0; err: close(fd); return ret; } static int get_messages_info(struct ipc_ns *ipc) { struct msginfo info; int ret; ret = msgctl(0, MSG_INFO, (struct msqid_ds *)&info); if (ret < 0) { pr_perror("msgctl failed with %d", errno); return ret; } ipc->msg_ctlmax = info.msgmax; ipc->msg_ctlmnb = info.msgmnb; ipc->msg_ctlmni = info.msgmni; ipc->msg_bytes = info.msgtql; ipc->msg_hdrs = info.msgmap; ipc->ids[IPC_MSG_IDS].in_use = info.msgpool; if (read_ipc_sysctl("/proc/sys/kernel/auto_msgmni", &ipc->auto_msgmni, sizeof(ipc->auto_msgmni))) return -1; if (read_ipc_sysctl("/proc/sys/kernel/msg_next_id", &ipc->msg_next_id, sizeof(ipc->msg_next_id))) return -1; if (read_ipc_sysctl("/proc/sys/kernel/sem_next_id", &ipc->sem_next_id, sizeof(ipc->sem_next_id))) return -1; if (read_ipc_sysctl("/proc/sys/kernel/shm_next_id", &ipc->shm_next_id, sizeof(ipc->shm_next_id))) return -1; if (read_ipc_sysctl("/proc/sys/fs/mqueue/queues_max", (int *)&ipc->mq_queues_max, sizeof(ipc->mq_queues_max))) return -1; if (read_ipc_sysctl("/proc/sys/fs/mqueue/msg_max", (int *)&ipc->mq_msg_max, sizeof(ipc->mq_msg_max))) return -1; if (read_ipc_sysctl("/proc/sys/fs/mqueue/msgsize_max", (int *)&ipc->mq_msgsize_max, sizeof(ipc->mq_msgsize_max))) return -1; if (read_ipc_sysctl("/proc/sys/fs/mqueue/msg_default", (int *)&ipc->mq_msg_default, sizeof(ipc->mq_msg_default))) return -1; if (read_ipc_sysctl("/proc/sys/fs/mqueue/msgsize_default", (int *)&ipc->mq_msgsize_default, sizeof(ipc->mq_msgsize_default))) return -1; return 0; } static int get_semaphores_info(struct ipc_ns *ipc) { int err; struct seminfo info; err = semctl(0, 0, SEM_INFO, &info); if (err < 0) pr_perror("semctl failed with %d", errno); ipc->sem_ctls[0] = info.semmsl; ipc->sem_ctls[1] = info.semmns; ipc->sem_ctls[2] = info.semopm; ipc->sem_ctls[3] = info.semmni; ipc->used_sems = info.semaem; ipc->ids[IPC_SEM_IDS].in_use = info.semusz; return 0; } static int get_shared_memory_info(struct ipc_ns *ipc) { int ret; union { struct shminfo64 shminfo64; struct shm_info shminfo; struct shmid_ds shmid; } u; ret = shmctl(0, IPC_INFO, &u.shmid); if (ret < 0) pr_perror("semctl failed with %d", errno); ipc->shm_ctlmax = u.shminfo64.shmmax; ipc->shm_ctlall = u.shminfo64.shmall; ipc->shm_ctlmni = u.shminfo64.shmmni; ret = shmctl(0, SHM_INFO, &u.shmid); if (ret < 0) pr_perror("semctl failed with %d", errno); ipc->shm_tot = u.shminfo.shm_tot; ipc->ids[IPC_SHM_IDS].in_use = u.shminfo.used_ids; if (read_ipc_sysctl("/proc/sys/kernel/shm_rmid_forced", &ipc->shm_rmid_forced, sizeof(ipc->shm_rmid_forced))) return -1; return 0; } int fill_ipc_ns(struct ipc_ns *ipc) { int ret; ret = get_messages_info(ipc); if (ret < 0) { pr_perror("Failed to collect messages"); return ret; } ret = get_semaphores_info(ipc); if (ret < 0) { pr_perror("Failed to collect semaphores"); return ret; } ret = get_shared_memory_info(ipc); if (ret < 0) { pr_perror("Failed to collect shared memory"); return ret; } return 0; } static int rand_ipc_sysctl(char *name, unsigned int val) { int fd; int ret; char buf[32]; fd = open(name, O_WRONLY); if (fd < 0) { pr_perror("Can't open %s", name); return fd; } sprintf(buf, "%d\n", val); ret = write(fd, buf, strlen(buf)); if (ret < 0) { pr_perror("Can't write %u into %s", val, name); return -errno; } close(fd); return 0; } static int rand_ipc_sem(void) { int fd; int ret; char buf[128]; char *name = "/proc/sys/kernel/sem"; fd = open(name, O_WRONLY); if (fd < 0) { pr_perror("Can't open %s", name); return fd; } sprintf(buf, "%d %d %d %d\n", (unsigned)lrand48(), (unsigned)lrand48(), (unsigned)lrand48(), (unsigned)lrand48()); ret = write(fd, buf, 128); if (ret < 0) { pr_perror("Can't write %s: %d", name, errno); return -errno; } close(fd); return 0; } #define INT_MAX ((int)(~0U>>1)) static int rand_ipc_ns(void) { int ret; ret = rand_ipc_sem(); if (!ret) ret = rand_ipc_sysctl("/proc/sys/kernel/msgmax", (unsigned)lrand48()); if (!ret) ret = rand_ipc_sysctl("/proc/sys/kernel/msgmnb", (unsigned)lrand48()); if (!ret) ret = rand_ipc_sysctl("/proc/sys/kernel/msgmni", (unsigned)lrand48()); if (!ret) ret = rand_ipc_sysctl("/proc/sys/kernel/auto_msgmni", 0); if (!ret && (unsigned)lrand48() % 2) ret = rand_ipc_sysctl("/proc/sys/kernel/msg_next_id", (unsigned)lrand48() % ((unsigned)INT_MAX + 1)); if (!ret && (unsigned)lrand48() % 2) ret = rand_ipc_sysctl("/proc/sys/kernel/sem_next_id", (unsigned)lrand48() % ((unsigned)INT_MAX + 1)); if (!ret && (unsigned)lrand48() % 2) ret = rand_ipc_sysctl("/proc/sys/kernel/shm_next_id", (unsigned)lrand48() % ((unsigned)INT_MAX + 1)); if (!ret) ret = rand_ipc_sysctl("/proc/sys/kernel/shmmax", (unsigned)lrand48()); if (!ret) ret = rand_ipc_sysctl("/proc/sys/kernel/shmall", (unsigned)lrand48()); if (!ret) ret = rand_ipc_sysctl("/proc/sys/kernel/shmmni", (unsigned)lrand48()); if (!ret) ret = rand_ipc_sysctl("/proc/sys/kernel/shm_rmid_forced", (unsigned)lrand48() & 1); if (!ret) ret = rand_ipc_sysctl("/proc/sys/fs/mqueue/queues_max", (((unsigned)lrand48()) % 1023) + 1); if (!ret) ret = rand_ipc_sysctl("/proc/sys/fs/mqueue/msg_max", ((unsigned)lrand48() % 65536) + 1); if (!ret) ret = rand_ipc_sysctl("/proc/sys/fs/mqueue/msgsize_max", ((unsigned)lrand48() & (8192 * 128 - 1)) | 128); if (!ret) ret = rand_ipc_sysctl("/proc/sys/fs/mqueue/msg_default", ((unsigned)lrand48() % 65536) + 1); if (!ret) ret = rand_ipc_sysctl("/proc/sys/fs/mqueue/msgsize_default", ((unsigned)lrand48() & (8192 * 128 - 1)) | 128); if (ret < 0) pr_perror("Failed to randomize ipc namespace tunables"); return ret; } static void show_ipc_entry(struct ipc_ns *old, struct ipc_ns *new) { int i; for (i = 0; i < 3; i++) { if (old->ids[i].in_use != new->ids[i].in_use) pr_perror("ids[%d].in_use differs: %d ---> %d", i, old->ids[i].in_use, new->ids[i].in_use); } for (i = 0; i < 4; i++) { if (old->sem_ctls[i] != new->sem_ctls[i]) pr_perror("sem_ctls[%d] differs: %d ---> %d", i, old->sem_ctls[i], new->sem_ctls[i]); } if (old->msg_ctlmax != new->msg_ctlmax) pr_perror("msg_ctlmax differs: %d ---> %d", old->msg_ctlmax, new->msg_ctlmax); if (old->msg_ctlmnb != new->msg_ctlmnb) pr_perror("msg_ctlmnb differs: %d ---> %d", old->msg_ctlmnb, new->msg_ctlmnb); if (old->msg_ctlmni != new->msg_ctlmni) pr_perror("msg_ctlmni differs: %d ---> %d", old->msg_ctlmni, new->msg_ctlmni); if (old->auto_msgmni != new->auto_msgmni) pr_perror("auto_msgmni differs: %d ---> %d", old->auto_msgmni, new->auto_msgmni); if (old->msg_next_id != new->msg_next_id) pr_perror("msg_next_id differs: %d ---> %d", old->msg_next_id, new->msg_next_id); if (old->sem_next_id != new->sem_next_id) pr_perror("sem_next_id differs: %d ---> %d", old->sem_next_id, new->sem_next_id); if (old->shm_next_id != new->shm_next_id) pr_perror("shm_next_id differs: %d ---> %d", old->shm_next_id, new->shm_next_id); if (old->shm_ctlmax != new->shm_ctlmax) pr_perror("shm_ctlmax differs: %zu ---> %zu", old->shm_ctlmax, new->shm_ctlmax); if (old->shm_ctlall != new->shm_ctlall) pr_perror("shm_ctlall differs: %zu ---> %zu", old->shm_ctlall, new->shm_ctlall); if (old->shm_ctlmni != new->shm_ctlmni) pr_perror("shm_ctlmni differs: %d ---> %d", old->shm_ctlmni, new->shm_ctlmni); if (old->shm_rmid_forced != new->shm_rmid_forced) pr_perror("shm_rmid_forced differs: %d ---> %d", old->shm_rmid_forced, new->shm_rmid_forced); if (old->mq_queues_max != new->mq_queues_max) pr_perror("mq_queues_max differs: %d ---> %d", old->mq_queues_max, new->mq_queues_max); if (old->mq_msg_max != new->mq_msg_max) pr_perror("mq_msg_max differs: %d ---> %d", old->mq_msg_max, new->mq_msg_max); if (old->mq_msgsize_max != new->mq_msgsize_max) pr_perror("mq_msgsize_max differs: %d ---> %d", old->mq_msgsize_max, new->mq_msgsize_max); if (old->mq_msg_default != new->mq_msg_default) pr_perror("mq_msg_default differs: %d ---> %d", old->mq_msg_default, new->mq_msg_default); if (old->mq_msgsize_default != new->mq_msgsize_default) pr_perror("mq_msgsize_default differs: %d ---> %d", old->mq_msgsize_default, new->mq_msgsize_default); } int main(int argc, char **argv) { int ret; test_init(argc, argv); ret = rand_ipc_ns(); if (ret) { pr_perror("Failed to randomize ipc ns before migration"); return -1; } ret = fill_ipc_ns(&ipc_before); if (ret) { pr_perror("Failed to collect ipc ns before migration"); return ret; } test_daemon(); test_waitsig(); ret = fill_ipc_ns(&ipc_after); if (ret) { pr_perror("Failed to collect ipc ns after migration"); return ret; } if (memcmp(&ipc_before, &ipc_after, sizeof(ipc_after))) { pr_perror("IPC's differ"); show_ipc_entry(&ipc_before, &ipc_after); return -EINVAL; } pass(); return 0; } criu-3.6/test/zdtm/static/ipc_namespace.desc000066400000000000000000000000431317335042600212000ustar00rootroot00000000000000{'flavor': 'ns', 'flags' : 'suid'} criu-3.6/test/zdtm/static/jobctl00.c000066400000000000000000000140251317335042600173370ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that job control migrates correctly"; const char *test_author = "Roman Kagan "; #define JOBS_DEF 8 #define JOBS_MAX 64 unsigned int num_jobs = JOBS_DEF; TEST_OPTION(num_jobs, uint, "# \"jobs\" in a \"shell\" " "(default " __stringify(JOBS_DEF) ", max " __stringify(JOBS_MAX) ")", 0); #define PROCS_DEF 4 unsigned int num_procs = PROCS_DEF; TEST_OPTION(num_procs, uint, "# processes in a \"job\" " "(default " __stringify(PROCS_DEF) ")", 0); static const char wr_string[] = "All you need is love!\n"; static const char rd_string[] = "We all live in a yellow submarine\n"; static const char susp_char = '\032'; /* ^Z */ static volatile sig_atomic_t signo = 0; static void record_sig(int sig) { signo = sig; } static void record_and_raise_sig(int sig) { signo = sig; signal(sig, SIG_DFL); raise(sig); } static int wait4sig(int sig) { sigset_t mask, oldmask; sigemptyset(&mask); sigaddset(&mask, sig); sigaddset(&mask, SIGCHLD); /* to see our children die */ sigprocmask(SIG_BLOCK, &mask, &oldmask); while (!signo) sigsuspend (&oldmask); sigprocmask (SIG_UNBLOCK, &mask, NULL); return signo != sig; } static int is_fg(void) { pid_t pgid = getpgrp(); pid_t tcpgid = tcgetpgrp(1); return (pgid != -1) && (pgid == tcpgid); } static int reader(int sig) { char str[sizeof(rd_string) + 1]; return read(0, str, sizeof(str)) < 0 || strcmp(str, rd_string); } static int post_reader(int fd) { if (write(fd, rd_string, sizeof(rd_string) - 1) < 0) { fail("write failed: %m"); return -1; } return 0; } static int writer(int sig) { return write(1, wr_string, sizeof(wr_string) - 1) < 0; } static int post_writer(int fd) { char str[sizeof(wr_string) + 1]; if (read(0, str, sizeof(str)) < 0) { fail("read failed: %m"); return -1; } /* if (strcmp(str, wr_string)) { fail("read string mismatch"); return -1; } */ return 0; } static struct job_type { int sig; int (*action)(int sig); int (*post)(int fd); } job_types[] = { { SIGTTOU, writer, post_writer }, { SIGTTIN, reader, post_reader }, { SIGCONT, wait4sig, NULL }, }; static int process(int (*action)(int), int sig) { int ret; if (is_fg()) /* we must be in background on entry */ return 1; if (signal(sig, record_and_raise_sig) == SIG_ERR) return 2; kill(getppid(), SIGUSR2); /* tell the parent we're ready */ ret = action(sig); /* will be busy doing nothing for the duration of migration */ if (ret) return 3; if (!is_fg()) /* we must be in foreground now */ return 4; ret = signo != sig; /* have we got the desired signal? */ test_waitsig(); return ret; } static int job(int (*action)(int), int sig) { int i; if (setpgrp() < 0) return 1; for (i = num_procs; i; i--) { pid_t pid = fork(); if (pid < 0) kill(0, SIGKILL); /* kill the whole job */ if (pid == 0) /* the last is worker, others are sleepers */ exit(process(i == 1 ? action : wait4sig, sig)); /* wait for the child to grow up before going to next one * ignore return code as the child may get stopped and SIGCHILD * us */ wait4sig(SIGUSR2); signo = 0; /* rearm sighandler */ } kill(getppid(), SIGUSR2); /* tell the parent we're ready */ /* we (or our children) will get suspended somehow here, so the rest * will hopefully happen after migration */ for (i = num_procs; i; i--) { int ret; wait(&ret); if (!WIFEXITED(ret) || WEXITSTATUS(ret)) kill(0, SIGKILL); } return 0; } static int make_pty_pair(int *fdmaster, int *fdslave) { struct termios tio; if (openpty(fdmaster, fdslave, NULL, &tio, NULL) < 0) return -1; if (ioctl(*fdslave, TIOCSCTTY, NULL) < 0) return -1; tio.c_lflag |= (ICANON | ISIG | TOSTOP); if (tcsetattr(*fdslave, TCSANOW, &tio) < 0) return -1; return 0; } int start_jobs(pid_t *jobs, int njobs, int fdmaster, int fdslave) { int i; /* the children will signal readiness via SIGUSR2 or get stopped (or * exit :) and signal that via SIGCHLD */ if (signal(SIGUSR2, record_sig) == SIG_ERR || signal(SIGCHLD, record_sig) == SIG_ERR) { pr_perror("can't install signal handler"); return -1; } for (i = 0; i < njobs; i++) { int jtno = i % (sizeof(job_types) / sizeof(job_types[0])); jobs[i] = fork(); if (jobs[i] < 0) { /* we're busted - bail out */ pr_perror("fork failed"); goto killout; } if (jobs[i] == 0) { close(fdmaster); dup2(fdslave, 0); dup2(fdslave, 1); dup2(fdslave, 2); close(fdslave); exit(job(job_types[jtno].action, job_types[jtno].sig)); } /* wait for the child to grow up before proceeding */ wait4sig(SIGUSR2); signo = 0; /* rearm sighandler */ } return 0; killout: for (; i >= 0; i--) kill(-jobs[i], SIGKILL); return -1; } int finish_jobs(pid_t *jobs, int njobs, int fdmaster, int fdslave) { int i; for (i = num_jobs; i--; ) { int ret; int jtno = i % (sizeof(job_types) / sizeof(job_types[0])); if (tcsetpgrp(fdslave, jobs[i]) < 0) { fail("can't bring a job into foreground: %m"); goto killout; } kill(-jobs[i], SIGCONT); if (job_types[jtno].post && job_types[jtno].post(fdmaster)) goto killout; kill(-jobs[i], SIGTERM); waitpid(jobs[i], &ret, 0); if (!WIFEXITED(ret) || WEXITSTATUS(ret)) { fail("job didn't exit cleanly: %d", ret); goto killout; } } return 0; killout: for (; i >= 0; i--) kill(-jobs[i], SIGKILL); return -1; } int main(int argc, char ** argv) { int fdmaster, fdslave; pid_t jobs[JOBS_MAX] = {}; test_init(argc, argv); if (num_jobs > JOBS_MAX) { pr_perror("%d jobs is too many", num_jobs); exit(1); } if (make_pty_pair(&fdmaster, &fdslave) < 0) { pr_perror("can't make pty pair"); exit(1); } sleep(30); if (start_jobs(jobs, num_jobs, fdmaster, fdslave)) { pr_perror("failed to start jobs"); exit(1); } test_daemon(); test_waitsig(); if (finish_jobs(jobs, num_jobs, fdmaster, fdslave)) fail("failed to finish jobs"); else pass(); return 0; } criu-3.6/test/zdtm/static/lib/000077500000000000000000000000001317335042600163225ustar00rootroot00000000000000criu-3.6/test/zdtm/static/lib/criu-rtc.so000077700000000000000000000000001317335042600227162../criu-rtc.soustar00rootroot00000000000000criu-3.6/test/zdtm/static/link10.c000066400000000000000000000031241317335042600170160ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Migrate two hardlinked, open, and unlinked files"; const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char ** argv) { int fd, fd2 = 0; struct stat stat, stat2; char filename2[256]; test_init(argc, argv); if (snprintf(filename2, sizeof(filename2), "%s.lnk", filename) >= sizeof(filename2)) { pr_perror("filename %s is too long", filename); exit(1); } fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } if (link(filename, filename2) < 0) { pr_perror("can't link %s to %s", filename, filename2); goto unlink; } fd2 = open(filename2, O_RDONLY); if (fd < 0) { pr_perror("can't open %s", filename2); goto unlink; } unlink(filename2); unlink(filename); test_daemon(); test_waitsig(); if (fstat(fd, &stat) < 0 || fstat(fd2, &stat2) < 0) { fail("fstat failed: %m"); goto out; } if (stat.st_ino != stat2.st_ino || stat.st_dev != stat2.st_dev) { fail("files are different: st_ino %lu != %lu or st_dev %lu != %lu", (long unsigned)stat.st_ino, (long unsigned)stat2.st_ino, (long unsigned)stat.st_dev, (long unsigned)stat2.st_dev); } pass(); out: close(fd); close(fd2); return 0; unlink: close(fd); close(fd2); unlink(filename2); unlink(filename); return 1; } criu-3.6/test/zdtm/static/loginuid.c000066400000000000000000000032051317335042600175320ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check for /proc/self/loginuid restore"; const char *test_author = "Dmitry Safonov "; const char loginuid_self[] = "/proc/self/loginuid"; const uid_t test_value = 3; const uid_t INVALID_UID = (uid_t)-1; uid_t get_loginuid(const char *path, int *err) { int fd; ssize_t num; char buf[11]; *err = 0; fd = open(path, O_RDONLY); if (fd < 0) { pr_perror("Failed to open %s", path); goto out; } num = read(fd, buf, 10); close(fd); if (num < 0) { pr_perror("Unable to read %s", path); goto out; } buf[num] = '\0'; return strtol(buf, NULL, 10); out: *err = -1; return 0; } int set_loginuid(const char *path, uid_t value) { int fd, ret = 0; char buf[11]; fd = open(path, O_RDWR); if (fd < 0) { pr_perror("Failed to open %s", path); return -1; } snprintf(buf, 11, "%u", value); if (write(fd, buf, 11) < 0) { pr_perror("Write %s to %s failed", buf, path); ret = -1; } close(fd); return ret; } int main(int argc, char *argv[]) { int ret; uid_t new_loginuid; /* unset before test */ if (set_loginuid(loginuid_self, INVALID_UID) < 0) return -1; test_init(argc, argv); if (set_loginuid(loginuid_self, test_value) < 0) return -1; test_daemon(); test_waitsig(); new_loginuid = get_loginuid(loginuid_self, &ret); if (ret < 0) return -1; if (new_loginuid != test_value) { fail("loginuid value %d is different after restore: %d\n", test_value, new_loginuid); return -1; } pass(); return 0; } criu-3.6/test/zdtm/static/loginuid.desc000066400000000000000000000000301317335042600202170ustar00rootroot00000000000000{'feature': 'loginuid'} criu-3.6/test/zdtm/static/macvlan.c000066400000000000000000000023641317335042600173460ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "check that macvlan interfaces are c/r'd correctly"; const char *test_author = "Tycho Andersen "; #define BRIDGE_NAME "zdtmbr0" #define IF_NAME "zdtmmvlan0" static bool wait_for_macvlan(void) { int i; for (i = 0; i < 10; i++) { if (system("ip addr list dev " IF_NAME) == 0) return true; sleep(1); } return false; } int main(int argc, char **argv) { int ret = 1; test_init(argc, argv); if (!wait_for_macvlan()) { fail("failed to inject macvlan device\n"); return 1; } if (system("ip addr list dev " IF_NAME " > macvlan.dump.test")) { fail("can't save net config"); goto out; } test_daemon(); test_waitsig(); if (system("ip addr list dev " IF_NAME " > macvlan.rst.test")) { fail("can't get net config"); goto out; } if (system("diff macvlan.rst.test macvlan.dump.test")) { fail("Net config differs after restore"); goto out; } pass(); ret = 0; out: return ret; } criu-3.6/test/zdtm/static/macvlan.desc000066400000000000000000000004031317335042600200320ustar00rootroot00000000000000{ 'deps': [ '/bin/sh', '/usr/bin/sort', '/bin/grep', '/sbin/ip|/bin/ip', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns', 'ropts': '--external macvlan[zdtmmvlan0]:zdtmbr0'} criu-3.6/test/zdtm/static/macvlan.hook000077500000000000000000000012671317335042600200700ustar00rootroot00000000000000#!/bin/bash [ "$1" == "--clean" -o "$1" == "--pre-restore" -o "$1" == "--post-start" ] || exit 0 if [ "$1" == "--post-start" ]; then set -e i=0 PIDF="zdtm/static/macvlan.pid.inprogress" while [ ! -f "$PIDF" ]; do i=$(($i+1)) if [ "$i" -eq "10" ]; then echo "failed to create macvlan test" exit 1 fi sleep 1 done TPID=$(cat $PIDF) ip link add zdtmbr0 type bridge ip addr add 10.0.55.55/32 dev zdtmbr0 ip link set zdtmbr0 up ip link add zdtmmvlan0 link zdtmbr0 type macvlan mode bridge ip addr add 10.0.55.56/32 dev zdtmmvlan0 ip link set zdtmmvlan0 netns $TPID else ip link del zdtmmvlan0 || true [ "$1" == "--clean" ] || exit 0 ip link del zdtmbr0 || true fi criu-3.6/test/zdtm/static/maps00.c000066400000000000000000000143541317335042600170270ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Create all sorts of maps and compare /proc/pid/maps\n" "before and after migration\n"; const char *test_author = "Pavel Emelianov "; char *filename; TEST_OPTION(filename, string, "file name", 1); const static int map_prots[] = { PROT_NONE, PROT_READ, PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE | PROT_EXEC, }; #define NUM_MPROTS sizeof(map_prots) / sizeof(int) #define RW_PROT(x) ((x) & (PROT_READ | PROT_WRITE)) #define X_PROT(x) ((x) & PROT_EXEC) int check_prot(int src_prot, int dst_prot) { if (RW_PROT(src_prot) != RW_PROT(dst_prot)) return 0; /* If exec bit will be enabled may depend on NX capablity of CPUs of * source and destination nodes. In any case, migrated mapping should * not have less permissions than newly created one ** * A is a subset of B iff (A & B) == A */ return (X_PROT(dst_prot) & X_PROT(src_prot)) == X_PROT(dst_prot); } const static int map_flags[] = { MAP_PRIVATE, MAP_SHARED, MAP_PRIVATE | MAP_ANONYMOUS, MAP_SHARED | MAP_ANONYMOUS }; #define NUM_MFLAGS sizeof(map_flags) / sizeof(int) #define NUM_MAPS NUM_MPROTS * NUM_MFLAGS #define ONE_MAP_SIZE 0x2000 struct map { int prot; int prot_real; int flag; char filename[256]; int fd; void *ptr; }; static void init_map(struct map *map, int prot_no, int flag_no) { map->fd = -1; map->prot = map_prots[prot_no]; map->flag = map_flags[flag_no]; } static int make_map(struct map *map) { uint32_t crc; uint8_t buf[ONE_MAP_SIZE]; static int i = 0; if (!(map->flag & MAP_ANONYMOUS)) { /* need file */ if (snprintf(map->filename, sizeof(map->filename), "%s-%02d", filename, i++) >= sizeof(map->filename)) { pr_perror("filename %s is too long", filename); return -1; } map->fd = open(map->filename, O_RDWR | O_CREAT, 0600); if (map->fd < 0) { pr_perror("can't open %s", map->filename); return -1; } crc = ~0; datagen(buf, sizeof(buf), &crc); if (write(map->fd, buf, sizeof(buf)) != sizeof(buf)) { pr_perror("failed to write %s", map->filename); return -1; } } map->ptr = mmap(NULL, ONE_MAP_SIZE, map->prot, map->flag, map->fd, 0); if (map->ptr == MAP_FAILED) { pr_perror("can't create mapping"); return -1; } if ((map->flag & MAP_ANONYMOUS) && (map->prot & PROT_WRITE)) { /* can't fill it with data otherwise */ crc = ~0; datagen(map->ptr, ONE_MAP_SIZE, &crc); } test_msg("map: ptr %p flag %8x prot %8x\n", map->ptr, map->flag, map->prot); return 0; } static sigjmp_buf segv_ret; /* we need sig*jmp stuff, otherwise SIGSEGV will reset our handler */ static void segfault(int signo) { siglongjmp(segv_ret, 1); } /* * after test func should be placed check map, because size of test_func * is calculated as (check_map-test_func) */ int test_func() { return 1; } static int check_map(struct map *map) { int prot = PROT_WRITE | PROT_READ | PROT_EXEC; if (signal(SIGSEGV, segfault) == SIG_ERR) { fail("setting SIGSEGV handler failed: %m\n"); return -1; } if (!sigsetjmp(segv_ret, 1)) { uint32_t crc = ~0; if (datachk(map->ptr, ONE_MAP_SIZE, &crc)) /* perform read access */ if (!(map->flag & MAP_ANONYMOUS) || (map->prot & PROT_WRITE)) { /* anon maps could only be filled when r/w */ fail("CRC mismatch: ptr %p flag %8x prot %8x\n", map->ptr, map->flag, map->prot); return -1; } /* prot |= PROT_READ// need barrier before this line, because compiler change order commands. I finded one method: look at next lines*/ } else prot &= PROT_WRITE | !PROT_READ | PROT_EXEC; if (signal(SIGSEGV, segfault) == SIG_ERR) { fail("setting SIGSEGV handler failed: %m\n"); return -1; } if (!sigsetjmp(segv_ret, 1)) { * (int *) (map->ptr) = 1234; /* perform write access */ } else prot &= !PROT_WRITE | PROT_READ | PROT_EXEC; if (signal(SIGSEGV, segfault) == SIG_ERR) { fail("restoring SIGSEGV handler failed: %m\n"); return -1; } if (!sigsetjmp(segv_ret, 1)) { if (map->prot & PROT_WRITE) { memcpy(map->ptr,test_func, getpagesize()); } else { if (!(map->flag & MAP_ANONYMOUS)) { lseek(map->fd,0,SEEK_SET); if (write(map->fd,test_func,check_map - test_func)filename); return -1; } } } if (!(map->flag & MAP_ANONYMOUS) || map->prot & PROT_WRITE) /* Function body has been copied into the mapping */ ((int (*)())map->ptr)(); /* perform exec access */ else /* No way to copy function body into mapping, * clear exec bit from effective protection */ prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; } else prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) { fail("restoring SIGSEGV handler failed: %m\n"); return -1; } return prot; } static void destroy_map(struct map *map) { munmap(map->ptr, ONE_MAP_SIZE); if (map->fd >= 0) { close(map->fd); unlink(map->filename); } } #define MAPS_LEN 0x10000 int main(int argc, char ** argv) { struct map maps[NUM_MAPS] = {}, maps_compare[NUM_MAPS] = {}; int i, j, k; test_init(argc, argv); k = 0; for (i = 0; i < NUM_MPROTS; i++) for (j = 0; j < NUM_MFLAGS; j++) init_map(maps + k++, i, j); for (i = 0; i < NUM_MAPS; i++) if (make_map(maps + i)) goto err; test_daemon(); test_waitsig(); for (i = 0; i < NUM_MAPS; i++) if ((maps[i].prot_real=check_map(maps + i))<0) goto err; k=0; for (i = 0; i < NUM_MPROTS; i++) for (j = 0; j < NUM_MFLAGS; j++) init_map(maps_compare + k++, i, j); for (i = 0; i < NUM_MAPS; i++) if (make_map(maps_compare+ i)) goto err; for (i = 0; i < NUM_MAPS; i++) if ((maps_compare[i].prot_real=check_map(maps_compare + i))<0) goto err; for (i = 0; i< NUM_MAPS; i++) if (!check_prot(maps[i].prot_real, maps_compare[i].prot_real)){ fail("protection on %i (flag=%d prot=%d) maps has changed (prot=%d(expected %d))", i, maps[i].flag, maps[i].prot, maps[i].prot_real, maps_compare[i].prot_real); goto err; } pass(); for (i = 0; i < NUM_MAPS; i++) { destroy_map(maps + i); destroy_map(maps_compare + i); } return 0; err: return 1; } criu-3.6/test/zdtm/static/maps01.c000066400000000000000000000075071317335042600170320ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define MEM_SIZE (1LU << 30) #define MEM_OFFSET (1LU << 29) #define MEM_OFFSET2 (MEM_SIZE - PAGE_SIZE) #define MEM_OFFSET3 (20LU * PAGE_SIZE) const char *test_doc = "Test shared memory"; const char *test_author = "Andrew Vagin > 20); goto err; } p = mmap(NULL, MEM_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) { pr_err("Failed to mmap %ld Mb shared anonymous R/W memory\n", MEM_SIZE >> 20); goto err; } p2 = mmap(NULL, MEM_OFFSET, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (p2 == MAP_FAILED) { pr_err("Failed to mmap %lu Mb anonymous memory\n", MEM_OFFSET >> 20); goto err; } pid = test_fork(); if (pid < 0) { pr_err("Fork failed with %d\n", pid); goto err; } else if (pid == 0) { void *p3; p3 = mmap(NULL, MEM_OFFSET3, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (p3 == MAP_FAILED) { pr_err("Failed to mmap %lu Mb anonymous R/W memory\n", MEM_OFFSET3 >> 20); goto err; } crc = ~0; datagen(m + MEM_OFFSET, PAGE_SIZE, &crc); crc = ~0; datagen(m + MEM_OFFSET2, PAGE_SIZE, &crc); crc = ~0; datagen(p + MEM_OFFSET + MEM_OFFSET3, PAGE_SIZE, &crc); crc = ~0; datagen(p + MEM_OFFSET + 2 * MEM_OFFSET3, PAGE_SIZE, &crc); crc = ~0; datagen(p + MEM_OFFSET3, PAGE_SIZE, &crc); crc = ~0; datagen(p3, PAGE_SIZE, &crc); task_waiter_complete(&t, 1); test_waitsig(); crc = ~0; status = datachk(m + MEM_OFFSET, PAGE_SIZE, &crc); if (status) return 1; crc = ~0; status = datachk(m + MEM_OFFSET2, PAGE_SIZE, &crc); if (status) return 1; crc = ~0; status = datachk(m + PAGE_SIZE, PAGE_SIZE, &crc); if (status) return 1; crc = ~0; status = datachk(p + MEM_OFFSET + 2 * MEM_OFFSET3, PAGE_SIZE, &crc); if (status) return 1; crc = ~0; status = datachk(p + MEM_OFFSET3, PAGE_SIZE, &crc); if (status) return 1; crc = ~0; status = datachk(p3, PAGE_SIZE, &crc); if (status) return 1; return 0; } task_waiter_wait4(&t, 1); munmap(p, MEM_OFFSET); p2 = mremap(p + MEM_OFFSET, MEM_OFFSET, MEM_OFFSET, MREMAP_FIXED | MREMAP_MAYMOVE, p2); if (p2 == MAP_FAILED) goto err; snprintf(path, PATH_MAX, "/proc/self/map_files/%lx-%lx", (unsigned long) m, (unsigned long) m + MEM_SIZE); fd = open(path, O_RDWR); if (fd == -1) { pr_perror("Can't open file %s", path); goto err; } m2 = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED, fd, MEM_OFFSET3); if (m2 == MAP_FAILED) { pr_perror("Can't map file %s", path); goto err; } close(fd); munmap(m, PAGE_SIZE); munmap(m + PAGE_SIZE * 10, PAGE_SIZE); munmap(m + MEM_OFFSET2, PAGE_SIZE); crc = ~0; datagen(m + PAGE_SIZE, PAGE_SIZE, &crc); crc = ~0; datagen(m2, PAGE_SIZE, &crc); test_daemon(); test_waitsig(); kill(pid, SIGTERM); wait(&status); if (WIFEXITED(status)) { if (WEXITSTATUS(status)) goto err; } else goto err; crc = ~0; if (datachk(m + MEM_OFFSET, PAGE_SIZE, &crc)) goto err; crc = ~0; if (datachk(m2, PAGE_SIZE, &crc)) goto err; crc = ~0; if (datachk(p2 + MEM_OFFSET3, PAGE_SIZE, &crc)) goto err; pass(); return 0; err: if (waitpid(-1, NULL, WNOHANG) == 0) { kill(pid, SIGTERM); wait(NULL); } return 1; } criu-3.6/test/zdtm/static/maps01.desc000066400000000000000000000000441317335042600175130ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/maps02.c000066400000000000000000000047601317335042600170310ustar00rootroot00000000000000#include #include "zdtmtst.h" #include "get_smaps_bits.h" #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif const char *test_doc = "Test shared memory with advises"; const char *test_author = "Cyrill Gorcunov "; struct mmap_data { void *start; unsigned long orig_flags; unsigned long orig_madv; unsigned long new_flags; unsigned long new_madv; }; #define MEM_SIZE (8192) static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) { m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, flags, -1, 0); if (m->start == MAP_FAILED) { pr_perror("mmap failed"); return -1; } if (madvise(m->start, MEM_SIZE, adv)) { if (errno == EINVAL) { test_msg("madvise failed, no kernel support\n"); munmap(m->start, MEM_SIZE); *m = (struct mmap_data){ }; } else { pr_perror("madvise failed"); return -1; } } return 0; } int main(int argc, char **argv) { struct mmap_data m[5] = { }; size_t i; test_init(argc, argv); test_msg("Alloc growsdown\n"); if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) return -1; test_msg("Alloc locked/sequential\n"); if (alloc_anon_mmap(&m[1], MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, MADV_SEQUENTIAL)) return -1; test_msg("Alloc noreserve/dontdump\n"); if (alloc_anon_mmap(&m[2], MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, MADV_DONTDUMP)) return -1; test_msg("Alloc hugetlb/hugepage\n"); if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) return -1; test_msg("Alloc dontfork/random|mergeable\n"); if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) return -1; test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m)/sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) return -1; } test_daemon(); test_waitsig(); test_msg("Fetch restored flags/adv\n"); for (i = 0; i < sizeof(m)/sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].new_flags, &m[i].new_madv)) return -1; if (m[i].orig_flags != m[i].new_flags) { pr_perror("Flags are changed %lx %lx -> %lx (%zu)", (unsigned long)m[i].start, m[i].orig_flags, m[i].new_flags, i); fail(); return -1; } if (m[i].orig_madv != m[i].new_madv) { pr_perror("Madvs are changed %lx %lx -> %lx (%zu)", (unsigned long)m[i].start, m[i].orig_madv, m[i].new_madv, i); fail(); return -1; } } pass(); return 0; } criu-3.6/test/zdtm/static/maps03.c000066400000000000000000000015531317335042600170270ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" #if (LONG_MAX == 2147483647L) /* 32 bit */ #define TEST_SKIP_REASON "64-bit arch required" #include "skip-me.c" #else const char *test_doc = "Test for huge VMA area"; const char *test_author = "Cyrill Gorcunov "; int main(int argc, char **argv) { test_init(argc, argv); unsigned char *mem; test_msg("Alloc huge VMA\n"); mem = (void *)mmap(NULL, (10L << 30), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if ((void *)mem == MAP_FAILED) { pr_perror("mmap failed"); return -1; } mem[4L << 30] = 1; mem[8L << 30] = 2; test_daemon(); test_waitsig(); test_msg("Testing restored data\n"); if (mem[4L << 30] != 1 || mem[8L << 30] != 2) { fail("Data corrupted!\n"); exit(1); } pass(); return 0; } #endif criu-3.6/test/zdtm/static/maps03.desc000066400000000000000000000000241317335042600175130ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/maps04.c000066400000000000000000000020071317335042600170230ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define MEM_SIZE (1L << 29) const char *test_doc = "Test big mappings"; const char *test_author = "Andrew Vagin #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Create a bunch of small VMAs and test they survive transferring\n"; const char *test_author = "Cyrill Gorcunov "; #define NR_MAPS 4096 #define NR_MAPS_1 (NR_MAPS + 0) #define NR_MAPS_2 (NR_MAPS + 1) #define MAPS_SIZE_1 (140 << 10) #define MAPS_SIZE_2 (8192) int main(int argc, char *argv[]) { void *map[NR_MAPS + 2] = { }, *addr; size_t i, summary; test_init(argc, argv); summary = NR_MAPS * 2 * 4096 + MAPS_SIZE_1 + MAPS_SIZE_2 + (1 << 20); addr = mmap(NULL, summary, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { pr_perror("Can't mmap"); return 1; } munmap(addr, summary); for (i = 0; i < NR_MAPS; i++) { map[i] = mmap(i > 0 ? map[i - 1] + 8192 : addr, 4096, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (map[i] == MAP_FAILED) { pr_perror("Can't mmap"); return 1; } else { /* Dirtify it */ int *v = (void *)map[i]; *v = i; } } map[NR_MAPS_1] = mmap(map[NR_MAPS_1 - 1] + 8192, MAPS_SIZE_1, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); if (map[NR_MAPS_1] == MAP_FAILED) { pr_perror("Can't mmap"); return 1; } else { /* Dirtify it */ int *v = (void *)map[NR_MAPS_1]; *v = i; test_msg("map-1: %p %p\n", map[NR_MAPS_1], map[NR_MAPS_1] + MAPS_SIZE_1); } map[NR_MAPS_2] = mmap(map[NR_MAPS_1] + MAPS_SIZE_1, MAPS_SIZE_2, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); if (map[NR_MAPS_2] == MAP_FAILED) { pr_perror("Can't mmap"); return 1; } else { /* Dirtify it */ int *v = (void *)map[NR_MAPS_2]; *v = i; test_msg("map-2: %p %p\n", map[NR_MAPS_2], map[NR_MAPS_2] + MAPS_SIZE_2); } test_daemon(); test_waitsig(); for (i = 0; i < NR_MAPS; i++) { int *v = (void *)map[i]; if (*v != i) { fail("Data corrupted at page %lu", (unsigned long)i); return 1; } } pass(); return 0; } criu-3.6/test/zdtm/static/maps06.c000066400000000000000000000013751317335042600170340ustar00rootroot00000000000000#include "zdtmtst.h" #include #include #include #include const char *test_doc = "Create a lot of file vma-s"; const char *test_author = "Andrei Vagin "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char ** argv) { int fd, i; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT, 0666); if (fd < 0) return 1; ftruncate(fd, 4096); for (i = 0; i < 1024; i++) { if (mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, 0) == MAP_FAILED) return 1; if (mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0) == MAP_FAILED) return 1; } test_daemon(); test_waitsig(); pass(); return 0; } criu-3.6/test/zdtm/static/maps_file_prot.c000066400000000000000000000022341317335042600207240ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test mappings of same file with different prot"; const char *test_author = "Jamie Liu "; char *filename; TEST_OPTION(filename, string, "file name", 1); #define die(fmt, arg...) do { pr_perror(fmt, ## arg); return 1; } while (0) int main(int argc, char ** argv) { void *ro_map, *rw_map; int fd; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT, 0644); if (fd < 0) die("open failed"); if (ftruncate(fd, 2 * PAGE_SIZE)) die("ftruncate failed"); ro_map = mmap(NULL, 2 * PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0); if (ro_map == MAP_FAILED) die("mmap failed"); rw_map = ro_map + PAGE_SIZE; if (mprotect(rw_map, PAGE_SIZE, PROT_READ | PROT_WRITE)) die("mprotect failed"); close(fd); test_daemon(); test_waitsig(); /* Check that rw_map is still writeable */ *(volatile char *)rw_map = 1; if (mprotect(ro_map, PAGE_SIZE, PROT_READ | PROT_WRITE)) { fail("mprotect after restore failed"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mem-touch.c000066400000000000000000000024431317335042600176210ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check changing memory"; const char *test_author = "Pavel Emelyanov "; #define MEM_PAGES 16 int main(int argc, char **argv) { void *mem; int i, fail = 0; unsigned rover = 1; unsigned backup[MEM_PAGES] = {}; srand(time(NULL)); test_init(argc, argv); mem = mmap(NULL, MEM_PAGES * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, 0, 0); if (mem == MAP_FAILED) return 1; test_msg("mem %p backup %p\n", mem, backup); test_daemon(); while (test_go()) { unsigned pfn; struct timespec req = { .tv_sec = 0, .tv_nsec = 100000, }; pfn = random() % MEM_PAGES; *(unsigned *)(mem + pfn * PAGE_SIZE) = rover; backup[pfn] = rover; test_msg("t %u %u\n", pfn, rover); rover++; nanosleep(&req, NULL); } test_waitsig(); test_msg("final rover %u\n", rover); for (i = 0; i < MEM_PAGES; i++) if (backup[i] != *(unsigned *)(mem + i * PAGE_SIZE)) { test_msg("Page %u differs want %u has %u\n", i, backup[i], *(unsigned *)(mem + i * PAGE_SIZE)); fail = 1; } else test_msg("Page %u matches %u\n", i, backup[i]); if (fail) fail("Memory corruption\n"); else pass(); return 0; } criu-3.6/test/zdtm/static/mem-touch.desc000066400000000000000000000000241317335042600203060ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/mlock_setuid.c000066400000000000000000000021011317335042600203740ustar00rootroot00000000000000#include #include #include #include "zdtmtst.h" #include "get_smaps_bits.h" #define MEM_SIZE (69632) int main(int argc, char **argv) { int ret; void *start; unsigned long new_flags = 0; unsigned long new_madv = 0; test_init(argc, argv); test_msg("Alloc vma of size %d\n", MEM_SIZE); start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (start == MAP_FAILED) { pr_perror("mmap failed"); return -1; } test_msg("Lock vma from %p to %lx\n", start, (unsigned long)start + MEM_SIZE); ret = mlock(start, MEM_SIZE); if (ret < 0) { pr_perror("mlock"); return -1; } test_daemon(); test_msg("Setuid to 18943\n"); ret = setuid(18943); if (ret < 0) { pr_perror("setuid"); return -1; } test_waitsig(); ret = get_smaps_bits((unsigned long)start, &new_flags, &new_madv); if (ret < 0) return -1; test_msg("Check smaps flags for MAP_LOCKED\n"); if (new_flags & MAP_LOCKED) { pass(); } else { fail("Vma is not locked after c/r\n"); return -1; } return 0; } criu-3.6/test/zdtm/static/mlock_setuid.desc000066400000000000000000000000441317335042600210740ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/mmx00.c000066400000000000000000000042641317335042600166670ustar00rootroot00000000000000#include #include #include "zdtmtst.h" const char *test_doc = "Start a calculation, leaving MMX in a certain state,\n" "before migration, continue after"; const char *test_author = "Pavel Emelianov "; #if defined(__i386__) || defined(__x86_64__) void start(uint8_t *bytes, uint16_t *words) { __asm__ volatile ( "movq %0, %%mm0\n" "movq %1, %%mm1\n" "movq %2, %%mm2\n" "movq %3, %%mm3\n" "paddb %%mm0, %%mm1\n" "psubw %%mm2, %%mm3\n" : : "m" (bytes[0]), "m" (bytes[8]), "m" (words[0]), "m" (words[4]) ); } void finish(uint8_t *bytes, uint16_t *words) { __asm__ volatile ( "movq %%mm1, %0\n" "movq %%mm3, %1\n" : "=m" (bytes[0]), "=m" (words[0]) ); } static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { __asm__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "c"(0)); } int chk_proc_mmx(void) { unsigned int eax, ebx, ecx, edx; cpuid(1, &eax, &ebx, &ecx, &edx); return edx & (1 << 23); } #endif int main(int argc, char **argv) { #if defined(__i386__) || defined(__x86_64__) uint8_t bytes[16]; uint16_t words[8]; uint32_t rnd[8]; int i; uint8_t resbytes1[8], resbytes2[8]; uint16_t reswords1[4], reswords2[4]; #endif test_init(argc, argv); #if defined(__i386__) || defined(__x86_64__) if (!chk_proc_mmx()) { skip("MMX not supported"); return 1; } for (i = 0; i < (sizeof(bytes) + sizeof(words)) / 4; i++) rnd[i] = mrand48(); memcpy((uint8_t *) bytes, (uint8_t *) rnd, sizeof(bytes)); memcpy((uint8_t *) words, (uint8_t *) rnd + sizeof(bytes), sizeof(words)); start(bytes, words); finish(resbytes1, reswords1); start(bytes, words); test_daemon(); test_waitsig(); finish(resbytes2, reswords2); if (memcmp((uint8_t *) resbytes1, (uint8_t *) resbytes2, sizeof(resbytes1))) fail("byte op mismatch\n"); else if (memcmp((uint8_t *) reswords1, (uint8_t *) reswords2, sizeof(reswords2))) fail("word op mismatch\n"); else pass(); #else skip("Unsupported arch"); #endif return 0; } criu-3.6/test/zdtm/static/mmx00.desc000066400000000000000000000000231317335042600173500ustar00rootroot00000000000000{'arch': 'x86_64'} criu-3.6/test/zdtm/static/mnt_enablefs.c000066400000000000000000000013601317335042600203550ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check enabled file systems (--enable-fs)"; const char *test_author = "Andrei Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { char fname[PATH_MAX]; test_init(argc, argv); mkdir(dirname, 0777); if (mount("zdtm_nfsd", dirname, "nfsd", 0, NULL) == -1) { pr_perror("mount"); return -1; } snprintf(fname, sizeof(fname), "%s/exports", dirname); test_daemon(); test_waitsig(); if (access(fname, F_OK)) fail(); pass(); return 0; } criu-3.6/test/zdtm/static/mnt_enablefs.checkskip000077500000000000000000000001041317335042600220750ustar00rootroot00000000000000#!/bin/sh unshare -m --propagation private mount -t nfsd nfsd /mnt criu-3.6/test/zdtm/static/mnt_enablefs.desc000066400000000000000000000001421317335042600210460ustar00rootroot00000000000000{ 'feature': 'mnt_id', 'flags': 'suid', 'flavor': 'ns', 'opts': '--enable-fs nfsd'} criu-3.6/test/zdtm/static/mnt_ext_auto.c000066400000000000000000000111241317335042600204250ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check --mnt-ext-map"; const char *test_author = "Andrew Vagin "; #ifdef ZDTM_EXTMAP_MANUAL char *dirname = "mnt_ext_manual.test"; char *dirname_private_shared_bind = "mnt_ext_manual_private_shared_bind.test"; char *dirname_bind = "mnt_ext_manual_bind.test"; char *dirname_slave_shared_bind = "mnt_ext_manual_slave_shared_bind.test"; char *dirname_slave_bind = "mnt_ext_manual_slave_bind.test"; #define DDIR "mtest" #else char *dirname = "mnt_ext_auto.test"; char *dirname_private_shared_bind = "mnt_ext_auto_private_shared_bind.test"; char *dirname_bind = "mnt_ext_auto_bind.test"; char *dirname_slave_shared_bind = "mnt_ext_auto_slave_shared_bind.test"; char *dirname_slave_bind = "mnt_ext_auto_slave_bind.test"; #define DDIR "atest" #endif TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char ** argv) { char src[PATH_MAX], dst[PATH_MAX], *root; char dst_bind[PATH_MAX], dst_private_shared_bind[PATH_MAX], dst_slave_shared_bind[PATH_MAX], dst_slave_bind[PATH_MAX]; char *dname = "/tmp/zdtm_ext_auto.XXXXXX"; struct stat sta, stb, bsta, bstb, ssbsta, sbsta, ssbstb, sbstb, psbsta, psbstb; char* zdtm_newns = getenv("ZDTM_NEWNS"); root = getenv("ZDTM_ROOT"); if (root == NULL) { pr_perror("root"); return 1; } sprintf(dst, "%s/%s", get_current_dir_name(), dirname); sprintf(dst_private_shared_bind, "%s/%s", get_current_dir_name(), dirname_private_shared_bind); sprintf(dst_bind, "%s/%s", get_current_dir_name(), dirname_bind); sprintf(dst_slave_shared_bind, "%s/%s", get_current_dir_name(), dirname_slave_shared_bind); sprintf(dst_slave_bind, "%s/%s", get_current_dir_name(), dirname_slave_bind); if (!zdtm_newns) { pr_perror("ZDTM_NEWNS is not set"); return 1; } else if (strcmp(zdtm_newns, "1")) { goto test; } mkdir(dname, 755); sprintf(src, "%s/%s", dname, DDIR); if (mount("zdtm_auto_ext_mnt", dname, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } mkdir(src, 755); unshare(CLONE_NEWNS); mkdir(dst, 755); if (mount(src, dst, NULL, MS_BIND, NULL)) { pr_perror("bind"); return 1; } mkdir(dst_private_shared_bind, 755); if (mount(dst, dst_private_shared_bind, NULL, MS_BIND, NULL)) { pr_perror("bind"); return 1; } if (mount("none", dst_private_shared_bind, NULL, MS_PRIVATE, NULL)) { pr_perror("bind"); return 1; } if (mount("none", dst_private_shared_bind, NULL, MS_SHARED, NULL)) { pr_perror("bind"); return 1; } mkdir(dst_bind, 755); if (mount(dst_private_shared_bind, dst_bind, NULL, MS_BIND, NULL)) { pr_perror("bind"); return 1; } mkdir(dst_slave_shared_bind, 755); if (mount(dst_bind, dst_slave_shared_bind, NULL, MS_BIND, NULL)) { pr_perror("bind"); return 1; } if (mount("none", dst_slave_shared_bind, NULL, MS_SLAVE, NULL)) { pr_perror("bind"); return 1; } if (mount("none", dst_slave_shared_bind, NULL, MS_SHARED, NULL)) { pr_perror("bind"); return 1; } mkdir(dst_slave_bind, 755); if (mount(dst_slave_shared_bind, dst_slave_bind, NULL, MS_BIND, NULL)) { pr_perror("bind"); return 1; } if (mount("none", dst_slave_bind, NULL, MS_SLAVE, NULL)) { pr_perror("bind"); return 1; } test: test_init(argc, argv); if (stat(dirname, &stb)) { pr_perror("stat"); sleep(100); return 1; } if (stat(dirname_private_shared_bind, &psbstb)) { pr_perror("stat"); sleep(100); return 1; } if (stat(dirname_bind, &bstb)) { pr_perror("stat"); sleep(100); return 1; } if (stat(dirname_slave_shared_bind, &ssbstb)) { pr_perror("stat"); sleep(100); return 1; } if (stat(dirname_slave_bind, &sbstb)) { pr_perror("stat"); sleep(100); return 1; } test_daemon(); test_waitsig(); if (stat(dirname, &sta)) { pr_perror("stat"); sleep(100); return 1; } if (stat(dirname_private_shared_bind, &psbsta)) { pr_perror("stat"); sleep(100); return 1; } if (stat(dirname_bind, &bsta)) { pr_perror("stat"); sleep(100); return 1; } if (stat(dirname_slave_shared_bind, &ssbsta)) { pr_perror("stat"); sleep(100); return 1; } if (stat(dirname_slave_bind, &sbsta)) { pr_perror("stat"); sleep(100); return 1; } if (sta.st_dev != stb.st_dev) { fail(); return 1; } if (psbsta.st_dev != psbstb.st_dev) { fail(); return 1; } if (bsta.st_dev != bstb.st_dev) { fail(); return 1; } if (ssbsta.st_dev != ssbstb.st_dev) { fail(); return 1; } if (sbsta.st_dev != sbstb.st_dev) { fail(); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mnt_ext_auto.desc000066400000000000000000000001101317335042600211120ustar00rootroot00000000000000{'flavor': 'ns uns', 'feature': 'mnt_id', 'opts': '--external mnt[]:s'} criu-3.6/test/zdtm/static/mnt_ext_dev.c000066400000000000000000000035311317335042600202360ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check mounts of external devices"; const char *test_author = "Andrei Vagin #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that mounts with external master peers are c/r'd"; const char *test_author = "Tycho Andersen "; char *dirname = "mnt_ext_auto.test"; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char ** argv) { char src[PATH_MAX], dst[PATH_MAX], *root; char *dname = "/tmp/zdtm_ext_auto.XXXXXX"; root = getenv("ZDTM_ROOT"); if (root == NULL) { pr_perror("root"); return 1; } sprintf(dst, "%s/ext_mounts", getenv("ZDTM_ROOT")); if (strcmp(getenv("ZDTM_NEWNS"), "1")) goto test; mkdir(dname, 755); sprintf(src, "%s/test", dname); if (mount("zdtm_auto_ext_mnt", dname, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } mkdir(src, 755); mkdir(dst, 755); unshare(CLONE_NEWNS); if (mount(src, dst, NULL, MS_BIND, NULL)) { pr_perror("bind"); return 1; } if (mount(src, dst, NULL, MS_SLAVE, NULL)) { pr_perror("slave"); return 1; } test: test_init(argc, argv); test_daemon(); test_waitsig(); pass(); return 0; } criu-3.6/test/zdtm/static/mnt_ext_master.desc000066400000000000000000000001571317335042600214500ustar00rootroot00000000000000{ 'feature': 'mnt_id', 'flavor': 'ns uns', 'opts': '--ext-mount-map auto --enable-external-masters'} criu-3.6/test/zdtm/static/mnt_ro_bind.c000066400000000000000000000033471317335042600202210ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check read-only bind-mounts"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #define TEST_WORD "testtest" #define TEST_WORD2 "TESTTEST" int main(int argc, char **argv) { int fd, ret = 1; char rw_path[PATH_MAX], ro_path[PATH_MAX], rw_f[PATH_MAX], ro_f[PATH_MAX]; test_init(argc, argv); snprintf(rw_path, sizeof(rw_path), "%s/rw", dirname); snprintf(ro_path, sizeof(ro_path), "%s/ro", dirname); snprintf(rw_f, sizeof(rw_f), "%s/rw/test", dirname); snprintf(ro_f, sizeof(ro_f), "%s/ro/test", dirname); mkdir(dirname, 0700); if (mount("none", dirname, "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } mkdir(rw_path, 0700); mkdir(ro_path, 0700); if (mount("zdtm_rw", rw_path, "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } if (mount(rw_path, ro_path, NULL, MS_BIND, NULL) < 0) { fail("Can't mount tmpfs"); return 1; } if (mount(NULL, ro_path, NULL, MS_BIND | MS_REMOUNT | MS_RDONLY, NULL) < 0) { fail("Can't mount tmpfs"); return 1; } test_daemon(); test_waitsig(); fd = open(ro_f, O_CREAT | O_WRONLY, 0666); if (fd != -1 || errno != EROFS) { fail("%s is created", ro_f); goto err; } fd = open(rw_f, O_CREAT | O_WRONLY, 0666); if (fd < 0) { fail("Unable to create %s", rw_f); goto err; } close(fd); fd = open(ro_f, O_RDONLY); if (fd < 0) { fail("Unable to create %s", rw_f); goto err; } pass(); ret = 0; err: umount2(dirname, MNT_DETACH); rmdir(dirname); return ret; } criu-3.6/test/zdtm/static/mnt_ro_bind.desc000066400000000000000000000000461317335042600207060ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid'} criu-3.6/test/zdtm/static/mnt_tracefs.c000066400000000000000000000026551317335042600202350ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test c/r of tracefs"; const char *test_author = "Tycho Andersen "; char *dirname = "mnt_tracefs.test"; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char ** argv) { char dst[PATH_MAX]; if (strcmp(getenv("ZDTM_NEWNS"), "1")) goto test; if (unshare(CLONE_NEWNS)) { pr_perror("unshare"); return 1; } sprintf(dst, "%s/%s", get_current_dir_name(), dirname); if (mkdir(dst, 755) < 0) { pr_perror("mkdir"); return 1; } if (mount("/sys/kernel/debug", dst, NULL, MS_BIND | MS_REC, NULL)) { rmdir(dst); pr_perror("mount"); return 1; } /* trigger the tracefs mount */ strcat(dst, "/tracing/README"); if (access(dst, F_OK) < 0) { umount(dst); rmdir(dst); pr_perror("access"); return 1; } test: test_init(argc, argv); test_daemon(); test_waitsig(); sprintf(dst, "%s/%s/tracing/README", get_current_dir_name(), dirname); /* EACCES is what we expect, since users can't actually /see/ this * filesystem, but CRIU needs to know how to remount it, so the restore * should succeed */ if (access(dst, F_OK) < 0 && errno != EACCES) { fail("couldn't access tracefs at %s", dst); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mnt_tracefs.checkskip000077500000000000000000000003071317335042600217520ustar00rootroot00000000000000#!/bin/bash # tracefs is automatically mounted under debugfs if the kernel has it, so we # just need to check for a file in the tracing directory. test -f /sys/kernel/debug/tracing/README || exit 1 criu-3.6/test/zdtm/static/mnt_tracefs.desc000066400000000000000000000001541317335042600207210ustar00rootroot00000000000000{ 'feature': 'mnt_id', 'flavor': 'uns', 'opts': '--ext-mount-map auto --enable-external-masters'} criu-3.6/test/zdtm/static/mnt_tracefs.hook000077500000000000000000000001211317335042600207400ustar00rootroot00000000000000#!/bin/bash [ "$1" == "--clean" ] || exit 0 rmdir zdtm/static/mnt_tracefs.test criu-3.6/test/zdtm/static/mntns-deleted-dst000066400000000000000000000000001317335042600210200ustar00rootroot00000000000000criu-3.6/test/zdtm/static/mntns_deleted.c000066400000000000000000000041121317335042600205430ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 #endif const char *test_doc = "Check the restore of deleted bindmounts"; const char *test_author = "Cyrill Gorcunov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #define TEST_DIR_SRC "test-src" #define TEST_DIR_DST "test-dst" #define TEST_FILE_SRC "mntns-deleted-src" #define TEST_FILE_DST "mntns-deleted-dst" int main(int argc, char *argv[]) { char path_src[PATH_MAX], path_dst[PATH_MAX]; int fd1, fd2; test_init(argc, argv); if (mkdir(dirname, 0700)) { pr_perror("mkdir %s", dirname); exit(1); } if (mount("none", dirname, "tmpfs", MS_MGC_VAL, NULL)) { pr_perror("mount %s", dirname); return 1; } snprintf(path_src, sizeof(path_src), "%s/%s", dirname, TEST_DIR_SRC); snprintf(path_dst, sizeof(path_dst), "%s/%s", dirname, TEST_DIR_DST); rmdir(path_src); rmdir(path_dst); unlink(TEST_FILE_SRC); unlink(TEST_FILE_DST); if (mkdir(path_src, 0700) || mkdir(path_dst, 0700)) { pr_perror("mkdir"); return 1; } if ((fd1 = open(TEST_FILE_SRC, O_WRONLY | O_CREAT | O_TRUNC, 0600) < 0)) { pr_perror("touching %s", TEST_FILE_SRC); return 1; } close(fd1); if ((fd2 = open(TEST_FILE_DST, O_WRONLY | O_CREAT | O_TRUNC, 0600) < 0)) { pr_perror("touching %s", TEST_FILE_DST); return 1; } close(fd2); if (mount(path_src, path_dst, NULL, MS_BIND | MS_MGC_VAL, NULL)) { pr_perror("mount %s -> %s", path_src, path_dst); return 1; } if (mount(TEST_FILE_SRC, TEST_FILE_DST, NULL, MS_BIND | MS_MGC_VAL, NULL)) { pr_perror("mount %s -> %s", TEST_FILE_SRC, TEST_FILE_DST); return 1; } if (rmdir(path_src)) { pr_perror("rmdir %s", path_src); return 1; } if (unlink(TEST_FILE_SRC)) { pr_perror("unlink %s", TEST_FILE_SRC); return 1; } test_daemon(); test_waitsig(); pass(); return 0; } criu-3.6/test/zdtm/static/mntns_deleted.desc000066400000000000000000000000731317335042600212410ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mntns_ghost.c000066400000000000000000000035721317335042600202720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check ghost and link-remap files in a few mntns"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { task_waiter_t lock; pid_t pid = -1; int status = 1; test_init(argc, argv); task_waiter_init(&lock); pid = fork(); if (pid < 0) { pr_perror("fork"); return 1; } if (pid == 0) { int fd; DIR *d; struct dirent *de; if (unshare(CLONE_NEWNS)) { pr_perror("unshare"); return 1; } if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) { pr_perror("mount"); return 1; } if (mkdir(dirname, 0600) < 0) { pr_perror("mkdir"); return 1; } if (mount(dirname, dirname, NULL, MS_BIND, NULL)) { pr_perror("mount"); return 1; } if (chdir(dirname)) return 1; fd = open("test.ghost", O_CREAT | O_WRONLY, 0600); if (fd < 0) { pr_perror("open"); return 1; } if (unlink("test.ghost")) { pr_perror("unlink"); return 1; } task_waiter_complete(&lock, 1); test_waitsig(); if (close(fd)) { pr_perror("close"); return 1; } d = opendir("."); if (d == NULL) { pr_perror("opendir"); return 1; } while ((de = readdir(d)) != NULL) { if (!strcmp(de->d_name, ".")) continue; if (!strcmp(de->d_name, "..")) continue; pr_err("%s\n", de->d_name); } closedir(d); return 0; } task_waiter_wait4(&lock, 1); test_daemon(); test_waitsig(); kill(pid, SIGTERM); wait(&status); if (status) { fail("Test died"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mntns_ghost.desc000066400000000000000000000001231317335042600207530ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id', 'opts': '--link-remap'} criu-3.6/test/zdtm/static/mntns_link_ghost.c000077700000000000000000000000001317335042600247772mntns_link_remap.custar00rootroot00000000000000criu-3.6/test/zdtm/static/mntns_link_ghost.desc000066400000000000000000000000731317335042600217740ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mntns_link_remap.c000066400000000000000000000116461317335042600212700ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 #endif const char *test_doc = "Check ghost and link-remap files in a few mntns"; const char *test_author = "Andrew Vagin "; #define MPTS_FILE "F" char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #define NS_STACK_SIZE 4096 /* All arguments should be above stack, because it grows down */ struct ns_exec_args { char stack[NS_STACK_SIZE] __stack_aligned__; char stack_ptr[0]; int fd; int sync; }; #define AWK_OK 13 #define AWK_FAIL 42 static int get_mntid(int fd) { char str[256]; int mnt_id = -1; FILE *f; snprintf(str, sizeof(str), "/proc/self/fdinfo/%d", fd); f = fopen(str, "r"); if (!f) { pr_perror("Can't open %s to parse", str); return -1; } while (fgets(str, sizeof(str), f)) { if (sscanf(str, "mnt_id: %d", &mnt_id) == 1) break; } fclose(f); return mnt_id; } int ns_child(void *_arg) { struct ns_exec_args *args = _arg; int fd2; int id1, id2; struct stat st1, st2; char lpath[PATH_MAX], fpath[PATH_MAX]; snprintf(fpath, sizeof(fpath), "%s/1", dirname); if (umount(fpath)) { pr_perror("umount"); return 1; } snprintf(lpath, sizeof(lpath), "%s/0/2", dirname); snprintf(fpath, sizeof(fpath), "%s/2", dirname); if (mkdir(fpath, 0600) < 0) { fail("Can't make zdtm_sys"); return 1; } if (mount(lpath, fpath, NULL, MS_BIND, NULL)) { pr_perror("mount"); return 1; } snprintf(fpath, sizeof(fpath), "%s/0", dirname); if (umount(fpath)) { pr_perror("umount"); return 1; } snprintf(fpath, sizeof(fpath), "%s/2/%s", dirname, MPTS_FILE); fd2 = open(fpath, O_RDWR); if (fd2 < 0) { pr_perror("open"); return -1; } close(args->sync); test_waitsig(); id1 = get_mntid(args->fd); id2 = get_mntid(fd2); if (id1 <0 || id2 < 0) exit(1); if (fstat(args->fd, &st1) || fstat(fd2, &st2)) { pr_perror("stat"); exit(1); } test_msg("%d %d", id1, id2); #ifdef ZDTM_LINK_REMAP if (st1.st_nlink != 1) { #else if (st1.st_nlink != 0) { #endif pr_perror("Wrong number of links: %lu", (long unsigned)st1.st_nlink); exit(1); } if (id1 > 0 && id1 != id2 && st1.st_ino == st2.st_ino) exit(AWK_OK); else exit(AWK_FAIL); } int main(int argc, char **argv) { struct ns_exec_args args; pid_t pid = -1; char lpath[PATH_MAX], fpath[PATH_MAX]; char buf[256]; int p[2]; test_init(argc, argv); if (mkdir(dirname, 0600) < 0) { fail("Can't make zdtm_sys"); return 1; } if (mount("test", dirname, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } snprintf(fpath, sizeof(fpath), "%s/0", dirname); if (mkdir(fpath, 0600) < 0) { fail("Can't make zdtm_sys"); return 1; } if (mount("test", fpath, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } snprintf(lpath, sizeof(lpath), "%s/0/1", dirname); if (mkdir(lpath, 0600) < 0) { fail("Can't make zdtm_sys"); return 1; } snprintf(fpath, sizeof(fpath), "%s/1", dirname); if (mkdir(fpath, 0600) < 0) { fail("Can't make zdtm_sys"); return 1; } if (mount(lpath, fpath, NULL, MS_BIND, NULL)) { pr_perror("mount"); return 1; } snprintf(lpath, sizeof(lpath), "%s/0/2", dirname); if (mkdir(lpath, 0600) < 0) { fail("Can't make zdtm_sys"); return 1; } if (pipe(p) == -1) { pr_perror("pipe"); return 1; } if (getenv("ZDTM_NOSUBNS") == NULL) { snprintf(fpath, sizeof(fpath), "%s/1/%s", dirname, MPTS_FILE); args.fd = open(fpath, O_CREAT | O_RDWR, 0600); if (args.fd < 0) { fail("Can't open file"); return 1; } snprintf(fpath, sizeof(fpath), "%s/0/1/%s", dirname, MPTS_FILE); snprintf(lpath, sizeof(fpath), "%s/0/2/%s", dirname, MPTS_FILE); if (link(fpath, lpath) == -1) { pr_perror("link"); return -1; } #ifdef ZDTM_LINK_REMAP snprintf(lpath, sizeof(fpath), "%s/0/%s", dirname, MPTS_FILE); if (link(fpath, lpath) == -1) { pr_perror("link"); return -1; } #endif args.sync = p[1]; pid = clone(ns_child, args.stack_ptr, CLONE_NEWNS | SIGCHLD, &args); if (pid < 0) { pr_perror("Unable to fork child"); return 1; } close(args.fd); } close(p[1]); read(p[0], buf, sizeof(buf)); snprintf(fpath, sizeof(fpath), "%s/0/1/%s", dirname, MPTS_FILE); if (unlink(fpath)) return 1; snprintf(fpath, sizeof(fpath), "%s/0/2/%s", dirname, MPTS_FILE); if (unlink(fpath)) return 1; test_daemon(); test_waitsig(); if (pid > 0) { kill(pid, SIGTERM); int status = 1; wait(&status); if (WIFEXITED(status)) { if (WEXITSTATUS(status) == AWK_OK) pass(); else if (WEXITSTATUS(status) == AWK_FAIL) fail("Mount ID not restored"); else fail("Failed to check mount IDs (%d)", WEXITSTATUS(status)); } else fail("Test died"); } umount2(dirname, MNT_DETACH); rmdir(dirname); return 0; } criu-3.6/test/zdtm/static/mntns_link_remap.desc000066400000000000000000000001231317335042600217500ustar00rootroot00000000000000{'flavor': 'uns ns', 'flags': 'suid', 'feature': 'mnt_id', 'opts': '--link-remap'} criu-3.6/test/zdtm/static/mntns_open.c000066400000000000000000000050041317335042600200770ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 #endif const char *test_doc = "Check that mnt_id is repsected"; const char *test_author = "Pavel Emelianov "; #define MPTS_FILE "F" char *dirname; TEST_OPTION(dirname, string, "directory name", 1); char fpath[PATH_MAX]; #define NS_STACK_SIZE 4096 /* All arguments should be above stack, because it grows down */ struct ns_exec_args { char stack[NS_STACK_SIZE] __stack_aligned__; char stack_ptr[0]; int fd; }; #define AWK_OK 13 #define AWK_FAIL 42 static int get_mntid(int fd) { char str[256]; int mnt_id = -1; FILE *f; snprintf(str, sizeof(str), "/proc/self/fdinfo/%d", fd); f = fopen(str, "r"); if (!f) { pr_perror("Can't open %s to parse", str); return -1; } while (fgets(str, sizeof(str), f)) { if (sscanf(str, "mnt_id: %d", &mnt_id) == 1) break; } fclose(f); return mnt_id; } task_waiter_t t; int ns_child(void *_arg) { struct ns_exec_args *args = _arg; int fd2; int id1, id2; fd2 = open(fpath, O_RDWR); task_waiter_complete(&t, 1); test_waitsig(); id1 = get_mntid(args->fd); id2 = get_mntid(fd2); test_msg("%d %d", id1, id2); if (id1 <0 || id2 < 0) exit(1); if (id1 > 0 && id1 != id2) exit(AWK_OK); else exit(AWK_FAIL); } int main(int argc, char **argv) { struct ns_exec_args args; pid_t pid = -1; test_init(argc, argv); task_waiter_init(&t); snprintf(fpath, sizeof(fpath), "%s/%s", dirname, MPTS_FILE); if (mkdir(dirname, 0600) < 0) { fail("Can't make zdtm_sys"); return 1; } if (getenv("ZDTM_NOSUBNS") == NULL) { args.fd = open(fpath, O_CREAT | O_RDWR, 0600); if (args.fd < 0) { fail("Can't open file"); return 1; } pid = clone(ns_child, args.stack_ptr, CLONE_NEWNS | SIGCHLD, &args); if (pid < 0) { pr_perror("Unable to fork child"); return 1; } close(args.fd); } task_waiter_wait4(&t, 1); test_daemon(); test_waitsig(); if (pid > 0) { kill(pid, SIGTERM); int status = 1; wait(&status); if (WIFEXITED(status)) { if (WEXITSTATUS(status) == AWK_OK) pass(); else if (WEXITSTATUS(status) == AWK_FAIL) fail("Mount ID not restored"); else fail("Failed to check mount IDs (%d)", WEXITSTATUS(status)); } else fail("Test died"); } unlink(fpath); rmdir(dirname); return 0; } criu-3.6/test/zdtm/static/mntns_open.desc000066400000000000000000000000731317335042600205740ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mntns_overmount.c000066400000000000000000000025111317335042600211740ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check two mounts in the same directory"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { char d1[PATH_MAX], d2[PATH_MAX], f1[PATH_MAX], f2[PATH_MAX]; struct stat st; test_init(argc, argv); snprintf(d1, sizeof(d1), "%s/1/", dirname); snprintf(d2, sizeof(d2), "%s/2/", dirname); if (mkdir(dirname, 0700) || mkdir(d1, 0777) || mkdir(d2, 0700)) { pr_perror("mkdir"); return 1; } if (mount("zdtm_d1", d1, "sysfs", 0, NULL) || mount(NULL, d1, NULL, MS_SHARED, NULL) || mount(d1, d2, NULL, MS_BIND, NULL) || mount(NULL, d2, NULL, MS_SLAVE, NULL)) { pr_perror("mount"); return 1; } snprintf(f1, sizeof(f1), "%s/devices", d1); snprintf(f2, sizeof(f2), "%s/devices", d2); if (mount("zdtm_d1", d1, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } test_daemon(); test_waitsig(); if (umount(d1)) { pr_perror("umount"); return 1; } if (stat(f1, &st) || stat(f2, &st)) { pr_perror("stat"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mntns_overmount.desc000066400000000000000000000000671317335042600216740ustar00rootroot00000000000000{'flavor': 'ns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mntns_remap.c000066400000000000000000000034511317335042600202460ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check a case when one mount overmount another one"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { task_waiter_t t; pid_t pid; test_init(argc, argv); mkdir(dirname, 0755); if (mount("zdtm", dirname, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } if (chdir(dirname)) { pr_perror("chdir"); return 1; } mkdir("1", 0755); mkdir("2", 0755); if (mount("1", "1", NULL, MS_BIND, NULL)) { pr_perror("mount"); return 1; } if (mount(NULL, "1", NULL, MS_PRIVATE, NULL)) { pr_perror("mount"); return 1; } if (mount("zdtm", "2", "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } mkdir("1/a", 0755); mkdir("2/a", 0755); if (mount("1/a", "1/a", NULL, MS_BIND, NULL)) { pr_perror("mount"); return 1; } if (mount(NULL, "1/a", NULL, MS_SHARED, NULL)) { pr_perror("mount"); return 1; } if (mount("1/a", "2/a", NULL, MS_BIND, NULL)) { pr_perror("mount"); return 1; } mkdir("1/a/c", 0755); if (mount("zdtm", "1/a/c", "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } if (mount("2", "1", NULL, MS_MOVE, NULL)) { pr_perror("mount"); return 1; } task_waiter_init(&t); pid = fork(); if (pid < 0) return -1; if (pid == 0) { if (unshare(CLONE_NEWNS)) return 1; task_waiter_complete_current(&t); test_waitsig(); return 0; } task_waiter_wait4(&t, pid); test_daemon(); test_waitsig(); kill(pid, SIGTERM); wait(NULL); pass(); return 0; } criu-3.6/test/zdtm/static/mntns_remap.desc000066400000000000000000000000731317335042600207370ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mntns_ro_root.c000066400000000000000000000022351317335042600206240ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check a case when a root is read-only for a sub-mntns"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { task_waiter_t lock; pid_t pid = -1; int status = 1; test_init(argc, argv); task_waiter_init(&lock); pid = fork(); if (pid < 0) { pr_perror("fork"); return 1; } if (pid == 0) { if (unshare(CLONE_NEWNS)) { pr_perror("unshare"); return 1; } if (mount(NULL, "/", NULL, MS_REMOUNT | MS_RDONLY | MS_BIND, NULL)) { pr_perror("mount"); return 1; } task_waiter_complete(&lock, 1); test_waitsig(); return 0; } task_waiter_wait4(&lock, 1); test_daemon(); test_waitsig(); kill(pid, SIGTERM); wait(&status); if (status) { fail("Test died"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mntns_ro_root.desc000066400000000000000000000000731317335042600213160ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mntns_root_bind.c000066400000000000000000000047101317335042600211200ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 #endif const char *test_doc = "Check bind-mouns of the root mount"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { char subdir1[PATH_MAX], path[PATH_MAX], bpath[PATH_MAX], spath[PATH_MAX], bspath[PATH_MAX]; char subdir2[PATH_MAX], bsubdir2[PATH_MAX]; pid_t pid; int status; task_waiter_t t; test_init(argc, argv); task_waiter_init(&t); mount(NULL, "/", NULL, MS_SHARED, NULL); snprintf(subdir1, sizeof(subdir1), "%s/subdir1", dirname); snprintf(path, sizeof(path), "%s/test", subdir1); snprintf(bpath, sizeof(bpath), "%s/test.bind", subdir1); snprintf(spath, sizeof(spath), "%s/test/sub", subdir1); snprintf(bspath, sizeof(bspath), "%s/test.bind/sub", subdir1); snprintf(subdir2, sizeof(subdir2), "%s/subdir2", dirname); snprintf(bsubdir2, sizeof(bsubdir2), "%s/bsubdir2", dirname); if (mkdir(dirname, 0700) || mkdir(subdir1, 0777) || mkdir(subdir2, 0777) || mkdir(bsubdir2, 0777) || mkdir(path, 0700) || mkdir(spath, 0700) || mkdir(bpath, 0700)) { pr_perror("mkdir"); return 1; } pid = fork(); if (pid < 0) { pr_perror("fork"); return 1; } if (pid == 0) { unshare(CLONE_NEWNS); if (mount(path, bpath, NULL, MS_BIND, NULL)) { pr_perror("mount"); return 1; } task_waiter_complete(&t, 1); task_waiter_wait4(&t, 2); if (access(bspath, F_OK)) { fail("%s isn't accessiable", bspath); return 1; } if (umount2(bpath, MNT_DETACH)) { fail("umount"); return 1; } return 0; } task_waiter_wait4(&t, 1); if (mount("test", spath, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } #ifdef ROOT_BIND02 if (mount(subdir2, bsubdir2, NULL, MS_BIND, NULL)) { pr_perror("Unable to mount %s to %s", subdir2, bsubdir2); return 1; } #endif test_daemon(); test_waitsig(); task_waiter_complete(&t, 2); if (waitpid(pid, &status, 0) != pid) { pr_perror("waitpid %d", pid); return 1; } if (status) { pr_perror("%d/%d/%d/%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mntns_root_bind.desc000066400000000000000000000000731317335042600216120ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mntns_root_bind02.c000077700000000000000000000000001317335042600246152mntns_root_bind.custar00rootroot00000000000000criu-3.6/test/zdtm/static/mntns_root_bind02.desc000077700000000000000000000000001317335042600260052mntns_root_bind.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/mntns_rw_ro_rw.c000066400000000000000000000020431317335042600207760ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test read-only bind mounts"; const char *test_author = "Andrey Vagin "; int main(int argc, char **argv) { test_init(argc, argv); if (mount("/proc/sys/", "/proc/sys", NULL, MS_BIND, NULL)) { pr_perror("Unable to bind-mount /proc/sys"); return 1; } if (mount("/proc/sys/net", "/proc/sys/net", NULL, MS_BIND, NULL)) { pr_perror("Unable to bind-mount /proc/sys/net"); return 1; } if (mount("/proc/sys/", "/proc/sys", NULL, MS_RDONLY|MS_BIND|MS_REMOUNT, NULL)) { pr_perror("Unable to remount /proc/sys"); return 1; } test_daemon(); test_waitsig(); if (access("/proc/sys/net/ipv4/ip_forward", W_OK)) { fail("Unable to access /proc/sys/net/core/wmem_max"); return 1; } if (access("/proc/sys/kernel/ns_last_pid", W_OK) != -1 || errno != EROFS) { fail("Unable to access /proc/sys/kernel/pid_max"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mntns_rw_ro_rw.desc000066400000000000000000000000461317335042600214730ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid'} criu-3.6/test/zdtm/static/mntns_shared_bind.c000066400000000000000000000044111317335042600214010ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 #endif const char *test_doc = "Check shared non-root bind-mounts"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { char path[PATH_MAX], bpath[PATH_MAX], spath[PATH_MAX]; pid_t pid; int status; task_waiter_t t; test_init(argc, argv); task_waiter_init(&t); snprintf(path, sizeof(path), "%s/test", dirname); snprintf(bpath, sizeof(bpath), "%s/test.bind", dirname); snprintf(spath, sizeof(spath), "%s/test/sub", dirname); if (mkdir(dirname, 0700)) { pr_perror("mkdir"); return 1; } if (mount(NULL, "/", NULL, MS_SHARED, NULL)) { pr_perror("mount"); return 1; } #ifdef SHARED_BIND02 /* */ if (mount(dirname, dirname, "tmpfs", 0, NULL) || mount(NULL, dirname, NULL, MS_SHARED, NULL)) { pr_perror("mount"); return 1; } #endif if (mkdir(path, 0700) || mkdir(spath, 0700) || mkdir(bpath, 0700)) { pr_perror("mkdir"); return 1; } pid = fork(); if (pid < 0) { pr_perror("fork"); return 1; } if (pid == 0) { unshare(CLONE_NEWNS); if (mount(path, bpath, NULL, MS_BIND, NULL)) { pr_perror("mount"); return 1; } task_waiter_complete(&t, 1); task_waiter_wait4(&t, 2); if (umount(spath)) { task_waiter_complete(&t, 2); fail("umount"); return 1; } task_waiter_complete(&t, 3); task_waiter_wait4(&t, 4); return 0; } task_waiter_wait4(&t, 1); if (mount("test", spath, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } test_daemon(); test_waitsig(); task_waiter_complete(&t, 2); task_waiter_wait4(&t, 3); if (umount(bpath)) { task_waiter_complete(&t, 2); fail("umount"); return 1; } task_waiter_complete(&t, 4); if (waitpid(pid, &status, 0) != pid) { pr_perror("waitpid %d", pid); return 1; } if (status) { pr_perror("%d/%d/%d/%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mntns_shared_bind.desc000066400000000000000000000000731317335042600220750ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mntns_shared_bind02.c000077700000000000000000000000001317335042600253632mntns_shared_bind.custar00rootroot00000000000000criu-3.6/test/zdtm/static/mntns_shared_bind02.desc000066400000000000000000000000731317335042600222370ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mntns_shared_bind03.c000066400000000000000000000041671317335042600215540ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 #endif const char *test_doc = "Check shared non-root bind-mounts with different shared groups"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { test_init(argc, argv); if (mkdir(dirname, 0700)) { pr_perror("mkdir"); return 1; } if (chdir(dirname)) return 1; if (mkdir("1", 0700) || mkdir("2", 0700) || mkdir("3", 0700)) { pr_perror("mkdir"); return 1; } if (mkdir("A", 0700)) { pr_perror("mkdir"); return 1; } if (mkdir("B", 0700)) { pr_perror("mkdir"); return 1; } if (mount("1", "1", NULL, MS_BIND, NULL) || mount(NULL, "1", NULL, MS_PRIVATE, NULL) || mount(NULL, "1", NULL, MS_SHARED, NULL)) { pr_perror("mount"); return 1; } if (mount("1", "A", NULL, MS_BIND, NULL) || mount(NULL, "A", NULL, MS_PRIVATE, NULL) || mount(NULL, "A", NULL, MS_SHARED, NULL)) { pr_perror("mount"); return 1; } if (mount("1", "B", NULL, MS_BIND, NULL) || mount(NULL, "B", NULL, MS_SLAVE, NULL)) { pr_perror("mount"); return 1; } if (mkdir("1/D", 0700)) { pr_perror("mkdir"); return 1; } if (mount("1/D", "2", NULL, MS_BIND, NULL)) { pr_perror("mount"); return 1; } if (mount("1", "3", NULL, MS_BIND, NULL)) { pr_perror("mount"); return 1; } test_daemon(); test_waitsig(); if (mkdir("1/D/test", 0700)) { pr_perror("mkdir"); return 1; } if (mount("zdtm_shared", "1/D/test", "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } if (mount(NULL, "3", NULL, MS_PRIVATE, NULL)) { pr_perror("mount"); return 1; } if (umount("B/D/test")) { pr_perror("umount"); return 1; } if (umount("2/test")) { pr_perror("umount"); return 1; } if (umount("3/D/test")) { pr_perror("umount"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mntns_shared_bind03.desc000066400000000000000000000000731317335042600222400ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mntns_shared_vs_private.c000066400000000000000000000037241317335042600226550ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 #endif const char *test_doc = "Check a private mount in a shared mount"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { char path[PATH_MAX]; pid_t pid; int status, i; task_waiter_t t; test_init(argc, argv); task_waiter_init(&t); snprintf(path, sizeof(path), "%s/fs", dirname); if (mkdir(dirname, 0700)) { pr_perror("mkdir"); return 1; } if (mount(NULL, "/", NULL, MS_SHARED, NULL)) { pr_perror("mount"); return 1; } if (mount("zdtm_fs", dirname, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } if (mount(NULL, dirname, NULL, MS_PRIVATE, NULL)) { pr_perror("mount"); return 1; } if (mkdir(path, 0700)) { pr_perror("mkdir"); return 1; } if (mount("zdtm_fs", path, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } for (i = 0; i < 2; i++) { pid = fork(); if (pid < 0) { pr_perror("fork"); return 1; } if (pid == 0) { unshare(CLONE_NEWNS); task_waiter_complete(&t, 1); task_waiter_wait4(&t, 2); return 0; } } for (i = 0; i < 2; i++) task_waiter_wait4(&t, 1); test_daemon(); test_waitsig(); if (umount(path)) { pr_perror("Unable to umount %s", path); return 1; } if (umount(dirname)) { pr_perror("Unable to umount %s", dirname); return 1; } for (i = 0; i < 2; i++) { task_waiter_complete(&t, 2); if (waitpid(-1, &status, 0) < 0) { pr_perror("waitpid %d", pid); return 1; } if (status) { pr_err("%d/%d/%d/%d\n", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); return 1; } } pass(); return 0; } criu-3.6/test/zdtm/static/mntns_shared_vs_private.desc000066400000000000000000000000731317335042600233430ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mount_paths.c000066400000000000000000000021161317335042600202610ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that special charecters in paths are handled correctly"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #define TEST_DIR "tmpfs \t \t\\\\\t test \t\t\\\\ \t\\" int main(int argc, char **argv) { int ret = 1; char buf[1024], test_dir[PATH_MAX], fname[PATH_MAX]; test_init(argc, argv); mkdir(dirname, 0700); snprintf(test_dir, sizeof(test_dir), "%s/%s", dirname, TEST_DIR); mkdir(test_dir, 0700); if (mount("", test_dir, "tmpfs", 0, NULL)) { pr_perror("mount"); return 1; } snprintf(fname, sizeof(buf), "%s/\\\t \\\\ \\tt", test_dir); if (mkdir(fname, 0700)) { pr_perror("mkdir"); return 1; } test_daemon(); test_waitsig(); if (access(fname, F_OK)) { fail(); goto err; } pass(); ret = 0; err: umount2(test_dir, MNT_DETACH); rmdir(test_dir); rmdir(dirname); return ret; } criu-3.6/test/zdtm/static/mount_paths.desc000066400000000000000000000000461317335042600207550ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid'} criu-3.6/test/zdtm/static/mountpoints.c000066400000000000000000000167371317335042600203350ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that mountpoints (in mount namespace) are supported"; const char *test_author = "Pavel Emelianov "; #define MPTS_ROOT "/zdtm_mpts/" #define NS_STACK_SIZE 4096 /* All arguments should be above stack, because it grows down */ struct ns_exec_args { char stack[NS_STACK_SIZE] __stack_aligned__; char stack_ptr[0]; int status_pipe[2]; }; task_waiter_t t; int ns_child(void *_arg) { struct stat st; pid_t pid; int fd, ufd; mkdir(MPTS_ROOT"/dev/mntns2", 0600); if (mount("none", MPTS_ROOT"/dev/mntns2", "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } mkdir(MPTS_ROOT"/dev/mntns2/test", 0600); fd = open(MPTS_ROOT"/dev/mntns2/test/test.file", O_WRONLY | O_CREAT, 0666); if (fd < 0) return 1; ufd = open(MPTS_ROOT"/dev/mntns2/test/test.file.unlinked", O_WRONLY | O_CREAT, 0666); if (ufd < 0) return 1; unlink(MPTS_ROOT"/dev/mntns2/test/test.file.unlinked"); pid = fork(); task_waiter_complete(&t, 1); test_waitsig(); if (pid) { int status = 1;; kill(pid, SIGTERM); wait(&status); if (status) return 1; } if (stat(MPTS_ROOT"/dev/mntns2/test", &st)) { pr_perror("Can't stat /dev/share-1/test.share/test.share"); return 1; } return 0; } int main(int argc, char **argv) { int fd, tmpfs_fd, have_bfmtm = 0; struct ns_exec_args args; pid_t pid = -1; test_init(argc, argv); task_waiter_init(&t); rmdir(MPTS_ROOT); if (mkdir(MPTS_ROOT, 0600) < 0) { fail("Can't make zdtm_sys"); return 1; } if (mount("none", MPTS_ROOT, "sysfs", 0, "") < 0) { fail("Can't mount sysfs"); return 1; } if (mount("none", MPTS_ROOT"/dev", "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } tmpfs_fd = open(MPTS_ROOT"/dev/test", O_WRONLY | O_CREAT); if (write(tmpfs_fd, "hello", 5) <= 0) { pr_perror("write() failed"); return 1; } /* Check that over-mounted files are restored on tmpfs */ mkdir(MPTS_ROOT"/dev/overmount", 0600); fd = open(MPTS_ROOT"/dev/overmount/test.over", O_WRONLY | O_CREAT); if (fd == -1) { pr_perror("Unable to open "MPTS_ROOT"/dev/overmount"); return -1; } close(fd); if (mount("none", MPTS_ROOT"/dev/overmount", "tmpfs", 0, "") < 0) { pr_perror("Can't mount "MPTS_ROOT"/dev/overmount"); return 1; } mkdir(MPTS_ROOT"/dev/non-root", 0600); if (mount(MPTS_ROOT"/dev/non-root", MPTS_ROOT"/module", NULL, MS_BIND, NULL) < 0) { pr_perror("Can't bind-mount %s -> %s", MPTS_ROOT"/dev/tdir", MPTS_ROOT"/module"); } mkdir(MPTS_ROOT"/dev/non-root/test", 0600); mkdir(MPTS_ROOT"/dev/share-1", 0600); if (mount("none", MPTS_ROOT"/dev/share-1/", "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } if (mount("none", MPTS_ROOT"/dev/share-1/", NULL, MS_SHARED, NULL) < 0) { fail("Can't mount tmpfs"); return 1; } //#define CR_NEXT #ifdef CR_NEXT mkdir(MPTS_ROOT"/dev/share-1/alone", 0600); if (mount("none", MPTS_ROOT"/dev/share-1/alone", "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } #endif mkdir(MPTS_ROOT"/dev/share-2", 0600); if (mount(MPTS_ROOT"/dev/share-1", MPTS_ROOT"/dev/share-2", NULL, MS_BIND, NULL) < 0) { fail("Can't bind mount a tmpfs directory"); return 1; } mkdir(MPTS_ROOT"/dev/share-3", 0600); if (mount(MPTS_ROOT"/dev/share-1", MPTS_ROOT"/dev/share-3", NULL, MS_BIND, NULL) < 0) { fail("Can't bind mount a tmpfs directory"); return 1; } mkdir(MPTS_ROOT"/dev/slave", 0600); if (mount(MPTS_ROOT"/dev/share-1", MPTS_ROOT"/dev/slave", NULL, MS_BIND, NULL) < 0) { fail("Can't bind mount a tmpfs directory"); return 1; } if (mount("none", MPTS_ROOT"/dev/slave", NULL, MS_SLAVE, NULL) < 0) { fail("Can't mount tmpfs"); return 1; } mkdir(MPTS_ROOT"/dev/slave2", 0600); if (mount(MPTS_ROOT"/dev/share-3", MPTS_ROOT"/dev/slave2", NULL, MS_BIND, NULL) < 0) { fail("Can't bind mount a tmpfs directory"); return 1; } if (mount("none", MPTS_ROOT"/dev/slave2", NULL, MS_SLAVE, NULL) < 0) { fail("Can't mount tmpfs"); return 1; } mkdir(MPTS_ROOT"/dev/share-1/test.mnt.share", 0600); if (mount("none", MPTS_ROOT"/dev/share-1/test.mnt.share", "tmpfs", 0, "size=1G") < 0) { fail("Can't mount tmpfs"); return 1; } mkdir(MPTS_ROOT"/dev/share-1/test.mnt.share/test.share", 0600); if (umount(MPTS_ROOT"/dev/slave2/test.mnt.share")) { pr_perror("Can't umount "MPTS_ROOT"/dev/slave2/test.mnt.share"); return 1; } mkdir(MPTS_ROOT"/dev/slave/test.mnt.slave", 0600); if (mount("none", MPTS_ROOT"/dev/slave/test.mnt.slave", "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } mkdir(MPTS_ROOT"/dev/slave/test.mnt.slave/test.slave", 0600); fd = open(MPTS_ROOT"/dev/bmfile", O_CREAT | O_WRONLY); if (fd < 0) { pr_perror("Can't create " MPTS_ROOT "/dev/share-1/bmfile"); return 1; } close(fd); fd = open(MPTS_ROOT"/dev/bmfile-mount", O_CREAT | O_WRONLY); if (fd < 0) { pr_perror("Can't create " MPTS_ROOT "/dev/share-1/bmfile"); return 1; } close(fd); if (mount(MPTS_ROOT"/dev/bmfile", MPTS_ROOT"/dev/bmfile-mount", NULL, MS_BIND, NULL) < 0) { fail("Can't mount tmpfs"); return 1; } if (mount("none", MPTS_ROOT"/kernel", "proc", 0, "") < 0) { fail("Can't mount proc"); return 1; } if (mount("none", MPTS_ROOT"/kernel/sys/fs/binfmt_misc", "binfmt_misc", 0, "") == 0) have_bfmtm = 1; fd = open(MPTS_ROOT"/kernel/meminfo", O_RDONLY); if (fd == -1) return 1; if (getenv("ZDTM_NOSUBNS") == NULL) { pid = clone(ns_child, args.stack_ptr, CLONE_NEWNS | SIGCHLD, &args); if (pid < 0) { pr_perror("Unable to fork child"); return 1; } } task_waiter_wait4(&t, 1); test_daemon(); test_waitsig(); /* this checks both -- sys and proc presence */ if (access(MPTS_ROOT"/kernel/meminfo", F_OK)) { fail("No proc after restore"); return 1; } if (have_bfmtm && access(MPTS_ROOT"/kernel/sys/fs/binfmt_misc/register", F_OK)) { fail("No binfmt_misc after restore"); return 1; } if (umount(MPTS_ROOT"/dev/overmount") == -1) { pr_perror("Can't umount "MPTS_ROOT"/dev/overmount"); return -1; } if (access(MPTS_ROOT"/dev/overmount/test.over", F_OK)) { fail(MPTS_ROOT"/dev/overmount/test.over"); return -1; } { struct stat st1, st2; if (stat(MPTS_ROOT"/dev/share-1/test.mnt.share/test.share", &st1)) { pr_perror("Can't stat /dev/share-1/test.share/test.share"); return 1; } if (stat(MPTS_ROOT"/dev/share-2/test.mnt.share/test.share", &st2)) { pr_perror("Can't stat /dev/share-2/test.mnt.share/test.share"); return 1; } if (st1.st_ino != st2.st_ino) { fail("/dev/share-1 and /dev/share-1 is not shared"); return 1; } if (stat(MPTS_ROOT"/dev/slave/test.mnt.share/test.share", &st2)) { pr_perror("Can't stat /dev/slave/test.mnt.share/test.share"); return 1; } if (st1.st_ino != st2.st_ino) { fail("/dev/slave is not slave of /dev/share-1"); return 1; } if (stat(MPTS_ROOT"/dev/share-1/test.mnt.slave/test.slave", &st1) != -1 || errno != ENOENT) { pr_perror("/dev/share-1/test.mnt.slave/test.slave exists"); return 1; } if (stat(MPTS_ROOT"/dev/slave/test.mnt.slave/test.slave", &st2)) { pr_perror("Can't stat /dev/slave/test.mnt.slave/test.slave"); return 1; } if (stat(MPTS_ROOT"/dev/non-root/test", &st1)) { pr_perror("Can't stat /dev/non-root/test"); return 1; } } if (pid > 0) { kill(pid, SIGTERM); int status = 1; wait(&status); if (status) return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/mountpoints.desc000066400000000000000000000000741317335042600210140ustar00rootroot00000000000000{'flavor': 'ns', 'flags': 'suid excl', 'feature': 'mnt_id'} criu-3.6/test/zdtm/static/mprotect00.c000066400000000000000000000045541317335042600177250ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that memory protection migrates correctly\n"; const char *test_author = "Roman Kagan "; const static int prots[] = { PROT_NONE, PROT_READ, /* PROT_WRITE, */ /* doesn't work w/o READ */ PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE | PROT_EXEC, }; #define NUM_MPROTS sizeof(prots) / sizeof(int) static sigjmp_buf segv_ret; /* we need sig*jmp stuff, otherwise SIGSEGV will reset our handler */ static void segfault(int signo) { siglongjmp(segv_ret, 1); } static int check_prot(char *ptr, int prot) { if (signal(SIGSEGV, segfault) == SIG_ERR) { fail("setting SIGSEGV handler failed: %m\n"); return -1; } if (!sigsetjmp(segv_ret, 1)) { if (ptr[10] != 0) { fail("read value doesn't match what I wrote"); return -1; } if (!(prot & PROT_READ)) { fail("PROT_READ bypassed\n"); return -1; } } else /* we come here on return from SIGSEGV handler */ if (prot & PROT_READ) { fail("PROT_READ rejected\n"); return -1; } if (!sigsetjmp(segv_ret, 1)) { ptr[20] = 67; if (!(prot & PROT_WRITE)) { fail("PROT_WRITE bypassed\n"); return -1; } } else /* we come here on return from SIGSEGV handler */ if (prot & PROT_WRITE) { fail("PROT_WRITE rejected\n"); return -1; } if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) { fail("restoring SIGSEGV handler failed: %m\n"); return -1; } return 0; } int main(int argc, char ** argv) { char *ptr, *ptr_aligned; int pagesize; int i; test_init(argc, argv); pagesize = sysconf(_SC_PAGESIZE); if (pagesize < 0) { pr_perror("can't get PAGE_SIZE"); exit(1); } ptr = mmap(NULL, pagesize * (NUM_MPROTS + 1), PROT_NONE, MAP_PRIVATE | MAP_ANON, 0, 0); if (ptr == MAP_FAILED) { pr_perror("calloc failed"); return -1; } ptr_aligned = (char *)(((unsigned long) ptr + pagesize - 1) & ~(pagesize - 1)); for (i = 0; i < NUM_MPROTS; i++) if (mprotect(ptr_aligned + pagesize * i, pagesize / 2, prots[i]) < 0) { pr_perror("mprotect failed"); exit(1); } test_daemon(); test_waitsig(); for (i = 0; i < NUM_MPROTS; i++) if (check_prot(ptr_aligned + pagesize * i, prots[i])) goto out; pass(); out: return 0; } criu-3.6/test/zdtm/static/msgque.c000066400000000000000000000057201317335042600172250ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc="Tests sysv5 msg queues supporting by checkpointing"; const char *test_author="Stanislav Kinsbursky "; struct msg1 { long mtype; char mtext[30]; }; #define TEST_STRING "Test sysv5 msg" #define MSG_TYPE 1 #define ANOTHER_TEST_STRING "Yet another test sysv5 msg" #define ANOTHER_MSG_TYPE 26538 int main(int argc, char **argv) { key_t key; int msg, pid; struct msg1 msgbuf; int chret; test_init(argc, argv); key = ftok(argv[0], 822155650); if (key == -1) { pr_perror("Can't make key"); exit(1); } pid = test_fork(); if (pid < 0) { pr_perror("Can't fork"); exit(1); } msg = msgget(key, IPC_CREAT | IPC_EXCL | 0666); if (msg == -1) { msg = msgget(key, 0666); if (msg == -1) { pr_perror("Can't get queue"); goto err_kill; } } if (pid == 0) { test_waitsig(); if (msgrcv(msg, &msgbuf, sizeof(TEST_STRING), MSG_TYPE, IPC_NOWAIT) == -1) { fail("Child: msgrcv failed (%m)"); return -errno; } if (strncmp(TEST_STRING, msgbuf.mtext, sizeof(TEST_STRING))) { fail("Child: the source and received strings aren't equal"); return -errno; } test_msg("Child: received %s\n", msgbuf.mtext); msgbuf.mtype = ANOTHER_MSG_TYPE; memcpy(msgbuf.mtext, ANOTHER_TEST_STRING, sizeof(ANOTHER_TEST_STRING)); if (msgsnd(msg, &msgbuf, sizeof(ANOTHER_TEST_STRING), IPC_NOWAIT) != 0) { fail("Child: msgsnd failed (%m)"); return -errno; }; pass(); return 0; } else { msgbuf.mtype = MSG_TYPE; memcpy(msgbuf.mtext, TEST_STRING, sizeof(TEST_STRING)); if (msgsnd(msg, &msgbuf, sizeof(TEST_STRING), IPC_NOWAIT) != 0) { fail("Parent: msgsnd failed (%m)"); goto err_kill; }; msgbuf.mtype = ANOTHER_MSG_TYPE; memcpy(msgbuf.mtext, ANOTHER_TEST_STRING, sizeof(ANOTHER_TEST_STRING)); if (msgsnd(msg, &msgbuf, sizeof(ANOTHER_TEST_STRING), IPC_NOWAIT) != 0) { fail("child: msgsnd (2) failed (%m)"); return -errno; }; test_daemon(); test_waitsig(); kill(pid, SIGTERM); wait(&chret); chret = WEXITSTATUS(chret); if (chret) { fail("Parent: child exited with non-zero code %d (%s)\n", chret, strerror(chret)); goto out; } if (msgrcv(msg, &msgbuf, sizeof(ANOTHER_TEST_STRING), ANOTHER_MSG_TYPE, IPC_NOWAIT) == -1) { fail("Parent: msgrcv failed (%m)"); goto err; } if (strncmp(ANOTHER_TEST_STRING, msgbuf.mtext, sizeof(ANOTHER_TEST_STRING))) { fail("Parent: the source and received strings aren't equal"); goto err; } test_msg("Parent: received %s\n", msgbuf.mtext); pass(); } out: if (msgctl(msg, IPC_RMID, 0)) { fail("Failed to destroy message queue: %d\n", -errno); return -errno; } return chret; err_kill: kill(pid, SIGKILL); wait(NULL); err: chret = -errno; goto out; } criu-3.6/test/zdtm/static/msgque.desc000066400000000000000000000000251317335042600177120ustar00rootroot00000000000000{'flavor': 'ns uns'} criu-3.6/test/zdtm/static/mtime_mmap.c000066400000000000000000000043751317335042600200560ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "file mmaped for write and being written should change mtime\n" "and be migrated with correct new data"; char *filename; TEST_OPTION(filename, string, "file name", 1); #define FILE_SIZE (16 * 1024) int main(int argc, char **argv) { int fd; char buf[FILE_SIZE]; size_t count; int i; char *ptr; struct stat fst; time_t mtime_old, mtime_new; time_t ctime_old, ctime_new; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT, 0666); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } /* initialization */ count = sizeof(buf); memset(buf, 1, count); if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { pr_perror("failed to write %s", filename); exit(1); } if (fstat(fd, &fst) < 0) { pr_perror("can't get %s file info", filename); goto failed; } ptr = (char *)mmap(NULL, count, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (ptr == MAP_FAILED) { pr_perror("mmap() Failed, errno=%d : %s", errno, strerror(errno)); goto failed; } mtime_old = fst.st_mtime; ctime_old = fst.st_ctime; sleep(2); for (i = 0; i < count; i++) ptr[i]++; if (munmap(ptr, count)) { pr_perror("munmap Failed, errno=%d : %s", errno, strerror(errno)); goto failed; } if (fstat(fd, &fst) < 0) { pr_perror("can't get %s file info", filename); goto failed; } mtime_new = fst.st_mtime; /* time of last modification */ if (mtime_new <= mtime_old) { fail("mtime %ld wasn't updated on mmapped %s file", mtime_new, filename); goto failed; } ctime_new = fst.st_ctime; /* time of last status change */ if (ctime_new <= ctime_old) { fail("time of last status change of %s file wasn't changed\n", filename); goto failed; } test_daemon(); test_waitsig(); if (fstat(fd, &fst) < 0) { pr_perror("can't get %s file info", filename); goto failed; } /* time of last modification */ if (fst.st_mtime != mtime_new) { fail("After migration, mtime changed to %ld", fst.st_mtime); goto failed; } pass(); unlink(filename); close(fd); return 0; failed: return 1; } criu-3.6/test/zdtm/static/netns-dev.c000066400000000000000000000246431317335042600176340ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" #define LO_CONF_DIR_PATH "/proc/sys/net/ipv4/conf/lo" #define DEF_CONF_DIR_PATH "/proc/sys/net/ipv4/conf/default" #define ALL_CONF_DIR_PATH "/proc/sys/net/ipv4/conf/all" #define LO_CONF6_DIR_PATH "/proc/sys/net/ipv6/conf/lo" #define DEF_CONF6_DIR_PATH "/proc/sys/net/ipv6/conf/default" #define ALL_CONF6_DIR_PATH "/proc/sys/net/ipv6/conf/all" #define INT_MAX ((int)(~0U>>1)) #define INT_MIN (-INT_MAX - 1) char *devconfs4[] = { "accept_local", "accept_source_route", "arp_accept", "arp_announce", "arp_filter", "arp_ignore", "arp_notify", "bootp_relay", "disable_policy", "disable_xfrm", "drop_gratuitous_arp", "drop_unicast_in_l2_multicast", "force_igmp_version", "forwarding", "accept_redirects", "igmpv2_unsolicited_report_interval", "igmpv3_unsolicited_report_interval", "ignore_routes_with_linkdown", "log_martians", "mc_forwarding", "medium_id", "promote_secondaries", "proxy_arp", "proxy_arp_pvlan", "route_localnet", "rp_filter", "secure_redirects", "send_redirects", "shared_media", "src_valid_mark", "tag", NULL, }; struct range { int min; int max; }; struct range rand_range4[] = { {0, 1}, /* accept_local */ {-1, 0}, /* accept_source_route */ {0, 1}, /* arp_accept */ {0, 2}, /* arp_announce */ {0, 1}, /* arp_filter */ {0, 8}, /* arp_ignore */ {0, 1}, /* arp_notify */ {0, 1}, /* bootp_relay */ {0, 1}, /* disable_policy */ {0, 1}, /* disable_xfrm */ {0, 1}, /* drop_gratuitous_arp */ {0, 1}, /* drop_unicast_in_l2_multicast */ {0, INT_MAX}, /* force_igmp_version */ {0, 1}, /* forwarding */ {0, 1}, /* accept_redirects */ {0, INT_MAX}, /* igmpv2_unsolicited_report_interval */ {0, INT_MAX}, /* igmpv3_unsolicited_report_interval */ {0, 1}, /* ignore_routes_with_linkdown */ {0, 1}, /* log_martians */ {0, 1}, /* mc_forwarding */ {-1, INT_MAX}, /* medium_id */ {0, 1}, /* promote_secondaries */ {0, 1}, /* proxy_arp */ {0, 1}, /* proxy_arp_pvlan */ {0, 1}, /* route_localnet */ {0, 2}, /* rp_filter */ {0, 1}, /* secure_redirects */ {0, 1}, /* send_redirects */ {0, 1}, /* shared_media */ {0, 1}, /* src_valid_mark */ {INT_MIN, INT_MAX}, /* tag */ }; char *devconfs6[] = { "accept_dad", "accept_ra", "accept_ra_defrtr", "accept_ra_from_local", "accept_ra_min_hop_limit", "accept_ra_mtu", "accept_ra_pinfo", "accept_ra_rt_info_max_plen", "accept_ra_rtr_pref", "accept_source_route", "autoconf", "dad_transmits", "disable_ipv6", "drop_unicast_in_l2_multicast", "drop_unsolicited_na", "force_mld_version", "force_tllao", "forwarding", "accept_redirects", "hop_limit", "ignore_routes_with_linkdown", "keep_addr_on_down", "max_addresses", "max_desync_factor", "mldv1_unsolicited_report_interval", "mldv2_unsolicited_report_interval", "mtu", "ndisc_notify", "optimistic_dad", "proxy_ndp", "regen_max_retry", "router_probe_interval", "router_solicitation_delay", "router_solicitation_interval", "router_solicitations", "suppress_frag_ndisc", "temp_prefered_lft", "temp_valid_lft", "use_oif_addrs_only", "use_optimistic", "use_tempaddr", NULL, }; #define IPV6_MIN_MTU 1280 #define ROUTER_MAX 60 /* According to kernel docs do not make max_addresses too large */ #define MAX_ADDRESSES 128 struct range rand_range6[] = { {0, 2}, /* accept_dad */ {0, 2}, /* accept_ra */ {0, 1}, /* accept_ra_defrtr */ {0, 1}, /* accept_ra_from_local */ {0, INT_MAX}, /* accept_ra_min_hop_limit */ {0, 1}, /* accept_ra_mtu */ {0, 1}, /* accept_ra_pinfo */ {0, INT_MAX}, /* accept_ra_rt_info_max_plen */ {0, 1}, /* accept_ra_rtr_pref */ {-1, 0}, /* accept_source_route */ {0, 1}, /* autoconf */ {0, INT_MAX}, /* dad_transmits */ {0, 1}, /* disable_ipv6 */ {0, 1}, /* drop_unicast_in_l2_multicast */ {0, 1}, /* drop_unsolicited_na */ {0, 2}, /* force_mld_version */ {0, 1}, /* force_tllao */ {0, 1}, /* forwarding */ {0, 1}, /* accept_redirects */ {1, 255}, /* hop_limit */ {0, 1}, /* ignore_routes_with_linkdown */ {-1, 1}, /* keep_addr_on_down */ {0, MAX_ADDRESSES}, /* max_addresses */ {0, INT_MAX}, /* max_desync_factor */ {0, INT_MAX}, /* mldv1_unsolicited_report_interval */ {0, INT_MAX}, /* mldv2_unsolicited_report_interval */ {IPV6_MIN_MTU, IPV6_MIN_MTU}, /* mtu */ {0, 1}, /* ndisc_notify */ {0, 1}, /* optimistic_dad */ {0, 1}, /* proxy_ndp */ {0, INT_MAX}, /* regen_max_retry */ {0, ROUTER_MAX}, /* router_probe_interval */ {0, ROUTER_MAX}, /* router_solicitation_delay */ {0, ROUTER_MAX}, /* router_solicitation_interval */ {0, ROUTER_MAX}, /* router_solicitations */ {0, 1}, /* suppress_frag_ndisc */ {0, INT_MAX}, /* temp_prefered_lft */ {0, INT_MAX}, /* temp_valid_lft */ {0, 1}, /* use_oif_addrs_only */ {0, 1}, /* use_optimistic */ {0, 2}, /* use_tempaddr */ }; struct test_conf { int ipv4_conf[ARRAY_SIZE(devconfs4)]; int ipv4_conf_rand[ARRAY_SIZE(devconfs4)]; int ipv6_conf[ARRAY_SIZE(devconfs6)]; int ipv6_conf_rand[ARRAY_SIZE(devconfs6)]; char *dir4; char *dir6; } lo, def, all; static int save_conf(FILE *fp, int *conf, int *conf_rand, struct range *range, char *path) { int ret; /* * Save */ ret = fscanf(fp, "%d", conf); if (ret != 1) { pr_perror("fscanf"); return -1; } return 0; } static int rand_in_small_range(struct range *r) { return lrand48() % (r->max - r->min + 1) + r->min; } static int rand_in_range(struct range *r) { struct range small; int mid = r->max / 2 + r->min / 2; int half = r->max / 2 - r->min / 2; if (half < INT_MAX / 2) return rand_in_small_range(r); if (lrand48() % 2) { small.min = r->min; small.max = mid; } else { small.min = mid + 1; small.max = r->max; } return rand_in_small_range(&small); } static int gen_conf(FILE *fp, int *conf, int *conf_rand, struct range *range, char *path) { int ret; /* * Set random value */ *conf_rand = rand_in_range(range); ret = fprintf(fp, "%d", *conf_rand); if (ret < 0) { pr_perror("fprintf"); return -1; } return 0; } #define MAX_MSEC_GRANULARITY 10 static int check_conf(FILE *fp, int *conf, int *conf_rand, struct range *range, char *path) { int ret; int val; /* * Check opt */ ret = fscanf(fp, "%d", &val); if (ret != 1) { pr_perror("fscanf"); return -1; } if (val != *conf_rand) { fail("Option \"%s\" changed from %d to %d", path, *conf_rand, val); if ((strstr(path, "mldv1_unsolicited_report_interval") || strstr(path, "mldv2_unsolicited_report_interval")) && val - *conf_rand < MAX_MSEC_GRANULARITY) return 0; return -1; } return 0; } static int restore_conf(FILE *fp, int *conf, int *conf_rand, struct range *range, char *path) { int ret; /* * Restore opt */ ret = fprintf(fp, "%d", *conf); if (ret < 0) { pr_perror("fprintf"); return -1; } return 0; } static int for_each_option_do(int (*f)(FILE *fp, int *conf, int *conf_rand, struct range *range, char *path), struct test_conf *tc) { int ret; int i; for (i = 0; devconfs4[i]; i++) { FILE *fp; char path[PATH_MAX]; ret = snprintf(path, sizeof(path), "%s/%s", tc->dir4, devconfs4[i]); if (ret < 0) { pr_perror("snprintf"); return -1; } ret = access(path, W_OK); if (ret < 0) continue; fp = fopen(path, "r+"); if (fp == NULL) { pr_perror("fopen"); return -1; } ret = (*f)(fp, &tc->ipv4_conf[i], &tc->ipv4_conf_rand[i], &rand_range4[i], path); if (ret < 0) return -1; fclose(fp); } for (i = 0; devconfs6[i]; i++) { FILE *fp; char path[PATH_MAX]; ret = snprintf(path, sizeof(path), "%s/%s", tc->dir6, devconfs6[i]); if (ret < 0) { pr_perror("snprintf"); return -1; } ret = access(path, W_OK); if (ret < 0) continue; fp = fopen(path, "r+"); if (fp == NULL) { pr_perror("fopen"); return -1; } ret = (*f)(fp, &tc->ipv6_conf[i], &tc->ipv6_conf_rand[i], &rand_range6[i], path); if (ret < 0) return -1; fclose(fp); } return 0; } #define IPV6ADDR_EXAMPLE "2607:f0d0:1002:0051:0000:0000:0000:0004" #define MAX_STR_CONF_LEN 200 static int set_stable_secret(struct test_conf *tc) { int ret; FILE *fp; char path[PATH_MAX]; ret = snprintf(path, sizeof(path), "%s/%s", tc->dir6, "stable_secret"); if (ret < 0) { pr_perror("snprintf"); return -1; } ret = access(path, W_OK); if (ret < 0) return 0; fp = fopen(path, "r+"); if (fp == NULL) { pr_perror("fopen"); return -1; } ret = fprintf(fp, IPV6ADDR_EXAMPLE); if (ret < 0) { pr_perror("fprintf"); fclose(fp); return -1; } fclose(fp); return 0; } static int check_stable_secret(struct test_conf *tc) { int ret; FILE *fp; char path[PATH_MAX]; char val[MAX_STR_CONF_LEN+1]; ret = snprintf(path, sizeof(path), "%s/%s", tc->dir6, "stable_secret"); if (ret < 0) { pr_perror("snprintf"); return -1; } ret = access(path, W_OK); if (ret < 0) return 0; fp = fopen(path, "r+"); if (fp == NULL) { pr_perror("fopen"); return -1; } ret = fscanf(fp, "%s", val); if (ret != 1) { pr_perror("fscanf"); fclose(fp); return -1; } if (strcmp(val, IPV6ADDR_EXAMPLE)) { fail("Option \"%s\" changed from %s to %s", path, IPV6ADDR_EXAMPLE, val); fclose(fp); return -1; } fclose(fp); return 0; } int main(int argc, char **argv) { int ret; lo.dir4 = LO_CONF_DIR_PATH; def.dir4 = DEF_CONF_DIR_PATH; all.dir4 = ALL_CONF_DIR_PATH; lo.dir6 = LO_CONF6_DIR_PATH; def.dir6 = DEF_CONF6_DIR_PATH; all.dir6 = ALL_CONF6_DIR_PATH; test_init(argc, argv); ret = for_each_option_do(save_conf, &all); if (ret < 0) return -1; ret = for_each_option_do(save_conf, &def); if (ret < 0) return -1; ret = for_each_option_do(save_conf, &lo); if (ret < 0) return -1; ret = for_each_option_do(gen_conf, &all); if (ret < 0) return -1; ret = for_each_option_do(gen_conf, &def); if (ret < 0) return -1; ret = for_each_option_do(gen_conf, &lo); if (ret < 0) return -1; ret = set_stable_secret(&def); if (ret < 0) return -1; ret = set_stable_secret(&lo); if (ret < 0) return -1; test_daemon(); test_waitsig(); ret = for_each_option_do(check_conf, &all); if (ret < 0) return -1; ret = for_each_option_do(check_conf, &def); if (ret < 0) return -1; ret = for_each_option_do(check_conf, &lo); if (ret < 0) return -1; ret = for_each_option_do(restore_conf, &all); if (ret < 0) return -1; ret = for_each_option_do(restore_conf, &def); if (ret < 0) return -1; ret = for_each_option_do(restore_conf, &lo); if (ret < 0) return -1; ret = check_stable_secret(&def); if (ret < 0) return -1; ret = check_stable_secret(&lo); if (ret < 0) return -1; pass(); return 0; } criu-3.6/test/zdtm/static/netns-dev.desc000066400000000000000000000000461317335042600203170ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid'} criu-3.6/test/zdtm/static/netns-nf.c000066400000000000000000000016301317335042600174500ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that netfilter rules (some) are kept"; const char *test_author = "Pavel Emelianov "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char **argv) { char cmd[128]; test_init(argc, argv); if (system("iptables -A INPUT -t filter --protocol icmp -j DROP")) { pr_perror("Can't set input rule"); return -1; } sprintf(cmd, "iptables -L > pre-%s", filename); if (system(cmd)) { pr_perror("Can't save iptables"); return -1; } test_daemon(); test_waitsig(); sprintf(cmd, "iptables -L > post-%s", filename); if (system(cmd)) { fail("Can't get iptables"); return -1; } sprintf(cmd, "diff pre-%s post-%s", filename, filename); if (system(cmd)) { fail("Iptables differ"); return -1; } pass(); return 0; } criu-3.6/test/zdtm/static/netns-nf.desc000066400000000000000000000006471317335042600201530ustar00rootroot00000000000000{ 'deps': [ '/bin/sh', '/sbin/iptables', '/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/lib/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} criu-3.6/test/zdtm/static/netns.c000066400000000000000000000022341317335042600170500ustar00rootroot00000000000000#include #include #include #include "zdtmtst.h" const char *test_doc = "Check that network environment (links, addresses and routes) are preserved"; const char *test_author = "Pavel Emelianov "; int main(int argc, char **argv) { test_init(argc, argv); if (system("ip link set lo up")) { fail("Can't set lo up"); return -1; } if (system("ip addr add 1.2.3.4 dev lo")) { fail("Can't add addr on lo"); return -1; } if (system("ip route add 1.2.3.5 dev lo")) { fail("Can't add route via lo"); return -1; } if (system("ip route add 1.2.3.6 via 1.2.3.5")) { fail("Can't add route via lo (2)"); return -1; } if (system("ip link > netns.dump.test && ip addr >> netns.dump.test && ip route >> netns.dump.test")) { sleep(1000); fail("Can't save net config"); return -1; } test_daemon(); test_waitsig(); if (system("ip link > netns.rst.test && ip addr >> netns.rst.test && ip route >> netns.rst.test")) { fail("Can't get net config"); return -1; } if (system("diff netns.rst.test netns.dump.test")) { fail("Net config differs after restore"); return -1; } pass(); return 0; } criu-3.6/test/zdtm/static/netns.desc000066400000000000000000000001531317335042600175420ustar00rootroot00000000000000{ 'deps': ['/bin/sh', '/sbin/ip|/bin/ip', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} criu-3.6/test/zdtm/static/ofd_file_locks.c000066400000000000000000000051441317335042600206660ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" #include "fs.h" #include "ofd_file_locks.h" static int parse_ofd_lock(char *buf, struct flock64 *lck) { char fl_flag[10], fl_type[15], fl_option[10], fl_end[32]; long long start; int num; if (strncmp(buf, "lock:\t", 6) != 0) return 1; /* isn't lock, skip record */ num = sscanf(buf, "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", fl_flag, fl_type, fl_option, &start, fl_end); if (num < 4) { pr_err("Invalid lock info %s\n", buf); return -1; } if (strcmp(fl_flag, "OFDLCK")) return 1; lck->l_start = start; if (strcmp(fl_end, "EOF")) { unsigned long end; if (sscanf(fl_end, "%lu", &end) <= 0) { pr_err("Invalid lock entry\n"); return -1; } lck->l_len = end - lck->l_start + 1; } else { lck->l_len = 0; } if (strcmp(fl_option, "WRITE") == 0) lck->l_type = F_WRLCK; else lck->l_type = F_RDLCK; return 0; } static int read_fd_ofd_lock(int pid, int fd, struct flock64 *lck) { char path[PATH_MAX]; char buf[100]; int num; FILE *proc_file = NULL; sprintf(path, "/proc/%i/fdinfo/%i", pid, fd); proc_file = fopen(path, "r"); if (!proc_file) { pr_err("Can't open %s\n", path); return -1; } num = -1; while (fgets(buf, sizeof(buf), proc_file)) { num = parse_ofd_lock(buf, lck); if (num <= 0) break; } if (fclose(proc_file)) { pr_err("Can't close %s\n", path); return -1; } return num; } int check_lock_exists(const char *filename, struct flock64 *lck) { int ret = -1; int fd; fd = open(filename, O_RDWR, 0666); if (lck->l_type == F_RDLCK) { /* check, that there is no write lock */ ret = fcntl(fd, F_OFD_GETLK, lck); if (ret) { pr_err("fcntl failed (%i)\n", ret); goto out; } if (lck->l_type != F_UNLCK) { pr_err("OFD lock type do not match\n"); goto out; } } /* check, that lock is set */ lck->l_type = F_WRLCK; ret = fcntl(fd, F_OFD_GETLK, lck); if (ret) { pr_err("fcntl failed (%i)\n", ret); goto out; } if (lck->l_type == F_UNLCK) { pr_err("Lock not found\n"); goto out; } ret = 0; out: if (close(fd)) return -1; return ret; } static int check_file_locks_match(struct flock64 *orig_lck, struct flock64 *lck) { return orig_lck->l_start == lck->l_start && orig_lck->l_len == lck->l_len && orig_lck->l_type == lck->l_type; } int check_file_lock_restored(int pid, int fd, struct flock64 *lck) { struct flock64 lck_restored; if (read_fd_ofd_lock(pid, fd, &lck_restored)) return -1; if (!check_file_locks_match(lck, &lck_restored)) { pr_err("Can't restore file lock (fd: %i)\n", fd); return -1; } return 0; } criu-3.6/test/zdtm/static/ofd_file_locks.h000066400000000000000000000007211317335042600206670ustar00rootroot00000000000000#ifndef ZDTM_OFD_FILE_LOCKS_H_ #define ZDTM_OFD_FILE_LOCKS_H_ #include #ifndef F_OFD_GETLK #define F_OFD_GETLK 36 #define F_OFD_SETLK 37 #define F_OFD_SETLKW 38 #endif /* * Functions for parsing of OFD locks * from procfs and checking them after restoring. */ extern int check_lock_exists(const char *filename, struct flock64 *lck); extern int check_file_lock_restored(int pid, int fd, struct flock64 *lck); #endif /* ZDTM_OFD_FILE_LOCKS_H_ */ criu-3.6/test/zdtm/static/oom_score_adj.c000066400000000000000000000031021317335042600205170ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check for /proc/self/oom_score_adj restore"; const char *test_author = "Dmitry Safonov "; const char oom_score_adj_self[] = "/proc/self/oom_score_adj"; const int test_value = 400; int get_oom_score_adj(const char *path, int *err) { int fd; ssize_t num; char buf[11]; *err = 0; fd = open(path, O_RDONLY); if (fd < 0) { pr_perror("Failed to open %s", path); goto out; } num = read(fd, buf, 10); close(fd); if (num < 0) { pr_perror("Unable to read %s", path); goto out; } buf[num] = '\0'; return strtol(buf, NULL, 10); out: *err = -1; return 0; } int set_oom_score_adj(const char *path, int value) { int fd, ret = 0; char buf[11]; fd = open(path, O_RDWR); if (fd < 0) { pr_perror("Failed to open %s", path); return -1; } snprintf(buf, 11, "%d", value); if (write(fd, buf, 11) < 0) { pr_perror("Write %s to %s failed", buf, path); ret = -1; } close(fd); return ret; } int main(int argc, char *argv[]) { int ret; int new_oom_score_adj; test_init(argc, argv); if (set_oom_score_adj(oom_score_adj_self, test_value) < 0) return -1; test_daemon(); test_waitsig(); new_oom_score_adj = get_oom_score_adj(oom_score_adj_self, &ret); if (ret < 0) return -1; if (new_oom_score_adj != test_value) { fail("OOM score value %d is different after restore: %d\n", test_value, new_oom_score_adj); return -1; } pass(); return 0; } criu-3.6/test/zdtm/static/overmount_dev.c000066400000000000000000000034301317335042600206140ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that we can migrate with a device special file " "open in a directory which has been mounted over by " "another filesystem"; const char *test_author = "Roman Kagan "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { int fd; char path[256]; struct stat st; /* /dev/null params - sure to exist in a VPS */ mode_t mode = S_IFCHR | 0700; dev_t dev = makedev(1, 3); test_init(argc, argv); if (snprintf(path, sizeof(path), "%s/foo", dirname) >= sizeof(path)) { pr_perror("directory name \"%s\"is too long", dirname); exit(1); } if (mkdir(dirname, 0700)) { pr_perror("can't make directory %s", dirname); exit(1); } if (mknod(path, mode, dev)) { pr_perror("can't make device file \"%s\"", path); exit(1); } fd = open(path, O_RDWR); if (fd < 0) { pr_perror("can't open %s", path); goto rmdir; } if (mount("rien", dirname, "tmpfs", 0, 0) < 0) { pr_perror("can't mount tmpfs over %s", dirname); goto cleanup; } test_daemon(); test_waitsig(); if (umount(dirname) < 0) { fail("can't umount %s: %m", dirname); goto cleanup; } if (close(fd) < 0) { fail("can't close %s: %m", path); goto unlink; } if (stat(path, &st) < 0) { fail("can't stat %s: %m", path); goto unlink; } if (st.st_mode != mode || st.st_rdev != dev) { fail("%s is no longer the device file we had", path); goto unlink; } if (unlink(path) < 0) { fail("can't unlink %s: %m", path); goto rmdir; } pass(); goto rmdir; cleanup: close(fd); unlink: unlink(path); rmdir: rmdir(dirname); return 0; } criu-3.6/test/zdtm/static/overmount_dev.desc000066400000000000000000000000521317335042600213050ustar00rootroot00000000000000{'flavor' : "ns", 'flags': 'suid crfail'} criu-3.6/test/zdtm/static/overmount_fifo.c000066400000000000000000000032021317335042600207560ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that we can migrate with a named pipe " "open in a directory which has been mounted over by " "another filesystem"; const char *test_author = "Roman Kagan "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { int fd; char path[256]; struct stat st; mode_t mode = S_IFIFO | 0700; test_init(argc, argv); if (snprintf(path, sizeof(path), "%s/foo", dirname) >= sizeof(path)) { pr_perror("directory name \"%s\"is too long", dirname); exit(1); } if (mkdir(dirname, 0700)) { pr_perror("can't make directory %s", dirname); exit(1); } if (mknod(path, mode, 0)) { pr_perror("can't make fifo \"%s\"", path); exit(1); } fd = open(path, O_RDWR); if (fd < 0) { pr_perror("can't open %s", path); goto rmdir; } if (mount("rien", dirname, "tmpfs", 0, 0) < 0) { pr_perror("can't mount tmpfs over %s", dirname); goto cleanup; } test_daemon(); test_waitsig(); if (umount(dirname) < 0) { fail("can't umount %s: %m", dirname); goto cleanup; } if (close(fd) < 0) { fail("can't close %s: %m", path); goto unlink; } if (stat(path, &st) < 0) { fail("can't stat %s: %m", path); goto unlink; } if (st.st_mode != mode) { fail("%s is no longer the fifo we had", path); goto unlink; } if (unlink(path) < 0) { fail("can't unlink %s: %m", path); goto rmdir; } pass(); goto rmdir; cleanup: close(fd); unlink: unlink(path); rmdir: rmdir(dirname); return 0; } criu-3.6/test/zdtm/static/overmount_fifo.desc000066400000000000000000000000561317335042600214560ustar00rootroot00000000000000{'flavor' : 'ns uns', 'flags': 'suid crfail'} criu-3.6/test/zdtm/static/overmount_file.c000066400000000000000000000025361317335042600207630ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that we can't migrate with a file open in a " "directory which has been mounted over by another " "filesystem"; const char *test_author = "Roman Kagan "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { int fd; char path[256]; test_init(argc, argv); if (snprintf(path, sizeof(path), "%s/foo", dirname) >= sizeof(path)) { pr_perror("directory name \"%s\"is too long", dirname); exit(1); } if (mkdir(dirname, 0700)) { pr_perror("can't make directory %s", dirname); exit(1); } fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open %s", path); goto rmdir; } if (mount("rien", dirname, "tmpfs", 0, 0) < 0) { pr_perror("can't mount tmpfs over %s", dirname); goto cleanup; } test_daemon(); test_waitsig(); if (umount(dirname) < 0) { fail("can't umount %s: %m", dirname); goto cleanup; } if (close(fd) < 0) { fail("can't close %s: %m", path); goto unlink; } if (unlink(path) < 0) { fail("can't unlink %s: %m", path); goto rmdir; } pass(); goto rmdir; cleanup: close(fd); unlink: unlink(path); rmdir: rmdir(dirname); return 0; } criu-3.6/test/zdtm/static/overmount_file.desc000066400000000000000000000000561317335042600214520ustar00rootroot00000000000000{'flavor' : 'ns uns', 'flags': 'suid crfail'} criu-3.6/test/zdtm/static/overmount_sock.c000066400000000000000000000073541317335042600210060ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that we can migrate with a unix socket " "bound in a directory which has been mounted over by" " another filesystem"; const char *test_author = "Roman Kagan "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); static int fill_sock_name(struct sockaddr_un *name, const char *filename) { if (strlen(filename) >= sizeof(name->sun_path)) return -1; name->sun_family = AF_LOCAL; strcpy(name->sun_path, filename); return 0; } static int setup_srv_sock(const char *filename) { struct sockaddr_un name; int sock; if (fill_sock_name(&name, filename) < 0) { pr_perror("filename \"%s\" is too long", filename); return -1; } sock = socket(PF_LOCAL, SOCK_STREAM, 0); if (sock < 0) { pr_perror("can't create socket"); return -1; } if (bind(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) { pr_perror("can't bind to socket \"%s\"", filename); goto err; } if (listen(sock, 1) < 0) { pr_perror("can't listen on a socket \"%s\"", filename); goto err; } return sock; err: close(sock); return -1; } static int setup_clnt_sock(const char *filename) { struct sockaddr_un name; int sock; if (fill_sock_name(&name, filename) < 0) return -1; sock = socket(PF_LOCAL, SOCK_STREAM, 0); if (sock < 0) return -1; if (connect(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) goto err; return sock; err: close(sock); return -1; } int main(int argc, char ** argv) { int sock, acc_sock, ret; char path[256]; pid_t pid; uint32_t crc; uint8_t buf[1000]; test_init(argc, argv); if (snprintf(path, sizeof(path), "%s/foo", dirname) >= sizeof(path)) { pr_perror("directory name \"%s\"is too long", dirname); exit(1); } if (mkdir(dirname, 0700)) { pr_perror("can't make directory %s", dirname); exit(1); } sock = setup_srv_sock(path); if (sock < 0) goto out; pid = fork(); if (pid < 0) { pr_perror("can't fork"); goto out; } if (pid == 0) { /* child writes to the overmounted socket and returns */ close(sock); sock = setup_clnt_sock(path); if (sock < 0) _exit(1); test_waitsig(); crc = ~0; datagen(buf, sizeof(buf), &crc); if (write(sock, buf, sizeof(buf)) != sizeof(buf)) _exit(errno); close(sock); _exit(0); } acc_sock = accept(sock, NULL, NULL); if (acc_sock < 0) { pr_perror("can't accept() the connection on \"%s\"", path); goto out_kill; } close(sock); sock = acc_sock; if (mount("rien", dirname, "tmpfs", 0, 0) < 0) { pr_perror("can't mount tmpfs over %s", dirname); goto out_kill; } test_daemon(); test_waitsig(); if (kill(pid, SIGTERM)) { fail("terminating the child failed: %m\n"); goto out; } if (wait(&ret) != pid) { fail("wait() returned wrong pid %d: %m\n", pid); goto out; } if (WIFEXITED(ret)) { ret = WEXITSTATUS(ret); if (ret) { fail("child exited with nonzero code %d (%s)\n", ret, strerror(ret)); goto out; } } if (WIFSIGNALED(ret)) { fail("child exited on unexpected signal %d\n", WTERMSIG(ret)); goto out; } if (read(sock, buf, sizeof(buf)) != sizeof(buf)) { fail("can't read %s: %m\n", path); goto out; } crc = ~0; if (datachk(buf, sizeof(buf), &crc)) { fail("CRC mismatch\n"); goto out; } if (umount(dirname) < 0) { fail("can't umount %s: %m", dirname); goto out; } if (close(sock) < 0) { fail("can't close %s: %m", path); goto out; } if (unlink(path) < 0) { fail("can't unlink %s: %m", path); goto out; } pass(); out_kill: kill(pid, SIGKILL); out: close(sock); unlink(path); rmdir(dirname); return 0; } criu-3.6/test/zdtm/static/overmount_sock.desc000066400000000000000000000000561317335042600214720ustar00rootroot00000000000000{'flavor' : 'ns uns', 'flags': 'suid crfail'} criu-3.6/test/zdtm/static/packet_sock.c000066400000000000000000000154231317335042600202130ustar00rootroot00000000000000#include "zdtmtst.h" const char *test_doc = "static test for packet sockets"; const char *test_author = "Pavel Emelyanov "; /* * Description: * Create and bind several packet sockets, check thet getname * reports same result before and after c/r cycle. This is enough * for _basic_ packet functionality only, but still. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define SK_RESERVE 8 #define DEF_FANOUT 13 #ifndef PACKET_FANOUT #define PACKET_FANOUT 18 #endif static int test_sockaddr(int n, struct sockaddr_ll *have, struct sockaddr_ll *want) { if (have->sll_family != want->sll_family) { fail("%d Family mismatch %d/%d", n, (int)have->sll_family, (int)want->sll_family); return 1; } if (have->sll_protocol != want->sll_protocol) { fail("%d Proto mismatch %d/%d", n, (int)have->sll_protocol, (int)want->sll_protocol); return 1; } if (have->sll_ifindex != want->sll_ifindex) { fail("%d Index mismatch %d/%d", n, have->sll_ifindex, want->sll_ifindex); return 1; } /* all the others are derivatives from dev */ return 0; } #ifndef MAX_ADDR_LEN #define MAX_ADDR_LEN 32 #endif struct packet_mreq_max { int mr_ifindex; unsigned short mr_type; unsigned short mr_alen; unsigned char mr_address[MAX_ADDR_LEN]; }; #define LO_ADDR_LEN 6 #if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) struct tpacket_req3 { unsigned int tp_block_size; unsigned int tp_block_nr; unsigned int tp_frame_size; unsigned int tp_frame_nr; unsigned int tp_retire_blk_tov; unsigned int tp_sizeof_priv; unsigned int tp_feature_req_word; }; #endif int main(int argc, char **argv) { int sk1, sk2; struct sockaddr_ll addr, addr1, addr2; socklen_t alen; int ver, rsv, yes; struct packet_mreq_max mreq; struct tpacket_req3 ring; test_init(argc, argv); sk1 = socket(PF_PACKET, SOCK_RAW, 0); if (sk1 < 0) { pr_perror("Can't create socket 1"); return 1; } sk2 = socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_IP)); if (sk2 < 0) { pr_perror("Can't create socket 2"); return 1; } memset(&addr, 0, sizeof(addr)); addr.sll_family = AF_PACKET; addr.sll_ifindex = 1; /* loopback should be 1 in all namespaces */ if (bind(sk2, (struct sockaddr *)&addr, sizeof(addr)) < 0) { pr_perror("Can't bind socket"); return 1; } alen = sizeof(addr1); if (getsockname(sk1, (struct sockaddr *)&addr1, &alen) < 0) { pr_perror("Can't get sockname 1"); return 1; } alen = sizeof(addr2); if (getsockname(sk2, (struct sockaddr *)&addr2, &alen) < 0) { pr_perror("Can't get sockname 2"); return 1; } ver = TPACKET_V2; if (setsockopt(sk1, SOL_PACKET, PACKET_VERSION, &ver, sizeof(ver)) < 0) { pr_perror("Can't set version"); return 1; } yes = 1; if (setsockopt(sk1, SOL_PACKET, PACKET_AUXDATA, &yes, sizeof(yes)) < 0) { pr_perror("Can't set auxdata"); return 1; } memset(&ring, 0, sizeof(ring)); ring.tp_block_size = PAGE_SIZE; ring.tp_block_nr = 1; ring.tp_frame_size = 1024; ring.tp_frame_nr = (ring.tp_block_size / ring.tp_frame_size) * ring.tp_block_nr; if (setsockopt(sk1, SOL_PACKET, PACKET_RX_RING, &ring, sizeof(ring)) < 0) { pr_perror("Can't set rx ring"); return 1; } rsv = SK_RESERVE; if (setsockopt(sk2, SOL_PACKET, PACKET_RESERVE, &rsv, sizeof(rsv)) < 0) { pr_perror("Can't set reserve"); return 1; } yes = 1; if (setsockopt(sk2, SOL_PACKET, PACKET_ORIGDEV, &yes, sizeof(yes)) < 0) { pr_perror("Can't set origdev"); return 1; } yes = DEF_FANOUT; if (setsockopt(sk2, SOL_PACKET, PACKET_FANOUT, &yes, sizeof(yes)) < 0) { pr_perror("Can't configure fanout"); return 1; } memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = 1; mreq.mr_type = PACKET_MR_PROMISC; if (setsockopt(sk1, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { pr_perror("Can't add promisc member"); return 1; } memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = 1; mreq.mr_type = PACKET_MR_UNICAST; mreq.mr_alen = LO_ADDR_LEN; if (setsockopt(sk2, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { pr_perror("Can't add ucast member"); return 1; } memset(&ring, 0, sizeof(ring)); ring.tp_block_size = PAGE_SIZE; ring.tp_block_nr = 1; ring.tp_frame_size = 1024; ring.tp_frame_nr = (ring.tp_block_size / ring.tp_frame_size) * ring.tp_block_nr; if (setsockopt(sk2, SOL_PACKET, PACKET_TX_RING, &ring, sizeof(ring)) < 0) { pr_perror("Can't set tx ring"); return 1; } test_daemon(); test_waitsig(); alen = sizeof(addr); if (getsockname(sk1, (struct sockaddr *)&addr, &alen) < 0) { fail("Can't get sockname 1 rst"); return 1; } if (test_sockaddr(1, &addr, &addr1)) return 1; alen = sizeof(ver); if (getsockopt(sk1, SOL_PACKET, PACKET_VERSION, &ver, &alen) < 0) { fail("Can't get sockopt ver %m"); return 1; } if (ver != TPACKET_V2) { fail("Version mismatch have %d, want %d\n", ver, TPACKET_V2); return 1; } alen = sizeof(yes); if (getsockopt(sk1, SOL_PACKET, PACKET_AUXDATA, &yes, &alen) < 0) { fail("Can't get sockopt auxdata %m"); return 1; } if (yes != 1) { fail("Auxdata not ON"); return 1; } memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = 1; mreq.mr_type = PACKET_MR_PROMISC; if (setsockopt(sk1, SOL_PACKET, PACKET_DROP_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { fail("Promisc member not kept"); return 1; } alen = sizeof(yes); if (getsockopt(sk1, SOL_PACKET, PACKET_FANOUT, &yes, &alen) < 0) { fail("Can't read fanout back %m"); return 1; } if (yes != 0) { fail("Fanout screwed up to %x", yes); return 1; } alen = sizeof(addr); if (getsockname(sk2, (struct sockaddr *)&addr, &alen) < 0) { fail("Can't get sockname 2 rst"); return 1; } if (test_sockaddr(2, &addr, &addr2)) return 1; alen = sizeof(rsv); if (getsockopt(sk2, SOL_PACKET, PACKET_RESERVE, &rsv, &alen) < 0) { fail("Can't get sockopt rsv %m"); return 1; } alen = sizeof(yes); if (getsockopt(sk2, SOL_PACKET, PACKET_ORIGDEV, &yes, &alen) < 0) { fail("Can't get sockopt origdev %m"); return 1; } if (yes != 1) { fail("OrigDev not ON"); return 1; } if (rsv != SK_RESERVE) { fail("Reserve mismatch have %d, want %d\n", rsv, SK_RESERVE); return 1; } memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = 1; mreq.mr_type = PACKET_MR_UNICAST; mreq.mr_alen = LO_ADDR_LEN; if (setsockopt(sk2, SOL_PACKET, PACKET_DROP_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { fail("Ucast member not kept"); return 1; } alen = sizeof(yes); if (getsockopt(sk2, SOL_PACKET, PACKET_FANOUT, &yes, &alen) < 0) { fail("Can't read fanout2 back %m"); return 1; } if (yes != DEF_FANOUT) { fail("Fanout2 screwed up to %x", yes); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/packet_sock.desc000066400000000000000000000000221317335042600206740ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/packet_sock_mmap.c000066400000000000000000000044651317335042600212310ustar00rootroot00000000000000#include "zdtmtst.h" const char *test_doc = "static test for packet sockets mmaps"; const char *test_author = "Pavel Emelyanov "; #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) struct tpacket_req3 { unsigned int tp_block_size; unsigned int tp_block_nr; unsigned int tp_frame_size; unsigned int tp_frame_nr; unsigned int tp_retire_blk_tov; unsigned int tp_sizeof_priv; unsigned int tp_feature_req_word; }; #endif static void check_map_is_there(unsigned long addr, int sk) { FILE *f; char line[64]; struct stat ss; fstat(sk, &ss); f = fopen("/proc/self/maps", "r"); while (fgets(line, sizeof(line), f) != NULL) { unsigned long start; int maj, min, ino; sscanf(line, "%lx-%*x %*s %*s %x:%x %d %*s", &start, &maj, &min, &ino); if ((start == addr) && ss.st_dev == makedev(maj, min) && ss.st_ino == ino) { pass(); return; } } fail("No socket mapping found"); } int main(int argc, char **argv) { int sk; struct tpacket_req3 ring; void *mem; test_init(argc, argv); sk = socket(PF_PACKET, SOCK_RAW, 0); if (sk < 0) { pr_perror("Can't create socket 1"); return 1; } memset(&ring, 0, sizeof(ring)); ring.tp_block_size = PAGE_SIZE; ring.tp_block_nr = 1; ring.tp_frame_size = 1024; ring.tp_frame_nr = (ring.tp_block_size / ring.tp_frame_size) * ring.tp_block_nr; if (setsockopt(sk, SOL_PACKET, PACKET_RX_RING, &ring, sizeof(ring)) < 0) { pr_perror("Can't set rx ring"); return 1; } memset(&ring, 0, sizeof(ring)); ring.tp_block_size = PAGE_SIZE; ring.tp_block_nr = 1; ring.tp_frame_size = 1024; ring.tp_frame_nr = (ring.tp_block_size / ring.tp_frame_size) * ring.tp_block_nr; if (setsockopt(sk, SOL_PACKET, PACKET_TX_RING, &ring, sizeof(ring)) < 0) { pr_perror("Can't set tx ring"); return 1; } mem = mmap(NULL, 2 * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, sk, 0); if (mem == MAP_FAILED) { pr_perror("Can't mmap socket"); return 1; } test_daemon(); test_waitsig(); check_map_is_there((unsigned long)mem, sk); return 0; } criu-3.6/test/zdtm/static/packet_sock_mmap.desc000066400000000000000000000000221317335042600217060ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/packet_sock_spkt.c000066400000000000000000000027411317335042600212530ustar00rootroot00000000000000#include "zdtmtst.h" #include #include #include #include #include #include #include const char *test_doc = "Check bound and not bound SOCK_PACKET sockets"; const char *test_author = "Gleb Valin "; struct ethframe { struct ethhdr header; char data[ETH_DATA_LEN]; }; static int do_bind(int sk) { struct sockaddr addr = {}; addr.sa_family = AF_PACKET; strcpy(addr.sa_data, "lo"); return bind(sk, (struct sockaddr *) &addr, sizeof(addr)); } static int check_socket_binding(int sk, char *dev) { struct sockaddr addr = {}; socklen_t l = sizeof(addr); if (getsockname(sk, &addr, &l) < 0) return -1; if (addr.sa_family != AF_PACKET) return -1; if (strcmp(addr.sa_data, dev) != 0) return -1; return 0; } int main(int argc, char **argv) { int sk1; int sk2; test_init(argc, argv); sk1 = socket(AF_PACKET, SOCK_PACKET, htons(ETH_P_ALL)); if (sk1 < 0) { pr_perror("Can't create socket 1"); return 1; } if (do_bind(sk1) < 0) { pr_perror("Can't bind sosket 1"); return 1; } sk2 = socket(AF_PACKET, SOCK_PACKET, htons(ETH_P_ALL)); if (sk2 < 0) { pr_perror("Can't create socket 2"); return 1; } test_daemon(); test_waitsig(); if (check_socket_binding(sk1, "lo") < 0) { fail("Socket 1 has wrong binding"); return 1; } if (check_socket_binding(sk2, "") < 0) { fail("Socket 2 has wrong binding"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/packet_sock_spkt.desc000066400000000000000000000000501317335042600217360ustar00rootroot00000000000000{'flavor':'h uns ns', 'flags' : 'suid'} criu-3.6/test/zdtm/static/pdeath_sig.c000066400000000000000000000037021317335042600200310ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that pdeath sig is preserved"; const char *test_author = "Pavel Emelianov "; static int sigrecvd = 0; static void sigh(int s, siginfo_t *i, void *d) { sigrecvd = 1; } #ifndef PR_SET_PDEATH_SIGNAL #define PR_SET_PDEATH_SIGNAL 1 #endif int main(int argc, char **argv) { int pid, ret, pw[2], pr[2]; test_init(argc, argv); /* * Here's what will happen here: * * me -(fork)-> P -(fork)-> C * | | * +-------------->-(pw)->-+ * +-<-(pr)-<--------------+ * * We wait for C to prepare himself via pr. * After C/R we kill P and close pw to wake up * C. The we wait for it to report back via pr * which signals has he received. */ pipe(pw); pipe(pr); pid = fork(); if (pid == 0) { pid = fork(); if (pid == 0) { struct sigaction sa = {}; /* C */ close(pw[1]); close(pr[0]); sa.sa_sigaction = sigh; ret = sigaction(SIGUSR1, &sa, NULL); if (ret == 0) ret = prctl(PR_SET_PDEATH_SIGNAL, SIGUSR1, 0, 0, 0); write(pr[1], &ret, sizeof(ret)); read(pw[0], &ret, sizeof(ret)); write(pr[1], &sigrecvd, sizeof(sigrecvd)); } else { /* P, pid == C */ close(pw[0]); close(pw[1]); close(pr[0]); close(pr[1]); /* Just hang */ waitpid(pid, NULL, 0); } exit(0); } /* me, pid == P */ close(pw[0]); close(pr[1]); ret = -1; read(pr[0], &ret, sizeof(ret)); if (ret != 0) { pr_perror("C start error"); goto out; } /* * P didn't have time to close his pipes? * That's OK, CRIU should C/R these knots. */ test_daemon(); test_waitsig(); out: kill(pid, SIGKILL); waitpid(pid, NULL, 0); close(pw[1]); if (ret == 0) { read(pr[0], &ret, sizeof(ret)); if (ret != 1) fail("USR1 isn't delivered"); else pass(); } return 0; } criu-3.6/test/zdtm/static/pid00.c000066400000000000000000000032601317335042600166350ustar00rootroot00000000000000#include #include #include #include "zdtmtst.h" const char *test_doc = "Check that p?pid and e?[ug]id didn't change"; const char *test_author = "Pavel Emelianov "; int setfsuid(uid_t fsuid); int setfsgid(uid_t fsgid); int main(int argc, char **argv) { int pid, s_p[2], f_p[2], r_p[3]; const uid_t w_ruid = 1, w_euid = 2, w_suid = 3, w_fsuid = w_euid; const gid_t w_rgid = 5, w_egid = 6, w_sgid = 7, w_fsgid = 8; uid_t rid, eid, sid, fsid; char res = 'x'; test_init(argc, argv); pipe(s_p); pipe(f_p); pipe(r_p); pid = fork(); if (pid == 0) { close(s_p[0]); close(f_p[1]); close(r_p[0]); setresgid(w_rgid, w_egid, w_sgid); setfsgid(w_fsgid); setresuid(w_ruid, w_euid, w_suid); /* fsuid change is impossible after above */ close(s_p[1]); read(f_p[0], &res, 1); close(f_p[0]); #define CHECK_ID(__t, __w, __e) do { \ if (__t##id != w_##__t##__w##id) { \ res = __e; \ goto bad; \ } \ } while (0) rid = eid = sid = fsid = 0; getresuid(&rid, &eid, &sid); fsid = setfsuid(w_euid); CHECK_ID(r, u, '1'); CHECK_ID(e, u, '2'); CHECK_ID(s, u, '3'); CHECK_ID(s, u, '3'); CHECK_ID(fs, u, '4'); rid = eid = sid = fsid = 0; getresgid(&rid, &eid, &sid); fsid = setfsgid(w_fsgid); CHECK_ID(r, g, '5'); CHECK_ID(e, g, '6'); CHECK_ID(s, g, '7'); CHECK_ID(fs, g, '8'); res = '0'; bad: write(r_p[1], &res, 1); close(r_p[1]); _exit(0); } close(f_p[0]); close(s_p[1]); close(r_p[1]); read(s_p[0], &res, 1); close(s_p[0]); test_daemon(); test_waitsig(); close(f_p[1]); read(r_p[0], &res, 1); if (res == '0') pass(); else fail("Fail: %c", res); return 0; } criu-3.6/test/zdtm/static/pid00.desc000066400000000000000000000000221317335042600173220ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/pipe00.c000066400000000000000000000044011317335042600170140ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Lock inversion"; const char *test_author = "Andrey Vagin "; #define TEST_STRING "Hello world" int main(int argc, char ** argv) { int pipe1[2]; int pipe2[2]; int ret; pid_t pid; char buf[sizeof(TEST_STRING)]; task_waiter_t t; test_init(argc, argv); task_waiter_init(&t); ret = pipe(pipe1); if (ret) return 1; ret = pipe(pipe2); if (ret) return 1; pid = test_fork(); if (pid < 0) { pr_perror("Can't fork"); exit(1); } else if (pid == 0) { if (dup2(pipe1[1], 11) == -1 || dup2(pipe2[0], 12) == -1) { pr_perror("dup2 failed"); return 1; } } else { if (dup2(pipe1[0], 12) == -1 || dup2(pipe2[1], 11) == -1) { pr_perror("dup2 failed"); goto err; } } close(pipe2[0]); close(pipe2[1]); close(pipe1[0]); close(pipe1[1]); if (pid > 0) { int status; task_waiter_wait4(&t, 1); test_daemon(); test_waitsig(); ret = read(12, buf, sizeof(TEST_STRING)); if (ret != sizeof(TEST_STRING)) { pr_perror("read failed: %d", ret); goto err; } ret = write(11, TEST_STRING, sizeof(TEST_STRING)); if (ret != sizeof(TEST_STRING)) { pr_perror("write failed: %d", ret); goto err; } close(11); ret = read(12, buf, sizeof(TEST_STRING)); if (ret != sizeof(TEST_STRING)) { pr_perror("read failed: %d", ret); goto err; } if (strcmp(TEST_STRING, buf)) { pr_perror("data curruption"); goto err; } ret = wait(&status); if (ret == -1 || !WIFEXITED(status) || WEXITSTATUS(status)) { kill(pid, SIGKILL); goto err; } pass(); } else { task_waiter_complete(&t, 1); ret = write(11, TEST_STRING, sizeof(TEST_STRING)); if (ret != sizeof(TEST_STRING)) { pr_perror("write failed: %d", ret); return 1; } ret = read(12, buf, sizeof(TEST_STRING)); if (ret != sizeof(TEST_STRING)) { pr_perror("read failed: %d", ret); return 1; } ret = write(11, TEST_STRING, sizeof(TEST_STRING)); if (ret != sizeof(TEST_STRING)) { pr_perror("write failed: %d", ret); return 1; } close(11); if (strcmp(TEST_STRING, buf)) { pr_perror("data curruption"); return 1; } } return 0; err: pr_perror("FAIL"); return 1; } criu-3.6/test/zdtm/static/pipe01.c000066400000000000000000000044121317335042600170170ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test that all data can be restored"; const char *test_author = "Andrey Vagin "; #define TEST_STRING "Hello world" int main(int argc, char ** argv) { int pfd[2], pfd_dup[2], pfd_rop[2]; char path[PATH_MAX]; int ret; uint8_t buf[4096]; uint32_t crc; int flags, size = 0; test_init(argc, argv); crc = ~0; datagen(buf, sizeof(buf), &crc); ret = pipe(pfd); if (ret) { pr_perror("pipe() failed"); return 1; } pfd_dup[0] = dup(pfd[0]); pfd_dup[1] = dup(pfd[1]); snprintf(path, PATH_MAX, "/proc/self/fd/%d", pfd[0]); pfd_rop[0] = open(path, O_RDONLY); snprintf(path, PATH_MAX, "/proc/self/fd/%d", pfd[1]); pfd_rop[1] = open(path, O_WRONLY); if (pfd_rop[0] == -1 || pfd_rop[1] == -1 || pfd_dup[0] == -1 || pfd_dup[1] == -1) { pr_perror("dup() failed"); return 1; } flags = fcntl(pfd[1], F_GETFL, 0); if (flags == -1) { pr_perror("fcntl() failed"); return 1; } ret = fcntl(pfd[1], F_SETFL, flags | O_NONBLOCK); if (ret == -1) { pr_perror("fcntl() failed"); return 1; } while (1) { ret = write(pfd[1], buf, sizeof(buf)); if (ret == -1) { if (errno == EAGAIN) break; pr_perror("write() failed"); goto err; } size += ret; } test_daemon(); test_waitsig(); flags = fcntl(pfd[1], F_GETFL, 0); if (!(flags & O_NONBLOCK)) { pr_perror("O_NONBLOCK is absent"); goto err; } flags = fcntl(pfd_dup[1], F_GETFL, 0); if (!(flags & O_NONBLOCK)) { pr_perror("O_NONBLOCK is absent"); goto err; } flags = fcntl(pfd_rop[1], F_GETFL, 0); if (flags & O_NONBLOCK) { pr_perror("O_NONBLOCK appeared"); goto err; } if (close(pfd[1]) == -1) { pr_perror("close() failed"); goto err; } close(pfd_dup[1]); close(pfd_rop[1]); while (1) { ret = read(pfd[0], buf, sizeof(buf)); if (ret == 0) break; if (ret == -1) { goto err; pr_perror("read() failed"); } size -= ret; crc = ~0; ret = datachk(buf, sizeof(buf), &crc); if (ret) { fail("CRC mismatch\n"); goto err; } } if (size) goto err; pass(); return 0; err: fail(); return 1; } criu-3.6/test/zdtm/static/pipe02.c000066400000000000000000000021021317335042600170120ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Create two unshared descriptor for the one end of a pipe"; const char *test_author = "Andrey Vagin "; int main(int argc, char ** argv) { int p[2], fd; int ret; char path[PATH_MAX]; int flags; test_init(argc, argv); ret = pipe(p); if (ret) return 1; snprintf(path, sizeof(path), "/proc/self/fd/%d", p[0]); fd = open(path, O_RDONLY); if (fd == -1) { pr_perror("open"); return 1; }; if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK) == -1) { pr_perror("fcntl"); return 1; } test_daemon(); test_waitsig(); flags = fcntl(fd, F_GETFL, 0); if ((flags & O_NONBLOCK) == 0) { fail("O_NONBLOCK are not restored for %d", fd); return 1; } flags = fcntl(p[0], F_GETFL, 0); if ((flags & O_NONBLOCK) != 0) { fail("Unexpected O_NONBLOCK on %d", p[0]); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/poll.c000066400000000000000000000054711317335042600166750ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check poll() timeouts"; const char *test_author = "Cyrill Gorcunov "; static void show_timestamp(char *prefix, unsigned long tv_sec, unsigned long tv_usec) { test_msg("%8s: sec %20lu nsec %20lu\n", prefix, tv_sec, tv_usec); } static void show_pollfd(struct pollfd *fds, size_t nfds) { size_t i; for (i = 0; i < nfds; i++) { test_msg("%2zu) fd: %2d events %2x revents %2x\n", i, fds[i].fd, fds[i].events, fds[i].revents); } } int main(int argc, char *argv[]) { struct timeval time1, time2; struct timespec delay; struct pollfd ufds[2]; int pipes[2], ret; int delta, status; task_waiter_t t; pid_t pid; char *deltaenv; test_init(argc, argv); task_waiter_init(&t); if (pipe(pipes)) { pr_perror("Can't create pipes"); exit(1); } memset(ufds, 0, sizeof(ufds)); ufds[0].fd = pipes[0]; ufds[0].events = POLLIN; ufds[1].fd = pipes[1]; ufds[1].events = POLLIN; show_pollfd(ufds, 2); if (gettimeofday(&time1, NULL)) { pr_perror("Can't get first delta"); exit(1); } show_timestamp("Init", time1.tv_sec, time1.tv_usec); pid = test_fork(); if (pid < 0) { pr_perror("Fork failed"); exit(1); } else if (pid == 0) { if (gettimeofday(&time1, NULL)) { pr_perror("Can't get from times"); exit(1); } show_timestamp("Start", time1.tv_sec, time1.tv_usec); task_waiter_complete(&t, 1); deltaenv = getenv("ZDTM_DELTA"); if (deltaenv) delta = atoi(deltaenv); else delta = 5; while (test_go()) { ret = poll(ufds, 2, delta * 1000); show_pollfd(ufds, 2); if (ret && errno != EINTR) { pr_perror("Poll-2 returned %d (events?!)", ret); exit(1); } if (gettimeofday(&time2, NULL)) { pr_perror("Can't get from times"); exit(1); } show_timestamp("Stop", time2.tv_sec, time2.tv_usec); show_timestamp("Diff", time2.tv_sec - time1.tv_sec, time2.tv_usec - time1.tv_usec); if ((time2.tv_sec - time1.tv_sec) > delta) { fail("Delta is too big %lu", (unsigned long)(time2.tv_sec - time1.tv_sec)); exit(1); } } exit(0); } task_waiter_wait4(&t, 1); /* Wait to make sure we're in poll internals */ delay.tv_sec = 1; delay.tv_nsec = 0; nanosleep(&delay, NULL); test_daemon(); test_waitsig(); kill(pid, SIGTERM); /* Return immediately if child run or stopped(by SIGSTOP) */ if (waitpid(pid, &status, 0) == -1) { pr_perror("Unable to wait child"); exit(1); } if (!WIFEXITED(status) || WEXITSTATUS(status)) { fail("Child exited with error"); exit(1); } pass(); return 0; } criu-3.6/test/zdtm/static/poll.desc000066400000000000000000000000201317335042600173520ustar00rootroot00000000000000{'flavor': 'h'} criu-3.6/test/zdtm/static/posix_timers.c000066400000000000000000000256021317335042600204520ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc ="Posix timers migration check"; const char *test_author = "Kinsbursky Stanislav "; sigset_t mask; #define WRONG_SIGNAL 1 #define WRONG_SI_PTR 2 #define FAIL_OVERRUN 4 #define MAX_TIMER_DISPLACEMENT 10 #define NO_PERIODIC #ifndef CLOCK_MONOTONIC_COARSE # define CLOCK_MONOTONIC_COARSE 6 #endif #ifndef CLOCK_BOOTTIME # define CLOCK_BOOTTIME 7 #endif #ifndef NO_PERIODIC static void realtime_periodic_handler(int sig, siginfo_t *si, void *uc); static void monotonic_periodic_handler(int sig, siginfo_t *si, void *uc); static void boottime_periodic_handler(int sig, siginfo_t *si, void *uc); #endif static void realtime_oneshot_handler(int sig, siginfo_t *si, void *uc); static void monotonic_oneshot_handler(int sig, siginfo_t *si, void *uc); static void boottime_oneshot_handler(int sig, siginfo_t *si, void *uc); enum { #ifndef NO_PERIODIC REALTIME_PERIODIC_INFO, MONOTONIC_PERIODIC_INFO, BOOTTIME_PERIODIC_INFO, #endif REALTIME_ONESHOT_INFO, MONOTONIC_ONESHOT_INFO, BOOTTIME_ONESHOT_INFO, }; static struct posix_timers_info { char clock; char *name; void (*handler)(int sig, siginfo_t *si, void *uc); int sig; int oneshot; int ms_int; struct sigaction sa; int handler_status; int handler_cnt; timer_t timerid; int overrun; struct timespec start, end; } posix_timers[] = { #ifndef NO_PERIODIC [REALTIME_PERIODIC_INFO] = { .clock = CLOCK_REALTIME, .name = "REALTIME (periodic)", .handler = realtime_periodic_handler, .sig = SIGALRM, .oneshot = 0, .ms_int = 1, }, [MONOTONIC_PERIODIC_INFO] = { .clock = CLOCK_MONOTONIC, .name = "MONOTONIC (periodic)", .handler = monotonic_periodic_handler, .sig = SIGINT, .oneshot = 0, .ms_int = 3, }, [BOOTTIME_PERIODIC_INFO] = { .clock = CLOCK_BOOTTIME, .name = "BOOTTIME (periodic)", .handler = boottime_periodic_handler, .sig = SIGWINCH, .oneshot = 0, .ms_int = 3, }, #endif [REALTIME_ONESHOT_INFO] = { .clock = CLOCK_REALTIME, .name = "REALTIME (oneshot)", .handler = realtime_oneshot_handler, .sig = SIGUSR1, .oneshot = 1, .ms_int = INT_MAX, }, [MONOTONIC_ONESHOT_INFO] = { .clock = CLOCK_MONOTONIC, .name = "MONOTONIC (oneshot)", .handler = monotonic_oneshot_handler, .sig = SIGUSR2, .oneshot = 1, .ms_int = INT_MAX, }, [BOOTTIME_ONESHOT_INFO] = { .clock = CLOCK_BOOTTIME, .name = "BOOTTIME (oneshot)", .handler = boottime_oneshot_handler, .sig = SIGPROF, .oneshot = 1, .ms_int = INT_MAX, }, { } }; static int check_handler_status(struct posix_timers_info *info, struct itimerspec *its, int ms_passed, int delta) { int displacement; int timer_ms; if (!info->handler_cnt && !info->oneshot) { fail("%s: Signal handler wasn't called\n", info->name); return -EINVAL; } if (info->handler_status) { if (info->handler_status & WRONG_SIGNAL) fail("%s: Handler: wrong signal received\n", info->name); if (info->handler_status & WRONG_SI_PTR) fail("%s: Handler: wrong timer address\n", info->name); if (info->handler_status & FAIL_OVERRUN) fail("%s: Handler: failed to get overrun count\n", info->name); return -1; } if (!info->oneshot && !its->it_value.tv_sec && !its->it_value.tv_nsec) { fail("%s: timer became unset\n", info->name); return -EFAULT; } if (info->oneshot && (its->it_interval.tv_sec || its->it_interval.tv_nsec)) { fail("%s: timer became periodic\n", info->name); return -EFAULT; } if (!info->oneshot && !its->it_interval.tv_sec && !its->it_interval.tv_nsec) { fail("%s: timer became oneshot\n", info->name); return -EFAULT; } if (info->oneshot) { int val = its->it_value.tv_sec * 1000 + its->it_value.tv_nsec / 1000 / 1000; if (info->handler_cnt) { if (val != 0) { fail("%s: timer continues ticking after expiration\n", info->name); return -EFAULT; } if (info->handler_cnt > 1) { fail("%s: timer expired %d times\n", info->name, info->handler_cnt); return -EFAULT; } if (info->ms_int > ms_passed) { fail("%s: timer expired too early\n", info->name); return -EFAULT; } return 0; } timer_ms = info->ms_int - val; } else timer_ms = (info->overrun + info->handler_cnt) * info->ms_int; displacement = (abs(ms_passed - timer_ms) - delta) * 100 / ms_passed; test_msg("%20s: cpt/rst : %-8d msec\n", info->name, delta); test_msg("%20s: Time passed (ms) : %-8d msec\n", info->name, ms_passed); test_msg("%20s: Timer results : %-8d msec\n", info->name, timer_ms); test_msg("%20s: Handler count : %d\n", info->name, info->handler_cnt); if (displacement > MAX_TIMER_DISPLACEMENT) { fail("%32s: Time displacement: %d%% (max alloved: %d%%)\n", info->name, displacement, MAX_TIMER_DISPLACEMENT); return -EFAULT; } return 0; } static int check_timers(int delta, struct timespec *sleep_start, struct timespec *sleep_end) { struct posix_timers_info *info = posix_timers; int ms_passed; int status = 0; struct itimerspec val, oldval; if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == -1) { fail("Failed to unlock signal\n"); return -errno; } while (info->handler) { memset(&val, 0, sizeof(val)); if (timer_settime(info->timerid, 0, &val, &oldval) == -1) { fail("%s: failed to reset timer\n", info->name); return -errno; } if (clock_gettime(info->clock, &info->end) == -1) { fail("Can't get %s end time\n", info->name); return -errno; } /* * Adjust with @total_sleep_time if needed. */ if (info->clock == CLOCK_BOOTTIME) { info->start.tv_sec -= sleep_start->tv_sec; info->start.tv_nsec -= sleep_start->tv_nsec; info->end.tv_sec -= sleep_end->tv_sec; info->end.tv_nsec -= sleep_end->tv_nsec; } ms_passed = (info->end.tv_sec - info->start.tv_sec) * 1000 + (info->end.tv_nsec - info->start.tv_nsec) / (1000 * 1000); if (check_handler_status(info, &oldval, ms_passed, delta)) status--; info++; } return status; } static void generic_handler(struct posix_timers_info *info, struct posix_timers_info *real, int sig) { int overrun; if (info == NULL) info = &posix_timers[MONOTONIC_ONESHOT_INFO]; if (info != real) { real->handler_status |= WRONG_SI_PTR; return; } if (sig != info->sig) info->handler_status |= WRONG_SIGNAL; overrun = timer_getoverrun(info->timerid); if (overrun == -1) info->handler_status |= FAIL_OVERRUN; else info->overrun += overrun; info->handler_cnt++; } #ifndef NO_PERIODIC static void monotonic_periodic_handler(int sig, siginfo_t *si, void *uc) { generic_handler(si->si_value.sival_ptr, &posix_timers[MONOTONIC_PERIODIC_INFO], sig); } static void boottime_periodic_handler(int sig, siginfo_t *si, void *uc) { generic_handler(si->si_value.sival_ptr, &posix_timers[BOOTTIME_PERIODIC_INFO], sig); } #endif static void monotonic_oneshot_handler(int sig, siginfo_t *si, void *uc) { generic_handler(si->si_value.sival_ptr, &posix_timers[MONOTONIC_ONESHOT_INFO], sig); } static void boottime_oneshot_handler(int sig, siginfo_t *si, void *uc) { generic_handler(si->si_value.sival_ptr, &posix_timers[BOOTTIME_ONESHOT_INFO], sig); } #ifndef NO_PERIODIC static void realtime_periodic_handler(int sig, siginfo_t *si, void *uc) { generic_handler(si->si_value.sival_ptr, &posix_timers[REALTIME_PERIODIC_INFO], sig); } #endif static void realtime_oneshot_handler(int sig, siginfo_t *si, void *uc) { generic_handler(si->si_value.sival_ptr, &posix_timers[REALTIME_ONESHOT_INFO], sig); } static int setup_timers(void) { int i; int ret; struct posix_timers_info *info = posix_timers; struct sigevent sev; struct itimerspec its; sigemptyset(&mask); while(info->handler) { sigaddset(&mask, info->sig); info++; } if (sigprocmask(SIG_SETMASK, &mask, NULL) == -1) { pr_perror("Failed to unlock signal"); return -errno; } info = posix_timers; while(info->handler) { /* Add and delete fake timers to test restoring 'with holes' */ timer_t timeridt; for (i = 0; i < 10; i++) { ret = timer_create(CLOCK_REALTIME, NULL, &timeridt); if (ret < 0) { pr_perror("Can't create temporary posix timer %lx", (long) timeridt); return -errno; } ret = timer_delete(timeridt); if (ret < 0) { pr_perror("Can't remove temporaty posix timer %lx", (long) timeridt); return -errno; } } info->sa.sa_flags = SA_SIGINFO; info->sa.sa_sigaction = info->handler; sigemptyset(&info->sa.sa_mask); if (sigaction(info->sig, &info->sa, NULL) == -1) { pr_perror("Failed to set SIGALRM handler"); return -errno; } sev.sigev_notify = SIGEV_SIGNAL; sev.sigev_signo = info->sig; if (&posix_timers[MONOTONIC_ONESHOT_INFO] == info) sev.sigev_value.sival_ptr = NULL; else sev.sigev_value.sival_ptr = info; if (timer_create(info->clock, &sev, &info->timerid) == -1) { pr_perror("Can't create timer"); return -errno; } its.it_value.tv_sec = info->ms_int / 1000; its.it_value.tv_nsec = info->ms_int % 1000 * 1000 * 1000; if (!info->oneshot) { its.it_interval.tv_sec = its.it_value.tv_sec; its.it_interval.tv_nsec = its.it_value.tv_nsec; } else its.it_interval.tv_sec = its.it_interval.tv_nsec = 0; if (clock_gettime(info->clock, &info->start) == -1) { pr_perror("Can't get %s start time", info->name); return -errno; } if (timer_settime(info->timerid, 0, &its, NULL) == -1) { pr_perror("Can't set timer"); return -errno; } info++; } return 0; } /* * Figure out @total_sleep_time, ie time the system was in hardware * suspend mode, will need this value to exclude from boottime clock * testing. */ static int get_total_sleep_time(struct timespec *tv, char *type) { struct timespec boottime_coarse; struct timespec boottime; if (clock_gettime(CLOCK_BOOTTIME, &boottime) == -1) { pr_perror("Can't get CLOCK_BOOTTIME %s time", type); return -errno; } if (clock_gettime(CLOCK_MONOTONIC_COARSE, &boottime_coarse) == -1) { pr_perror("Can't get CLOCK_MONOTONIC_COARSE %s time", type); return -errno; } tv->tv_sec = boottime.tv_sec - boottime_coarse.tv_sec; tv->tv_nsec = boottime.tv_nsec - boottime_coarse.tv_nsec; test_msg("(%6s) boottime %lu " "boottime-coarse %lu " "total_sleep_time %lu\n", type, (long)boottime.tv_sec, (long)boottime_coarse.tv_sec, (long)tv->tv_sec); return 0; } int main(int argc, char **argv) { struct timespec sleep_start, sleep_end; struct timespec start, end; int err; test_init(argc, argv); err = setup_timers(); if (err) return err; usleep(500 * 1000); clock_gettime(CLOCK_REALTIME, &start); err = get_total_sleep_time(&sleep_start, "start"); if (err) return err; test_daemon(); test_waitsig(); clock_gettime(CLOCK_REALTIME, &end); err = get_total_sleep_time(&sleep_end, "end"); if (err) return err; err = check_timers((end.tv_sec - start.tv_sec) * 1000 + (end.tv_nsec - start.tv_nsec) / 1000000, &sleep_start, &sleep_end); if (err) return err; pass(); return 0; } criu-3.6/test/zdtm/static/proc-self.c000066400000000000000000000027151317335042600176170ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check for /proc/self/ns path restore"; const char *test_author = "Cyrill Gorcunov "; const char nspath[] = "/proc/self/ns/net"; int read_fd_link(int lfd, char *buf, size_t size) { ssize_t ret; char t[32]; snprintf(t, sizeof(t), "/proc/self/fd/%d", lfd); ret = readlink(t, buf, size - 1); if (ret < 0) { pr_perror("Can't read link of fd %d", lfd); return -1; } buf[ret] = 0; return 0; } int main(int argc, char *argv[]) { char path_orig[64], path_new[64]; int fd_self, fd_new; test_init(argc, argv); memset(path_orig, 0, sizeof(path_orig)); memset(path_new, 0, sizeof(path_new)); fd_self = open(nspath, O_RDONLY); if (fd_self < 0) { pr_perror("Can't open %s", nspath); return -1; } test_daemon(); test_waitsig(); if (read_fd_link(fd_self, path_orig, sizeof(path_orig))) { pr_perror("Can't fill original path"); return -1; } fd_new = open(nspath, O_RDONLY); if (fd_new < 0) { pr_perror("Can't open %s", nspath); return -1; } if (read_fd_link(fd_new, path_new, sizeof(path_new))) { pr_perror("Can't fill new path"); return -1; } if (memcmp(path_orig, path_new, sizeof(path_orig))) { fail("Paths mismatch %s %s\n", path_orig, path_new); return -1; } pass(); return 0; } criu-3.6/test/zdtm/static/pstree.c000066400000000000000000000024201317335042600172200ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that environment didn't change"; const char *test_author = "Pavel Emelianov "; int main(int argc, char **argv) { char x; int pid, ppid; int sp[2], fp[2], rp[2]; test_init(argc, argv); if (pipe(sp) || pipe(fp) || pipe(rp)) { pr_perror("pipe"); return 1; } pid = fork(); if (pid == 0) { close(sp[0]); close(fp[1]); close(rp[0]); pid = getpid(); ppid = getppid(); close(sp[1]); if (read(fp[0], &x, 1)) { pr_perror("read"); return 1; } close(fp[0]); if (pid != getpid()) x = 'p'; else if (ppid != getppid()) x = 'P'; else x = '0'; if (write(rp[1], &x, 1) != 1) { pr_perror("write"); return 1; } close(rp[1]); _exit(0); } x = 'X'; close(sp[1]); close(fp[0]); close(rp[1]); if (read(sp[0], &x, 1)) { pr_perror("read"); return 1; } test_daemon(); test_waitsig(); close(fp[1]); if (read(rp[0], &x, 1) != 1) { pr_perror("read"); return 1; } close(rp[0]); if (x == 'X') fail("Sync failed"); else if (x == 'p') fail("Pid failed"); else if (x == 'P') fail("PPid failed"); else if (x != '0') fail("Shit happened"); else pass(); return 0; } criu-3.6/test/zdtm/static/pthread00.c000066400000000000000000000067721317335042600175230ustar00rootroot00000000000000/* * A simple testee program with threads */ #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define exit_group(code) \ syscall(__NR_exit_group, code) const char *test_doc = "Create a few pthreads/forks and compare TLS and mmap data on restore\n"; const char *test_author = "Cyrill Gorcunov = 0; i--) task_waiter_wait4(&waiter[i], 1); test_daemon(); test_waitsig(); for (i = 0; i < NR_WAITERS; i++) task_waiter_complete(&waiter[i], 2); test_msg("Waiting while all threads are joined\n"); pthread_join(th1, NULL); pthread_join(th2, NULL); if (IS_PASSED(map, 0) && IS_PASSED(map, 1) && IS_PASSED(map, 2) && IS_PASSED(map, 3) && IS_PASSED(map, 4) && IS_PASSED(map, 5)) pass(); else fail(); return 0; } criu-3.6/test/zdtm/static/pthread01.c000066400000000000000000000125041317335042600175120ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define gettid() pthread_self() const char *test_doc = "Create a few pthreads and test TLS + blocked signals\n"; const char *test_author = "Cyrill Gorcunov #include #include #include #include "zdtmtst.h" const char *test_doc = "Create a thread with a dead leader\n"; const char *test_author = "Andrew Vagin #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check ptrace, if the child process's stopped by signal"; const char *test_author = "Andrey Vagin "; typedef void (*sighandler_t)(int); int child_fd; int child_exit = 0; void sig_handler(int signo, siginfo_t *siginfo, void *data) { int pid, ret; test_msg("receive signal sig=%d from pid=%d\n", signo, siginfo->si_pid); pid = siginfo->si_pid; ret = write(child_fd, &pid, sizeof(pid)); if (ret != sizeof(pid)) pr_perror("write"); child_exit = 1; } int child(int fd) { int ret = 0; struct sigaction act = { .sa_sigaction = sig_handler, .sa_flags = SA_SIGINFO, }, old_act; sigemptyset(&act.sa_mask); child_fd = fd; ret = sigaction(SIGUSR2, &act, &old_act); if (ret < 0) { pr_perror("signal failed"); return 1; } ret = ptrace(PTRACE_TRACEME, 0, 0, 0); if (ret < 0) { pr_perror("ptrace failed"); return 1; } ret = write(child_fd, &ret, sizeof(ret)); while (!child_exit) ret = sleep(1); close(child_fd); return 0; } int main(int argc, char ** argv) { int ret, status = 0; pid_t pid, spid, cpid; int child_pipe[2]; siginfo_t siginfo; test_init(argc, argv); ret = pipe(child_pipe); if (ret < 0) { pr_perror("pipe failed"); return 1; } cpid = test_fork(); if (cpid < 0) { pr_perror("fork failed"); return 1; } else if (cpid == 0) { close(child_pipe[0]); return child(child_pipe[1]); } close(child_pipe[1]); test_msg("wait while child initialized"); ret = read(child_pipe[0], &status, sizeof(status)); if (ret != sizeof(status)) { pr_perror("read from child process failed"); return 1; } spid = test_fork(); if (spid < 0) { pr_perror("Can't fork signal process"); return 1; } else if (spid == 0) { test_msg("send signal to %d\n", cpid); ret = kill(cpid, SIGUSR2); if (ret < 0) { pr_perror("kill failed"); } return 0; } if (waitid(P_PID, spid, &siginfo, WEXITED | WNOWAIT)) { pr_perror("Unable to wait spid"); return 1; } if (waitid(P_PID, cpid, &siginfo, WSTOPPED | WNOWAIT)) { pr_perror("Unable to wait cpid"); return 1; } test_daemon(); test_waitsig(); while (1) { test_msg("waiting...\n"); pid = wait(&status); if (pid < 0) { if (errno != ECHILD) pr_perror("wait"); break; } if (WIFSTOPPED(status)) { test_msg("pid=%d stopsig=%d\n", pid, WSTOPSIG(status)); ret = ptrace(PTRACE_GETSIGINFO, pid, 0, &siginfo); if (ret < 0) { pr_perror("ptrace failed"); return 1; } else test_msg("pid=%d sends signal\n", siginfo.si_pid); ret = ptrace(PTRACE_CONT, pid, 0, WSTOPSIG(status)); if (ret < 0) pr_perror("ptrace failed"); ret = read(child_pipe[0], &status, sizeof(status)); if (ret != sizeof(status)) { pr_perror("read"); return 1; } if (spid != siginfo.si_pid) { fail("%d!=%d", cpid, siginfo.si_pid); return 1; } else if (status != siginfo.si_pid) { fail("%d!=%d", status, siginfo.si_pid); return 1; } } else if (WIFEXITED(status)) { test_msg("pid = %d status = %d\n", pid, WEXITSTATUS(status)); if (WEXITSTATUS(status)) return 1; } else if (WIFSIGNALED(status)) { test_msg("pid = %d signal = %d\n", pid, WTERMSIG(status)); return 1; } } pass(); return 0; } criu-3.6/test/zdtm/static/ptrace_sig.desc000066400000000000000000000000241317335042600205300ustar00rootroot00000000000000{'flags': 'crfail'} criu-3.6/test/zdtm/static/pty-console.c000077700000000000000000000000001317335042600213102pty01.custar00rootroot00000000000000criu-3.6/test/zdtm/static/pty-console.desc000066400000000000000000000000471317335042600206710ustar00rootroot00000000000000{'flags': 'suid', 'flavor' : 'ns uns'} criu-3.6/test/zdtm/static/pty00.c000066400000000000000000000051551317335042600167020ustar00rootroot00000000000000#define _XOPEN_SOURCE 500 #include #include "zdtmtst.h" #include #include #include #include #include #include #include const char *test_doc = "Check, that pseudoterminals are restored"; const char *test_author = "Andrey Vagin "; static unsigned int nr_sighups; static void signal_handler_sighup(int signum) { nr_sighups++; } int main(int argc, char ** argv) { int fdm, fds, ret, tty, i; char *slavename; char buf[4096]; const char teststr[] = "hello\n"; struct sigaction sa = { .sa_handler = signal_handler_sighup, .sa_flags = 0, }; test_init(argc, argv); /* * On closing control terminal we're expecting to * receive SIGHUP, so make sure it's delivered. */ if (sigaction(SIGHUP, &sa, 0)) { fail("sigaction failed\n"); return 1; } fdm = open("/dev/ptmx", O_RDWR); if (fdm == -1) { pr_perror("open(%s) failed", "/dev/ptmx"); return 1; } grantpt(fdm); unlockpt(fdm); slavename = ptsname(fdm); fds = open(slavename, O_RDWR); if (fds == -1) { pr_perror("open(%s) failed", slavename); return 1; } tty = open("/dev/tty", O_RDWR); if (tty < 0) { pr_perror("open(%s) failed", "/dev/tty"); return 1; } /* Try to reproduce a deadlock */ if (dup2(fdm, 101) != 101) { pr_perror("dup( , 101) failed"); return 1; } close(fdm); fdm = 101; if (dup2(fds, 100) != 100) { pr_perror("dup( , 100) failed"); return 1; } close(fds); fds = 100; for (i = 0; i < 10; i++) { /* Check connectivity */ ret = write(fdm, teststr, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("write(fdm) failed"); return 1; } } test_daemon(); test_waitsig(); for (i = 0; i < 10; i++) { ret = read(fds, buf, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("read(fds) failed"); return 1; } } if (strncmp(teststr, buf, sizeof(teststr) - 1)) { fail("data mismatch"); return 1; } ret = write(fdm, teststr, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("write(fdm) failed"); return 1; } ret = read(tty, buf, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("read(tty) failed"); return 1; } if (strncmp(teststr, buf, sizeof(teststr) - 1)) { fail("data mismatch"); return 1; } if (nr_sighups != 0) { fail("Expected 0 SIGHUP before closing control terminal but got %d", nr_sighups); return 1; } close(fdm); close(fds); close(tty); if (nr_sighups != 1) { fail("Expected 1 SIGHUP after closing control terminal but got %d", nr_sighups); return 1; } else pass(); return 0; } criu-3.6/test/zdtm/static/pty01.c000066400000000000000000000047431317335042600167050ustar00rootroot00000000000000#define _XOPEN_SOURCE 500 #define _DEFAULT_SOURCE #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check two pts on ptmx"; const char *test_author = "Cyrill Gorcunov "; static const char teststr[] = "ping\n"; int main(int argc, char *argv[]) { char buf[sizeof(teststr)]; int master, slave1, slave2, ret; char *slavename; struct stat st; uid_t new_uid = 33333; gid_t new_gid = 44444; test_init(argc, argv); master = open("/dev/ptmx", O_RDWR); if (master == -1) { pr_perror("open(%s) failed", "/dev/ptmx"); return 1; } grantpt(master); unlockpt(master); slavename = ptsname(master); slave1 = open(slavename, O_RDWR); if (slave1 == -1) { pr_perror("open(%s) failed", slavename); return 1; } slave2 = open(slavename, O_RDWR); if (slave2 == -1) { pr_perror("open(%s) failed", slavename); return 1; } #ifdef ZDTM_DEV_CONSOLE { int fd; fd = open("/dev/console", O_CREAT | O_RDONLY, 0755); if (fd < 0) return -1; close(fd); if (mount(slavename, "/dev/console", NULL, MS_BIND, NULL)) return -1; } #endif if (fchown(slave1, new_uid, new_gid)) { pr_perror("Can't set uid/gid on %s", slavename); return 1; } test_daemon(); test_waitsig(); signal(SIGHUP, SIG_IGN); if (fstat(slave1, &st)) { pr_perror("Can't fetch stat on %s", slavename); return 1; } if (st.st_uid != new_uid || st.st_gid != new_gid) { fail("UID/GID mismatch (got %d/%d but %d/%d expected)", (int)st.st_uid, (int)st.st_gid, (int)new_uid, (int)new_gid); return 1; } ret = write(master, teststr, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("write(master) failed"); return 1; } ret = read(slave1, buf, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("read(slave1) failed"); return 1; } if (strncmp(teststr, buf, sizeof(teststr) - 1)) { fail("data mismatch"); return 1; } ret = write(master, teststr, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("write(master) failed"); return 1; } ret = read(slave2, buf, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("read(slave1) failed"); return 1; } if (strncmp(teststr, buf, sizeof(teststr) - 1)) { fail("data mismatch"); return 1; } close(master); close(slave1); close(slave2); pass(); return 0; } criu-3.6/test/zdtm/static/pty01.desc000066400000000000000000000000221317335042600173630ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/pty02.c000066400000000000000000000036251317335042600167040ustar00rootroot00000000000000#define _XOPEN_SOURCE 500 #include #include "zdtmtst.h" #include #include #include #include #include #include #include const char *test_doc = "Check forked master ptmx"; const char *test_author = "Cyrill Gorcunov "; static const char teststr[] = "ping\n"; #define exit_shot(pid, code) \ do { kill(pid, SIGKILL); exit(code); } while (0) #define exit_shot_parent(code) \ exit_shot(getppid(), 1) int main(int argc, char *argv[]) { char buf[sizeof(teststr)]; int master, slave, ret; char *slavename; task_waiter_t t; pid_t pid; test_init(argc, argv); master = open("/dev/ptmx", O_RDWR); if (master == -1) { pr_perror("open(%s) failed", "/dev/ptmx"); return 1; } grantpt(master); unlockpt(master); slavename = ptsname(master); slave = open(slavename, O_RDWR); if (slave == -1) { pr_perror("open(%s) failed", slavename); return 1; } task_waiter_init(&t); pid = test_fork(); if (pid == 0) { int new_master, ret; new_master = dup(master); if (new_master < 0) { pr_perror("can't dup master"); exit_shot_parent(1); } task_waiter_complete_current(&t); ret = write(new_master, teststr, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("write(new_master) failed (ret = %d)", ret); exit_shot_parent(1); } task_waiter_wait4(&t, 1); close(new_master); exit(0); } else if (pid < 0) { pr_perror("test_fork failed"); exit(1); } task_waiter_wait4(&t, pid); close(master); test_daemon(); test_waitsig(); signal(SIGHUP, SIG_IGN); ret = read(slave, buf, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("read(slave) failed (ret = %d)", ret); return 1; } if (strncmp(teststr, buf, sizeof(teststr) - 1)) { fail("data mismatch"); return 1; } task_waiter_complete(&t, 1); close(slave); pass(); return 0; } criu-3.6/test/zdtm/static/pty03.c000066400000000000000000000030151317335042600166760ustar00rootroot00000000000000#define _XOPEN_SOURCE 500 #include #include "zdtmtst.h" #include #include #include #include #include #include #include #include const char *test_doc = "Check a non-opened control terminal"; const char *test_author = "Andrey Vagin "; static const char teststr[] = "ping\n"; int main(int argc, char *argv[]) { char buf[sizeof(teststr)]; int master, slave, ret; char *slavename; test_init(argc, argv); master = open("/dev/ptmx", O_RDWR); if (master == -1) { pr_perror("open(%s) failed", "/dev/ptmx"); return 1; } grantpt(master); unlockpt(master); slavename = ptsname(master); slave = open(slavename, O_RDWR); if (slave == -1) { pr_perror("open(%s) failed", slavename); return 1; } if (ioctl(slave, TIOCSCTTY, 1)) { pr_perror("Can't set a controll terminal"); return 1; } close(slave); test_daemon(); test_waitsig(); slave = open("/dev/tty", O_RDWR); if (slave == -1) { pr_perror("Can't open the controll terminal"); return -1; } signal(SIGHUP, SIG_IGN); ret = write(master, teststr, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("write(master) failed"); return 1; } ret = read(slave, buf, sizeof(teststr) - 1); if (ret != sizeof(teststr) - 1) { pr_perror("read(slave1) failed"); return 1; } if (strncmp(teststr, buf, sizeof(teststr) - 1)) { fail("data mismatch"); return 1; } close(master); close(slave); pass(); return 0; } criu-3.6/test/zdtm/static/pty03.desc000066400000000000000000000000241317335042600173670ustar00rootroot00000000000000{'flavor': 'h uns'} criu-3.6/test/zdtm/static/pty04.c000066400000000000000000000021751317335042600167050ustar00rootroot00000000000000#define _XOPEN_SOURCE 500 #include #include "zdtmtst.h" #include #include #include #include #include #include #include #include const char *test_doc = "Check two pts with a fake ptmx"; const char *test_author = "Cyrill Gorcunov "; int main(int argc, char *argv[]) { int master, slave1, slave2; char *slavename; test_init(argc, argv); master = open("/dev/ptmx", O_RDWR); if (master == -1) { pr_perror("open(%s) failed", "/dev/ptmx"); return 1; } grantpt(master); unlockpt(master); slavename = ptsname(master); slave1 = open(slavename, O_RDWR); if (slave1 == -1) { pr_perror("open(%s) failed", slavename); return 1; } slave2 = open(slavename, O_RDWR); if (slave2 == -1) { pr_perror("open(%s) failed", slavename); return 1; } if (ioctl(slave1, TIOCSCTTY, 1)) { pr_perror("Can't set a controll terminal"); return 1; } test_msg("Closing master\n"); signal(SIGHUP, SIG_IGN); close(master); test_daemon(); test_waitsig(); close(slave1); close(slave2); pass(); return 0; } criu-3.6/test/zdtm/static/remap_dead_pid.c000066400000000000000000000025661317335042600206460ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000 #endif #ifdef REMAP_PID_ROOT const char *proc_path = "/proc/%d"; #else const char *proc_path = "/proc/%d/mountinfo"; #endif const char *test_doc = "Check that dead pid's /proc entries are remapped correctly"; const char *test_author = "Tycho Andersen "; int main(int argc, char **argv) { pid_t pid; test_init(argc, argv); pid = fork(); if (pid < 0) { fail("fork() failed"); return -1; } if (pid == 0) { /* Child process just sleeps until it is killed. All we need * here is a process to open the mountinfo of. */ while(1) sleep(10); } else { test_msg("child is %d\n", pid); int fd, ret; char path[PATH_MAX]; pid_t result; sprintf(path, proc_path, pid); fd = open(path, O_RDONLY); if (fd < 0) { fail("failed to open fd"); return -1; } /* no matter what, we should kill the child */ kill(pid, SIGKILL); result = waitpid(pid, NULL, 0); if (result < 0) { fail("failed waitpid()"); return -1; } test_daemon(); test_waitsig(); ret = fcntl(fd, F_GETFD); close(fd); if (ret) { fail("bad fd after restore"); return -1; } } pass(); return 0; } criu-3.6/test/zdtm/static/remap_dead_pid.desc000066400000000000000000000000251317335042600213260ustar00rootroot00000000000000{'flavor': 'ns uns'} criu-3.6/test/zdtm/static/remap_dead_pid_root.c000077700000000000000000000000001317335042600247542remap_dead_pid.custar00rootroot00000000000000criu-3.6/test/zdtm/static/remap_dead_pid_root.desc000066400000000000000000000000251317335042600223710ustar00rootroot00000000000000{'flavor': 'ns uns'} criu-3.6/test/zdtm/static/rlimits00.c000066400000000000000000000021461317335042600175460ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that rlimits are saved"; const char *test_author = "Pavel Emelianov "; int main(int argc, char **argv) { int r, changed = 0; struct rlimit rlims[RLIM_NLIMITS], trlim; test_init(argc, argv); for (r = 0; r < RLIM_NLIMITS; r++) { if (getrlimit(r, &rlims[r])) { pr_perror("Can't get rlimit"); goto out; } if (rlims[r].rlim_cur > 1 && rlims[r].rlim_cur != RLIM_INFINITY) { rlims[r].rlim_cur--; if (setrlimit(r, &rlims[r])) { pr_perror("Can't set rlimit"); goto out; } changed = 1; } } if (!changed) { pr_perror("Can't change any rlimir"); goto out; } test_daemon(); test_waitsig(); for (r = 0; r < RLIM_NLIMITS; r++) { if (getrlimit(r, &trlim)) { fail("Can't get rlimit after rst"); goto out; } if (rlims[r].rlim_cur != trlim.rlim_cur) { fail("Cur changed"); goto out; } if (rlims[r].rlim_max != trlim.rlim_max) { fail("Max changed"); goto out; } } pass(); out: return 0; } criu-3.6/test/zdtm/static/rmdir_open.c000066400000000000000000000024611317335042600200610ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that opened removed dir works"; const char *test_author = "Pavel Emelianov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { char subdir[PATH_MAX]; int fd; struct stat st; test_init(argc, argv); sprintf(subdir, "%s/subdir", dirname); if (mkdir(dirname, 0700) || mkdir(subdir, 0700)) { pr_perror("Can't make dir"); goto out; } fd = open(subdir, O_DIRECTORY); if (fd < 0) { pr_perror("Can't open dir"); goto outr; } if (rmdir(subdir) || rmdir(dirname)) { pr_perror("Can't remove dir"); goto outr; } test_daemon(); test_waitsig(); /* * We can't compare anything with previous, since * inode _will_ change, so can the device. The only * reasonable thing we can do is check that the fd * still points to some removed directory. */ if (fstat(fd, &st)) { fail("Can't stat fd\n"); goto out; } if (!S_ISDIR(st.st_mode)) { fail("Fd is no longer directory\n"); goto out; } if (st.st_nlink != 0) { fail("Directory is not removed\n"); goto out; } pass(); return 0; outr: rmdir(dirname); out: return 1; } criu-3.6/test/zdtm/static/route_rules000077500000000000000000000027141317335042600200560ustar00rootroot00000000000000#!/bin/bash # $Id: route_rules,v 1.1 2007/06/04 12:11:30 agladkov Exp $ # # Copyright (c) 2007 by SWsoft. # All rights reserved. # # Description: # check that routes saved after migration export PATH=$PATH:${0%/*}/../../lib die() { echo "$0:${BASH_LINENO[0]}: $*" >&2 exit 1 } fail() { echo "FAIL: $0:${BASH_LINENO[0]}: $*" > "$outfile" exit 1 } do_or_fail() { local failmsg="$1" output shift output="$(eval $@ 2>&1)" || fail "$failmsg: $output" } do_start() { [ -f "$statefile" ] && die "state file $statefile aleady exists" # Get default route dev_name=`ip route list match 0.0.0.0/0 | sed 's/.*dev \([^ ]*\).*/\1/'` [ -n "$dev_name" ] || fail "dev_name is zero: " \ "\$dev_name=\`ip route list match 0.0.0.0/0 | " \ "sed 's/.*dev \([^ ]*\).*/\1/'" do_or_fail "can't add routes" \ ip r a 1.2.3.4/32 dev $dev_name && ip r a 1.2.0.0/16 via 1.2.3.4 do_or_fail "can't list created routes" \ ip r \| grep "1.2.3.4" \> "$statefile" } do_stop() { do_or_fail "can't compare the routes" \ ip r \| grep "1.2.3.4" \| diff -u "$statefile" - rm -f "$statefile" IFS=" "; for line in `ip r | grep "1.2.3.4"`; do eval ip r del $line done echo "PASS" > $outfile } tmpargs="$(../lib/parseargs.sh --name=$0 \ --flags-req=statefile,outfile \ -- "$@")" || die "can't parse command line" eval "$tmpargs" [ -f "$outfile" ] && die "out file $outfile aleady exists" # expect "start" or "stop" action=${1:?Specify action$(die 'Specify action')} do_$action criu-3.6/test/zdtm/static/rtc.c000066400000000000000000000021731317335042600165130ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define TEST_HZ 4 #define NR_FAILS 10 int main(int argc, char **argv) { unsigned long data; long delta; int fd, fail = NR_FAILS, to_pass = NR_FAILS; struct timeval start, end; test_init(argc, argv); fd = open("/dev/rtc", O_RDWR); if (fd < 0) { pr_perror("open"); return 1; } if (ioctl(fd, RTC_IRQP_SET, TEST_HZ) == -1) { pr_perror("RTC_IRQP_SET"); return 1; } if (ioctl(fd, RTC_PIE_ON, 0) == -1) { pr_perror("RTC_PIE_ON"); return 1; } test_daemon(); gettimeofday(&start, NULL); start.tv_usec += start.tv_sec * 1000000; while (test_go() || to_pass--) { if (read(fd, &data, sizeof(unsigned long)) == -1) return 1; gettimeofday(&end, NULL); end.tv_usec += end.tv_sec * 1000000; delta = end.tv_usec - start.tv_usec; if (labs(delta - 1000000 / TEST_HZ ) > 100000) { pr_perror("delta = %ld", delta); fail--; if (fail == 0) return 1; } start = end; } pass(); return 0; } criu-3.6/test/zdtm/static/rtc.desc000066400000000000000000000001121317335042600171760ustar00rootroot00000000000000{'flavor': 'h', 'flags': 'suid crlib','arch': 'x86_64 aarch64 arm ppc64'} criu-3.6/test/zdtm/static/s390x_gs_threads.c000066400000000000000000000106651317335042600210210ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "lock.h" #include "zdtmtst.h" #define NR_THREADS 4 #define GS_ENABLE 0 #define GS_SET_BC_CB 2 #define GS_BROADCAST 4 #ifndef __NR_guarded_storage #define __NR_guarded_storage 378 #endif const char *test_doc = "Check the guarded storage broadcast"; /* Original test provided by Martin Schwidefsky */ const char *test_author = "Alice Frosi "; static unsigned long main_thread_tid; /* * This test case executes the following procedure: * * 1) The parent thread creates NR_THREADS child threads * * 2) For each thread (including the parent thread): * - Enable guarded-storage * - Set the guarded-storage broadcast control block and * specify gs_handler as Guarded-Storage-Event Parameter-List * address * * 3) Dump and restore * * 4) Guarded-storage broadcast event * - Child threads: Wait until main thread does GS broadcast * - Parent thread: Trigger GS broadcast * * 5) Verify that all GS works as expected and all threads have been * executed the gs_handler */ struct gs_cb { __u64 reserved; __u64 gsd; __u64 gssm; __u64 gs_epl_a; }; static futex_t futex; static futex_t futex2; /* * Load guarded-storage */ void load_guarded(unsigned long *mem); asm( ".global load_guarded\n" "load_guarded:\n" " .insn rxy,0xe3000000004c,%r2,0(%r2)\n" " br %r14\n" " .size load_guarded,.-load_guarded\n"); /* * Inline assembly to deal with interrupted context to the call of * the GS handler. Load guarded can be turned into a branch to this * function. */ void gs_handler_asm(void); asm( ".globl gs_handler_asm\n" "gs_handler_asm:\n" " lgr %r14,%r15\n" " aghi %r15,-320\n" " stmg %r0,%r14,192(%r15)\n" " stg %r14,312(%r14)\n" " la %r2,160(%r15)\n" " .insn rxy,0xe30000000049,0,160(%r15)\n" " lg %r14,24(%r2)\n" " lg %r14,40(%r14)\n" " la %r14,6(%r14)\n" " stg %r14,304(%r15)\n" " brasl %r14,gs_handler\n" " lmg %r0,%r15,192(%r15)\n" " br %r14\n" " .size gs_handler_asm,.-gs_handler_asm\n"); /* * GS handler called when GS event occurs */ void gs_handler(struct gs_cb *this_cb) { unsigned long tid = syscall(SYS_gettid); test_msg("gs_handler for thread %016lx\n", tid); futex_dec_and_wake(&futex2); } /* * Entry point for threads */ static void *thread_run(void *param) { unsigned long test = 0x1234000000; unsigned long *gs_epl; struct gs_cb *gs_cb; /* Enable guarded-storage */ if (syscall(__NR_guarded_storage, GS_ENABLE) != 0) { fail("Unable to enable guarded storage"); exit(1); } gs_epl = malloc(sizeof(unsigned long) * 6); gs_cb = malloc(sizeof(*gs_cb)); if (gs_epl == NULL || gs_cb == NULL) { fail("Error allocating memory\n"); exit(1); } gs_cb->gsd = 0x1234000000UL | 26; gs_cb->gssm = -1UL; gs_cb->gs_epl_a = (unsigned long) gs_epl; gs_epl[1] = (unsigned long) gs_handler_asm; /* Set the GS broadcast control block */ syscall(__NR_guarded_storage, GS_SET_BC_CB, gs_cb); futex_dec_and_wake(&futex); /* Wait for all threads to set the GS broadcast control block */ futex_wait_until(&futex, 0); test_msg("Thread %016lx staring loop\n", syscall(SYS_gettid)); /* * Designate a guarded-storage section until the main task * performs the GS_BROADCAST action and the following load_guarded * will provoke the switch to the gs handler */ while (1) load_guarded(&test); } int main(int argc, char *argv[]) { pthread_t tids[NR_THREADS]; int i; main_thread_tid = syscall(SYS_gettid); test_init(argc, argv); /* Enable guarded-storage */ if (syscall(__NR_guarded_storage, GS_ENABLE) != 0) { if (errno == ENOSYS) { test_daemon(); test_waitsig(); skip("No guarded storage support"); pass(); return 0; } fail("Unable to enable guarded storage"); return 1; } futex_set(&futex, NR_THREADS); for (i = 0; i < NR_THREADS; i++) pthread_create(tids + i, NULL, thread_run, NULL); test_msg("Waiting for thread startup\n"); /* Wait for all threads to set the GS broadcast control block */ futex_wait_until(&futex, 0); test_daemon(); test_waitsig(); test_msg("Doing broadcast\n"); futex_set(&futex2, NR_THREADS); /* * Triggers a GS event and force all the threads to execute * the gs handler */ syscall(__NR_guarded_storage, GS_BROADCAST); test_msg("Waiting for thread completion\n"); futex_wait_until(&futex2, 0); pass(); return 0; } criu-3.6/test/zdtm/static/s390x_mmap_high.c000066400000000000000000000026221317335042600206210ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" #define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ #define MAP_SIZE 0x1000 #define VAL 0x77 const char *test_doc = "Verify that tasks > 4TB can be checkpointed"; const char *test_author = "Michael Holzheu "; /* * Map memory at the very end of the 8 PB address space */ int main(int argc, char **argv) { void *addr = (void *) TASK_SIZE_LEVEL_4 - MAP_SIZE; char *buf; int i; test_init(argc, argv); /* * Skip test if kernel does not have the following fix: * * ee71d16d22 ("s390/mm: make TASK_SIZE independent from the number * of page table levels") */ if (munmap(addr, MAP_SIZE) == -1) { test_daemon(); test_waitsig(); skip("Detected kernel without 4 level TASK_SIZE fix"); pass(); return 0; } /* Map memory at the very end of the 8 PB address space */ buf = mmap(addr, MAP_SIZE, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (buf == MAP_FAILED) { pr_perror("Could not create mapping"); exit(1); } /* Initialize buffer with data */ memset(buf, VAL, MAP_SIZE); test_daemon(); test_waitsig(); /* Verify that we restored the data correctly */ for (i = 0; i < MAP_SIZE; i++) { if (buf[i] == VAL) continue; fail("%d: %d != %d\n", i, buf[i], VAL); goto out; } pass(); out: return 0; } criu-3.6/test/zdtm/static/s390x_mmap_high.desc000066400000000000000000000000221317335042600213050ustar00rootroot00000000000000{'arch': 's390x'} criu-3.6/test/zdtm/static/s390x_regs_check.c000066400000000000000000000273441317335042600207750ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that FP and VX registers do not change"; const char *test_author = "Michael Holzheu "; /* * This test case executes the following procedure: * * 1) Set registers to defined values * The main process creates one child process and within that process * NR_THREADS threads. Then the main process uses ptrace(SETREGS) to * set the registers in the child process and in all threads. * * 2) Detach from child and threads * Do this in order to allow criu to use ptrace for dumping. * * 3) Issue criu commands * Useful tests are: dump, dump --check-only, dump --leave-running * * 4) Check registers * Use ptrace(GETREGS) and compare with original values from step 1. * * This test can be used for two purposes: * * - Verify that "criu restore" sets the correct register sets * from "criu dump": * $ zdtmp.py run -t zdtm/static/s390x_regs_check * * - Verify that dumpee continues running with correct registers after * parasite injection: * $ zdtmp.py run --norst -t zdtm/static/s390x_regs_check * $ zdtmp.py run --norst --pre 2 -t zdtm/static/s390x_regs_check * $ zdtmp.py run --check-only -t zdtm/static/s390x_regs_check */ #define NR_THREADS 2 #define NR_THREADS_ALL (NR_THREADS + 1) static pid_t thread_pids[NR_THREADS_ALL]; static int pipefd[2]; /* * Generic structure to define a register set and test data */ struct reg_set { const char *name; /* Name of regset */ int nr; /* Number of regset */ void *data; /* Test data */ int len; /* Number of bytes of test data */ bool optional; /* Not all kernels/machines have this reg set */ bool available; /* Current kernel/machine has this reg set */ }; /* * s390 floating point registers */ struct prfpreg { uint32_t fpc; uint64_t fprs[16]; }; struct prfpreg prfpreg_data = { .fpc = 0, .fprs = { 0x0000000000000000, 0x1111111111111110, 0x2222222222222220, 0x3333333333333330, 0x4444444444444440, 0x5555555555555550, 0x6666666666666660, 0x7777777777777770, 0x8888888888888880, 0x9999999999999990, 0xaaaaaaaaaaaaaaa0, 0xbbbbbbbbbbbbbbb0, 0xccccccccccccccc0, 0xddddddddddddddd0, 0xeeeeeeeeeeeeeee0, 0xfffffffffffffff0, } }; struct reg_set reg_set_prfpreg = { .name = "PRFPREG", .nr = NT_PRFPREG, .data = &prfpreg_data, .len = sizeof(prfpreg_data), .optional = false, }; /* * s390 vector VXRS_LOW registers */ #define NT_S390_VXRS_LOW 0x309 struct vxrs_low { uint64_t regs[16]; }; struct vxrs_low vxrs_low_data = { .regs = { 0x0000000000000001, 0x1111111111111111, 0x2222222222222221, 0x3333333333333331, 0x4444444444444441, 0x5555555555555551, 0x6666666666666661, 0x7777777777777771, 0x8888888888888881, 0x9999999999999991, 0xaaaaaaaaaaaaaaa1, 0xbbbbbbbbbbbbbbb1, 0xccccccccccccccc1, 0xddddddddddddddd1, 0xeeeeeeeeeeeeeee1, 0xfffffffffffffff1, } }; struct reg_set reg_set_vxrs_low = { .name = "VXRS_LOW", .nr = NT_S390_VXRS_LOW, .data = &vxrs_low_data, .len = sizeof(vxrs_low_data), .optional = true, }; /* * s390 vector VXRS_HIGH registers */ #define NT_S390_VXRS_HIGH 0x30a struct vxrs_high { uint64_t regs[32]; }; struct vxrs_high vxrs_high_data = { .regs = { 0x0000000000000002, 0x0000000000000002, 0x1111111111111112, 0x1111111111111112, 0x2222222222222222, 0x2222222222222222, 0x3333333333333332, 0x3333333333333332, 0x4444444444444442, 0x4444444444444442, 0x5555555555555552, 0x5555555555555552, 0x6666666666666662, 0x6666666666666662, 0x7777777777777772, 0x7777777777777772, 0x8888888888888882, 0x8888888888888882, 0x9999999999999992, 0x9999999999999992, 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2, 0xbbbbbbbbbbbbbbb2, 0xbbbbbbbbbbbbbbb2, 0xccccccccccccccc2, 0xccccccccccccccc2, 0xddddddddddddddd2, 0xddddddddddddddd2, 0xeeeeeeeeeeeeeee2, 0xeeeeeeeeeeeeeee2, 0xfffffffffffffff2, 0xfffffffffffffff2, } }; struct reg_set reg_set_vxrs_high = { .name = "VXRS_HIGH", .nr = NT_S390_VXRS_HIGH, .data = &vxrs_high_data, .len = sizeof(vxrs_high_data), .optional = true, }; /* * s390 guarded-storage registers */ #define NT_S390_GS_CB 0x30b #define NT_S390_GS_BC 0x30c struct gs_cb { uint64_t regs[4]; }; struct gs_cb gs_cb_data = { .regs = { 0x0000000000000000, 0x000000123400001a, 0x5555555555555555, 0x000000014b58a010, } }; struct reg_set reg_set_gs_cb = { .name = "GS_CB", .nr = NT_S390_GS_CB, .data = &gs_cb_data, .len = sizeof(gs_cb_data), .optional = true, }; struct gs_cb gs_bc_data = { .regs = { 0x0000000000000000, 0x000000123400001a, 0xffffffffffffffff, 0x0000000aaaaaaaaa, } }; struct reg_set reg_set_gs_bc = { .name = "GS_BC_CB", .nr = NT_S390_GS_BC, .data = &gs_bc_data, .len = sizeof(gs_bc_data), .optional = true, }; /* * s390 runtime-instrumentation control block */ #define NT_S390_RI_CB 0x30d struct ri_cb { uint64_t regs[8]; }; struct ri_cb ri_cb_data = { .regs = { 0x000002aa13aae000, 0x000002aa13aad000, 0x000002aa13aadfff, 0xe0a1000400000000, 0x0000000000000000, 0x0000000000004e20, 0x0000000000003479, 0x0000000000000000, } }; struct reg_set reg_set_ri_cb = { .name = "RI_CB", .nr = NT_S390_RI_CB, .data = &ri_cb_data, .len = sizeof(ri_cb_data), .optional = true, }; /* * Vector with all regsets */ struct reg_set *reg_set_vec[] = { ®_set_prfpreg, ®_set_vxrs_low, ®_set_vxrs_high, ®_set_gs_cb, ®_set_gs_bc, ®_set_ri_cb, NULL, }; /* * Print hexdump for buffer with variable group parameter */ void util_hexdump_grp(const char *tag, const void *data, int grp, int count, int indent) { char str[1024], *ptr = str; const char *buf = data; int i, first = 1; for (i = 0; i < count; i++) { if (first) { ptr = str; ptr += sprintf(ptr, "%*s", indent, " "); if (tag) ptr += sprintf(ptr, "%s: ", tag); ptr += sprintf(ptr, "%08x: ", i); first = 0; } ptr += sprintf(ptr, "%02x", buf[i]); if (i % 16 == 15 || i + 1 == count) { test_msg("%s\n", str); first = 1; } else if (i % grp == grp - 1) { ptr += sprintf(ptr, " "); } } } /* * Print hexdump for buffer with fix grp parameter */ void util_hexdump(const char *tag, const void *data, int count) { util_hexdump_grp(tag, data, sizeof(long), count, 0); } /* * Set regset for pid */ static int set_regset(pid_t pid, struct reg_set *reg_set) { struct iovec iov; iov.iov_base = reg_set->data; iov.iov_len = reg_set->len; if (ptrace(PTRACE_SETREGSET, pid, reg_set->nr, iov) == 0) { test_msg(" REGSET: %12s -> DONE\n", reg_set->name); reg_set->available = true; return 0; } if (reg_set->optional) { switch (errno) { case EOPNOTSUPP: case ENODEV: test_msg(" REGSET: %12s -> not supported by machine\n", reg_set->name); return 0; case EINVAL: test_msg(" REGSET: %12s -> not supported by kernel\n", reg_set->name); return 0; default: break; } } pr_perror("PTRACE_SETREGSET for %s failed for pid %d", reg_set->name, pid); return -1; } /* * Apply all regsets */ static int set_regset_all(pid_t pid) { int i; for (i = 0; reg_set_vec[i] != NULL; i++) { if (set_regset(pid, reg_set_vec[i])) return -1; } return 0; } /* * Check if regset for pid has changed */ static int check_regset(pid_t pid, struct reg_set *reg_set) { struct iovec iov; char *data; if (!reg_set->available) return 0; data = calloc(reg_set->len, 1); if (!data) return -1; iov.iov_base = data; iov.iov_len = reg_set->len; if (ptrace(PTRACE_GETREGSET, pid, reg_set->nr, iov) != 0) { pr_perror("PTRACE_SETREGSET for %s failed for pid %d", reg_set->name, pid); free(data); return -1; } if (memcmp(data, reg_set->data, reg_set->len) != 0) { test_msg("RegSet %s changed for pid=%d\n", reg_set->name, pid); test_msg("Original values:\n"); util_hexdump(reg_set->name, reg_set->data, reg_set->len); test_msg("New values:\n"); util_hexdump(reg_set->name, data, reg_set->len); free(data); return -1; } free(data); return 0; } /* * Check all regsets */ static int check_regset_all(pid_t pid) { int i; for (i = 0; reg_set_vec[i] != NULL; i++) { if (check_regset(pid, reg_set_vec[i])) return -1; } return 0; } /* * Send error to father */ static void send_error(void) { int val = 0; if (write(pipefd[1], &val, sizeof(val)) == -1) pr_perror("write failed"); } /* * Write tid to pipe and then loop without changing registers */ static inline void send_tid_and_loop(int fd) { int tid = syscall(__NR_gettid); asm volatile( "lgr 2,%0\n" /* Arg 1: fd */ "la 3,%1\n" /* Arg 2: &tid */ "lghi 4,4\n" /* Arg 3: sizeof(int) */ "svc 4\n" /* __NR_write SVC: */ /* After SVC no more registers are changed */ "0: j 0b\n" /* Loop here */ : : "d" (fd), "Q" (tid) : "2", "3", "4"); } /* * Function for threads */ static void *thread_func(void *fd) { send_tid_and_loop(pipefd[1]); return NULL; } /* * Function executed by the child */ static void child_func(void) { pthread_t thread; int i; /* Close read end of pipe */ close(pipefd[0]); /* Create threads and send TID */ for (i = 0; i < NR_THREADS; i++) { if (pthread_create(&thread, NULL, thread_func, NULL) != 0) { pr_perror("Error create thread: %d", i); send_error(); } } /* Send tid and wait until get killed */ send_tid_and_loop(pipefd[1]); } /* * Attach to a thread */ static int ptrace_attach(pid_t pid) { if (ptrace(PTRACE_ATTACH, pid, 0, 0) == 0) { if (waitpid(pid, NULL, __WALL) < 0) { pr_perror("Waiting for thread %d failed", pid); return -1; } return 0; } pr_perror("Attach to thread %d failed", pid); return -1; } /* * Detach from a thread */ static int ptrace_detach(pid_t pid) { if (ptrace(PTRACE_DETACH, pid, 0, 0) == 0) return 0; pr_perror("Detach from thread %d failed", pid); return -1; } /* * Create child with threads and verify that registers are not corrupted */ int main(int argc, char *argv[]) { bool failed = false; pid_t child, pid; int i; test_init(argc, argv); test_msg("------------- START 1 PROCESS + %d THREADS ---------------\n", NR_THREADS); if (pipe(pipefd) == -1) { perror("pipe failed"); exit(EXIT_FAILURE); } child = fork(); if (child == 0) child_func(); /* Parent */ for (i = 0; i < NR_THREADS_ALL; i++) { if (read(pipefd[0], &pid, sizeof(pid_t)) == -1) { perror("Read from pipe failed"); failed = true; goto kill_all_threads; } if (pid == 0) { pr_err("Not all threads are started\n"); failed = true; goto kill_all_threads; } test_msg("STARTED: pid = %d\n", pid); thread_pids[i] = pid; } /* Close write end */ close(pipefd[1]); test_msg("---------------------- SET REGISTERS --------------------\n"); for (i = 0; i < NR_THREADS_ALL; i++) { pid = thread_pids[i]; test_msg("SET: pid = %d\n", pid); ptrace_attach(pid); set_regset_all(pid); ptrace_detach(pid); } test_daemon(); test_waitsig(); test_msg("-------------------- CHECK REGISTERS --------------------\n"); for (i = 0; i < NR_THREADS_ALL; i++) { pid = thread_pids[i]; test_msg("CHECK: pid = %d:\n", pid); ptrace_attach(pid); if (check_regset_all(pid) == 0) { test_msg(" -> OK\n"); } else { test_msg(" -> FAIL\n"); failed = true; } ptrace_detach(pid); } test_msg("----------------------- CLEANUP ------------------------\n"); kill_all_threads: for (i = 0; i < NR_THREADS_ALL; i++) { pid = thread_pids[i]; if (pid == 0) continue; test_msg("KILL: pid = %d\n", pid); kill(pid, SIGTERM); } if (failed) { fail("Registers changed"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/s390x_regs_check.desc000066400000000000000000000000221317335042600214510ustar00rootroot00000000000000{'arch': 's390x'} criu-3.6/test/zdtm/static/s390x_runtime_instr.c000066400000000000000000000111451317335042600215720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #ifndef __NR_s390_runtime_instr #define __NR_s390_runtime_instr 342 #endif #define NT_S390_RI_CB 0x30d #define BUF_SIZE (1024*1024) const char *test_doc = "Check runtime-instrumentation"; /* Original test provided by Martin Schwidefsky */ const char *test_author = "Alice Frosi "; struct runtime_instr_cb { unsigned long rca; unsigned long roa; unsigned long rla; unsigned int v : 1; unsigned int s : 1; unsigned int k : 1; unsigned int h : 1; unsigned int a : 1; unsigned int reserved1 : 3; unsigned int ps : 1; unsigned int qs : 1; unsigned int pc : 1; unsigned int qc : 1; unsigned int reserved2 : 1; unsigned int g : 1; unsigned int u : 1; unsigned int l : 1; unsigned int key : 4; unsigned int reserved3 : 8; unsigned int t : 1; unsigned int rgs : 3; unsigned int m : 4; unsigned int n : 1; unsigned int mae : 1; unsigned int reserved4 : 2; unsigned int c : 1; unsigned int r : 1; unsigned int b : 1; unsigned int j : 1; unsigned int e : 1; unsigned int x : 1; unsigned int reserved5 : 2; unsigned int bpxn : 1; unsigned int bpxt : 1; unsigned int bpti : 1; unsigned int bpni : 1; unsigned int reserved6 : 2; unsigned int d : 1; unsigned int f : 1; unsigned int ic : 4; unsigned int dc : 4; unsigned long reserved7; unsigned long sf; unsigned long rsic; unsigned long reserved8; }; /* * Return PSW mask */ static inline unsigned long extract_psw(void) { unsigned int reg1, reg2; asm volatile("epsw %0,%1" : "=d" (reg1), "=a" (reg2)); return (((unsigned long) reg1) << 32) | ((unsigned long) reg2); } /* * Enable runtime-instrumentation */ static inline void rion(void) { asm volatile (".word 0xaa01, 0x0000"); } /* * Disable runtime-instrumentation */ static inline void rioff(void) { asm volatile (".word 0xaa03, 0x0000"); } /* * Modify the current runtime-instrumentation control block */ static inline void mric(struct runtime_instr_cb *cb) { asm volatile(".insn rsy,0xeb0000000062,0,0,%0" : : "Q" (*cb)); } /* * Store the current runtime-instrumentation control block */ static inline void stric(struct runtime_instr_cb *cb) { asm volatile(".insn rsy,0xeb0000000061,0,0,%0" : "=Q" (*cb) : : "cc"); } /* * Ensure that runtime-intstrumentation is still working after C/R */ int main(int argc, char **argv) { struct runtime_instr_cb ricb, ricb_check; unsigned long *ricb_check_ptr = (unsigned long *) &ricb_check; unsigned long *ricb_ptr = (unsigned long *) &ricb; unsigned long psw_mask; void *buf; int i; test_init(argc, argv); buf = malloc(BUF_SIZE); memset(buf, 0, BUF_SIZE); memset(&ricb, 0, sizeof(ricb)); /* Initialize the default RI control block in the kernel */ if (syscall(__NR_s390_runtime_instr, 1, NULL) < 0) { if (errno == EOPNOTSUPP) { test_daemon(); test_waitsig(); skip("RI not supported"); pass(); return 0; } fail("Fail with error %d", errno); return -1; } /* Set buffer for RI */ ricb.rca = ricb.roa = (unsigned long) buf; ricb.rla = (unsigned long) buf + BUF_SIZE; mric(&ricb); /* Enable RI - afterwards the PSW will have RI bit set */ rion(); psw_mask = extract_psw(); /* Verify that the RI bit is set in the PSW */ if (!(psw_mask & PSW_MASK_RI)) { fail("Failed to enable RI"); return -1; } /* Collect RI records until we hit buffer-full condition */ while (ricb.rca < ricb.rla + 1) { for (i = 0; i < 10000; i++) asm volatile("" : : : "memory"); rioff(); stric(&ricb); rion(); } /* Disable RI */ rioff(); /* Save the current RI control block */ stric(&ricb); ricb_check = ricb; /* Re-enable RI for checkpoint */ rion(); /* Do C/R now */ test_daemon(); test_waitsig(); /* Verify that the RI bit is set in the PSW */ psw_mask = extract_psw(); if (!(psw_mask & PSW_MASK_RI)) { fail("RI bit in PSW not set"); return -1; } /* * Verify that the RI block has been restored correctly * and the buffer is unchanged */ rioff(); stric(&ricb); for (i = 0; i < 8; i++) { if (ricb_ptr[i] == ricb_check_ptr[i]) continue; /* Skip sf field because its value may change */ if (i == 6) continue; fail("%d:Got %016lx expected %016lx", i, ricb_ptr[i], ricb_check_ptr[i]); return -1; } pass(); return 0; } criu-3.6/test/zdtm/static/sched_policy00.c000066400000000000000000000031101317335042600205200ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check sched policy to be preserved"; const char *test_author = "Pavel Emelyanov "; static const int parm = 3; static int do_nothing(void) { while (1) sleep(10); return -1; } int main(int argc, char ** argv) { int pid, ret, err = 0; struct sched_param p; int old_rt_runtime_us = -1; FILE *file; test_init(argc, argv); pid = fork(); if (!pid) return do_nothing(); file = fopen("/sys/fs/cgroup/cpu/user.slice/cpu.rt_runtime_us", "r"); if (file) { ret = fscanf(file, "%d", &old_rt_runtime_us); fclose(file); if ((ret > 0) && (old_rt_runtime_us == 0)) { file = fopen("/sys/fs/cgroup/cpu/user.slice/cpu.rt_runtime_us", "w"); if (file) { fprintf(file, "100\n"); fclose(file); } } } p.sched_priority = parm; if (sched_setscheduler(pid, SCHED_RR, &p)) { pr_perror("Can't set policy"); kill(pid, SIGKILL); return -1; } test_daemon(); test_waitsig(); ret = sched_getscheduler(pid); if (ret != SCHED_RR) { fail("Broken/No policy"); err++; } ret = sched_getparam(pid, &p); if (ret < 0 || p.sched_priority != parm) { fail("Broken prio"); err++; } if (!err) pass(); kill(pid, SIGKILL); if (old_rt_runtime_us != -1) { file = fopen("/sys/fs/cgroup/cpu/user.slice/cpu.rt_runtime_us", "w"); if (file) { fprintf(file, "%d\n", old_rt_runtime_us); fclose(file); } } return err; } criu-3.6/test/zdtm/static/sched_policy00.desc000066400000000000000000000000441317335042600212170ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/sched_prio00.c000066400000000000000000000023531317335042600202020ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check sched prios to be preserved"; const char *test_author = "Pavel Emelyanov "; #define NRTASKS 3 static int do_nothing(void) { while (1) sleep(10); return -1; } static void kill_all(int *pid, int n) { int i; for (i = 0; i < n; i++) kill(pid[i], SIGKILL); } int main(int argc, char ** argv) { int pid[NRTASKS], i, parm[NRTASKS], ret; test_init(argc, argv); parm[0] = -20; parm[1] = 19; parm[2] = 1; for (i = 0; i < NRTASKS; i++) { pid[i] = fork(); if (!pid[i]) return do_nothing(); if (setpriority(PRIO_PROCESS, pid[i], parm[i])) { pr_perror("Can't set prio %d", i); kill_all(pid, i); return -1; } } test_daemon(); test_waitsig(); for (i = 0; i < NRTASKS; i++) { errno = 0; ret = getpriority(PRIO_PROCESS, pid[i]); if (errno) { fail("No prio for task %d", i); break; } if (ret != parm[i]) { fail("Broken nice for %d", i); break; } } if (i == NRTASKS) pass(); kill_all(pid, NRTASKS); return 0; } criu-3.6/test/zdtm/static/sched_prio00.desc000066400000000000000000000000441317335042600206710ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/scm00.c000066400000000000000000000054741317335042600166540ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that SCM_RIGHTS are preserved"; const char *test_author = "Pavel Emelyanov "; static int send_fd(int via, int fd) { struct msghdr h = {}; struct cmsghdr *ch; struct iovec iov; char buf[CMSG_SPACE(sizeof(int))], c = '\0'; int *fdp; h.msg_control = buf; h.msg_controllen = sizeof(buf); ch = CMSG_FIRSTHDR(&h); ch->cmsg_level = SOL_SOCKET; ch->cmsg_type = SCM_RIGHTS; ch->cmsg_len = CMSG_LEN(sizeof(int)); fdp = (int *)CMSG_DATA(ch); *fdp = fd; h.msg_iov = &iov; h.msg_iovlen = 1; iov.iov_base = &c; iov.iov_len = sizeof(c); if (sendmsg(via, &h, 0) <= 0) return -1; return 0; } static int recv_fd(int via) { struct msghdr h = {}; struct cmsghdr *ch; struct iovec iov; char buf[CMSG_SPACE(sizeof(int))], c; int *fdp; h.msg_control = buf; h.msg_controllen = sizeof(buf); h.msg_iov = &iov; h.msg_iovlen = 1; iov.iov_base = &c; iov.iov_len = sizeof(c); if (recvmsg(via, &h, 0) <= 0) return -1; ch = CMSG_FIRSTHDR(&h); if (h.msg_flags & MSG_TRUNC) return -2; if (ch == NULL) return -3; if (ch->cmsg_type != SCM_RIGHTS) return -4; fdp = (int *)CMSG_DATA(ch); return *fdp; } int main(int argc, char **argv) { int sk[2], p[2], rfd; #define MSG "HELLO" char buf[8]; /* bigger than the MSG to check boundaries */ test_init(argc, argv); if (socketpair(PF_UNIX, SOCK_DGRAM, 0, sk) < 0) { pr_perror("Can't make unix pair"); exit(1); } if (pipe(p) < 0) { pr_perror("Can't make pipe"); exit(1); } if (send_fd(sk[0], p[0]) < 0) { pr_perror("Can't send descriptor"); exit(1); } #ifndef KEEP_SENT_FD close(p[0]); #ifdef SEND_BOTH if (send_fd(sk[0], p[1]) < 0) { pr_perror("Can't send 2nd descriptor"); exit(1); } close(p[1]); p[0] = p[1] = -1; #else /* Swap pipe ends to make scm recv put pipe into different place */ dup2(p[1], p[0]); close(p[1]); p[1] = p[0]; p[0] = -1; #endif #endif test_daemon(); test_waitsig(); rfd = recv_fd(sk[1]); if (rfd < 0) { fail("Can't recv pipe back (%d)", p[0]); goto out; } #ifdef SEND_BOTH test_msg("Recv 2nd end\n"); p[1] = recv_fd(sk[1]); if (p[1] < 0) { fail("Can't recv 2nd pipe back (%d)", p[1]); goto out; } #endif #ifdef KEEP_SENT_FD if (rfd == p[0]) { fail("Original descriptor not kept"); goto out; } again: #endif if (write(p[1], MSG, sizeof(MSG)) != sizeof(MSG)) { fail("Pipe write-broken"); goto out; } if (read(rfd, buf, sizeof(buf)) != sizeof(MSG)) { fail("Pipe read-broken"); goto out; } if (strcmp(buf, MSG)) { buf[sizeof(buf) - 1] = '\0'; fail("Pipe read-broken (%s)", buf); goto out; } #ifdef KEEP_SENT_FD if (rfd != p[0]) { test_msg("Check kept\n"); rfd = p[0]; goto again; } #endif pass(); out: return 0; } criu-3.6/test/zdtm/static/scm01.c000077700000000000000000000000001317335042600177242scm00.custar00rootroot00000000000000criu-3.6/test/zdtm/static/scm02.c000077700000000000000000000000001317335042600177252scm00.custar00rootroot00000000000000criu-3.6/test/zdtm/static/scm03.c000066400000000000000000000055371317335042600166570ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that SCM_RIGHTS are preserved"; const char *test_author = "Pavel Emelyanov "; static int send_fd(int via, int fd1, int fd2) { struct msghdr h = {}; struct cmsghdr *ch; struct iovec iov; #ifdef SEPARATE char buf[2 * CMSG_SPACE(sizeof(int))]; #else char buf[CMSG_SPACE(2 * sizeof(int))]; #endif char c = '\0'; int *fdp; memset(buf, 0, sizeof(buf)); h.msg_control = buf; h.msg_controllen = sizeof(buf); #ifdef SEPARATE ch = CMSG_FIRSTHDR(&h); ch->cmsg_level = SOL_SOCKET; ch->cmsg_type = SCM_RIGHTS; ch->cmsg_len = CMSG_LEN(sizeof(int)); fdp = (int *)CMSG_DATA(ch); fdp[0] = fd1; ch = CMSG_NXTHDR(&h, ch); ch->cmsg_level = SOL_SOCKET; ch->cmsg_type = SCM_RIGHTS; ch->cmsg_len = CMSG_LEN(sizeof(int)); fdp = (int *)CMSG_DATA(ch); fdp[0] = fd2; #else ch = CMSG_FIRSTHDR(&h); ch->cmsg_level = SOL_SOCKET; ch->cmsg_type = SCM_RIGHTS; ch->cmsg_len = CMSG_LEN(2 * sizeof(int)); fdp = (int *)CMSG_DATA(ch); fdp[0] = fd1; fdp[1] = fd2; #endif h.msg_iov = &iov; h.msg_iovlen = 1; iov.iov_base = &c; iov.iov_len = sizeof(c); if (sendmsg(via, &h, 0) <= 0) return -1; return 0; } static int recv_fd(int via, int *fd1, int *fd2) { struct msghdr h = {}; struct cmsghdr *ch; struct iovec iov; char buf[CMSG_SPACE(2 * sizeof(int))]; char c; int *fdp; h.msg_control = buf; h.msg_controllen = sizeof(buf); h.msg_iov = &iov; h.msg_iovlen = 1; iov.iov_base = &c; iov.iov_len = sizeof(c); if (recvmsg(via, &h, 0) <= 0) return -1; if (h.msg_flags & MSG_CTRUNC) { test_msg("CTR\n"); return -2; } /* No 2 SCM-s here, kernel merges them upon send */ ch = CMSG_FIRSTHDR(&h); if (h.msg_flags & MSG_TRUNC) return -2; if (ch == NULL) return -3; if (ch->cmsg_type != SCM_RIGHTS) return -4; fdp = (int *)CMSG_DATA(ch); *fd1 = fdp[0]; *fd2 = fdp[1]; return 0; } int main(int argc, char **argv) { int sk[2], p[2]; #define MSG "HELLO" char buf[8]; /* bigger than the MSG to check boundaries */ test_init(argc, argv); if (socketpair(PF_UNIX, SOCK_DGRAM, 0, sk) < 0) { pr_perror("Can't make unix pair"); exit(1); } if (pipe(p) < 0) { pr_perror("Can't make pipe"); exit(1); } if (send_fd(sk[0], p[0], p[1]) < 0) { pr_perror("Can't send descriptor"); exit(1); } close(p[0]); close(p[1]); p[0] = p[1] = -1; test_daemon(); test_waitsig(); if (recv_fd(sk[1], &p[0], &p[1]) < 0) { fail("Can't recv pipes back"); goto out; } if (write(p[1], MSG, sizeof(MSG)) != sizeof(MSG)) { fail("Pipe write-broken"); goto out; } if (read(p[0], buf, sizeof(buf)) != sizeof(MSG)) { fail("Pipe read-broken"); goto out; } if (strcmp(buf, MSG)) { buf[sizeof(buf) - 1] = '\0'; fail("Pipe read-broken (%s)", buf); goto out; } pass(); out: return 0; } criu-3.6/test/zdtm/static/scm04.c000077700000000000000000000000001317335042600177322scm03.custar00rootroot00000000000000criu-3.6/test/zdtm/static/seccomp_filter.c000066400000000000000000000067411317335042600207260ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #ifdef __NR_seccomp #include #include #endif #include "zdtmtst.h" const char *test_doc = "Check that SECCOMP_MODE_FILTER is restored"; const char *test_author = "Tycho Andersen "; #ifdef __NR_seccomp int get_seccomp_mode(pid_t pid) { FILE *f; char buf[PATH_MAX]; sprintf(buf, "/proc/%d/status", pid); f = fopen(buf, "r+"); if (!f) { pr_perror("fopen failed"); return -1; } while (NULL != fgets(buf, sizeof(buf), f)) { int mode; if (sscanf(buf, "Seccomp:\t%d", &mode) != 1) continue; fclose(f); return mode; } fclose(f); return -1; } int filter_syscall(int syscall_nr) { struct sock_filter filter[] = { BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, syscall_nr, 0, 1), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog bpf_prog = { .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), .filter = filter, }; if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf_prog) < 0) { pr_perror("prctl failed"); return -1; } return 0; } int main(int argc, char ** argv) { pid_t pid; int mode, status; int sk_pair[2], sk, ret; char c = 'K'; test_init(argc, argv); if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } pid = fork(); if (pid < 0) { pr_perror("fork"); return -1; } if (pid == 0) { sk = sk_pair[1]; close(sk_pair[0]); /* * Let's install a few filters separately to make sure the * chaining actually works. */ if (filter_syscall(__NR_ptrace) < 0) _exit(1); /* * The idea is to have a syscall that is used in restore_creds, * so we can make sure seccomp is actually suspended when that * is called. */ if (filter_syscall(__NR_setresuid) < 0) _exit(1); setuid(1000); zdtm_seccomp = 1; test_msg("SECCOMP_MODE_FILTER is enabled\n"); if (write(sk, &c, 1) != 1) { pr_perror("write"); _exit(1); } if (read(sk, &c, 1) != 1) { pr_perror("read"); _exit(1); } prctl(PR_SET_DUMPABLE, 1); if (write(sk, &c, 1) != 1) { pr_perror("write"); _exit(1); } if (read(sk, &c, 1) != 1) { pr_perror("read"); _exit(1); } /* We expect to be killed by our policy above. */ ptrace(PTRACE_TRACEME); syscall(__NR_exit, 0); } sk = sk_pair[0]; close(sk_pair[1]); if ((ret = read(sk, &c, 1)) != 1) { pr_perror("read %d", ret); goto err; } test_daemon(); test_waitsig(); if (write(sk, &c, 1) != 1) { pr_perror("write"); goto err; } if ((ret = read(sk, &c, 1)) != 1) { pr_perror("read %d", ret); goto err; } mode = get_seccomp_mode(pid); if (write(sk, &c, 1) != 1) { pr_perror("write"); goto err; } if (waitpid(pid, &status, 0) != pid) { pr_perror("waitpid"); exit(1); } if (WTERMSIG(status) != SIGSYS) { pr_perror("expected SIGSYS, got %d", WTERMSIG(status)); exit(1); } if (mode != SECCOMP_MODE_FILTER) { fail("seccomp mode mismatch %d\n", mode); return 1; } pass(); return 0; err: kill(pid, SIGKILL); return 1; } #else /* __NR_seccomp */ #define TEST_SKIP_REASON "incompatible kernel (no seccomp)" #include "skip-me.c" #endif /* __NR_seccomp */ criu-3.6/test/zdtm/static/seccomp_filter.desc000066400000000000000000000000601317335042600214060ustar00rootroot00000000000000{'flags': 'suid', 'feature': 'seccomp_filters'} criu-3.6/test/zdtm/static/seccomp_filter_inheritance.c000066400000000000000000000062451317335042600232760ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #ifdef __NR_seccomp # include # include # include #endif #include "zdtmtst.h" const char *test_doc = "Check that SECCOMP_MODE_FILTER is restored"; const char *test_author = "Tycho Andersen "; #ifdef __NR_seccomp int get_seccomp_mode(pid_t pid) { FILE *f; char buf[PATH_MAX]; sprintf(buf, "/proc/%d/status", pid); f = fopen(buf, "r+"); if (!f) { pr_perror("fopen failed"); return -1; } while (NULL != fgets(buf, sizeof(buf), f)) { int mode; if (sscanf(buf, "Seccomp:\t%d", &mode) != 1) continue; fclose(f); return mode; } fclose(f); return -1; } int filter_syscall(int syscall_nr) { struct sock_filter filter[] = { BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, syscall_nr, 0, 1), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog bpf_prog = { .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), .filter = filter, }; if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf_prog) < 0) { pr_perror("prctl failed"); return -1; } return 0; } int main(int argc, char ** argv) { pid_t pid; int mode, status; int sk_pair[2], sk, ret; char c = 'K'; test_init(argc, argv); if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } pid = fork(); if (pid < 0) { pr_perror("fork"); return -1; } if (pid == 0) { pid_t pid2; sk = sk_pair[1]; close(sk_pair[0]); if (filter_syscall(__NR_ptrace) < 0) _exit(1); if (filter_syscall(__NR_fstat) < 0) _exit(1); zdtm_seccomp = 1; test_msg("SECCOMP_MODE_FILTER is enabled\n"); pid2 = fork(); if (pid2 < 0) _exit(1); if (!pid2) { if (write(sk, &c, 1) != 1) { pr_perror("write"); _exit(1); } if (read(sk, &c, 1) != 1) { pr_perror("read"); _exit(1); } /* We expect to be killed by our policy above. */ ptrace(PTRACE_TRACEME); _exit(1); } if (waitpid(pid2, &status, 0) != pid2) { pr_perror("waitpid"); _exit(1); } if (WTERMSIG(status) != SIGSYS) { pr_err("expected SIGSYS, got %d\n", WTERMSIG(status)); _exit(1); } _exit(0); } sk = sk_pair[0]; close(sk_pair[1]); if ((ret = read(sk, &c, 1)) != 1) { pr_perror("read %d", ret); goto err; } test_daemon(); test_waitsig(); mode = get_seccomp_mode(pid); if (write(sk, &c, 1) != 1) { pr_perror("write"); goto err; } if (mode != SECCOMP_MODE_FILTER) { fail("seccomp mode mismatch %d\n", mode); return 1; } if (waitpid(pid, &status, 0) != pid) { pr_perror("waitpid"); _exit(1); } if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { fail("bad exit status"); return 1; } pass(); return 0; err: kill(pid, SIGKILL); return 1; } #else /* __NR_seccomp */ #define TEST_SKIP_REASON "incompatible kernel (no seccomp)" #include "skip-me.c" #endif /* __NR_seccomp */ criu-3.6/test/zdtm/static/seccomp_filter_inheritance.desc000066400000000000000000000000601317335042600237570ustar00rootroot00000000000000{'flags': 'suid', 'feature': 'seccomp_filters'} criu-3.6/test/zdtm/static/seccomp_filter_tsync.c000066400000000000000000000102271317335042600221400ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #ifdef __NR_seccomp # include # include # include # include #endif #include "zdtmtst.h" #ifndef SECCOMP_SET_MODE_FILTER #define SECCOMP_SET_MODE_FILTER 1 #endif #ifndef SECCOMP_FILTER_FLAG_TSYNC #define SECCOMP_FILTER_FLAG_TSYNC 1 #endif const char *test_doc = "Check that SECCOMP_FILTER_FLAG_TSYNC works correctly after restore"; const char *test_author = "Tycho Andersen "; #ifdef __NR_seccomp pthread_mutex_t getpid_wait; int get_seccomp_mode(pid_t pid) { FILE *f; char buf[PATH_MAX]; sprintf(buf, "/proc/%d/status", pid); f = fopen(buf, "r+"); if (!f) { pr_perror("fopen failed"); return -1; } while (NULL != fgets(buf, sizeof(buf), f)) { int mode; if (sscanf(buf, "Seccomp:\t%d", &mode) != 1) continue; fclose(f); return mode; } fclose(f); return -1; } int filter_syscall(int syscall_nr, unsigned int flags) { struct sock_filter filter[] = { BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, syscall_nr, 0, 1), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog bpf_prog = { .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), .filter = filter, }; if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, flags, &bpf_prog) < 0) { pr_perror("seccomp failed"); return -1; } return 0; } void *wait_and_getpid(void *arg) { pthread_mutex_lock(&getpid_wait); pthread_mutex_unlock(&getpid_wait); pthread_mutex_destroy(&getpid_wait); /* we expect the tg to get killed by the seccomp filter that was * installed via TSYNC */ ptrace(PTRACE_TRACEME); pthread_exit((void *)1); } int main(int argc, char ** argv) { pid_t pid; int mode, status; int sk_pair[2], sk, ret; char c = 'K'; test_init(argc, argv); if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } pid = fork(); if (pid < 0) { pr_perror("fork"); return -1; } if (pid == 0) { pthread_t th; void *p = NULL; if (pthread_mutex_init(&getpid_wait, NULL)) { pr_perror("pthread_mutex_init"); _exit(1); } sk = sk_pair[1]; close(sk_pair[0]); if (filter_syscall(__NR_getpid, 0) < 0) _exit(1); zdtm_seccomp = 1; pthread_mutex_lock(&getpid_wait); pthread_create(&th, NULL, wait_and_getpid, NULL); test_msg("SECCOMP_MODE_FILTER is enabled\n"); if (write(sk, &c, 1) != 1) { pr_perror("write"); _exit(1); } if (read(sk, &c, 1) != 1) { pr_perror("read"); _exit(1); } /* Now we have c/r'd with a shared filter, so let's install * another filter with TSYNC and make sure that it is * inherited. */ if (filter_syscall(__NR_ptrace, SECCOMP_FILTER_FLAG_TSYNC) < 0) _exit(1); pthread_mutex_unlock(&getpid_wait); if (pthread_join(th, &p) != 0) { pr_perror("pthread_join"); exit(1); } /* Here we're abusing pthread exit slightly: if the thread gets * to call pthread_exit, the value of p is one, but if it gets * killed pthread_join doesn't set a value since the thread * didn't, so the value is null; we exit 0 to indicate success * as usual. */ syscall(__NR_exit, p); } sk = sk_pair[0]; close(sk_pair[1]); if ((ret = read(sk, &c, 1)) != 1) { pr_perror("read %d", ret); goto err; } test_daemon(); test_waitsig(); mode = get_seccomp_mode(pid); if (write(sk, &c, 1) != 1) { pr_perror("write"); goto err; } if (waitpid(pid, &status, 0) != pid) { pr_perror("waitpid"); exit(1); } if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { pr_err("expected 0 exit, got %d\n", WEXITSTATUS(status)); exit(1); } if (mode != SECCOMP_MODE_FILTER) { fail("seccomp mode mismatch %d\n", mode); return 1; } pass(); return 0; err: kill(pid, SIGKILL); return 1; } #else /* __NR_seccomp */ #define TEST_SKIP_REASON "incompatible kernel (no seccomp)" #include "skip-me.c" #endif /* __NR_seccomp */ criu-3.6/test/zdtm/static/seccomp_filter_tsync.desc000066400000000000000000000000601317335042600226260ustar00rootroot00000000000000{'flags': 'suid', 'feature': 'seccomp_filters'} criu-3.6/test/zdtm/static/seccomp_strict.c000066400000000000000000000043201317335042600207400ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #ifdef __NR_seccomp # include # include #endif #include "zdtmtst.h" const char *test_doc = "Check that SECCOMP_MODE_STRICT is restored"; const char *test_author = "Tycho Andersen "; #ifdef __NR_seccomp int get_seccomp_mode(pid_t pid) { FILE *f; char buf[PATH_MAX]; sprintf(buf, "/proc/%d/status", pid); f = fopen(buf, "r+"); if (!f) { pr_perror("fopen failed"); return -1; } while (NULL != fgets(buf, sizeof(buf), f)) { int mode; if (sscanf(buf, "Seccomp:\t%d", &mode) != 1) continue; fclose(f); return mode; } fclose(f); return -1; } int main(int argc, char ** argv) { pid_t pid; int mode, status; int sk_pair[2], sk; char c = 'K'; test_init(argc, argv); if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } pid = fork(); if (pid < 0) { pr_perror("fork"); return -1; } if (pid == 0) { sk = sk_pair[1]; close(sk_pair[0]); zdtm_seccomp = 1; if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) < 0) { pr_perror("prctl failed"); return -1; } test_msg("SECCOMP_MODE_STRICT is enabled\n"); if (write(sk, &c, 1) != 1) { pr_perror("write"); return -1; } if (read(sk, &c, 1) != 1) { _exit(1); pr_perror("read"); return -1; } syscall(__NR_exit, 0); } sk = sk_pair[0]; close(sk_pair[1]); if (read(sk, &c, 1) != 1) { pr_perror("read"); goto err; } test_daemon(); test_waitsig(); mode = get_seccomp_mode(pid); if (write(sk, &c, 1) != 1) { pr_perror("write"); goto err; } if (waitpid(pid, &status, 0) != pid) { pr_perror("waitpid"); exit(1); } if (status != 0) { pr_perror("The child exited with an unexpected code %d", status); exit(1); } if (mode != SECCOMP_MODE_STRICT) { fail("seccomp mode mismatch %d\n", mode); return 1; } pass(); return 0; err: kill(pid, SIGKILL); return 1; } #else /* __NR_seccomp */ #define TEST_SKIP_REASON "incompatible kernel (no seccomp)" #include "skip-me.c" #endif /* __NR_seccomp */ criu-3.6/test/zdtm/static/seccomp_strict.desc000066400000000000000000000000601317335042600214310ustar00rootroot00000000000000{'flags': 'suid', 'feature': 'seccomp_suspend'} criu-3.6/test/zdtm/static/selfexe00.c000066400000000000000000000022121317335042600175100ustar00rootroot00000000000000/* * A simple testee program with threads */ #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define gettid() pthread_self() const char *test_doc = "Check if /proc/self/exe points to same location after restore\n"; const char *test_author = "Cyrill Gorcunov #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc="Tests IPC semaphores migrates fine"; const char *test_author="Stanislav Kinsbursky "; static int sem_test(int id, struct sembuf *lock, struct sembuf *unlock, int lock_ops, int unlock_ops) { if (semop(id, lock, lock_ops) == -1) { fail("Failed to lock semaphore"); return -errno; } if (semop(id, unlock, unlock_ops) == -1) { fail("Failed to unlock semaphore"); return -errno; } return 0; } #define NSEMS 10 static int check_sem_by_key(int key, int num) { int id; struct sembuf lock[2] = { { .sem_num = num, .sem_op = 0, .sem_flg = 0, }, { .sem_num = num, .sem_op = 1, .sem_flg = 0, }, }; struct sembuf unlock[1] = { { .sem_num = num, .sem_op = -1, .sem_flg = 0, } }; int val; id = semget(key, NSEMS, 0777); if (id == -1) { fail("Can't get sem"); return -errno; } val = semctl(id, num, GETVAL); if (val < 0) { fail("Failed to get sem value"); return -errno; } return sem_test(id, lock, unlock, sizeof(lock)/sizeof(struct sembuf), sizeof(unlock)/sizeof(struct sembuf)); } static int check_sem_by_id(int id, int num, int val) { int curr; struct sembuf lock[] = { { .sem_num = num, .sem_op = val, .sem_flg = 0, }, }; struct sembuf unlock[] = { { .sem_num = num, .sem_op = - val * 2, .sem_flg = 0, } }; curr = semctl(id, num, GETVAL); if (curr < 0) { fail("Failed to get sem value"); return -errno; } if (curr != val) { fail("Sem has wrong value: %d instead of %d\n", curr, val); return -EFAULT; } return sem_test(id, lock, unlock, sizeof(lock)/sizeof(struct sembuf), sizeof(unlock)/sizeof(struct sembuf)); } int main(int argc, char **argv) { int id, key; int i; /* See man semctl */ union semun { int val; struct semid_ds *buf; unsigned short *array; struct seminfo *__buf; } val[NSEMS]; int ret, fail_count = 0; test_init(argc, argv); key = ftok(argv[0], 89063453); if (key == -1) { pr_perror("Can't make key"); return -1; } id = semget(key, NSEMS, 0777 | IPC_CREAT | IPC_EXCL); if (id == -1) { fail_count++; pr_perror("Can't get sem array"); goto out; } for (i = 0; i < NSEMS; i++) { val[i].val = lrand48() & 0x7; if (semctl(id, i, SETVAL, val[i]) == -1) { fail_count++; pr_perror("Can't init sem %d", i); goto out_destroy; } } test_daemon(); test_waitsig(); for (i = 0; i < NSEMS; i++) { ret = check_sem_by_id(id, i, val[i].val); if (ret < 0) { fail_count++; fail("Check sem %d by id failed", i); goto out_destroy; } if (check_sem_by_key(key, i) < 0) { fail("Check sem %d by key failed", i); fail_count++; goto out_destroy; } val[i].val = semctl(id, 0, GETVAL); if (val[i].val < 0) { fail("Failed to get sem %d value", i); fail_count++; goto out_destroy; } if (val[i].val != 0) { fail("Non-zero sem %d value: %d", i, val[i].val); fail_count++; } } out_destroy: ret = semctl(id, 0, IPC_RMID); if (ret < 0) { fail("Destroy sem array failed"); fail_count++; } out: if (fail_count == 0) pass(); return fail_count; } criu-3.6/test/zdtm/static/sem.desc000066400000000000000000000000251317335042600171750ustar00rootroot00000000000000{'flavor': 'ns uns'} criu-3.6/test/zdtm/static/session00.c000066400000000000000000000104131317335042600175420ustar00rootroot00000000000000#include #include #include #include "zdtmtst.h" const char *test_doc = "Test that sid, pgid are restored"; const char *test_author = "Andrey Vagin "; #define DETACH 1 #define NEWSID 2 #define CHANGESID 4 #define DOUBLE_CHANGESID 8 struct testcase { int flags; pid_t pid; pid_t sid; }; static struct testcase testcases[] = { {DETACH, }, {NEWSID, }, {0, }, {DETACH|NEWSID, }, {CHANGESID, }, {DOUBLE_CHANGESID | CHANGESID, } }; /* 2 2 session00 4 4 \_ session00 # {NEWSID, }, 2 5 \_ session00 # {0, }, 8 8 \_ session00 2 9 | \_ session00 # {CHANGESID, } 10 10 \_ session00 11 11 \_ session00 2 12 \_ session00 # {DOUBLE_CHANGESID | CHANGESID, } 2 3 session00 # {DETACH, }, 6 7 session00 # {DETACH|NEWSID, }, */ #define NUM_CASES (sizeof(testcases) / sizeof(struct testcase)) static int fork_child(int i) { int p[2]; int status, ret; pid_t pid, sid; ret = pipe(p); if (ret) { pr_perror("pipe() failed"); return 1; } pid = test_fork(); if (pid < 0) { pr_perror("Can't fork"); return 1; } if (pid == 0) { if (testcases[i].flags & NEWSID) { sid = setsid(); if (sid == -1) { pr_perror("setsid failed"); write(p[1], &sid, sizeof(sid)); exit(1); } } if (testcases[i].flags & (DETACH | CHANGESID)) { pid = test_fork(); if (pid < 0) { write(p[1], &pid, sizeof(pid)); exit(1); } } if (pid != 0) { if (!(testcases[i].flags & CHANGESID)) exit(0); sid = setsid(); if (sid == -1) { pr_perror("setsid failed"); write(p[1], &sid, sizeof(sid)); exit(1); } close(p[1]); wait(NULL); if (getsid(getpid()) != sid) { fail("The process %d (%x) has SID=%d (expected %d)", pid, testcases[i].flags, sid, testcases[i].sid); exit(1); } exit(0); } if (testcases[i].flags & DOUBLE_CHANGESID) { pid = fork(); if (pid < 0) { write(p[1], &pid, sizeof(pid)); exit(1); } if (pid == 0) goto child; sid = setsid(); if (sid == -1) { pr_perror("setsid failed"); write(p[1], &sid, sizeof(sid)); exit(1); } close(p[1]); wait(NULL); if (getsid(getpid()) != sid) { fail("The process %d (%x) has SID=%d (expected %d)", pid, testcases[i].flags, sid, testcases[i].sid); exit(1); } exit(0); } child: pid = getpid(); write(p[1], &pid, sizeof(pid)); close(p[1]); test_waitsig(); pass(); exit(0); } close(p[1]); if (testcases[i].flags & DETACH) { pid_t ret; ret = wait(&status); if (ret != pid) { pr_perror("wait return %d instead of %d", ret, pid); kill(pid, SIGKILL); return 1; } } ret = read(p[0], &testcases[i].pid, sizeof(pid)); if (ret != sizeof(ret)) { pr_perror("read failed"); return 1; } /* wait when a child closes fd */ ret = read(p[0], &testcases[i].pid, sizeof(pid)); if (ret != 0) { pr_perror("read failed"); return 1; } close(p[0]); if (testcases[i].pid < 0) { pr_perror("child failed"); return 1; } testcases[i].sid = getsid(testcases[i].pid); return 0; } int main(int argc, char ** argv) { int i, ret, err = 0, status; pid_t pid; test_init(argc, argv); for (i = 0; i < NUM_CASES; i++) if (fork_child(i)) break; if (i != NUM_CASES) { int j; for (j = 0; j < i; j++) kill(testcases[j].pid, SIGTERM); return 1; } test_daemon(); test_waitsig(); for (i = 0; i < NUM_CASES; i++) { pid_t pid = testcases[i].pid; pid_t sid = getsid(pid); if (sid != testcases[i].sid) { fail("The process %d (%x) has SID=%d (expected %d)", pid, testcases[i].flags, sid, testcases[i].sid); err++; } ret = kill(pid, SIGKILL); if (ret == -1) { pr_perror("kill failed"); err++; } waitpid(pid, NULL, 0); if (testcases[i].flags & CHANGESID) { pid = wait(&status); if (pid == -1) { pr_perror("wait() failed"); err++; } if (!WIFEXITED(status) || WEXITSTATUS(status)) { fail("The process with pid %d returns %d\n", pid, status); err++; } } } pid = wait(&status); if (pid != -1 || errno != ECHILD) { pr_perror("%d isn't waited", pid); err++; } if (!err) pass(); return err > 0; } criu-3.6/test/zdtm/static/session00.desc000066400000000000000000000000251317335042600202340ustar00rootroot00000000000000{'flavor': 'ns uns'} criu-3.6/test/zdtm/static/session01.c000066400000000000000000000162151317335042600175510ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" #include "lock.h" const char *test_doc = "Test that sid, pgid are restored"; const char *test_author = "Andrey Vagin "; struct master { pid_t pid; pid_t ppid; pid_t sid; pid_t pgid; }; struct testcase { pid_t pid; pid_t ppid; pid_t sid; pid_t born_sid; pid_t pgid; int alive; struct master master; futex_t futex; }; enum { TEST_FORK, TEST_PGID, TEST_WAIT, TEST_MASTER, TEST_CHECK, TEST_EXIT, }; static struct testcase *testcases; static futex_t *fstate; static struct testcase __testcases[] = { { 2, 1, 2, 1, 2, 1 }, /* session00 */ { 4, 2, 4, 2, 4, 1 }, /* |\_session00 */ {15, 4, 4, 4, 15, 1 }, /* | |\_session00 */ {16, 4, 4, 4, 15, 1 }, /* | \_session00 */ {17, 4, 4, 4, 17, 0 }, /* | |\_session00 */ {18, 4, 4, 4, 17, 1 }, /* | \_session00 */ { 5, 2, 2, 2, 2, 1 }, /* |\_session00 */ { 8, 2, 8, 2, 8, 1 }, /* |\_session00 */ { 9, 8, 2, 2, 2, 1 }, /* | \_session00 */ {10, 2, 10, 2, 10, 1 }, /* |\_session00 */ {11, 10, 11, 2, 11, 1 }, /* | \_session00 */ {12, 11, 2, 2, 2, 1 }, /* | \_session00 */ {13, 2, 2, 2, 2, 0 }, /* \_session00 */ { 3, 13, 2, 2, 2, 1 }, /* session00 */ { 6, 2, 6, 2, 6, 0 }, /* \_session00 */ {14, 6, 6, 6, 6, 1 }, /* session00 */ }; #define TESTS (sizeof(__testcases) / sizeof(struct testcase)) #define check(n, a, b) do { if ((a) != (b)) { pr_perror("%s mismatch %d != %d", n, a, b); goto err; } } while (0) static int child(const int c); static int fork_children(struct testcase *t, int leader) { int i; pid_t cid; for (i = 0; i < TESTS; i++) { if (t->pid != testcases[i].ppid) continue; if (leader ^ (t->pid == testcases[i].born_sid)) continue; cid = test_fork_id(i); if (cid < 0) goto err; if (cid == 0) { test_msg("I'm %d with pid %d\n", i, getpid()); child(i); exit(0); } testcases[i].master.pid = cid; } return 0; err: return -1; } static int child(const int c) { int i; struct testcase *t = &testcases[c]; t->master.pid = getpid(); if (fork_children(t, 0)) goto err; if (t->pid == t->sid) { if (getpid() != getsid(getpid())) if (setsid() < 0) goto err; if (fork_children(t, 1)) goto err; } if (t->pid == t->pgid) { if (getpid() != getpgid(getpid())) if (setpgid(getpid(), getpid()) < 0) { pr_perror("setpgid() failed"); goto err; } t->master.pgid = t->master.pid; } futex_set_and_wake(&t->futex, c); if (c == 0) goto out; futex_wait_until(fstate, TEST_PGID); for (i = 0; i < TESTS; i++) { if (c == 0) break; if (t->pgid != testcases[i].pid) continue; if (getpgid(getpid()) != testcases[i].master.pid) if (setpgid(getpid(), testcases[i].master.pid) < 0) { pr_perror("setpgid() failed (%d) (%d)", c, i); goto err; } t->master.pgid = testcases[i].master.pid; break; } futex_set_and_wake(&t->futex, c); futex_wait_until(fstate, TEST_WAIT); for (i = 0; i < TESTS; i++) { if (t->pid != testcases[i].ppid) continue; if (testcases[i].alive) continue; test_msg("Wait porcess %d (pid %d)\n", i, testcases[i].master.pid); waitpid(testcases[i].master.pid, NULL, 0); } if (!t->alive) goto out; futex_set_and_wake(&t->futex, c); futex_wait_until(fstate, TEST_MASTER); /* Save the master copy */ t->master.ppid = getppid(); t->master.sid = getsid(getpid()); futex_set_and_wake(&t->futex, c); futex_wait_until(fstate, TEST_CHECK); check("pid", t->master.pid, getpid()); check("ppid", t->master.ppid, getppid()); check("sid", t->master.sid, getsid(getpid())); check("pgid", t->master.pgid, getpgid(getpid())); futex_set_and_wake(&t->futex, c); /* Wait while all test cases check results */ futex_wait_until(fstate, TEST_EXIT); out: return 0; err: futex_set_and_wake(&t->futex, -1); return 1; } int main(int argc, char ** argv) { int i, err, ret; void *ptr; BUG_ON(sizeof(*fstate) + sizeof(__testcases) > 4096); ptr = mmap(NULL, 4096, PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) return 1; fstate = ptr; futex_set(fstate, TEST_FORK); testcases = ptr + sizeof(*fstate); memcpy(testcases, &__testcases, sizeof(__testcases)); test_init(argc, argv); testcases[0].master.pid = getpid();; if (child(0)) goto err; for (i = 1; i < TESTS; i++) { ret = futex_wait_while(&testcases[i].futex, 0); if (ret < 0) return 1; futex_set(&testcases[i].futex, 0); } test_msg("TEST_PGID\n"); futex_set_and_wake(fstate, TEST_PGID); for (i = 1; i < TESTS; i++) { ret = futex_wait_while(&testcases[i].futex, 0); if (ret < 0) goto err; futex_set(&testcases[i].futex, 0); } test_msg("TEST_WAIT\n"); futex_set_and_wake(fstate, TEST_WAIT); for (i = 1; i < TESTS; i++) { if (!testcases[i].alive) continue; ret = futex_wait_while(&testcases[i].futex, 0); if (ret < 0) goto err; futex_set(&testcases[i].futex, 0); } for (i = 0; i < TESTS; i++) { if (testcases[0].pid != testcases[i].ppid) continue; if (testcases[i].alive) continue; test_msg("Wait porcess %d (pid %d)\n", i, testcases[i].master.pid); waitpid(testcases[i].master.pid, NULL, 0); } test_msg("TEST_MASTER\n"); futex_set_and_wake(fstate, TEST_MASTER); for (i = 1; i < TESTS; i++) { if (!testcases[i].alive) continue; ret = futex_wait_while(&testcases[i].futex, 0); if (ret < 0) goto err; futex_set(&testcases[i].futex, 0); test_msg("The process %d initilized\n", ret); } test_daemon(); test_waitsig(); err = 0; for (i = 1; i < TESTS; i++) { int j; struct testcase *t = testcases + i; pid_t sid, pgid; if (!t->alive) continue; for (j = 0; j < TESTS; j++) { struct testcase *p = testcases + j; /* sanity check */ if (p->pid == t->sid && t->master.sid != p->master.pid) { pr_perror("session mismatch (%d) %d != (%d) %d", i, t->master.sid, j, p->master.pid); err++; } if (p->pid == t->pgid && t->master.pgid != p->master.pid) { pr_perror("pgid mismatch (%d) %d != (%d) %d", i, t->master.pgid, j, p->master.pid); err++; } } sid = getsid(t->master.pid); if (t->master.sid != sid) { pr_perror("%d: session mismatch %d (expected %d)", i, sid, t->master.sid); err++; } pgid = getpgid(t->master.pid); if (t->master.pgid != pgid) { pr_perror("%d: pgid mismatch %d (expected %d)", i, t->master.pgid, pgid); err++; } } test_msg("TEST_CHECK\n"); futex_set_and_wake(fstate, TEST_CHECK); for (i = 1; i < TESTS; i++) { if (!testcases[i].alive) continue; ret = futex_wait_while(&testcases[i].futex, 0); if (ret < 0) goto err; futex_set(&testcases[i].futex, 0); if (ret < 0) { fail("Someone failed"); err++; continue; } test_msg("The process %u is restored correctly\n", (unsigned)ret); } test_msg("TEST_EXIT\n"); futex_set_and_wake(fstate, TEST_EXIT); if (!err) pass(); return 0; err: for (i = 1; i < TESTS; i++) { pid_t pid = testcases[i].master.pid; if (pid > 0) { ret = kill(pid, SIGKILL); test_msg("kill %d %s\n", pid, strerror(ret == -1 ? errno : 0)); } } return 1; } criu-3.6/test/zdtm/static/session01.desc000066400000000000000000000000251317335042600202350ustar00rootroot00000000000000{'flavor': 'ns uns'} criu-3.6/test/zdtm/static/session02.c000066400000000000000000000140361317335042600175510ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Create a crazy process tree"; const char *test_author = "Andrew Vagin "; struct process { pid_t pid; pid_t sid; int sks[2]; int dead; }; struct process *processes; int nr_processes = 20; int current = 0; static void cleanup() { int i; for (i = 0; i < nr_processes; i++) { if (processes[i].dead) continue; if (processes[i].pid <= 0) continue; kill(processes[i].pid, SIGKILL); } } enum commands { TEST_FORK, TEST_WAIT, TEST_SUBREAPER, TEST_SETSID, TEST_DIE }; struct command { enum commands cmd; int arg1; int arg2; }; static void handle_command(); static void mainloop() { while (1) handle_command(); } #define CLONE_STACK_SIZE 4096 /* All arguments should be above stack, because it grows down */ struct clone_args { char stack[CLONE_STACK_SIZE] __stack_aligned__; char stack_ptr[0]; int id; }; static int clone_func(void *_arg) { struct clone_args *args = (struct clone_args *) _arg; current = args->id; test_msg("%3d: Hello. My pid is %d\n", args->id, getpid()); mainloop(); exit(0); } static int make_child(int id, int flags) { struct clone_args args; pid_t cid; args.id = id; cid = clone(clone_func, args.stack_ptr, flags | SIGCHLD, &args); if (cid < 0) pr_perror("clone(%d, %d)", id, flags); processes[id].pid = cid; return cid; } static void handle_command() { int sk = processes[current].sks[0], ret, status = 0; struct command cmd; ret = read(sk, &cmd, sizeof(cmd)); if (ret != sizeof(cmd)) { pr_perror("Unable to get command"); goto err; } switch (cmd.cmd) { case TEST_FORK: { pid_t pid; pid = make_child(cmd.arg1, cmd.arg2); if (pid == -1) { status = -1; goto err; } test_msg("%3d: fork(%d, %x) = %d\n", current, cmd.arg1, cmd.arg2, pid); processes[cmd.arg1].pid = pid; } break; case TEST_WAIT: test_msg("%3d: wait(%d) = %d\n", current, cmd.arg1, processes[cmd.arg1].pid); if (waitpid(processes[cmd.arg1].pid, NULL, 0) == -1) { pr_perror("waitpid(%d)", processes[cmd.arg1].pid); status = -1; } break; case TEST_SUBREAPER: test_msg("%3d: subreaper(%d)\n", current, cmd.arg1); if (prctl(PR_SET_CHILD_SUBREAPER, cmd.arg1, 0, 0, 0) == -1) { pr_perror("PR_SET_CHILD_SUBREAPER"); status = -1; } break; case TEST_SETSID: test_msg("%3d: setsid()\n", current); if(setsid() == -1) { pr_perror("setsid"); status = -1; } break; case TEST_DIE: test_msg("%3d: die()\n", current); processes[current].dead = 1; shutdown(sk, SHUT_RDWR); exit(0); } ret = write(sk, &status, sizeof(status)); if (ret != sizeof(status)) { pr_perror("Unable to answer"); goto err; } if (status < 0) goto err; return; err: shutdown(sk, SHUT_RDWR); exit(1); } static int send_command(int id, enum commands op, int arg1, int arg2) { int sk = processes[id].sks[1], ret, status; struct command cmd = {op, arg1, arg2}; if (op == TEST_FORK) { if (processes[arg1].pid) { pr_perror("%d is busy", arg1); return -1; } } ret = write(sk, &cmd, sizeof(cmd)); if (ret != sizeof(cmd)) { pr_perror("Unable to send command"); goto err; } status = 0; ret = read(sk, &status, sizeof(status)); if (ret != sizeof(status) && !(status == 0 && op == TEST_DIE)) { pr_perror("Unable to get answer"); goto err; } if (status) { pr_perror("The command(%d, %d, %d) failed", op, arg1, arg2); goto err; } return 0; err: cleanup(); exit(1); } int main(int argc, char ** argv) { int pid, i; int fail_cnt = 0; test_init(argc, argv); processes = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (processes == NULL) { pr_perror("Unable to map share memory"); return 1; } for (i = 0; i < nr_processes; i++) { if (socketpair(PF_UNIX, SOCK_STREAM, 0, processes[i].sks) == -1) { pr_perror("socketpair"); return 1; } } pid = make_child(0, 0); if (pid < 0) return -1; /* * 5 5 \_ session02 ( 0) * 6 6 \_ session02 ( 1) * 8 7 | \_ session02 ( 3) * 15 12 | \_ session02 (10) * 10 10 \_ session02 ( 5) * 11 7 \_ session02 ( 6) * 13 12 \_ session02 ( 8) */ send_command(0, TEST_SUBREAPER, 1, 0); send_command(0, TEST_SETSID, 0, 0); send_command(0, TEST_FORK, 1, 0); send_command(1, TEST_FORK, 2, 0); send_command(2, TEST_SETSID, 0, 0); send_command(2, TEST_FORK, 3, CLONE_PARENT); send_command(2, TEST_DIE, 0, 0); send_command(1, TEST_WAIT, 2, 0); send_command(3, TEST_FORK, 4, 0); send_command(4, TEST_FORK, 5, 0); send_command(5, TEST_FORK, 6, 0); send_command(5, TEST_FORK, 7, 0); send_command(7, TEST_SETSID, 0, 0); send_command(7, TEST_FORK, 8, CLONE_PARENT); send_command(7, TEST_FORK, 9, CLONE_PARENT); send_command(7, TEST_DIE, 0, 0); send_command(5, TEST_WAIT, 7, 0); send_command(9, TEST_FORK, 10, 0); send_command(1, TEST_SUBREAPER, 1, 0); send_command(9, TEST_DIE, 0, 0); send_command(5, TEST_WAIT, 9, 0); send_command(1, TEST_SUBREAPER, 0, 0); send_command(4, TEST_DIE, 0, 0); send_command(3, TEST_WAIT, 4, 0); send_command(1, TEST_SETSID, 0, 0); send_command(5, TEST_SETSID, 0, 0); for (i = 0; i < nr_processes; i++) { if (processes[i].dead) continue; if (processes[i].pid == 0) continue; processes[i].sid = getsid(processes[i].pid); if (processes[i].sid == -1) { pr_perror("getsid(%d)", i); goto err; } } test_daemon(); test_waitsig(); for (i = 0; i < nr_processes; i++) { pid_t sid; if (processes[i].dead) continue; if (processes[i].pid == 0) continue; sid = getsid(processes[i].pid); if (sid == -1) { pr_perror("getsid(%d)", i); goto err; } if (sid != processes[i].sid) { fail("%d, %d: wrong sid %d (expected %d)", i, processes[i].pid, sid, processes[i].sid); fail_cnt++; } } if (fail_cnt) goto err; pass(); return 0; err: cleanup(); return 1; } criu-3.6/test/zdtm/static/session02.desc000066400000000000000000000000241317335042600202350ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/session03.c000066400000000000000000000151641317335042600175550ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Create a crazy process tree"; const char *test_author = "Andrew Vagin "; struct process { pid_t pid; pid_t sid; int sks[2]; int dead; int wait; }; #define MEM_SIZE (2 * PAGE_SIZE) #define PR_MAX (MEM_SIZE / sizeof(struct process)) struct process *processes; int nr_processes = 0; int current = 0; static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) { pid_t pid = siginfo->si_pid; if (siginfo->si_status == 2) waitpid(pid, NULL, WNOHANG); } static void cleanup() { int i, ret; for (i = 0; i < nr_processes; i++) { if (processes[i].dead) continue; if (processes[i].pid <= 0) continue; kill(processes[i].pid, SIGKILL); } while (1) { ret = wait(NULL); if (ret == -1) { if (errno == ECHILD) break; pr_perror("wait"); exit(1); } } } enum commands { TEST_FORK, TEST_DIE_WAIT, TEST_DIE, TEST_SUBREAPER, TEST_SETSID, TEST_MAX }; int cmd_weght[TEST_MAX] = {10, 3, 1, 10, 7}; int sum_weight = 0; static int get_rnd_op() { int i, m; if (sum_weight == 0) { for (i = 0; i < TEST_MAX; i++) sum_weight += cmd_weght[i]; } m = lrand48() % sum_weight; for (i = 0; i < TEST_MAX; i++) { if (m > cmd_weght[i]) { m -= cmd_weght[i]; continue; } return i; } return -1; } struct command { enum commands cmd; int arg1; int arg2; }; static void handle_command(); static void mainloop() { while (1) handle_command(); } #define CLONE_STACK_SIZE 4096 /* All arguments should be above stack, because it grows down */ struct clone_args { char stack[CLONE_STACK_SIZE] __stack_aligned__; char stack_ptr[0]; int id; }; static int clone_func(void *_arg) { struct clone_args *args = (struct clone_args *) _arg; current = args->id; test_msg("%3d: Hello. My pid is %d\n", args->id, getpid()); mainloop(); exit(0); } static int make_child(int id, int flags) { struct clone_args args; pid_t cid; args.id = id; cid = clone(clone_func, args.stack_ptr, flags | SIGCHLD, &args); if (cid < 0) pr_perror("clone(%d, %d)", id, flags); processes[id].pid = cid; return cid; } static void handle_command() { int sk = processes[current].sks[0], ret, status = 0; struct command cmd; ret = read(sk, &cmd, sizeof(cmd)); if (ret != sizeof(cmd)) { pr_perror("Unable to get command"); goto err; } switch (cmd.cmd) { case TEST_FORK: { pid_t pid; pid = make_child(cmd.arg1, cmd.arg2 ? CLONE_PARENT : 0); if (pid < 0) { status = -1; goto err; } test_msg("%3d: fork(%d, %x) = %d\n", current, cmd.arg1, cmd.arg2, pid); processes[cmd.arg1].pid = pid; } break; case TEST_SUBREAPER: test_msg("%3d: subreaper(%d)\n", current, cmd.arg1); if (prctl(PR_SET_CHILD_SUBREAPER, cmd.arg1, 0, 0, 0) == -1) { pr_perror("PR_SET_CHILD_SUBREAPER"); status = -1; } break; case TEST_SETSID: if (getsid(getpid()) == getpid()) break; test_msg("%3d: setsid()\n", current); if(setsid() == -1) { pr_perror("setsid"); status = -1; } break; case TEST_DIE_WAIT: test_msg("%3d: wait()\n", current); case TEST_DIE: test_msg("%3d: die()\n", current); processes[current].dead = 1; shutdown(sk, SHUT_RDWR); if (cmd.cmd == TEST_DIE_WAIT) exit(2); exit(0); default: pr_perror("Unknown operation %d", cmd.cmd); status = -1; break; } ret = write(sk, &status, sizeof(status)); if (ret != sizeof(status)) { pr_perror("Unable to answer"); goto err; } if (status < 0) goto err; return; err: shutdown(sk, SHUT_RDWR); exit(1); } static int send_command(int id, enum commands op, int arg) { int sk = processes[id].sks[1], ret, status; struct command cmd = {op, arg}; if (op == TEST_FORK) { cmd.arg1 = nr_processes; nr_processes++; if (nr_processes > PR_MAX) return -1; cmd.arg2 = arg; } ret = write(sk, &cmd, sizeof(cmd)); if (ret != sizeof(cmd)) { pr_perror("Unable to send command"); goto err; } status = 0; ret = read(sk, &status, sizeof(status)); if (ret != sizeof(status) && !(status == 0 && (op == TEST_DIE || op == TEST_DIE_WAIT))) { pr_perror("Unable to get answer"); goto err; } if (status) { pr_perror("The command(%d, %d) failed", op, arg); goto err; } return 0; err: cleanup(); exit(1); } int main(int argc, char ** argv) { struct sigaction act; int pid, i, ret; int fail_cnt = 0; test_init(argc, argv); if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) == -1) { pr_perror("PR_SET_CHILD_SUBREAPER"); return -1; } ret = sigaction(SIGCHLD, NULL, &act); if (ret < 0) { pr_perror("sigaction() failed"); return -1; } act.sa_flags |= SA_NOCLDSTOP | SA_SIGINFO | SA_RESTART; act.sa_sigaction = sigchld_handler; sigemptyset(&act.sa_mask); sigaddset(&act.sa_mask, SIGCHLD); ret = sigaction(SIGCHLD, &act, NULL); if (ret < 0) { pr_perror("sigaction() failed"); return -1; } processes = mmap(NULL, MEM_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (processes == NULL) { pr_perror("Unable to map share memory"); return 1; } for (i = 0; i < PR_MAX; i++) { if (socketpair(PF_UNIX, SOCK_STREAM, 0, processes[i].sks) == -1) { pr_perror("socketpair"); return 1; } } nr_processes++; pid = make_child(0, 0); if (pid < 0) return -1; while(nr_processes < PR_MAX) { int op, id; int flags = lrand48() % 2; op = get_rnd_op(); if (op == TEST_DIE || op == TEST_DIE_WAIT || op == TEST_SUBREAPER) { if (nr_processes == 1) continue; else id = lrand48() % (nr_processes - 1) + 1; } else if (op == TEST_FORK) { id = nr_processes * 9 / 10 + lrand48() % nr_processes / 10; while (processes[id].dead != 0) id--; } else id = lrand48() % nr_processes; if (processes[id].dead) continue; send_command(id, op, flags); } for (i = 0; i < nr_processes; i++) { if (processes[i].dead) continue; if (processes[i].pid == 0) continue; processes[i].sid = getsid(processes[i].pid); if (processes[i].sid == -1) { pr_perror("getsid(%d)", i); goto err; } } test_daemon(); test_waitsig(); for (i = 0; i < nr_processes; i++) { pid_t sid; if (processes[i].dead) continue; if (processes[i].pid == 0) continue; sid = getsid(processes[i].pid); if (sid == -1) { pr_perror("getsid(%d)", i); goto err; } if (sid != processes[i].sid) { fail("%d, %d: wrong sid %d (expected %d)", i, processes[i].pid, sid, processes[i].sid); fail_cnt++; } } if (fail_cnt) goto err; pass(); cleanup(); return 0; err: cleanup(); return 1; } criu-3.6/test/zdtm/static/session03.desc000066400000000000000000000000241317335042600202360ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/shm-mp.c000066400000000000000000000042201317335042600171170ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc="Tests mprotected SYSVIPC shmems"; const char *test_author="Pavel Emelyanov "; static sigjmp_buf segv_ret; /* we need sig*jmp stuff, otherwise SIGSEGV will reset our handler */ static void segfault(int signo) { siglongjmp(segv_ret, 1); } static int check_prot(char *ptr, char val, int prot) { if (signal(SIGSEGV, segfault) == SIG_ERR) { fail("setting SIGSEGV handler failed: %m\n"); return -1; } if (!sigsetjmp(segv_ret, 1)) { if (*ptr != val) { fail("read value doesn't match what I wrote"); return -1; } if (!(prot & PROT_READ)) { fail("PROT_READ bypassed\n"); return -1; } } else /* we come here on return from SIGSEGV handler */ if (prot & PROT_READ) { fail("PROT_READ rejected\n"); return -1; } if (!sigsetjmp(segv_ret, 1)) { *ptr = val; if (!(prot & PROT_WRITE)) { fail("PROT_WRITE bypassed\n"); return -1; } } else /* we come here on return from SIGSEGV handler */ if (prot & PROT_WRITE) { fail("PROT_WRITE rejected\n"); return -1; } if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) { fail("restoring SIGSEGV handler failed: %m\n"); return -1; } return 0; } int main(int argc, char **argv) { key_t key; int id, f = 0; char *mem; test_init(argc, argv); key = ftok(argv[0], 812135646); if (key == -1) { pr_perror("Can't make key"); goto out; } id = shmget(key, 2 * PAGE_SIZE, 0777 | IPC_CREAT | IPC_EXCL); if (id == -1) { pr_perror("Can't make seg"); goto out; } mem = shmat(id, NULL, 0); if (mem == (void *)-1) { pr_perror("Can't shmat"); goto out_rm; } mem[0] = 'R'; mem[PAGE_SIZE] = 'W'; if (mprotect(mem, PAGE_SIZE, PROT_READ)) { pr_perror("Can't mprotect shmem"); goto out_dt; } test_daemon(); test_waitsig(); if (check_prot(mem, 'R', PROT_READ)) f++; if (check_prot(mem + PAGE_SIZE, 'W', PROT_READ | PROT_WRITE)) f++; if (!f) pass(); else fail("Some checks failed"); out_dt: shmdt(mem); out_rm: shmctl(id, IPC_RMID, NULL); out: return 0; } criu-3.6/test/zdtm/static/shm-mp.desc000066400000000000000000000000251317335042600176120ustar00rootroot00000000000000{'flavor': 'ns uns'} criu-3.6/test/zdtm/static/shm-unaligned.c000077700000000000000000000000001317335042600214012shm.custar00rootroot00000000000000criu-3.6/test/zdtm/static/shm-unaligned.desc000077700000000000000000000000001317335042600225712shm.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/shm.c000066400000000000000000000066421317335042600165170ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc="Tests detached shmems migrate fine"; const char *test_author="Stanislav Kinsbursky "; #define DEF_MEM_SIZE (40960) unsigned int shmem_size = DEF_MEM_SIZE; TEST_OPTION(shmem_size, uint, "Size of shared memory segment", 0); #define INIT_CRC (~0) static int fill_shm_seg(int id, size_t size) { uint8_t *mem; uint32_t crc = INIT_CRC; mem = shmat(id, NULL, 0); if (mem == (void *)-1) { pr_perror("Can't attach shm: %d", -errno); return -1; } datagen(mem, size, &crc); if (shmdt(mem) < 0) { pr_perror("Can't detach shm: %d", -errno); return -1; } return 0; } static int get_shm_seg(int key, size_t size, unsigned int flags) { int id; id = shmget(key, size, 0777 | flags); if (id == -1) { pr_perror("Can't get shm: %d", -errno); return -1; } return id; } static int prepare_shm(int key, size_t size) { int id; id = get_shm_seg(key, shmem_size, IPC_CREAT | IPC_EXCL); if (id == -1) { return -1; } if (fill_shm_seg(id, shmem_size) < 0) return -1; return id; } static int check_shm_id(int id, size_t size) { uint8_t *mem; uint32_t crc = INIT_CRC; mem = shmat(id, NULL, 0); if (mem == (void *)-1) { pr_perror("Can't attach shm: %d", -errno); return -1; } crc = INIT_CRC; if (datachk(mem, size, &crc)) { fail("shmem data are corrupted"); return -1; } if (shmdt(mem) < 0) { pr_perror("Can't detach shm: %d", -errno); return -1; } return 0; } static int check_shm_key(int key, size_t size) { int id; id = get_shm_seg(key, size, 0); if (id < 0) return -1; return check_shm_id(id, size); } int main(int argc, char **argv) { key_t key; int shm; int fail_count = 0; int ret = -1; void *mem; uint32_t crc = INIT_CRC; test_init(argc, argv); #ifdef ZDTM_SHM_UNALIGNED key = ftok(argv[0], 822155666); #else key = ftok(argv[0], 822155667); #endif if (key == -1) { pr_perror("Can't make key"); goto out; } shm = prepare_shm(key, shmem_size); if (shm == -1) { pr_perror("Can't prepare shm (1)"); goto out; } mem = shmat(shm, NULL, 0); if (mem == (void *)-1) { pr_perror("Can't shmat"); goto out; } test_daemon(); test_waitsig(); ret = check_shm_id(shm, shmem_size); if (ret < 0) { fail("ID check (1) failed\n"); fail_count++; goto out_shm; } ret = check_shm_key(key, shmem_size); if (ret < 0) { fail("KEY check failed\n"); fail_count++; goto out_shm; } if (datachk(mem, shmem_size, &crc)) { fail("shmem data is corrupted"); return -1; } if (shmdt(mem) < 0) { pr_perror("Can't detach shm"); return -1; } ret = shmctl(shm, IPC_RMID, NULL); if (ret < 0) { fail("Failed (1) to destroy segment: %d\n", -errno); fail_count++; goto out_shm; } /* * Code below checks that it's still possible to create new IPC SHM * segments */ shm = prepare_shm(key, shmem_size); if (shm == -1) { fail("Can't prepare shm (2)"); fail_count++; goto out; } ret = check_shm_id(shm, shmem_size); if (ret < 0) { fail("ID check (2) failed\n"); fail_count++; goto out_shm; } out_shm: ret = shmctl(shm, IPC_RMID, NULL); if (ret < 0) { fail("Failed (2) to destroy segment: %d\n", -errno); fail_count++; } if (fail_count == 0) pass(); out: return ret; } criu-3.6/test/zdtm/static/shm.desc000066400000000000000000000000251317335042600172000ustar00rootroot00000000000000{'flavor': 'ns uns'} criu-3.6/test/zdtm/static/sigaltstack.c000066400000000000000000000067341317335042600202430ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check for alternate signal stack"; const char *test_author = "Cyrill Gorcunov "; static char stack_thread[SIGSTKSZ + TEST_MSG_BUFFER_SIZE] __stack_aligned__; static char stack_main[SIGSTKSZ + TEST_MSG_BUFFER_SIZE] __stack_aligned__; enum { SAS_MAIN_OLD, SAS_MAIN_NEW, SAS_THRD_OLD, SAS_THRD_NEW, SAS_MAX }; static stack_t sas_state[SAS_MAX]; static task_waiter_t t; #define exit_group(code) syscall(__NR_exit_group, code) #define gettid() syscall(__NR_gettid) static int sascmp(stack_t *old, stack_t *new) { return old->ss_size != new->ss_size || old->ss_sp != new->ss_sp || old->ss_flags != new->ss_flags; } static void show_ss(char *prefix, stack_t *s) { test_msg("%20s: at %p (size %8zu flags %#2x)\n", prefix, s->ss_sp, s->ss_size, s->ss_flags); } void thread_sigaction(int signo, siginfo_t *info, void *context) { if (sigaltstack(NULL, &sas_state[SAS_THRD_NEW])) pr_perror("thread sigaltstack"); show_ss("thread in sas", &sas_state[SAS_THRD_NEW]); task_waiter_complete(&t, 2); test_msg("Waiting in thread SAS\n"); task_waiter_wait4(&t, 3); test_msg("Leaving thread SAS\n"); } static void *thread_func(void *arg) { sas_state[SAS_THRD_OLD] = (stack_t) { .ss_size = sizeof(stack_thread) - 8, .ss_sp = stack_thread, .ss_flags = SS_ONSTACK, }; struct sigaction sa = { .sa_sigaction = thread_sigaction, .sa_flags = SA_RESTART | SA_ONSTACK, }; sigemptyset(&sa.sa_mask); if (sigaction(SIGUSR2, &sa, NULL)) { pr_perror("Can't set SIGUSR2 handler"); exit_group(-1); } task_waiter_wait4(&t, 1); if (sigaltstack(&sas_state[SAS_THRD_OLD], NULL)) { pr_perror("thread sigaltstack"); exit_group(-1); } syscall(__NR_tkill, gettid(), SIGUSR2); return NULL; } void leader_sigaction(int signo, siginfo_t *info, void *context) { if (sigaltstack(NULL, &sas_state[SAS_MAIN_NEW])) pr_perror("leader sigaltstack"); show_ss("leader in sas", &sas_state[SAS_MAIN_NEW]); } int main(int argc, char *argv[]) { pthread_t thread; sas_state[SAS_MAIN_OLD] = (stack_t) { .ss_size = sizeof(stack_main) - 8, .ss_sp = stack_main, .ss_flags = SS_ONSTACK, }; struct sigaction sa = { .sa_sigaction = leader_sigaction, .sa_flags = SA_RESTART | SA_ONSTACK, }; sigemptyset(&sa.sa_mask); test_init(argc, argv); task_waiter_init(&t); if (sigaction(SIGUSR1, &sa, NULL)) { pr_perror("Can't set SIGUSR1 handler"); exit(-1); } if (pthread_create(&thread, NULL, &thread_func, NULL)) { pr_perror("Can't create thread"); exit(-1); } if (sigaltstack(&sas_state[SAS_MAIN_OLD], NULL)) { pr_perror("sigaltstack"); exit(-1); } task_waiter_complete(&t, 1); task_waiter_wait4(&t, 2); test_daemon(); test_waitsig(); test_msg("Thread may leave SAS\n"); task_waiter_complete(&t, 3); syscall(__NR_tkill, gettid(), SIGUSR1); if (pthread_join(thread, NULL)) { fail("Error joining thread"); exit(-1); } task_waiter_fini(&t); show_ss("main old", &sas_state[SAS_MAIN_OLD]); show_ss("main new", &sas_state[SAS_MAIN_NEW]); show_ss("thrd old", &sas_state[SAS_THRD_OLD]); show_ss("thrd new", &sas_state[SAS_THRD_NEW]); if (sascmp(&sas_state[SAS_MAIN_OLD], &sas_state[SAS_MAIN_NEW]) || sascmp(&sas_state[SAS_THRD_OLD], &sas_state[SAS_THRD_NEW])) { fail("sas not restored"); } else pass(); return 0; } criu-3.6/test/zdtm/static/signalfd00.c000066400000000000000000000023341317335042600176510ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check for signalfd without signals"; const char *test_author = "Pavel Emelyanov "; int main(int argc, char *argv[]) { int fd, ret; sigset_t mask; siginfo_t info; test_init(argc, argv); sigemptyset(&mask); sigaddset(&mask, SIGUSR1); fd = signalfd(-1, &mask, SFD_NONBLOCK); if (fd < 0) { fail("Can't create signalfd"); exit(1); } sigemptyset(&mask); sigaddset(&mask, SIGUSR1); sigaddset(&mask, SIGUSR2); sigprocmask(SIG_BLOCK, &mask, NULL); test_daemon(); test_waitsig(); kill(getpid(), SIGUSR2); ret = read(fd, &info, sizeof(info)); if (ret >= 0) { fail("ghost signal"); exit(1); } kill(getpid(), SIGUSR1); ret = read(fd, &info, sizeof(info)); if (ret != sizeof(info)) { fail("no signal"); exit(1); } if (info.si_signo != SIGUSR1) { fail("wrong signal"); exit(1); } pass(); return 0; } criu-3.6/test/zdtm/static/sigpending.c000066400000000000000000000147511317335042600200570ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check pending signals"; const char *test_author = "Andrew Vagin "; static pid_t child; static int numsig; #define TESTSIG (SIGRTMAX) #define THREADSIG (SIGRTMIN) static siginfo_t share_infos[2]; static siginfo_t self_infos[64]; /* self */ static siginfo_t thread_infos[3]; /* thread */ static int share_nr; static int self_nr; static int thread_nr; #ifndef offsetof # define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #endif /* cr_siginfo is declared to get an offset of _sifields */ union cr_siginfo { struct { int si_signo; int si_errno; int si_code; union { int _pad[10]; /* ... */ } _sifields; } _info; siginfo_t info; }; typedef union cr_siginfo cr_siginfo_t; #define siginf_body(s) (&((cr_siginfo_t *)(s))->_info._sifields) #ifdef __i386__ /* * On x86_32 kernel puts only relevant union member when signal arrives, * leaving _si_fields to be filled with junk from stack. Check only * first 12 bytes: * // POSIX.1b signals. * struct * { * __pid_t si_pid; // Sending process ID. * __uid_t si_uid; // Real user ID of sending process. * sigval_t si_sigval; // Signal value. * } _rt; * Look at __copy_siginfo_to_user32() for more information. */ # define _si_fields_sz 12 #else # define _si_fields_sz (sizeof(siginfo_t) - offsetof(cr_siginfo_t, _info._sifields)) #endif #define siginfo_filled (offsetof(cr_siginfo_t, _info._sifields) + _si_fields_sz) static pthread_mutex_t exit_lock; static pthread_mutex_t init_lock; static void sig_handler(int signal, siginfo_t *info, void *data) { uint32_t crc; test_msg("signo=%d si_code=%x\n", signal, info->si_code); if (test_go()) { pr_perror("The signal is received before unlocking"); return; } switch (signal) { case SIGCHLD: if ((info->si_code & CLD_EXITED) && (info->si_pid == child) && (info->si_status == 5)) numsig++; else { fail("Wrong siginfo"); exit(1); } return; } if (TESTSIG == signal || THREADSIG == signal) { siginfo_t *src; if (signal == TESTSIG) { src = &share_infos[share_nr]; share_nr++; } else if (getpid() == syscall(SYS_gettid)) { src = &self_infos[self_nr]; self_nr++; } else { src = &thread_infos[thread_nr]; thread_nr++; } crc = ~0; if (datachk((uint8_t *) siginf_body(info), _si_fields_sz, &crc)) { fail("CRC mismatch\n"); return; } if (memcmp(info, src, siginfo_filled)) { fail("Source and received info are differ\n"); return; } numsig++; return; } pr_perror("Unexpected signal"); exit(1); } static int thread_id; static void *thread_fn(void *args) { sigset_t blockmask, oldset, newset; struct sigaction act; memset(&oldset, 0, sizeof(oldset)); memset(&newset, 0, sizeof(oldset)); sigfillset(&blockmask); sigdelset(&blockmask, SIGTERM); if (sigprocmask(SIG_BLOCK, &blockmask, NULL) == -1) { pr_perror("sigprocmask"); return NULL; } if (sigprocmask(SIG_SETMASK, NULL, &oldset) == -1) { pr_perror("sigprocmask"); return NULL; } thread_id = syscall(SYS_gettid); act.sa_flags = SA_SIGINFO | SA_RESTART; act.sa_sigaction = sig_handler; sigemptyset(&act.sa_mask); sigaddset(&act.sa_mask, TESTSIG); sigaddset(&act.sa_mask, THREADSIG); if (sigaction(TESTSIG, &act, NULL)) { pr_perror("sigaction() failed"); return NULL; } pthread_mutex_unlock(&init_lock); pthread_mutex_lock(&exit_lock); if (sigprocmask(SIG_UNBLOCK, &blockmask, &newset) == -1) { pr_perror("sigprocmask"); return NULL; } sigdelset(&oldset, SIGTRAP); sigdelset(&newset, SIGTRAP); if (memcmp(&newset, &oldset, sizeof(newset))) { fail("The signal blocking mask was changed"); numsig = INT_MAX; } return NULL; } static int sent_sigs; int send_siginfo(int signo, pid_t pid, pid_t tid, int group, siginfo_t *info) { static int si_code = -10; uint32_t crc = ~0; info->si_code = si_code; si_code--; info->si_signo = signo; datagen((uint8_t *) siginf_body(info), _si_fields_sz, &crc); sent_sigs++; if (group) return syscall(SYS_rt_sigqueueinfo, pid, signo, info); else return syscall(SYS_rt_tgsigqueueinfo, pid, tid, signo, info); } int main(int argc, char ** argv) { sigset_t blockmask, oldset, newset; struct sigaction act; pthread_t pthrd; siginfo_t infop; int i; memset(&oldset, 0, sizeof(oldset)); memset(&newset, 0, sizeof(oldset)); test_init(argc, argv); pthread_mutex_init(&exit_lock, NULL); pthread_mutex_lock(&exit_lock); pthread_mutex_init(&init_lock, NULL); pthread_mutex_lock(&init_lock); if (pthread_create(&pthrd, NULL, thread_fn, NULL)) { pr_perror("Can't create a thread"); return 1; } pthread_mutex_lock(&init_lock); sigfillset(&blockmask); sigdelset(&blockmask, SIGTERM); if (sigprocmask(SIG_BLOCK, &blockmask, NULL) == -1) { pr_perror("sigprocmask"); return -1; } if (sigprocmask(SIG_BLOCK, NULL, &oldset) == -1) { pr_perror("sigprocmask"); return -1; } child = fork(); if (child == -1) { pr_perror("fork"); return -1; } if(child == 0) return 5; /* SIGCHLD */ if (waitid(P_PID, child, &infop, WNOWAIT | WEXITED)) { pr_perror("waitid"); return 1; } sent_sigs++; for (i = 0; i < sizeof(share_infos) / sizeof(siginfo_t); i++) { send_siginfo(TESTSIG, getpid(), -1, 1, share_infos + i); } for (i = 0; i < sizeof(self_infos) / sizeof(siginfo_t); i++) { send_siginfo(THREADSIG, getpid(), getpid(), 0, self_infos + i); } for (i = 0; i < sizeof(thread_infos) / sizeof(siginfo_t); i++) { send_siginfo(THREADSIG, getpid(), thread_id, 0, thread_infos + i); } act.sa_flags = SA_SIGINFO | SA_RESTART; act.sa_sigaction = sig_handler; sigemptyset(&act.sa_mask); if (sigaction(SIGCHLD, &act, NULL)) { pr_perror("sigaction() failed"); return -1; } sigaddset(&act.sa_mask, TESTSIG); sigaddset(&act.sa_mask, THREADSIG); if (sigaction(TESTSIG, &act, NULL)) { pr_perror("sigaction() failed"); return -1; } if (sigaction(THREADSIG, &act, NULL)) { pr_perror("sigaction() failed"); return -1; } test_daemon(); test_waitsig(); if (sigprocmask(SIG_UNBLOCK, &blockmask, &newset) == -1) { pr_perror("sigprocmask"); return -1; } pthread_mutex_unlock(&exit_lock); pthread_join(pthrd, NULL); sigdelset(&oldset, SIGTRAP); sigdelset(&newset, SIGTRAP); if (memcmp(&newset, &oldset, sizeof(newset))) { fail("The signal blocking mask was changed"); return 1; } if (numsig == sent_sigs) pass(); return 0; } criu-3.6/test/zdtm/static/sit.c000066400000000000000000000021731317335042600165220ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "check sit devices"; const char *test_author = "Pavel Emelyanov "; #define IF_NAME "zdtmsit0" #define LOCAL_ADDR "1.1.1.2" #define REMOT_ADDR "2.2.2.1" int main(int argc, char **argv) { int ret = 1; test_init(argc, argv); if (system("ip link add " IF_NAME " type sit ttl 13 local " LOCAL_ADDR " remote " REMOT_ADDR)) { pr_perror("Can't make sit device"); return 1; } if (system("ip -details addr list dev " IF_NAME " > sit.dump.test")) { fail("can't save net config"); goto out; } test_daemon(); test_waitsig(); if (system("ip -details addr list dev " IF_NAME " > sit.rst.test")) { fail("can't get net config"); goto out; } if (system("diff sit.rst.test sit.dump.test")) { fail("Net config differs after restore"); goto out; } pass(); ret = 0; out: return ret; } criu-3.6/test/zdtm/static/sit.desc000066400000000000000000000001561317335042600172150ustar00rootroot00000000000000{ 'deps': [ '/bin/sh', '/sbin/ip|/bin/ip', '/usr/bin/diff' ], 'flags': 'suid', 'flavor': 'ns uns' } criu-3.6/test/zdtm/static/sk-freebind-false.c000077700000000000000000000000001317335042600235312sk-freebind.custar00rootroot00000000000000criu-3.6/test/zdtm/static/sk-freebind.c000066400000000000000000000030431317335042600201110ustar00rootroot00000000000000#include #include #include #include #include /* for sockaddr_in and inet_ntoa() */ #include "zdtmtst.h" const char *test_doc = "Check that IP_FREEBIND is restored"; const char *test_author = "Andrew Vagin "; union sockaddr_inet { struct sockaddr_in v4; struct sockaddr_in6 v6; }; #ifdef ZDTM_FREEBIND_FALSE static const int fb_keep = 0; static const int port = 56789; #else static const int fb_keep = 1; static const int port = 56787; #endif int main(int argc, char **argv) { union sockaddr_inet addr; socklen_t len; int val, sock; test_init(argc, argv); addr.v6.sin6_family = AF_INET6; inet_pton(AF_INET6, "2001:db8::ff00:42:8329", &(addr.v6.sin6_addr)); addr.v6.sin6_port = htons(port); sock = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); if (sock == -1) { pr_perror("socket() failed"); return -1; } val = 1; if (setsockopt(sock, SOL_IP, IP_FREEBIND, &val, sizeof(int)) == -1 ) { pr_perror("setsockopt() error"); return -1; } if (bind(sock, (struct sockaddr *) &addr, sizeof(addr))) { pr_perror("bind()"); return -1; } if (!fb_keep) { val = 0; if (setsockopt(sock, SOL_IP, IP_FREEBIND, &val, sizeof(int)) == -1 ) { pr_perror("setsockopt() error"); return -1; } } test_daemon(); test_waitsig(); len = sizeof(int); if (getsockopt(sock, SOL_IP, IP_FREEBIND, &val, &len) == -1 ) { pr_perror("setsockopt() error"); return -1; } if (val != fb_keep) { fail("Unexpected value: %d", val); return -1; } pass(); return 0; } criu-3.6/test/zdtm/static/sk-netlink.c000066400000000000000000000064721317335042600200100ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" #ifndef SOL_NETLINK #define SOL_NETLINK 270 #endif #define UDEV_MONITOR_TEST 32 const char *test_doc = "Support of netlink sockets"; const char *test_author = "Andrew Vagin "; int main(int argc, char ** argv) { int ssk, bsk, csk, dsk; struct sockaddr_nl addr; struct msghdr msg; struct { struct nlmsghdr hdr; } req; struct iovec iov; char buf[4096]; test_init(argc, argv); ssk = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT); if (ssk < 0) { pr_perror("Can't create sock diag socket"); return -1; } bsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT); if (bsk < 0) { pr_perror("Can't create sock diag socket"); return -1; } #if 0 int on, bbsk; bbsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT); if (bbsk < 0) { pr_perror("Can't create sock diag socket"); return -1; } on = UDEV_MONITOR_TEST; setsockopt(bbsk, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on)); #endif csk = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT); if (csk < 0) { pr_perror("Can't create sock diag socket"); return -1; } dsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT); if (dsk < 0) { pr_perror("Can't create sock diag socket"); return -1; } addr.nl_family = AF_NETLINK; addr.nl_groups = 0; addr.nl_pid = getpid(); if (bind(ssk, (struct sockaddr *) &addr, sizeof(struct sockaddr_nl))) { pr_perror("bind"); return 1; } addr.nl_groups = 1 << (UDEV_MONITOR_TEST - 1); addr.nl_pid = 0; if (bind(bsk, (struct sockaddr *) &addr, sizeof(struct sockaddr_nl))) { pr_perror("bind"); return 1; } addr.nl_pid = getpid();; addr.nl_groups = 1 << (UDEV_MONITOR_TEST - 1); if (connect(csk, (struct sockaddr *) &addr, sizeof(struct sockaddr_nl))) { pr_perror("connect"); return 1; } test_daemon(); test_waitsig(); req.hdr.nlmsg_len = sizeof(req); req.hdr.nlmsg_type = 0x1234; req.hdr.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST; req.hdr.nlmsg_seq = 0xabcd; memset(&msg, 0, sizeof(msg)); msg.msg_namelen = 0; msg.msg_iov = &iov; msg.msg_iovlen = 1; iov.iov_base = (void *) &req; iov.iov_len = sizeof(req);; if (sendmsg(csk, &msg, 0) < 0) { pr_perror("Can't send request message"); return 1; } memset(&msg, 0, sizeof(msg)); msg.msg_namelen = 0; msg.msg_iov = &iov; msg.msg_iovlen = 1; iov.iov_base = buf; iov.iov_len = sizeof(buf); if (recvmsg(ssk, &msg, 0) < 0) { pr_perror("Can't recv request message"); return 1; } if (recvmsg(bsk, &msg, 0) < 0) { pr_perror("Can't recv request message"); return 1; } addr.nl_family = AF_NETLINK; addr.nl_groups = 0; addr.nl_pid = getpid(); memset(&msg, 0, sizeof(msg)); msg.msg_namelen = sizeof(addr); msg.msg_name = &addr; msg.msg_iov = &iov; msg.msg_iovlen = 1; iov.iov_base = (void *) &req; iov.iov_len = sizeof(req);; if (sendmsg(dsk, &msg, 0) < 0) { pr_perror("Can't send request message"); return 1; } memset(&msg, 0, sizeof(msg)); msg.msg_namelen = 0; msg.msg_iov = &iov; msg.msg_iovlen = 1; iov.iov_base = buf; iov.iov_len = sizeof(buf); if (recvmsg(ssk, &msg, 0) < 0) { pr_perror("Can't recv request message"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/sk-netlink.desc000066400000000000000000000000221317335042600204650ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/sk-unix-rel.c000066400000000000000000000040411317335042600200750ustar00rootroot00000000000000 #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test unix stream sockets with relative name\n"; const char *test_author = "Cyrill Gorcunov #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check unconnected unix sockets"; const char *test_author = "Vagin Andrew "; int main(int argc, char ** argv) { int sk, skc; int ret, len; char path[PATH_MAX]; struct sockaddr_un addr; socklen_t addrlen; test_init(argc, argv); sk = socket(AF_UNIX, SOCK_STREAM, 0); if (sk == -1) { pr_perror("socket"); return 1; } skc = socket(AF_UNIX, SOCK_STREAM, 0); if (skc == -1) { pr_perror("socket"); return 1; } len = snprintf(path, sizeof(path), "X/zdtm-%s-%d/X", argv[0], getpid()); addr.sun_family = AF_UNIX; strncpy(addr.sun_path, path, sizeof(addr.sun_path)); addrlen = sizeof(addr.sun_family) + len; addr.sun_path[0] = 0; addr.sun_path[len - 1] = 0; ret = bind(sk, (struct sockaddr *) &addr, addrlen); if (ret) { fail("bind\n"); return 1; } test_daemon(); test_waitsig(); if (listen(sk, 1) == -1) { pr_perror("listen"); return 1; } if (connect(skc, (struct sockaddr *) &addr, addrlen) == -1) { fail("Unable to connect"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/skip-me.c000066400000000000000000000002431317335042600172640ustar00rootroot00000000000000int main(int argc, char ** argv) { test_init(argc, argv); test_msg("Skipping test:" TEST_SKIP_REASON); test_daemon(); test_waitsig(); pass(); return 0; } criu-3.6/test/zdtm/static/sleeping00.c000066400000000000000000000004231317335042600176650ustar00rootroot00000000000000#include #include "zdtmtst.h" const char *test_doc = "Suspend while migrating"; const char *test_author = "Roman Kagan "; int main(int argc, char ** argv) { test_init(argc, argv); test_daemon(); test_waitsig(); pass(); return 0; } criu-3.6/test/zdtm/static/sock_filter.c000066400000000000000000000040671317335042600202330ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check socket filter"; const char *test_author = "Pavel Emelyanov "; #ifndef SO_GET_FILTER #define SO_GET_FILTER SO_ATTACH_FILTER #endif #define SFLEN 14 int main(int argc, char **argv) { int sk; struct sock_fprog p; struct sock_filter f[SFLEN] = { { 0x28, 0, 0, 0x0000000c }, { 0x15, 0, 4, 0x00000800 }, { 0x20, 0, 0, 0x0000001a }, { 0x15, 8, 0, 0x7f000001 }, { 0x20, 0, 0, 0x0000001e }, { 0x15, 6, 7, 0x7f000001 }, { 0x15, 1, 0, 0x00000806 }, { 0x15, 0, 5, 0x00008035 }, { 0x20, 0, 0, 0x0000001c }, { 0x15, 2, 0, 0x7f000001 }, { 0x20, 0, 0, 0x00000026 }, { 0x15, 0, 1, 0x7f000001 }, { 0x6, 0, 0, 0x0000ffff }, { 0x6, 0, 0, 0x00000000 }, }; struct sock_filter f2[SFLEN], f3[SFLEN]; socklen_t len; test_init(argc, argv); sk = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); if (sk < 0) { pr_perror("No socket"); return 1; } p.len = SFLEN; p.filter = f; if (setsockopt(sk, SOL_SOCKET, SO_ATTACH_FILTER, &p, sizeof(p))) { pr_perror("No filter"); return 1; } len = 0; if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len)) { pr_perror("No len"); return 1; } if (len != SFLEN) { pr_perror("Len mismatch"); return 1; } memset(f2, 0, sizeof(f2)); if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, f2, &len)) { perror("No filter"); return 1; } if (len != SFLEN) { pr_perror("Len mismatch2"); return 1; } test_daemon(); test_waitsig(); len = 0; if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len)) { fail("No len"); return 1; } if (len != SFLEN) { fail("Len mismatch"); return 1; } memset(f3, 0, sizeof(f3)); if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, f3, &len)) { fail("No filter"); return 1; } if (len != SFLEN) { fail("Len mismatch2"); return 1; } if (memcmp(f2, f3, sizeof(f2))) { fail("Filters mismatch"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/sock_opts00.c000066400000000000000000000033141317335042600200650ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check various socket options to work"; const char *test_author = "Pavel Emelyanov "; #define TEST_PORT 59687 #define TEST_ADDR INADDR_ANY #define NOPTS 7 int main(int argc, char ** argv) { int sock, ret = 0, vname[NOPTS], val[NOPTS], rval, i; socklen_t len = sizeof(int); vname[0] = SO_PRIORITY; vname[1] = SO_RCVLOWAT; vname[2] = SO_MARK; vname[3] = SO_PASSCRED; vname[4] = SO_PASSSEC; vname[5] = SO_DONTROUTE; vname[6] = SO_NO_CHECK; test_init(argc, argv); sock = socket(PF_INET, SOCK_STREAM, 0); if (sock < 0) { pr_perror("can't create socket"); return 1; } for (i = 0; i < NOPTS; i++) { ret = getsockopt(sock, SOL_SOCKET, vname[i], &val[i], &len); if (ret) { pr_perror("can't get option %d", i); return 1; } val[i]++; ret = setsockopt(sock, SOL_SOCKET, vname[i], &val[i], len); if (ret) { pr_perror("can't set option %d", i); return 1; } ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); if (ret) { pr_perror("can't get option %d 2", i); return 1; } if (rval != val[i]) { if (rval + 1 == val[i]) { pr_perror("can't reset option %d want %d have %d", i, val[i], rval); return 1; } /* kernel tuned things up on set */ val[i] = rval; } } test_daemon(); test_waitsig(); for (i = 0; i < NOPTS; i++) { ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); if (ret) { pr_perror("can't get option %d again", i); return 1; } if (val[i] != rval) { fail("option %d changed", i); return 1; } } pass(); close(sock); return 0; } criu-3.6/test/zdtm/static/sock_opts00.desc000066400000000000000000000000221317335042600205520ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/sock_opts01.c000066400000000000000000000023271317335042600200710ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that SO_BINDTODEVICE option works"; const char *test_author = "Pavel Emelyanov "; int main(int argc, char ** argv) { int sock, ret; char dev[IFNAMSIZ], dev2[IFNAMSIZ]; socklen_t len, len2; test_init(argc, argv); sock = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { pr_perror("can't create socket"); return 1; } ret = setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, "eth0", 5); if (ret < 0) ret = setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, "lo", 3); if (ret < 0) { pr_perror("can't bind to eth0"); return 1; } len = sizeof(dev); ret = getsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, &dev, &len); if (ret < 0) { pr_perror("can't get dev binding"); return 1; } test_daemon(); test_waitsig(); len2 = sizeof(dev); ret = getsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, &dev2, &len2); if (ret < 0) { fail("can't get dev binding2"); return 1; } if ((len != len2) || strncmp(dev, dev2, len)) fail("wrong bound device"); else pass(); close(sock); return 0; } criu-3.6/test/zdtm/static/sock_opts01.desc000066400000000000000000000000221317335042600205530ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/sock_peercred.c000066400000000000000000000046421317335042600205360ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define STACK_SIZE (1024 * 1024) #define GID_INC 1 #define UID_INC 1 const char *test_doc = "Check peercred of a unix socket remains the same"; const char *test_author = "Kirill Tkhai "; static int child_func(void *fd_p) { int fd = (int)(unsigned long)fd_p; struct ucred ucred; socklen_t len; int sks[2]; if (setgid(getgid() + GID_INC) != 0) { pr_perror("Can't setgid()"); return 1; } if (setuid(getuid() + UID_INC) != 0) { pr_perror("Can't setuid()"); return 1; } if (socketpair(PF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sks) < 0) { pr_perror("Can't create socketpair"); return 1; } len = sizeof(ucred); if (getsockopt(sks[0], SOL_SOCKET, SO_PEERCRED, &ucred, &len) < 0) { pr_perror("Can't getsockopt()"); return 1; } if (ucred.pid != getpid() || ucred.uid != getuid() || ucred.gid != getgid()) { pr_perror("Wrong sockopts"); return 1; } /* If sks[1] == fd, the below closes it, but we don't care */ if (dup2(sks[0], fd) == -1) { pr_perror("Can't dup fd\n"); return 1; } return 0; } int main(int argc, char **argv) { struct ucred ucred; int fd, status; socklen_t len; char *stack; pid_t pid; test_init(argc, argv); /* * We do not know, which direction stack grows. * So just allocate 2 * STACK_SIZE for stack and * give clone() pointer to middle of this memory. */ stack = malloc(2 * STACK_SIZE); if (!stack) { pr_err("malloc\n"); return 1; } /* Find unused fd */ for (fd = 0; fd < INT_MAX; fd++) { if (fcntl(fd, F_GETFD) == -1 && errno == EBADF) break; } if (fd == INT_MAX) { pr_err("INT_MAX happens...\n"); return 1; } pid = clone(child_func, stack + STACK_SIZE, CLONE_FILES|SIGCHLD, (void *)(unsigned long)fd); if (pid == -1) { pr_perror("clone"); return 1; } if (wait(&status) == -1 || status) { pr_perror("wait error: status=%d\n", status); return 1; } test_daemon(); test_waitsig(); len = sizeof(ucred); if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &ucred, &len) < 0) { fail("Can't getsockopt()"); return 1; } if (ucred.pid != pid || ucred.gid != getuid() + UID_INC || ucred.gid != getgid() + GID_INC) { fail("Wrong pid, uid or gid\n"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/sock_peercred.desc000066400000000000000000000000331317335042600212200ustar00rootroot00000000000000{ 'flags': 'suid noauto' } criu-3.6/test/zdtm/static/socket-ext.c000066400000000000000000000036561317335042600200200ustar00rootroot00000000000000 #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test external sockets\n"; const char *test_author = "Andrey Vagin #include #include #include #include #include #include #include #include #include static int port = 8880; #define BUF_SIZE 4096 int fill_sock_buf(int fd) { int flags; int size; int ret; flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_err("Can't get flags"); return -1; } if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { pr_err("Can't set flags"); return -1; } size = 0; while (1) { char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) break; pr_err("write"); return -1; } size += ret; } if (fcntl(fd, F_SETFL, flags) == -1) { pr_err("Can't set flags"); return -1; } test_msg("snd_size = %d\n", size); return size; } static int clean_sk_buf(int fd) { int size, ret; char buf[BUF_SIZE]; size = 0; while (1) { ret = read(fd, buf, sizeof(buf)); if (ret == -1) { pr_err("read"); return -11; } if (ret == 0) break; size += ret; } test_msg("rcv_size = %d\n", size); return size; } #define TEST_MSG "Hello World!" int main(int argc, char **argv) { char *newns = getenv("ZDTM_NEWNS"); int fd, fd_s, ctl_fd; pid_t extpid; int pfd[2]; int ret = 0, snd_size = 0, rcv_size = 0; #ifndef ZDTM_TCP_LAST_ACK char buf[BUF_SIZE]; #endif if (newns) test_init(argc, argv); if (pipe(pfd)) { pr_perror("pipe() failed"); return 1; } extpid = fork(); if (extpid < 0) { pr_perror("fork() failed"); return 1; } else if (extpid == 0) { int size = 0; char c; if (!newns) test_ext_init(argc, argv); close(pfd[1]); if (read(pfd[0], &port, sizeof(port)) != sizeof(port)) { pr_perror("Can't read port\n"); return 1; } close(pfd[0]); fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); if (fd < 0) return 1; ctl_fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); if (ctl_fd < 0) return 1; /* == The preparation stage == */ if (read(ctl_fd, &size, sizeof(size)) != sizeof(size)) { pr_perror("read"); return 1; } if (shutdown(fd, SHUT_WR) == -1) { pr_perror("shutdown"); return 1; } if (write(ctl_fd, &size, sizeof(size)) != sizeof(size)) { pr_perror("write"); return 1; } /* == End of the preparation stage == */ /* Checkpoint/restore */ /* == The final stage == */ if (read(ctl_fd, &c, 1) != 0) { pr_perror("read"); return 1; } #ifdef ZDTM_TCP_LAST_ACK size = clean_sk_buf(fd); if (size < 0) return 1; #else if (read(fd, buf, sizeof(buf)) != sizeof(TEST_MSG) || strncmp(buf, TEST_MSG, sizeof(TEST_MSG))) { pr_perror("read"); return 1; } #endif if (write(ctl_fd, &size, sizeof(size)) != sizeof(size)) { pr_perror("write"); return 1; } /* == End of the final stage == */ close(ctl_fd); close(fd); return 0; } if (!newns) test_init(argc, argv); if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { pr_err("initializing server failed"); return 1; } close(pfd[0]); if (write(pfd[1], &port, sizeof(port)) != sizeof(port)) { pr_perror("Can't send port"); return 1; } close(pfd[1]); /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); if (fd < 0) { pr_err("can't accept client connection"); return 1; } ctl_fd = tcp_accept_server(fd_s); if (ctl_fd < 0) { pr_err("can't accept client connection"); return 1; } /* == The preparation stage == */ #ifdef ZDTM_TCP_LAST_ACK snd_size = fill_sock_buf(fd); if (snd_size <= 0) return 1; #endif if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { pr_perror("read"); return 1; } if (read(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { pr_perror("read"); return 1; } /* == End of the preparation stage */ #ifdef ZDTM_TCP_LAST_ACK if (shutdown(fd, SHUT_WR) == -1) { pr_perror("shutdown"); return 1; } #endif test_daemon(); test_waitsig(); /* == The final stage == */ if (shutdown(ctl_fd, SHUT_WR) == -1) { pr_perror("shutdown"); return 1; } #ifndef ZDTM_TCP_LAST_ACK if (write(fd, TEST_MSG, sizeof(TEST_MSG)) != sizeof(TEST_MSG)) { pr_perror("write"); return 1; } if (shutdown(fd, SHUT_WR) == -1) { pr_perror("shutdown"); return 1; } #endif rcv_size = clean_sk_buf(fd); if (ret != rcv_size) { fail("The child sent %d bytes, but the parent received %d bytes\n", ret, rcv_size); return 1; } if (read(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { pr_perror("read"); return 1; } /* == End of the final stage == */ if (ret != snd_size) { fail("The parent sent %d bytes, but the child received %d bytes\n", snd_size, ret); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcp-close-wait.desc000066400000000000000000000001271317335042600225350ustar00rootroot00000000000000{'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed'} criu-3.6/test/zdtm/static/socket-tcp-close-wait.hook000077700000000000000000000000001317335042600273052socket-tcp-fin-wait1.hookustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-close0.c000066400000000000000000000027551317335042600211700ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that tcp-close option closes connected tcp socket"; const char *test_author = "Pavel Begunkov "; static int port = 8880; static int check_socket_closed(int sk) { int err, buffer = 0; struct { __u8 tcpi_state; } info; socklen_t len = sizeof(info); err = getsockopt(sk, IPPROTO_TCP, TCP_INFO, (void *)&info, &len); if (err != 0) { pr_perror("Can't get socket state\n"); return -1; } else if (info.tcpi_state != TCP_CLOSE) { pr_err("Invalid socket state (%i)\n", (int)info.tcpi_state); return -1; } err = recv(sk, &buffer, sizeof(buffer), 0); if (!err || errno != ENOTCONN) { pr_perror("Invalid recv response\n"); return -1; } return 0; } int main(int argc, char **argv) { int fd, fd_s, clt; test_init(argc, argv); fd_s = tcp_init_server(AF_INET, &port); if (fd_s < 0) { pr_err("Server initializations failed\n"); return 1; } clt = tcp_init_client(AF_INET, "localhost", port); if (clt < 0) return 1; fd = tcp_accept_server(fd_s); if (fd < 0) { pr_err("Can't accept client connection\n"); return 1; } close(fd_s); test_daemon(); test_waitsig(); if (check_socket_closed(fd)) { fail("Server socket isn't closed\n"); return 1; } if (check_socket_closed(clt)) { fail("Client socket isn't closed\n"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcp-close0.desc000066400000000000000000000001131317335042600216460ustar00rootroot00000000000000{'dopts': '--tcp-established', 'ropts': '--tcp-close', 'flags': 'reqrst '} criu-3.6/test/zdtm/static/socket-tcp-close1.c000066400000000000000000000020041317335042600211540ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that tcp-close option doesn't close listening tcp socket"; const char *test_author = "Pavel Begunkov "; static int port = 8880; static int check_socket_state(int sk, int state) { int err; struct { __u8 tcpi_state; } info; socklen_t len = sizeof(info); err = getsockopt(sk, IPPROTO_TCP, TCP_INFO, (void *)&info, &len); if (err != 0) { pr_perror("Can't get socket state\n"); return -1; } return info.tcpi_state == state ? 0 : -1; } int main(int argc, char **argv) { int fd_s; test_init(argc, argv); fd_s = tcp_init_server(AF_INET, &port); if (fd_s < 0) { pr_err("Server initializations failed\n"); return 1; } test_daemon(); test_waitsig(); if (check_socket_state(fd_s, TCP_LISTEN)) { fail("Listen socket state is changed\n"); close(fd_s); return 1; } close(fd_s); pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcp-close1.desc000077700000000000000000000000001317335042600257462socket-tcp-close0.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-closed-last-ack.c000077700000000000000000000000001317335042600264222socket-tcp-closed.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-closed-last-ack.desc000066400000000000000000000012131317335042600234310ustar00rootroot00000000000000{ 'deps': [ '/bin/sh', '/sbin/iptables', '/usr/lib64/xtables/libxt_tcp.so|/lib/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so', '/usr/lib64/xtables/libxt_standard.so|/lib/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', 'feature' : 'tcp_half_closed', 'flavor': 'ns uns', } criu-3.6/test/zdtm/static/socket-tcp-closed-last-ack.hook000077700000000000000000000000001317335042600302042socket-tcp-fin-wait1.hookustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-closed.c000066400000000000000000000051201317335042600212410ustar00rootroot00000000000000#include "zdtmtst.h" #ifdef ZDTM_IPV6 #define ZDTM_FAMILY AF_INET6 #else #define ZDTM_FAMILY AF_INET #endif const char *test_doc = "Check closed tcp sockets\n"; const char *test_author = "Andrey Vagin #include #include #include #include #include #include #include #include static int port = 8880; union sockaddr_inet { struct sockaddr addr; struct sockaddr_in v4; struct sockaddr_in6 v6; }; int main(int argc, char **argv) { int fd, fd_s, clt, sk; union sockaddr_inet src_addr, dst_addr, addr; socklen_t aux; char c = 5; #ifdef ZDTM_TCP_LAST_ACK char cmd[4096]; #endif test_init(argc, argv); sk = socket(ZDTM_FAMILY, SOCK_STREAM, 0); if (sk < 0) { pr_perror("socket"); return 1; } if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { pr_err("initializing server failed\n"); return 1; } clt = tcp_init_client(ZDTM_FAMILY, "localhost", port); if (clt < 0) return 1; /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); if (fd < 0) { pr_err("can't accept client connection\n"); return 1; } close(fd_s); shutdown(clt, SHUT_WR); #ifdef ZDTM_TCP_LAST_ACK snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -A INPUT --dport %d -j DROP", port); if (system(cmd)) return -1; #endif shutdown(fd, SHUT_WR); if (ZDTM_FAMILY == AF_INET) aux = sizeof(struct sockaddr_in); else if (ZDTM_FAMILY == AF_INET6) aux = sizeof(struct sockaddr_in6); else return 1; if (getsockopt(clt, SOL_SOCKET, SO_PEERNAME, &dst_addr, &aux)) { pr_perror("SO_PEERNAME"); return 1; } if (getsockname(clt, &src_addr.addr, &aux)) { pr_perror("getsockname"); return 1; } test_daemon(); test_waitsig(); #ifdef ZDTM_TCP_LAST_ACK snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -D INPUT --dport %d -j DROP", port); if (system(cmd)) return -1; #endif if (read(fd, &c, 1) != 0) { fail("read"); return 1; } if (read(clt, &c, 1) != 0) { fail("read"); return 1; } if (write(clt, &c, 1) != -1) { fail("write"); return 1; } if (write(fd, &c, 1) != -1) { fail("write"); return 1; } if (getsockopt(clt, SOL_SOCKET, SO_PEERNAME, &addr, &aux)) { pr_perror("SO_PEERNAME"); return 1; } if (memcmp(&addr, &dst_addr, aux)) { pr_err("A destination address mismatch"); return 1; } if (getsockname(clt, &addr.addr, &aux)) { pr_perror("getsockname"); return 1; } if (memcmp(&addr, &src_addr, aux)) { pr_err("A source address mismatch"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcp-closed.desc000066400000000000000000000001541317335042600217370ustar00rootroot00000000000000{'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed', 'flavor' : 'ns uns'} criu-3.6/test/zdtm/static/socket-tcp-closed.hook000077700000000000000000000000001317335042600265072socket-tcp-fin-wait1.hookustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-closing.c000066400000000000000000000075521317335042600214410ustar00rootroot00000000000000#include "zdtmtst.h" #ifdef ZDTM_IPV6 #define ZDTM_FAMILY AF_INET6 #else #define ZDTM_FAMILY AF_INET #endif const char *test_doc = "Check sockets in the TCP_CLOSING state\n"; const char *test_author = "Andrey Vagin #include #include #include #include #include #include #include #include #include static int port = 8880; #define BUF_SIZE 4096 int fill_sock_buf(int fd) { int flags; int size; int ret; flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_err("Can't get flags"); return -1; } if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { pr_err("Can't set flags"); return -1; } size = 0; while (1) { char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) break; pr_err("write"); return -1; } size += ret; } if (fcntl(fd, F_SETFL, flags) == -1) { pr_err("Can't set flags"); return -1; } test_msg("snd_size = %d\n", size); return size; } static int clean_sk_buf(int fd) { int size, ret; char buf[BUF_SIZE]; size = 0; while (1) { ret = read(fd, buf, sizeof(buf)); if (ret == -1) { pr_err("read"); return -11; } if (ret == 0) break; size += ret; } test_msg("rcv_size = %d\n", size); return size; } int main(int argc, char **argv) { char *newns = getenv("ZDTM_NEWNS"); int fd, fd_s, ctl_fd; pid_t extpid; int pfd[2]; int ret, snd_size = 0, rcv_size = 0; if (newns) test_init(argc, argv); if (pipe(pfd)) { pr_err("pipe() failed"); return 1; } extpid = fork(); if (extpid < 0) { pr_err("fork() failed"); return 1; } else if (extpid == 0) { int size = 0; char c; if (!newns) test_ext_init(argc, argv); close(pfd[1]); if (read(pfd[0], &port, sizeof(port)) != sizeof(port)) { pr_err("Can't read port\n"); return 1; } fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); if (fd < 0) return 1; ctl_fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); if (ctl_fd < 0) return 1; size = fill_sock_buf(fd); if (size <= 0) return 1; if (shutdown(fd, SHUT_WR) == -1) { pr_err("shutdown"); return 1; } if (write(ctl_fd, &size, sizeof(size)) != sizeof(size)) { pr_err("write"); return 1; } if (read(ctl_fd, &c, 1) != 0) { pr_err("read"); return 1; } size = clean_sk_buf(fd); if (size < 0) return 1; write(ctl_fd, &size, sizeof(size)); close(fd); return 0; } if (!newns) test_init(argc, argv); if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { pr_err("initializing server failed"); return 1; } close(pfd[0]); if (write(pfd[1], &port, sizeof(port)) != sizeof(port)) { pr_err("Can't send port"); return 1; } close(pfd[1]); /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); if (fd < 0) { pr_err("can't accept client connection %m"); return 1; } ctl_fd = tcp_accept_server(fd_s); if (ctl_fd < 0) { pr_err("can't accept client connection %m"); return 1; } snd_size = fill_sock_buf(fd); if (snd_size <= 0) return 1; if (shutdown(fd, SHUT_WR) == -1) { pr_err("shutdown"); return 1; } if (read(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { pr_err("read"); return 1; } rcv_size = clean_sk_buf(fd); if (ret != rcv_size) { fail("The child sent %d bytes, but the parent received %d bytes\n", ret, rcv_size); return 1; } sleep(1); test_daemon(); test_waitsig(); if (read(fd, &ret, sizeof(ret))) { pr_perror("read"); return 1; } if (shutdown(ctl_fd, SHUT_WR) == -1) { pr_err("shutdown"); return 1; } if (read(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { pr_err("read"); return 1; } if (ret != snd_size) { fail("The parent sent %d bytes, but the child received %d bytes\n", snd_size, ret); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcp-closing.desc000066400000000000000000000001271317335042600221240ustar00rootroot00000000000000{'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed'} criu-3.6/test/zdtm/static/socket-tcp-closing.hook000077700000000000000000000000001317335042600266742socket-tcp-fin-wait1.hookustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-fin-wait1.c000066400000000000000000000073671317335042600216060ustar00rootroot00000000000000#include "zdtmtst.h" #ifdef ZDTM_IPV6 #define ZDTM_FAMILY AF_INET6 #else #define ZDTM_FAMILY AF_INET #endif const char *test_doc = "Check sockets in TCP_FIN_WAIT* states\n"; const char *test_author = "Andrey Vagin #include #include #include #include #include #include #include #include #include static int port = 8880; #define TEST_MSG "Hello World!" #define BUF_SIZE 4096 int fill_sock_buf(int fd) { int flags; int size; int ret; flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_err("Can't get flags"); return -1; } if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { pr_err("Can't set flags"); return -1; } size = 0; while (1) { char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) break; pr_err("write"); return -1; } size += ret; } if (fcntl(fd, F_SETFL, flags) == -1) { pr_err("Can't set flags"); return -1; } test_msg("snd_size = %d\n", size); return size; } static int clean_sk_buf(int fd) { int size, ret; char buf[BUF_SIZE]; size = 0; while (1) { ret = read(fd, buf, sizeof(buf)); if (ret == -1) { pr_err("read"); return -11; } if (ret == 0) break; size += ret; } test_msg("rcv_size = %d\n", size); return size; } int main(int argc, char **argv) { char *newns = getenv("ZDTM_NEWNS"); int fd, fd_s, ctl_fd; pid_t extpid; int pfd[2]; int ret, snd_size = 0; char buf[BUF_SIZE]; if (newns) test_init(argc, argv); if (pipe(pfd)) { pr_err("pipe() failed"); return 1; } extpid = fork(); if (extpid < 0) { pr_err("fork() failed"); return 1; } else if (extpid == 0) { int size = 0; char c; if (!newns) test_ext_init(argc, argv); close(pfd[1]); if (read(pfd[0], &port, sizeof(port)) != sizeof(port)) { pr_err("Can't read port\n"); return 1; } fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); if (fd < 0) return 1; write(fd, TEST_MSG, 2); ctl_fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); if (ctl_fd < 0) return 1; if (read(ctl_fd, &c, 1) != 0) { pr_err("read"); return 1; } if (write(fd, TEST_MSG + 2, sizeof(TEST_MSG) - 2) != sizeof(TEST_MSG) - 2) { pr_err("write"); return 1; } if (shutdown(fd, SHUT_WR) == -1) { pr_err("shutdown"); return 1; } size = clean_sk_buf(fd); if (size < 0) return 1; write(ctl_fd, &size, sizeof(size)); close(fd); return 0; } if (!newns) test_init(argc, argv); if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { pr_err("initializing server failed"); return 1; } close(pfd[0]); if (write(pfd[1], &port, sizeof(port)) != sizeof(port)) { pr_err("Can't send port"); return 1; } close(pfd[1]); /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); if (fd < 0) { pr_err("can't accept client connection %m"); return 1; } ctl_fd = tcp_accept_server(fd_s); if (ctl_fd < 0) { pr_err("can't accept client connection %m"); return 1; } #if !defined(ZDTM_TCP_FIN_WAIT2) snd_size = fill_sock_buf(fd); if (snd_size <= 0) return 1; #endif if (shutdown(fd, SHUT_WR) == -1) { pr_err("shutdown"); return 1; } test_daemon(); test_waitsig(); if (shutdown(ctl_fd, SHUT_WR) == -1) { pr_err("shutdown"); return 1; } if (recv(fd, buf, sizeof(buf), MSG_WAITALL) != sizeof(TEST_MSG) || strncmp(buf, TEST_MSG, sizeof(TEST_MSG))) { pr_err("read"); return 1; } if (read(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { pr_err("read"); return 1; } if (ret != snd_size) { fail("The parent sent %d bytes, but the child received %d bytes\n", snd_size, ret); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcp-fin-wait1.desc000066400000000000000000000001271317335042600222650ustar00rootroot00000000000000{'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed'} criu-3.6/test/zdtm/static/socket-tcp-fin-wait1.hook000077500000000000000000000036321317335042600223160ustar00rootroot00000000000000#!/usr/bin/env python2 import sys sys.path.append("../crit") import pycriu import os, os.path import json import difflib import subprocess if sys.argv[1] in ["--pre-dump", "--post-restore"]: pid = os.getenv("ZDTM_TEST_PID") try: subprocess.Popen(["nsenter", "-t", pid, "-n", "ss", "-t", "-a", "-n"]).wait() except OSError, e: pass if sys.argv[1] != "--post-restore": sys.exit(0) print "Check TCP images" def get_sockets(image_dir): fname = os.path.join(image_dir, "inetsk.img") if not os.access(fname, os.F_OK): return None f = open(fname) sockets = pycriu.images.load(f) sockets = sockets["entries"] for s in sockets: f = open(os.path.join(image_dir, "inetsk.img")) ids = pycriu.images.load(f) tcp_img = os.path.join(image_dir, "tcp-stream-%x.img" % int(s["ino"])) print tcp_img if os.access(tcp_img, os.F_OK): f = open(tcp_img) tcp = pycriu.images.load(f) s['tcp'] = tcp["entries"][0] s["tcp"].pop("extra", None) s["tcp"].pop("timestamp", None) s["tcp"].pop("snd_wl1", None) s["tcp"].pop("rcv_wnd", None) s["tcp"].pop("snd_wnd", None) s["tcp"].pop("max_window", None) s.pop("id", None) s.pop("ino") sockets.sort(lambda a, b: cmp(a["src_port"] + a["dst_port"], b["src_port"] + b["dst_port"])) return sockets path = os.getenv("ZDTM_IMG_DIR") prev = None exit_code = 0 for d in os.listdir(path): sockets = get_sockets(os.path.join(path, d)) if not prev: prev = sockets continue if prev == sockets: continue sockets_str = json.dumps(sockets, sys.stdout, indent=8, sort_keys=True) prev_str = json.dumps(prev, sys.stdout, indent=8, sort_keys=True) print "\n".join(difflib.unified_diff(prev_str.split("\n"), sockets_str.split("\n"))) sys.exit(exit_code) criu-3.6/test/zdtm/static/socket-tcp-fin-wait2.c000077700000000000000000000000001317335042600256022socket-tcp-fin-wait1.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-fin-wait2.desc000077700000000000000000000000001317335042600267722socket-tcp-fin-wait1.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-fin-wait2.hook000077700000000000000000000000001317335042600270362socket-tcp-fin-wait1.hookustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-last-ack.c000077700000000000000000000000001317335042600257512socket-tcp-close-wait.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-last-ack.desc000066400000000000000000000001461317335042600221660ustar00rootroot00000000000000{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed'} criu-3.6/test/zdtm/static/socket-tcp-last-ack.hook000077700000000000000000000000001317335042600267352socket-tcp-fin-wait1.hookustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-local.c000077700000000000000000000000001317335042600232772socket-tcp.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-local.desc000066400000000000000000000001161317335042600215560ustar00rootroot00000000000000{'flavor': 'h ns uns', 'opts': '--tcp-established', 'flags': 'nouser samens'} criu-3.6/test/zdtm/static/socket-tcp-local.hook000077700000000000000000000000001317335042600263312socket-tcp-fin-wait2.hookustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-nfconntrack.c000077700000000000000000000000001317335042600245132socket-tcp.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-nfconntrack.desc000066400000000000000000000000761317335042600227770ustar00rootroot00000000000000{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid'} criu-3.6/test/zdtm/static/socket-tcp-reseted.c000066400000000000000000000035501317335042600214300ustar00rootroot00000000000000 #include "zdtmtst.h" #include #include #include /* for sockaddr_in and inet_ntoa() */ #include #include #ifdef ZDTM_IPV6 #define ZDTM_FAMILY AF_INET6 #else #define ZDTM_FAMILY AF_INET #endif const char *test_doc = "Check, that a reseted TCP connection can be restored\n"; const char *test_author = "Andrey Vagin #include #include #include #include #include #include #include #include static int port = 8880; int main(int argc, char **argv) { int fd, fd_s, clt; char cmd[4096], buf[10]; test_init(argc, argv); if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { pr_err("initializing server failed\n"); return 1; } clt = tcp_init_client(ZDTM_FAMILY, "localhost", port); if (clt < 0) { pr_perror("Unable to create a client socket"); return 1; } /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); if (fd < 0) { pr_err("can't accept client connection\n"); return 1; } if (write(clt, "asd", 3) != 3) { pr_perror("Unable to write into a socket"); return 1; } snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -A INPUT --dport %d -j REJECT --reject-with tcp-reset", port); if (system(cmd)) return 1; if (write(fd, "asdas", 5) == -1) { pr_perror("Unable to write into a socket"); return 1; } snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -D INPUT --dport %d -j REJECT --reject-with tcp-reset", port); if (system(cmd)) return 1; test_daemon(); test_waitsig(); if (read(fd, buf, sizeof(buf)) != 3) { fail("Unable to read data from a socket"); return 1; } if (write(fd, buf, 3) != -1) { fail("Can write into a closed socket"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcp-reseted.desc000066400000000000000000000013361317335042600221240ustar00rootroot00000000000000{ 'deps': [ '/bin/sh', '/sbin/iptables', '/usr/lib64/xtables/libxt_tcp.so|/lib/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so', '/usr/lib64/xtables/libxt_standard.so|/lib/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so', '/usr/lib64/xtables/libipt_REJECT.so|/lib/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', 'feature' : 'tcp_half_closed' } criu-3.6/test/zdtm/static/socket-tcp-reseted.hook000077700000000000000000000000001317335042600266712socket-tcp-fin-wait1.hookustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-syn-sent.c000066400000000000000000000055431317335042600215610ustar00rootroot00000000000000#include "zdtmtst.h" #ifdef ZDTM_IPV6 #define ZDTM_FAMILY AF_INET6 #else #define ZDTM_FAMILY AF_INET #endif const char *test_doc = "Check unconnected tcp sockets\n"; const char *test_author = "Andrey Vagin #include #include #include #include #include #include #include #include #include static int port = 8880; union sockaddr_inet { struct sockaddr addr; struct sockaddr_in v4; struct sockaddr_in6 v6; }; int main(int argc, char **argv) { int fd, fd_s, sock, sk; union sockaddr_inet addr; char cmd[4096]; test_init(argc, argv); sk = socket(ZDTM_FAMILY, SOCK_STREAM, 0); if (sk < 0) { pr_perror("socket"); return 1; } if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { pr_err("initializing server failed\n"); return 1; } if ((sock = socket(ZDTM_FAMILY, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)) < 0) { pr_perror("can't create socket"); return -1; } /* Construct the server address structure */ memset(&addr, 0, sizeof(addr)); if (ZDTM_FAMILY == AF_INET) { addr.v4.sin_family = AF_INET; inet_pton(AF_INET, "localhost", &addr.v4.sin_addr); } else { addr.v6.sin6_family = AF_INET6; inet_pton(AF_INET6, "localhost", &addr.v6.sin6_addr); } if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0) { pr_perror("can't connect to server"); return -1; } snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -A INPUT --dport %d -j DROP", port); if (system(cmd)) return -1; /* Construct the server address structure */ memset(&addr, 0, sizeof(addr)); if (ZDTM_FAMILY == AF_INET) { addr.v4.sin_family = AF_INET; addr.v4.sin_port = htons(port); inet_pton(AF_INET, "localhost", &addr.v4.sin_addr); } else { addr.v6.sin6_family = AF_INET6; addr.v6.sin6_port = htons(port); inet_pton(AF_INET6, "localhost", &addr.v6.sin6_addr); } errno = 0; if (connect(sock, (struct sockaddr *) &addr, sizeof(addr)) == 0 || errno != EINPROGRESS) { pr_perror("can't connect to server"); return -1; } test_daemon(); test_waitsig(); snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -D INPUT --dport %d -j DROP", port); if (system(cmd)) return -1; /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); if (fd < 0) { pr_err("can't accept client connection\n"); return 1; } close(fd_s); fcntl(sock, F_SETFL, 0); char c = 5; if (write(sock, &c, 1) != 1) { fail("Unable to send data"); return 1; } c = 0; if (read(fd, &c, 1) != 1 || c != 5) { fail("Unable to recv data"); return 1; } c = 6; if (write(fd, &c, 1) != 1) { fail("Unable to send data"); return 1; } c = 0; if (read(sock, &c, 1) != 1 || c != 6) { fail("Unable to recv data"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcp-syn-sent.desc000066400000000000000000000010241317335042600222430ustar00rootroot00000000000000{ 'deps': [ '/bin/sh', '/sbin/iptables', '/usr/lib64/xtables/libxt_tcp.so|/lib/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so', '/usr/lib64/xtables/libxt_standard.so|/lib/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', 'feature' : 'tcp_half_closed' } criu-3.6/test/zdtm/static/socket-tcp-syn-sent.hook000077700000000000000000000000001317335042600270162socket-tcp-fin-wait1.hookustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp-unconn.c000066400000000000000000000047751317335042600213070ustar00rootroot00000000000000#include "zdtmtst.h" #ifdef ZDTM_IPV6 #define ZDTM_FAMILY AF_INET6 #else #define ZDTM_FAMILY AF_INET #endif const char *test_doc = "Check unconnected tcp sockets\n"; const char *test_author = "Andrey Vagin #include #include #include #include #include #include #include #include static int port = 8880; union sockaddr_inet { struct sockaddr addr; struct sockaddr_in v4; struct sockaddr_in6 v6; }; int main(int argc, char **argv) { int fd, fd_s, sock, sk; union sockaddr_inet addr, src_addr; socklen_t aux; test_init(argc, argv); sk = socket(ZDTM_FAMILY, SOCK_STREAM, 0); if (sk < 0) { pr_perror("socket"); return 1; } if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { pr_err("initializing server failed\n"); return 1; } if ((sock = socket(ZDTM_FAMILY, SOCK_STREAM, IPPROTO_TCP)) < 0) { pr_perror("can't create socket"); return -1; } /* Construct the server address structure */ memset(&addr, 0, sizeof(addr)); if (ZDTM_FAMILY == AF_INET) { addr.v4.sin_family = AF_INET; inet_pton(AF_INET, "localhost", &addr.v4.sin_addr); } else { addr.v6.sin6_family = AF_INET6; inet_pton(AF_INET6, "localhost", &addr.v6.sin6_addr); } if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0) { pr_perror("can't connect to server"); return -1; } aux = sizeof(src_addr); memset(&src_addr, 0, sizeof(src_addr)); if (getsockname(sock, &src_addr.addr, &aux)) { pr_perror("getsockname"); return 1; } test_daemon(); test_waitsig(); memset(&addr, 0, sizeof(addr)); if (getsockname(sock, &addr.addr, &aux)) { pr_perror("getsockname"); return 1; } if (memcmp(&addr, &src_addr, aux)) { pr_err("A source address mismatch"); return 1; } /* Construct the server address structure */ memset(&addr, 0, sizeof(addr)); if (ZDTM_FAMILY == AF_INET) { addr.v4.sin_family = AF_INET; addr.v4.sin_port = htons(port); inet_pton(AF_INET, "localhost", &addr.v4.sin_addr); } else { addr.v6.sin6_family = AF_INET6; addr.v6.sin6_port = htons(port); inet_pton(AF_INET6, "localhost", &addr.v6.sin6_addr); } if (connect(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0) { pr_perror("can't connect to server"); return -1; } /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); if (fd < 0) { pr_err("can't accept client connection\n"); return 1; } close(fd_s); pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcp-unconn.desc000066400000000000000000000001271317335042600217660ustar00rootroot00000000000000{'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed'} criu-3.6/test/zdtm/static/socket-tcp.c000066400000000000000000000073661317335042600200100ustar00rootroot00000000000000#include "zdtmtst.h" #ifdef ZDTM_IPV6 #define ZDTM_FAMILY AF_INET6 #else #define ZDTM_FAMILY AF_INET #endif const char *test_doc = "Check, that a TCP connection can be restored\n"; const char *test_author = "Andrey Vagin #include #include #include #include #include #include #include #include static int port = 8880; #define BUF_SIZE 4096 int read_data(int fd, unsigned char *buf, int size) { int cur = 0; int ret; while (cur != size) { ret = read(fd, buf + cur, size - cur); if (ret <= 0) return -1; cur += ret; } return 0; } int write_data(int fd, const unsigned char *buf, int size) { int cur = 0; int ret; while (cur != size) { ret = write(fd, buf + cur, size - cur); if (ret <= 0) return -1; cur += ret; } return 0; } int main(int argc, char **argv) { unsigned char buf[BUF_SIZE]; int fd, fd_s; pid_t extpid; uint32_t crc; int pfd[2]; int val; socklen_t optlen; #ifdef ZDTM_CONNTRACK unshare(CLONE_NEWNET); if (system("ip link set up dev lo")) return 1; if (system("iptables -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) return 1; system("iptables -A INPUT -j DROP"); #endif #ifdef ZDTM_TCP_LOCAL test_init(argc, argv); #endif if (pipe(pfd)) { pr_perror("pipe() failed"); return 1; } extpid = fork(); if (extpid < 0) { pr_perror("fork() failed"); return 1; } else if (extpid == 0) { #ifndef ZDTM_TCP_LOCAL test_ext_init(argc, argv); #endif close(pfd[1]); if (read(pfd[0], &port, sizeof(port)) != sizeof(port)) { pr_perror("Can't read port"); return 1; } fd = tcp_init_client(ZDTM_FAMILY, "localhost", port); if (fd < 0) return 1; #ifdef STREAM while (1) { if (read_data(fd, buf, BUF_SIZE)) { pr_perror("read less then have to"); return 1; } if (datachk(buf, BUF_SIZE, &crc)) return 2; datagen(buf, BUF_SIZE, &crc); if (write_data(fd, buf, BUF_SIZE)) { pr_perror("can't write"); return 1; } } #else if (read_data(fd, buf, BUF_SIZE)) { pr_perror("read less then have to"); return 1; } if (datachk(buf, BUF_SIZE, &crc)) return 2; datagen(buf, BUF_SIZE, &crc); if (write_data(fd, buf, BUF_SIZE)) { pr_perror("can't write"); return 1; } #endif return 0; } #ifndef ZDTM_TCP_LOCAL test_init(argc, argv); #endif if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { pr_err("initializing server failed\n"); return 1; } close(pfd[0]); if (write(pfd[1], &port, sizeof(port)) != sizeof(port)) { pr_perror("Can't send port"); return 1; } close(pfd[1]); /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); if (fd < 0) { pr_err("can't accept client connection\n"); return 1; } val = 1; if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val))) { pr_perror("setsockopt"); return 1; } test_daemon(); #ifdef STREAM while (test_go()) { datagen(buf, BUF_SIZE, &crc); if (write_data(fd, buf, BUF_SIZE)) { pr_perror("can't write"); return 1; } if (read_data(fd, buf, BUF_SIZE)) { pr_perror("read less then have to"); return 1; } if (datachk(buf, BUF_SIZE, &crc)) return 2; } kill(extpid, SIGKILL); #else test_waitsig(); datagen(buf, BUF_SIZE, &crc); if (write_data(fd, buf, BUF_SIZE)) { pr_perror("can't write"); return 1; } if (read_data(fd, buf, BUF_SIZE)) { pr_perror("read less then have to"); return 1; } if (datachk(buf, BUF_SIZE, &crc)) return 2; #endif optlen = sizeof(val); if (getsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, &optlen)) { pr_perror("getsockopt"); return 1; } if (val != 1) { fail("SO_REUSEADDR are not set for %d\n", fd); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcp.desc000066400000000000000000000001071317335042600204660ustar00rootroot00000000000000{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} criu-3.6/test/zdtm/static/socket-tcp6-close-wait.c000077700000000000000000000000001317335042600264072socket-tcp-close-wait.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-close-wait.desc000077700000000000000000000000001317335042600275772socket-tcp-close-wait.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-closed.c000077700000000000000000000000001317335042600250132socket-tcp-closed.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-closed.desc000077700000000000000000000000001317335042600262032socket-tcp-closed.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-closing.c000077700000000000000000000000001317335042600253652socket-tcp-closing.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-closing.desc000077700000000000000000000000001317335042600265552socket-tcp-closing.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-closing.hook000077700000000000000000000000001317335042600266212socket-tcp-closing.hookustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-fin-wait1.c000077700000000000000000000000001317335042600256672socket-tcp-fin-wait1.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-fin-wait1.desc000077700000000000000000000000001317335042600270572socket-tcp-fin-wait1.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-fin-wait2.c000077700000000000000000000000001317335042600256712socket-tcp-fin-wait2.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-fin-wait2.desc000077700000000000000000000000001317335042600270602socket-tcp-fin-wait1.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-last-ack.c000077700000000000000000000000001317335042600254672socket-tcp-last-ack.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-last-ack.desc000077700000000000000000000000001317335042600266572socket-tcp-last-ack.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-local.c000077700000000000000000000000001317335042600233652socket-tcp.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-local.desc000077700000000000000000000000001317335042600256452socket-tcp-local.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-unconn.c000077700000000000000000000000001317335042600250712socket-tcp-unconn.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6-unconn.desc000077700000000000000000000000001317335042600262612socket-tcp-unconn.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6.c000077700000000000000000000000001317335042600222752socket-tcp.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcp6.desc000066400000000000000000000001071317335042600205540ustar00rootroot00000000000000{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} criu-3.6/test/zdtm/static/socket-tcpbuf-local.c000077700000000000000000000000001317335042600244712socket-tcpbuf.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcpbuf-local.desc000066400000000000000000000001161317335042600222530ustar00rootroot00000000000000{'flavor': 'h ns uns', 'opts': '--tcp-established', 'flags': 'nouser samens'} criu-3.6/test/zdtm/static/socket-tcpbuf.c000066400000000000000000000132301317335042600204700ustar00rootroot00000000000000#include "zdtmtst.h" #ifdef ZDTM_IPV6 #define ZDTM_FAMILY AF_INET6 #else #define ZDTM_FAMILY AF_INET #endif const char *test_doc = "Check full tcp buffers with custom sizes\n"; const char *test_author = "Andrey Vagin #include #include #include #include #include #include #include #include #include static int port = 8880; #define BUF_SIZE 4096 #define TCP_MAX_BUF (100 << 20) static void read_safe(int fd, void *buf, size_t size) { if (read(fd, buf, size) != size) { pr_perror("Unable to read from %d", fd); exit(1); } } static void write_safe(int fd, void *buf, size_t size) { if (write(fd, buf, size) != size) { pr_perror("Unable to write to %d", fd); exit(1); } } static int fill_sock_buf(int fd) { int flags; int size; int ret; flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_perror("Can't get flags"); return -1; } if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { pr_perror("Can't set flags"); return -1; } size = 0; while (1) { char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) break; pr_perror("write"); return -1; } size += ret; } if (fcntl(fd, F_SETFL, flags) == -1) { pr_perror("Can't set flags"); return -1; } return size; } static int clean_sk_buf(int fd, int limit) { int size, ret; char buf[BUF_SIZE]; size = 0; while (1) { ret = read(fd, buf, sizeof(buf)); if (ret == -1) { pr_perror("read"); return -11; } if (ret == 0) break; size += ret; if (limit && size >= limit) break; } return size; } int main(int argc, char **argv) { int fd, fd_s, ctl_fd; pid_t extpid; int pfd[2]; int sk_bsize; int ret, snd, snd_size, rcv_size = 0, rcv_buf_size; #ifdef ZDTM_TCP_LOCAL test_init(argc, argv); #endif if (pipe(pfd)) { pr_perror("pipe() failed"); return 1; } extpid = fork(); if (extpid < 0) { pr_perror("fork() failed"); return 1; } else if (extpid == 0) { int size; char c; #ifndef ZDTM_TCP_LOCAL test_ext_init(argc, argv); #endif close(pfd[1]); read_safe(pfd[0], &port, sizeof(port)); fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); if (fd < 0) return 1; ctl_fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); if (fd < 0) return 1; snd_size = fill_sock_buf(fd); if (snd_size <= 0) return 1; write_safe(ctl_fd, &snd_size, sizeof(snd_size)); read_safe(ctl_fd, &rcv_buf_size, sizeof(rcv_buf_size)); while (1) { /* heart beat */ read_safe(ctl_fd, &ret, sizeof(ret)); if (ret < 0) break; rcv_buf_size += ret; snd = fill_sock_buf(fd); if (snd < 0) return -1; snd_size += snd; if (rcv_buf_size / 2) { ret = clean_sk_buf(fd, rcv_buf_size / 2); if (ret <= 0) return 1; } else ret = 0; rcv_buf_size -= ret; rcv_size += ret; write_safe(ctl_fd, &snd, sizeof(snd)); } read_safe(ctl_fd, &ret, sizeof(ret)); rcv_buf_size += ret; write_safe(ctl_fd, &snd_size, sizeof(snd_size)); if (read(ctl_fd, &c, 1) != 0) { pr_perror("read"); return 1; } if (shutdown(fd, SHUT_WR) == -1) { pr_perror("shutdown"); return 1; } size = clean_sk_buf(fd, 0); if (size < 0) return 1; if (size != rcv_buf_size) { fail("the received buffer contains only %d bytes (%d)\n", size, rcv_buf_size); } rcv_size += size; write_safe(ctl_fd, &rcv_size, sizeof(rcv_size)); close(fd); return 0; } #ifndef ZDTM_TCP_LOCAL test_init(argc, argv); #endif if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { pr_err("initializing server failed\n"); return 1; } close(pfd[0]); write_safe(pfd[1], &port, sizeof(port)); close(pfd[1]); /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); if (fd < 0) { pr_err("can't accept client connection\n"); return 1; } ctl_fd = tcp_accept_server(fd_s); if (ctl_fd < 0) { pr_err("can't accept client connection\n"); return 1; } sk_bsize = TCP_MAX_BUF; if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sk_bsize, sizeof(sk_bsize)) == -1) { pr_perror("Can't set snd buf"); return 1; } sk_bsize = TCP_MAX_BUF; if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &sk_bsize, sizeof(sk_bsize)) == -1) { pr_perror("Can't set snd buf"); return 1; } snd_size = fill_sock_buf(fd); if (snd_size <= 0) return 1; read_safe(ctl_fd, &rcv_buf_size, sizeof(rcv_buf_size)); write_safe(ctl_fd, &snd_size, sizeof(snd_size)); test_daemon(); snd = 0; while (test_go()) { /* heart beat */ if (rcv_buf_size / 2) { ret = clean_sk_buf(fd, rcv_buf_size / 2); if (ret <= 0) return 1; } else ret = 0; rcv_size += ret; rcv_buf_size -= ret; write_safe(ctl_fd, &snd, sizeof(snd)); read_safe(ctl_fd, &ret, sizeof(ret)); rcv_buf_size += ret; snd = fill_sock_buf(fd); if (snd < 0) return -1; snd_size += snd; } ret = -1; write_safe(ctl_fd, &ret, sizeof(ret)); write_safe(ctl_fd, &snd, sizeof(ret)); read_safe(ctl_fd, &snd, sizeof(snd)); if (shutdown(ctl_fd, SHUT_WR) == -1) { pr_perror("shutdown"); return 1; } if (shutdown(fd, SHUT_WR) == -1) { pr_perror("shutdown"); return 1; } ret = clean_sk_buf(fd, 0); if (ret != rcv_buf_size) { fail("the received buffer contains only %d bytes (%d)\n", ret, rcv_buf_size); } rcv_size += ret; if (snd != rcv_size) { fail("The child sent %d bytes, but the parent received %d bytes\n", rcv_buf_size, rcv_size); return 1; } read_safe(ctl_fd, &ret, sizeof(ret)); if (ret != snd_size) { fail("The parent sent %d bytes, but the child received %d bytes\n", snd_size, ret); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket-tcpbuf.desc000066400000000000000000000001071317335042600211630ustar00rootroot00000000000000{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} criu-3.6/test/zdtm/static/socket-tcpbuf6-local.c000077700000000000000000000000001317335042600245572socket-tcpbuf.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcpbuf6-local.desc000077700000000000000000000000001317335042600270372socket-tcpbuf-local.descustar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcpbuf6.c000077700000000000000000000000001317335042600234672socket-tcpbuf.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket-tcpbuf6.desc000066400000000000000000000001071317335042600212510ustar00rootroot00000000000000{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} criu-3.6/test/zdtm/static/socket6_udp.c000066400000000000000000000047331317335042600201550ustar00rootroot00000000000000#include "zdtmtst.h" const char *test_doc = "Static test for IP6/UDP socket\n"; const char *test_author = "Cyrill Gorcunov \n"; #include #include #include #include #include #include #include #include #include #include /* for sockaddr_in and inet_ntoa() */ #include static int port = 8880; static char buf[64]; #define MSG1 "msg1" #define MSG2 "msg_2" int main(int argc, char **argv) { int ret, sk1, sk2; socklen_t len = sizeof(struct sockaddr_in6); struct sockaddr_in6 addr1, addr2, addr; test_init(argc, argv); sk1 = socket(PF_INET6, SOCK_DGRAM, IPPROTO_UDP); if (sk1 < 0) { pr_perror("Can't create socket"); return 1; } memset(&addr1, 0, sizeof(addr1)); addr1.sin6_family = AF_INET6; addr1.sin6_port = htons(port); inet_pton(AF_INET6, "::1", &addr1.sin6_addr); ret = bind(sk1, (struct sockaddr *)&addr1, len); if (ret < 0) { pr_perror("Can't bind socket"); return 1; } sk2 = socket(PF_INET6, SOCK_DGRAM, IPPROTO_UDP); if (sk2 < 0) { pr_perror("Can't create socket"); return 1; } memset(&addr2, 0, sizeof(addr2)); addr2.sin6_family = AF_INET6; addr2.sin6_port = htons(port+1); inet_pton(AF_INET6, "::1", &addr2.sin6_addr); ret = bind(sk2, (struct sockaddr *)&addr2, len); if (ret < 0) { pr_perror("Can't bind socket"); return 1; } ret = connect(sk2, (struct sockaddr *)&addr1, len); if (ret < 0) { pr_perror("Can't connect"); return 1; } test_daemon(); test_waitsig(); ret = sendto(sk1, MSG1, sizeof(MSG1), 0, (struct sockaddr *)&addr2, len); if (ret < 0) { fail("Can't send"); return 1; } ret = send(sk2, MSG2, sizeof(MSG2), 0); if (ret < 0) { fail("Can't send C"); return 1; } ret = recvfrom(sk1, buf, sizeof(buf), 0, (struct sockaddr *)&addr, &len); if (ret <= 0) { fail("Can't recv C"); return 1; } if (len != sizeof(struct sockaddr_in6) || memcmp(&addr2, &addr, len)) { fail("Wrong peer C"); return 1; } if (ret != sizeof(MSG2) || memcmp(buf, MSG2, ret)) { fail("Wrong message C"); return 1; } ret = recvfrom(sk2, buf, sizeof(buf), 0, (struct sockaddr *)&addr, &len); if (ret <= 0) { fail("Can't recv"); return 1; } if (len != sizeof(struct sockaddr_in6) || memcmp(&addr1, &addr, len)) { fail("Wrong peer"); return 1; } if (ret != sizeof(MSG1) || memcmp(buf, MSG1, ret)) { fail("Wrong message"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket_aio.c000066400000000000000000000051271317335042600200450ustar00rootroot00000000000000#include "zdtmtst.h" const char *test_doc = "static test for AIO\n"; const char *test_author = "Andrew Vagin "; /* Description: * Create two tcp socket, server send asynchronous request on * read data and clietn write data after migration */ #include #include #include #include #include #include #include #include #include #include static int port = 8880; #define BUF_SIZE 1024 int main(int argc, char **argv) { char buf[BUF_SIZE]; int fd, fd_s; struct aiocb aiocb; int status; pid_t pid; int ret, res; const struct aiocb *aioary[1]; task_waiter_t child_waiter; test_init(argc, argv); task_waiter_init(&child_waiter); if ((fd_s = tcp_init_server(AF_INET, &port)) < 0) { pr_err("initializing server failed\n"); return 1; } pid = test_fork(); if (pid < 0) { pr_perror("fork failed"); return 1; } if (pid == 0) { /* * Chiled is client of TCP connection */ close(fd_s); fd = tcp_init_client(AF_INET, "127.0.0.1", port); if (fd < 0) return 1; memset(&aiocb, 0, sizeof(struct aiocb)); aiocb.aio_fildes = fd; aiocb.aio_buf = buf; aiocb.aio_nbytes = BUF_SIZE; ret = aio_read(&aiocb); if (ret < 0) { pr_perror("aio_read failed"); return 1; } task_waiter_complete_current(&child_waiter); /* Wait for request completion */ aioary[0] = &aiocb; ret = aio_error(&aiocb); #ifdef DEBUG test_msg("."); #endif res = 0; again: if (aio_suspend(aioary, 1, NULL) < 0 && errno != EINTR) { pr_perror("aio_suspend failed"); res = 1; } ret = aio_error(&aiocb); if (!res && ret == EINPROGRESS) { #ifdef DEBUG test_msg("restart aio_suspend\n"); #endif goto again; } if (ret != 0) { pr_err("Error at aio_error(): %s\n", strerror(ret)); res = 1; } if (aio_return(&aiocb) != BUF_SIZE) { pr_perror("Error at aio_return()"); res = 1; } close(fd); return res; } /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); close(fd_s); if (fd < 0) { pr_err("can't accept client connection\n"); goto error; } task_waiter_wait4(&child_waiter, pid); test_daemon(); test_waitsig(); if (write(fd, buf, BUF_SIZE) < BUF_SIZE) { pr_perror("can't write"); goto error; } close(fd); if (wait(&status) < 0) { pr_perror("wait failed"); goto error; } if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { pr_err("child failed with exit code %d\n", WEXITSTATUS(status)); return 1; } pass(); return 0; error: kill(pid, SIGKILL); wait(&status); return -1; } criu-3.6/test/zdtm/static/socket_aio.desc000066400000000000000000000000621317335042600205320ustar00rootroot00000000000000{'opts': '--tcp-established', 'flags' : 'nouser'} criu-3.6/test/zdtm/static/socket_close_data.c000066400000000000000000000014211317335042600213640ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check one end of socketpair with data"; const char *test_author = "Andrew Vagin 0 ? ret : 0] = 0; if (ret != sizeof(MSG)) { fail("%d: %s", ret, buf); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket_close_data01.c000066400000000000000000000041111317335042600215240ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check data of bound socket and possibility to connect"; const char *test_author = "Kirill Tkhai 0 ? ret : 0] = 0; if (ret != sizeof(MSG)) { fail("%d: %s", ret, buf); ret = 1; goto unlink; } /* Test2: check it's still possible to connect to the bound socket */ if (fork() == 0) { exit(client("(iter2)")); } if (wait(&status) < 0) { fail("wait failed"); goto unlink; } if (WEXITSTATUS(status) != 0) { fail("connect failed"); goto unlink; } ret = 0; pass(); unlink: unlink(filename); return ret; } criu-3.6/test/zdtm/static/socket_dgram_data.c000066400000000000000000000031661317335042600213610ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that data in dgram socket are restored correctly"; const char *test_author = "Andrew Vagin 0 ? ret : 0] = 0; if (ret != sizeof(MSG)) { fail("%d: %s", ret, buf); return 1; } ret = read(srv, buf, sizeof(buf)); if (ret != -1 || errno != EAGAIN) { fail("unexpected data: %d", ret); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket_listen.c000066400000000000000000000042001317335042600205620ustar00rootroot00000000000000#include "zdtmtst.h" #ifdef ZDTM_IPV6 #define ZDTM_FAMILY AF_INET6 #else #define ZDTM_FAMILY AF_INET #endif const char *test_doc = "static test for listening socket\n"; const char *test_author = "Stanislav Kinsbursky "; /* Description: * Create two tcp socket, server send asynchronous request on * read data and clietn write data after migration */ #include #include #include #include #include #include #include static int port = 8880; #define BUF_SIZE 1024 static void sig_hand(int signo) {} int main(int argc, char **argv) { unsigned char buf[BUF_SIZE]; int fd, fd_s; int status; pid_t pid; int res; uint32_t crc; struct sigaction sa = { .sa_handler = sig_hand, /* don't set SA_RESTART */ }; test_init(argc, argv); if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { pr_err("initializing server failed\n"); return 1; } test_daemon(); test_waitsig(); sigemptyset(&sa.sa_mask); if (sigaction(SIGCHLD, &sa, NULL)) pr_perror("Can't set SIGCHLD handler"); pid = test_fork(); if (pid < 0) { pr_perror("fork failed"); return 1; } if (pid == 0) { /* * Chiled is client of TCP connection */ close(fd_s); fd = tcp_init_client(ZDTM_FAMILY, "localhost", port); if (fd < 0) return 1; res = read(fd, buf, BUF_SIZE); close(fd); if (res != BUF_SIZE) { pr_perror("read less then have to: %d instead of %d", res, BUF_SIZE); return -1; } if (datachk(buf, BUF_SIZE, &crc)) return -2; return 0; } /* * parent is server of TCP connection */ fd = tcp_accept_server(fd_s); close(fd_s); if (fd < 0) { pr_err("can't accept client connection\n"); goto error; } datagen(buf, BUF_SIZE, &crc); if (write(fd, buf, BUF_SIZE) < BUF_SIZE) { pr_perror("can't write"); goto error; } close(fd); if (wait(&status) < 0) { pr_perror("wait failed"); goto error; } if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { pr_err("child failed with exit code %d\n", WEXITSTATUS(status)); return 1; } pass(); return 0; error: kill(pid, SIGKILL); wait(&status); return -1; } criu-3.6/test/zdtm/static/socket_listen6.c000077700000000000000000000000001317335042600236612socket_listen.custar00rootroot00000000000000criu-3.6/test/zdtm/static/socket_queues.c000066400000000000000000000054221317335042600206020ustar00rootroot00000000000000 #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" /* FIXME Need gram sockets tests */ const char *test_doc = "Test unix sockets queues (2 messages in queue)\n"; const char *test_author = "Stanislav Kinsbursky \n"; #define SK_DATA_S1 "packet stream left" #define SK_DATA_S2 "packet stream right" #define SK_DATA_D1 "packet dgram left" #define SK_DATA_D2 "packet dgram right" int main(int argc, char *argv[]) { int ssk_pair_d[2]; int ssk_pair_s[2]; char buf_left[64], buf_right[64]; test_init(argc, argv); if (socketpair(AF_UNIX, SOCK_STREAM, 0, ssk_pair_s) == -1) { fail("socketpair\n"); exit(1); } write(ssk_pair_s[0], SK_DATA_S1, sizeof(SK_DATA_S1)); write(ssk_pair_s[0], SK_DATA_S2, sizeof(SK_DATA_S2)); write(ssk_pair_s[1], SK_DATA_S2, sizeof(SK_DATA_S2)); write(ssk_pair_s[1], SK_DATA_S1, sizeof(SK_DATA_S1)); if (socketpair(AF_UNIX, SOCK_DGRAM, 0, ssk_pair_d) == -1) { fail("socketpair\n"); exit(1); } write(ssk_pair_d[0], SK_DATA_D1, sizeof(SK_DATA_D1)); write(ssk_pair_d[0], SK_DATA_D2, sizeof(SK_DATA_D2)); write(ssk_pair_d[1], SK_DATA_D2, sizeof(SK_DATA_D2)); write(ssk_pair_d[1], SK_DATA_D1, sizeof(SK_DATA_D1)); test_daemon(); test_waitsig(); read(ssk_pair_s[1], buf_left, strlen(SK_DATA_S1) + 1); if (strcmp(buf_left, SK_DATA_S1)) { fail("SK_DATA_S2: '%s\n", SK_DATA_S1); exit(1); } read(ssk_pair_s[1], buf_right, strlen(SK_DATA_S2) + 1); if (strcmp(buf_right, SK_DATA_S2)) { fail("data corrupted\n"); exit(1); } test_msg("stream1 : '%s' '%s'\n", buf_left, buf_right); read(ssk_pair_s[0], buf_left, strlen(SK_DATA_S2) + 1); if (strcmp(buf_left, SK_DATA_S2)) { fail("data corrupted\n"); exit(1); } read(ssk_pair_s[0], buf_right, strlen(SK_DATA_S1) + 1); if (strcmp(buf_right, SK_DATA_S1)) { fail("data corrupted\n"); exit(1); } test_msg("stream2 : '%s' '%s'\n", buf_left, buf_right); read(ssk_pair_d[1], buf_left, strlen(SK_DATA_D1) + 1); if (strcmp(buf_left, SK_DATA_D1)) { fail("data corrupted\n"); exit(1); } read(ssk_pair_d[1], buf_right, strlen(SK_DATA_D2) + 1); if (strcmp(buf_right, SK_DATA_D2)) { fail("data corrupted\n"); exit(1); } test_msg("dgram1 : '%s' '%s'\n", buf_left, buf_right); read(ssk_pair_d[0], buf_left, strlen(SK_DATA_D2) + 1); if (strcmp(buf_left, SK_DATA_D2)) { fail("data corrupted\n"); exit(1); } read(ssk_pair_d[0], buf_right,strlen(SK_DATA_D1) + 1); if (strcmp(buf_right, SK_DATA_D1)) { fail("data corrupted\n"); exit(1); } test_msg("dgram2 : '%s' '%s'\n", buf_left, buf_right); pass(); return 0; } criu-3.6/test/zdtm/static/socket_snd_addr.c000066400000000000000000000042551317335042600210540ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that sender addresses are restored"; const char *test_author = "Andrew Vagin 0 ? ret : 0] = 0; if (ret != sizeof(MSG)) { fail("%d: %s", ret, buf); return 1; } if (hdr.msg_namelen > sizeof(addr.sun_family) + 1) pr_perror("%d, %s", hdr.msg_namelen, addr.sun_path + 1); if (memcmp(addr.sun_path, sk_names[i], sizeof(SK_NAME))) { fail("A sender address is mismatch"); return 1; } } pass(); return 0; } criu-3.6/test/zdtm/static/socket_snd_addr.desc000066400000000000000000000000241317335042600215360ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/socket_udp.c000066400000000000000000000051271317335042600200650ustar00rootroot00000000000000#include "zdtmtst.h" const char *test_doc = "static test for UDP socket\n"; const char *test_author = "Pavel Emelyanov \n"; /* Description: * Create two tcp socket, server send asynchronous request on * read data and clietn write data after migration */ #include #include #include #include #include #include #include #include #include #include /* for sockaddr_in and inet_ntoa() */ #include static int port = 8880; static char buf[8]; #define MSG1 "msg1" #define MSG2 "msg_2" int main(int argc, char **argv) { int ret, sk1, sk2; socklen_t len = sizeof(struct sockaddr_in); struct sockaddr_in addr1, addr2, addr; test_init(argc, argv); sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); if (sk1 < 0) { pr_perror("Can't create socket"); return 1; } memset(&addr1, 0, sizeof(addr1)); addr1.sin_family = AF_INET; addr1.sin_addr.s_addr = inet_addr("127.0.0.1"); addr1.sin_port = htons(port); ret = bind(sk1, (struct sockaddr *)&addr1, len); if (ret < 0) { pr_perror("Can't bind socket"); return 1; } sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); if (sk2 < 0) { pr_perror("Can't create socket"); return 1; } memset(&addr2, 0, sizeof(addr1)); addr2.sin_family = AF_INET; addr2.sin_addr.s_addr = inet_addr("127.0.0.1"); addr2.sin_port = htons(port + 1); ret = bind(sk2, (struct sockaddr *)&addr2, len); if (ret < 0) { pr_perror("Can't bind socket"); return 1; } ret = connect(sk2, (struct sockaddr *)&addr1, len); if (ret < 0) { pr_perror("Can't connect"); return 1; } test_daemon(); test_waitsig(); ret = sendto(sk1, MSG1, sizeof(MSG1), 0, (struct sockaddr *)&addr2, len); if (ret < 0) { fail("Can't send"); return 1; } ret = send(sk2, MSG2, sizeof(MSG2), 0); if (ret < 0) { fail("Can't send C"); return 1; } ret = recvfrom(sk1, buf, sizeof(buf), 0, (struct sockaddr *)&addr, &len); if (ret <= 0) { fail("Can't recv C"); return 1; } if (len != sizeof(struct sockaddr_in) || memcmp(&addr2, &addr, len)) { fail("Wrong peer C"); return 1; } if (ret != sizeof(MSG2) || memcmp(buf, MSG2, ret)) { fail("Wrong message C"); return 1; } ret = recvfrom(sk2, buf, sizeof(buf), 0, (struct sockaddr *)&addr, &len); if (ret <= 0) { fail("Can't recv"); return 1; } if (len != sizeof(struct sockaddr_in) || memcmp(&addr1, &addr, len)) { fail("Wrong peer"); return 1; } if (ret != sizeof(MSG1) || memcmp(buf, MSG1, ret)) { fail("Wrong message"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket_udp_shutdown.c000066400000000000000000000054051317335042600220170ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include /* for sockaddr_in and inet_ntoa() */ #include #include "zdtmtst.h" const char *test_doc = "static test for UDP shutdown'ed socket"; const char *test_author = "Cyrill Gorcunov "; static int port = 8881; #define MSG1 "msg1" int main(int argc, char **argv) { socklen_t len = sizeof(struct sockaddr_in); struct sockaddr_in addr1, addr2, addr; int ret, sk1, sk2; char buf[512]; test_init(argc, argv); sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); if (sk1 < 0 || sk2 < 0) { pr_err("Can't create socket"); exit(1); return 1; } memset(&addr1, 0, sizeof(addr1)); memset(&addr2, 0, sizeof(addr1)); addr1.sin_family = AF_INET; addr1.sin_addr.s_addr = inet_addr("127.0.0.10"); addr1.sin_port = htons(port); addr2.sin_family = AF_INET; addr2.sin_addr.s_addr = inet_addr("127.0.0.10"); addr2.sin_port = htons(port + 1); if (bind(sk1, (struct sockaddr *)&addr1, len) < 0 || bind(sk2, (struct sockaddr *)&addr2, len) < 0) { pr_err("Can't bind socket"); return 1; } if (connect(sk1, (struct sockaddr *)&addr2, len) || connect(sk2, (struct sockaddr *)&addr1, len)) { pr_err("Can't connect"); return 1; } if (shutdown(sk1, SHUT_WR) || shutdown(sk2, SHUT_RD)) { pr_err("Can't shutdown\n"); return 1; } ret = sendto(sk2, MSG1, sizeof(MSG1), 0, (struct sockaddr *)&addr1, len); if (ret < 0) { pr_perror("Can't send"); return 1; } ret = recvfrom(sk1, buf, sizeof(buf), 0, (struct sockaddr *)&addr, &len); if (ret <= 0) { pr_err("Can't receive data"); return 1; } if (len != sizeof(struct sockaddr_in) || memcmp(&addr2, &addr, len)) { pr_err("Data received from wrong peer"); return 1; } if (ret != sizeof(MSG1) || memcmp(buf, MSG1, ret)) { pr_err("Wrong message received"); return 1; } test_daemon(); test_waitsig(); ret = sendto(sk2, MSG1, sizeof(MSG1), 0, (struct sockaddr *)&addr1, len); if (ret < 0) { pr_perror("Can't send"); return 1; } ret = recvfrom(sk1, buf, sizeof(buf), 0, (struct sockaddr *)&addr, &len); if (ret <= 0) { pr_err("Can't receive data"); return 1; } if (len != sizeof(struct sockaddr_in) || memcmp(&addr2, &addr, len)) { pr_err("Data received from wrong peer"); return 1; } if (ret != sizeof(MSG1) || memcmp(buf, MSG1, ret)) { pr_err("Wrong message received"); return 1; } ret = sendto(sk1, MSG1, sizeof(MSG1), 0, (struct sockaddr *)&addr2, len); if (ret >= 0) { fail("Sent to write-shutdown'ed socket"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/socket_udplite.c000066400000000000000000000051371317335042600207440ustar00rootroot00000000000000#include "zdtmtst.h" const char *test_doc = "static test for UDP socket\n"; const char *test_author = "Pavel Emelyanov \n"; /* Description: * Create two tcp socket, server send asynchronous request on * read data and clietn write data after migration */ #include #include #include #include #include #include #include #include #include #include /* for sockaddr_in and inet_ntoa() */ #include static int port = 8890; static char buf[8]; #define MSG1 "msg1" #define MSG2 "msg_2" int main(int argc, char **argv) { int ret, sk1, sk2; socklen_t len = sizeof(struct sockaddr_in); struct sockaddr_in addr1, addr2, addr; test_init(argc, argv); sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDPLITE); if (sk1 < 0) { pr_perror("Can't create socket"); return 1; } memset(&addr1, 0, sizeof(addr1)); addr1.sin_family = AF_INET; addr1.sin_addr.s_addr = inet_addr("127.0.0.1"); addr1.sin_port = htons(port); ret = bind(sk1, (struct sockaddr *)&addr1, len); if (ret < 0) { pr_perror("Can't bind socket"); return 1; } sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDPLITE); if (sk2 < 0) { pr_perror("Can't create socket"); return 1; } memset(&addr2, 0, sizeof(addr1)); addr2.sin_family = AF_INET; addr2.sin_addr.s_addr = inet_addr("127.0.0.1"); addr2.sin_port = htons(port + 1); ret = bind(sk2, (struct sockaddr *)&addr2, len); if (ret < 0) { pr_perror("Can't bind socket"); return 1; } ret = connect(sk2, (struct sockaddr *)&addr1, len); if (ret < 0) { pr_perror("Can't connect"); return 1; } test_daemon(); test_waitsig(); ret = sendto(sk1, MSG1, sizeof(MSG1), 0, (struct sockaddr *)&addr2, len); if (ret < 0) { fail("Can't send"); return 1; } ret = send(sk2, MSG2, sizeof(MSG2), 0); if (ret < 0) { fail("Can't send C"); return 1; } ret = recvfrom(sk1, buf, sizeof(buf), 0, (struct sockaddr *)&addr, &len); if (ret <= 0) { fail("Can't recv C"); return 1; } if (len != sizeof(struct sockaddr_in) || memcmp(&addr2, &addr, len)) { fail("Wrong peer C"); return 1; } if (ret != sizeof(MSG2) || memcmp(buf, MSG2, ret)) { fail("Wrong message C"); return 1; } ret = recvfrom(sk2, buf, sizeof(buf), 0, (struct sockaddr *)&addr, &len); if (ret <= 0) { fail("Can't recv"); return 1; } if (len != sizeof(struct sockaddr_in) || memcmp(&addr1, &addr, len)) { fail("Wrong peer"); return 1; } if (ret != sizeof(MSG1) || memcmp(buf, MSG1, ret)) { fail("Wrong message"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/sockets00.c000066400000000000000000000057521317335042600175440ustar00rootroot00000000000000 #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test unix stream sockets\n"; const char *test_author = "Cyrill Gorcunov #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test unix sockets shutdown"; const char *test_author = "Pavel Emelyanov "; #define fin(msg) do { pr_perror(msg); exit(1); } while (0) #define ffin(msg) do { fail(msg); exit(1); } while (0) #define TEST_MSG "test-message" static char buf[sizeof(TEST_MSG)]; int main(int argc, char *argv[]) { int spu[2], spb[2], dpu[2], dpb[2], dpd[2]; int ret; test_init(argc, argv); signal(SIGPIPE, SIG_IGN); /* spu -- stream pair, unidirectional shutdown */ if (socketpair(PF_UNIX, SOCK_STREAM, 0, spu) < 0) fin("no stream pair 1"); if (shutdown(spu[0], SHUT_RD) < 0) fin("no stream shutdown 1"); /* spb -- stream pair, bidirectional shutdown */ if (socketpair(PF_UNIX, SOCK_STREAM, 0, spb) < 0) fin("no stream pair 2"); if (shutdown(spb[0], SHUT_RDWR) < 0) fin("no stream shutdown 2"); /* dpu -- dgram pair, one end read shutdown */ if (socketpair(PF_UNIX, SOCK_DGRAM, 0, dpu) < 0) fin("no dgram pair 1"); if (shutdown(dpu[0], SHUT_RD) < 0) fin("no dgram shutdown 1"); /* dpb -- dgram pair, one end read-write shutdown */ if (socketpair(PF_UNIX, SOCK_DGRAM, 0, dpb) < 0) fin("no dgram pair 2"); if (shutdown(dpb[0], SHUT_RDWR) < 0) fin("no dgram shutdown 2"); /* dpd -- dgram pair, one end write shutdown with data */ if (socketpair(PF_UNIX, SOCK_DGRAM, 0, dpd) < 0) fin("no dgram pair 3"); if (write(dpd[0], TEST_MSG, sizeof(TEST_MSG)) < 0) fin("no dgram write"); if (shutdown(dpd[0], SHUT_WR) < 0) fin("no dgram shutdown 3"); test_daemon(); test_waitsig(); /* * spu -- check that one direction is blocked and * the other one is not */ ret = write(spu[0], TEST_MSG, sizeof(TEST_MSG)); if (ret < 0) ffin("SU shutdown broken 1"); ret = read(spu[1], buf, sizeof(buf)); if (ret < 0) ffin("SU shutdown broken 2"); ret = write(spu[1], TEST_MSG, sizeof(TEST_MSG)); if (ret >= 0) ffin("SU shutdown broken 3"); /* * spb -- check that both ends are off */ ret = write(spb[0], TEST_MSG, sizeof(TEST_MSG)); if (ret >= 0) ffin("SB shutdown broken 1"); ret = write(spb[1], TEST_MSG, sizeof(TEST_MSG)); if (ret >= 0) ffin("SB shutdown broken 2"); /* * dpu -- check that one direction works, and * the other does not */ ret = write(dpu[0], TEST_MSG, sizeof(TEST_MSG)); if (ret < 0) ffin("DU shutdown broken 1"); ret = read(dpu[1], buf, sizeof(buf)); if (ret < 0) ffin("DU shutdown broken 2"); ret = write(dpu[1], TEST_MSG, sizeof(TEST_MSG)); if (ret >= 0) ffin("DU shutdown broken 3"); /* * dpb -- check that both ends are read */ ret = write(dpb[0], TEST_MSG, sizeof(TEST_MSG)); if (ret >= 0) ffin("DB shutdown broken 1"); ret = write(dpb[1], TEST_MSG, sizeof(TEST_MSG)); if (ret >= 0) ffin("DB shutdown broken 2"); /* * dpd -- check that data is in there, but can't * feed more */ ret = read(dpd[1], buf, sizeof(buf)); if (ret < 0) ffin("DD shutdown nodata"); ret = write(dpd[0], TEST_MSG, sizeof(buf)); if (ret >= 0) ffin("DB shutdown broken"); pass(); return 0; } criu-3.6/test/zdtm/static/sockets02.c000066400000000000000000000024151317335042600175370ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test semi-closed unix stream connection\n"; const char *test_author = "Pavel Emelyanov \n"; int main(int argc, char *argv[]) { int ssk_pair[2], ret; char aux, data; test_init(argc, argv); data = (char)lrand48(); if (socketpair(AF_UNIX, SOCK_STREAM, 0, ssk_pair) == -1) { fail("socketpair\n"); exit(1); } if (write(ssk_pair[1], &data, sizeof(data)) != sizeof(data)) { fail("write\n"); exit(1); } close(ssk_pair[1]); test_daemon(); test_waitsig(); ret = read(ssk_pair[0], &aux, sizeof(aux)); if (ret != sizeof(data) && aux != data) { fail("Data loss (write %d, read %d)", data, aux); return 1; } errno = 0; ret = read(ssk_pair[0], &aux, sizeof(aux)); if (ret != 0 || errno != 0) { fail("Opened end in wrong state (%d/%d)", ret, errno); return 0; } errno = 0; ret = read(ssk_pair[1], &aux, sizeof(aux)); if (ret != -1 || errno != EBADF) { fail("Closed end in wrong state (%d/%d)", ret, errno); return 0; } pass(); return 0; } criu-3.6/test/zdtm/static/sockets03.c000066400000000000000000000041031317335042600175340ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test unix stream sockets with mismatch in shutdown state\n"; const char *test_author = "Andrey Ryabinin "; #define SK_DATA "packet" char *filename; TEST_OPTION(filename, string, "socket file name", 1); int main(int argc, char *argv[]) { int sk[3]; struct sockaddr_un addr; unsigned int addrlen; char path[PATH_MAX]; char buf[64]; char *cwd; int ret; test_init(argc, argv); signal(SIGPIPE, SIG_IGN); cwd = get_current_dir_name(); if (!cwd) { fail("getcwd\n"); exit(1); } snprintf(path, sizeof(path), "%s/%s", cwd, filename); unlink(path); addr.sun_family = AF_UNIX; strncpy(addr.sun_path, path, sizeof(addr.sun_path)); addrlen = sizeof(addr.sun_family) + strlen(path); sk[0] = socket(AF_UNIX, SOCK_STREAM, 0); sk[1] = socket(AF_UNIX, SOCK_STREAM, 0); if (sk[0] < 0 || sk[1] < 0) { fail("socket\n"); exit(1); } ret = bind(sk[0], (struct sockaddr *) &addr, addrlen); if (ret) { fail("bind\n"); exit(1); } ret = listen(sk[0], 16); if (ret) { fail("listen\n"); exit(1); } ret = shutdown(sk[1], SHUT_RD); if (ret) { fail("shutdown\n"); exit(1); } ret = connect(sk[1], (struct sockaddr *) &addr, addrlen); if (ret) { fail("connect\n"); exit(1); } sk[2] = accept(sk[0], NULL, NULL); if (sk[2] < 0) { fail("accept"); exit(1); } test_daemon(); test_waitsig(); if (write(sk[1], SK_DATA, sizeof(SK_DATA)) < 0) { fail("write\n"); exit(1); } if (read(sk[2], &buf, sizeof(buf)) < 0) { fail("read\n"); exit(1); } if (strncmp(buf, SK_DATA, sizeof(SK_DATA))) { fail("data corrupted\n"); exit(1); } if (write(sk[2], SK_DATA, sizeof(SK_DATA)) >= 0) { fail("successful write to shutdown receiver\n"); exit(1); } close(sk[0]); close(sk[1]); close(sk[2]); pass(); return 0; } criu-3.6/test/zdtm/static/sockets03.desc000066400000000000000000000000221317335042600202240ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/sockets_dgram.c000066400000000000000000000131171317335042600205500ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test unix dgram sockets\n"; const char *test_author = "Cyrill Gorcunov = sizeof(name_bound.sun_path)) { fail("too long path"); exit(1); } name_bound.sun_family = AF_UNIX; strncpy(name_bound.sun_path, path, sizeof(name_bound.sun_path)); snprintf(path, sizeof(path), "%s/%s.conn", dirname, filename); unlink(path); if (strlen(path) >= sizeof(name_conn.sun_path)) { fail("too long path"); exit(1); } name_conn.sun_family = AF_UNIX; strncpy(name_conn.sun_path, path, sizeof(name_conn.sun_path)); snprintf(path, sizeof(path), "%s/%s.bound-conn", dirname, filename); unlink(path); if (strlen(path) >= sizeof(name_bound_conn.sun_path)) { fail("too long path"); exit(1); } name_bound_conn.sun_family = AF_UNIX; strncpy(name_bound_conn.sun_path, path, sizeof(name_bound_conn.sun_path)); ret = bind(sk_dgram_bound_server, (struct sockaddr *) &name_bound, sizeof(name_bound)); if (ret) { fail("bind"); exit(1); } ret = bind(sk_dgram_conn_server, (struct sockaddr *) &name_conn, sizeof(name_conn)); if (ret) { fail("bind"); exit(1); } ret = connect(sk_dgram_conn_client, (struct sockaddr *) &name_conn, sizeof(name_conn)); if (ret) { fail("connect"); exit(1); } ret = connect(sk_dgram_conn_client2, (struct sockaddr *) &name_conn, sizeof(name_conn)); if (ret) { fail("connect"); exit(1); } ret = bind(sk_dgram_bound_conn, (struct sockaddr *) &name_bound_conn, sizeof(name_bound_conn)); if (ret) { fail("bind"); exit(1); } /* Note, it's already bound, so make it more idiotic! */ ret = connect(sk_dgram_bound_conn, (struct sockaddr *) &name_bound_conn, sizeof(name_bound_conn)); if (ret) { fail("connect"); exit(1); } memset(buf, 0, sizeof(buf)); sendto(sk_dgram_bound_client, SK_DATA_BOUND, sizeof(SK_DATA_BOUND), 0, (struct sockaddr *) &name_bound, sizeof(name_bound)); read(sk_dgram_bound_server, &buf, sizeof(buf)); if (strcmp(buf, SK_DATA_BOUND)) { fail("data corrupted\n"); exit(1); } test_msg("dgram-bound : '%s'\n", buf); memset(buf, 0, sizeof(buf)); write(sk_dgram_conn_client, SK_DATA_CONN, sizeof(SK_DATA_CONN)); read(sk_dgram_conn_server, &buf, sizeof(buf)); if (strcmp(buf, SK_DATA_CONN)) { fail("data corrupted\n"); exit(1); } test_msg("dgram-conn : '%s'\n", buf); memset(buf, 0, sizeof(buf)); write(sk_dgram_bound_conn, SK_DATA_BOUND_CONN, sizeof(SK_DATA_BOUND_CONN)); read(sk_dgram_bound_conn, &buf, sizeof(buf)); if (strcmp(buf, SK_DATA_BOUND_CONN)) { fail("data corrupted\n"); exit(1); } test_msg("dgram-bound-conn : '%s'\n", buf); test_daemon(); test_waitsig(); memset(buf, 0, sizeof(buf)); sendto(sk_dgram_bound_client, SK_DATA_BOUND, sizeof(SK_DATA_BOUND), 0, (struct sockaddr *) &name_bound, sizeof(name_bound)); read(sk_dgram_bound_server, &buf, sizeof(buf)); if (strcmp(buf, SK_DATA_BOUND)) { fail("data corrupted\n"); exit(1); } test_msg("dgram-bound : '%s'\n", buf); memset(buf, 0, sizeof(buf)); write(sk_dgram_conn_client, SK_DATA_CONN, sizeof(SK_DATA_CONN)); read(sk_dgram_conn_server, &buf, sizeof(buf)); if (strcmp(buf, SK_DATA_CONN)) { fail("data corrupted\n"); exit(1); } test_msg("dgram-conn : '%s'\n", buf); memset(buf, 0, sizeof(buf)); write(sk_dgram_bound_conn, SK_DATA_BOUND_CONN, sizeof(SK_DATA_BOUND_CONN)); read(sk_dgram_bound_conn, &buf, sizeof(buf)); if (strcmp(buf, SK_DATA_BOUND_CONN)) { fail("data corrupted\n"); exit(1); } test_msg("dgram-bound-conn : '%s'\n", buf); pass(); /* * Do cleanup work */ unlink(name_bound.sun_path); unlink(name_conn.sun_path); unlink(name_bound_conn.sun_path); return 0; } criu-3.6/test/zdtm/static/sockets_spair.c000066400000000000000000000021601317335042600205700ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test unix stream socketpair\n"; const char *test_author = "Cyrill Gorcunov #include #include "zdtmtst.h" const char *test_doc = "Start a calculation, leaving SSE in a certain state,\n" "before migration, continue after"; const char *test_author = "Pavel Emelianov "; #if defined(__i386__) || defined(__x86_64__) void start(float *in) { __asm__ volatile ( "movaps %0, %%xmm0\n" "movaps %1, %%xmm1\n" "addps %%xmm0, %%xmm1\n" "sqrtps %%xmm1, %%xmm2\n" : : "m" (in[0]), "m" (in[4]) ); } void finish(float *out) { __asm__ volatile ( "movaps %%xmm1, %0\n" "movaps %%xmm2, %1\n" : "=m" (out[0]), "=m" (out[4]) ); } static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { __asm__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "c"(0)); } int chk_proc_sse(void) { unsigned int eax, ebx, ecx, edx; cpuid(1, &eax, &ebx, &ecx, &edx); return edx & (1 << 25); } #endif int main(int argc, char **argv) { #if defined(__i386__) || defined(__x86_64__) float input[8] __attribute__((aligned(16))); float res1[8] __attribute__((aligned(16))); float res2[8] __attribute__((aligned(16))); int i; #endif test_init(argc, argv); #if defined(__i386__) || defined(__x86_64__) if (!chk_proc_sse()) { skip("SSE not supported"); return 1; } for (i = 0; i < sizeof(input) / sizeof(float); i++) input[i] = drand48(); start(input); finish(res1); start(input); finish(res1); test_daemon(); test_waitsig(); finish(res2); if (memcmp((uint8_t *) res1, (uint8_t *) res2, sizeof(res1))) fail("results differ\n"); else pass(); #else skip("Unsupported arch"); #endif return 0; } criu-3.6/test/zdtm/static/sse00.desc000066400000000000000000000000231317335042600173410ustar00rootroot00000000000000{'arch': 'x86_64'} criu-3.6/test/zdtm/static/sse20.c000066400000000000000000000033641317335042600166620ustar00rootroot00000000000000#include #include #include "zdtmtst.h" const char *test_doc = "Start a calculation, leaving SSE2 in a certain state,\n" "before migration, continue after"; const char *test_author = "Pavel Emelianov "; #if defined(__i386__) || defined(__x86_64__) void start(double *in) { __asm__ volatile ( "movapd %0, %%xmm0\n" "movapd %1, %%xmm1\n" "addpd %%xmm0, %%xmm1\n" "sqrtpd %%xmm1, %%xmm2\n" : : "m" (in[0]), "m" (in[2]) ); } void finish(double *out) { __asm__ volatile ( "movapd %%xmm1, %0\n" "movapd %%xmm2, %1\n" : "=m" (out[0]), "=m" (out[2]) ); } static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { __asm__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "c"(0)); } int chk_proc_sse2(void) { unsigned int eax, ebx, ecx, edx; cpuid(1, &eax, &ebx, &ecx, &edx); return edx & (1 << 26); } #endif int main(int argc, char **argv) { #if defined(__i386__) || defined(__x86_64__) double input[4] __attribute__((aligned(16))); double res1[4] __attribute__((aligned(16))); double res2[4] __attribute__((aligned(16))); int i; #endif test_init(argc, argv); #if defined(__i386__) || defined(__x86_64__) if (!chk_proc_sse2()) { skip("SSE2 not supported"); return 1; } for (i = 0; i < sizeof(input) / sizeof(double); i++) input[i] = drand48(); start(input); finish(res1); start(input); test_daemon(); test_waitsig(); finish(res2); if (memcmp((uint8_t *) res1, (uint8_t *) res2, sizeof(res1))) fail("results differ\n"); else pass(); #else skip("Unsupported arch"); #endif return 0; } criu-3.6/test/zdtm/static/sse20.desc000066400000000000000000000000231317335042600173430ustar00rootroot00000000000000{'arch': 'x86_64'} criu-3.6/test/zdtm/static/stopped.c000066400000000000000000000027571317335042600174110ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check, that stopped tasts are restored correctly"; const char *test_author = "Andrew Vagin "; int main(int argc, char **argv) { pid_t pid; siginfo_t infop; int p[2], ret, status; test_init(argc, argv); if (pipe(p)) { pr_perror("Unable to create pipe"); return 1; } pid = test_fork(); if (pid < 0) return -1; else if (pid == 0) { char c; close(p[1]); ret = read(p[0], &c, 1); if (ret != 1) { pr_perror("Unable to read: %d", ret); return 1; } return 0; } close(p[0]); kill(pid, SIGSTOP); if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { pr_perror("waitid"); return 1; } #ifdef ZDTM_STOPPED_TKILL syscall(__NR_tkill, pid, SIGSTOP); #endif #ifdef ZDTM_STOPPED_KILL kill(pid, SIGSTOP); #endif write(p[1], "0", 1); close(p[1]); test_daemon(); test_waitsig(); // Return immediately if child run or stopped(by SIGSTOP) if (waitpid(pid, &status, WUNTRACED | WCONTINUED) == -1) { pr_perror("Unable to wait child"); goto out; } if (WIFSTOPPED(status)) test_msg("The procces stopped\n"); else{ fail("The process doesn't stopped"); goto out; } kill(pid, SIGCONT); if (waitpid(pid, &status, 0) == -1) { pr_perror("Unable to wait child"); goto out; } if (WIFEXITED(status)) pass(); else fail("The process doesn't continue"); out: return 0; } criu-3.6/test/zdtm/static/stopped01.c000077700000000000000000000000001317335042600213542stopped.custar00rootroot00000000000000criu-3.6/test/zdtm/static/stopped02.c000077700000000000000000000000001317335042600213552stopped.custar00rootroot00000000000000criu-3.6/test/zdtm/static/stopped12.c000077700000000000000000000000001317335042600213562stopped.custar00rootroot00000000000000criu-3.6/test/zdtm/static/tempfs.c000066400000000000000000000042011317335042600172130ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check tmpfs mount"; const char *test_author = "Pavel Emelianov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #define TEST_WORD "testtest" #define TEST_WORD2 "TESTTEST" int main(int argc, char **argv) { int fd, fdo, ret = 1; char buf[1024], fname[PATH_MAX], overmount[PATH_MAX]; test_init(argc, argv); mkdir(dirname, 0700); if (mount("none", dirname, "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } snprintf(fname, sizeof(buf), "%s/test.file", dirname); fdo = open(fname, O_RDWR | O_CREAT, 0644); if (fdo < 0) { pr_perror("open failed"); goto err; } if (write(fdo, TEST_WORD, sizeof(TEST_WORD)) != sizeof(TEST_WORD)) { pr_perror("write() failed"); goto err; } snprintf(overmount, sizeof(buf), "%s/test", dirname); mkdir(overmount, 0700); snprintf(fname, sizeof(buf), "%s/test.file", overmount); fd = open(fname, O_RDWR | O_CREAT, 0644); if (fd < 0) { pr_perror("open failed"); goto err; } if (write(fd, TEST_WORD2, sizeof(TEST_WORD2)) != sizeof(TEST_WORD2)) { pr_perror("write() failed"); goto err; } close(fd); if (mount("none", overmount, "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); goto err; } test_daemon(); test_waitsig(); if (umount(overmount) < 0) { fail("Can't mount tmpfs"); goto err; } lseek(fdo, 0, SEEK_SET); buf[sizeof(TEST_WORD) + 1] = '\0'; if (read(fdo, buf, sizeof(TEST_WORD)) != sizeof(TEST_WORD)) { fail("Read failed"); goto err; } close(fdo); if (strcmp(buf, TEST_WORD)) { fail("File corrupted"); goto err; } fd = open(fname, O_RDONLY); if (fd < 0) { pr_perror("open failed"); goto err; } buf[sizeof(TEST_WORD2) + 1] = '\0'; if (read(fd, buf, sizeof(TEST_WORD2)) != sizeof(TEST_WORD2)) { fail("Read failed"); goto err; } close(fd); if (strcmp(buf, TEST_WORD2)) { fail("File corrupted"); goto err; } pass(); ret = 0; err: umount2(dirname, MNT_DETACH); rmdir(dirname); return ret; } criu-3.6/test/zdtm/static/tempfs.desc000066400000000000000000000000461317335042600177120ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid'} criu-3.6/test/zdtm/static/tempfs_overmounted.c000066400000000000000000000012521317335042600216450ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check tmpfs mount"; const char *test_author = "Pavel Emelianov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { test_init(argc, argv); mkdir(dirname, 0700); if (mount("none", dirname, "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } if (mount("none", dirname, "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } test_daemon(); test_waitsig(); pass(); return 0; } criu-3.6/test/zdtm/static/tempfs_overmounted.desc000066400000000000000000000000551317335042600223410ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid crfail'} criu-3.6/test/zdtm/static/tempfs_overmounted01.c000066400000000000000000000044111317335042600220060ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check how file systems are dumped if some mount points are overmounted"; const char *test_author = "Andrei Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { task_waiter_t lock; int pid, status = -1; test_init(argc, argv); task_waiter_init(&lock); mkdir(dirname, 0700); pid = fork(); if (pid < 0) { pr_perror("fork"); return 1; } if (pid == 0) { if (mount("zdtm", dirname, "tmpfs", 0, "") < 0) { pr_err("Can't mount tmpfs"); return 1; } if (chdir(dirname)) { pr_err("chdir"); return 1; } /* * We don't know a direction in which criu enumerates mount, * so lets create two chains of mounts. */ /* Create a chain when a parent mount is overmounted */ mkdir("a", 0700); mkdir("b", 0700); if (mount("zdtm1", "a", "tmpfs", 0, "") || mount("a", "b", NULL, MS_BIND, "")) { pr_perror("Can't mount tmpfs"); return 1; } mkdir("a/b", 0700); mkdir("a/b/c", 0700); if (mount("a/b", "a", NULL, MS_BIND, "")) { pr_perror("mount"); return 1; } if (mount("b", "a/c", NULL, MS_MOVE, "")) { pr_perror("Can't mount tmpfs"); return 1; } /* create a second chain where a child mount is overmounted*/ if (mount("zdtm2", "b", "tmpfs", 0, "")) { pr_perror("can't mount tmpfs"); return 1; } mkdir("b/b", 0700); mkdir("b/b/z", 0700); if (mount("b", "b/b", NULL, MS_BIND, NULL) || mount("b/b/b", "b/b", NULL, MS_BIND, NULL)) { pr_perror("can't mount tmpfs"); return 1; } task_waiter_complete(&lock, 1); test_waitsig(); if (umount2("a", MNT_DETACH)) { pr_perror("umount"); return 1; } if (umount2("b/b", MNT_DETACH) || umount2("b/b", MNT_DETACH)) { pr_perror("umount"); return 1; } if (access("a/b/c", R_OK) || access("b/b/z", R_OK)) { pr_perror("access"); return 1; } return 0; } task_waiter_wait4(&lock, 1); test_daemon(); test_waitsig(); kill(pid, SIGTERM); wait(&status); if (status) { fail(); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/tempfs_overmounted01.desc000066400000000000000000000000461317335042600225020ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid'} criu-3.6/test/zdtm/static/tempfs_ro.c000066400000000000000000000026561317335042600177270ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check read-only tmpfs mount"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #define TEST_WORD "testtest" int main(int argc, char **argv) { int fd, ret = 1; char buf[1024], fname[PATH_MAX]; test_init(argc, argv); mkdir(dirname, 0700); if (mount("none", dirname, "tmpfs", 0, "") < 0) { fail("Can't mount tmpfs"); return 1; } snprintf(fname, sizeof(buf), "%s/test.file", dirname); fd = open(fname, O_RDWR | O_CREAT, 0644); if (fd < 0) { pr_perror("open failed"); goto err; } if (write(fd, TEST_WORD, sizeof(TEST_WORD)) != sizeof(TEST_WORD)) { pr_perror("write() failed"); goto err; } close(fd); if (mount(NULL, dirname, "tmpfs", MS_REMOUNT | MS_RDONLY, NULL) < 0) { fail("Can't mount tmpfs"); return 1; } test_daemon(); test_waitsig(); fd = open(fname, O_RDONLY); if (fd < 0) { pr_perror("open failed"); goto err; } buf[sizeof(TEST_WORD) + 1] = '\0'; if (read(fd, buf, sizeof(TEST_WORD)) != sizeof(TEST_WORD)) { fail("Read failed"); goto err; } close(fd); if (strcmp(buf, TEST_WORD)) { fail("File corrupted"); goto err; } pass(); ret = 0; err: umount2(dirname, MNT_DETACH); rmdir(dirname); return ret; } criu-3.6/test/zdtm/static/tempfs_ro.desc000066400000000000000000000000421317335042600204060ustar00rootroot00000000000000{'flavor': 'ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/tempfs_ro02.c000066400000000000000000000016511317335042600200630ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check read-only tmpfs mount"; const char *test_author = "Andrew Vagin "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #define TEST_WORD "testtest" int main(int argc, char **argv) { int fd, ret = 1; char buf[1024], fname[PATH_MAX]; test_init(argc, argv); mkdir(dirname, 0700); if (mount("none", dirname, "tmpfs", MS_RDONLY, "") < 0) { fail("Can't mount tmpfs"); return 1; } snprintf(fname, sizeof(buf), "%s/test.file", dirname); test_daemon(); test_waitsig(); fd = open(fname, O_RDWR | O_CREAT, 0777); if (fd >= 0 || errno != EROFS) { pr_perror("open failed -> %d", fd); goto err; } pass(); ret = 0; err: umount2(dirname, MNT_DETACH); rmdir(dirname); return ret; } criu-3.6/test/zdtm/static/tempfs_ro02.desc000066400000000000000000000000461317335042600205540ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid'} criu-3.6/test/zdtm/static/tempfs_subns.c000066400000000000000000000046311317335042600204340ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check tmpfs in a non-root mntns"; const char *test_author = "Andrew Vagin #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define exit_group(code) \ syscall(__NR_exit_group, code) const char *test_doc = "Acquire UID/GID setting caps, create thread and drop thread to non-root by changing UID/GID\n"; const char *test_author = "Vitaly Ostrosablin "; unsigned int gid; unsigned int uid; pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; pthread_cond_t cond = PTHREAD_COND_INITIALIZER; task_waiter_t t; int done = 0; void *chg_uid_gid(void *arg) { cap_t newcaps; cap_t mycaps; int ret; test_msg("Aux thread runs as UID: %d; GID: %d\n", getuid(), getgid()); newcaps = cap_from_text("cap_setgid,cap_setuid=+eip"); if (!newcaps) { pr_perror("Failed to get capability struct\n"); exit(1); } ret = cap_set_proc(newcaps); if (ret) { pr_perror("Failed to set capabilities for the process\n"); exit(1); } mycaps = cap_get_proc(); if (!mycaps) { pr_perror("Failed to get child thread capabilities\n"); exit_group(2); } test_msg("Child capabilities: %s\n", cap_to_text(mycaps, NULL)); test_msg("Changing UID/GID in child thread to %d:%d\n", uid, gid); ret = syscall(SYS_setresgid, gid, gid, gid); if (ret >= 0) { syscall(SYS_setresuid, uid, uid, uid); } else if (ret < 0) { pr_perror("Failed to change UID/GID\n"); exit_group(2); } gid = getgid(); uid = getuid(); test_msg("Now aux thread runs as UID: %d; GID: %d\n", uid, gid); test_msg("Child thread is waiting for main thread's signal\n"); task_waiter_complete(&t, 1); pthread_mutex_lock(&mutex); while (!done) { pthread_cond_wait(&cond, &mutex); } pthread_mutex_unlock(&mutex); test_msg("Child thread returns\n"); return NULL; } int main(int argc, char **argv) { pthread_t diff_cred_thread; cap_t newcaps; int maingroup; int mainuser; int ret; test_init(argc, argv); task_waiter_init(&t); if (getuid() != 0) { fail("Test is expected to be run with root privileges\n"); exit(1); } test_msg("Acquiring CAP_SETGID and CAP_SETUID...\n"); newcaps = cap_from_text("cap_setgid,cap_setuid=+eip"); if (!newcaps) { pr_perror("Failed to get capability struct\n"); exit(1); } ret = cap_set_proc(newcaps); if (ret) { pr_perror("Failed to set capabilities for the process\n"); exit(1); } ret = prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0); if (ret) { pr_perror("Unable to set KEEPCAPS\n"); exit(1); } test_msg("Main thread runs as UID: %d; GID: %d\n", getuid(), getgid()); gid = 99; uid = 99; maingroup = 8; mainuser = 12; test_msg("Creating thread with different UID/GID\n"); ret = pthread_create(&diff_cred_thread, NULL, &chg_uid_gid, NULL); task_waiter_wait4(&t, 1); test_msg("Relinquishing root privileges\n"); ret = syscall(SYS_setresgid, maingroup, maingroup, maingroup); if (ret >= 0) { ret = syscall(SYS_setresuid, mainuser, mainuser, mainuser); } else if (ret < 0) { pr_perror("Failed to drop privileges\n"); exit(1); } test_msg("Now main thread runs as UID: %d; GID: %d\n", getuid(), getgid()); if (gid == getgid() || uid == getuid()) { pr_perror("Thread credentials match\n"); exit(1); } test_msg("Main thread is waiting for signal\n"); test_daemon(); test_waitsig(); if (gid == getgid() || uid == getuid()) { pr_perror("Thread credentials match after restore\n"); exit(1); } pthread_mutex_lock(&mutex); done = 1; pthread_cond_signal(&cond); pthread_mutex_unlock(&mutex); pthread_join(diff_cred_thread, NULL); test_msg("Threads joined\n"); pass(); return 0; } criu-3.6/test/zdtm/static/thread_different_uid_gid.desc000066400000000000000000000000221317335042600233670ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/static/timerfd.c000066400000000000000000000066521317335042600173630ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Checks timerfd survives checkpoint/restore\n"; const char *test_author = "Cyrill Gorcunov "; #define TIMERFD_VNSEC 50000 #define TIMERFD_ISEC 4 struct timerfd_status { int clockid; uint64_t ticks; int settime_flags; struct itimerspec v; }; static void show_timerfd(char *prefix, struct timerfd_status *s) { test_msg("\t%s clockid %d ticks %llu settime_flags %d it_value(%llu, %llu) it_interval(%llu, %llu)\n", prefix, s->clockid, (unsigned long long)s->ticks, s->settime_flags, (unsigned long long)s->v.it_value.tv_sec, (unsigned long long)s->v.it_value.tv_nsec, (unsigned long long)s->v.it_interval.tv_sec, (unsigned long long)s->v.it_interval.tv_nsec); } static int parse_self_fdinfo(int fd, struct timerfd_status *s) { char buf[256]; int ret = -1; FILE *f; sprintf(buf, "/proc/self/fdinfo/%d", fd); f = fopen(buf, "r"); if (!f) { pr_perror("Can't open %s to parse", buf); return -1; } memset(s, 0, sizeof(*s)); /* * clockid: 0 * ticks: 0 * settime flags: 01 * it_value: (0, 49406829) * it_interval: (1, 0) */ while (fgets(buf, sizeof(buf), f)) { if (strncmp(buf, "clockid:", 8)) continue; if (sscanf(buf, "clockid: %d", &s->clockid) != 1) goto parse_err; if (!fgets(buf, sizeof(buf), f)) goto parse_err; if (sscanf(buf, "ticks: %llu", (unsigned long long *)&s->ticks) != 1) goto parse_err; if (!fgets(buf, sizeof(buf), f)) goto parse_err; if (sscanf(buf, "settime flags: 0%o", &s->settime_flags) != 1) goto parse_err; if (!fgets(buf, sizeof(buf), f)) goto parse_err; if (sscanf(buf, "it_value: (%llu, %llu)", (unsigned long long *)&s->v.it_value.tv_sec, (unsigned long long *)&s->v.it_value.tv_nsec) != 2) goto parse_err; if (!fgets(buf, sizeof(buf), f)) goto parse_err; if (sscanf(buf, "it_interval: (%llu, %llu)", (unsigned long long *)&s->v.it_interval.tv_sec, (unsigned long long *)&s->v.it_interval.tv_nsec) != 2) goto parse_err; ret = 0; break; } if (ret) goto parse_err; err: fclose(f); return ret; parse_err: pr_perror("Format error"); goto err; } static int check_timerfd(int fd, struct timerfd_status *old) { struct timerfd_status new; if (parse_self_fdinfo(fd, &new)) return -1; show_timerfd("restored", &new); if (old->clockid != new.clockid || old->settime_flags != new.settime_flags || old->ticks > new.ticks || old->v.it_value.tv_sec > new.v.it_value.tv_sec || old->v.it_interval.tv_sec != new.v.it_interval.tv_sec) return -1; return 0; } int main(int argc, char *argv[]) { struct timerfd_status old = { .clockid = CLOCK_MONOTONIC, .ticks = 0, .settime_flags = 0, .v = { .it_value = { .tv_sec = 0, .tv_nsec= TIMERFD_VNSEC, }, .it_interval = { .tv_sec = TIMERFD_ISEC, .tv_nsec= 0, }, }, }; int timerfd = 0, ret; test_init(argc, argv); timerfd = timerfd_create(old.clockid, 0); if (timerfd < 0) { pr_perror("timerfd_create failed"); return -1; } show_timerfd("setup", &old); if (timerfd_settime(timerfd, old.settime_flags, &old.v, NULL)) { pr_perror("timerfd_settime failed"); return -1; } sleep(1); test_daemon(); test_waitsig(); ret = check_timerfd(timerfd, &old); if (ret) fail(); else pass(); return ret; } criu-3.6/test/zdtm/static/timerfd.desc000066400000000000000000000000271317335042600200450ustar00rootroot00000000000000{'feature': 'timerfd'} criu-3.6/test/zdtm/static/timers.c000066400000000000000000000035571317335042600172350ustar00rootroot00000000000000#include #include #include #include "zdtmtst.h" const char *test_doc = "Checks timers keep ticking after migration\n"; const char *test_author = "Pavel Emelianov "; static struct { const int timer_type; const int signal; volatile sig_atomic_t count; } timer_tests[] = { /* from slowest to fastest */ { ITIMER_VIRTUAL, SIGVTALRM }, { ITIMER_PROF, SIGPROF }, { ITIMER_REAL, SIGALRM }, }; #define NUM_TIMERS (sizeof(timer_tests) / sizeof(timer_tests[0])) #define MAX_TIMER_COUNT 10 static void timer_tick(int sig) { int i; for (i = 0; i < NUM_TIMERS; i++) if (timer_tests[i].signal == sig) { /* don't go beyond MAX_TIMER_COUNT, to avoid overflow */ if (timer_tests[i].count < MAX_TIMER_COUNT) timer_tests[i].count++; break; } } static void setup_timers(void) { int i; struct itimerval tv = { .it_interval = { .tv_sec = 0, .tv_usec = 100000 }, .it_value = { .tv_sec = 0, .tv_usec = 100 }, }; for (i = 0; i < NUM_TIMERS; i++) { if (signal(timer_tests[i].signal, timer_tick) == SIG_ERR) { pr_perror("can't set signal handler %d", i); exit(1); } if (setitimer(timer_tests[i].timer_type, &tv, NULL) < 0) { pr_perror("can't set timer %d", i); exit(1); } } } static void check_timers(void) { int i; volatile unsigned int j; /* avoid optimizing the loop away */ for (i = 0; i < NUM_TIMERS; i++) /* reset counters first */ timer_tests[i].count = 0; /* waste some real and CPU time: run for MAX_TIMER_COUNT ticks or until * j overflows */ for (j = 1; j && timer_tests[0].count < MAX_TIMER_COUNT; j++); for (i = 0; i < NUM_TIMERS; i++) if (!timer_tests[i].count) { fail("timer %d stuck", i); return; } pass(); } int main(int argc, char **argv) { test_init(argc, argv); setup_timers(); test_daemon(); test_waitsig(); check_timers(); return 0; } criu-3.6/test/zdtm/static/tty00.c000066400000000000000000000034711317335042600167050ustar00rootroot00000000000000#define _XOPEN_SOURCE 500 #include #include "zdtmtst.h" #include #include #include #include #include #include #include #include const char *test_doc = "Check that a control terminal is restored"; const char *test_author = "Andrey Vagin "; static int sighup = 0; static void sighup_handler(int signo) { test_msg("SIGHUP is here\n"); sighup = 1; } int main(int argc, char ** argv) { int fdm, fds, status; task_waiter_t t; char *slavename; pid_t pid; test_init(argc, argv); task_waiter_init(&t); fdm = open("/dev/ptmx", O_RDWR); if (fdm == -1) { pr_perror("Can't open a master pseudoterminal"); return 1; } grantpt(fdm); unlockpt(fdm); slavename = ptsname(fdm); pid = test_fork(); if (pid < 0) { pr_perror("fork() failed"); return 1; } if (pid == 0) { close(fdm); signal(SIGHUP, sighup_handler); if (setsid() == -1) return 1; /* set up a controlling terminal */ fds = open(slavename, O_RDWR); if (fds == -1) { pr_perror("Can't open a slave pseudoterminal %s", slavename); return 1; } if (ioctl(fdm, TIOCSCTTY, 1) < 0) { pr_perror("Can't setup a controlling terminal"); return 1; } close(fds); task_waiter_complete_current(&t); test_waitsig(); if (sighup) return 0; return 1; } task_waiter_wait4(&t, pid); test_daemon(); test_waitsig(); close(fdm); if (kill(pid, SIGTERM) == -1) { pr_perror("kill failed"); return 1; } pid = waitpid(pid, &status, 0); if (pid < 0) return 1; if (WIFEXITED(status)) { if (WEXITSTATUS(status)) { fail("The child returned %d", WEXITSTATUS(status)); return 1; } } else { test_msg("The child has been killed by %d\n", WTERMSIG(status)); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/tty02.c000066400000000000000000000017421317335042600167060ustar00rootroot00000000000000#define _XOPEN_SOURCE 500 #include #include "zdtmtst.h" #include #include #include #include #include #include #include const char *test_doc = "Check a non-controling terminal"; const char *test_author = "Andrey Vagin "; int main(int argc, char ** argv) { int fdm, fds; char *slavename; pid_t sid; test_init(argc, argv); setsid(); fdm = open("/dev/ptmx", O_RDWR); if (fdm == -1) { pr_perror("Can't open a master pseudoterminal"); return 1; } grantpt(fdm); unlockpt(fdm); slavename = ptsname(fdm); /* set up a controlling terminal */ fds = open(slavename, O_RDWR | O_NOCTTY); if (fds == -1) { pr_perror("Can't open a slave pseudoterminal %s", slavename); return 1; } test_daemon(); test_waitsig(); if (ioctl(fds, TIOCGSID, &sid) != -1 || errno != ENOTTY) { fail("The tty is a controlling for someone"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/tty03.c000066400000000000000000000040561317335042600167100ustar00rootroot00000000000000#define _XOPEN_SOURCE 500 #include #include "zdtmtst.h" #include #include #include #include #include #include #include const char *test_doc = "Check a controlling terminal, if a proper fd belongs to another session leader"; const char *test_author = "Andrey Vagin "; int main(int argc, char ** argv) { int fdm, fds, exit_code = 1, status; task_waiter_t t; char *slavename; pid_t sid_b, sid_a, pid; int pfd[2]; test_init(argc, argv); task_waiter_init(&t); if (pipe(pfd) == -1) { pr_perror("pipe"); return 1; } fdm = open("/dev/ptmx", O_RDWR); if (fdm == -1) { pr_perror("Can't open a master pseudoterminal"); return 1; } grantpt(fdm); unlockpt(fdm); slavename = ptsname(fdm); pid = test_fork(); if (pid == 0) { if (setsid() == -1) { pr_perror("setsid"); return 1; } close(pfd[0]); /* set up a controlling terminal */ fds = open(slavename, O_RDWR | O_NOCTTY); if (fds == -1) { pr_perror("Can't open a slave pseudoterminal %s", slavename); return 1; } ioctl(fds, TIOCSCTTY, 1); pid = test_fork(); if (pid == 0) { if (setsid() == -1) { pr_perror("setsid"); return 1; } close(pfd[1]); task_waiter_complete(&t, 1); test_waitsig(); exit(0); } close(fds); close(pfd[1]); task_waiter_wait4(&t, 1); task_waiter_complete(&t, 0); test_waitsig(); kill(pid, SIGTERM); wait(&status); exit(status); } close(pfd[1]); if (read(pfd[0], &sid_a, 1) != 0) { pr_perror("read"); goto out; } if (ioctl(fdm, TIOCGSID, &sid_b) == -1) { pr_perror("The tty is not a controlling"); goto out; } task_waiter_wait4(&t, 0); test_daemon(); test_waitsig(); if (ioctl(fdm, TIOCGSID, &sid_a) == -1) { fail("The tty is not a controlling"); goto out; } if (sid_b != sid_a) { fail("The tty is controlling for someone else"); goto out; } exit_code = 0; out: kill(pid, SIGTERM); wait(&status); if (status == 0 && exit_code == 0) pass(); return exit_code; } criu-3.6/test/zdtm/static/tun.c000066400000000000000000000107131317335042600165300ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test TUN/TAP devices\n"; const char *test_author = "Pavel Emelianov "; #define TUN_DEVICE "/dev/net/tun" #ifndef IFF_MULTI_QUEUE #define IFF_MULTI_QUEUE 0x0100 #define IFF_ATTACH_QUEUE 0x0200 #define IFF_DETACH_QUEUE 0x0400 #define IFF_PERSIST 0x0800 #endif #ifndef TUNSETQUEUE #define TUNSETQUEUE _IOW('T', 217, int) #endif static int any_fail = 0; static int __open_tun(void) { int fd; fd = open(TUN_DEVICE, O_RDWR); if (fd < 0) pr_perror("Can't open tun file"); return fd; } static int set_tun_queue(int fd, unsigned flags) { struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); ifr.ifr_flags = flags; if (ioctl(fd, TUNSETQUEUE, &ifr) < 0) { pr_perror("Can't set queue"); return -1; } return 0; } static int __attach_tun(int fd, char *name, unsigned flags) { struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); strcpy(ifr.ifr_name, name); ifr.ifr_flags = flags; if (ioctl(fd, TUNSETIFF, &ifr) < 0) { if (!(flags & IFF_TUN_EXCL)) pr_perror("Can't attach iff %s", name); return -1; } return fd; } static int open_tun(char *name, unsigned flags) { int fd; fd = __open_tun(); if (fd < 0) return -1; return __attach_tun(fd, name, flags); } static void check_tun(int fd, char *name, unsigned flags) { struct ifreq ifr; if (ioctl(fd, TUNGETIFF, &ifr) > 0) { any_fail = 1; fail("Attached tun %s file lost device", name); } if (strcmp(ifr.ifr_name, name)) { any_fail = 1; fail("Attached tun %s wrong device", name); } if ((ifr.ifr_flags & flags) != flags) { any_fail = 1; fail("Attached tun %s wrong device type", name); } } static int dev_get_hwaddr(int fd, char *a) { struct ifreq ifr; if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) { pr_perror("Can't get hwaddr"); return -1; } memcpy(a, ifr.ifr_hwaddr.sa_data, ETH_ALEN); return 0; } int main(int argc, char **argv) { int fds[5], ret; char addr[ETH_ALEN], a2[ETH_ALEN]; test_init(argc, argv); /* fd[0] -- opened file */ fds[0] = __open_tun(); if (fds[0] < 0) { pr_perror("No file 0"); return -1; } /* fd[1] -- opened file with tun device */ fds[1] = open_tun("tunx0", IFF_TUN); if (fds[1] < 0) { pr_perror("No file 1"); return -1; } /* fd[2] and [3] -- two-queued device, with 3 detached */ fds[2] = open_tun("tunx1", IFF_TUN | IFF_MULTI_QUEUE); if (fds[2] < 0) { pr_perror("No file 2"); return -1; } fds[3] = open_tun("tunx1", IFF_TUN | IFF_MULTI_QUEUE); if (fds[3] < 0) { pr_perror("No file 3"); return -1; } ret = set_tun_queue(fds[3], IFF_DETACH_QUEUE); if (ret < 0) return -1; /* special case -- persistent device */ ret = open_tun("tunx2", IFF_TUN); if (ret < 0) { pr_perror("No persistent device"); return -1; } if (ioctl(ret, TUNSETPERSIST, 1) < 0) { pr_perror("Can't make persistent"); return -1; } /* and one tap in fd[4] */ fds[4] = open_tun("tapx0", IFF_TAP); if (fds[4] < 0) { pr_perror("No tap"); return -1; } if (dev_get_hwaddr(fds[4], addr) < 0) { pr_perror("No hwaddr for tap?"); return -1; } close(ret); test_daemon(); test_waitsig(); /* check fds[0] is not attached to device */ ret = __attach_tun(fds[0], "tunx3", IFF_TUN); if (ret < 0) { any_fail = 1; fail("Opened tun file broken"); } /* check that fds[1] has device */ check_tun(fds[1], "tunx0", IFF_TUN); /* check that fds[2] and [3] are at MQ device with */ check_tun(fds[2], "tunx1", IFF_TUN | IFF_MULTI_QUEUE); check_tun(fds[3], "tunx1", IFF_TUN | IFF_MULTI_QUEUE); ret = set_tun_queue(fds[2], IFF_DETACH_QUEUE); if (ret < 0) { any_fail = 1; fail("Queue not attached"); } ret = set_tun_queue(fds[3], IFF_ATTACH_QUEUE); if (ret < 0) { any_fail = 1; fail("Queue not detached"); } /* check persistent device */ ret = open_tun("tunx2", IFF_TUN | IFF_TUN_EXCL); if (ret >= 0) { any_fail = 1; fail("Persistent device lost"); } else { ret = open_tun("tunx2", IFF_TUN); if (ret < 0) pr_perror("Can't attach tun2"); else ioctl(ret, TUNSETPERSIST, 0); } check_tun(fds[4], "tapx0", IFF_TAP); if (dev_get_hwaddr(fds[4], a2) < 0) { pr_perror("No hwaddr for tap? (2)"); any_fail = 1; } else if (memcmp(addr, a2, sizeof(addr))) { fail("Address mismatch on tap %x:%x -> %x:%x", (int)addr[0], (int)addr[1], (int)a2[0], (int)a2[1]); any_fail = 1; } if (!any_fail) pass(); return 0; } criu-3.6/test/zdtm/static/tun.desc000066400000000000000000000000701317335042600172170ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'tun'} criu-3.6/test/zdtm/static/uffd-events.c000066400000000000000000000057131317335042600201540ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test uffd events"; const char *test_author = "Mike Rapoport "; #define NR_MAPS 5 #define MAP_SIZE (1 << 20) static void *map[NR_MAPS]; static int create_mappings(void) { uint32_t crc; int i; for (i = 0; i < NR_MAPS; i++) { map[i] = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (map[i] == MAP_FAILED) { fail("mmap failed"); return 1; } crc = i; datagen(map[i], MAP_SIZE, &crc); } return 0; } static int verify_zeroes(void *m) { int i; for (i = 0; i < MAP_SIZE; i += PAGE_SIZE) { char *p = m + i; if (*p != 0) return 1; } return 0; } static int check_madv_dn(int idx) { void *m = map[idx]; if (madvise(m, MAP_SIZE, MADV_DONTNEED)) { fail("madvise failed"); return 1; } if (verify_zeroes(m)) { fail("not zero"); return 1; } return 0; } static int check_mremap_grow(int idx) { void *m = map[idx]; uint32_t crc = idx; m = mremap(m, MAP_SIZE, MAP_SIZE * 2, MREMAP_MAYMOVE); if (m == MAP_FAILED) { fail("mremap failed"); return 1; } if (datachk(m, MAP_SIZE, &crc)) { fail("Mem corrupted"); return 1; } /* the new part of the mapping should be filled with zeroes */ m += MAP_SIZE; if (verify_zeroes(m)) { fail("not zeroes"); return 1; } return 0; } static int check_swapped_mappings(int idx) { uint32_t crc = idx; void *m1 = map[idx]; void *m2 = map[idx + 1]; void *p = map[0]; p = mremap(m1, MAP_SIZE, MAP_SIZE, MREMAP_MAYMOVE | MREMAP_FIXED, p); if (p == MAP_FAILED) { fail("mremap failed"); return 1; } m1 = mremap(m2, MAP_SIZE, MAP_SIZE, MREMAP_MAYMOVE | MREMAP_FIXED, m1); if (m1 == MAP_FAILED) { fail("mremap failed"); return 1; } m2 = mremap(p, MAP_SIZE, MAP_SIZE, MREMAP_MAYMOVE | MREMAP_FIXED, m2); if (m2 == MAP_FAILED) { fail("mremap failed"); return 1; } if (datachk(m2, MAP_SIZE, &crc)) { fail("Mem corrupted"); return 1; } crc = idx + 1; if (datachk(m1, MAP_SIZE, &crc)) { fail("Mem corrupted"); return 1; } return 0; } int main(int argc, char ** argv) { uint32_t crc; int pid; test_init(argc, argv); if (create_mappings()) return -1; test_daemon(); test_waitsig(); /* run some page faults */ crc = 0; if (datachk(map[0], MAP_SIZE, &crc)) { fail("Mem corrupted"); return 1; } pid = fork(); if (pid < 0) { fail("Can't fork"); return 1; } /* check madvise(MADV_DONTNEED) */ if (check_madv_dn(1)) return 1; /* check growing mremap */ if (check_mremap_grow(2)) return 1; /* check swapped mappings */ if (check_swapped_mappings(3)) return 1; if (pid) { int status; waitpid(-1, &status, 0); if (status) { fail("child failed"); return status; } } pass(); return 0; } criu-3.6/test/zdtm/static/umask00.c000066400000000000000000000010021317335042600171710ustar00rootroot00000000000000#include #include "zdtmtst.h" const char *test_doc = "Check that umask didn't change"; const char *test_author = "Pavel Emelianov "; unsigned int mask; TEST_OPTION(mask, uint, "umask", 1); int main(int argc, char **argv) { unsigned int cur_mask, mask2; test_init(argc, argv); cur_mask = umask(mask); test_daemon(); test_waitsig(); mask2 = umask(0); if (mask != mask2) fail("mask changed: %o != %o\n", mask, mask2); else pass(); umask(cur_mask); return 0; } criu-3.6/test/zdtm/static/unbound_sock.c000066400000000000000000000014671317335042600204210ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Create a socket before migration, and bind to it after\n"; const char *test_author = "Roman Kagan "; #define TEST_PORT 59687 #define TEST_ADDR INADDR_ANY int main(int argc, char ** argv) { int sock; struct sockaddr_in name = { .sin_family = AF_INET, .sin_port = htons(TEST_PORT), .sin_addr.s_addr = htonl(TEST_ADDR), }; test_init(argc, argv); sock = socket(PF_INET, SOCK_STREAM, 0); if (sock < 0) { pr_perror("can't create socket"); return 1; } test_daemon(); test_waitsig(); if (bind(sock, (struct sockaddr *) &name, sizeof(name)) < 0) fail("can't bind to a socket: %m"); else pass(); close(sock); return 0; } criu-3.6/test/zdtm/static/unhashed_proc.c000066400000000000000000000026771317335042600205560ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Chdir into unhashed proc entry"; const char *test_author = "Konstantin Khlebnikov "; int main(int argc, char ** argv) { int pid, len; char cwd1[PATH_MAX], cwd2[PATH_MAX]; test_init(argc, argv); pid = fork(); if (pid < 0) { pr_perror("fork failed"); exit(1); } else if (!pid) { pause(); return 0; } sprintf(cwd1, "/proc/%d", pid); if (chdir(cwd1) < 0) { kill(pid, SIGKILL); pr_perror("chdir failed"); exit(1); } kill(pid, SIGKILL); waitpid(pid, NULL, 0); if (getcwd(cwd1, sizeof(cwd1))) { pr_perror("successful getcwd: %s", cwd1); exit(1); } else if (errno != ENOENT) { pr_perror("wrong errno"); exit(1); } len = readlink("/proc/self/cwd", cwd1, sizeof(cwd1)); if (len < 0) { pr_perror("can't read cwd symlink"); exit(1); } cwd1[len] = 0; test_daemon(); test_waitsig(); if (getcwd(cwd2, sizeof(cwd2))) { fail("successful getcwd: %s\n", cwd2); exit(1); } else if (errno != ENOENT) { fail("wrong errno: %m\n"); exit(1); } len = readlink("/proc/self/cwd", cwd2, sizeof(cwd2)-1); if (len < 0) { fail("can't read cwd symlink %m\n"); exit(1); } cwd2[len] = 0; if (strcmp(cwd1, cwd2)) test_msg("cwd differs: %s != %s\n", cwd1, cwd2); pass(); return 0; } criu-3.6/test/zdtm/static/unhashed_proc.desc000066400000000000000000000000551317335042600212360ustar00rootroot00000000000000{'flags': 'crfail', 'opts' : '--link-remap'} criu-3.6/test/zdtm/static/unlink_fifo.c000066400000000000000000000015751317335042600202330ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that we can migrate with a named pipe " "open and then unlinked"; const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char **argv) { int fd; mode_t mode = S_IFIFO | 0700; test_init(argc, argv); if (mknod(filename, mode, 0)) { pr_perror("can't make fifo \"%s\"", filename); exit(1); } fd = open(filename, O_RDWR); if (fd < 0) { pr_perror("can't open %s", filename); return 1; } if (unlink(filename) < 0) { pr_perror("can't unlink %s", filename); return 1; } test_daemon(); test_waitsig(); if (close(fd) < 0) { fail("can't close %s: %m", filename); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/unlink_fifo_wronly.c000066400000000000000000000021451317335042600216370ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that we can migrate with a named pipe, " "opened in WRONLY mode and then unlinked"; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char **argv) { int fd, fd1; mode_t mode = S_IFIFO | 0600; test_init(argc, argv); if (mknod(filename, mode, 0)) { pr_perror("can't make fifo \"%s\"", filename); exit(1); } fd = open(filename, O_RDONLY | O_NONBLOCK); if (fd < 0) { pr_perror("open(%s, O_RDONLY | O_NONBLOCK) Failed", filename); return 1; } fd1 = open(filename, O_WRONLY); if (fd1 < 0) { pr_perror("open(%s, O_WRONLY) Failed", filename); return 1; } if (unlink(filename) < 0) { pr_perror("can't unlink %s", filename); return 1; } test_daemon(); test_waitsig(); if (close(fd) < 0) { fail("can't close (O_RDONLY | O_NONBLOCK) %s: %m", filename); return 1; } if (close(fd1) < 0) { fail("can't close (O_WRONLY) %s: %m", filename); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/unlink_fstat00.c000066400000000000000000000061411317335042600205630ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Open, unlink, change size, seek, migrate, check size"; #ifdef UNLINK_FSTAT04 char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #else char *filename; TEST_OPTION(filename, string, "file name", 1); #endif int main(int argc, char ** argv) { int fd; size_t fsize=1000; mode_t mode; uid_t uid; gid_t gid; uint8_t buf[fsize]; struct stat fst; uint32_t crc; #ifdef UNLINK_FSTAT04 char filename[PATH_MAX]; #endif test_init(argc, argv); #ifdef UNLINK_FSTAT04 snprintf(filename, sizeof(filename), "%s/test\\file'\n\"un%%linkfstat00", dirname); mkdir(dirname, 0700); #endif fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } #ifdef UNLINK_FSTAT04 if (chmod(dirname, 0500)) { pr_perror("chmod"); exit(1); } #endif if (fstat(fd, &fst) < 0) { pr_perror("can't get file info %s before", filename); goto failed; } if (unlink(filename) < 0) { pr_perror("can't unlink %s", filename); goto failed; } /* Change file size */ if (fst.st_size != 0) { pr_perror("%s file size eq %ld", filename, (long)fst.st_size); goto failed; } crc = ~0; datagen(buf, sizeof(buf), &crc); if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { pr_perror("can't write %s", filename); goto failed; } /* Change file mode */ if ((fst.st_mode & S_IXOTH) == 0) mode = (fst.st_mode | S_IXOTH); else mode = (fst.st_mode ^ S_IXOTH); if (fchmod(fd, mode) < 0) { pr_perror("can't chmod %s", filename); goto failed; } if (getuid()) { uid = getuid(); gid = getgid(); } else { /* Change uid, gid */ if (fchown(fd, (uid = fst.st_uid + 1), (gid = fst.st_gid + 1)) < 0) { pr_perror("can't chown %s", filename); goto failed; } } if (lseek(fd, 0, SEEK_SET) != 0) { pr_perror("can't reposition to 0"); goto failed; } test_daemon(); test_waitsig(); if (fstat(fd, &fst) < 0) { pr_perror("can't get %s file info after", filename); goto failed; } /* Check file size */ if (fst.st_size != fsize) { fail("(via fstat): file size changed to %ld", (long)fst.st_size); goto failed; } fst.st_size = lseek(fd, 0, SEEK_END); if (fst.st_size != fsize) { fail("(via lseek): file size changed to %ld", (long)fst.st_size); goto failed; } /* Check mode */ if (fst.st_mode != mode) { fail("mode is changed to %o(%o)", fst.st_mode, mode); goto failed; } /* Check uid, gid */ if (fst.st_uid != uid || fst.st_gid != gid) { fail("u(g)id changed: uid=%d(%d), gid=%d(%d)", fst.st_uid, uid, fst.st_gid, gid); goto failed; } if (lseek(fd, 0, SEEK_SET) != 0) { pr_perror("can't reposition to 0"); goto failed; } if (read(fd, buf, sizeof(buf)) != sizeof(buf)) { fail("can't read %s: %m\n", filename); goto failed; } crc = ~0; if (datachk(buf, sizeof(buf), &crc)) { fail("CRC mismatch\n"); goto failed; } close(fd); pass(); return 0; failed: unlink(filename); close(fd); return 1; } criu-3.6/test/zdtm/static/unlink_fstat00.hook000077500000000000000000000003201317335042600212750ustar00rootroot00000000000000#!/bin/bash [ "$1" == "--fault" -a "$2" == "restore" ] || exit 0 if [ $(find -name 'unlink_fstat00*ghost' | wc -l ) -ne 0 ]; then echo "Dangling ghost file" exit 1 fi echo "Restore fault handled" exit 0 criu-3.6/test/zdtm/static/unlink_fstat01+.c000077700000000000000000000000001317335042600236362unlink_fstat01.custar00rootroot00000000000000criu-3.6/test/zdtm/static/unlink_fstat01.c000066400000000000000000000032271317335042600205660ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Open, unlink, change size, migrate, check size"; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char ** argv) { int fd; size_t fsize=1000; uint8_t buf[fsize]; struct stat fst; test_init(argc, argv); fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } if (fstat(fd, &fst) < 0) { pr_perror("can't get file info %s before", filename); goto failed; } if (fst.st_size != 0) { pr_perror("%s file size eq %ld", filename, (long)fst.st_size); goto failed; } if (unlink(filename) < 0) { pr_perror("can't unlink %s", filename); goto failed; } #ifdef UNLINK_OVER { int fdo; fdo = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fdo < 0) { pr_perror("can't open %s", filename); exit(1); } } #endif memset(buf, '0', sizeof(buf)); if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { pr_perror("can't write %s", filename); goto failed; } test_daemon(); test_waitsig(); if (fstat(fd, &fst) < 0) { pr_perror("can't get %s file info after", filename); goto failed; } if (fst.st_size != fsize) { fail("(via fstat): file size changed to %lld", (long long)fst.st_size); goto failed; } fst.st_size = lseek(fd, 0, SEEK_END); if (fst.st_size != fsize) { fail("(via lseek): file size changed to %lld", (long long)fst.st_size); goto failed; } close(fd); pass(); return 0; failed: unlink(filename); close(fd); return 1; } criu-3.6/test/zdtm/static/unlink_fstat02.c000066400000000000000000000042651317335042600205720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Open, link, unlink x2, change size, migrate, check size"; char *filename; TEST_OPTION(filename, string, "file name", 1); static char link_name[1024]; int main(int argc, char ** argv) { int fd[2]; size_t fsize=1000; uint8_t buf[fsize]; struct stat fst, fst2; test_init(argc, argv); fd[0] = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd[0] < 0) { pr_perror("can't open %s", filename); exit(1); } sprintf(link_name, "%s.link", filename); if (link(filename, link_name)) { pr_perror("can't link files"); goto failed0; } fd[1] = open(link_name, O_RDONLY); if (fd[1] < 0) { pr_perror("can't open %s", link_name); goto failed0; } if (fstat(fd[0], &fst) < 0) { pr_perror("can't get file info %s before", filename); goto failed; } if (fst.st_size != 0) { pr_perror("%s file size eq %lld", filename, (long long)fst.st_size); goto failed; } if (unlink(filename) < 0) { pr_perror("can't unlink %s", filename); goto failed; } if (unlink(link_name) < 0) { pr_perror("can't unlink %s", link_name); goto failed; } memset(buf, '0', sizeof(buf)); if (write(fd[0], buf, sizeof(buf)) != sizeof(buf)) { pr_perror("can't write %s", filename); goto failed; } test_daemon(); test_waitsig(); if (fstat(fd[0], &fst) < 0) { pr_perror("can't get %s file info after", filename); goto failed; } if (fstat(fd[1], &fst2) < 0) { pr_perror("can't get %s file2 info after", link_name); goto failed; } if ((fst.st_dev != fst2.st_dev) || (fst.st_ino != fst2.st_ino)) { fail("files differ after restore\n"); goto failed; } if (fst.st_size != fsize) { fail("(via fstat): file size changed to %lld", (long long)fst.st_size); goto failed; } fst.st_size = lseek(fd[0], 0, SEEK_END); if (fst.st_size != fsize) { fail("(via lseek): file size changed to %lld", (long long)fst.st_size); goto failed; } close(fd[0]); close(fd[1]); pass(); return 0; failed: unlink(link_name); close(fd[1]); failed0: unlink(filename); close(fd[0]); return 1; } criu-3.6/test/zdtm/static/unlink_fstat03.c000066400000000000000000000041741317335042600205720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Open, link, unlink former, change size, migrate, check size"; char *filename; TEST_OPTION(filename, string, "file name", 1); static char link_name[1024]; int main(int argc, char ** argv) { int fd; size_t fsize=1000; uint8_t buf[fsize]; struct stat fst, fst2; struct statfs fsst; test_init(argc, argv); fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } sprintf(link_name, "%s.link", filename); if (link(filename, link_name)) { pr_perror("can't link files"); goto failed0; } if (fstat(fd, &fst) < 0) { pr_perror("can't get file info %s before", filename); goto failed; } if (fst.st_size != 0) { pr_perror("%s file size eq %lld", filename, (long long)fst.st_size); goto failed; } if (unlink(filename) < 0) { pr_perror("can't unlink %s", filename); goto failed; } memset(buf, '0', sizeof(buf)); if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { pr_perror("can't write %s", filename); goto failed; } test_daemon(); test_waitsig(); if (statfs(link_name, &fsst) < 0) { pr_perror("statfs(%s)", link_name); goto failed; } if (fstat(fd, &fst2) < 0) { pr_perror("can't get %s file info after", filename); goto failed; } /* An NFS mount is restored with another st_dev */ if (fsst.f_type != NFS_SUPER_MAGIC && fst.st_dev != fst2.st_dev) { fail("files differ after restore\n"); goto failed; } if (fst.st_ino != fst2.st_ino) { fail("files differ after restore\n"); goto failed; } if (fst2.st_size != fsize) { fail("(via fstat): file size changed to %lld", (long long)fst.st_size); goto failed; } fst2.st_size = lseek(fd, 0, SEEK_END); if (fst2.st_size != fsize) { fail("(via lseek): file size changed to %lld", (long long)fst.st_size); goto failed; } close(fd); pass(); return 0; failed: unlink(link_name); failed0: unlink(filename); close(fd); return 1; } criu-3.6/test/zdtm/static/unlink_fstat03.desc000066400000000000000000000000541317335042600212570ustar00rootroot00000000000000{'opts': '--link-remap', 'flags': 'nouser'} criu-3.6/test/zdtm/static/unlink_fstat04.c000077700000000000000000000000001317335042600235652unlink_fstat00.custar00rootroot00000000000000criu-3.6/test/zdtm/static/unlink_fstat04.desc000066400000000000000000000000251317335042600212560ustar00rootroot00000000000000{ "flags" : "suid" } criu-3.6/test/zdtm/static/unlink_largefile.c000066400000000000000000000020521317335042600212310ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Checkpointing/restore of big (2Gb) unlinked files"; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char ** argv) { int fd; char buf[1000000]; off64_t offset= 0x80002000ULL; size_t count; test_init(argc, argv); fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } if (lseek64(fd, offset, SEEK_SET) < 0) { pr_perror("can't lseek %s, offset= %llx", filename, (long long unsigned)offset); goto failed; } count = sizeof(buf); memset(buf, 0, count); if (write(fd, buf, count) != count) { pr_perror("can't write %s", filename); goto failed; } if (unlink(filename) < 0) { pr_perror("can't unlink %s", filename); goto failed; } test_daemon(); test_waitsig(); close(fd); pass(); return 0; failed: unlink(filename); close(fd); return 1; } criu-3.6/test/zdtm/static/unlink_largefile.desc000066400000000000000000000000241317335042600217220ustar00rootroot00000000000000{'flags': 'crfail'} criu-3.6/test/zdtm/static/unlink_mmap00.c000066400000000000000000000027771317335042600204070ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test mmaped and unlinked files"; char *filename; TEST_OPTION(filename, string, "file name", 1); #ifndef PAGE_SIZE #define PAGE_SIZE 4096 #endif static void touch_file_page(int fd, unsigned long off, char c) { if (lseek(fd, off, SEEK_SET) != off) { pr_perror("Lseek fail"); exit(1); } if (write(fd, &c, 1) != 1) { pr_perror("Write fail"); exit(1); } } int main(int argc, char ** argv) { int fd; char *mem_a, *mem_b; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open file"); exit(1); } touch_file_page(fd, 0, 'a'); touch_file_page(fd, PAGE_SIZE, 'b'); touch_file_page(fd, 2 * PAGE_SIZE - 1, 'c'); /* for aligned file */ /* map with different prots to create 2 regions */ mem_a = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); mem_b = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, PAGE_SIZE); if (mem_a == MAP_FAILED || mem_b == MAP_FAILED) { pr_perror("can't map file"); exit(1); } if (unlink(filename) < 0) { pr_perror("can't unlink file"); exit(1); } close(fd); test_daemon(); test_waitsig(); if (mem_a[0] != 'a') fail("1st region fail"); else if (mem_b[0] != 'b' || mem_b[PAGE_SIZE - 1] != 'c') fail("2nd regin fail"); else pass(); return 0; } criu-3.6/test/zdtm/static/unlink_mmap00.desc000066400000000000000000000000241317335042600210620ustar00rootroot00000000000000{'flags': 'nouser'} criu-3.6/test/zdtm/static/unlink_mmap01.c000066400000000000000000000035431317335042600204000ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test mmaped and unlinked files (2, with hard links)"; char *filename; TEST_OPTION(filename, string, "file name", 1); static char linkname[4096]; #ifndef PAGE_SIZE #define PAGE_SIZE 4096 #endif static void touch_file_page(int fd, unsigned long off, char c) { if (lseek(fd, off, SEEK_SET) != off) { pr_perror("Lseek fail"); exit(1); } if (write(fd, &c, 1) != 1) { pr_perror("Write fail"); exit(1); } } int main(int argc, char ** argv) { int fd; char *mem_a, *mem_b; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open file"); exit(1); } touch_file_page(fd, 0, 'a'); touch_file_page(fd, PAGE_SIZE - 1, 'b');/* for aligned file */ mem_a = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); if (mem_a == MAP_FAILED) { pr_perror("can't map file"); exit(1); } sprintf(linkname, "%s.lnk", filename); if (link(filename, linkname)) { pr_perror("can't link file"); exit(1); } if (unlink(filename) < 0) { pr_perror("can't unlink file"); exit(1); } close(fd); fd = open(linkname, O_RDWR); if (fd < 0) { pr_perror("can't open link"); exit(1); } mem_b = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); if (mem_b == MAP_FAILED) { pr_perror("can't map link"); exit(1); } if (unlink(linkname) < 0) { pr_perror("can't unlink link"); exit(1); } close(fd); test_daemon(); test_waitsig(); if (mem_a[0] != 'a' || mem_a[PAGE_SIZE - 1] != 'b') fail("1st region fail"); else if (mem_b[0] != 'a' || mem_b[PAGE_SIZE - 1] != 'b') fail("2nd regin fail"); else pass(); return 0; } criu-3.6/test/zdtm/static/unlink_mmap01.desc000066400000000000000000000000241317335042600210630ustar00rootroot00000000000000{'flags': 'nouser'} criu-3.6/test/zdtm/static/unlink_mmap02.c000066400000000000000000000027731317335042600204050ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Test mmaped, opened and unlinked files"; char *filename; TEST_OPTION(filename, string, "file name", 1); #ifndef PAGE_SIZE #define PAGE_SIZE 4096 #endif static void touch_file_page(int fd, unsigned long off, char c) { if (lseek(fd, off, SEEK_SET) != off) { pr_perror("Lseek fail"); exit(1); } if (write(fd, &c, 1) != 1) { pr_perror("Write fail"); exit(1); } } int main(int argc, char ** argv) { int fd; char *mem_a, *mem_b; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open file"); exit(1); } touch_file_page(fd, 2 * PAGE_SIZE - 1, 'c'); /* for aligned file */ /* map with different prots to create 2 regions */ mem_a = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); mem_b = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, PAGE_SIZE); if (mem_a == MAP_FAILED || mem_b == MAP_FAILED) { pr_perror("can't map file"); exit(1); } if (unlink(filename) < 0) { pr_perror("can't unlink file"); exit(1); } test_daemon(); test_waitsig(); touch_file_page(fd, 0, 'a'); touch_file_page(fd, PAGE_SIZE, 'b'); if (mem_a[0] != 'a') fail("1st region fail"); else if (mem_b[0] != 'b' || mem_b[PAGE_SIZE - 1] != 'c') fail("2nd regin fail"); else pass(); return 0; } criu-3.6/test/zdtm/static/unlink_mmap02.desc000066400000000000000000000000241317335042600210640ustar00rootroot00000000000000{'flags': 'nouser'} criu-3.6/test/zdtm/static/unlink_regular00.c000066400000000000000000000041411317335042600211010ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Checkpointing/restore of unlinked file inside unlinked directory"; const char *test_author = "Kirill Tkhai "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); #define SUBDIR "subdir" #define FNAME "testfile" #define MSG "Hello!!!111" int main(int argc, char ** argv) { char subdir[PATH_MAX], fname[PATH_MAX], lname[PATH_MAX]; char buf[sizeof(MSG) + 1]; int fd, ret = -1; test_init(argc, argv); memset(buf, 0, sizeof(buf)); if (mkdir(dirname, 0777)) { fail("can't create %s", dirname); exit(1); } if (mount("none", dirname, "tmpfs", 0, "") < 0) { fail("can't mount tmpfs to %s", dirname); goto rm_topdir; } sprintf(subdir, "%s/" SUBDIR, dirname); if (mkdir(subdir, 0777)) { fail("can't create %s", subdir); goto umount; } sprintf(fname, "%s/" SUBDIR "/" FNAME, dirname); sprintf(lname, "%s/" FNAME, dirname); fd = open(fname, O_RDWR | O_CREAT, 0644); if (fd < 0) { fail("can't open %s", fname); rmdir(subdir); goto umount; } if (link(fname, lname) < 0) { fail("can't link %s to %s", fname, lname); unlink(fname); rmdir(subdir); goto umount; } if (unlink(fname) || rmdir(subdir)) { fail("can't unlink %s or %s", fname, subdir); goto close_file; } if (write(fd, MSG, sizeof(MSG)) != sizeof(MSG)) { fail("can't write %s", fname); goto close_file; } test_daemon(); test_waitsig(); if (lseek(fd, 0, SEEK_SET) != 0) { fail("can't lseek %s", fname); goto close_file; } if (read(fd, buf, sizeof(MSG)) != sizeof(MSG)) { fail("can't read %s", fname); goto close_file; } if (strcmp(buf, MSG)) { fail("content differs: %s, %s, sizeof=%zu", buf, MSG, sizeof(MSG)); goto close_file; } ret = 0; pass(); close_file: close(fd); unlink(lname); umount: if (umount(dirname) < 0) pr_err("Can't umount\n"); rm_topdir: if (rmdir(dirname) < 0) pr_err("Can't rmdir()\n"); return ret; } criu-3.6/test/zdtm/static/unlink_regular00.desc000066400000000000000000000000761317335042600216000ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid', 'opts': '--link-remap'} criu-3.6/test/zdtm/static/uptime_grow.c000066400000000000000000000021461317335042600202640ustar00rootroot00000000000000#include "zdtmtst.h" const char *test_doc = "test to ensure that monotonic clock doesn't decrease"; const char *test_author = "Evgeny Antysev "; #include #include # define tv_ge(a, b) \ (((a)->tv_sec == (b)->tv_sec) ? \ ((a)->tv_nsec >= (b)->tv_nsec) : \ ((a)->tv_sec > (b)->tv_sec)) int main(int argc, char **argv) { struct timespec tm_old, tm, ts; double diff_nsec; ts.tv_sec = 0; ts.tv_nsec = 1000000; test_init(argc, argv); if (clock_gettime(CLOCK_MONOTONIC, &tm_old)) { pr_perror("clock_gettime failed"); exit(1); } test_daemon(); while (test_go()) { if (clock_gettime(CLOCK_MONOTONIC, &tm)) { pr_perror("clock_gettime failed"); exit(1); } if (!tv_ge(&tm, &tm_old)) { diff_nsec = (tm_old.tv_sec - tm.tv_sec) * 1.0E9 +\ (tm_old.tv_nsec - tm.tv_nsec); fail("clock step backward for %e nsec\n", diff_nsec); exit(1); } tm_old = tm; /* Kernel can't suspend container by design if calls clock_gettime() in a loop, so we need to sleep between clock_gettime(). */ nanosleep(&ts, NULL); } pass(); return 0; } criu-3.6/test/zdtm/static/uptime_grow.desc000066400000000000000000000000241317335042600207510ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/static/utsname.c000066400000000000000000000015611317335042600173770ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that utsname hasn't changed"; const char *test_author = "Pavel Emelianov "; static struct utsname after; #define ZDTM_NODE "zdtm.nodename.ru" #define ZDTM_DOMAIN "zdtm.nodename.ru" int main(int argc, char **argv) { test_init(argc, argv); if (sethostname(ZDTM_NODE, sizeof(ZDTM_NODE))) { pr_perror("Unable to set hostname"); return 1; } if (setdomainname(ZDTM_DOMAIN, sizeof(ZDTM_DOMAIN))) { pr_perror("Unable to set domainname"); return 1; } test_daemon(); test_waitsig(); uname(&after); if (strcmp(ZDTM_NODE, after.nodename)) { fail("Nodename doesn't match"); return 1; } if (strcmp(ZDTM_DOMAIN, after.domainname)) { fail("Domainname doesn't match"); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/utsname.desc000066400000000000000000000000461317335042600200700ustar00rootroot00000000000000{'flavor': 'ns uns', 'flags': 'suid'} criu-3.6/test/zdtm/static/vdso-proxy.c000066400000000000000000000061431317335042600200560ustar00rootroot00000000000000#include #include #include "zdtmtst.h" const char *test_doc = "Compare mappings before/after C/R for vdso/vvar presence. Should run iterative under vdso proxy fault-injection.\n"; const char *test_author = "Dmitry Safonov "; #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) #define VDSO_BAD_ADDR (-1ul) #define MAX_VMAS 80 #define BUF_SIZE 1024 /* * After C/R with vdso trampolines insertion, there should * be added one or two vmas: vdso and possibly vvar. * We need to check that nr. vmas after C/R <= +2 new vmas. * Also previous vdso/vvar vma should still be present after C/R. */ struct vm_area { unsigned long start; unsigned long end; bool is_vvar_or_vdso; }; static char buf[BUF_SIZE]; static int parse_maps(struct vm_area *vmas) { FILE *maps; int i; maps = fopen("/proc/self/maps", "r"); if (maps == NULL) { pr_err("Failed to open maps file: %m\n"); return -1; } for (i = 0; i < MAX_VMAS; i++) { struct vm_area *v = &vmas[i]; char *end; if (fgets(buf, BUF_SIZE, maps) == NULL) break; v->start = strtoul(buf, &end, 16); v->end = strtoul(end + 1, NULL, 16); v->is_vvar_or_vdso |= strstr(buf, "[vdso]") != NULL; v->is_vvar_or_vdso |= strstr(buf, "[vvar]") != NULL; test_msg("[NOTE]\tVMA: [%#lx, %#lx]\n", v->start, v->end); } if (i == MAX_VMAS) { pr_err("Number of VMAs is bigger than reserved array's size\n"); return -1; } if (fclose(maps)) { pr_err("Failed to close maps file: %m\n"); return -1; } return i; } int compare_vmas(struct vm_area *vmax, struct vm_area *vmay) { if (vmax->start > vmay->start) return 1; if (vmax->start < vmay->start) return -1; if (vmax->end > vmay->end) return 1; if (vmax->end < vmay->end) return -1; return 0; } static int check_vvar_vdso(struct vm_area *before, struct vm_area *after) { int i, j = 0; for (i = 0; i < MAX_VMAS && j < MAX_VMAS; i++, j++) { int cmp = compare_vmas(&before[i], &after[j]); if (cmp == 0) continue; if (cmp < 0) {/* Lost mapping */ test_msg("[NOTE]\tLost mapping: %#lx-%#lx\n", before[i].start, before[i].end); j--; if (before[i].is_vvar_or_vdso) { fail("Lost vvar/vdso mapping"); return -1; } continue; } test_msg("[NOTE]\tNew mapping appeared: %#lx-%#lx\n", after[j].start, after[j].end); i--; } return 0; } static struct vm_area vmas_before[MAX_VMAS]; static struct vm_area vmas_after[MAX_VMAS]; int main(int argc, char *argv[]) { int nr_before, nr_after; test_init(argc, argv); test_msg("[NOTE]\tMappings before:\n"); nr_before = parse_maps(vmas_before); if (nr_before < 0) { pr_perror("Faied to parse maps"); return -1; } test_daemon(); test_waitsig(); test_msg("[NOTE]\tMappings after:\n"); nr_after = parse_maps(vmas_after); if (nr_after < 0) { pr_perror("Faied to parse maps"); return -1; } /* After restore vDSO/VVAR blobs must remain in the old place. */ if (check_vvar_vdso(vmas_before, vmas_after)) return -1; if (nr_before + 2 < nr_after) { fail("There is more than two (VVAR/vDSO) vmas added after C/R"); return -1; } pass(); return 0; } criu-3.6/test/zdtm/static/vdso00.c000066400000000000000000000012361317335042600170350ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check if we can use vDSO after restore\n"; const char *test_author = "Cyrill Gorcunov #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check if we can use vDSO using direct vDSO calls\n"; const char *test_author = "Cyrill Gorcunov offset == VDSO_BAD_ADDR && s->name[0] == '\0'; } enum { VDSO_SYMBOL_CLOCK_GETTIME, VDSO_SYMBOL_GETCPU, VDSO_SYMBOL_GETTIMEOFDAY, VDSO_SYMBOL_TIME, VDSO_SYMBOL_MAX }; const char *vdso_symbols[VDSO_SYMBOL_MAX] = { [VDSO_SYMBOL_CLOCK_GETTIME] = "__vdso_clock_gettime", [VDSO_SYMBOL_GETCPU] = "__vdso_getcpu", [VDSO_SYMBOL_GETTIMEOFDAY] = "__vdso_gettimeofday", [VDSO_SYMBOL_TIME] = "__vdso_time", }; struct vdso_symtable { unsigned long vma_start; unsigned long vma_end; struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; }; #define VDSO_SYMTABLE_INIT \ { \ .vma_start = VDSO_BAD_ADDR, \ .vma_end = VDSO_BAD_ADDR, \ .symbols = { \ [0 ... VDSO_SYMBOL_MAX - 1] = \ (struct vdso_symbol)VDSO_SYMBOL_INIT, \ }, \ } static bool __ptr_oob(void *ptr, void *start, size_t size) { void *end = (void *)((unsigned long)start + size); return ptr > end || ptr < start; } static unsigned long elf_hash(const unsigned char *name) { unsigned long h = 0, g; while (*name) { h = (h << 4) + *name++; g = h & 0xf0000000ul; if (g) h ^= g >> 24; h &= ~g; } return h; } static int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t) { Phdr_t *dynamic = NULL, *load = NULL; Ehdr_t *ehdr = (void *)mem; Dyn_t *dyn_strtab = NULL; Dyn_t *dyn_symtab = NULL; Dyn_t *dyn_strsz = NULL; Dyn_t *dyn_syment = NULL; Dyn_t *dyn_hash = NULL; Word_t *hash = NULL; Phdr_t *phdr; Dyn_t *d; Word_t *bucket, *chain; Word_t nbucket, nchain; char *dynsymbol_names; unsigned int i, j, k; BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); test_msg("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size); /* * Make sure it's a file we support. */ if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { pr_perror("Elf header magic mismatch"); return -EINVAL; } /* * We need PT_LOAD and PT_DYNAMIC here. Each once. */ phdr = (void *)&mem[ehdr->e_phoff]; for (i = 0; i < ehdr->e_phnum; i++, phdr++) { if (__ptr_oob(phdr, mem, size)) goto err_oob; switch (phdr->p_type) { case PT_DYNAMIC: if (dynamic) { pr_perror("Second PT_DYNAMIC header"); return -EINVAL; } dynamic = phdr; break; case PT_LOAD: if (load) { pr_perror("Second PT_LOAD header"); return -EINVAL; } load = phdr; break; } } if (!load || !dynamic) { pr_perror("One of obligated program headers is missed"); return -EINVAL; } test_msg("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr); /* * Dynamic section tags should provide us the rest of information * needed. Note that we're interested in a small set of tags. */ d = (void *)&mem[dynamic->p_offset]; for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) { if (__ptr_oob(d, mem, size)) goto err_oob; if (d->d_tag == DT_NULL) { break; } else if (d->d_tag == DT_STRTAB) { dyn_strtab = d; } else if (d->d_tag == DT_SYMTAB) { dyn_symtab = d; } else if (d->d_tag == DT_STRSZ) { dyn_strsz = d; } else if (d->d_tag == DT_SYMENT) { dyn_syment = d; } else if (d->d_tag == DT_HASH) { dyn_hash = d; } } if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) { pr_perror("Not all dynamic entries are present"); return -EINVAL; } dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr]; if (__ptr_oob(dynsymbol_names, mem, size)) goto err_oob; hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr]; if (__ptr_oob(hash, mem, size)) goto err_oob; nbucket = hash[0]; nchain = hash[1]; bucket = &hash[2]; chain = &hash[nbucket + 2]; test_msg("nbucket %lu nchain %lu bucket %p chain %p\n", (long)nbucket, (long)nchain, bucket, chain); for (i = 0; i < ARRAY_SIZE(vdso_symbols); i++) { k = elf_hash((const unsigned char *)vdso_symbols[i]); for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) { Sym_t *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr]; char *name; sym = &sym[j]; if (__ptr_oob(sym, mem, size)) continue; if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) continue; name = &dynsymbol_names[sym->st_name]; if (__ptr_oob(name, mem, size)) continue; if (strcmp(name, vdso_symbols[i])) continue; memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name)); t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr; test_msg("symbol %s offset %lx\n", t->symbols[i].name, t->symbols[i].offset); break; } } return 0; err_oob: pr_perror("Corrupted Elf data"); return -EFAULT; } static int vdso_fill_self_symtable(struct vdso_symtable *s) { char buf[512]; int ret = -1; FILE *maps; *s = (struct vdso_symtable)VDSO_SYMTABLE_INIT; maps = fopen("/proc/self/maps", "r"); if (!maps) { pr_perror("Can't open self-vma"); return -1; } while (fgets(buf, sizeof(buf), maps)) { unsigned long start, end; if (!strstr(buf, "[vdso]")) continue; ret = sscanf(buf, "%lx-%lx", &start, &end); if (ret != 2) { ret = -1; pr_perror("Can't find vDSO bounds"); goto err; } s->vma_start = start; s->vma_end = end; ret = vdso_fill_symtable((void *)start, end - start, s); break; } test_msg("[vdso] %lx-%lx\n", s->vma_start, s->vma_end); err: fclose(maps); return ret; } static int vdso_clock_gettime_handler(void *func) { __vdso_clock_gettime_t *vdso_clock_gettime = func; struct timespec ts1, ts2; clock_gettime(CLOCK_REALTIME, &ts1); vdso_clock_gettime(CLOCK_REALTIME, &ts2); test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", ts1.tv_sec, ts2.tv_sec); if (labs(ts1.tv_sec - ts2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); return -1; } return 0; } static int vdso_getcpu_handler(void *func) { __vdso_getcpu_t *vdso_getcpu = func; unsigned cpu, node; vdso_getcpu(&cpu, &node, NULL); test_msg("vdso_getcpu: cpu %d node %d\n", cpu, node); return 0; } static int vdso_gettimeofday_handler(void *func) { __vdso_gettimeofday_t *vdso_gettimeofday = func; struct timeval tv1, tv2; struct timezone tz; gettimeofday(&tv1, &tz); vdso_gettimeofday(&tv2, &tz); test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", tv1.tv_sec, tv2.tv_sec); if (labs(tv1.tv_sec - tv2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); return -1; } return 0; } static int vdso_time_handler(void *func) { __vdso_time_t *vdso_time = func; time_t t1, t2; t1 = time(NULL); t2 = vdso_time(NULL); test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t1); if (labs(t1 - t2) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); return -1; } return 0; } static int call_handlers(struct vdso_symtable *symtable) { typedef int (handler_t)(void *func); handler_t *handlers[VDSO_SYMBOL_MAX] = { [VDSO_SYMBOL_CLOCK_GETTIME] = vdso_clock_gettime_handler, [VDSO_SYMBOL_GETCPU] = vdso_getcpu_handler, [VDSO_SYMBOL_GETTIMEOFDAY] = vdso_gettimeofday_handler, [VDSO_SYMBOL_TIME] = vdso_time_handler, }; size_t i; for (i = 0; i < ARRAY_SIZE(symtable->symbols); i++) { struct vdso_symbol *s = &symtable->symbols[i]; handler_t *func; if (vdso_symbol_empty(s) || i > ARRAY_SIZE(handlers)) continue; func = handlers[i]; if (func((void *)(s->offset + symtable->vma_start))) { pr_perror("Handler error"); return -1; } } return 0; } int main(int argc, char *argv[]) { struct vdso_symtable symtable; test_init(argc, argv); if (vdso_fill_self_symtable(&symtable)) { pr_perror("Faied to parse vdso"); return -1; } if (call_handlers(&symtable)) return -1; test_daemon(); test_waitsig(); /* * After restore the vDSO must remain in old place. */ if (call_handlers(&symtable)) { fail("Failed to call vdso handlers from symtable after C/R"); return -1; } pass(); return 0; } criu-3.6/test/zdtm/static/vdso01.desc000066400000000000000000000000231317335042600175230ustar00rootroot00000000000000{'arch': 'x86_64'} criu-3.6/test/zdtm/static/vdso02.c000066400000000000000000000107121317335042600170360ustar00rootroot00000000000000#include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Restoring task with unmapped vDSO blob. Poor man's test for C/R on vdso64_enabled=0 booted kernel.\n"; const char *test_author = "Dmitry Safonov "; #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) #define VDSO_BAD_ADDR (-1ul) #define VVAR_BAD_ADDR (-1ul) #define BUF_SZ 1024 struct vm_area { unsigned long start; unsigned long end; }; static int parse_vm_area(char *buf, struct vm_area *vma) { if (sscanf(buf, "%lx-%lx", &vma->start, &vma->end) == 2) return 0; pr_perror("Can't find VMA bounds"); return -1; } static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) { char buf[BUF_SZ]; int ret = -1; FILE *maps; vdso->start = VDSO_BAD_ADDR; vdso->end = VDSO_BAD_ADDR; vvar->start = VVAR_BAD_ADDR; vvar->end = VVAR_BAD_ADDR; if (snprintf(buf, BUF_SZ, "/proc/%d/maps", pid) < 0) { pr_perror("snprintf() failure for path"); return -1; } maps = fopen(buf, "r"); if (!maps) { pr_perror("Can't open maps for %d", pid); return -1; } while (fgets(buf, sizeof(buf), maps)) { if (strstr(buf, "[vdso]") && parse_vm_area(buf, vdso)) goto err; if (strstr(buf, "[vvar]") && parse_vm_area(buf, vvar)) goto err; } if (vdso->start != VDSO_BAD_ADDR) test_msg("[vdso] %lx-%lx\n", vdso->start, vdso->end); if (vvar->start != VVAR_BAD_ADDR) test_msg("[vvar] %lx-%lx\n", vvar->start, vvar->end); ret = 0; err: fclose(maps); return ret; } #ifdef __i386__ /* * On i386 syscalls for speed are optimized trough vdso, * call raw int80 as vdso is unmapped. */ #define __NR32_munmap 91 #define __NR32_kill 37 #define __NR32_exit 1 struct syscall_args32 { uint32_t nr, arg0, arg1; }; static inline void do_full_int80(struct syscall_args32 *args) { asm volatile ( "int $0x80\n\t" : "+a" (args->nr), "+b" (args->arg0), "+c" (args->arg1)); } int sys_munmap(void *addr, size_t len) { struct syscall_args32 s = {0}; s.nr = __NR32_munmap; s.arg0 = (uint32_t)(uintptr_t)addr; s.arg1 = (uint32_t)len; do_full_int80(&s); return (int)s.nr; } int sys_kill(pid_t pid, int sig) { struct syscall_args32 s = {0}; s.nr = __NR32_kill; s.arg0 = (uint32_t)pid; s.arg1 = (uint32_t)sig; do_full_int80(&s); return (int)s.nr; } void sys_exit(int status) { struct syscall_args32 s = {0}; s.nr = __NR32_exit; s.arg0 = (uint32_t)status; do_full_int80(&s); } #else /* !__i386__ */ int sys_munmap(void *addr, size_t len) { return syscall(SYS_munmap, addr, len); } int sys_kill(pid_t pid, int sig) { return syscall(SYS_kill, pid, sig); } void sys_exit(int status) { syscall(SYS_exit, status); } #endif static int unmap_blobs(void) { struct vm_area vdso, vvar; int ret; if (find_blobs(getpid(), &vdso, &vvar)) return -1; if (vdso.start != VDSO_BAD_ADDR) { ret = sys_munmap((void*)vdso.start, vdso.end - vdso.start); if (ret) return ret; } if (vvar.start != VVAR_BAD_ADDR) { ret = sys_munmap((void*)vvar.start, vvar.end - vvar.start); if (ret) return ret; } return 0; } int main(int argc, char *argv[]) { struct vm_area vdso, vvar; pid_t child; int status, ret = -1; test_init(argc, argv); child = fork(); if (child < 0) { pr_perror("fork() failed"); exit(1); } if (child == 0) { child = getpid(); if (unmap_blobs() < 0) syscall(SYS_exit, 1); sys_kill(child, SIGSTOP); sys_exit(2); } waitpid(child, &status, WUNTRACED); if (WIFEXITED(status)) { int ret = WEXITSTATUS(status); pr_err("Child unexpectedly exited with %d\n", ret); goto out_kill; } else if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); pr_err("Child unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); goto out_kill; } else if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { pr_err("Child is unstoppable or was stopped by other means\n"); goto out_kill; } if (find_blobs(child, &vdso, &vvar)) goto out_kill; if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { pr_err("Found vvar or vdso blob(s) in child, which should have unmapped them\n"); goto out_kill; } test_daemon(); test_waitsig(); if (find_blobs(child, &vdso, &vvar)) goto out_kill; if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { pr_err("Child without vdso got it after C/R\n"); fail(); goto out_kill; } pass(); ret = 0; out_kill: kill(child, SIGKILL); return ret; } criu-3.6/test/zdtm/static/vfork00.c000066400000000000000000000027351317335042600172160ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Block migration by a pending (non-exec()-ed) vfork()"; const char *test_author = "Pavel Emelianov "; int main(int argc, char ** argv) { int ret = 0; pid_t pid; test_init(argc, argv); /* vfork() won't let us control the test, so fork() first, and vfork() * in the child */ pid = fork(); if (pid < 0) { pr_err("fork failed: %m"); exit(1); } if (pid == 0) { int ret2; pid = vfork(); if (pid < 0) ret = errno; /* wait for signal in _both_ branches */ test_waitsig(); /* vforked guy shouldn't return, hence we _exit() */ if (pid == 0) _exit(0); if (wait(&ret2) != pid) ret = errno; _exit(ret); } test_daemon(); test_waitsig(); /* signal the whole process group, because our child is suspended until * the grand-child has exec()-ed, but we don't know the pid of the * latter */ if (kill(0, SIGTERM)) { fail("terminating the children failed: %m"); exit(1); } if (wait(&ret) != pid) { fail("wait() returned wrong pid: %m"); exit(1); } if (WIFEXITED(ret)) { ret = WEXITSTATUS(ret); if (ret) { fail("child exited with nonzero code %d (%s)", ret, strerror(ret)); exit(1); } } if (WIFSIGNALED(ret)) { fail("child exited on unexpected signal %d", WTERMSIG(ret)); exit(1); } pass(); return 0; } criu-3.6/test/zdtm/static/vfork00.desc000066400000000000000000000000331317335042600176770ustar00rootroot00000000000000{'flags': 'noauto crfail'} criu-3.6/test/zdtm/static/vsx.c000066400000000000000000000317121317335042600165440ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" /* * This test is specific to PowerPC */ #ifndef _ARCH_PPC64 int main(int argc, char *argv[]) { test_init(argc, argv); skip("Unsupported arch"); return 0; } #else #include #include /* * This test verifies that data stored in the VSX regsiters are still there * once the restart is done. * * The test is filling the registers with dedicated values and then check * their content. */ const char *test_doc = "Test if data in vector registers do survive the c/r"; const char *test_author = "Laurent Dufour "; int is_test_doable(void) { unsigned long val; val = getauxval(AT_HWCAP); #define CHECK_FEATURE(f) do { \ if (!(val & f)) { \ test_msg("CPU feature " #f " is missing\n"); \ return 0; \ } \ } while(0) CHECK_FEATURE(PPC_FEATURE_64); CHECK_FEATURE(PPC_FEATURE_HAS_ALTIVEC); CHECK_FEATURE(PPC_FEATURE_HAS_VSX); return 1; } void fill_vsx(uint64_t *pt) { asm volatile( "lis 3, 0 \n" "lxvd2x 0, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 1, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 2, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 3, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 4, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 5, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 6, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 7, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 8, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 9, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 10, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 11, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 12, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 13, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 14, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 15, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 16, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 17, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 18, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 19, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 20, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 21, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 22, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 23, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 24, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 25, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 26, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 27, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 28, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 29, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 30, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 31, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 32, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 33, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 34, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 35, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 36, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 37, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 38, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 39, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 40, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 41, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 42, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 43, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 44, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 45, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 46, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 47, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 48, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 49, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 50, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 51, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 52, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 53, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 54, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 55, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 56, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 57, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 58, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 59, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 60, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 61, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 62, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "lxvd2x 63, 3, %0 \n" : /* no output */ : "r" (pt) : "3"); } void read_vsx(uint64_t *pt) { asm volatile( "lis 3, 0 \n" "stxvd2x 0, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 1, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 2, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 3, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 4, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 5, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 6, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 7, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 8, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 9, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 10, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 11, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 12, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 13, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 14, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 15, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 16, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 17, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 18, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 19, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 20, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 21, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 22, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 23, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 24, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 25, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 26, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 27, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 28, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 29, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 30, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 31, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 32, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 33, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 34, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 35, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 36, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 37, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 38, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 39, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 40, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 41, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 42, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 43, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 44, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 45, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 46, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 47, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 48, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 49, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 50, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 51, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 52, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 53, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 54, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 55, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 56, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 57, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 58, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 59, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 60, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 61, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 62, 3, %0 \n" "addi 3, 3, 16 \n" /* move to the next qword */ "stxvd2x 63, 3, %0 \n" : /* no output */ : "r" (pt) : "3"); } int main(int argc, char *argv[]) { /* A random buffer of 1024 bytes (64 * 128bit registers to fill) */ static const char ibuffer[128/8*64]= "sahwoleiGun9loosliz0Aech9aiph5eiIengei7Ogh8zu7ye" "Aeshie6vai0thaehool1ooK6ayaj3Neitahn8yeeh5ahfuiT" "uCeir1bife4ieceema8choo2Wengaec1seDaidohteipa4ai" "aequee7AiriejaeJar1giak8Gei2uathloh5uemaeG6EiSoo" "PhaenaethoPhej8nEecheegeihosho8Zohroo8ea6Juuheif" "nu2Hahvai1tuf0Zeeeveephu2EitaexiVaekieboac7Nushu" "aeTh6Quoo3iozeisaudaGheed0aPah2Schoog0eiChaeN5su" "xoo1phoic1mahXohSai1thoogo0oesooeaxai7eBahHahMue" "quiloh2ooPahpiujeithae0Dau0shuwicobinaaYooj0ajiw" "iiheeS4awoh3haevlaiGe8phaev3eiluaChaF6ieng4aith4" "aif3TooYo1aigoomZiuhai8eesoo4maiLahr3PoM8Eir5ooz" "Iequ9ahre4Op4bahaiso6ohnah8Shokimooch1Oafahf5aih" "xohphee1pi5Iecaiaigh7Eisah2uew5acie7wi6Zo0Eelah9" "woi8QueerohfeiThaBoh5jaic3peiPohAhng0bu5shoop7ca" "Qui5kodaika8quioahmohreeVe8loquaeeLi5ze3oceiHa0l" "roh8Ooxae7uish9ioog7ieS3aibeo2thOosiuvaiS5lohp4U" "emieG0eit6Bien8EzaiwiTh3geighaexshee8eHiec1TooH2" "Eeceacai0inaejieboo8NeishieweiraHooj9apeecooy0th" "daThei6aexeisahdsei3keik0diPheejchais6ezo0iep5Ae" "Wiqu6aepeing4ba8diek3aev9waYooveAebai9eef6Iex6vo" "Quee9MeitahMighoHuo3seveeMoh3ohtoxaib6ootaiF5EeT" "Ohb9eijoonoh6ich"; char obuffer[128/8*64]; int do_test; test_init(argc, argv); do_test = is_test_doable(); if (do_test) { memset(obuffer, 0xFF, sizeof(obuffer)); fill_vsx((uint64_t *)ibuffer); } test_daemon(); test_waitsig(); if (do_test) { read_vsx((uint64_t *)obuffer); if (!memcmp(ibuffer, obuffer, sizeof(ibuffer))) pass(); else { test_msg("Data mismatch\n"); fail(); } } else { test_msg("The CPU is missing some features.\n"); fail(); } return 0; } #endif /* _ARCH_PPC64 */ criu-3.6/test/zdtm/static/vsx.desc000066400000000000000000000000241317335042600172300ustar00rootroot00000000000000{'arch': 'ppc64le'} criu-3.6/test/zdtm/static/vt.c000066400000000000000000000022771317335042600163610ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check c/r of a virtual terminal"; const char *test_author = "Ruslan Kuprieiev "; char *filename; TEST_OPTION(filename, string, "file name", 1); #ifdef __s390x__ #define MINOR 64 /* ttyS0 */ #else #define MINOR 5 #endif int main(int argc, char **argv) { struct stat st1, st2; int fd; test_init(argc, argv); if (mknod(filename, S_IFCHR | S_IRUSR | S_IWUSR, makedev(4, MINOR))) { pr_perror("Can't create virtual terminal %s", filename); return 1; } fd = open(filename, O_RDONLY); if (fd < 0) { pr_perror("Open virtual terminal %s failed", filename); return 1; } if (fstat(fd, &st1)) { pr_perror("Can't stat %s virtual terminal", filename); return 1; } test_daemon(); test_waitsig(); if (fstat(fd, &st2)) { pr_perror("Can't stat %s virtual terminal", filename); return 1; } if (st1.st_rdev != st2.st_rdev) { fail("Virtual terminal rdev mismatch %x != %x on %s", (int)st1.st_rdev, (int)st2.st_rdev, filename); return 1; } pass(); return 0; } criu-3.6/test/zdtm/static/vt.desc000066400000000000000000000000441317335042600170430ustar00rootroot00000000000000{'flavor': 'h ns', 'flags': 'suid'} criu-3.6/test/zdtm/static/wait00.c000066400000000000000000000017641317335042600170340ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "See if we can wait() for a child after migration"; const char *test_author = "Roman Kagan "; int main(int argc, char ** argv) { int ret; pid_t pid; test_init(argc, argv); pid = fork(); if (pid < 0) { pr_perror("fork failed"); exit(1); } if (pid == 0) { test_waitsig(); _exit(0); } test_daemon(); test_waitsig(); if (kill(pid, SIGTERM)) { fail("terminating the child failed: %m\n"); goto out; } if (wait(&ret) != pid) { fail("wait() returned wrong pid: %m\n"); goto out; } if (WIFEXITED(ret)) { ret = WEXITSTATUS(ret); if (ret) { fail("child exited with nonzero code %d (%s)\n", ret, strerror(ret)); goto out; } } if (WIFSIGNALED(ret)) { fail("child exited on unexpected signal %d\n", WTERMSIG(ret)); goto out; } pass(); out: return 0; } criu-3.6/test/zdtm/static/write_read00.c000066400000000000000000000020701317335042600202040ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Write file before migration, read after"; const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char ** argv) { int fd; uint32_t crc; uint8_t buf[1000000]; test_init(argc, argv); fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } crc = ~0; datagen(buf, sizeof(buf), &crc); if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { pr_perror("can't write %s", filename); exit(1); } close(fd); test_daemon(); test_waitsig(); fd = open(filename, O_RDONLY); if (fd < 0) { fail("can't open %s: %m\n", filename); exit(1); } if (read(fd, buf, sizeof(buf)) != sizeof(buf)) { fail("can't read %s: %m\n", filename); goto out; } crc = ~0; if (datachk(buf, sizeof(buf), &crc)) { fail("CRC mismatch\n"); goto out; } pass(); out: unlink(filename); return 0; } criu-3.6/test/zdtm/static/write_read01.c000066400000000000000000000023701317335042600202100ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Write and half way read file before migration, complete after"; const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char ** argv) { int fd; int len; uint32_t crc = ~0; uint8_t buf[1000000]; test_init(argc, argv); fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } crc = ~0; datagen(buf, sizeof(buf), &crc); if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { pr_perror("can't write %s", filename); exit(1); } close(fd); fd = open(filename, O_RDONLY); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } len = sizeof(buf) / 2; if (read(fd, buf, len) != len) { pr_perror("can't read %s", filename); exit(1); } test_daemon(); test_waitsig(); /* recover reading */ if (read(fd, buf + len, sizeof(buf) - len) != (sizeof(buf) - len)) { fail("can't read %s: %m\n", filename); goto out; } crc = ~0; if (datachk(buf, sizeof(buf), &crc)) { fail("CRC mismatch\n"); goto out; } pass(); out: unlink(filename); return 0; } criu-3.6/test/zdtm/static/write_read02.c000066400000000000000000000027661317335042600202220ustar00rootroot00000000000000#include #include #include #include #include "zdtmtst.h" const char *test_doc = "Write file half way before migration, complete and read after"; const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char ** argv) { int fd, fd1; int len, full_len; uint32_t crc; uint8_t buf[1000000]; char str[32]; test_init(argc, argv); fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } crc = ~0; datagen(buf, sizeof(buf), &crc); full_len = sizeof(buf); // create standard file sprintf(str, "standard_%s", filename); fd1 = open(str, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (write(fd1, buf, full_len) != full_len) { pr_perror("can't write %s", str); exit(1); } close(fd1); len = sizeof(buf) / 2; if (write(fd, buf, len) != len) { pr_perror("can't write %s", filename); exit(1); } test_daemon(); test_waitsig(); if (write(fd, buf + len, sizeof(buf) - len) != (sizeof(buf) - len)) { fail("can't write %s: %m\n", filename); goto out; } close(fd); fd = open(filename, O_RDONLY); if (fd < 0) { fail("can't open %s: %m\n", filename); return 1; } if (read(fd, buf, full_len) != full_len) { fail("can't read %s: %m\n", filename); return 1; } crc = ~0; if (datachk(buf, full_len, &crc)) { fail("CRC mismatch\n"); return 1; } pass(); out: unlink(filename); return 0; } criu-3.6/test/zdtm/static/write_read10.c000066400000000000000000000046741317335042600202210ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Open r/w and unlink file, and fork before migration;\n" "check that the child can write to it and the parent\n" "can read from it after migration"; const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char ** argv) { int fd, child_fd, ret; pid_t pid; uint32_t crc; uint8_t buf[1000000]; task_waiter_t t; test_init(argc, argv); fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); if (fd < 0) { pr_perror("can't open %s", filename); exit(1); } child_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); if (child_fd < 0) { pr_perror("can't open %s", filename); exit(1); } if (unlink(filename)) { pr_perror("can't unlink %s", filename); exit(1); } task_waiter_init(&t); pid = fork(); if (pid < 0) { pr_perror("can't fork"); exit(1); } if (pid == 0) { /* child writes to the unlinked file and returns */ close(fd); task_waiter_complete_current(&t); test_waitsig(); crc = ~0; datagen(buf, sizeof(buf), &crc); if (write(child_fd, buf, sizeof(buf)) != sizeof(buf)) _exit(errno); close(child_fd); _exit(0); } else task_waiter_wait4(&t, pid); close(child_fd); test_daemon(); test_waitsig(); if (kill(pid, SIGTERM)) { fail("terminating the child failed: %m\n"); goto out; } if (wait(&ret) != pid) { fail("wait() returned wrong pid %d: %m\n", pid); goto out; } if (WIFEXITED(ret)) { ret = WEXITSTATUS(ret); if (ret) { fail("child exited with nonzero code %d (%s)\n", ret, strerror(ret)); goto out; } } if (WIFSIGNALED(ret)) { fail("child exited on unexpected signal %d\n", WTERMSIG(ret)); goto out; } if (lseek(fd, 0, SEEK_SET) < 0) { fail("lseeking to the beginning of file failed: %m\n"); goto out; } if (read(fd, buf, sizeof(buf)) != sizeof(buf)) { fail("can't read %s: %m\n", filename); goto out; } crc = ~0; if (datachk(buf, sizeof(buf), &crc)) { fail("CRC mismatch\n"); goto out; } if (close(fd)) { fail("close failed: %m\n"); goto out_noclose; } if (unlink(filename) != -1 || errno != ENOENT) { fail("file %s should have been deleted before migration: unlink: %m\n", filename); goto out_noclose; } pass(); out: close(fd); out_noclose: return 0; } criu-3.6/test/zdtm/static/xids00.c000066400000000000000000000044171317335042600170350ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Check that environment didn't change"; const char *test_author = "Pavel Emelianov "; int main(int argc, char **argv) { int tmp_pipe[2], i; int pids[2], syncfd[2], stat, fail = 0; test_init(argc, argv); pipe(tmp_pipe); pids[0] = test_fork(); if (pids[0] == 0) { close(tmp_pipe[0]); setsid(); close(tmp_pipe[1]); test_waitsig(); if (getpid() != getsid(0)) exit(1); if (getpid() != getpgid(0)) exit(2); test_msg("P1 OK\n"); exit(0); } close(tmp_pipe[1]); syncfd[0] = tmp_pipe[0]; pipe(tmp_pipe); pids[1] = test_fork(); if (pids[1] == 0) { int tmp_pipe_sub[2], pid; close(tmp_pipe[0]); setsid(); pipe(tmp_pipe_sub); pid = test_fork(); if (pid == 0) { close(tmp_pipe[1]); close(tmp_pipe_sub[0]); setpgid(0, 0); close(tmp_pipe_sub[1]); test_waitsig(); if (getsid(0) != getppid()) exit(1); if (getpgid(0) != getpid()) exit(1); exit(0); } close(tmp_pipe_sub[1]); read(tmp_pipe_sub[0], &stat, 1); close(tmp_pipe_sub[0]); close(tmp_pipe[1]); test_waitsig(); if (getpid() != getsid(0)) exit(1); if (getpid() != getpgid(0)) exit(2); kill(pid, SIGTERM); if (waitpid(pid, &stat, 0) < 0) { pr_perror("Unable to wait P2 %d", pid); exit(3); } else if (!WIFEXITED(stat) || WEXITSTATUS(stat)) { pr_perror("P2 stat %d/%d/%d/%d", WIFEXITED(stat), WEXITSTATUS(stat), WIFSIGNALED(stat), WTERMSIG(stat)); exit(3); } exit(0); } close(tmp_pipe[1]); syncfd[1] = tmp_pipe[0]; read(syncfd[0], &stat, 1); close(syncfd[0]); read(syncfd[1], &stat, 1); close(syncfd[1]); test_daemon(); test_waitsig(); for (i = 0; i < sizeof(pids) / sizeof(pids[0]); i++) kill(pids[i], SIGTERM); for (i = 0; i < sizeof(pids) / sizeof(pids[0]); i++) { if (waitpid(pids[i], &stat, 0) < 0) { pr_perror("Unable to wait %d", pids[i]); fail = 1; } else if (!WIFEXITED(stat) || WEXITSTATUS(stat)) { pr_perror("P%d stat %d/%d/%d/%d", i, WIFEXITED(stat), WEXITSTATUS(stat), WIFSIGNALED(stat), WTERMSIG(stat)); fail = 1; } } if (fail) fail("Something failed"); else pass(); return 0; } criu-3.6/test/zdtm/static/zombie00.c000066400000000000000000000042321317335042600173460ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "See if we can wait() for a zombified child after migration"; const char *test_author = "Roman Kagan "; struct zombie { int pid; int exited; int exitcode; }; #define NR_ZOMBIES 4 int main(int argc, char ** argv) { int i, status; struct zombie zombie[NR_ZOMBIES]; zombie[0].exited = 1; zombie[0].exitcode = 0; zombie[1].exited = 1; zombie[1].exitcode = 3; zombie[2].exited = 0; zombie[2].exitcode = SIGKILL; zombie[3].exited = 0; zombie[3].exitcode = SIGSEGV; test_init(argc, argv); for (i = 0; i < NR_ZOMBIES; i++) { zombie[i].pid = fork(); if (zombie[i].pid < 0) { pr_perror("fork failed"); exit(1); } if (zombie[i].pid == 0) { if (zombie[i].exited) _exit(zombie[i].exitcode); else if (zombie[i].exitcode == SIGSEGV) *(volatile int *)NULL = 0; else kill(getpid(), zombie[i].exitcode); _exit(13); /* just in case */ } test_msg("kid %d will %d/%d\n", zombie[i].pid, zombie[i].exited, zombie[i].exitcode); } /* * We must wait for zombies to appear, but we cannot use * wait4 here :( Use sleep. */ for (i = 0; i < NR_ZOMBIES; i++) { siginfo_t siginfo; if (waitid(P_PID, zombie[i].pid, &siginfo, WNOWAIT | WEXITED)) { pr_perror("Unable to wait %d", zombie[i].pid); exit(1); } } test_daemon(); test_waitsig(); for (i = 0; i < NR_ZOMBIES; i++) { if (waitpid(zombie[i].pid, &status, 0) != zombie[i].pid) { fail("Exit with wrong pid\n"); exit(1); } if (zombie[i].exited) { if (!WIFEXITED(status)) { fail("Not exited, but should (%d)\n", zombie[i].pid); exit(1); } if (WEXITSTATUS(status) != zombie[i].exitcode) { fail("Exit with wrong status (%d)\n", zombie[i].pid); exit(1); } } else { if (!WIFSIGNALED(status)) { fail("Not killed, but should (%d)\n", zombie[i].pid); exit(1); } if (WTERMSIG(status) != zombie[i].exitcode) { fail("Killed with wrong signal (%d)\n", zombie[i].pid); exit(1); } } } pass(); return 0; } criu-3.6/test/zdtm/transition/000077500000000000000000000000001317335042600164575ustar00rootroot00000000000000criu-3.6/test/zdtm/transition/Makefile000066400000000000000000000030261317335042600201200ustar00rootroot00000000000000LIBDIR := ../lib LIB := $(LIBDIR)/libzdtmtst.a LDLIBS += $(LIB) CPPFLAGS += -I$(LIBDIR) TST_NOFILE = \ ipc \ ptrace \ epoll \ fork \ fork2 \ thread-bomb \ maps007 \ maps008 \ pipe_loop00 \ pipe_shared00 \ socket_loop00 \ netlink00 \ file_aio \ socket-tcp \ socket-tcp6 \ shmem \ lazy-thp \ TST_FILE = \ file_read \ unix_sock \ fifo_dyn \ fifo_loop \ TST = $(TST_NOFILE) $(TST_FILE) SRC = $(TST:%=%.c) OBJ = $(SRC:%.c=%.o) DEP = $(SRC:%.c=%.d) PID = $(TST:%=%.pid) OUT = $(TST:%=%.out) include ../Makefile.inc all: $(TST) install: all .PHONY: all install $(TST_NOFILE:%=%.pid): %.pid: % $(/dev/null` 2>/dev/null || break; \ sleep 1; \ done $(TST): | $(LIB) file_aio: LDLIBS += -lrt -pthread socket-tcp: CFLAGS += -D STREAM socket-tcp6: CFLAGS += -D ZDTM_IPV6 -D STREAM ptrace.o: CFLAGS += -pthread ptrace: LDFLAGS += -pthread fork2: CFLAGS += -D FORK2 thread-bomb.o: CFLAGS += -pthread thread-bomb: LDFLAGS += -pthread %: %.sh cp $< $@ chmod +x $@ $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) .PHONY: force start check_start stop wait_stop criu-3.6/test/zdtm/transition/epoll.c000066400000000000000000000071301317335042600177370ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "migrate application using epoll"; #define MAX_SCALE 128 enum child_exit_codes { SUCCESS = 0, GETTIMEOFDAYERROR, WRITEERROR, MAX_EXIT_CODE }; static char *child_fail_reason[] = { "Success", "Can't get time", "Can't write" }; int scale = 13; TEST_OPTION(scale, int, "How many children should perform testing", 0); static int pids[MAX_SCALE]; static int fds[MAX_SCALE][2]; static volatile int stop = 0; static void killall(void) { int i; for (i = 0; i < scale; i++) { close(fds[i][0]); close(fds[i][1]); kill(pids[i], SIGUSR2); } } static void do_stop(int sig) { stop = 1; } static void run_child(int num) { int fd = fds[num][1]; uint32_t crc = ~0; size_t buf_size=512; uint8_t buf[buf_size]; struct timeval tv; struct timespec ts; int rv; close(fds[num][0]); datagen(buf, sizeof(buf), &crc); if (gettimeofday(&tv, NULL) < 0) { rv = GETTIMEOFDAYERROR; goto out; } srand(tv.tv_sec + tv.tv_usec); ts.tv_sec = 0; while (!stop) { ts.tv_nsec = rand() % 999999999; nanosleep(&ts, &ts); if (write(fd, buf, buf_size) < 0 && (!stop /* signal SIGUSR2 NOT received */ || (errno != EINTR && errno != EPIPE))) { fail("child write: %m\n"); rv = WRITEERROR; goto out; } } rv = SUCCESS; out: close(fds[num][1]); exit(rv); } int main(int argc, char **argv) { int rv, i; int counter = 0; int efd; size_t buf_size=512; char buf[buf_size]; struct epoll_event event = { .events = EPOLLIN }, *events; test_init(argc, argv); if (scale > MAX_SCALE) { pr_err("Too many children specified\n"); exit(1); } if (signal(SIGUSR2, do_stop) == SIG_ERR) { pr_perror("Can't setup signal handler"); exit(1); } if ((efd = epoll_create(scale)) < 0) { pr_perror("Can't create epoll"); exit(1); } for (i = 0; i < scale; i++) { if (pipe(fds[i]) < 0) { pr_perror("Can't create pipe[%d]", i); killall(); exit(1); } if (fcntl(fds[i][0], F_SETFL, O_NONBLOCK) < 0) { pr_perror("Can't set O_NONBLOCK flag on fd[%d]", i); killall(); exit(1); } event.data.fd = fds[i][0]; if (epoll_ctl(efd, EPOLL_CTL_ADD, fds[i][0], &event) < 0) { pr_perror("Can't add fd[%d]", i); killall(); exit(1); } if ((rv = test_fork()) < 0) { pr_perror("Can't fork[%d]", i); killall(); exit(1); } if (rv == 0) run_child(i); close(fds[i][1]); pids[i] = rv; } if ((events = (struct epoll_event*) malloc (sizeof(struct epoll_event)*scale)) == NULL) { pr_perror("Can't allocate memory"); killall(); exit(1); } test_daemon(); while (test_go()) { if ((rv = epoll_wait(efd, events, scale, rand() % 999)) < 0 && errno != EINTR) { pr_perror("epoll_wait error"); killall(); exit(1); } for (i = 0; i < rv; i++) { while (read(events[i].data.fd, buf, buf_size) > 0); if (errno != EAGAIN && errno != 0 && errno) { pr_perror("read error"); killall(); exit(1); } } } test_waitsig(); killall(); for (i = 0; i < scale; i++) { if (waitpid(pids[i], &rv, 0) < 0) { fail("waitpid error: %m\n"); counter++; continue; } else { rv = WEXITSTATUS(rv); if (rv < MAX_EXIT_CODE && rv > SUCCESS) { fail("Child failed: %s (%d)\n", child_fail_reason[rv], rv); counter++; } else if (rv != SUCCESS) { fail("Unknown exitcode from child: %d\n", rv); counter++; } } } if (counter == 0) pass(); return 0; } criu-3.6/test/zdtm/transition/fifo_dyn.c000066400000000000000000000056651317335042600204340ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "dynamic FIFO test"; #define PROCS_DEF 2 /* 0 - parent, 1 - child */ #define BUF_SIZE 256 unsigned int num_procs = PROCS_DEF; char *filename; TEST_OPTION(filename, string, "file name", 1); int main(int argc, char **argv) { int ret = 0; int readfd, writefd; mode_t mode = S_IFIFO | 0600; char path[PROCS_DEF][BUF_SIZE]; pid_t pid; int i; uint8_t buf[0x100000]; int chret; char *file_path; test_init(argc, argv); for (i = 0; i < PROCS_DEF; i++) { file_path = path[i]; if (snprintf(file_path, BUF_SIZE, "%s-%02d", filename, i) >= BUF_SIZE) { pr_perror("filename %s is too long", filename); exit(1); } if (mkfifo(file_path, mode)) { pr_perror("can't make fifo \"%s\"", file_path); exit(1); } } pid = test_fork(); if (pid < 0) { pr_perror("Can't fork"); kill(0, SIGKILL); exit(1); } if (pid == 0) { file_path = path[0]; readfd = open(file_path, O_RDONLY); if (readfd < 0) { pr_perror("open(%s, O_RDONLY) Failed", file_path); ret = errno; return ret; } file_path = path[1]; writefd = open(file_path, O_WRONLY); if (writefd < 0) { pr_perror("open(%s, O_WRONLY) Failed", file_path); ret = errno; return ret; } if (pipe_in2out(readfd, writefd, buf, sizeof(buf)) < 0) /* pass errno as exit code to the parent */ if (test_go() /* signal NOT delivered */ || (errno != EINTR && errno != EPIPE)) ret = errno; close(readfd); close(writefd); exit(ret); } file_path = path[0]; writefd = open(file_path, O_WRONLY); if (writefd < 0) { pr_perror("open(%s, O_WRONLY) Failed", file_path); kill(pid, SIGKILL); return 1; } file_path = path[1]; readfd = open(file_path, O_RDONLY); if (readfd < 0) { pr_perror("open(%s, O_RDONLY) Failed", file_path); kill(pid, SIGKILL); return 1; } test_daemon(); while (test_go()) { int len, rlen = 0, wlen; uint8_t rbuf[sizeof(buf)], *p; datagen(buf, sizeof(buf), NULL); wlen = write(writefd, buf, sizeof(buf)); if (wlen < 0) { if (errno == EINTR) continue; else { fail("write failed: %m\n"); ret = 1; break; } } for (p = rbuf, len = wlen; len > 0; p += rlen, len -= rlen) { rlen = read(readfd, p, len); if (rlen <= 0) break; } if (rlen < 0 && errno == EINTR) continue; if (len > 0) { fail("read failed: %m\n"); ret = 1; break; } if (memcmp(buf, rbuf, wlen)) { fail("data mismatch\n"); ret = 1; break; } } close(writefd); test_waitsig(); wait(&chret); chret = WEXITSTATUS(chret); if (chret) { fail("child exited with non-zero code %d (%s)\n", chret, strerror(chret)); return 1; } if (!ret) pass(); close(readfd); for (i = 0; i < PROCS_DEF; i++) unlink(path[i]); return 0; } criu-3.6/test/zdtm/transition/fifo_dyn.desc000066400000000000000000000000241317335042600211100ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/transition/fifo_loop.c000066400000000000000000000075101317335042600206020ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Multi-process fifo loop"; #define BUF_SIZE 256 #define PROCS_DEF 4 unsigned int num_procs = PROCS_DEF; TEST_OPTION(num_procs, uint, "# processes to create " "(default " __stringify(PROCS_DEF) ")", 0); char *filename; TEST_OPTION(filename, string, "file name", 1); static int pids[PROCS_DEF]; volatile sig_atomic_t num_exited = 0; void inc_num_exited(int signo) { num_exited++; } int main(int argc, char **argv) { int ret = 0; int readfd, writefd; mode_t mode = S_IFIFO | 0644; char path[PROCS_DEF][BUF_SIZE]; pid_t pid; int i; uint8_t buf[0x100000]; char *file_path; test_init(argc, argv); for (i = 0; i < PROCS_DEF; i++) { file_path = path[i]; if (snprintf(file_path, BUF_SIZE, "%s-%02d", filename, i) >= BUF_SIZE) { pr_err("filename %s is too long\n", filename); exit(1); } if (mkfifo(file_path, mode)) { pr_perror("can't make fifo \"%s\"", file_path); exit(1); } } if (signal(SIGCHLD, inc_num_exited) == SIG_ERR) { pr_perror("can't set SIGCHLD handler"); exit(1); } for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ pid = test_fork(); if (pid < 0) { pr_perror("Can't fork"); kill(0, SIGKILL); exit(1); } if (pid == 0) { file_path = path[i - 1]; readfd = open(file_path, O_RDONLY); if (readfd < 0) { pr_perror("open(%s, O_RDONLY) failed", file_path); ret = errno; return ret; } file_path = path[i]; writefd = open(file_path, O_WRONLY); if (writefd < 0) { pr_perror("open(%s, O_WRONLY) failed", file_path); ret = errno; return ret; } signal(SIGPIPE, SIG_IGN); if (pipe_in2out(readfd, writefd, buf, sizeof(buf)) < 0) /* pass errno as exit code to the parent */ if (test_go() /* signal NOT delivered */ || (errno != EINTR && errno != EPIPE)) ret = errno; close(readfd); close(writefd); exit(ret); } pids[i] = pid; } file_path = path[0]; writefd = open(file_path, O_WRONLY); if (writefd < 0) { pr_perror("open(%s, O_WRONLY) failed", file_path); kill(0, SIGKILL); exit(1); } file_path = path[i - 1]; readfd = open(file_path, O_RDONLY); if (readfd < 0) { pr_perror("open(%s, O_RDONLY) failed", file_path); kill(0, SIGKILL); exit(1); } if (num_exited) { pr_err("Some children died unexpectedly\n"); kill(0, SIGKILL); exit(1); } test_daemon(); while (test_go()) { int len, rlen = 0, wlen; uint8_t rbuf[sizeof(buf)], *p; datagen(buf, sizeof(buf), NULL); wlen = write(writefd, buf, sizeof(buf)); if (wlen < 0) { if (errno == EINTR) continue; else { fail("write failed: %m\n"); ret = 1; break; } } for (p = rbuf, len = wlen; len > 0; p += rlen, len -= rlen) { rlen = read(readfd, p, len); if (rlen <= 0) break; } if (rlen < 0 && errno == EINTR) continue; if (len > 0) { fail("read failed: %m\n"); ret = 1; break; } if (memcmp(buf, rbuf, wlen)) { fail("data mismatch\n"); ret = 1; break; } } close(writefd); test_waitsig(); /* even if failed, wait for migration to complete */ if (kill(0, SIGTERM)) { fail("failed to send SIGTERM to my process group: %m\n"); return 1; /* shouldn't wait() in this case */ } close(readfd); for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ int chret; if (waitpid(pids[i], &chret, 0) < 0) { fail("waitpid error: %m\n"); ret = 1; continue; } chret = WEXITSTATUS(chret); if (chret) { fail("child %d exited with non-zero code %d (%s)\n", i, chret, strerror(chret)); ret = 1; continue; } } if (!ret) pass(); for (i = 0; i < PROCS_DEF; i++) unlink(path[i]); return 0; } criu-3.6/test/zdtm/transition/file_aio.c000066400000000000000000000035301317335042600203730ustar00rootroot00000000000000#include "zdtmtst.h" const char *test_doc = "test for AIO"; const char *test_author = "Andrew Vagin "; #include #include #include #include #include #include #include #include #include #define BUF_SIZE 1024 int main(int argc, char **argv) { test_init(argc, argv); char buf[BUF_SIZE]; int fd; struct aiocb aiocb; const struct aiocb *aioary[1]; char tmpfname[256]="/tmp/file_aio.XXXXXX"; int ret; fd = mkstemp(tmpfname); if (fd == -1) { pr_perror("mkstemp() failed"); exit(1); } unlink(tmpfname); if (write(fd, buf, BUF_SIZE) != BUF_SIZE) { pr_perror("Error at write()"); exit(1); } test_daemon(); while (test_go()) { memset(&aiocb, 0, sizeof(struct aiocb)); aiocb.aio_offset = 0; aiocb.aio_fildes = fd; aiocb.aio_buf = buf; aiocb.aio_nbytes = BUF_SIZE; ret = aio_read(&aiocb); if (ret < 0) { if ((errno == EINTR) && (!test_go())) break; pr_perror("aio_read failed"); return 1; } if (ret < 0) { pr_perror("aio_read failed"); exit(1); } /* Wait for request completion */ aioary[0] = &aiocb; again: ret = aio_suspend(aioary, 1, NULL); if (ret < 0) { if ((errno == EINTR) && (! test_go())) break; if (errno != EINTR) { pr_perror("aio_suspend failed"); return 1; } } ret = aio_error(&aiocb); if (ret == EINPROGRESS) { #ifdef DEBUG test_msg("restart aio_suspend\n"); #endif goto again; } if (ret != 0) { pr_err("Error at aio_error(): %s\n", strerror(ret)); return 1; } ret = aio_return(&aiocb); if (ret < 0) { if ((errno == EINTR) && (!test_go())) break; pr_perror("aio_return failed"); return 1; } if (ret != BUF_SIZE) { pr_perror("Error at aio_return()"); exit(1); } } close(fd); pass(); return 0; } criu-3.6/test/zdtm/transition/file_read.c000066400000000000000000000101001317335042600205250ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Fill/read file continuously to check" "it's migrated at the right moment"; const char *test_author = "Pavel Emelianov "; #define MAX_SCALE 128 #define FILE_SIZE (16 * 1024) enum kids_exit_codes { SUCCESS = 0, FILE_CORRUPTED, MMAP_FAILED, OPEN_FAILED, WRITE_FAILED, READ_FAILED, FSYNC_FAILED, SEEK_FAILED, MAX_EXIT_CODE_VAL }; static char *kids_fail_reasons[] = { "Success", /* 1 */ "File corrupted", /* 2 */ "Map failed", /* 3 */ "Open (create) failed", /* 4 */ "Write failed", /* 5 */ "Read failed", /* 6 */ "Fsync failed", /* 7 */ "Lseek failed" }; int scale = 13; TEST_OPTION(scale, int, "How many children should perform testing", 0); char *filename; TEST_OPTION(filename, string, "file name", 1); static int pids[MAX_SCALE]; static volatile int stop = 0; static void killall(void) { int i; for (i = 0; i < MAX_SCALE; i++) kill(pids[i], SIGUSR2); } static void do_stop(int sig) { stop = 1; } static char *buf; static void prepare_buf(void) { int i; for (i = 0; i < FILE_SIZE; i++) buf[i] = rand(); } static int fill_file(int fd) { int rv, wr; if (lseek(fd, 0, SEEK_SET) == -1) return -2; wr = 0; while (1) { rv = write(fd, buf + wr, FILE_SIZE - wr); if (rv <= 0) return -1; wr += rv; if (wr == FILE_SIZE) break; } return 0; } static int check_file(int fd) { char rbuf[1024]; int rv, rd; if (lseek(fd, 0, SEEK_SET) == -1) return -2; rd = 0; while (1) { rv = read(fd, rbuf, 1024); if (rv <= 0) return -1; if (memcmp(buf + rd, rbuf, rv)) return 1; rd += rv; if (rd == FILE_SIZE) break; } return 0; } static void chew_some_file(int num) { int fd, rv; char chew_file[PATH_MAX]; buf = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, 0, 0); rv = MMAP_FAILED; if (buf == MAP_FAILED) goto out_exit; sprintf(chew_file, "chew_%s.%d", filename, num); fd = open(chew_file, O_CREAT | O_EXCL | O_RDWR, 0666); rv = OPEN_FAILED; if (fd == -1) goto out_unmap; while (!stop) { prepare_buf(); switch (fill_file(fd)) { case -1: rv = WRITE_FAILED; goto out_exit; case -2: rv = SEEK_FAILED; goto out_exit; } if (fsync(fd) == -1) { rv = FSYNC_FAILED; goto out_exit; } if (fsync(fd) == -1) { rv = FSYNC_FAILED; goto out_exit; } switch (check_file(fd)) { case -1: rv = READ_FAILED; goto out_exit; case -2: rv = SEEK_FAILED; goto out_exit; case 1: rv = FILE_CORRUPTED; int fd1; char str[PATH_MAX]; // create standard file sprintf(str, "standard_%s.%d", filename, num); fd1 = open(str, O_WRONLY | O_CREAT | O_TRUNC, 0666); if (write(fd1, buf, FILE_SIZE) != FILE_SIZE) pr_perror("can't write %s", str); close(fd1); goto out_exit; } } rv = SUCCESS; close(fd); unlink(chew_file); out_unmap: munmap(buf, FILE_SIZE); out_exit: exit(rv); } int main(int argc, char **argv) { int rv, i; int counter = 0; test_init(argc, argv); if (scale > MAX_SCALE) { pr_err("Too many children specified\n"); exit(-1); } if (signal(SIGUSR2, do_stop) == SIG_ERR) { pr_perror("Can't setup signal handler"); exit(-1); } for (i = 0; i < scale; i++) { rv = test_fork(); if (rv == -1) { pr_perror("Can't fork"); killall(); exit(-1); } if (rv == 0) chew_some_file(i); pids[i] = rv; } test_daemon(); test_waitsig(); killall(); for (i = 0; i < scale; i++) { if (waitpid(pids[i], &rv, 0) == -1) { fail("Can't wipe up the kid\n"); counter++; continue; } if (!WIFEXITED(rv)) { fail("Kid was killed\n"); counter++; } else { rv = WEXITSTATUS(rv); if (rv < MAX_EXIT_CODE_VAL && rv > SUCCESS) { fail("Kid failed: %s (%d)\n", kids_fail_reasons[rv], rv); counter++; } else if (rv != SUCCESS) { fail("Unknown exitcode from kid: %d\n", rv); counter++; } } } if (counter == 0) pass(); return 0; } criu-3.6/test/zdtm/transition/fork.c000066400000000000000000000030041317335042600175610ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Tests that forking tasks are handled properly"; const char *test_author = "Pavel Emelyanov "; char children[] = "0123456789"; int main(int argc, char **argv) { int pid, wpid, status; int p[2]; test_init(argc, argv); if (pipe(p)) { pr_perror("pipe"); return -1; } if (write(p[1], children, sizeof(children)) != sizeof(children)) { pr_perror("write"); return -1; } test_daemon(); while (test_go()) { char c = 0; int ret; ret = read(p[0], &children, sizeof(children)); if (ret <= 0) { pr_perror("read"); return 1; } for (; ret > 0; ret--) { pid = fork(); if (pid < 0) { fail("Can't fork"); goto out; } if (pid == 0) { #ifdef FORK2 usleep(10000); #endif if (write(p[1], &c, 1) != 1) { pr_perror("write"); return 1; } exit(0); } } while (1) { wpid = waitpid(-1, &status, WNOHANG); if (wpid < 0) { if (errno == ECHILD) break; pr_perror("waitpid"); return -1; } if (wpid == 0) break; if (!WIFEXITED(status)) { fail("Task %d didn't exit", wpid); goto out; } if (WEXITSTATUS(status) != 0) { fail("Task %d exited with wrong code", wpid); goto out; } } } pass(); out: return 0; } criu-3.6/test/zdtm/transition/fork2.c000077700000000000000000000000001317335042600207462fork.custar00rootroot00000000000000criu-3.6/test/zdtm/transition/ipc.c000066400000000000000000000072471317335042600174100ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc="Tests ipc sems and shmems migrate fine"; const char *test_author="Pavel Emelianov "; static struct sembuf unlock = { .sem_op = 1, .sem_num = 0, .sem_flg = 0, }; static struct sembuf lock = { .sem_op = -1, .sem_num = 0, .sem_flg = 0, }; #define DEF_MEM_SIZE (40960) unsigned int shmem_size = DEF_MEM_SIZE; TEST_OPTION(shmem_size, uint, "Size of shared memory segment", 0); #define INIT_CRC (~0) #define POISON 0xac static inline void poison_area(int *mem) { memset(mem, POISON, shmem_size); } static int child(key_t key) { int sem, shm, ret, res = 0; uint8_t *mem; uint32_t crc; sem = semget(key, 1, 0777); if (sem == -1) return -1; shm = shmget(key, shmem_size, 0777); if (shm == -1) return -2; mem = shmat(shm, NULL, 0); if (mem == (uint8_t *)-1) return -3; while (test_go()) { ret = semop(sem, &lock, 1); if (ret) { if (errno == EINTR) continue; fail("Error in semop lock"); res = errno; break; } crc = INIT_CRC; datagen(mem, shmem_size, &crc); while ((ret = semop(sem, &unlock, 1)) && (errno == EINTR)); if (ret) { fail("Error in semop unlock"); res = errno; break; } } shmdt(mem); return res; } int main(int argc, char **argv) { key_t key; int sem, shm, pid1, pid2; int fail_count = 0; uint8_t *mem; uint32_t crc; int ret; test_init(argc, argv); key = ftok(argv[0], 822155650); if (key == -1) { pr_perror("Can't make key"); goto out; } sem = semget(key, 1, 0777 | IPC_CREAT | IPC_EXCL); if (sem == -1) { pr_perror("Can't get sem"); goto out; } if (semctl(sem, 0, SETVAL, 1) == -1) { pr_perror("Can't init sem"); fail_count++; goto out_sem; } shm = shmget(key, shmem_size, 0777 | IPC_CREAT | IPC_EXCL); if (shm == -1) { pr_perror("Can't get shm"); fail_count++; goto out_sem; } mem = shmat(shm, NULL, 0); if (mem == (void *)-1) { pr_perror("Can't attach shm"); fail_count++; goto out_shm; } poison_area((int *)mem); pid1 = test_fork(); if (pid1 == -1) { pr_perror("Can't fork 1st time"); goto out_shdt; } else if (pid1 == 0) exit(child(key)); pid2 = test_fork(); if (pid2 == -1) { pr_perror("Can't fork 2nd time"); fail_count++; goto out_child; } else if (pid2 == 0) exit(child(key)); test_daemon(); while (test_go()) { ret = semop(sem, &lock, 1); if (ret) { if (errno == EINTR) continue; fail_count++; fail("Error in semop lock"); break; } if (mem[0] != POISON) { crc = INIT_CRC; if (datachk(mem, shmem_size, &crc)) { fail_count++; fail("Semaphore protection is broken or " "shmem pages are messed"); semop(sem, &unlock, 1); break; } poison_area((int *)mem); } while ((ret = semop(sem, &unlock, 1)) && (errno == EINTR)); if (ret) { fail_count++; fail("Error in semop unlock"); break; } } test_waitsig(); kill(pid2, SIGTERM); waitpid(pid2, &ret, 0); if (!WIFEXITED(ret)) { fail_count++; pr_perror("Child 2 was killed"); } else if (WEXITSTATUS(ret)) { fail_count++; pr_perror("Child 2 couldn't inititalise"); } out_child: kill(pid1, SIGTERM); waitpid(pid1, &ret, 0); if (!WIFEXITED(ret)) { fail_count++; pr_perror("Child 1 was killed"); } else if (WEXITSTATUS(ret)) { fail_count++; pr_perror("Child 1 couldn't inititalise"); } out_shdt: shmdt(mem); out_shm: shmctl(shm, IPC_RMID, NULL); out_sem: semctl(sem, 1, IPC_RMID); if (fail_count == 0) pass(); out: return 0; } criu-3.6/test/zdtm/transition/ipc.desc000066400000000000000000000000211317335042600200630ustar00rootroot00000000000000{'flavor': 'ns'} criu-3.6/test/zdtm/transition/lazy-thp.c000066400000000000000000000023351317335042600203760ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #define N_PAGES 1024 const char *test_doc = "Test interaction between THP and lazy-pages"; /* The test is based on example by Adrian Reber */ const char *test_author = "Mike Rapoport "; int main(int argc, char ** argv) { char *mem, *org, *m; int count; test_init(argc, argv); /* we presume that malloc returns not page aliged address */ mem = malloc(PAGE_SIZE * N_PAGES); org = malloc(PAGE_SIZE); if (!mem || !org) { fail("malloc failed\n"); exit(1); } memset(mem, 0x42, PAGE_SIZE * N_PAGES); memset(org, 0x42, PAGE_SIZE); test_daemon(); while (test_go()) { for (count = 0; count < N_PAGES; count += 2) { m = mem + (count * PAGE_SIZE) + 128; *m = count; } for (count = 0; count < N_PAGES; count++) { m = mem+(count*PAGE_SIZE); org[128] = (count % 2 == 0) ? count : 0x42; if (memcmp(org, m, PAGE_SIZE)) { fail("memory corruption\n"); return 1; } } sleep(1); } pass(); return 0; } criu-3.6/test/zdtm/transition/maps007.c000066400000000000000000000076471317335042600200300ustar00rootroot00000000000000 #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #include "lock.h" #define MAP_SIZE (1UL << 20) #define MEM_SIZE (1UL << 29) const char *test_doc = "create random mappings and touch memory"; int sys_process_vm_readv(pid_t pid, void *addr, void *buf, int size) { struct iovec lvec = {.iov_base = buf, .iov_len = size }; struct iovec rvec = {.iov_base = addr, .iov_len = size }; /* workaround bug in glibc with sixth argument of syscall */ char nop[PAGE_SIZE]; memset(nop, 0, sizeof(nop)); return syscall(__NR_process_vm_readv, pid, &lvec, 1, &rvec, 1, 0); } /* The child follows the parents two steps behind. */ #define MAX_DELTA 1000 int main(int argc, char **argv) { void *start, *end, *p; pid_t child; struct { futex_t delta; futex_t stop; } *shm; uint32_t v; unsigned long long count = 0; int i; test_init(argc, argv); /* shared memory for synchronization */ shm = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0); if (shm == MAP_FAILED) return -1; /* allocate workspace */ start = mmap(NULL, MEM_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (start == MAP_FAILED) return -1; test_msg("%p-%p\n", start, start + MEM_SIZE); end = start + MEM_SIZE; v = 0; futex_set(&shm->delta, v); futex_set(&shm->stop, 0); child = fork(); if (child < 0) { pr_perror("fork"); return 1; } while (1) { void *ret; unsigned long size; int prot = PROT_NONE; if (child) { if (!test_go()) break; futex_wait_while_gt(&shm->delta, 2 * MAX_DELTA); futex_inc_and_wake(&shm->delta); } else { if (!futex_get(&shm->stop)) /* shm->delta must be always bigger than MAX_DELTA */ futex_wait_while_lt(&shm->delta, MAX_DELTA + 2); else if (count % 100 == 0) test_msg("count %llu delta %d\n", count, futex_get(&shm->delta)); /* heartbeat */ if (futex_get(&shm->stop) && atomic_get(&shm->delta.raw) == MAX_DELTA) break; futex_dec_and_wake(&shm->delta); } count++; if (child && count == MAX_DELTA + 1) test_daemon(); p = start + ((lrand48() * PAGE_SIZE) % MEM_SIZE); size = lrand48() * PAGE_SIZE; size %= (end - p); size %= MAP_SIZE; if (size == 0) size = PAGE_SIZE; if (lrand48() % 2) prot |= PROT_READ; if (lrand48() % 2) prot |= PROT_EXEC; if (lrand48() % 2) prot |= PROT_WRITE; ret = mmap(p, size, prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); if (ret == MAP_FAILED) { pr_perror("%p-%p", p, p + size); goto err; } if (!(prot & PROT_WRITE)) continue; for (i = 0; i < lrand48() % 50; i++) { char *t = p + (lrand48() * PAGE_SIZE) % (size); t[0] = lrand48(); } } test_msg("count %llu\n", count); if (child == 0) { if (!test_go()) pr_perror("unexpected state"); futex_set_and_wake(&shm->stop, 2); test_waitsig(); return 0; } else { int readable = 0, status = -1; /* stop the child */ futex_set(&shm->stop, 1); futex_add_and_wake(&shm->delta, MAX_DELTA); /* wait until the child will be in the same point */ futex_wait_until(&shm->stop, 2); /* check that child and parent have the identical content of memory */ for (p = start; p < end; p += PAGE_SIZE) { char rbuf[PAGE_SIZE], lbuf[PAGE_SIZE]; int rret, lret; lret = sys_process_vm_readv(getpid(), p, lbuf, PAGE_SIZE); rret = sys_process_vm_readv(child, p, rbuf, PAGE_SIZE); if (rret != lret) { pr_perror("%p %d %d", p, lret, rret); goto err; } if (lret < 0) continue; readable++; if (memcmp(rbuf, lbuf, PAGE_SIZE)) { pr_perror("%p", p); goto err; } } test_msg("readable %d\n", readable); kill(child, SIGTERM); wait(&status); if (status != 0) { pr_perror("Non-zero exit code: %d", status); goto err; } pass(); } return 0; err: kill(child, SIGSEGV); *((volatile int *) 0) = 0; return 1; } criu-3.6/test/zdtm/transition/maps007.desc000066400000000000000000000000221317335042600205000ustar00rootroot00000000000000{'flags': 'suid'} criu-3.6/test/zdtm/transition/maps008.c000066400000000000000000000301751317335042600200210ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #include "lock.h" const char *test_doc = "ps tree with anon shared vmas for dedup"; /* * 1. ps tree with non triavial anon shmem vmas is created first. * 2. Each process gets its portion of shmem vmas. * 3. Each process continiously datagens its portion until * criu dump is finished. * 4. Each process datachecks all its shmem portions after restore. * 5. Contents of anon shmem vmas are checked for equality in * different processes. */ typedef int (*proc_func_t)(task_waiter_t *setup_waiter); static pid_t fork_and_setup(proc_func_t pfunc) { task_waiter_t setup_waiter; pid_t pid; task_waiter_init(&setup_waiter); pid = test_fork(); if (pid < 0) { pr_perror("fork failed"); exit(1); } if (pid == 0) exit(pfunc(&setup_waiter)); task_waiter_wait4(&setup_waiter, pid); task_waiter_fini(&setup_waiter); return pid; } static void cont_and_wait_child(pid_t pid) { int status; kill(pid, SIGTERM); waitpid(pid, &status, 0); if (WIFEXITED(status)) { if (WEXITSTATUS(status)) exit(WEXITSTATUS(status)); } else exit(1); } static void *mmap_ashmem(size_t size) { void *mem = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (mem == MAP_FAILED) { pr_perror("Can't map shmem %zx", size); exit(1); } return mem; } static void *mmap_proc_mem(pid_t pid, unsigned long addr, unsigned long size) { int fd; void *mem; char path[PATH_MAX]; snprintf(path, PATH_MAX, "/proc/%d/map_files/%lx-%lx", (int)pid, addr, addr + size); fd = open(path, O_RDWR); if (fd == -1) { pr_perror("Can't open file %s", path); exit(1); } mem = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); close(fd); if (mem == MAP_FAILED) { pr_perror("Can't map file %s", path); exit(1); } return mem; } static void check_mem_eq(void *addr1, size_t size1, void *addr2, size_t size2) { unsigned long min_size = size1 < size2 ? size1 : size2; if (memcmp(addr1, addr2, min_size)) { pr_err("Mem differs %lx %lx %lx", (unsigned long)addr1, (unsigned long)addr2, min_size); exit(1); } } static void xmunmap(void *map, size_t size) { if (munmap(map, size)) { pr_err("xmunmap"); exit(1); } } static void chk_proc_mem_eq(pid_t pid1, void *addr1, unsigned long size1, pid_t pid2, void *addr2, unsigned long size2) { void *map1, *map2; map1 = mmap_proc_mem(pid1, (unsigned long)addr1, size1); map2 = mmap_proc_mem(pid2, (unsigned long)addr2, size2); check_mem_eq(map1, size1, map2, size2); xmunmap(map1, size1); xmunmap(map2, size2); } /* * ps tree: * proc1_______________ * | | | * proc11___ proc12 proc13 * | | | * proc111 proc112 proc131 */ #define PROC_CNT 7 #define PROC1_PGIX 0 #define PROC11_PGIX 1 #define PROC12_PGIX 2 #define PROC13_PGIX 3 #define PROC111_PGIX 4 #define PROC112_PGIX 5 #define PROC131_PGIX 6 #define ZERO_PGIX 7 /* unused pgix: 8 */ #define MEM_PERIOD (9 * PAGE_SIZE) struct pstree { pid_t proc1; pid_t proc11; pid_t proc12; pid_t proc13; pid_t proc111; pid_t proc112; pid_t proc131; }; struct pstree *pstree; struct test_sync { futex_t datagen; futex_t datagen_exit_cnt; }; struct test_sync *test_sync; size_t mem1_size, mem2_size, mem3_size; uint8_t *mem1, *mem2, *mem3; #define CRC_EPOCH_OFFSET (PAGE_SIZE - sizeof(uint32_t)) static void read_each_pg(volatile uint8_t *mem, size_t size, size_t off) { if (!mem) return; while (off < size) { (mem + off)[0]; off += MEM_PERIOD; } } void datagen_each_pg(uint8_t *mem, size_t size, size_t off, uint32_t crc_epoch) { if (!mem) return; while (futex_get(&test_sync->datagen) && (off < size)) { uint32_t crc = crc_epoch; datagen(mem + off, CRC_EPOCH_OFFSET, &crc); *(uint32_t *)(mem + off + CRC_EPOCH_OFFSET) = crc_epoch; off += MEM_PERIOD; } } void datachck_each_pg(uint8_t *mem, size_t size, size_t off) { if (!mem) return; while (off < size) { uint32_t crc = *(uint32_t *)(mem + off + CRC_EPOCH_OFFSET); if (datachk(mem + off, CRC_EPOCH_OFFSET, &crc)) exit(1); off += MEM_PERIOD; } } static void mems_read_each_pgix(size_t pgix) { const size_t off = pgix * PAGE_SIZE; read_each_pg(mem1, mem1_size, off); read_each_pg(mem2, mem2_size, off); read_each_pg(mem3, mem3_size, off); } static void mems_datagen_each_pgix(size_t pgix, uint32_t *crc_epoch) { const size_t off = pgix * PAGE_SIZE; ++(*crc_epoch); datagen_each_pg(mem1, mem1_size, off, *crc_epoch); datagen_each_pg(mem2, mem2_size, off, *crc_epoch); datagen_each_pg(mem3, mem3_size, off, *crc_epoch); } static void mems_datachck_each_pgix(size_t pgix) { const size_t off = pgix * PAGE_SIZE; datachck_each_pg(mem1, mem1_size, off); datachck_each_pg(mem2, mem2_size, off); datachck_each_pg(mem3, mem3_size, off); } static int proc131_func(task_waiter_t *setup_waiter) { uint32_t crc_epoch = 0; pstree->proc131 = getpid(); mems_datagen_each_pgix(PROC131_PGIX, &crc_epoch); task_waiter_complete_current(setup_waiter); while (futex_get(&test_sync->datagen)) mems_datagen_each_pgix(PROC131_PGIX, &crc_epoch); futex_inc_and_wake(&test_sync->datagen_exit_cnt); test_waitsig(); mems_datachck_each_pgix(PROC131_PGIX); return 0; } static int proc13_func(task_waiter_t *setup_waiter) { size_t MEM1_HOLE_START = 2 * MEM_PERIOD; size_t MEM1_HOLE_SIZE = 1 * MEM_PERIOD; uint32_t crc_epoch = 0; pstree->proc13 = getpid(); xmunmap(mem1 + MEM1_HOLE_START, MEM1_HOLE_SIZE); xmunmap(mem2, mem2_size); xmunmap(mem3, mem3_size); mem2 = mem1 + MEM1_HOLE_START + MEM1_HOLE_SIZE; mem2_size = mem1_size - (mem2 - mem1); mem1_size = MEM1_HOLE_START; mem3 = mmap_ashmem(mem3_size); mems_datagen_each_pgix(PROC13_PGIX, &crc_epoch); fork_and_setup(proc131_func); task_waiter_complete_current(setup_waiter); while (futex_get(&test_sync->datagen)) mems_datagen_each_pgix(PROC13_PGIX, &crc_epoch); futex_inc_and_wake(&test_sync->datagen_exit_cnt); test_waitsig(); mems_datachck_each_pgix(PROC13_PGIX); chk_proc_mem_eq(pstree->proc13, mem1, mem1_size, pstree->proc131, mem1, mem1_size); chk_proc_mem_eq(pstree->proc13, mem2, mem2_size, pstree->proc131, mem2, mem2_size); chk_proc_mem_eq(pstree->proc13, mem3, mem3_size, pstree->proc131, mem3, mem3_size); cont_and_wait_child(pstree->proc131); return 0; } static int proc12_func(task_waiter_t *setup_waiter) { uint32_t crc_epoch = 0; pstree->proc12 = getpid(); mems_datagen_each_pgix(PROC12_PGIX, &crc_epoch); task_waiter_complete_current(setup_waiter); while (futex_get(&test_sync->datagen)) mems_datagen_each_pgix(PROC12_PGIX, &crc_epoch); futex_inc_and_wake(&test_sync->datagen_exit_cnt); test_waitsig(); mems_datachck_each_pgix(PROC12_PGIX); return 0; } static int proc111_func(task_waiter_t *setup_waiter) { uint32_t crc_epoch = 0; pstree->proc111 = getpid(); mems_datagen_each_pgix(PROC111_PGIX, &crc_epoch); task_waiter_complete_current(setup_waiter); while (futex_get(&test_sync->datagen)) mems_datagen_each_pgix(PROC111_PGIX, &crc_epoch); futex_inc_and_wake(&test_sync->datagen_exit_cnt); test_waitsig(); mems_datachck_each_pgix(PROC111_PGIX); return 0; } static int proc112_func(task_waiter_t *setup_waiter) { uint32_t crc_epoch = 0; pstree->proc112 = getpid(); mems_datagen_each_pgix(PROC112_PGIX, &crc_epoch); task_waiter_complete_current(setup_waiter); while (futex_get(&test_sync->datagen)) mems_datagen_each_pgix(PROC112_PGIX, &crc_epoch); futex_inc_and_wake(&test_sync->datagen_exit_cnt); test_waitsig(); mems_datachck_each_pgix(PROC112_PGIX); return 0; } static int proc11_func(task_waiter_t *setup_waiter) { const size_t MEM3_START_CUT = 1 * MEM_PERIOD; const size_t MEM3_END_CUT = 2 * MEM_PERIOD; void *mem3_old = mem3; size_t mem3_size_old = mem3_size; uint32_t crc_epoch = 0; pstree->proc11 = getpid(); xmunmap(mem3, MEM3_START_CUT); mem3 += MEM3_START_CUT; mem3_size -= MEM3_START_CUT; fork_and_setup(proc111_func); fork_and_setup(proc112_func); xmunmap(mem3 + mem3_size - MEM3_END_CUT, MEM3_END_CUT); mem3_size -= MEM3_END_CUT; mems_datagen_each_pgix(PROC11_PGIX, &crc_epoch); task_waiter_complete_current(setup_waiter); while (futex_get(&test_sync->datagen)) mems_datagen_each_pgix(PROC11_PGIX, &crc_epoch); futex_inc_and_wake(&test_sync->datagen_exit_cnt); test_waitsig(); mems_datachck_each_pgix(PROC11_PGIX); chk_proc_mem_eq(pstree->proc11, mem1, mem1_size, pstree->proc111, mem1, mem1_size); chk_proc_mem_eq(pstree->proc11, mem1, mem1_size, pstree->proc112, mem1, mem1_size); chk_proc_mem_eq(pstree->proc11, mem2, mem2_size, pstree->proc111, mem2, mem2_size); chk_proc_mem_eq(pstree->proc11, mem2, mem2_size, pstree->proc112, mem2, mem2_size); chk_proc_mem_eq(pstree->proc11, mem3, mem3_size, pstree->proc111, mem3, mem3_size + MEM3_END_CUT); chk_proc_mem_eq(pstree->proc11, mem3, mem3_size, pstree->proc112, mem3, mem3_size + MEM3_END_CUT); uint8_t *proc1_mem3 = mmap_proc_mem(pstree->proc1, (unsigned long)mem3_old, mem3_size_old); check_mem_eq(mem3, mem3_size, proc1_mem3 + MEM3_START_CUT, mem3_size); xmunmap(proc1_mem3, mem3_size_old); cont_and_wait_child(pstree->proc111); cont_and_wait_child(pstree->proc112); return 0; } #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MB(n) ((n) * (1UL << 20)) static int proc1_func(void) { uint32_t crc_epoch = 0; uint8_t *mem2_old = NULL; /* * Min mem size: * At least 5 mem periods for mem pages and vma holes. * At least 1 MB mem size not to test on tiny working set. */ mem1_size = MEM_PERIOD * MAX(5, MB(1) / MEM_PERIOD + 1); mem2_size = mem1_size * 2; mem3_size = mem2_size * 3; futex_set(&test_sync->datagen, 1); pstree->proc1 = getpid(); mem1 = mmap_ashmem(mem1_size); mem2 = mmap_ashmem(mem2_size); mem3 = mmap_ashmem(mem3_size); mems_datagen_each_pgix(PROC1_PGIX, &crc_epoch); mems_read_each_pgix(ZERO_PGIX); fork_and_setup(proc11_func); fork_and_setup(proc12_func); fork_and_setup(proc13_func); xmunmap(mem1, mem1_size); if (mremap(mem2, mem2_size, mem1_size, MREMAP_MAYMOVE | MREMAP_FIXED, mem1) != mem1) { pr_perror("proc1 mem2 remap"); exit(1); } mem2_old = mem2; mem2 = NULL; test_daemon(); while (test_go()) mems_datagen_each_pgix(PROC1_PGIX, &crc_epoch); test_waitsig(); futex_set(&test_sync->datagen_exit_cnt, 0); futex_set(&test_sync->datagen, 0); futex_wait_while(&test_sync->datagen_exit_cnt, PROC_CNT); mems_datachck_each_pgix(PROC1_PGIX); chk_proc_mem_eq(pstree->proc1, mem1, mem1_size, pstree->proc11, mem2_old, mem2_size); chk_proc_mem_eq(pstree->proc1, mem1, mem1_size, pstree->proc12, mem2_old, mem2_size); chk_proc_mem_eq(pstree->proc1, mem3, mem3_size, pstree->proc12, mem3, mem3_size); cont_and_wait_child(pstree->proc11); cont_and_wait_child(pstree->proc12); cont_and_wait_child(pstree->proc13); pass(); return 0; } static void kill_pstree_from_root(void) { if (getpid() != pstree->proc1) return; kill(pstree->proc11, SIGKILL); kill(pstree->proc12, SIGKILL); kill(pstree->proc13, SIGKILL); kill(pstree->proc111, SIGKILL); kill(pstree->proc112, SIGKILL); kill(pstree->proc131, SIGKILL); } static void sigchld_hand(int signo, siginfo_t *info, void *ucontext) { if (info->si_code != CLD_EXITED) return; if (!info->si_status) return; /* * If we are not ps tree root then propagate child error to parent. * If we are ps tree root then also call all * atexit handlers set up by zdtm test framework and this test. * exit() is not async signal safe but it's ok for testing purposes. * exit() usage allows us to use very simple error handling * and pstree killing logic. */ exit(info->si_status); } int main(int argc, char **argv) { test_init(argc, argv); pstree = (struct pstree *)mmap_ashmem(PAGE_SIZE); test_sync = (struct test_sync *)mmap_ashmem(sizeof(*test_sync)); struct sigaction sa = { .sa_sigaction = sigchld_hand, .sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP }; sigemptyset(&sa.sa_mask); if (sigaction(SIGCHLD, &sa, NULL)) { pr_perror("SIGCHLD handler setup"); exit(1); }; if (atexit(kill_pstree_from_root)) { pr_err("Can't setup atexit cleanup func"); exit(1); } return proc1_func(); } criu-3.6/test/zdtm/transition/maps008.desc000066400000000000000000000000411317335042600205020ustar00rootroot00000000000000{'flavor': 'h', 'flags': 'suid'} criu-3.6/test/zdtm/transition/netlink00.c000066400000000000000000000153711317335042600204360ustar00rootroot00000000000000/* Description: testcase for netlink sockets migration. * e.g. * ip rule show * ip rule add * ip rule show * ip rule del * in a loop */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" #undef DEBUG //#define DEBUG const char *test_doc = "Netlink socket loop"; const char *test_author = "Andrew Vagin (avagin@parallels.com)"; //buffer to hold the RTNETLINK request struct { struct nlmsghdr nl; struct rtmsg rt; char buf[8192]; } req; // variables used for // socket communications int fd; struct sockaddr_nl la; struct sockaddr_nl pa; struct msghdr msg; struct iovec iov; int rtn; // buffer to hold the RTNETLINK reply(ies) char buf[8192]; char dsts[24] = "192.168.0.255"; int pn = 32;//network prefix // RTNETLINK message pointers & lengths // used when processing messages struct nlmsghdr *nlp; int nll; struct rtmsg *rtp; int rtl; struct rtattr *rtap; int send_request(); int recv_reply(); int form_request_add(); int form_request_del(); int read_reply(); typedef int (*cmd_t)(); #define CMD_NUM 2 cmd_t cmd[CMD_NUM]={form_request_add, form_request_del}; int main(int argc, char *argv[]) { int i; test_init(argc, argv); fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (fd<0){ pr_perror("socket"); goto out; } // setup local address & bind using // this address bzero(&la, sizeof(la)); la.nl_family = AF_NETLINK; la.nl_pid = getpid(); if (bind(fd, (struct sockaddr*) &la, sizeof(la))){ pr_perror("bind failed"); goto out; } //Preperation: form_request_del(); send_request(); recv_reply(); test_daemon(); while (test_go()){ for (i=0; i < CMD_NUM; i++){ cmd[i](); if (send_request() < 0){ fail("send_request failed"); goto out; }; if (recv_reply() < 0){ fail("RTNETLINK answers: %m"); goto out; }; #ifdef DEBUG if (read_reply() < 0){ fail("read_reply failed"); goto out; } #endif } } pass(); out: return 0; } int send_request() { // create the remote address // to communicate bzero(&pa, sizeof(pa)); pa.nl_family = AF_NETLINK; // initialize & create the struct msghdr supplied // to the sendmsg() function bzero(&msg, sizeof(msg)); msg.msg_name = (void *) &pa; msg.msg_namelen = sizeof(pa); // place the pointer & size of the RTNETLINK // message in the struct msghdr iov.iov_base = (void *) &req.nl; iov.iov_len = req.nl.nlmsg_len; msg.msg_iov = &iov; msg.msg_iovlen = 1; // send the RTNETLINK message to kernel rtn = sendmsg(fd, &msg, 0); if (rtn<0){ pr_perror("sendmsg failed"); return -1; } return 0; } int recv_reply() { char *p; // initialize the socket read buffer bzero(buf, sizeof(buf)); p = buf; nll = 0; // read from the socket until the NLMSG_DONE is // returned in the type of the RTNETLINK message // or if it was a monitoring socket while(1) { rtn = recv(fd, p, sizeof(buf) - nll, 0); if (rtn < 0) { pr_perror("recv failed"); return -1; } if (rtn == 0) { pr_err("EOF on netlink\n"); return -1; } nlp = (struct nlmsghdr *) p; if(nlp->nlmsg_type == NLMSG_DONE) return 0; if (nlp->nlmsg_type == NLMSG_ERROR) { struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(nlp); errno=-err->error; if (errno) { return -1; } return 0; } // increment the buffer pointer to place // next message p += rtn; // increment the total size by the size of // the last received message nll += rtn; if((la.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE) break; } return 0; } int read_reply() { //string to hold content of the route // table (i.e. one entry) char dsts[24], gws[24], ifs[16], ms[24]; // outer loop: loops thru all the NETLINK // headers that also include the route entry // header nlp = (struct nlmsghdr *) buf; for(; NLMSG_OK(nlp, nll); nlp = NLMSG_NEXT(nlp, nll)) { // get route entry header rtp = (struct rtmsg *) NLMSG_DATA(nlp); // we are only concerned about the // main route table if(rtp->rtm_table != RT_TABLE_MAIN) continue; // init all the strings bzero(dsts, sizeof(dsts)); bzero(gws, sizeof(gws)); bzero(ifs, sizeof(ifs)); bzero(ms, sizeof(ms)); // inner loop: loop thru all the attributes of // one route entry rtap = (struct rtattr *) RTM_RTA(rtp); rtl = RTM_PAYLOAD(nlp); for( ; RTA_OK(rtap, rtl); rtap = RTA_NEXT(rtap,rtl)) { switch(rtap->rta_type) { // destination IPv4 address case RTA_DST: inet_ntop(AF_INET, RTA_DATA(rtap), dsts, 24); break; // next hop IPv4 address case RTA_GATEWAY: inet_ntop(AF_INET, RTA_DATA(rtap), gws, 24); break; // unique ID associated with the network // interface case RTA_OIF: sprintf(ifs, "%d", *((int *) RTA_DATA(rtap))); default: break; } } sprintf(ms, "%d", rtp->rtm_dst_len); test_msg("dst %s/%s gw %s if %s\n", dsts, ms, gws, ifs); } return 0; } #define NLMSG_TAIL(nmsg) \ ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) int form_request_del() { bzero(&req, sizeof(req)); req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); rtap = NLMSG_TAIL(&req.nl); rtap->rta_type = RTA_DST; rtap->rta_len = RTA_LENGTH(4); inet_pton(AF_INET, dsts, ((char *)rtap) + sizeof(struct rtattr)); req.nl.nlmsg_len = NLMSG_ALIGN(req.nl.nlmsg_len) + RTA_ALIGN(rtap->rta_len); req.nl.nlmsg_flags = NLM_F_CREATE | NLM_F_ACK | NLM_F_REQUEST; req.nl.nlmsg_type = RTM_DELROUTE; req.rt.rtm_family = AF_INET; req.rt.rtm_table = RT_TABLE_MAIN; req.rt.rtm_protocol = RTPROT_STATIC; req.rt.rtm_scope = RT_SCOPE_UNIVERSE; req.rt.rtm_type = RTN_UNICAST; req.rt.rtm_dst_len = pn; return 0; } int form_request_add() { int ifcn = 1; //interface number bzero(&req, sizeof(req)); req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); rtap = NLMSG_TAIL(&req.nl); rtap->rta_type = RTA_DST; rtap->rta_len = RTA_LENGTH(4); inet_pton(AF_INET, dsts, ((char *)rtap) + sizeof(struct rtattr)); req.nl.nlmsg_len = NLMSG_ALIGN(req.nl.nlmsg_len) + RTA_ALIGN(rtap->rta_len); rtap = NLMSG_TAIL(&req.nl);; rtap->rta_type = RTA_OIF;//Output interface index rtap->rta_len = RTA_LENGTH(sizeof(int)); memcpy(((char *)rtap) + sizeof(struct rtattr), &ifcn, sizeof(int)); req.nl.nlmsg_len = NLMSG_ALIGN(req.nl.nlmsg_len) + RTA_ALIGN(rtap->rta_len); req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK; req.nl.nlmsg_type = RTM_NEWROUTE; req.rt.rtm_family = AF_INET; req.rt.rtm_table = RT_TABLE_MAIN; req.rt.rtm_protocol = RTPROT_STATIC; req.rt.rtm_scope = RT_SCOPE_UNIVERSE; req.rt.rtm_type = RTN_UNICAST; req.rt.rtm_dst_len = pn; return 0; } criu-3.6/test/zdtm/transition/netlink00.desc000066400000000000000000000000241317335042600211170ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/transition/pipe_loop00.c000066400000000000000000000065501317335042600207570ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Multi-process pipe loop"; const char *test_author = "Pavel Emelianov "; #define PROCS_DEF 4 #define PROCS_MAX 64 unsigned int num_procs = PROCS_DEF; TEST_OPTION(num_procs, uint, "# processes to create " "(default " __stringify(PROCS_DEF) ", max " __stringify(PROCS_MAX) ")", 0); volatile sig_atomic_t num_exited = 0; void inc_num_exited(int signo) { num_exited++; } int main(int argc, char **argv) { int ret = 0; pid_t pid; int i; uint8_t buf[0x100000]; int pipes[PROCS_MAX * 2]; int in, out; test_init(argc, argv); if (num_procs > PROCS_MAX) { pr_err("%d processes is too many: max = %d\n", num_procs, PROCS_MAX); exit(1); } for (i = 0; i < num_procs; i++) if (pipe(pipes + i * 2)) { pr_perror("Can't create pipes"); exit(1); } if (signal(SIGCHLD, inc_num_exited) == SIG_ERR) { pr_perror("can't set SIGCHLD handler"); exit(1); } for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ pid = test_fork(); if (pid < 0) { pr_perror("Can't fork"); kill(0, SIGKILL); exit(1); } if (pid == 0) { int j; in = i * 2; out = in - 1; for (j = 0; j < num_procs * 2; j++) if (j != in && j != out) close(pipes[j]); signal(SIGPIPE, SIG_IGN); if (pipe_in2out(pipes[in], pipes[out], buf, sizeof(buf)) < 0) /* pass errno as exit code to the parent */ if (test_go() /* signal NOT delivered */ || (errno != EINTR && errno != EPIPE)) ret = errno; test_waitsig(); /* even if failed, wait for migration to complete */ close(pipes[in]); close(pipes[out]); exit(ret); } } for (i = 1; i < num_procs * 2 - 1; i++) close(pipes[i]); in = pipes[0]; out = pipes[num_procs * 2 - 1]; /* don't block on writing, _do_ block on reading */ if (set_nonblock(out,1) < 0) { pr_perror("setting O_NONBLOCK failed"); exit(1); } if (num_exited) { pr_err("Some children died unexpectedly\n"); kill(0, SIGKILL); exit(1); } test_daemon(); while (test_go()) { int len, rlen = 0, wlen; uint8_t rbuf[sizeof(buf)], *p; datagen(buf, sizeof(buf), NULL); wlen = write(out, buf, sizeof(buf)); if (wlen < 0) { if (errno == EINTR) continue; else { fail("write failed\n"); ret = 1; break; } } for (p = rbuf, len = wlen; len > 0; p += rlen, len -= rlen) { rlen = read(in, p, len); if (rlen <= 0) break; } if (rlen < 0 && errno == EINTR) continue; if (len > 0) { fail("read failed: %m\n"); ret = 1; break; } if (memcmp(buf, rbuf, wlen)) { fail("data mismatch\n"); ret = 1; break; } } close(out); test_waitsig(); /* even if failed, wait for migration to complete */ if (kill(0, SIGTERM)) { fail("failed to send SIGTERM to my process group: %m\n"); goto out; /* shouldn't wait() in this case */ } for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ int chret; if (wait(&chret) < 0) { fail("can't wait for a child: %m\n"); ret = 1; continue; } chret = WEXITSTATUS(chret); if (chret) { fail("child %d exited with non-zero code %d (%s)\n", i, chret, strerror(chret)); ret = 1; continue; } } if (!ret) pass(); out: close(in); return 0; } criu-3.6/test/zdtm/transition/pipe_shared00.c000066400000000000000000000052061317335042600212510ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Multi-process pipe split"; const char *test_author = "Pavel Emelianov "; #define PROCS_DEF 4 #define PROCS_MAX 64 unsigned int num_procs = PROCS_DEF; TEST_OPTION(num_procs, uint, "# processes to create " "(default " __stringify(PROCS_DEF) ", max " __stringify(PROCS_MAX) ")", 0); volatile sig_atomic_t num_exited = 0; void inc_num_exited(int signo) { num_exited++; } #define SND_CHR 'y' int main(int argc, char **argv) { int ret = 0; pid_t pid; int i; uint8_t buf[PIPE_BUF * 100]; int pipes[2]; test_init(argc, argv); if (num_procs > PROCS_MAX) { pr_err("%d processes is too many: max = %d\n", num_procs, PROCS_MAX); exit(1); } if (pipe(pipes)) { pr_perror("Can't create pipes"); exit(1); } if (signal(SIGCHLD, inc_num_exited) == SIG_ERR) { pr_perror("can't set SIGCHLD handler"); exit(1); } for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ pid = test_fork(); if (pid < 0) { pr_perror("can't fork"); kill(0, SIGKILL); exit(1); } if (pid == 0) { close(pipes[1]); while (test_go()) { int rlen = read(pipes[0], buf, sizeof(buf)); if (rlen == 0) break; else if (rlen < 0) { ret = errno; /* pass errno as exit code to the parent */ break; } for (i = 0; i < rlen && buf[i] == SND_CHR; i++) ; if (i < rlen) { ret = EILSEQ; break; } } test_waitsig(); /* even if failed, wait for migration to complete */ close(pipes[0]); exit(ret); } } close(pipes[0]); if (num_exited) { pr_err("Some children died unexpectedly\n"); kill(0, SIGKILL); exit(1); } test_daemon(); memset(buf, SND_CHR, sizeof(buf)); while(test_go()) if (write(pipes[1], buf, sizeof(buf)) < 0 && (errno != EINTR || test_go())) { /* only SIGTERM may stop us */ fail("write failed: %m\n"); ret = 1; break; } close(pipes[1]); test_waitsig(); /* even if failed, wait for migration to complete */ if (kill(0, SIGTERM)) { fail("failed to send SIGTERM to my process group: %m\n"); goto out; /* shouldn't wait() in this case */ } for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ int chret; if (wait(&chret) < 0) { fail("can't wait for a child: %m\n"); ret = 1; continue; } chret = WEXITSTATUS(chret); if (chret) { fail("child exited with non-zero code %d (%s)\n", chret, strerror(chret)); ret = 1; continue; } } if (!ret) pass(); out: return 0; } criu-3.6/test/zdtm/transition/ptrace.c000066400000000000000000000047311317335042600201060ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Tests that ptraced thread do not escape from tracing"; const char *test_author = "Pavel Emelianov "; #define NR_THREADS 2 unsigned int nr_threads = NR_THREADS; TEST_OPTION(nr_threads, uint, "Number of threads", 0); static void *thread(void *arg) { *(int *)arg = syscall(SYS_gettid); while (1) sleep(1); return NULL; } int main(int argc, char **argv) { int pid, status, i, stopped; #define PT_REGS_SIZE 4096 /* big enough for any arch */ #define PT_REGS_ALIGN 16 /* big enough for any arch */ char regs[PT_REGS_SIZE] __attribute__((aligned(PT_REGS_ALIGN))); int *pids; test_init(argc, argv); pids = (int *)mmap(NULL, sizeof(int) * nr_threads, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0); if (pids == MAP_FAILED) { pr_perror("Can't map"); exit(1); } memset(pids, 0, sizeof(int) * nr_threads); pid = fork(); if (pid < 0) { pr_perror("Can't fork"); goto out; } else if (pid == 0) { pthread_t pt[nr_threads]; for (i = 0; i < nr_threads - 1; i++) { if (pthread_create(&pt[i], NULL, thread, pids + i)) { pr_perror("Can't make thread"); goto out_th; } } thread(pids + i); out_th: for (i--; i >=0; i--) { pthread_kill(pt[i], SIGKILL); pthread_join(pt[i], NULL); } return 0; } for (i = 0; i < nr_threads; i++) { while (pids[i] == 0) sched_yield(); if (ptrace(PTRACE_ATTACH, pids[i], (char *)1, NULL) == -1) { pr_perror("Can't attach"); goto out_pt; } } test_daemon(); while (test_go()) { for (i = 0; i < nr_threads; i++) if (pids[i]) break; if (i == nr_threads) break; stopped = wait4(-1, &status, __WALL, NULL); if (stopped == -1) { pr_perror("Can't wait"); break; } if (WIFSTOPPED(status)) { if (ptrace(PTRACE_GETSIGINFO, stopped, NULL, regs)) { /* FAIL */ fail("Ptrace won't work"); break; } for (i = 0; i < nr_threads; i++) if (pids[i] == stopped) break; if (i == nr_threads) continue; pids[i] = 0; ptrace(PTRACE_DETACH, stopped, (char *)1, NULL); ptrace(PTRACE_CONT, stopped, (char *)1, NULL); continue; } } test_waitsig(); pass(); out_pt: kill(pid, SIGKILL); wait(NULL); out: munmap(pids, sizeof(int) * nr_threads); return 0; } criu-3.6/test/zdtm/transition/ptrace.desc000066400000000000000000000000241317335042600205710ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/transition/shmem.c000066400000000000000000000027601317335042600177410ustar00rootroot00000000000000#include #include #include #include #include #include #include "zdtmtst.h" const char *test_author = "Andrei Vagin "; #define MEM_SIZE (1<<25) int main(int argc, char **argv) { pid_t pid; void *addr; int *sum, status; long size; test_init(argc, argv); sum = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); if (sum == MAP_FAILED) return 1; addr = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); if (addr == MAP_FAILED) return 1; pid = fork(); if (pid < 0) return 1; if (pid == 0) { int i = 0; long size = PAGE_SIZE, old_size = MEM_SIZE; status = 0; while (test_go()) { addr = mremap(addr, old_size, size, MREMAP_MAYMOVE); status -= *((int *)(addr + size - PAGE_SIZE)); *((int *)(addr + size - PAGE_SIZE)) = i++; status += *((int *)(addr + size - PAGE_SIZE)); old_size = size; size += PAGE_SIZE; if (size > MEM_SIZE) size = PAGE_SIZE; } *sum = status; return 0; } test_daemon(); test_waitsig(); kill(pid, SIGTERM); status = -1; waitpid(pid, &status, 0); if (status) { pr_err("The child return non-zero code: %d\n", status); return 1; } status = 0; for (size = PAGE_SIZE; size <= MEM_SIZE; size += PAGE_SIZE) { status += *((int *)(addr + size - PAGE_SIZE)); } if (status != *sum) { fail("checksum mismatch: %x %x\n", status, *sum); return 1; } pass(); return 0; } criu-3.6/test/zdtm/transition/socket-tcp.c000077700000000000000000000000001317335042600246142../static/socket-tcp.custar00rootroot00000000000000criu-3.6/test/zdtm/transition/socket-tcp.desc000066400000000000000000000001071317335042600213710ustar00rootroot00000000000000{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} criu-3.6/test/zdtm/transition/socket-tcp6.c000077700000000000000000000000001317335042600247022../static/socket-tcp.custar00rootroot00000000000000criu-3.6/test/zdtm/transition/socket-tcp6.desc000066400000000000000000000001071317335042600214570ustar00rootroot00000000000000{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} criu-3.6/test/zdtm/transition/socket_loop00.c000066400000000000000000000076131317335042600213130ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Multi-process socket loop"; const char *test_author = "Pavel Emelianov "; #define PROCS_DEF 4 #define PROCS_MAX 64 unsigned int num_procs = PROCS_DEF; TEST_OPTION(num_procs, uint, "# processes to create " "(default " __stringify(PROCS_DEF) ", max " __stringify(PROCS_MAX) ")", 0); volatile sig_atomic_t num_exited = 0; void inc_num_exited(int signo) { num_exited++; } int main(int argc, char **argv) { int ret = 0; pid_t pid; int i; uint8_t buf[0x100000]; int socks[PROCS_MAX * 2]; int in, out; test_init(argc, argv); if (num_procs > PROCS_MAX) { pr_err("%d processes is too many: max = %d\n", num_procs, PROCS_MAX); exit(1); } for (i = 0; i < num_procs; i++) if (socketpair(AF_LOCAL, SOCK_STREAM, 0, socks + i * 2)) { pr_perror("can't create sockets"); exit(1); } if (signal(SIGCHLD, inc_num_exited) == SIG_ERR) { pr_perror("can't set SIGCHLD handler"); exit(1); } for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ pid = test_fork(); if (pid < 0) { pr_perror("Can't fork"); kill(0, SIGKILL); exit(1); } if (pid == 0) { int j; in = i * 2; out = in - 1; for (j = 0; j < num_procs * 2; j++) if (j != in && j != out) close(socks[j]); signal(SIGPIPE, SIG_IGN); if (pipe_in2out(socks[in], socks[out], buf, sizeof(buf)) < 0) /* pass errno as exit code to the parent */ if (test_go() /* signal NOT delivered */ || (errno != EINTR && errno != EPIPE && errno != ECONNRESET)) ret = errno; test_waitsig(); /* even if failed, wait for migration to complete */ close(socks[in]); close(socks[out]); exit(ret); } } for (i = 1; i < num_procs * 2 - 1; i++) close(socks[i]); in = socks[0]; out = socks[num_procs * 2 - 1]; /* don't block on writing, _do_ block on reading */ if (set_nonblock(out,1) < 0) { pr_perror("setting O_NONBLOCK failed"); exit(1); } if (num_exited) { pr_err("Some children died unexpectedly\n"); kill(0, SIGKILL); exit(1); } test_daemon(); while (test_go()) { int len, rlen = 0, wlen; uint8_t rbuf[sizeof(buf)], *p; datagen(buf, sizeof(buf), NULL); wlen = write(out, buf, sizeof(buf)); if (wlen < 0) { if (errno == EINTR) continue; else { fail("write failed\n"); ret = 1; break; } } for (p = rbuf, len = wlen; len > 0; p += rlen, len -= rlen) { rlen = read(in, p, len); if (rlen <= 0) break; } if (rlen < 0 && errno == EINTR) continue; if (len > 0) { fail("read failed: %m\n"); ret = 1; break; } if (memcmp(buf, rbuf, wlen)) { fail("data mismatch\n"); ret = 1; break; } } test_waitsig(); /* even if failed, wait for migration to complete */ /* We expect that write(2) in child may return error only after signal * has been received. Thus, send signal before closing parent fds. */ if (kill(0, SIGTERM)) { fail("failed to send SIGTERM to my process group: %m\n"); goto out; /* shouldn't wait() in this case */ } if (close(out)) fail("Failed to close parent fd 'out': %m\n"); /* If last child in the chain (from whom we read data) receives signal * after parent has finished reading but before calling write(2), this * child can block forever. To avoid this, close 'in' fd. */ if (close(in)) fail("failed to close parent fd 'in': %m\n"); for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ int chret; if (wait(&chret) < 0) { fail("can't wait for a child: %m\n"); ret = 1; continue; } chret = WEXITSTATUS(chret); if (chret) { fail("child %d exited with non-zero code %d (%s)\n", i, chret, strerror(chret)); ret = 1; continue; } } if (!ret) pass(); out: return 0; } criu-3.6/test/zdtm/transition/thread-bomb.c000066400000000000000000000014771317335042600210200ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "zdtmtst.h" #define exit_group(code) \ syscall(__NR_exit_group, code) static void *thread_fn(void *arg) { pthread_t t, p, *self; if (arg) { p = *(pthread_t *)arg; pthread_join(p, NULL); free(arg); } self = malloc(sizeof(*self)); *self = pthread_self(); pthread_create(&t, NULL, thread_fn, self); return NULL; } int main(int argc, char **argv) { char *val; int max_nr = 1024, i; val = getenv("ZDTM_THREAD_BOMB"); if (val) max_nr = atoi(val); test_msg("%d\n", max_nr); test_init(argc, argv); for (i = 0; i < max_nr; i++) { pthread_t p; pthread_create(&p, NULL, thread_fn, NULL); } test_daemon(); test_waitsig(); pass(); return 0; } criu-3.6/test/zdtm/transition/thread-bomb.desc000066400000000000000000000000241317335042600214770ustar00rootroot00000000000000{'flags': 'noauto'} criu-3.6/test/zdtm/transition/unix_sock.c000066400000000000000000000131701317335042600206270ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "zdtmtst.h" const char *test_doc = "Multi-client - server app"; const char *test_author = "Roman Kagan "; #define PROCS_DEF 4 #define PROCS_MAX 64 unsigned int num_procs = PROCS_DEF; TEST_OPTION(num_procs, uint, "# processes to create " "(default " __stringify(PROCS_DEF) ", max " __stringify(PROCS_MAX) ")", 0); char *filename; TEST_OPTION(filename, string, "file name", 1); #define ACCEPT_TIMEOUT 100 /* max delay for the child to connect */ static int fill_sock_name(struct sockaddr_un *name, const char *filename) { if (strlen(filename) >= sizeof(name->sun_path)) return -1; name->sun_family = AF_LOCAL; strcpy(name->sun_path, filename); return 0; } static int setup_srv_sock(void) { struct sockaddr_un name; int sock; if (fill_sock_name(&name, filename) < 0) { pr_err("filename \"%s\" is too long\n", filename); return -1; } sock = socket(PF_LOCAL, SOCK_STREAM, 0); if (sock < 0) { pr_perror("can't create socket"); return -1; } if (bind(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) { pr_perror("can't bind to socket \"%s\"", filename); goto err; } if (fcntl(sock, F_SETFL, O_NONBLOCK) < 0) { pr_perror("can't make socket \"%s\" non-blocking", filename); goto err; } if (listen(sock, 1) < 0) { pr_perror("can't listen on a socket \"%s\"", filename); goto err; } return sock; err: close(sock); return -1; } static int accept_one_conn(int sock) { int acc_sock; fd_set fds; struct timeval timeout = { .tv_sec = ACCEPT_TIMEOUT, }; FD_ZERO(&fds); FD_SET(sock, &fds); switch (select(FD_SETSIZE, &fds, NULL, NULL, &timeout)) { case 1: break; case 0: pr_err("timeout accepting a connection\n"); return -1; default: pr_perror("error while waiting for a connection"); return -1; } acc_sock = accept(sock, NULL, NULL); if (acc_sock < 0) pr_perror("error accepting a connection"); return acc_sock; } static int setup_clnt_sock(void) { struct sockaddr_un name; int sock; int ret = 0; if (fill_sock_name(&name, filename) < 0) { pr_err("filename \"%s\" is too long\n", filename); return -1; } sock = socket(PF_LOCAL, SOCK_STREAM, 0); if (sock < 0) { ret = -errno; pr_perror("can't create socket"); return ret; } if (connect(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) { ret = -errno; pr_perror("can't connect"); goto err; } return sock; err: close(sock); return ret; } #define BUFLEN 1000 static int child(void) { int ret = 1; uint8_t buf[BUFLEN]; uint32_t crc = ~0; int sock = setup_clnt_sock(); if (sock < 0) { ret = -sock; goto out; } signal(SIGPIPE, SIG_IGN); while (test_go()) { datagen(buf, sizeof(buf), &crc); if (write(sock, buf, sizeof(buf)) < 0 && (test_go() /* signal NOT received */ || (errno != EINTR && errno != EPIPE && \ errno != ECONNRESET))) { ret = errno; fail("child write: %m\n"); goto out; } } ret = 0; out: close(sock); return ret; } int main(int argc, char **argv) { struct { pid_t pid; int sock; uint32_t crc; } child_desc[PROCS_MAX]; int i, nproc; int sock; uint8_t buf[BUFLEN]; fd_set active_fds, read_fds; test_init(argc, argv); if (num_procs > PROCS_MAX) { pr_err("%d processes is too many: max = %d\n", num_procs, PROCS_MAX); exit(1); } sock = setup_srv_sock(); if (sock < 0) exit(1); FD_ZERO(&active_fds); for (nproc = 0; nproc < num_procs; nproc++) { child_desc[nproc].pid = test_fork(); if (child_desc[nproc].pid < 0) { pr_perror("can't fork"); goto cleanup; } if (child_desc[nproc].pid == 0) { close(sock); exit(child()); } child_desc[nproc].sock = accept_one_conn(sock); if (child_desc[nproc].sock < 0) { kill(child_desc[nproc].pid, SIGKILL); goto cleanup; } child_desc[nproc].crc = ~0; FD_SET(child_desc[nproc].sock, &active_fds); } close(sock); /* no more connections */ test_daemon(); while (test_go()) { read_fds = active_fds; if (select(FD_SETSIZE, &read_fds, NULL, NULL, NULL) < 0 && errno != EINTR) { fail("error waiting for data: %m"); goto out; } for (i = 0; i < num_procs; i++) if (FD_ISSET(child_desc[i].sock, &read_fds)) { if (read(child_desc[i].sock, buf, sizeof(buf)) < 0) { if(errno == EINTR) /* we're asked to stop */ break; else { fail("error reading data from socket: %m"); goto out; } } if (datachk(buf, sizeof(buf), &child_desc[i].crc)) { fail("CRC mismatch"); goto out; } } } out: test_waitsig(); if (kill(0, SIGTERM)) { fail("failed to send SIGTERM to my process group: %m\n"); goto cleanup; /* shouldn't wait() in this case */ } while (nproc-- > 0) { int chret; /* * Close socket to make sure that child's write() returns. * This is to avoid race when server stopped reading & sent * signal to child, child has checked for signal & found none * (not yet delivered), then called write(), blocking forever. */ if(close(child_desc[nproc].sock)) fail("Can't close server socket: %m\n"); if (wait(&chret) < 0) { fail("can't wait for a child: %m\n"); goto cleanup; } chret = WEXITSTATUS(chret); if (chret) { fail("child exited with non-zero code %d (%s)\n", chret, strerror(chret)); goto cleanup; } } pass(); cleanup: while (nproc-- > 0) { close(child_desc[nproc].sock); if (child_desc[nproc].pid > 0) kill(child_desc[nproc].pid, SIGKILL); } close(sock); unlink(filename); return 0; } criu-3.6/test/zdtm_ct.c000066400000000000000000000030771317335042600151260ustar00rootroot00000000000000#include #include #include #include #include #include #include int main(int argc, char **argv) { pid_t pid; int status; /* * pidns is used to avoid conflicts * mntns is used to mount /proc * net is used to avoid conflicts of parasite sockets */ if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) return 1; pid = fork(); if (pid == 0) { if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); return 1; } umount2("/proc", MNT_DETACH); umount2("/dev/pts", MNT_DETACH); if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { fprintf(stderr, "mount(/proc): %m"); return 1; } if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { fprintf(stderr, "mount(pts): %m"); return 1; } if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { fprintf(stderr, "mount(binfmt_misc): %m"); return 1; } if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { fprintf(stderr, "mount(ptmx): %m"); return 1; } if (system("ip link set up dev lo")) return 1; execv(argv[1], argv + 1); fprintf(stderr, "execve: %m"); return 1; } if (waitpid(pid, &status, 0) != pid) { fprintf(stderr, "waitpid: %m"); return 1; } if (WIFEXITED(status)) return WEXITSTATUS(status); else if (WIFSIGNALED(status)) kill(getpid(), WTERMSIG(status)); else fprintf(stderr, "Unexpected exit status: %x\n", status); return 1; } criu-3.6/test/zdtm_mount_cgroups000077500000000000000000000010001317335042600171660ustar00rootroot00000000000000#!/bin/sh # If a controller is created during dumping processes, criu may fail with error: # Error (cgroup.c:768): cg: Set 3 is not subset of 2 # so lets create all test controllers before executing tests. cat /proc/self/cgroup | grep -q zdtmtst.defaultroot && exit tdir=`mktemp -d zdtm.XXXXXX` for i in "zdtmtst" "zdtmtst.defaultroot"; do mount -t cgroup -o none,name=$i zdtm $tdir && # a fake group prevents destroying of a controller mkdir -p $tdir/holder && umount -l $tdir || exit 1 done rmdir $tdir