pax_global_header00006660000000000000000000000064146461170540014522gustar00rootroot0000000000000052 comment=c67e73e593c04092fb1d14053b535c2b08ca218f git-filter-repo-2.45.0/000077500000000000000000000000001464611705400146235ustar00rootroot00000000000000git-filter-repo-2.45.0/.gitattributes000066400000000000000000000001511464611705400175130ustar00rootroot00000000000000*.sh eol=lf *.py eol=lf /git-filter-repo eol=lf /contrib/filter-repo-demos/[a-z]* eol=lf /t/t9*/* eol=lf git-filter-repo-2.45.0/.github/000077500000000000000000000000001464611705400161635ustar00rootroot00000000000000git-filter-repo-2.45.0/.github/dependabot.yml000066400000000000000000000001721464611705400210130ustar00rootroot00000000000000--- version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "monthly"git-filter-repo-2.45.0/.github/workflows/000077500000000000000000000000001464611705400202205ustar00rootroot00000000000000git-filter-repo-2.45.0/.github/workflows/test.yml000066400000000000000000000017031464611705400217230ustar00rootroot00000000000000name: Run tests on: [push, pull_request] jobs: run-tests: strategy: matrix: os: [ 'windows', 'ubuntu', 'macos' ] fail-fast: false runs-on: ${{ matrix.os }}-latest steps: - uses: actions/checkout@v4 - name: Setup python uses: actions/setup-python@v5 with: python-version: 3.x - name: test shell: bash run: | # setup-python puts `python` into the `PATH`, not `python3`, yet # `git-filter-repo` expects `python3` in the `PATH`. Let's add # a shim. printf '#!/bin/sh\n\nexec python "$@"\n' >python3 && export PATH=$PWD:$PATH && if ! t/run_tests -q -v -x then mkdir failed && tar czf failed/failed.tar.gz t exit 1 fi - name: upload failed tests' directories if: failure() uses: actions/upload-artifact@v4 with: name: failed-${{ matrix.os }} path: failed git-filter-repo-2.45.0/.gitignore000066400000000000000000000001341464611705400166110ustar00rootroot00000000000000/Documentation/html/ /Documentation/man1/ /t/test-results /t/trash directory* /__pycache__/ git-filter-repo-2.45.0/COPYING000066400000000000000000000022071464611705400156570ustar00rootroot00000000000000git-filter-repo itself and most the files in this repository (exceptions noted below) are provided under the MIT license (see COPYING.mit). The usage of the MIT license probably makes filter-repo compatible with everything, but just in case, these files can also be used under whatever open source license[1] that git.git or libgit2 use now or in the future (currently GPL[2] and GPL-with-linking-exception[3]). Further, the examples (in contrib/filter-repo-demos/ and t/t9391/) can also be used under the same license that libgit2 provides their examples under (CC0, currently[4]). Exceptions: - The test harness (t/test-lib.sh, t/test-lib-functions.sh) is a slightly modified copy of git.git's test harness (the difference being that my copy doesn't require a built version of 'git' to be present). These are thus GPL2 (see COPYING.gpl), and are individually marked as such. [1] ...as defined by the Open Source Initiative (https://opensource.org/) [2] https://git.kernel.org/pub/scm/git/git.git/tree/COPYING [3] https://github.com/libgit2/libgit2/blob/master/COPYING [4] https://github.com/libgit2/libgit2/blob/master/examples/COPYING git-filter-repo-2.45.0/COPYING.gpl000066400000000000000000000432541464611705400164470ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. git-filter-repo-2.45.0/COPYING.mit000066400000000000000000000020361464611705400164470ustar00rootroot00000000000000Copyright (c) 2009, 2018-2019 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. git-filter-repo-2.45.0/Documentation/000077500000000000000000000000001464611705400174345ustar00rootroot00000000000000git-filter-repo-2.45.0/Documentation/Contributing.md000066400000000000000000000104001464611705400224200ustar00rootroot00000000000000Welcome to the community! Contributions need to meet the bar for inclusion in git.git. Although filter-repo is not part of the git.git repository, I want to leave the option open for it to be merged in the future. As such, any contributions need to follow the same [guidelines for contribution to git.git](https://git.kernel.org/pub/scm/git/git.git/tree/Documentation/SubmittingPatches), with a few exceptions: * While I [hate](https://public-inbox.org/git/CABPp-BG2SkH0GrRYpHLfp2Wey91ThwQoTgf9UmPa9f5Szn+v3Q@mail.gmail.com/) [GitHub](https://public-inbox.org/git/CABPp-BEcpasV4vBTm0uxQ4Vzm88MQAX-ArDG4e9QU8tEoNsZWw@mail.gmail.com/) [PRs](https://public-inbox.org/git/CABPp-BEHy8c3raHwf9aFXvXN0smf_WwCcNiYxQBwh7W6An60qQ@mail.gmail.com/) (as others point out, [it's mind-boggling in a bad way that web-based Git hosting and code review systems do such a poor job](http://nhaehnle.blogspot.com/2020/06/they-want-to-be-small-they-want-to-be.html)), git-format-patch and git-send-email can be a beast and I have not yet found time to modify Dscho's excellent [GitGitGadget](https://github.com/gitgitgadget/gitgitgadget) to work with git-filter-repo. As such: * For very short single-commit changes, feel free to open GitHub PRs. * For more involved changes, if format-patch or send-email give you too much trouble, go ahead and open a GitHub PR and just mention that email didn't work out. * If emailing patches to the git list: * Include "filter-repo" at the start of the subject, e.g. "[filter-repo PATCH] Add packaging scripts for uploading to PyPI" instead of just "[PATCH] Add packaging scripts for uploading to PyPI" * CC me instead of the git maintainer * Git's [CodingGuidlines for python code](https://github.com/git/git/blob/v2.24.0/Documentation/CodingGuidelines#L482-L494) are only partially applicable: * python3 is a hard requirement; python2 is/was EOL at the end of 2019 and should not be used. (Commit 4d0264ab723c ("filter-repo: workaround python<2.7.9 exec bug", 2019-04-30) was the last version of filter-repo that worked with python2). * You can depend on anything in python 3.5 or earlier. I may bump this minimum version over time, but do want to generally work with the python3 version found in current enterprise Linux distributions. * In filter-repo, it's not just OK to use bytestrings, you are expected to use them a lot. Using unicode strings result in lots of ugly errors since input comes from filesystem names, commit messages, file contents, etc., none of which are guaranteed to be unicode. (Plus unicode strings require lots of effort to verify, encode, and decode -- slowing the filtering process down). I tried to work with unicode strings more broadly in the code base multiple times; but it's just a bad idea to use an abstraction that doesn't fit the data. * I generally like [PEP 8](https://www.python.org/dev/peps/pep-0008/), but used two-space indents for years before learning of it and have just continued that habit. For consistency, contributions should also use two-space indents and otherwise generally follow PEP 8. There are a few extra things I would like folks to keep in mind: * Please test line coverage if you add or modify code * `make test` will run the testsuite under [coverage3](https://pypi.org/project/coverage/) (which you will need to install), and report on line coverage. Line coverage of git-filter-repo needs to remain at 100%; line coverage of contrib and test scripts can be ignored. * Please do not be intimidated by detailed feedback: * In the git community, I have been contributing for years and have had hundreds of patches accepted but I still find that even when I try to make patches perfect I am not surprised when I have to spend as much or more time fixing up patches after submitting them than I did figuring out the patches in the first place. git folks tend to do thorough reviews, which has taught me a lot, and I try to do the same for filter-repo. Plus, as noted above, I want contributions from others to be acceptable in git.git itself. git-filter-repo-2.45.0/Documentation/converting-from-bfg-repo-cleaner.md000066400000000000000000000145511464611705400262110ustar00rootroot00000000000000# Cheat Sheet: Converting from BFG Repo Cleaner This document is aimed at folks who are familiar with BFG Repo Cleaner and want to learn how to convert over to using filter-repo. ## Table of Contents * [Half-hearted conversions](#half-hearted-conversions) * [Intention of "equivalent" commands](#intention-of-equivalent-commands) * [Basic Differences](#basic-differences) * [Cheat Sheet: Conversion of Examples from BFG](#cheat-sheet-conversion-of-examples-from-bfg) ## Half-hearted conversions You can switch most any BFG command to use filter-repo under the covers by just replacing the `java -jar bfg.jar` part of the command with [`bfg-ish`](../contrib/filter-repo-demos/bfg-ish). bfg-ish is a reasonable tool, and provides a number of bug fixes and features on top of bfg, but most of my focus is naturally on filter-repo which has a number of capabilities lacking in bfg-ish. ## Intention of "equivalent" commands BFG and filter-repo have a few differences, highlighted in the Basic Differences section below, that make it hard to get commands that behave identically. Rather than focusing on matching BFG output as exactly as possible, I treat the BFG examples as idiomatic ways to solve a certain type of problem with BFG, and express how one would idiomatically solve the same problem in filter-repo. Sometimes that means the results are not identical, but they are largely the same in each case. ## Basic Differences BFG operates directly on tree objects, which have no notion of their leading path. Thus, it has no way of differentiating between 'README.md' at the toplevel versus in some subdirectory. You simply operate on the basename of files and directories. This precludes doing things like renaming files and directories or other bigger restructures. By directly operating on trees, it also runs into problems with loose vs. packed objects, loose vs. packed refs, not understanding replace refs or grafts, and not understanding the index and working tree as another data source. With `git filter-repo`, you are essentially given an editing tool to operate on the [fast-export](https://git-scm.com/docs/git-fast-export) serialization of a repo, which operates on filenames including their full paths from the toplevel of the repo. Directories are not separately specified, so any directory-related filtering is done by checking the leading path of each file. Further, you aren't limited to the pre-defined filtering types, python callbacks which operate on the data structures from the fast-export stream can be provided to do just about anything you want. By leveraging fast-export and fast-import, filter-repo gains automatic handling of objects and refs whether they are packed or not, automatic handling of replace refs and grafts, and future features that may appear. It also tries hard to provide a full rewrite solution, so it takes care of additional important concerns such as updating the index and working tree and running an automatic gc for the user afterwards. The "protection" and "privacy" defaults in BFG are something I fundamentally disagreed with for a variety of reasons; see the comments at the top of the [bfg-ish](../contrib/filter-repo-demos/bfg-ish) script if you want details. The bfg-ish script implemented these protection and privacy options since it was designed to act like BFG, but still flipped the default to the opposite of what BFG chose. I left the "protection" and "non-private" features out of filter-repo entirely. This means a number of things with filter-repo: * any filters you specify will also be applied to HEAD, so that you don't have a weird disconnect from your history transformations only being applied to most commits * `[formerly OLDHASH]` references are not munged into commit messages; the replace refs that filter-repo adds are a much cleaner way of looking up commits by old commit hashes. * `Former-commit-id:` footers are not added to commit messages; the replace refs that filter-repo adds are a much cleaner way of looking up commits by old commit hashes. * History is not littered with `.REMOVED.git-id` files. BFG expects you to specify the repository to rewrite as its final argument, whereas filter-repo expects you to cd into the repo and then run filter-repo. ## Cheat Sheet: Conversion of Examples from BFG ### Stripping big blobs ```shell java -jar bfg.jar --strip-blobs-bigger-than 100M some-big-repo.git ``` becomes ```shell git filter-repo --strip-blobs-bigger-than 100M ``` ### Deleting files ```shell java -jar bfg.jar --delete-files id_{dsa,rsa} my-repo.git ``` becomes ```shell git filter-repo --use-base-name --path id_dsa --path id_rsa --invert-paths ``` ### Removing sensitive content ```shell java -jar bfg.jar --replace-text passwords.txt my-repo.git ``` becomes ```shell git filter-repo --replace-text passwords.txt ``` The `--replace-text` was a really clever idea that the BFG came up with and I just implemented mostly as-is within filter-repo. Sadly, BFG didn't document the format of files passed to --replace text very well, but I added more detail in the filter-repo documentation. There is one small but important difference between the two tools: if you use both "regex:" and "==>" on a single line to specify a regex search and replace, then filter-repo will use "\1", "\2", "\3", etc. for replacement strings whereas BFG used "$1", "$2", "$3", etc. The reason for this difference is simply that python used backslashes in its regex format while scala used dollar signs, and both tools wanted to just pass along the strings unmodified to the underlying language. (Since bfg-ish attempts to emulate the BFG, it accepts "$1", "$2" and so forth and translates them to "\1", "\2", etc. so that filter-repo/python will understand it.) ### Removing files and folders with a certain name ```shell java -jar bfg.jar --delete-folders .git --delete-files .git --no-blob-protection my-repo.git ``` becomes ```shell git filter-repo --invert-paths --path-glob '*/.git' --path .git ``` Yes, that glob will handle .git directories one or more directories deep; it's a git-style glob rather than a shell-style glob. Also, the `--path .git` was added because `--path-glob '*/.git'` won't match a directory named .git in the toplevel directory since it has a '/' character in the glob expression (though I would hope the repository doesn't have a tracked .git toplevel directory in its history). git-filter-repo-2.45.0/Documentation/converting-from-filter-branch.md000066400000000000000000000256311464611705400256220ustar00rootroot00000000000000# Cheat Sheet: Converting from filter-branch This document is aimed at folks who are familiar with filter-branch and want to learn how to convert over to using filter-repo. ## Table of Contents * [Half-hearted conversions](#half-hearted-conversions) * [Intention of "equivalent" commands](#intention-of-equivalent-commands) * [Basic Differences](#basic-differences) * [Cheat Sheet: Conversion of Examples from the filter-branch manpage](#cheat-sheet-conversion-of-examples-from-the-filter-branch-manpage) * [Cheat Sheet: Additional conversion examples](#cheat-sheet-additional-conversion-examples) ## Half-hearted conversions You can switch nearly any `git filter-branch` command to use filter-repo under the covers by just replacing the `git filter-branch` part of the command with [`filter-lamely`](../contrib/filter-repo-demos/filter-lamely). The git.git regression testsuite passes when I swap out the filter-branch script with filter-lamely, for example. (However, the filter-branch tests are not very comprehensive, so don't rely on that too much.) Doing a half-hearted conversion has nearly all of the drawbacks of filter-branch and nearly none of the benefits of filter-repo, but it will make your command run a few times faster and makes for a very simple conversion. You'll get a lot more performance, safety, and features by just switching to direct filter-repo commands. ## Intention of "equivalent" commands filter-branch and filter-repo have different defaults, as highlighted in the Basic Differences section below. As such, getting a command which behaves identically is not possible. Also, sometimes the filter-branch manpage lies, e.g. it says "suppose you want to...from all commits" and then uses a command line like "git filter-branch ... HEAD", which only operates on commits in the current branch rather than on all commits. Rather than focusing on matching filter-branch output as exactly as possible, I treat the filter-branch examples as idiomatic ways to solve a certain type of problem with filter-branch, and express how one would idiomatically solve the same problem in filter-repo. Sometimes that means the results are not identical, but they are largely the same in each case. ## Basic Differences With `git filter-branch`, you have a git repository where every single commit (within the branches or revisions you specify) is checked out and then you run one or more shell commands to transform the working copy into your desired end state. With `git filter-repo`, you are essentially given an editing tool to operate on the [fast-export](https://git-scm.com/docs/git-fast-export) serialization of a repo. That means there is an input stream of all the contents of the repository, and rather than specifying filters in the form of commands to run, you usually employ a number of common pre-defined filters that provide various ways to slice, dice, or modify the repo based on its components (such as pathnames, file content, user names or emails, etc.) That makes common operations easier, even if it's not as versatile as shell callbacks. For cases where more complexity or special casing is needed, filter-repo provides python callbacks that can operate on the data structures populated from the fast-export stream to do just about anything you want. filter-branch defaults to working on a subset of the repository, and requires you to specify a branch or branches, meaning you need to specify `-- --all` to modify all commits. filter-repo by contrast defaults to rewriting everything, and you need to specify `--refs ` if you want to limit to just a certain set of branches or range of commits. (Though any `` that begin with a hyphen are not accepted by filter-repo as they look like the start of different options.) filter-repo also takes care of additional concerns automatically, like rewriting commit messages that reference old commit IDs to instead reference the rewritten commit IDs, pruning commits which do not start empty but become empty due to the specified filters, and automatically shrinking and gc'ing the repo at the end of the filtering operation. ## Cheat Sheet: Conversion of Examples from the filter-branch manpage ### Removing a file The filter-branch manual provided three different examples of removing a single file, based on different levels of ease vs. carefulness and performance: ```shell git filter-branch --tree-filter 'rm filename' HEAD ``` ```shell git filter-branch --tree-filter 'rm -f filename' HEAD ``` ```shell git filter-branch --index-filter 'git rm --cached --ignore-unmatch filename' HEAD ``` All of these just become ```shell git filter-repo --invert-paths --path filename ``` ### Extracting a subdirectory Extracting a subdirectory via ```shell git filter-branch --subdirectory-filter foodir -- --all ``` is one of the easiest commands to convert; it just becomes ```shell git filter-repo --subdirectory-filter foodir ``` ### Moving the whole tree into a subdirectory Keeping all files but placing them in a new subdirectory via ```shell git filter-branch --index-filter \ 'git ls-files -s | sed "s-\t\"*-&newsubdir/-" | GIT_INDEX_FILE=$GIT_INDEX_FILE.new \ git update-index --index-info && mv "$GIT_INDEX_FILE.new" "$GIT_INDEX_FILE"' HEAD ``` (which happens to be GNU-specific and will fail with BSD userland in very subtle ways) becomes ```shell git filter-repo --to-subdirectory-filter newsubdir ``` (which works fine regardless of GNU vs BSD userland differences.) ### Re-grafting history The filter-branch manual provided one example with three different commands that could be used to achieve it, though the first of them had limited applicability (only when the repo had a single initial commit). These three examples were: ```shell git filter-branch --parent-filter 'sed "s/^\$/-p /"' HEAD ``` ```shell git filter-branch --parent-filter \ 'test $GIT_COMMIT = && echo "-p " || cat' HEAD ``` ```shell git replace --graft $commit-id $graft-id git filter-branch $graft-id..HEAD ``` git-replace did not exist when the original two examples were written, but it is clear that the last example is far easier to understand. As such, filter-repo just uses the same mechanism: ```shell git replace --graft $commit-id $graft-id git filter-repo --force ``` NOTE: --force should usually be avoided unless you have taken care to make sure you have a backup (or are running on a fresh clone of) your repo. It is needed in this case because filter-repo errors out when no arguments are specified, and because it usually first checks whether you are in a fresh clone before irrecoverably rewriting your repository (git-replace created a new graft and thus added something to your previously fresh clone). ### Removing commits by a certain author WARNING: This is a BAD example for BOTH filter-branch and filter-repo. It does not remove the changes the user made from the repo, it just removes the commit in question while smashing the changes from it into any subsequent commits as though the subsequent authors had been responsible for those changes as well. `git rebase` is likely to be a better fit for what you really want if you are looking at this example. (See also [this explanation of the differences between rebase and filter-repo](https://github.com/newren/git-filter-repo/issues/62#issuecomment-597725502)) This filter-branch example ```shell git filter-branch --commit-filter ' if [ "$GIT_AUTHOR_NAME" = "Darl McBribe" ]; then skip_commit "$@"; else git commit-tree "$@"; fi' HEAD ``` becomes ```shell git filter-repo --commit-callback ' if commit.author_name == b"Darl McBribe": commit.skip() ' ``` ### Rewriting commit messages -- removing text Removing git-svn-id: lines from commit messages via ```shell git filter-branch --msg-filter ' sed -e "/^git-svn-id:/d" ' ``` becomes ```shell git filter-repo --message-callback ' return re.sub(b"^git-svn-id:.*\n", b"", message, flags=re.MULTILINE) ' ``` ### Rewriting commit messages -- adding text Adding Acked-by lines to the last ten commits via ```shell git filter-branch --msg-filter ' cat && echo "Acked-by: Bugs Bunny " ' master~10..master ``` becomes ```shell git filter-repo --message-callback ' return message + b"Acked-by: Bugs Bunny \n" ' --refs master~10..master ``` ### Changing author/committer(/tagger?) information ```shell git filter-branch --env-filter ' if test "$GIT_AUTHOR_EMAIL" = "root@localhost" then GIT_AUTHOR_EMAIL=john@example.com fi if test "$GIT_COMMITTER_EMAIL" = "root@localhost" then GIT_COMMITTER_EMAIL=john@example.com fi ' -- --all ``` becomes either ```shell # Ensure ' ' is a line in .mailmap, then: git filter-repo --use-mailmap ``` or ```shell git filter-repo --email-callback ' return email if email != b"root@localhost" else b"john@example.com" ' ``` (and as a bonus both filter-repo alternatives will fix tagger emails too, unlike the filter-branch example) ### Restricting to a range The partial examples ```shell git filter-branch ... C..H ``` ```shell git filter-branch ... C..H ^D ``` ```shell git filter-branch ... D..H ^C ``` become ```shell git filter-repo ... --refs C..H ``` ```shell git filter-repo ... --refs C..H ^D ``` ```shell git filter-repo ... --refs D..H ^C ``` Note that filter-branch accepts `--not` among the revision specifiers, but that appears to python to be a flag name which breaks parsing. So, instead of e.g. `--not C` as we might use with filter-branch, we can specify `^C` to filter-repo. ## Cheat Sheet: Additional conversion examples ### Running a code formatter or linter on each file with some extension Running some program on a subset of files is relatively natural in filter-branch: ```shell git filter-branch --tree-filter ' git ls-files -z "*.c" \ | xargs -0 -n 1 clang-format -style=file -i ' ``` filter-repo decided not to provide a way to run an external program to do filtering, because most filter-branch uses of this ability are riddled with [safety problems](https://git-scm.com/docs/git-filter-branch#SAFETY) and [performance issues](https://git-scm.com/docs/git-filter-branch#PERFORMANCE). However, in special cases like this it's fairly safe. One can write a script that uses filter-repo as a library to achieve this, while also gaining filter-repo's automatic handling of other concerns like rewriting commit IDs in commit messages or pruning commits that become empty. In fact, one of the [contrib demos](../contrib/filter-repo-demos), [lint-history](../contrib/filter-repo-demos/lint-history), handles this exact type of situation already: ```shell lint-history --relevant 'return filename.endswith(b".c")' \ clang-format -style=file -i ``` git-filter-repo-2.45.0/Documentation/git-filter-repo.txt000066400000000000000000001677031464611705400232240ustar00rootroot00000000000000// This file is NOT the documentation; it's the *source code* for it. // Please follow the "user manual" link under // https://github.com/newren/git-filter-repo#how-do-i-use-it // to access the actual documentation, or view another site that // has compiled versions available, such as: // https://www.mankier.com/1/git-filter-repo git-filter-repo(1) ================== NAME ---- git-filter-repo - Rewrite repository history SYNOPSIS -------- [verse] 'git filter-repo' --analyze 'git filter-repo' [] [] [] [] [] [] [] [] DESCRIPTION ----------- Rapidly rewrite entire repository history using user-specified filters. This is a destructive operation which should not be used lightly; it writes new commits, trees, tags, and blobs corresponding to (but filtered from) the original objects in the repository, then deletes the original history and leaves only the new. See <> for more details on the ramifications of using this tool. Several different types of history rewrites are possible; examples include (but are not limited to): * stripping large files (or large directories or large extensions) * stripping unwanted files by path * extracting wanted paths and their history (stripping everything else) * restructuring the file layout (such as moving all files into a subdirectory in preparation for merging with another repo, making a subdirectory become the new toplevel directory, or merging two directories with independent filenames into one directory) * renaming tags (also often in preparation for merging with another repo) * replacing or removing sensitive text such as passwords * making mailmap rewriting of user names or emails permanent * making grafts or replacement refs permanent * rewriting commit messages Additionally, several concerns are handled automatically (many of these can be overridden, but they are all on by default): * rewriting (possibly abbreviated) hashes in commit messages to refer to the new post-rewrite commit hashes * pruning commits which become empty due to the above filters (also handles edge cases like pruning of merge commits which become degenerate and empty) * stripping of original history to avoid mixing old and new history * repacking the repository post-rewrite to shrink the repo for the user And additional facilities are available via a config option * creating replace-refs (see linkgit:git-replace[1]) for old commit hashes, which if manually pushed and fetched will allow users to continue to refer to new commits using (unabbreviated) old commit IDs Also, it's worth noting that there is an important safety mechanism: * abort if run from a repo that is not a fresh clone (to prevent accidental data loss from rewriting local history that doesn't exist anywhere else). See <>. For those who know that there is large unwanted stuff in their history and want help finding it, this command also * provides an option to analyze a repository and generate reports that can be useful in determining what to filter (or in determining whether a separate filtering command was successful). See also <>, <>, <>, and <>. OPTIONS ------- Analysis Options ~~~~~~~~~~~~~~~~ --analyze:: Analyze repository history and create a report that may be useful in determining what to filter in a subsequent run (or in determining if a previous filtering command did what you wanted). Will not modify your repo. Filtering based on paths (see also --filename-callback) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ These options specify the paths to select. Note that much like git itself, renames are NOT followed so you may need to specify multiple paths, e.g. `--path olddir/ --path newdir/` --invert-paths:: Invert the selection of files from the specified --path-{match,glob,regex} options below, i.e. only select files matching none of those options. --path-match :: --path :: Exact paths (files or directories) to include in filtered history. Multiple --path options can be specified to get a union of paths. --path-glob :: Glob of paths to include in filtered history. Multiple --path-glob options can be specified to get a union of paths. --path-regex :: Regex of paths to include in filtered history. Multiple --path-regex options can be specified to get a union of paths. --use-base-name:: Match on file base name instead of full path from the top of the repo. Incompatible with --path-rename, and incompatible with matching against directory names. Renaming based on paths (see also --filename-callback) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Note: if you combine path filtering with path renaming, be aware that a rename directive does not select paths, it only says how to rename paths that are selected with the filters. --path-rename :: --path-rename-match :: Path to rename; if filename or directory matches rename to . Multiple --path-rename options can be specified. Path shortcuts ~~~~~~~~~~~~~~ --paths-from-file :: Specify several path filtering and renaming directives, one per line. Lines with `==>` in them specify path renames, and lines can begin with `literal:` (the default), `glob:`, or `regex:` to specify different matching styles. Blank lines and lines starting with a `#` are ignored (if you have a filename that you want to filter on that starts with `literal:`, `#`, `glob:`, or `regex:`, then prefix the line with 'literal:'). --subdirectory-filter :: Only look at history that touches the given subdirectory and treat that directory as the project root. Equivalent to using `--path / --path-rename /:` --to-subdirectory-filter :: Treat the project root as instead being under . Equivalent to using `--path-rename :/` Content editing filters (see also --blob-callback) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --replace-text :: A file with expressions that, if found, will be replaced. By default, each expression is treated as literal text, but `regex:` and `glob:` prefixes are supported. You can end the line with `==>` and some replacement text to choose a replacement choice other than the default of `***REMOVED***`. --strip-blobs-bigger-than :: Strip blobs (files) bigger than specified size (e.g. `5M`, `2G`, etc) --strip-blobs-with-ids :: Read git object ids from each line of the given file, and strip all of them from history Renaming of refs (see also --refname-callback) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --tag-rename :: Rename tags starting with to start with . For example, --tag-rename foo:bar will rename tag foo-1.2.3 to bar-1.2.3; either or can be empty. Filtering of commit messages (see also --message-callback) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --replace-message :: A file with expressions that, if found in commit or tag messages, will be replaced. This file uses the same syntax as --replace-text. --preserve-commit-hashes:: By default, since commits are rewritten and thus gain new hashes, references to old commit hashes in commit messages are replaced with new commit hashes (abbreviated to the same length as the old reference). Use this flag to turn off updating commit hashes in commit messages. --preserve-commit-encoding:: Do not reencode commit messages into UTF-8. By default, if the commit object specifies an encoding for the commit message, the message is re-encoded into UTF-8. Filtering of names & emails (see also --name-callback and --email-callback) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --mailmap :: Use specified mailmap file (see linkgit:git-shortlog[1] for details on the format) when rewriting author, committer, and tagger names and emails. If the specified file is part of git history, historical versions of the file will be ignored; only the current contents are consulted. --use-mailmap:: Same as: '--mailmap .mailmap' Parent rewriting ~~~~~~~~~~~~~~~~ --replace-refs {delete-no-add, delete-and-add, update-no-add, update-or-add, update-and-add, old-default}:: How to handle replace refs (see git-replace(1)). Replace refs can be added during the history rewrite as a way to allow users to pass old commit IDs (from before git-filter-repo was run) to git commands and have git know how to translate those old commit IDs to the new (post-rewrite) commit IDs. Also, replace refs that existed before the rewrite can either be deleted or updated. The choices to pass to --replace-refs thus need to specify both what to do with existing refs and what to do with commit rewrites. Thus 'update-and-add' means to update existing replace refs, and for any commit rewrite (even if already pointed at by a replace ref) add a new refs/replace/ reference to map from the old commit ID to the new commit ID. The default is update-no-add, meaning update existing replace refs but do not add any new ones. There is also a special 'old-default' option for picking the default used in versions prior to git-filter-repo-2.45, namely 'update-and-add' upon the first run of git-filter-repo in a repository and 'update-or-add' if running git-filter-repo again on a repository. --prune-empty {always, auto, never}:: Whether to prune empty commits. 'auto' (the default) means only prune commits which become empty (not commits which were empty in the original repo, unless their parent was pruned). When the parent of a commit is pruned, the first non-pruned ancestor becomes the new parent. --prune-degenerate {always, auto, never}:: Since merge commits are needed for history topology, they are typically exempt from pruning. However, they can become degenerate with the pruning of other commits (having fewer than two parents, having one commit serve as both parents, or having one parent as the ancestor of the other.) If such merge commits have no file changes, they can be pruned. The default ('auto') is to only prune empty merge commits which become degenerate (not which started as such). --no-ff:: Even if the first parent is or becomes an ancestor of another parent, do not prune it. This modifies how --prune-degenerate behaves, and may be useful in projects who always use merge --no-ff. Generic callback code snippets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --filename-callback :: Python code body for processing filenames; see <>. --message-callback :: Python code body for processing messages (both commit messages and tag messages); see <>. --name-callback :: Python code body for processing names of people; see <>. --email-callback :: Python code body for processing emails addresses; see <>. --refname-callback :: Python code body for processing refnames; see <>. --blob-callback :: Python code body for processing blob objects; see <>. --commit-callback :: Python code body for processing commit objects; see <>. --tag-callback :: Python code body for processing tag objects; see <>. --reset-callback :: Python code body for processing reset objects; see <>. Location to filter from/to ~~~~~~~~~~~~~~~~~~~~~~~~~~ NOTE: Specifying alternate source or target locations implies --partial. However, unlike normal uses of --partial, this doesn't risk mixing old and new history since the old and new histories are in different repositories. --source :: Git repository to read from --target :: Git repository to overwrite with filtered history Miscellaneous options ~~~~~~~~~~~~~~~~~~~~~ --help:: -h:: Show a help message and exit. --force:: -f:: Ignore fresh clone checks and rewrite history (an irreversible operation, especially since it by default ends with an immediate pruning of reflogs and old objects). See <>. Note that when cloning repos on a local filesystem, it is better to pass `--no-local` to git clone than passing `--force` to git-filter-repo. --partial:: Do a partial history rewrite, resulting in the mixture of old and new history. This disables rewriting refs/remotes/origin/* to refs/heads/*, disables removing of the 'origin' remote, disables removing unexported refs, disables expiring the reflog, and disables the automatic post-filter gc. Also, this modifies --tag-rename and --refname-callback options such that instead of replacing old refs with new refnames, it will instead create new refs and keep the old ones around. Use with caution. --refs :: Limit history rewriting to the specified refs. Implies --partial. In addition to the normal caveats of --partial (mixing old and new history, no automatic remapping of refs/remotes/origin/* to refs/heads/*, etc.), this also may cause problems for pruning of degenerate empty merge commits when negative revisions are specified. --dry-run:: Do not change the repository. Run `git fast-export` and filter its output, and save both the original and the filtered version for comparison. This also disables rewriting commit messages due to not knowing new commit IDs and disables filtering of some empty commits due to inability to query the fast-import backend. --debug:: Print additional information about operations being performed and commands being run. (If used together with --dry-run, shows extra information about what would be run). --stdin:: Instead of running `git fast-export` and filtering its output, filter the fast-export stream from stdin. The stdin must be in the expected input format (e.g. it needs to include original-oid directives). --quiet:: Pass --quiet to other git commands called. OUTPUT ------ Every time filter-repo is run, files are created in the `.git/filter-repo/` directory. These files are overwritten unconditionally on every run. Commit map ~~~~~~~~~~ The `.git/filter-repo/commit-map` file contains a mapping of how all commits were (or were not) changed. * A header is the first line with the text "old" and "new" * Commit mappings are in no particular order * All commits in range of the rewrite will be listed, even commits that are unchanged (e.g. because the commit pre-dated when the large file(s) were introduced to the repo). * An all-zeros hash, or null SHA, represents a non-existent object. When in the "new" column, this means the commit was removed entirely. Reference map ~~~~~~~~~~~~~ The `.git/filter-repo/ref-map` file contains a mapping of which local references were changed. * A header is the first line with the text "old", "new" and "ref" * Reference mappings are in no particular order * An all-zeros hash, or null SHA, represents a non-existent object. When in the "new" column, this means the ref was removed entirely. [[FRESHCLONE]] FRESH CLONE SAFETY CHECK AND --FORCE ------------------------------------ Since filter-repo does irreversible rewriting of history, it is important to avoid making changes to a repo for which the user doesn't have a good backup. The primary defense mechanism is to simply educate users and rely on them to be good stewards of their data; thus there are several warnings in the documentation about how filter repo rewrites history. However, as a service to users, we would like to provide an additional safety check beyond the documentation. There isn't a good way to check if the user has a good backup, but we can ask a related question that is an imperfect but quite reasonable proxy: "Is this repository a fresh clone?" Unfortunately, that is also a question we can't get a perfect answer to; git provides no way to answer that question. However, there are approximately a dozen things that I found that seem to always be true of brand new clones (assuming they are either clones of remote repositories or are made with the `--no-local` flag), and I check for all of those. These checks can have both false positives and false negatives. Someone might have a perfectly good backup of their repo without it actually being a fresh clone -- but there's no way for filter-repo to know that. Conversely, someone could look at all things that filter-repo checks for in its safety checks and then just tweak their non-backed-up repository to satisfy those conditions (though it would take a fair amount of effort, and it's astronomically unlikely that a repo that isn't a fresh clone randomly happens to match all the criteria). In practice, the safety checks filter-repo uses seem to be really good at avoiding people accidentally running filter-repo on a repository that they shouldn't be running it on. It even caught me once when I did mean to run filter-repo but was in a different directory than I thought I was. In short, it's perfectly fine to use `--force` to override the safety checks as long as you're okay with filter-repo irreversibly rewriting the contents of the current repository. It is a really bad idea to get in the habit of always specifying `--force`; if you do, one day you will run one of your commands in the wrong directory like I did, and you won't have the safety check anymore to bail you out. Also, it is definitely NOT okay to recommend `--force` on forums, Q&A sites, or in emails to other users without first carefully explaining that `--force` means putting your repositories' data at risk. I am especially bothered by people who suggest the flag when it clearly is NOT needed; they are needlessly putting other peoples' data at risk. [[VERSATILITY]] VERSATILITY ----------- filter-repo has a hierarchy of capabilities on the spectrum from easy to use convenience flags that perform pre-defined types of filtering, to choices that provide lots of flexibility in controlling how filtering occurs. This spectrum includes the following: * Convenience flags making common types of history rewriting simple (e.g. --path, --strip-blobs-bigger-than, --replace-text, --mailmap) * Options which are shorthand for others or which provide greater control than others (e.g. --subdirectory-filter could just be written using both a path selection (--path) and a path rename (--path-rename) filter; --paths-from-file can handle all other --path* options and more such as regex renaming of paths) * Generic python callbacks for handling a certain type of data (the filename, message, name, email, and refname callbacks) * Generic python callbacks for handling fundamental git objects, allowing greater control over the combination of data types the object holds (the commit, tag, blob, and reset callbacks) * The ability to import filter-repo as a module in a python program and use its classes and functions for even greater control and flexibility while still leveraging lots of basic capabilities. One can even use this to write new tools with a completely different interface. For more information about callbacks, see <>. For examples on writing python programs that import filter-repo as a module to create new history rewriting tools, look at the contrib/filter-repo-demos/ directory. That directory includes, among other examples, a reimplementation of git-filter-branch which is faster than git-filter-branch, and a reimplementation of BFG Repo Cleaner with several bug fixes and new features. [[DISCUSSION]] DISCUSSION ---------- Using filter-repo is relatively simple, but rewriting history is part of a larger discussion in terms of collaboration. When you rewrite history, the old and new histories are no longer compatible; if you push this history somewhere for others to view, it will look as though you've done a rebase of all branches and tags. Make sure you are familiar with the "RECOVERING FROM UPSTREAM REBASE" section of linkgit:git-rebase[1] (and in particular, "The hard case") before proceeding, in addition to this section. Steps to use git-filter-repo as part of the bigger picture of doing a history rewrite are roughly as follows: 1. Create a clone of your repository (if you created special refs outside of refs/heads/ or refs/tags/, make sure to fetch those too). You may pass `--bare` or `--mirror` to `git clone`, if you prefer. You should pass `--no-local` if the repository you are cloning from is on the local filesystem. Avoid other flags; some might confuse the fresh clone check, and others could cause parts of the data to be missing that are needed for the rewrite. 2. (Optional) Run `git filter-repo --analyze`. This will create a directory of reports mentioning renames that have occurred in your repo and also listing sizes of objects aggregated by path/directory/extension/blob-id; this information may be useful in choosing how to filter your repo. It can also be useful to re-run --analyze after filtering to verify the changes look correct. 3. Run filter-repo with your desired filtering options. Many examples are given below. For more complex cases, note that doing the filtering in multiple steps (by running multiple filter-repo invocations in a sequence) is supported. If anything goes wrong here, simply delete your clone and restart. 4. Push your new repository to its new home (note that refs/remotes/origin/* will have been moved to refs/heads/* as the first part of filter-repo, so you can just deal with normal branches instead of remote tracking branches). While you can force push this to the same URL you cloned from, there are good reasons to consider pushing to a different location instead: * People who cloned from the original repo will have old history. When they fetch the new history you force pushed up, unless they do a `git reset --hard @{u}` on their branches or rebase their local work, git will think they have hundreds or thousands of commits with very similar commit messages as what exist upstream (but which include files you wanted excised from history), and allow the user to merge the two histories, resulting in what looks like two copies of each commit. If they then push this history back up, then everyone now has history with two copies of each commit and the bad files have returned. You're more likely to succeed in forcing people to get rid of the old history if they have to clone a new URL. * Rewriting history will rewrite tags; those who have already downloaded tags will not get the updated tags by default (see the "On Re-tagging" section of linkgit:git-tag[1]). Every user trying to use an existing clone will have to forcibly delete all tags and re-fetch them; it may be easier for them to just re-clone, which they are more likely to do with a new clone URL. * Rewriting history may delete some refs (e.g. branches that only had files that you wanted excised from history); unless you run git push with the `--mirror` or `--prune` options, those refs will continue to exist on the server. If folks then merge these branches into others, then people have started mixing old and new history. If users had already cloned these branches, removing them from the server isn't enough; you need all users to delete any local branches based on these refs and run fetch with the `--prune` option as well. Simply re-cloning from a new URL is easier. * The server may not allow you to force push over some refs. For example, code review systems may have special ref namespaces (e.g. refs/changes/, refs/pull/, refs/merge-requests/) that they have locked down. 5. If you still want to push your rewritten history back to the original url despite my warnings above, you'll have to manage it very carefully: * git-filter-repo deletes the "origin" remote to help avoid people accidentally repushing to the same repository, so you'll need to remind git what origin's url was. You'll have to look up the command for that. * You'll need to carefully synchronize with *everyone* who has cloned the repository, and will also need to carefully synchronize with *everything* (e.g. CI systems) that has cloned it. Every single clone will either need to be thrown away and re-cloned, or need to take all the steps outlined in item 4 as well as follow the necessary steps from "RECOVERING FROM UPSTREAM REBASE" section of linkgit:git-rebase[1]. If you miss fixing any clones, you'll risk mixing old and new history and end up with an even worse mess to clean up. * Finally, you'll need to consult any documentation from your hosting provider about how to remove any server-side references to the old commits (example: https://docs.gitlab.com/ee/user/project/repository/reducing_the_repo_size_using_git.html[GitLab's excellent docs on reducing repository size], or https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/removing-sensitive-data-from-a-repository#fully-removing-the-data-from-github[the first and second steps under "Fully removing the data from GitHub"]). 6. (Optional) Some additional considerations * filter-repo has a --replace-refs option to allow creating replace refs (see linkgit:git-replace[1]) for each rewritten commit ID, allowing you to use old (unabbreviated) commit hashes in the git command line to refer to the newly rewritten commits. If you want to use these replace refs, manually push them to the relevant clone URL and tell users to manually fetch them (e.g. by adjusting their fetch refspec, `git config --add remote.origin.fetch +refs/replace/*:refs/replace/*`). Sadly, replace refs are not yet widely understood; projects like jgit and libgit2 do not support them and existing repository managers (e.g. Gerrit, GitHub, GitLab) do not yet understand replace refs. Thus one can't use old commit hashes within the UI of these other systems. This may change in the future, but replace refs at least help users locally within the git command line interface. Also, be aware that commit-graphs are excessively cautious around replace refs and just turn off entirely if any are present, so after enough time has passed that old commit IDs become less relevant, users may want to locally delete the replace refs to regain the speedups from commit-graphs. * If you have a central repo, you may want to prevent people from pushing old commit IDs, in order to avoid mixing old and new history. Every repository manager does this differently, some provide specialized commands (e.g. https://gerrit-review.googlesource.com/Documentation/cmd-ban-commit.html), others require you to write hooks. [[EXAMPLES]] EXAMPLES -------- Path based filtering ~~~~~~~~~~~~~~~~~~~~ To only keep the 'README.md' file plus the directories 'guides' and 'tools/releases/': -------------------------------------------------- git filter-repo --path README.md --path guides/ --path tools/releases -------------------------------------------------- Directory names can be given with or without a trailing slash, and all filenames are relative to the toplevel of the repo. To keep all files except these paths, just add `--invert-paths`: -------------------------------------------------- git filter-repo --path README.md --path guides/ --path tools/releases --invert-paths -------------------------------------------------- If you want to have both an inclusion filter and an exclusion filter, just run filter-repo multiple times. For example, to keep the src/main subdirectory but exclude files under src/main named 'data', run: -------------------------------------------------- git filter-repo --path src/main/ git filter-repo --path-glob 'src/*/data' --invert-paths -------------------------------------------------- Note that the asterisk (`*`) will match across multiple directories, so the second command would remove e.g. src/main/org/whatever/data. Also, the second command by itself would also remove e.g. src/not-main/foo/data, but since src/not-main/ was removed by the first command, that's not an issue. Also, the use of quotes around the asterisk is sometimes important to avoid glob expansion by the shell. You can also select paths by regular expression (see https://docs.python.org/3/library/re.html#regular-expression-syntax). For example, to only include files from the repo whose name is in the format YYYY-MM-DD.txt and is found at least two subdirectories deep: -------------------------------------------------- git filter-repo --path-regex '^.*/.*/[0-9]{4}-[0-9]{2}-[0-9]{2}.txt$' -------------------------------------------------- If you want two directories to be renamed (and maybe merged if both are renamed to the same location), use --path-rename; for example, to rename both 'cmds/' and 'src/scripts/' to 'tools/': -------------------------------------------------- git filter-repo --path-rename cmds:tools --path-rename src/scripts/:tools/ -------------------------------------------------- As with `--path`, directories can be specified with or without a trailing slash for `--path-rename`. If you do a `--path-rename` to something that was already in use, it will be silently overwritten. However, if you try to rename multiple files to the same location (e.g. src/scripts/run_release.sh and cmds/run_release.sh both existed and had different content with the renames above), then you will be given an error. If you have such a case, you may want to add another rename command to move one of the paths somewhere else where it won't collide: -------------------------------------------------- git filter-repo --path-rename cmds/run_release.sh:tools/do_release.sh \ --path-rename cmds/:tools/ \ --path-rename src/scripts/:tools/ -------------------------------------------------- Also, `--path-rename` brings up ordering issues; all path arguments are applied in order. Thus, a command like -------------------------------------------------- git filter-repo --path-rename sources/:src/main/ --path src/main/ -------------------------------------------------- would make sense but reversing the two arguments would not (src/main/ is created by the rename so reversing the two would give you an empty repo). Also, note that the rename of cmds/run_release.sh a couple examples ago was done before the other renames. Note that path renaming does not do path filtering, thus the following command -------------------------------------------------- git filter-repo --path src/main/ --path-rename tools/:scripts/ -------------------------------------------------- would not result in the tools or scripts directories being present, because the single filter selected only src/main/. It's likely that you would instead want to run: -------------------------------------------------- git filter-repo --path src/main/ --path tools/ --path-rename tools/:scripts/ -------------------------------------------------- If you prefer to filter based solely on basename, use the `--use-base-name` flag (though this is incompatible with `--path-rename`). For example, to only include README.md and Makefile files from any directory: -------------------------------------------------- git filter-repo --use-base-name --path README.md --path Makefile -------------------------------------------------- If you wanted to delete all .DS_Store files in any directory, you could either use: -------------------------------------------------- git filter-repo --invert-paths --path '.DS_Store' --use-base-name -------------------------------------------------- or -------------------------------------------------- git filter-repo --invert-paths --path-glob '*/.DS_Store' --path '.DS_Store' -------------------------------------------------- (the `--path-glob` isn't sufficient by itself as it might miss a toplevel .DS_Store file; further while something like `--path-glob '*.DS_Store'` would workaround that problem it would also grab files named `foo.DS_Store` or `bar/baz.DS_Store`) Finally, see also the `--filename-callback` from <>. Filtering based on many paths ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you have a long list of files, directories, globs, or regular expressions to filter on, you can stick them in a file and use `--paths-from-file`; for example, with a file named stuff-i-want.txt with contents of -------------------------------------------------- # Blank lines and comment lines are ignored. # Examples similar to --path: README.md guides/ tools/releases # An example that is like --path-glob: glob:*.py # An example that is like --path-regex: regex:^.*/.*/[0-9]{4}-[0-9]{2}-[0-9]{2}.txt$ # An example of renaming a path tools/==>scripts/ # An example of using a regex to rename a path regex:(.*)/([^/]*)/([^/]*)\.text$==>\2/\1/\3.txt -------------------------------------------------- then you could run -------------------------------------------------- git filter-repo --paths-from-file stuff-i-want.txt -------------------------------------------------- to get a repo containing only the toplevel README.md file, the guides/ and tools/releases/ directories, all python files, files whose name was of the form YYYY-MM-DD.txt at least two subdirectories deep, and would rename tools/ to scripts/ and rename files like foo/bar/baz.text to bar/foo/baz.txt. Note the special line prefixes of `glob:` and `regex:` and the special string `==>` denoting renames. Sometimes you have a way of easily generating all the files you want. For example, if you know that none of the currently tracked files have any newlines or special characters in them (see core.quotePath from `git config --help`) so that `git ls-files` would print all files literally one per line, and you knew that you wanted to keep only the files that are currently tracked (thus deleting from all commits in history any files that only appear on other branches or that only appear in older commits), then you could use a pair of commands such as -------------------------------------------------- git ls-files >../paths-i-want.txt git filter-repo --paths-from-file ../paths-i-want.txt -------------------------------------------------- Similarly, you could use --paths-from-file to delete many files. For example, you could run `git filter-repo --analyze` to get reports, look in one such as .git/filter-repo/analysis/path-deleted-sizes.txt and copy all the filenames into a file such as /tmp/files-i-dont-want-anymore.txt and then run -------------------------------------------------- git filter-repo --invert-paths --paths-from-file /tmp/files-i-dont-want-anymore.txt -------------------------------------------------- to delete them all. Directory based shortcuts ~~~~~~~~~~~~~~~~~~~~~~~~~ Let's say you had a directory structure like the following: module/ foo.c bar.c otherDir/ blah.config stuff.txt zebra.jpg If you wanted just the module/ directory and you wanted it to become the new root so that your new directory structure looked like foo.c bar.c then you could run: -------------------------------------------------- git filter-repo --subdirectory-filter module/ -------------------------------------------------- If you wanted all the files from the original repo, but wanted to move everything under a subdirectory named my-module/, so that your new directory structure looked like my-module/ module/ foo.c bar.c otherDir/ blah.config stuff.txt zebra.jpg then you would instead run run -------------------------------------------------- git filter-repo --to-subdirectory-filter my-module/ -------------------------------------------------- Content based filtering ~~~~~~~~~~~~~~~~~~~~~~~ If you want to filter out all files bigger than a certain size, you can use `--strip-blobs-bigger-than` with some size (K, M, and G suffixes are recognized), e.g.: -------------------------------------------------- git filter-repo --strip-blobs-bigger-than 10M -------------------------------------------------- If you want to strip out all files with specified git object ids (hashes), list the hashes in a file and run -------------------------------------------------- git filter-repo --strip-blobs-with-ids FILE_WITH_GIT_BLOB_IDS -------------------------------------------------- If you want to modify file contents, you can do so based on a list of expressions in a file, one per line. For example, with a file named expressions.txt containing -------------------------------------------------- p455w0rd foo==>bar glob:*666*==> regex:\bdriver\b==>pilot literal:MM/DD/YYYY==>YYYY-MM-DD regex:([0-9]{2})/([0-9]{2})/([0-9]{4})==>\3-\1-\2 -------------------------------------------------- then running -------------------------------------------------- git filter-repo --replace-text expressions.txt -------------------------------------------------- will go through and replace `p455w0rd` with `***REMOVED***`, `foo` with `bar`, any line containing `666` with a blank line, the word `driver` with `pilot` (but not if it has letters before or after; e.g. `drivers` will be unmodified), replace the exact text `MM/DD/YYYY` with `YYYY-MM-DD` and replace date strings of the form MM/DD/YYYY with ones of the form YYYY-MM-DD. In the expressions file, there are a few things to note: * Every line has a replacement, given by whatever is on the right of `==>`. If `==>` does not appear on the line, the default replacement is `***REMOVED***`. * Lines can start with `literal:`, `glob:`, or `regex:` to specify whether to do literal string matches, globs (see https://docs.python.org/3/library/fnmatch.html), or regular expressions (see https://docs.python.org/3/library/re.html#regular-expression-syntax). If none of these are specified, `literal:` is assumed. * If multiple matches are found, all are replaced. * globs and regexes are applied to the entire file, but without any special flags turned on. Some folks may be interested in adding `(?m)` to the regex to turn on MULTILINE mode, so that `^` and `$` match the beginning and ends of lines rather than the beginning and end of file. See https://docs.python.org/3/library/re.html for details. See also the `--blob-callback` from <>. Updating commit/tag messages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you want to modify commit or tag messages, you can do so with the same syntax as `--replace-text`, explained above. For example, with a file named expressions.txt containing -------------------------------------------------- foo==>bar -------------------------------------------------- then running -------------------------------------------------- git filter-repo --replace-message expressions.txt -------------------------------------------------- will replace `foo` in commit or tag messages with `bar`. See also the `--message-callback` from <>. Refname based filtering ~~~~~~~~~~~~~~~~~~~~~~~ To rename tags, use `--tag-rename`, e.g.: -------------------------------------------------- git filter-repo --tag-rename foo:bar -------------------------------------------------- This will rename any tags starting with `foo` to now start with `bar`. Either side of the colon could be blank, e.g. -------------------------------------------------- git filter-repo --tag-rename '':'my-module-' -------------------------------------------------- For more general refname modification, see `--refname-callback` from <>. User and email based filtering ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To modify username and emails of commits, you can create a mailmap file in the format accepted by linkgit:git-shortlog[1]. For example, if you have a file named my-mailmap you can run -------------------------------------------------- git filter-repo --mailmap my-mailmap -------------------------------------------------- and if the current contents of that file are as follows (if the specified mailmap file is version controlled, historical versions of the file are ignored): -------------------------------------------------- Name For User New Name And New Name And Old Name And -------------------------------------------------- then we can update username and/or emails based on the specified mapping. See also the `--name-callback` and `--email-callback` from <>. Parent rewriting ~~~~~~~~~~~~~~~~ To replace $commit_A with $commit_B (e.g. make all commits which had $commit_A as a parent instead have $commit_B for that parent), and rewrite history to make it permanent: -------------------------------------------------- git replace $commit_A $commit_B git filter-repo --force -------------------------------------------------- To create a new commit with the same contents as $commit_A except with different parent(s) and then replace $commit_A with the new commit, and rewrite history to make it permanent: -------------------------------------------------- git replace --graft $commit_A $new_parent_or_parents git filter-repo --force -------------------------------------------------- The reason to specify --force is two-fold: filter-repo will error out if no arguments are specified, and the new graft commit would otherwise trigger the not-a-fresh-clone check. Partial history rewrites ~~~~~~~~~~~~~~~~~~~~~~~~ To rewrite the history on just one branch (which may cause it to no longer share any common history with other branches), use `--refs`. For example, to remove a file named 'extraneous.txt' from the 'master' branch: -------------------------------------------------- git filter-repo --invert-paths --path extraneous.txt --refs master -------------------------------------------------- To rewrite just some recent commits: -------------------------------------------------- git filter-repo --invert-paths --path extraneous.txt --refs master~3..master -------------------------------------------------- [[CALLBACKS]] CALLBACKS --------- For flexibility, filter-repo allows you to specify functions on the command line to further filter all changes. Please note that there are some API compatibility caveats associated with these callbacks that you should be aware of before using them; see the "API BACKWARD COMPATIBILITY CAVEAT" comment near the top of git-filter-repo source code. All callback functions are of the same general format. For a command line argument like -------------------------------------------------- --foo-callback 'BODY' -------------------------------------------------- the following code will be compiled and called: -------------------------------------------------- def foo_callback(foo): BODY -------------------------------------------------- Thus, you just need to make sure your _BODY_ modifies and returns _foo_ appropriately. One important thing to note for all callbacks is that filter-repo uses bytestrings (see https://docs.python.org/3/library/stdtypes.html#bytes) everywhere instead of strings. There are four callbacks that allow you to operate directly on raw objects that contain data that's easy to write in linkgit:git-fast-import[1] format: -------------------------------------------------- --blob-callback --commit-callback --tag-callback --reset-callback -------------------------------------------------- We'll come back to these later because it is often the case that the other callbacks are more convenient. The other callbacks operate on a small piece of the raw objects or operate on pieces across multiple types of raw object (e.g. author names and committer names and tagger names across commits and tags, or refnames across commits, tags, and resets, or messages across commits and tags). The convenience callbacks are: -------------------------------------------------- --filename-callback --message-callback --name-callback --email-callback --refname-callback -------------------------------------------------- in each you are expected to simply return a new value based on the one passed in. For example, -------------------------------------------------- git-filter-repo --name-callback 'return name.replace(b"Wiliam", b"William")' -------------------------------------------------- would result in the following function being called: -------------------------------------------------- def name_callback(name): return name.replace(b"Wiliam", b"William") -------------------------------------------------- The email callback is quite similar: -------------------------------------------------- git-filter-repo --email-callback 'return email.replace(b".cm", b".com")' -------------------------------------------------- The refname callback is also similar, but note that the refname passed in and returned are expected to be fully qualified (e.g. b"refs/heads/master" instead of just b"master" and b"refs/tags/v1.0.7" instead of b"1.0.7"): -------------------------------------------------- git-filter-repo --refname-callback ' # Change e.g. refs/heads/master to refs/heads/prefix-master rdir,rpath = os.path.split(refname) return rdir + b"/prefix-" + rpath' -------------------------------------------------- The message callback is quite similar to the previous three callbacks, though it operates on a bytestring that is likely more than one line: -------------------------------------------------- git-filter-repo --message-callback ' if b"Signed-off-by:" not in message: message += b"\nSigned-off-by: Me My " return re.sub(b"[Ee]-?[Mm][Aa][Ii][Ll]", b"email", message)' -------------------------------------------------- The filename callback is slightly more interesting. Returning None means the file should be removed from all commits, returning the filename unmodified marks the file to be kept, and returning a different name means the file should be renamed. An example: -------------------------------------------------- git-filter-repo --filename-callback ' if b"/src/" in filename: # Remove all files with a directory named "src" in their path # (except when "src" appears at the toplevel). return None elif filename.startswith(b"tools/"): # Rename tools/ -> scripts/misc/ return b"scripts/misc/" + filename[6:] else: # Keep the filename and do not rename it return filename ' -------------------------------------------------- In contrast, the blob, reset, tag, and commit callbacks are not expected to return a value, but are instead expected to modify the object passed in. Major fields for these objects are (subject to API backward compatibility caveats mentioned previously): * Blob: `original_id` (original hash) and `data` * Reset: `ref` (name of reference) and `from_ref` (hash or integer mark) * Tag: `ref`, `from_ref`, `original_id`, `tagger_name`, `tagger_email`, `tagger_date`, `message` * Commit: `branch`, `original_id`, `author_name`, `author_email`, `author_date`, `committer_name`, `committer_email`, `committer_date`, `message`, `file_changes` (list of FileChange objects, each containing a `type`, `filename`, `mode`, and `blob_id`), `parents` (list of hashes or integer marks) An example of each: -------------------------------------------------- git filter-repo --blob-callback ' if len(blob.data) > 25: # Mark this blob for removal from all commits blob.skip() else: blob.data = blob.data.replace(b"Hello", b"Goodbye") ' -------------------------------------------------- -------------------------------------------------- git filter-repo --reset-callback 'reset.ref = reset.ref.replace(b"master", b"dev")' -------------------------------------------------- -------------------------------------------------- git filter-repo --tag-callback ' if tag.tagger_name == b"Jim Williams": # Omit this tag tag.skip() else: tag.message = tag.message + b"\n\nTag of %s by %s on %s" % (tag.ref, tag.tagger_email, tag.tagger_date)' -------------------------------------------------- -------------------------------------------------- git filter-repo --commit-callback ' # Remove executable files with three 6s in their name (including # from leading directories). # Also, undo deletion of sources/foo/bar.txt (change types are # either b"D" (deletion) or b"M" (add or modify); renames are # handled by deleting the old file and adding a new one) commit.file_changes = [ change for change in commit.file_changes if not (change.mode == b"100755" and change.filename.count(b"6") == 3) and not (change.type == b"D" and change.filename == b"sources/foo/bar.txt")] # Mark all .sh files as executable; modes in git are always one of # 100644 (normal file), 100755 (executable), 120000 (symlink), or # 160000 (submodule) for change in commit.file_changes: if change.filename.endswith(b".sh"): change.mode = b"100755" ' -------------------------------------------------- [[INTERNALS]] INTERNALS --------- You probably don't need to read this section unless you are just very curious or you are trying to do a very complex history rewrite. How filter-repo works ~~~~~~~~~~~~~~~~~~~~~ Roughly, filter-repo works by running -------------------------------------------------- git fast-export | filter | git fast-import -------------------------------------------------- where filter-repo not only launches the whole pipeline but also serves as the _filter_ in the middle. However, filter-repo does a few additional things on top in order to make it into a well-rounded filtering tool. A sequence that more accurately reflects what filter-repo runs is: 1. Verify we're in a fresh clone 2. `git fetch -u . refs/remotes/origin/*:refs/heads/*` 3. `git remote rm origin` 4. `git fast-export --show-original-ids --reference-excluded-parents --fake-missing-tagger --signed-tags=strip --tag-of-filtered-object=rewrite --use-done-feature --no-data --reencode=yes --mark-tags --all | filter | git -c core.ignorecase=false fast-import --date-format=raw-permissive --force --quiet` 5. `git update-ref --no-deref --stdin`, fed with a list of refs to nuke, and a list of replace refs to delete, create, or update. 6. `git reset --hard` 7. `git reflog expire --expire=now --all` 8. `git gc --prune=now` Some notes or exceptions on each of the above: 1. If we're not in a fresh clone, users will not be able to recover if they used the wrong command or ran in the wrong repo. (Though `--force` overrides this check, and it's also off if you've already ran filter-repo once in this repo.) 2. Technically, we actually use a `git update-ref` command fed with a lot of input due to the fact that users can use `--force` when local branches might not match remote branches. But this fetch command catches the intent rather succinctly. 3. We don't want users accidentally pushing back to the original repo, as discussed in <>. It also reminds users that since history has been rewritten, this repo is no longer compatible with the original. Finally, another minor benefit is this allows users to push with the `--mirror` option to their new home without accidentally sending remote tracking branches. 4. Some of these flags are always used but others are actually conditional. For example, filter-repo's `--replace-text` and `--blob-callback` options need to work on blobs so `--no-data` cannot be passed to fast-export. But when we don't need to work on blobs, passing `--no-data` speeds things up. Also, other flags may change the structure of the pipeline as well (e.g. `--dry-run` and `--debug`) 5. We use this step to write replace refs for accessing the newly written commit hashes using their previous names. Also, if refs were renamed by various steps, we need to delete the old refnames in order to avoid mixing old and new history. 6. Users also have old versions of files in their working tree and index; we want those cleaned up to match the rewritten history as well. Note that this step is skipped in bare repos. 7. Reflogs will hold on to old history, so we need to expire them. 8. We need to gc to avoid mixing new and old history. Also, it shrinks the repository for users, so they don't have to do extra work. (Odds are that they've only rewritten trees and commits and maybe a few blobs, so `--aggressive` isn't needed and would be too slow.) Information about these steps is printed out when `--debug` is passed to filter-repo. When doing a `--partial` history rewrite, steps 2, 3, 7, and 8 are unconditionally skipped, step 5 is skipped if `--replace-refs` is `update-no-add`, and just the nuke-unused-refs portion of step 5 is skipped if `--replace-refs` is something else. Limitations ~~~~~~~~~~~ Inherited limitations ^^^^^^^^^^^^^^^^^^^^^ Since git filter-repo calls fast-export and fast-import to do a lot of the heavy lifting, it inherits limitations from those systems: * extended commit headers, if any, are stripped * commits get rewritten meaning they will have new hashes; therefore, signatures on commits and tags cannot continue to work and instead are just removed (thus signed tags become annotated tags) * tags of commits are supported. Prior to git-2.24.0, tags of blobs and tags of tags are not supported (fast-export would die on such tags). tags of trees are not supported in any git version (since fast-export ignores tags of trees with a warning and fast-import provides no way to import them). * annotated and signed tags outside of the refs/tags/ namespace are not supported (their location will be mangled in weird ways) * fast-import will die on various forms of invalid input, such as a timezone with more than four digits * fast-export cannot reencode commit messages into UTF-8 if the commit message is not valid in its specified encoding (in such cases, it'll leave the commit message and the encoding header alone). * commits without an author will be given one matching the committer * tags without a tagger will be given a fake tagger * references that include commit cycles in their history (which can be created with linkgit:git-replace[1]) will not be flagged to the user as an error but will be silently deleted by fast-export as though the branch or tag contained no interesting files There are also some limitations due to the design of these systems: * Trying to insert additional files into the stream can be tricky; since fast-export only lists file changes in a merge relative to its first parent, if you insert additional files into a commit that is in the second (or third or fourth) parent history of a merge, then you also need to add it to the merge manually. (Similarly, if you change which parent is the first parent in a merge commit, you need to manually update the list of file changes to be relative to the new first parent.) * fast-export and fast-import work with exact file contents, not patches. (e.g. "Whatever the current contents of this file, update them to now have these contents") Because of this, removing the changes made in a single commit or inserting additional changes to a file in some commit and expecting them to propagate forward is not something that can be done with these tools. Use linkgit:git-rebase[1] for that. Intrinsic limitations ^^^^^^^^^^^^^^^^^^^^^ Some types of filtering have limitations that would affect any tool attempting to perform them; the most any tool can do is attempt to notify the user when it detects an issue: * When rewriting commit hashes in commit messages, there are a variety of cases when the hash will not be updated (whenever this happens, a note is written to `.git/filter-repo/suboptimal-issues`): ** if a commit hash does not correspond to a commit in the old repo ** if a commit hash corresponds to a commit that gets pruned ** if an abbreviated hash is not unique * Pruning of empty commits can cause a merge commit to lose an entire ancestry line and become a non-merge. If the merge commit had no changes then it can be pruned too, but if it still has changes it needs to be kept. This might cause minor confusion since the commit will likely have a commit message that makes it sound like a merge commit even though it's not. (Whenever a merge commit becomes a non-merge commit, a note is written to `.git/filter-repo/suboptimal-issues`) Issues specific to filter-repo ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * Multiple repositories in the wild have been observed which use a bogus timezone (`+051800`); google will find you some reports. The intended timezone wasn't clear or wasn't always the same. Replace with a different bogus timezone that fast-import will accept (`+0261`). * `--path-rename` can result in pathname collisions; to avoid excessive memory requirements of tracking which files are in all commits or looking up what files exist with either every commit or every usage of --path-rename, we just tell the user that they might clobber other changes if they aren't careful. We can check if the clobbering comes from another --path-rename without much overhead. (Perhaps in the future it's worth adding a slow mode to --path-rename that will do the more exhaustive checks?) * There is no mechanism for directly controlling which flags are passed to fast-export (or fast-import); only pre-defined flags can be turned on or off as a side-effect of other options. Direct control would make little sense because some options like `--full-tree` would require additional code in filter-repo (to parse new directives), and others such as `-M` or `-C` would break assumptions used in other places of filter-repo. * Partial-repo filtering, while supported, runs counter to filter-repo's "avoid mixing old and new history" design. This support has required improvements to core git as well (e.g. it depends upon the `--reference-excluded-parents` option to fast-export that was added specifically for this usage within filter-repo). The `--partial` and `--refs` options will continue to be supported since there are people with usecases for them; however, I am concerned that this inconsistency about mixing old and new history seems likely to lead to user mistakes. For now, I just hope that long explanations of caveats in the documentation of these options suffice to curtail any such problems. Comments on reversibility ^^^^^^^^^^^^^^^^^^^^^^^^^ Some people are interested in reversibility of a rewrite; e.g. rewrite history, possibly add some commits, then unrewrite and get the original history back plus a few new "unrewritten" commits. Obviously this is impossible if your rewrite involves throwing away information (e.g. filtering out files or replacing several different strings with `***REMOVED***`), but may be possible with some rewrites. filter-repo is likely to be a poor fit for this type of workflow for a few reasons: * most of the limitations inherited from fast-export and fast-import are of a type that cause reversibility issues * grafts and replace refs, if present, are used in the rewrite and made permanent * rewriting of commit hashes will probably be reversible, but it is possible for rewritten abbreviated hashes to not be unique even if the original abbreviated hashes were. * filter-repo defaults to several forms of irreversible rewriting that you may need to turn off (e.g. the last two bullet points above or reencoding commit messages into UTF-8); it's possible that additional forms of irreversible rewrites will be added in the future. * I assume that people use filter-repo for one-shot conversions, not ongoing data transfers. I explicitly reserve the right to change any API in filter-repo based on this presumption (and a comment to this effect is found in multiple places in the code and examples). You have been warned. SEE ALSO -------- linkgit:git-rebase[1], linkgit:git-filter-branch[1] GIT --- Part of the linkgit:git[1] suite git-filter-repo-2.45.0/INSTALL.md000066400000000000000000000207201464611705400162540ustar00rootroot00000000000000# Table of Contents * [Pre-requisites](#pre-requisites) * [Simple Installation](#simple-installation) * [Installation via Package Manager](#installation-via-package-manager) * [Detailed installation explanation for packagers](#detailed-installation-explanation-for-packagers) * [Installation via Makefile](#installation-via-makefile) * [Notes for Windows Users](#notes-for-windows-users) # Pre-requisites Instructions on this page assume you have already installed both [Git](https://git-scm.com) and [Python](https://www.python.org/) (though the [Notes for Windows Users](#notes-for-windows-users) has some tips on Python). # Simple Installation All you need to do is download one file: the [git-filter-repo script in this repository](git-filter-repo) ([direct link to raw file](https://raw.githubusercontent.com/newren/git-filter-repo/main/git-filter-repo)), making sure to preserve its name (`git-filter-repo`, with no extension). **That's it**. You're done. Then you can run any command you want, such as $ python3 git-filter-repo --analyze If you place the git-filter-repo script in your $PATH, then you can shorten commands by replacing `python3 git-filter-repo` with `git filter-repo`; the manual assumes this but you can use the longer form. If for some reason downloading a single file is too much of an installation hassle for you, or you really want some kind of "official installation", the other sections may have useful tips. Optionally, if you also want to use some of the contrib scripts, then you need to make sure you have a `git_filter_repo.py` file which is either a link to or copy of `git-filter-repo`, and you need to place that git_filter_repo.py file in $PYTHONPATH. # Installation via Package Manager If you want to install via some [package manager](https://alternativeto.net/software/yellowdog-updater-modified/?license=opensource), you can run $ PACKAGE_TOOL install git-filter-repo The following package managers have packaged git-filter-repo: [![Packaging status](https://repology.org/badge/vertical-allrepos/git-filter-repo.svg)](https://repology.org/project/git-filter-repo/versions) This list covers at least Windows (Scoop), Mac OS X (Homebrew), and Linux (most the rest). Note that I do not curate this list (and have no interest in doing so); https://repology.org tracks who packages these versions. # Detailed installation explanation for packagers filter-repo only consists of a few files that need to be installed: * git-filter-repo This is the _only_ thing needed for basic use. This can be installed in the directory pointed to by `git --exec-path`, or placed anywhere in $PATH. If your python3 executable is named "python" instead of "python3" (this particularly appears to affect a number of Windows users), then you'll also need to modify the first line of git-filter-repo to replace "python3" with "python". * git_filter_repo.py This is needed if you want to make use of one of the scripts in contrib/filter-repo-demos/, or want to write your own script making use of filter-repo as a python library. You can create this symlink to (or copy of) git-filter-repo named git_filter_repo.py and place it in your python site packages; `python -c "import site; print(site.getsitepackages())"` may help you find the appropriate location for your system. Alternatively, you can place this file anywhere within $PYTHONPATH. * git-filter-repo.1 This is needed if you want `git filter-repo --help` to succeed in displaying the manpage, when help.format is "man" (the default on Linux and Mac). This can be installed in the directory pointed to by `$(git --man-path)/man1/`, or placed anywhere in $MANDIR/man1/ where $MANDIR is some entry from $MANPATH. Note that `git filter-repo -h` will show a more limited built-in set of instructions regardless of whether the manpage is installed. * git-filter-repo.html This is needed if you want `git filter-repo --help` to succeed in displaying the html version of the help, when help.format is set to "html" (the default on Windows). This can be installed in the directory pointed to by `git --html-path`. Note that `git filter-repo -h` will show a more limited built-in set of instructions regardless of whether the html version of help is installed. So, installation might look something like the following: 1. If you don't have the necessary documentation files (because you are installing from a clone of filter-repo instead of from a tarball) then you can first run: `make snag_docs` (which just copies the generated documentation files from the `docs` branch) 2. Run the following ``` cp -a git-filter-repo $(git --exec-path) cp -a git-filter-repo.1 $(git --man-path)/man1 && mandb cp -a git-filter-repo.html $(git --html-path) ln -s $(git --exec-path)/git-filter-repo \ $(python -c "import site; print(site.getsitepackages()[-1])")/git_filter_repo.py ``` or you can use the provided Makefile, as noted below. # Installation via Makefile Installing should be doable by hand, but a Makefile is provided for those that prefer it. However, usage of the Makefile really requires overridding at least a couple of the directories with sane values, e.g. $ make prefix=/usr pythondir=/usr/lib64/python3.8/site-packages install Also, the Makefile will not edit the shebang line (the first line) of git-filter-repo if your python executable is not named "python3"; you'll still need to do that yourself. # Notes for Windows Users Windows likes to make things difficult. Common and historical issues: * **Non-functional Python stub**: Windows apparently ships with a [non-functional python](https://github.com/newren/git-filter-repo/issues/36#issuecomment-568933825). This can even manifest as [the app hanging](https://github.com/newren/git-filter-repo/issues/36) or [the system appearing to hang](https://github.com/newren/git-filter-repo/issues/312). Try installing [Python](https://docs.microsoft.com/en-us/windows/python/beginners) from the [Microsoft Store](https://apps.microsoft.com/store/search?publisher=Python%20Software%20Foundation) * **Modifying PATH, making the script executable**: For some reason lots of Windows users have a hard time modifying their PATH and/or making scripts executable. You can skip that step by just using `python3 git-filter-repo` instead of `git filter-repo` in your commands. * **Different python executable name**: It seems some users don't have a `python3` executable but one named something else like `python` or `python3.8` or whatever. You may need to edit the first line of the git-filter-repo script to specify the appropriate path. Or just don't bother and instead use the long form for executing filter-repo commands. Namely, replace the `git filter-repo` part of commands with `PYTHON_EXECUTABLE git-filter-repo`. (Where `PYTHON_EXECUTABLE` is something like `python` or `python3.8` or `C:\PATH\TO\INSTALLATION\OF\python3.exe` or whatever). * **Symlink issues**: git_filter_repo.py is supposed to be a symlink to git-filter-repo, so that it appears to have identical contents. If your system messed up the symlink (usually meaning it looks like a regular file with just one line), then delete git_filter_repo.py and replace it with a copy of git-filter-repo. * **Old GitBash limitations**: older versions of GitForWindows had an unfortunate shebang length limitation (see [git-for-windows issue #3165](https://github.com/git-for-windows/git/pull/3165)). If you're affected, just use the long form for invoking filter-repo commands, i.e. replace the `git filter-repo` part of commands with `python3 git-filter-repo`. For additional historical context, see: * [#371](https://github.com/newren/git-filter-repo/issues/371#issuecomment-1267116186) * [#360](https://github.com/newren/git-filter-repo/issues/360#issuecomment-1276813596) * [#312](https://github.com/newren/git-filter-repo/issues/312) * [#307](https://github.com/newren/git-filter-repo/issues/307) * [#225](https://github.com/newren/git-filter-repo/pull/225) * [#231](https://github.com/newren/git-filter-repo/pull/231) * [#124](https://github.com/newren/git-filter-repo/issues/124) * [#36](https://github.com/newren/git-filter-repo/issues/36) * [this git mailing list thread](https://lore.kernel.org/git/nycvar.QRO.7.76.6.2004251610300.18039@tvgsbejvaqbjf.bet/) git-filter-repo-2.45.0/Makefile000066400000000000000000000147671464611705400163020ustar00rootroot00000000000000# A bunch of installation-related paths people can override on the command line DESTDIR = / INSTALL = install prefix = $(HOME) bindir = $(prefix)/libexec/git-core localedir = $(prefix)/share/locale mandir = $(prefix)/share/man htmldir = $(prefix)/share/doc/git-doc pythondir = $(prefix)/lib64/python3.6/site-packages default: build build: @echo Nothing to do: filter-repo is a script which needs no compilation. test: time t/run_coverage # fixup_locale might matter once we actually have translations, but right now # we don't. It might not even matter then, because python has a fallback podir. fixup_locale: sed -ie s%@@LOCALEDIR@@%$(localedir)% git-filter-repo # People installing from tarball will already have man1/git-filter-repo.1 and # html/git-filter-repo.html. But let's support people installing from a git # clone too; for them, just cheat and snag a copy of the built docs that I # record in a different branch. snag_docs: Documentation/man1/git-filter-repo.1 Documentation/html/git-filter-repo.html Documentation/man1/git-filter-repo.1: mkdir -p Documentation/man1 git show origin/docs:man1/git-filter-repo.1 >Documentation/man1/git-filter-repo.1 Documentation/html/git-filter-repo.html: mkdir -p Documentation/html git show origin/docs:html/git-filter-repo.html >Documentation/html/git-filter-repo.html install: snag_docs #fixup_locale $(INSTALL) -Dm0755 git-filter-repo "$(DESTDIR)/$(bindir)/git-filter-repo" $(INSTALL) -dm0755 "$(DESTDIR)/$(pythondir)" ln -sf "$(bindir)/git-filter-repo" "$(DESTDIR)/$(pythondir)/git_filter_repo.py" $(INSTALL) -Dm0644 Documentation/man1/git-filter-repo.1 "$(DESTDIR)/$(mandir)/man1/git-filter-repo.1" $(INSTALL) -Dm0644 Documentation/html/git-filter-repo.html "$(DESTDIR)/$(htmldir)/git-filter-repo.html" if which mandb > /dev/null; then mandb; fi # # The remainder of the targets are meant for tasks for the maintainer; if they # don't work for you, I don't care. These tasks modify branches and upload # releases and whatnot, and presume a directory layout I have locally. # update_docs: # Set environment variables once export GIT_WORK_TREE=$(shell mktemp -d) \ export GIT_INDEX_FILE=$(shell mktemp) \ COMMIT=$(shell git rev-parse HEAD) \ && \ # Sanity check; we'll build docs in a clone of a git repo \ test -d ../git && \ # Sanity check; docs == origin/docs \ test -z "$(git rev-parse docs origin/docs | uniq -u)" && \ # Avoid spurious errors by forcing index to be well formatted, if empty \ git read-tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904 && # empty tree \ # Symlink git-filter-repo.txt documentation into git and build it \ ln -sf ../../git-filter-repo/Documentation/git-filter-repo.txt ../git/Documentation/ && \ make -C ../git/Documentation -j4 man html && \ # Take the built documentation and lay it out nicely \ mkdir $$GIT_WORK_TREE/html && \ mkdir $$GIT_WORK_TREE/man1 && \ cp -a ../git/Documentation/*.html $$GIT_WORK_TREE/html/ && \ cp -a ../git/Documentation/git-filter-repo.1 $$GIT_WORK_TREE/man1/ && \ dos2unix $$GIT_WORK_TREE/html/* && \ # Add new version of the documentation as a commit, if it differs \ git --work-tree $$GIT_WORK_TREE add . && \ git diff --quiet docs || git write-tree \ | xargs git commit-tree -p docs -m "Update docs to $$COMMIT" \ | xargs git update-ref refs/heads/docs && \ # Remove temporary files \ rm -rf $$GIT_WORK_TREE && \ rm $$GIT_INDEX_FILE && \ # Push the new documentation upstream \ git push origin docs && \ # Notify of completion \ echo && \ echo === filter-repo docs branch updated === # Call like this: # make GITHUB_COM_TOKEN=$KEY TAGNAME=v2.23.0 release release: github_release pypi_release # Call like this: # make GITHUB_COM_TOKEN=$KEY TAGNAME=v2.23.0 github_release github_release: update_docs FILEBASE=git-filter-repo-$(shell echo $(TAGNAME) | tail -c +2) \ TMP_INDEX_FILE=$(shell mktemp) \ COMMIT=$(shell git rev-parse HEAD) \ && \ test -n "$(GITHUB_COM_TOKEN)" && \ test -n "$(TAGNAME)" && \ test -n "$$COMMIT" && \ # Make sure we don't have any staged or unstaged changes \ git diff --quiet --staged HEAD && git diff --quiet HEAD && \ # Make sure 'jq' is installed \ type -p jq && \ # Tag the release, push it to GitHub \ git tag -a -m "filter-repo $(TAGNAME)" $(TAGNAME) $$COMMIT && \ git push origin $(TAGNAME) && \ # Create the tarball \ GIT_INDEX_FILE=$$TMP_INDEX_FILE git read-tree $$COMMIT && \ git ls-tree -r docs | grep filter-repo \ | sed -e 's%\t%\tDocumentation/%' \ | GIT_INDEX_FILE=$$TMP_INDEX_FILE git update-index --index-info && \ GIT_INDEX_FILE=$$TMP_INDEX_FILE git write-tree \ | xargs git archive --prefix=$$FILEBASE/ \ | xz -c >$$FILEBASE.tar.xz && \ rm $$TMP_INDEX_FILE && \ # Make GitHub mark our new tag as an official release \ curl -s -H "Authorization: token $(GITHUB_COM_TOKEN)" -X POST \ https://api.github.com/repos/newren/git-filter-repo/releases \ --data "{ \ \"tag_name\": \"$(TAGNAME)\", \ \"target_commitish\": \"$$COMMIT\", \ \"name\": \"$(TAGNAME)\", \ \"body\": \"filter-repo $(TAGNAME)\" \ }" | jq -r .id >asset_id && \ # Upload our tarball \ cat asset_id | xargs -I ASSET_ID curl -s -H "Authorization: token $(GITHUB_COM_TOKEN)" -H "Content-Type: application/octet-stream" --data-binary @$$FILEBASE.tar.xz https://uploads.github.com/repos/newren/git-filter-repo/releases/ASSET_ID/assets?name=$$FILEBASE.tar.xz && \ # Remove temporary file(s) \ rm asset_id && \ # Notify of completion \ echo && \ echo === filter-repo $(TAGNAME) created and uploaded to GitHub === pypi_release: # Has an implicit dependency on github_release because... # Upload to PyPI, automatically picking tag created by github_release python3 -m venv venv venv/bin/pip install --upgrade pip venv/bin/pip install build twine venv/bin/pyproject-build # Note: hope you remember password for pypi, but username is 'newren' venv/bin/twine upload dist/* # Remove temporary file(s) rm -rf dist/ venv/ git_filter_repo.egg-info/ # NOTE TO FUTURE SELF: If you accidentally push a bad release, you can remove # all but the git-filter-repo-$VERSION.tar.xz asset with # git push --delete origin $TAGNAME # To remove the git-filter-repo-$VERSION.tar.xz asset as well: # curl -s -H "Authorization: token $GITHUB_COM_TOKEN" -X GET \ # https://api.github.com/repos/newren/git-filter-repo/releases # and look for the "id", then run # curl -s -H "Authorization: token $GITHUB_COM_TOKEN" -X DELETE \ # https://api.github.com/repos/newren/git-filter-repo/releases/$ID git-filter-repo-2.45.0/README.md000066400000000000000000000705131464611705400161100ustar00rootroot00000000000000git filter-repo is a versatile tool for rewriting history, which includes [capabilities I have not found anywhere else](#design-rationale-behind-filter-repo). It roughly falls into the same space of tool as [git filter-branch](https://git-scm.com/docs/git-filter-branch) but without the capitulation-inducing poor [performance](https://public-inbox.org/git/CABPp-BGOz8nks0+Tdw5GyGqxeYR-3FF6FT5JcgVqZDYVRQ6qog@mail.gmail.com/), with far more capabilities, and with a design that scales usability-wise beyond trivial rewriting cases. [git filter-repo is now recommended by the git project](https://git-scm.com/docs/git-filter-branch#_warning) instead of git filter-branch. While most users will probably just use filter-repo as a simple command line tool (and likely only use a few of its flags), at its core filter-repo contains a library for creating history rewriting tools. As such, users with specialized needs can leverage it to quickly create [entirely new history rewriting tools](contrib/filter-repo-demos). # Table of Contents * [Prerequisites](#prerequisites) * [How do I install it?](#how-do-i-install-it) * [How do I use it?](#how-do-i-use-it) * [Why filter-repo instead of other alternatives?](#why-filter-repo-instead-of-other-alternatives) * [filter-branch](#filter-branch) * [BFG Repo Cleaner](#bfg-repo-cleaner) * [Simple example, with comparisons](#simple-example-with-comparisons) * [Solving this with filter-repo](#solving-this-with-filter-repo) * [Solving this with BFG Repo Cleaner](#solving-this-with-bfg-repo-cleaner) * [Solving this with filter-branch](#solving-this-with-filter-branch) * [Solving this with fast-export/fast-import](#solving-this-with-fast-exportfast-import) * [Design rationale behind filter-repo](#design-rationale-behind-filter-repo) * [How do I contribute?](#how-do-i-contribute) * [Is there a Code of Conduct?](#is-there-a-code-of-conduct) * [Upstream Improvements](#upstream-improvements) # Prerequisites filter-repo requires: * git >= 2.22.0 at a minimum; [some features](#upstream-improvements) require git >= 2.24.0 or later * python3 >= 3.5 # How do I install it? `git-filter-repo` is a single-file python script, which was done to make installation for basic use on many systems trivial: just place that file into your $PATH. See [INSTALL.md](INSTALL.md) for things beyond basic usage or special cases. The more involved instructions are only needed if one of the following apply: * you do not find the above comment about trivial installation intuitively obvious * you are working with a python3 executable named something other than "python3" * you want to install documentation (beyond the builtin docs shown with -h) * you want to run some of the [contrib](contrib/filter-repo-demos/) examples * you want to create your own python filtering scripts using filter-repo as a module/library # How do I use it? For comprehensive documentation: * see the [user manual](https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html) * alternative formating of the user manual is available on various external sites ([example](https://www.mankier.com/1/git-filter-repo)), for those that don't like the htmlpreview.github.io layout, though it may only be up-to-date as of the latest release If you prefer learning from examples: * there is a [cheat sheet for converting filter-branch commands](Documentation/converting-from-filter-branch.md#cheat-sheet-conversion-of-examples-from-the-filter-branch-manpage), which covers every example from the filter-branch manual * there is a [cheat sheet for converting BFG Repo Cleaner commands](Documentation/converting-from-bfg-repo-cleaner.md#cheat-sheet-conversion-of-examples-from-bfg), which covers every example from the BFG website * the [simple example](#simple-example-with-comparisons) below may be of interest * the user manual has an extensive [examples section](https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES) # Why filter-repo instead of other alternatives? This was covered in more detail in a [Git Rev News article on filter-repo](https://git.github.io/rev_news/2019/08/21/edition-54/#an-introduction-to-git-filter-repo--written-by-elijah-newren), but some highlights for the main competitors: ## filter-branch * filter-branch is [extremely to unusably slow](https://public-inbox.org/git/CABPp-BGOz8nks0+Tdw5GyGqxeYR-3FF6FT5JcgVqZDYVRQ6qog@mail.gmail.com/) ([multiple orders of magnitude slower than it should be](https://git-scm.com/docs/git-filter-branch#PERFORMANCE)) for non-trivial repositories. * [filter-branch is riddled with gotchas](https://git-scm.com/docs/git-filter-branch#SAFETY) that can silently corrupt your rewrite or at least thwart your "cleanup" efforts by giving you something more problematic and messy than what you started with. * filter-branch is [very onerous](#simple-example-with-comparisons) [to use](https://github.com/newren/git-filter-repo/blob/a6a6a1b0f62d365bbe2e76f823e1621857ec4dbd/contrib/filter-repo-demos/filter-lamely#L9-L61) for any rewrite which is even slightly non-trivial. * the git project has stated that the above issues with filter-branch cannot be backward compatibly fixed; they recommend that you [stop using filter-branch](https://git-scm.com/docs/git-filter-branch#_warning) * die-hard fans of filter-branch may be interested in [filter-lamely](contrib/filter-repo-demos/filter-lamely) (a.k.a. [filter-branch-ish](contrib/filter-repo-demos/filter-branch-ish)), a reimplementation of filter-branch based on filter-repo which is more performant (though not nearly as fast or safe as filter-repo). * a [cheat sheet](Documentation/converting-from-filter-branch.md#cheat-sheet-conversion-of-examples-from-the-filter-branch-manpage) is available showing how to convert example commands from the manual of filter-branch into filter-repo commands. ## BFG Repo Cleaner * great tool for its time, but while it makes some things simple, it is limited to a few kinds of rewrites. * its architecture is not amenable to handling more types of rewrites. * its architecture presents some shortcomings and bugs even for its intended usecase. * fans of bfg may be interested in [bfg-ish](contrib/filter-repo-demos/bfg-ish), a reimplementation of bfg based on filter-repo which includes several new features and bugfixes relative to bfg. * a [cheat sheet](Documentation/converting-from-bfg-repo-cleaner.md#cheat-sheet-conversion-of-examples-from-bfg) is available showing how to convert example commands from the manual of BFG Repo Cleaner into filter-repo commands. # Simple example, with comparisons Let's say that we want to extract a piece of a repository, with the intent on merging just that piece into some other bigger repo. For extraction, we want to: * extract the history of a single directory, src/. This means that only paths under src/ remain in the repo, and any commits that only touched paths outside this directory will be removed. * rename all files to have a new leading directory, my-module/ (e.g. so that src/foo.c becomes my-module/src/foo.c) * rename any tags in the extracted repository to have a 'my-module-' prefix (to avoid any conflicts when we later merge this repo into something else) ## Solving this with filter-repo Doing this with filter-repo is as simple as the following command: ```shell git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-' ``` (the single quotes are unnecessary, but make it clearer to a human that we are replacing the empty string as a prefix with `my-module-`) ## Solving this with BFG Repo Cleaner BFG Repo Cleaner is not capable of this kind of rewrite; in fact, all three types of wanted changes are outside of its capabilities. ## Solving this with filter-branch filter-branch comes with a pile of caveats (more on that below) even once you figure out the necessary invocation(s): ```shell git filter-branch \ --tree-filter 'mkdir -p my-module && \ git ls-files \ | grep -v ^src/ \ | xargs git rm -f -q && \ ls -d * \ | grep -v my-module \ | xargs -I files mv files my-module/' \ --tag-name-filter 'echo "my-module-$(cat)"' \ --prune-empty -- --all git clone file://$(pwd) newcopy cd newcopy git for-each-ref --format="delete %(refname)" refs/tags/ \ | grep -v refs/tags/my-module- \ | git update-ref --stdin git gc --prune=now ``` Some might notice that the above filter-branch invocation will be really slow due to using --tree-filter; you could alternatively use the --index-filter option of filter-branch, changing the above commands to: ```shell git filter-branch \ --index-filter 'git ls-files \ | grep -v ^src/ \ | xargs git rm -q --cached; git ls-files -s \ | sed "s%$(printf \\t)%&my-module/%" \ | git update-index --index-info; git ls-files \ | grep -v ^my-module/ \ | xargs git rm -q --cached' \ --tag-name-filter 'echo "my-module-$(cat)"' \ --prune-empty -- --all git clone file://$(pwd) newcopy cd newcopy git for-each-ref --format="delete %(refname)" refs/tags/ \ | grep -v refs/tags/my-module- \ | git update-ref --stdin git gc --prune=now ``` However, for either filter-branch command there are a pile of caveats. First, some may be wondering why I list five commands here for filter-branch. Despite the use of --all and --tag-name-filter, and filter-branch's manpage claiming that a clone is enough to get rid of old objects, the extra steps to delete the other tags and do another gc are still required to clean out the old objects and avoid mixing new and old history before pushing somewhere. Other caveats: * Commit messages are not rewritten; so if some of your commit messages refer to prior commits by (abbreviated) sha1, after the rewrite those messages will now refer to commits that are no longer part of the history. It would be better to rewrite those (abbreviated) sha1 references to refer to the new commit ids. * The --prune-empty flag sometimes misses commits that should be pruned, and it will also prune commits that *started* empty rather than just ended empty due to filtering. For repositories that intentionally use empty commits for versioning and publishing related purposes, this can be detrimental. * The commands above are OS-specific. GNU vs. BSD issues for sed, xargs, and other commands often trip up users; I think I failed to get most folks to use --index-filter since the only example in the filter-branch manpage that both uses it and shows how to move everything into a subdirectory is linux-specific, and it is not obvious to the reader that it has a portability issue since it silently misbehaves rather than failing loudly. * The --index-filter version of the filter-branch command may be two to three times faster than the --tree-filter version, but both filter-branch commands are going to be multiple orders of magnitude slower than filter-repo. * Both commands assume all filenames are composed entirely of ascii characters (even special ascii characters such as tabs or double quotes will wreak havoc and likely result in missing files or misnamed files) ## Solving this with fast-export/fast-import One can kind of hack this together with something like: ```shell git fast-export --no-data --reencode=yes --mark-tags --fake-missing-tagger \ --signed-tags=strip --tag-of-filtered-object=rewrite --all \ | grep -vP '^M [0-9]+ [0-9a-f]+ (?!src/)' \ | grep -vP '^D (?!src/)' \ | perl -pe 's%^(M [0-9]+ [0-9a-f]+ )(.*)$%\1my-module/\2%' \ | perl -pe 's%^(D )(.*)$%\1my-module/\2%' \ | perl -pe s%refs/tags/%refs/tags/my-module-% \ | git -c core.ignorecase=false fast-import --date-format=raw-permissive \ --force --quiet git for-each-ref --format="delete %(refname)" refs/tags/ \ | grep -v refs/tags/my-module- \ | git update-ref --stdin git reset --hard git reflog expire --expire=now --all git gc --prune=now ``` But this comes with some nasty caveats and limitations: * The various greps and regex replacements operate on the entire fast-export stream and thus might accidentally corrupt unintended portions of it, such as commit messages. If you needed to edit file contents and thus dropped the --no-data flag, it could also end up corrupting file contents. * This command assumes all filenames in the repository are composed entirely of ascii characters, and also exclude special characters such as tabs or double quotes. If such a special filename exists within the old src/ directory, it will be pruned even though it was intended to be kept. (In slightly different repository rewrites, this type of editing also risks corrupting filenames with special characters by adding extra double quotes near the end of the filename and in some leading directory name.) * This command will leave behind huge numbers of useless empty commits, and has no realistic way of pruning them. (And if you tried to combine this technique with another tool to prune the empty commits, then you now have no way to distinguish between commits which were made empty by the filtering that you want to remove, and commits which were empty before the filtering process and which you thus may want to keep.) * Commit messages which reference other commits by hash will now reference old commits that no longer exist. Attempting to edit the commit messages to update them is extraordinarily difficult to add to this kind of direct rewrite. # Design rationale behind filter-repo None of the existing repository filtering tools did what I wanted; they all came up short for my needs. No tool provided any of the first eight traits below I wanted, and no tool provided more than two of the last four traits either: 1. [Starting report] Provide user an analysis of their repo to help them get started on what to prune or rename, instead of expecting them to guess or find other tools to figure it out. (Triggered, e.g. by running the first time with a special flag, such as --analyze.) 1. [Keep vs. remove] Instead of just providing a way for users to easily remove selected paths, also provide flags for users to only *keep* certain paths. Sure, users could workaround this by specifying to remove all paths other than the ones they want to keep, but the need to specify all paths that *ever* existed in **any** version of the repository could sometimes be quite painful. For filter-branch, using pipelines like `git ls-files | grep -v ... | xargs -r git rm` might be a reasonable workaround but can get unwieldy and isn't as straightforward for users; plus those commands are often operating-system specific (can you spot the GNUism in the snippet I provided?). 1. [Renaming] It should be easy to rename paths. For example, in addition to allowing one to treat some subdirectory as the root of the repository, also provide options for users to make the root of the repository just become a subdirectory. And more generally allow files and directories to be easily renamed. Provide sanity checks if renaming causes multiple files to exist at the same path. (And add special handling so that if a commit merely copied oldname->newname without modification, then filtering oldname->newname doesn't trigger the sanity check and die on that commit.) 1. [More intelligent safety] Writing copies of the original refs to a special namespace within the repo does not provide a user-friendly recovery mechanism. Many would struggle to recover using that. Almost everyone I've ever seen do a repository filtering operation has done so with a fresh clone, because wiping out the clone in case of error is a vastly easier recovery mechanism. Strongly encourage that workflow by [detecting and bailing if we're not in a fresh clone](https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#FRESHCLONE), unless the user overrides with --force. 1. [Auto shrink] Automatically remove old cruft and repack the repository for the user after filtering (unless overridden); this simplifies things for the user, helps avoid mixing old and new history together, and avoids problems where the multi-step process for shrinking the repo documented in the manpage doesn't actually work in some cases. (I'm looking at you, filter-branch.) 1. [Clean separation] Avoid confusing users (and prevent accidental re-pushing of old stuff) due to mixing old repo and rewritten repo together. (This is particularly a problem with filter-branch when using the --tag-name-filter option, and sometimes also an issue when only filtering a subset of branches.) 1. [Versatility] Provide the user the ability to extend the tool or even write new tools that leverage existing capabilities, and provide this extensibility in a way that (a) avoids the need to fork separate processes (which would destroy performance), (b) avoids making the user specify OS-dependent shell commands (which would prevent users from sharing commands with each other), (c) takes advantage of rich data structures (because hashes, dicts, lists, and arrays are prohibitively difficult in shell) and (d) provides reasonable string manipulation capabilities (which are sorely lacking in shell). 1. [Old commit references] Provide a way for users to use old commit IDs with the new repository (in particular via mapping from old to new hashes with refs/replace/ references). 1. [Commit message consistency] If commit messages refer to other commits by ID (e.g. "this reverts commit 01234567890abcdef", "In commit 0013deadbeef9a..."), those commit messages should be rewritten to refer to the new commit IDs. 1. [Become-empty pruning] Commits which become empty due to filtering should be pruned. If the parent of a commit is pruned, the first non-pruned ancestor needs to become the new parent. If no non-pruned ancestor exists and the commit was not a merge, then it becomes a new root commit. If no non-pruned ancestor exists and the commit was a merge, then the merge will have one less parent (and thus make it likely to become a non-merge commit which would itself be pruned if it had no file changes of its own). One special thing to note here is that we prune commits which become empty, NOT commits which start empty. Some projects intentionally create empty commits for versioning or publishing reasons, and these should not be removed. (As a special case, commits which started empty but whose parent was pruned away will also be considered to have "become empty".) 1. [Become-degenerate pruning] Pruning of commits which become empty can potentially cause topology changes, and there are lots of special cases. Normally, merge commits are not removed since they are needed to preserve the graph topology, but the pruning of parents and other ancestors can ultimately result in the loss of one or more parents. A simple case was already noted above: if a merge commit loses enough parents to become a non-merge commit and it has no file changes, then it too can be pruned. Merge commits can also have a topology that becomes degenerate: it could end up with the merge_base serving as both parents (if all intervening commits from the original repo were pruned), or it could end up with one parent which is an ancestor of its other parent. In such cases, if the merge has no file changes of its own, then the merge commit can also be pruned. However, much as we do with empty pruning we do not prune merge commits that started degenerate (which indicates it may have been intentional, such as with --no-ff merges) but only merge commits that become degenerate and have no file changes of their own. 1. [Speed] Filtering should be reasonably fast # How do I contribute? See the [contributing guidelines](Documentation/Contributing.md). # Is there a Code of Conduct? Participants in the filter-repo community are expected to adhere to the same standards as for the git project, so the [git Code of Conduct](https://git.kernel.org/pub/scm/git/git.git/tree/CODE_OF_CONDUCT.md) applies. # Upstream Improvements Work on filter-repo and [its predecessor](https://public-inbox.org/git/51419b2c0904072035u1182b507o836a67ac308d32b9@mail.gmail.com/) has also driven numerous improvements to fast-export and fast-import (and occasionally other commands) in core git, based on things filter-repo needs to do its work: * git-2.28.0 * [fast-import: add new --date-format=raw-permissive format]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=d42a2fb72f) * git-2.24.0 * [fast-export: handle nested tags]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=941790d7de) * [t9350: add tests for tags of things other than a commit]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=8d7d33c1ce) * [fast-export: allow user to request tags be marked with --mark-tags]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=a1638cfe12) * [fast-export: add support for --import-marks-if-exists]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=208d69246e) * [fast-import: add support for new 'alias' command]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=b8f50e5b60) * [fast-import: allow tags to be identified by mark labels]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=f73b2aba05) * [fast-import: fix handling of deleted tags]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=3164e6bd24) * [fast-export: fix exporting a tag and nothing else]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=af2abd870b) * [git-fast-import.txt: clarify that multiple merge commits are allowed]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=d1387d3895) * git-2.23.0 * [t9350: fix encoding test to actually test reencoding]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=32615ce762) * [fast-import: support 'encoding' commit header]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=3edfcc65fd) * [fast-export: avoid stripping encoding header if we cannot reencode]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=ccbfc96dc4) * [fast-export: differentiate between explicitly UTF-8 and implicitly UTF-8]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=57a8be2cb0) * [fast-export: do automatic reencoding of commit messages only if requested]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=e80001f8fd) * git-2.22.0 * [log,diff-tree: add --combined-all-paths option]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=d76ce4f734) * [t9300: demonstrate bug with get-mark and empty orphan commits]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=62edbec7de) * [git-fast-import.txt: fix wording about where ls command can appear]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=a63c54a019) * [fast-import: check most prominent commands first]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=5056bb7646) * [fast-import: only allow cat-blob requests where it makes sense]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=7ffde293f2) * [fast-import: fix erroneous handling of get-mark with empty orphan commits]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=cf7b857a77) * [Honor core.precomposeUnicode in more places]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=8e712ef6fc) * git-2.21.0 * [fast-export: convert sha1 to oid]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=843b9e6d48) * [git-fast-import.txt: fix documentation for --quiet option]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=f55c979b14) * [git-fast-export.txt: clarify misleading documentation about rev-list args]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=4532be7cba) * [fast-export: use value from correct enum]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=b93b81e799) * [fast-export: avoid dying when filtering by paths and old tags exist]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=1f30c904b3) * [fast-export: move commit rewriting logic into a function for reuse]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=f129c4275c) * [fast-export: when using paths, avoid corrupt stream with non-existent mark]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=cd13762d8f) * [fast-export: ensure we export requested refs]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=fdf31b6369) * [fast-export: add --reference-excluded-parents option]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=530ca19c02) * [fast-import: remove unmaintained duplicate documentation]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=25dd3e4889) * [fast-export: add a --show-original-ids option to show original names]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=a965bb3116) * [git-show-ref.txt: fix order of flags]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=bd8d6f0def) * git-2.20.0 * [update-ref: fix type of update_flags variable to match its usage]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=e4c34855a2) * [update-ref: allow --no-deref with --stdin]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=d345e9fbe7) * git-1.7.3 * [fast-export: Fix dropping of files with --import-marks and path limiting]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=4087a02e45) * [fast-export: Add a --full-tree option]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=7f40ab0916) * [fast-export: Fix output order of D/F changes]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=060df62422) * [fast-import: Improve robustness when D->F changes provided in wrong order]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=253fb5f889) * git-1.6.4: * [fast-export: Set revs.topo_order before calling setup_revisions]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=668f3aa776) * [fast-export: Omit tags that tag trees]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=02c48cd69b) * [fast-export: Make sure we show actual ref names instead of "(null)"]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=2374502c6c) * [fast-export: Do parent rewriting to avoid dropping relevant commits]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=32164131db) * [fast-export: Add a --tag-of-filtered-object option for newly dangling tags]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=2d8ad46919) * [Add new fast-export testcases]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=25e0ca5dd6) * [fast-export: Document the fact that git-rev-list arguments are accepted]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=8af15d282e) * git-1.6.3: * [git-filter-branch: avoid collisions with variables in eval'ed commands]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=d5b0c97d13) * [Correct missing SP characters in grammar comment at top of fast-import.c]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=98e1a4186a) * [fast-export: Avoid dropping files from commits]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=ebeec7dbc5) * git-1.6.1.4: * [fast-export: ensure we traverse commits in topological order]( https://git.kernel.org/pub/scm/git/git.git/commit/?id=784f8affe4) git-filter-repo-2.45.0/contrib/000077500000000000000000000000001464611705400162635ustar00rootroot00000000000000git-filter-repo-2.45.0/contrib/filter-repo-demos/000077500000000000000000000000001464611705400216205ustar00rootroot00000000000000git-filter-repo-2.45.0/contrib/filter-repo-demos/README.md000066400000000000000000000043501464611705400231010ustar00rootroot00000000000000## Background filter-repo is not merely a history rewriting tool, it also contains a library that can be used to write new history rewriting tools. This directory contains several examples showing the breadth of different things that could be done. ## Quick overview Command                         |Description -------|----------- barebones-example |Simple example with no modifications to filter-repo behavior, just showing what to import and run. insert-beginning |Add a new file (e.g. LICENSE/COPYING) to the beginning of history. signed-off-by |Add a Signed-off-by tag to a range of commits lint-history |Run some lint command on all non-binary files in history. clean-ignore |Delete files from history which match current gitignore rules. filter-lamely (or filter‑branch‑ish) |A nearly bug compatible re-implementation of filter-branch (the git testsuite passes using it instead of filter-branch), with some performance tricks to make it several times faster (though it's still glacially slow compared to filter-repo). bfg-ish |A re-implementation of most of BFG Repo Cleaner, with new features and bug fixes. convert-svnexternals |Insert Git submodules according to SVN externals. ## Purpose Please note that the point of these examples is not to provide new complete tools, but simply to demonstrate that extremely varied history rewriting tools can be created which automatically inherit lots of useful base functionality: rewriting hashes in commit messages, pruning commits that become empty, handling filenames with funny characters, non-standard encodings, handling of replace refs, etc. (Additional examples of using filter-repo as a library can also be found in [the testsuite](../../t/t9391/).) My sincerest hope is that these examples provide lots of useful functionality, but that each is missing at least one critical piece for your usecase. Go forth and extend and improve. ## Usage All the examples require a symlink to git-filter-repo in your PYTHONPATH named git_filter_repo.py in order to run; also, all have a --help flag to get a description of their usage and flags. git-filter-repo-2.45.0/contrib/filter-repo-demos/barebones-example000077500000000000000000000014411464611705400251370ustar00rootroot00000000000000#!/usr/bin/env python3 """ This is a simple program that behaves identically to git-filter-repo. Its entire purpose is just to show what to import and run to get the normal git-filter-repo behavior, to serve as a starting point for you to figure out what you want to modify. """ """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ import sys try: import git_filter_repo as fr except ImportError: raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?") args = fr.FilteringOptions.parse_args(sys.argv[1:]) if args.analyze: fr.RepoAnalyze.run(args) else: filter = fr.RepoFilter(args) filter.run() git-filter-repo-2.45.0/contrib/filter-repo-demos/bfg-ish000077500000000000000000000552431464611705400230760ustar00rootroot00000000000000#!/usr/bin/env python3 """ This is a re-implementation of BFG Repo Cleaner, with some changes... New features: * pruning unwanted objects streamlined (automatic repack) and made more robust (BFG makes user repack manually, and while it provides instructions on how to do so, it won't successfully remove large objects in cases like unpacked refs, loose objects, or use of --no-blob-protection; the robustness details are bugfixes, so are covered below.) * pruning of commits which become empty (or become degenerate and empty) * creation of new replace refs so folks can access new commits using old (unabbreviated) commit hashes * respects and uses grafts and replace refs in the rewrite to make them permanent (this is half new feature, half bug fix; thus also mentioned in bugfixes below) * auto-update of commit encoding to utf-8 (as per fast-export's default; could pass --preserve-commit-encoding to FilteringOptions.parse_args() if this isn't wanted...) Bug fixes: * Works for both packfiles and loose objects (With BFG, if you don't repack before running, large blobs may be retained.) (With BFG, any files larger than core.bigFileThreshold are thus hard to remove since they will not be packed by a gc or a repack.) * Works for both packed-refs and loose refs (As per BFG issue #221, BFG fails to properly walk history unless packed.) * Works with replace refs (BFG operates directly on packfiles and packed-refs, and does not understand replace refs; see BFG issue #82) * Updates both index and working tree at end of rewrite (With BFG and --no-blob-protection, these are still left out-of-date. This is a doubly-whammy principle-of-least-astonishment violation: (1) users are likely to accidentally commit the "staged" changes, re-introducing the large blobs or removed passwords, (2) even if they don't commit the changes the index holding them will prevent gc from shrinking the repo. Fixing these two glaring problems not only makes --no-blob-protection safe to recommend, it makes it safe to make it the default.) * Fixes the "protection" defaults (With BFG, it won't rewrite the tree for HEAD; it can't reasonably switch to doing so because of the bugs mentioned above with updating the index and working tree. However, this behavior comes with a surprise for users: if HEAD is "protected" because users should manually update it first, why isn't that also true of the other branches? In my opinion, there's no user-facing distinction that makes sense for such a difference in handling. "Protecting" HEAD can also be an error-prone requirement for users -- why do they have to manually edit all files the same way --replace-text is doing and why do they have to risk dirty diffs if they get it slightly different (or a useless and ugly empty commit if they manage to get it right)? Finally, a third reason this was in my opinion a bad default was that it works really poorly in conjunction with other types of history rewrites, e.g. --subdirectory-filter, --to-subdirectory-filter, --convert-to-git-lfs, --path-rename, etc. For all three of these reasons, and the fixes mentioned above to make it safe, --no-blob-protection is made the default.) * Implements privacy improvements, defaulting to true (As per BFG #139, one of the BFG maintainers notes problematic issues with the "privacy" handling in BFG, suggesting options which could be added to improve the story. I implemented those options, except that I felt --private should be the default and made the various non-private choices individual options; see the --use-* options.) Other changes: * Removed the --convert-to-git-lfs option (As per BFG issues #116 and #215, and git-lfs issue #1589, handling LFS conversion is poor in BFG and not recommended; other tools are suggested even by the BFG authors.) * Removed the --strip-biggest-blobs option (I philosophically disagree with offering such an option when no mechanism is provided to see what the N biggest blobs are. How is the user supposed to select N? Even if they know they have three files which have been large, they may be unaware of others in history. Even if there aren't any other files in history and the user requests to remove the largest three blobs, it might not be what they want: one of the files might have had multiple versions, in which case their request would only remove some versions of the largest file from history and leave all versions of the second and third largest files as well as all but three versions of the largest file. Finally, on a more minor note, what is done in the case of a tie -- remove more than N, less than N, or just pick one of the objects tieing for Nth largest at random? It's ill-defined.) ...even with all these improvements, I think filter-repo is the better tool, and thus I suggest folks use it. I have no plans to improve bfg-ish further. However, bfg-ish serves as a nice demonstration of the ability to use filter-repo to write different filtering tools, which was its purpose. """ """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ import argparse import fnmatch import os import re import subprocess import tempfile try: import git_filter_repo as fr except ImportError: raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?") subproc = fr.subproc def java_to_fnmatch_glob(extended_glob): if not extended_glob: return None curly_re = re.compile(br'(.*){([^{}]*)}(.*)') m = curly_re.match(extended_glob) if not m: return [extended_glob] all_answers = [java_to_fnmatch_glob(m.group(1)+x+m.group(3)) for x in m.group(2).split(b',')] return [item for sublist in all_answers for item in sublist] class BFG_ish: def __init__(self): self.blob_sizes = {} self.filtered_blobs = {} self.cat_file_proc = None self.replacement_rules = None self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)') self.args = None def parse_options(self): usage = 'bfg-ish [options] []' parser = argparse.ArgumentParser(description="bfg-ish 1.13.0", usage=usage) parser.add_argument('--strip-blobs-bigger-than', '-b', metavar='', help=("strip blobs bigger than X (e.g. '128K', '1M', etc)")) #parser.add_argument('--strip-biggest-blobs', '-B', metavar='NUM', # help=("strip the top NUM biggest blobs")) parser.add_argument('--strip-blobs-with-ids', '-bi', metavar='', help=("strip blobs with the specified Git object ids")) parser.add_argument('--delete-files', '-D', metavar='', type=os.fsencode, help=("delete files with the specified names (e.g. '*.class', '*.{txt,log}' - matches on file name, not path within repo)")) parser.add_argument('--delete-folders', metavar='', type=os.fsencode, help=("delete folders with the specified names (e.g. '.svn', '*-tmp' - matches on folder name, not path within repo)")) parser.add_argument('--replace-text', '-rt', metavar='', help=("filter content of files, replacing matched text. Match expressions should be listed in the file, one expression per line - by default, each expression is treated as a literal, but 'regex:' & 'glob:' prefixes are supported, with '==>' to specify a replacement string other than the default of '***REMOVED***'.")) parser.add_argument('--filter-content-including', '-fi', metavar='', type=os.fsencode, help=("do file-content filtering on files that match the specified expression (eg '*.{txt,properties}')")) parser.add_argument('--filter-content-excluding', '-fe', metavar='', type=os.fsencode, help=("don't do file-content filtering on files that match the specified expression (eg '*.{xml,pdf}')")) parser.add_argument('--filter-content-size-threshold', '-fs', metavar='', default=1048576, type=int, help=("only do file-content filtering on files smaller than (default is 1048576 bytes)")) parser.add_argument('--preserve-ref-tips', '--protect-blobs-from', '-p', metavar='', nargs='+', help=("Do not filter the trees for final commit of the specified refs, only in the history before those commits (by default, filtering options affect all commits, even those at ref tips). This is not recommended.")) parser.add_argument('--no-blob-protection', action='store_true', help=("allow the BFG to modify even your *latest* commit. Not only is this highly recommended, it is the default. As such, this option does not actually do anything and is provided solely for compatibility with BFG. To undo this option, use --preserve-ref-tips and specify HEAD or the current branch name")) parser.add_argument('--use-formerly-log-text', action='store_true', help=("when updating commit hashes in commit messages also add a [formerly OLDHASH] text, possibly violating commit message line length guidelines and providing an inferior way to lookup old hashes (replace references are much preferred as git itself will understand them)")) parser.add_argument('--use-formerly-commit-footer', action='store_true', help=("append a `Former-commit-id:` footer to commit messages. This is an inferior way to lookup old hashes (replace references are much preferred as git itself will understand them)")) parser.add_argument('--use-replace-blobs', action='store_true', help=("replace any removed file by a `.REMOVED.git-id` file. Makes history ugly as it litters it with replacement files for each one you want removed, but has a small chance of being useful if you find you pruned something incorrectly.")) parser.add_argument('--private', action='store_true', help=("this option does nothing and is provided solely for compatibility with bfg; to undo it, use the --use-* options")) parser.add_argument('--massive-non-file-objects-sized-up-to', metavar='', help=("this option does nothing and is provided solely for compatibility with bfg")) parser.add_argument('repo', type=os.fsencode, help=("file path for Git repository to clean")) args = parser.parse_args() # Sanity check on args.repo if not os.path.isdir(args.repo): raise SystemExit("Repo not found: {}".format(os.fsdecode(args.repo))) dirname, basename = os.path.split(args.repo) if not basename: dirname, basename = os.path.split(dirname) if not dirname: dirname = b'.' if basename == b".git": raise SystemExit("For non-bare repos, please specify the toplevel directory ({}) for repo" .format(os.fsdecode(dirname))) return args def convert_replace_text(self, filename): tmpfile, newname = tempfile.mkstemp() os.close(tmpfile) with open(newname, 'bw') as outfile: with open(filename, 'br') as infile: for line in infile: if line.startswith(b'regex:'): beg, end = line.split(b'==>') end = re.sub(br'\$([0-9])', br'\\\1', end) outfile.write(b'%s==>%s\n' % (beg, end)) elif line.startswith(b'glob:'): outfile.write(b'glob:' + java_to_fnmatch_glob(line[5:])) else: outfile.write(line) return newname def path_wanted(self, filename): if not self.args.delete_files and not self.args.delete_folders: return filename paths = filename.split(b'/') dirs = paths[0:-1] basename = paths[-1] if self.args.delete_files and any(fnmatch.fnmatch(basename, x) for x in self.args.delete_files): return False if self.args.delete_folders and any(any(fnmatch.fnmatch(dirname, x) for dirname in dirs) for x in self.args.delete_folders): return False return True def should_filter_path(self, filename): def matches(basename, glob_list): return any(fnmatch.fnmatch(basename, x) for x in glob_list) basename = os.path.basename(filename) if self.args.filter_content_including and \ not matches(basename, self.args.filter_content_including): return False if self.args.filter_content_excluding and \ matches(basename, self.args.filter_content_excluding): return False return True def filter_relevant_blobs(self, commit): for change in commit.file_changes: if change.type == b'D': continue # deleted files have no remaining content to filter if change.mode in (b'120000', b'160000'): continue # symlinks and submodules aren't text files we can filter if change.blob_id in self.filtered_blobs: change.blob_id = self.filtered_blobs[change.blob_id] continue if self.args.filter_content_size_threshold: size = self.blob_sizes[change.blob_id] if size >= self.args.filter_content_size_threshold: continue if not self.should_filter_path(change.filename): continue self.cat_file_proc.stdin.write(change.blob_id + b'\n') self.cat_file_proc.stdin.flush() objhash, objtype, objsize = self.cat_file_proc.stdout.readline().split() # FIXME: This next line assumes the file fits in memory; though the way # fr.Blob works we kind of have that assumption baked in elsewhere too... contents = self.cat_file_proc.stdout.read(int(objsize)) if not any(x == b"0" for x in contents[0:8192]): # not binaries for literal, replacement in self.replacement_rules['literals']: contents = contents.replace(literal, replacement) for regex, replacement in self.replacement_rules['regexes']: contents = regex.sub(replacement, contents) self.cat_file_proc.stdout.read(1) # Read trailing newline blob = fr.Blob(contents) self.filter.insert(blob) self.filtered_blobs[change.blob_id] = blob.id change.blob_id = blob.id def munge_message(self, message, metadata): def replace_hash(matchobj): oldhash = matchobj.group(1) newhash = metadata['commit_rename_func'](oldhash) if newhash != oldhash and self.args.use_formerly_log_text: newhash = b'%s [formerly %s]' % (newhash, oldhash) return newhash return self._hash_re.sub(replace_hash, message) def commit_update(self, commit, metadata): # Strip out unwanted files new_file_changes = [] for change in commit.file_changes: if not self.path_wanted(change.filename): if not self.args.use_replace_blobs: continue blob = fr.Blob(change.blob_id) self.filter.insert(blob) change.blob_id = blob.id change.filename += b'.REMOVED.git-id' new_file_changes.append(change) commit.file_changes = new_file_changes # Filter text of relevant files if self.replacement_rules: self.filter_relevant_blobs(commit) # Replace commit hashes in commit message with 'newhash [formerly oldhash]' if self.args.use_formerly_log_text: commit.message = self.munge_message(commit.message, metadata) # Add a 'Former-commit-id:' footer if self.args.use_formerly_commit_footer: if not commit.message.endswith(b'\n'): commit.message += b'\n' lastline = commit.message.splitlines()[-1] if not re.match(b'\n[A-Za-z0-9-_]*: ', lastline): commit.message += b'\n' commit.message += b'Former-commit-id: %s' % commit.original_id def get_preservation_info(self, ref_tips): if not ref_tips: return [] cmd = 'git rev-parse --symbolic-full-name'.split() p = subproc.Popen(cmd + ref_tips, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) ret = p.wait() output = p.stdout.read() if ret != 0: raise SystemExit("Failed to translate --preserve-ref-tips arguments into refs\n"+fr.decode(output)) refs = output.splitlines() ref_trees = [b'%s^{tree}' % ref for ref in refs] output = subproc.check_output(['git', 'rev-parse'] + ref_trees) trees = output.splitlines() return dict(zip(refs, trees)) def revert_tree_changes(self, preserve_refs): # FIXME: Since this function essentially creates a new commit (with the # original tree) to replace the commit at the ref tip (which has a # filtered tree), I should update the created refs/replace/ object to # point to the newest commit. Also, the double reset (see comment near # where revert_tree_changes is called) seems kinda lame. It'd be easy # enough to fix these issues, but I'm very unmotivated since # --preserve-ref-tips/--protect-blobs-from is a design mistake. updates = {} for ref, tree in preserve_refs.items(): output = subproc.check_output('git cat-file -p'.split()+[ref]) lines = output.splitlines() if not lines[0].startswith(b'tree '): raise SystemExit("Error: --preserve-ref-tips only works with commit refs") num = 1 parents = [] while lines[num].startswith(b'parent '): parents.append(lines[num][7:]) num += 1 assert lines[num].startswith(b'author ') author_info = [x.strip() for x in re.split(b'[<>]', lines[num][7:])] aenv = 'GIT_AUTHOR_NAME GIT_AUTHOR_EMAIL GIT_AUTHOR_DATE'.split() assert lines[num+1].startswith(b'committer ') committer_info = [x.strip() for x in re.split(b'[<>]', lines[num+1][10:])] cenv = 'GIT_COMMITTER_NAME GIT_COMMITTER_EMAIL GIT_COMMITTER_DATE'.split() new_env = {**os.environ.copy(), **dict(zip(aenv, author_info)), **dict(zip(cenv, committer_info))} assert lines[num+2] == b'' commit_msg = b'\n'.join(lines[num+3:])+b'\n' p_s = [val for pair in zip(['-p',]*len(parents), parents) for val in pair] p = subproc.Popen('git commit-tree'.split() + p_s + [tree], stdin = subprocess.PIPE, stdout = subprocess.PIPE, env = new_env) p.stdin.write(commit_msg) p.stdin.close() if p.wait() != 0: raise SystemExit("Error: failed to write preserve commit for {} [{}]" .format(ref, tree)) updates[ref] = p.stdout.read().strip() p = subproc.Popen('git update-ref --stdin'.split(), stdin = subprocess.PIPE) for ref, newvalue in updates.items(): p.stdin.write(b'update %s %s\n' % (ref, newvalue)) p.stdin.close() if p.wait() != 0: raise SystemExit("Error: failed to write preserve commits") def run(self): bfg_args = self.parse_options() preserve_refs = self.get_preservation_info(bfg_args.preserve_ref_tips) work_dir = os.getcwd() os.chdir(bfg_args.repo) bfg_args.delete_files = java_to_fnmatch_glob(bfg_args.delete_files) bfg_args.delete_folders = java_to_fnmatch_glob(bfg_args.delete_folders) bfg_args.filter_content_including = \ java_to_fnmatch_glob(bfg_args.filter_content_including) bfg_args.filter_content_excluding = \ java_to_fnmatch_glob(bfg_args.filter_content_excluding) if bfg_args.replace_text and bfg_args.filter_content_size_threshold: # FIXME (perf): It would be much more performant and probably make more # sense to have a `git cat-file --batch-check` process running and query # it for blob sizes, since we may only need a small subset of blob sizes # rather than the sizes of all objects in the git database. self.blob_sizes, packed_sizes = fr.GitUtils.get_blob_sizes() extra_args = [] if bfg_args.strip_blobs_bigger_than: extra_args = ['--strip-blobs-bigger-than', bfg_args.strip_blobs_bigger_than] if bfg_args.strip_blobs_with_ids: extra_args = ['--strip-blobs-with-ids', bfg_args.strip_blobs_with_ids] if bfg_args.use_formerly_log_text: extra_args += ['--preserve-commit-hashes'] new_replace_file = None if bfg_args.replace_text: if not os.path.isabs(bfg_args.replace_text): bfg_args.replace_text = os.path.join(work_dir, bfg_args.replace_text) new_replace_file = self.convert_replace_text(bfg_args.replace_text) rules = fr.FilteringOptions.get_replace_text(new_replace_file) self.replacement_rules = rules self.cat_file_proc = subproc.Popen(['git', 'cat-file', '--batch'], stdin = subprocess.PIPE, stdout = subprocess.PIPE) self.args = bfg_args # Setting partial prevents: # * remapping origin remote tracking branches to regular branches # * deletion of the origin remote # * nuking unused refs # * nuking reflogs # * repacking # While these are arguably desirable things, BFG documentation assumes # the first two aren't done, so for compatibility turn them all off. # The third is irrelevant since BFG has no mechanism for renaming refs, # and we'll manually add the fourth and fifth back in below by calling # RepoFilter.cleanup(). fr_args = fr.FilteringOptions.parse_args(['--partial', '--force'] + extra_args) self.filter = fr.RepoFilter(fr_args, commit_callback=self.commit_update) self.filter.run() if new_replace_file: os.remove(new_replace_file) self.cat_file_proc.stdin.close() self.cat_file_proc.wait() need_another_reset = False if preserve_refs: self.revert_tree_changes(preserve_refs) # If the repository is not bare, self.filter.run() already did a reset # for us. However, if we are preserving refs (and the repository isn't # bare), we need another since we possibly updated HEAD after that # reset (FIXME: two resets is kinda ugly; would be nice to just do # one). if not fr.GitUtils.is_repository_bare('.'): need_another_reset = True if not os.path.isabs(os.fsdecode(bfg_args.repo)): bfg_args.repo = os.fsencode(os.path.join(work_dir, os.fsdecode(bfg_args.repo))) fr.RepoFilter.cleanup(bfg_args.repo, repack=True, reset=need_another_reset) if __name__ == '__main__': bfg = BFG_ish() bfg.run() # Show the same message BFG does, even if we don't copy the rest of its # progress output. Make this program feel slightly more authentically BFG. # :-) print(''' -- You can rewrite history in Git - don't let Trump do it for real! Trump's administration has lied consistently, to make people give up on ever being told the truth. Don't give up: https://www.rescue.org/topic/refugees-america -- ''') git-filter-repo-2.45.0/contrib/filter-repo-demos/clean-ignore000077500000000000000000000050521464611705400241130ustar00rootroot00000000000000#!/usr/bin/env python3 """ This is a simple program that will delete files from history which match current gitignore rules, while also: 1) pruning commits which become empty 2) pruning merge commits which become degenerate and have no changes relative to its remaining relevant parent 3) rewriting commit hashes in commit messages to reference new commit IDs. """ """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ import argparse import os import subprocess import sys try: import git_filter_repo as fr except ImportError: raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?") class CheckIgnores: def __init__(self): self.ignored = set() self.okay = set() cmd = 'git check-ignore --stdin --verbose --non-matching --no-index -z' self.check_ignore_process = subprocess.Popen(cmd.split(), stdin=subprocess.PIPE, stdout=subprocess.PIPE) def __del__(self): if self.check_ignore_process: self.check_ignore_process.stdin.close() def get_ignored(self, filenames): ignored = set() for name in filenames: if name in self.ignored: ignored.add(name) elif name in self.okay: continue else: self.check_ignore_process.stdin.write(name+b'\0') self.check_ignore_process.stdin.flush() result = os.read(self.check_ignore_process.stdout.fileno(), 65535).rstrip(b'\0') (source, linenum, pattern, pathname) = result.split(b"\0") if name != pathname: raise SystemExit("Error: Passed {} but got {}".format(name, pathname)) if not source and not linenum and not pattern: self.okay.add(name) else: if pattern[0:1] == b"!": self.okay.add(name) else: self.ignored.add(name) ignored.add(name) return ignored def skip_ignores(self, commit, metadata): filenames = [x.filename for x in commit.file_changes] bad = self.get_ignored(filenames) commit.file_changes = [x for x in commit.file_changes if x.filename not in bad] def main(): checker = CheckIgnores() args = fr.FilteringOptions.parse_args(sys.argv[1:], error_on_empty = False) filter = fr.RepoFilter(args, commit_callback=checker.skip_ignores) filter.run() if __name__ == '__main__': main() git-filter-repo-2.45.0/contrib/filter-repo-demos/convert-svnexternals000066400000000000000000000517061464611705400257660ustar00rootroot00000000000000#!/usr/bin/env python3 """ This is a program that will insert Git submodules according to SVN externals definitions (svn:externals properties) from the original Subversion repository throughout the history. Information about the externals is obtained from the ".gitsvnextmodules" file created during SVN-to-Git conversion by SubGit (https://subgit.com/). Its config option "translate.externals=true" had to be used therefore. Actual modifications: - Insert gitlinks (mode 160000) into the tree. - Add .gitmodules file with relevant sections. - Remove sections converted to submodules from .gitsvnextmodules file and delete it if empty. .gitsvnextmodules example: [submodule "somedir/extdir"] path = somedir/extdir owner = somedir url = https://svn.example.com/somesvnrepo/trunk revision = 1234 branch = / fetch = :refs/remotes/git-svn remote = svn type = dir Resulting addition in "somedir" tree (cat-file pretty-print format): 160000 commit 1234123412341234123412341234123412341234 extdir Resulting .gitmodules entry: [submodule "somedir/extdir"] path = somedir/extdir url = https://git.example.com/somegitrepo.git SVN-to-Git mapping file: Can be created from SubGit's "refs/svn/map". One line per mapping in following format: TAB TAB TAB TAB - Leading '#' can be used for comments. - must not contain a trailing slash. - has to be "commit" to be usable, but can be "missing" if does not exist in the repository anymore. Adopted from git-cat-file output. Example: https://svn.example.com/somesvnrepo/trunk 1234 https://git.example.com/somegitrepo.git 1234123412341234123412341234123412341234 commit Features: - Repeatedly added/removed externals will be handled properly. - Externals replaced by directly added files and vice versa will be handled properly. Caveats: - This script must NOT be run repeatedly. A second invocation would lead to a different result in case the externals could only be converted partially. - Inconsistent SVN repositories (with failing checkout) not handled, i.e. - normal directory and external with the same path - external path not existing for the given revision - No attention was paid to non-ASCII and special characters in gitlink paths, might cause problems. - There is no error handling for mandatory options missing in .gitsvnextmodules file. The script would crash in case of such buggy files, but that shouldn't happen in practice. TODO: - Add external files directly. - Alternatively add external directories directly instead of using a submodule. """ """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ import argparse import os import sys import shutil import subprocess import configparser from urllib.parse import urlsplit try: import git_filter_repo as fr except ImportError: raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?") svn_root_url = "" svn_git_mappings = [] def parse_args(): """ Parse and return arguments for this script. Also do some argument sanity checks and adaptions. """ parser = argparse.ArgumentParser( description="Add Git submodules according to svn:externals from .gitsvnextmodules. " "As preparation for this conversion process, an analysis can be performed.") parser.add_argument('--force', '-f', action='store_true', help="Rewrite repository history even if the current repo does not " "look like a fresh clone.") parser.add_argument('--refs', nargs='+', help="Limit history rewriting to the specified refs. Option is directly " "forwarded to git-filter-repo, see there for details and caveats. " "Use for debugging purposes only!") parser.add_argument('--svn-root-url', help="Root URL of the corresponding SVN repository, " "needed for conversion of relative to absolute external URLs.") analysis = parser.add_argument_group(title="Analysis") analysis.add_argument('--analyze', action='store_true', help="Analyze repository history and create auxiliary files for conversion process.") analysis.add_argument('--report-dir', type=os.fsencode, help="Directory to write report, defaults to GIT_DIR/filter-repo/svnexternals, " "refuses to run if exists, --force delete existing dir first.") conversion = parser.add_argument_group(title="Conversion") conversion.add_argument('--svn-git-mapfiles', type=os.fsencode, nargs='+', metavar='MAPFILE', help="Files with SVN-to-Git revision mappings for SVN externals conversion.") args = parser.parse_args() if args.analyze and args.svn_git_mapfiles: raise SystemExit("Error: --svn-git-mapfiles makes no sense with --analyze.") if not args.analyze and not args.svn_git_mapfiles: raise SystemExit("Error: --svn-git-mapfiles is required for the conversion process.") return args def read_mappings(mapfiles): """ Read files with SVN-to-Git mappings and return a list of mappings from it. """ mappings = [] for mapfile in mapfiles: with open(mapfile, "rb") as f: for line in f: line = line.rstrip(b'\r\n') # Skip blank and comment lines if not line or line.startswith(b'#'): continue # Convert to string for use with configparser later line = line.decode() # Parse the line fields = line.split('\t', 4) mapping = {'svn_url': fields[0], 'svn_rev': int(fields[1]), 'git_url': fields[2], 'git_commit': fields[3], 'state': fields[4]} mappings.append(mapping) return mappings cat_file_process = None def parse_config(blob_id): """ Create a configparser object for a .gitsvnextmodules/.gitmodules file from its blob ID. """ parsed_config = configparser.ConfigParser() if blob_id is not None: # Get the blob contents cat_file_process.stdin.write(blob_id + b'\n') cat_file_process.stdin.flush() objhash, objtype, objsize = cat_file_process.stdout.readline().split() contents_plus_newline = cat_file_process.stdout.read(int(objsize)+1) # Parse it parsed_config.read_string(contents_plus_newline.decode()) return parsed_config def create_blob(parsed_config): """ Create a filter-repo blob object from a .gitsvnextmodules/.gitmodules configparser object according to Git config style. """ lines = [] for sec in parsed_config.sections(): lines.append("[" + sec + "]\n") for opt in parsed_config.options(sec): lines.append("\t" + opt + " = " + parsed_config[sec][opt] + "\n") return fr.Blob(''.join(lines).encode()) def get_git_url(svn_url): """ Get the Git URL for a corresponding SVN URL. """ for entry in svn_git_mappings: if entry['svn_url'] == svn_url: return entry['git_url'] else: return None def get_git_commit_hash(svn_url, svn_rev): """ Get the Git commit hash for its corresponding SVN URL+revision. The mapping is not restricted to the exact revision, but also uses the next lower revision found. Needed when the revision was set to that of the root URL instead of to that of the specific subdirectory (e.g. trunk). TortoiseSVN behaves so when setting the external to HEAD. """ ent = None rev = 0 for entry in svn_git_mappings: if (entry['svn_url'] == svn_url and entry['svn_rev'] <= svn_rev and entry['svn_rev'] > rev): ent = entry rev = entry['svn_rev'] if ent is not None and ent['state'] == "commit": return ent['git_commit'] else: return None def get_absolute_svn_url(svnext_url, svn_root_url): """ Convert a relative svn:externals URL to an absolute one. If the format is unsupported, return the URL unchanged with success=False. If no root URL is given or the URL is absolute already, return it unchanged. In all cases, even if returned "unchanged", trailing slashes are removed. """ # Remove trailing slash(es) svnext_url = svnext_url.rstrip("/") svn_root_url = svn_root_url.rstrip("/") # Normalize URLs in relative format svn_root_parsed = urlsplit(svn_root_url) if svnext_url.startswith(("../", "^/../")): # unsupported return (False, svnext_url) elif not svn_root_url: pass # unchanged elif svnext_url.startswith("^/"): svnext_url = svn_root_url + svnext_url[1:] elif svnext_url.startswith("//"): svnext_url = svn_root_parsed.scheme + ":" + svnext_url elif svnext_url.startswith("/"): svnext_url = svn_root_parsed.scheme + "://" + svn_root_parsed.netloc + svnext_url return True, svnext_url def parse_revision_value(value): """ Parse the value of key 'revision' from a .gitsvnextmodules file and return it as integer. Used to handle non-numeric values like 1k, 2k, 3k etc. added by SubGit instead of 1024, 2048, 3072 etc., likewise 1m, 2m, ..., 1g, ... """ suffix = value[-1] if suffix in "kmg": mult = {"k": 1024, "m": 1024**2, "g": 1024**3} return int(value[0:-1]) * mult[suffix] else: return int(value) def add_submodule_tree_entry(commit, parsed_config, section): """ Add a submodule entry to the tree of a Git commit. SVN externals information obtained from parsed .gitsvnextmodules file. """ # Skip type=file (SVN file external), not possible as submodule if parsed_config[section]['type'] != 'dir': return False success, svn_url = get_absolute_svn_url(parsed_config[section]['url'], svn_root_url) # Skip unsupported URL format if not success: return False # Get SVN revision if parsed_config.has_option(section, 'revision'): svn_rev = parse_revision_value(parsed_config[section]['revision']) else: # TODO: revision has to be guessed according to commit timestamp, skip for now return False # SVN url+revision mapping to Git commit git_hash = get_git_commit_hash(svn_url, svn_rev) # Skip missing or unusable mapping if git_hash is None: return False git_hash = git_hash.encode() dirname = parsed_config[section]['path'].encode() # Add gitlink to tree commit.file_changes.append(fr.FileChange(b'M', dirname, git_hash, b'160000')) return True def get_commit_map_path(): """ Return path to commit-map file. """ git_dir = fr.GitUtils.determine_git_dir(b'.') return os.path.join(git_dir, b'filter-repo', b'commit-map') def parse_commit_map(commit_map_file): """ Parse commit-map file and return a dictionary. """ parsed_map = {} with open(commit_map_file, "rb") as f: for line in f: line = line.rstrip(b'\r\n') # Skip blank lines if not line: continue # Store old/new commits, also the "old"/"new" header in the first line old, new = line.split() parsed_map[old] = new return parsed_map def merge_commit_maps(old_commit_map, new_commit_map): """ Merge old and new commit-map by omitting intermediate commits. Return the merged dictionary. """ merged_map = {} for (key, old_val) in old_commit_map.items(): new_val = new_commit_map[old_val] if old_val in new_commit_map else old_val merged_map[key] = new_val return merged_map def write_commit_map(commit_map, commit_map_file): """ Write commit-map dictionary to file. """ with open(commit_map_file, 'wb') as f: for (old, new) in commit_map.items(): f.write(b'%-40s %s\n' % (old, new)) def create_report_dir(args): """ Create the directory for analysis report. """ if args.report_dir: reportdir = args.report_dir else: git_dir = fr.GitUtils.determine_git_dir(b'.') # Create the report directory as necessary results_tmp_dir = os.path.join(git_dir, b'filter-repo') if not os.path.isdir(results_tmp_dir): os.mkdir(results_tmp_dir) reportdir = os.path.join(results_tmp_dir, b'svnexternals') if os.path.isdir(reportdir): if args.force: sys.stdout.write("Warning: Removing recursively: \"%s\"" % fr.decode(reportdir)) shutil.rmtree(reportdir) else: sys.stdout.write("Error: dir already exists (use --force to delete): \"%s\"\n" % fr.decode(reportdir)) sys.exit(1) os.mkdir(reportdir) return reportdir analysis = {'dir_ext_orig': [], 'dir_ext_abs': [], 'file_ext_orig': [], 'file_ext_abs': []} def write_analysis(reportdir): """ Prepare analysis and write it to files in report directory. """ analysis['dir_ext_orig'].sort() analysis['dir_ext_abs'].sort() analysis['file_ext_orig'].sort() analysis['file_ext_abs'].sort() sys.stdout.write("Writing reports to %s..." % fr.decode(reportdir)) sys.stdout.flush() with open(os.path.join(reportdir, b"dir-externals-original.txt"), 'wb') as f: for url in analysis['dir_ext_orig']: f.write(("%s\n" % url).encode()) with open(os.path.join(reportdir, b"dir-externals-absolute.txt"), 'wb') as f: for url in analysis['dir_ext_abs']: f.write(("%s\n" % url).encode()) with open(os.path.join(reportdir, b"file-externals-original.txt"), 'wb') as f: for url in analysis['file_ext_orig']: f.write(("%s\n" % url).encode()) with open(os.path.join(reportdir, b"file-externals-absolute.txt"), 'wb') as f: for url in analysis['file_ext_abs']: f.write(("%s\n" % url).encode()) sys.stdout.write("done.\n") def analyze_externals(commit, metadata): """ Generate/extend analysis of SVN externals for a Git commit. Used as filter-repo commit callback. """ for change in commit.file_changes: if change.filename == b'.gitsvnextmodules' and change.type == b'M': gitsvnextmodules = parse_config(change.blob_id) for sec in gitsvnextmodules.sections(): url = gitsvnextmodules[sec]['url'] success, abs_url = get_absolute_svn_url(url, svn_root_url) # List of svn:externals URLs, also add the URL to the absolute list if # conversion was not successful if gitsvnextmodules[sec]['type'] == 'dir': if url not in analysis['dir_ext_orig']: analysis['dir_ext_orig'].append(url) if abs_url not in analysis['dir_ext_abs']: analysis['dir_ext_abs'].append(abs_url) else: if url not in analysis['file_ext_orig']: analysis['file_ext_orig'].append(url) if abs_url not in analysis['file_ext_abs']: analysis['file_ext_abs'].append(abs_url) def insert_submodules(commit, metadata): """ Insert submodules for a Git commit. Used as filter-repo commit callback. Since .gitsvnextmodules just contains the svn:externals state for the given commit, we cannot derive specific changes from that file. So we can only add/modify the gitlinks according to .gitsvnextmodules (without knowing whether adding a new or modifying an existing or even "modifying" an unchanged submodule, but none of that really matters). We do not have information about deleted externals, those will be handled in a separate filter run afterwards. The .gitmodules file however will already be correct in this function because we don't need to know about specific changes to add, modify or delete it. """ for change in commit.file_changes: if change.filename == b'.gitsvnextmodules' and change.type in (b'M', b'D'): gitsvnextmodules = parse_config(change.blob_id) gitmodules = configparser.ConfigParser() # Add gitlinks to the tree and prepare .gitmodules file content for sec in gitsvnextmodules.sections(): if add_submodule_tree_entry(commit, gitsvnextmodules, sec): # Gitlink added # -> Add this entry to .gitmodules as well # Create the section name string manually, do not rely on # .gitsvnextmodules to always use the proper section name. sec_name = 'submodule "' + gitsvnextmodules[sec]['path'] + '"' gitmodules[sec_name] = {} # submodule..path gitmodules[sec_name]['path'] = gitsvnextmodules[sec]['path'] # submodule..url success, svn_url = get_absolute_svn_url(gitsvnextmodules[sec]['url'], svn_root_url) git_url = get_git_url(svn_url) if git_url is not None: gitmodules[sec_name]['url'] = git_url else: # Abort, but this will not happen in practice, catched in # add_submodule_tree_entry() via get_git_commit_hash() already. raise SystemExit("Error: No Git URL found in mapping although a commit hash could be found.") # Write blob and adapt tree for .gitmodules if gitmodules.sections(): # Create a blob object from the content and add it to the tree. blob = create_blob(gitmodules) filter.insert(blob) commit.file_changes.append(fr.FileChange(b'M', b'.gitmodules', blob.id, b'100644')) else: # Delete the file, even if a "git rm" of all submodules keeps it empty. commit.file_changes.append(fr.FileChange(b'D', b'.gitmodules')) def delete_submodules(commit, metadata): """ Delete submodules from a Git commit. Used as filter-repo commit callback. Delete all submodules (inserted in the previous filter run) without an entry in .gitsvnextmodules, these were real deletions of externals, which couldn't be detected before. Only the tree entries have to be removed because the .gitmodules file is already in correct state from previous filter run. """ for change in commit.file_changes: if change.filename == b'.gitsvnextmodules' and change.type in (b'M', b'D'): gitsvnextmodules = parse_config(change.blob_id) # Search for all submodules in the tree output = subprocess.check_output('git ls-tree -d -r -z'.split() + [commit.original_id]) for line in output.split(b'\x00'): if not line: continue mode_objtype_objid, dirname = line.split(b'\t', 1) mode, objtype, objid = mode_objtype_objid.split(b' ') if mode == b'160000' and objtype == b'commit': # Submodule found # -> Delete it if there is no corresponding entry in # .gitsvnextmodules, keep/reinsert it otherwise for sec in gitsvnextmodules.sections(): if gitsvnextmodules[sec]['path'].encode() == dirname: # Reinsert it, might have been deleted in previous commits if add_submodule_tree_entry(commit, gitsvnextmodules, sec): # And remove the config section because this external has been # converted gitsvnextmodules.remove_section(sec) break else: # Delete it commit.file_changes.append(fr.FileChange(b'D', dirname)) # Rewrite .gitsvnextmodules to contain the unhandled externals only, # delete it if empty (all externals converted). if gitsvnextmodules.sections(): # Create a blob object from the content and replace the original one. blob = create_blob(gitsvnextmodules) filter.insert(blob) change.blob_id = blob.id else: if change.type == b'M': # File became empty, delete it commit.file_changes.append(fr.FileChange(b'D', b'.gitsvnextmodules')) break # avoid endless for loop #else: # File was empty already, delete command already present in stream my_args = parse_args() # Use passed URL without trailing slash(es) if my_args.svn_root_url: svn_root_url = my_args.svn_root_url.rstrip("/") # Arguments forwarded to filter-repo extra_args = [] if my_args.force: extra_args = ['--force'] if my_args.refs: extra_args += ['--refs'] + my_args.refs cat_file_process = subprocess.Popen(['git', 'cat-file', '--batch'], stdin = subprocess.PIPE, stdout = subprocess.PIPE) if my_args.analyze: # Analysis reportdir = create_report_dir(my_args) fr_args = fr.FilteringOptions.parse_args(['--dry-run'] + extra_args) filter = fr.RepoFilter(fr_args, commit_callback=analyze_externals) filter.run() write_analysis(reportdir) else: # Conversion svn_git_mappings = read_mappings(my_args.svn_git_mapfiles) # There are no references to commit hashes in commit messages because this # script runs on a Git repository converted from a Subversion repository. fr_args = fr.FilteringOptions.parse_args(['--preserve-commit-hashes', '--preserve-commit-encoding', '--replace-refs', 'update-no-add'] + extra_args) filter = fr.RepoFilter(fr_args, commit_callback=insert_submodules) filter.run() # Store commit-map after first run first_commit_map = parse_commit_map(get_commit_map_path()) filter = fr.RepoFilter(fr_args, commit_callback=delete_submodules) filter.run() # Update commit-map after second run, based on original IDs second_commit_map = parse_commit_map(get_commit_map_path()) merged_commit_map = merge_commit_maps(first_commit_map, second_commit_map) write_commit_map(merged_commit_map, get_commit_map_path()) cat_file_process.stdin.close() cat_file_process.wait() git-filter-repo-2.45.0/contrib/filter-repo-demos/filter-branch-ish000077700000000000000000000000001464611705400275302filter-lamelyustar00rootroot00000000000000git-filter-repo-2.45.0/contrib/filter-repo-demos/filter-lamely000077500000000000000000000707441464611705400243300ustar00rootroot00000000000000#!/usr/bin/env python3 """This is a bug compatible-ish[1] reimplementation of filter-branch, which happens to be faster. The goal is _only_ to show filter-repo's flexibility in re-implementing other types of history rewriting commands. It is not meant for actual end-user use, because filter-branch (and thus filter-lamely) is an abomination of user interfaces: * it is difficult to learn, except for a few exceedingly trivial rewrites * it is difficult to use; even for expert users like me I often have to spend significant time to craft the filters to do what is needed * it is painfully slow to use: the slow execution (even if filter-lamely is several times faster than filter-branch it will still be far slower than filter-repo) is doubly problematic because users have to retry their commands often to see if they've crafted the right filters, so the real execution time is much worse than what benchmarks typically show. (Benchmarks don't include how long it took to come up with the right command.) * it provides really bad feedback: broken filters often modify history incorrectly rather than providing errors; even when errors are printed, it takes forever before the errors are shown, the errors are lost in a sea of output, and no context about which commits were involved are saved. * users cannot share commands they come up with very well, because BSD vs. GNU userland differences will result in errors -- causing the above problems to be repeated and/or resulting in silent corruption of repos * the usability defaults are atrocious... * partial history rewrites * backup to refs/original/ * no automatic post-run cleanup * not pruning empty commits * not rewriting commit hashes in commit messages * ...and the atrocious defaults combine for even worse effects: * users mix up old and new history, push both, things get merged, and then they have even more of a mess with banned objects still floating around * since users can run arbitrary commands in the filters, relying on the local repo to keep a backup of itself seems suspect * refs/original/ doesn't correctly back up tags (it dereferences them), so it isn't a safe mechanism for recovery even if all goes well * even if the backups in refs/original/ were good, many users don't know how to restore using that mechanism. So they clone before filtering and just nuke the clone if the filtering goes poorly. * --tag-name-filter writes out new tags but leaves the old ones around, making claims like "just clone the repo to get rid of the old history" a farce. It also makes it hard to extricate old vs. new bits of history, as if the default to partial history rewrites wasn't bad enough * since filtering can result in lots of empty commits, filter-branch at least provides an option to nuke all empty commits, but naturally that includes the empty commits that were intentionally added to the original reposository as opposed to just commits that become empty due to filtering. And, for good measure, filter-branch's --prune-empty actually still misses some commits that become empty. * it's extremely difficult in filter-branch to rewrite commit hashes in commit messages sanely. It requires using undocumented capabilities and even then is going to be extremely painful and slow. As long as --commit-filter isn't used, I could do it in filter-lamely with just a one-line change, but the point was demonstrating compatibility with a horrible tool, not showing how we can make it ever so slightly less awful. [1] Replacing git-filter-branch with this script will still pass all the git-v2.22.0 regression tests. However, I know those tests aren't thorough enough and that I did break backward compatibility in some cases. But, assuming people are crazy enough to want filter-branch to continue to exist, I assert that filter-lamely would be a better filter-branch due to its improved speed. I won't maintain or improve filter-lamely though, because the only proper thing to do with filter-branch is attempt to rewrite our collective history so that people are unaware of its existence. People should use filter-repo instead. Intentional differences from git-filter-branch: * (Perf) --tree-filter and --index-filter only operate on files that have changed since the previous commit, which significantly reduces the amount of work needed. This requires special efforts to correctly handle deletes when the filters attempt to rename files, but provides significant perf improvements. There is a vanishingly small chance that someone out there is depending on rewriting all files in every commit and does so differently depending on topology of commits instead of contents of files and is thus adversely affected by this change. I doubt it, though. * I vastly simplified the map() function to just ignore writing out the mapping; I've never seen anyone explicitly use it, and filter-repo handles remapping to ancestors without it. I dare you to find anyone that was reading the $workdir/../map/ directory and using it in their filtering. * When git-replace was introduced, --parent-filter became obsolete and deprecated IMO. As such, I didn't bother reimplementing. If I were to reimplement it, I'd just do an extra loop over commits and invoke git-replace based on the --parent-filter output or something similar to that. * I took a bit of liberty in the implementation of --state-branch; I still pass the regression tests, but I kind of violated the spirit of the option. I may actually circle back and fix this, if I add such a similarly named option to filter-repo. """ """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ import argparse import datetime import os import shutil import subprocess import sys try: import git_filter_repo as fr except ImportError: raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?") subproc = fr.subproc class UserInterfaceNightmare: def __init__(self): args = UserInterfaceNightmare.parse_args() # Fix up args.refs if not args.refs: args.refs = ["HEAD"] elif args.refs[0] == '--': args.refs = args.refs[1:] # Make sure args.d is an absolute path if not args.d.startswith(b'/'): args.d = os.path.abspath(args.d) # Save the args self.args = args self._orig_refs = {} self._special_delete_mode = b'deadbeefdeadbeefdeadbeefdeadbeefdeadbeef' self._commit_filter_functions = b''' EMPTY_TREE=$(git hash-object -t tree /dev/null) # if you run 'skip_commit "$@"' in a commit filter, it will print # the (mapped) parents, effectively skipping the commit. skip_commit() { shift; while [ -n "$1" ]; do shift; echo "$1"; shift; done; } # map is lame; just fake it. map() { echo "$1" } # if you run 'git_commit_non_empty_tree "$@"' in a commit filter, # it will skip commits that leave the tree untouched, commit the other. git_commit_non_empty_tree() { if test $# = 3 && test "$1" = $(git rev-parse "$3^{tree}"); then echo "$3" elif test $# = 1 && test "$1" = $EMPTY_TREE; then : else git commit-tree "$@" fi } ''' @staticmethod def parse_args(): parser = argparse.ArgumentParser( description='Mimic filter-branch functionality, for those who ' 'lamely have not upgraded their scripts to filter-repo') parser.add_argument('--setup', metavar='', help=("Common commands to be included before every other filter")) parser.add_argument('--subdirectory-filter', metavar='', help=("Only include paths under the given directory and rewrite " "that directory to be the new project root.")) parser.add_argument('--env-filter', metavar='', help=("Modify the name/email/date of either author or committer")) parser.add_argument('--tree-filter', metavar='', help=("Command to rewrite the tree and its contents. The working " "directory will be set to the root of the checked out tree. " "New files are auto-added, disappeared, etc.")) parser.add_argument('--index-filter', metavar='', help=("Command to rewrite the index. Similar to the tree filter, " "but there are no working tree files which makes it " "faster. Commonly used with `git rm --cached " "--ignore-unmatch` and `git update-index --index-info`")) parser.add_argument('--parent-filter', metavar='', help=("Bail with an error; deprecated years ago")) parser.add_argument('--remap-to-ancestor', action='store_true', # Does nothing, this option is always on. Only exists # because filter-branch once allowed it to be off and # so some tests pass this option. help=argparse.SUPPRESS) parser.add_argument('--msg-filter', metavar='', help=("Command to run for modifying commit and tag messages which " "are received on standard input; standard output will be used " "as the new message.")) parser.add_argument('--commit-filter', metavar='', help=("A command to perform the commit. It will be called with " "arguments of the form \" [(-p )...]" "\" and the log message on stdin. The commit id is expected " "on stdout. The simplest commit filter would be 'git " "commit-tree $@'")) parser.add_argument('--tag-name-filter', metavar='', help=("This filter is rewriting tag names. It will be called " "with tag names on stdin and expect a new tag name on stdout.")) parser.add_argument('--prune-empty', action='store_true', help=("Prune empty commits, even commits that were intentionally " "added as empty commits in the original repository and really " "shouldn't be removed.")) parser.add_argument('--original', metavar='', type=os.fsencode, default=b'refs/original/', help=("Alter misguided backup strategy to store refs under " " instead of refs/original/")) parser.add_argument('-d', metavar='', default='.git-rewrite', type=os.fsencode, help=("Alter the temporary directory used for rewriting")) parser.add_argument('--force', '-f', action='store_true', help=("Run even if there is an existing temporary directory or " "an existing backup (e.g. under refs/original/)")) parser.add_argument('--state-branch', metavar='', help=("Do nothing; filter-lamely is enough faster than " "filter-branch that it doesn't need incrementalism.")) parser.add_argument('refs', metavar='rev-list options', nargs=argparse.REMAINDER, help=("Arguments for git rev-list. All positive refs included by " "these options are rewritten. Sane people specify things like " "--all, though that annoyingly requires prefacing with --")) args = parser.parse_args() # Make setup apply to all the other shell filters if args.setup: if args.env_filter: args.env_filter = args.setup + "\n" + args.env_filter if args.tree_filter: args.tree_filter = args.setup + "\n" + args.tree_filter if args.index_filter: args.index_filter = args.setup + "\n" + args.index_filter if args.msg_filter: args.msg_filter = args.setup + "\n" + args.msg_filter if args.commit_filter: args.commit_filter = args.setup + "\n" + args.commit_filter if args.tag_name_filter: args.tag_name_filter = args.setup + "\n" + args.tag_name_filter return args @staticmethod def _get_dereferenced_refs(): # [BUG-COMPAT] We could leave out the --dereference and the '^{}' handling # and fix a nasty bug from filter-branch. But, as stated elsewhere, the # goal is not to provide sane behavior, but to match what filter-branch # does. cur_refs = {} cmd = 'git show-ref --head --dereference' output = subproc.check_output(cmd.split()) for line in output.splitlines(): objhash, refname = line.split() if refname.endswith(b'^{}'): refname = refname[0:-3] cur_refs[refname] = objhash return cur_refs def _get_and_check_orig_refs(self): self._orig_refs = self._get_dereferenced_refs() if any(ref.startswith(self.args.original) for ref in self._orig_refs): if self.args.force: cmds = b''.join([b"delete %s\n" % r for r in sorted(self._orig_refs) if r.startswith(self.args.original)]) subproc.check_output('git update-ref --no-deref --stdin'.split(), input = cmds) else: raise SystemExit("Error: {} already exists. Force overwriting with -f" .format(fr.decode(self.args.original))) def _write_original_refs(self): new_refs = self._get_dereferenced_refs() exported_refs, imported_refs = self.filter.get_exported_and_imported_refs() overwritten = imported_refs & exported_refs cmds = b''.join([b"update %s%s %s\n" % (self.args.original, r, self._orig_refs[r]) for r in sorted(overwritten) if r not in new_refs or self._orig_refs[r] != new_refs[r]]) subproc.check_output('git update-ref --no-deref --stdin'.split(), input = cmds) def _setup(self): if self.args.force and os.path.exists(self.args.d): shutil.rmtree(self.args.d) if os.path.exists(self.args.d): raise SystemExit("Error: {} already exists; use --force to bypass." .format(self.args.d)) self._get_and_check_orig_refs() os.makedirs(self.args.d) self.index_file = os.path.join(self.args.d, b'temp_index') self.tmp_tree = os.path.join(self.args.d, b't') os.makedirs(self.tmp_tree) # Hack (stupid regression tests depending on implementation details # instead of verifying user-visible and intended functionality...) if self.args.d.endswith(b'/dfoo'): with open(os.path.join(self.args.d, b'backup-refs'), 'w') as f: f.write('drepo\n') # End hack def _cleanup(self): shutil.rmtree(self.args.d) def _check_for_unsupported_args(self): if self.args.parent_filter: raise SystemExit("Error: --parent-filter was deprecated years ago with git-replace(1). Use it instead.") def get_extended_refs(self): if not self.args.tag_name_filter: return self.args.refs if '--all' in self.args.refs or '--tags' in self.args.refs: # No need to follow tags pointing at refs we are exporting if we are # already exporting all tags; besides, if we do so fast export will # buggily export such tags multiple times, and fast-import will scream # "error: multiple updates for ref 'refs/tags/$WHATEVER' not allowed" return self.args.refs # filter-branch treats --tag-name-filter as an implicit "follow-tags"-ish # behavior. So, we need to determine which tags point to commits we are # rewriting. output = subproc.check_output(['git', 'rev-list'] + self.args.refs) all_commits = set(output.splitlines()) cmd = 'git show-ref --tags --dereference'.split() output = subproc.check_output(cmd) # In ideal world, follow_tags would be a list of tags which point at one # of the commits in all_commits. But since filter-branch is insane and # we need to match its insanity, we instead store the tags as the values # of a dict, with the keys being the new name for the given tags. The # reason for this is due to problems with multiple tags mapping to the # same name and filter-branch not wanting to error out on this obviously # broken condition, as noted below. follow_tags = {} for line in output.splitlines(): objhash, refname = line.split() if refname.endswith(b'^{}'): refname = refname[0:-3] refname = fr.decode(refname) if refname in self.args.refs: # Don't specify the same tag multiple times, or fast export will # buggily export it multiple times, and fast-import will scream that # "error: multiple updates for ref 'refs/tags/$WHATEVER' not allowed" continue if objhash in all_commits: newname = self.tag_rename(refname.encode()) # [BUG-COMPAT] What if multiple tags map to the same newname, you ask? # Well, a sane program would detect that and give the user an error. # fast-import does precisely that. We could do it too, but providing # sane behavior goes against the core principle of filter-lamely: # # dispense with sane behavior; do what filter-branch does instead # # And filter-branch has a testcase that relies on no error being # shown to the user with only an update corresponding to the tag # which was originally alphabetically last being performed. We rely # on show-ref printing tags in alphabetical order to match that lame # functionality from filter-branch. follow_tags[newname] = refname return self.args.refs + list(follow_tags.values()) def _populate_full_index(self, commit): subproc.check_call(['git', 'read-tree', commit]) def _populate_index(self, file_changes): subproc.check_call('git read-tree --empty'.split()) # [BUG-COMPAT??] filter-branch tests are weird, and filter-branch itself # manually sets GIT_ALLOW_NULL_SHA1, so to pass the same tests we need to # as well. os.environ['GIT_ALLOW_NULL_SHA1'] = '1' p = subproc.Popen('git update-index -z --index-info'.split(), stdin = subprocess.PIPE) for change in file_changes: if change.type == b'D': # We need to write something out to the index for the delete in # case they are renaming all files (e.g. moving into a subdirectory); # they need to be able to rename what is deleted so it actually deletes # the right thing. p.stdin.write(b'160000 %s\t%s\x00' % (self._special_delete_mode, change.filename)) else: p.stdin.write(b'%s %s\t%s\x00' % (change.mode, change.blob_id, change.filename)) p.stdin.close() if p.wait() != 0: raise SystemExit("Failed to setup index for tree or index filter") del os.environ['GIT_ALLOW_NULL_SHA1'] def _update_file_changes_from_index(self, commit): new_changes = {} output = subproc.check_output('git ls-files -sz'.split()) for line in output.split(b'\x00'): if not line: continue mode_thru_stage, filename = line.split(b'\t', 1) mode, objid, stage = mode_thru_stage.split(b' ') if mode == b'160000' and objid == self._special_delete_mode: new_changes[filename] = fr.FileChange(b'D', filename) elif set(objid) == set(b'0'): # [BUG-COMPAT??] Despite filter-branch setting GIT_ALLOW_NULL_SHA1 # before calling read-tree, it expects errors to be thrown if any null # shas remain. Crazy filter-branch. raise SystemExit("Error: file {} has broken id {}" .format(fr.decode(filename), fr.decode(objid))) else: new_changes[filename] = fr.FileChange(b'M', filename, objid, mode) commit.file_changes = list(new_changes.values()) def _env_variables(self, commit): # Define GIT_COMMIT and GIT_{AUTHOR,COMMITTER}_{NAME,EMAIL,DATE} envvars = b'' envvars += b'export GIT_COMMIT="%s"\n' % commit.original_id envvars += b'export GIT_AUTHOR_NAME="%s"\n' % commit.author_name envvars += b'export GIT_AUTHOR_EMAIL="%s"\n' % commit.author_email envvars += b'export GIT_AUTHOR_DATE="@%s"\n' % commit.author_date envvars += b'export GIT_COMMITTER_NAME="%s"\n' % commit.committer_name envvars += b'export GIT_COMMITTER_EMAIL="%s"\n' % commit.committer_email envvars += b'export GIT_COMMITTER_DATE="@%s"\n' % commit.committer_date return envvars def fixup_commit(self, commit, metadata): if self.args.msg_filter: commit.message = subproc.check_output(self.args.msg_filter, shell=True, input = commit.message) if self.args.env_filter and not self.args.commit_filter: envvars = self._env_variables(commit) echo_results = b''' echo "${GIT_AUTHOR_NAME}" echo "${GIT_AUTHOR_EMAIL}" echo "${GIT_AUTHOR_DATE}" echo "${GIT_COMMITTER_NAME}" echo "${GIT_COMMITTER_EMAIL}" echo "${GIT_COMMITTER_DATE}" ''' shell_snippet = envvars + self.args.env_filter.encode() + echo_results output = subproc.check_output(['/bin/sh', '-c', shell_snippet]).strip() last = output.splitlines()[-6:] commit.author_name = last[0] commit.author_email = last[1] assert(last[2][0:1] == b'@') commit.author_date = last[2][1:] commit.committer_name = last[3] commit.committer_email = last[4] assert(last[5][0:1] == b'@') commit.committer_date = last[5][1:] if not (self.args.tree_filter or self.args.index_filter or self.args.commit_filter): return # os.environ needs its arguments to be strings because it will call # .encode on them. So lame when we already know the necessary bytes, # but whatever...just call fr.decode() and be done with it. os.environ['GIT_INDEX_FILE'] = fr.decode(self.index_file) os.environ['GIT_WORK_TREE'] = fr.decode(self.tmp_tree) if self.args.tree_filter or self.args.index_filter: full_tree = False deletion_changes = [x for x in commit.file_changes if x.type == b'D'] if len(commit.parents) >= 1 and not isinstance(commit.parents[0], int): # When a commit's parent is a commit hash rather than an integer, # it means that we are doing a partial history rewrite with an # excluded revision range. In such a case, the first non-excluded # commit (i.e. this commit) won't be building on a bunch of history # that was filtered, so we filter the entire tree for that commit # rather than just the files it modified relative to its parent. full_tree = True self._populate_full_index(commit.parents[0]) else: self._populate_index(commit.file_changes) if self.args.tree_filter: # Make sure self.tmp_tree is a new clean directory and we're in it if os.path.exists(self.tmp_tree): shutil.rmtree(self.tmp_tree) os.makedirs(self.tmp_tree) # Put the files there subproc.check_call('git checkout-index --all'.split()) # Call the tree filter subproc.call(self.args.tree_filter, shell=True, cwd=self.tmp_tree) # Add the files, then move out of the directory subproc.check_call('git add -A'.split()) if self.args.index_filter: subproc.call(self.args.index_filter, shell=True, cwd=self.tmp_tree) self._update_file_changes_from_index(commit) if full_tree: commit.file_changes.insert(0, fr.FileChange(b'DELETEALL')) elif deletion_changes and self.args.tree_filter: orig_deletions = set(x.filename for x in deletion_changes) # Populate tmp_tree with all the deleted files, each containing its # original name shutil.rmtree(self.tmp_tree) os.makedirs(self.tmp_tree) for change in deletion_changes: dirname, basename = os.path.split(change.filename) realdir = os.path.join(self.tmp_tree, dirname) if not os.path.exists(realdir): os.makedirs(realdir) with open(os.path.join(realdir, basename), 'bw') as f: f.write(change.filename) # Call the tree filter subproc.call(self.args.tree_filter, shell=True, cwd=self.tmp_tree) # Get the updated file deletions updated_deletion_paths = set() for dirname, subdirs, files in os.walk(self.tmp_tree): for basename in files: filename = os.path.join(dirname, basename) with open(filename, 'br') as f: orig_name = f.read() if orig_name in orig_deletions: updated_deletion_paths.add(filename[len(self.tmp_tree)+1:]) # ...and finally add them to the list commit.file_changes += [fr.FileChange(b'D', filename) for filename in updated_deletion_paths] if self.args.commit_filter: # Define author and committer info for commit_filter envvars = self._env_variables(commit) if self.args.env_filter: envvars += self.args.env_filter.encode() + b'\n' # Get tree and parents we need to pass cmd = b'git rev-parse %s^{tree}' % commit.original_id tree = subproc.check_output(cmd.split()).strip() parent_pairs = zip(['-p']*len(commit.parents), commit.parents) # Define the command to run combined_shell_snippet = (self._commit_filter_functions + envvars + self.args.commit_filter.encode()) cmd = ['/bin/sh', '-c', combined_shell_snippet, "git commit-tree", tree] cmd += [item for pair in parent_pairs for item in pair] # Run it and get the new commit new_commit = subproc.check_output(cmd, input = commit.message).strip() commit.skip(new_commit) reset = fr.Reset(commit.branch, new_commit) self.filter.insert(reset) del os.environ['GIT_WORK_TREE'] del os.environ['GIT_INDEX_FILE'] def tag_rename(self, refname): if not self.args.tag_name_filter or not refname.startswith(b'refs/tags/'): return refname newname = subproc.check_output(self.args.tag_name_filter, shell=True, input=refname[10:]).strip() return b'refs/tags/' + newname def deref_tags(self, tag, metadata): '''[BUG-COMPAT] fast-export and fast-import nicely and naturally handle tag objects. Trying to break this and destroy the correct handling of tags requires extra work. In particular, De-referencing tags and thus forcing all tags to be lightweight is something that would only be done by someone who was insane, or someone who was trying to mimic filter-branch's functionality. But then, perhaps I repeat myself. Anyway, let's mimic yet another insanity of filter-branch here... ''' if self.args.tag_name_filter: return tag.skip() reset = fr.Reset(tag.ref, tag.from_ref) self.filter.insert(reset, direct_insertion = False) def muck_stuff_up(self): self._check_for_unsupported_args() self._setup() extra_args = [] if self.args.subdirectory_filter: extra_args = ['--subdirectory-filter', self.args.subdirectory_filter] self.args.prune_empty = True fr_args = fr.FilteringOptions.parse_args(['--preserve-commit-hashes', '--preserve-commit-encoding', '--partial', '--force'] + extra_args) fr_args.prune_empty = 'always' if self.args.prune_empty else 'never' fr_args.refs = self.get_extended_refs() self.filter = fr.RepoFilter(fr_args, commit_callback=self.fixup_commit, refname_callback=self.tag_rename, tag_callback=self.deref_tags) self.filter.run() self._write_original_refs() self._cleanup() overrides = ('GIT_TEST_DISALLOW_ABBREVIATED_OPTIONS', 'I_PROMISE_TO_UPGRADE_TO_FILTER_REPO') if not any(x in os.environ for x in overrides) and sys.argv[1:] != ['--help']: print(""" WARNING: While filter-lamely is a better filter-branch than filter-branch, it is vastly inferior to filter-repo. Please use filter-repo instead. (You can squelch this warning and five second pause with export {}=1 )""".format(overrides[-1])) import time time.sleep(5) filter_branch = UserInterfaceNightmare() filter_branch.muck_stuff_up() git-filter-repo-2.45.0/contrib/filter-repo-demos/insert-beginning000077500000000000000000000047501464611705400250160ustar00rootroot00000000000000#!/usr/bin/env python3 """ This is a simple program that will insert some regular file into the root commit(s) of history, e.g. adding a file named LICENSE or COPYING to the first commit. It also rewrites commit hashes in commit messages to update them based on these changes. """ """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ # Technically, this program could be replaced by a one-liner: # git filter-repo --force --commit-callback "if not commit.parents: commit.file_changes.append(FileChange(b'M', $RELATIVE_TO_PROJECT_ROOT_PATHNAME, b'$(git hash-object -w $FILENAME)', b'100644'))" # but let's do it as a full-fledged program that imports git_filter_repo # anyway... import argparse import os import subprocess try: import git_filter_repo as fr except ImportError: raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?") parser = argparse.ArgumentParser( description='Add a file to the root commit(s) of history') parser.add_argument('--file', type=os.fsencode, help=("Relative-path to file whose contents should be added to root commit(s)")) args = parser.parse_args() if not args.file: raise SystemExit("Error: Need to specify the --file option") fhash = subprocess.check_output(['git', 'hash-object', '-w', args.file]).strip() fmode = b'100755' if os.access(args.file, os.X_OK) else b'100644' # FIXME: I've assumed the file wasn't a directory or symlink... def fixup_commits(commit, metadata): if len(commit.parents) == 0: commit.file_changes.append(fr.FileChange(b'M', args.file, fhash, fmode)) # FIXME: What if the history already had a file matching the given name, # but which didn't exist until later in history? Is the intent for the # user to keep the other version that existed when it existed, or to # overwrite the version for all of history with the specified file? I # don't know, but if it's the latter, we'd need to add an 'else' clause # like the following: #else: # commit.file_changes = [x for x in commit.file_changes # if x.filename != args.file] fr_args = fr.FilteringOptions.parse_args(['--preserve-commit-encoding', '--force', '--replace-refs', 'update-no-add']) filter = fr.RepoFilter(fr_args, commit_callback=fixup_commits) filter.run() git-filter-repo-2.45.0/contrib/filter-repo-demos/lint-history000077500000000000000000000162351464611705400242220ustar00rootroot00000000000000#!/usr/bin/env python3 """ This is a simple program that will run a linting program on all non-binary files in history. It also rewrites commit hashes in commit messages to refer to the new commits with the rewritten files. You call it like this: lint-history my-lint-command --arg whatever --another-arg and it will repeatedly call my-lint-command --arg whatever --another-arg $TEMPORARY_FILE with $TEMPORARY_FILE having contents of some file from history. NOTE: Several people have taken and modified this script for a variety of special cases (linting python files, linting jupyter notebooks, just linting java files, etc.) and posted their modifications at https://github.com/newren/git-filter-repo/issues/45 Feel free to take a look and adopt some of their ideas. Most of these modifications are probably strictly unnecessary since you could just make a lint-script that takes the filename, checks that it matches what you want, and then calls the real linter. But I guess folks don't like making an intermediate script. So I eventually added the --relevant flag for picking out certain files providing yet another way to handle it. """ """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ # Technically, if you are only running on all non-binary files and don't care # about filenames, then this program could be replaced by a "one-liner"; e.g. # git filter-repo --force --blob-callback ' # if not b"\0" in blob.data[0:8192]: # filename = ".git/info/tmpfile" # with open(filename, "wb") as f: # f.write(blob.data) # subprocess.check_call(["lint_program", "--some", "arg", filename]) # with open(filename, "rb") as f: # blob.data = f.read() # os.remove(filename) # ' # but let's do it as a full-fledged program that imports git_filter_repo # and show how to also do it with filename handling... import argparse import os import subprocess import tempfile try: import git_filter_repo as fr except ImportError: raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?") example_text = '''CALLBACK When you pass --relevant 'BODY', the following style of function will be compiled and called: def is_relevant(filename): BODY Where filename is the full relative path from the toplevel of the repository. Thus, to only run on files with a ".txt" extension you would run lint-history --relevant 'return filename.endswith(b".txt")' ... EXAMPLES To run dos2unix on all non-binary files in history: lint-history dos2unix To run eslint --fix on all .js files in history: lint-history --relevant 'return filename.endswith(b".js")' eslint --fix INTERNALS Linting of files in history will be done by writing the files to a temporary directory before running the linting program; the location of this temporary directory can be controlled via the TMPDIR environment variable as per https://docs.python.org/3/library/tempfile.html#tempfile.mkdtemp. ''' parser = argparse.ArgumentParser(description='Run a program (e.g. code formatter or linter) on files in history', epilog = example_text, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--relevant', metavar="FUNCTION_BODY", help=("Python code for determining whether to apply linter to a " "given filename. Implies --filenames-important. See CALLBACK " "below.")) parser.add_argument('--filenames-important', action='store_true', help=("By default, contents are written to a temporary file with a " "random name. If the linting program needs to know the file " "basename to operate correctly (e.g. because it needs to know " "the file's extension), then pass this argument")) parser.add_argument('--refs', nargs='+', help=("Limit history rewriting to the specified refs. " "Implies --partial of git-filter-repo (and all its " "implications).")) parser.add_argument('command', nargs=argparse.REMAINDER, help=("Lint command to run, other than the filename at the end")) lint_args = parser.parse_args() if not lint_args.command: raise SystemExit("Error: Need to specify a lint command") if len(lint_args.command) > 1 and lint_args.command[0] == '--': lint_args.command.pop(0) tmpdir = None blobs_handled = {} cat_file_process = None def lint_with_real_filenames(commit, metadata): for change in commit.file_changes: if change.blob_id in blobs_handled: change.blob_id = blobs_handled[change.blob_id] elif change.type == b'D': continue elif not is_relevant(change.filename): continue else: # Get the old blob contents cat_file_process.stdin.write(change.blob_id + b'\n') cat_file_process.stdin.flush() objhash, objtype, objsize = cat_file_process.stdout.readline().split() contents_plus_newline = cat_file_process.stdout.read(int(objsize)+1) # Write it out to a file with the same basename filename = os.path.join(tmpdir, os.path.basename(change.filename)) with open(filename, "wb") as f: f.write(contents_plus_newline[:-1]) # Lint the file subprocess.check_call(lint_args.command + [filename.decode('utf-8')]) # Get the new contents with open(filename, "rb") as f: blob = fr.Blob(f.read()) # Insert the new file into the filter's stream, and remove the tempfile filter.insert(blob) os.remove(filename) # Record our handling of the blob and use it for this change blobs_handled[change.blob_id] = blob.id change.blob_id = blob.id def lint_non_binary_blobs(blob, metadata): if not b"\0" in blob.data[0:8192]: filename = '.git/info/tmpfile' with open(filename, "wb") as f: f.write(blob.data) subprocess.check_call(lint_args.command + [filename]) with open(filename, "rb") as f: blob.data = f.read() os.remove(filename) if lint_args.filenames_important and not lint_args.relevant: lint_args.relevant = 'return True' if lint_args.relevant: body = lint_args.relevant exec('def is_relevant(filename):\n '+'\n '.join(body.splitlines()), globals()) lint_args.filenames_important = True input_args = [] if lint_args.refs: input_args = ["--refs",] + lint_args.refs args = fr.FilteringOptions.parse_args(input_args, error_on_empty = False) args.force = True if lint_args.filenames_important: tmpdir = tempfile.mkdtemp().encode() cat_file_process = subprocess.Popen(['git', 'cat-file', '--batch'], stdin = subprocess.PIPE, stdout = subprocess.PIPE) filter = fr.RepoFilter(args, commit_callback=lint_with_real_filenames) filter.run() cat_file_process.stdin.close() cat_file_process.wait() else: if not os.path.exists('.git/info'): os.makedirs('.git/info') filter = fr.RepoFilter(args, blob_callback=lint_non_binary_blobs) filter.run() git-filter-repo-2.45.0/contrib/filter-repo-demos/signed-off-by000077500000000000000000000046431464611705400242060ustar00rootroot00000000000000#!/usr/bin/env python3 """ This is a simple program that will add Signed-off-by: tags to a range of commits. Example usage, to add a signed-off-by trailer to every commit that is not in next but is in any of master, develop, or maint: signed-off-by master develop maint ^next More likely called as: signed-off-by master~4..master There's no real reason to use this script since `rebase --signoff` exists; it's mostly just a demonstration of what could be done. """ """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ import argparse import re import subprocess try: import git_filter_repo as fr except ImportError: raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?") parser = argparse.ArgumentParser( description="Add 'Signed-off-by:' tags to a range of commits") parser.add_argument('rev_list_args', metavar='rev-list args', nargs=argparse.REMAINDER, help=("Range of commits (need to include ref tips) to work on")) myargs = parser.parse_args() user_name = subprocess.check_output('git config user.name'.split()).rstrip() user_email = subprocess.check_output('git config user.email'.split()).rstrip() trailer = b'Signed-off-by: %s <%s>' % (user_name, user_email) def add_signed_off_by_trailer(commit, metadata): if trailer in commit.message: return # We want to add the trailer, but we want it to be separated from any # existing paragraphs by a blank line. However, if the commit message # already ends with trailers, then we want all trailers to be on adjacent # lines. if not commit.message.endswith(b'\n'): commit.message += b'\n' lastline = commit.message.splitlines()[-1] if not re.match(b'[A-Za-z0-9-_]*: ', lastline): commit.message += b'\n' commit.message += trailer # Setting source and target to anything prevents: # * remapping origin remote tracking branches to regular branches # * deletion of the origin remote # * nuking unused refs # * nuking reflogs # * repacking # so we cheat and set source and target both to '.' args = fr.FilteringOptions.parse_args(['--force', '--refs'] + myargs.rev_list_args) args.refs = myargs.rev_list_args filter = fr.RepoFilter(args, commit_callback=add_signed_off_by_trailer) filter.run() git-filter-repo-2.45.0/git-filter-repo000077500000000000000000005072641464611705400176000ustar00rootroot00000000000000#!/usr/bin/env python3 """ git-filter-repo filters git repositories, similar to git filter-branch, BFG repo cleaner, and others. The basic idea is that it works by running git fast-export | filter | git fast-import where this program not only launches the whole pipeline but also serves as the 'filter' in the middle. It does a few additional things on top as well in order to make it into a well-rounded filtering tool. git-filter-repo can also be used as a library for more involved filtering operations; however: ***** API BACKWARD COMPATIBILITY CAVEAT ***** Programs using git-filter-repo as a library can reach pretty far into its internals, but I am not prepared to guarantee backward compatibility of all APIs. I suspect changes will be rare, but I reserve the right to change any API. Since it is assumed that repository filtering is something one would do very rarely, and in particular that it's a one-shot operation, this should not be a problem in practice for anyone. However, if you want to re-use a program you have written that uses git-filter-repo as a library (or makes use of one of its --*-callback arguments), you should either make sure you are using the same version of git and git-filter-repo, or make sure to re-test it. If there are particular pieces of the API you are concerned about, and there is not already a testcase for it in t9391-lib-usage.sh or t9392-python-callback.sh, please contribute a testcase. That will not prevent me from changing the API, but it will allow you to look at the history of a testcase to see whether and how the API changed. ***** END API BACKWARD COMPATIBILITY CAVEAT ***** """ import argparse import collections import fnmatch import gettext import io import os import platform import re import shutil import subprocess import sys import time import textwrap from datetime import tzinfo, timedelta, datetime __all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress", "Checkpoint", "FastExportParser", "ProgressWriter", "string_to_date", "date_to_string", "record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"] # The globals to make visible to callbacks. They will see all our imports for # free, as well as our public API. public_globals = ["__builtins__", "argparse", "collections", "fnmatch", "gettext", "io", "os", "platform", "re", "shutil", "subprocess", "sys", "time", "textwrap", "tzinfo", "timedelta", "datetime"] + __all__ deleted_hash = b'0'*40 write_marks = True date_format_permissive = True def gettext_poison(msg): if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover return "# GETTEXT POISON #" return gettext.gettext(msg) _ = gettext_poison def setup_gettext(): TEXTDOMAIN="git-filter-repo" podir = os.environ.get("GIT_TEXTDOMAINDIR") or "@@LOCALEDIR@@" if not os.path.isdir(podir): # pragma: no cover podir = None # Python has its own fallback; use that ## This looks like the most straightforward translation of the relevant ## code in git.git:gettext.c and git.git:perl/Git/I18n.pm: #import locale #locale.setlocale(locale.LC_MESSAGES, ""); #locale.setlocale(locale.LC_TIME, ""); #locale.textdomain(TEXTDOMAIN); #locale.bindtextdomain(TEXTDOMAIN, podir); ## but the python docs suggest using the gettext module (which doesn't ## have setlocale()) instead, so: gettext.textdomain(TEXTDOMAIN); gettext.bindtextdomain(TEXTDOMAIN, podir); def _timedelta_to_seconds(delta): """ Converts timedelta to seconds """ offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000 return round(offset) class FixedTimeZone(tzinfo): """ Fixed offset in minutes east from UTC. """ tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$') def __init__(self, offset_string): tzinfo.__init__(self) sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups() factor = -1 if (sign and sign == b'-') else 1 self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm))) self._offset_string = offset_string def utcoffset(self, dt): return self._offset def tzname(self, dt): return self._offset_string def dst(self, dt): return timedelta(0) def string_to_date(datestring): (unix_timestamp, tz_offset) = datestring.split() return datetime.fromtimestamp(int(unix_timestamp), FixedTimeZone(tz_offset)) def date_to_string(dateobj): epoch = datetime.fromtimestamp(0, dateobj.tzinfo) return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)), dateobj.tzinfo.tzname(0))) def decode(bytestr): 'Try to convert bytestr to utf-8 for outputting as an error message.' return bytestr.decode('utf-8', 'backslashreplace') def glob_to_regex(glob_bytestr): 'Translate glob_bytestr into a regex on bytestrings' # fnmatch.translate is idiotic and won't accept bytestrings if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr)) # Create regex operating on string regex = fnmatch.translate(decode(glob_bytestr)) # FIXME: This is an ugly hack... # fnmatch.translate tries to do multi-line matching and wants the glob to # match up to the end of the input, which isn't relevant for us, so we # have to modify the regex. fnmatch.translate has used different regex # constructs to achieve this with different python versions, so we have # to check for each of them and then fix it up. It would be much better # if fnmatch.translate could just take some flags to allow us to specify # what we want rather than employing this hackery, but since it # doesn't... if regex.endswith(r'\Z(?ms)'): # pragma: no cover regex = regex[0:-7] elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover regex = regex[4:-3] # Finally, convert back to regex operating on bytestr return regex.encode() class PathQuoting: _unescape = {b'a': b'\a', b'b': b'\b', b'f': b'\f', b'n': b'\n', b'r': b'\r', b't': b'\t', b'v': b'\v', b'"': b'"', b'\\':b'\\'} _unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})') _escape = [bytes([x]) for x in range(127)]+[ b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)] _reverse = dict(map(reversed, _unescape.items())) for x in _reverse: _escape[ord(x)] = b'\\'+_reverse[x] _special_chars = [len(x) > 1 for x in _escape] @staticmethod def unescape_sequence(orig): seq = orig.group(1) return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)]) @staticmethod def dequote(quoted_string): if quoted_string.startswith(b'"'): assert quoted_string.endswith(b'"') return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence, quoted_string[1:-1]) return quoted_string @staticmethod def enquote(unquoted_string): # Option 1: Quoting when fast-export would: # pqsc = PathQuoting._special_chars # if any(pqsc[x] for x in set(unquoted_string)): # Option 2, perf hack: do minimal amount of quoting required by fast-import if unquoted_string.startswith(b'"') or b'\n' in unquoted_string: pqe = PathQuoting._escape return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"' return unquoted_string class AncestryGraph(object): """ A class that maintains a direct acycle graph of commits for the purpose of determining if one commit is the ancestor of another. """ def __init__(self): self.cur_value = 0 # A mapping from the external identifers given to us to the simple integers # we use in self.graph self.value = {} # A tuple of (depth, list-of-ancestors). Values and keys in this graph are # all integers from the self.value dict. The depth of a commit is one more # than the max depth of any of its ancestors. self.graph = {} # Cached results from previous calls to is_ancestor(). self._cached_is_ancestor = {} def record_external_commits(self, external_commits): """ Record in graph that each commit in external_commits exists, and is treated as a root commit with no parents. """ for c in external_commits: if c not in self.value: self.cur_value += 1 self.value[c] = self.cur_value self.graph[self.cur_value] = (1, []) def add_commit_and_parents(self, commit, parents): """ Record in graph that commit has the given parents. parents _MUST_ have been first recorded. commit _MUST_ not have been recorded yet. """ assert all(p in self.value for p in parents) assert commit not in self.value # Get values for commit and parents self.cur_value += 1 self.value[commit] = self.cur_value graph_parents = [self.value[x] for x in parents] # Determine depth for commit, then insert the info into the graph depth = 1 if parents: depth += max(self.graph[p][0] for p in graph_parents) self.graph[self.cur_value] = (depth, graph_parents) def is_ancestor(self, possible_ancestor, check): """ Return whether possible_ancestor is an ancestor of check """ a, b = self.value[possible_ancestor], self.value[check] original_pair = (a,b) a_depth = self.graph[a][0] ancestors = [b] visited = set() while ancestors: ancestor = ancestors.pop() prev_pair = (a, ancestor) if prev_pair in self._cached_is_ancestor: if not self._cached_is_ancestor[prev_pair]: continue self._cached_is_ancestor[original_pair] = True return True if ancestor in visited: continue visited.add(ancestor) depth, more_ancestors = self.graph[ancestor] if ancestor == a: self._cached_is_ancestor[original_pair] = True return True elif depth <= a_depth: continue ancestors.extend(more_ancestors) self._cached_is_ancestor[original_pair] = False return False class MailmapInfo(object): def __init__(self, filename): self.changes = {} self._parse_file(filename) def _parse_file(self, filename): name_and_email_re = re.compile(br'(.*?)\s*<([^>]*)>\s*') comment_re = re.compile(br'\s*#.*') if not os.access(filename, os.R_OK): raise SystemExit(_("Cannot read %s") % decode(filename)) with open(filename, 'br') as f: count = 0 for line in f: count += 1 err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line) # Remove comments line = comment_re.sub(b'', line) # Remove leading and trailing whitespace line = line.strip() if not line: continue m = name_and_email_re.match(line) if not m: raise SystemExit(err) proper_name, proper_email = m.groups() if len(line) == m.end(): self.changes[(None, proper_email)] = (proper_name, proper_email) continue rest = line[m.end():] m = name_and_email_re.match(rest) if m: commit_name, commit_email = m.groups() if len(rest) != m.end(): raise SystemExit(err) else: commit_name, commit_email = rest, None self.changes[(commit_name, commit_email)] = (proper_name, proper_email) def translate(self, name, email): ''' Given a name and email, return the expected new name and email from the mailmap if there is a translation rule for it, otherwise just return the given name and email.''' for old, new in self.changes.items(): old_name, old_email = old new_name, new_email = new if (old_email is None or email.lower() == old_email.lower()) and ( name == old_name or not old_name): return (new_name or name, new_email or email) return (name, email) class ProgressWriter(object): def __init__(self): self._last_progress_update = time.time() self._last_message = None def show(self, msg): self._last_message = msg now = time.time() if now - self._last_progress_update > .1: self._last_progress_update = now sys.stdout.write("\r{}".format(msg)) sys.stdout.flush() def finish(self): self._last_progress_update = 0 if self._last_message: self.show(self._last_message) sys.stdout.write("\n") class _IDs(object): """ A class that maintains the 'name domain' of all the 'marks' (short int id for a blob/commit git object). The reason this mechanism is necessary is because the text of fast-export may refer to an object using a different mark than the mark that was assigned to that object using IDS.new(). This class allows you to translate the fast-export marks (old) to the marks assigned from IDS.new() (new). Note that there are two reasons why the marks may differ: (1) The user manually creates Blob or Commit objects (for insertion into the stream) (2) We're reading the data from two different repositories and trying to combine the data (git fast-export will number ids from 1...n, and having two 1's, two 2's, two 3's, causes issues). """ def __init__(self): """ Init """ # The id for the next created blob/commit object self._next_id = 1 # A map of old-ids to new-ids (1:1 map) self._translation = {} # A map of new-ids to every old-id that points to the new-id (1:N map) self._reverse_translation = {} def has_renames(self): """ Return whether there have been ids remapped to new values """ return bool(self._translation) def new(self): """ Should be called whenever a new blob or commit object is created. The returned value should be used as the id/mark for that object. """ rv = self._next_id self._next_id += 1 return rv def record_rename(self, old_id, new_id, handle_transitivity = False): """ Record that old_id is being renamed to new_id. """ if old_id != new_id: # old_id -> new_id self._translation[old_id] = new_id # Transitivity will be needed if new commits are being inserted mid-way # through a branch. if handle_transitivity: # Anything that points to old_id should point to new_id if old_id in self._reverse_translation: for id_ in self._reverse_translation[old_id]: self._translation[id_] = new_id # Record that new_id is pointed to by old_id if new_id not in self._reverse_translation: self._reverse_translation[new_id] = [] self._reverse_translation[new_id].append(old_id) def translate(self, old_id): """ If old_id has been mapped to an alternate id, return the alternate id. """ if old_id in self._translation: return self._translation[old_id] else: return old_id def __str__(self): """ Convert IDs to string; used for debugging """ rv = "Current count: %d\nTranslation:\n" % self._next_id for k in sorted(self._translation): rv += " %d -> %s\n" % (k, self._translation[k]) rv += "Reverse translation:\n" for k in sorted(self._reverse_translation): rv += " " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n" return rv class _GitElement(object): """ The base class for all git elements that we create. """ def __init__(self): # A string that describes what type of Git element this is self.type = None # A flag telling us if this Git element has been dumped # (i.e. printed) or skipped. Typically elements that have been # dumped or skipped will not be dumped again. self.dumped = 0 def dump(self, file_): """ This version should never be called. Derived classes need to override! We should note that subclasses should implement this method such that the output would match the format produced by fast-export. """ raise SystemExit(_("Unimplemented function: %s") % type(self).__name__ +".dump()") # pragma: no cover def __bytes__(self): """ Convert GitElement to bytestring; used for debugging """ old_dumped = self.dumped writeme = io.BytesIO() self.dump(writeme) output_lines = writeme.getvalue().splitlines() writeme.close() self.dumped = old_dumped return b"%s:\n %s" % (type(self).__name__.encode(), b"\n ".join(output_lines)) def skip(self, new_id=None): """ Ensures this element will not be written to output """ self.dumped = 2 class _GitElementWithId(_GitElement): """ The base class for Git elements that have IDs (commits and blobs) """ def __init__(self): _GitElement.__init__(self) # The mark (short, portable id) for this element self.id = _IDS.new() # The previous mark for this element self.old_id = None def skip(self, new_id=None): """ This element will no longer be automatically written to output. When a commit gets skipped, it's ID will need to be translated to that of its parent. """ self.dumped = 2 _IDS.record_rename(self.old_id or self.id, new_id) class Blob(_GitElementWithId): """ This class defines our representation of git blob elements (i.e. our way of representing file contents). """ def __init__(self, data, original_id = None): _GitElementWithId.__init__(self) # Denote that this is a blob self.type = 'blob' # Record original id self.original_id = original_id # Stores the blob's data assert(type(data) == bytes) self.data = data def dump(self, file_): """ Write this blob element to a file. """ self.dumped = 1 HASH_TO_ID[self.original_id] = self.id ID_TO_HASH[self.id] = self.original_id file_.write(b'blob\n') file_.write(b'mark :%d\n' % self.id) file_.write(b'data %d\n%s' % (len(self.data), self.data)) file_.write(b'\n') class Reset(_GitElement): """ This class defines our representation of git reset elements. A reset event is the creation (or recreation) of a named branch, optionally starting from a specific revision). """ def __init__(self, ref, from_ref = None): _GitElement.__init__(self) # Denote that this is a reset self.type = 'reset' # The name of the branch being (re)created self.ref = ref # Some reference to the branch/commit we are resetting from self.from_ref = from_ref def dump(self, file_): """ Write this reset element to a file """ self.dumped = 1 file_.write(b'reset %s\n' % self.ref) if self.from_ref: if isinstance(self.from_ref, int): file_.write(b'from :%d\n' % self.from_ref) else: file_.write(b'from %s\n' % self.from_ref) file_.write(b'\n') class FileChange(_GitElement): """ This class defines our representation of file change elements. File change elements are components within a Commit element. """ def __init__(self, type_, filename = None, id_ = None, mode = None): _GitElement.__init__(self) # Denote the type of file-change (b'M' for modify, b'D' for delete, etc) # We could # assert(type(type_) == bytes) # here but I don't just due to worries about performance overhead... self.type = type_ # Record the name of the file being changed self.filename = filename # Record the mode (mode describes type of file entry (non-executable, # executable, or symlink)). self.mode = mode # blob_id is the id (mark) of the affected blob self.blob_id = id_ if type_ == b'DELETEALL': assert filename is None and id_ is None and mode is None self.filename = b'' # Just so PathQuoting.enquote doesn't die else: assert filename is not None if type_ == b'M': assert id_ is not None and mode is not None elif type_ == b'D': assert id_ is None and mode is None elif type_ == b'R': # pragma: no cover (now avoid fast-export renames) assert mode is None if id_ is None: raise SystemExit(_("new name needed for rename of %s") % filename) self.filename = (self.filename, id_) self.blob_id = None def dump(self, file_): """ Write this file-change element to a file """ skipped_blob = (self.type == b'M' and self.blob_id is None) if skipped_blob: return self.dumped = 1 quoted_filename = PathQuoting.enquote(self.filename) if self.type == b'M' and isinstance(self.blob_id, int): file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename)) elif self.type == b'M': file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename)) elif self.type == b'D': file_.write(b'D %s\n' % quoted_filename) elif self.type == b'DELETEALL': file_.write(b'deleteall\n') else: raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover class Commit(_GitElementWithId): """ This class defines our representation of commit elements. Commit elements contain all the information associated with a commit. """ def __init__(self, branch, author_name, author_email, author_date, committer_name, committer_email, committer_date, message, file_changes, parents, original_id = None, encoding = None, # encoding for message; None implies UTF-8 **kwargs): _GitElementWithId.__init__(self) self.old_id = self.id # Denote that this is a commit element self.type = 'commit' # Record the affected branch self.branch = branch # Record original id self.original_id = original_id # Record author's name self.author_name = author_name # Record author's email self.author_email = author_email # Record date of authoring self.author_date = author_date # Record committer's name self.committer_name = committer_name # Record committer's email self.committer_email = committer_email # Record date the commit was made self.committer_date = committer_date # Record commit message and its encoding self.encoding = encoding self.message = message # List of file-changes associated with this commit. Note that file-changes # are also represented as git elements self.file_changes = file_changes self.parents = parents def dump(self, file_): """ Write this commit element to a file. """ self.dumped = 1 HASH_TO_ID[self.original_id] = self.id ID_TO_HASH[self.id] = self.original_id # Make output to fast-import slightly easier for humans to read if the # message has no trailing newline of its own; cosmetic, but a nice touch... extra_newline = b'\n' if self.message.endswith(b'\n') or not (self.parents or self.file_changes): extra_newline = b'' if not self.parents: file_.write(b'reset %s\n' % self.branch) file_.write((b'commit %s\n' b'mark :%d\n' b'author %s <%s> %s\n' b'committer %s <%s> %s\n' ) % ( self.branch, self.id, self.author_name, self.author_email, self.author_date, self.committer_name, self.committer_email, self.committer_date )) if self.encoding: file_.write(b'encoding %s\n' % self.encoding) file_.write(b'data %d\n%s%s' % (len(self.message), self.message, extra_newline)) for i, parent in enumerate(self.parents): file_.write(b'from ' if i==0 else b'merge ') if isinstance(parent, int): file_.write(b':%d\n' % parent) else: file_.write(b'%s\n' % parent) for change in self.file_changes: change.dump(file_) if not self.parents and not self.file_changes: # Workaround a bug in pre-git-2.22 versions of fast-import with # the get-mark directive. file_.write(b'\n') file_.write(b'\n') def first_parent(self): """ Return first parent commit """ if self.parents: return self.parents[0] return None def skip(self, new_id=None): _SKIPPED_COMMITS.add(self.old_id or self.id) _GitElementWithId.skip(self, new_id) class Tag(_GitElementWithId): """ This class defines our representation of annotated tag elements. """ def __init__(self, ref, from_ref, tagger_name, tagger_email, tagger_date, tag_msg, original_id = None): _GitElementWithId.__init__(self) self.old_id = self.id # Denote that this is a tag element self.type = 'tag' # Store the name of the tag self.ref = ref # Store the entity being tagged (this should be a commit) self.from_ref = from_ref # Record original id self.original_id = original_id # Store the name of the tagger self.tagger_name = tagger_name # Store the email of the tagger self.tagger_email = tagger_email # Store the date self.tagger_date = tagger_date # Store the tag message self.message = tag_msg def dump(self, file_): """ Write this tag element to a file """ self.dumped = 1 HASH_TO_ID[self.original_id] = self.id ID_TO_HASH[self.id] = self.original_id file_.write(b'tag %s\n' % self.ref) if (write_marks and self.id): file_.write(b'mark :%d\n' % self.id) markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else b'from %s\n' file_.write(markfmt % self.from_ref) if self.tagger_name: file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email)) file_.write(self.tagger_date) file_.write(b'\n') file_.write(b'data %d\n%s' % (len(self.message), self.message)) file_.write(b'\n') class Progress(_GitElement): """ This class defines our representation of progress elements. The progress element only contains a progress message, which is printed by fast-import when it processes the progress output. """ def __init__(self, message): _GitElement.__init__(self) # Denote that this is a progress element self.type = 'progress' # Store the progress message self.message = message def dump(self, file_): """ Write this progress element to a file """ self.dumped = 1 file_.write(b'progress %s\n' % self.message) file_.write(b'\n') class Checkpoint(_GitElement): """ This class defines our representation of checkpoint elements. These elements represent events which force fast-import to close the current packfile, start a new one, and to save out all current branch refs, tags and marks. """ def __init__(self): _GitElement.__init__(self) # Denote that this is a checkpoint element self.type = 'checkpoint' def dump(self, file_): """ Write this checkpoint element to a file """ self.dumped = 1 file_.write(b'checkpoint\n') file_.write(b'\n') class LiteralCommand(_GitElement): """ This class defines our representation of commands. The literal command includes only a single line, and is not processed in any special way. """ def __init__(self, line): _GitElement.__init__(self) # Denote that this is a literal element self.type = 'literal' # Store the command self.line = line def dump(self, file_): """ Write this progress element to a file """ self.dumped = 1 file_.write(self.line) class Alias(_GitElement): """ This class defines our representation of fast-import alias elements. An alias element is the setting of one mark to the same sha1sum as another, usually because the newer mark corresponded to a pruned commit. """ def __init__(self, ref, to_ref): _GitElement.__init__(self) # Denote that this is a reset self.type = 'alias' self.ref = ref self.to_ref = to_ref def dump(self, file_): """ Write this reset element to a file """ self.dumped = 1 file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref)) class FastExportParser(object): """ A class for parsing and handling the output from fast-export. This class allows the user to register callbacks when various types of data are encountered in the fast-export output. The basic idea is that, FastExportParser takes fast-export output, creates the various objects as it encounters them, the user gets to use/modify these objects via callbacks, and finally FastExportParser outputs the modified objects in fast-import format (presumably so they can be used to create a new repo). """ def __init__(self, tag_callback = None, commit_callback = None, blob_callback = None, progress_callback = None, reset_callback = None, checkpoint_callback = None, done_callback = None): # Members below simply store callback functions for the various git # elements self._tag_callback = tag_callback self._blob_callback = blob_callback self._reset_callback = reset_callback self._commit_callback = commit_callback self._progress_callback = progress_callback self._checkpoint_callback = checkpoint_callback self._done_callback = done_callback # Keep track of which refs appear from the export, and which make it to # the import (pruning of empty commits, renaming of refs, and creating # new manual objects and inserting them can cause these to differ). self._exported_refs = set() self._imported_refs = set() # A list of the branches we've seen, plus the last known commit they # pointed to. An entry in latest_*commit will be deleted if we get a # reset for that branch. These are used because of fast-import's weird # decision to allow having an implicit parent via naming the branch # instead of requiring branches to be specified via 'from' directives. self._latest_commit = {} self._latest_orig_commit = {} # A handle to the input source for the fast-export data self._input = None # A handle to the output file for the output we generate (we call dump # on many of the git elements we create). self._output = None # Stores the contents of the current line of input being parsed self._currentline = '' # Compile some regexes and cache those self._mark_re = re.compile(br'mark :(\d+)\n$') self._parent_regexes = {} parent_regex_rules = (br' :(\d+)\n$', br' ([0-9a-f]{40})\n') for parent_refname in (b'from', b'merge'): ans = [re.compile(parent_refname+x) for x in parent_regex_rules] self._parent_regexes[parent_refname] = ans self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"') self._refline_regexes = {} for refline_name in (b'reset', b'commit', b'tag', b'progress'): self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$') self._user_regexes = {} for user in (b'author', b'committer', b'tagger'): self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$') def _advance_currentline(self): """ Grab the next line of input """ self._currentline = self._input.readline() def _parse_optional_mark(self): """ If the current line contains a mark, parse it and advance to the next line; return None otherwise """ mark = None matches = self._mark_re.match(self._currentline) if matches: mark = int(matches.group(1)) self._advance_currentline() return mark def _parse_optional_parent_ref(self, refname): """ If the current line contains a reference to a parent commit, then parse it and advance the current line; otherwise return None. Note that the name of the reference ('from', 'merge') must match the refname arg. """ orig_baseref, baseref = None, None rule, altrule = self._parent_regexes[refname] matches = rule.match(self._currentline) if matches: orig_baseref = int(matches.group(1)) # We translate the parent commit mark to what it needs to be in # our mark namespace baseref = _IDS.translate(orig_baseref) self._advance_currentline() else: matches = altrule.match(self._currentline) if matches: orig_baseref = matches.group(1) baseref = orig_baseref self._advance_currentline() return orig_baseref, baseref def _parse_optional_filechange(self): """ If the current line contains a file-change object, then parse it and advance the current line; otherwise return None. We only care about file changes of type b'M' and b'D' (these are the only types of file-changes that fast-export will provide). """ filechange = None changetype = self._currentline[0:1] if changetype == b'M': (changetype, mode, idnum, path) = self._currentline.split(None, 3) if idnum[0:1] == b':': idnum = idnum[1:] path = path.rstrip(b'\n') # We translate the idnum to our id system if len(idnum) != 40: idnum = _IDS.translate( int(idnum) ) if idnum is not None: if path.startswith(b'"'): path = PathQuoting.dequote(path) filechange = FileChange(b'M', path, idnum, mode) else: filechange = b'skipped' self._advance_currentline() elif changetype == b'D': (changetype, path) = self._currentline.split(None, 1) path = path.rstrip(b'\n') if path.startswith(b'"'): path = PathQuoting.dequote(path) filechange = FileChange(b'D', path) self._advance_currentline() elif changetype == b'R': # pragma: no cover (now avoid fast-export renames) rest = self._currentline[2:-1] if rest.startswith(b'"'): m = self._quoted_string_re.match(rest) if not m: raise SystemExit(_("Couldn't parse rename source")) orig = PathQuoting.dequote(m.group(0)) new = rest[m.end()+1:] else: orig, new = rest.split(b' ', 1) if new.startswith(b'"'): new = PathQuoting.dequote(new) filechange = FileChange(b'R', orig, new) self._advance_currentline() return filechange def _parse_original_id(self): original_id = self._currentline[len(b'original-oid '):].rstrip() self._advance_currentline() return original_id def _parse_encoding(self): encoding = self._currentline[len(b'encoding '):].rstrip() self._advance_currentline() return encoding def _parse_ref_line(self, refname): """ Parses string data (often a branch name) from current-line. The name of the string data must match the refname arg. The program will crash if current-line does not match, so current-line will always be advanced if this method returns. """ matches = self._refline_regexes[refname].match(self._currentline) if not matches: raise SystemExit(_("Malformed %(refname)s line: '%(line)s'") % ({'refname': refname, 'line':self._currentline}) ) # pragma: no cover ref = matches.group(1) self._advance_currentline() return ref def _parse_user(self, usertype): """ Get user name, email, datestamp from current-line. Current-line will be advanced. """ user_regex = self._user_regexes[usertype] (name, email, when) = user_regex.match(self._currentline).groups() self._advance_currentline() return (name, email, when) def _parse_data(self): """ Reads data from _input. Current-line will be advanced until it is beyond the data. """ fields = self._currentline.split() assert fields[0] == b'data' size = int(fields[1]) data = self._input.read(size) self._advance_currentline() if self._currentline == b'\n': self._advance_currentline() return data def _parse_blob(self): """ Parse input data into a Blob object. Once the Blob has been created, it will be handed off to the appropriate callbacks. Current-line will be advanced until it is beyond this blob's data. The Blob will be dumped to _output once everything else is done (unless it has been skipped by the callback). """ # Parse the Blob self._advance_currentline() id_ = self._parse_optional_mark() original_id = None if self._currentline.startswith(b'original-oid'): original_id = self._parse_original_id(); data = self._parse_data() if self._currentline == b'\n': self._advance_currentline() # Create the blob blob = Blob(data, original_id) # If fast-export text had a mark for this blob, need to make sure this # mark translates to the blob's true id. if id_: blob.old_id = id_ _IDS.record_rename(id_, blob.id) # Call any user callback to allow them to use/modify the blob if self._blob_callback: self._blob_callback(blob) # Now print the resulting blob if not blob.dumped: blob.dump(self._output) def _parse_reset(self): """ Parse input data into a Reset object. Once the Reset has been created, it will be handed off to the appropriate callbacks. Current-line will be advanced until it is beyond the reset data. The Reset will be dumped to _output once everything else is done (unless it has been skipped by the callback). """ # Parse the Reset ref = self._parse_ref_line(b'reset') self._exported_refs.add(ref) ignoreme, from_ref = self._parse_optional_parent_ref(b'from') if self._currentline == b'\n': self._advance_currentline() # fast-export likes to print extraneous resets that serve no purpose. # While we could continue processing such resets, that is a waste of # resources. Also, we want to avoid recording that this ref was # seen in such cases, since this ref could be rewritten to nothing. if not from_ref: self._latest_commit.pop(ref, None) self._latest_orig_commit.pop(ref, None) return # Create the reset reset = Reset(ref, from_ref) # Call any user callback to allow them to modify the reset if self._reset_callback: self._reset_callback(reset) # Update metadata self._latest_commit[reset.ref] = reset.from_ref self._latest_orig_commit[reset.ref] = reset.from_ref # Now print the resulting reset if not reset.dumped: self._imported_refs.add(reset.ref) reset.dump(self._output) def _parse_commit(self): """ Parse input data into a Commit object. Once the Commit has been created, it will be handed off to the appropriate callbacks. Current-line will be advanced until it is beyond the commit data. The Commit will be dumped to _output once everything else is done (unless it has been skipped by the callback OR the callback has removed all file-changes from the commit). """ # Parse the Commit. This may look involved, but it's pretty simple; it only # looks bad because a commit object contains many pieces of data. branch = self._parse_ref_line(b'commit') self._exported_refs.add(branch) id_ = self._parse_optional_mark() original_id = None if self._currentline.startswith(b'original-oid'): original_id = self._parse_original_id(); author_name = None author_email = None if self._currentline.startswith(b'author'): (author_name, author_email, author_date) = self._parse_user(b'author') (committer_name, committer_email, committer_date) = \ self._parse_user(b'committer') if not author_name and not author_email: (author_name, author_email, author_date) = \ (committer_name, committer_email, committer_date) encoding = None if self._currentline.startswith(b'encoding '): encoding = self._parse_encoding() commit_msg = self._parse_data() pinfo = [self._parse_optional_parent_ref(b'from')] # Due to empty pruning, we can have real 'from' and 'merge' lines that # due to commit rewriting map to a parent of None. We need to record # 'from' if its non-None, and we need to parse all 'merge' lines. while self._currentline.startswith(b'merge '): pinfo.append(self._parse_optional_parent_ref(b'merge')) orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)] # No parents is oddly represented as [None] instead of [], due to the # special 'from' handling. Convert it here to a more canonical form. if parents == [None]: parents = [] if orig_parents == [None]: orig_parents = [] # fast-import format is kinda stupid in that it allows implicit parents # based on the branch name instead of requiring them to be specified by # 'from' directives. The only way to get no parent is by using a reset # directive first, which clears the latest_commit_for_this_branch tracking. if not orig_parents and self._latest_commit.get(branch): parents = [self._latest_commit[branch]] if not orig_parents and self._latest_orig_commit.get(branch): orig_parents = [self._latest_orig_commit[branch]] # Get the list of file changes file_changes = [] file_change = self._parse_optional_filechange() had_file_changes = file_change is not None while file_change: if not (type(file_change) == bytes and file_change == b'skipped'): file_changes.append(file_change) file_change = self._parse_optional_filechange() if self._currentline == b'\n': self._advance_currentline() # Okay, now we can finally create the Commit object commit = Commit(branch, author_name, author_email, author_date, committer_name, committer_email, committer_date, commit_msg, file_changes, parents, original_id, encoding) # If fast-export text had a mark for this commit, need to make sure this # mark translates to the commit's true id. if id_: commit.old_id = id_ _IDS.record_rename(id_, commit.id) # Call any user callback to allow them to modify the commit aux_info = {'orig_parents': orig_parents, 'had_file_changes': had_file_changes} if self._commit_callback: self._commit_callback(commit, aux_info) # Now print the resulting commit, or if prunable skip it self._latest_orig_commit[branch] = commit.id if not (commit.old_id or commit.id) in _SKIPPED_COMMITS: self._latest_commit[branch] = commit.id if not commit.dumped: self._imported_refs.add(commit.branch) commit.dump(self._output) def _parse_tag(self): """ Parse input data into a Tag object. Once the Tag has been created, it will be handed off to the appropriate callbacks. Current-line will be advanced until it is beyond the tag data. The Tag will be dumped to _output once everything else is done (unless it has been skipped by the callback). """ # Parse the Tag tag = self._parse_ref_line(b'tag') self._exported_refs.add(b'refs/tags/'+tag) id_ = self._parse_optional_mark() ignoreme, from_ref = self._parse_optional_parent_ref(b'from') original_id = None if self._currentline.startswith(b'original-oid'): original_id = self._parse_original_id(); tagger_name, tagger_email, tagger_date = None, None, None if self._currentline.startswith(b'tagger'): (tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger') tag_msg = self._parse_data() if self._currentline == b'\n': self._advance_currentline() # Create the tag tag = Tag(tag, from_ref, tagger_name, tagger_email, tagger_date, tag_msg, original_id) # If fast-export text had a mark for this tag, need to make sure this # mark translates to the tag's true id. if id_: tag.old_id = id_ _IDS.record_rename(id_, tag.id) # Call any user callback to allow them to modify the tag if self._tag_callback: self._tag_callback(tag) # The tag might not point at anything that still exists (self.from_ref # will be None if the commit it pointed to and all its ancestors were # pruned due to being empty) if tag.from_ref: # Print out this tag's information if not tag.dumped: self._imported_refs.add(b'refs/tags/'+tag.ref) tag.dump(self._output) else: tag.skip() def _parse_progress(self): """ Parse input data into a Progress object. Once the Progress has been created, it will be handed off to the appropriate callbacks. Current-line will be advanced until it is beyond the progress data. The Progress will be dumped to _output once everything else is done (unless it has been skipped by the callback). """ # Parse the Progress message = self._parse_ref_line(b'progress') if self._currentline == b'\n': self._advance_currentline() # Create the progress message progress = Progress(message) # Call any user callback to allow them to modify the progress messsage if self._progress_callback: self._progress_callback(progress) # NOTE: By default, we do NOT print the progress message; git # fast-import would write it to fast_import_pipes which could mess with # our parsing of output from the 'ls' and 'get-mark' directives we send # to fast-import. If users want these messages, they need to process # and handle them in the appropriate callback above. def _parse_checkpoint(self): """ Parse input data into a Checkpoint object. Once the Checkpoint has been created, it will be handed off to the appropriate callbacks. Current-line will be advanced until it is beyond the checkpoint data. The Checkpoint will be dumped to _output once everything else is done (unless it has been skipped by the callback). """ # Parse the Checkpoint self._advance_currentline() if self._currentline == b'\n': self._advance_currentline() # Create the checkpoint checkpoint = Checkpoint() # Call any user callback to allow them to drop the checkpoint if self._checkpoint_callback: self._checkpoint_callback(checkpoint) # NOTE: By default, we do NOT print the checkpoint message; although it # we would only realistically get them with --stdin, the fact that we # are filtering makes me think the checkpointing is less likely to be # reasonable. In fact, I don't think it's necessary in general. If # users do want it, they should process it in the checkpoint_callback. def _parse_literal_command(self): """ Parse literal command. Then just dump the line as is. """ # Create the literal command object command = LiteralCommand(self._currentline) self._advance_currentline() # Now print the resulting literal command if not command.dumped: command.dump(self._output) def insert(self, obj): assert not obj.dumped obj.dump(self._output) if type(obj) == Commit: self._imported_refs.add(obj.branch) elif type(obj) in (Reset, Tag): self._imported_refs.add(obj.ref) def run(self, input, output): """ This method filters fast export output. """ # Set input. If no args provided, use stdin. self._input = input self._output = output # Run over the input and do the filtering self._advance_currentline() while self._currentline: if self._currentline.startswith(b'blob'): self._parse_blob() elif self._currentline.startswith(b'reset'): self._parse_reset() elif self._currentline.startswith(b'commit'): self._parse_commit() elif self._currentline.startswith(b'tag'): self._parse_tag() elif self._currentline.startswith(b'progress'): self._parse_progress() elif self._currentline.startswith(b'checkpoint'): self._parse_checkpoint() elif self._currentline.startswith(b'feature'): self._parse_literal_command() elif self._currentline.startswith(b'option'): self._parse_literal_command() elif self._currentline.startswith(b'done'): if self._done_callback: self._done_callback() self._parse_literal_command() # Prevent confusion from others writing additional stuff that'll just # be ignored self._output.close() elif self._currentline.startswith(b'#'): self._parse_literal_command() elif self._currentline.startswith(b'get-mark') or \ self._currentline.startswith(b'cat-blob') or \ self._currentline.startswith(b'ls'): raise SystemExit(_("Unsupported command: '%s'") % self._currentline) else: raise SystemExit(_("Could not parse line: '%s'") % self._currentline) def get_exported_and_imported_refs(self): return self._exported_refs, self._imported_refs def record_id_rename(old_id, new_id): """ Register a new translation """ handle_transitivity = True _IDS.record_rename(old_id, new_id, handle_transitivity) # Internal globals _IDS = _IDs() _SKIPPED_COMMITS = set() HASH_TO_ID = {} ID_TO_HASH = {} class SubprocessWrapper(object): @staticmethod def decodify(args): if type(args) == str: return args else: assert type(args) == list return [decode(x) if type(x)==bytes else x for x in args] @staticmethod def call(*args, **kwargs): if 'cwd' in kwargs: kwargs['cwd'] = decode(kwargs['cwd']) return subprocess.call(SubprocessWrapper.decodify(*args), **kwargs) @staticmethod def check_output(*args, **kwargs): if 'cwd' in kwargs: kwargs['cwd'] = decode(kwargs['cwd']) return subprocess.check_output(SubprocessWrapper.decodify(*args), **kwargs) @staticmethod def check_call(*args, **kwargs): # pragma: no cover # used by filter-lamely if 'cwd' in kwargs: kwargs['cwd'] = decode(kwargs['cwd']) return subprocess.check_call(SubprocessWrapper.decodify(*args), **kwargs) @staticmethod def Popen(*args, **kwargs): if 'cwd' in kwargs: kwargs['cwd'] = decode(kwargs['cwd']) return subprocess.Popen(SubprocessWrapper.decodify(*args), **kwargs) subproc = subprocess if platform.system() == 'Windows' or 'PRETEND_UNICODE_ARGS' in os.environ: subproc = SubprocessWrapper class GitUtils(object): @staticmethod def get_commit_count(repo, *args): """ Return the number of commits that have been made on repo. """ if not args: args = ['--all'] if len(args) == 1 and isinstance(args[0], list): args = args[0] p = subproc.Popen(["git", "rev-list", "--count"] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo) if p.wait() != 0: raise SystemExit(_("%s does not appear to be a valid git repository") % decode(repo)) return int(p.stdout.read()) @staticmethod def get_total_objects(repo): """ Return the number of objects (both packed and unpacked) """ p1 = subproc.Popen(["git", "count-objects", "-v"], stdout=subprocess.PIPE, cwd=repo) lines = p1.stdout.read().splitlines() # Return unpacked objects + packed-objects return int(lines[0].split()[1]) + int(lines[2].split()[1]) @staticmethod def is_repository_bare(repo_working_dir): out = subproc.check_output('git rev-parse --is-bare-repository'.split(), cwd=repo_working_dir) return (out.strip() == b'true') @staticmethod def determine_git_dir(repo_working_dir): d = subproc.check_output('git rev-parse --git-dir'.split(), cwd=repo_working_dir).strip() if repo_working_dir==b'.' or d.startswith(b'/'): return d return os.path.join(repo_working_dir, d) @staticmethod def get_refs(repo_working_dir): try: output = subproc.check_output('git show-ref'.split(), cwd=repo_working_dir) except subprocess.CalledProcessError as e: # If error code is 1, there just aren't any refs; i.e. new repo. # If error code is other than 1, some other error (e.g. not a git repo) if e.returncode != 1: raise SystemExit('fatal: {}'.format(e)) output = '' return dict(reversed(x.split()) for x in output.splitlines()) @staticmethod def get_blob_sizes(quiet = False): blob_size_progress = ProgressWriter() num_blobs = 0 processed_blobs_msg = _("Processed %d blob sizes") # Get sizes of blobs by sha1 cmd = '--batch-check=%(objectname) %(objecttype) ' + \ '%(objectsize) %(objectsize:disk)' cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd], bufsize = -1, stdout = subprocess.PIPE) unpacked_size = {} packed_size = {} for line in cf.stdout: sha, objtype, objsize, objdisksize = line.split() objsize, objdisksize = int(objsize), int(objdisksize) if objtype == b'blob': unpacked_size[sha] = objsize packed_size[sha] = objdisksize num_blobs += 1 if not quiet: blob_size_progress.show(processed_blobs_msg % num_blobs) cf.wait() if not quiet: blob_size_progress.finish() return unpacked_size, packed_size @staticmethod def get_file_changes(repo, parent_hash, commit_hash): """ Return a FileChanges list with the differences between parent_hash and commit_hash """ file_changes = [] cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash] output = subproc.check_output(cmd, cwd=repo) for line in output.splitlines(): fileinfo, path = line.split(b'\t', 1) if path.startswith(b'"'): path = PathQuoting.dequote(path) oldmode, mode, oldhash, newhash, changetype = fileinfo.split() if changetype == b'D': file_changes.append(FileChange(b'D', path)) elif changetype in (b'A', b'M', b'T'): identifier = HASH_TO_ID.get(newhash, newhash) file_changes.append(FileChange(b'M', path, identifier, mode)) else: # pragma: no cover raise SystemExit("Unknown change type for line {}".format(line)) return file_changes @staticmethod def print_my_version(): with open(__file__, 'br') as f: contents = f.read() # If people replaced @@LOCALEDIR@@ string to point at their local # directory, undo it so we can get original source version. contents = re.sub(br'\A#\!.*', br'#!/usr/bin/env python3', contents) contents = re.sub(br'(\("GIT_TEXTDOMAINDIR"\) or ").*"', br'\1@@LOCALEDIR@@"', contents) cmd = 'git hash-object --stdin'.split() version = subproc.check_output(cmd, input=contents).strip() print(decode(version[0:12])) class FilteringOptions(object): default_replace_text = b'***REMOVED***' class AppendFilter(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): user_path = values suffix = option_string[len('--path-'):] or 'match' if suffix.startswith('rename'): mod_type = 'rename' match_type = option_string[len('--path-rename-'):] or 'match' values = values.split(b':') if len(values) != 2: raise SystemExit(_("Error: --path-rename expects one colon in its" " argument: .")) if values[0] and values[1] and not ( values[0].endswith(b'/') == values[1].endswith(b'/')): raise SystemExit(_("Error: With --path-rename, if OLD_NAME and " "NEW_NAME are both non-empty and either ends " "with a slash then both must.")) if any(v.startswith(b'/') for v in values): raise SystemExit(_("Error: Pathnames cannot begin with a '/'")) components = values[0].split(b'/') + values[1].split(b'/') else: mod_type = 'filter' match_type = suffix components = values.split(b'/') if values.startswith(b'/'): raise SystemExit(_("Error: Pathnames cannot begin with a '/'")) for illegal_path in [b'.', b'..']: if illegal_path in components: raise SystemExit(_("Error: Invalid path component '%s' found in '%s'") % (decode(illegal_path), decode(user_path))) if match_type == 'regex': values = re.compile(values) items = getattr(namespace, self.dest, []) or [] items.append((mod_type, match_type, values)) if (match_type, mod_type) == ('glob', 'filter'): if not values.endswith(b'*'): extension = b'*' if values.endswith(b'/') else b'/*' items.append((mod_type, match_type, values+extension)) setattr(namespace, self.dest, items) class HelperFilter(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): af = FilteringOptions.AppendFilter(dest='path_changes', option_strings=None) dirname = values if values[-1:] == b'/' else values+b'/' if option_string == '--subdirectory-filter': af(parser, namespace, dirname, '--path-match') af(parser, namespace, dirname+b':', '--path-rename') elif option_string == '--to-subdirectory-filter': af(parser, namespace, b':'+dirname, '--path-rename') else: raise SystemExit(_("Error: HelperFilter given invalid option_string: %s") % option_string) # pragma: no cover class FileWithPathsFilter(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): if not namespace.path_changes: namespace.path_changes = [] namespace.path_changes += FilteringOptions.get_paths_from_file(values) @staticmethod def create_arg_parser(): # Include usage in the summary, so we can put the description first summary = _('''Rewrite (or analyze) repository history git-filter-repo destructively rewrites history (unless --analyze or --dry-run are given) according to specified rules. It refuses to do any rewriting unless either run from a clean fresh clone, or --force was given. Basic Usage: git-filter-repo --analyze git-filter-repo [FILTER/RENAME/CONTROL OPTIONS] See EXAMPLES section for details. ''').rstrip() # Provide a long helpful examples section example_text = _('''CALLBACKS All callback functions are of the same general format. For a command line argument like --foo-callback 'BODY' the following code will be compiled and called: def foo_callback(foo): BODY Thus, to replace 'Jon' with 'John' in author/committer/tagger names: git filter-repo --name-callback 'return name.replace(b"Jon", b"John")' To remove all 'Tested-by' tags in commit (or tag) messages: git filter-repo --message-callback 'return re.sub(br"\\nTested-by:.*", "", message)' To remove all .DS_Store files: git filter-repo --filename-callback 'return None if os.path.basename(filename) == b".DS_Store" else filename' Note that if BODY resolves to a filename, then the contents of that file will be used as the BODY in the callback function. For more detailed examples and explanations AND caveats, see https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#CALLBACKS EXAMPLES To get a bunch of reports mentioning renames that have occurred in your repo and listing sizes of objects aggregated by any of path, directory, extension, or blob-id: git filter-repo --analyze (These reports can help you choose how to filter your repo; it can be useful to re-run this command after filtering to regenerate the report and verify the changes look correct.) To extract the history that touched just 'guides' and 'tools/releases': git filter-repo --path guides/ --path tools/releases To remove foo.zip and bar/baz/zips from every revision in history: git filter-repo --path foo.zip --path bar/baz/zips/ --invert-paths To replace the text 'password' with 'p455w0rd': git filter-repo --replace-text <(echo "password==>p455w0rd") To use the current version of the .mailmap file to update authors, committers, and taggers throughout history and make it permanent: git filter-repo --use-mailmap To extract the history of 'src/', rename all files to have a new leading directory 'my-module' (e.g. src/foo.java -> my-module/src/foo.java), and add a 'my-module-' prefix to all tags: git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-' For more detailed examples and explanations, see https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES''') # Create the basic parser parser = argparse.ArgumentParser(description=summary, usage = argparse.SUPPRESS, add_help = False, epilog = example_text, formatter_class=argparse.RawDescriptionHelpFormatter) analyze = parser.add_argument_group(title=_("Analysis")) analyze.add_argument('--analyze', action='store_true', help=_("Analyze repository history and create a report that may be " "useful in determining what to filter in a subsequent run. " "Will not modify your repo.")) analyze.add_argument('--report-dir', metavar='DIR_OR_FILE', type=os.fsencode, dest='report_dir', help=_("Directory to write report, defaults to GIT_DIR/filter_repo/analysis," "refuses to run if exists, --force delete existing dir first.")) path = parser.add_argument_group(title=_("Filtering based on paths " "(see also --filename-callback)"), description=textwrap.dedent(_(""" These options specify the paths to select. Note that much like git itself, renames are NOT followed so you may need to specify multiple paths, e.g. `--path olddir/ --path newdir/` """[1:]))) path.add_argument('--invert-paths', action='store_false', dest='inclusive', help=_("Invert the selection of files from the specified " "--path-{match,glob,regex} options below, i.e. only select " "files matching none of those options.")) path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE', type=os.fsencode, action=FilteringOptions.AppendFilter, dest='path_changes', help=_("Exact paths (files or directories) to include in filtered " "history. Multiple --path options can be specified to get " "a union of paths.")) path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode, action=FilteringOptions.AppendFilter, dest='path_changes', help=_("Glob of paths to include in filtered history. Multiple " "--path-glob options can be specified to get a union of " "paths.")) path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode, action=FilteringOptions.AppendFilter, dest='path_changes', help=_("Regex of paths to include in filtered history. Multiple " "--path-regex options can be specified to get a union of " "paths")) path.add_argument('--use-base-name', action='store_true', help=_("Match on file base name instead of full path from the top " "of the repo. Incompatible with --path-rename, and " "incompatible with matching against directory names.")) rename = parser.add_argument_group(title=_("Renaming based on paths " "(see also --filename-callback)")) rename.add_argument('--path-rename', '--path-rename-match', metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode, action=FilteringOptions.AppendFilter, help=_("Path to rename; if filename or directory matches OLD_NAME " "rename to NEW_NAME. Multiple --path-rename options can be " "specified. NOTE: If you combine filtering options with " "renaming ones, do not rely on a rename argument to select " "paths; you also need a filter to select them.")) helpers = parser.add_argument_group(title=_("Path shortcuts")) helpers.add_argument('--paths', help=argparse.SUPPRESS, metavar='IGNORE') helpers.add_argument('--paths-from-file', metavar='FILENAME', type=os.fsencode, action=FilteringOptions.FileWithPathsFilter, dest='path_changes', help=_("Specify several path filtering and renaming directives, one " "per line. Lines with '==>' in them specify path renames, " "and lines can begin with 'literal:' (the default), 'glob:', " "or 'regex:' to specify different matching styles. Blank " "lines and lines starting with a '#' are ignored.")) helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY', action=FilteringOptions.HelperFilter, type=os.fsencode, help=_("Only look at history that touches the given subdirectory " "and treat that directory as the project root. Equivalent " "to using '--path DIRECTORY/ --path-rename DIRECTORY/:'")) helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY', action=FilteringOptions.HelperFilter, type=os.fsencode, help=_("Treat the project root as instead being under DIRECTORY. " "Equivalent to using '--path-rename :DIRECTORY/'")) contents = parser.add_argument_group(title=_("Content editing filters " "(see also --blob-callback)")) contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE', help=_("A file with expressions that, if found, will be replaced. " "By default, each expression is treated as literal text, " "but 'regex:' and 'glob:' prefixes are supported. You can " "end the line with '==>' and some replacement text to " "choose a replacement choice other than the default of '{}'." .format(decode(FilteringOptions.default_replace_text)))) contents.add_argument('--strip-blobs-bigger-than', metavar='SIZE', dest='max_blob_size', default=0, help=_("Strip blobs (files) bigger than specified size (e.g. '5M', " "'2G', etc)")) contents.add_argument('--strip-blobs-with-ids', metavar='BLOB-ID-FILENAME', help=_("Read git object ids from each line of the given file, and " "strip all of them from history")) refrename = parser.add_argument_group(title=_("Renaming of refs " "(see also --refname-callback)")) refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode, help=_("Rename tags starting with OLD to start with NEW. For " "example, --tag-rename foo:bar will rename tag foo-1.2.3 " "to bar-1.2.3; either OLD or NEW can be empty.")) messages = parser.add_argument_group(title=_("Filtering of commit messages " "(see also --message-callback)")) messages.add_argument('--replace-message', metavar='EXPRESSIONS_FILE', help=_("A file with expressions that, if found in commit messages, " "will be replaced. This file uses the same syntax as " "--replace-text.")) messages.add_argument('--preserve-commit-hashes', action='store_true', help=_("By default, since commits are rewritten and thus gain new " "hashes, references to old commit hashes in commit messages " "are replaced with new commit hashes (abbreviated to the same " "length as the old reference). Use this flag to turn off " "updating commit hashes in commit messages.")) messages.add_argument('--preserve-commit-encoding', action='store_true', help=_("Do not reencode commit messages into UTF-8. By default, if " "the commit object specifies an encoding for the commit " "message, the message is re-encoded into UTF-8.")) people = parser.add_argument_group(title=_("Filtering of names & emails " "(see also --name-callback " "and --email-callback)")) people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME', type=os.fsencode, help=_("Use specified mailmap file (see git-shortlog(1) for " "details on the format) when rewriting author, committer, " "and tagger names and emails. If the specified file is " "part of git history, historical versions of the file will " "be ignored; only the current contents are consulted.")) people.add_argument('--use-mailmap', dest='mailmap', action='store_const', const=b'.mailmap', help=_("Same as: '--mailmap .mailmap' ")) parents = parser.add_argument_group(title=_("Parent rewriting")) parents.add_argument('--replace-refs', default=None, choices=['delete-no-add', 'delete-and-add', 'update-no-add', 'update-or-add', 'update-and-add', 'old-default'], help=_("How to handle replace refs (see git-replace(1)). Replace " "refs can be added during the history rewrite as a way to " "allow users to pass old commit IDs (from before " "git-filter-repo was run) to git commands and have git know " "how to translate those old commit IDs to the new " "(post-rewrite) commit IDs. Also, replace refs that existed " "before the rewrite can either be deleted or updated. The " "choices to pass to --replace-refs thus need to specify both " "what to do with existing refs and what to do with commit " "rewrites. Thus 'update-and-add' means to update existing " "replace refs, and for any commit rewrite (even if already " "pointed at by a replace ref) add a new refs/replace/ reference " "to map from the old commit ID to the new commit ID. The " "default is update-no-add, meaning update existing replace refs " "but do not add any new ones. There is also a special " "'old-default' option for picking the default used in versions " "prior to git-filter-repo-2.45, namely 'update-and-add' upon " "the first run of git-filter-repo in a repository and " "'update-or-add' if running git-filter-repo again on a " "repository.")) parents.add_argument('--prune-empty', default='auto', choices=['always', 'auto', 'never'], help=_("Whether to prune empty commits. 'auto' (the default) means " "only prune commits which become empty (not commits which were " "empty in the original repo, unless their parent was pruned). " "When the parent of a commit is pruned, the first non-pruned " "ancestor becomes the new parent.")) parents.add_argument('--prune-degenerate', default='auto', choices=['always', 'auto', 'never'], help=_("Since merge commits are needed for history topology, they " "are typically exempt from pruning. However, they can become " "degenerate with the pruning of other commits (having fewer " "than two parents, having one commit serve as both parents, or " "having one parent as the ancestor of the other.) If such " "merge commits have no file changes, they can be pruned. The " "default ('auto') is to only prune empty merge commits which " "become degenerate (not which started as such).")) parents.add_argument('--no-ff', action='store_true', help=_("Even if the first parent is or becomes an ancestor of another " "parent, do not prune it. This modifies how " "--prune-degenerate behaves, and may be useful in projects who " "always use merge --no-ff.")) callback = parser.add_argument_group(title=_("Generic callback code snippets")) callback.add_argument('--filename-callback', metavar="FUNCTION_BODY_OR_FILE", help=_("Python code body for processing filenames; see CALLBACKS " "sections below.")) callback.add_argument('--message-callback', metavar="FUNCTION_BODY_OR_FILE", help=_("Python code body for processing messages (both commit " "messages and tag messages); see CALLBACKS section below.")) callback.add_argument('--name-callback', metavar="FUNCTION_BODY_OR_FILE", help=_("Python code body for processing names of people; see " "CALLBACKS section below.")) callback.add_argument('--email-callback', metavar="FUNCTION_BODY_OR_FILE", help=_("Python code body for processing emails addresses; see " "CALLBACKS section below.")) callback.add_argument('--refname-callback', metavar="FUNCTION_BODY_OR_FILE", help=_("Python code body for processing refnames; see CALLBACKS " "section below.")) callback.add_argument('--blob-callback', metavar="FUNCTION_BODY_OR_FILE", help=_("Python code body for processing blob objects; see " "CALLBACKS section below.")) callback.add_argument('--commit-callback', metavar="FUNCTION_BODY_OR_FILE", help=_("Python code body for processing commit objects; see " "CALLBACKS section below.")) callback.add_argument('--tag-callback', metavar="FUNCTION_BODY_OR_FILE", help=_("Python code body for processing tag objects; see CALLBACKS " "section below.")) callback.add_argument('--reset-callback', metavar="FUNCTION_BODY_OR_FILE", help=_("Python code body for processing reset objects; see " "CALLBACKS section below.")) desc = _( "Specifying alternate source or target locations implies --partial,\n" "except that the normal default for --replace-refs is used. However,\n" "unlike normal uses of --partial, this doesn't risk mixing old and new\n" "history since the old and new histories are in different repositories.") location = parser.add_argument_group(title=_("Location to filter from/to"), description=desc) location.add_argument('--source', type=os.fsencode, help=_("Git repository to read from")) location.add_argument('--target', type=os.fsencode, help=_("Git repository to overwrite with filtered history")) order = parser.add_argument_group(title=_("Ordering of commits")) order.add_argument('--date-order', action='store_true', help=_("Processes commits in commit timestamp order.")) misc = parser.add_argument_group(title=_("Miscellaneous options")) misc.add_argument('--help', '-h', action='store_true', help=_("Show this help message and exit.")) misc.add_argument('--version', action='store_true', help=_("Display filter-repo's version and exit.")) misc.add_argument('--force', '-f', action='store_true', help=_("Rewrite repository history even if the current repo does not " "look like a fresh clone. History rewriting is irreversible " "(and includes immediate pruning of reflogs and old objects), " "so be cautious about using this flag.")) misc.add_argument('--partial', action='store_true', help=_("Do a partial history rewrite, resulting in the mixture of " "old and new history. This disables rewriting " "refs/remotes/origin/* to refs/heads/*, disables removing " "of the 'origin' remote, disables removing unexported refs, " "disables expiring the reflog, and disables the automatic " "post-filter gc. Also, this modifies --tag-rename and " "--refname-callback options such that instead of replacing " "old refs with new refnames, it will instead create new " "refs and keep the old ones around. Use with caution.")) # WARNING: --refs presents a problem with become-degenerate pruning: # * Excluding a commit also excludes its ancestors so when some other # commit has an excluded ancestor as a parent we have no way of # knowing what it is an ancestor of without doing a special # full-graph walk. misc.add_argument('--refs', nargs='+', help=_("Limit history rewriting to the specified refs. Implies " "--partial. In addition to the normal caveats of --partial " "(mixing old and new history, no automatic remapping of " "refs/remotes/origin/* to refs/heads/*, etc.), this also may " "cause problems for pruning of degenerate empty merge " "commits when negative revisions are specified.")) misc.add_argument('--dry-run', action='store_true', help=_("Do not change the repository. Run `git fast-export` and " "filter its output, and save both the original and the " "filtered version for comparison. This also disables " "rewriting commit messages due to not knowing new commit " "IDs and disables filtering of some empty commits due to " "inability to query the fast-import backend." )) misc.add_argument('--debug', action='store_true', help=_("Print additional information about operations being " "performed and commands being run. When used together " "with --dry-run, also show extra information about what " "would be run.")) # WARNING: --state-branch has some problems: # * It does not work well with manually inserted objects (user creating # Blob() or Commit() or Tag() objects and calling # RepoFilter.insert(obj) on them). # * It does not work well with multiple source or multiple target repos # * It doesn't work so well with pruning become-empty commits (though # --refs doesn't work so well with it either) # These are probably fixable, given some work (e.g. re-importing the # graph at the beginning to get the AncestryGraph right, doing our own # export of marks instead of using fast-export --export-marks, etc.), but # for now just hide the option. misc.add_argument('--state-branch', #help=_("Enable incremental filtering by saving the mapping of old " # "to new objects to the specified branch upon exit, and" # "loading that mapping from that branch (if it exists) " # "upon startup.")) help=argparse.SUPPRESS) misc.add_argument('--stdin', action='store_true', help=_("Instead of running `git fast-export` and filtering its " "output, filter the fast-export stream from stdin. The " "stdin must be in the expected input format (e.g. it needs " "to include original-oid directives).")) misc.add_argument('--quiet', action='store_true', help=_("Pass --quiet to other git commands called")) return parser @staticmethod def sanity_check_args(args): if args.analyze and args.path_changes: raise SystemExit(_("Error: --analyze is incompatible with --path* flags; " "it's a read-only operation.")) if args.analyze and args.stdin: raise SystemExit(_("Error: --analyze is incompatible with --stdin.")) # If no path_changes are found, initialize with empty list but mark as # not inclusive so that all files match if args.path_changes == None: args.path_changes = [] args.inclusive = False else: # Similarly, if we have no filtering paths, then no path should be # filtered out. Based on how newname() works, the easiest way to # achieve that is setting args.inclusive to False. if not any(x[0] == 'filter' for x in args.path_changes): args.inclusive = False # Also check for incompatible --use-base-name and --path-rename flags. if args.use_base_name: if any(x[0] == 'rename' for x in args.path_changes): raise SystemExit(_("Error: --use-base-name and --path-rename are " "incompatible.")) # Also throw some sanity checks on git version here; # PERF: remove these checks once new enough git versions are common p = subproc.Popen('git fast-export -h'.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output = p.stdout.read() if b'--anonymize-map' not in output: # pragma: no cover global date_format_permissive date_format_permissive = False if not any(x in output for x in [b'--mark-tags',b'--[no-]mark-tags']): # pragma: no cover global write_marks write_marks = False if args.state_branch: # We need a version of git-fast-export with --mark-tags raise SystemExit(_("Error: need git >= 2.24.0")) if not any(x in output for x in [b'--reencode', b'--[no-]reencode']): # pragma: no cover if args.preserve_commit_encoding: # We need a version of git-fast-export with --reencode raise SystemExit(_("Error: need git >= 2.23.0")) else: # Set args.preserve_commit_encoding to None which we'll check for later # to avoid passing --reencode=yes to fast-export (that option was the # default prior to git-2.23) args.preserve_commit_encoding = None # If we don't have fast-exoprt --reencode, we may also be missing # diff-tree --combined-all-paths, which is even more important... p = subproc.Popen('git diff-tree -h'.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output = p.stdout.read() if b'--combined-all-paths' not in output: # We need a version of git-diff-tree with --combined-all-paths raise SystemExit(_("Error: need git >= 2.22.0")) # End of sanity checks on git version if args.max_blob_size: suffix = args.max_blob_size[-1] if suffix not in '1234567890': mult = {'K': 1024, 'M': 1024**2, 'G': 1024**3} if suffix not in mult: raise SystemExit(_("Error: could not parse --strip-blobs-bigger-than" " argument %s") % args.max_blob_size) args.max_blob_size = int(args.max_blob_size[0:-1]) * mult[suffix] else: args.max_blob_size = int(args.max_blob_size) @staticmethod def get_replace_text(filename): replace_literals = [] replace_regexes = [] with open(filename, 'br') as f: for line in f: line = line.rstrip(b'\r\n') # Determine the replacement replacement = FilteringOptions.default_replace_text if b'==>' in line: line, replacement = line.rsplit(b'==>', 1) # See if we need to match via regex regex = None if line.startswith(b'regex:'): regex = line[6:] elif line.startswith(b'glob:'): regex = glob_to_regex(line[5:]) if regex: replace_regexes.append((re.compile(regex), replacement)) else: # Otherwise, find the literal we need to replace if line.startswith(b'literal:'): line = line[8:] if not line: continue replace_literals.append((line, replacement)) return {'literals': replace_literals, 'regexes': replace_regexes} @staticmethod def get_paths_from_file(filename): new_path_changes = [] with open(filename, 'br') as f: for line in f: line = line.rstrip(b'\r\n') # Skip blank lines if not line: continue # Skip comment lines if line.startswith(b'#'): continue # Determine the replacement match_type, repl = 'literal', None if b'==>' in line: line, repl = line.rsplit(b'==>', 1) # See if we need to match via regex match_type = 'match' # a.k.a. 'literal' if line.startswith(b'regex:'): match_type = 'regex' match = re.compile(line[6:]) elif line.startswith(b'glob:'): match_type = 'glob' match = line[5:] if repl: raise SystemExit(_("Error: In %s, 'glob:' and '==>' are incompatible (renaming globs makes no sense)" % decode(filename))) else: if line.startswith(b'literal:'): match = line[8:] else: match = line if repl is not None: if match and repl and match.endswith(b'/') != repl.endswith(b'/'): raise SystemExit(_("Error: When rename directories, if OLDNAME " "and NEW_NAME are both non-empty and either " "ends with a slash then both must.")) # Record the filter or rename if repl is not None: new_path_changes.append(['rename', match_type, (match, repl)]) else: new_path_changes.append(['filter', match_type, match]) if match_type == 'glob' and not match.endswith(b'*'): extension = b'*' if match.endswith(b'/') else b'/*' new_path_changes.append(['filter', match_type, match+extension]) return new_path_changes @staticmethod def default_options(): return FilteringOptions.parse_args([], error_on_empty = False) @staticmethod def parse_args(input_args, error_on_empty = True): parser = FilteringOptions.create_arg_parser() if not input_args and error_on_empty: parser.print_usage() raise SystemExit(_("No arguments specified.")) args = parser.parse_args(input_args) if args.help: parser.print_help() raise SystemExit() if args.paths: raise SystemExit("Error: Option `--paths` unrecognized; did you mean --path or --paths-from-file?") if args.version: GitUtils.print_my_version() raise SystemExit() FilteringOptions.sanity_check_args(args) if args.mailmap: args.mailmap = MailmapInfo(args.mailmap) if args.replace_text: args.replace_text = FilteringOptions.get_replace_text(args.replace_text) if args.replace_message: args.replace_message = FilteringOptions.get_replace_text(args.replace_message) if args.strip_blobs_with_ids: with open(args.strip_blobs_with_ids, 'br') as f: args.strip_blobs_with_ids = set(f.read().split()) else: args.strip_blobs_with_ids = set() if (args.partial or args.refs) and not args.replace_refs: args.replace_refs = 'update-no-add' args.repack = not (args.partial or args.refs) if args.refs or args.source or args.target: args.partial = True if not args.refs: args.refs = ['--all'] return args class RepoAnalyze(object): # First, several helper functions for analyze_commit() @staticmethod def equiv_class(stats, filename): return stats['equivalence'].get(filename, (filename,)) @staticmethod def setup_equivalence_for_rename(stats, oldname, newname): # if A is renamed to B and B is renamed to C, then the user thinks of # A, B, and C as all being different names for the same 'file'. We record # this as an equivalence class: # stats['equivalence'][name] = (A,B,C) # for name being each of A, B, and C. old_tuple = stats['equivalence'].get(oldname, ()) if newname in old_tuple: return elif old_tuple: new_tuple = tuple(list(old_tuple)+[newname]) else: new_tuple = (oldname, newname) for f in new_tuple: stats['equivalence'][f] = new_tuple @staticmethod def setup_or_update_rename_history(stats, commit, oldname, newname): rename_commits = stats['rename_history'].get(oldname, set()) rename_commits.add(commit) stats['rename_history'][oldname] = rename_commits @staticmethod def handle_renames(stats, commit, change_types, filenames): for index, change_type in enumerate(change_types): if change_type == ord(b'R'): oldname, newname = filenames[index], filenames[-1] RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname) RepoAnalyze.setup_or_update_rename_history(stats, commit, oldname, newname) @staticmethod def handle_file(stats, graph, commit, modes, shas, filenames): mode, sha, filename = modes[-1], shas[-1], filenames[-1] # Figure out kind of deletions to undo for this file, and update lists # of all-names-by-sha and all-filenames delmode = 'tree_deletions' if mode != b'040000': delmode = 'file_deletions' stats['names'][sha].add(filename) stats['allnames'].add(filename) # If the file (or equivalence class of files) was recorded as deleted, # clearly it isn't anymore equiv = RepoAnalyze.equiv_class(stats, filename) for f in equiv: stats[delmode].pop(f, None) # If we get a modify/add for a path that was renamed, we may need to break # the equivalence class. However, if the modify/add was on a branch that # doesn't have the rename in its history, we are still okay. need_to_break_equivalence = False if equiv[-1] != filename: for rename_commit in stats['rename_history'][filename]: if graph.is_ancestor(rename_commit, commit): need_to_break_equivalence = True if need_to_break_equivalence: for f in equiv: if f in stats['equivalence']: del stats['equivalence'][f] @staticmethod def analyze_commit(stats, graph, commit, parents, date, file_changes): graph.add_commit_and_parents(commit, parents) for change in file_changes: modes, shas, change_types, filenames = change if len(parents) == 1 and change_types.startswith(b'R'): change_types = b'R' # remove the rename score; we don't care if modes[-1] == b'160000': continue elif modes[-1] == b'000000': # Track when files/directories are deleted for f in RepoAnalyze.equiv_class(stats, filenames[-1]): if any(x == b'040000' for x in modes[0:-1]): stats['tree_deletions'][f] = date else: stats['file_deletions'][f] = date elif change_types.strip(b'AMT') == b'': RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'': RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) elif change_types.strip(b'RAMT') == b'': RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) RepoAnalyze.handle_renames(stats, commit, change_types, filenames) else: raise SystemExit(_("Unhandled change type(s): %(change_type)s " "(in commit %(commit)s)") % ({'change_type': change_types, 'commit': commit}) ) # pragma: no cover @staticmethod def gather_data(args): unpacked_size, packed_size = GitUtils.get_blob_sizes() stats = {'names': collections.defaultdict(set), 'allnames' : set(), 'file_deletions': {}, 'tree_deletions': {}, 'equivalence': {}, 'rename_history': collections.defaultdict(set), 'unpacked_size': unpacked_size, 'packed_size': packed_size, 'num_commits': 0} # Setup the rev-list/diff-tree process processed_commits_msg = _("Processed %d commits") commit_parse_progress = ProgressWriter() num_commits = 0 cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) + ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' + ' --date=short -M -t -c --raw --combined-all-paths') dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) f = dtp.stdout line = f.readline() if not line: raise SystemExit(_("Nothing to analyze; repository is empty.")) cont = bool(line) graph = AncestryGraph() while cont: commit = line.rstrip() parents = f.readline().split() date = f.readline().rstrip() # We expect a blank line next; if we get a non-blank line then # this commit modified no files and we need to move on to the next. # If there is no line, we've reached end-of-input. line = f.readline() if not line: cont = False line = line.rstrip() # If we haven't reached end of input, and we got a blank line meaning # a commit that has modified files, then get the file changes associated # with this commit. file_changes = [] if cont and not line: cont = False for line in f: if not line.startswith(b':'): cont = True break n = 1+max(1, len(parents)) assert line.startswith(b':'*(n-1)) relevant = line[n-1:-1] splits = relevant.split(None, n) modes = splits[0:n] splits = splits[n].split(None, n) shas = splits[0:n] splits = splits[n].split(b'\t') change_types = splits[0] filenames = [PathQuoting.dequote(x) for x in splits[1:]] file_changes.append([modes, shas, change_types, filenames]) # If someone is trying to analyze a subset of the history, make sure # to avoid dying on commits with parents that we haven't seen before if args.refs: graph.record_external_commits([p for p in parents if not p in graph.value]) # Analyze this commit and update progress RepoAnalyze.analyze_commit(stats, graph, commit, parents, date, file_changes) num_commits += 1 commit_parse_progress.show(processed_commits_msg % num_commits) # Show the final commits processed message and record the number of commits commit_parse_progress.finish() stats['num_commits'] = num_commits # Close the output, ensure rev-list|diff-tree pipeline completed successfully dtp.stdout.close() if dtp.wait(): raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover return stats @staticmethod def write_report(reportdir, stats): def datestr(datetimestr): return datetimestr if datetimestr else _('').encode() def dirnames(path): while True: path = os.path.dirname(path) yield path if path == b'': break # Compute aggregate size information for paths, extensions, and dirs total_size = {'packed': 0, 'unpacked': 0} path_size = {'packed': collections.defaultdict(int), 'unpacked': collections.defaultdict(int)} ext_size = {'packed': collections.defaultdict(int), 'unpacked': collections.defaultdict(int)} dir_size = {'packed': collections.defaultdict(int), 'unpacked': collections.defaultdict(int)} for sha in stats['names']: size = {'packed': stats['packed_size'][sha], 'unpacked': stats['unpacked_size'][sha]} for which in ('packed', 'unpacked'): for name in stats['names'][sha]: total_size[which] += size[which] path_size[which][name] += size[which] basename, ext = os.path.splitext(name) ext_size[which][ext] += size[which] for dirname in dirnames(name): dir_size[which][dirname] += size[which] # Determine if and when extensions and directories were deleted ext_deleted_data = {} for name in stats['allnames']: when = stats['file_deletions'].get(name, None) # Update the extension basename, ext = os.path.splitext(name) if when is None: ext_deleted_data[ext] = None elif ext in ext_deleted_data: if ext_deleted_data[ext] is not None: ext_deleted_data[ext] = max(ext_deleted_data[ext], when) else: ext_deleted_data[ext] = when dir_deleted_data = {} for name in dir_size['packed']: dir_deleted_data[name] = stats['tree_deletions'].get(name, None) with open(os.path.join(reportdir, b"README"), 'bw') as f: # Give a basic overview of this file f.write(b"== %s ==\n" % _("Overall Statistics").encode()) f.write((" %s: %d\n" % (_("Number of commits"), stats['num_commits'])).encode()) f.write((" %s: %d\n" % (_("Number of filenames"), len(path_size['packed']))).encode()) f.write((" %s: %d\n" % (_("Number of directories"), len(dir_size['packed']))).encode()) f.write((" %s: %d\n" % (_("Number of file extensions"), len(ext_size['packed']))).encode()) f.write(b"\n") f.write((" %s: %d\n" % (_("Total unpacked size (bytes)"), total_size['unpacked'])).encode()) f.write((" %s: %d\n" % (_("Total packed size (bytes)"), total_size['packed'])).encode()) f.write(b"\n") # Mention issues with the report f.write(("== %s ==\n" % _("Caveats")).encode()) f.write(("=== %s ===\n" % _("Sizes")).encode()) f.write(textwrap.dedent(_(""" Packed size represents what size your repository would be if no trees, commits, tags, or other metadata were included (though it may fail to represent de-duplication; see below). It also represents the current packing, which may be suboptimal if you haven't gc'ed for a while. Unpacked size represents what size your repository would be if no trees, commits, tags, or other metadata were included AND if no files were packed; i.e., without delta-ing or compression. Both unpacked and packed sizes can be slightly misleading. Deleting a blob from history not save as much space as the unpacked size, because it is obviously normally stored in packed form. Also, deleting a blob from history may not save as much space as its packed size either, because another blob could be stored as a delta against that blob, so when you remove one blob another blob's packed size may grow. Also, the sum of the packed sizes can add up to more than the repository size; if the same contents appeared in the repository in multiple places, git will automatically de-dupe and store only one copy, while the way sizes are added in this analysis adds the size for each file path that has those contents. Further, if a file is ever reverted to a previous version's contents, the previous version's size will be counted multiple times in this analysis, even though git will only store it once. """)[1:]).encode()) f.write(b"\n") f.write(("=== %s ===\n" % _("Deletions")).encode()) f.write(textwrap.dedent(_(""" Whether a file is deleted is not a binary quality, since it can be deleted on some branches but still exist in others. Also, it might exist in an old tag, but have been deleted in versions newer than that. More thorough tracking could be done, including looking at merge commits where one side of history deleted and the other modified, in order to give a more holistic picture of deletions. However, that algorithm would not only be more complex to implement, it'd also be quite difficult to present and interpret by users. Since --analyze is just about getting a high-level rough picture of history, it instead implements the simplistic rule that is good enough for 98% of cases: A file is marked as deleted if the last commit in the fast-export stream that mentions the file lists it as deleted. This makes it dependent on topological ordering, but generally gives the "right" answer. """)[1:]).encode()) f.write(b"\n") f.write(("=== %s ===\n" % _("Renames")).encode()) f.write(textwrap.dedent(_(""" Renames share the same non-binary nature that deletions do, plus additional challenges: * If the renamed file is renamed again, instead of just two names for a path you can have three or more. * Rename pairs of the form (oldname, newname) that we consider to be different names of the "same file" might only be valid over certain commit ranges. For example, if a new commit reintroduces a file named oldname, then new versions of oldname aren't the "same file" anymore. We could try to portray this to the user, but it's easier for the user to just break the pairing and only report unbroken rename pairings to the user. * The ability for users to rename files differently in different branches means that our chains of renames will not necessarily be linear but may branch out. """)[1:]).encode()) f.write(b"\n") # Equivalence classes for names, so if folks only want to keep a # certain set of paths, they know the old names they want to include # too. with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f: seen = set() for pathname,equiv_group in sorted(stats['equivalence'].items(), key=lambda x:(x[1], x[0])): if equiv_group in seen: continue seen.add(equiv_group) f.write(("{} ->\n ".format(decode(equiv_group[0])) + "\n ".join(decode(x) for x in equiv_group[1:]) + "\n").encode()) # List directories in reverse sorted order of unpacked size with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f: msg = "=== %s ===\n" % _("Deleted directories by reverse size") f.write(msg.encode()) msg = _("Format: unpacked size, packed size, date deleted, directory name\n") f.write(msg.encode()) for dirname, size in sorted(dir_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): if (dir_deleted_data[dirname]): f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], size, datestr(dir_deleted_data[dirname]), dirname or _('').encode())) with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f: f.write(("=== %s ===\n" % _("All directories by reverse size")).encode()) msg = _("Format: unpacked size, packed size, date deleted, directory name\n") f.write(msg.encode()) for dirname, size in sorted(dir_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], size, datestr(dir_deleted_data[dirname]), dirname or _("").encode())) # List extensions in reverse sorted order of unpacked size with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f: msg = "=== %s ===\n" % _("Deleted extensions by reverse size") f.write(msg.encode()) msg = _("Format: unpacked size, packed size, date deleted, extension name\n") f.write(msg.encode()) for extname, size in sorted(ext_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): if (ext_deleted_data[extname]): f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], size, datestr(ext_deleted_data[extname]), extname or _('').encode())) with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f: f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode()) msg = _("Format: unpacked size, packed size, date deleted, extension name\n") f.write(msg.encode()) for extname, size in sorted(ext_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], size, datestr(ext_deleted_data[extname]), extname or _('').encode())) # List files in reverse sorted order of unpacked size with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f: msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size") f.write(msg.encode()) msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n") f.write(msg.encode()) for pathname, size in sorted(path_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): when = stats['file_deletions'].get(pathname, None) if when: f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], size, datestr(when), pathname)) with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f: msg = "=== %s ===\n" % _("All paths by reverse accumulated size") f.write(msg.encode()) msg = _("Format: unpacked size, packed size, date deleted, path name\n") f.write(msg.encode()) for pathname, size in sorted(path_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): when = stats['file_deletions'].get(pathname, None) f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], size, datestr(when), pathname)) # List of filenames and sizes in descending order with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f: f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode()) f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode()) for sha, size in sorted(stats['packed_size'].items(), key=lambda x:(x[1],x[0]), reverse=True): if sha not in stats['names']: # Some objects in the repository might not be referenced, or not # referenced by the branches/tags the user cares about; skip them. continue names_with_sha = stats['names'][sha] if len(names_with_sha) == 1: names_with_sha = names_with_sha.pop() else: names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']' f.write(b" %s %10d %10d %s\n" % (sha, stats['unpacked_size'][sha], size, names_with_sha)) @staticmethod def run(args): if args.report_dir: reportdir = args.report_dir else: git_dir = GitUtils.determine_git_dir(b'.') # Create the report directory as necessary results_tmp_dir = os.path.join(git_dir, b'filter-repo') if not os.path.isdir(results_tmp_dir): os.mkdir(results_tmp_dir) reportdir = os.path.join(results_tmp_dir, b"analysis") if os.path.isdir(reportdir): if args.force: sys.stdout.write(_("Warning: Removing recursively: \"%s\"") % decode(reportdir)) shutil.rmtree(reportdir) else: sys.stdout.write(_("Error: dir already exists (use --force to delete): \"%s\"\n") % decode(reportdir)) sys.exit(1) os.mkdir(reportdir) # Gather the data we need stats = RepoAnalyze.gather_data(args) # Write the reports sys.stdout.write(_("Writing reports to %s...") % decode(reportdir)) sys.stdout.flush() RepoAnalyze.write_report(reportdir, stats) sys.stdout.write(_("done.\n")) class InputFileBackup: def __init__(self, input_file, output_file): self.input_file = input_file self.output_file = output_file def close(self): self.input_file.close() self.output_file.close() def read(self, size): output = self.input_file.read(size) self.output_file.write(output) return output def readline(self): line = self.input_file.readline() self.output_file.write(line) return line class DualFileWriter: def __init__(self, file1, file2): self.file1 = file1 self.file2 = file2 def write(self, *args): self.file1.write(*args) self.file2.write(*args) def flush(self): self.file1.flush() self.file2.flush() def close(self): self.file1.close() self.file2.close() class RepoFilter(object): def __init__(self, args, filename_callback = None, message_callback = None, name_callback = None, email_callback = None, refname_callback = None, blob_callback = None, commit_callback = None, tag_callback = None, reset_callback = None, done_callback = None): self._args = args # Repo we are exporting self._repo_working_dir = None # Store callbacks for acting on objects printed by FastExport self._blob_callback = blob_callback self._commit_callback = commit_callback self._tag_callback = tag_callback self._reset_callback = reset_callback self._done_callback = done_callback # Store callbacks for acting on slices of FastExport objects self._filename_callback = filename_callback # filenames from commits self._message_callback = message_callback # commit OR tag message self._name_callback = name_callback # author, committer, tagger self._email_callback = email_callback # author, committer, tagger self._refname_callback = refname_callback # from commit/tag/reset self._handle_arg_callbacks() # Defaults for input self._input = None self._fep = None # Fast Export Process self._fe_orig = None # Path to where original fast-export output stored self._fe_filt = None # Path to where filtered fast-export output stored self._parser = None # FastExportParser object we are working with # Defaults for output self._output = None self._fip = None # Fast Import Process self._import_pipes = None self._managed_output = True # A tuple of (depth, list-of-ancestors). Commits and ancestors are # identified by their id (their 'mark' in fast-export or fast-import # speak). The depth of a commit is one more than the max depth of any # of its ancestors. self._graph = AncestryGraph() # Another one, for ancestry of commits in the original repo self._orig_graph = AncestryGraph() # Names of files that were tweaked in any commit; such paths could lead # to subsequent commits being empty self._files_tweaked = set() # A set of commit hash pairs (oldhash, newhash) which used to be merge # commits but due to filtering were turned into non-merge commits. # The commits probably have suboptimal commit messages (e.g. "Merge branch # next into master"). self._commits_no_longer_merges = [] # A dict of original_ids to new_ids; filtering commits means getting # new commit hash (sha1sums), and we record the mapping both for # diagnostic purposes and so we can rewrite commit messages. Note that # the new_id can be None rather than a commit hash if the original # commit became empty and was pruned or was otherwise dropped. self._commit_renames = {} # A set of original_ids for which we have not yet gotten the # new_ids; we use OrderedDict because we need to know the order of # insertion, but the values are always ignored (and set to None). # If there was an OrderedSet class, I'd use it instead. self._pending_renames = collections.OrderedDict() # A dict of commit_hash[0:7] -> set(commit_hashes with that prefix). # # It's common for commit messages to refer to commits by abbreviated # commit hashes, as short as 7 characters. To facilitate translating # such short hashes, we have a mapping of prefixes to full old hashes. self._commit_short_old_hashes = collections.defaultdict(set) # A set of commit hash references appearing in commit messages which # mapped to a valid commit that was removed entirely in the filtering # process. The commit message will continue to reference the # now-missing commit hash, since there was nothing to map it to. self._commits_referenced_but_removed = set() # Progress handling (number of commits parsed, etc.) self._progress_writer = ProgressWriter() self._num_commits = 0 # Size of blobs in the repo self._unpacked_size = {} # Other vars self._sanity_checks_handled = False self._finalize_handled = False self._orig_refs = None self._newnames = {} # Cache a few message translations for performance reasons self._parsed_message = _("Parsed %d commits") # Compile some regexes and cache those self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)') def _handle_arg_callbacks(self): def make_callback(argname, str): callback_globals = {g: globals()[g] for g in public_globals} callback_locals = {} exec('def callback({}, _do_not_use_this_var = None):\n'.format(argname)+ ' '+'\n '.join(str.splitlines()), callback_globals, callback_locals) return callback_locals['callback'] def handle(type): callback_field = '_{}_callback'.format(type) code_string = getattr(self._args, type+'_callback') if code_string: if os.path.exists(code_string): with open(code_string, 'r', encoding='utf-8') as f: code_string = f.read() if getattr(self, callback_field): raise SystemExit(_("Error: Cannot pass a %s_callback to RepoFilter " "AND pass --%s-callback" % (type, type))) if 'return ' not in code_string and \ type not in ('blob', 'commit', 'tag', 'reset'): raise SystemExit(_("Error: --%s-callback should have a return statement") % type) setattr(self, callback_field, make_callback(type, code_string)) handle('filename') handle('message') handle('name') handle('email') handle('refname') handle('blob') handle('commit') handle('tag') handle('reset') def _run_sanity_checks(self): self._sanity_checks_handled = True if not self._managed_output: if not self._args.replace_refs: # If not _managed_output we don't want to make extra changes to the # repo, so set default to no-op 'update-no-add' self._args.replace_refs = 'update-no-add' return if self._args.debug: print("[DEBUG] Passed arguments:\n{}".format(self._args)) # Determine basic repository information target_working_dir = self._args.target or b'.' self._orig_refs = GitUtils.get_refs(target_working_dir) is_bare = GitUtils.is_repository_bare(target_working_dir) # Determine if this is second or later run of filter-repo tmp_dir = self.results_tmp_dir(create_if_missing=False) already_ran = os.path.isfile(os.path.join(tmp_dir, b'already_ran')) # Default for --replace-refs if not self._args.replace_refs: self._args.replace_refs = 'update-no-add' if self._args.replace_refs == 'old-default': self._args.replace_refs = ('update-or-add' if already_ran else 'update-and-add') # Do sanity checks from the correct directory if not self._args.force and not already_ran: cwd = os.getcwd() os.chdir(target_working_dir) RepoFilter.sanity_check(self._orig_refs, is_bare) os.chdir(cwd) @staticmethod def sanity_check(refs, is_bare): def abort(reason): try: cmd = 'git config remote.origin.url' output = subproc.check_output(cmd.split()).strip() except subprocess.CalledProcessError as e: output = None msg = "" if output and os.path.isdir(output): msg = _("Note: when cloning local repositories, you need to pass\n" " --no-local to git clone to avoid this issue.\n") raise SystemExit( _("Aborting: Refusing to destructively overwrite repo history since\n" "this does not look like a fresh clone.\n" " (%s)\n%s" "Please operate on a fresh clone instead. If you want to proceed\n" "anyway, use --force.") % (reason, msg)) # Make sure repo is fully packed, just like a fresh clone would be. # Note that transfer.unpackLimit defaults to 100, meaning that a # repository with no packs and less than 100 objects should be considered # fully packed. output = subproc.check_output('git count-objects -v'.split()) stats = dict(x.split(b': ') for x in output.splitlines()) num_packs = int(stats[b'packs']) num_loose_objects = int(stats[b'count']) if num_packs > 1 or \ (num_packs == 1 and num_loose_objects > 0) or \ num_loose_objects >= 100: abort(_("expected freshly packed repo")) # Make sure there is precisely one remote, named "origin"...or that this # is a new bare repo with no packs and no remotes output = subproc.check_output('git remote'.split()).strip() if not (output == b"origin" or (num_packs == 0 and not output)): abort(_("expected one remote, origin")) # Avoid letting people running with weird setups and overwriting GIT_DIR # elsewhere git_dir = GitUtils.determine_git_dir(b'.') if is_bare and git_dir != b'.': abort(_("GIT_DIR must be .")) elif not is_bare and git_dir != b'.git': abort(_("GIT_DIR must be .git")) # Make sure that all reflogs have precisely one entry reflog_dir=os.path.join(git_dir, b'logs') for root, dirs, files in os.walk(reflog_dir): for filename in files: pathname = os.path.join(root, filename) with open(pathname, 'br') as f: if len(f.read().splitlines()) > 1: shortpath = pathname[len(reflog_dir)+1:] abort(_("expected at most one entry in the reflog for %s") % decode(shortpath)) # Make sure there are no stashed changes if b'refs/stash' in refs: abort(_("has stashed changes")) # Do extra checks in non-bare repos if not is_bare: # Avoid uncommitted, unstaged, or untracked changes if subproc.call('git diff --staged --quiet'.split()): abort(_("you have uncommitted changes")) if subproc.call('git diff --quiet'.split()): abort(_("you have unstaged changes")) if len(subproc.check_output('git ls-files -o'.split())) > 0: abort(_("you have untracked changes")) # Avoid unpushed changes for refname, rev in refs.items(): if not refname.startswith(b'refs/heads/'): continue origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/') if origin_ref not in refs: abort(_('%s exists, but %s not found') % (decode(refname), decode(origin_ref))) if rev != refs[origin_ref]: abort(_('%s does not match %s') % (decode(refname), decode(origin_ref))) # Make sure there is only one worktree output = subproc.check_output('git worktree list'.split()) if len(output.splitlines()) > 1: abort(_('you have multiple worktrees')) @staticmethod def cleanup(repo, repack, reset, run_quietly=False, show_debuginfo=False): ''' cleanup repo; if repack then expire reflogs and do a gc --prune=now. if reset then do a reset --hard. Optionally also curb output if run_quietly is True, or go the opposite direction and show extra output if show_debuginfo is True. ''' assert not (run_quietly and show_debuginfo) if (repack and not run_quietly and not show_debuginfo): print(_("Repacking your repo and cleaning out old unneeded objects")) quiet_flags = '--quiet' if run_quietly else '' cleanup_cmds = [] if repack: cleanup_cmds = ['git reflog expire --expire=now --all'.split(), 'git gc {} --prune=now'.format(quiet_flags).split()] if reset: cleanup_cmds.insert(0, 'git reset {} --hard'.format(quiet_flags).split()) location_info = ' (in {})'.format(decode(repo)) if repo != b'.' else '' for cmd in cleanup_cmds: if show_debuginfo: print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd))) subproc.call(cmd, cwd=repo) def _get_rename(self, old_hash): # If we already know the rename, just return it new_hash = self._commit_renames.get(old_hash, None) if new_hash: return new_hash # If it's not in the remaining pending renames, we don't know it if old_hash is not None and old_hash not in self._pending_renames: return None # Read through the pending renames until we find it or we've read them all, # and return whatever we might find self._flush_renames(old_hash) return self._commit_renames.get(old_hash, None) def _flush_renames(self, old_hash=None, limit=0): # Parse through self._pending_renames until we have read enough. We have # read enough if: # self._pending_renames is empty # old_hash != None and we found a rename for old_hash # limit > 0 and len(self._pending_renames) started less than 2*limit # limit > 0 and len(self._pending_renames) < limit if limit and len(self._pending_renames) < 2 * limit: return fi_input, fi_output = self._import_pipes while self._pending_renames: orig_id, ignore = self._pending_renames.popitem(last=False) new_id = fi_output.readline().rstrip() self._commit_renames[orig_id] = new_id if old_hash == orig_id: return if limit and len(self._pending_renames) < limit: return def _translate_commit_hash(self, matchobj_or_oldhash): old_hash = matchobj_or_oldhash if not isinstance(matchobj_or_oldhash, bytes): old_hash = matchobj_or_oldhash.group(1) orig_len = len(old_hash) new_hash = self._get_rename(old_hash) if new_hash is None: if old_hash[0:7] not in self._commit_short_old_hashes: self._commits_referenced_but_removed.add(old_hash) return old_hash possibilities = self._commit_short_old_hashes[old_hash[0:7]] matches = [x for x in possibilities if x[0:orig_len] == old_hash] if len(matches) != 1: self._commits_referenced_but_removed.add(old_hash) return old_hash old_hash = matches[0] new_hash = self._get_rename(old_hash) assert new_hash is not None return new_hash[0:orig_len] def _trim_extra_parents(self, orig_parents, parents): '''Due to pruning of empty commits, some parents could be non-existent (None) or otherwise redundant. Remove the non-existent parents, and remove redundant parents so long as that doesn't transform a merge commit into a non-merge commit. Returns a tuple: (parents, new_first_parent_if_would_become_non_merge)''' always_prune = (self._args.prune_degenerate == 'always') # Pruning of empty commits means multiple things: # * An original parent of this commit may have been pruned causing the # need to rewrite the reported parent to the nearest ancestor. We # want to know when we're dealing with such a parent. # * Further, there may be no "nearest ancestor" if the entire history # of that parent was also pruned. (Detectable by the parent being # 'None') # Remove all parents rewritten to None, and keep track of which parents # were rewritten to an ancestor. tmp = zip(parents, orig_parents, [(x in _SKIPPED_COMMITS or always_prune) for x in orig_parents]) tmp2 = [x for x in tmp if x[0] is not None] if not tmp2: # All ancestors have been pruned; we have no parents. return [], None parents, orig_parents, is_rewritten = [list(x) for x in zip(*tmp2)] # We can't have redundant parents if we don't have at least 2 parents if len(parents) < 2: return parents, None # Don't remove redundant parents if user doesn't want us to if self._args.prune_degenerate == 'never': return parents, None # Remove duplicate parents (if both sides of history have lots of commits # which become empty due to pruning, the most recent ancestor on both # sides may be the same commit), except only remove parents that have # been rewritten due to previous empty pruning. seen = set() seen_add = seen.add # Deleting duplicate rewritten parents means keeping parents if either # they have not been seen or they are ones that have not been rewritten. parents_copy = parents uniq = [[p, orig_parents[i], is_rewritten[i]] for i, p in enumerate(parents) if not (p in seen or seen_add(p)) or not is_rewritten[i]] parents, orig_parents, is_rewritten = [list(x) for x in zip(*uniq)] if len(parents) < 2: return parents_copy, parents[0] # Flatten unnecessary merges. (If one side of history is entirely # empty commits that were pruned, we may end up attempting to # merge a commit with its ancestor. Remove parents that are an # ancestor of another parent.) num_parents = len(parents) to_remove = [] for cur in range(num_parents): if not is_rewritten[cur]: continue for other in range(num_parents): if cur == other: continue if not self._graph.is_ancestor(parents[cur], parents[other]): continue # parents[cur] is an ancestor of parents[other], so parents[cur] # seems redundant. However, if it was intentionally redundant # (e.g. a no-ff merge) in the original, then we want to keep it. if not always_prune and \ self._orig_graph.is_ancestor(orig_parents[cur], orig_parents[other]): continue # Some folks want their history to have all first parents be merge # commits (except for any root commits), and always do a merge --no-ff. # For such folks, don't remove the first parent even if it's an # ancestor of other commits. if self._args.no_ff and cur == 0: continue # Okay so the cur-th parent is an ancestor of the other-th parent, # and it wasn't that way in the original repository; mark the # cur-th parent as removable. to_remove.append(cur) break # cur removed, so skip rest of others -- i.e. check cur+=1 for x in reversed(to_remove): parents.pop(x) if len(parents) < 2: return parents_copy, parents[0] return parents, None def _prunable(self, commit, new_1st_parent, had_file_changes, orig_parents): parents = commit.parents if self._args.prune_empty == 'never': return False always_prune = (self._args.prune_empty == 'always') # For merge commits, unless there are prunable (redundant) parents, we # do not want to prune if len(parents) >= 2 and not new_1st_parent: return False if len(parents) < 2: # Special logic for commits that started empty... if not had_file_changes and not always_prune: had_parents_pruned = (len(parents) < len(orig_parents) or (len(orig_parents) == 1 and orig_parents[0] in _SKIPPED_COMMITS)) # If the commit remains empty and had parents which were pruned, # then prune this commit; otherwise, retain it return (not commit.file_changes and had_parents_pruned) # We can only get here if the commit didn't start empty, so if it's # empty now, it obviously became empty if not commit.file_changes: return True # If there are no parents of this commit and we didn't match the case # above, then this commit cannot be pruned. Since we have no parent(s) # to compare to, abort now to prevent future checks from failing. if not parents: return False # Similarly, we cannot handle the hard cases if we don't have a pipe # to communicate with fast-import if not self._import_pipes: return False # If there have not been renames/remappings of IDs (due to insertion of # new blobs), then we can sometimes know things aren't prunable with a # simple check if not _IDS.has_renames(): # non-merge commits can only be empty if blob/file-change editing caused # all file changes in the commit to have the same file contents as # the parent. changed_files = set(change.filename for change in commit.file_changes) if len(orig_parents) < 2 and changed_files - self._files_tweaked: return False # Finally, the hard case: due to either blob rewriting, or due to pruning # of empty commits wiping out the first parent history back to the merge # base, the list of file_changes we have may not actually differ from our # (new) first parent's version of the files, i.e. this would actually be # an empty commit. Check by comparing the contents of this commit to its # (remaining) parent. # # NOTE on why this works, for the case of original first parent history # having been pruned away due to being empty: # The first parent history having been pruned away due to being # empty implies the original first parent would have a tree (after # filtering) that matched the merge base's tree. Since # file_changes has the changes needed to go from what would have # been the first parent to our new commit, and what would have been # our first parent has a tree that matches the merge base, then if # the new first parent has a tree matching the versions of files in # file_changes, then this new commit is empty and thus prunable. fi_input, fi_output = self._import_pipes self._flush_renames() # Avoid fi_output having other stuff present # Optimization note: we could have two loops over file_changes, the # first doing all the self._output.write() calls, and the second doing # the rest. But I'm worried about fast-import blocking on fi_output # buffers filling up so I instead read from it as I go. for change in commit.file_changes: parent = new_1st_parent or commit.parents[0] # exists due to above checks quoted_filename = PathQuoting.enquote(change.filename) if isinstance(parent, int): self._output.write(b"ls :%d %s\n" % (parent, quoted_filename)) else: self._output.write(b"ls %s %s\n" % (parent, quoted_filename)) self._output.flush() parent_version = fi_output.readline().split() if change.type == b'D': if parent_version != [b'missing', quoted_filename]: return False else: blob_sha = change.blob_id if isinstance(change.blob_id, int): self._output.write(b"get-mark :%d\n" % change.blob_id) self._output.flush() blob_sha = fi_output.readline().rstrip() if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]: return False return True def _record_remapping(self, commit, orig_parents): new_id = None # Record the mapping of old commit hash to new one if commit.original_id and self._import_pipes: fi_input, fi_output = self._import_pipes self._output.write(b"get-mark :%d\n" % commit.id) self._output.flush() orig_id = commit.original_id self._commit_short_old_hashes[orig_id[0:7]].add(orig_id) # Note that we have queued up an id for later reading; flush a # few of the older ones if we have too many queued up self._pending_renames[orig_id] = None self._flush_renames(None, limit=40) # Also, record if this was a merge commit that turned into a non-merge # commit. if len(orig_parents) >= 2 and len(commit.parents) < 2: self._commits_no_longer_merges.append((commit.original_id, new_id)) def callback_metadata(self, extra_items = dict()): return {'commit_rename_func': self._translate_commit_hash, 'ancestry_graph': self._graph, 'original_ancestry_graph': self._orig_graph, **extra_items} def _tweak_blob(self, blob): if self._args.max_blob_size and len(blob.data) > self._args.max_blob_size: blob.skip() if blob.original_id in self._args.strip_blobs_with_ids: blob.skip() if ( self._args.replace_text # not (if blob contains zero byte in the first 8Kb, that is, if blob is binary data) and not b"\0" in blob.data[0:8192] ): for literal, replacement in self._args.replace_text['literals']: blob.data = blob.data.replace(literal, replacement) for regex, replacement in self._args.replace_text['regexes']: blob.data = regex.sub(replacement, blob.data) if self._blob_callback: self._blob_callback(blob, self.callback_metadata()) def _filter_files(self, commit): def filename_matches(path_expression, pathname): ''' Returns whether path_expression matches pathname or a leading directory thereof, allowing path_expression to not have a trailing slash even if it is meant to match a leading directory. ''' if path_expression == b'': return True n = len(path_expression) if (pathname.startswith(path_expression) and (path_expression[n-1:n] == b'/' or len(pathname) == n or pathname[n:n+1] == b'/')): return True return False def newname(path_changes, pathname, use_base_name, filtering_is_inclusive): ''' Applies filtering and rename changes from path_changes to pathname, returning any of None (file isn't wanted), original filename (file is wanted with original name), or new filename. ''' wanted = False full_pathname = pathname if use_base_name: pathname = os.path.basename(pathname) for (mod_type, match_type, path_exp) in path_changes: if mod_type == 'filter' and not wanted: assert match_type in ('match', 'glob', 'regex') if match_type == 'match' and filename_matches(path_exp, pathname): wanted = True if match_type == 'glob' and fnmatch.fnmatch(pathname, path_exp): wanted = True if match_type == 'regex' and path_exp.search(pathname): wanted = True elif mod_type == 'rename': match, repl = path_exp assert match_type in ('match','regex') # glob was translated to regex if match_type == 'match' and filename_matches(match, full_pathname): full_pathname = full_pathname.replace(match, repl, 1) if match_type == 'regex': full_pathname = match.sub(repl, full_pathname) return full_pathname if (wanted == filtering_is_inclusive) else None args = self._args new_file_changes = {} # Assumes no renames or copies, otherwise collisions for change in commit.file_changes: # NEEDSWORK: _If_ we ever want to pass `--full-tree` to fast-export and # parse that output, we'll need to modify this block; `--full-tree` # issues a deleteall directive which has no filename, and thus this # block would normally strip it. Of course, FileChange() and # _parse_optional_filechange() would need updates too. if change.type == b'DELETEALL': new_file_changes[b''] = change continue if change.filename in self._newnames: change.filename = self._newnames[change.filename] else: original_filename = change.filename change.filename = newname(args.path_changes, change.filename, args.use_base_name, args.inclusive) if self._filename_callback: change.filename = self._filename_callback(change.filename) self._newnames[original_filename] = change.filename if not change.filename: continue # Filtering criteria excluded this file; move on to next one if change.filename in new_file_changes: # Getting here means that path renaming is in effect, and caused one # path to collide with another. That's usually bad, but can be okay # under two circumstances: # 1) Sometimes people have a file named OLDFILE in old revisions of # history, and they rename to NEWFILE, and would like to rewrite # history so that all revisions refer to it as NEWFILE. As such, # we can allow a collision when (at least) one of the two paths # is a deletion. Note that if OLDFILE and NEWFILE are unrelated # this also allows the rewrite to continue, which makes sense # since OLDFILE is no longer in the way. # 2) If OLDFILE and NEWFILE are exactly equal, then writing them # both to the same location poses no problem; we only need one # file. (This could come up if someone copied a file in some # commit, then later either deleted the file or kept it exactly # in sync with the original with any changes, and then decides # they want to rewrite history to only have one of the two files) colliding_change = new_file_changes[change.filename] if change.type == b'D': # We can just throw this one away and keep the other continue elif change.type == b'M' and ( change.mode == colliding_change.mode and change.blob_id == colliding_change.blob_id): # The two are identical, so we can throw this one away and keep other continue elif new_file_changes[change.filename].type != b'D': raise SystemExit(_("File renaming caused colliding pathnames!\n") + _(" Commit: {}\n").format(commit.original_id) + _(" Filename: {}").format(change.filename)) # Strip files that are too large if self._args.max_blob_size and \ self._unpacked_size.get(change.blob_id, 0) > self._args.max_blob_size: continue if self._args.strip_blobs_with_ids and \ change.blob_id in self._args.strip_blobs_with_ids: continue # Otherwise, record the change new_file_changes[change.filename] = change commit.file_changes = [v for k,v in sorted(new_file_changes.items())] def _tweak_commit(self, commit, aux_info): if self._args.replace_message: for literal, replacement in self._args.replace_message['literals']: commit.message = commit.message.replace(literal, replacement) for regex, replacement in self._args.replace_message['regexes']: commit.message = regex.sub(replacement, commit.message) if self._message_callback: commit.message = self._message_callback(commit.message) # Change the commit message according to callback if not self._args.preserve_commit_hashes: commit.message = self._hash_re.sub(self._translate_commit_hash, commit.message) # Change the author & committer according to mailmap rules args = self._args if args.mailmap: commit.author_name, commit.author_email = \ args.mailmap.translate(commit.author_name, commit.author_email) commit.committer_name, commit.committer_email = \ args.mailmap.translate(commit.committer_name, commit.committer_email) # Change author & committer according to callbacks if self._name_callback: commit.author_name = self._name_callback(commit.author_name) commit.committer_name = self._name_callback(commit.committer_name) if self._email_callback: commit.author_email = self._email_callback(commit.author_email) commit.committer_email = self._email_callback(commit.committer_email) # Sometimes the 'branch' given is a tag; if so, rename it as requested so # we don't get any old tagnames if self._args.tag_rename: commit.branch = RepoFilter._do_tag_rename(args.tag_rename, commit.branch) if self._refname_callback: commit.branch = self._refname_callback(commit.branch) # Filter or rename the list of file changes orig_file_changes = set(commit.file_changes) self._filter_files(commit) # Record ancestry graph parents, orig_parents = commit.parents, aux_info['orig_parents'] if self._args.state_branch: external_parents = parents else: external_parents = [p for p in parents if not isinstance(p, int)] self._graph.record_external_commits(external_parents) self._orig_graph.record_external_commits(external_parents) self._graph.add_commit_and_parents(commit.id, parents) self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents) # Prune parents (due to pruning of empty commits) if relevant old_1st_parent = parents[0] if parents else None parents, new_1st_parent = self._trim_extra_parents(orig_parents, parents) commit.parents = parents # If parents were pruned, then we need our file changes to be relative # to the new first parent if parents and old_1st_parent != parents[0]: commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir, ID_TO_HASH[parents[0]], commit.original_id) orig_file_changes = set(commit.file_changes) self._filter_files(commit) # Find out which files were modified by the callbacks. Such paths could # lead to subsequent commits being empty (e.g. if removing a line containing # a password from every version of a file that had the password, and some # later commit did nothing more than remove that line) final_file_changes = set(commit.file_changes) if self._args.replace_text or self._blob_callback: differences = orig_file_changes.union(final_file_changes) else: differences = orig_file_changes.symmetric_difference(final_file_changes) self._files_tweaked.update(x.filename for x in differences) # Call the user-defined callback, if any if self._commit_callback: self._commit_callback(commit, self.callback_metadata(aux_info)) # Now print the resulting commit, or if prunable skip it if not commit.dumped: if not self._prunable(commit, new_1st_parent, aux_info['had_file_changes'], orig_parents): self._insert_into_stream(commit) self._record_remapping(commit, orig_parents) else: rewrite_to = new_1st_parent or commit.first_parent() commit.skip(new_id = rewrite_to) if self._args.state_branch: alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash) self._insert_into_stream(alias) reset = Reset(commit.branch, rewrite_to or deleted_hash) self._insert_into_stream(reset) self._commit_renames[commit.original_id] = None # Show progress self._num_commits += 1 if not self._args.quiet: self._progress_writer.show(self._parsed_message % self._num_commits) @staticmethod def _do_tag_rename(rename_pair, tagname): old, new = rename_pair.split(b':', 1) old, new = b'refs/tags/'+old, b'refs/tags/'+new if tagname.startswith(old): return tagname.replace(old, new, 1) return tagname def _tweak_tag(self, tag): # Tweak the tag message according to callbacks if self._args.replace_message: for literal, replacement in self._args.replace_message['literals']: tag.message = tag.message.replace(literal, replacement) for regex, replacement in self._args.replace_message['regexes']: tag.message = regex.sub(replacement, tag.message) if self._message_callback: tag.message = self._message_callback(tag.message) # Tweak the tag name according to tag-name-related callbacks tag_prefix = b'refs/tags/' fullref = tag_prefix+tag.ref if self._args.tag_rename: fullref = RepoFilter._do_tag_rename(self._args.tag_rename, fullref) if self._refname_callback: fullref = self._refname_callback(fullref) if not fullref.startswith(tag_prefix): msg = "Error: fast-import requires tags to be in refs/tags/ namespace." msg += "\n {} renamed to {}".format(tag_prefix+tag.ref, fullref) raise SystemExit(msg) tag.ref = fullref[len(tag_prefix):] # Tweak the tagger according to callbacks if self._args.mailmap: tag.tagger_name, tag.tagger_email = \ self._args.mailmap.translate(tag.tagger_name, tag.tagger_email) if self._name_callback: tag.tagger_name = self._name_callback(tag.tagger_name) if self._email_callback: tag.tagger_email = self._email_callback(tag.tagger_email) # Call general purpose tag callback if self._tag_callback: self._tag_callback(tag, self.callback_metadata()) def _tweak_reset(self, reset): if self._args.tag_rename: reset.ref = RepoFilter._do_tag_rename(self._args.tag_rename, reset.ref) if self._refname_callback: reset.ref = self._refname_callback(reset.ref) if self._reset_callback: self._reset_callback(reset, self.callback_metadata()) def results_tmp_dir(self, create_if_missing=True): target_working_dir = self._args.target or b'.' git_dir = GitUtils.determine_git_dir(target_working_dir) d = os.path.join(git_dir, b'filter-repo') if create_if_missing and not os.path.isdir(d): os.mkdir(d) return d def _load_marks_file(self, marks_basename): full_branch = 'refs/heads/{}'.format(self._args.state_branch) marks_file = os.path.join(self.results_tmp_dir(), marks_basename) working_dir = self._args.target or b'.' cmd = ['git', '-C', working_dir, 'show-ref', full_branch] contents = b'' if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: cmd = ['git', '-C', working_dir, 'show', '%s:%s' % (full_branch, decode(marks_basename))] try: contents = subproc.check_output(cmd) except subprocess.CalledProcessError as e: # pragma: no cover raise SystemExit(_("Failed loading %s from %s") % (decode(marks_basename), full_branch)) if contents: biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines()) _IDS._next_id = max(_IDS._next_id, biggest_id+1) with open(marks_file, 'bw') as f: f.write(contents) return marks_file def _save_marks_files(self): basenames = [b'source-marks', b'target-marks'] working_dir = self._args.target or b'.' # Check whether the branch exists parent = [] full_branch = 'refs/heads/{}'.format(self._args.state_branch) cmd = ['git', '-C', working_dir, 'show-ref', full_branch] if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: parent = ['-p', full_branch] # Run 'git hash-object $MARKS_FILE' for each marks file, save result blob_hashes = {} for marks_basename in basenames: marks_file = os.path.join(self.results_tmp_dir(), marks_basename) if not os.path.isfile(marks_file): # pragma: no cover raise SystemExit(_("Failed to find %s to save to %s") % (marks_file, self._args.state_branch)) cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file] blob_hashes[marks_basename] = subproc.check_output(cmd).strip() # Run 'git mktree' to create a tree out of it p = subproc.Popen(['git', '-C', working_dir, 'mktree'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) for b in basenames: p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b)) p.stdin.close() p.wait() tree = p.stdout.read().strip() # Create the new commit cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files', tree] + parent) commit = subproc.check_output(cmd).strip() subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit]) def importer_only(self): self._run_sanity_checks() self._setup_output() def set_output(self, outputRepoFilter): assert outputRepoFilter._output # set_output implies this RepoFilter is doing exporting, though may not # be the only one. self._setup_input(use_done_feature = False) # Set our output management up to pipe to outputRepoFilter's locations self._managed_output = False self._output = outputRepoFilter._output self._import_pipes = outputRepoFilter._import_pipes # Handle sanity checks, though currently none needed for export-only cases self._run_sanity_checks() def _setup_input(self, use_done_feature): if self._args.stdin: self._input = sys.stdin.detach() sys.stdin = None # Make sure no one tries to accidentally use it self._fe_orig = None else: skip_blobs = (self._blob_callback is None and self._args.replace_text is None and self._args.source == self._args.target) extra_flags = [] if skip_blobs: extra_flags.append('--no-data') if self._args.max_blob_size: self._unpacked_size, packed_size = GitUtils.get_blob_sizes() if use_done_feature: extra_flags.append('--use-done-feature') if write_marks: extra_flags.append(b'--mark-tags') if self._args.state_branch: assert(write_marks) source_marks_file = self._load_marks_file(b'source-marks') extra_flags.extend([b'--export-marks='+source_marks_file, b'--import-marks='+source_marks_file]) if self._args.preserve_commit_encoding is not None: # pragma: no cover reencode = 'no' if self._args.preserve_commit_encoding else 'yes' extra_flags.append('--reencode='+reencode) if self._args.date_order: extra_flags.append('--date-order') location = ['-C', self._args.source] if self._args.source else [] fep_cmd = ['git'] + location + ['fast-export', '--show-original-ids', '--signed-tags=strip', '--tag-of-filtered-object=rewrite', '--fake-missing-tagger', '--reference-excluded-parents' ] + extra_flags + self._args.refs self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE) self._input = self._fep.stdout if self._args.dry_run or self._args.debug: self._fe_orig = os.path.join(self.results_tmp_dir(), b'fast-export.original') output = open(self._fe_orig, 'bw') self._input = InputFileBackup(self._input, output) if self._args.debug: tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd] print("[DEBUG] Running: {}".format(' '.join(tmp))) print(" (saving a copy of the output at {})" .format(decode(self._fe_orig))) def _setup_output(self): if not self._args.dry_run: location = ['-C', self._args.target] if self._args.target else [] fip_cmd = ['git'] + location + ['-c', 'core.ignorecase=false', 'fast-import', '--force', '--quiet'] if date_format_permissive: fip_cmd.append('--date-format=raw-permissive') if self._args.state_branch: target_marks_file = self._load_marks_file(b'target-marks') fip_cmd.extend([b'--export-marks='+target_marks_file, b'--import-marks='+target_marks_file]) self._fip = subproc.Popen(fip_cmd, bufsize=-1, stdin=subprocess.PIPE, stdout=subprocess.PIPE) self._import_pipes = (self._fip.stdin, self._fip.stdout) if self._args.dry_run or self._args.debug: self._fe_filt = os.path.join(self.results_tmp_dir(), b'fast-export.filtered') self._output = open(self._fe_filt, 'bw') else: self._output = self._fip.stdin if self._args.debug and not self._args.dry_run: self._output = DualFileWriter(self._fip.stdin, self._output) tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd] print("[DEBUG] Running: {}".format(' '.join(tmp))) print(" (using the following file as input: {})" .format(decode(self._fe_filt))) def _migrate_origin_to_heads(self): refs_to_migrate = set(x for x in self._orig_refs if x.startswith(b'refs/remotes/origin/')) if not refs_to_migrate: return if self._args.debug: print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*") target_working_dir = self._args.target or b'.' p = subproc.Popen('git update-ref --no-deref --stdin'.split(), stdin=subprocess.PIPE, cwd=target_working_dir) for ref in refs_to_migrate: if ref == b'refs/remotes/origin/HEAD': p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) del self._orig_refs[ref] continue newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/') if newref not in self._orig_refs: p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref])) p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) self._orig_refs[newref] = self._orig_refs[ref] del self._orig_refs[ref] p.stdin.close() if p.wait(): raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover # Now remove if self._args.debug: print("[DEBUG] Removing 'origin' remote (rewritten history will no ") print(" longer be related; consider re-pushing it elsewhere.") subproc.call('git remote rm origin'.split(), cwd=target_working_dir) def _final_commands(self): self._finalize_handled = True self._done_callback and self._done_callback() if not self._args.quiet: self._progress_writer.finish() def _ref_update(self, target_working_dir): # Start the update-ref process p = subproc.Popen('git update-ref --no-deref --stdin'.split(), stdin=subprocess.PIPE, cwd=target_working_dir) # Remove replace_refs from _orig_refs replace_refs = {k:v for k, v in self._orig_refs.items() if k.startswith(b'refs/replace/')} reverse_replace_refs = collections.defaultdict(list) for k,v in replace_refs.items(): reverse_replace_refs[v].append(k) all(map(self._orig_refs.pop, replace_refs)) # Remove unused refs exported_refs, imported_refs = self.get_exported_and_imported_refs() refs_to_nuke = exported_refs - imported_refs if self._args.partial: refs_to_nuke = set() if refs_to_nuke and self._args.debug: print("[DEBUG] Deleting the following refs:\n "+ decode(b"\n ".join(refs_to_nuke))) p.stdin.write(b''.join([b"delete %s\n" % x for x in refs_to_nuke])) # Delete or update and add replace_refs; note that fast-export automatically # handles 'update-no-add', we only need to take action for the other four # choices for replace_refs. self._flush_renames() actual_renames = {k:v for k,v in self._commit_renames.items() if k != v} if self._args.replace_refs in ['delete-no-add', 'delete-and-add']: # Delete old replace refs, if unwanted replace_refs_to_nuke = set(replace_refs) if self._args.replace_refs == 'delete-and-add': # git-update-ref won't allow us to update a ref twice, so be careful # to avoid deleting refs we'll later update replace_refs_to_nuke = replace_refs_to_nuke.difference( [b'refs/replace/'+x for x in actual_renames]) p.stdin.write(b''.join([b"delete %s\n" % x for x in replace_refs_to_nuke])) if self._args.replace_refs in ['delete-and-add', 'update-or-add', 'update-and-add']: # Add new replace refs update_only = (self._args.replace_refs == 'update-or-add') p.stdin.write(b''.join([b"update refs/replace/%s %s\n" % (old, new) for old,new in actual_renames.items() if new and not (update_only and old in reverse_replace_refs)])) # Complete the update-ref process p.stdin.close() if p.wait(): raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover def _record_metadata(self, metadata_dir, orig_refs): self._flush_renames() with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f: f.write(("%-40s %s\n" % (_("old"), _("new"))).encode()) for (old,new) in self._commit_renames.items(): msg = b'%s %s\n' % (old, new if new != None else deleted_hash) f.write(msg) exported_refs, imported_refs = self.get_exported_and_imported_refs() batch_check_process = None batch_check_output_re = re.compile(b'^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$') with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f: f.write(("%-40s %-40s %s\n" % (_("old"), _("new"), _("ref"))).encode()) for refname, old_hash in orig_refs.items(): if refname not in exported_refs: continue if refname not in imported_refs: new_hash = deleted_hash elif old_hash in self._commit_renames: new_hash = self._commit_renames[old_hash] new_hash = new_hash if new_hash != None else deleted_hash else: # Must be either an annotated tag, or a ref whose tip was pruned if not batch_check_process: cmd = 'git cat-file --batch-check'.split() target_working_dir = self._args.target or b'.' batch_check_process = subproc.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd=target_working_dir) batch_check_process.stdin.write(refname+b"\n") batch_check_process.stdin.flush() line = batch_check_process.stdout.readline() m = batch_check_output_re.match(line) if m and m.group(2) in (b'tag', b'commit'): new_hash = m.group(1) elif line.endswith(b' missing\n'): new_hash = deleted_hash else: raise SystemExit(_("Failed to find new id for %(refname)s " "(old id was %(old_hash)s)") % ({'refname': refname, 'old_hash': old_hash}) ) # pragma: no cover f.write(b'%s %s %s\n' % (old_hash, new_hash, refname)) if self._args.source or self._args.target: new_refs = GitUtils.get_refs(self._args.target or b'.') for ref, new_hash in new_refs.items(): if ref not in orig_refs and not ref.startswith(b'refs/replace/'): old_hash = b'0'*len(new_hash) f.write(b'%s %s %s\n' % (old_hash, new_hash, ref)) if batch_check_process: batch_check_process.stdin.close() batch_check_process.wait() with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f: issues_found = False if self._commits_no_longer_merges: issues_found = True f.write(textwrap.dedent(_(''' The following commits used to be merge commits but due to filtering are now regular commits; they likely have suboptimal commit messages (e.g. "Merge branch next into master"). Original commit hash on the left, commit hash after filtering/rewriting on the right: ''')[1:]).encode()) for oldhash, newhash in self._commits_no_longer_merges: f.write(' {} {}\n'.format(oldhash, newhash).encode()) f.write(b'\n') if self._commits_referenced_but_removed: issues_found = True f.write(textwrap.dedent(_(''' The following commits were filtered out, but referenced in another commit message. The reference to the now-nonexistent commit hash (or a substring thereof) was left as-is in any commit messages: ''')[1:]).encode()) for bad_commit_reference in self._commits_referenced_but_removed: f.write(' {}\n'.format(bad_commit_reference).encode()) f.write(b'\n') if not issues_found: f.write(_("No filtering problems encountered.\n").encode()) with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f: f.write(_("This file exists to allow you to filter again without --force.\n").encode()) def finish(self): ''' Alternative to run() when there is no input of our own to parse, meaning that run only really needs to close the handle to fast-import and let it finish, thus making a call to "run" feel like a misnomer. ''' assert not self._input assert self._managed_output self.run() def insert(self, obj, direct_insertion = False): if not direct_insertion: if type(obj) == Blob: self._tweak_blob(obj) elif type(obj) == Commit: aux_info = {'orig_parents': obj.parents, 'had_file_changes': bool(obj.file_changes)} self._tweak_commit(obj, aux_info) elif type(obj) == Reset: self._tweak_reset(obj) elif type(obj) == Tag: self._tweak_tag(obj) self._insert_into_stream(obj) def _insert_into_stream(self, obj): if not obj.dumped: if self._parser: self._parser.insert(obj) else: obj.dump(self._output) def get_exported_and_imported_refs(self): return self._parser.get_exported_and_imported_refs() def run(self): start = time.time() if not self._input and not self._output: self._run_sanity_checks() if not self._args.dry_run and not self._args.partial: self._migrate_origin_to_heads() self._setup_input(use_done_feature = True) self._setup_output() assert self._sanity_checks_handled if self._input: # Create and run the filter self._repo_working_dir = self._args.source or b'.' self._parser = FastExportParser(blob_callback = self._tweak_blob, commit_callback = self._tweak_commit, tag_callback = self._tweak_tag, reset_callback = self._tweak_reset, done_callback = self._final_commands) self._parser.run(self._input, self._output) if not self._finalize_handled: self._final_commands() # Make sure fast-export completed successfully if not self._args.stdin and self._fep.wait(): raise SystemExit(_("Error: fast-export failed; see above.")) # pragma: no cover self._input.close() # If we're not the manager of self._output, we should avoid post-run cleanup if not self._managed_output: return # Close the output and ensure fast-import successfully completes self._output.close() if not self._args.dry_run and self._fip.wait(): raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover # With fast-export and fast-import complete, update state if requested if self._args.state_branch: self._save_marks_files() # Notify user how long it took, before doing a gc and such msg = "New history written in {:.2f} seconds..." if self._args.repack: msg = "New history written in {:.2f} seconds; now repacking/cleaning..." print(msg.format(time.time()-start)) # Exit early, if requested if self._args.dry_run: print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed.")) if self._fe_orig: print(_(" Requested filtering can be seen by comparing:")) print(" " + decode(self._fe_orig)) else: print(_(" Requested filtering can be seen at:")) print(" " + decode(self._fe_filt)) return target_working_dir = self._args.target or b'.' if self._input: self._ref_update(target_working_dir) # Write out data about run self._record_metadata(self.results_tmp_dir(), self._orig_refs) # Final cleanup: # If we need a repack, then nuke the reflogs and repack. # If we need a reset, do a reset --hard reset = not GitUtils.is_repository_bare(target_working_dir) RepoFilter.cleanup(target_working_dir, self._args.repack, reset, run_quietly=self._args.quiet, show_debuginfo=self._args.debug) # Let user know how long it took print(_("Completely finished after {:.2f} seconds.") .format(time.time()-start)) def main(): setup_gettext() args = FilteringOptions.parse_args(sys.argv[1:]) if args.analyze: RepoAnalyze.run(args) else: filter = RepoFilter(args) filter.run() if __name__ == '__main__': main() git-filter-repo-2.45.0/git_filter_repo.py000077700000000000000000000000001464611705400233012git-filter-repoustar00rootroot00000000000000git-filter-repo-2.45.0/pyproject.toml000066400000000000000000000026511464611705400175430ustar00rootroot00000000000000[project] name = "git-filter-repo" description = "Quickly rewrite git repository history" authors = [ {name = "Elijah Newren", email = "newren@gmail.com"} ] readme = "README.md" classifiers = [ "Development Status :: 4 - Beta", "Operating System :: OS Independent", "Programming Language :: Python", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] license.text = "MIT" requires-python = ">= 3.5" dynamic = ["version"] [project.urls] Homepage = "https://github.com/newren/git-filter-repo" Issues = "https://github.com/newren/git-filter-repo/issues/" Source = "https://github.com/newren/git-filter-repo" [project.scripts] git-filter-repo = "git_filter_repo:main" [build-system] requires = ["setuptools>=61", "setuptools_scm>=8.0", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools] py-modules = ["git_filter_repo"] [tool.setuptools_scm] # This section intentionally left blank git-filter-repo-2.45.0/t/000077500000000000000000000000001464611705400150665ustar00rootroot00000000000000git-filter-repo-2.45.0/t/run_coverage000077500000000000000000000010561464611705400174750ustar00rootroot00000000000000#!/bin/bash set -eu orig_dir=$(cd $(dirname $0) && pwd -P) tmpdir=$(mktemp -d) cat <$tmpdir/.coveragerc [run] parallel=true data_file=$tmpdir/.coverage EOF cat <$tmpdir/sitecustomize.py import coverage coverage.process_startup() EOF export COVERAGE_PROCESS_START=$tmpdir/.coveragerc export PYTHONPATH=$tmpdir: # Produce a coverage report, even if the tests fail set +e $orig_dir/run_tests exitcode=$? set -e cd $tmpdir coverage3 combine -q coverage3 html -d $orig_dir/report coverage3 report -m cd $orig_dir rm -rf $tmpdir exit $exitcode git-filter-repo-2.45.0/t/run_tests000077500000000000000000000011041464611705400170360ustar00rootroot00000000000000#!/bin/bash set -eu cd $(dirname $0) # Put git_filter_repo.py on the front of PYTHONPATH export PYTHONPATH="$PWD/..${PYTHONPATH:+:$PYTHONPATH}" # We pretend filenames are unicode for two reasons: (1) because it exercises # more code, and (2) this setting will detect accidental use of unicode strings # for file/directory names when it should always be bytestrings. export PRETEND_UNICODE_ARGS=1 export TEST_SHELL_PATH=/bin/sh failed=0 for t in t[0-9]*.sh do printf '\n\n== %s ==\n' "$t" bash $t "$@" || failed=$(($failed+1)) done if [ 0 -lt $failed ] then exit 1 fi git-filter-repo-2.45.0/t/t9390-filter-repo.sh000077500000000000000000001564051464611705400204560ustar00rootroot00000000000000#!/bin/bash test_description='Basic filter-repo tests' . ./test-lib.sh export PATH=$(dirname $TEST_DIRECTORY):$PATH # Put git-filter-repo in PATH DATA="$TEST_DIRECTORY/t9390" SQ="'" filter_testcase() { INPUT=$1 OUTPUT=$2 shift 2 REST=("$@") NAME="check: $INPUT -> $OUTPUT using '${REST[@]}'" test_expect_success "$NAME" ' # Clean up from previous run git pack-refs --all && rm .git/packed-refs && # Run the example cat $DATA/$INPUT | git filter-repo --stdin --quiet --force --replace-refs delete-no-add "${REST[@]}" && # Compare the resulting repo to expected value git fast-export --use-done-feature --all >compare && test_cmp $DATA/$OUTPUT compare ' } filter_testcase basic basic-filename --path filename filter_testcase basic basic-twenty --path twenty filter_testcase basic basic-ten --path ten filter_testcase basic basic-numbers --path ten --path twenty filter_testcase basic basic-filename --invert-paths --path-glob 't*en*' filter_testcase basic basic-numbers --invert-paths --path-regex 'f.*e.*e' filter_testcase basic basic-mailmap --mailmap ../t9390/sample-mailmap filter_testcase basic basic-replace --replace-text ../t9390/sample-replace filter_testcase basic basic-message --replace-message ../t9390/sample-message filter_testcase empty empty-keepme --path keepme filter_testcase empty more-empty-keepme --path keepme --prune-empty=always \ --prune-degenerate=always filter_testcase empty less-empty-keepme --path keepme --prune-empty=never \ --prune-degenerate=never filter_testcase degenerate degenerate-keepme --path moduleA/keepme filter_testcase degenerate degenerate-moduleA --path moduleA filter_testcase degenerate degenerate-globme --path-glob *me filter_testcase degenerate degenerate-keepme-noff --path moduleA/keepme --no-ff filter_testcase unusual unusual-filtered --path '' filter_testcase unusual unusual-mailmap --mailmap ../t9390/sample-mailmap setup_path_rename() { test -d path_rename && return test_create_repo path_rename && ( cd path_rename && mkdir sequences values && test_seq 1 10 >sequences/tiny && test_seq 100 110 >sequences/intermediate && test_seq 1000 1010 >sequences/large && test_seq 1000 1010 >values/large && test_seq 10000 10010 >values/huge && git add sequences values && git commit -m initial && git mv sequences/tiny sequences/small && cp sequences/intermediate sequences/medium && echo 10011 >values/huge && git add sequences values && git commit -m updates && git rm sequences/intermediate && echo 11 >sequences/small && git add sequences/small && git commit -m changes && echo 1011 >sequences/medium && git add sequences/medium && git commit -m final ) } test_expect_success '--path-rename sequences/tiny:sequences/small' ' setup_path_rename && ( git clone file://"$(pwd)"/path_rename path_rename_single && cd path_rename_single && git filter-repo --path-rename sequences/tiny:sequences/small && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 7 filenames && ! grep sequences/tiny filenames && git rev-parse HEAD~3:sequences/small ) ' test_expect_success '--path-rename sequences:numbers' ' setup_path_rename && ( git clone file://"$(pwd)"/path_rename path_rename_dir && cd path_rename_dir && git filter-repo --path-rename sequences:numbers && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 8 filenames && ! grep sequences/ filenames && grep numbers/ filenames && grep values/ filenames ) ' test_expect_success '--path-rename-prefix values:numbers' ' setup_path_rename && ( git clone file://"$(pwd)"/path_rename path_rename_dir_2 && cd path_rename_dir_2 && git filter-repo --path-rename values/:numbers/ && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 8 filenames && ! grep values/ filenames && grep sequences/ filenames && grep numbers/ filenames ) ' test_expect_success '--path-rename squashing' ' setup_path_rename && ( git clone file://"$(pwd)"/path_rename path_rename_squash && cd path_rename_squash && git filter-repo \ --path-rename sequences/tiny:sequences/small \ --path-rename sequences:numbers \ --path-rename values:numbers \ --path-rename numbers/intermediate:numbers/medium && git log --format=%n --name-only | sort | uniq >filenames && # Just small, medium, large, huge, and a blank line... test_line_count = 5 filenames && ! grep sequences/ filenames && ! grep values/ filenames && grep numbers/ filenames ) ' test_expect_success '--path-rename inability to squash' ' setup_path_rename && ( git clone file://"$(pwd)"/path_rename path_rename_bad_squash && cd path_rename_bad_squash && test_must_fail git filter-repo \ --path-rename values/large:values/big \ --path-rename values/huge:values/big 2>../err && test_i18ngrep "File renaming caused colliding pathnames" ../err ) ' test_expect_success '--paths-from-file' ' setup_path_rename && ( git clone file://"$(pwd)"/path_rename paths_from_file && cd paths_from_file && cat >../path_changes <<-EOF && literal:values/huge values/huge==>values/gargantuan glob:*rge # Comments and blank lines are ignored regex:.*med.* regex:^([^/]*)/(.*)ge$==>\2/\1/ge EOF git filter-repo --paths-from-file ../path_changes && git log --format=%n --name-only | sort | uniq >filenames && # intermediate, medium, two larges, gargantuan, and a blank line test_line_count = 6 filenames && ! grep sequences/tiny filenames && grep sequences/intermediate filenames && grep lar/sequences/ge filenames && grep lar/values/ge filenames && grep values/gargantuan filenames && ! grep sequences/small filenames && grep sequences/medium filenames && rm ../path_changes ) ' test_expect_success '--paths does not mean --paths-from-file' ' setup_path_rename && ( git clone file://"$(pwd)"/path_rename paths_misuse && cd paths_misuse && test_must_fail git filter-repo --paths values/large 2>../err && grep "Error: Option.*--paths.*unrecognized; did you" ../err && rm ../err ) ' create_path_filtering_and_renaming() { test -d path_filtering_and_renaming && return test_create_repo path_filtering_and_renaming && ( cd path_filtering_and_renaming && >.gitignore && mkdir -p src/main/java/com/org/{foo,bar} && mkdir -p src/main/resources && test_seq 1 10 >src/main/java/com/org/foo/uptoten && test_seq 11 20 >src/main/java/com/org/bar/uptotwenty && test_seq 1 7 >src/main/java/com/org/uptoseven && test_seq 1 5 >src/main/resources/uptofive && git add . && git commit -m Initial ) } test_expect_success 'Mixing filtering and renaming paths, not enough filters' ' create_path_filtering_and_renaming && git clone --no-local path_filtering_and_renaming \ path_filtering_and_renaming_1 && ( cd path_filtering_and_renaming_1 && git filter-repo --path .gitignore \ --path src/main/resources \ --path-rename src/main/java/com/org/foo/:src/main/java/com/org/ && cat <<-EOF >expect && .gitignore src/main/resources/uptofive EOF git ls-files >actual && test_cmp expect actual ) ' test_expect_success 'Mixing filtering and renaming paths, enough filters' ' create_path_filtering_and_renaming && git clone --no-local path_filtering_and_renaming \ path_filtering_and_renaming_2 && ( cd path_filtering_and_renaming_2 && git filter-repo --path .gitignore \ --path src/main/resources \ --path src/main/java/com/org/foo/ \ --path-rename src/main/java/com/org/foo/:src/main/java/com/org/ && cat <<-EOF >expect && .gitignore src/main/java/com/org/uptoten src/main/resources/uptofive EOF git ls-files >actual && test_cmp expect actual ) ' test_expect_success 'Mixing filtering and to-subdirectory-filter' ' create_path_filtering_and_renaming && git clone --no-local path_filtering_and_renaming \ path_filtering_and_renaming_3 && ( cd path_filtering_and_renaming_3 && git filter-repo --path src/main/resources \ --to-subdirectory-filter my-module && cat <<-EOF >expect && my-module/src/main/resources/uptofive EOF git ls-files >actual && test_cmp expect actual ) ' setup_metasyntactic_repo() { test -d metasyntactic && return test_create_repo metasyntactic && ( cd metasyntactic && weird_name=$(printf "file\tna\nme") && echo "funny" >"$weird_name" && mkdir numbers && test_seq 1 10 >numbers/small && test_seq 100 110 >numbers/medium && git add "$weird_name" numbers && git commit -m initial && git tag v1.0 && git tag -a -m v1.1 v1.1 && mkdir words && echo foo >words/important && echo bar >words/whimsical && echo baz >words/sequences && git add words && git commit -m some.words && git branch another_branch && git tag v2.0 && echo spam >words/to && echo eggs >words/know && git add words git rm "$weird_name" && git commit -m more.words && git tag -a -m "Look, ma, I made a tag" v3.0 ) } test_expect_success FUNNYNAMES '--tag-rename' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic tag_rename && cd tag_rename && git filter-repo \ --tag-rename "":"myrepo-" \ --path words && test_must_fail git cat-file -t v1.0 && test_must_fail git cat-file -t v1.1 && test_must_fail git cat-file -t v2.0 && test_must_fail git cat-file -t v3.0 && test_must_fail git cat-file -t myrepo-v1.0 && test_must_fail git cat-file -t myrepo-v1.1 && test $(git cat-file -t myrepo-v2.0) = commit && test $(git cat-file -t myrepo-v3.0) = tag ) ' test_expect_success 'tag of tag before relevant portion of history' ' test_create_repo filtered_tag_of_tag && ( cd filtered_tag_of_tag && echo contents >file && git add file && git commit -m "Initial" && git tag -a -m "Inner Tag" inner_tag HEAD && git tag -a -m "Outer Tag" outer_tag inner_tag && mkdir subdir && echo stuff >subdir/whatever && git add subdir && git commit -m "Add file in subdir" && git filter-repo --force --subdirectory-filter subdir && git show-ref >refs && ! grep refs/tags refs && git log --all --oneline >commits && test_line_count = 1 commits ) ' test_expect_success FUNNYNAMES '--subdirectory-filter' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic subdir_filter && cd subdir_filter && git filter-repo \ --subdirectory-filter words && git cat-file --batch-check --batch-all-objects >all-objs && test_line_count = 10 all-objs && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 6 filenames && grep ^important$ filenames && test_must_fail git cat-file -t v1.0 && test_must_fail git cat-file -t v1.1 && test $(git cat-file -t v2.0) = commit && test $(git cat-file -t v3.0) = tag ) ' test_expect_success FUNNYNAMES '--subdirectory-filter with trailing slash' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic subdir_filter_2 && cd subdir_filter_2 && git filter-repo \ --subdirectory-filter words/ && git cat-file --batch-check --batch-all-objects >all-objs && test_line_count = 10 all-objs && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 6 filenames && grep ^important$ filenames && test_must_fail git cat-file -t v1.0 && test_must_fail git cat-file -t v1.1 && test $(git cat-file -t v2.0) = commit && test $(git cat-file -t v3.0) = tag ) ' test_expect_success FUNNYNAMES '--to-subdirectory-filter' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic to_subdir_filter && cd to_subdir_filter && git filter-repo \ --to-subdirectory-filter mysubdir/ && git cat-file --batch-check --batch-all-objects >all-objs && test_line_count = 22 all-objs && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 9 filenames && grep "^\"mysubdir/file\\\\tna\\\\nme\"$" filenames && grep ^mysubdir/words/important$ filenames && test $(git cat-file -t v1.0) = commit && test $(git cat-file -t v1.1) = tag && test $(git cat-file -t v2.0) = commit && test $(git cat-file -t v3.0) = tag ) ' test_expect_success FUNNYNAMES '--use-base-name' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic use_base_name && cd use_base_name && git filter-repo --path small --path important --use-base-name && git cat-file --batch-check --batch-all-objects >all-objs && test_line_count = 10 all-objs && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 3 filenames && grep ^numbers/small$ filenames && grep ^words/important$ filenames && test $(git cat-file -t v1.0) = commit && test $(git cat-file -t v1.1) = tag && test $(git cat-file -t v2.0) = commit && test $(git cat-file -t v3.0) = tag ) ' test_expect_success FUNNYNAMES 'refs/replace/ to skip a parent' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic replace_skip_ref && cd replace_skip_ref && git tag -d v2.0 && git replace HEAD~1 HEAD~2 && git filter-repo --replace-refs delete-no-add --path "" --force && test $(git rev-list --count HEAD) = 2 && git cat-file --batch-check --batch-all-objects >all-objs && test_line_count = 16 all-objs && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 9 filenames && test $(git cat-file -t v1.0) = commit && test $(git cat-file -t v1.1) = tag && test_must_fail git cat-file -t v2.0 && test $(git cat-file -t v3.0) = tag ) ' test_expect_success FUNNYNAMES 'refs/replace/ to add more initial history' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic replace_add_refs && cd replace_add_refs && git checkout --orphan new_root && rm .git/index && git add numbers/small && git clean -fd && git commit -m new.root && git replace --graft master~2 new_root && git checkout master && git --no-replace-objects cat-file -p master~2 >grandparent && ! grep parent grandparent && git filter-repo --replace-refs delete-no-add --path "" --force && git --no-replace-objects cat-file -p master~2 >new-grandparent && grep parent new-grandparent && test $(git rev-list --count HEAD) = 4 && git cat-file --batch-check --batch-all-objects >all-objs && test_line_count = 22 all-objs && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 9 filenames && test $(git cat-file -t v1.0) = commit && test $(git cat-file -t v1.1) = tag && test $(git cat-file -t v2.0) = commit && test $(git cat-file -t v3.0) = tag ) ' test_expect_success FUNNYNAMES 'creation/deletion/updating of replace refs' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic replace_handling && # Same setup as "refs/replace/ to skip a parent", so we # do not have to check that replacement refs were used # correctly in the rewrite, just that replacement refs were # deleted, added, or updated correctly. cd replace_handling && git tag -d v2.0 && master=$(git rev-parse master) && master_1=$(git rev-parse master~1) && master_2=$(git rev-parse master~2) && git replace HEAD~1 HEAD~2 && cd .. && mkdir -p test_replace_refs && cd test_replace_refs && rsync -a --delete ../replace_handling/ ./ && git filter-repo --replace-refs delete-no-add --path-rename numbers:counting && git show-ref >output && ! grep refs/replace/ output && rsync -a --delete ../replace_handling/ ./ && git filter-repo --replace-refs delete-and-add --path-rename numbers:counting && echo "$(git rev-parse master) refs/replace/$master" >out && echo "$(git rev-parse master~1) refs/replace/$master_1" >>out && echo "$(git rev-parse master~1) refs/replace/$master_2" >>out && sort -k 2 out >expect && git show-ref | grep refs/replace/ >output && test_cmp output expect && rsync -a --delete ../replace_handling/ ./ && git filter-repo --replace-refs update-no-add --path-rename numbers:counting && echo "$(git rev-parse master~1) refs/replace/$master_1" >expect && git show-ref | grep refs/replace/ >output && test_cmp output expect && rsync -a --delete ../replace_handling/ ./ && git filter-repo --replace-refs update-or-add --path-rename numbers:counting && echo "$(git rev-parse master) refs/replace/$master" >>out && echo "$(git rev-parse master~1) refs/replace/$master_1" >>out && sort -k 2 out >expect && git show-ref | grep refs/replace/ >output && test_cmp output expect && rsync -a --delete ../replace_handling/ ./ && git filter-repo --replace-refs update-and-add --path-rename numbers:counting && echo "$(git rev-parse master) refs/replace/$master" >>out && echo "$(git rev-parse master~1) refs/replace/$master_1" >>out && echo "$(git rev-parse master~1) refs/replace/$master_2" >>out && sort -k 2 out >expect && git show-ref | grep refs/replace/ >output && test_cmp output expect && rsync -a --delete ../replace_handling/ ./ && git filter-repo --replace-refs old-default --path-rename numbers:counting && echo "$(git rev-parse master) refs/replace/$master" >>out && echo "$(git rev-parse master~1) refs/replace/$master_1" >>out && echo "$(git rev-parse master~1) refs/replace/$master_2" >>out && sort -k 2 out >expect && git show-ref | grep refs/replace/ >output && test_cmp output expect && # Test the default rsync -a --delete ../replace_handling/ ./ && git filter-repo --path-rename numbers:counting && echo "$(git rev-parse master~1) refs/replace/$master_1" >expect && git show-ref | grep refs/replace/ >output && test_cmp output expect ) ' test_expect_success FUNNYNAMES '--debug' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic debug && cd debug && git filter-repo --path words --debug && test $(git rev-list --count HEAD) = 2 && git cat-file --batch-check --batch-all-objects >all-objs && test_line_count = 12 all-objs && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 6 filenames && test_path_is_file .git/filter-repo/fast-export.original && grep "^commit " .git/filter-repo/fast-export.original >out && test_line_count = 3 out && test_path_is_file .git/filter-repo/fast-export.filtered && grep "^commit " .git/filter-repo/fast-export.filtered >out && test_line_count = 2 out ) ' test_expect_success FUNNYNAMES '--dry-run' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic dry_run && cd dry_run && git filter-repo --path words --dry-run && git show-ref | grep master >out && test_line_count = 2 out && awk "{print \$1}" out | uniq >out2 && test_line_count = 1 out2 && test $(git rev-list --count HEAD) = 3 && git cat-file --batch-check --batch-all-objects >all-objs && test_line_count = 19 all-objs && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 9 filenames && test_path_is_file .git/filter-repo/fast-export.original && grep "^commit " .git/filter-repo/fast-export.original >out && test_line_count = 3 out && test_path_is_file .git/filter-repo/fast-export.filtered && grep "^commit " .git/filter-repo/fast-export.filtered >out && test_line_count = 2 out ) ' test_expect_success FUNNYNAMES '--dry-run --debug' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic dry_run_debug && cd dry_run_debug && git filter-repo --path words --dry-run --debug && git show-ref | grep master >out && test_line_count = 2 out && awk "{print \$1}" out | uniq >out2 && test_line_count = 1 out2 && test $(git rev-list --count HEAD) = 3 && git cat-file --batch-check --batch-all-objects >all-objs && test_line_count = 19 all-objs && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 9 filenames && test_path_is_file .git/filter-repo/fast-export.original && grep "^commit " .git/filter-repo/fast-export.original >out && test_line_count = 3 out && test_path_is_file .git/filter-repo/fast-export.filtered && grep "^commit " .git/filter-repo/fast-export.filtered >out && test_line_count = 2 out ) ' test_expect_success FUNNYNAMES '--dry-run --stdin' ' setup_metasyntactic_repo && ( git clone file://"$(pwd)"/metasyntactic dry_run_stdin && cd dry_run_stdin && git fast-export --all | git filter-repo --path words --dry-run --stdin && git show-ref | grep master >out && test_line_count = 2 out && awk "{print \$1}" out | uniq >out2 && test_line_count = 1 out2 && test $(git rev-list --count HEAD) = 3 && git cat-file --batch-check --batch-all-objects >all-objs && test_line_count = 19 all-objs && git log --format=%n --name-only | sort | uniq >filenames && test_line_count = 9 filenames && test_path_is_missing .git/filter-repo/fast-export.original && test_path_is_file .git/filter-repo/fast-export.filtered && grep "^commit " .git/filter-repo/fast-export.filtered >out && test_line_count = 2 out ) ' setup_analyze_me() { test -d analyze_me && return test_create_repo analyze_me && ( cd analyze_me && mkdir numbers words && test_seq 1 10 >numbers/small.num && test_seq 100 110 >numbers/medium.num && echo spam >words/to && echo eggs >words/know && echo rename a lot >fickle && git add numbers words fickle && test_tick && git commit -m initial && git branch modify-fickle && git branch other && git mv fickle capricious && test_tick && git commit -m "rename on main branch" && git checkout other && echo random other change >whatever && git add whatever && git mv fickle capricious && test_tick && git commit -m "rename on other branch" && git checkout master && git merge --no-commit other && git mv capricious mercurial && test_tick && git commit && git mv words sequence && test_tick && git commit -m now.sequence && git rm -rf numbers && test_tick && git commit -m remove.words && mkdir words && echo no >words/know && git add words/know && test_tick && git commit -m "Recreated file previously renamed" && echo "160000 deadbeefdeadbeefdeadbeefdeadbeefdeadbeefQfake_submodule" | q_to_tab | git update-index --index-info && test_tick && git commit -m "Add a fake submodule" && test_tick && git commit --allow-empty -m "Final commit, empty" && git checkout modify-fickle && echo "more stuff" >>fickle && test_tick && git commit -am "another more stuff commit" && git checkout modify-fickle && echo "more stuff" >>fickle && test_tick && git commit -am "another more stuff commit" && test_tick && git commit --allow-empty -m "Final commit, empty" && git checkout master && # Add a random extra unreferenced object echo foobar | git hash-object --stdin -w ) } test_expect_success C_LOCALE_OUTPUT '--analyze' ' setup_analyze_me && ( cd analyze_me && git filter-repo --analyze && # It should not work again without a --force test_must_fail git filter-repo --analyze && # With a --force, another run should succeed git filter-repo --analyze --force && test -d .git/filter-repo/analysis && cd .git/filter-repo/analysis && cat >expect <<-EOF && fickle -> capricious mercurial words/to -> sequence/to EOF test_cmp expect renames.txt && cat >expect <<-EOF && == Overall Statistics == Number of commits: 12 Number of filenames: 10 Number of directories: 4 Number of file extensions: 2 Total unpacked size (bytes): 206 Total packed size (bytes): 387 EOF head -n 9 README >actual && test_cmp expect actual && cat >expect <<-\EOF && === Files by sha and associated pathnames in reverse size === Format: sha, unpacked size, packed size, filename(s) object stored as a89c82a2d4b713a125a4323d25adda062cc0013d 44 48 numbers/medium.num c58ae2ffaf8352bd9860bf4bbb6ea78238dca846 35 41 fickle ccff62141ec7bae42e01a3dcb7615b38aa9fa5b3 24 40 fickle f00c965d8307308469e537302baa73048488f162 21 37 numbers/small.num 2aa69a2a708eed00cb390e30f6bcc3eed773f390 20 36 whatever 51b95456de9274c9a95f756742808dfd480b9b35 13 29 [capricious, fickle, mercurial] 732c85a1b3d7ce40ec8f78fd9ffea32e9f45fae0 5 20 [sequence/know, words/know] 34b6a0c9d02cb6ef7f409f248c0c1224ce9dd373 5 20 [sequence/to, words/to] 7ecb56eb3fa3fa6f19dd48bca9f971950b119ede 3 18 words/know EOF test_cmp expect blob-shas-and-paths.txt && cat >expect <<-EOF && === All directories by reverse size === Format: unpacked size, packed size, date deleted, directory name 206 387 65 85 2005-04-07 numbers 13 58 words 10 40 sequence EOF test_cmp expect directories-all-sizes.txt && cat >expect <<-EOF && === Deleted directories by reverse size === Format: unpacked size, packed size, date deleted, directory name 65 85 2005-04-07 numbers EOF test_cmp expect directories-deleted-sizes.txt && cat >expect <<-EOF && === All extensions by reverse size === Format: unpacked size, packed size, date deleted, extension name 141 302 65 85 2005-04-07 .num EOF test_cmp expect extensions-all-sizes.txt && cat >expect <<-EOF && === Deleted extensions by reverse size === Format: unpacked size, packed size, date deleted, extension name 65 85 2005-04-07 .num EOF test_cmp expect extensions-deleted-sizes.txt && cat >expect <<-EOF && === All paths by reverse accumulated size === Format: unpacked size, packed size, date deleted, path name 72 110 fickle 44 48 2005-04-07 numbers/medium.num 8 38 words/know 21 37 2005-04-07 numbers/small.num 20 36 whatever 13 29 mercurial 13 29 capricious 5 20 words/to 5 20 sequence/to 5 20 sequence/know EOF test_cmp expect path-all-sizes.txt && cat >expect <<-EOF && === Deleted paths by reverse accumulated size === Format: unpacked size, packed size, date deleted, path name(s) 44 48 2005-04-07 numbers/medium.num 21 37 2005-04-07 numbers/small.num EOF test_cmp expect path-deleted-sizes.txt ) ' test_expect_success C_LOCALE_OUTPUT '--analyze --report-dir' ' setup_analyze_me && ( cd analyze_me && rm -rf .git/filter-repo && git filter-repo --analyze --report-dir foobar && # It should not work again without a --force test_must_fail git filter-repo --analyze --report-dir foobar && # With a --force, though, it should overwrite git filter-repo --analyze --report-dir foobar --force && test ! -d .git/filter-repo/analysis && test -d foobar && cd foobar && # Very simple tests because already tested above. test_path_is_file renames.txt && test_path_is_file README && test_path_is_file blob-shas-and-paths.txt && test_path_is_file directories-all-sizes.txt && test_path_is_file directories-deleted-sizes.txt && test_path_is_file extensions-all-sizes.txt && test_path_is_file extensions-deleted-sizes.txt && test_path_is_file path-all-sizes.txt && test_path_is_file path-deleted-sizes.txt ) ' test_expect_success '--replace-text all options' ' setup_analyze_me && ( git clone file://"$(pwd)"/analyze_me replace_text && cd replace_text && cat >../replace-rules <<-\EOF && other change==>variation literal:spam==>foodstuff glob:ran*m==>haphazard regex:1(.[0-9])==>2\1 EOF git filter-repo --replace-text ../replace-rules && test_seq 200 210 >expect && git show HEAD~4:numbers/medium.num >actual && test_cmp expect actual && echo "haphazard ***REMOVED*** variation" >expect && test_cmp expect whatever ) ' test_expect_success '--replace-text binary zero_byte-0_char' ' ( set -e set -u REPO=replace-text-detect-binary FILE=mangle.bin OLD_STR=replace-from NEW_STR=replace-with # used with printf, contains a zero byte and a "0" character, binary OLD_CONTENT_FORMAT="${OLD_STR}\\0${OLD_STR}\\n0\\n" # expect content unchanged due to binary NEW_CONTENT_FORMAT="${OLD_CONTENT_FORMAT}" rm -rf "${REPO}" git init "${REPO}" cd "${REPO}" echo "${OLD_STR}==>${NEW_STR}" >../replace-rules printf "${NEW_CONTENT_FORMAT}" > ../expect printf "${OLD_CONTENT_FORMAT}" > "${FILE}" git add "${FILE}" git commit -m 'test' git filter-repo --force --replace-text ../replace-rules test_cmp ../expect "${FILE}" ) ' test_expect_success '--replace-text binary zero_byte-no_0_char' ' ( set -e set -u REPO=replace-text-detect-binary FILE=mangle.bin OLD_STR=replace-from NEW_STR=replace-with # used with printf, contains a zero byte but no "0" character, binary OLD_CONTENT_FORMAT="${OLD_STR}\\0${OLD_STR}\\n" # expect content unchanged due to binary NEW_CONTENT_FORMAT="${OLD_CONTENT_FORMAT}" rm -rf "${REPO}" git init "${REPO}" cd "${REPO}" echo "${OLD_STR}==>${NEW_STR}" >../replace-rules printf "${NEW_CONTENT_FORMAT}" > ../expect printf "${OLD_CONTENT_FORMAT}" > "${FILE}" git add "${FILE}" git commit -m 'test' git filter-repo --force --replace-text ../replace-rules test_cmp ../expect "${FILE}" ) ' test_expect_success '--replace-text text-file no_zero_byte-zero_char' ' ( set -e set -u REPO=replace-text-detect-binary FILE=mangle.bin OLD_STR=replace-from NEW_STR=replace-with # used with printf, contains no zero byte but contains a "0" character, text OLD_CONTENT_FORMAT="${OLD_STR}0\\n0${OLD_STR}\\n0\\n" # expect content changed due to text NEW_CONTENT_FORMAT="${NEW_STR}0\\n0${NEW_STR}\\n0\\n" rm -rf "${REPO}" git init "${REPO}" cd "${REPO}" echo "${OLD_STR}==>${NEW_STR}" >../replace-rules printf "${NEW_CONTENT_FORMAT}" > ../expect printf "${OLD_CONTENT_FORMAT}" > "${FILE}" git add "${FILE}" git commit -m 'test' git filter-repo --force --replace-text ../replace-rules test_cmp ../expect "${FILE}" ) ' test_expect_success '--strip-blobs-bigger-than' ' setup_analyze_me && ( git clone file://"$(pwd)"/analyze_me strip_big_blobs && cd strip_big_blobs && # Verify certain files are present initially git log --format=%n --name-only | sort | uniq >../filenames && test_line_count = 11 ../filenames && git rev-parse HEAD~7:numbers/medium.num && git rev-parse HEAD~7:numbers/small.num && git rev-parse HEAD~4:mercurial && test -f mercurial && # Make one of the current files be "really big" test_seq 1 1000 >mercurial && git add mercurial && git commit --amend && # Strip "really big" files git filter-repo --force --strip-blobs-bigger-than 3K --prune-empty never && git log --format=%n --name-only | sort | uniq >../filenames && test_line_count = 11 ../filenames && # The "mercurial" file should still be around... git rev-parse HEAD~4:mercurial && git rev-parse HEAD:mercurial && # ...but only with its old, smaller contents test_line_count = 1 mercurial && # Strip files that are too big, verify they are gone git filter-repo --strip-blobs-bigger-than 40 && git log --format=%n --name-only | sort | uniq >../filenames && test_line_count = 10 ../filenames && test_must_fail git rev-parse HEAD~7:numbers/medium.num && # Do it again, this time with --replace-text since that means # we are operating without --no-data and have to go through # a different codepath. (The search/replace terms are bogus) cat >../replace-rules <<-\EOF && not found==>was found EOF git filter-repo --strip-blobs-bigger-than 20 --replace-text ../replace-rules && git log --format=%n --name-only | sort | uniq >../filenames && test_line_count = 9 ../filenames && test_must_fail git rev-parse HEAD~7:numbers/medium.num && test_must_fail git rev-parse HEAD~7:numbers/small.num && # Remove the temporary auxiliary files rm ../replace-rules && rm ../filenames ) ' test_expect_success '--strip-blobs-with-ids' ' setup_analyze_me && ( git clone file://"$(pwd)"/analyze_me strip_blobs_with_ids && cd strip_blobs_with_ids && # Verify certain files are present initially git log --format=%n --name-only | sort | uniq >../filenames && test_line_count = 11 ../filenames && grep fake_submodule ../filenames && # Strip "a certain file" files echo deadbeefdeadbeefdeadbeefdeadbeefdeadbeef >../input && git filter-repo --strip-blobs-with-ids ../input && git log --format=%n --name-only | sort | uniq >../filenames && test_line_count = 10 ../filenames && # Make sure fake_submodule was removed ! grep fake_submodule ../filenames && # Do it again, this time with --replace-text since that means # we are operating without --no-data and have to go through # a different codepath. (The search/replace terms are bogus) cat >../bad-ids <<-\EOF && 34b6a0c9d02cb6ef7f409f248c0c1224ce9dd373 51b95456de9274c9a95f756742808dfd480b9b35 EOF cat >../replace-rules <<-\EOF && not found==>was found EOF git filter-repo --strip-blobs-with-ids ../bad-ids --replace-text ../replace-rules && git log --format=%n --name-only | sort | uniq >../filenames && test_line_count = 6 ../filenames && ! grep sequence/to ../filenames && ! grep words/to ../filenames && ! grep capricious ../filenames && ! grep fickle ../filenames && ! grep mercurial ../filenames && # Remove the temporary auxiliary files rm ../bad-ids && rm ../replace-rules && rm ../filenames ) ' setup_commit_message_rewriting() { test -d commit_msg && return test_create_repo commit_msg && ( cd commit_msg && echo two guys walking into a >bar && git add bar && git commit -m initial && test_commit another && name=$(git rev-parse HEAD) && echo hello >world && git add world && git commit -m "Commit referencing ${name:0:8}" && git revert HEAD && for i in $(test_seq 1 200) do git commit --allow-empty -m "another commit" done && echo foo >bar && git add bar && git commit -m bar && git revert --no-commit HEAD && echo foo >baz && git add baz && git commit ) } test_expect_success 'commit message rewrite' ' setup_commit_message_rewriting && ( git clone file://"$(pwd)"/commit_msg commit_msg_clone && cd commit_msg_clone && git filter-repo --invert-paths --path bar && git log --oneline >changes && test_line_count = 204 changes && # If a commit we reference is rewritten, we expect the # reference to be rewritten. name=$(git rev-parse HEAD~203) && echo "Commit referencing ${name:0:8}" >expect && git log --no-walk --format=%s HEAD~202 >actual && test_cmp expect actual && # If a commit we reference was pruned, then the reference # has nothing to be rewritten to. Verify that the commit # ID it points to does not exist. latest=$(git log --no-walk | grep reverts | awk "{print \$4}" | tr -d '.') && test -n "$latest" && test_must_fail git cat-file -e "$latest" ) ' test_expect_success 'commit hash unchanged if requested' ' setup_commit_message_rewriting && ( git clone file://"$(pwd)"/commit_msg commit_msg_clone_2 && cd commit_msg_clone_2 && name=$(git rev-parse HEAD~204) && git filter-repo --invert-paths --path bar --preserve-commit-hashes && git log --oneline >changes && test_line_count = 204 changes && echo "Commit referencing ${name:0:8}" >expect && git log --no-walk --format=%s HEAD~202 >actual && test_cmp expect actual ) ' test_expect_success 'commit message encoding preserved if requested' ' ( git init commit_message_encoding && cd commit_message_encoding && cat >input <<-\EOF && feature done commit refs/heads/develop mark :1 original-oid deadbeefdeadbeefdeadbeefdeadbeefdeadbeef author Just Me 1234567890 -0200 committer Just Me 1234567890 -0200 encoding iso-8859-7 data 5 EOF printf "Pi: \360\n\ndone\n" >>input && cat input | git fast-import --quiet && git rev-parse develop >expect && git filter-repo --preserve-commit-encoding --force && git rev-parse develop >actual && test_cmp expect actual ) ' test_expect_success 'commit message rewrite unsuccessful' ' ( git init commit_msg_not_found && cd commit_msg_not_found && cat >input <<-\EOF && feature done commit refs/heads/develop mark :1 original-oid deadbeefdeadbeefdeadbeefdeadbeefdeadbeef author Just Me 1234567890 -0200 committer Just Me 1234567890 -0200 data 2 A commit refs/heads/develop mark :2 original-oid deadbeefcafedeadbeefcafedeadbeefcafecafe author Just Me 1234567890 -0200 committer Just Me 1234567890 -0200 data 2 B commit refs/heads/develop mark :3 original-oid 0000000000000000000000000000000000000004 author Just Me 3980014290 -0200 committer Just Me 3980014290 -0200 data 93 Four score and seven years ago, commit deadbeef ("B", 2009-02-13) messed up. This fixes it. done EOF cat input | git filter-repo --stdin --path salutation --force && git log --oneline develop >changes && test_line_count = 3 changes && git log develop >out && grep deadbeef out ) ' test_expect_success 'startup sanity checks' ' setup_analyze_me && ( git clone file://"$(pwd)"/analyze_me startup_sanity_checks && cd startup_sanity_checks && echo foobar | git hash-object -w --stdin && test_must_fail git filter-repo --path numbers 2>err && test_i18ngrep "expected freshly packed repo" err && git prune && git remote add another_remote /dev/null && test_must_fail git filter-repo --path numbers 2>err && test_i18ngrep "expected one remote, origin" err && git remote rm another_remote && git remote rename origin another_remote && test_must_fail git filter-repo --path numbers 2>err && test_i18ngrep "expected one remote, origin" err && git remote rename another_remote origin && cd words && test_must_fail git filter-repo --path numbers 2>err && test_i18ngrep "GIT_DIR must be .git" err && rm err && cd .. && git config core.bare true && test_must_fail git filter-repo --path numbers 2>err && test_i18ngrep "GIT_DIR must be ." err && git config core.bare false && git update-ref -m "Just Testing" refs/heads/master HEAD && test_must_fail git filter-repo --path numbers 2>err && test_i18ngrep "expected at most one entry in the reflog" err && git reflog expire --expire=now && echo yes >>words/know && git stash save random change && rm -rf .git/logs/ && git gc && test_must_fail git filter-repo --path numbers 2>err && test_i18ngrep "has stashed changes" err && git update-ref -d refs/stash && echo yes >>words/know && git add words/know && git gc --prune=now && test_must_fail git filter-repo --path numbers 2>err && test_i18ngrep "you have uncommitted changes" err && git checkout HEAD words/know && echo yes >>words/know && test_must_fail git filter-repo --path numbers 2>err && test_i18ngrep "you have unstaged changes" err && git checkout -- words/know && test_must_fail git filter-repo --path numbers 2>err && test_i18ngrep "you have untracked changes" err && rm err && git worktree add ../other-worktree HEAD && test_must_fail git filter-repo --path numbers 2>../err && test_i18ngrep "you have multiple worktrees" ../err && rm -rf ../err && git worktree remove ../other-worktree && git update-ref -d refs/remotes/origin/master && test_must_fail git filter-repo --path numbers 2>../err && test_i18ngrep "refs/heads/master exists, but refs/remotes/origin/master not found" ../err && git update-ref -m restoring refs/remotes/origin/master refs/heads/master && rm ../err && rm .git/logs/refs/remotes/origin/master && git update-ref -m funsies refs/remotes/origin/master refs/heads/master~1 && test_must_fail git filter-repo --path numbers 2>../err && test_i18ngrep "refs/heads/master does not match refs/remotes/origin/master" ../err && rm ../err && cd ../ && git -C analyze_me gc && echo foobar | git -C analyze_me hash-object -w --stdin && git clone analyze_me startup_sanity_checks2 && cd startup_sanity_checks2 && echo foobar | git hash-object -w --stdin && test_must_fail git filter-repo --path numbers 2>../err && test_i18ngrep "expected freshly packed repo" ../err && test_i18ngrep "when cloning local repositories" ../err && rm ../err ) ' test_expect_success 'other startup error cases and requests for help' ' ( # prevent MSYS2 (Git for Windows) from converting the colon to # a semicolon when encountering parameters that look like # Unix-style, colon-separated path lists (such as `foo:.`) MSYS_NO_PATHCONV=1 && export MSYS_NO_PATHCONV git init startup_errors && cd startup_errors && git filter-repo -h >out && test_i18ngrep "filter-repo destructively rewrites history" out && test_must_fail git filter-repo 2>err && test_i18ngrep "No arguments specified." err && test_must_fail git filter-repo --analyze 2>err && test_i18ngrep "Nothing to analyze; repository is empty" err && ( GIT_CEILING_DIRECTORIES=$(pwd) && export GIT_CEILING_DIRECTORIES && mkdir not_a_repo && cd not_a_repo && test_must_fail git filter-repo --dry-run 2>err && test_i18ngrep "returned non-zero exit status" err && rm err && cd .. && rmdir not_a_repo ) && test_must_fail git filter-repo --analyze --path foobar 2>err && test_i18ngrep ": --analyze is incompatible with --path" err && test_must_fail git filter-repo --analyze --stdin 2>err && test_i18ngrep ": --analyze is incompatible with --stdin" err && test_must_fail git filter-repo --path-rename foo:bar --use-base-name 2>err && test_i18ngrep ": --use-base-name and --path-rename are incompatible" err && test_must_fail git filter-repo --path-rename foo:bar/ 2>err && test_i18ngrep "either ends with a slash then both must." err && echo "foo==>bar/" >input && test_must_fail git filter-repo --paths-from-file input 2>err && test_i18ngrep "either ends with a slash then both must." err && echo "glob:*.py==>newname" >input && test_must_fail git filter-repo --paths-from-file input 2>err && test_i18ngrep "renaming globs makes no sense" err && test_must_fail git filter-repo --strip-blobs-bigger-than 3GiB 2>err && test_i18ngrep "could not parse.*3GiB" err && test_must_fail git filter-repo --path-rename foo/bar:. 2>err && test_i18ngrep "Invalid path component .\.. found in .foo/bar:\." err && test_must_fail git filter-repo --path /foo/bar 2>err && test_i18ngrep "Pathnames cannot begin with a ./" err && test_must_fail git filter-repo --path-rename foo:/bar 2>err && test_i18ngrep "Pathnames cannot begin with a ./" err && test_must_fail git filter-repo --path-rename /foo:bar 2>err && test_i18ngrep "Pathnames cannot begin with a ./" err && test_must_fail git filter-repo --path-rename foo 2>err && test_i18ngrep "Error: --path-rename expects one colon in its argument" err && test_must_fail git filter-repo --subdirectory-filter /foo 2>err && test_i18ngrep "Pathnames cannot begin with a ./" err && test_must_fail git filter-repo --subdirectory-filter /foo 2>err && test_i18ngrep "Pathnames cannot begin with a ./" err ) ' test_expect_success 'invalid fast-import directives' ' ( git init invalid_directives && cd invalid_directives && echo "get-mark :15" | \ test_must_fail git filter-repo --stdin --force 2>err && test_i18ngrep "Unsupported command" err && echo "invalid-directive" | \ test_must_fail git filter-repo --stdin --force 2>err && test_i18ngrep "Could not parse line" err ) ' test_expect_success 'mailmap sanity checks' ' setup_analyze_me && ( git clone file://"$(pwd)"/analyze_me mailmap_sanity_checks && cd mailmap_sanity_checks && fake=$(pwd)/fake && test_must_fail git filter-repo --mailmap "$fake"/path 2>../err && test_i18ngrep "Cannot read $fake/path" ../err && echo "Total Bogus" >../whoopsies && test_must_fail git filter-repo --mailmap ../whoopsies 2>../err && test_i18ngrep "Unparseable mailmap file" ../err && rm ../err && rm ../whoopsies && echo "Me Myself Extraneous" >../whoopsies && test_must_fail git filter-repo --mailmap ../whoopsies 2>../err && test_i18ngrep "Unparseable mailmap file" ../err && rm ../err && rm ../whoopsies ) ' test_expect_success 'incremental import' ' setup_analyze_me && ( git clone file://"$(pwd)"/analyze_me incremental && cd incremental && original=$(git rev-parse master) && git fast-export --reference-excluded-parents master~2..master \ | git filter-repo --stdin --refname-callback "return b\"develop\"" && test "$(git rev-parse develop)" = "$original" ) ' test_expect_success '--target' ' setup_analyze_me && git init target && ( cd target && git checkout -b other && echo hello >world && git add world && git commit -m init && git checkout -b unique ) && git -C target rev-parse unique >target/expect && git filter-repo --source analyze_me --target target --path fake_submodule --force --debug && test 2 = $(git -C target rev-list --count master) && test_must_fail git -C target rev-parse other && git -C target rev-parse unique >target/actual && test_cmp target/expect target/actual ' test_expect_success '--date-order' ' test_create_repo date_order && ( cd date_order && git fast-import --quiet <$DATA/date-order && # First, verify that without date-order, C is before B cat <<-EOF >expect-normal && Initial A C B D merge EOF git filter-repo --force --message-callback " with open(\"messages.txt\", \"ab\") as f: f.write(message) return message " && test_cmp expect-normal messages.txt && # Next, verify that with date-order, C and B are reversed rm messages.txt && cat <<-EOF >expect && Initial A B C D merge EOF git filter-repo --date-order --force --message-callback " with open(\"messages.txt\", \"ab\") as f: f.write(message) return message " && test_cmp expect messages.txt ) ' test_expect_success '--refs' ' setup_analyze_me && git init refs && ( cd refs && git checkout -b other && echo hello >world && git add world && git commit -m init ) && git -C refs rev-parse other >refs/expect && git -C analyze_me rev-parse master >refs/expect && git filter-repo --source analyze_me --target refs --refs master --force && git -C refs rev-parse other >refs/actual && git -C refs rev-parse master >refs/actual && test_cmp refs/expect refs/actual ' test_expect_success '--refs and --replace-text' ' # This test exists to make sure we do not assume that parents in # filter-repo code are always represented by integers (or marks); # they sometimes are represented as hashes. setup_path_rename && ( git clone file://"$(pwd)"/path_rename refs_and_replace_text && cd refs_and_replace_text && git rev-parse --short=10 HEAD~1 >myparent && echo "10==>TEN" >input && git filter-repo --force --replace-text input --refs $(cat myparent)..master && cat <<-EOF >expect && TEN11 EOF test_cmp expect sequences/medium && git rev-list --count HEAD >actual && echo 4 >expect && test_cmp expect actual && git rev-parse --short=10 HEAD~1 >actual && test_cmp myparent actual ) ' test_expect_success 'reset to specific refs' ' test_create_repo reset_to_specific_refs && ( cd reset_to_specific_refs && git commit --allow-empty -m initial && INITIAL=$(git rev-parse HEAD) && echo "$INITIAL refs/heads/develop" >expect && cat >input <<-INPUT_END && reset refs/heads/develop from $INITIAL reset refs/heads/master from 0000000000000000000000000000000000000000 INPUT_END cat input | git filter-repo --force --stdin && git show-ref >actual && test_cmp expect actual ) ' setup_handle_funny_characters() { test -d funny_chars && return test_create_repo funny_chars && ( cd funny_chars && git symbolic-ref HEAD refs/heads/españa && printf "بتتكلم بالهندي؟\n" >señor && printf "Αυτά μου φαίνονται αλαμπουρνέζικα.\n" >>señor && printf "זה סינית בשבילי\n" >>señor && printf "ちんぷんかんぷん\n" >>señor && printf "За мене тоа е шпанско село\n" >>señor && printf "看起来像天书。\n" >>señor && printf "انگار ژاپنی حرف می زنه\n" >>señor && printf "Это для меня китайская грамота.\n" >>señor && printf "To mi je španska vas\n" >>señor && printf "Konuya Fransız kaldım\n" >>señor && printf "עס איז די שפּראַך פון גיבבעריש\n" >>señor && printf "Not even UTF-8:\xe0\x80\x80\x00\n" >>señor && cp señor señora && cp señor señorita && git add . && export GIT_AUTHOR_NAME="Nguyễn Arnfjörð Gábor" && export GIT_COMMITTER_NAME=$GIT_AUTHOR_NAME && export GIT_AUTHOR_EMAIL="emails@are.ascii" && export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL" && git commit -m "€$£₽₪" && git tag -a -m "₪₽£€$" סְפָרַד ) } test_expect_success 'handle funny characters' ' setup_handle_funny_characters && ( git clone file://"$(pwd)"/funny_chars funny_chars_checks && cd funny_chars_checks && file_sha=$(git rev-parse :0:señor) && former_head_sha=$(git rev-parse HEAD) && git filter-repo --replace-refs old-default --to-subdirectory-filter títulos && cat <<-EOF >expect && 100644 $file_sha 0 "t\303\255tulos/se\303\261or" 100644 $file_sha 0 "t\303\255tulos/se\303\261ora" 100644 $file_sha 0 "t\303\255tulos/se\303\261orita" EOF git ls-files -s >actual && test_cmp expect actual && commit_sha=$(git rev-parse HEAD) && tag_sha=$(git rev-parse סְפָרַד) && cat <<-EOF >expect && $commit_sha refs/heads/españa $commit_sha refs/replace/$former_head_sha $tag_sha refs/tags/סְפָרַד EOF git show-ref >actual && test_cmp expect actual && echo "€$£₽₪" >expect && git cat-file -p HEAD | tail -n 1 >actual && echo "₪₽£€$" >expect && git cat-file -p סְפָרַד | tail -n 1 >actual ) ' test_expect_success '--state-branch with changing renames' ' test_create_repo state_branch_renames_export test_create_repo state_branch_renames && ( cd state_branch_renames && git fast-import --quiet <$DATA/basic-numbers && git branch -d A && git branch -d B && git tag -d v1.0 && ORIG=$(git rev-parse master) && git reset --hard master~1 && git filter-repo --path-rename ten:zehn \ --state-branch state_info \ --target ../state_branch_renames_export && cd ../state_branch_renames_export && git log --format=%s --name-status >actual && cat <<-EOF >expect && Merge branch ${SQ}A${SQ} into B add twenty M twenty add ten M zehn Initial A twenty A zehn EOF test_cmp expect actual && cd ../state_branch_renames && git reset --hard $ORIG && git filter-repo --path-rename twenty:veinte \ --state-branch state_info \ --target ../state_branch_renames_export && cd ../state_branch_renames_export && git log --format=%s --name-status >actual && cat <<-EOF >expect && whatever A ten A veinte Merge branch ${SQ}A${SQ} into B add twenty M twenty add ten M zehn Initial A twenty A zehn EOF test_cmp expect actual ) ' test_expect_success '--state-branch with expanding paths and refs' ' test_create_repo state_branch_more_paths_export test_create_repo state_branch_more_paths && ( cd state_branch_more_paths && git fast-import --quiet <$DATA/basic-numbers && git reset --hard master~1 && git filter-repo --path ten --state-branch state_info \ --target ../state_branch_more_paths_export \ --refs master && cd ../state_branch_more_paths_export && echo 2 >expect && git rev-list --count master >actual && test_cmp expect actual && test_must_fail git rev-parse master~1:twenty && test_must_fail git rev-parse master:twenty && cd ../state_branch_more_paths && git reset --hard v1.0 && git filter-repo --path ten --path twenty \ --state-branch state_info \ --target ../state_branch_more_paths_export && cd ../state_branch_more_paths_export && echo 3 >expect && git rev-list --count master >actual && test_cmp expect actual && test_must_fail git rev-parse master~2:twenty && git rev-parse master:twenty ) ' test_expect_success FUNNYNAMES 'degenerate merge with non-matching filenames' ' test_create_repo degenerate_merge_differing_filenames && ( cd degenerate_merge_differing_filenames && touch "foo \"quote\" bar" && git add "foo \"quote\" bar" && git commit -m "Add foo \"quote\" bar" git branch A && git checkout --orphan B && git reset --hard && mkdir -p pkg/list && test_commit pkg/list/whatever && test_commit unwanted_file && git checkout A && git merge --allow-unrelated-histories --no-commit B && >pkg/list/wanted && git add pkg/list/wanted && git rm -f pkg/list/whatever.t && git commit && git filter-repo --force --path pkg/list && ! test_path_is_file pkg/list/whatever.t && git ls-files >actual && echo pkg/list/wanted >expect && test_cmp expect actual ) ' test_expect_success 'degenerate merge with typechange' ' test_create_repo degenerate_merge_with_typechange && ( cd degenerate_merge_with_typechange && touch irrelevant_file && git add irrelevant_file && git commit -m "Irrelevant, unwanted file" git branch A && git checkout --orphan B && git reset --hard && echo hello >world && git add world && git commit -m "greeting" && echo goodbye >planet && git add planet && git commit -m "farewell" && git checkout A && git merge --allow-unrelated-histories --no-commit B && rm world && ln -s planet world && git add world && git commit && git filter-repo --force --path world && test_path_is_missing irrelevant_file && test_path_is_missing planet && echo world >expect && git ls-files >actual && test_cmp expect actual && git log --oneline HEAD >input && test_line_count = 2 input ) ' test_expect_success 'Filtering a blob to make it match previous version' ' test_create_repo remove_unique_bits_of_blob && ( cd remove_unique_bits_of_blob && test_write_lines foo baz >metasyntactic_names && git add metasyntactic_names && git commit -m init && test_write_lines foo bar baz >metasyntactic_names && git add metasyntactic_names && git commit -m second && git filter-repo --force --blob-callback "blob.data = blob.data.replace(b\"\\nbar\", b\"\")" echo 1 >expect && git rev-list --count HEAD >actual && test_cmp expect actual ) ' test_expect_success 'tweaking just a tag' ' test_create_repo tweaking_just_a_tag && ( cd tweaking_just_a_tag && test_commit foo && git tag -a -m "Here is a tag" mytag && git filter-repo --force --refs mytag ^mytag^{commit} --name-callback "return name.replace(b\"Mitter\", b\"L D\")" && git cat-file -p mytag | grep C.O.L.D ) ' test_expect_success '--version' ' git filter-repo --version >actual && git hash-object ../../git-filter-repo | cut -c 1-12 >expect && test_cmp expect actual ' test_expect_success 'empty author ident' ' test_create_repo empty_author_ident && ( cd empty_author_ident && git init && cat <<-EOF | git fast-import --quiet && feature done blob mark :1 data 8 initial reset refs/heads/develop commit refs/heads/develop mark :2 author 1535228562 -0700 committer Full Name 1535228562 -0700 data 8 Initial M 100644 :1 filename done EOF git filter-repo --force --path-rename filename:stuff && git log --format=%an develop >actual && echo >expect && test_cmp expect actual ) ' test_done git-filter-repo-2.45.0/t/t9390/000077500000000000000000000000001464611705400156565ustar00rootroot00000000000000git-filter-repo-2.45.0/t/t9390/basic000066400000000000000000000040421464611705400166620ustar00rootroot00000000000000feature done # Simple repo with three files, a merge where each side touches exactly one # file, and a commit at the end touching all three. Note that the original-oid # directives are very fake, but make it easy to recognize what original shas # are. blob mark :1 original-oid 0000000000000000000000000000000000000001 data 8 initial blob mark :2 original-oid 0000000000000000000000000000000000000002 data 8 ten-mod blob mark :3 original-oid 0000000000000000000000000000000000000003 data 11 twenty-mod blob mark :4 original-oid 0000000000000000000000000000000000000004 data 6 final reset refs/heads/master commit refs/heads/master mark :5 original-oid 0000000000000000000000000000000000000009 author Little O. Me 1535228562 -0700 committer Little O. Me 1535228562 -0700 data 8 Initial M 100644 :1 filename M 100644 :1 ten M 100644 :1 twenty commit refs/heads/B mark :6 original-oid 000000000000000000000000000000000000000B author Little 'ol Me 1535229544 -0700 committer Little 'ol Me 1535229544 -0700 data 11 add twenty from :5 M 100644 :3 twenty commit refs/heads/A mark :7 original-oid 000000000000000000000000000000000000000A author Little O. Me 1535229523 -0700 committer Little O. Me 1535229523 -0700 data 8 add ten from :5 M 100644 :2 ten commit refs/heads/master mark :8 original-oid 000000000000000000000000000000000000000C author Lit.e Me 1535229559 -0700 committer Lit.e Me 1535229580 -0700 data 24 Merge branch 'A' into B from :6 merge :7 M 100644 :2 ten commit refs/heads/master mark :9 original-oid 000000000000000000000000000000000000000D author Little Me 1535229601 -0700 committer Little Me 1535229601 -0700 data 9 whatever from :8 M 100644 :4 filename M 100644 :4 ten M 100644 :4 twenty tag v1.0 from :9 original-oid 000000000000000000000000000000000000000E tagger Little John 1535229618 -0700 data 5 v1.0 reset refs/heads/master from :9 done git-filter-repo-2.45.0/t/t9390/basic-filename000066400000000000000000000010721464611705400204400ustar00rootroot00000000000000feature done blob mark :1 data 8 initial reset refs/heads/A commit refs/heads/A mark :2 author Little O. Me 1535228562 -0700 committer Little O. Me 1535228562 -0700 data 8 Initial M 100644 :1 filename blob mark :3 data 6 final commit refs/heads/master mark :4 author Little Me 1535229601 -0700 committer Little Me 1535229601 -0700 data 9 whatever from :2 M 100644 :3 filename reset refs/heads/B from :2 tag v1.0 from :4 tagger Little John 1535229618 -0700 data 5 v1.0 done git-filter-repo-2.45.0/t/t9390/basic-mailmap000066400000000000000000000023431464611705400203020ustar00rootroot00000000000000feature done blob mark :1 data 8 initial reset refs/heads/B commit refs/heads/B mark :2 author Little 'ol Me 1535228562 -0700 committer Little 'ol Me 1535228562 -0700 data 8 Initial M 100644 :1 filename M 100644 :1 ten M 100644 :1 twenty blob mark :3 data 11 twenty-mod commit refs/heads/B mark :4 author Little 'ol Me 1535229544 -0700 committer Little 'ol Me 1535229544 -0700 data 11 add twenty from :2 M 100644 :3 twenty blob mark :5 data 8 ten-mod commit refs/heads/A mark :6 author Little 'ol Me 1535229523 -0700 committer Little 'ol Me 1535229523 -0700 data 8 add ten from :2 M 100644 :5 ten commit refs/heads/master mark :7 author Little 'ol Me 1535229559 -0700 committer Little 'ol Me 1535229580 -0700 data 24 Merge branch 'A' into B from :4 merge :6 M 100644 :5 ten blob mark :8 data 6 final commit refs/heads/master mark :9 author Little 'ol Me 1535229601 -0700 committer Little 'ol Me 1535229601 -0700 data 9 whatever from :7 M 100644 :8 filename M 100644 :8 ten M 100644 :8 twenty tag v1.0 from :9 tagger Little John 1535229618 -0700 data 5 v1.0 done git-filter-repo-2.45.0/t/t9390/basic-message000066400000000000000000000023761464611705400203140ustar00rootroot00000000000000feature done blob mark :1 data 8 initial reset refs/heads/B commit refs/heads/B mark :2 author Little O. Me 1535228562 -0700 committer Little O. Me 1535228562 -0700 data 9 Modified M 100644 :1 filename M 100644 :1 ten M 100644 :1 twenty blob mark :3 data 11 twenty-mod commit refs/heads/B mark :4 author Little 'ol Me 1535229544 -0700 committer Little 'ol Me 1535229544 -0700 data 18 add the number 20 from :2 M 100644 :3 twenty blob mark :5 data 8 ten-mod commit refs/heads/A mark :6 author Little O. Me 1535229523 -0700 committer Little O. Me 1535229523 -0700 data 8 add ten from :2 M 100644 :5 ten commit refs/heads/master mark :7 author Lit.e Me 1535229559 -0700 committer Lit.e Me 1535229580 -0700 data 24 Merge branch 'A' into B from :4 merge :6 M 100644 :5 ten blob mark :8 data 6 final commit refs/heads/master mark :9 author Little Me 1535229601 -0700 committer Little Me 1535229601 -0700 data 9 whatever from :7 M 100644 :8 filename M 100644 :8 ten M 100644 :8 twenty tag v1.0 from :9 tagger Little John 1535229618 -0700 data 15 version one :) done git-filter-repo-2.45.0/t/t9390/basic-numbers000066400000000000000000000023011464611705400203270ustar00rootroot00000000000000feature done blob mark :1 data 8 initial reset refs/heads/B commit refs/heads/B mark :2 author Little O. Me 1535228562 -0700 committer Little O. Me 1535228562 -0700 data 8 Initial M 100644 :1 ten M 100644 :1 twenty blob mark :3 data 11 twenty-mod commit refs/heads/B mark :4 author Little 'ol Me 1535229544 -0700 committer Little 'ol Me 1535229544 -0700 data 11 add twenty from :2 M 100644 :3 twenty blob mark :5 data 8 ten-mod commit refs/heads/A mark :6 author Little O. Me 1535229523 -0700 committer Little O. Me 1535229523 -0700 data 8 add ten from :2 M 100644 :5 ten commit refs/heads/master mark :7 author Lit.e Me 1535229559 -0700 committer Lit.e Me 1535229580 -0700 data 24 Merge branch 'A' into B from :4 merge :6 M 100644 :5 ten blob mark :8 data 6 final commit refs/heads/master mark :9 author Little Me 1535229601 -0700 committer Little Me 1535229601 -0700 data 9 whatever from :7 M 100644 :8 ten M 100644 :8 twenty tag v1.0 from :9 tagger Little John 1535229618 -0700 data 5 v1.0 done git-filter-repo-2.45.0/t/t9390/basic-replace000066400000000000000000000024161464611705400202760ustar00rootroot00000000000000feature done blob mark :1 data 8 initial reset refs/heads/B commit refs/heads/B mark :2 author Little O. Me 1535228562 -0700 committer Little O. Me 1535228562 -0700 data 8 Initial M 100644 :1 filename M 100644 :1 ten M 100644 :1 twenty blob mark :3 data 28 twenty-modified-by-gremlins commit refs/heads/B mark :4 author Little 'ol Me 1535229544 -0700 committer Little 'ol Me 1535229544 -0700 data 11 add twenty from :2 M 100644 :3 twenty blob mark :5 data 25 ten-modified-by-gremlins commit refs/heads/A mark :6 author Little O. Me 1535229523 -0700 committer Little O. Me 1535229523 -0700 data 8 add ten from :2 M 100644 :5 ten commit refs/heads/master mark :7 author Lit.e Me 1535229559 -0700 committer Lit.e Me 1535229580 -0700 data 24 Merge branch 'A' into B from :4 merge :6 M 100644 :5 ten blob mark :8 data 6 final commit refs/heads/master mark :9 author Little Me 1535229601 -0700 committer Little Me 1535229601 -0700 data 9 whatever from :7 M 100644 :8 filename M 100644 :8 ten M 100644 :8 twenty tag v1.0 from :9 tagger Little John 1535229618 -0700 data 5 v1.0 done git-filter-repo-2.45.0/t/t9390/basic-ten000066400000000000000000000013661464611705400174540ustar00rootroot00000000000000feature done blob mark :1 data 8 initial reset refs/heads/B commit refs/heads/B mark :2 author Little O. Me 1535228562 -0700 committer Little O. Me 1535228562 -0700 data 8 Initial M 100644 :1 ten blob mark :3 data 8 ten-mod commit refs/heads/A mark :4 author Little O. Me 1535229523 -0700 committer Little O. Me 1535229523 -0700 data 8 add ten from :2 M 100644 :3 ten blob mark :5 data 6 final commit refs/heads/master mark :6 author Little Me 1535229601 -0700 committer Little Me 1535229601 -0700 data 9 whatever from :4 M 100644 :5 ten tag v1.0 from :6 tagger Little John 1535229618 -0700 data 5 v1.0 done git-filter-repo-2.45.0/t/t9390/basic-twenty000066400000000000000000000013731464611705400202160ustar00rootroot00000000000000feature done blob mark :1 data 8 initial reset refs/heads/A commit refs/heads/A mark :2 author Little O. Me 1535228562 -0700 committer Little O. Me 1535228562 -0700 data 8 Initial M 100644 :1 twenty blob mark :3 data 11 twenty-mod commit refs/heads/B mark :4 author Little 'ol Me 1535229544 -0700 committer Little 'ol Me 1535229544 -0700 data 11 add twenty from :2 M 100644 :3 twenty blob mark :5 data 6 final commit refs/heads/master mark :6 author Little Me 1535229601 -0700 committer Little Me 1535229601 -0700 data 9 whatever from :4 M 100644 :5 twenty tag v1.0 from :6 tagger Little John 1535229618 -0700 data 5 v1.0 done git-filter-repo-2.45.0/t/t9390/date-order000066400000000000000000000020611464611705400176260ustar00rootroot00000000000000feature done blob mark :1 data 8 initial reset refs/heads/master commit refs/heads/master mark :2 author Little O. Me 1535228562 -0700 committer Little O. Me 1535228562 -0700 data 8 Initial M 100644 :1 filename commit refs/heads/master mark :3 author Little Me 1535229601 -0700 committer Little Me 1535229601 -0700 data 2 A from :2 commit refs/heads/master mark :4 author Little Me 1535229602 -0700 committer Little Me 1535229602 -0700 data 2 B from :2 commit refs/heads/master mark :5 author Little Me 1535229603 -0700 committer Little Me 1535229603 -0700 data 2 C from :3 commit refs/heads/master mark :6 author Little Me 1535229604 -0700 committer Little Me 1535229604 -0700 data 2 D from :4 commit refs/heads/master mark :7 author Little Me 1535229605 -0700 committer Little Me 1535229605 -0700 data 6 merge from :5 merge :6 done git-filter-repo-2.45.0/t/t9390/degenerate000066400000000000000000000223661464611705400177150ustar00rootroot00000000000000feature done # Simple repo with only three files, with a bunch of cases of dealing with # topology changes possibly causing merge commits to need to be pruned. # # As with case1, the original-oid directives are very fake, but if an error # is hit that shows one of these, it makes it really easy to know where it # came from. # # Expressed with shorthand, log history in the format # Commit Name(Parent(s)): files changed # for this case looks like the following: # W(V): moduleA/keepme # V(U,U): moduleB/nukeme # U(T): moduleA/sometimes # T(S): moduleA/keepme # S(R,R): moduleA/sometimes # R(R): moduleB/nukeme # Q(P): moduleA/keepme # P(N,M): moduleA/sometimes # O(M,N): moduleA/sometimes # N(C): moduleB/nukeme # M(L): moduleB/nukeme # L(K): moduleA/keepme # K(J): moduleB/nukeme # J(D,H): moduleA/sometimes # I(H,D): moduleA/sometimes # backwards-ish merge # H(G): moduleB/nukeme # G(F): moduleA/keepme # F(D): moduleB/nukeme # D(B,C): moduleA/sometimes # C(A): moduleB/nukeme # B(A): moduleB/nukeme # A(): moduleA/keepme # # This involved case is intended to test the following: # * Merge becoming non-merge due to both parents becoming same commit # * Two sub-cases: it has changes of its own, or it doesn't # * Merge becoming merge of commit with its own ancestor # * Two cases: and it has changes, and it doesn't have changes # * Two cases: first parent is the ancestor, second parent is the ancestor # * Merge starting as merge of commit with its own ancestor # * Two cases: has changes, doesn't have changes # * Two cases: first parent, or second parent blob mark :1 original-oid 0000000000000000000000000000000000000001 data 10 keepme v1 blob mark :2 original-oid 0000000000000000000000000000000000000002 data 10 nukeme v1 blob mark :3 original-oid 0000000000000000000000000000000000000003 data 10 nukeme v2 blob mark :4 original-oid 0000000000000000000000000000000000000004 data 13 sometimes v1 blob mark :5 original-oid 0000000000000000000000000000000000000005 data 10 nukeme v3 blob mark :6 original-oid 0000000000000000000000000000000000000006 data 10 keepme v2 blob mark :7 original-oid 0000000000000000000000000000000000000007 data 10 nukem4 v4 blob mark :8 original-oid 0000000000000000000000000000000000000008 data 13 sometimes v2 blob mark :9 original-oid 0000000000000000000000000000000000000009 data 13 sometimes v3 blob mark :10 original-oid 000000000000000000000000000000000000000A data 10 nukeme v4 blob mark :11 original-oid 000000000000000000000000000000000000000B data 10 keepme v3 blob mark :12 original-oid 000000000000000000000000000000000000000C data 10 nukeme v5 blob mark :13 original-oid 000000000000000000000000000000000000000D data 10 nukeme v6 blob mark :14 original-oid 000000000000000000000000000000000000000E data 13 sometimes v4 blob mark :15 original-oid 000000000000000000000000000000000000000F data 13 sometimes v5 blob mark :16 original-oid 0000000000000000000000000000000000000010 data 10 keepme v4 blob mark :17 original-oid 0000000000000000000000000000000000000011 data 10 nukeme v7 blob mark :18 original-oid 0000000000000000000000000000000000000012 data 13 sometimes v6 blob mark :19 original-oid 0000000000000000000000000000000000000013 data 10 keepme v5 blob mark :20 original-oid 0000000000000000000000000000000000000014 data 13 sometimes v7 blob mark :21 original-oid 0000000000000000000000000000000000000015 data 10 nukeme v8 blob mark :22 original-oid 0000000000000000000000000000000000000016 data 10 keepme v6 commit refs/heads/master mark :26 original-oid 0000000000000000000000000000000000000020 author Full Name 2000000000 +0100 committer Full Name 2000000000 +0100 data 2 A M 100644 :1 moduleA/keepme commit refs/heads/master mark :27 original-oid 0000000000000000000000000000000000000021 author Full Name 2000010000 +0100 committer Full Name 2000010000 +0100 data 2 B from :26 M 100644 :2 moduleB/nukeme commit refs/heads/master mark :28 original-oid 0000000000000000000000000000000000000022 author Full Name 2000020000 +0100 committer Full Name 2000020000 +0100 data 2 C from :26 M 100644 :3 moduleB/nukeme commit refs/heads/master mark :29 original-oid 0000000000000000000000000000000000000023 author Full Name 2000030000 +0100 committer Full Name 2000030000 +0100 data 29 D: Merge commit 'C' into 'B' from :27 merge :28 M 100644 :4 moduleA/sometimes commit refs/heads/master mark :30 original-oid 0000000000000000000000000000000000000024 author Full Name 2000040000 +0100 committer Full Name 2000040000 +0100 data 2 F from :29 M 100644 :5 moduleB/nukeme commit refs/heads/master mark :31 original-oid 0000000000000000000000000000000000000025 author Full Name 2000050000 +0100 committer Full Name 2000050000 +0100 data 2 G from :30 M 100644 :6 moduleA/keepme commit refs/heads/master mark :32 original-oid 0000000000000000000000000000000000000026 author Full Name 2000060000 +0100 committer Full Name 2000060000 +0100 data 2 H from :31 M 100644 :7 moduleB/nukeme commit refs/heads/branchI mark :33 original-oid 0000000000000000000000000000000000000027 author Full Name 2000070000 +0100 committer Full Name 2000070000 +0100 data 29 I: Merge commit 'D' into 'H' from :32 merge :29 M 100644 :8 moduleA/sometimes commit refs/heads/master mark :34 original-oid 0000000000000000000000000000000000000028 author Full Name 2000080000 +0100 committer Full Name 2000080000 +0100 data 29 J: Merge commit 'H' into 'D' from :29 merge :32 M 100644 :9 moduleA/sometimes commit refs/heads/master mark :35 original-oid 0000000000000000000000000000000000000029 author Full Name 2000090000 +0100 committer Full Name 2000090000 +0100 data 2 K from :34 M 100644 :10 moduleB/nukeme commit refs/heads/master mark :36 original-oid 000000000000000000000000000000000000002A author Full Name 2000092000 +0100 committer Full Name 2000092000 +0100 data 2 L from :35 M 100644 :11 moduleA/keepme commit refs/heads/master mark :37 original-oid 000000000000000000000000000000000000002B author Full Name 2000094000 +0100 committer Full Name 2000094000 +0100 data 2 M from :36 M 100644 :12 moduleB/nukeme commit refs/heads/master mark :38 original-oid 000000000000000000000000000000000000002C author Full Name 2000096000 +0100 committer Full Name 2000096000 +0100 data 2 N from :28 M 100644 :13 moduleB/nukeme commit refs/heads/branchO mark :39 original-oid 000000000000000000000000000000000000002D author Full Name 2000098000 +0100 committer Full Name 2000098000 +0100 data 29 O: Merge commit 'N' into 'M' from :37 merge :38 D moduleA/sometimes commit refs/heads/master mark :40 original-oid 000000000000000000000000000000000000002E author Full Name 2000099000 +0100 committer Full Name 2000099000 +0100 data 29 P: Merge commit 'M' into 'N' from :38 merge :37 M 100644 :15 moduleA/sometimes commit refs/heads/master mark :41 original-oid 0000000000000000000000000000000000000030 author Full Name 3000000000 +0100 committer Full Name 3000000000 +0100 data 2 Q from :40 M 100644 :16 moduleA/keepme commit refs/heads/master mark :42 original-oid 0000000000000000000000000000000000000031 author Full Name 3000010000 +0100 committer Full Name 3000010000 +0100 data 2 R from :41 M 100644 :17 moduleB/nukeme commit refs/heads/master mark :43 original-oid 0000000000000000000000000000000000000032 author Full Name 3000020000 +0100 committer Full Name 3000020000 +0100 data 29 S: Merge commit 'R' into 'R' from :42 merge :42 M 100644 :18 moduleA/sometimes commit refs/heads/master mark :44 original-oid 0000000000000000000000000000000000000033 author Full Name 3000030000 +0100 committer Full Name 3000030000 +0100 data 2 T from :43 M 100644 :19 moduleA/keepme commit refs/heads/master mark :45 original-oid 0000000000000000000000000000000000000034 author Full Name 3000040000 +0100 committer Full Name 3000040000 +0100 data 2 U from :44 M 100644 :20 moduleA/sometimes commit refs/heads/master mark :46 original-oid 0000000000000000000000000000000000000035 author Full Name 3000050000 +0100 committer Full Name 3000050000 +0100 data 29 V: Merge commit 'U' into 'U' from :45 merge :45 M 100644 :21 moduleB/nukeme commit refs/heads/master mark :47 original-oid 0000000000000000000000000000000000000036 author Full Name 3000060000 +0100 committer Full Name 3000060000 +0100 data 2 W from :46 M 100644 :22 moduleA/keepme done git-filter-repo-2.45.0/t/t9390/degenerate-globme000066400000000000000000000113061464611705400211500ustar00rootroot00000000000000feature done blob mark :1 data 10 keepme v1 reset refs/heads/master commit refs/heads/master mark :2 author Full Name 2000000000 +0100 committer Full Name 2000000000 +0100 data 2 A M 100644 :1 moduleA/keepme blob mark :3 data 10 nukeme v1 commit refs/heads/master mark :4 author Full Name 2000010000 +0100 committer Full Name 2000010000 +0100 data 2 B from :2 M 100644 :3 moduleB/nukeme blob mark :5 data 10 nukeme v2 commit refs/heads/master mark :6 author Full Name 2000020000 +0100 committer Full Name 2000020000 +0100 data 2 C from :2 M 100644 :5 moduleB/nukeme commit refs/heads/master mark :7 author Full Name 2000030000 +0100 committer Full Name 2000030000 +0100 data 29 D: Merge commit 'C' into 'B' from :4 merge :6 blob mark :8 data 10 nukeme v3 commit refs/heads/master mark :9 author Full Name 2000040000 +0100 committer Full Name 2000040000 +0100 data 2 F from :7 M 100644 :8 moduleB/nukeme blob mark :10 data 10 keepme v2 commit refs/heads/master mark :11 author Full Name 2000050000 +0100 committer Full Name 2000050000 +0100 data 2 G from :9 M 100644 :10 moduleA/keepme blob mark :12 data 10 nukem4 v4 commit refs/heads/master mark :13 author Full Name 2000060000 +0100 committer Full Name 2000060000 +0100 data 2 H from :11 M 100644 :12 moduleB/nukeme commit refs/heads/branchI mark :14 author Full Name 2000070000 +0100 committer Full Name 2000070000 +0100 data 29 I: Merge commit 'D' into 'H' from :13 merge :7 commit refs/heads/master mark :15 author Full Name 2000080000 +0100 committer Full Name 2000080000 +0100 data 29 J: Merge commit 'H' into 'D' from :7 merge :13 blob mark :16 data 10 nukeme v4 commit refs/heads/master mark :17 author Full Name 2000090000 +0100 committer Full Name 2000090000 +0100 data 2 K from :15 M 100644 :16 moduleB/nukeme blob mark :18 data 10 keepme v3 commit refs/heads/master mark :19 author Full Name 2000092000 +0100 committer Full Name 2000092000 +0100 data 2 L from :17 M 100644 :18 moduleA/keepme blob mark :20 data 10 nukeme v5 commit refs/heads/master mark :21 author Full Name 2000094000 +0100 committer Full Name 2000094000 +0100 data 2 M from :19 M 100644 :20 moduleB/nukeme blob mark :22 data 10 nukeme v6 commit refs/heads/master mark :23 author Full Name 2000096000 +0100 committer Full Name 2000096000 +0100 data 2 N from :6 M 100644 :22 moduleB/nukeme commit refs/heads/branchO mark :24 author Full Name 2000098000 +0100 committer Full Name 2000098000 +0100 data 29 O: Merge commit 'N' into 'M' from :21 merge :23 commit refs/heads/master mark :25 author Full Name 2000099000 +0100 committer Full Name 2000099000 +0100 data 29 P: Merge commit 'M' into 'N' from :23 merge :21 blob mark :26 data 10 keepme v4 commit refs/heads/master mark :27 author Full Name 3000000000 +0100 committer Full Name 3000000000 +0100 data 2 Q from :25 M 100644 :26 moduleA/keepme blob mark :28 data 10 nukeme v7 commit refs/heads/master mark :29 author Full Name 3000010000 +0100 committer Full Name 3000010000 +0100 data 2 R from :27 M 100644 :28 moduleB/nukeme commit refs/heads/master mark :30 author Full Name 3000020000 +0100 committer Full Name 3000020000 +0100 data 29 S: Merge commit 'R' into 'R' from :29 merge :29 blob mark :31 data 10 keepme v5 commit refs/heads/master mark :32 author Full Name 3000030000 +0100 committer Full Name 3000030000 +0100 data 2 T from :30 M 100644 :31 moduleA/keepme blob mark :33 data 10 nukeme v8 commit refs/heads/master mark :34 author Full Name 3000050000 +0100 committer Full Name 3000050000 +0100 data 29 V: Merge commit 'U' into 'U' from :32 merge :32 M 100644 :33 moduleB/nukeme blob mark :35 data 10 keepme v6 commit refs/heads/master mark :36 author Full Name 3000060000 +0100 committer Full Name 3000060000 +0100 data 2 W from :34 M 100644 :35 moduleA/keepme done git-filter-repo-2.45.0/t/t9390/degenerate-keepme000066400000000000000000000034551464611705400211570ustar00rootroot00000000000000feature done blob mark :1 data 10 keepme v1 reset refs/heads/branchO commit refs/heads/branchO mark :2 author Full Name 2000000000 +0100 committer Full Name 2000000000 +0100 data 2 A M 100644 :1 moduleA/keepme blob mark :3 data 10 keepme v2 commit refs/heads/branchO mark :4 author Full Name 2000050000 +0100 committer Full Name 2000050000 +0100 data 2 G from :2 M 100644 :3 moduleA/keepme commit refs/heads/branchI mark :5 author Full Name 2000070000 +0100 committer Full Name 2000070000 +0100 data 29 I: Merge commit 'D' into 'H' from :4 merge :2 commit refs/heads/branchO mark :6 author Full Name 2000080000 +0100 committer Full Name 2000080000 +0100 data 29 J: Merge commit 'H' into 'D' from :2 merge :4 blob mark :7 data 10 keepme v3 commit refs/heads/branchO mark :8 author Full Name 2000092000 +0100 committer Full Name 2000092000 +0100 data 2 L from :6 M 100644 :7 moduleA/keepme blob mark :9 data 10 keepme v4 commit refs/heads/master mark :10 author Full Name 3000000000 +0100 committer Full Name 3000000000 +0100 data 2 Q from :8 M 100644 :9 moduleA/keepme blob mark :11 data 10 keepme v5 commit refs/heads/master mark :12 author Full Name 3000030000 +0100 committer Full Name 3000030000 +0100 data 2 T from :10 M 100644 :11 moduleA/keepme blob mark :13 data 10 keepme v6 commit refs/heads/master mark :14 author Full Name 3000060000 +0100 committer Full Name 3000060000 +0100 data 2 W from :12 M 100644 :13 moduleA/keepme done git-filter-repo-2.45.0/t/t9390/degenerate-keepme-noff000066400000000000000000000037741464611705400221110ustar00rootroot00000000000000feature done blob mark :1 data 10 keepme v1 reset refs/heads/master commit refs/heads/master mark :2 author Full Name 2000000000 +0100 committer Full Name 2000000000 +0100 data 2 A M 100644 :1 moduleA/keepme blob mark :3 data 10 keepme v2 commit refs/heads/branchO mark :4 author Full Name 2000050000 +0100 committer Full Name 2000050000 +0100 data 2 G from :2 M 100644 :3 moduleA/keepme commit refs/heads/branchI mark :5 author Full Name 2000070000 +0100 committer Full Name 2000070000 +0100 data 29 I: Merge commit 'D' into 'H' from :4 merge :2 commit refs/heads/branchO mark :6 author Full Name 2000080000 +0100 committer Full Name 2000080000 +0100 data 29 J: Merge commit 'H' into 'D' from :2 merge :4 blob mark :7 data 10 keepme v3 commit refs/heads/branchO mark :8 author Full Name 2000092000 +0100 committer Full Name 2000092000 +0100 data 2 L from :6 M 100644 :7 moduleA/keepme commit refs/heads/master mark :9 author Full Name 2000099000 +0100 committer Full Name 2000099000 +0100 data 29 P: Merge commit 'M' into 'N' from :2 merge :8 blob mark :10 data 10 keepme v4 commit refs/heads/master mark :11 author Full Name 3000000000 +0100 committer Full Name 3000000000 +0100 data 2 Q from :9 M 100644 :10 moduleA/keepme blob mark :12 data 10 keepme v5 commit refs/heads/master mark :13 author Full Name 3000030000 +0100 committer Full Name 3000030000 +0100 data 2 T from :11 M 100644 :12 moduleA/keepme blob mark :14 data 10 keepme v6 commit refs/heads/master mark :15 author Full Name 3000060000 +0100 committer Full Name 3000060000 +0100 data 2 W from :13 M 100644 :14 moduleA/keepme done git-filter-repo-2.45.0/t/t9390/degenerate-moduleA000066400000000000000000000066221464611705400212760ustar00rootroot00000000000000feature done blob mark :1 data 10 keepme v1 reset refs/heads/master commit refs/heads/master mark :2 author Full Name 2000000000 +0100 committer Full Name 2000000000 +0100 data 2 A M 100644 :1 moduleA/keepme blob mark :3 data 13 sometimes v1 commit refs/heads/master mark :4 author Full Name 2000030000 +0100 committer Full Name 2000030000 +0100 data 29 D: Merge commit 'C' into 'B' from :2 merge :2 M 100644 :3 moduleA/sometimes blob mark :5 data 10 keepme v2 commit refs/heads/master mark :6 author Full Name 2000050000 +0100 committer Full Name 2000050000 +0100 data 2 G from :4 M 100644 :5 moduleA/keepme blob mark :7 data 13 sometimes v2 commit refs/heads/branchI mark :8 author Full Name 2000070000 +0100 committer Full Name 2000070000 +0100 data 29 I: Merge commit 'D' into 'H' from :6 merge :4 M 100644 :7 moduleA/sometimes blob mark :9 data 13 sometimes v3 commit refs/heads/master mark :10 author Full Name 2000080000 +0100 committer Full Name 2000080000 +0100 data 29 J: Merge commit 'H' into 'D' from :4 merge :6 M 100644 :9 moduleA/sometimes blob mark :11 data 10 keepme v3 commit refs/heads/master mark :12 author Full Name 2000092000 +0100 committer Full Name 2000092000 +0100 data 2 L from :10 M 100644 :11 moduleA/keepme commit refs/heads/branchO mark :13 author Full Name 2000098000 +0100 committer Full Name 2000098000 +0100 data 29 O: Merge commit 'N' into 'M' from :12 merge :2 D moduleA/sometimes blob mark :14 data 13 sometimes v5 commit refs/heads/master mark :15 author Full Name 2000099000 +0100 committer Full Name 2000099000 +0100 data 29 P: Merge commit 'M' into 'N' from :2 merge :12 M 100644 :14 moduleA/sometimes blob mark :16 data 10 keepme v4 commit refs/heads/master mark :17 author Full Name 3000000000 +0100 committer Full Name 3000000000 +0100 data 2 Q from :15 M 100644 :16 moduleA/keepme blob mark :18 data 13 sometimes v6 commit refs/heads/master mark :19 author Full Name 3000020000 +0100 committer Full Name 3000020000 +0100 data 29 S: Merge commit 'R' into 'R' from :17 merge :17 M 100644 :18 moduleA/sometimes blob mark :20 data 10 keepme v5 commit refs/heads/master mark :21 author Full Name 3000030000 +0100 committer Full Name 3000030000 +0100 data 2 T from :19 M 100644 :20 moduleA/keepme blob mark :22 data 13 sometimes v7 commit refs/heads/master mark :23 author Full Name 3000040000 +0100 committer Full Name 3000040000 +0100 data 2 U from :21 M 100644 :22 moduleA/sometimes commit refs/heads/master mark :24 author Full Name 3000050000 +0100 committer Full Name 3000050000 +0100 data 29 V: Merge commit 'U' into 'U' from :23 merge :23 blob mark :25 data 10 keepme v6 commit refs/heads/master mark :26 author Full Name 3000060000 +0100 committer Full Name 3000060000 +0100 data 2 W from :24 M 100644 :25 moduleA/keepme done git-filter-repo-2.45.0/t/t9390/empty000066400000000000000000000060401464611705400167370ustar00rootroot00000000000000feature done # Simple repo with only two files, with a whole bunch of cases dealing with # empty pruning, particularly commits that start empty. # # As with case1, the original-oid directives are very fake, but if an error # is hit that shows one of these, it makes it really easy to know where it # came from. blob mark :1 original-oid 0000000000000000000000000000000000000001 data 10 nukeme v1 blob mark :2 original-oid 0000000000000000000000000000000000000002 data 10 keepme v1 blob mark :3 original-oid 0000000000000000000000000000000000000003 data 10 nukeme v2 blob mark :4 original-oid 0000000000000000000000000000000000000004 data 10 keepme v2 commit refs/heads/master mark :5 original-oid 0000000000000000000000000000000000000010 author Full Name 1000000000 +0100 committer Full Name 1000000000 +0100 data 2 A commit refs/heads/master mark :6 original-oid 0000000000000000000000000000000000000011 author Full Name 1000010000 +0100 committer Full Name 1000010000 +0100 data 2 B from :5 reset refs/heads/master commit refs/heads/master mark :7 original-oid 0000000000000000000000000000000000000012 author Full Name 1000020000 +0100 committer Full Name 1000020000 +0100 data 2 C M 100644 :1 nukeme commit refs/heads/master mark :8 original-oid 0000000000000000000000000000000000000013 author Full Name 1000030000 +0100 committer Full Name 1000030000 +0100 data 2 D from :7 commit refs/heads/master mark :9 original-oid 0000000000000000000000000000000000000014 author Full Name 1000040000 +0100 committer Full Name 1000040000 +0100 data 29 E: Merge commit 'D' into 'B' from :6 merge :8 M 100644 :2 keepme commit refs/heads/master mark :10 original-oid 0000000000000000000000000000000000000015 author Full Name 1000050000 +0100 committer Full Name 1000050000 +0100 data 29 F: Merge commit 'D' into 'B' from :6 merge :8 commit refs/heads/master mark :11 original-oid 0000000000000000000000000000000000000016 author Full Name 1000060000 +0100 committer Full Name 1000060000 +0100 data 2 G from :9 M 100644 :3 nukeme commit refs/heads/master mark :12 original-oid 0000000000000000000000000000000000000017 author Full Name 1000070000 +0100 committer Full Name 1000070000 +0100 data 2 H from :11 commit refs/heads/master mark :13 original-oid 0000000000000000000000000000000000000018 author Full Name 1000080000 +0100 committer Full Name 1000080000 +0100 data 2 I from :10 M 100644 :4 keepme commit refs/heads/master mark :14 original-oid 0000000000000000000000000000000000000019 author Full Name 1000090000 +0100 committer Full Name 1000090000 +0100 data 29 J: Merge commit 'I' into 'H' from :12 merge :13 done git-filter-repo-2.45.0/t/t9390/empty-keepme000066400000000000000000000020331464611705400202010ustar00rootroot00000000000000feature done reset refs/heads/master commit refs/heads/master mark :1 author Full Name 1000000000 +0100 committer Full Name 1000000000 +0100 data 2 A commit refs/heads/master mark :2 author Full Name 1000010000 +0100 committer Full Name 1000010000 +0100 data 2 B from :1 blob mark :3 data 10 keepme v1 commit refs/heads/master mark :4 author Full Name 1000040000 +0100 committer Full Name 1000040000 +0100 data 29 E: Merge commit 'D' into 'B' from :2 M 100644 :3 keepme blob mark :5 data 10 keepme v2 commit refs/heads/master mark :6 author Full Name 1000080000 +0100 committer Full Name 1000080000 +0100 data 2 I from :2 M 100644 :5 keepme commit refs/heads/master mark :7 author Full Name 1000090000 +0100 committer Full Name 1000090000 +0100 data 29 J: Merge commit 'I' into 'H' from :4 merge :6 done git-filter-repo-2.45.0/t/t9390/less-empty-keepme000066400000000000000000000036601464611705400211540ustar00rootroot00000000000000feature done reset refs/heads/master commit refs/heads/master mark :1 author Full Name 1000000000 +0100 committer Full Name 1000000000 +0100 data 2 A commit refs/heads/master mark :2 author Full Name 1000010000 +0100 committer Full Name 1000010000 +0100 data 2 B from :1 reset refs/heads/master commit refs/heads/master mark :3 author Full Name 1000020000 +0100 committer Full Name 1000020000 +0100 data 2 C commit refs/heads/master mark :4 author Full Name 1000030000 +0100 committer Full Name 1000030000 +0100 data 2 D from :3 blob mark :5 data 10 keepme v1 commit refs/heads/master mark :6 author Full Name 1000040000 +0100 committer Full Name 1000040000 +0100 data 29 E: Merge commit 'D' into 'B' from :2 merge :4 M 100644 :5 keepme commit refs/heads/master mark :7 author Full Name 1000060000 +0100 committer Full Name 1000060000 +0100 data 2 G from :6 commit refs/heads/master mark :8 author Full Name 1000070000 +0100 committer Full Name 1000070000 +0100 data 2 H from :7 commit refs/heads/master mark :9 author Full Name 1000050000 +0100 committer Full Name 1000050000 +0100 data 29 F: Merge commit 'D' into 'B' from :2 merge :4 blob mark :10 data 10 keepme v2 commit refs/heads/master mark :11 author Full Name 1000080000 +0100 committer Full Name 1000080000 +0100 data 2 I from :9 M 100644 :10 keepme commit refs/heads/master mark :12 author Full Name 1000090000 +0100 committer Full Name 1000090000 +0100 data 29 J: Merge commit 'I' into 'H' from :8 merge :11 done git-filter-repo-2.45.0/t/t9390/more-empty-keepme000066400000000000000000000013271464611705400211460ustar00rootroot00000000000000feature done blob mark :1 data 10 keepme v1 reset refs/heads/master commit refs/heads/master mark :2 author Full Name 1000040000 +0100 committer Full Name 1000040000 +0100 data 29 E: Merge commit 'D' into 'B' M 100644 :1 keepme blob mark :3 data 10 keepme v2 reset refs/heads/master commit refs/heads/master mark :4 author Full Name 1000080000 +0100 committer Full Name 1000080000 +0100 data 2 I M 100644 :3 keepme commit refs/heads/master mark :5 author Full Name 1000090000 +0100 committer Full Name 1000090000 +0100 data 29 J: Merge commit 'I' into 'H' from :2 merge :4 done git-filter-repo-2.45.0/t/t9390/sample-mailmap000066400000000000000000000004261464611705400205020ustar00rootroot00000000000000Little 'ol Me # Here is a comment Little 'ol Me Little O. Me Little 'ol Me Little 'ol Me Little Me Little John little.john <> git-filter-repo-2.45.0/t/t9390/sample-message000066400000000000000000000001231464611705400205000ustar00rootroot00000000000000Initial==>Modified regex:tw.nty==>the number 20 v1.0==>version one! regex:!$==> :) git-filter-repo-2.45.0/t/t9390/sample-replace000066400000000000000000000000331464611705400204670ustar00rootroot00000000000000mod==>modified-by-gremlins git-filter-repo-2.45.0/t/t9390/unusual000066400000000000000000000013741464611705400173020ustar00rootroot00000000000000option git quiet feature done # Input in a format filter-repo isn't generally expected to receive (either # because we don't pass certain flags to fast-export or repos don't have the # weird features or whatever other reason), but which we want to test for # completeness. progress I am starting the import, yo. checkpoint blob mark :1 original-oid 0000000000000000000000000000000000000001 data 5 hello commit refs/heads/master mark :2 original-oid 0000000000000000000000000000000000000002 committer Srinivasa Ramanujan 1535228562 +051800 data 8 Initial M 100644 :1 greeting reset refs/heads/develop from :2 tag v1.0 from :2 original-oid 0000000000000000000000000000000000000003 tagger little.john <> 1535229618 -0700 data 4 v1.0 done git-filter-repo-2.45.0/t/t9390/unusual-filtered000066400000000000000000000005761464611705400211010ustar00rootroot00000000000000feature done blob mark :1 data 5 hello reset refs/heads/develop commit refs/heads/develop mark :2 author Srinivasa Ramanujan 1535228562 +051800 committer Srinivasa Ramanujan 1535228562 +051800 data 8 Initial M 100644 :1 greeting reset refs/heads/master from :2 tag v1.0 from :2 tagger little.john <> 1535229618 -0700 data 4 v1.0 done git-filter-repo-2.45.0/t/t9390/unusual-mailmap000066400000000000000000000006161464611705400207160ustar00rootroot00000000000000feature done blob mark :1 data 5 hello reset refs/heads/develop commit refs/heads/develop mark :2 author Srinivasa Ramanujan 1535228562 +051800 committer Srinivasa Ramanujan 1535228562 +051800 data 8 Initial M 100644 :1 greeting reset refs/heads/master from :2 tag v1.0 from :2 tagger Little John 1535229618 -0700 data 4 v1.0 done git-filter-repo-2.45.0/t/t9391-filter-repo-lib-usage.sh000077500000000000000000000145651464611705400223250ustar00rootroot00000000000000#!/bin/bash test_description='Usage of git-filter-repo as a library' . ./test-lib.sh # for git_filter_repo.py import case "$(uname -s)" in MINGW*|MSYS) export PYTHONPATH=$(cygpath -am $TEST_DIRECTORY/..)\;$PYTHONPATH ;; *) export PYTHONPATH=$(dirname $TEST_DIRECTORY):$PYTHONPATH ;; esac # Avoid writing git_filter_repo.pyc file export PYTHONDONTWRITEBYTECODE=1 export CONTRIB_DIR=$TEST_DIRECTORY/../contrib/filter-repo-demos DATA="$TEST_DIRECTORY/t9391" setup() { git init $1 && ( cd $1 && echo hello > world && git add world && test_tick && git commit -m initial && printf "The launch code is 1-2-3-4." > secret && git add secret && test_tick && git commit -m "Sssh. Dont tell no one" && echo A file that you cant trust > file.doc && echo there >> world && git add file.doc world && test_tick && printf "Random useless changes\n\nLet us be like the marketing group. Marketing is staffed with pansies" | git commit -F - && echo Do not use a preposition to end a setence with > advice && git add advice && test_tick && GIT_AUTHOR_NAME="Copy N. Paste" git commit -m "hypocrisy is fun" && echo Avoid cliches like the plague >> advice && test_tick && GIT_AUTHOR_EMAIL="foo@my.crp" git commit -m "it is still fun" advice && echo " \$Id: A bunch of junk$" > foobar.c && git add foobar.c && test_tick && git commit -m "Brain damage" ) } test_expect_success 'commit_info.py' ' setup commit_info && ( cd commit_info && $TEST_DIRECTORY/t9391/commit_info.py && test 0e5a1029 = $(git rev-parse --short=8 --verify refs/heads/master) ) ' test_expect_success 'file_filter.py' ' setup file_filter && ( cd file_filter && $TEST_DIRECTORY/t9391/file_filter.py && test ee59e2b4 = $(git rev-parse --short=8 --verify refs/heads/master) ) ' test_expect_success 'print_progress.py' ' setup print_progress && ( cd print_progress && MASTER=$(git rev-parse --verify master) && $TEST_DIRECTORY/t9391/print_progress.py . new && test $MASTER = $(git rev-parse --verify refs/heads/master) ) ' test_expect_success 'rename-master-to-develop.py' ' setup rename_master_to_develop && ( cd rename_master_to_develop && MASTER=$(git rev-parse --verify master) && $TEST_DIRECTORY/t9391/rename-master-to-develop.py && test $MASTER = $(git rev-parse --verify refs/heads/develop) ) ' test_expect_success 'strip-cvs-keywords.py' ' setup strip_cvs_keywords && ( cd strip_cvs_keywords && $TEST_DIRECTORY/t9391/strip-cvs-keywords.py test 2306fc7c = $(git rev-parse --short=8 --verify refs/heads/master) ) ' test_expect_success 'setup two extra repositories' ' mkdir repo1 && cd repo1 && git init && echo hello > world && git add world && test_tick && git commit -m "Commit A" && echo goodbye > world && git add world && test_tick && git commit -m "Commit C" && cd .. && mkdir repo2 && cd repo2 && git init && echo foo > bar && git add bar && test_tick && git commit -m "Commit B" && echo fooey > bar && git add bar && test_tick && git commit -m "Commit D" && cd .. ' test_expect_success 'splice_repos.py' ' git init splice_repos && $TEST_DIRECTORY/t9391/splice_repos.py repo1 repo2 splice_repos && test 4 = $(git -C splice_repos rev-list master | wc -l) ' test_expect_success 'create_fast_export_output.py' ' git init create_fast_export_output && (cd create_fast_export_output && $TEST_DIRECTORY/t9391/create_fast_export_output.py && test e5e0569b = $(git rev-parse --short=8 --verify refs/heads/master) && test 122ead00 = $(git rev-parse --short=8 --verify refs/heads/devel) && test f36143f9 = $(git rev-parse --short=8 --verify refs/tags/v1.0)) ' test_expect_success 'unusual.py' ' setup unusual && ( cd unusual && cat $TEST_DIRECTORY/t9390/unusual | \ $TEST_DIRECTORY/t9391/unusual.py >output && grep "Decipher this: .oy ,tropmi eht gnitrats ma I" output && grep "Found 2 blobs/commits and 4 other objects" output ) ' test_expect_success 'erroneous.py' ' setup erroneous && ( cd erroneous && test_must_fail $TEST_DIRECTORY/t9391/erroneous.py 2>../err && test_i18ngrep "Error: Cannot pass a tag_callback to RepoFilter AND pass --tag-callback" ../err ) ' test_expect_success 'other error cases' ' GIT_CEILING_DIRECTORIES=$(pwd) && export GIT_CEILING_DIRECTORIES && ( mkdir other && cd other && ! python3 -c "import git_filter_repo as fr; fr.GitUtils.get_commit_count(b\".\", [\"HEAD\"])" 2>err && test_i18ngrep "\. does not appear to be a valid git repository" err ) ' test_lazy_prereq DOS2UNIX ' dos2unix -h test $? -ne 127 ' test_expect_success 'lint-history' ' test_create_repo lint-history && ( cd lint-history && git config core.autocrlf false && echo initial >content && git add content && git commit -m "initial" && printf "CRLF is stupid\r\n" >content && git add content && git commit -m "make a statement" && printf "CRLF is stupid\n" >content && git add content && git commit -m "oops, that was embarassing" && if test_have_prereq DOS2UNIX then $CONTRIB_DIR/lint-history --filenames-important dos2unix && echo 2 >expect && git rev-list --count HEAD >actual && test_cmp expect actual fi ) ' test_expect_success !WINDOWS 'lint-history --refs' ' test_create_repo lint-history-only-some-refs && ( cd lint-history-only-some-refs && test_commit a somefile bad && test_commit b notherfile baaad && test_commit c whatever baaaaaad && git checkout -b mybranch HEAD~1 && test_commit d somefile baaaaaaaad && test_commit e whatever "baaaaaaaaaad to the bone" && cat <<-EOF >linter.sh && #!/bin/bash cat \$1 | tr -d a >tmp mv tmp \$1 EOF chmod u+x linter.sh && PATH=$PATH:. $CONTRIB_DIR/lint-history --refs master..mybranch -- linter.sh && echo bd >expect && echo bd to the bone >long-expect && # Verify master is untouched git checkout master && ! test_cmp somefile expect && ! test_cmp notherfile expect && ! test_cmp whatever expect && # Verify that files touched on the branch are tweaked git checkout mybranch && test_cmp somefile expect && ! test_cmp notherfile expect && test_cmp whatever long-expect ) ' test_expect_success 'clean-ignore with emoji in filenames' ' test_create_repo clean-ignore && ( cd clean-ignore && git fast-import --quiet <$DATA/emoji-repo && git reset --hard && $CONTRIB_DIR/clean-ignore --force && printf ".gitignore\nfilename\n" >expect && git ls-files >actual && test_cmp expect actual ) ' test_done git-filter-repo-2.45.0/t/t9391/000077500000000000000000000000001464611705400156575ustar00rootroot00000000000000git-filter-repo-2.45.0/t/t9391/commit_info.py000077500000000000000000000017441464611705400205450ustar00rootroot00000000000000#!/usr/bin/env python3 """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo """ import re import datetime import git_filter_repo as fr def change_up_them_commits(commit, metadata): # Change the commit author if commit.author_name == b"Copy N. Paste": commit.author_name = b"Ima L. Oser" commit.author_email = b"aloser@my.corp" # Fix the author email commit.author_email = re.sub(b"@my.crp", b"@my.corp", commit.author_email) # Fix the committer date (bad timezone conversion in initial import) oldtime = fr.string_to_date(commit.committer_date) newtime = oldtime + datetime.timedelta(hours=-5) commit.committer_date = fr.date_to_string(newtime) # Fix the commit message commit.message = re.sub(b"Marketing is staffed with pansies", b"", commit.message) args = fr.FilteringOptions.parse_args(['--force']) filter = fr.RepoFilter(args, commit_callback = change_up_them_commits) filter.run() git-filter-repo-2.45.0/t/t9391/create_fast_export_output.py000077500000000000000000000102741464611705400235410ustar00rootroot00000000000000#!/usr/bin/env python3 """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ import git_filter_repo as fr from git_filter_repo import Blob, Reset, FileChange, Commit, Tag, FixedTimeZone from git_filter_repo import Progress, Checkpoint from datetime import datetime, timedelta args = fr.FilteringOptions.default_options() out = fr.RepoFilter(args) out.importer_only() world = Blob(b"Hello") out.insert(world) bar = Blob(b"foo\n") out.insert(bar) master = Reset(b"refs/heads/master") out.insert(master) changes = [FileChange(b'M', b'world', world.id, mode=b"100644"), FileChange(b'M', b'bar', bar.id, mode=b"100644")] when = datetime(year=2005, month=4, day=7, hour=15, minute=16, second=10, tzinfo=FixedTimeZone(b"-0700")) when_string = fr.date_to_string(when) commit1 = Commit(b"refs/heads/master", b"A U Thor", b"au@thor.email", when_string, b"Com M. Iter", b"comm@iter.email", when_string, b"My first commit! Wooot!\n\nLonger description", changes, parents = []) out.insert(commit1) world = Blob(b"Hello\nHi") out.insert(world) world_link = Blob(b"world") out.insert(world_link) changes = [FileChange(b'M', b'world', world.id, mode=b"100644"), FileChange(b'M', b'planet', world_link.id, mode=b"120000")] when += timedelta(days=3, hours=4, minutes=6) when_string = fr.date_to_string(when) commit2 = Commit(b"refs/heads/master", b"A U Thor", b"au@thor.email", when_string, b"Com M. Iter", b"comm@iter.email", when_string, b"Make a symlink to world called planet, modify world", changes, parents = [commit1.id]) out.insert(commit2) script = Blob(b"#!/bin/sh\n\necho Hello") out.insert(script) changes = [FileChange(b'M', b'runme', script.id, mode=b"100755"), FileChange(b'D', b'bar')] when_string = b"1234567890 -0700" commit3 = Commit(b"refs/heads/master", b"A U Thor", b"au@thor.email", when_string, b"Com M. Iter", b"comm@iter.email", when_string, b"Add runme script, remove bar", changes, parents = [commit2.id]) out.insert(commit3) progress = Progress(b"Done with the master branch now...") out.insert(progress) checkpoint = Checkpoint() out.insert(checkpoint) devel = Reset(b"refs/heads/devel", commit1.id) out.insert(devel) world = Blob(b"Hello\nGoodbye") out.insert(world) changes = [FileChange(b'DELETEALL'), FileChange(b'M', b'world', world.id, mode=b"100644"), FileChange(b'M', b'bar', bar.id, mode=b"100644")] when = datetime(2006, 8, 17, tzinfo=FixedTimeZone(b"+0200")) when_string = fr.date_to_string(when) commit4 = Commit(b"refs/heads/devel", b"A U Thor", b"au@thor.email", when_string, b"Com M. Iter", b"comm@iter.email", when_string, b"Modify world", changes, parents = [commit1.id]) out.insert(commit4) world = Blob(b"Hello\nHi\nGoodbye") out.insert(world) when = fr.string_to_date(commit3.author_date) + timedelta(days=47) when_string = fr.date_to_string(when) # git fast-import requires file changes to be listed in terms of differences # to the first parent. Thus, despite the fact that runme and planet have # not changed and bar was not modified in the devel side, we have to list them # all anyway. changes = [FileChange(b'M', b'world', world.id, mode=b"100644"), FileChange(b'D', b'bar'), FileChange(b'M', b'runme', script.id, mode=b"100755"), FileChange(b'M', b'planet', world_link.id, mode=b"120000")] commit5 = Commit(b"refs/heads/devel", b"A U Thor", b"au@thor.email", when_string, b"Com M. Iter", b"comm@iter.email", when_string, b"Merge branch 'master'\n", changes, parents = [commit4.id, commit3.id]) out.insert(commit5) mytag = Tag(b"refs/tags/v1.0", commit5.id, b"His R. Highness", b"royalty@my.kingdom", when_string, b"I bequeath to my peons this royal software") out.insert(mytag) out.finish() git-filter-repo-2.45.0/t/t9391/emoji-repo000066400000000000000000000005401464611705400176470ustar00rootroot00000000000000feature done blob mark :1 data 8 initial blob mark :2 data 5 lock blob mark :3 data 11 *.bak 🔒 reset refs/heads/master commit refs/heads/master mark :4 author Little O. Me 1535228562 -0700 committer Little O. Me 1535228562 -0700 data 10 My commit M 100644 :1 filename M 100644 :2 🔒 M 100644 :3 .gitignore done git-filter-repo-2.45.0/t/t9391/erroneous.py000077500000000000000000000005631464611705400202610ustar00rootroot00000000000000#!/usr/bin/env python3 """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo """ import git_filter_repo as fr def handle_tag(tag): print("Tagger: "+''.join(tag.tagger_name)) args = fr.FilteringOptions.parse_args(['--force', '--tag-callback', 'pass']) filter = fr.RepoFilter(args, tag_callback = handle_tag) filter.run() git-filter-repo-2.45.0/t/t9391/file_filter.py000077500000000000000000000014341464611705400205220ustar00rootroot00000000000000#!/usr/bin/env python3 """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ import sys import git_filter_repo as fr def drop_file_by_contents(blob, metadata): bad_file_contents = b'The launch code is 1-2-3-4.' if blob.data == bad_file_contents: blob.skip() def drop_files_by_name(commit, metadata): new_file_changes = [] for change in commit.file_changes: if not change.filename.endswith(b'.doc'): new_file_changes.append(change) commit.file_changes = new_file_changes sys.argv.append('--force') args = fr.FilteringOptions.parse_args(sys.argv[1:]) filter = fr.RepoFilter(args, blob_callback = drop_file_by_contents, commit_callback = drop_files_by_name) filter.run() git-filter-repo-2.45.0/t/t9391/print_progress.py000077500000000000000000000021421464611705400213130ustar00rootroot00000000000000#!/usr/bin/env python3 """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo """ import sys import git_filter_repo as fr if len(sys.argv) != 3: raise SystemExit("Syntax:\n %s SOURCE_REPO TARGET_REPO") source_repo = sys.argv[1].encode() target_repo = sys.argv[2].encode() total_objects = fr.GitUtils.get_total_objects(source_repo) # blobs+trees total_commits = fr.GitUtils.get_commit_count(source_repo) object_count = 0 commit_count = 0 def print_progress(): global object_count, commit_count, total_objects, total_commits print("\rRewriting commits... %d/%d (%d objects)" % (commit_count, total_commits, object_count), end='') def my_blob_callback(blob, metadata): global object_count object_count += 1 print_progress() def my_commit_callback(commit, metadata): global commit_count commit_count += 1 print_progress() args = fr.FilteringOptions.parse_args(['--force', '--quiet']) filter = fr.RepoFilter(args, blob_callback = my_blob_callback, commit_callback = my_commit_callback) filter.run() git-filter-repo-2.45.0/t/t9391/rename-master-to-develop.py000077500000000000000000000006571464611705400230600ustar00rootroot00000000000000#!/usr/bin/env python3 """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ import git_filter_repo as fr def my_commit_callback(commit, metadata): if commit.branch == b"refs/heads/master": commit.branch = b"refs/heads/develop" args = fr.FilteringOptions.default_options() args.force = True filter = fr.RepoFilter(args, commit_callback = my_commit_callback) filter.run() git-filter-repo-2.45.0/t/t9391/splice_repos.py000077500000000000000000000053021464611705400207230ustar00rootroot00000000000000#!/usr/bin/env python3 """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. Also, note that splicing repos may need some special care as fast-export only shows the files that changed relative to the first parent, so there may be gotchas if you are to splice near merge commits; this example does not try to handle any such special cases. """ import re import sys import git_filter_repo as fr class InterleaveRepositories: def __init__(self, repo1, repo2, output_dir): self.repo1 = repo1 self.repo2 = repo2 self.output_dir = output_dir self.commit_map = {} self.last_commit = None def skip_reset(self, reset, metadata): reset.skip() def hold_commit(self, commit, metadata): commit.skip(new_id = commit.id) letter = re.match(b'Commit (.)', commit.message).group(1) self.commit_map[letter] = commit def weave_commit(self, commit, metadata): letter = re.match(b'Commit (.)', commit.message).group(1) prev_letter = bytes([ord(letter)-1]) # Splice in any extra commits needed if prev_letter in self.commit_map: new_commit = self.commit_map[prev_letter] new_commit.dumped = 0 new_commit.parents = [self.last_commit] if self.last_commit else [] # direct_insertion=True to avoid weave_commit being called recursively # on the same commit self.out.insert(new_commit, direct_insertion = True) commit.parents = [new_commit.id] # Dump our commit now self.out.insert(commit, direct_insertion = True) # Make sure that commits that depended on new_commit.id will now depend # on commit.id if prev_letter in self.commit_map: self.last_commit = commit.id fr.record_id_rename(new_commit.id, commit.id) def run(self): blob = fr.Blob(b'public gpg key contents') tag = fr.Tag(b'gpg-pubkey', blob.id, b'Ima Tagger', b'ima@tagg.er', b'1136199845 +0300', b'Very important explanation and stuff') args = fr.FilteringOptions.parse_args(['--target', self.output_dir]) out = fr.RepoFilter(args) out.importer_only() self.out = out i1args = fr.FilteringOptions.parse_args(['--source', self.repo1]) i1 = fr.RepoFilter(i1args, reset_callback = self.skip_reset, commit_callback = self.hold_commit) i1.set_output(out) i1.run() i2args = fr.FilteringOptions.parse_args(['--source', self.repo2]) i2 = fr.RepoFilter(i2args, commit_callback = self.weave_commit) i2.set_output(out) i2.run() out.insert(blob) out.insert(tag) out.finish() splicer = InterleaveRepositories(sys.argv[1], sys.argv[2], sys.argv[3]) splicer.run() git-filter-repo-2.45.0/t/t9391/strip-cvs-keywords.py000077500000000000000000000012251464611705400220330ustar00rootroot00000000000000#!/usr/bin/env python3 """ Please see the ***** API BACKWARD COMPATIBILITY CAVEAT ***** near the top of git-filter-repo. """ import re import git_filter_repo as fr def strip_cvs_keywords(blob, metadata): # FIXME: Should first check if blob is a text file to avoid ruining # binaries. Could use python.magic here, or just output blob.data to # the unix 'file' command pattern = br'\$(Id|Date|Source|Header|CVSHeader|Author|Revision):.*\$' replacement = br'$\1$' blob.data = re.sub(pattern, replacement, blob.data) args = fr.FilteringOptions.parse_args(['--force']) filter = fr.RepoFilter(args, blob_callback = strip_cvs_keywords) filter.run() git-filter-repo-2.45.0/t/t9391/unusual.py000077500000000000000000000107641464611705400177400ustar00rootroot00000000000000#!/usr/bin/env python3 # Please: DO NOT USE THIS AS AN EXAMPLE. # # This file is NOT for demonstration of how to use git-filter-repo as a # libary; it exists to test corner cases or otherwise unusual inputs, and # to verify some invariants that git-filter-repo currently aims to maintain # (these invariants might be different in future versions of # git-filter-repo). As such, it reaches deep into the internals and does # weird things that you should probably avoid in your usage of # git-filter-repo. Any code in this testcase is much more likely to have # API breaks than other files in t9391. import collections import os import random import io import sys import textwrap import git_filter_repo as fr total_objects = {'common': 0, 'uncommon': 0} def track_everything(obj, *_ignored): if type(obj) == fr.Blob or type(obj) == fr.Commit: total_objects['common'] += 1 else: total_objects['uncommon'] += 1 if type(obj) == fr.Reset: def assert_not_reached(x): raise SystemExit("should have been skipped!") obj.dump = assert_not_reached obj.skip() if hasattr(obj, 'id') and type(obj) != fr.Tag: # The creation of myblob should cause objects in stream to get their ids # increased by 1; this shouldn't be depended upon as API by external # projects, I'm just verifying an invariant of the current code. assert fr._IDS._reverse_translation[obj.id] == [obj.id - 1] def handle_progress(progress): print(b"Decipher this: "+bytes(reversed(progress.message))) track_everything(progress) def handle_checkpoint(checkpoint_object): # Flip a coin; see if we want to pass the checkpoint through. if random.randint(0,1) == 0: checkpoint_object.dump(parser._output) track_everything(checkpoint_object) mystr = b'This is the contents of the blob' compare = b"Blob:\n blob\n mark :1\n data %d\n %s" % (len(mystr), mystr) # Next line's only purpose is testing code coverage of something that helps # debugging git-filter-repo; it is NOT something external folks should depend # upon. myblob = fr.Blob(mystr) assert bytes(myblob) == compare # Everyone should be using RepoFilter objects, not FastExportParser. But for # testing purposes... parser = fr.FastExportParser(blob_callback = track_everything, reset_callback = track_everything, commit_callback = track_everything, tag_callback = track_everything, progress_callback = handle_progress, checkpoint_callback = handle_checkpoint) parser.run(input = sys.stdin.detach(), output = open(os.devnull, 'bw')) # DO NOT depend upon or use _IDS directly you external script writers. I'm # only testing here for code coverage; the capacity exists to help debug # git-filter-repo itself, not for external folks to use. assert str(fr._IDS).startswith("Current count: 5") print("Found {} blobs/commits and {} other objects" .format(total_objects['common'], total_objects['uncommon'])) stream = io.BytesIO(textwrap.dedent(''' blob mark :1 data 5 hello commit refs/heads/A mark :2 author Just Me 1234567890 -0200 committer Just Me 1234567890 -0200 data 2 A commit refs/heads/B mark :3 author Just Me 1234567890 -0200 committer Just Me 1234567890 -0200 data 2 B from :2 M 100644 :1 greeting reset refs/heads/B from :3 commit refs/heads/C mark :4 author Just Me 1234567890 -0200 committer Just Me 1234567890 -0200 data 2 C from :3 M 100644 :1 salutation '''[1:]).encode()) counts = collections.Counter() def look_for_reset(obj, metadata): print("Processing {}".format(obj)) counts[type(obj)] += 1 if type(obj) == fr.Reset: assert obj.ref == b'refs/heads/B' # Use all kinds of internals that external scripts should NOT use and which # are likely to break in the future, just to verify a few invariants... args = fr.FilteringOptions.parse_args(['--stdin', '--dry-run', '--path', 'salutation']) filter = fr.RepoFilter(args, blob_callback = look_for_reset, reset_callback = look_for_reset, commit_callback = look_for_reset, tag_callback = look_for_reset) filter._input = stream filter._setup_output() filter._sanity_checks_handled = True filter.run() assert counts == collections.Counter({fr.Blob: 1, fr.Commit: 3, fr.Reset: 1}) git-filter-repo-2.45.0/t/t9392-python-callback.sh000077500000000000000000000131631464611705400212740ustar00rootroot00000000000000#!/bin/bash test_description='Usage of git-filter-repo with python callbacks' . ./test-lib.sh export PATH=$(dirname $TEST_DIRECTORY):$PATH # Put git-filter-repo in PATH setup() { git init $1 && ( cd $1 && echo hello > world && git add world && test_tick && git commit -m initial && printf "The launch code is 1-2-3-4." > secret && git add secret && test_tick && git commit -m "Sssh. Dont tell no one" && echo A file that you cant trust > file.doc && echo there >> world && git add file.doc world && test_tick && printf "Random useless changes\n\nLet us be like the marketing group. Marketing is staffed with pansies" | git commit -F - && echo Do not use a preposition to end a setence with > advice && git add advice && test_tick && GIT_AUTHOR_NAME="Copy N. Paste" git commit -m "hypocrisy is fun" && echo Avoid cliches like the plague >> advice && test_tick && GIT_AUTHOR_EMAIL="foo@my.crp" git commit -m "it is still fun" advice && echo " \$Id: A bunch of junk$" > foobar.c && git add foobar.c && test_tick && git commit -m "Brain damage" && git tag v1.0 HEAD~3 && git tag -a -m 'Super duper snazzy release' v2.0 HEAD~1 && git branch testing master && # Make it look like a fresh clone (avoid need for --force) git gc && git remote add origin . && git update-ref refs/remotes/origin/master refs/heads/master git update-ref refs/remotes/origin/testing refs/heads/testing ) } test_expect_success '--filename-callback' ' setup filename-callback && ( cd filename-callback && git filter-repo --filename-callback "return None if filename.endswith(b\".doc\") else b\"src/\"+filename" && git log --format=%n --name-only | sort | uniq | grep -v ^$ > f && ! grep file.doc f && COMPARE=$(wc -l filtered_f && test_line_count = $COMPARE filtered_f ) ' test_expect_success '--message-callback' ' setup message-callback && ( cd message-callback && git filter-repo --message-callback "return b\"TLDR: \"+message[0:5]" && git log --format=%s >log-messages && grep TLDR:...... log-messages >modified-messages && test_line_count = 6 modified-messages ) ' test_expect_success '--name-callback' ' setup name-callback && ( cd name-callback && git filter-repo --name-callback "return name.replace(b\"N.\", b\"And\")" && git log --format=%an >log-person-names && grep Copy.And.Paste log-person-names ) ' test_expect_success '--email-callback' ' setup email-callback && ( cd email-callback && git filter-repo --email-callback "return email.replace(b\".com\", b\".org\")" && git log --format=%ae%n%ce >log-emails && ! grep .com log-emails && grep .org log-emails ) ' test_expect_success '--refname-callback' ' setup refname-callback && ( cd refname-callback && git filter-repo --refname-callback " dir,path = os.path.split(refname) return dir+b\"/prefix-\"+path" && git show-ref | grep refs/heads/prefix-master && git show-ref | grep refs/tags/prefix-v1.0 && git show-ref | grep refs/tags/prefix-v2.0 ) ' test_expect_success '--refname-callback sanity check' ' setup refname-sanity-check && ( cd refname-sanity-check && test_must_fail git filter-repo --refname-callback "return re.sub(b\"tags\", b\"other-tags\", refname)" 2>../err && test_i18ngrep "fast-import requires tags to be in refs/tags/ namespace" ../err && rm ../err ) ' test_expect_success '--blob-callback' ' setup blob-callback && ( cd blob-callback && git log --format=%n --name-only | sort | uniq | grep -v ^$ > f && test_line_count = 5 f && rm f && git filter-repo --blob-callback "if len(blob.data) > 25: blob.skip()" && git log --format=%n --name-only | sort | uniq | grep -v ^$ > f && test_line_count = 2 f ) ' test_expect_success '--commit-callback' ' setup commit-callback && ( cd commit-callback && git filter-repo --commit-callback " commit.committer_name = commit.author_name commit.committer_email = commit.author_email commit.committer_date = commit.author_date for change in commit.file_changes: change.mode = b\"100755\" " && git log --format=%ae%n%ce >log-emails && ! grep committer@example.com log-emails && git log --raw | grep ^: >file-changes && ! grep 100644 file-changes && grep 100755 file-changes ) ' test_expect_success '--tag-callback' ' setup tag-callback && ( cd tag-callback && git filter-repo --tag-callback " tag.tagger_name = b\"Dr. \"+tag.tagger_name tag.message = b\"Awesome sauce \"+tag.message " && git cat-file -p v2.0 | grep ^tagger.Dr\\. && git cat-file -p v2.0 | grep ^Awesome.sauce.Super ) ' test_expect_success '--reset-callback' ' setup reset-callback && ( cd reset-callback && git filter-repo --reset-callback "reset.from_ref = 3" && test $(git rev-parse testing) = $(git rev-parse master~3) ) ' test_expect_success 'callback has return statement sanity check' ' setup callback_return_sanity && ( cd callback_return_sanity && test_must_fail git filter-repo --filename-callback "filename + b\".txt\"" 2>../err&& test_i18ngrep "Error: --filename-callback should have a return statement" ../err && rm ../err ) ' test_expect_success 'Callback read from a file' ' setup name-callback-from-file && ( cd name-callback-from-file && echo "return name.replace(b\"N.\", b\"And\")" >../name-func && git filter-repo --name-callback ../name-func && git log --format=%an >log-person-names && grep Copy.And.Paste log-person-names ) ' test_done git-filter-repo-2.45.0/t/test-lib-functions.sh000066400000000000000000000744041464611705400211640ustar00rootroot00000000000000# Library of functions shared by all tests scripts, included by # test-lib.sh. # # Copyright (c) 2005 Junio C Hamano # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see http://www.gnu.org/licenses/ . # The semantics of the editor variables are that of invoking # sh -c "$EDITOR \"$@\"" files ... # # If our trash directory contains shell metacharacters, they will be # interpreted if we just set $EDITOR directly, so do a little dance with # environment variables to work around this. # # In particular, quoting isn't enough, as the path may contain the same quote # that we're using. test_set_editor () { FAKE_EDITOR="$1" export FAKE_EDITOR EDITOR='"$FAKE_EDITOR"' export EDITOR } test_set_index_version () { GIT_INDEX_VERSION="$1" export GIT_INDEX_VERSION } test_decode_color () { awk ' function name(n) { if (n == 0) return "RESET"; if (n == 1) return "BOLD"; if (n == 2) return "FAINT"; if (n == 3) return "ITALIC"; if (n == 7) return "REVERSE"; if (n == 30) return "BLACK"; if (n == 31) return "RED"; if (n == 32) return "GREEN"; if (n == 33) return "YELLOW"; if (n == 34) return "BLUE"; if (n == 35) return "MAGENTA"; if (n == 36) return "CYAN"; if (n == 37) return "WHITE"; if (n == 40) return "BLACK"; if (n == 41) return "BRED"; if (n == 42) return "BGREEN"; if (n == 43) return "BYELLOW"; if (n == 44) return "BBLUE"; if (n == 45) return "BMAGENTA"; if (n == 46) return "BCYAN"; if (n == 47) return "BWHITE"; } { while (match($0, /\033\[[0-9;]*m/) != 0) { printf "%s<", substr($0, 1, RSTART-1); codes = substr($0, RSTART+2, RLENGTH-3); if (length(codes) == 0) printf "%s", name(0) else { n = split(codes, ary, ";"); sep = ""; for (i = 1; i <= n; i++) { printf "%s%s", sep, name(ary[i]); sep = ";" } } printf ">"; $0 = substr($0, RSTART + RLENGTH, length($0) - RSTART - RLENGTH + 1); } print } ' } lf_to_nul () { perl -pe 'y/\012/\000/' } nul_to_q () { perl -pe 'y/\000/Q/' } q_to_nul () { perl -pe 'y/Q/\000/' } q_to_cr () { tr Q '\015' } q_to_tab () { tr Q '\011' } qz_to_tab_space () { tr QZ '\011\040' } append_cr () { sed -e 's/$/Q/' | tr Q '\015' } remove_cr () { tr '\015' Q | sed -e 's/Q$//' } # Generate an output of $1 bytes of all zeroes (NULs, not ASCII zeroes). # If $1 is 'infinity', output forever or until the receiving pipe stops reading, # whichever comes first. generate_zero_bytes () { test-tool genzeros "$@" } # In some bourne shell implementations, the "unset" builtin returns # nonzero status when a variable to be unset was not set in the first # place. # # Use sane_unset when that should not be considered an error. sane_unset () { unset "$@" return 0 } test_tick () { if test -z "${test_tick+set}" then test_tick=1112911993 else test_tick=$(($test_tick + 60)) fi GIT_COMMITTER_DATE="$test_tick -0700" GIT_AUTHOR_DATE="$test_tick -0700" export GIT_COMMITTER_DATE GIT_AUTHOR_DATE } # Stop execution and start a shell. This is useful for debugging tests. # # Be sure to remove all invocations of this command before submitting. test_pause () { "$SHELL_PATH" <&6 >&5 2>&7 } # Wrap git with a debugger. Adding this to a command can make it easier # to understand what is going on in a failing test. # # Examples: # debug git checkout master # debug --debugger=nemiver git $ARGS # debug -d "valgrind --tool=memcheck --track-origins=yes" git $ARGS debug () { case "$1" in -d) GIT_DEBUGGER="$2" && shift 2 ;; --debugger=*) GIT_DEBUGGER="${1#*=}" && shift 1 ;; *) GIT_DEBUGGER=1 ;; esac && GIT_DEBUGGER="${GIT_DEBUGGER}" "$@" <&6 >&5 2>&7 } # Call test_commit with the arguments # [-C ] [ [ []]]" # # This will commit a file with the given contents and the given commit # message, and tag the resulting commit with the given tag name. # # , , and all default to . # # If the first argument is "-C", the second argument is used as a path for # the git invocations. test_commit () { notick= && signoff= && indir= && while test $# != 0 do case "$1" in --notick) notick=yes ;; --signoff) signoff="$1" ;; -C) indir="$2" shift ;; *) break ;; esac shift done && indir=${indir:+"$indir"/} && file=${2:-"$1.t"} && echo "${3-$1}" > "$indir$file" && git ${indir:+ -C "$indir"} add "$file" && if test -z "$notick" then test_tick fi && git ${indir:+ -C "$indir"} commit $signoff -m "$1" && git ${indir:+ -C "$indir"} tag "${4:-$1}" } # Call test_merge with the arguments " ", where # can be a tag pointing to the commit-to-merge. test_merge () { test_tick && git merge -m "$1" "$2" && git tag "$1" } # This function helps systems where core.filemode=false is set. # Use it instead of plain 'chmod +x' to set or unset the executable bit # of a file in the working directory and add it to the index. test_chmod () { chmod "$@" && git update-index --add "--chmod=$@" } # Get the modebits from a file. test_modebits () { ls -l "$1" | sed -e 's|^\(..........\).*|\1|' } # Unset a configuration variable, but don't fail if it doesn't exist. test_unconfig () { config_dir= if test "$1" = -C then shift config_dir=$1 shift fi git ${config_dir:+-C "$config_dir"} config --unset-all "$@" config_status=$? case "$config_status" in 5) # ok, nothing to unset config_status=0 ;; esac return $config_status } # Set git config, automatically unsetting it after the test is over. test_config () { config_dir= if test "$1" = -C then shift config_dir=$1 shift fi test_when_finished "test_unconfig ${config_dir:+-C '$config_dir'} '$1'" && git ${config_dir:+-C "$config_dir"} config "$@" } test_config_global () { test_when_finished "test_unconfig --global '$1'" && git config --global "$@" } write_script () { { echo "#!${2-"$SHELL_PATH"}" && cat } >"$1" && chmod +x "$1" } # Use test_set_prereq to tell that a particular prerequisite is available. # The prerequisite can later be checked for in two ways: # # - Explicitly using test_have_prereq. # # - Implicitly by specifying the prerequisite tag in the calls to # test_expect_{success,failure,code}. # # The single parameter is the prerequisite tag (a simple word, in all # capital letters by convention). test_unset_prereq () { ! test_have_prereq "$1" || satisfied_prereq="${satisfied_prereq% $1 *} ${satisfied_prereq#* $1 }" } test_set_prereq () { case "$1" in !*) test_unset_prereq "${1#!}" ;; *) satisfied_prereq="$satisfied_prereq$1 " ;; esac } satisfied_prereq=" " lazily_testable_prereq= lazily_tested_prereq= # Usage: test_lazy_prereq PREREQ 'script' test_lazy_prereq () { lazily_testable_prereq="$lazily_testable_prereq$1 " eval test_prereq_lazily_$1=\$2 } test_run_lazy_prereq_ () { script=' mkdir -p "$TRASH_DIRECTORY/prereq-test-dir" && ( cd "$TRASH_DIRECTORY/prereq-test-dir" &&'"$2"' )' say >&3 "checking prerequisite: $1" say >&3 "$script" test_eval_ "$script" eval_ret=$? rm -rf "$TRASH_DIRECTORY/prereq-test-dir" if test "$eval_ret" = 0; then say >&3 "prerequisite $1 ok" else say >&3 "prerequisite $1 not satisfied" fi return $eval_ret } test_have_prereq () { # prerequisites can be concatenated with ',' save_IFS=$IFS IFS=, set -- $* IFS=$save_IFS total_prereq=0 ok_prereq=0 missing_prereq= for prerequisite do case "$prerequisite" in !*) negative_prereq=t prerequisite=${prerequisite#!} ;; *) negative_prereq= esac case " $lazily_tested_prereq " in *" $prerequisite "*) ;; *) case " $lazily_testable_prereq " in *" $prerequisite "*) eval "script=\$test_prereq_lazily_$prerequisite" && if test_run_lazy_prereq_ "$prerequisite" "$script" then test_set_prereq $prerequisite fi lazily_tested_prereq="$lazily_tested_prereq$prerequisite " esac ;; esac total_prereq=$(($total_prereq + 1)) case "$satisfied_prereq" in *" $prerequisite "*) satisfied_this_prereq=t ;; *) satisfied_this_prereq= esac case "$satisfied_this_prereq,$negative_prereq" in t,|,t) ok_prereq=$(($ok_prereq + 1)) ;; *) # Keep a list of missing prerequisites; restore # the negative marker if necessary. prerequisite=${negative_prereq:+!}$prerequisite if test -z "$missing_prereq" then missing_prereq=$prerequisite else missing_prereq="$prerequisite,$missing_prereq" fi esac done test $total_prereq = $ok_prereq } test_declared_prereq () { case ",$test_prereq," in *,$1,*) return 0 ;; esac return 1 } test_verify_prereq () { test -z "$test_prereq" || expr >/dev/null "$test_prereq" : '[A-Z0-9_,!]*$' || BUG "'$test_prereq' does not look like a prereq" } test_expect_failure () { test_start_ test "$#" = 3 && { test_prereq=$1; shift; } || test_prereq= test "$#" = 2 || BUG "not 2 or 3 parameters to test-expect-failure" test_verify_prereq export test_prereq if ! test_skip "$@" then say >&3 "checking known breakage: $2" if test_run_ "$2" expecting_failure then test_known_broken_ok_ "$1" else test_known_broken_failure_ "$1" fi fi test_finish_ } test_expect_success () { test_start_ test "$#" = 3 && { test_prereq=$1; shift; } || test_prereq= test "$#" = 2 || BUG "not 2 or 3 parameters to test-expect-success" test_verify_prereq export test_prereq if ! test_skip "$@" then say >&3 "expecting success: $2" if test_run_ "$2" then test_ok_ "$1" else test_failure_ "$@" fi fi test_finish_ } # test_external runs external test scripts that provide continuous # test output about their progress, and succeeds/fails on # zero/non-zero exit code. It outputs the test output on stdout even # in non-verbose mode, and announces the external script with "# run # : ..." before running it. When providing relative paths, keep in # mind that all scripts run in "trash directory". # Usage: test_external description command arguments... # Example: test_external 'Perl API' perl ../path/to/test.pl test_external () { test "$#" = 4 && { test_prereq=$1; shift; } || test_prereq= test "$#" = 3 || BUG "not 3 or 4 parameters to test_external" descr="$1" shift test_verify_prereq export test_prereq if ! test_skip "$descr" "$@" then # Announce the script to reduce confusion about the # test output that follows. say_color "" "# run $test_count: $descr ($*)" # Export TEST_DIRECTORY, TRASH_DIRECTORY and GIT_TEST_LONG # to be able to use them in script export TEST_DIRECTORY TRASH_DIRECTORY GIT_TEST_LONG # Run command; redirect its stderr to &4 as in # test_run_, but keep its stdout on our stdout even in # non-verbose mode. "$@" 2>&4 if test "$?" = 0 then if test $test_external_has_tap -eq 0; then test_ok_ "$descr" else say_color "" "# test_external test $descr was ok" test_success=$(($test_success + 1)) fi else if test $test_external_has_tap -eq 0; then test_failure_ "$descr" "$@" else say_color error "# test_external test $descr failed: $@" test_failure=$(($test_failure + 1)) fi fi fi } # Like test_external, but in addition tests that the command generated # no output on stderr. test_external_without_stderr () { # The temporary file has no (and must have no) security # implications. tmp=${TMPDIR:-/tmp} stderr="$tmp/git-external-stderr.$$.tmp" test_external "$@" 4> "$stderr" test -f "$stderr" || error "Internal error: $stderr disappeared." descr="no stderr: $1" shift say >&3 "# expecting no stderr from previous command" if test ! -s "$stderr" then rm "$stderr" if test $test_external_has_tap -eq 0; then test_ok_ "$descr" else say_color "" "# test_external_without_stderr test $descr was ok" test_success=$(($test_success + 1)) fi else if test "$verbose" = t then output=$(echo; echo "# Stderr is:"; cat "$stderr") else output= fi # rm first in case test_failure exits. rm "$stderr" if test $test_external_has_tap -eq 0; then test_failure_ "$descr" "$@" "$output" else say_color error "# test_external_without_stderr test $descr failed: $@: $output" test_failure=$(($test_failure + 1)) fi fi } # debugging-friendly alternatives to "test [-f|-d|-e]" # The commands test the existence or non-existence of $1. $2 can be # given to provide a more precise diagnosis. test_path_is_file () { if ! test -f "$1" then echo "File $1 doesn't exist. $2" false fi } test_path_is_dir () { if ! test -d "$1" then echo "Directory $1 doesn't exist. $2" false fi } test_path_exists () { if ! test -e "$1" then echo "Path $1 doesn't exist. $2" false fi } # Check if the directory exists and is empty as expected, barf otherwise. test_dir_is_empty () { test_path_is_dir "$1" && if test -n "$(ls -a1 "$1" | egrep -v '^\.\.?$')" then echo "Directory '$1' is not empty, it contains:" ls -la "$1" return 1 fi } test_path_is_missing () { if test -e "$1" then echo "Path exists:" ls -ld "$1" if test $# -ge 1 then echo "$*" fi false fi } # test_line_count checks that a file has the number of lines it # ought to. For example: # # test_expect_success 'produce exactly one line of output' ' # do something >output && # test_line_count = 1 output # ' # # is like "test $(wc -l [,<...>]: # Don't treat an exit caused by the given signal as error. # Multiple signals can be specified as a comma separated list. # Currently recognized signal names are: sigpipe, success. # (Don't use 'success', use 'test_might_fail' instead.) test_must_fail () { case "$1" in ok=*) _test_ok=${1#ok=} shift ;; *) _test_ok= ;; esac "$@" 2>&7 exit_code=$? if test $exit_code -eq 0 && ! list_contains "$_test_ok" success then echo >&4 "test_must_fail: command succeeded: $*" return 1 elif test_match_signal 13 $exit_code && list_contains "$_test_ok" sigpipe then return 0 elif test $exit_code -gt 129 && test $exit_code -le 192 then echo >&4 "test_must_fail: died by signal $(($exit_code - 128)): $*" return 1 elif test $exit_code -eq 127 then echo >&4 "test_must_fail: command not found: $*" return 1 elif test $exit_code -eq 126 then echo >&4 "test_must_fail: valgrind error: $*" return 1 fi return 0 } 7>&2 2>&4 # Similar to test_must_fail, but tolerates success, too. This is # meant to be used in contexts like: # # test_expect_success 'some command works without configuration' ' # test_might_fail git config --unset all.configuration && # do something # ' # # Writing "git config --unset all.configuration || :" would be wrong, # because we want to notice if it fails due to segv. # # Accepts the same options as test_must_fail. test_might_fail () { test_must_fail ok=success "$@" 2>&7 } 7>&2 2>&4 # Similar to test_must_fail and test_might_fail, but check that a # given command exited with a given exit code. Meant to be used as: # # test_expect_success 'Merge with d/f conflicts' ' # test_expect_code 1 git merge "merge msg" B master # ' test_expect_code () { want_code=$1 shift "$@" 2>&7 exit_code=$? if test $exit_code = $want_code then return 0 fi echo >&4 "test_expect_code: command exited with $exit_code, we wanted $want_code $*" return 1 } 7>&2 2>&4 # test_cmp is a helper function to compare actual and expected output. # You can use it like: # # test_expect_success 'foo works' ' # echo expected >expected && # foo >actual && # test_cmp expected actual # ' # # This could be written as either "cmp" or "diff -u", but: # - cmp's output is not nearly as easy to read as diff -u # - not all diff versions understand "-u" test_cmp() { $GIT_TEST_CMP "$@" } # Check that the given config key has the expected value. # # test_cmp_config [-C ] # [...] # # for example to check that the value of core.bar is foo # # test_cmp_config foo core.bar # test_cmp_config() { local GD && if test "$1" = "-C" then shift && GD="-C $1" && shift fi && printf "%s\n" "$1" >expect.config && shift && git $GD config "$@" >actual.config && test_cmp expect.config actual.config } # test_cmp_bin - helper to compare binary files test_cmp_bin() { cmp "$@" } # Use this instead of test_cmp to compare files that contain expected and # actual output from git commands that can be translated. When running # under GIT_TEST_GETTEXT_POISON this pretends that the command produced expected # results. test_i18ncmp () { ! test_have_prereq C_LOCALE_OUTPUT || test_cmp "$@" } # Use this instead of "grep expected-string actual" to see if the # output from a git command that can be translated either contains an # expected string, or does not contain an unwanted one. When running # under GIT_TEST_GETTEXT_POISON this pretends that the command produced expected # results. test_i18ngrep () { eval "last_arg=\${$#}" test -f "$last_arg" || BUG "test_i18ngrep requires a file to read as the last parameter" if test $# -lt 2 || { test "x!" = "x$1" && test $# -lt 3 ; } then BUG "too few parameters to test_i18ngrep" fi if test_have_prereq !C_LOCALE_OUTPUT then # pretend success return 0 fi if test "x!" = "x$1" then shift ! grep "$@" && return 0 echo >&4 "error: '! grep $@' did find a match in:" else grep "$@" && return 0 echo >&4 "error: 'grep $@' didn't find a match in:" fi if test -s "$last_arg" then cat >&4 "$last_arg" else echo >&4 "" fi return 1 } # Call any command "$@" but be more verbose about its # failure. This is handy for commands like "test" which do # not output anything when they fail. verbose () { "$@" && return 0 echo >&4 "command failed: $(git rev-parse --sq-quote "$@")" return 1 } # Check if the file expected to be empty is indeed empty, and barfs # otherwise. test_must_be_empty () { test_path_is_file "$1" && if test -s "$1" then echo "'$1' is not empty, it contains:" cat "$1" return 1 fi } # Tests that its two parameters refer to the same revision test_cmp_rev () { if test $# != 2 then error "bug in the test script: test_cmp_rev requires two revisions, but got $#" else local r1 r2 r1=$(git rev-parse --verify "$1") && r2=$(git rev-parse --verify "$2") && if test "$r1" != "$r2" then cat >&4 <<-EOF error: two revisions point to different objects: '$1': $r1 '$2': $r2 EOF return 1 fi fi } # Print a sequence of integers in increasing order, either with # two arguments (start and end): # # test_seq 1 5 -- outputs 1 2 3 4 5 one line at a time # # or with one argument (end), in which case it starts counting # from 1. test_seq () { case $# in 1) set 1 "$@" ;; 2) ;; *) BUG "not 1 or 2 parameters to test_seq" ;; esac test_seq_counter__=$1 while test "$test_seq_counter__" -le "$2" do echo "$test_seq_counter__" test_seq_counter__=$(( $test_seq_counter__ + 1 )) done } # This function can be used to schedule some commands to be run # unconditionally at the end of the test to restore sanity: # # test_expect_success 'test core.capslock' ' # git config core.capslock true && # test_when_finished "git config --unset core.capslock" && # hello world # ' # # That would be roughly equivalent to # # test_expect_success 'test core.capslock' ' # git config core.capslock true && # hello world # git config --unset core.capslock # ' # # except that the greeting and config --unset must both succeed for # the test to pass. # # Note that under --immediate mode, no clean-up is done to help diagnose # what went wrong. test_when_finished () { # We cannot detect when we are in a subshell in general, but by # doing so on Bash is better than nothing (the test will # silently pass on other shells). test "${BASH_SUBSHELL-0}" = 0 || BUG "test_when_finished does nothing in a subshell" test_cleanup="{ $* } && (exit \"\$eval_ret\"); eval_ret=\$?; $test_cleanup" } # Most tests can use the created repository, but some may need to create more. # Usage: test_create_repo test_create_repo () { test "$#" = 1 || BUG "not 1 parameter to test-create-repo" repo="$1" mkdir -p "$repo" ( cd "$repo" || error "Cannot setup test environment" "${GIT_TEST_INSTALLED:-$GIT_EXEC_PATH}/git$X" init \ "--template=$GIT_BUILD_DIR/templates/blt/" >&3 2>&4 || error "cannot run git init -- have you built things yet?" test -d .git/hooks && mv .git/hooks .git/hooks-disabled || true ) || exit } # This function helps on symlink challenged file systems when it is not # important that the file system entry is a symbolic link. # Use test_ln_s_add instead of "ln -s x y && git add y" to add a # symbolic link entry y to the index. test_ln_s_add () { if test_have_prereq SYMLINKS then ln -s "$1" "$2" && git update-index --add "$2" else printf '%s' "$1" >"$2" && ln_s_obj=$(git hash-object -w "$2") && git update-index --add --cacheinfo 120000 $ln_s_obj "$2" && # pick up stat info from the file git update-index "$2" fi } # This function writes out its parameters, one per line test_write_lines () { printf "%s\n" "$@" } perl () { command "$PERL_PATH" "$@" 2>&7 } 7>&2 2>&4 # Is the value one of the various ways to spell a boolean true/false? test_normalize_bool () { git -c magic.variable="$1" config --bool magic.variable 2>/dev/null } # Given a variable $1, normalize the value of it to one of "true", # "false", or "auto" and store the result to it. # # test_tristate GIT_TEST_HTTPD # # A variable set to an empty string is set to 'false'. # A variable set to 'false' or 'auto' keeps its value. # Anything else is set to 'true'. # An unset variable defaults to 'auto'. # # The last rule is to allow people to set the variable to an empty # string and export it to decline testing the particular feature # for versions both before and after this change. We used to treat # both unset and empty variable as a signal for "do not test" and # took any non-empty string as "please test". test_tristate () { if eval "test x\"\${$1+isset}\" = xisset" then # explicitly set eval " case \"\$$1\" in '') $1=false ;; auto) ;; *) $1=\$(test_normalize_bool \$$1 || echo true) ;; esac " else eval "$1=auto" fi } # Exit the test suite, either by skipping all remaining tests or by # exiting with an error. If "$1" is "auto", we then we assume we were # opportunistically trying to set up some tests and we skip. If it is # "true", then we report a failure. # # The error/skip message should be given by $2. # test_skip_or_die () { case "$1" in auto) skip_all=$2 test_done ;; true) error "$2" ;; *) error "BUG: test tristate is '$1' (real error: $2)" esac } # The following mingw_* functions obey POSIX shell syntax, but are actually # bash scripts, and are meant to be used only with bash on Windows. # A test_cmp function that treats LF and CRLF equal and avoids to fork # diff when possible. mingw_test_cmp () { # Read text into shell variables and compare them. If the results # are different, use regular diff to report the difference. local test_cmp_a= test_cmp_b= # When text came from stdin (one argument is '-') we must feed it # to diff. local stdin_for_diff= # Since it is difficult to detect the difference between an # empty input file and a failure to read the files, we go straight # to diff if one of the inputs is empty. if test -s "$1" && test -s "$2" then # regular case: both files non-empty mingw_read_file_strip_cr_ test_cmp_a <"$1" mingw_read_file_strip_cr_ test_cmp_b <"$2" elif test -s "$1" && test "$2" = - then # read 2nd file from stdin mingw_read_file_strip_cr_ test_cmp_a <"$1" mingw_read_file_strip_cr_ test_cmp_b stdin_for_diff='<<<"$test_cmp_b"' elif test "$1" = - && test -s "$2" then # read 1st file from stdin mingw_read_file_strip_cr_ test_cmp_a mingw_read_file_strip_cr_ test_cmp_b <"$2" stdin_for_diff='<<<"$test_cmp_a"' fi test -n "$test_cmp_a" && test -n "$test_cmp_b" && test "$test_cmp_a" = "$test_cmp_b" || eval "diff -u \"\$@\" $stdin_for_diff" } # $1 is the name of the shell variable to fill in mingw_read_file_strip_cr_ () { # Read line-wise using LF as the line separator # and use IFS to strip CR. local line while : do if IFS=$'\r' read -r -d $'\n' line then # good line=$line$'\n' else # we get here at EOF, but also if the last line # was not terminated by LF; in the latter case, # some text was read if test -z "$line" then # EOF, really break fi fi eval "$1=\$$1\$line" done } # Like "env FOO=BAR some-program", but run inside a subshell, which means # it also works for shell functions (though those functions cannot impact # the environment outside of the test_env invocation). test_env () { ( while test $# -gt 0 do case "$1" in *=*) eval "${1%%=*}=\${1#*=}" eval "export ${1%%=*}" shift ;; *) "$@" 2>&7 exit ;; esac done ) } 7>&2 2>&4 # Returns true if the numeric exit code in "$2" represents the expected signal # in "$1". Signals should be given numerically. test_match_signal () { if test "$2" = "$((128 + $1))" then # POSIX return 0 elif test "$2" = "$((256 + $1))" then # ksh return 0 fi return 1 } # Read up to "$1" bytes (or to EOF) from stdin and write them to stdout. test_copy_bytes () { perl -e ' my $len = $ARGV[1]; while ($len > 0) { my $s; my $nread = sysread(STDIN, $s, $len); die "cannot read: $!" unless defined($nread); last unless $nread; print $s; $len -= $nread; } ' - "$1" } # run "$@" inside a non-git directory nongit () { test -d non-repo || mkdir non-repo || return 1 ( GIT_CEILING_DIRECTORIES=$(pwd) && export GIT_CEILING_DIRECTORIES && cd non-repo && "$@" 2>&7 ) } 7>&2 2>&4 # convert stdin to pktline representation; note that empty input becomes an # empty packet, not a flush packet (for that you can just print 0000 yourself). packetize() { cat >packetize.tmp && len=$(wc -c /dev/null then BUG 'bad hash algorithm' fi && eval "test_oid_${k}_$tag=\"\$v\"" done } # Look up a per-hash value based on a key ($1). The value must have been loaded # by test_oid_init or test_oid_cache. test_oid () { local var="test_oid_${test_hash_algo}_$1" && # If the variable is unset, we must be missing an entry for this # key-hash pair, so exit with an error. if eval "test -z \"\${$var+set}\"" then BUG "undefined key '$1'" fi && eval "printf '%s' \"\${$var}\"" } # Choose a port number based on the test script's number and store it in # the given variable name, unless that variable already contains a number. test_set_port () { local var=$1 port if test $# -ne 1 || test -z "$var" then BUG "test_set_port requires a variable name" fi eval port=\$$var case "$port" in "") # No port is set in the given env var, use the test # number as port number instead. # Remove not only the leading 't', but all leading zeros # as well, so the arithmetic below won't (mis)interpret # a test number like '0123' as an octal value. port=${this_test#${this_test%%[1-9]*}} if test "${port:-0}" -lt 1024 then # root-only port, use a larger one instead. port=$(($port + 10000)) fi ;; *[!0-9]*|0*) error >&7 "invalid port number: $port" ;; *) # The user has specified the port. ;; esac # Make sure that parallel '--stress' test jobs get different # ports. port=$(($port + ${GIT_TEST_STRESS_JOB_NR:-0})) eval $var=$port } git-filter-repo-2.45.0/t/test-lib.sh000066400000000000000000001022601464611705400171460ustar00rootroot00000000000000# Test framework for git. See t/README for usage. # # Copyright (c) 2005 Junio C Hamano # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see http://www.gnu.org/licenses/ . # Test the binaries we have just built. The tests are kept in # t/ subdirectory and are run in 'trash directory' subdirectory. if test -z "$TEST_DIRECTORY" then # We allow tests to override this, in case they want to run tests # outside of t/, e.g. for running tests on the test library # itself. TEST_DIRECTORY=$(pwd) else # ensure that TEST_DIRECTORY is an absolute path so that it # is valid even if the current working directory is changed TEST_DIRECTORY=$(cd "$TEST_DIRECTORY" && pwd) || exit 1 fi if test -z "$TEST_OUTPUT_DIRECTORY" then # Similarly, override this to store the test-results subdir # elsewhere TEST_OUTPUT_DIRECTORY=$TEST_DIRECTORY fi GIT_BUILD_DIR="$TEST_DIRECTORY"/.. # If we were built with ASAN, it may complain about leaks # of program-lifetime variables. Disable it by default to lower # the noise level. This needs to happen at the start of the script, # before we even do our "did we build git yet" check (since we don't # want that one to complain to stderr). : ${ASAN_OPTIONS=detect_leaks=0:abort_on_error=1} export ASAN_OPTIONS # If LSAN is in effect we _do_ want leak checking, but we still # want to abort so that we notice the problems. : ${LSAN_OPTIONS=abort_on_error=1} export LSAN_OPTIONS PERL_PATH=${PERL_PATH:-perl} export PERL_PATH SHELL_PATH GIT_TEST_INSTALLED=${GIT_TEST_INSTALLED:-$(dirname $(type -p git))} GIT_TEST_CHAIN_LINT=0 DIFF='diff' # Parse options while taking care to leave $@ intact, so we will still # have all the original command line options when executing the test # script again for '--tee' and '--verbose-log' below. store_arg_to= prev_opt= for opt do if test -n "$store_arg_to" then eval $store_arg_to=\$opt store_arg_to= prev_opt= continue fi case "$opt" in -d|--d|--de|--deb|--debu|--debug) debug=t ;; -i|--i|--im|--imm|--imme|--immed|--immedi|--immedia|--immediat|--immediate) immediate=t ;; -l|--l|--lo|--lon|--long|--long-|--long-t|--long-te|--long-tes|--long-test|--long-tests) GIT_TEST_LONG=t; export GIT_TEST_LONG ;; -r) store_arg_to=run_list ;; --run=*) run_list=${opt#--*=} ;; -h|--h|--he|--hel|--help) help=t ;; -v|--v|--ve|--ver|--verb|--verbo|--verbos|--verbose) verbose=t ;; --verbose-only=*) verbose_only=${opt#--*=} ;; -q|--q|--qu|--qui|--quie|--quiet) # Ignore --quiet under a TAP::Harness. Saying how many tests # passed without the ok/not ok details is always an error. test -z "$HARNESS_ACTIVE" && quiet=t ;; --with-dashes) with_dashes=t ;; --no-bin-wrappers) no_bin_wrappers=t ;; --no-color) color= ;; --va|--val|--valg|--valgr|--valgri|--valgrin|--valgrind) valgrind=memcheck tee=t ;; --valgrind=*) valgrind=${opt#--*=} tee=t ;; --valgrind-only=*) valgrind_only=${opt#--*=} tee=t ;; --tee) tee=t ;; --root=*) root=${opt#--*=} ;; --chain-lint) GIT_TEST_CHAIN_LINT=1 ;; --no-chain-lint) GIT_TEST_CHAIN_LINT=0 ;; -x) trace=t ;; -V|--verbose-log) verbose_log=t tee=t ;; --write-junit-xml) write_junit_xml=t ;; --stress) stress=t ;; --stress=*) stress=${opt#--*=} case "$stress" in *[!0-9]*|0*|"") echo "error: --stress= requires the number of jobs to run" >&2 exit 1 ;; *) # Good. ;; esac ;; --stress-limit=*) stress_limit=${opt#--*=} case "$stress_limit" in *[!0-9]*|0*|"") echo "error: --stress-limit= requires the number of repetitions" >&2 exit 1 ;; *) # Good. ;; esac ;; *) echo "error: unknown test option '$opt'" >&2; exit 1 ;; esac prev_opt=$opt done if test -n "$store_arg_to" then echo "error: $prev_opt requires an argument" >&2 exit 1 fi if test -n "$valgrind_only" then test -z "$valgrind" && valgrind=memcheck test -z "$verbose" && verbose_only="$valgrind_only" elif test -n "$valgrind" then test -z "$verbose_log" && verbose=t fi if test -n "$stress" then verbose=t trace=t immediate=t fi TEST_STRESS_JOB_SFX="${GIT_TEST_STRESS_JOB_NR:+.stress-$GIT_TEST_STRESS_JOB_NR}" TEST_NAME="$(basename "$0" .sh)" TEST_RESULTS_DIR="$TEST_OUTPUT_DIRECTORY/test-results" TEST_RESULTS_BASE="$TEST_RESULTS_DIR/$TEST_NAME$TEST_STRESS_JOB_SFX" TRASH_DIRECTORY="trash directory.$TEST_NAME$TEST_STRESS_JOB_SFX" test -n "$root" && TRASH_DIRECTORY="$root/$TRASH_DIRECTORY" case "$TRASH_DIRECTORY" in /*) ;; # absolute path is good *) TRASH_DIRECTORY="$TEST_OUTPUT_DIRECTORY/$TRASH_DIRECTORY" ;; esac # If --stress was passed, run this test repeatedly in several parallel loops. if test "$GIT_TEST_STRESS_STARTED" = "done" then : # Don't stress test again. elif test -n "$stress" then if test "$stress" != t then job_count=$stress elif test -n "$GIT_TEST_STRESS_LOAD" then job_count="$GIT_TEST_STRESS_LOAD" elif job_count=$(getconf _NPROCESSORS_ONLN 2>/dev/null) && test -n "$job_count" then job_count=$((2 * $job_count)) else job_count=8 fi mkdir -p "$TEST_RESULTS_DIR" stressfail="$TEST_RESULTS_BASE.stress-failed" rm -f "$stressfail" stress_exit=0 trap ' kill $job_pids 2>/dev/null wait stress_exit=1 ' TERM INT HUP job_pids= job_nr=0 while test $job_nr -lt "$job_count" do ( GIT_TEST_STRESS_STARTED=done GIT_TEST_STRESS_JOB_NR=$job_nr export GIT_TEST_STRESS_STARTED GIT_TEST_STRESS_JOB_NR trap ' kill $test_pid 2>/dev/null wait exit 1 ' TERM INT cnt=1 while ! test -e "$stressfail" && { test -z "$stress_limit" || test $cnt -le $stress_limit ; } do $TEST_SHELL_PATH "$0" "$@" >"$TEST_RESULTS_BASE.stress-$job_nr.out" 2>&1 & test_pid=$! if wait $test_pid then printf "OK %2d.%d\n" $GIT_TEST_STRESS_JOB_NR $cnt else echo $GIT_TEST_STRESS_JOB_NR >>"$stressfail" printf "FAIL %2d.%d\n" $GIT_TEST_STRESS_JOB_NR $cnt fi cnt=$(($cnt + 1)) done ) & job_pids="$job_pids $!" job_nr=$(($job_nr + 1)) done wait if test -f "$stressfail" then stress_exit=1 echo "Log(s) of failed test run(s):" for failed_job_nr in $(sort -n "$stressfail") do echo "Contents of '$TEST_RESULTS_BASE.stress-$failed_job_nr.out':" cat "$TEST_RESULTS_BASE.stress-$failed_job_nr.out" done rm -rf "$TRASH_DIRECTORY.stress-failed" # Move the last one. mv "$TRASH_DIRECTORY.stress-$failed_job_nr" "$TRASH_DIRECTORY.stress-failed" fi exit $stress_exit fi # if --tee was passed, write the output not only to the terminal, but # additionally to the file test-results/$BASENAME.out, too. if test "$GIT_TEST_TEE_STARTED" = "done" then : # do not redirect again elif test -n "$tee" then mkdir -p "$TEST_RESULTS_DIR" # Make this filename available to the sub-process in case it is using # --verbose-log. GIT_TEST_TEE_OUTPUT_FILE=$TEST_RESULTS_BASE.out export GIT_TEST_TEE_OUTPUT_FILE # Truncate before calling "tee -a" to get rid of the results # from any previous runs. >"$GIT_TEST_TEE_OUTPUT_FILE" (GIT_TEST_TEE_STARTED=done ${TEST_SHELL_PATH} "$0" "$@" 2>&1; echo $? >"$TEST_RESULTS_BASE.exit") | tee -a "$GIT_TEST_TEE_OUTPUT_FILE" test "$(cat "$TEST_RESULTS_BASE.exit")" = 0 exit fi if test -n "$trace" && test -n "$test_untraceable" then # '-x' tracing requested, but this test script can't be reliably # traced, unless it is run with a Bash version supporting # BASH_XTRACEFD (introduced in Bash v4.1). # # Perform this version check _after_ the test script was # potentially re-executed with $TEST_SHELL_PATH for '--tee' or # '--verbose-log', so the right shell is checked and the # warning is issued only once. if test -n "$BASH_VERSION" && eval ' test ${BASH_VERSINFO[0]} -gt 4 || { test ${BASH_VERSINFO[0]} -eq 4 && test ${BASH_VERSINFO[1]} -ge 1 } ' then : Executed by a Bash version supporting BASH_XTRACEFD. Good. else echo >&2 "warning: ignoring -x; '$0' is untraceable without BASH_XTRACEFD" trace= fi fi if test -n "$trace" && test -z "$verbose_log" then verbose=t fi # For repeatability, reset the environment to known value. # TERM is sanitized below, after saving color control sequences. LANG=C LC_ALL=C PAGER=cat TZ=UTC export LANG LC_ALL PAGER TZ EDITOR=: # GIT_TEST_GETTEXT_POISON should not influence git commands executed # during initialization of test-lib and the test repo. Back it up, # unset and then restore after initialization is finished. if test -n "$GIT_TEST_GETTEXT_POISON" then GIT_TEST_GETTEXT_POISON_ORIG=$GIT_TEST_GETTEXT_POISON unset GIT_TEST_GETTEXT_POISON fi # A call to "unset" with no arguments causes at least Solaris 10 # /usr/xpg4/bin/sh and /bin/ksh to bail out. So keep the unsets # deriving from the command substitution clustered with the other # ones. unset VISUAL EMAIL LANGUAGE COLUMNS $("$PERL_PATH" -e ' my @env = keys %ENV; my $ok = join("|", qw( TRACE DEBUG TEST .*_TEST PROVE VALGRIND UNZIP PERF_ CURL_VERBOSE TRACE_CURL )); my @vars = grep(/^GIT_/ && !/^GIT_($ok)/o, @env); print join("\n", @vars); ') unset XDG_CACHE_HOME unset XDG_CONFIG_HOME unset GITPERLLIB GIT_AUTHOR_EMAIL=author@example.com GIT_AUTHOR_NAME='A U Thor' GIT_COMMITTER_EMAIL=committer@example.com GIT_COMMITTER_NAME='C O Mitter' GIT_MERGE_VERBOSITY=5 GIT_MERGE_AUTOEDIT=no export GIT_MERGE_VERBOSITY GIT_MERGE_AUTOEDIT export GIT_AUTHOR_EMAIL GIT_AUTHOR_NAME export GIT_COMMITTER_EMAIL GIT_COMMITTER_NAME export EDITOR # Tests using GIT_TRACE typically don't want : output GIT_TRACE_BARE=1 export GIT_TRACE_BARE check_var_migration () { # the warnings and hints given from this helper depends # on end-user settings, which will disrupt the self-test # done on the test framework itself. case "$GIT_TEST_FRAMEWORK_SELFTEST" in t) return ;; esac old_name=$1 new_name=$2 eval "old_isset=\${${old_name}:+isset}" eval "new_isset=\${${new_name}:+isset}" case "$old_isset,$new_isset" in isset,) echo >&2 "warning: $old_name is now $new_name" echo >&2 "hint: set $new_name too during the transition period" eval "$new_name=\$$old_name" ;; isset,isset) # do this later # echo >&2 "warning: $old_name is now $new_name" # echo >&2 "hint: remove $old_name" ;; esac } check_var_migration GIT_FSMONITOR_TEST GIT_TEST_FSMONITOR check_var_migration TEST_GIT_INDEX_VERSION GIT_TEST_INDEX_VERSION check_var_migration GIT_FORCE_PRELOAD_TEST GIT_TEST_PRELOAD_INDEX # Use specific version of the index file format if test -n "${GIT_TEST_INDEX_VERSION:+isset}" then GIT_INDEX_VERSION="$GIT_TEST_INDEX_VERSION" export GIT_INDEX_VERSION fi # Add libc MALLOC and MALLOC_PERTURB test # only if we are not executing the test with valgrind if test -n "$valgrind" || test -n "$TEST_NO_MALLOC_CHECK" then setup_malloc_check () { : nothing } teardown_malloc_check () { : nothing } else setup_malloc_check () { MALLOC_CHECK_=3 MALLOC_PERTURB_=165 export MALLOC_CHECK_ MALLOC_PERTURB_ } teardown_malloc_check () { unset MALLOC_CHECK_ MALLOC_PERTURB_ } fi # Protect ourselves from common misconfiguration to export # CDPATH into the environment unset CDPATH unset GREP_OPTIONS unset UNZIP case $(echo $GIT_TRACE |tr "[A-Z]" "[a-z]") in 1|2|true) GIT_TRACE=4 ;; esac # Convenience # # A regexp to match 5, 35 and 40 hexdigits _x05='[0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]' _x35="$_x05$_x05$_x05$_x05$_x05$_x05$_x05" _x40="$_x35$_x05" # Zero SHA-1 _z40=0000000000000000000000000000000000000000 OID_REGEX="$_x40" ZERO_OID=$_z40 EMPTY_TREE=4b825dc642cb6eb9a060e54bf8d69288fbee4904 EMPTY_BLOB=e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 # Line feed LF=' ' # UTF-8 ZERO WIDTH NON-JOINER, which HFS+ ignores # when case-folding filenames u200c=$(printf '\342\200\214') export _x05 _x35 _x40 _z40 LF u200c EMPTY_TREE EMPTY_BLOB ZERO_OID OID_REGEX # Each test should start with something like this, after copyright notices: # # test_description='Description of this test... # This test checks if command xyzzy does the right thing... # ' # . ./test-lib.sh test "x$TERM" != "xdumb" && ( test -t 1 && tput bold >/dev/null 2>&1 && tput setaf 1 >/dev/null 2>&1 && tput sgr0 >/dev/null 2>&1 ) && color=t if test -n "$color" then # Save the color control sequences now rather than run tput # each time say_color() is called. This is done for two # reasons: # * TERM will be changed to dumb # * HOME will be changed to a temporary directory and tput # might need to read ~/.terminfo from the original HOME # directory to get the control sequences # Note: This approach assumes the control sequences don't end # in a newline for any terminal of interest (command # substitutions strip trailing newlines). Given that most # (all?) terminals in common use are related to ECMA-48, this # shouldn't be a problem. say_color_error=$(tput bold; tput setaf 1) # bold red say_color_skip=$(tput setaf 4) # blue say_color_warn=$(tput setaf 3) # brown/yellow say_color_pass=$(tput setaf 2) # green say_color_info=$(tput setaf 6) # cyan say_color_reset=$(tput sgr0) say_color_="" # no formatting for normal text say_color () { test -z "$1" && test -n "$quiet" && return eval "say_color_color=\$say_color_$1" shift printf "%s\\n" "$say_color_color$*$say_color_reset" } else say_color() { test -z "$1" && test -n "$quiet" && return shift printf "%s\n" "$*" } fi TERM=dumb export TERM error () { say_color error "error: $*" GIT_EXIT_OK=t exit 1 } BUG () { error >&7 "bug in the test script: $*" } say () { say_color info "$*" } if test -n "$HARNESS_ACTIVE" then if test "$verbose" = t || test -n "$verbose_only" then printf 'Bail out! %s\n' \ 'verbose mode forbidden under TAP harness; try --verbose-log' exit 1 fi fi test "${test_description}" != "" || error "Test script did not set test_description." if test "$help" = "t" then printf '%s\n' "$test_description" exit 0 fi exec 5>&1 exec 6<&0 exec 7>&2 if test "$verbose_log" = "t" then exec 3>>"$GIT_TEST_TEE_OUTPUT_FILE" 4>&3 elif test "$verbose" = "t" then exec 4>&2 3>&1 else exec 4>/dev/null 3>/dev/null fi # Send any "-x" output directly to stderr to avoid polluting tests # which capture stderr. We can do this unconditionally since it # has no effect if tracing isn't turned on. # # Note that this sets up the trace fd as soon as we assign the variable, so it # must come after the creation of descriptor 4 above. Likewise, we must never # unset this, as it has the side effect of closing descriptor 4, which we # use to show verbose tests to the user. # # Note also that we don't need or want to export it. The tracing is local to # this shell, and we would not want to influence any shells we exec. BASH_XTRACEFD=4 test_failure=0 test_count=0 test_fixed=0 test_broken=0 test_success=0 test_external_has_tap=0 die () { code=$? if test -n "$GIT_EXIT_OK" then exit $code else echo >&5 "FATAL: Unexpected exit with code $code" exit 1 fi } GIT_EXIT_OK= trap 'die' EXIT trap 'exit $?' INT TERM HUP # The user-facing functions are loaded from a separate file so that # test_perf subshells can have them too . "$TEST_DIRECTORY/test-lib-functions.sh" # You are not expected to call test_ok_ and test_failure_ directly, use # the test_expect_* functions instead. test_ok_ () { if test -n "$write_junit_xml" then write_junit_xml_testcase "$*" fi test_success=$(($test_success + 1)) say_color "" "ok $test_count - $@" } test_failure_ () { if test -n "$write_junit_xml" then junit_insert="" junit_insert="$junit_insert $(xml_attr_encode \ "$(if test -n "$GIT_TEST_TEE_OUTPUT_FILE" then test-tool path-utils skip-n-bytes \ "$GIT_TEST_TEE_OUTPUT_FILE" $GIT_TEST_TEE_OFFSET else printf '%s\n' "$@" | sed 1d fi)")" junit_insert="$junit_insert" if test -n "$GIT_TEST_TEE_OUTPUT_FILE" then junit_insert="$junit_insert$(xml_attr_encode \ "$(cat "$GIT_TEST_TEE_OUTPUT_FILE")")" fi write_junit_xml_testcase "$1" " $junit_insert" fi test_failure=$(($test_failure + 1)) say_color error "not ok $test_count - $1" shift printf '%s\n' "$*" | sed -e 's/^/# /' test "$immediate" = "" || { GIT_EXIT_OK=t; exit 1; } } test_known_broken_ok_ () { if test -n "$write_junit_xml" then write_junit_xml_testcase "$* (breakage fixed)" fi test_fixed=$(($test_fixed+1)) say_color error "ok $test_count - $@ # TODO known breakage vanished" } test_known_broken_failure_ () { if test -n "$write_junit_xml" then write_junit_xml_testcase "$* (known breakage)" fi test_broken=$(($test_broken+1)) say_color warn "not ok $test_count - $@ # TODO known breakage" } test_debug () { test "$debug" = "" || eval "$1" } match_pattern_list () { arg="$1" shift test -z "$*" && return 1 for pattern_ do case "$arg" in $pattern_) return 0 esac done return 1 } match_test_selector_list () { title="$1" shift arg="$1" shift test -z "$1" && return 0 # Both commas and whitespace are accepted as separators. OLDIFS=$IFS IFS=' ,' set -- $1 IFS=$OLDIFS # If the first selector is negative we include by default. include= case "$1" in !*) include=t ;; esac for selector do orig_selector=$selector positive=t case "$selector" in !*) positive= selector=${selector##?} ;; esac test -z "$selector" && continue case "$selector" in *-*) if expr "z${selector%%-*}" : "z[0-9]*[^0-9]" >/dev/null then echo "error: $title: invalid non-numeric in range" \ "start: '$orig_selector'" >&2 exit 1 fi if expr "z${selector#*-}" : "z[0-9]*[^0-9]" >/dev/null then echo "error: $title: invalid non-numeric in range" \ "end: '$orig_selector'" >&2 exit 1 fi ;; *) if expr "z$selector" : "z[0-9]*[^0-9]" >/dev/null then echo "error: $title: invalid non-numeric in test" \ "selector: '$orig_selector'" >&2 exit 1 fi esac # Short cut for "obvious" cases test -z "$include" && test -z "$positive" && continue test -n "$include" && test -n "$positive" && continue case "$selector" in -*) if test $arg -le ${selector#-} then include=$positive fi ;; *-) if test $arg -ge ${selector%-} then include=$positive fi ;; *-*) if test ${selector%%-*} -le $arg \ && test $arg -le ${selector#*-} then include=$positive fi ;; *) if test $arg -eq $selector then include=$positive fi ;; esac done test -n "$include" } maybe_teardown_verbose () { test -z "$verbose_only" && return exec 4>/dev/null 3>/dev/null verbose= } last_verbose=t maybe_setup_verbose () { test -z "$verbose_only" && return if match_pattern_list $test_count $verbose_only then exec 4>&2 3>&1 # Emit a delimiting blank line when going from # non-verbose to verbose. Within verbose mode the # delimiter is printed by test_expect_*. The choice # of the initial $last_verbose is such that before # test 1, we do not print it. test -z "$last_verbose" && echo >&3 "" verbose=t else exec 4>/dev/null 3>/dev/null verbose= fi last_verbose=$verbose } maybe_teardown_valgrind () { test -z "$GIT_VALGRIND" && return GIT_VALGRIND_ENABLED= } maybe_setup_valgrind () { test -z "$GIT_VALGRIND" && return if test -z "$valgrind_only" then GIT_VALGRIND_ENABLED=t return fi GIT_VALGRIND_ENABLED= if match_pattern_list $test_count $valgrind_only then GIT_VALGRIND_ENABLED=t fi } want_trace () { test "$trace" = t && { test "$verbose" = t || test "$verbose_log" = t } } # This is a separate function because some tests use # "return" to end a test_expect_success block early # (and we want to make sure we run any cleanup like # "set +x"). test_eval_inner_ () { # Do not add anything extra (including LF) after '$*' eval " want_trace && set -x $*" } test_eval_ () { # If "-x" tracing is in effect, then we want to avoid polluting stderr # with non-test commands. But once in "set -x" mode, we cannot prevent # the shell from printing the "set +x" to turn it off (nor the saving # of $? before that). But we can make sure that the output goes to # /dev/null. # # There are a few subtleties here: # # - we have to redirect descriptor 4 in addition to 2, to cover # BASH_XTRACEFD # # - the actual eval has to come before the redirection block (since # it needs to see descriptor 4 to set up its stderr) # # - likewise, any error message we print must be outside the block to # access descriptor 4 # # - checking $? has to come immediately after the eval, but it must # be _inside_ the block to avoid polluting the "set -x" output # test_eval_inner_ "$@" &3 2>&4 { test_eval_ret_=$? if want_trace then set +x fi } 2>/dev/null 4>&2 if test "$test_eval_ret_" != 0 && want_trace then say_color error >&4 "error: last command exited with \$?=$test_eval_ret_" fi return $test_eval_ret_ } test_run_ () { test_cleanup=: expecting_failure=$2 if test "${GIT_TEST_CHAIN_LINT:-1}" != 0; then # turn off tracing for this test-eval, as it simply creates # confusing noise in the "-x" output trace_tmp=$trace trace= # 117 is magic because it is unlikely to match the exit # code of other programs if $(printf '%s\n' "$1" | sed -f "$GIT_BUILD_DIR/t/chainlint.sed" | grep -q '?![A-Z][A-Z]*?!') || test "OK-117" != "$(test_eval_ "(exit 117) && $1${LF}${LF}echo OK-\$?" 3>&1)" then BUG "broken &&-chain or run-away HERE-DOC: $1" fi trace=$trace_tmp fi setup_malloc_check test_eval_ "$1" eval_ret=$? teardown_malloc_check if test -z "$immediate" || test $eval_ret = 0 || test -n "$expecting_failure" && test "$test_cleanup" != ":" then setup_malloc_check test_eval_ "$test_cleanup" teardown_malloc_check fi if test "$verbose" = "t" && test -n "$HARNESS_ACTIVE" then echo "" fi return "$eval_ret" } test_start_ () { test_count=$(($test_count+1)) maybe_setup_verbose maybe_setup_valgrind if test -n "$write_junit_xml" then junit_start=$(test-tool date getnanos) fi } test_finish_ () { echo >&3 "" maybe_teardown_valgrind maybe_teardown_verbose if test -n "$GIT_TEST_TEE_OFFSET" then GIT_TEST_TEE_OFFSET=$(test-tool path-utils file-size \ "$GIT_TEST_TEE_OUTPUT_FILE") fi } test_skip () { to_skip= skipped_reason= if match_pattern_list $this_test.$test_count $GIT_SKIP_TESTS then to_skip=t skipped_reason="GIT_SKIP_TESTS" fi if test -z "$to_skip" && test -n "$test_prereq" && ! test_have_prereq "$test_prereq" then to_skip=t of_prereq= if test "$missing_prereq" != "$test_prereq" then of_prereq=" of $test_prereq" fi skipped_reason="missing $missing_prereq${of_prereq}" fi if test -z "$to_skip" && test -n "$run_list" && ! match_test_selector_list '--run' $test_count "$run_list" then to_skip=t skipped_reason="--run" fi case "$to_skip" in t) if test -n "$write_junit_xml" then message="$(xml_attr_encode "$skipped_reason")" write_junit_xml_testcase "$1" \ " " fi say_color skip >&3 "skipping test: $@" say_color skip "ok $test_count # skip $1 ($skipped_reason)" : true ;; *) false ;; esac } # stub; perf-lib overrides it test_at_end_hook_ () { : } write_junit_xml () { case "$1" in --truncate) >"$junit_xml_path" junit_have_testcase= shift ;; esac printf '%s\n' "$@" >>"$junit_xml_path" } xml_attr_encode () { printf '%s\n' "$@" | test-tool xml-encode } write_junit_xml_testcase () { junit_attrs="name=\"$(xml_attr_encode "$this_test.$test_count $1")\"" shift junit_attrs="$junit_attrs classname=\"$this_test\"" junit_attrs="$junit_attrs time=\"$(test-tool \ date getnanos $junit_start)\"" write_junit_xml "$(printf '%s\n' \ " " "$@" " ")" junit_have_testcase=t } test_done () { GIT_EXIT_OK=t if test -n "$write_junit_xml" && test -n "$junit_xml_path" then test -n "$junit_have_testcase" || { junit_start=$(test-tool date getnanos) write_junit_xml_testcase "all tests skipped" } # adjust the overall time junit_time=$(test-tool date getnanos $junit_suite_start) sed "s/]*/& time=\"$junit_time\"/" \ <"$junit_xml_path" >"$junit_xml_path.new" mv "$junit_xml_path.new" "$junit_xml_path" write_junit_xml " " "" fi if test -z "$HARNESS_ACTIVE" then mkdir -p "$TEST_RESULTS_DIR" cat >"$TEST_RESULTS_BASE.counts" <<-EOF total $test_count success $test_success fixed $test_fixed broken $test_broken failed $test_failure EOF fi if test "$test_fixed" != 0 then say_color error "# $test_fixed known breakage(s) vanished; please update test(s)" fi if test "$test_broken" != 0 then say_color warn "# still have $test_broken known breakage(s)" fi if test "$test_broken" != 0 || test "$test_fixed" != 0 then test_remaining=$(( $test_count - $test_broken - $test_fixed )) msg="remaining $test_remaining test(s)" else test_remaining=$test_count msg="$test_count test(s)" fi case "$test_failure" in 0) if test $test_external_has_tap -eq 0 then if test $test_remaining -gt 0 then say_color pass "# passed all $msg" fi # Maybe print SKIP message test -z "$skip_all" || skip_all="# SKIP $skip_all" case "$test_count" in 0) say "1..$test_count${skip_all:+ $skip_all}" ;; *) test -z "$skip_all" || say_color warn "$skip_all" say "1..$test_count" ;; esac fi if test -z "$debug" then test -d "$TRASH_DIRECTORY" || error "Tests passed but trash directory already removed before test cleanup; aborting" cd "$TRASH_DIRECTORY/.." && rm -fr "$TRASH_DIRECTORY" || { # try again in a bit sleep 5; rm -fr "$TRASH_DIRECTORY" } || error "Tests passed but test cleanup failed; aborting" fi test_at_end_hook_ exit 0 ;; *) if test $test_external_has_tap -eq 0 then say_color error "# failed $test_failure among $msg" say "1..$test_count" fi exit 1 ;; esac } if test -z "$GIT_TEST_CMP" then if test -n "$GIT_TEST_CMP_USE_COPIED_CONTEXT" then GIT_TEST_CMP="$DIFF -c" else GIT_TEST_CMP="$DIFF -u" fi fi # Test repository rm -fr "$TRASH_DIRECTORY" || { GIT_EXIT_OK=t echo >&5 "FATAL: Cannot prepare test area" exit 1 } HOME="$TRASH_DIRECTORY" GNUPGHOME="$HOME/gnupg-home-not-used" export HOME GNUPGHOME if test -z "$TEST_NO_CREATE_REPO" then test_create_repo "$TRASH_DIRECTORY" else mkdir -p "$TRASH_DIRECTORY" fi # Use -P to resolve symlinks in our working directory so that the cwd # in subprocesses like git equals our $PWD (for pathname comparisons). cd -P "$TRASH_DIRECTORY" || exit 1 this_test=${0##*/} this_test=${this_test%%-*} if match_pattern_list "$this_test" $GIT_SKIP_TESTS then say_color info >&3 "skipping test $this_test altogether" skip_all="skip all tests in $this_test" test_done fi if test -n "$write_junit_xml" then junit_xml_dir="$TEST_OUTPUT_DIRECTORY/out" mkdir -p "$junit_xml_dir" junit_xml_base=${0##*/} junit_xml_path="$junit_xml_dir/TEST-${junit_xml_base%.sh}.xml" junit_attrs="name=\"${junit_xml_base%.sh}\"" junit_attrs="$junit_attrs timestamp=\"$(TZ=UTC \ date +%Y-%m-%dT%H:%M:%S)\"" write_junit_xml --truncate "" " " junit_suite_start=$(test-tool date getnanos) if test -n "$GIT_TEST_TEE_OUTPUT_FILE" then GIT_TEST_TEE_OFFSET=0 fi fi # Provide an implementation of the 'yes' utility; the upper bound # limit is there to help Windows that cannot stop this loop from # wasting cycles when the downstream stops reading, so do not be # tempted to turn it into an infinite loop. cf. 6129c930 ("test-lib: # limit the output of the yes utility", 2016-02-02) yes () { if test $# = 0 then y=y else y="$*" fi i=0 while test $i -lt 99 do echo "$y" i=$(($i+1)) done } # Fix some commands on Windows, and other OS-specific things uname_s=$(uname -s) case $uname_s in *MINGW*) # Windows has its own (incompatible) sort and find sort () { /usr/bin/sort "$@" } find () { /usr/bin/find "$@" } # git sees Windows-style pwd pwd () { builtin pwd -W } # no POSIX permissions # backslashes in pathspec are converted to '/' # exec does not inherit the PID test_set_prereq MINGW test_set_prereq NATIVE_CRLF test_set_prereq SED_STRIPS_CR test_set_prereq GREP_STRIPS_CR test_set_prereq WINDOWS GIT_TEST_CMP=mingw_test_cmp ;; *CYGWIN*) test_set_prereq POSIXPERM test_set_prereq EXECKEEPSPID test_set_prereq CYGWIN test_set_prereq SED_STRIPS_CR test_set_prereq GREP_STRIPS_CR test_set_prereq WINDOWS ;; *) test_set_prereq POSIXPERM test_set_prereq BSLASHPSPEC test_set_prereq EXECKEEPSPID ;; esac ( COLUMNS=1 && test $COLUMNS = 1 ) && test_set_prereq COLUMNS_CAN_BE_1 test -z "$NO_PERL" && test_set_prereq PERL test -z "$NO_PTHREADS" && test_set_prereq PTHREADS test -z "$NO_PYTHON" && test_set_prereq PYTHON test -n "$USE_LIBPCRE1$USE_LIBPCRE2" && test_set_prereq PCRE test -n "$USE_LIBPCRE1" && test_set_prereq LIBPCRE1 test -n "$USE_LIBPCRE2" && test_set_prereq LIBPCRE2 test -z "$NO_GETTEXT" && test_set_prereq GETTEXT if test -n "$GIT_TEST_GETTEXT_POISON_ORIG" then GIT_TEST_GETTEXT_POISON=$GIT_TEST_GETTEXT_POISON_ORIG unset GIT_TEST_GETTEXT_POISON_ORIG fi # Can we rely on git's output in the C locale? if test -z "$GIT_TEST_GETTEXT_POISON" then test_set_prereq C_LOCALE_OUTPUT fi if test -z "$GIT_TEST_CHECK_CACHE_TREE" then GIT_TEST_CHECK_CACHE_TREE=true export GIT_TEST_CHECK_CACHE_TREE fi test_lazy_prereq PIPE ' # test whether the filesystem supports FIFOs test_have_prereq !MINGW,!CYGWIN && rm -f testfifo && mkfifo testfifo ' test_lazy_prereq SYMLINKS ' # test whether the filesystem supports symbolic links ln -s x y && test -h y ' test_lazy_prereq FILEMODE ' test "$(git config --bool core.filemode)" = true ' test_lazy_prereq CASE_INSENSITIVE_FS ' echo good >CamelCase && echo bad >camelcase && test "$(cat CamelCase)" != good ' test_lazy_prereq FUNNYNAMES ' test_have_prereq !MINGW && touch -- \ "FUNNYNAMES tab embedded" \ "FUNNYNAMES \"quote embedded\"" \ "FUNNYNAMES newline embedded" 2>/dev/null && rm -- \ "FUNNYNAMES tab embedded" \ "FUNNYNAMES \"quote embedded\"" \ "FUNNYNAMES newline embedded" 2>/dev/null ' test_lazy_prereq UTF8_NFD_TO_NFC ' # check whether FS converts nfd unicode to nfc auml=$(printf "\303\244") aumlcdiar=$(printf "\141\314\210") >"$auml" && test -f "$aumlcdiar" ' test_lazy_prereq AUTOIDENT ' sane_unset GIT_AUTHOR_NAME && sane_unset GIT_AUTHOR_EMAIL && git var GIT_AUTHOR_IDENT ' test_lazy_prereq EXPENSIVE ' test -n "$GIT_TEST_LONG" ' test_lazy_prereq EXPENSIVE_ON_WINDOWS ' test_have_prereq EXPENSIVE || test_have_prereq !MINGW,!CYGWIN ' test_lazy_prereq USR_BIN_TIME ' test -x /usr/bin/time ' test_lazy_prereq NOT_ROOT ' uid=$(id -u) && test "$uid" != 0 ' test_lazy_prereq JGIT ' type jgit ' # SANITY is about "can you correctly predict what the filesystem would # do by only looking at the permission bits of the files and # directories?" A typical example of !SANITY is running the test # suite as root, where a test may expect "chmod -r file && cat file" # to fail because file is supposed to be unreadable after a successful # chmod. In an environment (i.e. combination of what filesystem is # being used and who is running the tests) that lacks SANITY, you may # be able to delete or create a file when the containing directory # doesn't have write permissions, or access a file even if the # containing directory doesn't have read or execute permissions. test_lazy_prereq SANITY ' mkdir SANETESTD.1 SANETESTD.2 && chmod +w SANETESTD.1 SANETESTD.2 && >SANETESTD.1/x 2>SANETESTD.2/x && chmod -w SANETESTD.1 && chmod -r SANETESTD.1/x && chmod -rx SANETESTD.2 || BUG "cannot prepare SANETESTD" ! test -r SANETESTD.1/x && ! rm SANETESTD.1/x && ! test -f SANETESTD.2/x status=$? chmod +rwx SANETESTD.1 SANETESTD.2 && rm -rf SANETESTD.1 SANETESTD.2 || BUG "cannot clean SANETESTD" return $status ' test FreeBSD != $uname_s || GIT_UNZIP=${GIT_UNZIP:-/usr/local/bin/unzip} GIT_UNZIP=${GIT_UNZIP:-unzip} test_lazy_prereq UNZIP ' "$GIT_UNZIP" -v test $? -ne 127 ' run_with_limited_cmdline () { (ulimit -s 128 && "$@") } test_lazy_prereq CMDLINE_LIMIT ' test_have_prereq !MINGW,!CYGWIN && run_with_limited_cmdline true ' run_with_limited_stack () { (ulimit -s 128 && "$@") } test_lazy_prereq ULIMIT_STACK_SIZE ' test_have_prereq !MINGW,!CYGWIN && run_with_limited_stack true ' build_option () { git version --build-options | sed -ne "s/^$1: //p" } test_lazy_prereq LONG_IS_64BIT ' test 8 -le "$(build_option sizeof-long)" ' test_lazy_prereq TIME_IS_64BIT 'test-tool date is64bit' test_lazy_prereq TIME_T_IS_64BIT 'test-tool date time_t-is64bit' test_lazy_prereq CURL ' curl --version ' # SHA1 is a test if the hash algorithm in use is SHA-1. This is both for tests # which will not work with other hash algorithms and tests that work but don't # test anything meaningful (e.g. special values which cause short collisions). test_lazy_prereq SHA1 ' test $(git hash-object /dev/null) = e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 ' test_lazy_prereq REBASE_P ' test -z "$GIT_TEST_SKIP_REBASE_P" '