pax_global_header00006660000000000000000000000064147542147670014533gustar00rootroot0000000000000052 comment=559491629cb630950e8c4954712d87471da2c11b pcre2-ocaml-8.0.3/000077500000000000000000000000001475421476700136475ustar00rootroot00000000000000pcre2-ocaml-8.0.3/.clang-format000066400000000000000000000000231475421476700162150ustar00rootroot00000000000000BasedOnStyle: LLVM pcre2-ocaml-8.0.3/.editorconfig000066400000000000000000000005451475421476700163300ustar00rootroot00000000000000# EditorConfig: https://EditorConfig.org # Top-most EditorConfig file root = true # Default settings for all files [*] charset = utf-8 end_of_line = lf insert_final_newline = true trim_trailing_whitespace = true indent_style = space indent_size = 2 max_line_length = 80 # Makefile [Makefile] # Makefiles require tabs instead of spaces indent_style = tab pcre2-ocaml-8.0.3/.github/000077500000000000000000000000001475421476700152075ustar00rootroot00000000000000pcre2-ocaml-8.0.3/.github/workflows/000077500000000000000000000000001475421476700172445ustar00rootroot00000000000000pcre2-ocaml-8.0.3/.github/workflows/main.yml000066400000000000000000000011561475421476700207160ustar00rootroot00000000000000name: Builds, tests & co on: - pull_request - push - workflow_dispatch permissions: read-all jobs: build: strategy: fail-fast: false matrix: os: - ubuntu-latest - macos-latest runs-on: ${{ matrix.os }} steps: - name: Checkout tree uses: actions/checkout@v4 - name: Set-up OCaml ${{ matrix.ocaml-compiler }} uses: ocaml/setup-ocaml@v3 with: ocaml-compiler: 5 - run: opam install . --deps-only --with-test - run: opam exec -- dune build - run: opam exec -- dune runtest # vim: filetype=yaml pcre2-ocaml-8.0.3/.gitignore000066400000000000000000000000401475421476700156310ustar00rootroot00000000000000.*.swp .merlin *.install _build pcre2-ocaml-8.0.3/.ocamlformat000066400000000000000000000001521475421476700161520ustar00rootroot00000000000000version = 0.27.0 profile = conventional # Default overrides wrap-comments = true parse-docstrings = true pcre2-ocaml-8.0.3/CHANGES.md000066400000000000000000000012511475421476700152400ustar00rootroot00000000000000# Changelog ## 8.0.3 (2025-02-15) - ugh: forgot to make the function caml_alloc_some (for ocaml [4.08, 4.11] support) static. It clashes with the same function from the package `pcre` ## 8.0.2 (2024-12-26) - Thanks to @nojb, try to get it working for ocaml [4.08, 4.11] ## 8.0.1 (2024-12-20) - Merged all changes from old `pcre-ocaml`. - Fixed a bug in the `full_split` function where non-capturing groups were not identified as such. ## 7.5.3 (2024-12-23) * @mmottl fixed bug in `full_split` ## 7.5.2 (2023-09-06) - Fixed bug in `full_split`, added first unit-test for same ## 7.5.1 (2023-09-01) - Created pcre2-ocaml bindings based on original pcre-ocaml project pcre2-ocaml-8.0.3/LICENSE.md000066400000000000000000000654271475421476700152710ustar00rootroot00000000000000Copyright (c) 1999- Markus Mottl The Library is distributed under the terms of the GNU Lesser General Public License version 2.1 (included below). As a special exception to the GNU Lesser General Public License, you may link, statically or dynamically, a "work that uses the Library" with a publicly distributed version of the Library to produce an executable file containing portions of the Library, and distribute that executable file under terms of your choice, without any of the additional requirements listed in clause 6 of the GNU Lesser General Public License. By "a publicly distributed version of the Library", we mean either the unmodified Library as distributed by the authors, or a modified version of the Library that is distributed under the conditions defined in clause 2 of the GNU Lesser General Public License. This exception does not however invalidate any other reasons why the executable file might be covered by the GNU Lesser General Public License. --------------------------------------------------------------------------- ### GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] ### Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. ### TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION **0.** This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. **1.** You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. **2.** You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: - **a)** The modified work must itself be a software library. - **b)** You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. - **c)** You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. - **d)** If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. **3.** You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. **4.** You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. **5.** A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. **6.** As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: - **a)** Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) - **b)** Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. - **c)** Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. - **d)** If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. - **e)** Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. **7.** You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: - **a)** Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. - **b)** Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. **8.** You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. **9.** You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. **10.** Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. **11.** If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. **12.** If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. **13.** The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. **14.** If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. **NO WARRANTY** **15.** BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. **16.** IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. ### END OF TERMS AND CONDITIONS ### How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. one line to give the library's name and an idea of what it does. Copyright (C) year name of author This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. signature of Ty Coon, 1 April 1990 Ty Coon, President of Vice That's all there is to it! pcre2-ocaml-8.0.3/Makefile000066400000000000000000000001341475421476700153050ustar00rootroot00000000000000.PHONY: all clean doc all: dune build @install clean: dune clean doc: dune build @doc pcre2-ocaml-8.0.3/README.md000066400000000000000000000103311475421476700151240ustar00rootroot00000000000000# PCRE2-OCaml - Perl Compatibility Regular Expressions for OCaml Fork of the original [pcre-ocaml project](https://github.com/mmottl/pcre-ocaml) for PCRE2 support. These are the bindings as needed by the [Haxe compiler](https://github.com/HaxeFoundation/haxe). I do not plan on maintaining this repository. This [OCaml](http://www.ocaml.org) library interfaces with the C library [PCRE2](http://www.pcre.org), providing Perl-compatible regular expressions for string matching. ## Features PCRE2-OCaml offers: - Pattern searching - Subpattern extraction - String splitting by patterns - Pattern substitution Reasons to choose PCRE2-OCaml: - The PCRE2 library by Philip Hazel is mature and stable, implementing nearly all Perl regular expression features. High-level OCaml functions (split, replace, etc.) are compatible with Perl functions, as much as OCaml allows. Some developers find Perl-style regex syntax more intuitive and powerful than the Emacs-style regex used in OCaml's `Str` module. - PCRE2-OCaml is reentrant and thread-safe, unlike the `Str` module. This reentrancy offers convenience, eliminating concerns about library state. - High-level replacement and substitution functions in OCaml are faster than those in the `Str` module. When compiled to native code, they can even outperform Perl's C-based functions. - Returned data is unique, allowing safe destructive updates without side effects. - The library interface uses labels and default arguments for enhanced programming comfort. ## Usage Please run: ``` $ odig odoc pcre2 ``` Or (maybe?): ``` $ dune build @doc ``` Functions support two flag types: 1. **Convenience flags**: Readable and concise, translated internally on each call. Example: ```ocaml let rex = Pcre2.regexp ~flags:[`ANCHORED; `CASELESS] "some pattern" in (* ... *) ``` These are easy to use but may incur overhead in loops. For performance optimization, consider the next approach. 2. **Internal flags**: Predefined and translated from convenience flags for optimal loop performance. Example: ```ocaml let iflags = Pcre2.cflags [`ANCHORED; `CASELESS] in for i = 1 to 1000 do let rex = Pcre2.regexp ~iflags "some pattern constructed at runtime" in (* ... *) done ``` Translating flags outside loops saves cycles. Avoid creating regex in loops: ```ocaml for i = 1 to 1000 do let chunks = Pcre2.split ~pat:"[ \t]+" "foo bar" in (* ... *) done ``` Instead, predefine the regex: ```ocaml let rex = Pcre2.regexp "[ \t]+" in for i = 1 to 1000 do let chunks = Pcre2.split ~rex "foo bar" in (* ... *) done ``` Functions use optional arguments with intuitive defaults. For instance, `Pcre2.split` defaults to whitespace as the pattern. The `examples` directory contains applications demonstrating PCRE2-OCaml's functionality. ## Restartable (Partial) Pattern Matching PCRE2 includes a DFA match function for restarting partial matches with new input, exposed via `pcre2_dfa_exec`. While not suitable for extracting submatches or splitting strings, it's useful for streaming and search tasks. Example of a partial match restarted: ```ocaml utop # open Pcre2;; utop # let rex = regexp "12+3";; val rex : regexp = utop # let workspace = Array.make 40 0;; val workspace : int array = [|0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0|] utop # pcre2_dfa_match ~rex ~flags:[`PARTIAL_SOFT] ~workspace "12222";; Exception: Pcre2.Error Partial. utop # pcre2_dfa_match ~rex ~flags:[`PARTIAL_SOFT; `DFA_RESTART] ~workspace "2222222";; Exception: Pcre2.Error Partial. utop # pcre2_dfa_exec ~rex ~flags:[`PARTIAL_SOFT; `DFA_RESTART] ~workspace "2222222";; Exception: Pcre2.Error Partial. utop # pcre2_dfa_exec ~rex ~flags:[`PARTIAL_SOFT; `DFA_RESTART] ~workspace "223xxxx";; - : int array = [|0; 3; 0|] ``` Refer to the `pcre2_dfa_exec` documentation and the `dfa_restart` example for more information. ## Contact Information and Contributing Submit bug reports, feature requests, and contributions via the [GitHub issue tracker](https://github.com/camlp5/pcre2-ocaml/issues). For the latest information, visit: pcre2-ocaml-8.0.3/dune000066400000000000000000000002441475421476700145250ustar00rootroot00000000000000(env (dev (flags (:standard -w -9 -principal)) (c_flags (:standard -Wall -pedantic -Wextra -Wunused))) (release (ocamlopt_flags (:standard -O3)))) pcre2-ocaml-8.0.3/dune-project000066400000000000000000000012711475421476700161720ustar00rootroot00000000000000(lang dune 2.7) (name pcre2) (generate_opam_files true) (source (github camlp5/pcre2-ocaml)) (license "LGPL-2.1-or-later WITH OCaml-LGPL-linking-exception") (homepage "https://github.com/camlp5/pcre2-ocaml") (maintainers "Chet Murthy ") (authors "Markus Mottl ") (package (name pcre2) (synopsis "Bindings to the Perl Compatibility Regular Expressions library (version 2)") (description "pcre2-ocaml offers library functions for string pattern matching and\nsubstitution, similar to the functionality offered by the Perl language.") (depends (ocaml (>= 4.08)) dune-configurator (conf-libpcre2-8 :build) (ounit2 :with-test))) pcre2-ocaml-8.0.3/examples/000077500000000000000000000000001475421476700154655ustar00rootroot00000000000000pcre2-ocaml-8.0.3/examples/Makefile000066400000000000000000000002161475421476700171240ustar00rootroot00000000000000TARGETS = $(addsuffix .bc, cloc count_hash dfa_restart pcre2grep subst) .PHONY: all clean all: @dune build $(TARGETS) clean: @dune clean pcre2-ocaml-8.0.3/examples/README.md000066400000000000000000000022311475421476700167420ustar00rootroot00000000000000# Examples ## `cloc` This program reads C source code from `stdin` and outputs it to `stdout` with comments and empty lines removed. It's useful for counting lines of code. ## `count_hash` This program reads text from `stdin`, counts occurrences of identical words separated by whitespace, and prints the result to `stdout`. ## `pcre2grep` A grep-like program using Perl-compatible regular expressions. Start the program with the `-help` argument to see its functionality. ## `subst` Substitutes text in files using Perl-compatible regular expressions and substitution patterns. Start the program with the `-help` argument to see its functionality. Example invocation: ```sh subst '([Tt])ermcap' '$1ermCap' < /etc/termcap ``` ## `dfa_restart` Tests the DFA matching function and its partial match restart capability. Given a pattern, it accepts input incrementally, restarting the prior partial match until the pattern either succeeds or fails. Example interaction: ```sh $ dfa_restart.exe 'abc12+3' > abc partial match, provide more input: > 122222 partial match, provide more input: > 222 partial match, provide more input: > 3 match completed: "[|0;1;0|]" ``` pcre2-ocaml-8.0.3/examples/cloc.ml000066400000000000000000000010111475421476700167300ustar00rootroot00000000000000open Pcre2 let read_whole_channel ch = let size = 4096 in let strbuf = Bytes.create size in let buf = Buffer.create 65536 in let len = ref size in while !len <> 0 do len := input ch strbuf 0 size; Buffer.add_subbytes buf strbuf 0 !len done; Buffer.contents buf let () = let str = read_whole_channel stdin in let str = qreplace ~pat:"/\\*(.|\n)*?\\*/" str in let str = qreplace_first ~pat:"^(\n|\\s)+" str in let str = qreplace ~pat:"\n+((\n|\\s)\n)*" ~templ:"\n" str in print_string str pcre2-ocaml-8.0.3/examples/count_hash.ml000066400000000000000000000003701475421476700201520ustar00rootroot00000000000000open Hashtbl let hash = create 1973 let add_string s = try incr (find hash s) with Not_found -> add hash s (ref 1) ;; Pcre2.foreach_line (fun line -> List.iter add_string (Pcre2.split line)); iter (fun k v -> Printf.printf "%4d\t%s\n" !v k) hash pcre2-ocaml-8.0.3/examples/dfa_restart.ml000066400000000000000000000023761475421476700203250ustar00rootroot00000000000000open Pcre2 open Printf let show_array arr = Array.map string_of_int arr |> Array.to_list |> String.concat ";" |> sprintf "[|%s|]" let new_workspace () = Array.make 50 0 let () = let pat = if Array.length Sys.argv > 1 then Sys.argv.(1) else ( eprintf "%s: expected pattern argument\n" Sys.argv.(0); exit 1) in let rex = regexp pat in let rec find_match flags workspace = print_string "> "; let line, eof = try (read_line (), false) with End_of_file -> ("", true) in match pcre2_dfa_match ~rex ~flags ~workspace line with | res -> printf "match completed: %S\n" (show_array res); if not eof then ( printf "\n *input & workspace reset*\n"; find_match [ `PARTIAL_SOFT ] (new_workspace ())) | exception Error Partial -> printf "partial match, provide more input:\n"; find_match [ `DFA_RESTART; `PARTIAL_SOFT ] workspace | exception exn -> (match exn with | Not_found -> eprintf "pattern match failed\n" | Error WorkspaceSize -> eprintf "need larger workspace vector\n" | Error (InternalError s) -> eprintf "internal error: %s\n" s | exn -> raise exn); exit 1 in find_match [ `PARTIAL_SOFT ] (new_workspace ()) pcre2-ocaml-8.0.3/examples/dune000066400000000000000000000001501475421476700163370ustar00rootroot00000000000000(executables (names cloc count_hash pcre2grep subst dfa_restart) (libraries pcre2) (modes byte exe)) pcre2-ocaml-8.0.3/examples/pcre2grep.ml000066400000000000000000000066601475421476700177200ustar00rootroot00000000000000open Pcre2 open Printf let filenames = ref true and filenames_only = ref false and count_only = ref false and invert = ref false and number = ref false and silent = ref false and whole_lines = ref false let parse_args () = let ignore_case = ref false and pat = ref None and files = ref [] in let c = ("-c", Arg.Set count_only, "Count lines only.") and h = ( "-h", Arg.Clear filenames, "Suppress printing of filenames when searching multiple files." ) and i = ("-i", Arg.Set ignore_case, "Ignore case.") and l = ( "-l", Arg.Set filenames_only, "Only print names of files containing matching lines (once)." ) and n = ("-n", Arg.Set number, "Precede each line by its line number in the file.") and s = ( "-s", Arg.Set silent, "Display nothing but error messages. Exit status indicates match." ) and v = ("-v", Arg.Set invert, "Invert sense of the match: finds nonmatching lines.") and x = ( "-x", Arg.Set whole_lines, "Force the pattern to be anchored and to match the entire line." ) and usage = "Usage: pcre2grep [options] pattern [file] ...\n\n\ Searches files for character patterns.\n" and anon_arg arg = if !pat = None then pat := Some arg else files := arg :: !files in let args = [ c; h; i; l; n; s; v; x ] in Arg.parse args anon_arg usage; let flags = let flag_list = if !ignore_case then [ `CASELESS ] else [] in if !whole_lines then `ANCHORED :: flag_list else flag_list in let rex = match !pat with | Some pat -> regexp ~flags pat | None -> eprintf "%s: not enough arguments!\n" Sys.argv.(0); Arg.usage args usage; exit 2 in (rex, List.rev !files) let _ = let rex, files = parse_args () and rfl = rflags [] in let _, ovector = make_ovector rex in let pcre2grep file name = let ret_code = ref 1 and linenumber = ref 0 and count = ref 0 and stdin_print_name () = match name with | Some filename -> print_endline filename | None -> print_endline "" and print_name () = match name with Some name -> printf "%s:" name | None -> () in let try_match line = let matched = try unsafe_pcre2_match rfl rex ~pos:0 ~subj_start:0 ~subj:line ovector None; if !whole_lines && ovector.(1) <> String.length line then false else true with Not_found -> false in incr linenumber; if matched <> !invert then ( if !count_only then incr count else if !filenames_only then ( stdin_print_name (); raise Exit) else if !silent then raise Exit else ( print_name (); if !number then printf "%d:" !linenumber; print_endline line); ret_code := 0) in try foreach_line ~ic:file try_match; if !count_only then ( print_name (); printf "%d\n" !count); !ret_code with Exit -> 0 in if files = [] then exit (pcre2grep stdin None); if List.length files = 1 then filenames := false; if !filenames_only then filenames := true; let collect ret_code filename = try let file = open_in filename in let frc = pcre2grep file (if !filenames then Some filename else None) in close_in file; if frc = 0 && ret_code = 1 then 0 else ret_code with Sys_error msg -> prerr_endline msg; 2 in exit (List.fold_left collect 1 files) pcre2-ocaml-8.0.3/examples/subst.ml000066400000000000000000000036121475421476700171610ustar00rootroot00000000000000open Pcre2 let parse_args () = let quick = ref false and first = ref false and ignore_case = ref false and offset = ref 0 and pat = ref None and substr = ref None in let q = ( "-q", Arg.Set quick, "Quick replacement. Interpretes substitution as plain text." ) and f = ("-f", Arg.Set first, "Replace first occurrence in line only.") and i = ("-i", Arg.Set ignore_case, "Ignore case.") and ofs = ("-ofs", Arg.Int (fun n -> offset := n), "Start matching at column n.") and usage = "Usage: subst [-q] [-f] [-i] [-ofs offset] pattern substitution\n\n\ Reads lines from standard input and replaces occurrences of\n\ the PERL-style regular expression \"pattern\" with \"substitution\",\n\ printing the result to standard output.\n\ In default mode the contents of \"substitution\" will be interpreted\n\ similarly to its equivalent in PERL.\n" and anon_arg arg = match (!pat, !substr) with | None, _ -> pat := Some arg | _, None -> substr := Some arg | _ -> raise (Arg.Bad "too many arguments!") in let args = [ q; f; i; ofs ] in Arg.parse args anon_arg usage; let flags = if !ignore_case then [ `CASELESS ] else [] in let rex, sstr = match (!pat, !substr) with | Some rex, Some sstr -> (regexp ~flags rex, sstr) | _ -> prerr_endline (Sys.argv.(0) ^ ": not enough arguments!"); Arg.usage args usage; exit 1 in match (!quick, !first) with | false, false -> fun s -> replace ~rex ~pos:!offset ~templ:sstr s | true, false -> fun s -> qreplace ~rex ~pos:!offset ~templ:sstr s | false, true -> fun s -> replace_first ~rex ~pos:!offset ~templ:sstr s | true, true -> fun s -> qreplace_first ~rex ~pos:!offset ~templ:sstr s let _ = let substitute = parse_args () in foreach_line (fun line -> try print_endline (substitute line) with Invalid_argument _ -> print_endline line) pcre2-ocaml-8.0.3/pa_ppx_test/000077500000000000000000000000001475421476700161755ustar00rootroot00000000000000pcre2-ocaml-8.0.3/pa_ppx_test/Makefile000066400000000000000000000003611475421476700176350ustar00rootroot00000000000000NOT_OCAMLFIND=not-ocamlfind bootstrap: ../test/pcre2_tests.ml ../test/%.ml: %.ml $(NOT_OCAMLFIND) preprocess -package pa_ppx_regexp,camlp5.pr_o -ppopt -pa_ppx_regexp-nostatic -syntax camlp5o $< > $@.NEW && \ mv $@.NEW $@ .SUFFIXES: .ml pcre2-ocaml-8.0.3/pa_ppx_test/pcre2_tests.ml000066400000000000000000000173071475421476700207740ustar00rootroot00000000000000(**pp -syntax camlp5o -package pa_ppx.deriving_plugins.std *) open OUnit2 let test_special_char_regexps ctxt = () ; assert_equal "\n" ([%match {|\n$|}/s exc pcre2 strings] "\n") ; assert_equal "" ([%subst {|\n+$|} / {||} /s pcre2] "\n\n") let test_pcre2_simple_match ctxt = () ; assert_equal "abc" (Pcre2.get_substring ([%match "abc"/exc raw pcre2] "abc") 0) ; assert_equal (Some "abc") ([%match "abc"/pcre2] "abc") ; assert_equal (Some "abc") ([%match "abc"/strings pcre2] "abc") ; assert_equal true ([%match "abc"/pred pcre2] "abc") ; assert_equal false ([%match "abc"/pred pcre2] "abd") ; assert_equal None ([%match "abc"/pcre2] "abd") ; assert_raises Not_found (fun () -> [%match "abc"/exc pcre2] "abd") ; assert_raises Not_found (fun () -> [%match "abc"/exc strings pcre2] "abd") ; assert_equal None ([%match "abc"/strings pcre2] "abd") ; assert_equal "abc" ([%match "abc"/exc strings pcre2] "abc") ; assert_equal ("abc", Some "b") ([%match "a(b)c"/exc strings pcre2] "abc") ; assert_equal ("ac", None) ([%match "a(?:(b)?)c"/exc strings pcre2] "ac") ; assert_equal "abc" (Pcre2.get_substring ([%match "ABC"/exc raw i pcre2] "abc") 0) ; assert_equal ("abc", Some "a", Some "b", Some "c") ([%match "(a)(b)(c)"/exc strings pcre2] "abc") let test_pcre2_selective_match ctxt = () ; assert_equal ("abc", Some "b") ([%match "a(b)c"/exc strings (!0,1) pcre2] "abc") ; assert_equal ("abc", "b") ([%match "a(b)c"/exc strings (!0,!1) pcre2] "abc") ; assert_equal "b" ([%match "a(b)c"/exc strings !1 pcre2] "abc") ; assert_equal (Some ("abc", "b")) ([%match "a(b)c"/ strings (!0,!1) pcre2] "abc") ; assert_equal ("ac", None) ([%match "a(b)?c"/exc strings (!0,1) pcre2] "ac") ; assert_raises Not_found (fun _ -> [%match "a(b)?c"/exc strings (!0,!1) pcre2] "ac") ; assert_equal None ([%match "a(b)?c"/ strings (!0,!1) pcre2] "ac") let test_pcre2_search ctxt = () ; assert_equal "abc" ([%match "abc"/exc strings pcre2] "zzzabc") ; assert_equal None ([%match "^abc"/strings pcre2] "zzzabc") let show_string_option = function None -> "None" | Some s -> Printf.sprintf "Some %s" s let test_pcre2_single ctxt = let printer = show_string_option in () ; assert_equal ~printer None ([%match ".+"/pcre2] "\n\n") ; assert_equal ~printer None ([%match ".+" / m pcre2 strings] "\n\n") ; assert_equal ~printer None ([%match ".+"/ pcre2 strings] "\n\n") ; assert_equal ~printer (Some "\n\n") ([%match ".+"/s pcre2 strings] "\n\n") ; assert_equal ~printer None ([%match ".+"/m pcre2 strings] "\n\n") ; let printer x = x in () ; assert_equal ~printer "\n\n" ([%match ".+" / s exc pcre2 strings] "\n\n") ; assert_equal ~printer "<>\ndef" ([%subst ".+" / {|<<$0>>|} / pcre2] "abc\ndef") ; assert_equal ~printer "<>" ([%subst ".+" / {|<<$0>>|}/s pcre2] "abc\ndef") ; assert_equal ~printer "<>\ndef" ([%subst ".+" / {|<<$0>>|}/m pcre2] "abc\ndef") ; assert_equal ~printer "<>\ndef" ([%subst ".*" / {|<<$0>>|} /pcre2] "abc\ndef") ; assert_equal ~printer "<><<>>\n<><<>>" ([%subst ".*" / {|<<$0>>|} / g pcre2] "abc\ndef") ; assert_equal ~printer "<>\n<>" ([%subst ".+" / {|<<$0>>|} / g pcre2] "abc\ndef") ; assert_equal ~printer "<>a\nc<>" ([%subst "a.c" / {|<<$0>>|} / g pcre2] "abca\ncaec") ; assert_equal ~printer "<><><>" ([%subst "a.c" / {|<<$0>>|} / g s pcre2] "abca\ncaec") let test_pcre2_multiline ctxt = () ; assert_equal (Some "bar") ([%match ".+$"/ strings pcre2] "foo\nbar") ; assert_equal (Some "foo") ([%match ".+$"/ m strings pcre2] "foo\nbar") let test_pcre2_simple_split ctxt = () ; assert_equal ["bb"] ([%split "a"/pcre2] "bb") let test_pcre2_delim_split_raw ctxt = let open Pcre2 in () ; assert_equal [Delim "a"; Text "b"; Delim "a"; Text "b"] ([%split "a"/pcre2 raw] "ababa") ; assert_equal [Delim "a"; Text "b"; Delim "a"; Delim "a"; Text "b"] ([%split "a"/pcre2 raw] "abaaba") ; assert_equal [Delim "a"; NoGroup; Text "b"; Delim "ac"; Group (1, "c"); Text "b"; Delim "a"; NoGroup] ([%split "a(c)?"/pcre2 raw] "abacba") ; assert_equal [Delim "ac"; Group (1, "c"); Text "b"; Delim "ac"; Group (1, "c"); Text "b"; Delim "ac"; Group (1, "c")] ([%split "a(c)"/pcre2 raw] "acbacbac") ; assert_equal [Delim "ac"; Group (1, "c"); Text "b"; Delim "ac"; Group (1, "c"); Text "b"; Delim "ac"; Group (1, "c")] ([%split "a(c)"/pcre2 raw] "acbacbac") ; assert_equal [Delim "a"; NoGroup; Text "b"; Delim "ac"; Group (1, "c"); Text "b"; Delim "a"; NoGroup] ([%split "a(c)?"/pcre2 raw] "abacba") ; assert_equal [Text "ab"; Delim "x"; Group (1, "x"); NoGroup; Text "cd"] ([%split {|(x)|(u)|} / raw pcre2] "abxcd") ; assert_equal [Text "ab"; Delim "x"; Group (1, "x"); NoGroup; Text "cd"; Delim "u"; NoGroup; Group (2, "u")] ([%split {|(x)|(u)|} / raw pcre2] "abxcdu") let test_pcre2_string_pattern ctxt = () ; assert_equal "$b" ([%pattern {|$$$1|} /pcre2] ([%match "a(b)c"/exc pcre2 raw] "abc")) ; assert_equal "b" ([%pattern {|${01}|} /pcre2] ([%match "a(b)c"/exc pcre2 raw] "abc")) ; assert_equal "bx" (let s = "x" in [%pattern {|${01}${s}|} /pcre2] ([%match "a(b)c"/exc pcre2 raw] "abc")) ; assert_equal {|"bx|} (let s = "x" in [%pattern {|"${01}${s}|} /pcre2] ([%match "a(b)c"/exc pcre2 raw] "abc")) ; assert_equal {|"x|} (let s = "x" in [%pattern {|"${s}|} /pcre2]) let test_pcre2_expr_pattern ctxt = () ; assert_equal "abc" ([%pattern "$0$" / e pcre2] ([%match "abc"/exc pcre2 raw] "abc")) ; assert_equal "abcx" ([%pattern {|$0$ ^ "x"|} / e pcre2] ([%match "abc"/exc pcre2 raw] "abc")) ; assert_equal "abcx" (let x = "x" in [%pattern {|$0$ ^ x|} / e pcre2] ([%match "abc"/exc pcre2 raw] "abc")) ; assert_equal "x" (let x = "x" in [%pattern {|"" ^ x|} / e pcre2]) let test_pcre2_subst ctxt = () ; assert_equal "$b" ([%subst "a(b)c" / {|$$$1|} /pcre2] "abc") ; assert_equal "$b" ([%subst "A(B)C" / {|$$$1|} / i pcre2] "abc") ; assert_equal "$babc" ([%subst "A(B)C" / {|$$$1|} / i pcre2] "abcabc") ; assert_equal "$b$b" ([%subst "A(B)C" / {|$$$1|} / g i pcre2] "abcabc") ; assert_equal "$b$b" ([%subst "A(B)C" / {|"$" ^ $1$|} / e g i pcre2] "abcabc") ; assert_equal "$$" ([%subst "A(B)C" / {|"$"|} / e g i pcre2] "abcabc") ; assert_equal "$$" ([%subst "A(B)C" / {|$$|} / g i pcre2] "abcabc") let test_pcre2_ocamlfind_bits ctxt = () ; assert_equal ~printer:show_string_option (Some "-syntax camlp5o ") (snd ([%match {|^\(\*\*pp (.*?)\*\)|} / exc strings pcre2] {|(**pp -syntax camlp5o *) |})) let pcre2_envsubst envlookup s = let f s1 s2 = if s1 <> "" then envlookup s1 else if s2 <> "" then envlookup s2 else assert false in [%subst {|(?:\$\(([^)]+)\)|\$\{([^}]+)\})|} / {| f $1$ $2$ |} / g e pcre2] s let test_pcre2_envsubst_via_replace ctxt = let f = function "A" -> "res1" | "B" -> "res2" | _ -> failwith "unexpected arg in envsubst" in assert_equal "...res1...res2..." (pcre2_envsubst f {|...$(A)...${B}...|}) let suite = "Test pa_ppx_regexp" >::: [ "pcre2 simple_match" >:: test_pcre2_simple_match ; "pcre2 selective_match" >:: test_pcre2_selective_match ; "pcre2 search" >:: test_pcre2_search ; "pcre2 single" >:: test_pcre2_single ; "pcre2 multiline" >:: test_pcre2_multiline ; "pcre2 simple_split" >:: test_pcre2_simple_split ; "pcre2 delim_split raw" >:: test_pcre2_delim_split_raw ; "pcre2 string_pattern" >:: test_pcre2_string_pattern ; "pcre2 expr_pattern" >:: test_pcre2_expr_pattern ; "pcre2 subst" >:: test_pcre2_subst ; "pcre2 ocamlfind bits" >:: test_pcre2_ocamlfind_bits ; "pcre2 envsubst via replace" >:: test_pcre2_envsubst_via_replace ; "pcre only_regexps" >:: test_special_char_regexps ] let _ = if not !Sys.interactive then run_test_tt_main suite else () pcre2-ocaml-8.0.3/pcre2.opam000066400000000000000000000020231475421476700155350ustar00rootroot00000000000000# This file is generated by dune, edit dune-project instead Version: "8.0.3" opam-version: "2.0" synopsis: "Bindings to the Perl Compatibility Regular Expressions library (version 2)" description: """ pcre2-ocaml offers library functions for string pattern matching and substitution, similar to the functionality offered by the Perl language.""" maintainer: ["Chet Murthy "] authors: ["Markus Mottl "] license: "LGPL-2.1-or-later WITH OCaml-LGPL-linking-exception" homepage: "https://github.com/camlp5/pcre2-ocaml" bug-reports: "https://github.com/camlp5/pcre2-ocaml/issues" depends: [ "dune" {>= "2.7"} "ocaml" {>= "4.08"} "dune-configurator" "conf-libpcre2-8" {build} "ounit2" {with-test} "odoc" {with-doc} ] build: [ ["dune" "subst"] {dev} [ "dune" "build" "-p" name "-j" jobs "@install" "@runtest" {with-test} "@doc" {with-doc} ] ] dev-repo: "git+https://github.com/camlp5/pcre2-ocaml.git" url { src: "" checksum: [ "sha512=" ] } pcre2-ocaml-8.0.3/src/000077500000000000000000000000001475421476700144365ustar00rootroot00000000000000pcre2-ocaml-8.0.3/src/Makefile000066400000000000000000000001531475421476700160750ustar00rootroot00000000000000TARGETS = pcre2.cma libpcre2_stubs.a .PHONY: all clean all: @dune build $(TARGETS) clean: @dune clean pcre2-ocaml-8.0.3/src/config/000077500000000000000000000000001475421476700157035ustar00rootroot00000000000000pcre2-ocaml-8.0.3/src/config/Makefile000066400000000000000000000001341475421476700173410ustar00rootroot00000000000000TARGETS = discover.bc .PHONY: all clean all: @dune build $(TARGETS) clean: @dune clean pcre2-ocaml-8.0.3/src/config/discover.ml000066400000000000000000000007461475421476700200620ustar00rootroot00000000000000let () = let module C = Configurator.V1 in C.main ~name:"pcre2" (fun c -> let default : C.Pkg_config.package_conf = { libs = [ "-lpcre2-8" ]; cflags = [] } in let conf = match C.Pkg_config.get c with | None -> default | Some pc -> Option.value (C.Pkg_config.query pc ~package:"libpcre2-8") ~default in C.Flags.write_sexp "c_flags.sexp" conf.cflags; C.Flags.write_sexp "c_library_flags.sexp" conf.libs) pcre2-ocaml-8.0.3/src/config/dune000066400000000000000000000001211475421476700165530ustar00rootroot00000000000000(executables (names discover) (libraries dune.configurator) (modes byte exe)) pcre2-ocaml-8.0.3/src/dune000066400000000000000000000004641475421476700153200ustar00rootroot00000000000000(library (public_name pcre2) (foreign_stubs (language c) (names pcre2_stubs) (flags (:standard) (:include c_flags.sexp) -O2 -fPIC -DPIC)) (c_library_flags (:include c_library_flags.sexp))) (rule (targets c_flags.sexp c_library_flags.sexp) (action (run ./config/discover.exe))) pcre2-ocaml-8.0.3/src/pcre2.ml000066400000000000000000001214671475421476700160160ustar00rootroot00000000000000(* PCRE2-OCAML - Perl Compatibility Regular Expressions for OCaml Copyright © 1999- Markus Mottl This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *) (* Public exceptions and their registration with the C runtime *) open Printf type error = | Partial | BadPattern of string * int | BadUTF | BadUTFOffset | MatchLimit | DepthLimit | WorkspaceSize | InternalError of string let string_of_error = function | Partial -> "Partial" | BadPattern (msg, pos) -> sprintf "Pcre2.BadPattern(%S, pos=%i)" msg pos | BadUTF -> "BadUTF" | BadUTFOffset -> "BadUTFOffset" | MatchLimit -> "MatchLimit" | DepthLimit -> "DepthLimit" | WorkspaceSize -> "WorkspaceSize" | InternalError msg -> sprintf "InternalError(%S)" msg exception Error of error exception Backtrack exception Regexp_or of string * error let string_of_exn = function | Error error -> Some (sprintf "Pcre2.Error(%s)" (string_of_error error)) | Backtrack -> Some "Pcre2.Backtrack" | Regexp_or (pat, error) -> Some (sprintf "Pcre2.Regexp_or(pat=%S, %s)" pat (string_of_error error)) | _not_from_pcre -> None let () = Printexc.register_printer string_of_exn (* Puts exceptions into global C-variables for fast retrieval *) external pcre2_ocaml_init : unit -> unit = "pcre2_ocaml_init" (* Registers exceptions with the C runtime and caches polymorphic variants *) let () = Callback.register_exception "Pcre2.Error" (Error (InternalError "")); Callback.register_exception "Pcre2.Backtrack" Backtrack; pcre2_ocaml_init () (* Compilation and runtime flags and their conversion functions *) type icflag = int64 type irflag = int64 (* Compilation flags *) type cflag = [ `ALLOW_EMPTY_CLASS | `ALT_BSUX | `ALT_CIRCUMFLEX | `ALT_VERBNAMES | `ANCHORED | `AUTO_CALLOUT | `CASELESS | `DOLLAR_ENDONLY | `DOTALL | `DUPNAMES | `ENDANCHORED | `EXTENDED | `EXTENDED_MORE | `FIRSTLINE | `LITERAL | `MATCH_INVALID_UTF | `MATCH_UNSET_BACKREF | `MULTILINE | `NEVER_BACKSLASH_C | `NEVER_UCP | `NEVER_UTF | `NO_AUTO_CAPTURE | `NO_AUTO_POSSESS | `NO_DOTSTAR_ANCHOR | `NO_START_OPTIMIZE | `NO_UTF_CHECK | `UCP | `UNGREEDY | `USE_OFFSET_LIMIT | `UTF ] let int_of_cflag = function | `ALLOW_EMPTY_CLASS -> 0x00000001L | `ALT_BSUX -> 0x00000002L | `AUTO_CALLOUT -> 0x00000004L | `CASELESS -> 0x00000008L | `DOLLAR_ENDONLY -> 0x00000010L | `DOTALL -> 0x00000020L | `DUPNAMES -> 0x00000040L | `EXTENDED -> 0x00000080L | `FIRSTLINE -> 0x00000100L | `MATCH_UNSET_BACKREF -> 0x00000200L | `MULTILINE -> 0x00000400L | `NEVER_UCP -> 0x00000800L | `NEVER_UTF -> 0x00001000L | `NO_AUTO_CAPTURE -> 0x00002000L | `NO_AUTO_POSSESS -> 0x00004000L | `NO_DOTSTAR_ANCHOR -> 0x00008000L | `NO_START_OPTIMIZE -> 0x00010000L | `UCP -> 0x00020000L | `UNGREEDY -> 0x00040000L | `UTF -> 0x00080000L | `NEVER_BACKSLASH_C -> 0x00100000L | `ALT_CIRCUMFLEX -> 0x00200000L | `ALT_VERBNAMES -> 0x00400000L | `USE_OFFSET_LIMIT -> 0x00800000L | `EXTENDED_MORE -> 0x01000000L | `LITERAL -> 0x02000000L | `MATCH_INVALID_UTF -> 0x04000000L | `ENDANCHORED -> 0x20000000L | `NO_UTF_CHECK -> 0x40000000L | `ANCHORED -> 0x80000000L let coll_icflag icflag flag = Int64.logor (int_of_cflag flag) icflag let cflags flags = List.fold_left coll_icflag 0L flags let cflag_of_int = function | 0x00000001L -> `ALLOW_EMPTY_CLASS | 0x00000002L -> `ALT_BSUX | 0x00000004L -> `AUTO_CALLOUT | 0x00000008L -> `CASELESS | 0x00000010L -> `DOLLAR_ENDONLY | 0x00000020L -> `DOTALL | 0x00000040L -> `DUPNAMES | 0x00000080L -> `EXTENDED | 0x00000100L -> `FIRSTLINE | 0x00000200L -> `MATCH_UNSET_BACKREF | 0x00000400L -> `MULTILINE | 0x00000800L -> `NEVER_UCP | 0x00001000L -> `NEVER_UTF | 0x00002000L -> `NO_AUTO_CAPTURE | 0x00004000L -> `NO_AUTO_POSSESS | 0x00008000L -> `NO_DOTSTAR_ANCHOR | 0x00010000L -> `NO_START_OPTIMIZE | 0x00020000L -> `UCP | 0x00040000L -> `UNGREEDY | 0x00080000L -> `UTF | 0x00100000L -> `NEVER_BACKSLASH_C | 0x00200000L -> `ALT_CIRCUMFLEX | 0x00400000L -> `ALT_VERBNAMES | 0x00800000L -> `USE_OFFSET_LIMIT | 0x01000000L -> `EXTENDED_MORE | 0x02000000L -> `LITERAL | 0x04000000L -> `MATCH_INVALID_UTF | 0x20000000L -> `ENDANCHORED | 0x40000000L -> `NO_UTF_CHECK | 0x80000000L -> `ANCHORED | _ -> failwith "Pcre2.cflag_list: unknown compilation flag" let all_cflags = [ 0x00000001L; 0x00000002L; 0x00000004L; 0x00000008L; 0x00000010L; 0x00000020L; 0x00000040L; 0x00000080L; 0x00000100L; 0x00000200L; 0x00000400L; 0x00000800L; 0x00001000L; 0x00002000L; 0x00004000L; 0x00008000L; 0x00010000L; 0x00020000L; 0x00040000L; 0x00080000L; 0x00100000L; 0x00200000L; 0x00400000L; 0x00800000L; 0x01000000L; 0x02000000L; 0x04000000L; 0x20000000L; 0x40000000L; 0x80000000L; ] let cflag_list icflags = let coll flag_list flag = if Int64.equal (Int64.logand icflags flag) 0L then flag_list else cflag_of_int flag :: flag_list in List.fold_left coll [] all_cflags (* Runtime flags *) type rflag = [ `ANCHORED | `COPY_MATCHED_SUBJECT | `DFA_RESTART | `DFA_SHORTEST | `ENDANCHORED | `NOTBOL | `NOTEOL | `NOTEMPTY | `NOTEMPTY_ATSTART | `NO_JIT | `NO_UTF_CHECK | `PARTIAL_HARD | `PARTIAL_SOFT ] let int_of_rflag = function | `NOTBOL -> 0x00000001L | `NOTEOL -> 0x00000002L | `NOTEMPTY -> 0x00000004L | `NOTEMPTY_ATSTART -> 0x00000008L | `PARTIAL_SOFT -> 0x00000010L | `PARTIAL_HARD -> 0x00000020L | `DFA_RESTART -> 0x00000040L | `DFA_SHORTEST -> 0x00000080L | `NO_JIT -> 0x00002000L | `COPY_MATCHED_SUBJECT -> 0x00004000L | `ENDANCHORED -> 0x20000000L | `NO_UTF_CHECK -> 0x40000000L | `ANCHORED -> 0x80000000L let coll_irflag irflag flag = Int64.logor (int_of_rflag flag) irflag let rflags flags = List.fold_left coll_irflag 0L flags let rflag_of_int = function | 0x00000001L -> `NOTBOL | 0x00000002L -> `NOTEOL | 0x00000004L -> `NOTEMPTY | 0x00000008L -> `NOTEMPTY_ATSTART | 0x00000010L -> `PARTIAL_SOFT | 0x00000020L -> `PARTIAL_HARD | 0x00000040L -> `DFA_RESTART | 0x00000080L -> `DFA_SHORTEST | 0x00002000L -> `NO_JIT | 0x00004000L -> `COPY_MATCHED_SUBJECT | 0x20000000L -> `ENDANCHORED | 0x40000000L -> `NO_UTF_CHECK | 0x80000000L -> `ANCHORED | _ -> failwith "Pcre2.rflag_list: unknown runtime flag" let all_rflags = [ 0x00000001L; 0x00000002L; 0x00000004L; 0x00000008L; 0x00000010L; 0x00000020L; 0x00000040L; 0x00000080L; 0x00002000L; 0x00004000L; 0x20000000L; 0x40000000L; 0x80000000L; ] let rflag_list irflags = let coll flag_list flag = if Int64.equal (Int64.logand irflags flag) 0L then flag_list else rflag_of_int flag :: flag_list in List.fold_left coll [] all_rflags (* Information on the PCRE2-configuration (build-time options) *) external pcre2_version : unit -> string = "pcre2_version_stub" external pcre2_config_unicode : unit -> bool = "pcre2_config_unicode_stub" [@@noalloc] external pcre2_config_newline : unit -> char = "pcre2_config_newline_stub" [@@noalloc] external pcre2_config_link_size : unit -> (int[@untagged]) = "pcre2_config_link_size_stub_bc" "pcre2_config_link_size_stub" [@@noalloc] external pcre2_config_match_limit : unit -> (int[@untagged]) = "pcre2_config_match_limit_stub_bc" "pcre2_config_match_limit_stub" [@@noalloc] external pcre2_config_depth_limit : unit -> (int[@untagged]) = "pcre2_config_depth_limit_stub_bc" "pcre2_config_depth_limit_stub" [@@noalloc] external pcre2_config_stackrecurse : unit -> bool = "pcre2_config_stackrecurse_stub" [@@noalloc] let version = pcre2_version () let config_unicode = pcre2_config_unicode () let config_newline = pcre2_config_newline () let config_link_size = pcre2_config_link_size () let config_match_limit = pcre2_config_match_limit () let config_depth_limit = pcre2_config_depth_limit () let config_stackrecurse = pcre2_config_stackrecurse () (* Information on patterns *) type firstcodeunit_info = [ `Char of char | `Start_only | `ANCHORED ] type regexp external options : regexp -> (icflag[@unboxed]) = "pcre2_argoptions_stub_bc" "pcre2_argoptions_stub" external size : regexp -> (int[@untagged]) = "pcre2_size_stub_bc" "pcre2_size_stub" external capturecount : regexp -> (int[@untagged]) = "pcre2_capturecount_stub_bc" "pcre2_capturecount_stub" external backrefmax : regexp -> (int[@untagged]) = "pcre2_backrefmax_stub_bc" "pcre2_backrefmax_stub" external namecount : regexp -> (int[@untagged]) = "pcre2_namecount_stub_bc" "pcre2_namecount_stub" external nameentrysize : regexp -> (int[@untagged]) = "pcre2_nameentrysize_stub_bc" "pcre2_nameentrysize_stub" external names : regexp -> string array = "pcre2_names_stub" external firstcodeunit : regexp -> firstcodeunit_info = "pcre2_firstcodeunit_stub" external lastcodeunit : regexp -> char option = "pcre2_lastcodeunit_stub" (* Compilation of patterns *) type chtables external maketables : unit -> chtables = "pcre2_maketables_stub" external compile : (icflag[@unboxed]) -> chtables option -> string -> regexp = "pcre2_compile_stub_bc" "pcre2_compile_stub" (* external get_match_limit : regexp -> int option = "pcre2_get_match_limit_stub" *) (* Internal use only! *) external set_imp_match_limit : regexp -> (int[@untagged]) -> regexp = "pcre2_set_imp_match_limit_stub_bc" "pcre2_set_imp_match_limit_stub" [@@noalloc] (* external get_depth_limit : regexp -> int option = "pcre2_get_depth_limit_stub" *) (* Internal use only! *) external set_imp_depth_limit : regexp -> (int[@untagged]) -> regexp = "pcre2_set_imp_depth_limit_stub_bc" "pcre2_set_imp_depth_limit_stub" [@@noalloc] (* TODO implement jit using new pcre2_jit_compile api *) let regexp (* ?(jit_compile = false) *) ?limit ?depth_limit ?(iflags = 0L) ?flags ?chtables pat = let rex = match flags with | Some flag_list -> compile (cflags flag_list) chtables pat | _ -> compile iflags chtables pat in let rex = match limit with None -> rex | Some lim -> set_imp_match_limit rex lim in match depth_limit with None -> rex | Some lim -> set_imp_depth_limit rex lim let regexp_or (* ?jit_compile *) ?limit ?depth_limit ?(iflags = 0L) ?flags ?chtables pats = let check pat = try ignore (regexp ~iflags ?flags ?chtables pat) with Error error -> raise (Regexp_or (pat, error)) in List.iter check pats; let big_pat = let cnv pat = "(?:" ^ pat ^ ")" in String.concat "|" (List.rev (List.rev_map cnv pats)) in regexp (* ?jit_compile *) ?limit ?depth_limit ~iflags ?flags ?chtables big_pat let bytes_unsafe_blit_string str str_ofs bts bts_ofs len = let str_bts = Bytes.unsafe_of_string str in Bytes.unsafe_blit str_bts str_ofs bts bts_ofs len let string_unsafe_sub str ofs len = let res = Bytes.create len in bytes_unsafe_blit_string str ofs res 0 len; Bytes.unsafe_to_string res let quote s = let len = String.length s in let buf = Bytes.create (len lsl 1) in let pos = ref 0 in for i = 0 to len - 1 do match String.unsafe_get s i with | ('\\' | '^' | '$' | '.' | '[' | '|' | '(' | ')' | '?' | '*' | '+' | '{') as c -> Bytes.unsafe_set buf !pos '\\'; incr pos; Bytes.unsafe_set buf !pos c; incr pos | c -> Bytes.unsafe_set buf !pos c; incr pos done; string_unsafe_sub (Bytes.unsafe_to_string buf) 0 !pos (* Matching of patterns and subpattern extraction *) (* Default regular expression when none is provided by the user *) let def_rex = regexp (* ~jit_compile:true *) "\\s+" type substrings = string * int array type callout_data = { callout_number : int; substrings : substrings; start_match : int; current_position : int; capture_top : int; capture_last : int; pattern_position : int; next_item_length : int; } type callout = callout_data -> unit let get_subject (subj, _) = subj let num_of_subs (_, ovector) = Array.length ovector / 3 let get_offset_start ovector str_num = if str_num < 0 || str_num >= Array.length ovector / 3 then invalid_arg "Pcre2.get_offset_start: illegal offset"; let offset = str_num lsl 1 in (offset, Array.unsafe_get ovector offset) let get_substring_aux (subj, ovector) offset start = if start < 0 then raise Not_found else string_unsafe_sub subj start (Array.unsafe_get ovector (offset + 1) - start) let get_substring ((_, ovector) as substrings) str_num = let offset, start = get_offset_start ovector str_num in get_substring_aux substrings offset start let get_substring_ofs (_subj, ovector) str_num = let offset, start = get_offset_start ovector str_num in if start < 0 then raise Not_found else (start, Array.unsafe_get ovector (offset + 1)) let unsafe_get_substring ((_, ovector) as substrings) str_num = let offset = str_num lsl 1 in try get_substring_aux substrings offset (Array.unsafe_get ovector offset) with Not_found -> "" let get_substrings ?(full_match = true) ((_, ovector) as substrings) = if full_match then Array.init (Array.length ovector / 3) (unsafe_get_substring substrings) else let len = (Array.length ovector / 3) - 1 in Array.init len (fun n -> unsafe_get_substring substrings (n + 1)) let unsafe_get_opt_substring ((_, ovector) as substrings) str_num = let offset = str_num lsl 1 in try let start = Array.unsafe_get ovector offset in let str = get_substring_aux substrings offset start in Some str with Not_found -> None let get_opt_substrings ?(full_match = true) ((_, ovector) as substrings) = if full_match then Array.init (Array.length ovector / 3) (unsafe_get_opt_substring substrings) else let len = (Array.length ovector / 3) - 1 in Array.init len (fun n -> unsafe_get_opt_substring substrings (n + 1)) external get_stringnumber : regexp -> string -> (int[@untagged]) = "pcre2_substring_number_from_name_stub_bc" "pcre2_substring_number_from_name_stub" let get_named_substring rex name substrings = get_substring substrings (get_stringnumber rex name) let get_named_substring_ofs rex name substrings = get_substring_ofs substrings (get_stringnumber rex name) external unsafe_pcre2_match : (irflag[@unboxed]) -> regexp -> pos:(int[@untagged]) -> subj_start:(int[@untagged]) -> subj:string -> int array -> callout option -> unit = "pcre2_match_stub_bc" "pcre2_match_stub" let make_ovector rex = let subgroups1 = capturecount rex + 1 in let subgroups2 = subgroups1 lsl 1 in (subgroups2, Array.make (subgroups1 + subgroups2) 0) external unsafe_pcre2_dfa_match : (irflag[@unboxed]) -> regexp -> pos:(int[@untagged]) -> subj_start:(int[@untagged]) -> subj:string -> int array -> callout option -> workspace:int array -> unit = "pcre2_dfa_match_stub_bc" "pcre2_match_stub0" let pcre2_dfa_match ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?callout ?(workspace = Array.make 20 0) subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let _, ovector = make_ovector rex in unsafe_pcre2_dfa_match iflags rex ~pos ~subj_start:0 ~subj ovector callout ~workspace; ovector let pcre2_match ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let _, ovector = make_ovector rex in unsafe_pcre2_match iflags rex ~pos ~subj_start:0 ~subj ovector callout; ovector let exec ?iflags ?flags ?rex ?pat ?pos ?callout subj = (subj, pcre2_match ?iflags ?flags ?rex ?pat ?pos ?callout subj) let next_match ?iflags ?flags ?rex ?pat ?(pos = 0) ?callout (subj, ovector) = let pos = Array.unsafe_get ovector 1 + pos in let subj_len = String.length subj in if pos < 0 || pos > subj_len then invalid_arg "Pcre2.next_match: illegal offset"; (subj, pcre2_match ?iflags ?flags ?rex ?pat ~pos ?callout subj) let rec copy_lst ar n = function | [] -> ar | h :: t -> Array.unsafe_set ar n h; copy_lst ar (n - 1) t let exec_all ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?pos ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let ((_, ovector) as sstrs) = exec ~iflags ~rex ?pos ?callout subj in let null_flags = Int64.logor iflags 0x00000004L in (* `NOTEMPTY *) let subj_len = String.length subj in let rec loop pos ((subj, ovector) as sstrs) n lst = let maybe_ovector = try let first = Array.unsafe_get ovector 0 in if first = pos && Array.unsafe_get ovector 1 = pos then if pos = subj_len then None else Some (pcre2_match ~iflags:null_flags ~rex ~pos ?callout subj) else Some (pcre2_match ~iflags ~rex ~pos ?callout subj) with Not_found -> None in match maybe_ovector with | Some ovector -> let new_pos = Array.unsafe_get ovector 1 in loop new_pos (subj, ovector) (n + 1) (sstrs :: lst) | None -> copy_lst (Array.make (n + 1) sstrs) (n - 1) lst in loop (Array.unsafe_get ovector 1) sstrs 0 [] let extract ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj = get_substrings ?full_match (exec ?iflags ?flags ?rex ?pat ?pos ?callout subj) let extract_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj = get_opt_substrings ?full_match (exec ?iflags ?flags ?rex ?pat ?pos ?callout subj) let extract_all ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj = let many_sstrs = exec_all ?iflags ?flags ?rex ?pat ?pos ?callout subj in Array.map (get_substrings ?full_match) many_sstrs let extract_all_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj = let many_sstrs = exec_all ?iflags ?flags ?rex ?pat ?pos ?callout subj in Array.map (get_opt_substrings ?full_match) many_sstrs let pmatch ?iflags ?flags ?rex ?pat ?pos ?callout subj = try ignore (pcre2_match ?iflags ?flags ?rex ?pat ?pos ?callout subj); true with Not_found -> false (* String substitution *) (* Elements of a substitution pattern *) type subst = | SubstString of int * int (* Denotes a substring in the substitution *) | Backref of int (* nth backreference ($0 is program name!) *) | Match (* The whole matched string *) | PreMatch (* The string before the match *) | PostMatch (* The string after the match *) | LastParenMatch (* The last matched group *) (* Information on substitution patterns *) type substitution = string (* The substitution string *) * int (* Highest group number of backreferences *) * bool (* Makes use of "LastParenMatch" *) * subst list (* The list of substitution elements *) (* Only used internally in "subst" *) exception FoundAt of int let zero = Char.code '0' let subst str = let max_br = ref 0 in let with_lp = ref false in let lix = String.length str - 1 in let rec loop acc n = if lix < n then acc else try for i = n to lix do if String.unsafe_get str i = '$' then raise (FoundAt i) done; SubstString (n, lix - n + 1) :: acc with FoundAt i -> ( if i = lix then SubstString (n, lix - n + 1) :: acc else let i1 = i + 1 in let acc = if n = i then acc else SubstString (n, i - n) :: acc in match String.unsafe_get str i1 with | '0' .. '9' as c -> ( let subpat_nr = ref (Char.code c - zero) in try for j = i1 + 1 to lix do let c = String.unsafe_get str j in if c >= '0' && c <= '9' then subpat_nr := (10 * !subpat_nr) + Char.code c - zero else raise (FoundAt j) done; max_br := max !subpat_nr !max_br; Backref !subpat_nr :: acc with FoundAt j -> max_br := max !subpat_nr !max_br; loop (Backref !subpat_nr :: acc) j) | '!' -> loop acc (i1 + 1) | '$' -> loop (SubstString (i1, 1) :: acc) (i1 + 1) | '&' -> loop (Match :: acc) (i1 + 1) | '`' -> loop (PreMatch :: acc) (i1 + 1) | '\'' -> loop (PostMatch :: acc) (i1 + 1) | '+' -> with_lp := true; loop (LastParenMatch :: acc) (i1 + 1) | _ -> loop acc i1) in let subst_lst = loop [] 0 in (str, !max_br, !with_lp, subst_lst) let def_subst = subst "" (* Calculates a list of tuples (str, offset, len) which contain substrings to be copied on substitutions. Internal use only! *) let calc_trans_lst subgroups2 ovector subj templ subst_lst = let prefix_len = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let coll ((res_len, trans_lst) as accu) = let return_lst ((_str, _ix, len) as el) = if len = 0 then accu else (res_len + len, el :: trans_lst) in function | SubstString (ix, len) -> return_lst (templ, ix, len) | Backref 0 -> let prog_name = Sys.argv.(0) in return_lst (prog_name, 0, String.length prog_name) | Backref n -> let offset = n lsl 1 in let start = Array.unsafe_get ovector offset in let len = Array.unsafe_get ovector (offset + 1) - start in return_lst (subj, start, len) | Match -> return_lst (subj, prefix_len, last - prefix_len) | PreMatch -> return_lst (subj, 0, prefix_len) | PostMatch -> return_lst (subj, last, String.length subj - last) | LastParenMatch -> let subgroups2_2 = subgroups2 - 2 in let pos = ref subgroups2_2 in let ix = ref (Array.unsafe_get ovector subgroups2_2) in while !ix < 0 do let pos_2 = !pos - 2 in pos := pos_2; ix := Array.unsafe_get ovector pos_2 done; return_lst (subj, !ix, Array.unsafe_get ovector (!pos + 1) - !ix) in List.fold_left coll (0, []) subst_lst let replace ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?(itempl = def_subst) ?templ ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let templ, max_br, with_lp, subst_lst = match templ with Some str -> subst str | _ -> itempl in let subj_len = String.length subj in if pos < 0 || pos > subj_len then invalid_arg "Pcre2.replace: illegal offset"; let subgroups2, ovector = make_ovector rex in let nsubs = (subgroups2 lsr 1) - 1 in if max_br > nsubs then failwith "Pcre2.replace: backreference denotes nonexistent subpattern"; if with_lp && nsubs = 0 then failwith "Pcre2.replace: no backreferences"; let rec loop full_len trans_lsts cur_pos = if cur_pos > subj_len || try unsafe_pcre2_match iflags rex ~pos:cur_pos ~subj_start:0 ~subj ovector callout; false with Not_found -> true then ( let postfix_len = max (subj_len - cur_pos) 0 in let left = pos + full_len in let res = Bytes.create (left + postfix_len) in bytes_unsafe_blit_string subj 0 res 0 pos; bytes_unsafe_blit_string subj cur_pos res left postfix_len; let inner_coll ofs (templ, ix, len) = bytes_unsafe_blit_string templ ix res ofs len; ofs + len in let coll ofs (res_len, trans_lst) = let new_ofs = ofs - res_len in let _ = List.fold_left inner_coll new_ofs trans_lst in new_ofs in let _ = List.fold_left coll left trans_lsts in Bytes.unsafe_to_string res) else let first = Array.unsafe_get ovector 0 in let len = first - cur_pos in let ((res_len, _) as trans_lst_el) = calc_trans_lst subgroups2 ovector subj templ subst_lst in let trans_lsts = if len > 0 then trans_lst_el :: (len, [ (subj, cur_pos, len) ]) :: trans_lsts else trans_lst_el :: trans_lsts in let full_len = full_len + len + res_len in let next = first + 1 in let last = Array.unsafe_get ovector 1 in if last < next then if first < subj_len then let new_trans_lsts = (1, [ (subj, cur_pos + len, 1) ]) :: trans_lsts in loop (full_len + 1) new_trans_lsts next else loop full_len trans_lsts next else loop full_len trans_lsts last in loop 0 [] pos let qreplace ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?(templ = "") ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let subj_len = String.length subj in if pos < 0 || pos > subj_len then invalid_arg "Pcre2.qreplace: illegal offset"; let templ_len = String.length templ in let _, ovector = make_ovector rex in let rec loop full_len subst_lst cur_pos = if cur_pos > subj_len || try unsafe_pcre2_match iflags rex ~pos:cur_pos ~subj_start:0 ~subj ovector callout; false with Not_found -> true then ( let postfix_len = max (subj_len - cur_pos) 0 in let left = pos + full_len in let res = Bytes.create (left + postfix_len) in bytes_unsafe_blit_string subj 0 res 0 pos; bytes_unsafe_blit_string subj cur_pos res left postfix_len; let coll ofs = function | Some (substr, ix, len) -> let new_ofs = ofs - len in bytes_unsafe_blit_string substr ix res new_ofs len; new_ofs | None -> let new_ofs = ofs - templ_len in bytes_unsafe_blit_string templ 0 res new_ofs templ_len; new_ofs in let _ = List.fold_left coll left subst_lst in Bytes.unsafe_to_string res) else let first = Array.unsafe_get ovector 0 in let len = first - cur_pos in let subst_lst = if len > 0 then None :: Some (subj, cur_pos, len) :: subst_lst else None :: subst_lst in let last = Array.unsafe_get ovector 1 in let full_len = full_len + len + templ_len in let next = first + 1 in if last < next then if first < subj_len then loop (full_len + 1) (Some (subj, cur_pos + len, 1) :: subst_lst) next else loop full_len subst_lst next else loop full_len subst_lst last in loop 0 [] pos let substitute_substrings ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?callout ~subst subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let subj_len = String.length subj in if pos < 0 || pos > subj_len then invalid_arg "Pcre2.substitute: illegal offset"; let _, ovector = make_ovector rex in let rec loop full_len subst_lst cur_pos = if cur_pos > subj_len || try unsafe_pcre2_match iflags rex ~pos:cur_pos ~subj_start:0 ~subj ovector callout; false with Not_found -> true then ( let postfix_len = max (subj_len - cur_pos) 0 in let left = pos + full_len in let res = Bytes.create (left + postfix_len) in bytes_unsafe_blit_string subj 0 res 0 pos; bytes_unsafe_blit_string subj cur_pos res left postfix_len; let coll ofs (templ, ix, len) = let new_ofs = ofs - len in bytes_unsafe_blit_string templ ix res new_ofs len; new_ofs in let _ = List.fold_left coll left subst_lst in Bytes.unsafe_to_string res) else let first = Array.unsafe_get ovector 0 in let len = first - cur_pos in let templ = subst (subj, ovector) in let templ_len = String.length templ in let subst_lst = if len > 0 then (templ, 0, templ_len) :: (subj, cur_pos, len) :: subst_lst else (templ, 0, templ_len) :: subst_lst in let last = Array.unsafe_get ovector 1 in let full_len = full_len + len + templ_len in let next = first + 1 in if last < next then if first < subj_len then loop (full_len + 1) ((subj, cur_pos + len, 1) :: subst_lst) next else loop full_len subst_lst next else loop full_len subst_lst last in loop 0 [] pos let substitute ?iflags ?flags ?rex ?pat ?pos ?callout ~subst:str_subst subj = let subst (subj, ovector) = let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in str_subst (string_unsafe_sub subj first (last - first)) in substitute_substrings ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj let replace_first ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?(itempl = def_subst) ?templ ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let templ, max_br, with_lp, subst_lst = match templ with Some str -> subst str | _ -> itempl in let subgroups2, ovector = make_ovector rex in let nsubs = (subgroups2 lsr 1) - 1 in if max_br > nsubs then failwith "Pcre2.replace_first: backreference denotes nonexistent subpattern"; if with_lp && nsubs = 0 then failwith "Pcre2.replace_first: no backreferences"; try unsafe_pcre2_match iflags rex ~pos ~subj_start:0 ~subj ovector callout; let res_len, trans_lst = calc_trans_lst subgroups2 ovector subj templ subst_lst in let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let rest = String.length subj - last in let res = Bytes.create (first + res_len + rest) in bytes_unsafe_blit_string subj 0 res 0 first; let coll ofs (templ, ix, len) = bytes_unsafe_blit_string templ ix res ofs len; ofs + len in let ofs = List.fold_left coll first trans_lst in bytes_unsafe_blit_string subj last res ofs rest; Bytes.unsafe_to_string res with Not_found -> subj let qreplace_first ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?(templ = "") ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let _, ovector = make_ovector rex in try unsafe_pcre2_match iflags rex ~pos ~subj_start:0 ~subj ovector callout; let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let len = String.length templ in let rest = String.length subj - last in let postfix_start = first + len in let res = Bytes.create (postfix_start + rest) in bytes_unsafe_blit_string subj 0 res 0 first; bytes_unsafe_blit_string templ 0 res first len; bytes_unsafe_blit_string subj last res postfix_start rest; Bytes.unsafe_to_string res with Not_found -> subj let substitute_substrings_first ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?callout ~subst subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let _, ovector = make_ovector rex in try unsafe_pcre2_match iflags rex ~pos ~subj_start:0 ~subj ovector callout; let subj_len = String.length subj in let prefix_len = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let templ = subst (subj, ovector) in let postfix_len = subj_len - last in let templ_len = String.length templ in let postfix_start = prefix_len + templ_len in let res = Bytes.create (postfix_start + postfix_len) in bytes_unsafe_blit_string subj 0 res 0 prefix_len; bytes_unsafe_blit_string templ 0 res prefix_len templ_len; bytes_unsafe_blit_string subj last res postfix_start postfix_len; Bytes.unsafe_to_string res with Not_found -> subj let substitute_first ?iflags ?flags ?rex ?pat ?pos ?callout ~subst:str_subst subj = let subst (subj, ovector) = let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in str_subst (string_unsafe_sub subj first (last - first)) in substitute_substrings_first ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj (* Splitting *) let internal_psplit flags rex max pos callout subj = let subj_len = String.length subj in if subj_len = 0 then [] else if max = 1 then [ subj ] else let subgroups2, ovector = make_ovector rex in (* Adds contents of subgroups to the string accumulator *) let handle_subgroups strs = let strs = ref strs in let i = ref 2 in while !i < subgroups2 do let first = Array.unsafe_get ovector !i in incr i; let last = Array.unsafe_get ovector !i in let str = if first < 0 then "" else string_unsafe_sub subj first (last - first) in strs := str :: !strs; incr i done; !strs in (* Performs the recursive split *) let rec loop strs cnt pos prematch = let len = subj_len - pos in if len < 0 then strs else if (* Checks termination due to max restriction *) cnt = 0 then if prematch && try unsafe_pcre2_match flags rex ~pos ~subj_start:pos ~subj ovector callout; true with Not_found -> false then let last = Array.unsafe_get ovector 1 in let strs = handle_subgroups strs in string_unsafe_sub subj last (subj_len - last) :: strs else string_unsafe_sub subj pos len :: strs (* Calculates next accumulator state for splitting *) else if try unsafe_pcre2_match flags rex ~pos ~subj_start:pos ~subj ovector callout; false with Not_found -> true then string_unsafe_sub subj pos len :: strs else let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in if first = pos then if last = pos then let strs = if prematch then handle_subgroups strs else strs in if len = 0 then "" :: strs else if try unsafe_pcre2_match (* `ANCHORED | `NOTEMPTY *) (Int64.logor flags 0x80000004L) rex ~pos ~subj_start:pos ~subj ovector callout; true with Not_found -> false then let new_strs = handle_subgroups ("" :: strs) in loop new_strs (cnt - 1) (Array.unsafe_get ovector 1) false else let new_strs = string_unsafe_sub subj pos 1 :: strs in loop new_strs (cnt - 1) (pos + 1) true else if prematch then loop (handle_subgroups strs) cnt last false else loop (handle_subgroups ("" :: strs)) (cnt - 1) last false else let new_strs = string_unsafe_sub subj pos (first - pos) :: strs in loop (handle_subgroups new_strs) (cnt - 1) last false in loop [] (max - 1) pos false let rec strip_all_empty = function "" :: t -> strip_all_empty t | l -> l external isspace : char -> bool = "pcre2_isspace_stub" [@@noalloc] let rec find_no_space ix len str = if ix = len || not (isspace (String.unsafe_get str ix)) then ix else find_no_space (ix + 1) len str let split ?(iflags = 0L) ?flags ?rex ?pat ?(pos = 0) ?(max = 0) ?callout subj = let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let res = match (pat, rex) with | Some str, _ -> internal_psplit iflags (regexp str) max pos callout subj | _, Some rex -> internal_psplit iflags rex max pos callout subj | _ -> (* special case for Perl-splitting semantics *) let len = String.length subj in if pos > len || pos < 0 then failwith "Pcre2.split: illegal offset"; let new_pos = find_no_space pos len subj in internal_psplit iflags def_rex max new_pos callout subj in List.rev (if max = 0 then strip_all_empty res else res) let asplit ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj = Array.of_list (split ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj) (* Full splitting *) type split_result = | Text of string | Delim of string | Group of int * string | NoGroup let rec strip_all_empty_full = function | Delim _ :: rest -> strip_all_empty_full rest | l -> l let full_split ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?(max = 0) ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let subj_len = String.length subj in if subj_len = 0 then [] else if max = 1 then [ Text subj ] else let subgroups2, ovector = make_ovector rex in (* Adds contents of subgroups to the string accumulator *) let handle_subgroups strs = let strs = ref strs in let i = ref 2 in while !i < subgroups2 do let group_nr = !i lsr 1 in let first = Array.unsafe_get ovector !i in incr i; let last = Array.unsafe_get ovector !i in let str = if first < 0 then NoGroup else let group_str = string_unsafe_sub subj first (last - first) in Group (group_nr, group_str) in strs := str :: !strs; incr i done; !strs in (* Performs the recursive split *) let rec loop strs cnt pos prematch = let len = subj_len - pos in if len < 0 then strs else if (* Checks termination due to max restriction *) cnt = 0 then if prematch && try unsafe_pcre2_match iflags rex ~pos ~subj_start:pos ~subj ovector callout; true with Not_found -> false then let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let delim = Delim (string_unsafe_sub subj first (last - first)) in Text (string_unsafe_sub subj last (subj_len - last)) :: handle_subgroups (delim :: strs) else if len = 0 then strs else Text (string_unsafe_sub subj pos len) :: strs (* Calculates next accumulator state for splitting *) else if try unsafe_pcre2_match iflags rex ~pos ~subj_start:pos ~subj ovector callout; false with Not_found -> true then if len = 0 then strs else Text (string_unsafe_sub subj pos len) :: strs else let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in if first = pos then if last = pos then if len = 0 then handle_subgroups (Delim "" :: strs) else let empty_groups = handle_subgroups [] in if try unsafe_pcre2_match (* `ANCHORED | `NOTEMPTY *) (Int64.logor iflags 0x80000004L) rex ~pos ~subj_start:pos ~subj ovector callout; true with Not_found -> false then let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let delim = Delim (string_unsafe_sub subj first (last - first)) in let new_strs = let tmp_strs = if prematch then strs else empty_groups @ (Delim "" :: strs) in handle_subgroups (delim :: tmp_strs) in loop new_strs (cnt - 1) last false else let new_strs = (Text (string_unsafe_sub subj pos 1) :: empty_groups) @ (Delim "" :: strs) in loop new_strs (cnt - 1) (pos + 1) true else let delim = Delim (string_unsafe_sub subj first (last - first)) in loop (handle_subgroups (delim :: strs)) cnt last false else let delim = Delim (string_unsafe_sub subj first (last - first)) in let pre_strs = Text (string_unsafe_sub subj pos (first - pos)) :: strs in loop (handle_subgroups (delim :: pre_strs)) (cnt - 1) last false in let res = loop [] (max - 1) pos true in List.rev (if max = 0 then strip_all_empty_full res else res) (* Additional convenience functions useful in combination with this library *) let foreach_line ?(ic = stdin) f = try while true do f (input_line ic) done with End_of_file -> () let foreach_file filenames f = let do_with_file filename = let file = open_in filename in try f filename file; close_in file with exn -> close_in file; raise exn in List.iter do_with_file filenames pcre2-ocaml-8.0.3/src/pcre2.mli000066400000000000000000001120141475421476700161530ustar00rootroot00000000000000(* PCRE2-OCAML - Perl Compatibility Regular Expressions for OCaml Copyright © 1999- Markus Mottl This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *) (** Perl Compatibility Regular Expressions for OCaml {e %%VERSION%% - {{:%%PKG_HOMEPAGE%%} homepage}} *) (** {1 Exceptions} *) type error = | Partial (** String only matched the pattern partially *) | BadPattern of string * int (** [BadPattern (msg, pos)] regular expression is malformed. The reason is in [msg], the position of the error in the pattern in [pos]. *) | BadUTF (** UTF string being matched is invalid *) | BadUTFOffset (** Gets raised when a UTF string being matched with offset is invalid. *) | MatchLimit (** Maximum allowed number of match attempts with backtracking or recursion is reached during matching. ALL FUNCTIONS CALLING THE MATCHING ENGINE MAY RAISE IT!!! *) | DepthLimit | WorkspaceSize (** Raised by {!pcre2_dfa_match} when the provided workspace array is too small. See documention on {!pcre2_dfa_match} for details on workspace array sizing. *) | InternalError of string (** [InternalError msg] C-library exhibits unknown/undefined behaviour. The reason is in [msg]. *) exception Error of error (** Exception indicating PCRE errors. *) exception Backtrack (** [Backtrack] used in callout functions to force backtracking. *) exception Regexp_or of string * error (** [Regexp_or (pat, error)] gets raised for sub-pattern [pat] by [regexp_or] if it failed to compile. *) (** {1 Compilation and runtime flags and their conversion functions} *) type icflag (** Internal representation of compilation flags *) and irflag (** Internal representation of runtime flags *) and cflag = [ `ALLOW_EMPTY_CLASS (** Allow empty classes *) | `ALT_BSUX (** Alternative handling of \u, \U, and \x *) | `ALT_CIRCUMFLEX (** Alternative handling of ^ in multiline mode *) | `ALT_VERBNAMES (** Process backslashes in verb names *) | `ANCHORED (** Pattern matches only at start of string *) | `AUTO_CALLOUT (** Automatically inserts callouts with id 255 before each pattern item *) | `CASELESS (** Case insensitive matching *) | `DOLLAR_ENDONLY (** '$' in pattern matches only at end of string *) | `DOTALL (** '.' matches all characters (newlines, too) *) | `DUPNAMES (** Allow duplicate names for subpatterns *) | `ENDANCHORED (** Pattern can match only at end of subject *) | `EXTENDED (** Ignores whitespace and PERL-comments. Behaves like the '/x'-option in PERL *) | `EXTENDED_MORE | `FIRSTLINE (** Unanchored patterns must match before/at first NL *) | `LITERAL (** Pattern characters are all literal *) | `MATCH_INVALID_UTF (** Enable support for matching invalid UTF *) | `MATCH_UNSET_BACKREF (** Match unset backreferences *) | `MULTILINE (** '^' and '$' match before/after newlines, not just at the beginning/end of a string *) | `NEVER_BACKSLASH_C (** Lock out the use of \C in patterns *) | `NEVER_UCP (** Lock out UCP, e.g. via (\*UCP) *) | `NEVER_UTF (** Lock out UTF, e.g. via (\*UTF) *) | `NO_AUTO_CAPTURE (** Disables the use of numbered capturing parentheses *) | `NO_AUTO_POSSESS (** Disable auto-possessification *) | `NO_DOTSTAR_ANCHOR (** Disable automatic anchoring for .* *) | `NO_START_OPTIMIZE (** Disable match-time start optimizations *) | `NO_UTF_CHECK (** Do not check the pattern for UTF validity (only relevant if UTF is set) WARNING: with this flag enabled, invalid UTF strings may cause a crash, loop, or give incorrect results *) | `UCP (** Use Unicode properties for \d, \w, etc. *) | `UNGREEDY (** Quantifiers not greedy anymore, only if followed by '?' *) | `USE_OFFSET_LIMIT (** Enable offset limit for unanchored matching *) | `UTF (** Treat pattern and subjects as UTF strings *) ] (** Compilation flags *) val cflags : cflag list -> icflag (** [cflags cflag_list] converts a list of compilation flags to their internal representation. *) val cflag_list : icflag -> cflag list (** [cflag_list cflags] converts internal representation of compilation flags to a list. *) type rflag = [ `ANCHORED (** Match only at the first position *) | `COPY_MATCHED_SUBJECT (** On success, make a private subject copy *) | `DFA_RESTART (** Causes matching to proceed presuming the subject string is further to one partially matched previously using the same int-array working set. May only be used with {!pcre2_dfa_match} or {!unsafe_pcre2_dfa_match}, and should always be paired with [`PARTIAL]. *) | `DFA_SHORTEST (** Return only the shortest match *) | `ENDANCHORED (** Pattern can match only at end of subject *) | `NOTBOL (** Beginning of string is not treated as beginning of line *) | `NOTEOL (** End of string is not treated as end of line *) | `NOTEMPTY (** An empty string is not a valid match *) | `NOTEMPTY_ATSTART (** An empty string at the start of the subject is not a valid match *) | `NO_JIT (** Do not use JIT matching *) | `NO_UTF_CHECK (** Do not check the subject for UTF validity (only relevant if PCRE2_UTF was set at compile time) *) | `PARTIAL_HARD (** Throw Pcre2.Partial for a partial match even if there is a full match *) | `PARTIAL_SOFT (** Throw Pcre2.Partial for a partial match if no full matches are found *) ] (** Runtime flags *) val rflags : rflag list -> irflag (** [rflags rflag_list] converts a list of runtime flags to their internal representation. *) val rflag_list : irflag -> rflag list (** [rflag_list rflags] converts internal representation of runtime flags to a list. *) (** {1 Information on the PCRE2-configuration (build-time options)} *) (** Version information *) val version : string (** Version of the PCRE2-C-library *) val config_unicode : bool (** Indicates whether unicode support is enabled *) val config_newline : char (** Character used as newline *) val config_link_size : int (** Number of bytes used for internal linkage of regular expressions *) val config_match_limit : int (** Default limit for calls to internal matching function *) val config_depth_limit : int (** Default limit for depth of nested backtracking *) val config_stackrecurse : bool (** Indicates use of stack recursion in matching function *) (** {1 Information on patterns} *) type firstcodeunit_info = [ `Char of char (** Fixed first character *) | `Start_only (** Pattern matches at beginning and end of newlines *) | `ANCHORED (** Pattern is anchored *) ] (** Information on matching of "first chars" in patterns *) type regexp (** Compiled regular expressions *) val options : regexp -> icflag (** [options regexp] @return compilation flags of [regexp]. *) val size : regexp -> int (** [size regexp] @return memory size of [regexp]. *) val capturecount : regexp -> int (** [capturecount regexp] @return number of capturing subpatterns in [regexp]. *) val backrefmax : regexp -> int (** [backrefmax regexp] @return number of highest backreference in [regexp]. *) val namecount : regexp -> int (** [namecount regexp] @return number of named subpatterns in [regexp]. *) val nameentrysize : regexp -> int (** [nameentrysize regexp] @return size of longest name of named subpatterns in [regexp] + 3. *) val names : regexp -> string array (** [names regex] @return array of names of named substrings in [regexp]. *) val firstcodeunit : regexp -> firstcodeunit_info (** [firstcodeunit regexp] @return firstcodeunit info on [regexp]. *) val lastcodeunit : regexp -> char option (** [lastcodeunit regexp] @return some last matching character of [regexp] if available, [None] otherwise. *) val get_stringnumber : regexp -> string -> int (** [get_stringnumber rex name] @return the index of the named substring [name] in regular expression [rex]. This index can then be used with [get_substring]. @raise Invalid_arg if there is no such named substring. *) (* val get_match_limit : regexp -> int option *) (** [get_match_limit rex] @return some match limit of regular expression [rex] or [None]. *) (* val get_depth_limit : regexp -> int option *) (** [get_depth_limit rex] @return some depth limit of regular expression [rex] or [None]. *) (** {1 Compilation of patterns} *) type chtables (** Alternative set of char tables for pattern matching *) val maketables : unit -> chtables (** Generates new set of char tables for the current locale. *) val regexp : ?limit: (* ?jit_compile : bool -> *) int -> ?depth_limit:int -> ?iflags:icflag -> ?flags:cflag list -> ?chtables:chtables -> string -> regexp (** [regexp ?limit ?depth_limit ?iflags ?flags ?chtables pattern] compiles [pattern] with [flags] when given, with [iflags] otherwise, and with char tables [chtables]. If [limit] is specified, this sets a limit to the amount of recursion and backtracking (only lower than the builtin default!). If this limit is exceeded, [MatchLimit] will be raised during matching. @param limit default = no extra limit other than default @param depth_limit default = no extra depth_limit other than default @param iflags default = no extra flags @param flags default = ignored @param chtables default = builtin char tables @return the regular expression. For detailed documentation on how you can specify PERL-style regular expressions (= patterns), please consult the PCRE2-documentation ("man pcre2pattern") or PERL-manuals. @see www.perl.com *) val regexp_or : ?limit: (* ?jit_compile : bool -> *) int -> ?depth_limit:int -> ?iflags:icflag -> ?flags:cflag list -> ?chtables:chtables -> string list -> regexp (** [regexp_or ?limit ?depth_limit ?iflags ?flags ?chtables patterns] like {!val-regexp}, but combines [patterns] as alternatives (or-patterns) into one regular expression. *) val quote : string -> string (** [quote str] @return the quoted string of [str]. *) (** {1 Subpattern extraction} *) type substrings (** Information on substrings after pattern matching *) val get_subject : substrings -> string (** [get_subject substrings] @return the subject string of [substrings]. *) val num_of_subs : substrings -> int (** [num_of_subs substrings] @return number of strings in [substrings] (whole match inclusive). *) val get_substring : substrings -> int -> string (** [get_substring substrings n] @return the [n]th substring (0 is whole match) of [substrings]. @raise Invalid_argument if [n] is not in the range of the number of substrings. @raise Not_found if the corresponding subpattern did not capture a substring. *) val get_substring_ofs : substrings -> int -> int * int (** [get_substring_ofs substrings n] @return the offset tuple of the [n]th substring of [substrings] (0 is whole match). @raise Invalid_argument if [n] is not in the range of the number of substrings. @raise Not_found if the corresponding subpattern did not capture a substring. *) val get_substrings : ?full_match:bool -> substrings -> string array (** [get_substrings ?full_match substrings] @return the array of substrings in [substrings]. It includes the full match at index 0 when [full_match] is [true], the captured substrings only when it is [false]. If a subpattern did not capture a substring, the empty string is returned in the corresponding position instead. @param full_match default = true *) val get_opt_substrings : ?full_match:bool -> substrings -> string option array (** [get_opt_substrings ?full_match substrings] @return the array of optional substrings in [substrings]. It includes [Some full_match_str] at index 0 when [full_match] is [true], [Some captured_substrings] only when it is [false]. If a subpattern did not capture a substring, [None] is returned in the corresponding position instead. @param full_match default = true *) val get_named_substring : regexp -> string -> substrings -> string (** [get_named_substring rex name substrings] @return the named substring [name] in regular expression [rex] and [substrings]. @raise Invalid_argument if there is no such named substring. @raise Not_found if the corresponding subpattern did not capture a substring. *) val get_named_substring_ofs : regexp -> string -> substrings -> int * int (** [get_named_substring_ofs rex name substrings] @return the offset tuple of the named substring [name] in regular expression [rex] and [substrings]. @raise Invalid_argument if there is no such named substring. @raise Not_found if the corresponding subpattern did not capture a substring. *) (** {1 Callouts} *) type callout_data = { callout_number : int; (** Callout number *) substrings : substrings; (** Substrings matched so far *) start_match : int; (** Subject start offset of current match attempt *) current_position : int; (** Subject offset of current match pointer *) capture_top : int; (** Number of the highest captured substring so far *) capture_last : int; (** Number of the most recently captured substring *) pattern_position : int; (** Offset of next match item in pattern string *) next_item_length : int; (** Length of next match item in pattern string *) } (** Type of callout functions *) type callout = callout_data -> unit (** Callouts are referred to in patterns as "(?Cn)" where "n" is a [callout_number] ranging from 0 to 255. Substrings captured so far are accessible as usual via [substrings]. You will have to consider [capture_top] and [capture_last] to know about the current state of valid substrings. By raising exception [Backtrack] within a callout function, the user can force the pattern matching engine to backtrack to other possible solutions. Other exceptions will terminate matching immediately and return control to OCaml. *) (** {1 Matching of patterns and subpattern extraction} *) val pcre2_match : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?callout:callout -> string -> int array (** [pcre2_match ?iflags ?flags ?rex ?pat ?pos ?callout subj] @return an array of offsets that describe the position of matched subpatterns in the string [subj] starting at position [pos] with pattern [pat] when given, regular expression [rex] otherwise. The array also contains additional workspace needed by the match engine. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val pcre2_dfa_match : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?callout:callout -> ?workspace:int array -> string -> int array (** [pcre2_dfa_match ?iflags ?flags ?rex ?pat ?pos ?callout ?workspace subj] invokes the "alternative" DFA matching function. @return an array of offsets that describe the position of matched subpatterns in the string [subj] starting at position [pos] with pattern [pat] when given, regular expression [rex] otherwise. The array also contains additional workspace needed by the match engine. Uses [flags] when given, the precompiled [iflags] otherwise. Requires a sufficiently-large [workspace] array. Callouts are handled by [callout]. Note that the returned array of offsets are quite different from those returned by {!pcre2_match} et al. The motivating use case for the DFA match function is to be able to restart a partial match with N additional input segments. Because the match function/workspace does not store segments seen previously, the offsets returned when a match completes will refer only to the matching portion of the last subject string provided. Thus, returned offsets from this function should not be used to support extracting captured submatches. If you need to capture submatches from a series of inputs incrementally matched with this function, you'll need to concatenate those inputs that yield a successful match here and re-run the same pattern against that single subject string. Aside from an absolute minimum of [20], PCRE does not provide any guidance regarding the size of workspace array needed by any given pattern. Therefore, it is wise to appropriately handle the possible [WorkspaceSize] error. If raised, you can allocate a new, larger workspace array and begin the DFA matching process again. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts @param workspace default = fresh array of length [20] @raise Not_found if the pattern match has failed @raise Error Partial if the pattern has matched partially; a subsequent exec call with the same pattern and workspace (adding the [DFA_RESTART] flag) be made to either further advance or complete the partial match. @raise Error WorkspaceSize if the workspace array is too small to accommodate the DFA state required by the supplied pattern *) val exec : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?callout:callout -> string -> substrings (** [exec ?iflags ?flags ?rex ?pat ?pos ?callout subj] @return substring information on string [subj] starting at position [pos] with pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val exec_all : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?callout:callout -> string -> substrings array (** [exec_all ?iflags ?flags ?rex ?pat ?pos ?callout subj] @return an array of substring information of all matching substrings in string [subj] starting at position [pos] with pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val next_match : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?callout:callout -> substrings -> substrings (** [next_match ?iflags ?flags ?rex ?pat ?pos ?callout substrs] @return substring information on the match that follows on the last match denoted by [substrs], jumping over [pos] characters (also backwards!), using pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts @raise Not_found if pattern does not match. @raise Invalid_arg if [pos] let matching start outside of the subject string. *) val extract : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?full_match:bool -> ?callout:callout -> string -> string array (** [extract ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj] @return the array of substrings that match [subj] starting at position [pos], using pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. It includes the full match at index 0 when [full_match] is [true], the captured substrings only when it is [false]. Callouts are handled by [callout]. If a subpattern did not capture a substring, the empty string is returned in the corresponding position instead. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param full_match default = true @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val extract_opt : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?full_match:bool -> ?callout:callout -> string -> string option array (** [extract_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj] @return the array of optional substrings that match [subj] starting at position [pos], using pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. It includes [Some full_match_str] at index 0 when [full_match] is [true], [Some captured-substrings] only when it is [false]. Callouts are handled by [callout]. If a subpattern did not capture a substring, [None] is returned in the corresponding position instead. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param full_match default = true @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val extract_all : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?full_match:bool -> ?callout:callout -> string -> string array array (** [extract_all ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj] @return an array of arrays of all matching substrings that match [subj] starting at position [pos], using pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. It includes the full match at index 0 of the extracted string arrays when [full_match] is [true], the captured substrings only when it is [false]. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param full_match default = true @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val extract_all_opt : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?full_match:bool -> ?callout:callout -> string -> string option array array (** [extract_all_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj] @return an array of arrays of all optional matching substrings that match [subj] starting at position [pos], using pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. It includes [Some full_match_str] at index 0 of the extracted string arrays when [full_match] is [true], [Some captured_substrings] only when it is [false]. Callouts are handled by [callout]. If a subpattern did not capture a substring, [None] is returned in the corresponding position instead. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param full_match default = true @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val pmatch : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?callout:callout -> string -> bool (** [pmatch ?iflags ?flags ?rex ?pat ?pos ?callout subj] @return [true] if [subj] is matched by pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos]. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts *) (** {1 String substitution} *) type substitution (** Information on substitution patterns *) val subst : string -> substitution (** [subst str] converts the string [str] representing a substitution pattern to the internal representation The contents of the substitution string [str] can be normal text mixed with any of the following (mostly as in PERL): - {e $\[0-9\]+} - a "$" immediately followed by an arbitrary number. "$0" stands for the name of the executable, any other number for the n-th backreference. - {e $&} - the whole matched pattern - {e $`} - the text before the match - {e $'} - the text after the match - {e $+} - the last group that matched - {e $$} - a single "$" - {e $!} - delimiter which does not appear in the substitution. Can be used to part "$[0-9]+" from an immediately following other number. *) val replace : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?itempl:substitution -> ?templ:string -> ?callout:callout -> string -> string (** [replace ?iflags ?flags ?rex ?pat ?pos ?itempl ?templ ?callout subj] replaces all substrings of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the substitution string [templ] when given, [itempl] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param itempl default = empty string @param templ default = ignored @param callout default = ignore callouts @raise Failure if there are backreferences to nonexistent subpatterns. *) val qreplace : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?templ:string -> ?callout:callout -> string -> string (** [qreplace ?iflags ?flags ?rex ?pat ?pos ?templ ?callout subj] replaces all substrings of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the string [templ]. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param templ default = ignored @param callout default = ignore callouts *) val substitute_substrings : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?callout:callout -> subst:(substrings -> string) -> string -> string (** [substitute_substrings ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj] replaces all substrings of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the result of function [subst] applied to the substrings of the match. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts *) val substitute : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?callout:callout -> subst:(string -> string) -> string -> string (** [substitute ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj] replaces all substrings of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the result of function [subst] applied to the match. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts *) val replace_first : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?itempl:substitution -> ?templ:string -> ?callout:callout -> string -> string (** [replace_first ?iflags ?flags ?rex ?pat ?pos ?itempl ?templ ?callout subj] replaces the first substring of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the substitution string [templ] when given, [itempl] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param itempl default = empty string @param templ default = ignored @param callout default = ignore callouts @raise Failure if there are backreferences to nonexistent subpatterns. *) val qreplace_first : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?templ:string -> ?callout:callout -> string -> string (** [qreplace_first ?iflags ?flags ?rex ?pat ?pos ?templ ?callout subj] replaces the first substring of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the string [templ]. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param templ default = ignored @param callout default = ignore callouts *) val substitute_substrings_first : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?callout:callout -> subst:(substrings -> string) -> string -> string (** [substitute_substrings_first ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj] replaces the first substring of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the result of function [subst] applied to the substrings of the match. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts *) val substitute_first : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?callout:callout -> subst:(string -> string) -> string -> string (** [substitute_first ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj] replaces the first substring of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the result of function [subst] applied to the match. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts *) (** {1 Splitting} *) val split : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?max:int -> ?callout:callout -> string -> string list (** [split ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj] splits [subj] into a list of at most [max] strings, using as delimiter pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos]. Uses [flags] when given, the precompiled [iflags] otherwise. If [max] is zero, trailing empty fields are stripped. If it is negative, it is treated as arbitrarily large. If neither [pat] nor [rex] are specified, leading whitespace will be stripped! Should behave exactly as in PERL. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param max default = 0 @param callout default = ignore callouts *) val asplit : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?max:int -> ?callout:callout -> string -> string array (** [asplit ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj] same as {!Pcre2.split} but return an array instead of a list. *) (** Result of a {!Pcre2.full_split} *) type split_result = | Text of string (** Text part of split string *) | Delim of string (** Delimiter part of split string *) | Group of int * string (** Subgroup of matched delimiter (subgroup_nr, subgroup_str) *) | NoGroup (** Unmatched subgroup *) val full_split : ?iflags:irflag -> ?flags:rflag list -> ?rex:regexp -> ?pat:string -> ?pos:int -> ?max:int -> ?callout:callout -> string -> split_result list (** [full_split ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj] splits [subj] into a list of at most [max] elements of type "split_result", using as delimiter pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos]. Uses [flags] when given, the precompiled [iflags] otherwise. If [max] is zero, trailing empty fields are stripped. If it is negative, it is treated as arbitrarily large. Should behave exactly as in PERL. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param max default = 0 @param callout default = ignore callouts *) (** {1 Additional convenience functions} *) val foreach_line : ?ic:in_channel -> (string -> unit) -> unit (** [foreach_line ?ic f] applies [f] to each line in inchannel [ic] until the end-of-file is reached. @param ic default = stdin *) val foreach_file : string list -> (string -> in_channel -> unit) -> unit (** [foreach_file filenames f] opens each file in the list [filenames] for input and applies [f] to each filename and the corresponding channel. Channels are closed after each operation (even when exceptions occur - they get reraised afterwards!). *) (** {1 {b UNSAFE STUFF - USE WITH CAUTION!}} *) val unsafe_pcre2_match : irflag -> regexp -> pos:int -> subj_start:int -> subj:string -> int array -> callout option -> unit (** [unsafe_pcre_exec flags rex ~pos ~subj_start ~subj offset_vector callout]. You should read the C-source to know what happens. If you do not understand it - {b don't use this function!} *) val make_ovector : regexp -> int * int array (** [make_ovector regexp] calculates the tuple (subgroups2, ovector) which is the number of subgroup offsets and the offset array. *) val unsafe_pcre2_dfa_match : irflag -> regexp -> pos:int -> subj_start:int -> subj:string -> int array -> callout option -> workspace:int array -> unit (** [unsafe_pcre_dfa_exec flags rex ~pos ~subj_start ~subj offset_vector callout ~workpace]. You should read the C-source to know what happens. If you do not understand it - {b don't use this function!} *) pcre2-ocaml-8.0.3/src/pcre2_stubs.c000066400000000000000000000654461475421476700170540ustar00rootroot00000000000000/* PCRE2-OCAML - Perl Compatibility Regular Expressions for OCaml Copyright © 1999- Markus Mottl This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #if defined(_WIN32) #define snprintf _snprintf #if defined(_DLL) #define PCREextern __declspec(dllexport) #else #define PCREextern #endif #endif #if defined(_WIN64) typedef long long *caml_int_ptr; #else typedef long *caml_int_ptr; #endif #if __GNUC__ >= 3 #define __unused __attribute__((unused)) #else #define __unused #endif #include #include #include #include #include #include #include #include #include #include #if (OCAML_VERSION_MAJOR == 4) && (OCAML_VERSION_MINOR < 12) #define Val_none (Val_long(0)) #define Some_val(v) Field(v, 0) #define Tag_some 0 #define Is_none(v) ((v) == Val_none) #define Is_some(v) Is_block(v) CAMLexport static value caml_alloc_some(value v) { CAMLparam1(v); value some = caml_alloc_small(1, 0); Field(some, 0) = v; CAMLreturn(some); } #endif #define PCRE2_CODE_UNIT_WIDTH 8 #include typedef const unsigned char *chartables; /* Type of chartable sets */ /* Contents of callout data */ struct cod { long subj_start; /* Start of subject string */ value *v_substrings_p; /* Pointer to substrings matched so far */ value *v_cof_p; /* Pointer to callout function */ value v_exn; /* Possible exception raised by callout function */ }; /* Cache for exceptions */ static const value *pcre2_exc_Error = NULL; /* Exception [Error] */ static const value *pcre2_exc_Backtrack = NULL; /* Exception [Backtrack] */ /* Cache for polymorphic variants */ static value var_Start_only; /* Variant [`Start_only] */ static value var_ANCHORED; /* Variant [`ANCHORED] */ static value var_Char; /* Variant [`Char char] */ /* Data associated with OCaml values of PCRE regular expression */ struct pcre2_ocaml_regexp { pcre2_code *rex; pcre2_match_context *mcontext; }; #define Pcre2_ocaml_regexp_val(v) \ ((struct pcre2_ocaml_regexp *)Data_custom_val(v)) #define get_rex(v) Pcre2_ocaml_regexp_val(v)->rex #define get_mcontext(v) Pcre2_ocaml_regexp_val(v)->mcontext #define set_rex(v, r) Pcre2_ocaml_regexp_val(v)->rex = r #define set_mcontext(v, c) Pcre2_ocaml_regexp_val(v)->mcontext = c /* Data associated with OCaml values of PCRE tables */ struct pcre2_ocaml_tables { chartables tables; }; #define Pcre2_ocaml_tables_val(v) \ ((struct pcre2_ocaml_tables *)Data_custom_val(v)) #define get_tables(v) Pcre2_ocaml_tables_val(v)->tables #define set_tables(v, t) Pcre2_ocaml_tables_val(v)->tables = t /* Converts subject offsets from C-integers to OCaml-Integers. This is a bit tricky, because there are 32- and 64-bit platforms around and OCaml chooses the larger possibility for representing integers when available (also in arrays) - not so the PCRE! */ static inline void copy_ovector(long subj_start, const size_t *ovec_src, caml_int_ptr ovec_dst, uint32_t subgroups2) { if (subj_start == 0) while (subgroups2--) { *ovec_dst = (*ovec_src == PCRE2_UNSET) ? Val_int(-1) : Val_int(*ovec_src); --ovec_src; --ovec_dst; } else while (subgroups2--) { *ovec_dst = (*ovec_src == PCRE2_UNSET) ? Val_int(-1) : Val_long(*ovec_src + subj_start); --ovec_src; --ovec_dst; } } /* Callout handler */ static int pcre2_callout_handler(pcre2_callout_block *cb, struct cod *cod) { if (cod != NULL) { /* Callout is available */ value v_res; /* Set up parameter array */ value v_callout_data = caml_alloc_small(8, 0); const value v_substrings = *cod->v_substrings_p; const uint32_t capture_top = cb->capture_top; uint32_t subgroups2 = capture_top << 1; const uint32_t subgroups2_1 = subgroups2 - 1; const size_t *ovec_src = cb->offset_vector + subgroups2_1; caml_int_ptr ovec_dst = (long *)&Field(Field(v_substrings, 1), 0) + subgroups2_1; long subj_start = cod->subj_start; copy_ovector(subj_start, ovec_src, ovec_dst, subgroups2); Field(v_callout_data, 0) = Val_int(cb->callout_number); Field(v_callout_data, 1) = v_substrings; Field(v_callout_data, 2) = Val_int(cb->start_match + subj_start); Field(v_callout_data, 3) = Val_int(cb->current_position + subj_start); Field(v_callout_data, 4) = Val_int(capture_top); Field(v_callout_data, 5) = Val_int(cb->capture_last); Field(v_callout_data, 6) = Val_int(cb->pattern_position); Field(v_callout_data, 7) = Val_int(cb->next_item_length); /* Perform callout */ v_res = caml_callback_exn(*cod->v_cof_p, v_callout_data); if (Is_exception_result(v_res)) { /* Callout raised an exception */ const value v_exn = Extract_exception(v_res); if (Field(v_exn, 0) == *pcre2_exc_Backtrack) return 1; cod->v_exn = v_exn; return PCRE2_ERROR_CALLOUT; } } return 0; } /* Fetches the named OCaml-values + caches them and calculates + caches the variant hash values */ CAMLprim value pcre2_ocaml_init(value __unused v_unit) { pcre2_exc_Error = caml_named_value("Pcre2.Error"); pcre2_exc_Backtrack = caml_named_value("Pcre2.Backtrack"); var_Start_only = caml_hash_variant("Start_only"); var_ANCHORED = caml_hash_variant("ANCHORED"); var_Char = caml_hash_variant("Char"); return Val_unit; } /* Finalizing deallocation function for chartable sets */ static void pcre2_dealloc_tables(value v_tables) { #if PCRE2_MINOR >= 34 pcre2_maketables_free(NULL, get_tables(v_tables)); #else free((void *)get_tables(v_tables)); #endif } /* Finalizing deallocation function for compiled regular expressions */ static void pcre2_dealloc_regexp(value v_rex) { pcre2_code_free(get_rex(v_rex)); pcre2_match_context_free(get_mcontext(v_rex)); } /* Raising exceptions */ CAMLnoreturn_start static inline void raise_pcre2_error(value v_arg) CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_partial(void) CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_bad_utf(void) CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_bad_utf_offset(void) CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_match_limit(void) CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_depth_limit(void) CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_workspace_size(void) CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_bad_pattern(int code, size_t pos) CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_internal_error(char *msg) CAMLnoreturn_end; static inline void raise_pcre2_error(value v_arg) { caml_raise_with_arg(*pcre2_exc_Error, v_arg); } static inline void raise_partial(void) { raise_pcre2_error(Val_int(0)); } static inline void raise_bad_utf(void) { raise_pcre2_error(Val_int(1)); } static inline void raise_bad_utf_offset(void) { raise_pcre2_error(Val_int(2)); } static inline void raise_match_limit(void) { raise_pcre2_error(Val_int(3)); } static inline void raise_depth_limit(void) { raise_pcre2_error(Val_int(4)); } static inline void raise_workspace_size(void) { raise_pcre2_error(Val_int(5)); } static inline void raise_bad_pattern(int code, size_t pos) { CAMLparam0(); CAMLlocal1(v_msg); value v_arg; v_msg = caml_alloc_string(128); pcre2_get_error_message(code, (PCRE2_UCHAR *)String_val(v_msg), 128); v_arg = caml_alloc_small(2, 0); Field(v_arg, 0) = v_msg; Field(v_arg, 1) = Val_int(pos); raise_pcre2_error(v_arg); CAMLnoreturn; } static inline void raise_internal_error(char *msg) { CAMLparam0(); CAMLlocal1(v_msg); value v_arg; v_msg = caml_copy_string(msg); v_arg = caml_alloc_small(1, 1); Field(v_arg, 0) = v_msg; raise_pcre2_error(v_arg); CAMLnoreturn; } /* PCRE pattern compilation */ static struct custom_operations regexp_ops = { "pcre2_ocaml_regexp", pcre2_dealloc_regexp, custom_compare_default, custom_hash_default, custom_serialize_default, custom_deserialize_default, custom_compare_ext_default, custom_fixed_length_default}; /* Makes compiled regular expression from compilation options, an optional value of chartables and the pattern string */ CAMLprim value pcre2_compile_stub(int64_t v_opt, value v_tables, value v_pat) { value v_rex; /* Final result -> value of type [regexp] */ size_t regexp_size, ocaml_regexp_size = sizeof(struct pcre2_ocaml_regexp); int error_code = 0; /* error code for potential error */ size_t error_ofs = 0; /* offset in the pattern at which error occurred */ size_t length = caml_string_length(v_pat); pcre2_compile_context *ccontext = NULL; /* If v_tables = [None], then pointer to tables is NULL, otherwise set it to the appropriate value */ if (Is_some(v_tables)) { ccontext = pcre2_compile_context_create(NULL); pcre2_set_character_tables(ccontext, get_tables(Field(v_tables, 0))); } /* Compiles the pattern */ pcre2_code *regexp = pcre2_compile((PCRE2_SPTR)String_val(v_pat), length, v_opt, &error_code, &error_ofs, ccontext); pcre2_compile_context_free(ccontext); /* Raises appropriate exception with [BadPattern] if the pattern could not be compiled */ if (regexp == NULL) raise_bad_pattern(error_code, error_ofs); /* It's unknown at this point whether JIT compilation is going to be used, but we have to decide on a size. Tests with some simple patterns indicate a roughly 50% increase in size when studying without JIT. A factor of two times hence seems like a reasonable bound to use here. */ pcre2_pattern_info(regexp, PCRE2_INFO_SIZE, ®exp_size); v_rex = caml_alloc_custom_mem(®exp_ops, ocaml_regexp_size, 2 * regexp_size); set_rex(v_rex, regexp); set_mcontext(v_rex, pcre2_match_context_create(NULL)); return v_rex; } CAMLprim value pcre2_compile_stub_bc(value v_opt, value v_tables, value v_pat) { return pcre2_compile_stub(Int64_val(v_opt), v_tables, v_pat); } /* Gets the depth limit of a regular expression if it exists */ /* CAMLprim value pcre2_get_depth_limit_stub(value v_rex); */ /* Gets the match limit of a regular expression if it exists */ /* CAMLprim value pcre2_get_match_limit_stub(value v_rex); */ /* Sets a match limit for a regular expression imperatively */ CAMLprim value pcre2_set_imp_match_limit_stub(value v_rex, intnat v_lim) { pcre2_match_context *mcontext = get_mcontext(v_rex); pcre2_set_match_limit(mcontext, v_lim); return v_rex; } CAMLprim value pcre2_set_imp_match_limit_stub_bc(value v_rex, value v_lim) { return pcre2_set_imp_match_limit_stub(v_rex, Int_val(v_lim)); } /* Sets a depth limit for a regular expression imperatively */ CAMLprim value pcre2_set_imp_depth_limit_stub(value v_rex, intnat v_lim) { pcre2_match_context *mcontext = get_mcontext(v_rex); pcre2_set_depth_limit(mcontext, v_lim); return v_rex; } CAMLprim value pcre2_set_imp_depth_limit_stub_bc(value v_rex, value v_lim) { return pcre2_set_imp_depth_limit_stub(v_rex, Int_val(v_lim)); } /* Performs the call to the pcre2_pattern_info function */ static inline int pcre2_pattern_info_stub(value v_rex, int what, void *where) { return pcre2_pattern_info(get_rex(v_rex), what, where); } /* Some stubs for info-functions */ /* Generic macro for getting integer results from pcre2_pattern_info */ #define MAKE_INTNAT_INFO(tp, name, option) \ CAMLprim intnat pcre2_##name##_stub(value v_rex) { \ tp options; \ const int ret = \ pcre2_pattern_info_stub(v_rex, PCRE2_INFO_##option, &options); \ if (ret != 0) \ raise_internal_error("pcre2_##name##_stub"); \ return options; \ } \ \ CAMLprim value pcre2_##name##_stub_bc(value v_rex) { \ return Val_int(pcre2_##name##_stub(v_rex)); \ } MAKE_INTNAT_INFO(size_t, size, SIZE) MAKE_INTNAT_INFO(int, capturecount, CAPTURECOUNT) MAKE_INTNAT_INFO(int, backrefmax, BACKREFMAX) MAKE_INTNAT_INFO(int, namecount, NAMECOUNT) MAKE_INTNAT_INFO(int, nameentrysize, NAMEENTRYSIZE) CAMLprim int64_t pcre2_argoptions_stub(value v_rex) { uint32_t options; const int ret = pcre2_pattern_info_stub(v_rex, PCRE2_INFO_ARGOPTIONS, &options); if (ret != 0) raise_internal_error("pcre2_argoptions_stub"); return (int64_t)options; } CAMLprim value pcre2_argoptions_stub_bc(value v_rex) { CAMLparam1(v_rex); CAMLreturn(caml_copy_int64(pcre2_argoptions_stub(v_rex))); } CAMLprim value pcre2_firstcodeunit_stub(value v_rex) { uint32_t firstcodetype; const int ret = pcre2_pattern_info_stub(v_rex, PCRE2_INFO_FIRSTCODETYPE, &firstcodetype); if (ret != 0) raise_internal_error("pcre2_firstcodeunit_stub"); switch (firstcodetype) { case 2: return var_Start_only; break; /* [`Start_only] */ case 0: return var_ANCHORED; break; /* [`ANCHORED] */ case 1: { uint32_t firstcodeunit; const int ret = pcre2_pattern_info_stub(v_rex, PCRE2_INFO_FIRSTCODEUNIT, &firstcodeunit); if (ret != 0) raise_internal_error("pcre2_firstcodeunit_stub"); value v_firstbyte; /* Allocates the non-constant constructor [`Char of char] and fills in the appropriate value */ v_firstbyte = caml_alloc_small(2, 0); Field(v_firstbyte, 0) = var_Char; Field(v_firstbyte, 1) = Val_int(firstcodeunit); return v_firstbyte; break; } default: /* Should not happen */ raise_internal_error("pcre2_firstcodeunit_stub"); } } CAMLprim value pcre2_lastcodeunit_stub(value v_rex) { uint32_t lastcodetype; const int ret = pcre2_pattern_info_stub(v_rex, PCRE2_INFO_LASTCODETYPE, &lastcodetype); if (ret != 0) raise_internal_error("pcre2_lastcodeunit_stub"); if (lastcodetype == 0) return Val_none; if (lastcodetype != 1) raise_internal_error("pcre2_lastcodeunit_stub"); else { uint32_t lastcodeunit; const int ret = pcre2_pattern_info_stub(v_rex, PCRE2_INFO_LASTCODEUNIT, &lastcodeunit); if (ret != 0) raise_internal_error("pcre2_lastcodeunit_stub"); return caml_alloc_some(Val_int(lastcodeunit)); } } CAMLnoreturn_start static inline void handle_match_error(char *loc, const int ret) CAMLnoreturn_end; static inline void handle_match_error(char *loc, const int ret) { switch (ret) { /* Dedicated exceptions */ case PCRE2_ERROR_NOMATCH: caml_raise_not_found(); case PCRE2_ERROR_PARTIAL: raise_partial(); case PCRE2_ERROR_MATCHLIMIT: raise_match_limit(); case PCRE2_ERROR_BADUTFOFFSET: raise_bad_utf_offset(); case PCRE2_ERROR_DEPTHLIMIT: raise_depth_limit(); case PCRE2_ERROR_DFA_WSSIZE: raise_workspace_size(); default: { if (PCRE2_ERROR_UTF8_ERR21 <= ret && ret <= PCRE2_ERROR_UTF8_ERR1) raise_bad_utf(); /* Unknown error */ char err_buf[100]; snprintf(err_buf, 100, "%s: unhandled PCRE2 error code: %d", loc, ret); raise_internal_error(err_buf); } } } static inline void handle_pcre2_match_result(size_t *ovec, value v_ovec, size_t ovec_len, long subj_start, uint32_t ret) { caml_int_ptr ocaml_ovec = (caml_int_ptr)&Field(v_ovec, 0); const uint32_t subgroups2 = ret * 2; const uint32_t subgroups2_1 = subgroups2 - 1; const size_t *ovec_src = ovec + subgroups2_1; caml_int_ptr ovec_clear_stop = ocaml_ovec + (ovec_len * 2) / 3; caml_int_ptr ovec_dst = ocaml_ovec + subgroups2_1; copy_ovector(subj_start, ovec_src, ovec_dst, subgroups2); while (++ovec_dst < ovec_clear_stop) *ovec_dst = -1; } /* Executes a pattern match with runtime options, a regular expression, a matching position, the start of the subject string, a subject string, a number of subgroup offsets, an offset vector and an optional callout function */ CAMLprim value pcre2_match_stub0(int64_t v_opt, value v_rex, intnat v_pos, intnat v_subj_start, value v_subj, value v_ovec, value v_maybe_cof, value v_workspace) { int ret; int is_dfa = v_workspace != (value)NULL; long pos = v_pos, subj_start = v_subj_start; size_t ovec_len = Wosize_val(v_ovec), len = caml_string_length(v_subj); if (pos > (long)len || pos < subj_start) caml_invalid_argument("Pcre2.pcre2_match_stub: illegal position"); if (subj_start > (long)len || subj_start < 0) caml_invalid_argument("Pcre2.pcre2_match_stub: illegal subject start"); pos -= subj_start; len -= subj_start; { const pcre2_code *code = get_rex(v_rex); /* Compiled pattern */ pcre2_match_context *mcontext = get_mcontext(v_rex); /* Match context */ PCRE2_SPTR ocaml_subj = (PCRE2_SPTR)String_val(v_subj) + subj_start; /* Subject string */ pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(code, NULL); /* Special case when no callout functions specified */ if (Is_none(v_maybe_cof)) { /* Performs the match */ if (is_dfa) ret = pcre2_dfa_match(code, ocaml_subj, len, pos, v_opt, match_data, mcontext, (int *)&Field(v_workspace, 0), Wosize_val(v_workspace)); else ret = pcre2_match(code, ocaml_subj, len, pos, v_opt, match_data, mcontext); size_t *ovec = pcre2_get_ovector_pointer(match_data); if (ret < 0) { pcre2_match_data_free(match_data); handle_match_error("pcre2_match_stub", ret); } else { handle_pcre2_match_result(ovec, v_ovec, ovec_len, subj_start, ret); } } /* There are callout functions */ else { value v_cof = Field(v_maybe_cof, 0); value v_substrings; PCRE2_UCHAR *subj = caml_stat_alloc(sizeof(char) * len); int workspace_len = 0; int *workspace = NULL; struct cod cod = {0, (value *)NULL, (value *)NULL, (value)NULL}; pcre2_match_context *new_mcontext = pcre2_match_context_copy(mcontext); pcre2_set_callout( new_mcontext, (int (*)(pcre2_callout_block_8 *, void *))&pcre2_callout_handler, &cod); cod.subj_start = subj_start; memcpy(subj, ocaml_subj, len); Begin_roots4(v_rex, v_cof, v_substrings, v_ovec); Begin_roots1(v_subj); v_substrings = caml_alloc_small(2, 0); End_roots(); Field(v_substrings, 0) = v_subj; Field(v_substrings, 1) = v_ovec; cod.v_substrings_p = &v_substrings; cod.v_cof_p = &v_cof; if (is_dfa) { workspace_len = Wosize_val(v_workspace); workspace = caml_stat_alloc(sizeof(int) * workspace_len); ret = pcre2_dfa_match(code, subj, len, pos, v_opt, match_data, new_mcontext, (int *)&Field(v_workspace, 0), workspace_len); } else ret = pcre2_match(code, subj, len, pos, v_opt, match_data, new_mcontext); caml_stat_free(subj); End_roots(); pcre2_match_context_free(new_mcontext); size_t *ovec = pcre2_get_ovector_pointer(match_data); if (ret < 0) { if (is_dfa) caml_stat_free(workspace); pcre2_match_data_free(match_data); if (ret == PCRE2_ERROR_CALLOUT) caml_raise(cod.v_exn); else handle_match_error("pcre2_match_stub(callout)", ret); } else { handle_pcre2_match_result(ovec, v_ovec, ovec_len, subj_start, ret); if (is_dfa) { caml_int_ptr ocaml_workspace_dst = (caml_int_ptr)&Field(v_workspace, 0); const int *workspace_src = workspace; const int *workspace_src_stop = workspace + workspace_len; while (workspace_src != workspace_src_stop) { *ocaml_workspace_dst = *workspace_src; ocaml_workspace_dst++; workspace_src++; } caml_stat_free(workspace); } } } pcre2_match_data_free(match_data); } return Val_unit; } CAMLprim value pcre2_match_stub(int64_t v_opt, value v_rex, intnat v_pos, intnat v_subj_start, value v_subj, value v_ovec, value v_maybe_cof) { return pcre2_match_stub0(v_opt, v_rex, v_pos, v_subj_start, v_subj, v_ovec, v_maybe_cof, (value)NULL); } /* Byte-code hook for pcre2_match_stub Needed, because there are more than 5 arguments */ CAMLprim value pcre2_match_stub_bc(value *argv, int __unused argn) { return pcre2_match_stub0(Int64_val(argv[0]), argv[1], Int_val(argv[2]), Int_val(argv[3]), argv[4], argv[5], argv[6], (value)NULL); } /* Byte-code hook for pcre2_dfa_match_stub Needed, because there are more than 5 arguments */ CAMLprim value pcre2_dfa_match_stub_bc(value *argv, int __unused argn) { return pcre2_match_stub0(Int64_val(argv[0]), argv[1], Int_val(argv[2]), Int_val(argv[3]), argv[4], argv[5], argv[6], argv[7]); } static struct custom_operations tables_ops = { "pcre2_ocaml_tables", pcre2_dealloc_tables, custom_compare_default, custom_hash_default, custom_serialize_default, custom_deserialize_default, custom_compare_ext_default, custom_fixed_length_default}; /* Generates a new set of chartables for the current locale (see man page of PCRE */ CAMLprim value pcre2_maketables_stub(value __unused v_unit) { /* According to testing with `malloc_size`, it seems that a typical set of tables will require about 1536 bytes of memory. This may or may not be true on other platforms or for all versions of PCRE. Since there is apparently no reliable way of finding out, 1536 is probably a good default value. */ size_t tables_size = sizeof(struct pcre2_ocaml_tables); const value v_tables = caml_alloc_custom_mem(&tables_ops, tables_size, 1536); set_tables(v_tables, pcre2_maketables(NULL)); return v_tables; } /* Wraps around the isspace-function */ CAMLprim value pcre2_isspace_stub(value v_c) { return Val_bool(isspace(Int_val(v_c))); } /* Returns number of substring associated with a name */ CAMLprim intnat pcre2_substring_number_from_name_stub(value v_rex, value v_name) { const int ret = pcre2_substring_number_from_name( get_rex(v_rex), (PCRE2_SPTR)String_val(v_name)); if (ret == PCRE2_ERROR_NOSUBSTRING) caml_invalid_argument("Named string not found"); return ret; } CAMLprim value pcre2_substring_number_from_name_stub_bc(value v_rex, value v_name) { return Val_int(pcre2_substring_number_from_name_stub(v_rex, v_name)); } /* Returns array of names of named substrings in a regexp */ CAMLprim value pcre2_names_stub(value v_rex) { CAMLparam1(v_rex); CAMLlocal1(v_res); uint32_t name_count; uint32_t entry_size; const char *tbl_ptr; uint32_t i; int ret = pcre2_pattern_info_stub(v_rex, PCRE2_INFO_NAMECOUNT, &name_count); if (ret != 0) raise_internal_error("pcre2_names_stub: namecount"); ret = pcre2_pattern_info_stub(v_rex, PCRE2_INFO_NAMEENTRYSIZE, &entry_size); if (ret != 0) raise_internal_error("pcre2_names_stub: nameentrysize"); ret = pcre2_pattern_info_stub(v_rex, PCRE2_INFO_NAMETABLE, &tbl_ptr); if (ret != 0) raise_internal_error("pcre2_names_stub: nametable"); v_res = caml_alloc(name_count, 0); for (i = 0; i < name_count; ++i) { value v_name = caml_copy_string(tbl_ptr + 2); Store_field(v_res, i, v_name); tbl_ptr += entry_size; } CAMLreturn(v_res); } /* Generic stub for getting integer results from pcre2_config */ static inline int pcre2_config_int(int what) { int ret; pcre2_config(what, (void *)&ret); return ret; } /* Generic stub for getting long integer results from pcre2_config */ static inline long pcre2_config_long(int what) { long ret; pcre2_config(what, (void *)&ret); return ret; } /* Some stubs for config-functions */ /* Makes OCaml-string from PCRE-version */ CAMLprim value pcre2_version_stub(value __unused v_unit) { CAMLparam1(v_unit); CAMLlocal1(v_version); v_version = caml_alloc_string(32); pcre2_config(PCRE2_CONFIG_VERSION, (void *)String_val(v_version)); CAMLreturn(v_version); } /* Returns boolean indicating unicode support */ CAMLprim value pcre2_config_unicode_stub(value __unused v_unit) { return Val_bool(pcre2_config_int(PCRE2_CONFIG_UNICODE)); } /* Returns character used as newline */ CAMLprim value pcre2_config_newline_stub(value __unused v_unit) { return Val_int(pcre2_config_int(PCRE2_CONFIG_NEWLINE)); } /* Returns number of bytes used for internal linkage of regular expressions */ CAMLprim intnat pcre2_config_link_size_stub(value __unused v_unit) { return pcre2_config_int(PCRE2_CONFIG_LINKSIZE); } CAMLprim value pcre2_config_link_size_stub_bc(value v_unit) { return Val_int(pcre2_config_link_size_stub(v_unit)); } /* Returns default limit for calls to internal matching function */ CAMLprim intnat pcre2_config_match_limit_stub(value __unused v_unit) { return pcre2_config_long(PCRE2_CONFIG_MATCHLIMIT); } CAMLprim value pcre2_config_match_limit_stub_bc(value v_unit) { return Val_int(pcre2_config_match_limit_stub(v_unit)); } /* Returns default limit for depth of nested backtracking */ CAMLprim intnat pcre2_config_depth_limit_stub(value __unused v_unit) { return pcre2_config_long(PCRE2_CONFIG_DEPTHLIMIT); } CAMLprim value pcre2_config_depth_limit_stub_bc(value v_unit) { return Val_int(pcre2_config_depth_limit_stub(v_unit)); } /* Returns boolean indicating use of stack recursion */ CAMLprim intnat pcre2_config_stackrecurse_stub(value __unused v_unit) { return Val_bool(pcre2_config_int(PCRE2_CONFIG_STACKRECURSE)); } pcre2-ocaml-8.0.3/test/000077500000000000000000000000001475421476700146265ustar00rootroot00000000000000pcre2-ocaml-8.0.3/test/dune000066400000000000000000000003171475421476700155050ustar00rootroot00000000000000(test (name old_pcre2_tests) (modules Old_pcre2_tests) (libraries pcre2 ounit2)) (test (name pcre2_tests) (modules Pcre2_tests) (libraries pcre2 ounit2)) (env (dev (flags (:standard -w -27)))) pcre2-ocaml-8.0.3/test/old_pcre2_tests.ml000066400000000000000000000010451475421476700202530ustar00rootroot00000000000000open OUnit2 open Pcre2 let simple_test ctxt = assert_equal 0 0; assert_equal [ Text "ab"; Delim "x"; Group (1, "x"); NoGroup; Text "cd" ] (full_split ~pat:"(x)|(u)" "abxcd"); assert_equal [ Text "ab"; Delim "x"; Group (1, "x"); NoGroup; Text "cd"; Delim "u"; NoGroup; Group (2, "u"); Text "ef"; ] (full_split ~pat:"(x)|(u)" "abxcduef") let suite = "Test pcre" >::: [ "simple_test" >:: simple_test ] let _ = if not !Sys.interactive then run_test_tt_main suite else () pcre2-ocaml-8.0.3/test/pcre2_tests.ml000066400000000000000000000525021475421476700174210ustar00rootroot00000000000000(**pp -syntax camlp5o -package pa_ppx.deriving_plugins.std *) open OUnit2 let test_special_char_regexps ctxt = (); assert_equal "\n" ((let __re__ = Pcre2.regexp ~flags:[`DOTALL] "\\n$" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0) (Pcre2.exec ~rex:__re__ __subj__)) "\n"); assert_equal "" (Pcre2.substitute_substrings_first ~rex:(Pcre2.regexp ~flags:[`DOTALL] "\\n+$") ~subst:(fun __g__ -> String.concat "" []) "\n\n") let test_pcre2_simple_match ctxt = (); assert_equal "abc" (Pcre2.get_substring ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> Pcre2.exec ~rex:__re__ __subj__) "abc") 0); assert_equal (Some "abc") ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "abc"); assert_equal (Some "abc") ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "abc"); assert_equal true ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> Pcre2.pmatch ~rex:__re__ __subj__) "abc"); assert_equal false ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> Pcre2.pmatch ~rex:__re__ __subj__) "abd"); assert_equal None ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "abd"); assert_raises Not_found (fun () -> (let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0) (Pcre2.exec ~rex:__re__ __subj__)) "abd"); assert_raises Not_found (fun () -> (let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0) (Pcre2.exec ~rex:__re__ __subj__)) "abd"); assert_equal None ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "abd"); assert_equal "abc" ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0) (Pcre2.exec ~rex:__re__ __subj__)) "abc"); assert_equal ("abc", Some "b") ((let __re__ = Pcre2.regexp ~flags:[] "a(b)c" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0, (try Some (Pcre2.get_substring __g__ 1) with Not_found -> None)) (Pcre2.exec ~rex:__re__ __subj__)) "abc"); assert_equal ("ac", None) ((let __re__ = Pcre2.regexp ~flags:[] "a(?:(b)?)c" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0, (try Some (Pcre2.get_substring __g__ 1) with Not_found -> None)) (Pcre2.exec ~rex:__re__ __subj__)) "ac"); assert_equal "abc" (Pcre2.get_substring ((let __re__ = Pcre2.regexp ~flags:[`CASELESS] "ABC" in fun __subj__ -> Pcre2.exec ~rex:__re__ __subj__) "abc") 0); assert_equal ("abc", Some "a", Some "b", Some "c") ((let __re__ = Pcre2.regexp ~flags:[] "(a)(b)(c)" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0, (try Some (Pcre2.get_substring __g__ 1) with Not_found -> None), (try Some (Pcre2.get_substring __g__ 2) with Not_found -> None), (try Some (Pcre2.get_substring __g__ 3) with Not_found -> None)) (Pcre2.exec ~rex:__re__ __subj__)) "abc") let test_pcre2_selective_match ctxt = (); assert_equal ("abc", Some "b") ((let __re__ = Pcre2.regexp ~flags:[] "a(b)c" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0, (try Some (Pcre2.get_substring __g__ 1) with Not_found -> None)) (Pcre2.exec ~rex:__re__ __subj__)) "abc"); assert_equal ("abc", "b") ((let __re__ = Pcre2.regexp ~flags:[] "a(b)c" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0, Pcre2.get_substring __g__ 1) (Pcre2.exec ~rex:__re__ __subj__)) "abc"); assert_equal "b" ((let __re__ = Pcre2.regexp ~flags:[] "a(b)c" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 1) (Pcre2.exec ~rex:__re__ __subj__)) "abc"); assert_equal (Some ("abc", "b")) ((let __re__ = Pcre2.regexp ~flags:[] "a(b)c" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0, Pcre2.get_substring __g__ 1) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "abc"); assert_equal ("ac", None) ((let __re__ = Pcre2.regexp ~flags:[] "a(b)?c" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0, (try Some (Pcre2.get_substring __g__ 1) with Not_found -> None)) (Pcre2.exec ~rex:__re__ __subj__)) "ac"); assert_raises Not_found (fun _ -> (let __re__ = Pcre2.regexp ~flags:[] "a(b)?c" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0, Pcre2.get_substring __g__ 1) (Pcre2.exec ~rex:__re__ __subj__)) "ac"); assert_equal None ((let __re__ = Pcre2.regexp ~flags:[] "a(b)?c" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0, Pcre2.get_substring __g__ 1) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "ac") let test_pcre2_search ctxt = (); assert_equal "abc" ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0) (Pcre2.exec ~rex:__re__ __subj__)) "zzzabc"); assert_equal None ((let __re__ = Pcre2.regexp ~flags:[] "^abc" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "zzzabc") let show_string_option = function None -> "None" | Some s -> Printf.sprintf "Some %s" s let test_pcre2_single ctxt = let printer = show_string_option in (); assert_equal ~printer None ((let __re__ = Pcre2.regexp ~flags:[] ".+" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "\n\n"); assert_equal ~printer None ((let __re__ = Pcre2.regexp ~flags:[`MULTILINE] ".+" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "\n\n"); assert_equal ~printer None ((let __re__ = Pcre2.regexp ~flags:[] ".+" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "\n\n"); assert_equal ~printer (Some "\n\n") ((let __re__ = Pcre2.regexp ~flags:[`DOTALL] ".+" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "\n\n"); assert_equal ~printer None ((let __re__ = Pcre2.regexp ~flags:[`MULTILINE] ".+" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "\n\n"); let printer x = x in (); assert_equal ~printer "\n\n" ((let __re__ = Pcre2.regexp ~flags:[`DOTALL] ".+" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0) (Pcre2.exec ~rex:__re__ __subj__)) "\n\n"); assert_equal ~printer "<>\ndef" (Pcre2.substitute_substrings_first ~rex:(Pcre2.regexp ~flags:[] ".+") ~subst:(fun __g__ -> String.concat "" ["<<"; begin match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s end; ">>"]) "abc\ndef"); assert_equal ~printer "<>" (Pcre2.substitute_substrings_first ~rex:(Pcre2.regexp ~flags:[`DOTALL] ".+") ~subst:(fun __g__ -> String.concat "" ["<<"; begin match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s end; ">>"]) "abc\ndef"); assert_equal ~printer "<>\ndef" (Pcre2.substitute_substrings_first ~rex:(Pcre2.regexp ~flags:[`MULTILINE] ".+") ~subst:(fun __g__ -> String.concat "" ["<<"; begin match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s end; ">>"]) "abc\ndef"); assert_equal ~printer "<>\ndef" (Pcre2.substitute_substrings_first ~rex:(Pcre2.regexp ~flags:[] ".*") ~subst:(fun __g__ -> String.concat "" ["<<"; begin match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s end; ">>"]) "abc\ndef"); assert_equal ~printer "<><<>>\n<><<>>" (Pcre2.substitute_substrings ~rex:(Pcre2.regexp ~flags:[] ".*") ~subst:(fun __g__ -> String.concat "" ["<<"; begin match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s end; ">>"]) "abc\ndef"); assert_equal ~printer "<>\n<>" (Pcre2.substitute_substrings ~rex:(Pcre2.regexp ~flags:[] ".+") ~subst:(fun __g__ -> String.concat "" ["<<"; begin match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s end; ">>"]) "abc\ndef"); assert_equal ~printer "<>a\nc<>" (Pcre2.substitute_substrings ~rex:(Pcre2.regexp ~flags:[] "a.c") ~subst:(fun __g__ -> String.concat "" ["<<"; begin match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s end; ">>"]) "abca\ncaec"); assert_equal ~printer "<><><>" (Pcre2.substitute_substrings ~rex:(Pcre2.regexp ~flags:[`DOTALL] "a.c") ~subst:(fun __g__ -> String.concat "" ["<<"; begin match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s end; ">>"]) "abca\ncaec") let test_pcre2_multiline ctxt = (); assert_equal (Some "bar") ((let __re__ = Pcre2.regexp ~flags:[] ".+$" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "foo\nbar"); assert_equal (Some "foo") ((let __re__ = Pcre2.regexp ~flags:[`MULTILINE] ".+$" in fun __subj__ -> match Option.map (fun __g__ -> Pcre2.get_substring __g__ 0) (try Some (Pcre2.exec ~rex:__re__ __subj__) with Not_found -> None) with exception Not_found -> None | rv -> rv) "foo\nbar") let test_pcre2_simple_split ctxt = (); assert_equal ["bb"] ((let __re__ = Pcre2.regexp ~flags:[] "a" in fun __subj__ -> Pcre2.split ~rex:__re__ __subj__) "bb") let test_pcre2_delim_split_raw ctxt = let open Pcre2 in begin (); assert_equal [Delim "a"; Text "b"; Delim "a"; Text "b"] ((let __re__ = Pcre2.regexp ~flags:[] "a" in fun __subj__ -> Pcre2.full_split ~rex:__re__ __subj__) "ababa"); assert_equal [Delim "a"; Text "b"; Delim "a"; Delim "a"; Text "b"] ((let __re__ = Pcre2.regexp ~flags:[] "a" in fun __subj__ -> Pcre2.full_split ~rex:__re__ __subj__) "abaaba"); assert_equal [Delim "a"; NoGroup; Text "b"; Delim "ac"; Group (1, "c"); Text "b"; Delim "a"; NoGroup] ((let __re__ = Pcre2.regexp ~flags:[] "a(c)?" in fun __subj__ -> Pcre2.full_split ~rex:__re__ __subj__) "abacba"); assert_equal [Delim "ac"; Group (1, "c"); Text "b"; Delim "ac"; Group (1, "c"); Text "b"; Delim "ac"; Group (1, "c")] ((let __re__ = Pcre2.regexp ~flags:[] "a(c)" in fun __subj__ -> Pcre2.full_split ~rex:__re__ __subj__) "acbacbac"); assert_equal [Delim "ac"; Group (1, "c"); Text "b"; Delim "ac"; Group (1, "c"); Text "b"; Delim "ac"; Group (1, "c")] ((let __re__ = Pcre2.regexp ~flags:[] "a(c)" in fun __subj__ -> Pcre2.full_split ~rex:__re__ __subj__) "acbacbac"); assert_equal [Delim "a"; NoGroup; Text "b"; Delim "ac"; Group (1, "c"); Text "b"; Delim "a"; NoGroup] ((let __re__ = Pcre2.regexp ~flags:[] "a(c)?" in fun __subj__ -> Pcre2.full_split ~rex:__re__ __subj__) "abacba"); assert_equal [Text "ab"; Delim "x"; Group (1, "x"); NoGroup; Text "cd"] ((let __re__ = Pcre2.regexp ~flags:[] "(x)|(u)" in fun __subj__ -> Pcre2.full_split ~rex:__re__ __subj__) "abxcd"); assert_equal [Text "ab"; Delim "x"; Group (1, "x"); NoGroup; Text "cd"; Delim "u"; NoGroup; Group (2, "u")] ((let __re__ = Pcre2.regexp ~flags:[] "(x)|(u)" in fun __subj__ -> Pcre2.full_split ~rex:__re__ __subj__) "abxcdu") end let test_pcre2_string_pattern ctxt = (); assert_equal "$b" ((fun __g__ -> String.concat "" ["$"; ""; match Pcre2.get_substring __g__ 1 with exception Not_found -> "" | s -> s]) ((let __re__ = Pcre2.regexp ~flags:[] "a(b)c" in fun __subj__ -> Pcre2.exec ~rex:__re__ __subj__) "abc")); assert_equal "b" ((fun __g__ -> String.concat "" [match Pcre2.get_substring __g__ 01 with exception Not_found -> "" | s -> s]) ((let __re__ = Pcre2.regexp ~flags:[] "a(b)c" in fun __subj__ -> Pcre2.exec ~rex:__re__ __subj__) "abc")); assert_equal "bx" (let s = "x" in (fun __g__ -> String.concat "" [begin match Pcre2.get_substring __g__ 01 with exception Not_found -> "" | s -> s end; ""; s]) ((let __re__ = Pcre2.regexp ~flags:[] "a(b)c" in fun __subj__ -> Pcre2.exec ~rex:__re__ __subj__) "abc")); assert_equal "\"bx" (let s = "x" in (fun __g__ -> String.concat "" ["\""; begin match Pcre2.get_substring __g__ 01 with exception Not_found -> "" | s -> s end; ""; s]) ((let __re__ = Pcre2.regexp ~flags:[] "a(b)c" in fun __subj__ -> Pcre2.exec ~rex:__re__ __subj__) "abc")); assert_equal "\"x" (let s = "x" in String.concat "" ["\""; s]) let test_pcre2_expr_pattern ctxt = (); assert_equal "abc" ((fun __g__ -> match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s) ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> Pcre2.exec ~rex:__re__ __subj__) "abc")); assert_equal "abcx" ((fun __g__ -> (match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s) ^ "x") ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> Pcre2.exec ~rex:__re__ __subj__) "abc")); assert_equal "abcx" (let x = "x" in (fun __g__ -> (match Pcre2.get_substring __g__ 0 with exception Not_found -> "" | s -> s) ^ x) ((let __re__ = Pcre2.regexp ~flags:[] "abc" in fun __subj__ -> Pcre2.exec ~rex:__re__ __subj__) "abc")); assert_equal "x" (let x = "x" in "" ^ x) let test_pcre2_subst ctxt = (); assert_equal "$b" (Pcre2.substitute_substrings_first ~rex:(Pcre2.regexp ~flags:[] "a(b)c") ~subst:(fun __g__ -> String.concat "" ["$"; ""; match Pcre2.get_substring __g__ 1 with exception Not_found -> "" | s -> s]) "abc"); assert_equal "$b" (Pcre2.substitute_substrings_first ~rex:(Pcre2.regexp ~flags:[`CASELESS] "A(B)C") ~subst:(fun __g__ -> String.concat "" ["$"; ""; match Pcre2.get_substring __g__ 1 with exception Not_found -> "" | s -> s]) "abc"); assert_equal "$babc" (Pcre2.substitute_substrings_first ~rex:(Pcre2.regexp ~flags:[`CASELESS] "A(B)C") ~subst:(fun __g__ -> String.concat "" ["$"; ""; match Pcre2.get_substring __g__ 1 with exception Not_found -> "" | s -> s]) "abcabc"); assert_equal "$b$b" (Pcre2.substitute_substrings ~rex:(Pcre2.regexp ~flags:[`CASELESS] "A(B)C") ~subst:(fun __g__ -> String.concat "" ["$"; ""; match Pcre2.get_substring __g__ 1 with exception Not_found -> "" | s -> s]) "abcabc"); assert_equal "$b$b" (Pcre2.substitute_substrings ~rex:(Pcre2.regexp ~flags:[`CASELESS] "A(B)C") ~subst:(fun __g__ -> "$" ^ (match Pcre2.get_substring __g__ 1 with exception Not_found -> "" | s -> s)) "abcabc"); assert_equal "$$" (Pcre2.substitute_substrings ~rex:(Pcre2.regexp ~flags:[`CASELESS] "A(B)C") ~subst:(fun __g__ -> "$") "abcabc"); assert_equal "$$" (Pcre2.substitute_substrings ~rex:(Pcre2.regexp ~flags:[`CASELESS] "A(B)C") ~subst:(fun __g__ -> String.concat "" ["$"]) "abcabc") let test_pcre2_ocamlfind_bits ctxt = (); assert_equal ~printer:show_string_option (Some "-syntax camlp5o ") (snd ((let __re__ = Pcre2.regexp ~flags:[] "^\\(\\*\\*pp (.*?)\\*\\)" in fun __subj__ -> (fun __g__ -> Pcre2.get_substring __g__ 0, (try Some (Pcre2.get_substring __g__ 1) with Not_found -> None)) (Pcre2.exec ~rex:__re__ __subj__)) "(**pp -syntax camlp5o *)\n")) let pcre2_envsubst envlookup s = let f s1 s2 = if s1 <> "" then envlookup s1 else if s2 <> "" then envlookup s2 else assert false in Pcre2.substitute_substrings ~rex:(Pcre2.regexp ~flags:[] "(?:\\$\\(([^)]+)\\)|\\$\\{([^}]+)\\})") ~subst:(fun __g__ -> f (match Pcre2.get_substring __g__ 1 with exception Not_found -> "" | s -> s) (match Pcre2.get_substring __g__ 2 with exception Not_found -> "" | s -> s)) s let test_pcre2_envsubst_via_replace ctxt = let f = function "A" -> "res1" | "B" -> "res2" | _ -> failwith "unexpected arg in envsubst" in assert_equal "...res1...res2..." (pcre2_envsubst f "...$(A)...${B}...") let suite = "Test pa_ppx_regexp" >::: ["pcre2 simple_match" >:: test_pcre2_simple_match; "pcre2 selective_match" >:: test_pcre2_selective_match; "pcre2 search" >:: test_pcre2_search; "pcre2 single" >:: test_pcre2_single; "pcre2 multiline" >:: test_pcre2_multiline; "pcre2 simple_split" >:: test_pcre2_simple_split; "pcre2 delim_split raw" >:: test_pcre2_delim_split_raw; "pcre2 string_pattern" >:: test_pcre2_string_pattern; "pcre2 expr_pattern" >:: test_pcre2_expr_pattern; "pcre2 subst" >:: test_pcre2_subst; "pcre2 ocamlfind bits" >:: test_pcre2_ocamlfind_bits; "pcre2 envsubst via replace" >:: test_pcre2_envsubst_via_replace; "pcre only_regexps" >:: test_special_char_regexps] let _ = if not !(Sys.interactive) then run_test_tt_main suite