pax_global_header00006660000000000000000000000064135554062200014514gustar00rootroot0000000000000052 comment=45a2781669d4dc1616f9d07510b366cbe0ae99d4 pcre-ocaml-7.4.3/000077500000000000000000000000001355540622000135515ustar00rootroot00000000000000pcre-ocaml-7.4.3/.gitignore000066400000000000000000000000401355540622000155330ustar00rootroot00000000000000.*.swp .merlin *.install _build pcre-ocaml-7.4.3/CHANGES.md000066400000000000000000000020101355540622000151340ustar00rootroot00000000000000### 7.4.3 (2019-10-27) * Switched from `caml_alloc_custom` to `caml_alloc_custom_mem`. This should improve memory usage and GC performance. * Switched to OPAM file generation via `dune-project` ### 7.4.2 (2019-10-11) * Fixed warnings in C-stubs ### 7.4.1 (2019-02-21) * Fixed pattern execution bug due to DFA implementation ### 7.4.0 (2019-02-05) * Added DFA support New functions: * pcre_dfa_exec * unsafe_pcre_dfa_exec Thanks to Chas Emerick for this contribution! ### 7.3.5 (2018-10-25) * Switched to dune, dune-release, and OPAM 2.0 ### 7.3.4 (2017-11-22) * Improved finalization of regular expressions and tables for better performance ### 7.3.3 (2017-10-17) * Fixed external declaration bug in internal regexp compile function ### 7.3.2 (2017-10-10) * Improved compatibility with MSVC ### 7.3.1 (2017-10-08) * Used untagged integers when declaring external functions ### 7.3.0 (2017-07-27) * Switched to jbuilder and topkg pcre-ocaml-7.4.3/LICENSE.md000066400000000000000000000654331355540622000151700ustar00rootroot00000000000000Copyright (c) 1999- Markus Mottl The Library is distributed under the terms of the GNU Lesser General Public License version 2.1 (included below). As a special exception to the GNU Lesser General Public License, you may link, statically or dynamically, a "work that uses the Library" with a publicly distributed version of the Library to produce an executable file containing portions of the Library, and distribute that executable file under terms of your choice, without any of the additional requirements listed in clause 6 of the GNU Lesser General Public License. By "a publicly distributed version of the Library", we mean either the unmodified Library as distributed by the authors, or a modified version of the Library that is distributed under the conditions defined in clause 3 of the GNU Lesser General Public License. This exception does not however invalidate any other reasons why the executable file might be covered by the GNU Lesser General Public License. --------------------------------------------------------------------------- ### GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] ### Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. ### TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION **0.** This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. **1.** You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. **2.** You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: - **a)** The modified work must itself be a software library. - **b)** You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. - **c)** You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. - **d)** If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. **3.** You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. **4.** You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. **5.** A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. **6.** As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: - **a)** Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) - **b)** Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. - **c)** Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. - **d)** If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. - **e)** Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. **7.** You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: - **a)** Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. - **b)** Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. **8.** You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. **9.** You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. **10.** Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. **11.** If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. **12.** If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. **13.** The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. **14.** If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. **NO WARRANTY** **15.** BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. **16.** IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. ### END OF TERMS AND CONDITIONS ### How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. one line to give the library's name and an idea of what it does. Copyright (C) year name of author This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. signature of Ty Coon, 1 April 1990 Ty Coon, President of Vice That's all there is to it! pcre-ocaml-7.4.3/Makefile000066400000000000000000000001341355540622000152070ustar00rootroot00000000000000.PHONY: all clean doc all: dune build @install clean: dune clean doc: dune build @doc pcre-ocaml-7.4.3/README.md000066400000000000000000000131231355540622000150300ustar00rootroot00000000000000## PCRE-OCaml - Perl Compatibility Regular Expressions for OCaml This [OCaml](http://www.ocaml.org)-library interfaces the C-library [PCRE](http://www.pcre.org) (Perl-compatibility Regular Expressions). It can be used for string matching with "PERL"-style regular expressions. ### Features PCRE-OCaml offers the following functionality for operating on strings: * Searching for patterns * Extracting subpatterns * Splitting strings according to patterns * Pattern substitution Other reasons to use PCRE-OCaml: * The PCRE-library by Philip Hazel has been under development for many years and is fairly advanced and stable. It implements just about all of the functionality that can be found in PERL regular expressions. The higher-level functions written in OCaml (split, replace, etc.), too, are compatible with the corresponding PERL-functions to the extent that OCaml allows. Most people find the syntax of PERL-style regular expressions more straightforward and powerful than the Emacs-style regular expressions used in the `Str`-module in the standard OCaml distribution. * PCRE-OCaml is reentrant and thus thread-safe, which is not the case for the `Str`-module in the OCaml standard library. Using reentrant libraries also means more convenience for programmers. They do not have to reason about states in which the library might be in. * The high-level functions for replacement and substitution, which are all implemented in OCaml, are much faster than the ones in the `Str`-module. In fact, when compiled to native code, they even seem to be significantly faster than those found in PERL (PERL is written in C). * You can rely on the data returned being unique. In other terms: if the result of a function is a string, you can safely use destructive updates on it without having to fear side effects. * The interface to the library makes use of labels and default arguments to give you a high degree of programming comfort. ### Usage Please consult the [API](https://mmottl.github.io/pcre-ocaml/api/pcre) for details. A general concept the user may need to understand is that most functions allow for two different kinds of flags: 1. "Convenience"-flags that make for readable and concise code, but which need to be translated to an internal representation on each call. Example: ```ocaml let rex = Pcre.regexp ~flags:[`ANCHORED; `CASELESS] "some pattern" in (* ... *) ``` This makes it easy to pass flags on the fly. They will be translated to the internal format automatically. However, if this happens to be in a loop, this translation will occur on each iteration. If you really need to save as much performance as possible, you should use the next approach. 2. "Internal" flags that need to be defined and translated from "convenience"-flags before function calls, but which allow for optimum performance in loops. Example: ```ocaml let iflags = Pcre.cflags [`ANCHORED; `CASELESS] in for i = 1 to 1000 do let rex = Pcre.regexp ~iflags "some pattern constructed at runtime" in (* ... *) done ``` Factoring out the translation of flags for regular expressions may save some cycles, but don't expect too much. You can save more CPU time when lifting the creation of regular expressions out of loops. Example for what not to do: ```ocaml for i = 1 to 1000 do let chunks = Pcre.split ~pat:"[ \t]+" "foo bar" in (* ... *) done ``` Better: ```ocaml let rex = Pcre.regexp "[ \t]+" in for i = 1 to 1000 do let chunks = Pcre.split ~rex "foo bar" in (* ... *) done ``` The provided functions use optional arguments with intuitive defaults. For example, the `Pcre.split`-function will assume whitespace as pattern. The `examples`-directory contains a few example applications demonstrating the functionality of PCRE-OCaml. #### Restartable (partial) pattern matching PCRE includes an "alternative" DFA match function that allows one to restart a partial match with additional input. This is exposed by `pcre-ocaml` via the `pcre_dfa_exec` function. While this cannot be used for "higher-level" operations like extracting submatches or splitting subject strings, it can be very useful in certain streaming and search use cases. This `utop` interaction demonstrates the basic workflow of a partial match that is then restarted multiple times before completing successfully: ```ocaml utop # open Pcre;; utop # let rex = regexp "12+3";; val rex : regexp = utop # let workspace = Array.make 40 0;; val workspace : int array = [|0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0|] utop # pcre_dfa_exec ~rex ~flags:[`PARTIAL] ~workspace "12222";; Exception: Pcre.Error Partial. utop # pcre_dfa_exec ~rex ~flags:[`PARTIAL; `DFA_RESTART] ~workspace "2222222";; Exception: Pcre.Error Partial. utop # pcre_dfa_exec ~rex ~flags:[`PARTIAL; `DFA_RESTART] ~workspace "2222222";; Exception: Pcre.Error Partial. utop # pcre_dfa_exec ~rex ~flags:[`PARTIAL; `DFA_RESTART] ~workspace "223xxxx";; - : int array = [|0; 3; 0|] ``` Please refer to the documentation of `pcre_dfa_exec` and check out the `dfa_restart` example for more info. ### Contact Information and Contributing Please submit bugs reports, feature requests, contributions and similar to the [GitHub issue tracker](https://github.com/mmottl/pcre-ocaml/issues). Up-to-date information is available at: pcre-ocaml-7.4.3/dune000066400000000000000000000002401355540622000144230ustar00rootroot00000000000000(env (dev (flags (:standard -w -9 -principal)) (c_flags (:standard -Wall -pedantic -Wextra -Wunused))) (release (ocamlopt_flags (:standard -O3))) ) pcre-ocaml-7.4.3/dune-project000066400000000000000000000013431355540622000160740ustar00rootroot00000000000000(lang dune 1.10) (name pcre) (generate_opam_files true) (source (github mmottl/pcre-ocaml)) (license "LGPL-2.1+ with OCaml linking exception") (homepage "https://mmottl.github.io/pcre-ocaml") (documentation "https://mmottl.github.io/pcre-ocaml/api") (maintainers "Markus Mottl ") (authors "Markus Mottl ") (package (name pcre) (synopsis "Bindings to the Perl Compatibility Regular Expressions library") (description "\ pcre-ocaml offers library functions for string pattern matching and substitution, similar to the functionality offered by the Perl language.") (depends (ocaml (>= 4.08)) (dune (>= 1.10)) (conf-libpcre :build) (base :build) base-bytes ) ) pcre-ocaml-7.4.3/examples/000077500000000000000000000000001355540622000153675ustar00rootroot00000000000000pcre-ocaml-7.4.3/examples/Makefile000066400000000000000000000002151355540622000170250ustar00rootroot00000000000000TARGETS = $(addsuffix .bc, cloc count_hash dfa_restart pcregrep subst) .PHONY: all clean all: @dune build $(TARGETS) clean: @dune clean pcre-ocaml-7.4.3/examples/README.md000066400000000000000000000022161355540622000166470ustar00rootroot00000000000000## Examples ### cloc This program reads C-sources from stdin and prints them to stdout with comments and empty lines removed. Useful for counting LOCs. ### count_hash This program reads text from stdin, counts all equal words that are separated by whitespace and prints the result to stdout. ### pcregrep A grep-like program using Perl-compatible regular expressions. Start the program with argument `-help` to see what it does! ### subst Substitutes text in files using Perl-compatible regular expressions and substitution patterns. Start the program with argument `-help` to see what it does! Example invocation: ```sh subst '([Tt])ermcap' '$1ermCap' < /etc/termcap ``` ### dfa_restart Exercises the availability of the DFA matching function and its partial match restart capability. Given a pattern, will accept input incrementally, restarting the prior partial match until the pattern succeeds in matching completely, or fails. Example interaction: ``` $ dfa_restart.exe 'abc12+3' > abc partial match, provide more input: > 122222 partial match, provide more input: > 222 partial match, provide more input: > 3 match completed: "[|0;1;0|]" ``` pcre-ocaml-7.4.3/examples/cloc.ml000066400000000000000000000010101355540622000166310ustar00rootroot00000000000000open Pcre let read_whole_channel ch = let size = 4096 in let strbuf = Bytes.create size in let buf = Buffer.create 65536 in let len = ref size in while !len <> 0 do len := input ch strbuf 0 size; Buffer.add_subbytes buf strbuf 0 !len done; Buffer.contents buf let () = let str = read_whole_channel stdin in let str = qreplace ~pat:"/\\*(.|\n)*?\\*/" str in let str = qreplace_first ~pat:"^(\n|\\s)+" str in let str = qreplace ~pat:"\n+((\n|\\s)\n)*" ~templ:"\n" str in print_string str pcre-ocaml-7.4.3/examples/count_hash.ml000066400000000000000000000003641355540622000200570ustar00rootroot00000000000000open Hashtbl let hash = create 1973 let add_string s = try incr (find hash s) with Not_found -> add hash s (ref 1);; Pcre.foreach_line (fun line -> List.iter add_string (Pcre.split line)); iter (fun k v -> Printf.printf "%4d\t%s\n" !v k) hash pcre-ocaml-7.4.3/examples/dfa_restart.ml000066400000000000000000000024101355540622000202140ustar00rootroot00000000000000open Pcre open Printf let show_array arr = Array.map string_of_int arr |> Array.to_list |> String.concat ";" |> sprintf "[|%s|]" let new_workspace () = Array.make 50 0 let () = let pat = if Array.length Sys.argv > 1 then Sys.argv.(1) else begin eprintf "%s: expected pattern argument\n" Sys.argv.(0); exit 1 end in let rex = regexp pat in let rec find_match flags workspace = print_string "> "; let line, eof = try read_line (), false with End_of_file -> "", true in match pcre_dfa_exec ~rex ~flags ~workspace line with | res -> printf "match completed: %S\n" (show_array res); if not eof then begin printf "\n *input & workspace reset*\n"; find_match [`PARTIAL] (new_workspace ()) end | exception (Error Partial) -> printf "partial match, provide more input:\n"; find_match [`DFA_RESTART; `PARTIAL] workspace | exception exn -> begin match exn with | Not_found -> eprintf "pattern match failed\n" | Error WorkspaceSize -> eprintf "need larger workspace vector\n" | Error InternalError s -> eprintf "internal error: %s\n" s | exn -> raise exn end; exit 1 in find_match [`PARTIAL] (new_workspace ()) pcre-ocaml-7.4.3/examples/dune000066400000000000000000000001271355540622000162450ustar00rootroot00000000000000(executables (names cloc count_hash pcregrep subst dfa_restart) (libraries pcre) ) pcre-ocaml-7.4.3/examples/pcregrep.ml000066400000000000000000000066131355540622000175360ustar00rootroot00000000000000open Pcre open Printf let filenames = ref true and filenames_only = ref false and count_only = ref false and invert = ref false and number = ref false and silent = ref false and whole_lines = ref false let parse_args () = let ignore_case = ref false and pat = ref None and files = ref [] in let c = "-c", Arg.Set count_only, "Count lines only." and h = "-h", Arg.Clear filenames, "Suppress printing of filenames when searching multiple files." and i = "-i", Arg.Set ignore_case, "Ignore case." and l = "-l", Arg.Set filenames_only, "Only print names of files containing matching lines (once)." and n = "-n", Arg.Set number, "Precede each line by its line number in the file." and s = "-s", Arg.Set silent, "Display nothing but error messages. Exit status indicates match." and v = "-v", Arg.Set invert, "Invert sense of the match: finds nonmatching lines." and x = "-x", Arg.Set whole_lines, "Force the pattern to be anchored and to match the entire line." and usage = "Usage: pcregrep [options] pattern [file] ...\n\n\ Searches files for character patterns.\n" and anon_arg arg = if !pat = None then pat := Some arg else files := arg :: !files in let args = [c; h; i; l; n; s; v; x] in Arg.parse args anon_arg usage; let flags = let flag_list = if !ignore_case then [`CASELESS] else [] in if !whole_lines then `ANCHORED :: flag_list else flag_list in let rex = match !pat with | Some pat -> regexp ~flags pat | None -> eprintf "%s: not enough arguments!\n" Sys.argv.(0); Arg.usage args usage; exit 2 in rex, List.rev !files let _ = let rex, files = parse_args () and rfl = rflags [] in let _, ovector = make_ovector rex in let pcregrep file name = let ret_code = ref 1 and linenumber = ref 0 and count = ref 0 and stdin_print_name () = match name with | Some filename -> print_endline filename | None -> print_endline "" and print_name () = match name with Some name -> printf "%s:" name | None -> () in let try_match line = let matched = try unsafe_pcre_exec rfl rex ~pos:0 ~subj_start:0 ~subj:line ovector None; if !whole_lines && ovector.(1) <> String.length line then false else true with Not_found -> false in incr linenumber; if matched <> !invert then begin if !count_only then incr count else if !filenames_only then begin stdin_print_name (); raise Exit end else if !silent then raise Exit else begin print_name (); if !number then printf "%d:" !linenumber; print_endline line end; ret_code := 0 end in try foreach_line ~ic:file try_match; if !count_only then begin print_name (); printf "%d\n" !count end; !ret_code with Exit -> 0 in if files = [] then exit (pcregrep stdin None); if List.length files = 1 then filenames := false; if !filenames_only then filenames := true; let collect ret_code filename = try let file = open_in filename in let frc = pcregrep file (if !filenames then Some filename else None) in close_in file; if frc = 0 && ret_code = 1 then 0 else ret_code with Sys_error msg -> prerr_endline msg; 2 in exit (List.fold_left collect 1 files) pcre-ocaml-7.4.3/examples/subst.ml000066400000000000000000000036171355540622000170700ustar00rootroot00000000000000open Pcre let parse_args () = let quick = ref false and first = ref false and ignore_case = ref false and offset = ref 0 and pat = ref None and substr = ref None in let q = "-q", Arg.Set quick, "Quick replacement. Interpretes substitution as plain text." and f = "-f", Arg.Set first, "Replace first occurrence in line only." and i = "-i", Arg.Set ignore_case, "Ignore case." and ofs = "-ofs", Arg.Int (fun n -> offset := n), "Start matching at column n." and usage = "Usage: subst [-q] [-f] [-i] [-ofs offset] pattern substitution\n\n\ Reads lines from standard input and replaces occurrences of\n\ the PERL-style regular expression \"pattern\" with \"substitution\",\n\ printing the result to standard output.\n\ In default mode the contents of \"substitution\" will be interpreted\n\ similarly to its equivalent in PERL.\n" and anon_arg arg = match !pat, !substr with | None, _ -> pat := Some arg | _, None -> substr := Some arg | _ -> raise (Arg.Bad "too many arguments!") in let args = [q; f; i; ofs] in Arg.parse args anon_arg usage; let flags = if !ignore_case then [`CASELESS] else [] in let rex, sstr = match !pat, !substr with | Some rex, Some sstr -> regexp ~flags rex, sstr | _ -> prerr_endline (Sys.argv.(0) ^ ": not enough arguments!"); Arg.usage args usage; exit 1 in match !quick, !first with | false, false -> fun s -> replace ~rex ~pos:!offset ~templ:sstr s | true, false -> fun s -> qreplace ~rex ~pos:!offset ~templ:sstr s | false, true -> fun s -> replace_first ~rex ~pos:!offset ~templ:sstr s | true, true -> fun s -> qreplace_first ~rex ~pos:!offset ~templ:sstr s let _ = let substitute = parse_args () in foreach_line (fun line -> try print_endline (substitute line) with Invalid_argument _ -> print_endline line) pcre-ocaml-7.4.3/pcre.opam000066400000000000000000000017151355540622000153640ustar00rootroot00000000000000# This file is generated by dune, edit dune-project instead opam-version: "2.0" build: [ ["dune" "subst"] {pinned} ["dune" "build" "-p" name "-j" jobs] ["dune" "runtest" "-p" name "-j" jobs] {with-test} ["dune" "build" "-p" name "@doc"] {with-doc} ] maintainer: ["Markus Mottl "] authors: ["Markus Mottl "] bug-reports: "https://github.com/mmottl/pcre-ocaml/issues" homepage: "https://mmottl.github.io/pcre-ocaml" doc: "https://mmottl.github.io/pcre-ocaml/api" license: "LGPL-2.1+ with OCaml linking exception" dev-repo: "git+https://github.com/mmottl/pcre-ocaml.git" synopsis: "Bindings to the Perl Compatibility Regular Expressions library" description: """ pcre-ocaml offers library functions for string pattern matching and substitution, similar to the functionality offered by the Perl language.""" depends: [ "ocaml" {>= "4.08"} "dune" {>= "1.10"} "conf-libpcre" {build} "base" {build} "base-bytes" ] pcre-ocaml-7.4.3/pre-v7.3.0-CHANGES.txt000066400000000000000000000610041355540622000170000ustar00rootroot000000000000002016-02-25: Minor version release v7.2.3: Fixed callout bug introduced with v7.2.0. Thanks to Raman Varabets for the bug report! 2016-02-23: Fixed linking problem with old versions of PCRE (< 8.20). Fixed backward compatibility issue with OCaml <= 3.12. 2016-02-22: Fixed a subgroup matching bug. Thanks to Cheng Lou for the bug report! 2015-08-21: Made GC less aggressive reclaiming regexps and chartables. 2014-12-10: Fixed another limit handling bug in the full_split function. 2014-12-02: Fixed a limit handling bug in the full_split function. Thanks to Rudi Grinberg for the report! 2014-10-23: Fixed string handling for new OCaml version 4.02 (String/Bytes modules). Requires new findlib version (>= 1.5). 2014-07-06: Moved to GitHub. 2014-06-04: Multiple bug fixes: * Allcation bug when performing callouts * Unprotected root when performing callouts * More portable offset copying in the C-stubs * Fixed a PERL-compatibility bug in the splitting routines The bug fixes required a minor API-change in an unsafe function, which is almost surely not directly called by any users. 2012-07-20: Downgraded findlib version requirement to support the Debian testing branch. Added --with-pcre-config flag to configure location of PCRE configuration generator. 2012-07-15: New major release version 7.0.0: * Upgraded to OCaml 4.00 * Switched to Oasis for packaging * Switched to OCamlBuild for the build process * Rewrote README in Markdown * Added stricter compilation flags * Minor bugfixes 2012-01-04: Fixed native code debug build target by updating OCamlMakefile. Thanks to Stéphane Glondu for the patch! 2011-12-15: Fixed a Windows portability bug in the C-bindings. Thanks to Evgenii Lepikhin for the patch! 2011-11-09: Updated OCamlMakefile to fix linking order. 2011-01-16: Added support for limit recursion flag. Thanks to Delphin Lecucq for the patch! 2010-10-31: Improved Windows support with MSVC. Thanks to Sylvain Le Gall for the patch! 2010-04-01: Added new function: * regexp_or 2009-06-20: Fixed bug in configuration functions that could lead to a segfault. Thanks to Gerd Stolpmann for the patch! 2009-05-07: Changed API wrt. error handling and thus made a major release. Improved behavior in the presence of recursion limit errors. Thanks to Martin Jambon for this patch! 2009-04-23: Fixed build problem on MinGW. Thanks to Gerd Stolpmann for the patch! 2009-03-08: Fixed build problem on Mac OS X with macports. Thanks to Ralph Douglass for the initial patch. Update OCamlMakefile. Improved Godi-distribution. 2008-05-06: Fixed build problem with newer versions of PCRE. 2008-03-14: Synced with Jane Street tree. 2008-01-25: Added new function: * names This function returns the names of all named substrings in a regular expression. Thanks to Benedikt Grundmann for the patch! 2007-07-12: Improved build scripts for Windows. Thanks to Christophe Troestler for the patch! 2007-07-12: Improved documentation for Win32 builds, and added some build scripts usable on Windows. Thanks to Christophe Troestler for this contribution! 2007-04-23: callback_exn -> caml_callback_exn. Updated OCamlMakefile. 2006-11-22: Updated OCamlMakefile. 2006-06-11: Updated to pcre-5.0! New representation for callbacks: they now take only one argument (a record of the callback data). Added partial matching and auto callouts. 2006-01-16: Updated OCamlMakefile. Removed a superfluous binding. 2005-08-18: Fixed a small compilation problem on rare platforms by upgrading OCamlMakefile. 2005-06-08: Relaxed license. Fixed copyright headers. 2005-05-31: Fixed some uncleanliness reported by Saffire (FFI-type checker). 2004-09-17: Fixed a bug concerning null patterns in exec_all (extract_all and extract_all_opt are also affected). Updated OCamlMakefile. 2004-05-19: Updated Makefile.mingw. Thanks to Jeff Henrikson for the patch! Updated OCamlMakefile. 2004-04-29: Changed behaviour of "get_substring"!!! It now does not return the empty string anymore if an accessed substring was not captured. It raises the exception "Not_found" instead. Three new functions: * get_opt_substrings * extract_opt * extract_all_opt These behave like the functions without "opt", but return "Some substring" if a substring was captured, and "None" otherwise. 2004-04-27: Updated OCamlMakefile. 2004-04-24: Updated OCamlMakefile. 2004-03-28: Changed interface to build-time configuration functions of PCRE. Updated OCamlMakefile. 2004-02-21: Added CAMLprim in the C-interface where appropriate. 2004-02-08: Fixed a minor bug concerning returning unit values from C. 2004-01-31: Added pcre_make.win32/Makefile.mingw. Thanks to Jeffrey Henrikson for this contribution! Update OCamlMakefile. 2004-01-13: Updated pcre_make.win32/pcre.h. 2003-12-30: Fixed documentation. 2003-12-21: Updated to pcre-4.5! New function: * config_stackrecurse New exception: * BadUTF8Offset Updated OCamlMakefile. 2003-12-19: Fixed a small (but probably unnoticable) bug concerning allocation of optional values in the C-stubs. 2003-12-12: Updated OCamlMakefile. Renamed stubs (invisible to users). 2003-11-16: Updated pcre_make.win32/pcre.h to reflect newest PCRE-version. May help Windows users. 2003-10-08: Upgraded to pcre-4.4! New flag for compiling patterns: NO_UTF8_CHECK New exception: BadUTF8 Updated OCamlMakefile. 2003-09-30: Fixed a bug in the documentation. Updated OCamlMakefile. 2003-06-17: Fixed a bug in the documentation. Updated OCamlMakefile. 2003-05-29: Updated to pcre-4.3! Major change: callouts are now fully supported! This allows the matching engine call OCaml-code while matching - quite powerful! Please see the interface specification for more information. Many small changes (improvements) in C-code. Updated documentation. 2003-04-08: Updated OCamlMakefile. Reformatted documentation. 2003-03-20: Added new function "get_subject". Patched OCamlMakefile. 2003-03-18: Major update: upgraded to pcre-4.1! Better UTF8-support. New flag "NO_AUTO_CAPTURE". New values: * config_utf8 * config_newline * config_link_size * config_match_limit Renamed all occurrences of "firstchar" to "firstbyte". New functions: * studysize * namecount * nameentrysize * get_stringnumber Updated OCamlMakefile. 2003-01-07: Updated OCamlMakefile to make use of "findlib". Added support for UTF-8 character encodings. Better installation and documentation for Win32. Thanks to Artem Prisyznuk for the above patches! 2002-12-14: Fixed a bug concerning zero-sized matches effecting "replace", "qreplace", "substitute_substrings" and "substitute". Updated OCamlMakefile. 2002-12-08: Improved documentation of "pcre_exec". 2002-11-24: Fixed a bug in "full_split" concerning matched subgroups. 2002-11-12: Added a new function "extract_all" (see interface documentation). 2002-08-16: Fixed a bug in the "split"-function: The Perl-splitting semantics was not completely adhered to: leading whitespace was stripped after the matching process rather than before, which lead to incompatible behaviour when a maximum bound was used. Thanks to Yutaka Oiwa for the bug report! Updated OCamlMakefile. 2002-07-31: Fixed a bug in the following functions: * replace * qreplace * substitute_substrings Transformed most part of the library to make it slightly more efficient and simple. The interface is still the same! Updated OCamlMakefile + documentation. 2002-07-15: Fixed a mistake in the documentation. 2002-05-05: Fixed a bug with the generation of byte-code libraries that dynamically link the PCRE. 2002-05-01: Removed C-library from distribution! Users must install it on their own now. Reorganized whole distribution and updated OCamlMakefile again for better support of dynamic and/or static libraries. 2002-04-30: Updated OcamlMakefile: it does not ask for confirmation during installation anymore! 2002-03-06: Upgraded to pcre-3.9. This should not change anything for OCaml-users. 2002-03-01: Updated OcamlMakefile. 2002-02-24: Separated compilation of library and examples to prevent confusions when the library needs to be installed before one can build the examples. 2002-02-15: Fixed a bug in the pcre_exec-function which was introduced ten days ago during correction of another bug (thanks to Gerd Stolpmann for the report!). 2002-02-15: Added the option to compile the library statically only. Updated INSTALL-notes to explain possible installation problems associated with support of dynamic linking. Added META-file for findlib. 2002-02-12: Rewrote interface documentation to support OCamldoc. Fixed a portability bug with shared libraries. 2002-02-10: Removed project from Sourceforge for simpler maintainance. 2002-02-07: Important news: library is linked dynamically now by default. 2002-02-06: Fixed a stupid bug affecting the following functions (thanks to Jacek Chrzaszcz for the bug report!): * num_of_subs * get_substring * get_substring_ofs * get_substrings * extract Also done: converted literal pattern strings in the library and the cloc-example so that the escape char (backslash) does not cause warnings anymore with the new OCaml-release. The latter is more paranoid about unknown escape combinations, requiring the user to add extra backslashes. 2002-01-07: Fixed a stupid bug: the position argument (offset) was not correctly handled in replacement and substitution functions, leading to wrong results. As it seems, people seldom use arguments other than zero... The "subst"-example now assumes that offsets other than zero should not lead to an error if it exceeds the line length. The line is just copied instead. 2001-12-28: Added README.win32 (courtesy of John W. Small). Updated README so that it is generated by Hevea. 2001-11-19: Upgraded to the newest release of the underlying C-library (PCRE-3.7). Added a tiny patch to satisfy the Visual C++ compiler under W2K (thanks to John W. Small!). Updated contact address. 2001-11-17: Updated OcamlMakefile. 2001-09-15: Upgraded to the newest release of the underlying C-library (PCRE-3.5). Added a new function: substitute_substrings It is similar to "substitute", but takes the full substring information of the match rather than the matching string. Thanks to Patrick M. Doane for proposing this missing feature! 2001-09-07: Updated OcamlMakefile 2001-08-27: Fixed a bug in the splitting function: leading whitespace was accidently always removed when using a regular expression rather than a pattern. This behaviour should only happen for the default whitespace pattern, which is used if you do not specify any pattern or regexp in the function call. 2001-06-30: Removed "Printexc.catch" from examples: is going to be deprecated in upcoming OCaml-release. 2001-05-22: Fixed typo in documentation. 2001-04-25: Added a new function: asplit It is identical to "split" with the exception that it returns a string array instead of a string list. This makes it easier for the user to access strings by index. Added a new option to "get_substrings" and "extract": full_match When "full_match" is true (default: yes), then the resulting string array will contain the full match at index 0, otherwise the result will only contain captured substrings. Removed superfluous comments in "pcre.ml": they are already present in the interface documentation anyway. 2001-04-08: Small patch that makes this library compile on OpenBSD, too. 2001-01-30: Made Makefile more general (allows simpler addition of further examples). 2001-01-24: Updated OcamlMakefile: made default definition of "OCAMLLIBPATH" backwards compatible again: some people do not use the CVS-version of OCaml, which supports the "-where"-option. People with the new compiler will not notice any effect. 2001-01-06: Added a new function: exec_all It allows you to execute pattern matching over a whole string until no more matches can be found: then it returns the array of all matching "substrings". You can extract subpatterns of each of those matching substrings again with the usual functions. Fixed a minor inconsistency in "next_match". 2000-12-23: Updated OcamlMakefile: makes use of the new "-where"-keyword to find the path to the standard library if it is not defined. 2000-12-14: Pedantry in the C-interface: added "const" qualifiers. 2000-12-09: Made some functions tail-recursive (very unlikely to cause any noticable effect for most people). 2000-12-02: Cleaned up the code a bit for distribution on SourceForge. Speed of some operations should be very slightly improved, too. 2000-11-16: Fixed a stupid bug that could crash your programs under rare circumstances (when you use faulty regular expressions). Added an internally used exception (InternalError) to the interface of the library. This allows the user to handle the case when the C-engine exhibits undefined behaviour (should never happen, anyway). 2000-09-27: Upgraded to the newest release of the underlying C-library (PCRE-3.4). See "pcre-C/ChangeLog" for more information (mostly minor bugfixes). Renamed "pgrep" to "pcregrep" to prevent name hiding on Solaris (change as suggested in the C-library distribution). 2000-06-24: Updated OcamlMakefile 2000-06-13: Updated OcamlMakefile 2000-06-12: Mini-optimisation: lifted a pure value out of a function body. 2000-06-11: Updated OcamlMakefile 2000-06-08: Added installation routine + updated OcamlMakefile again: This upgrade makes installation much easier! Read the updated INSTALL-file! 2000-06-07: Updated to new OcamlMakefile 2000-06-05: Added a new function: get_substring_ofs substrings n This allows you to get the offset positions of the matching pattern and the substrings directly from a value of type "substring". See the interface documentation for details. 2000-05-15: Upgraded to the newest release of the underlying C-library (PCRE-3.2). This does not add new features, but is said to fix some PERL-compatibility bugs and improves portability. 2000-05-04: Minor update of C-interface: Use the new "hash_variant"-function to compute the hash value of variants (used Callback.register before - a bit clumsy). 2000-04-24: New release: compiles with OCaml-3.00. Lots of changes in interface! OCaml-3.00 introduced some syntax changes for labels. Additionally, no keywords are allowed for them. All this required quite some renamings. I tried to stick to the new labels of the "str"-library as close as makes sense. Sorry for this inconvenience, but I think that adhering to some "standard" is a Good Thing... For further information on the changes, look at the documentation of the interface file. 2000-04-23: Minor cleanup of C-interface: Made local functions + variables static and moved a check for error codes (lint should be happier now). 2000-04-01: Changed linking of the library again (marginally): The new linking semantics for byte code libraries allows passing of flags that are remembered. This then only requires linking executables against "pcre.cma" - the C-library ("libpcre.a") will be linked in automatically. 2000-03-30: Cleaned up the C-interface: Removed a possible bug in the allocation of firstchar values. Saner handling of polymorphic variants (not hard-wired integers anymore - registered via callbacks). Better GC-settings: much nicer to the GC now (fewer full cycles - hardly more memory consumption). Should make programs run slightly faster when regular expressions are often allocated. Changed linking of the library to suit the semantics of the upcoming OCaml-major release (backward compatible). 2000-02-07: Changed interface of function "foreach_file": No idea why I implemented a tuple in the interface instead of a curried function. Must have been the drugs... ;-) Former interface: val foreach_file : string list -> (string * in_channel -> unit) -> unit Current interface: val foreach_file : string list -> (string -> in_channel -> unit) -> unit Additionally, this function now also closes the file channel in case of an exception. The exception gets reraised, of course. 2000-02-05: Eliminated a PERL-incompatibility of the "split"-function: When neither the pattern nor the compiled regular expression are specified, "split" defaults to a whitespace pattern. However, PERL additionally strips leading whitespace, but *only* when in "default" mode. This last "feature" (?) was overseen during the introduction of labels and default arguments. 2000-02-05: Updated the C-library to the newest release (pcre-3.0). This is a major release, but the changes to the OCaml-part are rather small: just the functionality concerning information on compiled patterns has changed. Instead of the former "info"-function, several ones are provided. I think this is the most convenient way to handle this, because it does not force the user to do explicit pattern matching on results: due to the number of new info-options this would have been rather confusing. Because the old PCRE-function "pcre_info" is obsolete, I chose to drop its OCaml-interface completely. I do not think that it is an often used part, anyway. See the interface file and the ChangeLog + documentation of the C-library to see, what has changed in detail. The other few changes just marginally effect efficiency. One bug was present in the last release in the "info"-part (polymorphic variants were not correctly represented). This has changed anyway and should work fine now. 2000-01-10: "foreach_line" uses stdin as labeled (label: 'in') default argument now. 1999-12-29: Lots of changes: * Uses the new features of OCaml 2.99: Labeled parameters and default arguments for much more convenience. E.g. write split pos:1 "foo bar" to get the list ["oo"; "bar"]. Polymorphic variants for passing options: this change allows, for example, using the data constructor "ANCHORED" for both compiling flags and at matching time instead of "C_ANCHORED" and "R_ANCHORED". * Got rid of all the superfluous shortcuts, like "bounded_psplit" and the like. Labeled parameters are much more readable and convenient. * Removed the functions compatible to the "Str"-module. It's probably rather confusing for people to see two different kinds of implementations. * Renamed functions due to the use of labels and the removing of the obsolete compatibility functions. * Updated documentation with more details on using the library. 1999-12-21: Small change (2 chars...) to make the C-interface compile with the newest release (OCaml 2.99). 1999-09-27: Fixed a bug which occurs, when replacements or substitutions in strings are longer than the subject string itself. This concerns functions "replace_all", "qreplace_all" and "substitute_all" + their variants. THANKS to GERD STOLPMANN (Gerd.Stolpmann@darmstadt.netsurf.de) for finding another bug! 1999-09-21: Fixed some bugs in "pcre_intf.c": "Store_field" was used inappropriately throughout the file, which could lead to segfaults in the garbage collector. This was corrected by using the "Field"-macro where appropriate. Protected an input parameter from being reclaimed too early. Initialized a field before throwing an exception to please the GC. The GC should be happy now... THANKS to GERD STOLPMANN (Gerd.Stolpmann@darmstadt.netsurf.de) for the patch! 1999-08-31: Updated to pcre-2.08 - this should fix some bugs. See "pcre-C/ChangeLog" for details. Renamed directory "pcre-C-2.07" to "pcre-C" to allow for easier upgrading. Added CVS-info to sources. 1999-08-24: Updated to pcre-2.07. Splitting is 100% PERL-compatible now. New runtime option: NOTEMPTY (see interface for details). Two additional examples: cloc and count_hash. New functions: sregexpo, sregexp, sregexp_case_fold. 1999-08-02: Exception handling more regular now: Raises [Invalid_argument] instead of [Failure] where appropriate. Raises [InternalError] if C-library exhibits undefined behaviour (has never happened so far). Updated comments. 1999-07-30: Bugfix in "pcre_intf.c": "pcre_ocaml" should work now on 64-bit architectures... 1999-07-29: Small fix in "pcre_intf.c": strict compilers (gcc is not very strict) otherwise complain about undefined behaviour in a certain line. Explicitely mention all include files. Removed unused variable. Also moved a line for efficiency... 1999-07-28: First release. pcre-ocaml-7.4.3/src/000077500000000000000000000000001355540622000143405ustar00rootroot00000000000000pcre-ocaml-7.4.3/src/Makefile000066400000000000000000000001511355540622000157750ustar00rootroot00000000000000TARGETS = pcre.cma libpcre_stubs.a .PHONY: all clean all: @dune build $(TARGETS) clean: @dune clean pcre-ocaml-7.4.3/src/config/000077500000000000000000000000001355540622000156055ustar00rootroot00000000000000pcre-ocaml-7.4.3/src/config/Makefile000066400000000000000000000001341355540622000172430ustar00rootroot00000000000000TARGETS = discover.bc .PHONY: all clean all: @dune build $(TARGETS) clean: @dune clean pcre-ocaml-7.4.3/src/config/discover.ml000066400000000000000000000007131355540622000177560ustar00rootroot00000000000000open Base let () = let module C = Configurator.V1 in C.main ~name:"pcre" (fun c -> let default : C.Pkg_config.package_conf = { libs = ["-lpcre"]; cflags = [] } in let conf = Option.value_map (C.Pkg_config.get c) ~default ~f:(fun pc -> Option.value (C.Pkg_config.query pc ~package:"libpcre") ~default) in C.Flags.write_sexp "c_flags.sexp" conf.cflags; C.Flags.write_sexp "c_library_flags.sexp" conf.libs) pcre-ocaml-7.4.3/src/config/dune000066400000000000000000000001071355540622000164610ustar00rootroot00000000000000(executables (names discover) (libraries base dune.configurator) ) pcre-ocaml-7.4.3/src/dune000066400000000000000000000005121355540622000152140ustar00rootroot00000000000000(library (public_name pcre) (c_names pcre_stubs) (c_flags (:standard) (:include c_flags.sexp) -O2 -fPIC -DPIC -Wno-keyword-macro ) (c_library_flags (:include c_library_flags.sexp)) ) (rule (targets c_flags.sexp c_library_flags.sexp) (deps (:discover config/discover.exe)) (action (run %{discover})) ) pcre-ocaml-7.4.3/src/pcre.ml000066400000000000000000001136241355540622000156320ustar00rootroot00000000000000(* PCRE-OCAML - Perl Compatibility Regular Expressions for OCaml Copyright (C) 1999- Markus Mottl email: markus.mottl@gmail.com WWW: http://www.ocaml.info This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *) (* Public exceptions and their registration with the C runtime *) type error = | Partial | BadPartial | BadPattern of string * int | BadUTF8 | BadUTF8Offset | MatchLimit | RecursionLimit | WorkspaceSize | InternalError of string exception Error of error exception Backtrack exception Regexp_or of string * error (* Puts exceptions into global C-variables for fast retrieval *) external pcre_ocaml_init : unit -> unit = "pcre_ocaml_init" (* Registers exceptions with the C runtime and caches polymorphic variants *) let () = Callback.register_exception "Pcre.Error" (Error (InternalError "")); Callback.register_exception "Pcre.Backtrack" Backtrack; pcre_ocaml_init () (* Compilation and runtime flags and their conversion functions *) type icflag = int type irflag = int (* Compilation flags *) type cflag = [ | `CASELESS | `MULTILINE | `DOTALL | `EXTENDED | `ANCHORED | `DOLLAR_ENDONLY | `EXTRA | `UNGREEDY | `UTF8 | `NO_UTF8_CHECK | `NO_AUTO_CAPTURE | `AUTO_CALLOUT | `FIRSTLINE ] let int_of_cflag = function | `CASELESS -> 0x0001 | `MULTILINE -> 0x0002 | `DOTALL -> 0x0004 | `EXTENDED -> 0x0008 | `ANCHORED -> 0x0010 | `DOLLAR_ENDONLY -> 0x0020 | `EXTRA -> 0x0040 | `UNGREEDY -> 0x0200 | `UTF8 -> 0x0800 | `NO_AUTO_CAPTURE -> 0x1000 | `NO_UTF8_CHECK -> 0x2000 | `AUTO_CALLOUT -> 0x4000 | `FIRSTLINE -> 0x40000 let coll_icflag icflag flag = int_of_cflag flag lor icflag let cflags flags = List.fold_left coll_icflag 0 flags let cflag_of_int = function | 0x0001 -> `CASELESS | 0x0002 -> `MULTILINE | 0x0004 -> `DOTALL | 0x0008 -> `EXTENDED | 0x0010 -> `ANCHORED | 0x0020 -> `DOLLAR_ENDONLY | 0x0040 -> `EXTRA | 0x0200 -> `UNGREEDY | 0x0800 -> `UTF8 | 0x1000 -> `NO_AUTO_CAPTURE | 0x2000 -> `NO_UTF8_CHECK | 0x4000 -> `AUTO_CALLOUT | 0x40000 -> `FIRSTLINE | _ -> failwith "Pcre.cflag_list: unknown compilation flag" let all_cflags = [ 0x0001; 0x0002; 0x0004; 0x0008; 0x0010; 0x0020; 0x0040; 0x0200; 0x0800; 0x1000; 0x2000; 0x4000; 0x40000; ] let cflag_list icflags = let coll flag_list flag = if icflags land flag <> 0 then cflag_of_int flag :: flag_list else flag_list in List.fold_left coll [] all_cflags (* Runtime flags *) type rflag = [ | `ANCHORED | `NOTBOL | `NOTEOL | `NOTEMPTY | `PARTIAL | `DFA_RESTART ] let int_of_rflag = function | `ANCHORED -> 0x00010 | `NOTBOL -> 0x00080 | `NOTEOL -> 0x00100 | `NOTEMPTY -> 0x00400 | `PARTIAL -> 0x08000 | `DFA_RESTART -> 0x20000 let coll_irflag irflag flag = int_of_rflag flag lor irflag let rflags flags = List.fold_left coll_irflag 0 flags let rflag_of_int = function | 0x00010 -> `ANCHORED | 0x00080 -> `NOTBOL | 0x00100 -> `NOTEOL | 0x00400 -> `NOTEMPTY | 0x08000 -> `PARTIAL | 0x20000 -> `DFA_RESTART | _ -> failwith "Pcre.rflag_list: unknown runtime flag" let all_rflags = [0x0010; 0x0080; 0x0100; 0x0400; 0x8000; 0x20000] let rflag_list irflags = let coll flag_list flag = if irflags land flag <> 0 then rflag_of_int flag :: flag_list else flag_list in List.fold_left coll [] all_rflags (* Information on the PCRE-configuration (build-time options) *) external pcre_version : unit -> string = "pcre_version_stub" external pcre_config_utf8 : unit -> bool = "pcre_config_utf8_stub" [@@noalloc] external pcre_config_newline : unit -> char = "pcre_config_newline_stub" [@@noalloc] external pcre_config_link_size : unit -> (int [@untagged]) = "pcre_config_link_size_stub_bc" "pcre_config_link_size_stub" [@@noalloc] external pcre_config_match_limit : unit -> (int [@untagged]) = "pcre_config_match_limit_stub_bc" "pcre_config_match_limit_stub" [@@noalloc] external pcre_config_match_limit_recursion : unit -> (int [@untagged]) = "pcre_config_match_limit_recursion_stub_bc" "pcre_config_match_limit_recursion_stub" [@@noalloc] external pcre_config_stackrecurse : unit -> bool = "pcre_config_stackrecurse_stub" [@@noalloc] let version = pcre_version () let config_utf8 = pcre_config_utf8 () let config_newline = pcre_config_newline () let config_link_size = pcre_config_link_size () let config_match_limit = pcre_config_match_limit () let config_match_limit_recursion = pcre_config_match_limit_recursion () let config_stackrecurse = pcre_config_stackrecurse () (* Information on patterns *) type firstbyte_info = [ `Char of char | `Start_only | `ANCHORED ] type study_stat = [ `Not_studied | `Studied | `Optimal ] type regexp external options : regexp -> (icflag [@untagged]) = "pcre_options_stub_bc" "pcre_options_stub" external size : regexp -> (int [@untagged]) = "pcre_size_stub_bc" "pcre_size_stub" external studysize : regexp -> (int [@untagged]) = "pcre_studysize_stub_bc" "pcre_studysize_stub" external capturecount : regexp -> (int [@untagged]) = "pcre_capturecount_stub_bc" "pcre_capturecount_stub" external backrefmax : regexp -> (int [@untagged]) = "pcre_backrefmax_stub_bc" "pcre_backrefmax_stub" external namecount : regexp -> (int [@untagged]) = "pcre_namecount_stub_bc" "pcre_namecount_stub" external nameentrysize : regexp -> (int [@untagged]) = "pcre_nameentrysize_stub_bc" "pcre_nameentrysize_stub" external names : regexp -> string array = "pcre_names_stub" external firstbyte : regexp -> firstbyte_info = "pcre_firstbyte_stub" external firsttable : regexp -> string option = "pcre_firsttable_stub" external lastliteral : regexp -> char option = "pcre_lastliteral_stub" external study_stat : regexp -> study_stat = "pcre_study_stat_stub" [@@noalloc] (* Compilation of patterns *) type chtables external maketables : unit -> chtables = "pcre_maketables_stub" (* Internal use only! *) external pcre_study : regexp -> unit = "pcre_study_stub" external compile : (icflag [@untagged]) -> chtables option -> string -> regexp = "pcre_compile_stub_bc" "pcre_compile_stub" external get_match_limit : regexp -> int option = "pcre_get_match_limit_stub" external get_match_limit_recursion : regexp -> int option = "pcre_get_match_limit_recursion_stub" (* Internal use only! *) external set_imp_match_limit : regexp -> (int [@untagged]) -> regexp = "pcre_set_imp_match_limit_stub_bc" "pcre_set_imp_match_limit_stub" [@@noalloc] (* Internal use only! *) external set_imp_match_limit_recursion : regexp -> (int [@untagged]) -> regexp = "pcre_set_imp_match_limit_recursion_stub_bc" "pcre_set_imp_match_limit_recursion_stub" [@@noalloc] let regexp ?(study = true) ?limit ?limit_recursion ?(iflags = 0) ?flags ?chtables pat = let rex = match flags with | Some flag_list -> compile (cflags flag_list) chtables pat | _ -> compile iflags chtables pat in if study then pcre_study rex; let rex = match limit with | None -> rex | Some lim -> set_imp_match_limit rex lim in match limit_recursion with | None -> rex | Some lim -> set_imp_match_limit_recursion rex lim let regexp_or ?study ?limit ?limit_recursion ?(iflags = 0) ?flags ?chtables pats = let check pat = try ignore (regexp ~study:false ~iflags ?flags ?chtables pat) with Error error -> raise (Regexp_or (pat, error)) in List.iter check pats; let big_pat = let cnv pat = "(?:" ^ pat ^ ")" in String.concat "|" (List.rev (List.rev_map cnv pats)) in regexp ?study ?limit ?limit_recursion ~iflags ?flags ?chtables big_pat let bytes_unsafe_blit_string str str_ofs bts bts_ofs len = let str_bts = Bytes.unsafe_of_string str in Bytes.unsafe_blit str_bts str_ofs bts bts_ofs len let string_unsafe_sub str ofs len = let res = Bytes.create len in bytes_unsafe_blit_string str ofs res 0 len; Bytes.unsafe_to_string res let quote s = let len = String.length s in let buf = Bytes.create (len lsl 1) in let pos = ref 0 in for i = 0 to len - 1 do match String.unsafe_get s i with | '\\' | '^' | '$' | '.' | '[' | '|' | '(' | ')' | '?' | '*' | '+' | '{' as c -> Bytes.unsafe_set buf !pos '\\'; incr pos; Bytes.unsafe_set buf !pos c; incr pos | c -> Bytes.unsafe_set buf !pos c; incr pos done; string_unsafe_sub (Bytes.unsafe_to_string buf) 0 !pos (* Matching of patterns and subpattern extraction *) (* Default regular expression when none is provided by the user *) let def_rex = regexp "\\s+" type substrings = string * int array type callout_data = { callout_number : int; substrings : substrings; start_match : int; current_position : int; capture_top : int; capture_last : int; pattern_position : int; next_item_length : int; } type callout = callout_data -> unit let get_subject (subj, _) = subj let num_of_subs (_, ovector) = Array.length ovector / 3 let get_offset_start ovector str_num = if str_num < 0 || str_num >= Array.length ovector / 3 then invalid_arg "Pcre.get_offset_start: illegal offset"; let offset = str_num lsl 1 in offset, Array.unsafe_get ovector offset let get_substring_aux (subj, ovector) offset start = if start < 0 then raise Not_found else string_unsafe_sub subj start (Array.unsafe_get ovector (offset + 1) - start) let get_substring (_, ovector as substrings) str_num = let offset, start = get_offset_start ovector str_num in get_substring_aux substrings offset start let get_substring_ofs (_subj, ovector) str_num = let offset, start = get_offset_start ovector str_num in if start < 0 then raise Not_found else start, Array.unsafe_get ovector (offset + 1) let unsafe_get_substring (_, ovector as substrings) str_num = let offset = str_num lsl 1 in try get_substring_aux substrings offset (Array.unsafe_get ovector offset) with Not_found -> "" let get_substrings ?(full_match = true) (_, ovector as substrings) = if full_match then Array.init (Array.length ovector / 3) (unsafe_get_substring substrings) else let len = (Array.length ovector / 3) - 1 in Array.init len (fun n -> unsafe_get_substring substrings (n + 1)) let unsafe_get_opt_substring (_, ovector as substrings) str_num = let offset = str_num lsl 1 in try let start = Array.unsafe_get ovector offset in let str = get_substring_aux substrings offset start in Some str with Not_found -> None let get_opt_substrings ?(full_match = true) (_, ovector as substrings) = if full_match then Array.init (Array.length ovector / 3) (unsafe_get_opt_substring substrings) else let len = (Array.length ovector / 3) - 1 in Array.init len (fun n -> unsafe_get_opt_substring substrings (n + 1)) external get_stringnumber : regexp -> string -> (int [@untagged]) = "pcre_get_stringnumber_stub_bc" "pcre_get_stringnumber_stub" let get_named_substring rex name substrings = get_substring substrings (get_stringnumber rex name) let get_named_substring_ofs rex name substrings = get_substring_ofs substrings (get_stringnumber rex name) external unsafe_pcre_exec : (irflag [@untagged]) -> regexp -> pos : (int [@untagged]) -> subj_start : (int [@untagged]) -> subj : string -> int array -> callout option -> unit = "pcre_exec_stub_bc" "pcre_exec_stub" let make_ovector rex = let subgroups1 = capturecount rex + 1 in let subgroups2 = subgroups1 lsl 1 in subgroups2, Array.make (subgroups1 + subgroups2) 0 external unsafe_pcre_dfa_exec : (irflag [@untagged]) -> regexp -> pos : (int [@untagged]) -> subj_start : (int [@untagged]) -> subj : string -> int array -> callout option -> workspace : int array -> unit = "pcre_dfa_exec_stub_bc" "pcre_exec_stub0" let pcre_dfa_exec ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?callout ?(workspace = Array.make 20 0) subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let _, ovector = make_ovector rex in unsafe_pcre_dfa_exec iflags rex ~pos ~subj_start:0 ~subj ovector callout ~workspace; ovector let pcre_exec ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let _, ovector = make_ovector rex in unsafe_pcre_exec iflags rex ~pos ~subj_start:0 ~subj ovector callout; ovector let exec ?iflags ?flags ?rex ?pat ?pos ?callout subj = subj, pcre_exec ?iflags ?flags ?rex ?pat ?pos ?callout subj let next_match ?iflags ?flags ?rex ?pat ?(pos = 0) ?callout (subj, ovector) = let pos = Array.unsafe_get ovector 1 + pos in let subj_len = String.length subj in if pos < 0 || pos > subj_len then invalid_arg "Pcre.next_match: illegal offset"; subj, pcre_exec ?iflags ?flags ?rex ?pat ~pos ?callout subj let rec copy_lst ar n = function | [] -> ar | h :: t -> Array.unsafe_set ar n h; copy_lst ar (n - 1) t let exec_all ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?pos ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let (_, ovector as sstrs) = exec ~iflags ~rex ?pos ?callout subj in let null_flags = iflags lor 0x0400 in let subj_len = String.length subj in let rec loop pos (subj, ovector as sstrs) n lst = let maybe_ovector = try let first = Array.unsafe_get ovector 0 in if first = pos && Array.unsafe_get ovector 1 = pos then if pos = subj_len then None else Some (pcre_exec ~iflags:null_flags ~rex ~pos ?callout subj) else Some (pcre_exec ~iflags ~rex ~pos ?callout subj) with Not_found -> None in match maybe_ovector with | Some ovector -> let new_pos = Array.unsafe_get ovector 1 in loop new_pos (subj, ovector) (n + 1) (sstrs :: lst) | None -> copy_lst (Array.make (n + 1) sstrs) (n - 1) lst in loop (Array.unsafe_get ovector 1) sstrs 0 [] let extract ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj = get_substrings ?full_match (exec ?iflags ?flags ?rex ?pat ?pos ?callout subj) let extract_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj = get_opt_substrings ?full_match (exec ?iflags ?flags ?rex ?pat ?pos ?callout subj) let extract_all ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj = let many_sstrs = exec_all ?iflags ?flags ?rex ?pat ?pos ?callout subj in Array.map (get_substrings ?full_match) many_sstrs let extract_all_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj = let many_sstrs = exec_all ?iflags ?flags ?rex ?pat ?pos ?callout subj in Array.map (get_opt_substrings ?full_match) many_sstrs let pmatch ?iflags ?flags ?rex ?pat ?pos ?callout subj = try ignore (pcre_exec ?iflags ?flags ?rex ?pat ?pos ?callout subj); true with Not_found -> false (* String substitution *) (* Elements of a substitution pattern *) type subst = | SubstString of int * int (* Denotes a substring in the substitution *) | Backref of int (* nth backreference ($0 is program name!) *) | Match (* The whole matched string *) | PreMatch (* The string before the match *) | PostMatch (* The string after the match *) | LastParenMatch (* The last matched group *) (* Information on substitution patterns *) type substitution = string (* The substitution string *) * int (* Highest group number of backreferences *) * bool (* Makes use of "LastParenMatch" *) * subst list (* The list of substitution elements *) (* Only used internally in "subst" *) exception FoundAt of int let zero = Char.code '0' let subst str = let max_br = ref 0 in let with_lp = ref false in let lix = String.length str - 1 in let rec loop acc n = if lix < n then acc else try for i = n to lix do if String.unsafe_get str i = '$' then raise (FoundAt i) done; SubstString (n, lix - n + 1) :: acc with FoundAt i -> if i = lix then SubstString (n, lix - n + 1) :: acc else let i1 = i + 1 in let acc = if n = i then acc else SubstString (n, i - n) :: acc in match String.unsafe_get str i1 with | '0'..'9' as c -> let subpat_nr = ref (Char.code c - zero) in (try for j = i1 + 1 to lix do let c = String.unsafe_get str j in if c >= '0' && c <= '9' then subpat_nr := 10 * !subpat_nr + Char.code c - zero else raise (FoundAt j) done; max_br := max !subpat_nr !max_br; Backref !subpat_nr :: acc with FoundAt j -> max_br := max !subpat_nr !max_br; loop (Backref !subpat_nr :: acc) j) | '!' -> loop acc (i1 + 1) | '$' -> loop (SubstString (i1, 1) :: acc) (i1 + 1) | '&' -> loop (Match :: acc) (i1 + 1) | '`' -> loop (PreMatch :: acc) (i1 + 1) | '\'' -> loop (PostMatch :: acc) (i1 + 1) | '+' -> with_lp := true; loop (LastParenMatch :: acc) (i1 + 1) | _ -> loop acc i1 in let subst_lst = loop [] 0 in str, !max_br, !with_lp, subst_lst let def_subst = subst "" (* Calculates a list of tuples (str, offset, len) which contain substrings to be copied on substitutions. Internal use only! *) let calc_trans_lst subgroups2 ovector subj templ subst_lst = let prefix_len = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let coll (res_len, trans_lst as accu) = let return_lst (_str, _ix, len as el) = if len = 0 then accu else res_len + len, el :: trans_lst in function | SubstString (ix, len) -> return_lst (templ, ix, len) | Backref 0 -> let prog_name = Sys.argv.(0) in return_lst (prog_name, 0, String.length prog_name) | Backref n -> let offset = n lsl 1 in let start = Array.unsafe_get ovector offset in let len = Array.unsafe_get ovector (offset + 1) - start in return_lst (subj, start, len) | Match -> return_lst (subj, prefix_len, last - prefix_len) | PreMatch -> return_lst (subj, 0, prefix_len) | PostMatch -> return_lst (subj, last, String.length subj - last) | LastParenMatch -> let subgroups2_2 = subgroups2 - 2 in let pos = ref subgroups2_2 in let ix = ref (Array.unsafe_get ovector subgroups2_2) in while !ix < 0 do let pos_2 = !pos - 2 in pos := pos_2; ix := Array.unsafe_get ovector pos_2 done; return_lst (subj, !ix, Array.unsafe_get ovector (!pos + 1) - !ix) in List.fold_left coll (0, []) subst_lst let replace ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?(itempl = def_subst) ?templ ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let templ, max_br, with_lp, subst_lst = match templ with | Some str -> subst str | _ -> itempl in let subj_len = String.length subj in if pos < 0 || pos > subj_len then invalid_arg "Pcre.replace: illegal offset"; let subgroups2, ovector = make_ovector rex in let nsubs = (subgroups2 lsr 1) - 1 in if max_br > nsubs then failwith "Pcre.replace: backreference denotes nonexistent subpattern"; if with_lp && nsubs = 0 then failwith "Pcre.replace: no backreferences"; let rec loop full_len trans_lsts cur_pos = if cur_pos > subj_len || try unsafe_pcre_exec iflags rex ~pos:cur_pos ~subj_start:0 ~subj ovector callout; false with Not_found -> true then let postfix_len = max (subj_len - cur_pos) 0 in let left = pos + full_len in let res = Bytes.create (left + postfix_len) in bytes_unsafe_blit_string subj 0 res 0 pos; bytes_unsafe_blit_string subj cur_pos res left postfix_len; let inner_coll ofs (templ, ix, len) = bytes_unsafe_blit_string templ ix res ofs len; ofs + len in let coll ofs (res_len, trans_lst) = let new_ofs = ofs - res_len in let _ = List.fold_left inner_coll new_ofs trans_lst in new_ofs in let _ = List.fold_left coll left trans_lsts in Bytes.unsafe_to_string res else let first = Array.unsafe_get ovector 0 in let len = first - cur_pos in let res_len, _ as trans_lst_el = calc_trans_lst subgroups2 ovector subj templ subst_lst in let trans_lsts = if len > 0 then trans_lst_el :: (len, [(subj, cur_pos, len)]) :: trans_lsts else trans_lst_el :: trans_lsts in let full_len = full_len + len + res_len in let next = first + 1 in let last = Array.unsafe_get ovector 1 in if last < next then if first < subj_len then let new_trans_lsts = (1, [(subj, cur_pos + len, 1)]) :: trans_lsts in loop (full_len + 1) new_trans_lsts next else loop full_len trans_lsts next else loop full_len trans_lsts last in loop 0 [] pos let qreplace ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?(templ = "") ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let subj_len = String.length subj in if pos < 0 || pos > subj_len then invalid_arg "Pcre.qreplace: illegal offset"; let templ_len = String.length templ in let _, ovector = make_ovector rex in let rec loop full_len subst_lst cur_pos = if cur_pos > subj_len || try unsafe_pcre_exec iflags rex ~pos:cur_pos ~subj_start:0 ~subj ovector callout; false with Not_found -> true then let postfix_len = max (subj_len - cur_pos) 0 in let left = pos + full_len in let res = Bytes.create (left + postfix_len) in bytes_unsafe_blit_string subj 0 res 0 pos; bytes_unsafe_blit_string subj cur_pos res left postfix_len; let coll ofs = function | Some (substr, ix, len) -> let new_ofs = ofs - len in bytes_unsafe_blit_string substr ix res new_ofs len; new_ofs | None -> let new_ofs = ofs - templ_len in bytes_unsafe_blit_string templ 0 res new_ofs templ_len; new_ofs in let _ = List.fold_left coll left subst_lst in Bytes.unsafe_to_string res else let first = Array.unsafe_get ovector 0 in let len = first - cur_pos in let subst_lst = if len > 0 then None :: Some (subj, cur_pos, len) :: subst_lst else None :: subst_lst in let last = Array.unsafe_get ovector 1 in let full_len = full_len + len + templ_len in let next = first + 1 in if last < next then if first < subj_len then loop (full_len + 1) (Some (subj, cur_pos + len, 1) :: subst_lst) next else loop full_len subst_lst next else loop full_len subst_lst last in loop 0 [] pos let substitute_substrings ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?callout ~subst subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let subj_len = String.length subj in if pos < 0 || pos > subj_len then invalid_arg "Pcre.substitute: illegal offset"; let _, ovector = make_ovector rex in let rec loop full_len subst_lst cur_pos = if cur_pos > subj_len || try unsafe_pcre_exec iflags rex ~pos:cur_pos ~subj_start:0 ~subj ovector callout; false with Not_found -> true then let postfix_len = max (subj_len - cur_pos) 0 in let left = pos + full_len in let res = Bytes.create (left + postfix_len) in bytes_unsafe_blit_string subj 0 res 0 pos; bytes_unsafe_blit_string subj cur_pos res left postfix_len; let coll ofs (templ, ix, len) = let new_ofs = ofs - len in bytes_unsafe_blit_string templ ix res new_ofs len; new_ofs in let _ = List.fold_left coll left subst_lst in Bytes.unsafe_to_string res else let first = Array.unsafe_get ovector 0 in let len = first - cur_pos in let templ = subst (subj, ovector) in let templ_len = String.length templ in let subst_lst = if len > 0 then (templ, 0, templ_len) :: (subj, cur_pos, len) :: subst_lst else (templ, 0, templ_len) :: subst_lst in let last = Array.unsafe_get ovector 1 in let full_len = full_len + len + templ_len in let next = first + 1 in if last < next then if first < subj_len then loop (full_len + 1) ((subj, cur_pos + len, 1) :: subst_lst) next else loop full_len subst_lst next else loop full_len subst_lst last in loop 0 [] pos let substitute ?iflags ?flags ?rex ?pat ?pos ?callout ~subst:str_subst subj = let subst (subj, ovector) = let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in str_subst (string_unsafe_sub subj first (last - first)) in substitute_substrings ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj let replace_first ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?(itempl = def_subst) ?templ ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let templ, max_br, with_lp, subst_lst = match templ with | Some str -> subst str | _ -> itempl in let subgroups2, ovector = make_ovector rex in let nsubs = (subgroups2 lsr 1) - 1 in if max_br > nsubs then failwith "Pcre.replace_first: backreference denotes nonexistent subpattern"; if with_lp && nsubs = 0 then failwith "Pcre.replace_first: no backreferences"; try unsafe_pcre_exec iflags rex ~pos ~subj_start:0 ~subj ovector callout; let res_len, trans_lst = calc_trans_lst subgroups2 ovector subj templ subst_lst in let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let rest = String.length subj - last in let res = Bytes.create (first + res_len + rest) in bytes_unsafe_blit_string subj 0 res 0 first; let coll ofs (templ, ix, len) = bytes_unsafe_blit_string templ ix res ofs len; ofs + len in let ofs = List.fold_left coll first trans_lst in bytes_unsafe_blit_string subj last res ofs rest; Bytes.unsafe_to_string res with Not_found -> subj let qreplace_first ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?(templ = "") ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let _, ovector = make_ovector rex in try unsafe_pcre_exec iflags rex ~pos ~subj_start:0 ~subj ovector callout; let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let len = String.length templ in let rest = String.length subj - last in let postfix_start = first + len in let res = Bytes.create (postfix_start + rest) in bytes_unsafe_blit_string subj 0 res 0 first; bytes_unsafe_blit_string templ 0 res first len; bytes_unsafe_blit_string subj last res postfix_start rest; Bytes.unsafe_to_string res with Not_found -> subj let substitute_substrings_first ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?callout ~subst subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let _, ovector = make_ovector rex in try unsafe_pcre_exec iflags rex ~pos ~subj_start:0 ~subj ovector callout; let subj_len = String.length subj in let prefix_len = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let templ = subst (subj, ovector) in let postfix_len = subj_len - last in let templ_len = String.length templ in let postfix_start = prefix_len + templ_len in let res = Bytes.create (postfix_start + postfix_len) in bytes_unsafe_blit_string subj 0 res 0 prefix_len; bytes_unsafe_blit_string templ 0 res prefix_len templ_len; bytes_unsafe_blit_string subj last res postfix_start postfix_len; Bytes.unsafe_to_string res with Not_found -> subj let substitute_first ?iflags ?flags ?rex ?pat ?pos ?callout ~subst:str_subst subj = let subst (subj, ovector) = let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in str_subst (string_unsafe_sub subj first (last - first)) in substitute_substrings_first ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj (* Splitting *) let internal_psplit flags rex max pos callout subj = let subj_len = String.length subj in if subj_len = 0 then [] else if max = 1 then [subj] else let subgroups2, ovector = make_ovector rex in (* Adds contents of subgroups to the string accumulator *) let handle_subgroups strs = let strs = ref strs in let i = ref 2 in while !i < subgroups2 do let first = Array.unsafe_get ovector !i in incr i; let last = Array.unsafe_get ovector !i in let str = if first < 0 then "" else string_unsafe_sub subj first (last - first) in strs := str :: !strs; incr i done; !strs in (* Performs the recursive split *) let rec loop strs cnt pos prematch = let len = subj_len - pos in if len < 0 then strs else (* Checks termination due to max restriction *) if cnt = 0 then if prematch && try unsafe_pcre_exec flags rex ~pos ~subj_start:pos ~subj ovector callout; true with Not_found -> false then let last = Array.unsafe_get ovector 1 in let strs = handle_subgroups strs in string_unsafe_sub subj last (subj_len - last) :: strs else string_unsafe_sub subj pos len :: strs (* Calculates next accumulator state for splitting *) else if try unsafe_pcre_exec flags rex ~pos ~subj_start:pos ~subj ovector callout; false with Not_found -> true then string_unsafe_sub subj pos len :: strs else let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in if first = pos then if last = pos then let strs = if prematch then handle_subgroups strs else strs in if len = 0 then "" :: strs else if try unsafe_pcre_exec (flags lor 0x0410) rex ~pos ~subj_start:pos ~subj ovector callout; true with Not_found -> false then let new_strs = handle_subgroups ("" :: strs) in loop new_strs (cnt - 1) (Array.unsafe_get ovector 1) false else let new_strs = string_unsafe_sub subj pos 1 :: strs in loop new_strs (cnt - 1) (pos + 1) true else if prematch then loop (handle_subgroups strs) cnt last false else loop (handle_subgroups ("" :: strs)) (cnt - 1) last false else let new_strs = string_unsafe_sub subj pos (first - pos) :: strs in loop (handle_subgroups new_strs) (cnt - 1) last false in loop [] (max - 1) pos false let rec strip_all_empty = function "" :: t -> strip_all_empty t | l -> l external isspace : char -> bool = "pcre_isspace_stub" [@@noalloc] let rec find_no_space ix len str = if ix = len || not (isspace (String.unsafe_get str ix)) then ix else find_no_space (ix + 1) len str let split ?(iflags = 0) ?flags ?rex ?pat ?(pos = 0) ?(max = 0) ?callout subj = let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let res = match pat, rex with | Some str, _ -> internal_psplit iflags (regexp str) max pos callout subj | _, Some rex -> internal_psplit iflags rex max pos callout subj | _ -> (* special case for Perl-splitting semantics *) let len = String.length subj in if pos > len || pos < 0 then failwith "Pcre.split: illegal offset"; let new_pos = find_no_space pos len subj in internal_psplit iflags def_rex max new_pos callout subj in List.rev (if max = 0 then strip_all_empty res else res) let asplit ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj = Array.of_list (split ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj) (* Full splitting *) type split_result = Text of string | Delim of string | Group of int * string | NoGroup let rec strip_all_empty_full = function | Delim _ :: rest -> strip_all_empty_full rest | l -> l let full_split ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0) ?(max = 0) ?callout subj = let rex = match pat with Some str -> regexp str | _ -> rex in let iflags = match flags with Some flags -> rflags flags | _ -> iflags in let subj_len = String.length subj in if subj_len = 0 then [] else if max = 1 then [Text (subj)] else let subgroups2, ovector = make_ovector rex in (* Adds contents of subgroups to the string accumulator *) let handle_subgroups strs = let strs = ref strs in let i = ref 2 in while !i < subgroups2 do let group_nr = !i lsr 1 in let first = Array.unsafe_get ovector !i in incr i; let last = Array.unsafe_get ovector !i in let str = if first < 0 then NoGroup else let group_str = string_unsafe_sub subj first (last - first) in Group (group_nr, group_str) in strs := str :: !strs; incr i done; !strs in (* Performs the recursive split *) let rec loop strs cnt pos prematch = let len = subj_len - pos in if len < 0 then strs else (* Checks termination due to max restriction *) if cnt = 0 then if prematch && try unsafe_pcre_exec iflags rex ~pos ~subj_start:pos ~subj ovector callout; true with Not_found -> false then let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let delim = Delim (string_unsafe_sub subj first (last - first)) in Text (string_unsafe_sub subj last (subj_len - last)) :: handle_subgroups (delim :: strs) else if len = 0 then strs else Text (string_unsafe_sub subj pos len) :: strs (* Calculates next accumulator state for splitting *) else if try unsafe_pcre_exec iflags rex ~pos ~subj_start:pos ~subj ovector callout; false with Not_found -> true then if len = 0 then strs else Text (string_unsafe_sub subj pos len) :: strs else let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in if first = pos then if last = pos then if len = 0 then handle_subgroups (Delim "" :: strs) else let empty_groups = handle_subgroups [] in if try unsafe_pcre_exec (iflags lor 0x0410) rex ~pos ~subj_start:pos ~subj ovector callout; true with Not_found -> false then let first = Array.unsafe_get ovector 0 in let last = Array.unsafe_get ovector 1 in let delim = Delim (string_unsafe_sub subj first (last - first)) in let new_strs = handle_subgroups ( delim :: (if prematch then strs else empty_groups @ (Delim "" :: strs))) in loop new_strs (cnt - 1) last false else let new_strs = Text (string_unsafe_sub subj pos 1) :: empty_groups @ Delim "" :: strs in loop new_strs (cnt - 1) (pos + 1) true else let delim = Delim (string_unsafe_sub subj first (last - first)) in loop (handle_subgroups (delim :: strs)) cnt last false else let delim = Delim (string_unsafe_sub subj first (last - first)) in let pre_strs = Text (string_unsafe_sub subj pos (first - pos)) :: strs in loop (handle_subgroups (delim :: pre_strs)) (cnt - 1) last false in let res = loop [] (max - 1) pos true in List.rev (if max = 0 then strip_all_empty_full res else res) (* Additional convenience functions useful in combination with this library *) let foreach_line ?(ic = stdin) f = try while true do f (input_line ic) done with End_of_file -> () let foreach_file filenames f = let do_with_file filename = let file = open_in filename in try f filename file; close_in file with exn -> close_in file; raise exn in List.iter do_with_file filenames pcre-ocaml-7.4.3/src/pcre.mli000066400000000000000000001122701355540622000157770ustar00rootroot00000000000000(* PCRE-OCAML - Perl Compatibility Regular Expressions for OCaml Copyright (C) 1999- Markus Mottl email: markus.mottl@gmail.com WWW: http://www.ocaml.info This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *) (** Perl Compatibility Regular Expressions for OCaml {e %%VERSION%% - {{:%%PKG_HOMEPAGE%%}homepage}} *) (** {6 Exceptions} *) type error = | Partial (** String only matched the pattern partially *) | BadPartial (** Pattern contains items that cannot be used together with partial matching. *) | BadPattern of string * int (** [BadPattern (msg, pos)] regular expression is malformed. The reason is in [msg], the position of the error in the pattern in [pos]. *) | BadUTF8 (** UTF8 string being matched is invalid *) | BadUTF8Offset (** Gets raised when a UTF8 string being matched with offset is invalid. *) | MatchLimit (** Maximum allowed number of match attempts with backtracking or recursion is reached during matching. ALL FUNCTIONS CALLING THE MATCHING ENGINE MAY RAISE IT!!! *) | RecursionLimit | WorkspaceSize (** Raised by {!pcre_dfa_exec} when the provided workspace array is too small. See documention on {!pcre_dfa_exec} for details on workspace array sizing. *) | InternalError of string (** [InternalError msg] C-library exhibits unknown/undefined behaviour. The reason is in [msg]. *) (** Exception indicating PCRE errors. *) exception Error of error (** [Backtrack] used in callout functions to force backtracking. *) exception Backtrack (** [Regexp_or (pat, error)] gets raised for sub-pattern [pat] by [regexp_or] if it failed to compile. *) exception Regexp_or of string * error (** {6 Compilation and runtime flags and their conversion functions} *) (** Internal representation of compilation flags *) type icflag (** Internal representation of runtime flags *) and irflag (** Compilation flags *) and cflag = [ `CASELESS (** Case insensitive matching *) | `MULTILINE (** '^' and '$' match before/after newlines, not just at the beginning/end of a string *) | `DOTALL (** '.' matches all characters (newlines, too) *) | `EXTENDED (** Ignores whitespace and PERL-comments. Behaves like the '/x'-option in PERL *) | `ANCHORED (** Pattern matches only at start of string *) | `DOLLAR_ENDONLY (** '$' in pattern matches only at end of string *) | `EXTRA (** Reserved for future extensions of PCRE *) | `UNGREEDY (** Quantifiers not greedy anymore, only if followed by '?' *) | `UTF8 (** Treats patterns and strings as UTF8 characters. *) | `NO_UTF8_CHECK (** Turns off validity checks on UTF8 strings for efficiency reasons. WARNING: invalid UTF8 strings may cause a crash then! *) | `NO_AUTO_CAPTURE (** Disables the use of numbered capturing parentheses *) | `AUTO_CALLOUT (** Automatically inserts callouts with id 255 before each pattern item *) | `FIRSTLINE (** Unanchored patterns must match before/at first NL *) ] val cflags : cflag list -> icflag (** [cflags cflag_list] converts a list of compilation flags to their internal representation. *) val cflag_list : icflag -> cflag list (** [cflag_list cflags] converts internal representation of compilation flags to a list. *) (** Runtime flags *) type rflag = [ `ANCHORED (** Treats pattern as if it were anchored *) | `NOTBOL (** Beginning of string is not treated as beginning of line *) | `NOTEOL (** End of string is not treated as end of line *) | `NOTEMPTY (** Empty strings are not considered to be a valid match *) | `PARTIAL (** Turns on partial matching *) | `DFA_RESTART (** Causes matching to proceed presuming the subject string is further to one partially matched previously using the same int-array working set. May only be used with {!pcre_dfa_exec} or {!unsafe_pcre_dfa_exec}, and should always be paired with {!`PARTIAL}. *) ] val rflags : rflag list -> irflag (** [rflags rflag_list] converts a list of runtime flags to their internal representation. *) val rflag_list : irflag -> rflag list (** [rflag_list rflags] converts internal representation of runtime flags to a list. *) (** {6 Information on the PCRE-configuration (build-time options)} *) (** Version information *) val version : string (** Version of the PCRE-C-library *) (** Indicates whether UTF8-support is enabled *) val config_utf8 : bool (** Character used as newline *) val config_newline : char (** Number of bytes used for internal linkage of regular expressions *) val config_link_size : int (** Default limit for calls to internal matching function *) val config_match_limit : int (** Default limit recursion for calls to internal matching function *) val config_match_limit_recursion : int (** Indicates use of stack recursion in matching function *) val config_stackrecurse : bool (** {6 Information on patterns} *) (** Information on matching of "first chars" in patterns *) type firstbyte_info = [ `Char of char (** Fixed first character *) | `Start_only (** Pattern matches at beginning and end of newlines *) | `ANCHORED (** Pattern is anchored *) ] (** Information on the study status of patterns *) type study_stat = [ `Not_studied (** Pattern has not yet been studied *) | `Studied (** Pattern has been studied successfully *) | `Optimal (** Pattern could not be improved by studying *) ] type regexp (** Compiled regular expressions *) (** [options regexp] @return compilation flags of [regexp]. *) val options : regexp -> icflag (** [size regexp] @return memory size of [regexp]. *) val size : regexp -> int (** [studysize regexp] @return memory size of study information of [regexp]. *) val studysize : regexp -> int (** [capturecount regexp] @return number of capturing subpatterns in [regexp]. *) val capturecount : regexp -> int (** [backrefmax regexp] @return number of highest backreference in [regexp]. *) val backrefmax : regexp -> int (** [namecount regexp] @return number of named subpatterns in [regexp]. *) val namecount : regexp -> int (** [nameentrysize regexp] @return size of longest name of named subpatterns in [regexp] + 3. *) val nameentrysize : regexp -> int (** [names regex] @return array of names of named substrings in [regexp]. *) val names : regexp -> string array (** [firstbyte regexp] @return firstbyte info on [regexp]. *) val firstbyte : regexp -> firstbyte_info (** [firsttable regexp] @return some 256-bit (32-byte) fixed set table in form of a string for [regexp] if available, [None] otherwise. *) val firsttable : regexp -> string option (** [lastliteral regexp] @return some last matching character of [regexp] if available, [None] otherwise. *) val lastliteral : regexp -> char option (** [study_stat regexp] @return study status of [regexp]. *) val study_stat : regexp -> study_stat val get_stringnumber : regexp -> string -> int (** [get_stringnumber rex name] @return the index of the named substring [name] in regular expression [rex]. This index can then be used with [get_substring]. @raise Invalid_arg if there is no such named substring. *) val get_match_limit : regexp -> int option (** [get_match_limit rex] @return some match limit of regular expression [rex] or [None]. *) val get_match_limit_recursion : regexp -> int option (** [get_match_limit_recursion rex] @return some recursion match limit of regular expression [rex] or [None]. *) (** {6 Compilation of patterns} *) type chtables (** Alternative set of char tables for pattern matching *) val maketables : unit -> chtables (** Generates new set of char tables for the current locale. *) val regexp : ?study : bool -> ?limit : int -> ?limit_recursion : int -> ?iflags : icflag -> ?flags : cflag list -> ?chtables : chtables -> string -> regexp (** [regexp ?study ?limit ?limit_recursion ?iflags ?flags ?chtables pattern] compiles [pattern] with [flags] when given, with [iflags] otherwise, and with char tables [chtables]. If [study] is true, then the resulting regular expression will be studied. If [limit] is specified, this sets a limit to the amount of recursion and backtracking (only lower than the builtin default!). If this limit is exceeded, [MatchLimit] will be raised during matching. @param study default = true @param limit default = no extra limit other than default @param limit_recursion default = no extra limit_recursion other than default @param iflags default = no extra flags @param flags default = ignored @param chtables default = builtin char tables @return the regular expression. For detailed documentation on how you can specify PERL-style regular expressions (= patterns), please consult the PCRE-documentation ("man pcrepattern") or PERL-manuals. @see www.perl.com *) val regexp_or : ?study : bool -> ?limit : int -> ?limit_recursion : int -> ?iflags : icflag -> ?flags : cflag list -> ?chtables : chtables -> string list -> regexp (** [regexp_or ?study ?limit ?limit_recursion ?iflags ?flags ?chtables patterns] like {!regexp}, but combines [patterns] as alternatives (or-patterns) into one regular expression. *) val quote : string -> string (** [quote str] @return the quoted string of [str]. *) (** {6 Subpattern extraction} *) type substrings (** Information on substrings after pattern matching *) val get_subject : substrings -> string (** [get_subject substrings] @return the subject string of [substrings]. *) val num_of_subs : substrings -> int (** [num_of_subs substrings] @return number of strings in [substrings] (whole match inclusive). *) val get_substring : substrings -> int -> string (** [get_substring substrings n] @return the [n]th substring (0 is whole match) of [substrings]. @raise Invalid_argument if [n] is not in the range of the number of substrings. @raise Not_found if the corresponding subpattern did not capture a substring. *) val get_substring_ofs : substrings -> int -> int * int (** [get_substring_ofs substrings n] @return the offset tuple of the [n]th substring of [substrings] (0 is whole match). @raise Invalid_argument if [n] is not in the range of the number of substrings. @raise Not_found if the corresponding subpattern did not capture a substring. *) val get_substrings : ?full_match : bool -> substrings -> string array (** [get_substrings ?full_match substrings] @return the array of substrings in [substrings]. It includes the full match at index 0 when [full_match] is [true], the captured substrings only when it is [false]. If a subpattern did not capture a substring, the empty string is returned in the corresponding position instead. @param full_match default = true *) val get_opt_substrings : ?full_match : bool -> substrings -> string option array (** [get_opt_substrings ?full_match substrings] @return the array of optional substrings in [substrings]. It includes [Some full_match_str] at index 0 when [full_match] is [true], [Some captured_substrings] only when it is [false]. If a subpattern did not capture a substring, [None] is returned in the corresponding position instead. @param full_match default = true *) val get_named_substring : regexp -> string -> substrings -> string (** [get_named_substring rex name substrings] @return the named substring [name] in regular expression [rex] and [substrings]. @raise Invalid_argument if there is no such named substring. @raise Not_found if the corresponding subpattern did not capture a substring. *) val get_named_substring_ofs : regexp -> string -> substrings -> int * int (** [get_named_substring_ofs rex name substrings] @return the offset tuple of the named substring [name] in regular expression [rex] and [substrings]. @raise Invalid_argument if there is no such named substring. @raise Not_found if the corresponding subpattern did not capture a substring. *) (** {6 Callouts} *) type callout_data = { callout_number : int; (** Callout number *) substrings : substrings; (** Substrings matched so far *) start_match : int; (** Subject start offset of current match attempt *) current_position : int; (** Subject offset of current match pointer *) capture_top : int; (** Number of the highest captured substring so far *) capture_last : int; (** Number of the most recently captured substring *) pattern_position : int; (** Offset of next match item in pattern string *) next_item_length : int; (** Length of next match item in pattern string *) } (** Type of callout functions *) type callout = callout_data -> unit (** Callouts are referred to in patterns as "(?Cn)" where "n" is a [callout_number] ranging from 0 to 255. Substrings captured so far are accessible as usual via [substrings]. You will have to consider [capture_top] and [capture_last] to know about the current state of valid substrings. By raising exception [Backtrack] within a callout function, the user can force the pattern matching engine to backtrack to other possible solutions. Other exceptions will terminate matching immediately and return control to OCaml. *) (** {6 Matching of patterns and subpattern extraction} *) val pcre_exec : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?callout : callout -> string -> int array (** [pcre_exec ?iflags ?flags ?rex ?pat ?pos ?callout subj] @return an array of offsets that describe the position of matched subpatterns in the string [subj] starting at position [pos] with pattern [pat] when given, regular expression [rex] otherwise. The array also contains additional workspace needed by the match engine. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val pcre_dfa_exec : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?callout : callout -> ?workspace : int array -> string -> int array (** [pcre_dfa_exec ?iflags ?flags ?rex ?pat ?pos ?callout ?workspace subj] invokes the "alternative" DFA matching function. @return an array of offsets that describe the position of matched subpatterns in the string [subj] starting at position [pos] with pattern [pat] when given, regular expression [rex] otherwise. The array also contains additional workspace needed by the match engine. Uses [flags] when given, the precompiled [iflags] otherwise. Requires a sufficiently-large [workspace] array. Callouts are handled by [callout]. Note that the returned array of offsets are quite different from those returned by {!pcre_exec} et al. The motivating use case for the DFA match function is to be able to restart a partial match with N additional input segments. Because the match function/workspace does not store segments seen previously, the offsets returned when a match completes will refer only to the matching portion of the last subject string provided. Thus, returned offsets from this function should not be used to support extracting captured submatches. If you need to capture submatches from a series of inputs incrementally matched with this function, you'll need to concatenate those inputs that yield a successful match here and re-run the same pattern against that single subject string. Aside from an absolute minimum of [20], PCRE does not provide any guidance regarding the size of workspace array needed by any given pattern. Therefore, it is wise to appropriately handle the possible [WorkspaceSize] error. If raised, you can allocate a new, larger workspace array and begin the DFA matching process again. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts @param workspace default = fresh array of length [20] @raise Not_found if the pattern match has failed @raise Error Partial if the pattern has matched partially; a subsequent exec call with the same pattern and workspace (adding the [DFA_RESTART] flag) be made to either further advance or complete the partial match. @raise Error WorkspaceSize if the workspace array is too small to accommodate the DFA state required by the supplied pattern *) val exec : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?callout : callout -> string -> substrings (** [exec ?iflags ?flags ?rex ?pat ?pos ?callout subj] @return substring information on string [subj] starting at position [pos] with pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val exec_all : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?callout : callout -> string -> substrings array (** [exec_all ?iflags ?flags ?rex ?pat ?pos ?callout subj] @return an array of substring information of all matching substrings in string [subj] starting at position [pos] with pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val next_match : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?callout : callout -> substrings -> substrings (** [next_match ?iflags ?flags ?rex ?pat ?pos ?callout substrs] @return substring information on the match that follows on the last match denoted by [substrs], jumping over [pos] characters (also backwards!), using pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts @raise Not_found if pattern does not match. @raise Invalid_arg if [pos] let matching start outside of the subject string. *) val extract : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?full_match : bool -> ?callout : callout -> string -> string array (** [extract ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj] @return the array of substrings that match [subj] starting at position [pos], using pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. It includes the full match at index 0 when [full_match] is [true], the captured substrings only when it is [false]. Callouts are handled by [callout]. If a subpattern did not capture a substring, the empty string is returned in the corresponding position instead. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param full_match default = true @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val extract_opt : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?full_match : bool -> ?callout : callout -> string -> string option array (** [extract_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj] @return the array of optional substrings that match [subj] starting at position [pos], using pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. It includes [Some full_match_str] at index 0 when [full_match] is [true], [Some captured-substrings] only when it is [false]. Callouts are handled by [callout]. If a subpattern did not capture a substring, [None] is returned in the corresponding position instead. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param full_match default = true @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val extract_all : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?full_match : bool -> ?callout : callout -> string -> string array array (** [extract_all ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj] @return an array of arrays of all matching substrings that match [subj] starting at position [pos], using pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. It includes the full match at index 0 of the extracted string arrays when [full_match] is [true], the captured substrings only when it is [false]. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param full_match default = true @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val extract_all_opt : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?full_match : bool -> ?callout : callout -> string -> string option array array (** [extract_all_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj] @return an array of arrays of all optional matching substrings that match [subj] starting at position [pos], using pattern [pat] when given, regular expression [rex] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. It includes [Some full_match_str] at index 0 of the extracted string arrays when [full_match] is [true], [Some captured_substrings] only when it is [false]. Callouts are handled by [callout]. If a subpattern did not capture a substring, [None] is returned in the corresponding position instead. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param full_match default = true @param callout default = ignore callouts @raise Not_found if pattern does not match. *) val pmatch : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?callout : callout -> string -> bool (** [pmatch ?iflags ?flags ?rex ?pat ?pos ?callout subj] @return [true] if [subj] is matched by pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos]. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts *) (** {6 String substitution} *) (** Information on substitution patterns *) type substitution val subst : string -> substitution (** [subst str] converts the string [str] representing a substitution pattern to the internal representation The contents of the substitution string [str] can be normal text mixed with any of the following (mostly as in PERL): - {e $\[0-9\]+} - a "$" immediately followed by an arbitrary number. "$0" stands for the name of the executable, any other number for the n-th backreference. - {e $&} - the whole matched pattern - {e $`} - the text before the match - {e $'} - the text after the match - {e $+} - the last group that matched - {e $$} - a single "$" - {e $!} - delimiter which does not appear in the substitution. Can be used to part "$[0-9]+" from an immediately following other number. *) val replace : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?itempl : substitution -> ?templ : string -> ?callout : callout -> string -> string (** [replace ?iflags ?flags ?rex ?pat ?pos ?itempl ?templ ?callout subj] replaces all substrings of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the substitution string [templ] when given, [itempl] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param itempl default = empty string @param templ default = ignored @param callout default = ignore callouts @raise Failure if there are backreferences to nonexistent subpatterns. *) val qreplace : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?templ : string -> ?callout : callout -> string -> string (** [qreplace ?iflags ?flags ?rex ?pat ?pos ?templ ?callout subj] replaces all substrings of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the string [templ]. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param templ default = ignored @param callout default = ignore callouts *) val substitute_substrings : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?callout : callout -> subst : (substrings -> string) -> string -> string (** [substitute_substrings ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj] replaces all substrings of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the result of function [subst] applied to the substrings of the match. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts *) val substitute : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?callout : callout -> subst : (string -> string) -> string -> string (** [substitute ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj] replaces all substrings of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the result of function [subst] applied to the match. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts *) val replace_first : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?itempl : substitution -> ?templ : string -> ?callout : callout -> string -> string (** [replace_first ?iflags ?flags ?rex ?pat ?pos ?itempl ?templ ?callout subj] replaces the first substring of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the substitution string [templ] when given, [itempl] otherwise. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param itempl default = empty string @param templ default = ignored @param callout default = ignore callouts @raise Failure if there are backreferences to nonexistent subpatterns. *) val qreplace_first : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?templ : string -> ?callout : callout -> string -> string (** [qreplace_first ?iflags ?flags ?rex ?pat ?pos ?templ ?callout subj] replaces the first substring of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the string [templ]. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param templ default = ignored @param callout default = ignore callouts *) val substitute_substrings_first : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?callout : callout -> subst : (substrings -> string) -> string -> string (** [substitute_substrings_first ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj] replaces the first substring of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the result of function [subst] applied to the substrings of the match. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts *) val substitute_first : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?callout : callout -> subst : (string -> string) -> string -> string (** [substitute_first ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj] replaces the first substring of [subj] matching pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos] with the result of function [subst] applied to the match. Uses [flags] when given, the precompiled [iflags] otherwise. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param callout default = ignore callouts *) (** {6 Splitting} *) val split : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?max : int -> ?callout : callout -> string -> string list (** [split ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj] splits [subj] into a list of at most [max] strings, using as delimiter pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos]. Uses [flags] when given, the precompiled [iflags] otherwise. If [max] is zero, trailing empty fields are stripped. If it is negative, it is treated as arbitrarily large. If neither [pat] nor [rex] are specified, leading whitespace will be stripped! Should behave exactly as in PERL. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param max default = 0 @param callout default = ignore callouts *) val asplit : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?max : int -> ?callout : callout -> string -> string array (** [asplit ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj] same as {!Pcre.split} but @return an array instead of a list. *) (** Result of a {!Pcre.full_split} *) type split_result = Text of string (** Text part of split string *) | Delim of string (** Delimiter part of split string *) | Group of int * string (** Subgroup of matched delimiter (subgroup_nr, subgroup_str) *) | NoGroup (** Unmatched subgroup *) val full_split : ?iflags : irflag -> ?flags : rflag list -> ?rex : regexp -> ?pat : string -> ?pos : int -> ?max : int -> ?callout : callout -> string -> split_result list (** [full_split ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj] splits [subj] into a list of at most [max] elements of type "split_result", using as delimiter pattern [pat] when given, regular expression [rex] otherwise, starting at position [pos]. Uses [flags] when given, the precompiled [iflags] otherwise. If [max] is zero, trailing empty fields are stripped. If it is negative, it is treated as arbitrarily large. Should behave exactly as in PERL. Callouts are handled by [callout]. @param iflags default = no extra flags @param flags default = ignored @param rex default = matches whitespace @param pat default = ignored @param pos default = 0 @param max default = 0 @param callout default = ignore callouts *) (** {6 Additional convenience functions} *) val foreach_line : ?ic : in_channel -> (string -> unit) -> unit (** [foreach_line ?ic f] applies [f] to each line in inchannel [ic] until the end-of-file is reached. @param ic default = stdin *) val foreach_file : string list -> (string -> in_channel -> unit) -> unit (** [foreach_file filenames f] opens each file in the list [filenames] for input and applies [f] to each filename and the corresponding channel. Channels are closed after each operation (even when exceptions occur - they get reraised afterwards!). *) (** {6 {b UNSAFE STUFF - USE WITH CAUTION!}} *) val unsafe_pcre_exec : irflag -> regexp -> pos : int -> subj_start : int -> subj : string -> int array -> callout option -> unit (** [unsafe_pcre_exec flags rex ~pos ~subj_start ~subj offset_vector callout]. You should read the C-source to know what happens. If you do not understand it - {b don't use this function!} *) val make_ovector : regexp -> int * int array (** [make_ovector regexp] calculates the tuple (subgroups2, ovector) which is the number of subgroup offsets and the offset array. *) val unsafe_pcre_dfa_exec : irflag -> regexp -> pos : int -> subj_start : int -> subj : string -> int array -> callout option -> workspace : int array -> unit (** [unsafe_pcre_dfa_exec flags rex ~pos ~subj_start ~subj offset_vector callout ~workpace]. You should read the C-source to know what happens. If you do not understand it - {b don't use this function!} *) pcre-ocaml-7.4.3/src/pcre_stubs.c000066400000000000000000000672461355540622000166740ustar00rootroot00000000000000/* PCRE-OCAML - Perl Compatibility Regular Expressions for OCaml Copyright (C) 1999- Markus Mottl email: markus.mottl@gmail.com WWW: http://www.ocaml.info This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #if defined(_WIN32) # define snprintf _snprintf # if defined(_DLL) # define PCREextern __declspec(dllexport) # else # define PCREextern # endif #endif #if _WIN64 typedef long long *caml_int_ptr; #else typedef long *caml_int_ptr; #endif #if __GNUC__ >= 3 # define inline inline __attribute__ ((always_inline)) # define __unused __attribute__ ((unused)) #else # define __unused # define inline #endif #include #include #include #include #include #include #include #include #include #include /* Error codes as defined for pcre 7.9, undefined in pcre 4.5 */ #ifndef PCRE_ERROR_PARTIAL #define PCRE_ERROR_PARTIAL (-12) #endif #ifndef PCRE_ERROR_BADPARTIAL #define PCRE_ERROR_BADPARTIAL (-13) #endif #ifndef PCRE_ERROR_RECURSIONLIMIT #define PCRE_ERROR_RECURSIONLIMIT (-21) #endif typedef const unsigned char *chartables; /* Type of chartable sets */ /* Contents of callout data */ struct cod { long subj_start; /* Start of subject string */ value *v_substrings_p; /* Pointer to substrings matched so far */ value *v_cof_p; /* Pointer to callout function */ value v_exn; /* Possible exception raised by callout function */ }; /* Cache for exceptions */ static const value *pcre_exc_Error = NULL; /* Exception [Error] */ static const value *pcre_exc_Backtrack = NULL; /* Exception [Backtrack] */ /* Cache for polymorphic variants */ static value var_Start_only; /* Variant [`Start_only] */ static value var_ANCHORED; /* Variant [`ANCHORED] */ static value var_Char; /* Variant [`Char char] */ static value var_Not_studied; /* Variant [`Not_studied] */ static value var_Studied; /* Variant [`Studied] */ static value var_Optimal; /* Variant [`Optimal] */ static value None = Val_int(0); /* Data associated with OCaml values of PCRE regular expression */ struct pcre_ocaml_regexp { pcre *rex; pcre_extra *extra; int studied; }; #define Pcre_ocaml_regexp_val(v) \ ((struct pcre_ocaml_regexp *) Data_custom_val(v)) #define get_rex(v) Pcre_ocaml_regexp_val(v)->rex #define get_extra(v) Pcre_ocaml_regexp_val(v)->extra #define get_studied(v) Pcre_ocaml_regexp_val(v)->studied #define set_rex(v, r) Pcre_ocaml_regexp_val(v)->rex = r #define set_extra(v, e) Pcre_ocaml_regexp_val(v)->extra = e #define set_studied(v, s) Pcre_ocaml_regexp_val(v)->studied = s /* Data associated with OCaml values of PCRE tables */ struct pcre_ocaml_tables { chartables tables; }; #define Pcre_ocaml_tables_val(v) \ ((struct pcre_ocaml_tables *) Data_custom_val(v)) #define get_tables(v) Pcre_ocaml_tables_val(v)->tables #define set_tables(v, t) Pcre_ocaml_tables_val(v)->tables = t /* Converts subject offsets from C-integers to OCaml-Integers. This is a bit tricky, because there are 32- and 64-bit platforms around and OCaml chooses the larger possibility for representing integers when available (also in arrays) - not so the PCRE! */ static inline void copy_ovector( long subj_start, const int *ovec_src, caml_int_ptr ovec_dst, int subgroups2) { if (subj_start == 0) while (subgroups2--) { *ovec_dst = Val_int(*ovec_src); --ovec_src; --ovec_dst; } else while (subgroups2--) { *ovec_dst = Val_long(*ovec_src + subj_start); --ovec_src; --ovec_dst; } } /* Callout handler */ static int pcre_callout_handler(pcre_callout_block* cb) { struct cod *cod = (struct cod *) cb->callout_data; if (cod != NULL) { /* Callout is available */ value v_res; /* Set up parameter array */ value v_callout_data = caml_alloc_small(8, 0); const value v_substrings = *cod->v_substrings_p; const int capture_top = cb->capture_top; int subgroups2 = capture_top << 1; const int subgroups2_1 = subgroups2 - 1; const int *ovec_src = cb->offset_vector + subgroups2_1; caml_int_ptr ovec_dst = &Field(Field(v_substrings, 1), 0) + subgroups2_1; long subj_start = cod->subj_start; copy_ovector(subj_start, ovec_src, ovec_dst, subgroups2); Field(v_callout_data, 0) = Val_int(cb->callout_number); Field(v_callout_data, 1) = v_substrings; Field(v_callout_data, 2) = Val_int(cb->start_match + subj_start); Field(v_callout_data, 3) = Val_int(cb->current_position + subj_start); Field(v_callout_data, 4) = Val_int(capture_top); Field(v_callout_data, 5) = Val_int(cb->capture_last); Field(v_callout_data, 6) = Val_int(cb->pattern_position); Field(v_callout_data, 7) = Val_int(cb->next_item_length); /* Perform callout */ v_res = caml_callback_exn(*cod->v_cof_p, v_callout_data); if (Is_exception_result(v_res)) { /* Callout raised an exception */ const value v_exn = Extract_exception(v_res); if (Field(v_exn, 0) == *pcre_exc_Backtrack) return 1; cod->v_exn = v_exn; return PCRE_ERROR_CALLOUT; } } return 0; } /* Fetchs the named OCaml-values + caches them and calculates + caches the variant hash values */ CAMLprim value pcre_ocaml_init(value __unused v_unit) { pcre_exc_Error = caml_named_value("Pcre.Error"); pcre_exc_Backtrack = caml_named_value("Pcre.Backtrack"); var_Start_only = caml_hash_variant("Start_only"); var_ANCHORED = caml_hash_variant("ANCHORED"); var_Char = caml_hash_variant("Char"); var_Not_studied = caml_hash_variant("Not_studied"); var_Studied = caml_hash_variant("Studied"); var_Optimal = caml_hash_variant("Optimal"); pcre_callout = &pcre_callout_handler; return Val_unit; } /* Finalizing deallocation function for chartable sets */ static void pcre_dealloc_tables(value v_tables) { (pcre_free)((void *) get_tables(v_tables)); } /* Finalizing deallocation function for compiled regular expressions */ static void pcre_dealloc_regexp(value v_rex) { void *extra = get_extra(v_rex); if (extra != NULL) #ifdef PCRE_STUDY_JIT_COMPILE pcre_free_study(extra); #else pcre_free(extra); #endif (pcre_free)(get_rex(v_rex)); } /* Raising exceptions */ CAMLnoreturn_start static inline void raise_pcre_error(value v_arg) CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_partial() CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_bad_partial() CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_bad_utf8() CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_bad_utf8_offset() CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_match_limit() CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_recursion_limit() CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_workspace_size() CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_bad_pattern(const char *msg, int pos) CAMLnoreturn_end; CAMLnoreturn_start static inline void raise_internal_error(char *msg) CAMLnoreturn_end; static inline void raise_pcre_error(value v_arg) { caml_raise_with_arg(*pcre_exc_Error, v_arg); } static inline void raise_partial() { raise_pcre_error(Val_int(0)); } static inline void raise_bad_partial() { raise_pcre_error(Val_int(1)); } static inline void raise_bad_utf8() { raise_pcre_error(Val_int(2)); } static inline void raise_bad_utf8_offset() { raise_pcre_error(Val_int(3)); } static inline void raise_match_limit() { raise_pcre_error(Val_int(4)); } static inline void raise_recursion_limit() { raise_pcre_error(Val_int(5)); } static inline void raise_workspace_size() { raise_pcre_error(Val_int(6)); } static inline void raise_bad_pattern(const char *msg, int pos) { CAMLparam0(); CAMLlocal1(v_msg); value v_arg; v_msg = caml_copy_string(msg); v_arg = caml_alloc_small(2, 0); Field(v_arg, 0) = v_msg; Field(v_arg, 1) = Val_int(pos); raise_pcre_error(v_arg); CAMLnoreturn; } static inline void raise_internal_error(char *msg) { CAMLparam0(); CAMLlocal1(v_msg); value v_arg; v_msg = caml_copy_string(msg); v_arg = caml_alloc_small(1, 1); Field(v_arg, 0) = v_msg; raise_pcre_error(v_arg); CAMLnoreturn; } /* PCRE pattern compilation */ static struct custom_operations regexp_ops = { "pcre_ocaml_regexp", pcre_dealloc_regexp, custom_compare_default, custom_hash_default, custom_serialize_default, custom_deserialize_default, custom_compare_ext_default, custom_fixed_length_default }; /* Makes compiled regular expression from compilation options, an optional value of chartables and the pattern string */ CAMLprim value pcre_compile_stub(intnat v_opt, value v_tables, value v_pat) { value v_rex; /* Final result -> value of type [regexp] */ size_t regexp_size, ocaml_regexp_size = sizeof(struct pcre_ocaml_regexp); const char *error = NULL; /* pointer to possible error message */ int error_ofs = 0; /* offset in the pattern at which error occurred */ /* If v_tables = [None], then pointer to tables is NULL, otherwise set it to the appropriate value */ chartables tables = (v_tables == None) ? NULL : get_tables(Field(v_tables, 0)); /* Compiles the pattern */ pcre *regexp = pcre_compile(String_val(v_pat), v_opt, &error, &error_ofs, tables); /* Raises appropriate exception with [BadPattern] if the pattern could not be compiled */ if (regexp == NULL) raise_bad_pattern(error, error_ofs); /* It's unknown at this point whether the user will study the pattern later (probably), or if JIT compilation is going to be used, but we have to decide on a size. Tests with some simple patterns indicate a roughly 50% increase in size when studying without JIT. A factor of two times hence seems like a reasonable bound to use here. */ pcre_fullinfo(regexp, NULL, PCRE_INFO_SIZE, ®exp_size); v_rex = caml_alloc_custom_mem(®exp_ops, ocaml_regexp_size, 2*regexp_size); set_rex(v_rex, regexp); set_extra(v_rex, NULL); set_studied(v_rex, 0); return v_rex; } CAMLprim value pcre_compile_stub_bc(value v_opt, value v_tables, value v_pat) { return pcre_compile_stub(Int_val(v_opt), v_tables, v_pat); } /* Studies a regexp */ CAMLprim value pcre_study_stub(value v_rex) { /* If it has not yet been studied */ if (! get_studied(v_rex)) { const char *error = NULL; pcre_extra *extra = pcre_study(get_rex(v_rex), 0, &error); if (error != NULL) caml_invalid_argument((char *) error); set_extra(v_rex, extra); set_studied(v_rex, 1); } return v_rex; } /* Gets the match limit recursion of a regular expression if it exists */ CAMLprim value pcre_get_match_limit_recursion_stub(value v_rex) { pcre_extra *extra = get_extra(v_rex); if (extra == NULL) return None; if (extra->flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) { value v_lim = Val_int(extra->match_limit_recursion); value v_res = caml_alloc_small(1, 0); Field(v_res, 0) = v_lim; return v_res; } return None; } /* Gets the match limit of a regular expression if it exists */ CAMLprim value pcre_get_match_limit_stub(value v_rex) { pcre_extra *extra = get_extra(v_rex); if (extra == NULL) return None; if (extra->flags & PCRE_EXTRA_MATCH_LIMIT) { value v_lim = Val_int(extra->match_limit); value v_res = caml_alloc_small(1, 0); Field(v_res, 0) = v_lim; return v_res; } return None; } /* Sets a match limit for a regular expression imperatively */ CAMLprim value pcre_set_imp_match_limit_stub(value v_rex, intnat v_lim) { pcre_extra *extra = get_extra(v_rex); if (extra == NULL) { extra = pcre_malloc(sizeof(pcre_extra)); extra->flags = PCRE_EXTRA_MATCH_LIMIT; set_extra(v_rex, extra); } else { unsigned long *flags_ptr = &extra->flags; *flags_ptr = PCRE_EXTRA_MATCH_LIMIT | *flags_ptr; } extra->match_limit = v_lim; return v_rex; } CAMLprim value pcre_set_imp_match_limit_stub_bc(value v_rex, value v_lim) { return pcre_set_imp_match_limit_stub(v_rex, Int_val(v_lim)); } /* Sets a match limit recursion for a regular expression imperatively */ CAMLprim value pcre_set_imp_match_limit_recursion_stub( value v_rex, intnat v_lim) { pcre_extra *extra = get_extra(v_rex); if (extra == NULL) { extra = pcre_malloc(sizeof(pcre_extra)); extra->flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION; set_extra(v_rex, extra); } else { unsigned long *flags_ptr = &extra->flags; *flags_ptr = PCRE_EXTRA_MATCH_LIMIT_RECURSION | *flags_ptr; } extra->match_limit_recursion = v_lim; return v_rex; } CAMLprim value pcre_set_imp_match_limit_recursion_stub_bc( value v_rex, value v_lim) { return pcre_set_imp_match_limit_recursion_stub(v_rex, Int_val(v_lim)); } /* Performs the call to the pcre_fullinfo function */ static inline int pcre_fullinfo_stub(value v_rex, int what, void *where) { return pcre_fullinfo(get_rex(v_rex), get_extra(v_rex), what, where); } /* Some stubs for info-functions */ /* Generic macro for getting integer results from pcre_fullinfo */ #define make_intnat_info(tp, name, option) \ CAMLprim intnat pcre_##name##_stub(value v_rex) \ { \ tp options; \ const int ret = pcre_fullinfo_stub(v_rex, PCRE_INFO_##option, &options); \ if (ret != 0) raise_internal_error("pcre_##name##_stub"); \ return options; \ } \ \ CAMLprim value pcre_##name##_stub_bc(value v_rex) \ { return Val_int(pcre_##name##_stub(v_rex)); } make_intnat_info(unsigned long, options, OPTIONS) make_intnat_info(size_t, size, SIZE) make_intnat_info(size_t, studysize, STUDYSIZE) make_intnat_info(int, capturecount, CAPTURECOUNT) make_intnat_info(int, backrefmax, BACKREFMAX) make_intnat_info(int, namecount, NAMECOUNT) make_intnat_info(int, nameentrysize, NAMEENTRYSIZE) CAMLprim value pcre_firstbyte_stub(value v_rex) { int firstbyte; const int ret = pcre_fullinfo_stub(v_rex, PCRE_INFO_FIRSTBYTE, &firstbyte); if (ret != 0) raise_internal_error("pcre_firstbyte_stub"); switch (firstbyte) { case -1 : return var_Start_only; break; /* [`Start_only] */ case -2 : return var_ANCHORED; break; /* [`ANCHORED] */ default : if (firstbyte < 0 ) /* Should not happen */ raise_internal_error("pcre_firstbyte_stub"); else { value v_firstbyte; /* Allocates the non-constant constructor [`Char of char] and fills in the appropriate value */ v_firstbyte = caml_alloc_small(2, 0); Field(v_firstbyte, 0) = var_Char; Field(v_firstbyte, 1) = Val_int(firstbyte); return v_firstbyte; } } } CAMLprim value pcre_firsttable_stub(value v_rex) { const unsigned char *ftable; int ret = pcre_fullinfo_stub(v_rex, PCRE_INFO_FIRSTTABLE, (void *) &ftable); if (ret != 0) raise_internal_error("pcre_firsttable_stub"); if (ftable == NULL) return None; else { value v_res, v_res_str; char *ptr; int i; Begin_roots1(v_rex); v_res_str = caml_alloc_string(32); End_roots(); ptr = String_val(v_res_str); for (i = 0; i <= 31; ++i) { *ptr = *ftable; ++ptr; ++ftable; } Begin_roots1(v_res_str); /* Allocates [Some string] from firsttable */ v_res = caml_alloc_small(1, 0); End_roots(); Field(v_res, 0) = v_res_str; return v_res; } } CAMLprim value pcre_lastliteral_stub(value v_rex) { int lastliteral; const int ret = pcre_fullinfo_stub(v_rex, PCRE_INFO_LASTLITERAL, &lastliteral); if (ret != 0) raise_internal_error("pcre_lastliteral_stub"); if (lastliteral == -1) return None; if (lastliteral < 0) raise_internal_error("pcre_lastliteral_stub"); else { /* Allocates [Some char] */ value v_res = caml_alloc_small(1, 0); Field(v_res, 0) = Val_int(lastliteral); return v_res; } } CAMLprim value pcre_study_stat_stub(value v_rex) { /* Generates the appropriate constant constructor [`Optimal] or [`Studied] if regexp has already been studied */ if (get_studied(v_rex)) return (get_extra(v_rex) == NULL) ? var_Optimal : var_Studied; return var_Not_studied; /* otherwise [`Not_studied] */ } CAMLnoreturn_start static inline void handle_exec_error(char *loc, const int ret) CAMLnoreturn_end; static inline void handle_exec_error(char *loc, const int ret) { switch (ret) { /* Dedicated exceptions */ case PCRE_ERROR_NOMATCH : caml_raise_not_found(); case PCRE_ERROR_PARTIAL : raise_partial(); case PCRE_ERROR_MATCHLIMIT : raise_match_limit(); case PCRE_ERROR_BADPARTIAL : raise_bad_partial(); case PCRE_ERROR_BADUTF8 : raise_bad_utf8(); case PCRE_ERROR_BADUTF8_OFFSET : raise_bad_utf8_offset(); case PCRE_ERROR_RECURSIONLIMIT : raise_recursion_limit(); case PCRE_ERROR_DFA_WSSIZE : raise_workspace_size(); /* Unknown error */ default : { char err_buf[100]; snprintf(err_buf, 100, "%s: unhandled PCRE error code: %d", loc, ret); raise_internal_error(err_buf); } } } static inline void handle_pcre_exec_result( int *ovec, value v_ovec, long ovec_len, long subj_start, int ret) { caml_int_ptr ocaml_ovec = (caml_int_ptr) &Field(v_ovec, 0); const int subgroups2 = ret * 2; const int subgroups2_1 = subgroups2 - 1; const int *ovec_src = ovec + subgroups2_1; caml_int_ptr ovec_clear_stop = ocaml_ovec + (ovec_len * 2) / 3; caml_int_ptr ovec_dst = ocaml_ovec + subgroups2_1; copy_ovector(subj_start, ovec_src, ovec_dst, subgroups2); while (++ovec_dst < ovec_clear_stop) *ovec_dst = -1; } /* Executes a pattern match with runtime options, a regular expression, a matching position, the start of the the subject string, a subject string, a number of subgroup offsets, an offset vector and an optional callout function */ CAMLprim value pcre_exec_stub0( intnat v_opt, value v_rex, intnat v_pos, intnat v_subj_start, value v_subj, value v_ovec, value v_maybe_cof, value v_workspace) { int ret; int is_dfa = v_workspace != (value) NULL; long pos = v_pos, len = caml_string_length(v_subj), subj_start = v_subj_start; long ovec_len = Wosize_val(v_ovec); if (pos > len || pos < subj_start) caml_invalid_argument("Pcre.pcre_exec_stub: illegal position"); if (subj_start > len || subj_start < 0) caml_invalid_argument("Pcre.pcre_exec_stub: illegal subject start"); pos -= subj_start; len -= subj_start; { const pcre *code = get_rex(v_rex); /* Compiled pattern */ const pcre_extra *extra = get_extra(v_rex); /* Extra info */ const char *ocaml_subj = String_val(v_subj) + subj_start; /* Subject string */ const int opt = v_opt; /* Runtime options */ /* Special case when no callout functions specified */ if (v_maybe_cof == None) { int *ovec = (int *) &Field(v_ovec, 0); /* Performs the match */ if (is_dfa) ret = pcre_dfa_exec(code, extra, ocaml_subj, len, pos, opt, ovec, ovec_len, (int *) &Field(v_workspace, 0), Wosize_val(v_workspace)); else ret = pcre_exec(code, extra, ocaml_subj, len, pos, opt, ovec, ovec_len); if (ret < 0) handle_exec_error("pcre_exec_stub", ret); else handle_pcre_exec_result(ovec, v_ovec, ovec_len, subj_start, ret); } /* There are callout functions */ else { value v_cof = Field(v_maybe_cof, 0); value v_substrings; char *subj = caml_stat_alloc(sizeof(char) * len); int *ovec = caml_stat_alloc(sizeof(int) * ovec_len); int workspace_len; int *workspace; struct cod cod = { 0, (value *) NULL, (value *) NULL, (value) NULL }; struct pcre_extra new_extra = #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION # ifdef PCRE_EXTRA_MARK # ifdef PCRE_EXTRA_EXECUTABLE_JIT { PCRE_EXTRA_CALLOUT_DATA, NULL, 0, NULL, NULL, 0, NULL, NULL }; # else { PCRE_EXTRA_CALLOUT_DATA, NULL, 0, NULL, NULL, 0, NULL }; # endif # else { PCRE_EXTRA_CALLOUT_DATA, NULL, 0, NULL, NULL, 0 }; # endif #else { PCRE_EXTRA_CALLOUT_DATA, NULL, 0, NULL, NULL }; #endif cod.subj_start = subj_start; memcpy(subj, ocaml_subj, len); Begin_roots4(v_rex, v_cof, v_substrings, v_ovec); Begin_roots1(v_subj); v_substrings = caml_alloc_small(2, 0); End_roots(); Field(v_substrings, 0) = v_subj; Field(v_substrings, 1) = v_ovec; cod.v_substrings_p = &v_substrings; cod.v_cof_p = &v_cof; new_extra.callout_data = &cod; if (extra != NULL) { new_extra.flags = PCRE_EXTRA_CALLOUT_DATA | extra->flags; new_extra.study_data = extra->study_data; new_extra.match_limit = extra->match_limit; new_extra.tables = extra->tables; #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION new_extra.match_limit_recursion = extra->match_limit_recursion; #endif } if (is_dfa) { workspace_len = Wosize_val(v_workspace); workspace = caml_stat_alloc(sizeof(int) * workspace_len); ret = pcre_dfa_exec(code, extra, subj, len, pos, opt, ovec, ovec_len, (int *) &Field(v_workspace, 0), workspace_len); } else ret = pcre_exec(code, &new_extra, subj, len, pos, opt, ovec, ovec_len); caml_stat_free(subj); End_roots(); if (ret < 0) { if (is_dfa) caml_stat_free(workspace); caml_stat_free(ovec); if (ret == PCRE_ERROR_CALLOUT) caml_raise(cod.v_exn); else handle_exec_error("pcre_exec_stub(callout)", ret); } else { handle_pcre_exec_result(ovec, v_ovec, ovec_len, subj_start, ret); if (is_dfa) { caml_int_ptr ocaml_workspace_dst = (caml_int_ptr) &Field(v_workspace, 0); const int *workspace_src = workspace; const int *workspace_src_stop = workspace + workspace_len; while (workspace_src != workspace_src_stop) { *ocaml_workspace_dst = *workspace_src; ocaml_workspace_dst++; workspace_src++; } caml_stat_free(workspace); } caml_stat_free(ovec); } } } return Val_unit; } CAMLprim value pcre_exec_stub( intnat v_opt, value v_rex, intnat v_pos, intnat v_subj_start, value v_subj, value v_ovec, value v_maybe_cof) { return pcre_exec_stub0(v_opt, v_rex, v_pos, v_subj_start, v_subj, v_ovec, v_maybe_cof, (value) NULL); } /* Byte-code hook for pcre_exec_stub Needed, because there are more than 5 arguments */ CAMLprim value pcre_exec_stub_bc(value *argv, int __unused argn) { return pcre_exec_stub0( Int_val(argv[0]), argv[1], Int_val(argv[2]), Int_val(argv[3]), argv[4], argv[5], argv[6], (value) NULL); } /* Byte-code hook for pcre_dfa_exec_stub Needed, because there are more than 5 arguments */ CAMLprim value pcre_dfa_exec_stub_bc(value *argv, int __unused argn) { return pcre_exec_stub0( Int_val(argv[0]), argv[1], Int_val(argv[2]), Int_val(argv[3]), argv[4], argv[5], argv[6], argv[7]); } static struct custom_operations tables_ops = { "pcre_ocaml_tables", pcre_dealloc_tables, custom_compare_default, custom_hash_default, custom_serialize_default, custom_deserialize_default, custom_compare_ext_default, custom_fixed_length_default }; /* Generates a new set of chartables for the current locale (see man page of PCRE */ CAMLprim value pcre_maketables_stub(value __unused v_unit) { /* According to testing with `malloc_size`, it seems that a typical set of tables will require about 1536 bytes of memory. This may or may not be true on other platforms or for all versions of PCRE. Since there is apparently no reliable way of finding out, 1536 is probably a good default value. */ size_t tables_size = sizeof(struct pcre_ocaml_tables); const value v_tables = caml_alloc_custom_mem(&tables_ops, tables_size, 1536); set_tables(v_tables, pcre_maketables()); return v_tables; } /* Wraps around the isspace-function */ CAMLprim value pcre_isspace_stub(value v_c) { return Val_bool(isspace(Int_val(v_c))); } /* Returns number of substring associated with a name */ CAMLprim intnat pcre_get_stringnumber_stub(value v_rex, value v_name) { const int ret = pcre_get_stringnumber(get_rex(v_rex), String_val(v_name)); if (ret == PCRE_ERROR_NOSUBSTRING) caml_invalid_argument("Named string not found"); return ret; } CAMLprim value pcre_get_stringnumber_stub_bc(value v_rex, value v_name) { return Val_int(pcre_get_stringnumber_stub(v_rex, v_name)); } /* Returns array of names of named substrings in a regexp */ CAMLprim value pcre_names_stub(value v_rex) { CAMLparam0(); CAMLlocal1(v_res); int name_count; int entry_size; const char *tbl_ptr; int i; int ret = pcre_fullinfo_stub(v_rex, PCRE_INFO_NAMECOUNT, &name_count); if (ret != 0) raise_internal_error("pcre_names_stub: namecount"); ret = pcre_fullinfo_stub(v_rex, PCRE_INFO_NAMEENTRYSIZE, &entry_size); if (ret != 0) raise_internal_error("pcre_names_stub: nameentrysize"); ret = pcre_fullinfo_stub(v_rex, PCRE_INFO_NAMETABLE, &tbl_ptr); if (ret != 0) raise_internal_error("pcre_names_stub: nametable"); v_res = caml_alloc(name_count, 0); for (i = 0; i < name_count; ++i) { value v_name = caml_copy_string(tbl_ptr + 2); Store_field(v_res, i, v_name); tbl_ptr += entry_size; } CAMLreturn(v_res); } /* Generic stub for getting integer results from pcre_config */ static inline int pcre_config_int(int what) { int ret; pcre_config(what, (void *) &ret); return ret; } /* Generic stub for getting long integer results from pcre_config */ static inline int pcre_config_long(int what) { long ret; pcre_config(what, (void *) &ret); return ret; } /* Some stubs for config-functions */ /* Makes OCaml-string from PCRE-version */ CAMLprim value pcre_version_stub(value __unused v_unit) { return caml_copy_string((char *) pcre_version()); } /* Returns boolean indicating UTF8-support */ CAMLprim value pcre_config_utf8_stub(value __unused v_unit) { return Val_bool(pcre_config_int(PCRE_CONFIG_UTF8)); } /* Returns character used as newline */ CAMLprim value pcre_config_newline_stub(value __unused v_unit) { return Val_int(pcre_config_int(PCRE_CONFIG_NEWLINE)); } /* Returns number of bytes used for internal linkage of regular expressions */ CAMLprim intnat pcre_config_link_size_stub(value __unused v_unit) { return pcre_config_int(PCRE_CONFIG_LINK_SIZE); } CAMLprim value pcre_config_link_size_stub_bc(value v_unit) { return Val_int(pcre_config_link_size_stub(v_unit)); } /* Returns default limit for calls to internal matching function */ CAMLprim intnat pcre_config_match_limit_stub(value __unused v_unit) { return pcre_config_long(PCRE_CONFIG_MATCH_LIMIT); } CAMLprim value pcre_config_match_limit_stub_bc(value v_unit) { return Val_int(pcre_config_match_limit_stub(v_unit)); } /* Returns default limit for recursive calls to internal matching function */ CAMLprim intnat pcre_config_match_limit_recursion_stub(value __unused v_unit) { return pcre_config_long(PCRE_CONFIG_MATCH_LIMIT_RECURSION); } CAMLprim value pcre_config_match_limit_recursion_stub_bc(value v_unit) { return Val_int(pcre_config_match_limit_recursion_stub(v_unit)); } /* Returns boolean indicating use of stack recursion */ CAMLprim value pcre_config_stackrecurse_stub(value __unused v_unit) { return Val_bool(pcre_config_int(PCRE_CONFIG_STACKRECURSE)); }