weighted-regexp-0.3.1.2/ 0000755 0000000 0000000 00000000000 11716565266 013150 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/weighted-regexp.cabal 0000644 0000000 0000000 00000011005 11716565266 017221 0 ustar 00 0000000 0000000 Name: weighted-regexp
Version: 0.3.1.2
Cabal-Version: >= 1.6
Synopsis: Weighted Regular Expression Matcher
Description:
Haskell implementation of a weighted regular expression
matcher with linear worst-case time and space bounds.
Category: Text, Parsing
License: BSD3
License-File: LICENSE
Author: Thomas Wilke, Frank Huch, Sebastian Fischer
Maintainer: Sebastian Fischer
Bug-Reports: http://github.com/sebfisch/haskell-regexp/issues
Homepage: http://sebfisch.github.com/haskell-regexp
Build-Type: Simple
Stability: experimental
Extra-Source-Files: README.markdown CHANGES.markdown
Library
Build-Tools: happy >= 1.17
Build-Depends: base >= 3 && < 5,
array >= 0.1 && < 0.5
HS-Source-Dirs: src
Exposed-Modules: Text.RegExp,
Text.RegExp.Matching.Leftmost,
Text.RegExp.Matching.Longest,
Text.RegExp.Matching.LeftLong,
Text.RegExp.Internal,
Data.Semiring,
Data.Semiring.Properties
Other-Modules: Text.RegExp.Data,
Text.RegExp.Parser,
Text.RegExp.Matching,
Text.RegExp.Matching.Leftmost.Type,
Text.RegExp.Matching.Longest.Type,
Text.RegExp.Matching.LeftLong.Type
Extensions: RankNTypes,
BangPatterns,
FlexibleContexts,
FlexibleInstances,
MultiParamTypeClasses,
NoMonomorphismRestriction,
GeneralizedNewtypeDeriving
Flag QuickCheck
Description: Build executable to run QuickCheck tests
Default: False
Executable quickcheck-re
Main-Is: quickcheck.lhs
If flag(QuickCheck)
Build-Depends: base >= 3 && < 5, QuickCheck < 2
Else
Buildable: False
HS-Source-Dirs: src
Other-Modules: Text.RegExp,
Text.RegExp.Matching.Leftmost,
Text.RegExp.Matching.Longest,
Text.RegExp.Matching.LeftLong,
Data.Semiring,
Data.Semiring.Properties
Text.RegExp.Internal,
Text.RegExp.Data,
Text.RegExp.Parser,
Text.RegExp.Matching,
Text.RegExp.Matching.Leftmost.Type,
Text.RegExp.Matching.Longest.Type,
Text.RegExp.Matching.LeftLong.Type
Extensions: RankNTypes,
BangPatterns
FlexibleContexts,
FlexibleInstances,
MultiParamTypeClasses,
NoMonomorphismRestriction,
GeneralizedNewtypeDeriving,
OverloadedStrings,
ScopedTypeVariables
GHC-Options: -fhpc -fno-warn-missing-methods -fno-warn-orphans
Flag Criterion
Description: Build executable to run Criterion benchmarks
Default: False
Executable criterion-re
Main-Is: criterion.lhs
If flag(Criterion)
Build-Depends: base >= 3 && < 5, criterion >= 0.5 && < 0.6
Else
Buildable: False
HS-Source-Dirs: src
Other-Modules: Text.RegExp,
Text.RegExp.Matching.Leftmost,
Text.RegExp.Matching.Longest,
Text.RegExp.Matching.LeftLong,
Data.Semiring,
Text.RegExp.Internal,
Text.RegExp.Data,
Text.RegExp.Parser,
Text.RegExp.Matching,
Text.RegExp.Matching.Leftmost.Type,
Text.RegExp.Matching.Longest.Type,
Text.RegExp.Matching.LeftLong.Type
Extensions: RankNTypes,
BangPatterns,
FlexibleContexts,
FlexibleInstances,
MultiParamTypeClasses,
NoMonomorphismRestriction,
GeneralizedNewtypeDeriving,
OverloadedStrings
GHC-Options:
Source-Repository head
type: git
location: git://github.com/sebfisch/haskell-regexp.git
weighted-regexp-0.3.1.2/Setup.hs 0000644 0000000 0000000 00000000075 11716565266 014606 0 ustar 00 0000000 0000000 import Distribution.Simple
main :: IO ()
main = defaultMain
weighted-regexp-0.3.1.2/CHANGES.markdown 0000644 0000000 0000000 00000010101 11716565266 015755 0 ustar 00 0000000 0000000 % Changelog for [`weighted-regexp`]
[`weighted-regexp`]: http://hackage.haskell.org/package/weighted-regexp
# 0.3.1.1
## Use `BangPatterns` language extension
Added the `BangPatterns` language extension to the cabal file because
without it the generated parser fails to build using GHC 7.2.
# 0.3.1
## Expose internal data types and matching functions
Added new module `Text.RegExp.Internal` that exposes internal data
types and matching functions. Users probably don't want to use it
unless they implement their own matching functions.
# 0.3.0.1
## Conditional build dependencies
Moved build dependencies for QuickCheck and Criterion test programs
under a conditional so they are only pulled in if one actually
compiles these programs using the flags `-fQuickCheck` or
`-fCriterion`. (Thank you Brent!)
# 0.3.0.0
## Implemented workaround for [GHC ticket 4227]
Currently, GHC can SPECIALIZE functions only where they are defined.
The types `Leftmost`, `Longest`, and `LeftLong` are now defined in
separate modules to bring them into the scope of the matching
functions. Specialization makes the matching functions almost three
times faster for the types mentioned above.
This workaround allows to specialize the matching functions for types
defined in this package. Users, however, must use the matching
functions unspecialized for their own types.
Along with this change, the constructors of the matching types are no
longer exported.
# 0.2.0.0
## More general types for matching functions
The functions `fullMatch` and `partialMatch` now both have the type
Weight a b w => RegExp a -> [b] -> w
whereas previously the signatures have been:
fullMatch :: Semiring w => RegExp c -> [c] -> w
partialMatch :: Weight c (Int,c) w => RegExp c -> [c] -> w
The change allows users to provide custom symbol weights in full
matchings and to do partial matchings with arbitrary symbols weights
instead of having to use only characters and their positions.
This generalization leads to a slight performance penalty in small
examples but has a negligible effect when matching large inputs.
## Renamed `accept` to `acceptFull`, added `acceptPartial`
Based on the more general `partialMatch` function, the function
`acceptPartial` was added for the `Bool` semiring. The `accept`
function has been appropriately renamed.
## Strict numeric semiring
The lazy definition of arithmetic operations for the `Numeric`
semiring has been dropped in favour of the more efficient standard
implementation. As a consequence, `matchingCount` no longer works with
infinite regular expressions.
## SPECIALIZE pragmas prevent memory leak
The generalization of the matching functions leads to a memory leak
that can be avoided by specializing them for concrete
semirings. Corresponding pragmas have been added for `Bool` and for
`Numeric` types but not for the more complex semirings defined in the
extra matching modules. It is unclear what is the best way to
specialize them too because the pragma must be placed in the module
where the matching functions are defined but, there, not all semirings
are in scope. See [GHC ticket 4227].
[GHC ticket 4227]: http://hackage.haskell.org/trac/ghc/ticket/4227
## Fixed mistake in Criterion benchmarks
In the group of partial matchings, the benchmark for `Bool`
accidentally used full matching. It now uses partial matching which,
unsurprisingly, is slower.
# 0.1.1.0
## added `noMatch`
`Text.RegExp` now provides a combinator
noMatch :: RegExp c
which is an identity of `alt`. With this combinator, regular
expressions form a semiring with
zero = noMatch
one = eps
(.+.) = alt
(.*.) = seq_
A corresponding `Semiring` instance is not defined due to the lack of
an appropriate `Eq` instance.
## added `perm`
`Text.RegExp` now provides a combinator
perm :: [RegExp c] -> RegExp c
that matches the given regular expressions in sequence. Each
expression must be matched exactly once but in arbitrary order. For
example, the regular expression
perm (map char "abc")
is equivalent to `abc|acb|bca|bac|cba|cab` and represented as
`a(bc|cb)|b(ca|ac)|c(ba|ab)`.
weighted-regexp-0.3.1.2/README.markdown 0000644 0000000 0000000 00000025367 11716565266 015666 0 ustar 00 0000000 0000000 % Weighted RegExp Matching
Efficient regular expression matching can be beautifully
simple. Revisiting ideas from theoretical computer science, it can be
implemented with linear worst-case time and space bounds in the purely
functional programming language [Haskell].
[Haskell]: http://hackage.haskell.org/platform/
[semirings]: http://en.wikipedia.org/wiki/Semiring
# Background
Since Plato wrote about philosophy in the form of [dialogues], authors
have used this literary form to convey their ideas. The 15th
[International Conference on Functional Programming][ICFP] features an
article on Regular Expressions written as a play, [A Play on Regular
Expressions][paper], which is meant to be [elegant, instructive, and
fun][Pearl]. The play discusses an efficient, purely functional
algorithm for matching regular expressions. By generalizing from
Booleans to arbitrary [semirings], this algorithm implements various
matching policies for weighted regular expressions.
[dialogues]: http://en.wikipedia.org/wiki/Socratic_dialogue
[ICFP]: http://www.icfpconference.org/icfp2010/
[Pearl]: http://web.cecs.pdx.edu/~apt/icfp09_cfp.html#pearls
[paper]: regexp-play.pdf
# Installation
An implementation of the ideas discussed in the Play on Regular
Expressions is available as a Haskell library. It is implemented in
pure Haskell rather than as a binding to an external library so you do
not need to install an external regular expression library to use it.
|
However, you need Haskell in order to use this library. By installing
the [Haskell Platform][Haskell] you get a Haskell compiler with an
interactive environment as well as the package manager `cabal-install`
and various pre-installed packages.
|
|
You can install the [`weighted-regexp`] library by typing the following
into a terminal:
[`weighted-regexp`]: http://hackage.haskell.org/package/weighted-regexp
bash# cabal update
bash# cabal install weighted-regexp
|
This will install the current version. Differences between versions
are listed in the [changelog].
[changelog]: http://sebfisch.github.com/haskell-regexp/CHANGES.html
# Correctness
The matching algorithm computes the same result as a simple inductive
specification (given in the [Play on Regular Expressions][paper]) but
is [more efficient](#performance) than a direct translation of this
specification into Haskell. Although the ideas behind the algorithm
are not new but based on proven results from theoretical computer
science, there is no correctness proof for the equivalence of the
Haskell implementation of the algorithm with its specification. The
equivalence is therefore confirmed by testing.
It is difficult (and tedious) to write tests manually that cover all
interesting apsects of regular expression matching. Therefore,
[QuickCheck] is used to generate tests automatically and [Haskell
Program Coverage (HPC)][HPC] is used to monitor test coverage.
[QuickCheck]: http://www.cse.chalmers.se/~rjmh/QuickCheck/
[HPC]: http://www.haskell.org/ghc/docs/latest/html/users_guide/hpc.html
You can install the `weighted-regexp` library along with a test
program as follows:
bash# cabal install weighted-regexp -fQuickCheck
Using the `QuickCheck` flag results in an additional program that you
can use to test the implementation. The program tests
* the algebraic laws of semirings for all defined semirings, and
* the equivalence of the matching algorithm with the specification
both for full and partial matchings.
For testing the equivalence, QuickCheck generates random regular
expressions and compares the result of the matching algorithm with the
result of its specification on random words. Moreover, the program
tests
* the parser that provides common syntactic sugar like bounded
repetitions and character classes,
* the use of the library to recognize non-regular languages using
infinite regular expressions, and
* a combinator for parsing permutation sequences, that is, sequences
of regular expressions in arbitrary order.
For a more detailed description of the tested properties consider the
[source code][quickcheck.lhs] of the test program. In order to
generate an HPC report you need to download the sources of the
`weighted-regexp` package. But you may as well consult the
[pregenerated coverage report][coverage] instead of generating one
yourself.
[quickcheck.lhs]: http://github.com/sebfisch/haskell-regexp/blob/master/src/quickcheck.lhs
[coverage]: http://sebfisch.github.com/haskell-regexp/quickcheck/hpc_index.html
# Performance
The matching algorithm provided by this library is usually slower than
other libraries like [pcre] but has a better asymptotic
complexity. There are no corner cases for which matching takes forever
or eats all available memory. More specifically, the worst-case run
time for matching a word against a regular expression is linearly
bounded by the length of the word and the size of the regular
expression. It is in *O(nm)* if *n* is the length of the word and *m*
the size of the expression. The memory requirements are independent of
the length of the word and linear in the size of the regular
expression, that is, in *O(m)*. Therefore, this library provides
similar asymptotic complexity guarantees as Google's [re2].
[pcre]: http://www.pcre.org/
[re2]: http://code.google.com/p/re2/
Here are timings that have been obtained (on a MacBook) with the
current version of the library.
input regexp run time memory
------------------- --------------------- -------------- --------
100 MB of a's `.*` 8s (12 MB/s) 1 MB
5000 a's `(a?){5000}a{5000}` 13s 5 MB
~2M a's and b's `.*a.{20}a.*` 3.6s 1 MB
The first example measures the search speed for a simple regular
expression with a long string. There is room for improvement. No time
has been invested yet to improve the performance of the library with
regard to constant factors.
The second example demonstrates the good asymptotic complexity of the
algorithm. Unlike a backtracking implementation like [pcre] the
library finishes in reasonable time. However, the memory requirements
are higher than usual and on closer inspection one can see that almost
10 of 13 seconds are spent during garbage collection. This example
uses a large regular expression which leads to a lot of garbage in the
matching algorithm.
The third example pushes automata based approaches to the limit
because the deterministic finite automaton that corresponds to the
regular expression is exponentially large. The input has been chosen
to not match the expression but is otherwise random and probably
explores many different states of the automaton. The matching
algorithm produces states on the fly and discards them, hence, it is
fast in this example, in fact, faster than re2[^cpp].
[^cpp]: The following C++ program uses the [re2] library and needs
*4s* to search for `a.{20}a` in a string of ~2M random a's ad b's:
Unlike the Haskell program, this program keeps the whole input,
that is, the result of `getline`, in memory. Can [re2] match input
on the fly?
The benchmarks above all use large input and two of them are
specifically designed as corner cases of typical matching
algorithms. The run time of matching more common regular expressions
against short input has been measured using [Criterion] in order to
get statistically robust results.
[Criterion]: http://www.serpentine.com/blog/2009/09/29/criterion-a-new-benchmarking-library-for-haskell/
You can install the `weighted-regexp` package with the `Criterion` flag to generate a program that executes the benchmarks described below:
bash# cabal install weighted-regexp -fCriterion
You can call `criterion-re --help` to see how to use the generated
program. It tests three different examples:
* a unique full match with a regular expression for phone numbers,
* an ambiguous full match with a regular expression for sequences of
HTML elements, and
* a partial match with a regular expression for protein sequences in
RNA.
For a more detailed explanation consider the [source
code][criterion.lhs] of the benchmark program.
[criterion.lhs]: http://github.com/sebfisch/haskell-regexp/blob/master/src/criterion.lhs
matching acceptance #matchings leftmost longest leftmost longest
--------------- ----------- ----------- ---------- ---------- -----------------
unique full [3.8 us] [4.8 us]
ambiguous full [11.7 us] [13.4 us]
partial [20.4 us] [27.2 us] [26.2 us] [27.5 us]
Click on the numbers for a more detailed distribution of run times.
[3.8 us]: http://sebfisch.github.com/haskell-regexp/criterion/full-accept-phone-densities-800x600.png
[4.8 us]: http://sebfisch.github.com/haskell-regexp/criterion/full-count-phone-densities-800x600.png
[11.7 us]: http://sebfisch.github.com/haskell-regexp/criterion/full-accept-html-densities-800x600.png
[13.4 us]: http://sebfisch.github.com/haskell-regexp/criterion/full-count-html-densities-800x600.png
[20.4 us]: http://sebfisch.github.com/haskell-regexp/criterion/partial-accept-rna-densities-800x600.png
[27.2 us]: http://sebfisch.github.com/haskell-regexp/criterion/partial-leftmost-rna-densities-800x600.png
[26.2 us]: http://sebfisch.github.com/haskell-regexp/criterion/partial-longest-rna-densities-800x600.png
[27.5 us]: http://sebfisch.github.com/haskell-regexp/criterion/partial-leftlong-rna-densities-800x600.png
# Collaboration
|
The source code of this library is on [github]. You can collaborate by
using it in your projects, report bugs and ask for new features in the
[issue tracker], or provide patches that implement pending issues.
|
[github]: http://github.com/sebfisch/haskell-regexp
[issue tracker]: http://github.com/sebfisch/haskell-regexp/issues
The algorithm discussed in the [Play on Regular Expressions][paper]
has been implemented in different languages. In a series of two
[blog][blog] [posts][posts], Carl Friedrich Bolz describes a Python
implementation that uses a Just In Time (JIT) compiler to achieve
impressive performance. He compares his version with corresponding C++
and Java programs.
[blog]: http://morepypy.blogspot.com/2010/05/efficient-and-elegant-regular.html
[posts]: http://morepypy.blogspot.com/2010/06/jit-for-regular-expression-matching.html
For questions and feedback email [Sebastian
Fischer](mailto:mail@sebfisch.de). weighted-regexp-0.3.1.2/LICENSE 0000644 0000000 0000000 00000002767 11716565266 014171 0 ustar 00 0000000 0000000 Copyright (c) 2010, Thomas Wilke, Frank Huch, Sebastian Fischer
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the author nor the names of his contributors
may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
weighted-regexp-0.3.1.2/src/ 0000755 0000000 0000000 00000000000 11716565266 013737 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/src/criterion.lhs 0000644 0000000 0000000 00000004222 11716565266 016445 0 ustar 00 0000000 0000000
> {-# LANGUAGE OverloadedStrings #-}
We use Criterion to run a number of micro benchmarks that match
different regular expressions against strings.
> import Text.RegExp
> import Text.RegExp.Matching.Leftmost as Leftmost
> import Text.RegExp.Matching.Longest as Longest
> import Text.RegExp.Matching.LeftLong as LeftLong
>
> import Criterion.Main
>
> main :: IO ()
> main = defaultMain
> [ bgroup "full"
> [ bgroup mode
> [ bench name $ call re str
> | (name, re, str) <-
> [ ("phone", phone're, phone'str)
> , ("html" , html're , html'str)
> ]
> ]
> | (mode, call) <-
> [ ("accept", whnf . acceptFull)
> , ("count" , whnf . (matchingCount :: RegExp Char -> String -> Int))
> ]
> ]
> , bgroup "partial"
> [ bgroup mode
> [ bench name $ call re str
> | (name, re, str) <-
> [ ("rna", rna're, rna'str)
> ]
> ]
> | (mode, call) <-
> [ ("accept" , whnf . acceptPartial)
> , ("leftmost", whnf . Leftmost.matching)
> , ("longest" , whnf . Longest.matching)
> , ("leftlong", whnf . LeftLong.matching)
> ]
> ]
> ]
The following regular expression for phone numbers matches uniquely
against phone numbers like the one given below.
> phone're :: RegExp Char
> phone're = "[0-9]+(-[0-9]+)*"
>
> phone'str :: String
> phone'str = "0431-880-7267"
As an example for an ambiguous match we match the following regular
expression wich reminds one of HTML documents.
> html're :: RegExp Char
> html're = "(<\\w*>.*\\w*>)*"
This expressions matches the string below in two different ways.
> html'str :: String
> html'str = "some
text
"
To benchmark partial matchings we search for a protein sequence in an
RNA sequence. Protein sequences start with `AUG`, followed by codons
(triplets) built from the bases adenin (`A`), cytosine (`C`), guanin
(`G`), and uracil (`U`), and end with `UAG`, `UGA`, or `UAA`.
> rna're :: RegExp Char
> rna're = "AUG([ACGU][ACGU][ACGU])*(UAG|UGA|UAA)"
For example, the following RNA sequence contains the protein sequence
`AUGACACUUGAAUGA`.
> rna'str :: String
> rna'str = "UUACGGAUGACACUUGAAUGACUGA"
weighted-regexp-0.3.1.2/src/quickcheck.lhs 0000644 0000000 0000000 00000032641 11716565266 016567 0 ustar 00 0000000 0000000
> {-# LANGUAGE GeneralizedNewtypeDeriving #-}
> {-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-}
> {-# LANGUAGE ScopedTypeVariables #-}
> {-# LANGUAGE OverloadedStrings #-}
> {-# OPTIONS_GHC -fno-warn-missing-methods -fno-warn-orphans #-}
We specify a `Monoid` instance for a `newtype` of lists.
> import Data.Monoid ( Monoid(..) )
We use QuickCheck version 1 for testing because version 2 cannot be
used in batch mode.
> import Test.QuickCheck
> import Test.QuickCheck.Batch
> import Control.Monad ( ap, replicateM )
> import Data.Char ( chr, ord )
> import Data.List ( permutations )
We import the semiring properties in order to check them for the
defined instances. We also define our own `sum` function for
semirings.
> import Data.Semiring.Properties
> import Prelude hiding ( sum )
Finally, we need the `RegExp` datatype, the `symWeight` function from
the `Weight` class, and the different semirings used for matching.
> import Text.RegExp
> import Text.RegExp.Data
> import Text.RegExp.Matching.Leftmost.Type ( Leftmost(..) )
> import Text.RegExp.Matching.Longest.Type ( Longest(..) )
> import Text.RegExp.Matching.LeftLong.Type ( LeftLong(..) )
> import Text.RegExp.Matching.Leftmost ( getLeftmost )
> import Text.RegExp.Matching.Longest ( getLongest )
> import Text.RegExp.Matching.LeftLong ( getLeftLong )
> import qualified Text.RegExp.Matching.Leftmost as Leftmost
> import qualified Text.RegExp.Matching.Longest as Longest
> import qualified Text.RegExp.Matching.LeftLong as LeftLong
The `main` function runs all tests defined in this program.
> main :: IO ()
> main =
> do runChecks "semiring laws (Bool)" (semiring'laws :: Checks Bool)
> runChecks "semiring laws (Int)" (semiring'laws :: Checks (Numeric Int))
> runChecks "semiring laws (Leftmost)" (semiring'laws :: Checks Leftmost)
> runChecks "semiring laws (Longest)" (semiring'laws :: Checks Longest)
> runChecks "semiring laws (LeftLong)" semiring'laws'LeftLong
> runTests (pad "full match") options $
> checks (full'match'spec acceptFull id :: Checks Bool) ++
> checks (full'match'spec matchingCount getNumeric
> :: Checks (Numeric Int))
> runTests (pad "partial match") options $
> checks (partial'match'spec acceptPartial id :: Checks Bool) ++
> checks (indexed'match'spec Leftmost.matching getLeftmost) ++
> checks (partial'match'spec Longest.matching getLongest) ++
> checks (indexed'match'spec LeftLong.matching getLeftLong)
> runTests (pad "parse printed regexp") options [run parse'printed]
> runChecks "lazy infinite regexps" infinite'regexp'checks
> runTests "permutation parsing" options [run perm'parser'check]
> where
> options = defOpt { no_of_tests = 1000, length_of_tests = 60 }
> runChecks s = runTests (pad s) options . checks
> pad s = replicate (25-length s) ' ' ++ s
The `Arbitrary` instance for numeric types wraps the underlying
instance. We also provide one for `Char` which is not predefined.
> instance (Num a, Arbitrary a) => Arbitrary (Numeric a) where
> arbitrary = Numeric `fmap` arbitrary
>
> instance Arbitrary Char where
> arbitrary = elements "abcde \\|*+?.[]{}"
We provide generic `Semiring` instances for the semirings used for
matching.
> instance Arbitrary Leftmost where
> arbitrary = frequency [ (1, return zero)
> , (1, return one)
> , (3, (Leftmost . abs) `fmap` arbitrary) ]
>
> instance Arbitrary Longest where
> arbitrary = frequency [ (1, return zero)
> , (1, return one)
> , (3, (Longest . succ . abs) `fmap` arbitrary) ]
>
> instance Arbitrary LeftLong where
> arbitrary = frequency [ (1, return zero)
> , (1, return one)
> , (3, do x <- abs `fmap` arbitrary
> y <- abs `fmap` arbitrary
> return $ LeftLong (min x y) (max x y)) ]
We define a list of `Checks` for the semiring laws.
> semiring'laws :: (Arbitrary s, Show s, Semiring s) => Checks s
> semiring'laws = mconcat [ prop2 plus'comm
> , prop1 left'zero
> , prop3 add'assoc
> , prop1 left'one
> , prop1 right'one
> , prop3 mul'assoc
> , prop3 left'distr
> , prop3 right'distr
> , prop1 left'ann
> , prop1 right'ann
> ]
`Checks` is a `newtype` for a list of batch tests with a phantom type
that can be used in definitions of the properties.
> newtype Checks a = Checks { checks :: [TestOptions -> IO TestResult] }
> deriving ( Monoid )
We define the auxiliary functions to create semiring properties with
different arities.
> prop1 :: (Arbitrary s, Show s, Testable a) => (s -> a) -> Checks s
> prop1 prop = Checks [run prop]
>
> prop2 :: (Arbitrary s, Show s, Testable a) => (s -> s -> a) -> Checks s
> prop2 prop = Checks [run prop]
>
> prop3 :: (Arbitrary s, Show s, Testable a) => (s-> s -> s -> a) -> Checks s
> prop3 prop = Checks [run prop]
The `LeftLong` type satisfies the distributive laws only with a
precondition on all involved multiplications: multiplied matches must
be adjacent and the start position must be smaller than the end
position. This precondition is satisfied for all multiplications
during regular expression matching.
We define a variant of `semiring'laws` with this precondition on the
distributive laws.
> semiring'laws'LeftLong :: Checks LeftLong
> semiring'laws'LeftLong = mconcat
> [ prop2 plus'comm
> , prop1 left'zero
> , prop3 add'assoc
> , prop1 left'one
> , prop1 right'one
> , prop3 mul'assoc
> , prop3 left'distr'LeftLong
> , prop3 right'distr'LeftLong
> , prop1 left'ann
> , prop1 right'ann
> ]
For testing the distributive laws, we adjust the randomly generated
`LeftLong` values such that the arguments of multiplications are
adjacent.
> left'distr'LeftLong :: LeftLong -> LeftLong -> LeftLong -> Bool
> left'distr'LeftLong a b c = left'distr a (shift a b) (shift a c)
> where
> shift (LeftLong _ x) (LeftLong y z) = LeftLong (x+1) (z+x+1-y)
> shift _ x = x
>
> right'distr'LeftLong :: LeftLong -> LeftLong -> LeftLong -> Bool
> right'distr'LeftLong a b c = right'distr (shift a c) (shift b c) c
> where
> shift (LeftLong x y) (LeftLong z _) = LeftLong (x+z-1-y) (z-1)
> shift x _ = x
Now we turn to the correctness of the `match` function. In order to
check it, we compare it with a executable specification which is
correct by definition:
> full'match'spec :: (Show a, Weight Char Char s)
> => (RegExp Char -> String -> a)
> -> (s -> a)
> -> Checks s
> full'match'spec = match'spec fullMatchSpec
>
> partial'match'spec :: (Show a, Weight Char Char s)
> => (RegExp Char -> String -> a)
> -> (s -> a)
> -> Checks s
> partial'match'spec = match'spec partialMatchSpec
>
> indexed'match'spec :: (Show a, Weight Char (Int,Char) s)
> => (RegExp Char -> String -> a)
> -> (s -> a)
> -> Checks s
> indexed'match'spec = match'spec (\r -> partialMatchSpec r . zip [(0::Int)..])
>
> match'spec :: (Show a, Semiring s)
> => (RegExp Char -> String -> s)
> -> (RegExp Char -> String -> a)
> -> (s -> a)
> -> Checks s
> match'spec spec convmatch conv =
> Checks [run (check'match'spec spec convmatch conv)]
>
> check'match'spec :: (Show a, Semiring s)
> => (RegExp Char -> String -> s)
> -> (RegExp Char -> String -> a)
> -> (s -> a)
> -> RegExp Char -> String -> Bool
> check'match'spec spec convmatch conv r s =
> show (convmatch r s') == show (conv (spec r s'))
> where s' = take 5 s
To make this work, we need an `Arbitrary` instance for regular
expressions.
> instance Arbitrary (RegExp Char) where
> arbitrary = sized regexp
>
> regexp :: Int -> Gen (RegExp Char)
> regexp 0 = frequency [ (1, return eps)
> , (4, char `fmap` simpleChar) ]
> regexp n = frequency [ (3, regexp 0)
> , (1, alt `fmap` subexp `ap` subexp)
> , (2, seq_ `fmap` subexp `ap` subexp)
> , (1, rep `fmap` regexp (n-1))
> , (2, fromString `fmap` parsedRegExp n) ]
> where subexp = regexp (n `div` 2)
>
> simpleChar :: Gen Char
> simpleChar = elements "abcde"
>
> parsedRegExp :: Int -> Gen String
> parsedRegExp n = frequency [ (4, symClass)
> , (2, (++"?") `fmap` subexp)
> , (2, (++"+") `fmap` subexp)
> , (1, mkBrep1 =<< subexp)
> , (1, mkBrep2 =<< subexp) ]
> where
> subexp = (($"") . showParen True . shows)
> `fmap` (resize (n-1) arbitrary :: Gen (RegExp Char))
>
> mkBrep1 r = do x <- elements [0..3] :: Gen Int
> return $ r ++ "{" ++ show x ++ "}"
>
> mkBrep2 r = do x <- elements [0..2] :: Gen Int
> y <- elements [0..2] :: Gen Int
> return $ r ++ "{" ++ show x ++ "," ++ show (x+y) ++ "}"
>
> symClass :: Gen String
> symClass = frequency [ (1, specialChar)
> , (2, do n <- choose (0,3)
> cs <- replicateM n charClass
> s <- (["","^"]!!) `fmap` choose (0,1)
> return $ "[" ++ s ++ concat cs ++ "]") ]
> where
> specialChar = elements (map (:[]) "." ++
> map (\c -> '\\':[c]) "abcdewWdDsS \\|*+?.[]{}^")
> charClass = oneof [ (:[]) `fmap` simpleChar
> , specialChar
> , do x <- simpleChar
> y <- simpleChar
> return $ x : '-' : [chr (ord x+ord y-ord 'a')] ]
The specification of the matching function is defined inductively on
the structure of a regular expression. It uses exhaustive search to
find all possibilities to match a regexp against a word.
> fullMatchSpec :: Weight a b s => RegExp a -> [b] -> s
> fullMatchSpec (RegExp r) = matchSpec (reg (weighted r))
>
> matchSpec :: Semiring s => Reg s c -> [c] -> s
> matchSpec Eps u = if null u then one else zero
> matchSpec (Sym _ f) u = case u of [c] -> f c; _ -> zero
> matchSpec (Alt p q) u = matchSpec (reg p) u .+. matchSpec (reg q) u
> matchSpec (Seq p q) u =
> sum [ matchSpec (reg p) u1 .*. matchSpec (reg q) u2 | (u1,u2) <- split u ]
> matchSpec (Rep p) u =
> sum [ prod [ matchSpec (reg p) ui | ui <- ps] | ps <- parts u ]
>
> sum, prod :: Semiring s => [s] -> s
> sum = foldr (.+.) zero
> prod = foldr (.*.) one
>
> split :: [a] -> [([a],[a])]
> split [] = [([],[])]
> split (c:cs) = ([],c:cs) : [ (c:s1,s2) | (s1,s2) <- split cs ]
>
> parts :: [a] -> [[[a]]]
> parts [] = [[]]
> parts [c] = [[[c]]]
> parts (c:cs) = concat [ [(c:p):ps,[c]:p:ps] | p:ps <- parts cs ]
We can perform a similar test for partial instead of full matches.
> partialMatchSpec :: Weight a b s => RegExp a -> [b] -> s
> partialMatchSpec (RegExp r) =
> matchSpec (reg (arb `seqW` weighted r `seqW` arb))
> where RegExp arb = rep anySym
As a check for the parser, we check whether the representation
generated by the `Show` instance of regular expressions can be parsed
back and yields the original expression.
> parse'printed :: RegExp Char -> Bool
> parse'printed r = fromString (show r) == r
We can also match infinite regular expressions lazily to recognize
context-free or even context-sensitive languages.
> infinite'regexp'checks :: Checks Bool
> infinite'regexp'checks = Checks [run context'free, run context'sensitive]
As an example for a context-free language, we recognize the language
${a^nb^n | n >= 0}$.
> context'free :: String -> Bool
> context'free s = isInAnBn s == (anbn =~ s)
>
> isInAnBn :: String -> Bool
> isInAnBn s = all (=='a') xs && all (=='b') ys && length xs == length ys
> where (xs,ys) = break (=='b') s
>
> anbn :: RegExp Char
> anbn = eps `alt` seq_ "a" (anbn `seq_` "b")
As an example for a context-sensitive language we use the language
${a^nb^nc^n | n >= 0}$. To show that the alphabet cannot only contain
characters, we use numbers instead of characters.
> context'sensitive :: [Int] -> Bool
> context'sensitive s = isInAnBnCn s == acceptFull anbncn s
>
> isInAnBnCn :: [Int] -> Bool
> isInAnBnCn s = all (==1) xs && all (==2) ys && all (==3) zs
> && length xs == length ys && length ys == length zs
> where (xs,l) = break (==2) s
> (ys,zs) = break (==3) l
>
> anbncn :: RegExp Int
> anbncn = mkAnBnCn 0
> where
> mkAnBnCn n = brep (n,n) (sym 2) `seq_` brep (n,n) (sym 3)
> `alt` seq_ (sym 1) (mkAnBnCn (n+1))
The library provides a combinator that matches a list of regular
expressions in sequence, each occurring once in any order.
> perm'parser'check :: String -> Bool
> perm'parser'check cs = all (acceptFull (perm (map char s))) (permutations s)
> where s = take 5 cs
We restrict the test to at most 5! (that is five factorial)
permutations because otherwise there are too many. Note that it is
possible to match much longer permutations:
ghci> accept (perm (map char ['a'..'z'])) $ reverse ['a'..'z']
True
(0.05 secs, 8706356 bytes)
But matching `perm (map char ['a'..'z'])` against *all* permutations
of ['a'..'z'] takes too long.
weighted-regexp-0.3.1.2/src/Data/ 0000755 0000000 0000000 00000000000 11716565266 014610 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/src/Data/Semiring.hs 0000644 0000000 0000000 00000004244 11716565266 016725 0 ustar 00 0000000 0000000 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
-- |
-- Module : Data.Semiring
-- Copyright : Thomas Wilke, Frank Huch, Sebastian Fischer
-- License : BSD3
-- Maintainer : Sebastian Fischer
-- Stability : experimental
--
-- This library provides a type class for semirings and instances for
-- standard data types.
--
module Data.Semiring (
Semiring(..), fromBool,
Numeric(..)
) where
infixr 6 .+.
infixr 7 .*.
-- |
-- A semiring is an additive commutative monoid with identity 'zero':
--
-- > a .+. b == b .+. a
-- > zero .+. a == a
-- > (a .+. b) .+. c == a .+. (b .+. c)
--
-- A semiring is a multiplicative monoid with identity 'one':
--
-- > one .*. a == a
-- > a .*. one == a
-- > (a .*. b) .*. c == a .*. (b .*. c)
--
-- Multiplication distributes over addition:
--
-- > a .*. (b .+. c) == (a .*. b) .+. (a .*. c)
-- > (a .+. b) .*. c == (a .*. c) .+. (b .*. c)
--
-- 'zero' annihilates a semiring with respect to multiplication:
--
-- > zero .*. a == zero
-- > a .*. zero == zero
--
-- All laws should hold with respect to the required `Eq` instance.
--
-- For example, the Booleans form a semiring.
--
-- * @False@ is an identity of disjunction which is commutative and
-- associative,
--
-- * @True@ is an identity of conjunction which is associative,
--
-- * conjunction distributes over disjunction, and
--
-- * @False@ annihilates the Booleans with respect to conjunction.
--
class Eq s => Semiring s where
zero, one :: s
(.+.), (.*.) :: s -> s -> s
-- | Auxiliary function to convert Booleans to an arbitrary semiring.
--
fromBool :: Semiring s => Bool -> s
fromBool False = zero
fromBool True = one
instance Semiring Bool where
zero = False; one = True; (.+.) = (||); (.*.) = (&&)
-- |
-- Wrapper for numeric types.
--
-- Every numeric type that satisfies the semiring laws (as all
-- predefined numeric types do) is a semiring.
--
newtype Numeric a = Numeric { getNumeric :: a }
deriving (Eq,Num)
instance Show a => Show (Numeric a) where
show = show . getNumeric
instance Num a => Semiring (Numeric a) where
zero = 0; one = 1; (.+.) = (+); (.*.) = (*)
weighted-regexp-0.3.1.2/src/Data/Semiring/ 0000755 0000000 0000000 00000000000 11716565266 016365 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/src/Data/Semiring/Properties.hs 0000644 0000000 0000000 00000003140 11716565266 021053 0 ustar 00 0000000 0000000 -- |
-- Module : Data.Semiring.Properties
-- Copyright : Sebastian Fischer
-- License : BSD3
--
-- This library provides properties for the 'Semiring' type class that
-- can be checked using libraries like QuickCheck or SmallCheck.
--
module Data.Semiring.Properties (
module Data.Semiring, module Data.Semiring.Properties
) where
import Data.Semiring
-- | > a .+. b == b .+. a
plus'comm :: Semiring s => s -> s -> Bool
plus'comm a b = a .+. b == b .+. a
-- | > zero .+. a == a
left'zero :: Semiring s => s -> Bool
left'zero a = zero .+. a == a
-- | > (a .+. b) .+. c == a .+. (b .+. c)
add'assoc :: Semiring s => s -> s -> s -> Bool
add'assoc a b c = (a .+. b) .+. c == a .+. (b .+. c)
-- | > one .*. a == a
left'one :: Semiring s => s -> Bool
left'one a = one .*. a == a
-- | > a .*. one == a
right'one :: Semiring s => s -> Bool
right'one a = a .*. one == a
-- | > (a .*. b) .*. c == a .*. (b .*. c)
mul'assoc :: Semiring s => s -> s -> s -> Bool
mul'assoc a b c = (a .*. b) .*. c == a .*. (b .*. c)
-- | > a .*. (b .+. c) == (a .*. b) .+. (a .*. c)
left'distr :: Semiring s => s -> s -> s -> Bool
left'distr a b c = a .*. (b .+. c) == (a .*. b) .+. (a .*. c)
-- | > (a .+. b) .*. c == (a .*. c) .+. (b .*. c)
right'distr :: Semiring s => s -> s -> s -> Bool
right'distr a b c = (a .+. b) .*. c == (a .*. c) .+. (b .*. c)
-- | > zero .*. a == zero
left'ann :: Semiring s => s -> Bool
left'ann a = zero .*. a == zero
-- | > a .*. zero == zero
right'ann :: Semiring s => s -> Bool
right'ann a = a .*. zero == zero
weighted-regexp-0.3.1.2/src/Text/ 0000755 0000000 0000000 00000000000 11716565266 014663 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/src/Text/RegExp.hs 0000644 0000000 0000000 00000007641 11716565266 016421 0 ustar 00 0000000 0000000 {-# LANGUAGE FlexibleInstances #-}
{-# OPTIONS -fno-warn-orphans #-}
-- |
-- Module : Text.RegExp
-- Copyright : Thomas Wilke, Frank Huch, and Sebastian Fischer
-- License : BSD3
-- Maintainer : Sebastian Fischer
-- Stability : experimental
--
-- This library provides a simple and fast regular expression matcher
-- that is implemented in Haskell without binding to external
-- libraries.
--
-- There are different ways to implement regular expression
-- matching. Backtracking algorithms are simple but need bookkeeping
-- overhead for nondeterministic search. One can use deterministic
-- finite automata (DFA, see
-- ) to match regular
-- expressions faster. But for certain regular expressions these DFA
-- are exponentially large which sometimes leads to prohibitive memory
-- requirements.
--
-- We use a smart and simple algorithm to generate a DFA from a
-- regular expression and do not generate the DFA completely but on
-- the fly while parsing. This leads to a linear-time deterministic
-- algorithm with constant space requirements. More specifically, the
-- run time is limited by the product of the sizes of the regular
-- expression and the string and the memory is limited by the size of
-- the regular expression.
--
module Text.RegExp (
module Data.Semiring, Weight(..),
-- * Constructing regular expressions
RegExp, fromString,
eps, char, sym, psym, anySym, noMatch, alt, seq_, rep, rep1, opt, brep,
perm,
-- * Matching
(=~), acceptFull, acceptPartial, matchingCount, fullMatch, partialMatch
) where
import Data.Semiring
import qualified Data.String
import Text.RegExp.Data
import Text.RegExp.Parser
import Text.RegExp.Matching
-- |
-- Parses a regular expression from its string representation. If the
-- 'OverloadedStrings' language extension is enabled, string literals
-- can be used as regular expressions without using 'fromString'
-- explicitly. Implicit conversion is especially useful in combination
-- with functions like '=~' that take a value of type @RegExp Char@ as
-- argument.
--
-- Here are some examples of supported regular expressions along with
-- an explanation what they mean:
--
-- * @a@ matches the character @a@
--
-- * @[abc]@ matches any of the characters @a@, @b@, or @c@. It is
-- equivalent to @(a|b|c)@, but @|@ can be used to specify
-- alternatives between arbitrary regular expressions, not only
-- characters.
--
-- * @[^abc]@ matches anything but the characters @a@, @b@, or @c@.
--
-- * @\\d@ matches a digit and is equivalent to @[0-9]@. Moreover,
-- @\\D@ matches any non-digit character, @\\s@ and @\\S@ match
-- space and non-space characters and @\\w@ and @\\W@ match word
-- characters and non-word characters, that is, @\\w@ abbreviates
-- @[a-zA-Z_]@.
--
-- * @a?@ matches the empty word or the character @a@, @a*@ matches
-- zero or more occurrences of @a@, and @a+@ matches one or more
-- @a@'s.
--
-- * @.@ (the dot) matches one arbitrary character.
--
-- * @a{4,7}@ matches four to seven occurrences of @a@, @a{2}@
-- matches two.
--
fromString :: String -> RegExp Char
fromString = Data.String.fromString
instance Data.String.IsString (RegExp Char) where
fromString = parse
-- |
-- Matches a sequence of the given regular expressions in any
-- order. For example, the regular expression
--
-- @
-- perm (map char \"abc\")
-- @
--
-- has the same meaning as
--
-- @
-- abc|acb|bca|bac|cba|cab
-- @
--
-- and is represented as
--
-- @
-- a(bc|cb)|b(ca|ac)|c(ba|ab)
-- @
--
perm :: [RegExp c] -> RegExp c
perm [] = eps
perm [r] = r
perm rs = go rs []
where
go [p] qs = p `seq_` perm qs
go (p:ps) qs = (p `seq_` perm (ps ++ qs)) `alt` go ps (p:qs)
-- |
-- Alias for 'acceptFull' specialized for Strings. Useful in combination
-- with the 'IsString' instance for 'RegExp' 'Char'
--
(=~) :: RegExp Char -> String -> Bool
(=~) = acceptFull
weighted-regexp-0.3.1.2/src/Text/RegExp/ 0000755 0000000 0000000 00000000000 11716565266 016055 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/src/Text/RegExp/Data.hs 0000644 0000000 0000000 00000013773 11716565266 017275 0 ustar 00 0000000 0000000 {-# LANGUAGE RankNTypes #-}
{-# LANGUAGE MultiParamTypeClasses, FlexibleInstances #-}
module Text.RegExp.Data where
import Data.Semiring
-- |
-- Regular expressions are represented as values of type 'RegExp' @c@
-- where @c@ is the character type of the underlying alphabet. Values
-- of type @RegExp@ @c@ can be matched against lists of type @[c]@.
--
newtype RegExp c = RegExp (forall w . Semiring w => RegW w c)
data RegW w c = RegW { active :: !Bool,
empty :: !w,
final_ :: !w,
reg :: !(Reg w c) }
final :: Semiring w => RegW w c -> w
final r = if active r then final_ r else zero
data Reg w c = Eps
| Sym String (c -> w)
| Alt (RegW w c) (RegW w c)
| Seq (RegW w c) (RegW w c)
| Rep (RegW w c)
class Semiring w => Weight a b w where
symWeight :: (a -> w) -> b -> w
defaultSymWeight :: (a -> w) -> a -> w
defaultSymWeight = id
instance Weight c c Bool where
symWeight = defaultSymWeight
instance Num a => Weight c c (Numeric a) where
symWeight = defaultSymWeight
weighted :: Weight a b w => RegW w a -> RegW w b
weighted (RegW a e f r) =
case r of
Eps -> RegW a e f Eps
Sym s p -> RegW a e f (Sym s (symWeight p))
Alt p q -> RegW a e f (Alt (weighted p) (weighted q))
Seq p q -> RegW a e f (Seq (weighted p) (weighted q))
Rep p -> RegW a e f (Rep (weighted p))
-- |
-- Matches the empty word. 'eps' has no direct string representation
-- but is used to implement other constructs such as optional
-- components like @a?@.
--
eps :: RegExp c
eps = RegExp epsW
epsW :: Semiring w => RegW w c
epsW = RegW False one zero Eps
-- | Matches the given character.
--
char :: Char -> RegExp Char
char c = psym (quote c) (c==)
-- | Matches the given symbol.
--
sym :: (Eq c, Show c) => c -> RegExp c
sym c = psym (show c) (c==)
quote :: Char -> String
quote c | c `elem` " \\|*+?.[]{}^" = '\\' : [c]
| otherwise = [c]
-- | Matches a symbol that satisfies the given predicate.
--
psym :: String -> (c -> Bool) -> RegExp c
psym s p = RegExp (symW s (fromBool . p))
symW :: Semiring w => String -> (c -> w) -> RegW w c
symW s p = RegW False zero zero $ Sym s p
-- | Matches an arbitrary symbol.
--
anySym :: RegExp c
anySym = psym "." (const True)
-- | Does not match anything. 'noMatch' is an identity for 'alt'.
--
noMatch :: RegExp c
noMatch = psym "[]" (const False)
-- |
-- Matches either of two regular expressions. For example @a+b@
-- matches either the character @a@ or the character @b@.
--
alt :: RegExp c -> RegExp c -> RegExp c
alt (RegExp p) (RegExp q) =
RegExp (RegW False (empty p .+. empty q) zero (Alt p q))
altW :: Semiring w => RegW w c -> RegW w c -> RegW w c
altW p q = RegW (active p || active q)
(empty p .+. empty q)
(final p .+. final q)
(Alt p q)
-- |
-- Matches the sequence of two regular expressions. For example the
-- regular expressions @ab@ matches the word @ab@.
--
seq_ :: RegExp c -> RegExp c -> RegExp c
seq_ (RegExp p) (RegExp q) =
RegExp (RegW False (empty p .*. empty q) zero (Seq p q))
seqW :: Semiring w => RegW w c -> RegW w c -> RegW w c
seqW p q = RegW (active p || active q)
(empty p .*. empty q)
(final p .*. empty q .+. final q)
(Seq p q)
-- | Matches zero or more occurrences of the given regular
-- expression. For example @a*@ matches the character @a@ zero or
-- more times.
--
rep :: RegExp c -> RegExp c
rep (RegExp r) = RegExp (RegW False one zero (Rep r))
repW :: Semiring w => RegW w c -> RegW w c
repW r = RegW (active r) one (final r) (Rep r)
-- | Matches one or more occurrences of the given regular
-- expression. For example @a+@ matches the character @a@ one or
-- more times.
--
rep1 :: RegExp c -> RegExp c
rep1 r = r `seq_` rep r
-- |
-- Matches the given regular expression or the empty word. Optional
-- expressions are usually written @a?@ but could also be written
-- @(|a)@, that is, as alternative between 'eps' and @a@.
--
opt :: RegExp c -> RegExp c
opt r = eps `alt` r
-- |
-- Matches a regular expression a given number of times. For example,
-- the regular expression @a{4,7}@ matches the character @a@ four to
-- seven times. If the minimal and maximal occurences are identical,
-- one can be left out, that is, @a{2}@ matches two occurrences of the
-- character @a@.
--
-- Numerical bounds are implemented via translation into ordinary
-- regular expressions. For example, @a{4,7}@ is translated into
-- @aaaa(a(a(a)?)?)?@.
--
brep :: (Int,Int) -> RegExp c -> RegExp c
brep (n,m) r
| n < 0 || m < 0 || n > m = error msg
| n == 0 && m == 0 = eps
| n == m = foldr1 seq_ (replicate n r)
| otherwise = foldr seq_ rest (replicate n r)
where
rest = foldr nestopt (opt r) (replicate (m-n-1) r)
nestopt p q = opt (seq_ p q)
msg = "Text.RegExp.brep: invalid repetition bounds: " ++ show (n,m)
regW :: Semiring w => RegExp c -> RegW w c
regW (RegExp r) = r
instance Show (RegExp Char) where
showsPrec n r = showsPrec n (regW r :: RegW Bool Char)
instance Show (RegW Bool Char) where
showsPrec n r = showsPrec n (reg r)
instance Show (Reg Bool Char) where
showsPrec _ Eps = showString "()"
showsPrec _ (Sym s _) = showString s
showsPrec n (Alt p q) = showParen (n > 0)
$ showsPrec 1 p
. showString "|"
. shows q
showsPrec n (Seq p q) = showParen (n > 1)
$ showsPrec 2 p
. showsPrec 1 q
showsPrec _ (Rep r) = showsPrec 2 r . showString "*"
instance Eq (RegExp Char) where
p == q = regW p == (regW q :: RegW Bool Char)
instance Eq (RegW Bool Char) where
p == q = reg p == reg q
instance Eq (Reg Bool Char) where
Eps == Eps = True
Sym s _ == Sym t _ = s==t
Alt a b == Alt c d = a==c && b==d
Seq a b == Seq c d = a==c && b==d
Rep a == Rep b = a==b
_ == _ = False
weighted-regexp-0.3.1.2/src/Text/RegExp/Internal.hs 0000644 0000000 0000000 00000001032 11716565266 020161 0 ustar 00 0000000 0000000 -- |
-- Module : Text.RegExp
-- Copyright : Thomas Wilke, Frank Huch, and Sebastian Fischer
-- License : BSD3
-- Maintainer : Sebastian Fischer
-- Stability : experimental
--
-- This module exports internal data types and matching functions. You
-- do not need to import it unless you want to write your own matching
-- algorithms.
--
module Text.RegExp.Internal (
module Text.RegExp.Data, module Text.RegExp.Matching
) where
import Text.RegExp.Data
import Text.RegExp.Matching
weighted-regexp-0.3.1.2/src/Text/RegExp/Matching.hs 0000644 0000000 0000000 00000007444 11716565266 020154 0 ustar 00 0000000 0000000 {-# LANGUAGE MultiParamTypeClasses, FlexibleContexts #-}
module Text.RegExp.Matching where
import Data.Semiring
import Text.RegExp.Data
import Text.RegExp.Matching.Leftmost.Type
import Text.RegExp.Matching.Longest.Type
import Text.RegExp.Matching.LeftLong.Type
-- |
-- Checks whether a regular expression matches the given word. For
-- example, @acceptFull (fromString \"b|abc\") \"b\"@ yields @True@
-- because the first alternative of @b|abc@ matches the string
-- @\"b\"@.
--
acceptFull :: RegExp c -> [c] -> Bool
acceptFull r = fullMatch r
-- |
-- Checks whether a regular expression matches a subword of the given
-- word. For example, @acceptPartial (fromString \"b\") \"abc\"@
-- yields @True@ because @\"abc\"@ contains the substring @\"b\"@.
--
acceptPartial :: RegExp c -> [c] -> Bool
acceptPartial r = partialMatch r
-- |
-- Computes in how many ways a word can be matched against a regular
-- expression.
--
matchingCount :: Num a => RegExp c -> [c] -> a
matchingCount r = getNumeric . fullMatch r
{-# SPECIALIZE matchingCount :: RegExp c -> [c] -> Int #-}
-- |
-- Matches a regular expression against a word computing a weight in
-- an arbitrary semiring.
--
-- The symbols can have associated weights associated by the
-- 'symWeight' function of the 'Weight' class. This function also
-- allows to adjust the type of the used alphabet such that, for
-- example, positional information can be taken into account by
-- 'zip'ping the word with positions.
--
fullMatch :: Weight a b w => RegExp a -> [b] -> w
fullMatch (RegExp r) = matchW (weighted r)
{-# SPECIALIZE fullMatch :: RegExp c -> [c] -> Bool #-}
{-# SPECIALIZE fullMatch :: RegExp c -> [c] -> Numeric Int #-}
{-# SPECIALIZE fullMatch :: Num a => RegExp c -> [c] -> Numeric a #-}
{-# SPECIALIZE fullMatch :: RegExp c -> [(Int,c)] -> Leftmost #-}
{-# SPECIALIZE fullMatch :: RegExp c -> [c] -> Longest #-}
{-# SPECIALIZE fullMatch :: RegExp c -> [(Int,c)] -> LeftLong #-}
-- |
-- Matches a regular expression against substrings of a word computing
-- a weight in an arbitrary semiring. Similar to 'fullMatch' the
-- 'Weight' class is used to associate weights to the symbols of the
-- regular expression.
--
partialMatch :: Weight a b w => RegExp a -> [b] -> w
partialMatch (RegExp r) = matchW (arb `seqW` weighted r `seqW` arb)
where RegExp arb = rep anySym
{-# SPECIALIZE partialMatch :: RegExp c -> [c] -> Bool #-}
{-# SPECIALIZE partialMatch :: RegExp c -> [c] -> Numeric Int #-}
{-# SPECIALIZE partialMatch :: Num a => RegExp c -> [c] -> Numeric a #-}
{-# SPECIALIZE partialMatch :: RegExp c -> [(Int,c)] -> Leftmost #-}
{-# SPECIALIZE partialMatch :: RegExp c -> [c] -> Longest #-}
{-# SPECIALIZE partialMatch :: RegExp c -> [(Int,c)] -> LeftLong #-}
matchW :: Semiring w => RegW w c -> [c] -> w
matchW r [] = empty r
matchW r (c:cs) = final (foldl (shiftW zero) (shiftW one r c) cs)
{-# SPECIALIZE matchW :: RegW Bool c -> [c] -> Bool #-}
{-# SPECIALIZE matchW :: RegW (Numeric Int) c -> [c] -> Numeric Int #-}
{-# SPECIALIZE matchW :: Num a => RegW (Numeric a) c -> [c] -> Numeric a #-}
{-# SPECIALIZE matchW :: RegW Leftmost (Int,c) -> [(Int,c)] -> Leftmost #-}
{-# SPECIALIZE matchW :: RegW Longest c -> [c] -> Longest #-}
{-# SPECIALIZE matchW :: RegW LeftLong (Int,c) -> [(Int,c)] -> LeftLong #-}
shiftW :: Semiring w => w -> RegW w c -> c -> RegW w c
shiftW w r c | active r || w /= zero = shift w (reg r) c
| otherwise = r
shift :: Semiring w => w -> Reg w c -> c -> RegW w c
shift _ Eps _ = epsW
shift w (Sym s f) c = let w' = w .*. f c
in (symW s f) { active = w' /= zero, final_ = w' }
shift w (Alt p q) c = altW (shiftW w p c) (shiftW w q c)
shift w (Seq p q) c = seqW (shiftW w p c)
(shiftW (w .*. empty p .+. final p) q c)
shift w (Rep r) c = repW (shiftW (w .+. final r) r c)
weighted-regexp-0.3.1.2/src/Text/RegExp/Parser.y 0000644 0000000 0000000 00000007156 11716565266 017514 0 ustar 00 0000000 0000000 {
{-# LANGUAGE NoMonomorphismRestriction #-}
{-# OPTIONS -fno-warn-incomplete-patterns -fno-warn-missing-signatures #-}
module Text.RegExp.Parser ( parse ) where
import Text.RegExp.Data
( eps, char, psym, anySym, alt, seq_, rep, rep1, opt, brep )
import Data.Char ( isSpace, toLower, isAlphaNum, isDigit )
}
%name parseTokens
%tokentype { Token }
%error { parseError }
%token
sym { Sym $$ }
'*' { Ast }
seq { Seq }
'|' { Bar }
'(' { L }
')' { R }
'+' { Pls }
'?' { Que }
bnd { Bnd $$ }
cls { Cls $$ }
'.' { Dot }
%right '|'
%right seq
%right '*' '+' '?' bnd
%%
RegExp : {- empty -} { eps }
| sym { char $1 }
| RegExp '*' { rep $1 }
| RegExp seq RegExp { seq_ $1 $3 }
| RegExp '|' RegExp { alt $1 $3 }
| '(' RegExp ')' { $2 }
| RegExp '+' { rep1 $1 }
| RegExp '?' { opt $1 }
| RegExp bnd { brep $2 $1 }
| cls { uncurry psym $1 }
| '.' { anySym }
{
parse = parseTokens . scan
data Token = Seq | Sym Char | Ast | Bar | L | R
| Pls | Que | Bnd (Int,Int)
| Cls (String,Char -> Bool) | Dot
token :: Char -> Token
token '*' = Ast
token '|' = Bar
token '(' = L
token ')' = R
token '?' = Que
token '+' = Pls
token '.' = Dot
token c = Sym c
scan :: String -> [Token]
scan = insertSeqs . process
insertSeqs :: [Token] -> [Token]
insertSeqs [] = []
insertSeqs [t] = [t]
insertSeqs (a:ts@(b:_))
| lseq a && rseq b = a : Seq : insertSeqs ts
| otherwise = a : insertSeqs ts
lseq :: Token -> Bool
lseq Bar = False
lseq L = False
lseq _ = True
rseq :: Token -> Bool
rseq (Sym _) = True
rseq L = True
rseq (Cls _) = True
rseq Dot = True
rseq _ = False
process :: String -> [Token]
process [] = []
process ('\\':c:cs) = Cls (['\\',c],symClassPred c) : process cs
process ('{':cs) = case reads cs of
(n,'}':s1) : _ -> Bnd (n,n) : process s1
(n,',':s1) : _ ->
case reads s1 of
(m,'}':s2) : _ -> Bnd (n,m) : process s2
_ -> token '{' : process cs
_ -> token '{' : process cs
process ('[':'^':cs) = Cls (('[':'^':s),not.p) : process xs
where (s,p,xs) = processCls cs
process ('[' :cs) = Cls ('[':s,p) : process xs
where (s,p,xs) = processCls cs
process (c:cs) = token c : process cs
processCls :: String -> (String, Char -> Bool, String)
processCls [] = parseError []
processCls (']':cs) = ("]", const False, cs)
processCls ('\\':c:cs)
| isSymClassChar c = ('\\':c:s, \x -> symClassPred c x || p x, xs)
where (s,p,xs) = processCls cs
processCls ('\\':c:cs) = ('\\':c:s, \x -> x==c || p x, xs)
where (s,p,xs) = processCls cs
processCls (c:'-':e:cs) | e /= ']'
= (c:'-':e:s, \d -> (c<=d && d<=e) || p d, xs)
where (s,p,xs) = processCls cs
processCls (c:cs) = (c:s, \b -> b==c || p b, xs)
where (s,p,xs) = processCls cs
isSymClassChar :: Char -> Bool
isSymClassChar = (`elem`"wWdDsS")
symClassPred :: Char -> Char -> Bool
symClassPred 'w' = isWordChar
symClassPred 'd' = isDigit
symClassPred 's' = isSpace
symClassPred 'W' = not . isWordChar
symClassPred 'D' = not . isDigit
symClassPred 'S' = not . isSpace
symClassPred c = (c==)
isWordChar :: Char -> Bool
isWordChar c = c == '_' || isAlphaNum c
parseError :: [Token] -> a
parseError _ = error "cannot parse regular expression"
}
weighted-regexp-0.3.1.2/src/Text/RegExp/Matching/ 0000755 0000000 0000000 00000000000 11716565266 017607 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/src/Text/RegExp/Matching/LeftLong.hs 0000644 0000000 0000000 00000003134 11716565266 021656 0 ustar 00 0000000 0000000 -- |
-- Module : Text.RegExp.Matching.LeftLong
-- Copyright : Thomas Wilke, Frank Huch, and Sebastian Fischer
-- License : BSD3
-- Maintainer : Sebastian Fischer
-- Stability : experimental
--
-- This module implements leftmost longest matching based on weighted
-- regular expressions. It should be imported qualified as the
-- interface resembles that provided by other matching modules.
--
module Text.RegExp.Matching.LeftLong (
matching,
Matching, matchingIndex, matchingLength,
LeftLong, getLeftLong
) where
import Text.RegExp
import Text.RegExp.Matching.LeftLong.Type
-- |
-- Subwords of words that match a regular expression are represented
-- as values of type 'Matching'.
--
data Matching = Matching {
-- | Start index of the matching subword in the queried word.
matchingIndex :: !Int,
-- | Length of the matching subword.
matchingLength :: !Int
}
deriving Eq
instance Show Matching
where
showsPrec _ m = showString ""
-- |
-- Returns the leftmost longest of all non-empty matchings for a
-- regular expression in a given word. If the empty word is the only
-- matching its position is zero.
--
matching :: RegExp c -> [c] -> Maybe Matching
matching r = getLeftLong . partialMatch r . zip [(0::Int)..]
getLeftLong :: LeftLong -> Maybe Matching
getLeftLong Zero = Nothing
getLeftLong One = Just $ Matching 0 0
getLeftLong (LeftLong x y) = Just $ Matching x (y-x+1)
weighted-regexp-0.3.1.2/src/Text/RegExp/Matching/Leftmost.hs 0000644 0000000 0000000 00000002610 11716565266 021737 0 ustar 00 0000000 0000000 -- |
-- Module : Text.RegExp.Matching.Leftmost
-- Copyright : Thomas Wilke, Frank Huch, and Sebastian Fischer
-- License : BSD3
-- Maintainer : Sebastian Fischer
-- Stability : experimental
--
-- This module implements leftmost matching based on weighted regular
-- expressions. It should be imported qualified as the interface
-- resembles that provided by other matching modules.
--
module Text.RegExp.Matching.Leftmost (
matching,
Matching, matchingIndex,
Leftmost, getLeftmost
) where
import Text.RegExp
import Text.RegExp.Matching.Leftmost.Type
-- |
-- A 'Matching' records the leftmost start index of a matching subword.
--
data Matching = Matching {
-- | Start index of the matching subword in the queried word.
matchingIndex :: !Int
}
deriving Eq
instance Show Matching
where
showsPrec _ m = showString ""
-- |
-- Returns the leftmost of all non-empty matchings for a regular
-- expression in a given word. If the empty word is the only matching
-- its position is zero.
--
matching :: RegExp c -> [c] -> Maybe Matching
matching r = getLeftmost . partialMatch r . zip [(0::Int)..]
getLeftmost :: Leftmost -> Maybe Matching
getLeftmost Zero = Nothing
getLeftmost One = Just $ Matching 0
getLeftmost (Leftmost x) = Just $ Matching x
weighted-regexp-0.3.1.2/src/Text/RegExp/Matching/Longest.hs 0000644 0000000 0000000 00000002424 11716565266 021560 0 ustar 00 0000000 0000000 -- |
-- Module : Text.RegExp.Matching.Longest
-- Copyright : Thomas Wilke, Frank Huch, and Sebastian Fischer
-- License : BSD3
-- Maintainer : Sebastian Fischer
-- Stability : experimental
--
-- This module implements longest matching based on weighted regular
-- expressions. It should be imported qualified as the interface
-- resembles that provided by other matching modules.
--
module Text.RegExp.Matching.Longest (
matching,
Matching, matchingLength,
Longest, getLongest
) where
import Text.RegExp
import Text.RegExp.Matching.Longest.Type
-- |
-- A 'Matching' records the largest length of a matching subword.
--
data Matching = Matching {
-- | Length of the matching subword in the queried word.
matchingLength :: !Int
}
deriving Eq
instance Show Matching
where
showsPrec _ m = showString ""
-- |
-- Returns the longest of all matchings for a regular expression in a
-- given word.
--
matching :: RegExp c -> [c] -> Maybe Matching
matching r = getLongest . partialMatch r
getLongest :: Longest -> Maybe Matching
getLongest Zero = Nothing
getLongest One = Just $ Matching 0
getLongest (Longest x) = Just $ Matching x
weighted-regexp-0.3.1.2/src/Text/RegExp/Matching/Leftmost/ 0000755 0000000 0000000 00000000000 11716565266 021404 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/src/Text/RegExp/Matching/Leftmost/Type.hs 0000644 0000000 0000000 00000001447 11716565266 022667 0 ustar 00 0000000 0000000 {-# LANGUAGE MultiParamTypeClasses, FlexibleInstances #-}
module Text.RegExp.Matching.Leftmost.Type where
import Data.Semiring
import Text.RegExp.Data
-- | Semiring used for leftmost matching.
--
data Leftmost = Zero | One | Leftmost !Int
deriving (Eq,Show)
instance Semiring Leftmost where
zero = Zero; one = One
Zero .+. y = y
x .+. Zero = x
One .+. y = y
x .+. One = x
Leftmost a .+. Leftmost b = Leftmost (min a b)
Zero .*. _ = Zero
_ .*. Zero = Zero
One .*. y = y
x .*. One = x
Leftmost a .*. Leftmost b = Leftmost (min a b)
instance Weight c (Int,c) Leftmost where
symWeight p (n,c) = p c .*. Leftmost n
weighted-regexp-0.3.1.2/src/Text/RegExp/Matching/LeftLong/ 0000755 0000000 0000000 00000000000 11716565266 021321 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/src/Text/RegExp/Matching/LeftLong/Type.hs 0000644 0000000 0000000 00000002352 11716565266 022600 0 ustar 00 0000000 0000000 {-# LANGUAGE MultiParamTypeClasses, FlexibleInstances #-}
module Text.RegExp.Matching.LeftLong.Type where
import Data.Semiring
import Text.RegExp.Data
-- |
-- Semiring used for leftmost longest matching.
--
-- The `LeftLong` type satisfies the distributive laws only with a
-- precondition on all involved multiplications: multiplied matches
-- must be adjacent and the start position must be smaller than the
-- end position. This precondition is satisfied for all
-- multiplications during regular expression matching.
--
data LeftLong = Zero | One | LeftLong !Int !Int
deriving (Eq,Show)
instance Semiring LeftLong where
zero = Zero; one = One
Zero .+. y = y
x .+. Zero = x
One .+. y = y
x .+. One = x
LeftLong a b .+. LeftLong c d
| a=d = LeftLong a b
| otherwise = LeftLong c d
Zero .*. _ = Zero
_ .*. Zero = Zero
One .*. y = y
x .*. One = x
LeftLong a _ .*. LeftLong _ b = LeftLong a b
instance Weight c (Int,c) LeftLong where
symWeight p (n,c) = p c .*. LeftLong n n
weighted-regexp-0.3.1.2/src/Text/RegExp/Matching/Longest/ 0000755 0000000 0000000 00000000000 11716565266 021222 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/src/Text/RegExp/Matching/Longest/Type.hs 0000644 0000000 0000000 00000001374 11716565266 022504 0 ustar 00 0000000 0000000 {-# LANGUAGE MultiParamTypeClasses, FlexibleInstances #-}
module Text.RegExp.Matching.Longest.Type where
import Data.Semiring
import Text.RegExp.Data
-- | Semiring used for longest matching.
--
data Longest = Zero | One | Longest !Int
deriving (Eq,Show)
instance Semiring Longest where
zero = Zero; one = One
Zero .+. y = y
x .+. Zero = x
One .+. y = y
x .+. One = x
Longest a .+. Longest b = Longest (max a b)
Zero .*. _ = Zero
_ .*. Zero = Zero
One .*. y = y
x .*. One = x
Longest a .*. Longest b = Longest (a+b)
instance Weight c c Longest where
symWeight p c = p c .*. Longest 1
weighted-regexp-0.3.1.2/dist/ 0000755 0000000 0000000 00000000000 11716565266 014113 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/dist/build/ 0000755 0000000 0000000 00000000000 11716565266 015212 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/dist/build/Text/ 0000755 0000000 0000000 00000000000 11716565266 016136 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/dist/build/Text/RegExp/ 0000755 0000000 0000000 00000000000 11716565266 017330 5 ustar 00 0000000 0000000 weighted-regexp-0.3.1.2/dist/build/Text/RegExp/Parser.hs 0000644 0000000 0000000 00000042071 11716565266 021124 0 ustar 00 0000000 0000000 {-# OPTIONS_GHC -fno-warn-overlapping-patterns #-}
{-# OPTIONS -fglasgow-exts -cpp #-}
{-# LANGUAGE NoMonomorphismRestriction #-}
{-# OPTIONS -fno-warn-incomplete-patterns -fno-warn-missing-signatures #-}
module Text.RegExp.Parser ( parse ) where
import Text.RegExp.Data
( eps, char, psym, anySym, alt, seq_, rep, rep1, opt, brep )
import Data.Char ( isSpace, toLower, isAlphaNum, isDigit )
import qualified Data.Array as Happy_Data_Array
import qualified GHC.Exts as Happy_GHC_Exts
-- parser produced by Happy Version 1.18.5
newtype HappyAbsSyn t4 = HappyAbsSyn HappyAny
#if __GLASGOW_HASKELL__ >= 607
type HappyAny = Happy_GHC_Exts.Any
#else
type HappyAny = forall a . a
#endif
happyIn4 :: t4 -> (HappyAbsSyn t4)
happyIn4 x = Happy_GHC_Exts.unsafeCoerce# x
{-# INLINE happyIn4 #-}
happyOut4 :: (HappyAbsSyn t4) -> t4
happyOut4 x = Happy_GHC_Exts.unsafeCoerce# x
{-# INLINE happyOut4 #-}
happyInTok :: (Token) -> (HappyAbsSyn t4)
happyInTok x = Happy_GHC_Exts.unsafeCoerce# x
{-# INLINE happyInTok #-}
happyOutTok :: (HappyAbsSyn t4) -> (Token)
happyOutTok x = Happy_GHC_Exts.unsafeCoerce# x
{-# INLINE happyOutTok #-}
happyActOffsets :: HappyAddr
happyActOffsets = HappyA# "\x04\x00\x00\x00\xff\xff\x00\x00\x04\x00\x00\x00\x00\x00\x0e\x00\x00\x00\x04\x00\x04\x00\x00\x00\x00\x00\x00\x00\x16\x00\x19\x00\x00\x00\x00\x00"#
happyGotoOffsets :: HappyAddr
happyGotoOffsets = HappyA# "\x13\x00\x00\x00\x00\x00\x00\x00\x0d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"#
happyDefActions :: HappyAddr
happyDefActions = HappyA# "\xfe\xff\x00\x00\x00\x00\xfd\xff\xfe\xff\xf5\xff\xf4\xff\x00\x00\xfc\xff\xfe\xff\xfe\xff\xf8\xff\xf7\xff\xf6\xff\xfa\xff\xfb\xff\xf9\xff"#
happyCheck :: HappyAddr
happyCheck = HappyA# "\xff\xff\x02\x00\x03\x00\x04\x00\xff\xff\x01\x00\x07\x00\x08\x00\x09\x00\x05\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x0a\x00\x0b\x00\x02\x00\x03\x00\x04\x00\x00\x00\x06\x00\x07\x00\x08\x00\x09\x00\x02\x00\x03\x00\x04\x00\x02\x00\x03\x00\x07\x00\x08\x00\x09\x00\x07\x00\x08\x00\x09\x00\xff\xff\xff\xff\xff\xff"#
happyTable :: HappyAddr
happyTable = HappyA# "\x00\x00\x09\x00\x0a\x00\x0b\x00\x00\x00\x04\x00\x0c\x00\x0d\x00\x0e\x00\x05\x00\x0e\x00\xff\xff\x0f\x00\x07\x00\x06\x00\x07\x00\x09\x00\x0a\x00\x0b\x00\x02\x00\x11\x00\x0c\x00\x0d\x00\x0e\x00\x09\x00\x0a\x00\x0b\x00\x09\x00\x0a\x00\x0c\x00\x0d\x00\x0e\x00\x0c\x00\x0d\x00\x0e\x00\x00\x00\x00\x00\x00\x00"#
happyReduceArr = Happy_Data_Array.array (1, 11) [
(1 , happyReduce_1),
(2 , happyReduce_2),
(3 , happyReduce_3),
(4 , happyReduce_4),
(5 , happyReduce_5),
(6 , happyReduce_6),
(7 , happyReduce_7),
(8 , happyReduce_8),
(9 , happyReduce_9),
(10 , happyReduce_10),
(11 , happyReduce_11)
]
happy_n_terms = 13 :: Int
happy_n_nonterms = 1 :: Int
happyReduce_1 = happySpecReduce_0 0# happyReduction_1
happyReduction_1 = happyIn4
(eps
)
happyReduce_2 = happySpecReduce_1 0# happyReduction_2
happyReduction_2 happy_x_1
= case happyOutTok happy_x_1 of { (Sym happy_var_1) ->
happyIn4
(char happy_var_1
)}
happyReduce_3 = happySpecReduce_2 0# happyReduction_3
happyReduction_3 happy_x_2
happy_x_1
= case happyOut4 happy_x_1 of { happy_var_1 ->
happyIn4
(rep happy_var_1
)}
happyReduce_4 = happySpecReduce_3 0# happyReduction_4
happyReduction_4 happy_x_3
happy_x_2
happy_x_1
= case happyOut4 happy_x_1 of { happy_var_1 ->
case happyOut4 happy_x_3 of { happy_var_3 ->
happyIn4
(seq_ happy_var_1 happy_var_3
)}}
happyReduce_5 = happySpecReduce_3 0# happyReduction_5
happyReduction_5 happy_x_3
happy_x_2
happy_x_1
= case happyOut4 happy_x_1 of { happy_var_1 ->
case happyOut4 happy_x_3 of { happy_var_3 ->
happyIn4
(alt happy_var_1 happy_var_3
)}}
happyReduce_6 = happySpecReduce_3 0# happyReduction_6
happyReduction_6 happy_x_3
happy_x_2
happy_x_1
= case happyOut4 happy_x_2 of { happy_var_2 ->
happyIn4
(happy_var_2
)}
happyReduce_7 = happySpecReduce_2 0# happyReduction_7
happyReduction_7 happy_x_2
happy_x_1
= case happyOut4 happy_x_1 of { happy_var_1 ->
happyIn4
(rep1 happy_var_1
)}
happyReduce_8 = happySpecReduce_2 0# happyReduction_8
happyReduction_8 happy_x_2
happy_x_1
= case happyOut4 happy_x_1 of { happy_var_1 ->
happyIn4
(opt happy_var_1
)}
happyReduce_9 = happySpecReduce_2 0# happyReduction_9
happyReduction_9 happy_x_2
happy_x_1
= case happyOut4 happy_x_1 of { happy_var_1 ->
case happyOutTok happy_x_2 of { (Bnd happy_var_2) ->
happyIn4
(brep happy_var_2 happy_var_1
)}}
happyReduce_10 = happySpecReduce_1 0# happyReduction_10
happyReduction_10 happy_x_1
= case happyOutTok happy_x_1 of { (Cls happy_var_1) ->
happyIn4
(uncurry psym happy_var_1
)}
happyReduce_11 = happySpecReduce_1 0# happyReduction_11
happyReduction_11 happy_x_1
= happyIn4
(anySym
)
happyNewToken action sts stk [] =
happyDoAction 12# notHappyAtAll action sts stk []
happyNewToken action sts stk (tk:tks) =
let cont i = happyDoAction i tk action sts stk tks in
case tk of {
Sym happy_dollar_dollar -> cont 1#;
Ast -> cont 2#;
Seq -> cont 3#;
Bar -> cont 4#;
L -> cont 5#;
R -> cont 6#;
Pls -> cont 7#;
Que -> cont 8#;
Bnd happy_dollar_dollar -> cont 9#;
Cls happy_dollar_dollar -> cont 10#;
Dot -> cont 11#;
_ -> happyError' (tk:tks)
}
happyError_ tk tks = happyError' (tk:tks)
newtype HappyIdentity a = HappyIdentity a
happyIdentity = HappyIdentity
happyRunIdentity (HappyIdentity a) = a
instance Monad HappyIdentity where
return = HappyIdentity
(HappyIdentity p) >>= q = q p
happyThen :: () => HappyIdentity a -> (a -> HappyIdentity b) -> HappyIdentity b
happyThen = (>>=)
happyReturn :: () => a -> HappyIdentity a
happyReturn = (return)
happyThen1 m k tks = (>>=) m (\a -> k a tks)
happyReturn1 :: () => a -> b -> HappyIdentity a
happyReturn1 = \a tks -> (return) a
happyError' :: () => [(Token)] -> HappyIdentity a
happyError' = HappyIdentity . parseError
parseTokens tks = happyRunIdentity happySomeParser where
happySomeParser = happyThen (happyParse 0# tks) (\x -> happyReturn (happyOut4 x))
happySeq = happyDontSeq
parse = parseTokens . scan
data Token = Seq | Sym Char | Ast | Bar | L | R
| Pls | Que | Bnd (Int,Int)
| Cls (String,Char -> Bool) | Dot
token :: Char -> Token
token '*' = Ast
token '|' = Bar
token '(' = L
token ')' = R
token '?' = Que
token '+' = Pls
token '.' = Dot
token c = Sym c
scan :: String -> [Token]
scan = insertSeqs . process
insertSeqs :: [Token] -> [Token]
insertSeqs [] = []
insertSeqs [t] = [t]
insertSeqs (a:ts@(b:_))
| lseq a && rseq b = a : Seq : insertSeqs ts
| otherwise = a : insertSeqs ts
lseq :: Token -> Bool
lseq Bar = False
lseq L = False
lseq _ = True
rseq :: Token -> Bool
rseq (Sym _) = True
rseq L = True
rseq (Cls _) = True
rseq Dot = True
rseq _ = False
process :: String -> [Token]
process [] = []
process ('\\':c:cs) = Cls (['\\',c],symClassPred c) : process cs
process ('{':cs) = case reads cs of
(n,'}':s1) : _ -> Bnd (n,n) : process s1
(n,',':s1) : _ ->
case reads s1 of
(m,'}':s2) : _ -> Bnd (n,m) : process s2
_ -> token '{' : process cs
_ -> token '{' : process cs
process ('[':'^':cs) = Cls (('[':'^':s),not.p) : process xs
where (s,p,xs) = processCls cs
process ('[' :cs) = Cls ('[':s,p) : process xs
where (s,p,xs) = processCls cs
process (c:cs) = token c : process cs
processCls :: String -> (String, Char -> Bool, String)
processCls [] = parseError []
processCls (']':cs) = ("]", const False, cs)
processCls ('\\':c:cs)
| isSymClassChar c = ('\\':c:s, \x -> symClassPred c x || p x, xs)
where (s,p,xs) = processCls cs
processCls ('\\':c:cs) = ('\\':c:s, \x -> x==c || p x, xs)
where (s,p,xs) = processCls cs
processCls (c:'-':e:cs) | e /= ']'
= (c:'-':e:s, \d -> (c<=d && d<=e) || p d, xs)
where (s,p,xs) = processCls cs
processCls (c:cs) = (c:s, \b -> b==c || p b, xs)
where (s,p,xs) = processCls cs
isSymClassChar :: Char -> Bool
isSymClassChar = (`elem`"wWdDsS")
symClassPred :: Char -> Char -> Bool
symClassPred 'w' = isWordChar
symClassPred 'd' = isDigit
symClassPred 's' = isSpace
symClassPred 'W' = not . isWordChar
symClassPred 'D' = not . isDigit
symClassPred 'S' = not . isSpace
symClassPred c = (c==)
isWordChar :: Char -> Bool
isWordChar c = c == '_' || isAlphaNum c
parseError :: [Token] -> a
parseError _ = error "cannot parse regular expression"
{-# LINE 1 "templates/GenericTemplate.hs" #-}
{-# LINE 1 "templates/GenericTemplate.hs" #-}
{-# LINE 1 "" #-}
{-# LINE 1 "" #-}
{-# LINE 1 "templates/GenericTemplate.hs" #-}
-- Id: GenericTemplate.hs,v 1.26 2005/01/14 14:47:22 simonmar Exp
{-# LINE 30 "templates/GenericTemplate.hs" #-}
data Happy_IntList = HappyCons Happy_GHC_Exts.Int# Happy_IntList
{-# LINE 51 "templates/GenericTemplate.hs" #-}
{-# LINE 61 "templates/GenericTemplate.hs" #-}
{-# LINE 70 "templates/GenericTemplate.hs" #-}
infixr 9 `HappyStk`
data HappyStk a = HappyStk a (HappyStk a)
-----------------------------------------------------------------------------
-- starting the parse
happyParse start_state = happyNewToken start_state notHappyAtAll notHappyAtAll
-----------------------------------------------------------------------------
-- Accepting the parse
-- If the current token is 0#, it means we've just accepted a partial
-- parse (a %partial parser). We must ignore the saved token on the top of
-- the stack in this case.
happyAccept 0# tk st sts (_ `HappyStk` ans `HappyStk` _) =
happyReturn1 ans
happyAccept j tk st sts (HappyStk ans _) =
(happyTcHack j (happyTcHack st)) (happyReturn1 ans)
-----------------------------------------------------------------------------
-- Arrays only: do the next action
happyDoAction i tk st
= {- nothing -}
case action of
0# -> {- nothing -}
happyFail i tk st
-1# -> {- nothing -}
happyAccept i tk st
n | (n Happy_GHC_Exts.<# (0# :: Happy_GHC_Exts.Int#)) -> {- nothing -}
(happyReduceArr Happy_Data_Array.! rule) i tk st
where rule = (Happy_GHC_Exts.I# ((Happy_GHC_Exts.negateInt# ((n Happy_GHC_Exts.+# (1# :: Happy_GHC_Exts.Int#))))))
n -> {- nothing -}
happyShift new_state i tk st
where !(new_state) = (n Happy_GHC_Exts.-# (1# :: Happy_GHC_Exts.Int#))
where !(off) = indexShortOffAddr happyActOffsets st
!(off_i) = (off Happy_GHC_Exts.+# i)
check = if (off_i Happy_GHC_Exts.>=# (0# :: Happy_GHC_Exts.Int#))
then (indexShortOffAddr happyCheck off_i Happy_GHC_Exts.==# i)
else False
!(action)
| check = indexShortOffAddr happyTable off_i
| otherwise = indexShortOffAddr happyDefActions st
{-# LINE 130 "templates/GenericTemplate.hs" #-}
indexShortOffAddr (HappyA# arr) off =
Happy_GHC_Exts.narrow16Int# i
where
!i = Happy_GHC_Exts.word2Int# (Happy_GHC_Exts.or# (Happy_GHC_Exts.uncheckedShiftL# high 8#) low)
!high = Happy_GHC_Exts.int2Word# (Happy_GHC_Exts.ord# (Happy_GHC_Exts.indexCharOffAddr# arr (off' Happy_GHC_Exts.+# 1#)))
!low = Happy_GHC_Exts.int2Word# (Happy_GHC_Exts.ord# (Happy_GHC_Exts.indexCharOffAddr# arr off'))
!off' = off Happy_GHC_Exts.*# 2#
data HappyAddr = HappyA# Happy_GHC_Exts.Addr#
-----------------------------------------------------------------------------
-- HappyState data type (not arrays)
{-# LINE 163 "templates/GenericTemplate.hs" #-}
-----------------------------------------------------------------------------
-- Shifting a token
happyShift new_state 0# tk st sts stk@(x `HappyStk` _) =
let !(i) = (case Happy_GHC_Exts.unsafeCoerce# x of { (Happy_GHC_Exts.I# (i)) -> i }) in
-- trace "shifting the error token" $
happyDoAction i tk new_state (HappyCons (st) (sts)) (stk)
happyShift new_state i tk st sts stk =
happyNewToken new_state (HappyCons (st) (sts)) ((happyInTok (tk))`HappyStk`stk)
-- happyReduce is specialised for the common cases.
happySpecReduce_0 i fn 0# tk st sts stk
= happyFail 0# tk st sts stk
happySpecReduce_0 nt fn j tk st@((action)) sts stk
= happyGoto nt j tk st (HappyCons (st) (sts)) (fn `HappyStk` stk)
happySpecReduce_1 i fn 0# tk st sts stk
= happyFail 0# tk st sts stk
happySpecReduce_1 nt fn j tk _ sts@((HappyCons (st@(action)) (_))) (v1`HappyStk`stk')
= let r = fn v1 in
happySeq r (happyGoto nt j tk st sts (r `HappyStk` stk'))
happySpecReduce_2 i fn 0# tk st sts stk
= happyFail 0# tk st sts stk
happySpecReduce_2 nt fn j tk _ (HappyCons (_) (sts@((HappyCons (st@(action)) (_))))) (v1`HappyStk`v2`HappyStk`stk')
= let r = fn v1 v2 in
happySeq r (happyGoto nt j tk st sts (r `HappyStk` stk'))
happySpecReduce_3 i fn 0# tk st sts stk
= happyFail 0# tk st sts stk
happySpecReduce_3 nt fn j tk _ (HappyCons (_) ((HappyCons (_) (sts@((HappyCons (st@(action)) (_))))))) (v1`HappyStk`v2`HappyStk`v3`HappyStk`stk')
= let r = fn v1 v2 v3 in
happySeq r (happyGoto nt j tk st sts (r `HappyStk` stk'))
happyReduce k i fn 0# tk st sts stk
= happyFail 0# tk st sts stk
happyReduce k nt fn j tk st sts stk
= case happyDrop (k Happy_GHC_Exts.-# (1# :: Happy_GHC_Exts.Int#)) sts of
sts1@((HappyCons (st1@(action)) (_))) ->
let r = fn stk in -- it doesn't hurt to always seq here...
happyDoSeq r (happyGoto nt j tk st1 sts1 r)
happyMonadReduce k nt fn 0# tk st sts stk
= happyFail 0# tk st sts stk
happyMonadReduce k nt fn j tk st sts stk =
happyThen1 (fn stk tk) (\r -> happyGoto nt j tk st1 sts1 (r `HappyStk` drop_stk))
where !(sts1@((HappyCons (st1@(action)) (_)))) = happyDrop k (HappyCons (st) (sts))
drop_stk = happyDropStk k stk
happyMonad2Reduce k nt fn 0# tk st sts stk
= happyFail 0# tk st sts stk
happyMonad2Reduce k nt fn j tk st sts stk =
happyThen1 (fn stk tk) (\r -> happyNewToken new_state sts1 (r `HappyStk` drop_stk))
where !(sts1@((HappyCons (st1@(action)) (_)))) = happyDrop k (HappyCons (st) (sts))
drop_stk = happyDropStk k stk
!(off) = indexShortOffAddr happyGotoOffsets st1
!(off_i) = (off Happy_GHC_Exts.+# nt)
!(new_state) = indexShortOffAddr happyTable off_i
happyDrop 0# l = l
happyDrop n (HappyCons (_) (t)) = happyDrop (n Happy_GHC_Exts.-# (1# :: Happy_GHC_Exts.Int#)) t
happyDropStk 0# l = l
happyDropStk n (x `HappyStk` xs) = happyDropStk (n Happy_GHC_Exts.-# (1#::Happy_GHC_Exts.Int#)) xs
-----------------------------------------------------------------------------
-- Moving to a new state after a reduction
happyGoto nt j tk st =
{- nothing -}
happyDoAction j tk new_state
where !(off) = indexShortOffAddr happyGotoOffsets st
!(off_i) = (off Happy_GHC_Exts.+# nt)
!(new_state) = indexShortOffAddr happyTable off_i
-----------------------------------------------------------------------------
-- Error recovery (0# is the error token)
-- parse error if we are in recovery and we fail again
happyFail 0# tk old_st _ stk =
-- trace "failing" $
happyError_ tk
{- We don't need state discarding for our restricted implementation of
"error". In fact, it can cause some bogus parses, so I've disabled it
for now --SDM
-- discard a state
happyFail 0# tk old_st (HappyCons ((action)) (sts))
(saved_tok `HappyStk` _ `HappyStk` stk) =
-- trace ("discarding state, depth " ++ show (length stk)) $
happyDoAction 0# tk action sts ((saved_tok`HappyStk`stk))
-}
-- Enter error recovery: generate an error token,
-- save the old token and carry on.
happyFail i tk (action) sts stk =
-- trace "entering error recovery" $
happyDoAction 0# tk action sts ( (Happy_GHC_Exts.unsafeCoerce# (Happy_GHC_Exts.I# (i))) `HappyStk` stk)
-- Internal happy errors:
notHappyAtAll = error "Internal Happy error\n"
-----------------------------------------------------------------------------
-- Hack to get the typechecker to accept our action functions
happyTcHack :: Happy_GHC_Exts.Int# -> a -> a
happyTcHack x y = y
{-# INLINE happyTcHack #-}
-----------------------------------------------------------------------------
-- Seq-ing. If the --strict flag is given, then Happy emits
-- happySeq = happyDoSeq
-- otherwise it emits
-- happySeq = happyDontSeq
happyDoSeq, happyDontSeq :: a -> b -> b
happyDoSeq a b = a `seq` b
happyDontSeq a b = b
-----------------------------------------------------------------------------
-- Don't inline any functions from the template. GHC has a nasty habit
-- of deciding to inline happyGoto everywhere, which increases the size of
-- the generated parser quite a bit.
{-# NOINLINE happyDoAction #-}
{-# NOINLINE happyTable #-}
{-# NOINLINE happyCheck #-}
{-# NOINLINE happyActOffsets #-}
{-# NOINLINE happyGotoOffsets #-}
{-# NOINLINE happyDefActions #-}
{-# NOINLINE happyShift #-}
{-# NOINLINE happySpecReduce_0 #-}
{-# NOINLINE happySpecReduce_1 #-}
{-# NOINLINE happySpecReduce_2 #-}
{-# NOINLINE happySpecReduce_3 #-}
{-# NOINLINE happyReduce #-}
{-# NOINLINE happyMonadReduce #-}
{-# NOINLINE happyGoto #-}
{-# NOINLINE happyFail #-}
-- end of Happy Template.