statistics-0.16.2.1/0000755000000000000000000000000007346545000012322 5ustar0000000000000000statistics-0.16.2.1/LICENSE0000644000000000000000000000246107346545000013332 0ustar0000000000000000Copyright (c) 2009, 2010 Bryan O'Sullivan All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. statistics-0.16.2.1/README.markdown0000644000000000000000000000200707346545000015022 0ustar0000000000000000# Statistics: efficient, general purpose statistics This package provides the Statistics module, a Haskell library for working with statistical data in a space- and time-efficient way. Where possible, we give citations and computational complexity estimates for the algorithms used. # Performance This library has been carefully optimised for high performance. To obtain the best runtime efficiency, it is imperative to compile libraries and applications that use this library using a high level of optimisation. # Get involved! Please report bugs via the [github issue tracker](https://github.com/bos/statistics/issues). Master [git mirror](https://github.com/bos/statistics): * `git clone git://github.com/bos/statistics.git` There's also a [Mercurial mirror](https://bitbucket.org/bos/statistics): * `hg clone https://bitbucket.org/bos/statistics` (You can create and contribute changes using either Mercurial or git.) # Authors This library is written and maintained by Bryan O'Sullivan, . statistics-0.16.2.1/Setup.lhs0000644000000000000000000000011407346545000014126 0ustar0000000000000000#!/usr/bin/env runhaskell > import Distribution.Simple > main = defaultMain statistics-0.16.2.1/Statistics/0000755000000000000000000000000007346545000014454 5ustar0000000000000000statistics-0.16.2.1/Statistics/Autocorrelation.hs0000644000000000000000000000321307346545000020161 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} -- | -- Module : Statistics.Autocorrelation -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Functions for computing autocovariance and autocorrelation of a -- sample. module Statistics.Autocorrelation ( autocovariance , autocorrelation ) where import Prelude hiding (sum) import Statistics.Function (square) import Statistics.Sample (mean) import Statistics.Sample.Internal (sum) import qualified Data.Vector.Generic as G -- | Compute the autocovariance of a sample, i.e. the covariance of -- the sample against a shifted version of itself. autocovariance :: (G.Vector v Double, G.Vector v Int) => v Double -> v Double autocovariance a = G.map f . G.enumFromTo 0 $ l-2 where f k = sum (G.zipWith (*) (G.take (l-k) c) (G.slice k (l-k) c)) / fromIntegral l c = G.map (subtract (mean a)) a l = G.length a -- | Compute the autocorrelation function of a sample, and the upper -- and lower bounds of confidence intervals for each element. -- -- /Note/: The calculation of the 95% confidence interval assumes a -- stationary Gaussian process. autocorrelation :: (G.Vector v Double, G.Vector v Int) => v Double -> (v Double, v Double, v Double) autocorrelation a = (r, ci (-), ci (+)) where r = G.map (/ G.head c) c where c = autocovariance a dllse = G.map f . G.scanl1 (+) . G.map square $ r where f v = 1.96 * sqrt ((v * 2 + 1) / l) l = fromIntegral (G.length a) ci f = G.cons 1 . G.tail . G.map (f (-1/l)) $ dllse statistics-0.16.2.1/Statistics/ConfidenceInt.hs0000644000000000000000000000636707346545000017534 0ustar0000000000000000{-# LANGUAGE ViewPatterns #-} -- | Calculation of confidence intervals module Statistics.ConfidenceInt ( poissonCI , poissonNormalCI , binomialCI , naiveBinomialCI -- * References -- $references ) where import Statistics.Distribution import Statistics.Distribution.ChiSquared import Statistics.Distribution.Beta import Statistics.Types -- | Calculate confidence intervals for Poisson-distributed value -- using normal approximation poissonNormalCI :: Int -> Estimate NormalErr Double poissonNormalCI n | n < 0 = error "Statistics.ConfidenceInt.poissonNormalCI negative number of trials" | otherwise = estimateNormErr n' (sqrt n') where n' = fromIntegral n -- | Calculate confidence intervals for Poisson-distributed value for -- single measurement. These are exact confidence intervals poissonCI :: CL Double -> Int -> Estimate ConfInt Double poissonCI cl@(significanceLevel -> p) n | n < 0 = error "Statistics.ConfidenceInt.poissonCI: negative number of trials" | n == 0 = estimateFromInterval m (0 ,m2) cl | otherwise = estimateFromInterval m (m1,m2) cl where m = fromIntegral n m1 = 0.5 * quantile (chiSquared (2*n )) (p/2) m2 = 0.5 * complQuantile (chiSquared (2*n+2)) (p/2) -- | Calculate confidence interval using normal approximation. Note -- that this approximation breaks down when /p/ is either close to 0 -- or to 1. In particular if @np < 5@ or @1 - np < 5@ this -- approximation shouldn't be used. naiveBinomialCI :: Int -- ^ Number of trials -> Int -- ^ Number of successes -> Estimate NormalErr Double naiveBinomialCI n k | n <= 0 || k < 0 = error "Statistics.ConfidenceInt.naiveBinomialCI: negative number of events" | k > n = error "Statistics.ConfidenceInt.naiveBinomialCI: more successes than trials" | otherwise = estimateNormErr p σ where p = fromIntegral k / fromIntegral n σ = sqrt $ p * (1 - p) / fromIntegral n -- | Clopper-Pearson confidence interval also known as exact -- confidence intervals. binomialCI :: CL Double -> Int -- ^ Number of trials -> Int -- ^ Number of successes -> Estimate ConfInt Double binomialCI cl@(significanceLevel -> p) ni ki | ni <= 0 || ki < 0 = error "Statistics.ConfidenceInt.binomialCI: negative number of events" | ki > ni = error "Statistics.ConfidenceInt.binomialCI: more successes than trials" | ki == 0 = estimateFromInterval eff (0, ub) cl | ni == ki = estimateFromInterval eff (lb,0 ) cl | otherwise = estimateFromInterval eff (lb,ub) cl where k = fromIntegral ki n = fromIntegral ni eff = k / n lb = quantile (betaDistr k (n - k + 1)) (p/2) ub = complQuantile (betaDistr (k + 1) (n - k) ) (p/2) -- $references -- -- * Clopper, C.; Pearson, E. S. (1934). "The use of confidence or -- fiducial limits illustrated in the case of the -- binomial". Biometrika 26: 404–413. doi:10.1093/biomet/26.4.404 -- -- * Brown, Lawrence D.; Cai, T. Tony; DasGupta, Anirban -- (2001). "Interval Estimation for a Binomial Proportion". Statistical -- Science 16 (2): 101–133. doi:10.1214/ss/1009213286. MR 1861069. -- Zbl 02068924. statistics-0.16.2.1/Statistics/Correlation.hs0000644000000000000000000000370107346545000017272 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE BangPatterns #-} -- | -- Module : Statistics.Correlation.Pearson -- module Statistics.Correlation ( -- * Pearson correlation pearson , pearsonMatByRow -- * Spearman correlation , spearman , spearmanMatByRow ) where import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import Statistics.Matrix import Statistics.Sample import Statistics.Test.Internal (rankUnsorted) ---------------------------------------------------------------- -- Pearson ---------------------------------------------------------------- -- | Pearson correlation for sample of pairs. Exactly same as -- 'Statistics.Sample.correlation' pearson :: (G.Vector v (Double, Double), G.Vector v Double) => v (Double, Double) -> Double pearson = correlation {-# INLINE pearson #-} -- | Compute pairwise Pearson correlation between rows of a matrix pearsonMatByRow :: Matrix -> Matrix pearsonMatByRow m = generateSym (rows m) (\i j -> pearson $ row m i `U.zip` row m j) {-# INLINE pearsonMatByRow #-} ---------------------------------------------------------------- -- Spearman ---------------------------------------------------------------- -- | compute Spearman correlation between two samples spearman :: ( Ord a , Ord b , G.Vector v a , G.Vector v b , G.Vector v (a, b) , G.Vector v Int , G.Vector v Double , G.Vector v (Double, Double) , G.Vector v (Int, a) , G.Vector v (Int, b) ) => v (a, b) -> Double spearman xy = pearson $ G.zip (rankUnsorted x) (rankUnsorted y) where (x, y) = G.unzip xy {-# INLINE spearman #-} -- | compute pairwise Spearman correlation between rows of a matrix spearmanMatByRow :: Matrix -> Matrix spearmanMatByRow = pearsonMatByRow . fromRows . fmap rankUnsorted . toRows {-# INLINE spearmanMatByRow #-} statistics-0.16.2.1/Statistics/Correlation/0000755000000000000000000000000007346545000016735 5ustar0000000000000000statistics-0.16.2.1/Statistics/Correlation/Kendall.hs0000644000000000000000000001206707346545000020651 0ustar0000000000000000{-# LANGUAGE BangPatterns, FlexibleContexts #-} -- | -- Module : Statistics.Correlation.Kendall -- -- Fast O(NlogN) implementation of -- . -- -- This module implements Kendall's tau form b which allows ties in the data. -- This is the same formula used by other statistical packages, e.g., R, matlab. -- -- > \tau = \frac{n_c - n_d}{\sqrt{(n_0 - n_1)(n_0 - n_2)}} -- -- where n_0 = n(n-1)\/2, n_1 = number of pairs tied for the first quantify, -- n_2 = number of pairs tied for the second quantify, -- n_c = number of concordant pairs$, n_d = number of discordant pairs. module Statistics.Correlation.Kendall ( kendall -- * References -- $references ) where import Control.Monad.ST (ST, runST) import Data.Bits (shiftR) import Data.Function (on) import Data.STRef import qualified Data.Vector.Algorithms.Intro as I import qualified Data.Vector.Generic as G import qualified Data.Vector.Generic.Mutable as GM -- | /O(nlogn)/ Compute the Kendall's tau from a vector of paired data. -- Return NaN when number of pairs <= 1. kendall :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double kendall xy' | G.length xy' <= 1 = 0/0 | otherwise = runST $ do xy <- G.thaw xy' let n = GM.length xy n_dRef <- newSTRef 0 I.sort xy tieX <- numOfTiesBy ((==) `on` fst) xy tieXY <- numOfTiesBy (==) xy tmp <- GM.new n mergeSort (compare `on` snd) xy tmp n_dRef tieY <- numOfTiesBy ((==) `on` snd) xy n_d <- readSTRef n_dRef let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer n_c = n_0 - n_d - tieX - tieY + tieXY return $ fromIntegral (n_c - n_d) / (sqrt.fromIntegral) ((n_0 - tieX) * (n_0 - tieY)) {-# INLINE kendall #-} -- calculate number of tied pairs in a sorted vector numOfTiesBy :: GM.MVector v a => (a -> a -> Bool) -> v s a -> ST s Integer numOfTiesBy f xs = do count <- newSTRef (0::Integer) loop count (1::Int) (0::Int) readSTRef count where n = GM.length xs loop c !acc !i | i >= n - 1 = modifySTRef' c (+ g acc) | otherwise = do x1 <- GM.unsafeRead xs i x2 <- GM.unsafeRead xs (i+1) if f x1 x2 then loop c (acc+1) (i+1) else modifySTRef' c (+ g acc) >> loop c 1 (i+1) g x = fromIntegral ((x * (x - 1)) `shiftR` 1) {-# INLINE numOfTiesBy #-} -- Implementation of Knight's merge sort (adapted from vector-algorithm). This -- function is used to count the number of discordant pairs. mergeSort :: GM.MVector v e => (e -> e -> Ordering) -> v s e -> v s e -> STRef s Integer -> ST s () mergeSort cmp src buf count = loop 0 (GM.length src - 1) where loop l u | u == l = return () | u - l == 1 = do eL <- GM.unsafeRead src l eU <- GM.unsafeRead src u case cmp eL eU of GT -> do GM.unsafeWrite src l eU GM.unsafeWrite src u eL modifySTRef' count (+1) _ -> return () | otherwise = do let mid = (u + l) `shiftR` 1 loop l mid loop mid u merge cmp (GM.unsafeSlice l (u-l+1) src) buf (mid - l) count {-# INLINE mergeSort #-} merge :: GM.MVector v e => (e -> e -> Ordering) -> v s e -> v s e -> Int -> STRef s Integer -> ST s () merge cmp src buf mid count = do GM.unsafeCopy tmp lower eTmp <- GM.unsafeRead tmp 0 eUpp <- GM.unsafeRead upper 0 loop tmp 0 eTmp upper 0 eUpp 0 where lower = GM.unsafeSlice 0 mid src upper = GM.unsafeSlice mid (GM.length src - mid) src tmp = GM.unsafeSlice 0 mid buf wroteHigh low iLow eLow high iHigh iIns | iHigh >= GM.length high = GM.unsafeCopy (GM.unsafeSlice iIns (GM.length low - iLow) src) (GM.unsafeSlice iLow (GM.length low - iLow) low) | otherwise = do eHigh <- GM.unsafeRead high iHigh loop low iLow eLow high iHigh eHigh iIns wroteLow low iLow high iHigh eHigh iIns | iLow >= GM.length low = return () | otherwise = do eLow <- GM.unsafeRead low iLow loop low iLow eLow high iHigh eHigh iIns loop !low !iLow !eLow !high !iHigh !eHigh !iIns = case cmp eHigh eLow of LT -> do GM.unsafeWrite src iIns eHigh modifySTRef' count (+ fromIntegral (GM.length low - iLow)) wroteHigh low iLow eLow high (iHigh+1) (iIns+1) _ -> do GM.unsafeWrite src iIns eLow wroteLow low (iLow+1) high iHigh eHigh (iIns+1) {-# INLINE merge #-} -- $references -- -- * William R. Knight. (1966) A computer method for calculating Kendall's Tau -- with ungrouped data. /Journal of the American Statistical Association/, -- Vol. 61, No. 314, Part 1, pp. 436-439. -- statistics-0.16.2.1/Statistics/Distribution.hs0000644000000000000000000002102207346545000017464 0ustar0000000000000000{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE BangPatterns, ScopedTypeVariables #-} -- | -- Module : Statistics.Distribution -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Type classes for probability distributions module Statistics.Distribution ( -- * Type classes Distribution(..) , DiscreteDistr(..) , ContDistr(..) -- ** Distribution statistics , MaybeMean(..) , Mean(..) , MaybeVariance(..) , Variance(..) , MaybeEntropy(..) , Entropy(..) , FromSample(..) -- ** Random number generation , ContGen(..) , DiscreteGen(..) , genContinuous -- * Helper functions , findRoot , sumProbabilities ) where import Prelude hiding (sum) import Statistics.Function (square) import Statistics.Sample.Internal (sum) import System.Random.Stateful (StatefulGen, uniformDouble01M) import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Generic as G -- | Type class common to all distributions. Only c.d.f. could be -- defined for both discrete and continuous distributions. class Distribution d where -- | Cumulative distribution function. The probability that a -- random variable /X/ is less or equal than /x/, -- i.e. P(/X/≤/x/). Cumulative should be defined for -- infinities as well: -- -- > cumulative d +∞ = 1 -- > cumulative d -∞ = 0 cumulative :: d -> Double -> Double cumulative d x = 1 - complCumulative d x -- | One's complement of cumulative distribution: -- -- > complCumulative d x = 1 - cumulative d x -- -- It's useful when one is interested in P(/X/>/x/) and -- expression on the right side begin to lose precision. This -- function have default implementation but implementors are -- encouraged to provide more precise implementation. complCumulative :: d -> Double -> Double complCumulative d x = 1 - cumulative d x {-# MINIMAL (cumulative | complCumulative) #-} -- | Discrete probability distribution. class Distribution d => DiscreteDistr d where -- | Probability of n-th outcome. probability :: d -> Int -> Double probability d = exp . logProbability d -- | Logarithm of probability of n-th outcome logProbability :: d -> Int -> Double logProbability d = log . probability d {-# MINIMAL (probability | logProbability) #-} -- | Continuous probability distribution. -- -- Minimal complete definition is 'quantile' and either 'density' or -- 'logDensity'. class Distribution d => ContDistr d where -- | Probability density function. Probability that random -- variable /X/ lies in the infinitesimal interval -- [/x/,/x+/δ/x/) equal to /density(x)/⋅δ/x/ density :: d -> Double -> Double density d = exp . logDensity d -- | Natural logarithm of density. logDensity :: d -> Double -> Double logDensity d = log . density d -- | Inverse of the cumulative distribution function. The value -- /x/ for which P(/X/≤/x/) = /p/. If probability is outside -- of [0,1] range function should call 'error' quantile :: d -> Double -> Double quantile d x = complQuantile d (1 - x) -- | 1-complement of @quantile@: -- -- > complQuantile x ≡ quantile (1 - x) complQuantile :: d -> Double -> Double complQuantile d x = quantile d (1 - x) {-# MINIMAL (density | logDensity), (quantile | complQuantile) #-} -- | Type class for distributions with mean. 'maybeMean' should return -- 'Nothing' if it's undefined for current value of data class Distribution d => MaybeMean d where maybeMean :: d -> Maybe Double -- | Type class for distributions with mean. If a distribution has -- finite mean for all valid values of parameters it should be -- instance of this type class. class MaybeMean d => Mean d where mean :: d -> Double -- | Type class for distributions with variance. If variance is -- undefined for some parameter values both 'maybeVariance' and -- 'maybeStdDev' should return Nothing. -- -- Minimal complete definition is 'maybeVariance' or 'maybeStdDev' class MaybeMean d => MaybeVariance d where maybeVariance :: d -> Maybe Double maybeVariance = fmap square . maybeStdDev maybeStdDev :: d -> Maybe Double maybeStdDev = fmap sqrt . maybeVariance {-# MINIMAL (maybeVariance | maybeStdDev) #-} -- | Type class for distributions with variance. If distribution have -- finite variance for all valid parameter values it should be -- instance of this type class. -- -- Minimal complete definition is 'variance' or 'stdDev' class (Mean d, MaybeVariance d) => Variance d where variance :: d -> Double variance d = square (stdDev d) stdDev :: d -> Double stdDev = sqrt . variance {-# MINIMAL (variance | stdDev) #-} -- | Type class for distributions with entropy, meaning Shannon entropy -- in the case of a discrete distribution, or differential entropy in the -- case of a continuous one. 'maybeEntropy' should return 'Nothing' if -- entropy is undefined for the chosen parameter values. class (Distribution d) => MaybeEntropy d where -- | Returns the entropy of a distribution, in nats, if such is defined. maybeEntropy :: d -> Maybe Double -- | Type class for distributions with entropy, meaning Shannon -- entropy in the case of a discrete distribution, or differential -- entropy in the case of a continuous one. If the distribution has -- well-defined entropy for all valid parameter values then it -- should be an instance of this type class. class (MaybeEntropy d) => Entropy d where -- | Returns the entropy of a distribution, in nats. entropy :: d -> Double -- | Generate discrete random variates which have given -- distribution. class Distribution d => ContGen d where genContVar :: (StatefulGen g m) => d -> g -> m Double -- | Generate discrete random variates which have given -- distribution. 'ContGen' is superclass because it's always possible -- to generate real-valued variates from integer values class (DiscreteDistr d, ContGen d) => DiscreteGen d where genDiscreteVar :: (StatefulGen g m) => d -> g -> m Int -- | Estimate distribution from sample. First parameter in sample is -- distribution type and second is element type. class FromSample d a where -- | Estimate distribution from sample. Returns 'Nothing' if there is -- not enough data, or if no usable fit results from the method -- used, e.g., the estimated distribution parameters would be -- invalid or inaccurate. fromSample :: G.Vector v a => v a -> Maybe d -- | Generate variates from continuous distribution using inverse -- transform rule. genContinuous :: (ContDistr d, StatefulGen g m) => d -> g -> m Double genContinuous d gen = do x <- uniformDouble01M gen return $! quantile d x data P = P {-# UNPACK #-} !Double {-# UNPACK #-} !Double -- | Approximate the value of /X/ for which P(/x/>/X/)=/p/. -- -- This method uses a combination of Newton-Raphson iteration and -- bisection with the given guess as a starting point. The upper and -- lower bounds specify the interval in which the probability -- distribution reaches the value /p/. findRoot :: ContDistr d => d -- ^ Distribution -> Double -- ^ Probability /p/ -> Double -- ^ Initial guess -> Double -- ^ Lower bound on interval -> Double -- ^ Upper bound on interval -> Double findRoot d prob = loop 0 1 where loop !(i::Int) !dx !x !lo !hi | abs dx <= accuracy || i >= maxIters = x | otherwise = loop (i+1) dx'' x'' lo' hi' where err = cumulative d x - prob P lo' hi' | err < 0 = P x hi | otherwise = P lo x pdf = density d x P dx' x' | pdf /= 0 = P (err / pdf) (x - dx) | otherwise = P dx x P dx'' x'' | x' < lo' || x' > hi' || pdf == 0 = let y = (lo' + hi') / 2 in P (y-x) y | otherwise = P dx' x' accuracy = 1e-15 maxIters = 150 -- | Sum probabilities in inclusive interval. sumProbabilities :: DiscreteDistr d => d -> Int -> Int -> Double sumProbabilities d low hi = -- Return value is forced to be less than 1 to guard against roundoff errors. -- ATTENTION! this check should be removed for testing or it could mask bugs. min 1 . sum . U.map (probability d) $ U.enumFromTo low hi statistics-0.16.2.1/Statistics/Distribution/0000755000000000000000000000000007346545000017133 5ustar0000000000000000statistics-0.16.2.1/Statistics/Distribution/Beta.hs0000644000000000000000000001277307346545000020354 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} ----------------------------------------------------------------------------- -- | -- Module : Statistics.Distribution.Beta -- Copyright : (C) 2012 Edward Kmett, -- License : BSD-style (see the file LICENSE) -- -- Maintainer : Edward Kmett -- Stability : provisional -- Portability : DeriveDataTypeable -- ---------------------------------------------------------------------------- module Statistics.Distribution.Beta ( BetaDistribution -- * Constructor , betaDistr , betaDistrE , improperBetaDistr , improperBetaDistrE -- * Accessors , bdAlpha , bdBeta ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.SpecFunctions ( incompleteBeta, invIncompleteBeta, logBeta, digamma, log1p) import Numeric.MathFunctions.Constants (m_NaN,m_neg_inf) import qualified Statistics.Distribution as D import Statistics.Internal -- | The beta distribution data BetaDistribution = BD { bdAlpha :: {-# UNPACK #-} !Double -- ^ Alpha shape parameter , bdBeta :: {-# UNPACK #-} !Double -- ^ Beta shape parameter } deriving (Eq, Typeable, Data, Generic) instance Show BetaDistribution where showsPrec n (BD a b) = defaultShow2 "improperBetaDistr" a b n instance Read BetaDistribution where readPrec = defaultReadPrecM2 "improperBetaDistr" improperBetaDistrE instance ToJSON BetaDistribution instance FromJSON BetaDistribution where parseJSON (Object v) = do a <- v .: "bdAlpha" b <- v .: "bdBeta" maybe (fail $ errMsgI a b) return $ improperBetaDistrE a b parseJSON _ = empty instance Binary BetaDistribution where put (BD a b) = put a >> put b get = do a <- get b <- get maybe (fail $ errMsgI a b) return $ improperBetaDistrE a b -- | Create beta distribution. Both shape parameters must be positive. betaDistr :: Double -- ^ Shape parameter alpha -> Double -- ^ Shape parameter beta -> BetaDistribution betaDistr a b = maybe (error $ errMsg a b) id $ betaDistrE a b -- | Create beta distribution. Both shape parameters must be positive. betaDistrE :: Double -- ^ Shape parameter alpha -> Double -- ^ Shape parameter beta -> Maybe BetaDistribution betaDistrE a b | a > 0 && b > 0 = Just (BD a b) | otherwise = Nothing errMsg :: Double -> Double -> String errMsg a b = "Statistics.Distribution.Beta.betaDistr: " ++ "shape parameters must be positive. Got a = " ++ show a ++ " b = " ++ show b -- | Create beta distribution. Both shape parameters must be -- non-negative. So it allows to construct improper beta distribution -- which could be used as improper prior. improperBetaDistr :: Double -- ^ Shape parameter alpha -> Double -- ^ Shape parameter beta -> BetaDistribution improperBetaDistr a b = maybe (error $ errMsgI a b) id $ improperBetaDistrE a b -- | Create beta distribution. Both shape parameters must be -- non-negative. So it allows to construct improper beta distribution -- which could be used as improper prior. improperBetaDistrE :: Double -- ^ Shape parameter alpha -> Double -- ^ Shape parameter beta -> Maybe BetaDistribution improperBetaDistrE a b | a >= 0 && b >= 0 = Just (BD a b) | otherwise = Nothing errMsgI :: Double -> Double -> String errMsgI a b = "Statistics.Distribution.Beta.betaDistr: " ++ "shape parameters must be non-negative. Got a = " ++ show a ++ " b = " ++ show b instance D.Distribution BetaDistribution where cumulative (BD a b) x | x <= 0 = 0 | x >= 1 = 1 | otherwise = incompleteBeta a b x complCumulative (BD a b) x | x <= 0 = 1 | x >= 1 = 0 -- For small x we use direct computation to avoid precision loss -- when computing (1-x) | x < 0.5 = 1 - incompleteBeta a b x -- Otherwise we use property of incomplete beta: -- > I(x,a,b) = 1 - I(1-x,b,a) | otherwise = incompleteBeta b a (1-x) instance D.Mean BetaDistribution where mean (BD a b) = a / (a + b) instance D.MaybeMean BetaDistribution where maybeMean = Just . D.mean instance D.Variance BetaDistribution where variance (BD a b) = a*b / (apb*apb*(apb+1)) where apb = a + b instance D.MaybeVariance BetaDistribution where maybeVariance = Just . D.variance instance D.Entropy BetaDistribution where entropy (BD a b) = logBeta a b - (a-1) * digamma a - (b-1) * digamma b + (a+b-2) * digamma (a+b) instance D.MaybeEntropy BetaDistribution where maybeEntropy = Just . D.entropy instance D.ContDistr BetaDistribution where density (BD a b) x | a <= 0 || b <= 0 = m_NaN | x <= 0 = 0 | x >= 1 = 0 | otherwise = exp $ (a-1)*log x + (b-1) * log1p (-x) - logBeta a b logDensity (BD a b) x | a <= 0 || b <= 0 = m_NaN | x <= 0 = m_neg_inf | x >= 1 = m_neg_inf | otherwise = (a-1)*log x + (b-1)*log1p (-x) - logBeta a b quantile (BD a b) p | p == 0 = 0 | p == 1 = 1 | p > 0 && p < 1 = invIncompleteBeta a b p | otherwise = error $ "Statistics.Distribution.Gamma.quantile: p must be in [0,1] range. Got: "++show p instance D.ContGen BetaDistribution where genContVar = D.genContinuous statistics-0.16.2.1/Statistics/Distribution/Binomial.hs0000644000000000000000000001323707346545000021227 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Binomial -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The binomial distribution. This is the discrete probability -- distribution of the number of successes in a sequence of /n/ -- independent yes\/no experiments, each of which yields success with -- probability /p/. module Statistics.Distribution.Binomial ( BinomialDistribution -- * Constructors , binomial , binomialE -- * Accessors , bdTrials , bdProbability ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.SpecFunctions (choose,logChoose,incompleteBeta,log1p) import Numeric.MathFunctions.Constants (m_epsilon,m_tiny) import qualified Statistics.Distribution as D import qualified Statistics.Distribution.Poisson.Internal as I import Statistics.Internal -- | The binomial distribution. data BinomialDistribution = BD { bdTrials :: {-# UNPACK #-} !Int -- ^ Number of trials. , bdProbability :: {-# UNPACK #-} !Double -- ^ Probability. } deriving (Eq, Typeable, Data, Generic) instance Show BinomialDistribution where showsPrec i (BD n p) = defaultShow2 "binomial" n p i instance Read BinomialDistribution where readPrec = defaultReadPrecM2 "binomial" binomialE instance ToJSON BinomialDistribution instance FromJSON BinomialDistribution where parseJSON (Object v) = do n <- v .: "bdTrials" p <- v .: "bdProbability" maybe (fail $ errMsg n p) return $ binomialE n p parseJSON _ = empty instance Binary BinomialDistribution where put (BD x y) = put x >> put y get = do n <- get p <- get maybe (fail $ errMsg n p) return $ binomialE n p instance D.Distribution BinomialDistribution where cumulative = cumulative complCumulative = complCumulative instance D.DiscreteDistr BinomialDistribution where probability = probability logProbability = logProbability instance D.Mean BinomialDistribution where mean = mean instance D.Variance BinomialDistribution where variance = variance instance D.MaybeMean BinomialDistribution where maybeMean = Just . D.mean instance D.MaybeVariance BinomialDistribution where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Entropy BinomialDistribution where entropy (BD n p) | n == 0 = 0 | n <= 100 = directEntropy (BD n p) | otherwise = I.poissonEntropy (fromIntegral n * p) instance D.MaybeEntropy BinomialDistribution where maybeEntropy = Just . D.entropy -- This could be slow for big n probability :: BinomialDistribution -> Int -> Double probability (BD n p) k | k < 0 || k > n = 0 | n == 0 = 1 -- choose could overflow Double for n >= 1030 so we switch to -- log-domain to calculate probability -- -- We also want to avoid underflow when computing p^k & -- (1-p)^(n-k). | n < 1000 , pK >= m_tiny , pNK >= m_tiny = choose n k * pK * pNK | otherwise = exp $ logChoose n k + log p * k' + log1p (-p) * nk' where pK = p^k pNK = (1-p)^(n-k) k' = fromIntegral k nk' = fromIntegral $ n - k logProbability :: BinomialDistribution -> Int -> Double logProbability (BD n p) k | k < 0 || k > n = (-1)/0 | n == 0 = 0 | otherwise = logChoose n k + log p * k' + log1p (-p) * nk' where k' = fromIntegral k nk' = fromIntegral $ n - k cumulative :: BinomialDistribution -> Double -> Double cumulative (BD n p) x | isNaN x = error "Statistics.Distribution.Binomial.cumulative: NaN input" | isInfinite x = if x > 0 then 1 else 0 | k < 0 = 0 | k >= n = 1 | otherwise = incompleteBeta (fromIntegral (n-k)) (fromIntegral (k+1)) (1 - p) where k = floor x complCumulative :: BinomialDistribution -> Double -> Double complCumulative (BD n p) x | isNaN x = error "Statistics.Distribution.Binomial.complCumulative: NaN input" | isInfinite x = if x > 0 then 0 else 1 | k < 0 = 1 | k >= n = 0 | otherwise = incompleteBeta (fromIntegral (k+1)) (fromIntegral (n-k)) p where k = floor x mean :: BinomialDistribution -> Double mean (BD n p) = fromIntegral n * p variance :: BinomialDistribution -> Double variance (BD n p) = fromIntegral n * p * (1 - p) directEntropy :: BinomialDistribution -> Double directEntropy d@(BD n _) = negate . sum $ takeWhile (< negate m_epsilon) $ dropWhile (not . (< negate m_epsilon)) $ [ let x = probability d k in x * log x | k <- [0..n]] -- | Construct binomial distribution. Number of trials must be -- non-negative and probability must be in [0,1] range binomial :: Int -- ^ Number of trials. -> Double -- ^ Probability. -> BinomialDistribution binomial n p = maybe (error $ errMsg n p) id $ binomialE n p -- | Construct binomial distribution. Number of trials must be -- non-negative and probability must be in [0,1] range binomialE :: Int -- ^ Number of trials. -> Double -- ^ Probability. -> Maybe BinomialDistribution binomialE n p | n < 0 = Nothing | p >= 0 && p <= 1 = Just (BD n p) | otherwise = Nothing errMsg :: Int -> Double -> String errMsg n p = "Statistics.Distribution.Binomial.binomial: n=" ++ show n ++ " p=" ++ show p ++ "but n>=0 and p in [0,1]" statistics-0.16.2.1/Statistics/Distribution/CauchyLorentz.hs0000644000000000000000000001047707346545000022272 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.CauchyLorentz -- Copyright : (c) 2011 Aleksey Khudyakov -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The Cauchy-Lorentz distribution. It's also known as Lorentz -- distribution or Breit–Wigner distribution. -- -- It doesn't have mean and variance. module Statistics.Distribution.CauchyLorentz ( CauchyDistribution , cauchyDistribMedian , cauchyDistribScale -- * Constructors , cauchyDistribution , cauchyDistributionE , standardCauchy ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Maybe (fromMaybe) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import qualified Statistics.Distribution as D import Statistics.Internal -- | Cauchy-Lorentz distribution. data CauchyDistribution = CD { -- | Central value of Cauchy-Lorentz distribution which is its -- mode and median. Distribution doesn't have mean so function -- is named after median. cauchyDistribMedian :: {-# UNPACK #-} !Double -- | Scale parameter of Cauchy-Lorentz distribution. It's -- different from variance and specify half width at half -- maximum (HWHM). , cauchyDistribScale :: {-# UNPACK #-} !Double } deriving (Eq, Typeable, Data, Generic) instance Show CauchyDistribution where showsPrec i (CD m s) = defaultShow2 "cauchyDistribution" m s i instance Read CauchyDistribution where readPrec = defaultReadPrecM2 "cauchyDistribution" cauchyDistributionE instance ToJSON CauchyDistribution instance FromJSON CauchyDistribution where parseJSON (Object v) = do m <- v .: "cauchyDistribMedian" s <- v .: "cauchyDistribScale" maybe (fail $ errMsg m s) return $ cauchyDistributionE m s parseJSON _ = empty instance Binary CauchyDistribution where put (CD m s) = put m >> put s get = do m <- get s <- get maybe (error $ errMsg m s) return $ cauchyDistributionE m s -- | Cauchy distribution cauchyDistribution :: Double -- ^ Central point -> Double -- ^ Scale parameter (FWHM) -> CauchyDistribution cauchyDistribution m s = fromMaybe (error $ errMsg m s) $ cauchyDistributionE m s -- | Cauchy distribution cauchyDistributionE :: Double -- ^ Central point -> Double -- ^ Scale parameter (FWHM) -> Maybe CauchyDistribution cauchyDistributionE m s | s > 0 = Just (CD m s) | otherwise = Nothing errMsg :: Double -> Double -> String errMsg _ s = "Statistics.Distribution.CauchyLorentz.cauchyDistribution: FWHM must be positive. Got " ++ show s -- | Standard Cauchy distribution. It's centered at 0 and have 1 FWHM standardCauchy :: CauchyDistribution standardCauchy = CD 0 1 instance D.Distribution CauchyDistribution where cumulative (CD m s) x | y < -1 = atan (-1/y) / pi | otherwise = 0.5 + atan y / pi where y = (x - m) / s complCumulative (CD m s) x | y > 1 = atan (1/y) / pi | otherwise = 0.5 - atan y / pi where y = (x - m) / s instance D.ContDistr CauchyDistribution where density (CD m s) x = (1 / pi) / (s * (1 + y*y)) where y = (x - m) / s quantile (CD m s) p | p == 0 = -1 / 0 | p == 1 = 1 / 0 | p == 0.5 = m | p < 0 = err | p < 0.5 = m - s / tan( pi * p ) | p < 1 = m + s / tan( pi * (1 - p) ) | otherwise = err where err = error $ "Statistics.Distribution.CauchyLorentz.quantile: p must be in [0,1] range. Got: "++show p complQuantile (CD m s) p | p == 0 = 1 / 0 | p == 1 = -1 / 0 | p == 0.5 = m | p < 0 = err | p < 0.5 = m + s / tan( pi * p ) | p < 1 = m - s / tan( pi * (1 - p) ) | otherwise = err where err = error $ "Statistics.Distribution.CauchyLorentz.quantile: p must be in [0,1] range. Got: "++show p instance D.ContGen CauchyDistribution where genContVar = D.genContinuous instance D.Entropy CauchyDistribution where entropy (CD _ s) = log s + log (4*pi) instance D.MaybeEntropy CauchyDistribution where maybeEntropy = Just . D.entropy statistics-0.16.2.1/Statistics/Distribution/ChiSquared.hs0000644000000000000000000001001607346545000021515 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.ChiSquared -- Copyright : (c) 2010 Alexey Khudyakov -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The chi-squared distribution. This is a continuous probability -- distribution of sum of squares of k independent standard normal -- distributions. It's commonly used in statistical tests module Statistics.Distribution.ChiSquared ( ChiSquared , chiSquaredNDF -- * Constructors , chiSquared , chiSquaredE ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.SpecFunctions ( incompleteGamma,invIncompleteGamma,logGamma,digamma) import Numeric.MathFunctions.Constants (m_neg_inf) import qualified System.Random.MWC.Distributions as MWC import qualified Statistics.Distribution as D import Statistics.Internal -- | Chi-squared distribution newtype ChiSquared = ChiSquared { chiSquaredNDF :: Int -- ^ Get number of degrees of freedom } deriving (Eq, Typeable, Data, Generic) instance Show ChiSquared where showsPrec i (ChiSquared n) = defaultShow1 "chiSquared" n i instance Read ChiSquared where readPrec = defaultReadPrecM1 "chiSquared" chiSquaredE instance ToJSON ChiSquared instance FromJSON ChiSquared where parseJSON (Object v) = do n <- v .: "chiSquaredNDF" maybe (fail $ errMsg n) return $ chiSquaredE n parseJSON _ = empty instance Binary ChiSquared where put (ChiSquared x) = put x get = do n <- get maybe (fail $ errMsg n) return $ chiSquaredE n -- | Construct chi-squared distribution. Number of degrees of freedom -- must be positive. chiSquared :: Int -> ChiSquared chiSquared n = maybe (error $ errMsg n) id $ chiSquaredE n -- | Construct chi-squared distribution. Number of degrees of freedom -- must be positive. chiSquaredE :: Int -> Maybe ChiSquared chiSquaredE n | n <= 0 = Nothing | otherwise = Just (ChiSquared n) errMsg :: Int -> String errMsg n = "Statistics.Distribution.ChiSquared.chiSquared: N.D.F. must be positive. Got " ++ show n instance D.Distribution ChiSquared where cumulative = cumulative instance D.ContDistr ChiSquared where density chi x | x <= 0 = 0 | otherwise = exp $ log x * (ndf2 - 1) - x2 - logGamma ndf2 - log 2 * ndf2 where ndf = fromIntegral $ chiSquaredNDF chi ndf2 = ndf/2 x2 = x/2 logDensity chi x | x <= 0 = m_neg_inf | otherwise = log x * (ndf2 - 1) - x2 - logGamma ndf2 - log 2 * ndf2 where ndf = fromIntegral $ chiSquaredNDF chi ndf2 = ndf/2 x2 = x/2 quantile = quantile instance D.Mean ChiSquared where mean (ChiSquared ndf) = fromIntegral ndf instance D.Variance ChiSquared where variance (ChiSquared ndf) = fromIntegral (2*ndf) instance D.MaybeMean ChiSquared where maybeMean = Just . D.mean instance D.MaybeVariance ChiSquared where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Entropy ChiSquared where entropy (ChiSquared ndf) = let kHalf = 0.5 * fromIntegral ndf in kHalf + log 2 + logGamma kHalf + (1-kHalf) * digamma kHalf instance D.MaybeEntropy ChiSquared where maybeEntropy = Just . D.entropy instance D.ContGen ChiSquared where genContVar (ChiSquared n) = MWC.chiSquare n cumulative :: ChiSquared -> Double -> Double cumulative chi x | x <= 0 = 0 | otherwise = incompleteGamma (ndf/2) (x/2) where ndf = fromIntegral $ chiSquaredNDF chi quantile :: ChiSquared -> Double -> Double quantile (ChiSquared ndf) p | p == 0 = 0 | p == 1 = 1/0 | p > 0 && p < 1 = 2 * invIncompleteGamma (fromIntegral ndf / 2) p | otherwise = error $ "Statistics.Distribution.ChiSquared.quantile: p must be in [0,1] range. Got: "++show p statistics-0.16.2.1/Statistics/Distribution/DiscreteUniform.hs0000644000000000000000000000746107346545000022601 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, DeriveGeneric, OverloadedStrings #-} -- | -- Module : Statistics.Distribution.DiscreteUniform -- Copyright : (c) 2016 André Szabolcs Szelp -- License : BSD3 -- -- Maintainer : a.sz.szelp@gmail.com -- Stability : experimental -- Portability : portable -- -- The discrete uniform distribution. There are two parametrizations of -- this distribution. First is the probability distribution on an -- inclusive interval {1, ..., n}. This is parametrized with n only, -- where p_1, ..., p_n = 1/n. ('discreteUniform'). -- -- The second parametrization is the uniform distribution on {a, ..., b} with -- probabilities p_a, ..., p_b = 1/(a-b+1). This is parametrized with -- /a/ and /b/. ('discreteUniformAB') module Statistics.Distribution.DiscreteUniform ( DiscreteUniform -- * Constructors , discreteUniform , discreteUniformAB -- * Accessors , rangeFrom , rangeTo ) where import Control.Applicative (empty) import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import System.Random.Stateful (uniformRM) import GHC.Generics (Generic) import qualified Statistics.Distribution as D import Statistics.Internal -- | The discrete uniform distribution. data DiscreteUniform = U { rangeFrom :: {-# UNPACK #-} !Int -- ^ /a/, the lower bound of the support {a, ..., b} , rangeTo :: {-# UNPACK #-} !Int -- ^ /b/, the upper bound of the support {a, ..., b} } deriving (Eq, Typeable, Data, Generic) instance Show DiscreteUniform where showsPrec i (U a b) = defaultShow2 "discreteUniformAB" a b i instance Read DiscreteUniform where readPrec = defaultReadPrecM2 "discreteUniformAB" (\a b -> Just (discreteUniformAB a b)) instance ToJSON DiscreteUniform instance FromJSON DiscreteUniform where parseJSON (Object v) = do a <- v .: "uniformA" b <- v .: "uniformB" return $ discreteUniformAB a b parseJSON _ = empty instance Binary DiscreteUniform where put (U a b) = put a >> put b get = discreteUniformAB <$> get <*> get instance D.Distribution DiscreteUniform where cumulative (U a b) x | x < fromIntegral a = 0 | x > fromIntegral b = 1 | otherwise = fromIntegral (floor x - a + 1) / fromIntegral (b - a + 1) instance D.DiscreteDistr DiscreteUniform where probability (U a b) k | k >= a && k <= b = 1 / fromIntegral (b - a + 1) | otherwise = 0 instance D.Mean DiscreteUniform where mean (U a b) = fromIntegral (a+b)/2 instance D.Variance DiscreteUniform where variance (U a b) = (fromIntegral (b - a + 1)^(2::Int) - 1) / 12 instance D.MaybeMean DiscreteUniform where maybeMean = Just . D.mean instance D.MaybeVariance DiscreteUniform where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Entropy DiscreteUniform where entropy (U a b) = log $ fromIntegral $ b - a + 1 instance D.MaybeEntropy DiscreteUniform where maybeEntropy = Just . D.entropy instance D.ContGen DiscreteUniform where genContVar d = fmap fromIntegral . D.genDiscreteVar d instance D.DiscreteGen DiscreteUniform where genDiscreteVar (U a b) = uniformRM (a,b) -- | Construct discrete uniform distribution on support {1, ..., n}. -- Range /n/ must be >0. discreteUniform :: Int -- ^ Range -> DiscreteUniform discreteUniform n | n < 1 = error $ msg ++ "range must be > 0. Got " ++ show n | otherwise = U 1 n where msg = "Statistics.Distribution.DiscreteUniform.discreteUniform: " -- | Construct discrete uniform distribution on support {a, ..., b}. discreteUniformAB :: Int -- ^ Lower boundary (inclusive) -> Int -- ^ Upper boundary (inclusive) -> DiscreteUniform discreteUniformAB a b | b < a = U b a | otherwise = U a b statistics-0.16.2.1/Statistics/Distribution/Exponential.hs0000644000000000000000000001121007346545000021750 0ustar0000000000000000{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Exponential -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The exponential distribution. This is the continuous probability -- distribution of the times between events in a Poisson process, in -- which events occur continuously and independently at a constant -- average rate. module Statistics.Distribution.Exponential ( ExponentialDistribution -- * Constructors , exponential , exponentialE -- * Accessors , edLambda ) where import Control.Applicative import Data.Aeson (FromJSON(..),ToJSON,Value(..),(.:)) import Data.Binary (Binary, put, get) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.SpecFunctions (log1p,expm1) import Numeric.MathFunctions.Constants (m_neg_inf) import qualified System.Random.MWC.Distributions as MWC import qualified Statistics.Distribution as D import qualified Statistics.Sample as S import Statistics.Internal newtype ExponentialDistribution = ED { edLambda :: Double } deriving (Eq, Typeable, Data, Generic) instance Show ExponentialDistribution where showsPrec n (ED l) = defaultShow1 "exponential" l n instance Read ExponentialDistribution where readPrec = defaultReadPrecM1 "exponential" exponentialE instance ToJSON ExponentialDistribution instance FromJSON ExponentialDistribution where parseJSON (Object v) = do l <- v .: "edLambda" maybe (fail $ errMsg l) return $ exponentialE l parseJSON _ = empty instance Binary ExponentialDistribution where put = put . edLambda get = do l <- get maybe (fail $ errMsg l) return $ exponentialE l instance D.Distribution ExponentialDistribution where cumulative = cumulative complCumulative = complCumulative instance D.ContDistr ExponentialDistribution where density (ED l) x | x < 0 = 0 | otherwise = l * exp (-l * x) logDensity (ED l) x | x < 0 = m_neg_inf | otherwise = log l + (-l * x) quantile = quantile complQuantile = complQuantile instance D.Mean ExponentialDistribution where mean (ED l) = 1 / l instance D.Variance ExponentialDistribution where variance (ED l) = 1 / (l * l) instance D.MaybeMean ExponentialDistribution where maybeMean = Just . D.mean instance D.MaybeVariance ExponentialDistribution where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Entropy ExponentialDistribution where entropy (ED l) = 1 - log l instance D.MaybeEntropy ExponentialDistribution where maybeEntropy = Just . D.entropy instance D.ContGen ExponentialDistribution where genContVar = MWC.exponential . edLambda cumulative :: ExponentialDistribution -> Double -> Double cumulative (ED l) x | x <= 0 = 0 | otherwise = - expm1 (-l * x) complCumulative :: ExponentialDistribution -> Double -> Double complCumulative (ED l) x | x <= 0 = 1 | otherwise = exp (-l * x) quantile :: ExponentialDistribution -> Double -> Double quantile (ED l) p | p >= 0 && p <= 1 = - log1p(-p) / l | otherwise = error $ "Statistics.Distribution.Exponential.quantile: p must be in [0,1] range. Got: "++show p complQuantile :: ExponentialDistribution -> Double -> Double complQuantile (ED l) p | p == 0 = 0 | p >= 0 && p < 1 = -log p / l | otherwise = error $ "Statistics.Distribution.Exponential.quantile: p must be in [0,1] range. Got: "++show p -- | Create an exponential distribution. exponential :: Double -- ^ Rate parameter. -> ExponentialDistribution exponential l = maybe (error $ errMsg l) id $ exponentialE l -- | Create an exponential distribution. exponentialE :: Double -- ^ Rate parameter. -> Maybe ExponentialDistribution exponentialE l | l > 0 = Just (ED l) | otherwise = Nothing errMsg :: Double -> String errMsg l = "Statistics.Distribution.Exponential.exponential: scale parameter must be positive. Got " ++ show l -- | Create exponential distribution from sample. Estimates the rate -- with the maximum likelihood estimator, which is biased. Returns -- @Nothing@ if the sample mean does not exist or is not positive. instance D.FromSample ExponentialDistribution Double where fromSample xs = let m = S.mean xs in if m > 0 then Just (ED (1/m)) else Nothing statistics-0.16.2.1/Statistics/Distribution/FDistribution.hs0000644000000000000000000001275607346545000022267 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.FDistribution -- Copyright : (c) 2011 Aleksey Khudyakov -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Fisher F distribution module Statistics.Distribution.FDistribution ( FDistribution -- * Constructors , fDistribution , fDistributionE , fDistributionReal , fDistributionRealE -- * Accessors , fDistributionNDF1 , fDistributionNDF2 ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.SpecFunctions ( logBeta, incompleteBeta, invIncompleteBeta, digamma) import Numeric.MathFunctions.Constants (m_neg_inf) import qualified Statistics.Distribution as D import Statistics.Function (square) import Statistics.Internal -- | F distribution data FDistribution = F { fDistributionNDF1 :: {-# UNPACK #-} !Double , fDistributionNDF2 :: {-# UNPACK #-} !Double , _pdfFactor :: {-# UNPACK #-} !Double } deriving (Eq, Typeable, Data, Generic) instance Show FDistribution where showsPrec i (F n m _) = defaultShow2 "fDistributionReal" n m i instance Read FDistribution where readPrec = defaultReadPrecM2 "fDistributionReal" fDistributionRealE instance ToJSON FDistribution instance FromJSON FDistribution where parseJSON (Object v) = do n <- v .: "fDistributionNDF1" m <- v .: "fDistributionNDF2" maybe (fail $ errMsgR n m) return $ fDistributionRealE n m parseJSON _ = empty instance Binary FDistribution where put (F n m _) = put n >> put m get = do n <- get m <- get maybe (fail $ errMsgR n m) return $ fDistributionRealE n m fDistribution :: Int -> Int -> FDistribution fDistribution n m = maybe (error $ errMsg n m) id $ fDistributionE n m fDistributionReal :: Double -> Double -> FDistribution fDistributionReal n m = maybe (error $ errMsgR n m) id $ fDistributionRealE n m fDistributionE :: Int -> Int -> Maybe FDistribution fDistributionE n m | n > 0 && m > 0 = let n' = fromIntegral n m' = fromIntegral m f' = 0.5 * (log m' * m' + log n' * n') - logBeta (0.5*n') (0.5*m') in Just $ F n' m' f' | otherwise = Nothing fDistributionRealE :: Double -> Double -> Maybe FDistribution fDistributionRealE n m | n > 0 && m > 0 = let f' = 0.5 * (log m * m + log n * n) - logBeta (0.5*n) (0.5*m) in Just $ F n m f' | otherwise = Nothing errMsg :: Int -> Int -> String errMsg _ _ = "Statistics.Distribution.FDistribution.fDistribution: non-positive number of degrees of freedom" errMsgR :: Double -> Double -> String errMsgR _ _ = "Statistics.Distribution.FDistribution.fDistribution: non-positive number of degrees of freedom" instance D.Distribution FDistribution where cumulative = cumulative complCumulative = complCumulative instance D.ContDistr FDistribution where density d x | x <= 0 = 0 | otherwise = exp $ logDensity d x logDensity d x | x <= 0 = m_neg_inf | otherwise = logDensity d x quantile = quantile cumulative :: FDistribution -> Double -> Double cumulative (F n m _) x | x <= 0 = 0 -- Only matches +∞ | isInfinite x = 1 -- NOTE: Here we rely on implementation detail of incompleteBeta. It -- computes using series expansion for sufficiently small x -- and uses following identity otherwise: -- -- I(x; a, b) = 1 - I(1-x; b, a) -- -- Point is we can compute 1-x as m/(m+y) without loss of -- precision for large x. Sadly this switchover point is -- implementation detail. | n >= (n+m)*bx = incompleteBeta (0.5 * n) (0.5 * m) bx | otherwise = 1 - incompleteBeta (0.5 * m) (0.5 * n) bx1 where y = n * x bx = y / (m + y) bx1 = m / (m + y) complCumulative :: FDistribution -> Double -> Double complCumulative (F n m _) x | x <= 0 = 1 -- Only matches +∞ | isInfinite x = 0 -- See NOTE at cumulative | m >= (n+m)*bx = incompleteBeta (0.5 * m) (0.5 * n) bx | otherwise = 1 - incompleteBeta (0.5 * n) (0.5 * m) bx1 where y = n*x bx = m / (m + y) bx1 = y / (m + y) logDensity :: FDistribution -> Double -> Double logDensity (F n m fac) x = fac + log x * (0.5 * n - 1) - log(m + n*x) * 0.5 * (n + m) quantile :: FDistribution -> Double -> Double quantile (F n m _) p | p >= 0 && p <= 1 = let x = invIncompleteBeta (0.5 * n) (0.5 * m) p in m * x / (n * (1 - x)) | otherwise = error $ "Statistics.Distribution.Uniform.quantile: p must be in [0,1] range. Got: "++show p instance D.MaybeMean FDistribution where maybeMean (F _ m _) | m > 2 = Just $ m / (m - 2) | otherwise = Nothing instance D.MaybeVariance FDistribution where maybeStdDev (F n m _) | m > 4 = Just $ 2 * square m * (m + n - 2) / (n * square (m - 2) * (m - 4)) | otherwise = Nothing instance D.Entropy FDistribution where entropy (F n m _) = let nHalf = 0.5 * n mHalf = 0.5 * m in log (n/m) + logBeta nHalf mHalf + (1 - nHalf) * digamma nHalf - (1 + mHalf) * digamma mHalf + (nHalf + mHalf) * digamma (nHalf + mHalf) instance D.MaybeEntropy FDistribution where maybeEntropy = Just . D.entropy instance D.ContGen FDistribution where genContVar = D.genContinuous statistics-0.16.2.1/Statistics/Distribution/Gamma.hs0000644000000000000000000001357007346545000020517 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Gamma -- Copyright : (c) 2009, 2011 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The gamma distribution. This is a continuous probability -- distribution with two parameters, /k/ and ϑ. If /k/ is -- integral, the distribution represents the sum of /k/ independent -- exponentially distributed random variables, each of which has a -- mean of ϑ. module Statistics.Distribution.Gamma ( GammaDistribution -- * Constructors , gammaDistr , gammaDistrE , improperGammaDistr , improperGammaDistrE -- * Accessors , gdShape , gdScale ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.MathFunctions.Constants (m_pos_inf, m_NaN, m_neg_inf) import Numeric.SpecFunctions (incompleteGamma, invIncompleteGamma, logGamma, digamma) import qualified System.Random.MWC.Distributions as MWC import qualified Numeric.Sum as Sum import Statistics.Distribution.Poisson.Internal as Poisson import qualified Statistics.Distribution as D import Statistics.Internal -- | The gamma distribution. data GammaDistribution = GD { gdShape :: {-# UNPACK #-} !Double -- ^ Shape parameter, /k/. , gdScale :: {-# UNPACK #-} !Double -- ^ Scale parameter, ϑ. } deriving (Eq, Typeable, Data, Generic) instance Show GammaDistribution where showsPrec i (GD k theta) = defaultShow2 "improperGammaDistr" k theta i instance Read GammaDistribution where readPrec = defaultReadPrecM2 "improperGammaDistr" improperGammaDistrE instance ToJSON GammaDistribution instance FromJSON GammaDistribution where parseJSON (Object v) = do k <- v .: "gdShape" theta <- v .: "gdScale" maybe (fail $ errMsgI k theta) return $ improperGammaDistrE k theta parseJSON _ = empty instance Binary GammaDistribution where put (GD x y) = put x >> put y get = do k <- get theta <- get maybe (fail $ errMsgI k theta) return $ improperGammaDistrE k theta -- | Create gamma distribution. Both shape and scale parameters must -- be positive. gammaDistr :: Double -- ^ Shape parameter. /k/ -> Double -- ^ Scale parameter, ϑ. -> GammaDistribution gammaDistr k theta = maybe (error $ errMsg k theta) id $ gammaDistrE k theta errMsg :: Double -> Double -> String errMsg k theta = "Statistics.Distribution.Gamma.gammaDistr: " ++ "k=" ++ show k ++ "theta=" ++ show theta ++ " but must be positive" -- | Create gamma distribution. Both shape and scale parameters must -- be positive. gammaDistrE :: Double -- ^ Shape parameter. /k/ -> Double -- ^ Scale parameter, ϑ. -> Maybe GammaDistribution gammaDistrE k theta | k > 0 && theta > 0 = Just (GD k theta) | otherwise = Nothing -- | Create gamma distribution. Both shape and scale parameters must -- be non-negative. improperGammaDistr :: Double -- ^ Shape parameter. /k/ -> Double -- ^ Scale parameter, ϑ. -> GammaDistribution improperGammaDistr k theta = maybe (error $ errMsgI k theta) id $ improperGammaDistrE k theta errMsgI :: Double -> Double -> String errMsgI k theta = "Statistics.Distribution.Gamma.gammaDistr: " ++ "k=" ++ show k ++ "theta=" ++ show theta ++ " but must be non-negative" -- | Create gamma distribution. Both shape and scale parameters must -- be non-negative. improperGammaDistrE :: Double -- ^ Shape parameter. /k/ -> Double -- ^ Scale parameter, ϑ. -> Maybe GammaDistribution improperGammaDistrE k theta | k >= 0 && theta >= 0 = Just (GD k theta) | otherwise = Nothing instance D.Distribution GammaDistribution where cumulative = cumulative instance D.ContDistr GammaDistribution where density = density logDensity (GD k theta) x | x <= 0 = m_neg_inf | otherwise = Sum.sum Sum.kbn [ log x * (k - 1) , - (x / theta) , - logGamma k , - log theta * k ] quantile = quantile instance D.Variance GammaDistribution where variance (GD a l) = a * l * l instance D.Mean GammaDistribution where mean (GD a l) = a * l instance D.MaybeMean GammaDistribution where maybeMean = Just . D.mean instance D.MaybeVariance GammaDistribution where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.MaybeEntropy GammaDistribution where maybeEntropy (GD a l) | a > 0 && l > 0 = Just $ a + log l + logGamma a + (1-a) * digamma a | otherwise = Nothing instance D.ContGen GammaDistribution where genContVar (GD a l) = MWC.gamma a l density :: GammaDistribution -> Double -> Double density (GD a l) x | a < 0 || l <= 0 = m_NaN | x <= 0 = 0 | a == 0 = if x == 0 then m_pos_inf else 0 | x == 0 = if a < 1 then m_pos_inf else if a > 1 then 0 else 1/l | a < 1 = Poisson.probability (x/l) a * a / x | otherwise = Poisson.probability (x/l) (a-1) / l cumulative :: GammaDistribution -> Double -> Double cumulative (GD k l) x | x <= 0 = 0 | otherwise = incompleteGamma k (x/l) quantile :: GammaDistribution -> Double -> Double quantile (GD k l) p | p == 0 = 0 | p == 1 = 1/0 | p > 0 && p < 1 = l * invIncompleteGamma k p | otherwise = error $ "Statistics.Distribution.Gamma.quantile: p must be in [0,1] range. Got: "++show p statistics-0.16.2.1/Statistics/Distribution/Geometric.hs0000644000000000000000000001600207346545000021404 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Geometric -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The Geometric distribution. There are two variants of -- distribution. First is the probability distribution of the number -- of Bernoulli trials needed to get one success, supported on the set -- [1,2..] ('GeometricDistribution'). Sometimes it's referred to as -- the /shifted/ geometric distribution to distinguish from another -- one. -- -- Second variant is probability distribution of the number of -- failures before first success, defined over the set [0,1..] -- ('GeometricDistribution0'). module Statistics.Distribution.Geometric ( GeometricDistribution , GeometricDistribution0 -- * Constructors , geometric , geometricE , geometric0 , geometric0E -- ** Accessors , gdSuccess , gdSuccess0 ) where import Control.Applicative import Control.Monad (liftM) import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.MathFunctions.Constants (m_neg_inf) import Numeric.SpecFunctions (log1p,expm1) import qualified System.Random.MWC.Distributions as MWC import qualified Statistics.Distribution as D import Statistics.Internal ---------------------------------------------------------------- -- | Distribution over [1..] newtype GeometricDistribution = GD { gdSuccess :: Double } deriving (Eq, Typeable, Data, Generic) instance Show GeometricDistribution where showsPrec i (GD x) = defaultShow1 "geometric" x i instance Read GeometricDistribution where readPrec = defaultReadPrecM1 "geometric" geometricE instance ToJSON GeometricDistribution instance FromJSON GeometricDistribution where parseJSON (Object v) = do x <- v .: "gdSuccess" maybe (fail $ errMsg x) return $ geometricE x parseJSON _ = empty instance Binary GeometricDistribution where put (GD x) = put x get = do x <- get maybe (fail $ errMsg x) return $ geometricE x instance D.Distribution GeometricDistribution where cumulative = cumulative complCumulative = complCumulative instance D.DiscreteDistr GeometricDistribution where probability (GD s) n | n < 1 = 0 | s >= 0.5 = s * (1 - s)^(n - 1) | otherwise = s * (exp $ log1p (-s) * (fromIntegral n - 1)) logProbability (GD s) n | n < 1 = m_neg_inf | otherwise = log s + log1p (-s) * (fromIntegral n - 1) instance D.Mean GeometricDistribution where mean (GD s) = 1 / s instance D.Variance GeometricDistribution where variance (GD s) = (1 - s) / (s * s) instance D.MaybeMean GeometricDistribution where maybeMean = Just . D.mean instance D.MaybeVariance GeometricDistribution where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Entropy GeometricDistribution where entropy (GD s) | s == 1 = 0 | otherwise = -(s * log s + (1-s) * log1p (-s)) / s instance D.MaybeEntropy GeometricDistribution where maybeEntropy = Just . D.entropy instance D.DiscreteGen GeometricDistribution where genDiscreteVar (GD s) g = MWC.geometric1 s g instance D.ContGen GeometricDistribution where genContVar d g = fromIntegral `liftM` D.genDiscreteVar d g cumulative :: GeometricDistribution -> Double -> Double cumulative (GD s) x | x < 1 = 0 | isInfinite x = 1 | isNaN x = error "Statistics.Distribution.Geometric.cumulative: NaN input" | s >= 0.5 = 1 - (1 - s)^k | otherwise = negate $ expm1 $ fromIntegral k * log1p (-s) where k = floor x :: Int complCumulative :: GeometricDistribution -> Double -> Double complCumulative (GD s) x | x < 1 = 1 | isInfinite x = 0 | isNaN x = error "Statistics.Distribution.Geometric.complCumulative: NaN input" | s >= 0.5 = (1 - s)^k | otherwise = exp $ fromIntegral k * log1p (-s) where k = floor x :: Int -- | Create geometric distribution. geometric :: Double -- ^ Success rate -> GeometricDistribution geometric x = maybe (error $ errMsg x) id $ geometricE x -- | Create geometric distribution. geometricE :: Double -- ^ Success rate -> Maybe GeometricDistribution geometricE x | x > 0 && x <= 1 = Just (GD x) | otherwise = Nothing errMsg :: Double -> String errMsg x = "Statistics.Distribution.Geometric.geometric: probability must be in (0,1] range. Got " ++ show x ---------------------------------------------------------------- -- | Distribution over [0..] newtype GeometricDistribution0 = GD0 { gdSuccess0 :: Double } deriving (Eq, Typeable, Data, Generic) instance Show GeometricDistribution0 where showsPrec i (GD0 x) = defaultShow1 "geometric0" x i instance Read GeometricDistribution0 where readPrec = defaultReadPrecM1 "geometric0" geometric0E instance ToJSON GeometricDistribution0 instance FromJSON GeometricDistribution0 where parseJSON (Object v) = do x <- v .: "gdSuccess0" maybe (fail $ errMsg x) return $ geometric0E x parseJSON _ = empty instance Binary GeometricDistribution0 where put (GD0 x) = put x get = do x <- get maybe (fail $ errMsg x) return $ geometric0E x instance D.Distribution GeometricDistribution0 where cumulative (GD0 s) x = cumulative (GD s) (x + 1) complCumulative (GD0 s) x = complCumulative (GD s) (x + 1) instance D.DiscreteDistr GeometricDistribution0 where probability (GD0 s) n = D.probability (GD s) (n + 1) logProbability (GD0 s) n = D.logProbability (GD s) (n + 1) instance D.Mean GeometricDistribution0 where mean (GD0 s) = 1 / s - 1 instance D.Variance GeometricDistribution0 where variance (GD0 s) = D.variance (GD s) instance D.MaybeMean GeometricDistribution0 where maybeMean = Just . D.mean instance D.MaybeVariance GeometricDistribution0 where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Entropy GeometricDistribution0 where entropy (GD0 s) = D.entropy (GD s) instance D.MaybeEntropy GeometricDistribution0 where maybeEntropy = Just . D.entropy instance D.DiscreteGen GeometricDistribution0 where genDiscreteVar (GD0 s) g = MWC.geometric0 s g instance D.ContGen GeometricDistribution0 where genContVar d g = fromIntegral `liftM` D.genDiscreteVar d g -- | Create geometric distribution. geometric0 :: Double -- ^ Success rate -> GeometricDistribution0 geometric0 x = maybe (error $ errMsg0 x) id $ geometric0E x -- | Create geometric distribution. geometric0E :: Double -- ^ Success rate -> Maybe GeometricDistribution0 geometric0E x | x > 0 && x <= 1 = Just (GD0 x) | otherwise = Nothing errMsg0 :: Double -> String errMsg0 x = "Statistics.Distribution.Geometric.geometric0: probability must be in (0,1] range. Got " ++ show x statistics-0.16.2.1/Statistics/Distribution/Hypergeometric.hs0000644000000000000000000001332607346545000022462 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Hypergeometric -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The Hypergeometric distribution. This is the discrete probability -- distribution that measures the probability of /k/ successes in /l/ -- trials, without replacement, from a finite population. -- -- The parameters of the distribution describe /k/ elements chosen -- from a population of /l/, with /m/ elements of one type, and -- /l/-/m/ of the other (all are positive integers). module Statistics.Distribution.Hypergeometric ( HypergeometricDistribution -- * Constructors , hypergeometric , hypergeometricE -- ** Accessors , hdM , hdL , hdK ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.MathFunctions.Constants (m_epsilon,m_neg_inf) import Numeric.SpecFunctions (choose,logChoose) import qualified Statistics.Distribution as D import Statistics.Internal data HypergeometricDistribution = HD { hdM :: {-# UNPACK #-} !Int , hdL :: {-# UNPACK #-} !Int , hdK :: {-# UNPACK #-} !Int } deriving (Eq, Typeable, Data, Generic) instance Show HypergeometricDistribution where showsPrec i (HD m l k) = defaultShow3 "hypergeometric" m l k i instance Read HypergeometricDistribution where readPrec = defaultReadPrecM3 "hypergeometric" hypergeometricE instance ToJSON HypergeometricDistribution instance FromJSON HypergeometricDistribution where parseJSON (Object v) = do m <- v .: "hdM" l <- v .: "hdL" k <- v .: "hdK" maybe (fail $ errMsg m l k) return $ hypergeometricE m l k parseJSON _ = empty instance Binary HypergeometricDistribution where put (HD m l k) = put m >> put l >> put k get = do m <- get l <- get k <- get maybe (fail $ errMsg m l k) return $ hypergeometricE m l k instance D.Distribution HypergeometricDistribution where cumulative = cumulative complCumulative = complCumulative instance D.DiscreteDistr HypergeometricDistribution where probability = probability logProbability = logProbability instance D.Mean HypergeometricDistribution where mean = mean instance D.Variance HypergeometricDistribution where variance = variance instance D.MaybeMean HypergeometricDistribution where maybeMean = Just . D.mean instance D.MaybeVariance HypergeometricDistribution where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Entropy HypergeometricDistribution where entropy = directEntropy instance D.MaybeEntropy HypergeometricDistribution where maybeEntropy = Just . D.entropy variance :: HypergeometricDistribution -> Double variance (HD m l k) = (k' * ml) * (1 - ml) * (l' - k') / (l' - 1) where m' = fromIntegral m l' = fromIntegral l k' = fromIntegral k ml = m' / l' mean :: HypergeometricDistribution -> Double mean (HD m l k) = fromIntegral k * fromIntegral m / fromIntegral l directEntropy :: HypergeometricDistribution -> Double directEntropy d@(HD m _ _) = negate . sum $ takeWhile (< negate m_epsilon) $ dropWhile (not . (< negate m_epsilon)) [ let x = probability d n in x * log x | n <- [0..m]] hypergeometric :: Int -- ^ /m/ -> Int -- ^ /l/ -> Int -- ^ /k/ -> HypergeometricDistribution hypergeometric m l k = maybe (error $ errMsg m l k) id $ hypergeometricE m l k hypergeometricE :: Int -- ^ /m/ -> Int -- ^ /l/ -> Int -- ^ /k/ -> Maybe HypergeometricDistribution hypergeometricE m l k | not (l > 0) = Nothing | not (m >= 0 && m <= l) = Nothing | not (k > 0 && k <= l) = Nothing | otherwise = Just (HD m l k) errMsg :: Int -> Int -> Int -> String errMsg m l k = "Statistics.Distribution.Hypergeometric.hypergeometric:" ++ " m=" ++ show m ++ " l=" ++ show l ++ " k=" ++ show k ++ " should hold: l>0 & m in [0,l] & k in (0,l]" -- Naive implementation probability :: HypergeometricDistribution -> Int -> Double probability (HD mi li ki) n | n < max 0 (mi+ki-li) || n > min mi ki = 0 -- No overflow | li < 1000 = choose mi n * choose (li - mi) (ki - n) / choose li ki | otherwise = exp $ logChoose mi n + logChoose (li - mi) (ki - n) - logChoose li ki logProbability :: HypergeometricDistribution -> Int -> Double logProbability (HD mi li ki) n | n < max 0 (mi+ki-li) || n > min mi ki = m_neg_inf | otherwise = logChoose mi n + logChoose (li - mi) (ki - n) - logChoose li ki cumulative :: HypergeometricDistribution -> Double -> Double cumulative d@(HD mi li ki) x | isNaN x = error "Statistics.Distribution.Hypergeometric.cumulative: NaN argument" | isInfinite x = if x > 0 then 1 else 0 | n < minN = 0 | n >= maxN = 1 | otherwise = D.sumProbabilities d minN n where n = floor x minN = max 0 (mi+ki-li) maxN = min mi ki complCumulative :: HypergeometricDistribution -> Double -> Double complCumulative d@(HD mi li ki) x | isNaN x = error "Statistics.Distribution.Hypergeometric.complCumulative: NaN argument" | isInfinite x = if x > 0 then 0 else 1 | n < minN = 1 | n >= maxN = 0 | otherwise = D.sumProbabilities d (n + 1) maxN where n = floor x minN = max 0 (mi+ki-li) maxN = min mi ki statistics-0.16.2.1/Statistics/Distribution/Laplace.hs0000644000000000000000000001216207346545000021032 0ustar0000000000000000{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Laplace -- Copyright : (c) 2015 Mihai Maruseac -- License : BSD3 -- -- Maintainer : mihai.maruseac@maruseac.com -- Stability : experimental -- Portability : portable -- -- The Laplace distribution. This is the continuous probability -- defined as the difference of two iid exponential random variables -- or a Brownian motion evaluated as exponentially distributed times. -- It is used in differential privacy (Laplace Method), speech -- recognition and least absolute deviations method (Laplace's first -- law of errors, giving a robust regression method) -- module Statistics.Distribution.Laplace ( LaplaceDistribution -- * Constructors , laplace , laplaceE -- * Accessors , ldLocation , ldScale ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import qualified Data.Vector.Generic as G import qualified Statistics.Distribution as D import qualified Statistics.Quantile as Q import qualified Statistics.Sample as S import Statistics.Internal data LaplaceDistribution = LD { ldLocation :: {-# UNPACK #-} !Double -- ^ Location. , ldScale :: {-# UNPACK #-} !Double -- ^ Scale. } deriving (Eq, Typeable, Data, Generic) instance Show LaplaceDistribution where showsPrec i (LD l s) = defaultShow2 "laplace" l s i instance Read LaplaceDistribution where readPrec = defaultReadPrecM2 "laplace" laplaceE instance ToJSON LaplaceDistribution instance FromJSON LaplaceDistribution where parseJSON (Object v) = do l <- v .: "ldLocation" s <- v .: "ldScale" maybe (fail $ errMsg l s) return $ laplaceE l s parseJSON _ = empty instance Binary LaplaceDistribution where put (LD l s) = put l >> put s get = do l <- get s <- get maybe (fail $ errMsg l s) return $ laplaceE l s instance D.Distribution LaplaceDistribution where cumulative = cumulative complCumulative = complCumulative instance D.ContDistr LaplaceDistribution where density (LD l s) x = exp (- abs (x - l) / s) / (2 * s) logDensity (LD l s) x = - abs (x - l) / s - log 2 - log s quantile = quantile complQuantile = complQuantile instance D.Mean LaplaceDistribution where mean (LD l _) = l instance D.Variance LaplaceDistribution where variance (LD _ s) = 2 * s * s instance D.MaybeMean LaplaceDistribution where maybeMean = Just . D.mean instance D.MaybeVariance LaplaceDistribution where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Entropy LaplaceDistribution where entropy (LD _ s) = 1 + log (2 * s) instance D.MaybeEntropy LaplaceDistribution where maybeEntropy = Just . D.entropy instance D.ContGen LaplaceDistribution where genContVar = D.genContinuous cumulative :: LaplaceDistribution -> Double -> Double cumulative (LD l s) x | x <= l = 0.5 * exp ( (x - l) / s) | otherwise = 1 - 0.5 * exp ( - (x - l) / s ) complCumulative :: LaplaceDistribution -> Double -> Double complCumulative (LD l s) x | x <= l = 1 - 0.5 * exp ( (x - l) / s) | otherwise = 0.5 * exp ( - (x - l) / s ) quantile :: LaplaceDistribution -> Double -> Double quantile (LD l s) p | p == 0 = -inf | p == 1 = inf | p == 0.5 = l | p > 0 && p < 0.5 = l + s * log (2 * p) | p > 0.5 && p < 1 = l - s * log (2 - 2 * p) | otherwise = error $ "Statistics.Distribution.Laplace.quantile: p must be in [0,1] range. Got: "++show p where inf = 1 / 0 complQuantile :: LaplaceDistribution -> Double -> Double complQuantile (LD l s) p | p == 0 = inf | p == 1 = -inf | p == 0.5 = l | p > 0 && p < 0.5 = l - s * log (2 * p) | p > 0.5 && p < 1 = l + s * log (2 - 2 * p) | otherwise = error $ "Statistics.Distribution.Laplace.quantile: p must be in [0,1] range. Got: "++show p where inf = 1 / 0 -- | Create an Laplace distribution. laplace :: Double -- ^ Location -> Double -- ^ Scale -> LaplaceDistribution laplace l s = maybe (error $ errMsg l s) id $ laplaceE l s -- | Create an Laplace distribution. laplaceE :: Double -- ^ Location -> Double -- ^ Scale -> Maybe LaplaceDistribution laplaceE l s | s >= 0 = Just (LD l s) | otherwise = Nothing errMsg :: Double -> Double -> String errMsg _ s = "Statistics.Distribution.Laplace.laplace: scale parameter must be positive. Got " ++ show s -- | Create Laplace distribution from sample. The location is estimated -- as the median of the sample, and the scale as the mean absolute -- deviation of the median. instance D.FromSample LaplaceDistribution Double where fromSample xs | G.null xs = Nothing | otherwise = Just $! LD s l where s = Q.median Q.medianUnbiased xs l = S.mean $ G.map (\x -> abs $ x - s) xs statistics-0.16.2.1/Statistics/Distribution/Lognormal.hs0000644000000000000000000001220407346545000021420 0ustar0000000000000000{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Lognormal -- Copyright : (c) 2020 Ximin Luo -- License : BSD3 -- -- Maintainer : infinity0@pwned.gg -- Stability : experimental -- Portability : portable -- -- The log normal distribution. This is a continuous probability -- distribution that describes data whose log is clustered around a -- mean. For example, the multiplicative product of many independent -- positive random variables. module Statistics.Distribution.Lognormal ( LognormalDistribution -- * Constructors , lognormalDistr , lognormalDistrErr , lognormalDistrMeanStddevErr , lognormalStandard ) where import Data.Aeson (FromJSON, ToJSON) import Data.Binary (Binary (..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.MathFunctions.Constants (m_huge, m_sqrt_2_pi) import Numeric.SpecFunctions (expm1, log1p) import qualified Data.Vector.Generic as G import qualified Statistics.Distribution as D import qualified Statistics.Distribution.Normal as N import Statistics.Internal -- | The lognormal distribution. newtype LognormalDistribution = LND N.NormalDistribution deriving (Eq, Typeable, Data, Generic) instance Show LognormalDistribution where showsPrec i (LND d) = defaultShow2 "lognormalDistr" m s i where m = D.mean d s = D.stdDev d instance Read LognormalDistribution where readPrec = defaultReadPrecM2 "lognormalDistr" $ (either (const Nothing) Just .) . lognormalDistrErr instance ToJSON LognormalDistribution instance FromJSON LognormalDistribution instance Binary LognormalDistribution where put (LND d) = put m >> put s where m = D.mean d s = D.stdDev d get = do m <- get sd <- get either fail return $ lognormalDistrErr m sd instance D.Distribution LognormalDistribution where cumulative = cumulative complCumulative = complCumulative instance D.ContDistr LognormalDistribution where logDensity = logDensity quantile = quantile complQuantile = complQuantile instance D.MaybeMean LognormalDistribution where maybeMean = Just . D.mean instance D.Mean LognormalDistribution where mean (LND d) = exp (m + v / 2) where m = D.mean d v = D.variance d instance D.MaybeVariance LognormalDistribution where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Variance LognormalDistribution where variance (LND d) = expm1 v * exp (2 * m + v) where m = D.mean d v = D.variance d instance D.Entropy LognormalDistribution where entropy (LND d) = logBase 2 (s * exp (m + 0.5) * m_sqrt_2_pi) where m = D.mean d s = D.stdDev d instance D.MaybeEntropy LognormalDistribution where maybeEntropy = Just . D.entropy instance D.ContGen LognormalDistribution where genContVar d = D.genContinuous d -- | Standard log normal distribution with mu 0 and sigma 1. -- -- Mean is @sqrt e@ and variance is @(e - 1) * e@. lognormalStandard :: LognormalDistribution lognormalStandard = LND N.standard -- | Create log normal distribution from parameters. lognormalDistr :: Double -- ^ Mu -> Double -- ^ Sigma -> LognormalDistribution lognormalDistr mu sig = either error id $ lognormalDistrErr mu sig -- | Create log normal distribution from parameters. lognormalDistrErr :: Double -- ^ Mu -> Double -- ^ Sigma -> Either String LognormalDistribution lognormalDistrErr mu sig | sig >= sqrt (log m_huge - 2 * mu) = Left $ errMsg mu sig | otherwise = LND <$> N.normalDistrErr mu sig errMsg :: Double -> Double -> String errMsg mu sig = "Statistics.Distribution.Lognormal.lognormalDistr: sigma must be > 0 && < " ++ show lim ++ ". Got " ++ show sig where lim = sqrt (log m_huge - 2 * mu) -- | Create log normal distribution from mean and standard deviation. lognormalDistrMeanStddevErr :: Double -- ^ Mu -> Double -- ^ Sigma -> Either String LognormalDistribution lognormalDistrMeanStddevErr m sd = LND <$> N.normalDistrErr mu sig where r = sd / m sig2 = log1p (r * r) sig = sqrt sig2 mu = log m - sig2 / 2 -- | Variance is estimated using maximum likelihood method -- (biased estimation) over the log of the data. -- -- Returns @Nothing@ if sample contains less than one element or -- variance is zero (all elements are equal) instance D.FromSample LognormalDistribution Double where fromSample = fmap LND . D.fromSample . G.map log logDensity :: LognormalDistribution -> Double -> Double logDensity (LND d) x | x > 0 = let lx = log x in D.logDensity d lx - lx | otherwise = 0 cumulative :: LognormalDistribution -> Double -> Double cumulative (LND d) x | x > 0 = D.cumulative d $ log x | otherwise = 0 complCumulative :: LognormalDistribution -> Double -> Double complCumulative (LND d) x | x > 0 = D.complCumulative d $ log x | otherwise = 1 quantile :: LognormalDistribution -> Double -> Double quantile (LND d) = exp . D.quantile d complQuantile :: LognormalDistribution -> Double -> Double complQuantile (LND d) = exp . D.complQuantile d statistics-0.16.2.1/Statistics/Distribution/NegativeBinomial.hs0000644000000000000000000001476707346545000022723 0ustar0000000000000000{-# LANGUAGE OverloadedStrings, PatternGuards, DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.NegativeBinomial -- Copyright : (c) 2022 Lorenz Minder -- License : BSD3 -- -- Maintainer : lminder@gmx.net -- Stability : experimental -- Portability : portable -- -- The negative binomial distribution. This is the discrete probability -- distribution of the number of failures in a sequence of independent -- yes\/no experiments before a specified number of successes /r/. Each -- Bernoulli trial has success probability /p/ in the range (0, 1]. The -- parameter /r/ must be positive, but does not have to be integer. module Statistics.Distribution.NegativeBinomial ( NegativeBinomialDistribution -- * Constructors , negativeBinomial , negativeBinomialE -- * Accessors , nbdSuccesses , nbdProbability ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import Data.Foldable (foldl') import GHC.Generics (Generic) import Numeric.SpecFunctions (incompleteBeta, log1p) import Numeric.SpecFunctions.Extra (logChooseFast) import Numeric.MathFunctions.Constants (m_epsilon, m_tiny) import qualified Statistics.Distribution as D import Statistics.Internal -- Math helper functions -- | Generalized binomial coefficients. -- -- These computes binomial coefficients with the small generalization -- that the /n/ need not be integer, but can be real. gChoose :: Double -> Int -> Double gChoose n k | k < 0 = 0 | k' >= 50 = exp $ logChooseFast n k' | otherwise = foldl' (*) 1 factors where factors = [ (n - k' + j) / j | j <- [1..k'] ] k' = fromIntegral k -- Implementation of Negative Binomial -- | The negative binomial distribution. data NegativeBinomialDistribution = NBD { nbdSuccesses :: {-# UNPACK #-} !Double -- ^ Number of successes until stop , nbdProbability :: {-# UNPACK #-} !Double -- ^ Success probability. } deriving (Eq, Typeable, Data, Generic) instance Show NegativeBinomialDistribution where showsPrec i (NBD r p) = defaultShow2 "negativeBinomial" r p i instance Read NegativeBinomialDistribution where readPrec = defaultReadPrecM2 "negativeBinomial" negativeBinomialE instance ToJSON NegativeBinomialDistribution instance FromJSON NegativeBinomialDistribution where parseJSON (Object v) = do r <- v .: "nbdSuccesses" p <- v .: "nbdProbability" maybe (fail $ errMsg r p) return $ negativeBinomialE r p parseJSON _ = empty instance Binary NegativeBinomialDistribution where put (NBD r p) = put r >> put p get = do r <- get p <- get maybe (fail $ errMsg r p) return $ negativeBinomialE r p instance D.Distribution NegativeBinomialDistribution where cumulative = cumulative complCumulative = complCumulative instance D.DiscreteDistr NegativeBinomialDistribution where probability = probability logProbability = logProbability instance D.Mean NegativeBinomialDistribution where mean = mean instance D.Variance NegativeBinomialDistribution where variance = variance instance D.MaybeMean NegativeBinomialDistribution where maybeMean = Just . D.mean instance D.MaybeVariance NegativeBinomialDistribution where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Entropy NegativeBinomialDistribution where entropy = directEntropy instance D.MaybeEntropy NegativeBinomialDistribution where maybeEntropy = Just . D.entropy -- This could be slow for big n probability :: NegativeBinomialDistribution -> Int -> Double probability d@(NBD r p) k | k < 0 = 0 -- Switch to log domain for large k + r to avoid overflows. -- -- We also want to avoid underflow when computing (1-p)^k & -- p^r. | k' + r < 1000 , pK >= m_tiny , pR >= m_tiny = gChoose (k' + r - 1) k * pK * pR | otherwise = exp $ logProbability d k where pK = exp $ log1p (-p) * k' pR = p**r k' = fromIntegral k logProbability :: NegativeBinomialDistribution -> Int -> Double logProbability (NBD r p) k | k < 0 = (-1)/0 | otherwise = logChooseFast (k' + r - 1) k' + log1p (-p) * k' + log p * r where k' = fromIntegral k cumulative :: NegativeBinomialDistribution -> Double -> Double cumulative (NBD r p) x | isNaN x = error "Statistics.Distribution.NegativeBinomial.cumulative: NaN input" | isInfinite x = if x > 0 then 1 else 0 | k < 0 = 0 | otherwise = incompleteBeta r (fromIntegral (k+1)) p where k = floor x :: Integer complCumulative :: NegativeBinomialDistribution -> Double -> Double complCumulative (NBD r p) x | isNaN x = error "Statistics.Distribution.NegativeBinomial.complCumulative: NaN input" | isInfinite x = if x > 0 then 0 else 1 | k < 0 = 1 | otherwise = incompleteBeta (fromIntegral (k+1)) r (1 - p) where k = (floor x)::Integer mean :: NegativeBinomialDistribution -> Double mean (NBD r p) = r * (1 - p)/p variance :: NegativeBinomialDistribution -> Double variance (NBD r p) = r * (1 - p)/(p * p) directEntropy :: NegativeBinomialDistribution -> Double directEntropy d = negate . sum $ takeWhile (< -m_epsilon) $ dropWhile (>= -m_epsilon) $ [ let x = probability d k in x * log x | k <- [0..]] -- | Construct negative binomial distribution. Number of failures /r/ -- must be positive and probability must be in (0,1] range negativeBinomial :: Double -- ^ Number of successes. -> Double -- ^ Success probability. -> NegativeBinomialDistribution negativeBinomial r p = maybe (error $ errMsg r p) id $ negativeBinomialE r p -- | Construct negative binomial distribution. Number of failures /r/ -- must be positive and probability must be in (0,1] range negativeBinomialE :: Double -- ^ Number of successes. -> Double -- ^ Success probability. -> Maybe NegativeBinomialDistribution negativeBinomialE r p | r > 0 && 0 < p && p <= 1 = Just (NBD r p) | otherwise = Nothing errMsg :: Double -> Double -> String errMsg r p = "Statistics.Distribution.NegativeBinomial.negativeBinomial: r=" ++ show r ++ " p=" ++ show p ++ ", but need r>0 and p in (0,1]" statistics-0.16.2.1/Statistics/Distribution/Normal.hs0000644000000000000000000001412707346545000020724 0ustar0000000000000000{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Normal -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The normal distribution. This is a continuous probability -- distribution that describes data that cluster around a mean. module Statistics.Distribution.Normal ( NormalDistribution -- * Constructors , normalDistr , normalDistrE , normalDistrErr , standard ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.MathFunctions.Constants (m_sqrt_2, m_sqrt_2_pi) import Numeric.SpecFunctions (erfc, invErfc) import qualified System.Random.MWC.Distributions as MWC import qualified Data.Vector.Generic as G import qualified Statistics.Distribution as D import qualified Statistics.Sample as S import Statistics.Internal -- | The normal distribution. data NormalDistribution = ND { mean :: {-# UNPACK #-} !Double , stdDev :: {-# UNPACK #-} !Double , ndPdfDenom :: {-# UNPACK #-} !Double , ndCdfDenom :: {-# UNPACK #-} !Double } deriving (Eq, Typeable, Data, Generic) instance Show NormalDistribution where showsPrec i (ND m s _ _) = defaultShow2 "normalDistr" m s i instance Read NormalDistribution where readPrec = defaultReadPrecM2 "normalDistr" normalDistrE instance ToJSON NormalDistribution instance FromJSON NormalDistribution where parseJSON (Object v) = do m <- v .: "mean" sd <- v .: "stdDev" either fail return $ normalDistrErr m sd parseJSON _ = empty instance Binary NormalDistribution where put (ND m sd _ _) = put m >> put sd get = do m <- get sd <- get either fail return $ normalDistrErr m sd instance D.Distribution NormalDistribution where cumulative = cumulative complCumulative = complCumulative instance D.ContDistr NormalDistribution where logDensity = logDensity quantile = quantile complQuantile = complQuantile instance D.MaybeMean NormalDistribution where maybeMean = Just . D.mean instance D.Mean NormalDistribution where mean = mean instance D.MaybeVariance NormalDistribution where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Variance NormalDistribution where stdDev = stdDev instance D.Entropy NormalDistribution where entropy d = 0.5 * log (2 * pi * exp 1 * D.variance d) instance D.MaybeEntropy NormalDistribution where maybeEntropy = Just . D.entropy instance D.ContGen NormalDistribution where genContVar d = MWC.normal (mean d) (stdDev d) -- | Standard normal distribution with mean equal to 0 and variance equal to 1 standard :: NormalDistribution standard = ND { mean = 0.0 , stdDev = 1.0 , ndPdfDenom = log m_sqrt_2_pi , ndCdfDenom = m_sqrt_2 } -- | Create normal distribution from parameters. -- -- IMPORTANT: prior to 0.10 release second parameter was variance not -- standard deviation. normalDistr :: Double -- ^ Mean of distribution -> Double -- ^ Standard deviation of distribution -> NormalDistribution normalDistr m sd = either error id $ normalDistrErr m sd -- | Create normal distribution from parameters. -- -- IMPORTANT: prior to 0.10 release second parameter was variance not -- standard deviation. normalDistrE :: Double -- ^ Mean of distribution -> Double -- ^ Standard deviation of distribution -> Maybe NormalDistribution normalDistrE m sd = either (const Nothing) Just $ normalDistrErr m sd -- | Create normal distribution from parameters. -- normalDistrErr :: Double -- ^ Mean of distribution -> Double -- ^ Standard deviation of distribution -> Either String NormalDistribution normalDistrErr m sd | sd > 0 = Right $ ND { mean = m , stdDev = sd , ndPdfDenom = log $ m_sqrt_2_pi * sd , ndCdfDenom = m_sqrt_2 * sd } | otherwise = Left $ errMsg m sd errMsg :: Double -> Double -> String errMsg _ sd = "Statistics.Distribution.Normal.normalDistr: standard deviation must be positive. Got " ++ show sd -- | Variance is estimated using maximum likelihood method -- (biased estimation). -- -- Returns @Nothing@ if sample contains less than one element or -- variance is zero (all elements are equal) instance D.FromSample NormalDistribution Double where fromSample xs | G.length xs <= 1 = Nothing | v == 0 = Nothing | otherwise = Just $! normalDistr m (sqrt v) where (m,v) = S.meanVariance xs logDensity :: NormalDistribution -> Double -> Double logDensity d x = (-xm * xm / (2 * sd * sd)) - ndPdfDenom d where xm = x - mean d sd = stdDev d cumulative :: NormalDistribution -> Double -> Double cumulative d x = erfc ((mean d - x) / ndCdfDenom d) / 2 complCumulative :: NormalDistribution -> Double -> Double complCumulative d x = erfc ((x - mean d) / ndCdfDenom d) / 2 quantile :: NormalDistribution -> Double -> Double quantile d p | p == 0 = -inf | p == 1 = inf | p == 0.5 = mean d | p > 0 && p < 1 = x * ndCdfDenom d + mean d | otherwise = error $ "Statistics.Distribution.Normal.quantile: p must be in [0,1] range. Got: "++show p where x = - invErfc (2 * p) inf = 1/0 complQuantile :: NormalDistribution -> Double -> Double complQuantile d p | p == 0 = inf | p == 1 = -inf | p == 0.5 = mean d | p > 0 && p < 1 = x * ndCdfDenom d + mean d | otherwise = error $ "Statistics.Distribution.Normal.complQuantile: p must be in [0,1] range. Got: "++show p where x = invErfc (2 * p) inf = 1/0 statistics-0.16.2.1/Statistics/Distribution/Poisson.hs0000644000000000000000000000714407346545000021127 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Poisson -- Copyright : (c) 2009, 2011 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The Poisson distribution. This is the discrete probability -- distribution of a number of events occurring in a fixed interval if -- these events occur with a known average rate, and occur -- independently from each other within that interval. module Statistics.Distribution.Poisson ( PoissonDistribution -- * Constructors , poisson , poissonE -- * Accessors , poissonLambda -- * References -- $references ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.SpecFunctions (incompleteGamma,logFactorial) import Numeric.MathFunctions.Constants (m_neg_inf) import qualified Statistics.Distribution as D import qualified Statistics.Distribution.Poisson.Internal as I import Statistics.Internal newtype PoissonDistribution = PD { poissonLambda :: Double } deriving (Eq, Typeable, Data, Generic) instance Show PoissonDistribution where showsPrec i (PD l) = defaultShow1 "poisson" l i instance Read PoissonDistribution where readPrec = defaultReadPrecM1 "poisson" poissonE instance ToJSON PoissonDistribution instance FromJSON PoissonDistribution where parseJSON (Object v) = do l <- v .: "poissonLambda" maybe (fail $ errMsg l) return $ poissonE l parseJSON _ = empty instance Binary PoissonDistribution where put = put . poissonLambda get = do l <- get maybe (fail $ errMsg l) return $ poissonE l instance D.Distribution PoissonDistribution where cumulative (PD lambda) x | x < 0 = 0 | isInfinite x = 1 | isNaN x = error "Statistics.Distribution.Poisson.cumulative: NaN input" | otherwise = 1 - incompleteGamma (fromIntegral (floor x + 1 :: Int)) lambda instance D.DiscreteDistr PoissonDistribution where probability (PD lambda) x = I.probability lambda (fromIntegral x) logProbability (PD lambda) i | i < 0 = m_neg_inf | otherwise = log lambda * fromIntegral i - logFactorial i - lambda instance D.Variance PoissonDistribution where variance = poissonLambda instance D.Mean PoissonDistribution where mean = poissonLambda instance D.MaybeMean PoissonDistribution where maybeMean = Just . D.mean instance D.MaybeVariance PoissonDistribution where maybeStdDev = Just . D.stdDev instance D.Entropy PoissonDistribution where entropy (PD lambda) = I.poissonEntropy lambda instance D.MaybeEntropy PoissonDistribution where maybeEntropy = Just . D.entropy -- | Create Poisson distribution. poisson :: Double -> PoissonDistribution poisson l = maybe (error $ errMsg l) id $ poissonE l -- | Create Poisson distribution. poissonE :: Double -> Maybe PoissonDistribution poissonE l | l >= 0 = Just (PD l) | otherwise = Nothing errMsg :: Double -> String errMsg l = "Statistics.Distribution.Poisson.poisson: lambda must be non-negative. Got " ++ show l -- $references -- -- * Loader, C. (2000) Fast and Accurate Computation of Binomial -- Probabilities. -- * Adell, J., Lekuona, A., and Yu, Y. (2010) Sharp Bounds on the -- Entropy of the Poisson Law and Related Quantities -- statistics-0.16.2.1/Statistics/Distribution/Poisson/0000755000000000000000000000000007346545000020565 5ustar0000000000000000statistics-0.16.2.1/Statistics/Distribution/Poisson/Internal.hs0000644000000000000000000001550207346545000022700 0ustar0000000000000000-- | -- Module : Statistics.Distribution.Poisson.Internal -- Copyright : (c) 2011 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Internal code for the Poisson distribution. module Statistics.Distribution.Poisson.Internal ( probability, poissonEntropy ) where import Data.List (unfoldr) import Numeric.MathFunctions.Constants (m_sqrt_2_pi, m_tiny, m_epsilon) import Numeric.SpecFunctions (logGamma, stirlingError {-, choose, logFactorial -}) import Numeric.SpecFunctions.Extra (bd0) -- | An unchecked, non-integer-valued version of Loader's saddle point -- algorithm. probability :: Double -> Double -> Double probability 0 0 = 1 probability 0 1 = 0 probability lambda x | isInfinite lambda = 0 | x < 0 = 0 | x <= lambda * m_tiny = exp (-lambda) | lambda < x * m_tiny = exp (-lambda + x * log lambda - logGamma (x+1)) | otherwise = exp (-(stirlingError x) - bd0 x lambda) / (m_sqrt_2_pi * sqrt x) -- -- | Compute entropy using Theorem 1 from "Sharp Bounds on the Entropy -- -- of the Poisson Law". This function is unused because 'directEntropy' -- -- is just as accurate and is faster by about a factor of 4. -- alyThm1 :: Double -> Double -- alyThm1 lambda = -- sum (takeWhile (\x -> abs x >= m_epsilon * lll) alySeries) + lll -- where lll = lambda * (1 - log lambda) -- alySeries = -- [ alyc k * exp (fromIntegral k * log lambda - logFactorial k) -- | k <- [2..] ] -- alyc :: Int -> Double -- alyc k = -- sum [ parity j * choose (k-1) j * log (fromIntegral j+1) | j <- [0..k-1] ] -- where parity j -- | even (k-j) = -1 -- | otherwise = 1 -- | Returns [x, x^2, x^3, x^4, ...] powers :: Double -> [Double] powers x = unfoldr (\y -> Just (y*x,y*x)) 1 -- | Returns an upper bound according to theorem 2 of "Sharp Bounds on -- the Entropy of the Poisson Law" alyThm2Upper :: Double -> [Double] -> Double alyThm2Upper lambda coefficients = 1.4189385332046727 + 0.5 * log lambda + zipCoefficients lambda coefficients -- | Returns the average of the upper and lower bounds according to -- theorem 2. alyThm2 :: Double -> [Double] -> [Double] -> Double alyThm2 lambda upper lower = alyThm2Upper lambda upper + 0.5 * (zipCoefficients lambda lower) zipCoefficients :: Double -> [Double] -> Double zipCoefficients lambda coefficients = (sum $ map (uncurry (*)) (zip (powers $ recip lambda) coefficients)) -- Mathematica code deriving the coefficients below: -- -- poissonMoment[0, s_] := 1 -- poissonMoment[1, s_] := 0 -- poissonMoment[k_, s_] := -- Sum[s * Binomial[k - 1, j] * poissonMoment[j, s], {j, 0, k - 2}] -- -- upperSeries[m_] := -- Distribute[Integrate[ -- Sum[(-1)^(j - 1) * -- poissonMoment[j, \[Lambda]] / (j * (j - 1)* \[Lambda]^j), -- {j, 3, 2 m - 1}], -- \[Lambda]]] -- -- lowerSeries[m_] := -- Distribute[Integrate[ -- poissonMoment[ -- 2 m + 2, \[Lambda]] / ((2 m + -- 1)*\[Lambda]^(2 m + 2)), \[Lambda]]] -- -- upperBound[m_] := upperSeries[m] + (Log[2*Pi*\[Lambda]] + 1)/2 -- -- lowerBound[m_] := upperBound[m] + lowerSeries[m] upperCoefficients4 :: [Double] upperCoefficients4 = [1/12, 1/24, -103/180, -13/40, -1/210] lowerCoefficients4 :: [Double] lowerCoefficients4 = [0,0,0, -105/4, -210, -2275/18, -167/21, -1/72] upperCoefficients6 :: [Double] upperCoefficients6 = [1/12, 1/24, 19/360, 9/80, -38827/2520, -74855/1008, -73061/2520, -827/720, -1/990] lowerCoefficients6 :: [Double] lowerCoefficients6 = [0,0,0,0,0, -3465/2, -45045, -466235/4, -531916/9, -56287/10, -629/11, -1/156] upperCoefficients8 :: [Double] upperCoefficients8 = [1/12, 1/24, 19/360, 9/80, 863/2520, 1375/1008, -3023561/2520, -15174047/720, -231835511/5940, -18927611/1320, -58315591/60060, -23641/3640, -1/2730] lowerCoefficients8 :: [Double] lowerCoefficients8 = [0,0,0,0,0,0,0, -2027025/8, -15315300, -105252147, -178127950, -343908565/4, -10929270, -3721149/14, -7709/15, -1/272] upperCoefficients10 :: [Double] upperCoefficients10 = [1/12, 1/24, 19/360, 9,80, 863/2520, 1375/1008, 33953/5040, 57281/1440, -2271071617/11880, -1483674219/176, -31714406276557/720720, -7531072742237/131040, -1405507544003/65520, -21001919627/10080, -1365808297/36720, -26059/544, -1/5814] lowerCoefficients10 :: [Double] lowerCoefficients10 = [0,0,0,0,0,0,0,0,0,-130945815/2, -7638505875, -438256243425/4, -435477637540, -3552526473925/6, -857611717105/3, -545654955967/12, -5794690528/3, -578334559/42, -699043/133, -1/420] upperCoefficients12 :: [Double] upperCoefficients12 = [1/12, 1/24, 19/360, 863/2520, 1375/1008, 33953/5040, 57281/1440, 3250433/11880, 378351/176, -37521922090657/720720, -612415657466657/131040, -3476857538815223/65520, -243882174660761/1440, -34160796727900637/183600, -39453820646687/544, -750984629069237/81396, -2934056300989/9576, -20394527513/12540, -3829559/9240, -1/10626] lowerCoefficients12 :: [Double] lowerCoefficients12 = [0,0,0,0,0,0,0,0,0,0,0, -105411381075/4, -5270569053750, -272908057767345/2, -1051953238104769, -24557168490009155/8, -3683261873403112, -5461918738302026/3, -347362037754732, -2205885452434521/100, -12237195698286/35, -16926981721/22, -6710881/155, -1/600] -- | Compute entropy directly from its definition. This is just as accurate -- as 'alyThm1' for lambda <= 1 and is faster, but is slow for large lambda, -- and produces some underestimation due to accumulation of floating point -- error. directEntropy :: Double -> Double directEntropy lambda = negate . sum $ takeWhile (< negate m_epsilon * lambda) $ dropWhile (not . (< negate m_epsilon * lambda)) $ [ let x = probability lambda k in x * log x | k <- [0..]] -- | Compute the entropy of a Poisson distribution using the best available -- method. poissonEntropy :: Double -> Double poissonEntropy lambda | lambda == 0 = 0 | lambda <= 10 = directEntropy lambda | lambda <= 12 = alyThm2 lambda upperCoefficients4 lowerCoefficients4 | lambda <= 18 = alyThm2 lambda upperCoefficients6 lowerCoefficients6 | lambda <= 24 = alyThm2 lambda upperCoefficients8 lowerCoefficients8 | lambda <= 30 = alyThm2 lambda upperCoefficients10 lowerCoefficients10 | otherwise = alyThm2 lambda upperCoefficients12 lowerCoefficients12 statistics-0.16.2.1/Statistics/Distribution/StudentT.hs0000644000000000000000000001061507346545000021244 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.StudentT -- Copyright : (c) 2011 Aleksey Khudyakov -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Student-T distribution module Statistics.Distribution.StudentT ( StudentT -- * Constructors , studentT , studentTE , studentTUnstandardized -- * Accessors , studentTndf ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.SpecFunctions ( logBeta, incompleteBeta, invIncompleteBeta, digamma) import qualified Statistics.Distribution as D import Statistics.Distribution.Transform (LinearTransform (..)) import Statistics.Internal -- | Student-T distribution newtype StudentT = StudentT { studentTndf :: Double } deriving (Eq, Typeable, Data, Generic) instance Show StudentT where showsPrec i (StudentT ndf) = defaultShow1 "studentT" ndf i instance Read StudentT where readPrec = defaultReadPrecM1 "studentT" studentTE instance ToJSON StudentT instance FromJSON StudentT where parseJSON (Object v) = do ndf <- v .: "studentTndf" maybe (fail $ errMsg ndf) return $ studentTE ndf parseJSON _ = empty instance Binary StudentT where put = put . studentTndf get = do ndf <- get maybe (fail $ errMsg ndf) return $ studentTE ndf -- | Create Student-T distribution. Number of parameters must be positive. studentT :: Double -> StudentT studentT ndf = maybe (error $ errMsg ndf) id $ studentTE ndf -- | Create Student-T distribution. Number of parameters must be positive. studentTE :: Double -> Maybe StudentT studentTE ndf | ndf > 0 = Just (StudentT ndf) | otherwise = Nothing errMsg :: Double -> String errMsg _ = modErr "studentT" "non-positive number of degrees of freedom" instance D.Distribution StudentT where cumulative = cumulative complCumulative = complCumulative instance D.ContDistr StudentT where density d@(StudentT ndf) x = exp (logDensityUnscaled d x) / sqrt ndf logDensity d@(StudentT ndf) x = logDensityUnscaled d x - log (sqrt ndf) quantile = quantile cumulative :: StudentT -> Double -> Double cumulative (StudentT ndf) x | x > 0 = 1 - 0.5 * ibeta | otherwise = 0.5 * ibeta where ibeta = incompleteBeta (0.5 * ndf) 0.5 (ndf / (ndf + x*x)) complCumulative :: StudentT -> Double -> Double complCumulative (StudentT ndf) x | x > 0 = 0.5 * ibeta | otherwise = 1 - 0.5 * ibeta where ibeta = incompleteBeta (0.5 * ndf) 0.5 (ndf / (ndf + x*x)) logDensityUnscaled :: StudentT -> Double -> Double logDensityUnscaled (StudentT ndf) x = log (ndf / (ndf + x*x)) * (0.5 * (1 + ndf)) - logBeta 0.5 (0.5 * ndf) quantile :: StudentT -> Double -> Double quantile (StudentT ndf) p | p >= 0 && p <= 1 = let x = invIncompleteBeta (0.5 * ndf) 0.5 (2 * min p (1 - p)) in case sqrt $ ndf * (1 - x) / x of r | p < 0.5 -> -r | otherwise -> r | otherwise = modErr "quantile" $ "p must be in [0,1] range. Got: "++show p instance D.MaybeMean StudentT where maybeMean (StudentT ndf) | ndf > 1 = Just 0 | otherwise = Nothing instance D.MaybeVariance StudentT where maybeVariance (StudentT ndf) | ndf > 2 = Just $! ndf / (ndf - 2) | otherwise = Nothing instance D.Entropy StudentT where entropy (StudentT ndf) = 0.5 * (ndf+1) * (digamma ((1+ndf)/2) - digamma(ndf/2)) + log (sqrt ndf) + logBeta (ndf/2) 0.5 instance D.MaybeEntropy StudentT where maybeEntropy = Just . D.entropy instance D.ContGen StudentT where genContVar = D.genContinuous -- | Create an unstandardized Student-t distribution. studentTUnstandardized :: Double -- ^ Number of degrees of freedom -> Double -- ^ Central value (0 for standard Student T distribution) -> Double -- ^ Scale parameter -> LinearTransform StudentT studentTUnstandardized ndf mu sigma | sigma > 0 = LinearTransform mu sigma $ studentT ndf | otherwise = modErr "studentTUnstandardized" $ "sigma must be > 0. Got: " ++ show sigma modErr :: String -> String -> a modErr fun msg = error $ "Statistics.Distribution.StudentT." ++ fun ++ ": " ++ msg statistics-0.16.2.1/Statistics/Distribution/Transform.hs0000644000000000000000000000702107346545000021442 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, DeriveGeneric, FlexibleContexts, FlexibleInstances, UndecidableInstances #-} -- | -- Module : Statistics.Distribution.Transform -- Copyright : (c) 2013 John McDonnell; -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Transformations over distributions module Statistics.Distribution.Transform ( LinearTransform (..) , linTransFixedPoint , scaleAround ) where import Data.Aeson (FromJSON, ToJSON) import Data.Binary (Binary) import Data.Binary (put, get) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import qualified Statistics.Distribution as D -- | Linear transformation applied to distribution. -- -- > LinearTransform μ σ _ -- > x' = μ + σ·x data LinearTransform d = LinearTransform { linTransLocation :: {-# UNPACK #-} !Double -- ^ Location parameter. , linTransScale :: {-# UNPACK #-} !Double -- ^ Scale parameter. , linTransDistr :: d -- ^ Distribution being transformed. } deriving (Eq, Show, Read, Typeable, Data, Generic) instance (FromJSON d) => FromJSON (LinearTransform d) instance (ToJSON d) => ToJSON (LinearTransform d) instance (Binary d) => Binary (LinearTransform d) where get = LinearTransform <$> get <*> get <*> get put (LinearTransform x y z) = put x >> put y >> put z -- | Apply linear transformation to distribution. scaleAround :: Double -- ^ Fixed point -> Double -- ^ Scale parameter -> d -- ^ Distribution -> LinearTransform d scaleAround x0 sc = LinearTransform (x0 * (1 - sc)) sc -- | Get fixed point of linear transformation linTransFixedPoint :: LinearTransform d -> Double linTransFixedPoint (LinearTransform loc sc _) = loc / (1 - sc) instance Functor LinearTransform where fmap f (LinearTransform loc sc dist) = LinearTransform loc sc (f dist) instance D.Distribution d => D.Distribution (LinearTransform d) where cumulative (LinearTransform loc sc dist) x = D.cumulative dist $ (x-loc) / sc instance D.ContDistr d => D.ContDistr (LinearTransform d) where density (LinearTransform loc sc dist) x = D.density dist ((x-loc) / sc) / sc logDensity (LinearTransform loc sc dist) x = D.logDensity dist ((x-loc) / sc) - log sc quantile (LinearTransform loc sc dist) p = loc + sc * D.quantile dist p complQuantile (LinearTransform loc sc dist) p = loc + sc * D.complQuantile dist p instance D.MaybeMean d => D.MaybeMean (LinearTransform d) where maybeMean (LinearTransform loc _ dist) = (+loc) <$> D.maybeMean dist instance (D.Mean d) => D.Mean (LinearTransform d) where mean (LinearTransform loc _ dist) = loc + D.mean dist instance D.MaybeVariance d => D.MaybeVariance (LinearTransform d) where maybeVariance (LinearTransform _ sc dist) = (*(sc*sc)) <$> D.maybeVariance dist maybeStdDev (LinearTransform _ sc dist) = (*sc) <$> D.maybeStdDev dist instance (D.Variance d) => D.Variance (LinearTransform d) where variance (LinearTransform _ sc dist) = sc * sc * D.variance dist stdDev (LinearTransform _ sc dist) = sc * D.stdDev dist instance (D.MaybeEntropy d) => D.MaybeEntropy (LinearTransform d) where maybeEntropy (LinearTransform _ _ dist) = D.maybeEntropy dist instance (D.Entropy d) => D.Entropy (LinearTransform d) where entropy (LinearTransform _ _ dist) = D.entropy dist instance D.ContGen d => D.ContGen (LinearTransform d) where genContVar (LinearTransform loc sc d) g = do x <- D.genContVar d g return $! loc + sc * x statistics-0.16.2.1/Statistics/Distribution/Uniform.hs0000644000000000000000000000752007346545000021112 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Uniform -- Copyright : (c) 2011 Aleksey Khudyakov -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Variate distributed uniformly in the interval. module Statistics.Distribution.Uniform ( UniformDistribution -- * Constructors , uniformDistr , uniformDistrE -- ** Accessors , uniformA , uniformB ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import System.Random.Stateful (uniformRM) import GHC.Generics (Generic) import qualified Statistics.Distribution as D import Statistics.Internal -- | Uniform distribution from A to B data UniformDistribution = UniformDistribution { uniformA :: {-# UNPACK #-} !Double -- ^ Low boundary of distribution , uniformB :: {-# UNPACK #-} !Double -- ^ Upper boundary of distribution } deriving (Eq, Typeable, Data, Generic) instance Show UniformDistribution where showsPrec i (UniformDistribution a b) = defaultShow2 "uniformDistr" a b i instance Read UniformDistribution where readPrec = defaultReadPrecM2 "uniformDistr" uniformDistrE instance ToJSON UniformDistribution instance FromJSON UniformDistribution where parseJSON (Object v) = do a <- v .: "uniformA" b <- v .: "uniformB" maybe (fail errMsg) return $ uniformDistrE a b parseJSON _ = empty instance Binary UniformDistribution where put (UniformDistribution x y) = put x >> put y get = do a <- get b <- get maybe (fail errMsg) return $ uniformDistrE a b -- | Create uniform distribution. uniformDistr :: Double -> Double -> UniformDistribution uniformDistr a b = maybe (error errMsg) id $ uniformDistrE a b -- | Create uniform distribution. uniformDistrE :: Double -> Double -> Maybe UniformDistribution uniformDistrE a b | b < a = Just $ UniformDistribution b a | a < b = Just $ UniformDistribution a b | otherwise = Nothing -- NOTE: failure is in default branch to guard against NaNs. errMsg :: String errMsg = "Statistics.Distribution.Uniform.uniform: wrong parameters" instance D.Distribution UniformDistribution where cumulative (UniformDistribution a b) x | x < a = 0 | x > b = 1 | otherwise = (x - a) / (b - a) instance D.ContDistr UniformDistribution where density (UniformDistribution a b) x | x < a = 0 | x > b = 0 | otherwise = 1 / (b - a) quantile (UniformDistribution a b) p | p >= 0 && p <= 1 = a + (b - a) * p | otherwise = error $ "Statistics.Distribution.Uniform.quantile: p must be in [0,1] range. Got: "++show p complQuantile (UniformDistribution a b) p | p >= 0 && p <= 1 = b + (a - b) * p | otherwise = error $ "Statistics.Distribution.Uniform.complQuantile: p must be in [0,1] range. Got: "++show p instance D.Mean UniformDistribution where mean (UniformDistribution a b) = 0.5 * (a + b) instance D.Variance UniformDistribution where -- NOTE: 1/sqrt 12 is not constant folded (#4101) so it's written as -- numerical constant. (Also FIXME!) stdDev (UniformDistribution a b) = 0.2886751345948129 * (b - a) variance (UniformDistribution a b) = d * d / 12 where d = b - a instance D.MaybeMean UniformDistribution where maybeMean = Just . D.mean instance D.MaybeVariance UniformDistribution where maybeStdDev = Just . D.stdDev instance D.Entropy UniformDistribution where entropy (UniformDistribution a b) = log (b - a) instance D.MaybeEntropy UniformDistribution where maybeEntropy = Just . D.entropy instance D.ContGen UniformDistribution where genContVar (UniformDistribution a b) = uniformRM (a,b) statistics-0.16.2.1/Statistics/Distribution/Weibull.hs0000644000000000000000000001701407346545000021075 0ustar0000000000000000{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Distribution.Lognormal -- Copyright : (c) 2020 Ximin Luo -- License : BSD3 -- -- Maintainer : infinity0@pwned.gg -- Stability : experimental -- Portability : portable -- -- The Weibull distribution. This is a continuous probability -- distribution that describes the occurrence of a single event whose -- probability changes over time, controlled by the shape parameter. module Statistics.Distribution.Weibull ( WeibullDistribution -- * Constructors , weibullDistr , weibullDistrErr , weibullStandard , weibullDistrApproxMeanStddevErr ) where import Control.Applicative import Data.Aeson (FromJSON(..), ToJSON, Value(..), (.:)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import GHC.Generics (Generic) import Numeric.MathFunctions.Constants (m_eulerMascheroni) import Numeric.SpecFunctions (expm1, log1p, logGamma) import qualified Data.Vector.Generic as G import qualified Statistics.Distribution as D import qualified Statistics.Sample as S import Statistics.Internal -- | The Weibull distribution. data WeibullDistribution = WD { wdShape :: {-# UNPACK #-} !Double , wdLambda :: {-# UNPACK #-} !Double } deriving (Eq, Typeable, Data, Generic) instance Show WeibullDistribution where showsPrec i (WD k l) = defaultShow2 "weibullDistr" k l i instance Read WeibullDistribution where readPrec = defaultReadPrecM2 "weibullDistr" $ (either (const Nothing) Just .) . weibullDistrErr instance ToJSON WeibullDistribution instance FromJSON WeibullDistribution where parseJSON (Object v) = do k <- v .: "wdShape" l <- v .: "wdLambda" either fail return $ weibullDistrErr k l parseJSON _ = empty instance Binary WeibullDistribution where put (WD k l) = put k >> put l get = do k <- get l <- get either fail return $ weibullDistrErr k l instance D.Distribution WeibullDistribution where cumulative = cumulative complCumulative = complCumulative instance D.ContDistr WeibullDistribution where logDensity = logDensity quantile = quantile complQuantile = complQuantile instance D.MaybeMean WeibullDistribution where maybeMean = Just . D.mean instance D.Mean WeibullDistribution where mean (WD k l) = l * exp (logGamma (1 + 1 / k)) instance D.MaybeVariance WeibullDistribution where maybeStdDev = Just . D.stdDev maybeVariance = Just . D.variance instance D.Variance WeibullDistribution where variance (WD k l) = l * l * (exp (logGamma (1 + 2 * invk)) - q * q) where invk = 1 / k q = exp (logGamma (1 + invk)) instance D.Entropy WeibullDistribution where entropy (WD k l) = m_eulerMascheroni * (1 - 1 / k) + log (l / k) + 1 instance D.MaybeEntropy WeibullDistribution where maybeEntropy = Just . D.entropy instance D.ContGen WeibullDistribution where genContVar d = D.genContinuous d -- | Standard Weibull distribution with scale factor (lambda) 1. weibullStandard :: Double -> WeibullDistribution weibullStandard k = weibullDistr k 1.0 -- | Create Weibull distribution from parameters. -- -- If the shape (first) parameter is @1.0@, the distribution is equivalent to a -- 'Statistics.Distribution.Exponential.ExponentialDistribution' with parameter -- @1 / lambda@ the scale (second) parameter. weibullDistr :: Double -- ^ Shape -> Double -- ^ Lambda (scale) -> WeibullDistribution weibullDistr k l = either error id $ weibullDistrErr k l -- | Create Weibull distribution from parameters. -- -- If the shape (first) parameter is @1.0@, the distribution is equivalent to a -- 'Statistics.Distribution.Exponential.ExponentialDistribution' with parameter -- @1 / lambda@ the scale (second) parameter. weibullDistrErr :: Double -- ^ Shape -> Double -- ^ Lambda (scale) -> Either String WeibullDistribution weibullDistrErr k l | k <= 0 = Left $ errMsg k l | l <= 0 = Left $ errMsg k l | otherwise = Right $ WD k l errMsg :: Double -> Double -> String errMsg k l = "Statistics.Distribution.Weibull.weibullDistr: both shape and lambda must be positive. Got shape " ++ show k ++ " and lambda " ++ show l -- | Create Weibull distribution from mean and standard deviation. -- -- The algorithm is from "Methods for Estimating Wind Speed Frequency -- Distributions", C. G. Justus, W. R. Hargreaves, A. Mikhail, D. Graber, 1977. -- Given the identity: -- -- \[ -- (\frac{\sigma}{\mu})^2 = \frac{\Gamma(1+2/k)}{\Gamma(1+1/k)^2} - 1 -- \] -- -- \(k\) can be approximated by -- -- \[ -- k \approx (\frac{\sigma}{\mu})^{-1.086} -- \] -- -- \(\lambda\) is then calculated straightforwardly via the identity -- -- \[ -- \lambda = \frac{\mu}{\Gamma(1+1/k)} -- \] -- -- Numerically speaking, the approximation for \(k\) is accurate only within a -- certain range. We arbitrarily pick the range \(0.033 \le \frac{\sigma}{\mu} \le 1.45\) -- where it is good to ~6%, and will refuse to create a distribution outside of -- this range. The paper does not cover these details but it is straightforward -- to check them numerically. weibullDistrApproxMeanStddevErr :: Double -- ^ Mean -> Double -- ^ Stddev -> Either String WeibullDistribution weibullDistrApproxMeanStddevErr m s = if r > 1.45 || r < 0.033 then Left msg else weibullDistrErr k l where r = s / m k = (s / m) ** (-1.086) l = m / exp (logGamma (1 + 1/k)) msg = "Statistics.Distribution.Weibull.weibullDistr: stddev-mean ratio " ++ "outside approximation accuracy range [0.033, 1.45]. Got " ++ "stddev " ++ show s ++ " and mean " ++ show m -- | Uses an approximation based on the mean and standard deviation in -- 'weibullDistrEstMeanStddevErr', with standard deviation estimated -- using maximum likelihood method (unbiased estimation). -- -- Returns @Nothing@ if sample contains less than one element or -- variance is zero (all elements are equal), or if the estimated mean -- and standard-deviation lies outside the range for which the -- approximation is accurate. instance D.FromSample WeibullDistribution Double where fromSample xs | G.length xs <= 1 = Nothing | v == 0 = Nothing | otherwise = either (const Nothing) Just $ weibullDistrApproxMeanStddevErr m (sqrt v) where (m,v) = S.meanVarianceUnb xs logDensity :: WeibullDistribution -> Double -> Double logDensity (WD k l) x | x < 0 = 0 | otherwise = log k + (k - 1) * log x - k * log l - (x / l) ** k cumulative :: WeibullDistribution -> Double -> Double cumulative (WD k l) x | x < 0 = 0 | otherwise = -expm1 (-(x / l) ** k) complCumulative :: WeibullDistribution -> Double -> Double complCumulative (WD k l) x | x < 0 = 1 | otherwise = exp (-(x / l) ** k) quantile :: WeibullDistribution -> Double -> Double quantile (WD k l) p | p == 0 = 0 | p == 1 = inf | p > 0 && p < 1 = l * (-log1p (-p)) ** (1 / k) | otherwise = error $ "Statistics.Distribution.Weibull.quantile: p must be in [0,1] range. Got: " ++ show p where inf = 1 / 0 complQuantile :: WeibullDistribution -> Double -> Double complQuantile (WD k l) q | q == 0 = inf | q == 1 = 0 | q > 0 && q < 1 = l * (-log q) ** (1 / k) | otherwise = error $ "Statistics.Distribution.Weibull.complQuantile: q must be in [0,1] range. Got: " ++ show q where inf = 1 / 0 statistics-0.16.2.1/Statistics/Function.hs0000644000000000000000000001001407346545000016571 0ustar0000000000000000{-# LANGUAGE BangPatterns, CPP, FlexibleContexts, Rank2Types #-} {-# OPTIONS_GHC -fsimpl-tick-factor=200 #-} -- | -- Module : Statistics.Function -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Useful functions. module Statistics.Function ( -- * Scanning minMax -- * Sorting , sort , gsort , sortBy , partialSort -- * Indexing , indexed , indices -- * Bit twiddling , nextHighestPowerOfTwo -- * Comparison , within -- * Arithmetic , square -- * Vectors , unsafeModify -- * Combinators , for , rfor ) where #include "MachDeps.h" import Control.Monad.ST (ST) import Data.Bits ((.|.), shiftR) import qualified Data.Vector.Algorithms.Intro as I import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Unboxed.Mutable as M import Numeric.MathFunctions.Comparison (within) -- | Sort a vector. sort :: U.Vector Double -> U.Vector Double sort = G.modify I.sort {-# NOINLINE sort #-} -- | Sort a vector. gsort :: (Ord e, G.Vector v e) => v e -> v e gsort = G.modify I.sort {-# INLINE gsort #-} -- | Sort a vector using a custom ordering. sortBy :: (G.Vector v e) => I.Comparison e -> v e -> v e sortBy f = G.modify $ I.sortBy f {-# INLINE sortBy #-} -- | Partially sort a vector, such that the least /k/ elements will be -- at the front. partialSort :: (G.Vector v e, Ord e) => Int -- ^ The number /k/ of least elements. -> v e -> v e partialSort k = G.modify (`I.partialSort` k) {-# SPECIALIZE partialSort :: Int -> U.Vector Double -> U.Vector Double #-} -- | Return the indices of a vector. indices :: (G.Vector v a, G.Vector v Int) => v a -> v Int indices a = G.enumFromTo 0 (G.length a - 1) {-# INLINE indices #-} -- | Zip a vector with its indices. indexed :: (G.Vector v e, G.Vector v Int, G.Vector v (Int,e)) => v e -> v (Int,e) indexed a = G.zip (indices a) a {-# INLINE indexed #-} data MM = MM {-# UNPACK #-} !Double {-# UNPACK #-} !Double -- | Compute the minimum and maximum of a vector in one pass. minMax :: (G.Vector v Double) => v Double -> (Double, Double) minMax = fini . G.foldl' go (MM (1/0) (-1/0)) where go (MM lo hi) k = MM (min lo k) (max hi k) fini (MM lo hi) = (lo, hi) {-# INLINE minMax #-} -- | Efficiently compute the next highest power of two for a -- non-negative integer. If the given value is already a power of -- two, it is returned unchanged. If negative, zero is returned. nextHighestPowerOfTwo :: Int -> Int nextHighestPowerOfTwo n #if WORD_SIZE_IN_BITS == 64 = 1 + _i32 #else = 1 + i16 #endif where i0 = n - 1 i1 = i0 .|. i0 `shiftR` 1 i2 = i1 .|. i1 `shiftR` 2 i4 = i2 .|. i2 `shiftR` 4 i8 = i4 .|. i4 `shiftR` 8 i16 = i8 .|. i8 `shiftR` 16 _i32 = i16 .|. i16 `shiftR` 32 -- It could be implemented as -- -- > nextHighestPowerOfTwo n = 1 + foldl' go (n-1) [1, 2, 4, 8, 16, 32] -- where go m i = m .|. m `shiftR` i -- -- But GHC do not inline foldl (probably because it's recursive) and -- as result function walks list of boxed ints. Hand rolled version -- uses unboxed arithmetic. -- | Multiply a number by itself. square :: Double -> Double square x = x * x -- | Simple for loop. Counts from /start/ to /end/-1. for :: Monad m => Int -> Int -> (Int -> m ()) -> m () for n0 !n f = loop n0 where loop i | i == n = return () | otherwise = f i >> loop (i+1) {-# INLINE for #-} -- | Simple reverse-for loop. Counts from /start/-1 to /end/ (which -- must be less than /start/). rfor :: Monad m => Int -> Int -> (Int -> m ()) -> m () rfor n0 !n f = loop n0 where loop i | i == n = return () | otherwise = let i' = i-1 in f i' >> loop i' {-# INLINE rfor #-} unsafeModify :: M.MVector s Double -> Int -> (Double -> Double) -> ST s () unsafeModify v i f = do k <- M.unsafeRead v i M.unsafeWrite v i (f k) {-# INLINE unsafeModify #-} statistics-0.16.2.1/Statistics/Internal.hs0000644000000000000000000000431107346545000016563 0ustar0000000000000000-- | -- Module : Statistics.Internal -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- module Statistics.Internal ( -- * Default definitions for Show defaultShow1 , defaultShow2 , defaultShow3 -- * Default definitions for Read , defaultReadPrecM1 , defaultReadPrecM2 , defaultReadPrecM3 -- * Reexports , Show(..) , Read(..) ) where import Control.Applicative import Control.Monad import Text.Read ---------------------------------------------------------------- -- Default show implementations ---------------------------------------------------------------- defaultShow1 :: (Show a) => String -> a -> Int -> ShowS defaultShow1 con a n = showParen (n >= 11) ( showString con . showChar ' ' . showsPrec 11 a ) defaultShow2 :: (Show a, Show b) => String -> a -> b -> Int -> ShowS defaultShow2 con a b n = showParen (n >= 11) ( showString con . showChar ' ' . showsPrec 11 a . showChar ' ' . showsPrec 11 b ) defaultShow3 :: (Show a, Show b, Show c) => String -> a -> b -> c -> Int -> ShowS defaultShow3 con a b c n = showParen (n >= 11) ( showString con . showChar ' ' . showsPrec 11 a . showChar ' ' . showsPrec 11 b . showChar ' ' . showsPrec 11 c ) ---------------------------------------------------------------- -- Default read implementations ---------------------------------------------------------------- defaultReadPrecM1 :: (Read a) => String -> (a -> Maybe r) -> ReadPrec r defaultReadPrecM1 con f = parens $ prec 10 $ do expect con a <- readPrec maybe empty return $ f a defaultReadPrecM2 :: (Read a, Read b) => String -> (a -> b -> Maybe r) -> ReadPrec r defaultReadPrecM2 con f = parens $ prec 10 $ do expect con a <- readPrec b <- readPrec maybe empty return $ f a b defaultReadPrecM3 :: (Read a, Read b, Read c) => String -> (a -> b -> c -> Maybe r) -> ReadPrec r defaultReadPrecM3 con f = parens $ prec 10 $ do expect con a <- readPrec b <- readPrec c <- readPrec maybe empty return $ f a b c expect :: String -> ReadPrec () expect str = do Ident s <- lexP guard (s == str) statistics-0.16.2.1/Statistics/Quantile.hs0000644000000000000000000003400207346545000016571 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveFoldable #-} {-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ViewPatterns #-} -- | -- Module : Statistics.Quantile -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Functions for approximating quantiles, i.e. points taken at regular -- intervals from the cumulative distribution function of a random -- variable. -- -- The number of quantiles is described below by the variable /q/, so -- with /q/=4, a 4-quantile (also known as a /quartile/) has 4 -- intervals, and contains 5 points. The parameter /k/ describes the -- desired point, where 0 ≤ /k/ ≤ /q/. module Statistics.Quantile ( -- * Quantile estimation functions -- $cont_quantiles ContParam(..) , Default(..) , quantile , quantiles , quantilesVec -- ** Parameters for the continuous sample method , cadpw , hazen , spss , s , medianUnbiased , normalUnbiased -- * Other algorithms , weightedAvg -- * Median & other specializations , median , mad , midspread -- * Deprecated , continuousBy -- * References -- $references ) where import Data.Binary (Binary) import Data.Aeson (ToJSON,FromJSON) import Data.Data (Data,Typeable) import Data.Default.Class import qualified Data.Foldable as F import Data.Vector.Generic ((!)) import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Storable as S import GHC.Generics (Generic) import Statistics.Function (partialSort) ---------------------------------------------------------------- -- Quantile estimation ---------------------------------------------------------------- -- | O(/n/·log /n/). Estimate the /k/th /q/-quantile of a sample, -- using the weighted average method. Up to rounding errors it's same -- as @quantile s@. -- -- The following properties should hold otherwise an error will be thrown. -- -- * the length of the input is greater than @0@ -- -- * the input does not contain @NaN@ -- -- * k ≥ 0 and k ≤ q weightedAvg :: G.Vector v Double => Int -- ^ /k/, the desired quantile. -> Int -- ^ /q/, the number of quantiles. -> v Double -- ^ /x/, the sample data. -> Double weightedAvg k q x | G.any isNaN x = modErr "weightedAvg" "Sample contains NaNs" | n == 0 = modErr "weightedAvg" "Sample is empty" | n == 1 = G.head x | q < 2 = modErr "weightedAvg" "At least 2 quantiles is needed" | k == q = G.maximum x | k >= 0 || k < q = xj + g * (xj1 - xj) | otherwise = modErr "weightedAvg" "Wrong quantile number" where j = floor idx idx = fromIntegral (n - 1) * fromIntegral k / fromIntegral q g = idx - fromIntegral j xj = sx ! j xj1 = sx ! (j+1) sx = partialSort (j+2) x n = G.length x {-# SPECIALIZE weightedAvg :: Int -> Int -> U.Vector Double -> Double #-} {-# SPECIALIZE weightedAvg :: Int -> Int -> V.Vector Double -> Double #-} {-# SPECIALIZE weightedAvg :: Int -> Int -> S.Vector Double -> Double #-} ---------------------------------------------------------------- -- Quantiles continuous algorithm ---------------------------------------------------------------- -- $cont_quantiles -- -- Below is family of functions which use same algorithm for estimation -- of sample quantiles. It approximates empirical CDF as continuous -- piecewise function which interpolates linearly between points -- \((X_k,p_k)\) where \(X_k\) is k-th order statistics (k-th smallest -- element) and \(p_k\) is probability corresponding to -- it. 'ContParam' determines how \(p_k\) is chosen. For more detailed -- explanation see [Hyndman1996]. -- -- This is the method used by most statistical software, such as R, -- Mathematica, SPSS, and S. -- | Parameters /α/ and /β/ to the 'continuousBy' function. Exact -- meaning of parameters is described in [Hyndman1996] in section -- \"Piecewise linear functions\" data ContParam = ContParam {-# UNPACK #-} !Double {-# UNPACK #-} !Double deriving (Show,Eq,Ord,Data,Typeable,Generic) -- | We use 's' as default value which is same as R's default. instance Default ContParam where def = s instance Binary ContParam instance ToJSON ContParam instance FromJSON ContParam -- | O(/n/·log /n/). Estimate the /k/th /q/-quantile of a sample /x/, -- using the continuous sample method with the given parameters. -- -- The following properties should hold, otherwise an error will be thrown. -- -- * input sample must be nonempty -- -- * the input does not contain @NaN@ -- -- * 0 ≤ k ≤ q quantile :: G.Vector v Double => ContParam -- ^ Parameters /α/ and /β/. -> Int -- ^ /k/, the desired quantile. -> Int -- ^ /q/, the number of quantiles. -> v Double -- ^ /x/, the sample data. -> Double quantile param q nQ xs | nQ < 2 = modErr "continuousBy" "At least 2 quantiles is needed" | badQ nQ q = modErr "continuousBy" "Wrong quantile number" | G.any isNaN xs = modErr "continuousBy" "Sample contains NaNs" | otherwise = estimateQuantile sortedXs pk where pk = toPk param n q nQ sortedXs = psort xs $ floor pk + 1 n = G.length xs {-# INLINABLE quantile #-} {-# SPECIALIZE quantile :: ContParam -> Int -> Int -> U.Vector Double -> Double #-} {-# SPECIALIZE quantile :: ContParam -> Int -> Int -> V.Vector Double -> Double #-} {-# SPECIALIZE quantile :: ContParam -> Int -> Int -> S.Vector Double -> Double #-} -- | O(/k·n/·log /n/). Estimate set of the /k/th /q/-quantile of a -- sample /x/, using the continuous sample method with the given -- parameters. This is faster than calling quantile repeatedly since -- sample should be sorted only once -- -- The following properties should hold, otherwise an error will be thrown. -- -- * input sample must be nonempty -- -- * the input does not contain @NaN@ -- -- * for every k in set of quantiles 0 ≤ k ≤ q quantiles :: (G.Vector v Double, F.Foldable f, Functor f) => ContParam -> f Int -> Int -> v Double -> f Double quantiles param qs nQ xs | nQ < 2 = modErr "quantiles" "At least 2 quantiles is needed" | F.any (badQ nQ) qs = modErr "quantiles" "Wrong quantile number" | G.any isNaN xs = modErr "quantiles" "Sample contains NaNs" -- Doesn't matter what we put into empty container | null qs = 0 <$ qs | otherwise = fmap (estimateQuantile sortedXs) ks' where ks' = fmap (\q -> toPk param n q nQ) qs sortedXs = psort xs $ floor (F.maximum ks') + 1 n = G.length xs {-# INLINABLE quantiles #-} {-# SPECIALIZE quantiles :: (Functor f, F.Foldable f) => ContParam -> f Int -> Int -> V.Vector Double -> f Double #-} {-# SPECIALIZE quantiles :: (Functor f, F.Foldable f) => ContParam -> f Int -> Int -> U.Vector Double -> f Double #-} {-# SPECIALIZE quantiles :: (Functor f, F.Foldable f) => ContParam -> f Int -> Int -> S.Vector Double -> f Double #-} -- | O(/k·n/·log /n/). Same as quantiles but uses 'G.Vector' container -- instead of 'Foldable' one. quantilesVec :: (G.Vector v Double, G.Vector v Int) => ContParam -> v Int -> Int -> v Double -> v Double quantilesVec param qs nQ xs | nQ < 2 = modErr "quantilesVec" "At least 2 quantiles is needed" | G.any (badQ nQ) qs = modErr "quantilesVec" "Wrong quantile number" | G.any isNaN xs = modErr "quantilesVec" "Sample contains NaNs" | G.null qs = G.empty | otherwise = G.map (estimateQuantile sortedXs) ks' where ks' = G.map (\q -> toPk param n q nQ) qs sortedXs = psort xs $ floor (G.maximum ks') + 1 n = G.length xs {-# INLINABLE quantilesVec #-} {-# SPECIALIZE quantilesVec :: ContParam -> V.Vector Int -> Int -> V.Vector Double -> V.Vector Double #-} {-# SPECIALIZE quantilesVec :: ContParam -> U.Vector Int -> Int -> U.Vector Double -> U.Vector Double #-} {-# SPECIALIZE quantilesVec :: ContParam -> S.Vector Int -> Int -> S.Vector Double -> S.Vector Double #-} -- Returns True if quantile number is out of range badQ :: Int -> Int -> Bool badQ nQ q = q < 0 || q > nQ -- Obtain k from equation for p_k [Hyndman1996] p.363. Note that -- equation defines p_k for integer k but we calculate it as real -- value and will use fractional part for linear interpolation. This -- is correct since equation is linear. toPk :: ContParam -> Int -- ^ /n/ number of elements -> Int -- ^ /k/, the desired quantile. -> Int -- ^ /q/, the number of quantiles. -> Double toPk (ContParam a b) (fromIntegral -> n) q nQ = a + p * (n + 1 - a - b) where p = fromIntegral q / fromIntegral nQ -- Estimate quantile for given k (including fractional part) estimateQuantile :: G.Vector v Double => v Double -> Double -> Double {-# INLINE estimateQuantile #-} estimateQuantile sortedXs k' = (1-g) * item (k-1) + g * item k where (k,g) = properFraction k' item = (sortedXs !) . clamp -- clamp = max 0 . min (n - 1) n = G.length sortedXs psort :: G.Vector v Double => v Double -> Int -> v Double psort xs k = partialSort (max 0 $ min (G.length xs - 1) k) xs {-# INLINE psort #-} -- | California Department of Public Works definition, /α/=0, /β/=1. -- Gives a linear interpolation of the empirical CDF. This -- corresponds to method 4 in R and Mathematica. cadpw :: ContParam cadpw = ContParam 0 1 -- | Hazen's definition, /α/=0.5, /β/=0.5. This is claimed to be -- popular among hydrologists. This corresponds to method 5 in R and -- Mathematica. hazen :: ContParam hazen = ContParam 0.5 0.5 -- | Definition used by the SPSS statistics application, with /α/=0, -- /β/=0 (also known as Weibull's definition). This corresponds to -- method 6 in R and Mathematica. spss :: ContParam spss = ContParam 0 0 -- | Definition used by the S statistics application, with /α/=1, -- /β/=1. The interpolation points divide the sample range into @n-1@ -- intervals. This corresponds to method 7 in R and Mathematica and -- is default in R. s :: ContParam s = ContParam 1 1 -- | Median unbiased definition, /α/=1\/3, /β/=1\/3. The resulting -- quantile estimates are approximately median unbiased regardless of -- the distribution of /x/. This corresponds to method 8 in R and -- Mathematica. medianUnbiased :: ContParam medianUnbiased = ContParam third third where third = 1/3 -- | Normal unbiased definition, /α/=3\/8, /β/=3\/8. An approximately -- unbiased estimate if the empirical distribution approximates the -- normal distribution. This corresponds to method 9 in R and -- Mathematica. normalUnbiased :: ContParam normalUnbiased = ContParam ta ta where ta = 3/8 modErr :: String -> String -> a modErr f err = error $ "Statistics.Quantile." ++ f ++ ": " ++ err ---------------------------------------------------------------- -- Specializations ---------------------------------------------------------------- -- | O(/n/·log /n/) Estimate median of sample median :: G.Vector v Double => ContParam -- ^ Parameters /α/ and /β/. -> v Double -- ^ /x/, the sample data. -> Double {-# INLINE median #-} median p = quantile p 1 2 -- | O(/n/·log /n/). Estimate the range between /q/-quantiles 1 and -- /q/-1 of a sample /x/, using the continuous sample method with the -- given parameters. -- -- For instance, the interquartile range (IQR) can be estimated as -- follows: -- -- > midspread medianUnbiased 4 (U.fromList [1,1,2,2,3]) -- > ==> 1.333333 midspread :: G.Vector v Double => ContParam -- ^ Parameters /α/ and /β/. -> Int -- ^ /q/, the number of quantiles. -> v Double -- ^ /x/, the sample data. -> Double midspread param k x | G.any isNaN x = modErr "midspread" "Sample contains NaNs" | k <= 0 = modErr "midspread" "Nonpositive number of quantiles" | otherwise = let Pair x1 x2 = quantiles param (Pair 1 (k-1)) k x in x2 - x1 {-# INLINABLE midspread #-} {-# SPECIALIZE midspread :: ContParam -> Int -> U.Vector Double -> Double #-} {-# SPECIALIZE midspread :: ContParam -> Int -> V.Vector Double -> Double #-} {-# SPECIALIZE midspread :: ContParam -> Int -> S.Vector Double -> Double #-} data Pair a = Pair !a !a deriving (Functor, F.Foldable) -- | O(/n/·log /n/). Estimate the median absolute deviation (MAD) of a -- sample /x/ using 'continuousBy'. It's robust estimate of -- variability in sample and defined as: -- -- \[ -- MAD = \operatorname{median}(| X_i - \operatorname{median}(X) |) -- \] mad :: G.Vector v Double => ContParam -- ^ Parameters /α/ and /β/. -> v Double -- ^ /x/, the sample data. -> Double mad p xs = median p $ G.map (abs . subtract med) xs where med = median p xs {-# INLINABLE mad #-} {-# SPECIALIZE mad :: ContParam -> U.Vector Double -> Double #-} {-# SPECIALIZE mad :: ContParam -> V.Vector Double -> Double #-} {-# SPECIALIZE mad :: ContParam -> S.Vector Double -> Double #-} ---------------------------------------------------------------- -- Deprecated ---------------------------------------------------------------- continuousBy :: G.Vector v Double => ContParam -- ^ Parameters /α/ and /β/. -> Int -- ^ /k/, the desired quantile. -> Int -- ^ /q/, the number of quantiles. -> v Double -- ^ /x/, the sample data. -> Double continuousBy = quantile {-# DEPRECATED continuousBy "Use quantile instead" #-} -- $references -- -- * Weisstein, E.W. Quantile. /MathWorld/. -- -- -- * [Hyndman1996] Hyndman, R.J.; Fan, Y. (1996) Sample quantiles in statistical -- packages. /American Statistician/ -- 50(4):361–365. statistics-0.16.2.1/Statistics/Regression.hs0000644000000000000000000001461207346545000017134 0ustar0000000000000000-- | -- Module : Statistics.Regression -- Copyright : 2014 Bryan O'Sullivan -- License : BSD3 -- -- Functions for regression analysis. module Statistics.Regression ( olsRegress , ols , rSquare , bootstrapRegress ) where import Control.Concurrent.Async (forConcurrently) import Control.DeepSeq (rnf) import Control.Monad (when) import Data.List (nub) import GHC.Conc (getNumCapabilities) import Prelude hiding (pred, sum) import Statistics.Function as F import Statistics.Matrix hiding (map) import Statistics.Matrix.Algorithms (qr) import Statistics.Resampling (splitGen) import Statistics.Types (Estimate(..),ConfInt,CL,estimateFromInterval,significanceLevel) import Statistics.Sample (mean) import Statistics.Sample.Internal (sum) import System.Random.MWC (GenIO, uniformR) import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Unboxed.Mutable as M -- | Perform an ordinary least-squares regression on a set of -- predictors, and calculate the goodness-of-fit of the regression. -- -- The returned pair consists of: -- -- * A vector of regression coefficients. This vector has /one more/ -- element than the list of predictors; the last element is the -- /y/-intercept value. -- -- * /R²/, the coefficient of determination (see 'rSquare' for -- details). olsRegress :: [Vector] -- ^ Non-empty list of predictor vectors. Must all have -- the same length. These will become the columns of -- the matrix /A/ solved by 'ols'. -> Vector -- ^ Responder vector. Must have the same length as the -- predictor vectors. -> (Vector, Double) olsRegress preds@(_:_) resps | any (/=n) ls = error $ "predictor vector length mismatch " ++ show lss | G.length resps /= n = error $ "responder/predictor length mismatch " ++ show (G.length resps, n) | otherwise = (coeffs, rSquare mxpreds resps coeffs) where coeffs = ols mxpreds resps mxpreds = transpose . fromVector (length lss + 1) n . G.concat $ preds ++ [G.replicate n 1] lss@(n:ls) = map G.length preds olsRegress _ _ = error "no predictors given" -- | Compute the ordinary least-squares solution to /A x = b/. ols :: Matrix -- ^ /A/ has at least as many rows as columns. -> Vector -- ^ /b/ has the same length as columns in /A/. -> Vector ols a b | rs < cs = error $ "fewer rows than columns " ++ show d | otherwise = solve r (transpose q `multiplyV` b) where d@(rs,cs) = dimension a (q,r) = qr a -- | Solve the equation /R x = b/. solve :: Matrix -- ^ /R/ is an upper-triangular square matrix. -> Vector -- ^ /b/ is of the same length as rows\/columns in /R/. -> Vector solve r b | n /= l = error $ "row/vector mismatch " ++ show (n,l) | otherwise = U.create $ do s <- U.thaw b rfor n 0 $ \i -> do si <- (/ unsafeIndex r i i) <$> M.unsafeRead s i M.unsafeWrite s i si F.for 0 i $ \j -> F.unsafeModify s j $ subtract (unsafeIndex r j i * si) return s where n = rows r l = U.length b -- | Compute /R²/, the coefficient of determination that -- indicates goodness-of-fit of a regression. -- -- This value will be 1 if the predictors fit perfectly, dropping to 0 -- if they have no explanatory power. rSquare :: Matrix -- ^ Predictors (regressors). -> Vector -- ^ Responders. -> Vector -- ^ Regression coefficients. -> Double rSquare pred resp coeff = 1 - r / t where r = sum $ flip U.imap resp $ \i x -> square (x - p i) t = sum $ flip U.map resp $ \x -> square (x - mean resp) p i = sum . flip U.imap coeff $ \j -> (* unsafeIndex pred i j) -- | Bootstrap a regression function. Returns both the results of the -- regression and the requested confidence interval values. bootstrapRegress :: GenIO -> Int -- ^ Number of resamples to compute. -> CL Double -- ^ Confidence level. -> ([Vector] -> Vector -> (Vector, Double)) -- ^ Regression function. -> [Vector] -- ^ Predictor vectors. -> Vector -- ^ Responder vector. -> IO (V.Vector (Estimate ConfInt Double), Estimate ConfInt Double) bootstrapRegress gen0 numResamples cl rgrss preds0 resp0 | numResamples < 1 = error $ "bootstrapRegress: number of resamples " ++ "must be positive" | otherwise = do -- some error checks so that we do not run into vector index out of bounds. case nub (map U.length preds0) of [] -> error "bootstrapRegress: predictor vectors must not be empty" [plen] -> do let rlen = U.length resp0 when (plen /= rlen) $ error $ "bootstrapRegress: responder vector length [" ++ show rlen ++ "] must be the same as predictor vectors' length [" ++ show plen ++ "]" xs -> error $ "bootstrapRegress: all predictor vectors must be of the same \ \length, lengths provided are: " ++ show xs caps <- getNumCapabilities gens <- splitGen caps gen0 vs <- forConcurrently (zip gens (balance caps numResamples)) $ \(gen,count) -> do v <- V.replicateM count $ do let n = U.length resp0 ixs <- U.replicateM n $ uniformR (0,n-1) gen let resp = U.backpermute resp0 ixs preds = map (flip U.backpermute ixs) preds0 return $ rgrss preds resp rnf v `seq` return v let (coeffsv, r2v) = G.unzip (V.concat vs) let coeffs = flip G.imap (G.convert coeffss) $ \i x -> est x . U.generate numResamples $ \k -> (coeffsv G.! k) G.! i r2 = est r2s (G.convert r2v) (coeffss, r2s) = rgrss preds0 resp0 est s v = estimateFromInterval s (w G.! lo, w G.! hi) cl where w = F.sort v bounded i = min (U.length w - 1) (max 0 i) lo = bounded $ round c hi = bounded $ truncate (n - c) n = fromIntegral numResamples c = n * (significanceLevel cl / 2) return (coeffs, r2) -- | Balance units of work across workers. balance :: Int -> Int -> [Int] balance numSlices numItems = zipWith (+) (replicate numSlices q) (replicate r 1 ++ repeat 0) where (q,r) = numItems `quotRem` numSlices statistics-0.16.2.1/Statistics/Resampling.hs0000644000000000000000000002257707346545000017126 0ustar0000000000000000{-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveFoldable #-} {-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveTraversable #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE TypeFamilies #-} -- | -- Module : Statistics.Resampling -- Copyright : (c) 2009, 2010 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Resampling statistics. module Statistics.Resampling ( -- * Data types Resample(..) , Bootstrap(..) , Estimator(..) , estimate -- * Resampling , resampleST , resample , resampleVector -- * Jackknife , jackknife , jackknifeMean , jackknifeVariance , jackknifeVarianceUnb , jackknifeStdDev -- * Helper functions , splitGen ) where import Data.Aeson (FromJSON, ToJSON) import Control.Concurrent.Async (forConcurrently_) import Control.Monad (forM_, forM, replicateM, liftM2) import Control.Monad.Primitive (PrimMonad(..)) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import Data.Vector.Algorithms.Intro (sort) import Data.Vector.Binary () import Data.Vector.Generic (unsafeFreeze,unsafeThaw) import Data.Word (Word32) import qualified Data.Foldable as T import qualified Data.Traversable as T import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Unboxed.Mutable as MU import GHC.Conc (numCapabilities) import GHC.Generics (Generic) import Numeric.Sum (Summation(..), kbn) import Statistics.Function (indices) import Statistics.Sample (mean, stdDev, variance, varianceUnbiased) import Statistics.Types (Sample) import System.Random.MWC (Gen, GenIO, initialize, uniformR, uniformVector) ---------------------------------------------------------------- -- Data types ---------------------------------------------------------------- -- | A resample drawn randomly, with replacement, from a set of data -- points. Distinct from a normal array to make it harder for your -- humble author's brain to go wrong. newtype Resample = Resample { fromResample :: U.Vector Double } deriving (Eq, Read, Show, Typeable, Data, Generic) instance FromJSON Resample instance ToJSON Resample instance Binary Resample where put = put . fromResample get = fmap Resample get data Bootstrap v a = Bootstrap { fullSample :: !a , resamples :: v a } deriving (Eq, Read, Show , Generic, Functor, T.Foldable, T.Traversable , Typeable, Data ) instance (Binary a, Binary (v a)) => Binary (Bootstrap v a) where get = liftM2 Bootstrap get get put (Bootstrap fs rs) = put fs >> put rs instance (FromJSON a, FromJSON (v a)) => FromJSON (Bootstrap v a) instance (ToJSON a, ToJSON (v a)) => ToJSON (Bootstrap v a) -- | An estimator of a property of a sample, such as its 'mean'. -- -- The use of an algebraic data type here allows functions such as -- 'jackknife' and 'bootstrapBCA' to use more efficient algorithms -- when possible. data Estimator = Mean | Variance | VarianceUnbiased | StdDev | Function (Sample -> Double) -- | Run an 'Estimator' over a sample. estimate :: Estimator -> Sample -> Double estimate Mean = mean estimate Variance = variance estimate VarianceUnbiased = varianceUnbiased estimate StdDev = stdDev estimate (Function est) = est ---------------------------------------------------------------- -- Resampling ---------------------------------------------------------------- -- | Single threaded and deterministic version of resample. resampleST :: PrimMonad m => Gen (PrimState m) -> [Estimator] -- ^ Estimation functions. -> Int -- ^ Number of resamples to compute. -> U.Vector Double -- ^ Original sample. -> m [Bootstrap U.Vector Double] resampleST gen ests numResamples sample = do -- Generate resamples res <- forM ests $ \e -> U.replicateM numResamples $ do v <- resampleVector gen sample return $! estimate e v -- Sort resamples resM <- mapM unsafeThaw res mapM_ sort resM resSorted <- mapM unsafeFreeze resM return $ zipWith Bootstrap [estimate e sample | e <- ests] resSorted -- | /O(e*r*s)/ Resample a data set repeatedly, with replacement, -- computing each estimate over the resampled data. -- -- This function is expensive; it has to do work proportional to -- /e*r*s/, where /e/ is the number of estimation functions, /r/ is -- the number of resamples to compute, and /s/ is the number of -- original samples. -- -- To improve performance, this function will make use of all -- available CPUs. At least with GHC 7.0, parallel performance seems -- best if the parallel garbage collector is disabled (RTS option -- @-qg@). resample :: GenIO -> [Estimator] -- ^ Estimation functions. -> Int -- ^ Number of resamples to compute. -> U.Vector Double -- ^ Original sample. -> IO [(Estimator, Bootstrap U.Vector Double)] resample gen ests numResamples samples = do let ixs = scanl (+) 0 $ zipWith (+) (replicate numCapabilities q) (replicate r 1 ++ repeat 0) where (q,r) = numResamples `quotRem` numCapabilities results <- mapM (const (MU.new numResamples)) ests gens <- splitGen numCapabilities gen forConcurrently_ (zip3 ixs (tail ixs) gens) $ \ (start,!end,gen') -> do -- on GHCJS it doesn't make sense to do any forking. -- JavaScript runtime has only single capability. let loop k ers | k >= end = return () | otherwise = do re <- resampleVector gen' samples forM_ ers $ \(est,arr) -> MU.write arr k . est $ re loop (k+1) ers loop start (zip ests' results) mapM_ sort results -- Build resamples res <- mapM unsafeFreeze results return $ zip ests $ zipWith Bootstrap [estimate e samples | e <- ests] res where ests' = map estimate ests -- | Create vector using resamples resampleVector :: (PrimMonad m, G.Vector v a) => Gen (PrimState m) -> v a -> m (v a) resampleVector gen v = G.replicateM n $ do i <- uniformR (0,n-1) gen return $! G.unsafeIndex v i where n = G.length v ---------------------------------------------------------------- -- Jackknife ---------------------------------------------------------------- -- | /O(n) or O(n^2)/ Compute a statistical estimate repeatedly over a -- sample, each time omitting a successive element. jackknife :: Estimator -> Sample -> U.Vector Double jackknife Mean sample = jackknifeMean sample jackknife Variance sample = jackknifeVariance sample jackknife VarianceUnbiased sample = jackknifeVarianceUnb sample jackknife StdDev sample = jackknifeStdDev sample jackknife (Function est) sample | G.length sample == 1 = singletonErr "jackknife" | otherwise = U.map f . indices $ sample where f i = est (dropAt i sample) -- | /O(n)/ Compute the jackknife mean of a sample. jackknifeMean :: Sample -> U.Vector Double jackknifeMean samp | len == 1 = singletonErr "jackknifeMean" | otherwise = G.map (/l) $ G.zipWith (+) (pfxSumL samp) (pfxSumR samp) where l = fromIntegral (len - 1) len = G.length samp -- | /O(n)/ Compute the jackknife variance of a sample with a -- correction factor @c@, so we can get either the regular or -- \"unbiased\" variance. jackknifeVariance_ :: Double -> Sample -> U.Vector Double jackknifeVariance_ c samp | len == 1 = singletonErr "jackknifeVariance" | otherwise = G.zipWith4 go als ars bls brs where als = pfxSumL . G.map goa $ samp ars = pfxSumR . G.map goa $ samp goa x = v * v where v = x - m bls = pfxSumL . G.map (subtract m) $ samp brs = pfxSumR . G.map (subtract m) $ samp m = mean samp n = fromIntegral len go al ar bl br = (al + ar - (b * b) / q) / (q - c) where b = bl + br q = n - 1 len = G.length samp -- | /O(n)/ Compute the unbiased jackknife variance of a sample. jackknifeVarianceUnb :: Sample -> U.Vector Double jackknifeVarianceUnb samp | G.length samp == 2 = singletonErr "jackknifeVariance" | otherwise = jackknifeVariance_ 1 samp -- | /O(n)/ Compute the jackknife variance of a sample. jackknifeVariance :: Sample -> U.Vector Double jackknifeVariance = jackknifeVariance_ 0 -- | /O(n)/ Compute the jackknife standard deviation of a sample. jackknifeStdDev :: Sample -> U.Vector Double jackknifeStdDev = G.map sqrt . jackknifeVarianceUnb pfxSumL :: U.Vector Double -> U.Vector Double pfxSumL = G.map kbn . G.scanl add zero pfxSumR :: U.Vector Double -> U.Vector Double pfxSumR = G.tail . G.map kbn . G.scanr (flip add) zero -- | Drop the /k/th element of a vector. dropAt :: U.Unbox e => Int -> U.Vector e -> U.Vector e dropAt n v = U.slice 0 n v U.++ U.slice (n+1) (U.length v - n - 1) v singletonErr :: String -> a singletonErr func = error $ "Statistics.Resampling." ++ func ++ ": not enough elements in sample" -- | Split a generator into several that can run independently. splitGen :: Int -> GenIO -> IO [GenIO] splitGen n gen | n <= 0 = return [] | otherwise = fmap (gen:) . replicateM (n-1) $ initialize =<< (uniformVector gen 256 :: IO (U.Vector Word32)) statistics-0.16.2.1/Statistics/Resampling/0000755000000000000000000000000007346545000016555 5ustar0000000000000000statistics-0.16.2.1/Statistics/Resampling/Bootstrap.hs0000644000000000000000000000737707346545000021104 0ustar0000000000000000-- | -- Module : Statistics.Resampling.Bootstrap -- Copyright : (c) 2009, 2011 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The bootstrap method for statistical inference. module Statistics.Resampling.Bootstrap ( bootstrapBCA , basicBootstrap -- * References -- $references ) where import Data.Vector.Generic ((!)) import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Generic as G import Statistics.Distribution (cumulative, quantile) import Statistics.Distribution.Normal import Statistics.Resampling (Bootstrap(..), jackknife) import Statistics.Sample (mean) import Statistics.Types (Sample, CL, Estimate, ConfInt, estimateFromInterval, estimateFromErr, CL, significanceLevel) import Statistics.Function (gsort) import qualified Statistics.Resampling as R import Control.Parallel.Strategies (parMap, rdeepseq) data T = {-# UNPACK #-} !Double :< {-# UNPACK #-} !Double infixl 2 :< -- | Bias-corrected accelerated (BCA) bootstrap. This adjusts for both -- bias and skewness in the resampled distribution. -- -- BCA algorithm is described in ch. 5 of Davison, Hinkley "Confidence -- intervals" in section 5.3 "Percentile method" bootstrapBCA :: CL Double -- ^ Confidence level -> Sample -- ^ Full data sample -> [(R.Estimator, Bootstrap U.Vector Double)] -- ^ Estimates obtained from resampled data and estimator used for -- this. -> [Estimate ConfInt Double] bootstrapBCA confidenceLevel sample resampledData = parMap rdeepseq e resampledData where e (est, Bootstrap pt resample) | U.length sample == 1 || isInfinite bias = estimateFromErr pt (0,0) confidenceLevel | otherwise = estimateFromInterval pt (resample ! lo, resample ! hi) confidenceLevel where -- Quantile estimates for given CL lo = min (max (cumn a1) 0) (ni - 1) where a1 = bias + b1 / (1 - accel * b1) b1 = bias + z1 hi = max (min (cumn a2) (ni - 1)) 0 where a2 = bias + b2 / (1 - accel * b2) b2 = bias - z1 -- Number of resamples ni = U.length resample n = fromIntegral ni -- Corrections z1 = quantile standard (significanceLevel confidenceLevel / 2) cumn = round . (*n) . cumulative standard bias = quantile standard (probN / n) where probN = fromIntegral . U.length . U.filter ( CL Double -- ^ Confidence vector -> Bootstrap v a -- ^ Estimate from full sample and vector of -- estimates obtained from resamples -> Estimate ConfInt a {-# INLINE basicBootstrap #-} basicBootstrap cl (Bootstrap e ests) = estimateFromInterval e (sorted ! lo, sorted ! hi) cl where sorted = gsort ests n = fromIntegral $ G.length ests c = n * (significanceLevel cl / 2) -- FIXME: can we have better estimates of quantiles in case when p -- is not multiple of 1/N -- -- FIXME: we could have undercoverage here lo = round c hi = truncate (n - c) -- $references -- -- * Davison, A.C; Hinkley, D.V. (1997) Bootstrap methods and their -- application. statistics-0.16.2.1/Statistics/Sample.hs0000644000000000000000000003645607346545000016247 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} -- | -- Module : Statistics.Sample -- Copyright : (c) 2008 Don Stewart, 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Commonly used sample statistics, also known as descriptive -- statistics. module Statistics.Sample ( -- * Types Sample , WeightedSample -- * Descriptive functions , range -- * Statistics of location , mean , welfordMean , meanWeighted , harmonicMean , geometricMean -- * Statistics of dispersion -- $variance -- ** Functions over central moments , centralMoment , centralMoments , skewness , kurtosis -- ** Two-pass functions (numerically robust) -- $robust , variance , varianceUnbiased , meanVariance , meanVarianceUnb , stdDev , varianceWeighted , stdErrMean -- ** Single-pass functions (faster, less safe) -- $cancellation , fastVariance , fastVarianceUnbiased , fastStdDev -- * Joint distributions , covariance , correlation , pair -- * References -- $references ) where import Statistics.Function (minMax) import Statistics.Sample.Internal (robustSumVar, sum) import Statistics.Types.Internal (Sample,WeightedSample) import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U -- Operator ^ will be overridden import Prelude hiding ((^), sum) -- | /O(n)/ Range. The difference between the largest and smallest -- elements of a sample. range :: (G.Vector v Double) => v Double -> Double range s = hi - lo where (lo , hi) = minMax s {-# INLINE range #-} -- | /O(n)/ Arithmetic mean. This uses Kahan-Babuška-Neumaier -- summation, so is more accurate than 'welfordMean' unless the input -- values are very large. mean :: (G.Vector v Double) => v Double -> Double mean xs = sum xs / fromIntegral (G.length xs) {-# SPECIALIZE mean :: U.Vector Double -> Double #-} {-# SPECIALIZE mean :: V.Vector Double -> Double #-} -- | /O(n)/ Arithmetic mean. This uses Welford's algorithm to provide -- numerical stability, using a single pass over the sample data. -- -- Compared to 'mean', this loses a surprising amount of precision -- unless the inputs are very large. welfordMean :: (G.Vector v Double) => v Double -> Double welfordMean = fini . G.foldl' go (T 0 0) where fini (T a _) = a go (T m n) x = T m' n' where m' = m + (x - m) / fromIntegral n' n' = n + 1 {-# SPECIALIZE welfordMean :: U.Vector Double -> Double #-} {-# SPECIALIZE welfordMean :: V.Vector Double -> Double #-} -- | /O(n)/ Arithmetic mean for weighted sample. It uses a single-pass -- algorithm analogous to the one used by 'welfordMean'. meanWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double meanWeighted = fini . G.foldl' go (V 0 0) where fini (V a _) = a go (V m w) (x,xw) = V m' w' where m' | w' == 0 = 0 | otherwise = m + xw * (x - m) / w' w' = w + xw {-# INLINE meanWeighted #-} -- | /O(n)/ Harmonic mean. This algorithm performs a single pass over -- the sample. harmonicMean :: (G.Vector v Double) => v Double -> Double harmonicMean = fini . G.foldl' go (T 0 0) where fini (T b a) = fromIntegral a / b go (T x y) n = T (x + (1/n)) (y+1) {-# INLINE harmonicMean #-} -- | /O(n)/ Geometric mean of a sample containing no negative values. geometricMean :: (G.Vector v Double) => v Double -> Double geometricMean = exp . mean . G.map log {-# INLINE geometricMean #-} -- | Compute the /k/th central moment of a sample. The central moment -- is also known as the moment about the mean. -- -- This function performs two passes over the sample, so is not subject -- to stream fusion. -- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. centralMoment :: (G.Vector v Double) => Int -> v Double -> Double centralMoment a xs | a < 0 = error "Statistics.Sample.centralMoment: negative input" | a == 0 = 1 | a == 1 = 0 | otherwise = sum (G.map go xs) / fromIntegral (G.length xs) where go x = (x-m) ^ a m = mean xs {-# SPECIALIZE centralMoment :: Int -> U.Vector Double -> Double #-} {-# SPECIALIZE centralMoment :: Int -> V.Vector Double -> Double #-} -- | Compute the /k/th and /j/th central moments of a sample. -- -- This function performs two passes over the sample, so is not subject -- to stream fusion. -- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. centralMoments :: (G.Vector v Double) => Int -> Int -> v Double -> (Double, Double) centralMoments a b xs | a < 2 || b < 2 = (centralMoment a xs , centralMoment b xs) | otherwise = fini . G.foldl' go (V 0 0) $ xs where go (V i j) x = V (i + d^a) (j + d^b) where d = x - m fini (V i j) = (i / n , j / n) m = mean xs n = fromIntegral (G.length xs) {-# SPECIALIZE centralMoments :: Int -> Int -> U.Vector Double -> (Double, Double) #-} {-# SPECIALIZE centralMoments :: Int -> Int -> V.Vector Double -> (Double, Double) #-} -- | Compute the skewness of a sample. This is a measure of the -- asymmetry of its distribution. -- -- A sample with negative skew is said to be /left-skewed/. Most of -- its mass is on the right of the distribution, with the tail on the -- left. -- -- > skewness $ U.to [1,100,101,102,103] -- > ==> -1.497681449918257 -- -- A sample with positive skew is said to be /right-skewed/. -- -- > skewness $ U.to [1,2,3,4,100] -- > ==> 1.4975367033335198 -- -- A sample's skewness is not defined if its 'variance' is zero. -- -- This function performs two passes over the sample, so is not subject -- to stream fusion. -- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. skewness :: (G.Vector v Double) => v Double -> Double skewness xs = c3 * c2 ** (-1.5) where (c3 , c2) = centralMoments 3 2 xs {-# SPECIALIZE skewness :: U.Vector Double -> Double #-} {-# SPECIALIZE skewness :: V.Vector Double -> Double #-} -- | Compute the excess kurtosis of a sample. This is a measure of -- the \"peakedness\" of its distribution. A high kurtosis indicates -- that more of the sample's variance is due to infrequent severe -- deviations, rather than more frequent modest deviations. -- -- A sample's excess kurtosis is not defined if its 'variance' is -- zero. -- -- This function performs two passes over the sample, so is not subject -- to stream fusion. -- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. kurtosis :: (G.Vector v Double) => v Double -> Double kurtosis xs = c4 / (c2 * c2) - 3 where (c4 , c2) = centralMoments 4 2 xs {-# SPECIALIZE kurtosis :: U.Vector Double -> Double #-} {-# SPECIALIZE kurtosis :: V.Vector Double -> Double #-} -- $variance -- -- The variance — and hence the standard deviation — of a -- sample of fewer than two elements are both defined to be zero. -- $robust -- -- These functions use the compensated summation algorithm of Chan et -- al. for numerical robustness, but require two passes over the -- sample data as a result. -- -- Because of the need for two passes, these functions are /not/ -- subject to stream fusion. data V = V {-# UNPACK #-} !Double {-# UNPACK #-} !Double -- | Maximum likelihood estimate of a sample's variance. Also known -- as the population variance, where the denominator is /n/. variance :: (G.Vector v Double) => v Double -> Double variance samp | n > 1 = robustSumVar (mean samp) samp / fromIntegral n | otherwise = 0 where n = G.length samp {-# SPECIALIZE variance :: U.Vector Double -> Double #-} {-# SPECIALIZE variance :: V.Vector Double -> Double #-} -- | Unbiased estimate of a sample's variance. Also known as the -- sample variance, where the denominator is /n/-1. varianceUnbiased :: (G.Vector v Double) => v Double -> Double varianceUnbiased samp | n > 1 = robustSumVar (mean samp) samp / fromIntegral (n-1) | otherwise = 0 where n = G.length samp {-# SPECIALIZE varianceUnbiased :: U.Vector Double -> Double #-} {-# SPECIALIZE varianceUnbiased :: V.Vector Double -> Double #-} -- | Calculate mean and maximum likelihood estimate of variance. This -- function should be used if both mean and variance are required -- since it will calculate mean only once. meanVariance :: (G.Vector v Double) => v Double -> (Double,Double) meanVariance samp | n > 1 = (m, robustSumVar m samp / fromIntegral n) | otherwise = (m, 0) where n = G.length samp m = mean samp {-# SPECIALIZE meanVariance :: U.Vector Double -> (Double,Double) #-} {-# SPECIALIZE meanVariance :: V.Vector Double -> (Double,Double) #-} -- | Calculate mean and unbiased estimate of variance. This -- function should be used if both mean and variance are required -- since it will calculate mean only once. meanVarianceUnb :: (G.Vector v Double) => v Double -> (Double,Double) meanVarianceUnb samp | n > 1 = (m, robustSumVar m samp / fromIntegral (n-1)) | otherwise = (m, 0) where n = G.length samp m = mean samp {-# SPECIALIZE meanVarianceUnb :: U.Vector Double -> (Double,Double) #-} {-# SPECIALIZE meanVarianceUnb :: V.Vector Double -> (Double,Double) #-} -- | Standard deviation. This is simply the square root of the -- unbiased estimate of the variance. stdDev :: (G.Vector v Double) => v Double -> Double stdDev = sqrt . varianceUnbiased {-# SPECIALIZE stdDev :: U.Vector Double -> Double #-} {-# SPECIALIZE stdDev :: V.Vector Double -> Double #-} -- | Standard error of the mean. This is the standard deviation -- divided by the square root of the sample size. stdErrMean :: (G.Vector v Double) => v Double -> Double stdErrMean samp = stdDev samp / (sqrt . fromIntegral . G.length) samp {-# SPECIALIZE stdErrMean :: U.Vector Double -> Double #-} {-# SPECIALIZE stdErrMean :: V.Vector Double -> Double #-} robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> V robustSumVarWeighted samp = G.foldl' go (V 0 0) samp where go (V s w) (x,xw) = V (s + xw*d*d) (w + xw) where d = x - m m = meanWeighted samp {-# INLINE robustSumVarWeighted #-} -- | Weighted variance. This is biased estimation. varianceWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double varianceWeighted samp | G.length samp > 1 = fini $ robustSumVarWeighted samp | otherwise = 0 where fini (V s w) = s / w {-# SPECIALIZE varianceWeighted :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE varianceWeighted :: V.Vector (Double,Double) -> Double #-} -- $cancellation -- -- The functions prefixed with the name @fast@ below perform a single -- pass over the sample data using Knuth's algorithm. They usually -- work well, but see below for caveats. These functions are subject -- to array fusion. -- -- /Note/: in cases where most sample data is close to the sample's -- mean, Knuth's algorithm gives inaccurate results due to -- catastrophic cancellation. fastVar :: (G.Vector v Double) => v Double -> T1 fastVar = G.foldl' go (T1 0 0 0) where go (T1 n m s) x = T1 n' m' s' where n' = n + 1 m' = m + d / fromIntegral n' s' = s + d * (x - m') d = x - m -- | Maximum likelihood estimate of a sample's variance. fastVariance :: (G.Vector v Double) => v Double -> Double fastVariance = fini . fastVar where fini (T1 n _m s) | n > 1 = s / fromIntegral n | otherwise = 0 {-# INLINE fastVariance #-} -- | Unbiased estimate of a sample's variance. fastVarianceUnbiased :: (G.Vector v Double) => v Double -> Double fastVarianceUnbiased = fini . fastVar where fini (T1 n _m s) | n > 1 = s / fromIntegral (n - 1) | otherwise = 0 {-# INLINE fastVarianceUnbiased #-} -- | Standard deviation. This is simply the square root of the -- maximum likelihood estimate of the variance. fastStdDev :: (G.Vector v Double) => v Double -> Double fastStdDev = sqrt . fastVariance {-# INLINE fastStdDev #-} -- | Covariance of sample of pairs. For empty sample it's set to -- zero covariance :: (G.Vector v (Double,Double), G.Vector v Double) => v (Double,Double) -> Double covariance xy | n == 0 = 0 | otherwise = mean $ G.zipWith (*) (G.map (\x -> x - muX) xs) (G.map (\y -> y - muY) ys) where n = G.length xy (xs,ys) = G.unzip xy muX = mean xs muY = mean ys {-# SPECIALIZE covariance :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE covariance :: V.Vector (Double,Double) -> Double #-} -- | Correlation coefficient for sample of pairs. Also known as -- Pearson's correlation. For empty sample it's set to zero. correlation :: (G.Vector v (Double,Double), G.Vector v Double) => v (Double,Double) -> Double correlation xy | n == 0 = 0 | otherwise = cov / sqrt (varX * varY) where n = G.length xy (xs,ys) = G.unzip xy (muX,varX) = meanVariance xs (muY,varY) = meanVariance ys cov = mean $ G.zipWith (*) (G.map (\x -> x - muX) xs) (G.map (\y -> y - muY) ys) {-# SPECIALIZE correlation :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE correlation :: V.Vector (Double,Double) -> Double #-} -- | Pair two samples. It's like 'G.zip' but requires that both -- samples have equal size. pair :: (G.Vector v a, G.Vector v b, G.Vector v (a,b)) => v a -> v b -> v (a,b) pair va vb | G.length va == G.length vb = G.zip va vb | otherwise = error "Statistics.Sample.pair: vector must have same length" {-# INLINE pair #-} ------------------------------------------------------------------------ -- Helper code. Monomorphic unpacked accumulators. -- (^) operator from Prelude is just slow. (^) :: Double -> Int -> Double x ^ 1 = x x ^ n = x * (x ^ (n-1)) {-# INLINE (^) #-} -- don't support polymorphism, as we can't get unboxed returns if we use it. data T = T {-# UNPACK #-}!Double {-# UNPACK #-}!Int data T1 = T1 {-# UNPACK #-}!Int {-# UNPACK #-}!Double {-# UNPACK #-}!Double {- Consider this core: with data T a = T !a !Int $wfold :: Double# -> Int# -> Int# -> (# Double, Int# #) and without, $wfold :: Double# -> Int# -> Int# -> (# Double#, Int# #) yielding to boxed returns and heap checks. -} -- $references -- -- * Chan, T. F.; Golub, G.H.; LeVeque, R.J. (1979) Updating formulae -- and a pairwise algorithm for computing sample -- variances. Technical Report STAN-CS-79-773, Department of -- Computer Science, Stanford -- University. -- -- * Knuth, D.E. (1998) The art of computer programming, volume 2: -- seminumerical algorithms, 3rd ed., p. 232. -- -- * Welford, B.P. (1962) Note on a method for calculating corrected -- sums of squares and products. /Technometrics/ -- 4(3):419–420. -- -- * West, D.H.D. (1979) Updating mean and variance estimates: an -- improved method. /Communications of the ACM/ -- 22(9):532–535. statistics-0.16.2.1/Statistics/Sample/0000755000000000000000000000000007346545000015675 5ustar0000000000000000statistics-0.16.2.1/Statistics/Sample/Histogram.hs0000644000000000000000000001001507346545000020163 0ustar0000000000000000{-# LANGUAGE FlexibleContexts, BangPatterns, ScopedTypeVariables #-} -- | -- Module : Statistics.Sample.Histogram -- Copyright : (c) 2011 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Functions for computing histograms of sample data. module Statistics.Sample.Histogram ( histogram -- * Building blocks , histogram_ , range ) where import Control.Monad.ST import Numeric.MathFunctions.Constants (m_epsilon,m_tiny) import Statistics.Function (minMax) import qualified Data.Vector.Generic as G import qualified Data.Vector.Generic.Mutable as GM -- | /O(n)/ Compute a histogram over a data set. -- -- The result consists of a pair of vectors: -- -- * The lower bound of each interval. -- -- * The number of samples within the interval. -- -- Interval (bin) sizes are uniform, and the upper and lower bounds -- are chosen automatically using the 'range' function. To specify -- these parameters directly, use the 'histogram_' function. histogram :: (G.Vector v0 Double, G.Vector v1 Double, Num b, G.Vector v1 b) => Int -- ^ Number of bins (must be positive). -> v0 Double -- ^ Sample data (cannot be empty). -> (v1 Double, v1 b) histogram numBins xs = (G.generate numBins step, histogram_ numBins lo hi xs) where (lo,hi) = range numBins xs step i = lo + d * fromIntegral i d = (hi - lo) / fromIntegral numBins {-# INLINE histogram #-} -- | /O(n)/ Compute a histogram over a data set. -- -- Interval (bin) sizes are uniform, based on the supplied upper -- and lower bounds. histogram_ :: forall b a v0 v1. (Num b, RealFrac a, G.Vector v0 a, G.Vector v1 b) => Int -- ^ Number of bins. This value must be positive. A zero -- or negative value will cause an error. -> a -- ^ Lower bound on interval range. Sample data less than -- this will cause an error. -> a -- ^ Upper bound on interval range. This value must not be -- less than the lower bound. Sample data that falls above -- the upper bound will cause an error. -> v0 a -- ^ Sample data. -> v1 b histogram_ numBins lo hi xs0 = G.create (GM.replicate numBins 0 >>= bin xs0) where bin :: forall s. v0 a -> G.Mutable v1 s b -> ST s (G.Mutable v1 s b) bin xs bins = go 0 where go i | i >= len = return bins | otherwise = do let x = xs `G.unsafeIndex` i b = truncate $ (x - lo) / d write' bins b . (+1) =<< GM.read bins b go (i+1) write' bins' b !e = GM.write bins' b e len = G.length xs d = ((hi - lo) / fromIntegral numBins) * (1 + realToFrac m_epsilon) {-# INLINE histogram_ #-} -- | /O(n)/ Compute decent defaults for the lower and upper bounds of -- a histogram, based on the desired number of bins and the range of -- the sample data. -- -- The upper and lower bounds used are @(lo-d, hi+d)@, where -- -- @d = (maximum sample - minimum sample) / ((bins - 1) * 2)@ -- -- If all elements in the sample are the same and equal to @x@ range -- is set to @(x - |x|/10, x + |x|/10)@. And if @x@ is equal to 0 range -- is set to @(-1,1)@. This is needed to avoid creating histogram with -- zero bin size. range :: (G.Vector v Double) => Int -- ^ Number of bins (must be positive). -> v Double -- ^ Sample data (cannot be empty). -> (Double, Double) range numBins xs | numBins < 1 = error "Statistics.Histogram.range: invalid bin count" | G.null xs = error "Statistics.Histogram.range: empty sample" | lo == hi = case abs lo / 10 of a | a < m_tiny -> (-1,1) | otherwise -> (lo - a, lo + a) | otherwise = (lo-d, hi+d) where d | numBins == 1 = 0 | otherwise = (hi - lo) / ((fromIntegral numBins - 1) * 2) (lo,hi) = minMax xs {-# INLINE range #-} statistics-0.16.2.1/Statistics/Sample/Internal.hs0000644000000000000000000000136007346545000020005 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} -- | -- Module : Statistics.Sample.Internal -- Copyright : (c) 2013 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Internal functions for computing over samples. module Statistics.Sample.Internal ( robustSumVar , sum ) where import Numeric.Sum (kbn, sumVector) import Prelude hiding (sum) import Statistics.Function (square) import qualified Data.Vector.Generic as G robustSumVar :: (G.Vector v Double) => Double -> v Double -> Double robustSumVar m = sum . G.map (square . subtract m) {-# INLINE robustSumVar #-} sum :: (G.Vector v Double) => v Double -> Double sum = sumVector kbn {-# INLINE sum #-} statistics-0.16.2.1/Statistics/Sample/KernelDensity.hs0000644000000000000000000001151007346545000021007 0ustar0000000000000000{-# LANGUAGE BangPatterns, FlexibleContexts, UnboxedTuples #-} -- | -- Module : Statistics.Sample.KernelDensity -- Copyright : (c) 2011 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Kernel density estimation. This module provides a fast, robust, -- non-parametric way to estimate the probability density function of -- a sample. -- -- This estimator does not use the commonly employed \"Gaussian rule -- of thumb\". As a result, it outperforms many plug-in methods on -- multimodal samples with widely separated modes. module Statistics.Sample.KernelDensity ( -- * Estimation functions kde , kde_ -- * References -- $references ) where import Data.Default.Class import Numeric.MathFunctions.Constants (m_sqrt_2_pi) import Numeric.RootFinding (fromRoot, ridders, RiddersParam(..), Tolerance(..)) import Prelude hiding (const, min, max, sum) import Statistics.Function (minMax, nextHighestPowerOfTwo) import Statistics.Sample.Histogram (histogram_) import Statistics.Sample.Internal (sum) import Statistics.Transform (CD, dct, idct) import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import qualified Data.Vector as V -- | Gaussian kernel density estimator for one-dimensional data, using -- the method of Botev et al. -- -- The result is a pair of vectors, containing: -- -- * The coordinates of each mesh point. The mesh interval is chosen -- to be 20% larger than the range of the sample. (To specify the -- mesh interval, use 'kde_'.) -- -- * Density estimates at each mesh point. kde :: (G.Vector v CD, G.Vector v Double, G.Vector v Int) => Int -- ^ The number of mesh points to use in the uniform discretization -- of the interval @(min,max)@. If this value is not a power of -- two, then it is rounded up to the next power of two. -> v Double -> (v Double, v Double) kde n0 xs = kde_ n0 (lo - range / 10) (hi + range / 10) xs where (lo,hi) = minMax xs range | G.length xs <= 1 = 1 -- Unreasonable guess | lo == hi = 1 -- All elements are equal | otherwise = hi - lo {-# INLINABLE kde #-} {-# SPECIAlIZE kde :: Int -> U.Vector Double -> (U.Vector Double, U.Vector Double) #-} {-# SPECIAlIZE kde :: Int -> V.Vector Double -> (V.Vector Double, V.Vector Double) #-} -- | Gaussian kernel density estimator for one-dimensional data, using -- the method of Botev et al. -- -- The result is a pair of vectors, containing: -- -- * The coordinates of each mesh point. -- -- * Density estimates at each mesh point. kde_ :: (G.Vector v CD, G.Vector v Double, G.Vector v Int) => Int -- ^ The number of mesh points to use in the uniform discretization -- of the interval @(min,max)@. If this value is not a power of -- two, then it is rounded up to the next power of two. -> Double -- ^ Lower bound (@min@) of the mesh range. -> Double -- ^ Upper bound (@max@) of the mesh range. -> v Double -> (v Double, v Double) kde_ n0 min max xs | G.null xs = error "Statistics.KernelDensity.kde: empty sample" | n0 <= 1 = error "Statistics.KernelDensity.kde: invalid number of points" | otherwise = (mesh, density) where mesh = G.generate ni $ \z -> min + (d * fromIntegral z) where d = r / (n-1) density = G.map (/(2 * r)) . idct $ G.zipWith f a (G.enumFromTo 0 (n-1)) where f b z = b * exp (sqr z * sqr pi * t_star * (-0.5)) !n = fromIntegral ni !ni = nextHighestPowerOfTwo n0 !r = max - min a = dct . G.map (/ sum h) $ h where h = G.map (/ len) $ histogram_ ni min max xs !len = fromIntegral (G.length xs) !t_star = fromRoot (0.28 * len ** (-0.4)) . ridders def{ riddersTol = AbsTol 1e-14 } (0,0.1) $ \x -> x - (len * (2 * sqrt pi) * go 6 (f 7 x)) ** (-0.4) where f q t = 2 * pi ** (q*2) * sum (G.zipWith g iv a2v) where g i a2 = i ** q * a2 * exp ((-i) * sqr pi * t) a2v = G.map (sqr . (*0.5)) $ G.tail a iv = G.map sqr $ G.enumFromTo 1 (n-1) go s !h | s == 1 = h | otherwise = go (s-1) (f s time) where time = (2 * const * k0 / len / h) ** (2 / (3 + 2 * s)) const = (1 + 0.5 ** (s+0.5)) / 3 k0 = U.product (G.enumFromThenTo 1 3 (2*s-1)) / m_sqrt_2_pi sqr x = x * x {-# INLINABLE kde_ #-} {-# SPECIAlIZE kde_ :: Int -> Double -> Double -> U.Vector Double -> (U.Vector Double, U.Vector Double) #-} {-# SPECIAlIZE kde_ :: Int -> Double -> Double -> V.Vector Double -> (V.Vector Double, V.Vector Double) #-} -- $references -- -- Botev. Z.I., Grotowski J.F., Kroese D.P. (2010). Kernel density -- estimation via diffusion. /Annals of Statistics/ -- 38(5):2916–2957. statistics-0.16.2.1/Statistics/Sample/KernelDensity/0000755000000000000000000000000007346545000020455 5ustar0000000000000000statistics-0.16.2.1/Statistics/Sample/KernelDensity/Simple.hs0000644000000000000000000001602607346545000022247 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, DeriveGeneric, FlexibleContexts #-} -- | -- Module : Statistics.Sample.KernelDensity.Simple -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Kernel density estimation code, providing non-parametric ways to -- estimate the probability density function of a sample. -- -- The techniques used by functions in this module are relatively -- fast, but they generally give inferior results to the KDE function -- in the main 'Statistics.KernelDensity' module (due to the -- oversmoothing documented for 'bandwidth' below). module Statistics.Sample.KernelDensity.Simple {-# DEPRECATED "Use Statistics.Sample.KernelDensity instead." #-} ( -- * Simple entry points epanechnikovPDF , gaussianPDF -- * Building blocks -- These functions may be useful if you need to construct a kernel -- density function estimator other than the ones provided in this -- module. -- ** Choosing points from a sample , Points(..) , choosePoints -- ** Bandwidth estimation , Bandwidth , bandwidth , epanechnikovBW , gaussianBW -- ** Kernels , Kernel , epanechnikovKernel , gaussianKernel -- ** Low-level estimation , estimatePDF , simplePDF -- * References -- $references ) where import Data.Aeson (FromJSON, ToJSON) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import Data.Vector.Binary () import GHC.Generics (Generic) import Numeric.MathFunctions.Constants (m_1_sqrt_2, m_2_sqrt_pi) import Prelude hiding (sum) import Statistics.Function (minMax) import Statistics.Sample (stdDev) import Statistics.Sample.Internal (sum) import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U -- | Points from the range of a 'Sample'. newtype Points = Points { fromPoints :: U.Vector Double } deriving (Eq, Read, Show, Typeable, Data, Generic) instance FromJSON Points instance ToJSON Points instance Binary Points where get = fmap Points get put = put . fromPoints -- | Bandwidth estimator for an Epanechnikov kernel. epanechnikovBW :: Double -> Bandwidth epanechnikovBW n = (80 / (n * m_2_sqrt_pi)) ** 0.2 -- | Bandwidth estimator for a Gaussian kernel. gaussianBW :: Double -> Bandwidth gaussianBW n = (4 / (n * 3)) ** 0.2 -- | The width of the convolution kernel used. type Bandwidth = Double -- | Compute the optimal bandwidth from the observed data for the -- given kernel. -- -- This function uses an estimate based on the standard deviation of a -- sample (due to Deheuvels), which performs reasonably well for -- unimodal distributions but leads to oversmoothing for more complex -- ones. bandwidth :: G.Vector v Double => (Double -> Bandwidth) -> v Double -> Bandwidth bandwidth kern values = stdDev values * kern (fromIntegral $ G.length values) -- | Choose a uniform range of points at which to estimate a sample's -- probability density function. -- -- If you are using a Gaussian kernel, multiply the sample's bandwidth -- by 3 before passing it to this function. -- -- If this function is passed an empty vector, it returns values of -- positive and negative infinity. choosePoints :: G.Vector v Double => Int -- ^ Number of points to select, /n/ -> Double -- ^ Sample bandwidth, /h/ -> v Double -- ^ Input data -> Points choosePoints n h sample = Points . U.map f $ U.enumFromTo 0 n' where lo = a - h hi = z + h (a, z) = minMax sample d = (hi - lo) / fromIntegral n' f i = lo + fromIntegral i * d n' = n - 1 -- | The convolution kernel. Its parameters are as follows: -- -- * Scaling factor, 1\//nh/ -- -- * Bandwidth, /h/ -- -- * A point at which to sample the input, /p/ -- -- * One sample value, /v/ type Kernel = Double -> Double -> Double -> Double -> Double -- | Epanechnikov kernel for probability density function estimation. epanechnikovKernel :: Kernel epanechnikovKernel f h p v | abs u <= 1 = f * (1 - u * u) | otherwise = 0 where u = (v - p) / (h * 0.75) -- | Gaussian kernel for probability density function estimation. gaussianKernel :: Kernel gaussianKernel f h p v = exp (-0.5 * u * u) * g where u = (v - p) / h g = f * 0.5 * m_2_sqrt_pi * m_1_sqrt_2 -- | Kernel density estimator, providing a non-parametric way of -- estimating the PDF of a random variable. estimatePDF :: G.Vector v Double => Kernel -- ^ Kernel function -> Bandwidth -- ^ Bandwidth, /h/ -> v Double -- ^ Sample data -> Points -- ^ Points at which to estimate -> U.Vector Double estimatePDF kernel h sample | n < 2 = errorShort "estimatePDF" | otherwise = U.map k . fromPoints where k p = sum . G.map (kernel f h p) $ sample f = 1 / (h * fromIntegral n) n = G.length sample {-# INLINE estimatePDF #-} -- | A helper for creating a simple kernel density estimation function -- with automatically chosen bandwidth and estimation points. simplePDF :: G.Vector v Double => (Double -> Double) -- ^ Bandwidth function -> Kernel -- ^ Kernel function -> Double -- ^ Bandwidth scaling factor (3 for a Gaussian kernel, 1 for all others) -> Int -- ^ Number of points at which to estimate -> v Double -- ^ sample data -> (Points, U.Vector Double) simplePDF fbw fpdf k numPoints sample = (points, estimatePDF fpdf bw sample points) where points = choosePoints numPoints (bw*k) sample bw = bandwidth fbw sample {-# INLINE simplePDF #-} -- | Simple Epanechnikov kernel density estimator. Returns the -- uniformly spaced points from the sample range at which the density -- function was estimated, and the estimates at those points. epanechnikovPDF :: G.Vector v Double => Int -- ^ Number of points at which to estimate -> v Double -- ^ Data sample -> (Points, U.Vector Double) epanechnikovPDF = simplePDF epanechnikovBW epanechnikovKernel 1 -- | Simple Gaussian kernel density estimator. Returns the uniformly -- spaced points from the sample range at which the density function -- was estimated, and the estimates at those points. gaussianPDF :: G.Vector v Double => Int -- ^ Number of points at which to estimate -> v Double -- ^ Data sample -> (Points, U.Vector Double) gaussianPDF = simplePDF gaussianBW gaussianKernel 3 errorShort :: String -> a errorShort func = error ("Statistics.KernelDensity." ++ func ++ ": at least two points required") -- $references -- -- * Deheuvels, P. (1977) Estimation non paramétrique de la densité -- par histogrammes -- généralisés. Mhttp://archive.numdam.org/article/RSA_1977__25_3_5_0.pdf> statistics-0.16.2.1/Statistics/Sample/Normalize.hs0000644000000000000000000000252307346545000020173 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} -- | -- Module : Statistics.Sample.Normalize -- Copyright : (c) 2017 Gregory W. Schwartz -- License : BSD3 -- -- Maintainer : gsch@mail.med.upenn.edu -- Stability : experimental -- Portability : portable -- -- Functions for normalizing samples. module Statistics.Sample.Normalize ( standardize ) where import Statistics.Sample import qualified Data.Vector.Generic as G import qualified Data.Vector as V import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Storable as S -- | /O(n)/ Normalize a sample using standard scores: -- -- \[ z = \frac{x - \mu}{\sigma} \] -- -- Where μ is sample mean and σ is standard deviation computed from -- unbiased variance estimation. If sample to small to compute σ or -- it's equal to 0 @Nothing@ is returned. standardize :: (G.Vector v Double) => v Double -> Maybe (v Double) standardize xs | G.length xs < 2 = Nothing | sigma == 0 = Nothing | otherwise = Just $ G.map (\x -> (x - mu) / sigma) xs where mu = mean xs sigma = stdDev xs {-# INLINABLE standardize #-} {-# SPECIALIZE standardize :: V.Vector Double -> Maybe (V.Vector Double) #-} {-# SPECIALIZE standardize :: U.Vector Double -> Maybe (U.Vector Double) #-} {-# SPECIALIZE standardize :: S.Vector Double -> Maybe (S.Vector Double) #-} statistics-0.16.2.1/Statistics/Sample/Powers.hs0000644000000000000000000001575407346545000017524 0ustar0000000000000000{-# LANGUAGE BangPatterns, DeriveDataTypeable, DeriveGeneric, FlexibleContexts #-} -- | -- Module : Statistics.Sample.Powers -- Copyright : (c) 2009, 2010 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Very fast statistics over simple powers of a sample. These can all -- be computed efficiently in just a single pass over a sample, with -- that pass subject to stream fusion. -- -- The tradeoff is that some of these functions are less numerically -- robust than their counterparts in the 'Statistics.Sample' module. -- Where this is the case, the alternatives are noted. module Statistics.Sample.Powers ( -- * Types Powers -- * Constructor , powers -- * Descriptive functions , order , count , sum -- * Statistics of location , mean -- * Statistics of dispersion , variance , stdDev , varianceUnbiased -- * Functions over central moments , centralMoment , skewness , kurtosis -- * References -- $references ) where import Control.Monad.ST import Data.Aeson (FromJSON, ToJSON) import Data.Binary (Binary(..)) import Data.Data (Data, Typeable) import Data.Vector.Binary () import Data.Vector.Unboxed ((!)) import GHC.Generics (Generic) import Numeric.SpecFunctions (choose) import Prelude hiding (sum) import Statistics.Function (indexed) import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Storable as SV import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Unboxed.Mutable as MU import qualified Statistics.Sample.Internal as S newtype Powers = Powers (U.Vector Double) deriving (Eq, Read, Show, Typeable, Data, Generic) instance FromJSON Powers instance ToJSON Powers instance Binary Powers where put (Powers v) = put v get = fmap Powers get -- | O(/n/) Collect the /n/ simple powers of a sample. -- -- Functions computed over a sample's simple powers require at least a -- certain number (or /order/) of powers to be collected. -- -- * To compute the /k/th 'centralMoment', at least /k/ simple powers -- must be collected. -- -- * For the 'variance', at least 2 simple powers are needed. -- -- * For 'skewness', we need at least 3 simple powers. -- -- * For 'kurtosis', at least 4 simple powers are required. -- -- This function is subject to stream fusion. powers :: G.Vector v Double => Int -- ^ /n/, the number of powers, where /n/ >= 2. -> v Double -> Powers powers k sample | k < 2 = error "Statistics.Sample.powers: too few powers" | otherwise = runST $ do acc <- MU.replicate l 0 G.forM_ sample $ \x -> let loop !i !xk | i == l = return () | otherwise = do MU.write acc i . (+ xk) =<< MU.read acc i loop (i+1) (xk * x) in loop 0 1 fmap Powers $ U.unsafeFreeze acc where l = k + 1 {-# SPECIALIZE powers :: Int -> U.Vector Double -> Powers #-} {-# SPECIALIZE powers :: Int -> V.Vector Double -> Powers #-} {-# SPECIALIZE powers :: Int -> SV.Vector Double -> Powers #-} -- | The order (number) of simple powers collected from a 'sample'. order :: Powers -> Int order (Powers pa) = U.length pa - 1 -- | Compute the /k/th central moment of a sample. The central -- moment is also known as the moment about the mean. centralMoment :: Int -> Powers -> Double centralMoment k p@(Powers pa) | k < 0 || k > order p = error ("Statistics.Sample.Powers.centralMoment: " ++ "invalid argument") | k == 0 = 1 | otherwise = (/n) . S.sum . U.map go . indexed . U.take (k+1) $ pa where go (i , e) = (k `choose` i) * ((-m) ^ (k-i)) * e n = U.head pa m = mean p -- | Maximum likelihood estimate of a sample's variance. Also known -- as the population variance, where the denominator is /n/. This is -- the second central moment of the sample. -- -- This is less numerically robust than the variance function in the -- 'Statistics.Sample' module, but the number is essentially free to -- compute if you have already collected a sample's simple powers. -- -- Requires 'Powers' with 'order' at least 2. variance :: Powers -> Double variance = centralMoment 2 -- | Standard deviation. This is simply the square root of the -- maximum likelihood estimate of the variance. stdDev :: Powers -> Double stdDev = sqrt . variance -- | Unbiased estimate of a sample's variance. Also known as the -- sample variance, where the denominator is /n/-1. -- -- Requires 'Powers' with 'order' at least 2. varianceUnbiased :: Powers -> Double varianceUnbiased p@(Powers pa) | n > 1 = variance p * n / (n-1) | otherwise = 0 where n = U.head pa -- | Compute the skewness of a sample. This is a measure of the -- asymmetry of its distribution. -- -- A sample with negative skew is said to be /left-skewed/. Most of -- its mass is on the right of the distribution, with the tail on the -- left. -- -- > skewness . powers 3 $ U.to [1,100,101,102,103] -- > ==> -1.497681449918257 -- -- A sample with positive skew is said to be /right-skewed/. -- -- > skewness . powers 3 $ U.to [1,2,3,4,100] -- > ==> 1.4975367033335198 -- -- A sample's skewness is not defined if its 'variance' is zero. -- -- Requires 'Powers' with 'order' at least 3. skewness :: Powers -> Double skewness p = centralMoment 3 p * variance p ** (-1.5) -- | Compute the excess kurtosis of a sample. This is a measure of -- the \"peakedness\" of its distribution. A high kurtosis indicates -- that the sample's variance is due more to infrequent severe -- deviations than to frequent modest deviations. -- -- A sample's excess kurtosis is not defined if its 'variance' is -- zero. -- -- Requires 'Powers' with 'order' at least 4. kurtosis :: Powers -> Double kurtosis p = centralMoment 4 p / (v * v) - 3 where v = variance p -- | The number of elements in the original 'Sample'. This is the -- sample's zeroth simple power. count :: Powers -> Int count (Powers pa) = floor $ U.head pa -- | The sum of elements in the original 'Sample'. This is the -- sample's first simple power. sum :: Powers -> Double sum (Powers pa) = pa ! 1 -- | The arithmetic mean of elements in the original 'Sample'. -- -- This is less numerically robust than the mean function in the -- 'Statistics.Sample' module, but the number is essentially free to -- compute if you have already collected a sample's simple powers. mean :: Powers -> Double mean p@(Powers pa) | n == 0 = 0 | otherwise = sum p / n where n = U.head pa -- $references -- -- * Besset, D.H. (2000) Elements of statistics. -- /Object-oriented implementation of numerical methods/ -- ch. 9, pp. 311–331. -- -- -- * Anderson, G. (2009) Compute /k/th central moments in one -- pass. /quantblog/. statistics-0.16.2.1/Statistics/Test/0000755000000000000000000000000007346545000015373 5ustar0000000000000000statistics-0.16.2.1/Statistics/Test/ChiSquared.hs0000644000000000000000000000557007346545000017766 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} -- | Pearson's chi squared test. module Statistics.Test.ChiSquared ( chi2test , chi2testCont , module Statistics.Test.Types ) where import Prelude hiding (sum) import Statistics.Distribution import Statistics.Distribution.ChiSquared import Statistics.Function (square) import Statistics.Sample.Internal (sum) import Statistics.Test.Types import Statistics.Types import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U -- | Generic form of Pearson chi squared tests for binned data. Data -- sample is supplied in form of tuples (observed quantity, -- expected number of events). Both must be positive. -- -- This test should be used only if all bins have expected values of -- at least 5. chi2test :: (G.Vector v (Int,Double), G.Vector v Double) => Int -- ^ Number of additional degrees of -- freedom. One degree of freedom -- is due to the fact that the are -- N observation in total and -- accounted for automatically. -> v (Int,Double) -- ^ Observation and expectation. -> Maybe (Test ChiSquared) chi2test ndf vec | ndf < 0 = error $ "Statistics.Test.ChiSquare.chi2test: negative NDF " ++ show ndf | n > 0 = Just Test { testSignificance = mkPValue $ complCumulative d chi2 , testStatistics = chi2 , testDistribution = chiSquared n } | otherwise = Nothing where n = G.length vec - ndf - 1 chi2 = sum $ G.map (\(o,e) -> square (fromIntegral o - e) / e) vec d = chiSquared n {-# INLINABLE chi2test #-} {-# SPECIALIZE chi2test :: Int -> U.Vector (Int,Double) -> Maybe (Test ChiSquared) #-} {-# SPECIALIZE chi2test :: Int -> V.Vector (Int,Double) -> Maybe (Test ChiSquared) #-} -- | Chi squared test for data with normal errors. Data is supplied in -- form of pair (observation with error, and expectation). chi2testCont :: (G.Vector v (Estimate NormalErr Double, Double), G.Vector v Double) => Int -- ^ Number of additional -- degrees of freedom. -> v (Estimate NormalErr Double, Double) -- ^ Observation and expectation. -> Maybe (Test ChiSquared) chi2testCont ndf vec | ndf < 0 = error $ "Statistics.Test.ChiSquare.chi2testCont: negative NDF " ++ show ndf | n > 0 = Just Test { testSignificance = mkPValue $ complCumulative d chi2 , testStatistics = chi2 , testDistribution = chiSquared n } | otherwise = Nothing where n = G.length vec - ndf - 1 chi2 = sum $ G.map (\(Estimate o (NormalErr s),e) -> square (o - e) / s) vec d = chiSquared n statistics-0.16.2.1/Statistics/Test/Internal.hs0000644000000000000000000000551607346545000017512 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} module Statistics.Test.Internal ( rank , rankUnsorted , splitByTags ) where import Data.Ord import Data.Vector.Generic ((!)) import qualified Data.Vector.Generic as G import qualified Data.Vector.Generic.Mutable as M import Statistics.Function -- Private data type for unfolding data Rank v a = Rank { rankCnt :: {-# UNPACK #-} !Int -- Number of ranks to return , rankVal :: {-# UNPACK #-} !Double -- Rank to return , rankNum :: {-# UNPACK #-} !Double -- Current rank , rankVec :: v a -- Remaining vector } -- | Calculate rank of every element of sample. In case of ties ranks -- are averaged. Sample should be already sorted in ascending order. -- -- Rank is index of element in the sample, numeration starts from 1. -- In case of ties average of ranks of equal elements is assigned -- to each -- -- >>> rank (==) (fromList [10,20,30::Int]) -- > fromList [1.0,2.0,3.0] -- -- >>> rank (==) (fromList [10,10,10,30::Int]) -- > fromList [2.0,2.0,2.0,4.0] rank :: (G.Vector v a, G.Vector v Double) => (a -> a -> Bool) -- ^ Equivalence relation -> v a -- ^ Vector to rank -> v Double rank eq vec = G.unfoldr go (Rank 0 (-1) 1 vec) where go (Rank 0 _ r v) | G.null v = Nothing | otherwise = case G.length h of 1 -> Just (r, Rank 0 0 (r+1) rest) n -> go Rank { rankCnt = n , rankVal = 0.5 * (r*2 + fromIntegral (n-1)) , rankNum = r + fromIntegral n , rankVec = rest } where (h,rest) = G.span (eq $ G.head v) v go (Rank n val r v) = Just (val, Rank (n-1) val r v) {-# INLINE rank #-} -- | Compute rank of every element of vector. Unlike rank it doesn't -- require sample to be sorted. rankUnsorted :: ( Ord a , G.Vector v a , G.Vector v Int , G.Vector v Double , G.Vector v (Int, a) ) => v a -> v Double rankUnsorted xs = G.create $ do -- Put ranks into their original positions -- NOTE: backpermute will do wrong thing vec <- M.new n for 0 n $ \i -> M.unsafeWrite vec (index ! i) (ranks ! i) return vec where n = G.length xs -- Calculate ranks for sorted array ranks = rank (==) sorted -- Sort vector and retain original indices of elements (index, sorted) = G.unzip $ sortBy (comparing snd) $ indexed xs {-# INLINE rankUnsorted #-} -- | Split tagged vector splitByTags :: (G.Vector v a, G.Vector v (Bool,a)) => v (Bool,a) -> (v a, v a) splitByTags vs = (G.map snd a, G.map snd b) where (a,b) = G.unstablePartition fst vs {-# INLINE splitByTags #-} statistics-0.16.2.1/Statistics/Test/KolmogorovSmirnov.hs0000644000000000000000000002470207346545000021450 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} -- | -- Module : Statistics.Test.KolmogorovSmirnov -- Copyright : (c) 2011 Aleksey Khudyakov -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Kolmogov-Smirnov tests are non-parametric tests for assessing -- whether given sample could be described by distribution or whether -- two samples have the same distribution. It's only applicable to -- continuous distributions. module Statistics.Test.KolmogorovSmirnov ( -- * Kolmogorov-Smirnov test kolmogorovSmirnovTest , kolmogorovSmirnovTestCdf , kolmogorovSmirnovTest2 -- * Evaluate statistics , kolmogorovSmirnovCdfD , kolmogorovSmirnovD , kolmogorovSmirnov2D -- * Probabilities , kolmogorovSmirnovProbability -- * References -- $references , module Statistics.Test.Types ) where import Control.Monad (when) import Prelude hiding (exponent, sum) import Statistics.Distribution (Distribution(..)) import Statistics.Function (gsort, unsafeModify) import Statistics.Matrix (center, for, fromVector) import qualified Statistics.Matrix as Mat import Statistics.Test.Types import Statistics.Types (mkPValue) import qualified Data.Vector as V import qualified Data.Vector.Storable as S import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Generic as G import Data.Vector.Generic ((!)) import qualified Data.Vector.Unboxed.Mutable as M ---------------------------------------------------------------- -- Test ---------------------------------------------------------------- -- | Check that sample could be described by distribution. Returns -- @Nothing@ is sample is empty -- -- This test uses Marsaglia-Tsang-Wang exact algorithm for -- calculation of p-value. kolmogorovSmirnovTest :: (Distribution d, G.Vector v Double) => d -- ^ Distribution -> v Double -- ^ Data sample -> Maybe (Test ()) {-# INLINE kolmogorovSmirnovTest #-} kolmogorovSmirnovTest d = kolmogorovSmirnovTestCdf (cumulative d) -- | Variant of 'kolmogorovSmirnovTest' which uses CDF in form of -- function. kolmogorovSmirnovTestCdf :: (G.Vector v Double) => (Double -> Double) -- ^ CDF of distribution -> v Double -- ^ Data sample -> Maybe (Test ()) {-# INLINE kolmogorovSmirnovTestCdf #-} kolmogorovSmirnovTestCdf cdf sample | G.null sample = Nothing | otherwise = Just Test { testSignificance = mkPValue $ 1 - prob , testStatistics = d , testDistribution = () } where d = kolmogorovSmirnovCdfD cdf sample prob = kolmogorovSmirnovProbability (G.length sample) d -- | Two sample Kolmogorov-Smirnov test. It tests whether two data -- samples could be described by the same distribution without -- making any assumptions about it. If either of samples is empty -- returns Nothing. -- -- This test uses approximate formula for computing p-value. kolmogorovSmirnovTest2 :: (G.Vector v Double) => v Double -- ^ Sample 1 -> v Double -- ^ Sample 2 -> Maybe (Test ()) kolmogorovSmirnovTest2 xs1 xs2 | G.null xs1 || G.null xs2 = Nothing | otherwise = Just Test { testSignificance = mkPValue $ 1 - prob d , testStatistics = d , testDistribution = () } where d = kolmogorovSmirnov2D xs1 xs2 * (en + 0.12 + 0.11/en) -- Effective number of data points n1 = fromIntegral (G.length xs1) n2 = fromIntegral (G.length xs2) en = sqrt $ n1 * n2 / (n1 + n2) -- prob z | z < 0 = error "kolmogorovSmirnov2D: internal error" | z == 0 = 0 | z < 1.18 = let y = exp( -1.23370055013616983 / (z*z) ) in 2.25675833419102515 * sqrt( -log y ) * (y + y**9 + y**25 + y**49) | otherwise = let x = exp(-2 * z * z) in 1 - 2*(x - x**4 + x**9) {-# INLINABLE kolmogorovSmirnovTest2 #-} {-# SPECIALIZE kolmogorovSmirnovTest2 :: U.Vector Double -> U.Vector Double -> Maybe (Test ()) #-} {-# SPECIALIZE kolmogorovSmirnovTest2 :: V.Vector Double -> V.Vector Double -> Maybe (Test ()) #-} {-# SPECIALIZE kolmogorovSmirnovTest2 :: S.Vector Double -> S.Vector Double -> Maybe (Test ()) #-} -- FIXME: Find source for approximation for D ---------------------------------------------------------------- -- Kolmogorov's statistic ---------------------------------------------------------------- -- | Calculate Kolmogorov's statistic /D/ for given cumulative -- distribution function (CDF) and data sample. If sample is empty -- returns 0. kolmogorovSmirnovCdfD :: G.Vector v Double => (Double -> Double) -- ^ CDF function -> v Double -- ^ Sample -> Double kolmogorovSmirnovCdfD cdf sample | G.null sample = 0 | otherwise = G.maximum $ G.zipWith3 (\p a b -> abs (p-a) `max` abs (p-b)) ps steps (G.tail steps) where xs = gsort sample n = G.length xs -- ps = G.map cdf xs steps = G.map (/ fromIntegral n) $ G.generate (n+1) fromIntegral {-# INLINABLE kolmogorovSmirnovCdfD #-} {-# SPECIALIZE kolmogorovSmirnovCdfD :: (Double -> Double) -> U.Vector Double -> Double #-} {-# SPECIALIZE kolmogorovSmirnovCdfD :: (Double -> Double) -> V.Vector Double -> Double #-} {-# SPECIALIZE kolmogorovSmirnovCdfD :: (Double -> Double) -> S.Vector Double -> Double #-} -- | Calculate Kolmogorov's statistic /D/ for given cumulative -- distribution function (CDF) and data sample. If sample is empty -- returns 0. kolmogorovSmirnovD :: (Distribution d, G.Vector v Double) => d -- ^ Distribution -> v Double -- ^ Sample -> Double kolmogorovSmirnovD d = kolmogorovSmirnovCdfD (cumulative d) {-# INLINE kolmogorovSmirnovD #-} -- | Calculate Kolmogorov's statistic /D/ for two data samples. If -- either of samples is empty returns 0. kolmogorovSmirnov2D :: (G.Vector v Double) => v Double -- ^ First sample -> v Double -- ^ Second sample -> Double kolmogorovSmirnov2D sample1 sample2 | G.null sample1 || G.null sample2 = 0 | otherwise = worker 0 0 0 where xs1 = gsort sample1 xs2 = gsort sample2 n1 = G.length xs1 n2 = G.length xs2 en1 = fromIntegral n1 en2 = fromIntegral n2 -- Find new index skip x i xs = go (i+1) where go n | n >= G.length xs = n | xs ! n == x = go (n+1) | otherwise = n -- Main loop worker d i1 i2 | i1 >= n1 || i2 >= n2 = d | otherwise = worker d' i1' i2' where d1 = xs1 ! i1 d2 = xs2 ! i2 i1' | d1 <= d2 = skip d1 i1 xs1 | otherwise = i1 i2' | d2 <= d1 = skip d2 i2 xs2 | otherwise = i2 d' = max d (abs $ fromIntegral i1' / en1 - fromIntegral i2' / en2) {-# INLINABLE kolmogorovSmirnov2D #-} {-# SPECIALIZE kolmogorovSmirnov2D :: U.Vector Double -> U.Vector Double -> Double #-} {-# SPECIALIZE kolmogorovSmirnov2D :: V.Vector Double -> V.Vector Double -> Double #-} {-# SPECIALIZE kolmogorovSmirnov2D :: S.Vector Double -> S.Vector Double -> Double #-} -- | Calculate cumulative probability function for Kolmogorov's -- distribution with /n/ parameters or probability of getting value -- smaller than /d/ with n-elements sample. -- -- It uses algorithm by Marsgalia et. al. and provide at least -- 7-digit accuracy. kolmogorovSmirnovProbability :: Int -- ^ Size of the sample -> Double -- ^ D value -> Double kolmogorovSmirnovProbability n d -- Avoid potentially lengthy calculations for large N and D > 0.999 | s > 7.24 || (s > 3.76 && n > 99) = 1 - 2 * exp( -(2.000071 + 0.331 / sqrt n' + 1.409 / n') * s) -- Exact computation | otherwise = fini $ KSMatrix 0 matrix `power` n where s = n' * d * d n' = fromIntegral n size = 2*k - 1 k = floor (n' * d) + 1 h = fromIntegral k - n' * d -- Calculate initial matrix matrix = let m = U.create $ do mat <- M.new (size*size) -- Fill matrix with 0 and 1s for 0 size $ \row -> for 0 size $ \col -> do let val | row + 1 >= col = 1 | otherwise = 0 :: Double M.write mat (row * size + col) val -- Correct left column/bottom row for 0 size $ \i -> do let delta = h ^^ (i + 1) unsafeModify mat (i * size) (subtract delta) unsafeModify mat (size * size - 1 - i) (subtract delta) -- Correct corner element if needed when (2*h > 1) $ do unsafeModify mat ((size - 1) * size) (+ ((2*h - 1) ^ size)) -- Divide diagonals by factorial let divide g num | num == size = return () | otherwise = do for num size $ \i -> unsafeModify mat (i * (size + 1) - num) (/ g) divide (g * fromIntegral (num+2)) (num+1) divide 2 1 return mat in fromVector size size m -- Last calculation fini (KSMatrix e m) = loop 1 (center m) e where loop i ss eQ | i > n = ss * 10 ^^ eQ | ss' < 1e-140 = loop (i+1) (ss' * 1e140) (eQ - 140) | otherwise = loop (i+1) ss' eQ where ss' = ss * fromIntegral i / fromIntegral n data KSMatrix = KSMatrix Int Mat.Matrix multiply :: KSMatrix -> KSMatrix -> KSMatrix multiply (KSMatrix e1 m1) (KSMatrix e2 m2) = KSMatrix (e1+e2) (Mat.multiply m1 m2) power :: KSMatrix -> Int -> KSMatrix power mat 1 = mat power mat n = avoidOverflow res where mat2 = power mat (n `quot` 2) pow = multiply mat2 mat2 res | odd n = multiply pow mat | otherwise = pow avoidOverflow :: KSMatrix -> KSMatrix avoidOverflow ksm@(KSMatrix e m) | center m > 1e140 = KSMatrix (e + 140) (Mat.map (* 1e-140) m) | otherwise = ksm ---------------------------------------------------------------- -- $references -- -- * G. Marsaglia, W. W. Tsang, J. Wang (2003) Evaluating Kolmogorov's -- distribution, Journal of Statistical Software, American -- Statistical Association, vol. 8(i18). statistics-0.16.2.1/Statistics/Test/KruskalWallis.hs0000644000000000000000000000654007346545000020524 0ustar0000000000000000-- | -- Module : Statistics.Test.KruskalWallis -- Copyright : (c) 2014 Danny Navarro -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- module Statistics.Test.KruskalWallis ( -- * Kruskal-Wallis test kruskalWallisTest -- ** Building blocks , kruskalWallisRank , kruskalWallis , module Statistics.Test.Types ) where import Data.Ord (comparing) import qualified Data.Vector.Unboxed as U import Statistics.Function (sort, sortBy, square) import Statistics.Distribution (complCumulative) import Statistics.Distribution.ChiSquared (chiSquared) import Statistics.Types import Statistics.Test.Types import Statistics.Test.Internal (rank) import Statistics.Sample import qualified Statistics.Sample.Internal as Sample(sum) -- | Kruskal-Wallis ranking. -- -- All values are replaced by the absolute rank in the combined samples. -- -- The samples and values need not to be ordered but the values in the result -- are ordered. Assigned ranks (ties are given their average rank). kruskalWallisRank :: (U.Unbox a, Ord a) => [U.Vector a] -> [U.Vector Double] kruskalWallisRank samples = groupByTags . sortBy (comparing fst) . U.zip tags $ rank (==) joinSample where (tags,joinSample) = U.unzip . sortBy (comparing snd) $ foldMap (uncurry tagSample) $ zip [(1::Int)..] samples tagSample t = U.map (\x -> (t,x)) groupByTags xs | U.null xs = [] | otherwise = sort (U.map snd ys) : groupByTags zs where (ys,zs) = U.span ((==) (fst $ U.head xs) . fst) xs -- | The Kruskal-Wallis Test. -- -- In textbooks the output value is usually represented by 'K' or 'H'. This -- function already does the ranking. kruskalWallis :: (U.Unbox a, Ord a) => [U.Vector a] -> Double kruskalWallis samples = (nTot - 1) * numerator / denominator where -- Total number of elements in all samples nTot = fromIntegral $ sumWith rsamples U.length -- Average rank of all samples avgRank = (nTot + 1) / 2 -- numerator = sumWith rsamples $ \sample -> let n = fromIntegral $ U.length sample in n * square (mean sample - avgRank) denominator = sumWith rsamples $ \sample -> Sample.sum $ U.map (\r -> square (r - avgRank)) sample rsamples = kruskalWallisRank samples -- | Perform Kruskal-Wallis Test for the given samples and required -- significance. For additional information check 'kruskalWallis'. This is just -- a helper function. -- -- It uses /Chi-Squared/ distribution for approximation as long as the sizes are -- larger than 5. Otherwise the test returns 'Nothing'. kruskalWallisTest :: (Ord a, U.Unbox a) => [U.Vector a] -> Maybe (Test ()) kruskalWallisTest [] = Nothing kruskalWallisTest samples -- We use chi-squared approximation here | all (>4) ns = Just Test { testSignificance = mkPValue $ complCumulative d k , testStatistics = k , testDistribution = () } | otherwise = Nothing where k = kruskalWallis samples ns = map U.length samples d = chiSquared (length ns - 1) -- * Helper functions sumWith :: Num a => [Sample] -> (Sample -> a) -> a sumWith samples f = Prelude.sum $ fmap f samples {-# INLINE sumWith #-} statistics-0.16.2.1/Statistics/Test/MannWhitneyU.hs0000644000000000000000000002316207346545000020321 0ustar0000000000000000-- | -- Module : Statistics.Test.MannWhitneyU -- Copyright : (c) 2010 Neil Brown -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Mann-Whitney U test (also know as Mann-Whitney-Wilcoxon and -- Wilcoxon rank sum test) is a non-parametric test for assessing -- whether two samples of independent observations have different -- mean. module Statistics.Test.MannWhitneyU ( -- * Mann-Whitney U test mannWhitneyUtest , mannWhitneyU , mannWhitneyUCriticalValue , mannWhitneyUSignificant -- ** Wilcoxon rank sum test , wilcoxonRankSums , module Statistics.Test.Types -- * References -- $references ) where import Data.List (findIndex) import Data.Ord (comparing) import Numeric.SpecFunctions (choose) import Prelude hiding (sum) import Statistics.Distribution (quantile) import Statistics.Distribution.Normal (standard) import Statistics.Function (sortBy) import Statistics.Sample.Internal (sum) import Statistics.Test.Internal (rank, splitByTags) import Statistics.Test.Types (TestResult(..), PositionTest(..), significant) import Statistics.Types (PValue,pValue) import qualified Data.Vector.Unboxed as U -- | The Wilcoxon Rank Sums Test. -- -- This test calculates the sum of ranks for the given two samples. -- The samples are ordered, and assigned ranks (ties are given their -- average rank), then these ranks are summed for each sample. -- -- The return value is (W₁, W₂) where W₁ is the sum of ranks of the first sample -- and W₂ is the sum of ranks of the second sample. This test is trivially transformed -- into the Mann-Whitney U test. You will probably want to use 'mannWhitneyU' -- and the related functions for testing significance, but this function is exposed -- for completeness. wilcoxonRankSums :: (Ord a, U.Unbox a) => U.Vector a -> U.Vector a -> (Double, Double) wilcoxonRankSums xs1 xs2 = (sum ranks1, sum ranks2) where -- Ranks for each sample (ranks1,ranks2) = splitByTags $ U.zip tags (rank (==) joinSample) -- Sorted and tagged sample (tags,joinSample) = U.unzip $ sortBy (comparing snd) $ tagSample True xs1 U.++ tagSample False xs2 -- Add tag to a sample tagSample t = U.map (\x -> (t,x)) -- | The Mann-Whitney U Test. -- -- This is sometimes known as the Mann-Whitney-Wilcoxon U test, and -- confusingly many sources state that the Mann-Whitney U test is the same as -- the Wilcoxon's rank sum test (which is provided as 'wilcoxonRankSums'). -- The Mann-Whitney U is a simple transform of Wilcoxon's rank sum test. -- -- Again confusingly, different sources state reversed definitions for U₁ -- and U₂, so it is worth being explicit about what this function returns. -- Given two samples, the first, xs₁, of size n₁ and the second, xs₂, -- of size n₂, this function returns (U₁, U₂) -- where U₁ = W₁ - (n₁(n₁+1))\/2 -- and U₂ = W₂ - (n₂(n₂+1))\/2, -- where (W₁, W₂) is the return value of @wilcoxonRankSums xs1 xs2@. -- -- Some sources instead state that U₁ and U₂ should be the other way round, often -- expressing this using U₁' = n₁n₂ - U₁ (since U₁ + U₂ = n₁n₂). -- -- All of which you probably don't care about if you just feed this into 'mannWhitneyUSignificant'. mannWhitneyU :: (Ord a, U.Unbox a) => U.Vector a -> U.Vector a -> (Double, Double) mannWhitneyU xs1 xs2 = (fst summedRanks - (n1*(n1 + 1))/2 ,snd summedRanks - (n2*(n2 + 1))/2) where n1 = fromIntegral $ U.length xs1 n2 = fromIntegral $ U.length xs2 summedRanks = wilcoxonRankSums xs1 xs2 -- | Calculates the critical value of Mann-Whitney U for the given sample -- sizes and significance level. -- -- This function returns the exact calculated value of U for all sample sizes; -- it does not use the normal approximation at all. Above sample size 20 it is -- generally recommended to use the normal approximation instead, but this function -- will calculate the higher critical values if you need them. -- -- The algorithm to generate these values is a faster, memoised version of the -- simple unoptimised generating function given in section 2 of \"The Mann Whitney -- Wilcoxon Distribution Using Linked Lists\" mannWhitneyUCriticalValue :: (Int, Int) -- ^ The sample size -> PValue Double -- ^ The p-value (e.g. 0.05) for which you want the critical value. -> Maybe Int -- ^ The critical value (of U). mannWhitneyUCriticalValue (m, n) p | m < 1 || n < 1 = Nothing -- Sample must be nonempty | p' <= 1 = Nothing -- p-value is too small. Null hypothesis couldn't be disproved | otherwise = findIndex (>= p') $ take (m*n) $ tail $ alookup !! (m+n-2) !! (min m n - 1) where mnCn = (m+n) `choose` n p' = mnCn * pValue p {- -- Original function, without memoisation, from Cheung and Klotz: -- Double is needed to avoid integer overflows. a :: Int -> Int -> Int -> Double a u bigN m | u < 0 = 0 | u >= m * n = bigN `choose` m | m == 1 || n == 1 = fromIntegral (u + 1) | otherwise = a u (bigN - 1) m + a (u - n) (bigN - 1) (m-1) where n = bigN - m -} -- Memoised version of the original a function, above. -- -- Doubles are stored to avoid integer overflow. 32-bit Ints begin to -- overflow for bigN as small as 33 (64-bit one at 66) while Double to -- go to infinity till bigN=1029 -- -- -- outer list is indexed by big N - 2 -- inner list by (m-1) (we know m < bigN) -- innermost list by u -- -- So: (alookup !! (bigN - 2) !! (m - 1) ! u) == a u bigN m alookup :: [[[Double]]] alookup = gen 2 [1 : repeat 2] where gen bigN predBigNList = let bigNlist = [ [ amemoed u m | u <- [0 .. m*(bigN-m)] ] ++ repeat (bigN `choose` m) | m <- [1 .. (bigN-1)]] -- has bigN-1 elements in bigNlist : gen (bigN+1) bigNlist where amemoed :: Int -> Int -> Double amemoed u m | m == 1 || n == 1 = fromIntegral (u + 1) | otherwise = mList !! u + if u < n then 0 else predmList !! (u-n) where n = bigN - m (predmList : mList : _) = drop (m-2) predBigNList -- Lists for m-1 and m respectively. i-th list correspond to m=i+1 -- -- We know that predBigNList has bigN - 2 elements -- (and we know that n > 1 therefore bigN > m + 1) -- So bigN - 2 >= m, i.e. predBigNList must have at least m elements -- elements, so dropping (m-2) must leave at least 2 -- | Calculates whether the Mann Whitney U test is significant. -- -- If both sample sizes are less than or equal to 20, the exact U critical value -- (as calculated by 'mannWhitneyUCriticalValue') is used. If either sample is -- larger than 20, the normal approximation is used instead. -- -- If you use a one-tailed test, the test indicates whether the first sample is -- significantly larger than the second. If you want the opposite, simply reverse -- the order in both the sample size and the (U₁, U₂) pairs. mannWhitneyUSignificant :: PositionTest -- ^ Perform one-tailed test (see description above). -> (Int, Int) -- ^ The samples' size from which the (U₁,U₂) values were derived. -> PValue Double -- ^ The p-value at which to test (e.g. 0.05) -> (Double, Double) -- ^ The (U₁, U₂) values from 'mannWhitneyU'. -> Maybe TestResult -- ^ Return 'Nothing' if the sample was too -- small to make a decision. mannWhitneyUSignificant test (in1, in2) pVal (u1, u2) -- Use normal approximation | in1 > 20 || in2 > 20 = let mean = n1 * n2 / 2 -- (u1+u2) / 2 sigma = sqrt $ n1*n2*(n1 + n2 + 1) / 12 z = (mean - u1) / sigma in Just $ case test of AGreater -> significant $ z < quantile standard p BGreater -> significant $ (-z) < quantile standard p SamplesDiffer -> significant $ abs z > abs (quantile standard (p/2)) -- Use exact critical value | otherwise = do crit <- fromIntegral <$> mannWhitneyUCriticalValue (in1, in2) pVal return $ case test of AGreater -> significant $ u2 <= crit BGreater -> significant $ u1 <= crit SamplesDiffer -> significant $ min u1 u2 <= crit where n1 = fromIntegral in1 n2 = fromIntegral in2 p = pValue pVal -- | Perform Mann-Whitney U Test for two samples and required -- significance. For additional information check documentation of -- 'mannWhitneyU' and 'mannWhitneyUSignificant'. This is just a helper -- function. -- -- One-tailed test checks whether first sample is significantly larger -- than second. Two-tailed whether they are significantly different. mannWhitneyUtest :: (Ord a, U.Unbox a) => PositionTest -- ^ Perform one-tailed test (see description above). -> PValue Double -- ^ The p-value at which to test (e.g. 0.05) -> U.Vector a -- ^ First sample -> U.Vector a -- ^ Second sample -> Maybe TestResult -- ^ Return 'Nothing' if the sample was too small to -- make a decision. mannWhitneyUtest ontTail p smp1 smp2 = mannWhitneyUSignificant ontTail (n1,n2) p $ mannWhitneyU smp1 smp2 where n1 = U.length smp1 n2 = U.length smp2 -- $references -- -- * Cheung, Y.K.; Klotz, J.H. (1997) The Mann Whitney Wilcoxon -- distribution using linked lists. /Statistica Sinica/ -- 7:805–813. -- . statistics-0.16.2.1/Statistics/Test/StudentT.hs0000644000000000000000000001350607346545000017506 0ustar0000000000000000{-# LANGUAGE FlexibleContexts, Rank2Types, ScopedTypeVariables #-} -- | Student's T-test is for assessing whether two samples have -- different mean. This module contain several variations of -- T-test. It's a parametric tests and assumes that samples are -- normally distributed. module Statistics.Test.StudentT ( studentTTest , welchTTest , pairedTTest , module Statistics.Test.Types ) where import Statistics.Distribution hiding (mean) import Statistics.Distribution.StudentT import Statistics.Sample (mean, varianceUnbiased) import Statistics.Test.Types import Statistics.Types (mkPValue,PValue) import Statistics.Function (square) import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Storable as S import qualified Data.Vector as V -- | Two-sample Student's t-test. It assumes that both samples are -- normally distributed and have same variance. Returns @Nothing@ if -- sample sizes are not sufficient. studentTTest :: (G.Vector v Double) => PositionTest -- ^ one- or two-tailed test -> v Double -- ^ Sample A -> v Double -- ^ Sample B -> Maybe (Test StudentT) studentTTest test sample1 sample2 | G.length sample1 < 2 || G.length sample2 < 2 = Nothing | otherwise = Just Test { testSignificance = significance test t ndf , testStatistics = t , testDistribution = studentT ndf } where (t, ndf) = tStatistics True sample1 sample2 {-# INLINABLE studentTTest #-} {-# SPECIALIZE studentTTest :: PositionTest -> U.Vector Double -> U.Vector Double -> Maybe (Test StudentT) #-} {-# SPECIALIZE studentTTest :: PositionTest -> S.Vector Double -> S.Vector Double -> Maybe (Test StudentT) #-} {-# SPECIALIZE studentTTest :: PositionTest -> V.Vector Double -> V.Vector Double -> Maybe (Test StudentT) #-} -- | Two-sample Welch's t-test. It assumes that both samples are -- normally distributed but doesn't assume that they have same -- variance. Returns @Nothing@ if sample sizes are not sufficient. welchTTest :: (G.Vector v Double) => PositionTest -- ^ one- or two-tailed test -> v Double -- ^ Sample A -> v Double -- ^ Sample B -> Maybe (Test StudentT) welchTTest test sample1 sample2 | G.length sample1 < 2 || G.length sample2 < 2 = Nothing | otherwise = Just Test { testSignificance = significance test t ndf , testStatistics = t , testDistribution = studentT ndf } where (t, ndf) = tStatistics False sample1 sample2 {-# INLINABLE welchTTest #-} {-# SPECIALIZE welchTTest :: PositionTest -> U.Vector Double -> U.Vector Double -> Maybe (Test StudentT) #-} {-# SPECIALIZE welchTTest :: PositionTest -> S.Vector Double -> S.Vector Double -> Maybe (Test StudentT) #-} {-# SPECIALIZE welchTTest :: PositionTest -> V.Vector Double -> V.Vector Double -> Maybe (Test StudentT) #-} -- | Paired two-sample t-test. Two samples are paired in a -- within-subject design. Returns @Nothing@ if sample size is not -- sufficient. pairedTTest :: forall v. (G.Vector v (Double, Double), G.Vector v Double) => PositionTest -- ^ one- or two-tailed test -> v (Double, Double) -- ^ paired samples -> Maybe (Test StudentT) pairedTTest test sample | G.length sample < 2 = Nothing | otherwise = Just Test { testSignificance = significance test t ndf , testStatistics = t , testDistribution = studentT ndf } where (t, ndf) = tStatisticsPaired sample {-# INLINABLE pairedTTest #-} {-# SPECIALIZE pairedTTest :: PositionTest -> U.Vector (Double,Double) -> Maybe (Test StudentT) #-} {-# SPECIALIZE pairedTTest :: PositionTest -> V.Vector (Double,Double) -> Maybe (Test StudentT) #-} ------------------------------------------------------------------------------- significance :: PositionTest -- ^ one- or two-tailed -> Double -- ^ t statistics -> Double -- ^ degree of freedom -> PValue Double -- ^ p-value significance test t df = case test of -- Here we exploit symmetry of T-distribution and calculate small tail SamplesDiffer -> mkPValue $ 2 * tailArea (negate (abs t)) AGreater -> mkPValue $ tailArea (negate t) BGreater -> mkPValue $ tailArea t where tailArea = cumulative (studentT df) -- Calculate T statistics for two samples tStatistics :: (G.Vector v Double) => Bool -- variance equality -> v Double -> v Double -> (Double, Double) {-# INLINE tStatistics #-} tStatistics varequal sample1 sample2 = (t, ndf) where -- t-statistics t = (m1 - m2) / sqrt ( if varequal then ((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2) * (1 / n1 + 1 / n2) else s1 / n1 + s2 / n2) -- degree of freedom ndf | varequal = n1 + n2 - 2 | otherwise = square (s1 / n1 + s2 / n2) / (square s1 / (square n1 * (n1 - 1)) + square s2 / (square n2 * (n2 - 1))) -- statistics of two samples n1 = fromIntegral $ G.length sample1 n2 = fromIntegral $ G.length sample2 m1 = mean sample1 m2 = mean sample2 s1 = varianceUnbiased sample1 s2 = varianceUnbiased sample2 -- Calculate T-statistics for paired sample tStatisticsPaired :: (G.Vector v (Double, Double)) => v (Double, Double) -> (Double, Double) {-# INLINE tStatisticsPaired #-} tStatisticsPaired sample = (t, ndf) where -- t-statistics t = let d = U.map (uncurry (-)) $ G.convert sample sumd = U.sum d in sumd / sqrt ((n * U.sum (U.map square d) - square sumd) / ndf) -- degree of freedom ndf = n - 1 n = fromIntegral $ G.length sample statistics-0.16.2.1/Statistics/Test/Types.hs0000644000000000000000000000553707346545000017045 0ustar0000000000000000{-# LANGUAGE DeriveFunctor, DeriveDataTypeable,DeriveGeneric #-} module Statistics.Test.Types ( Test(..) , isSignificant , TestResult(..) , significant , PositionTest(..) ) where import Control.DeepSeq (NFData(..)) import Control.Monad (liftM3) import Data.Aeson (FromJSON, ToJSON) import Data.Binary (Binary (..)) import Data.Data (Typeable, Data) import GHC.Generics import Statistics.Types (PValue) -- | Result of hypothesis testing data TestResult = Significant -- ^ Null hypothesis should be rejected | NotSignificant -- ^ Data is compatible with hypothesis deriving (Eq,Ord,Show,Typeable,Data,Generic) instance Binary TestResult where get = do sig <- get if sig then return Significant else return NotSignificant put = put . (== Significant) instance FromJSON TestResult instance ToJSON TestResult instance NFData TestResult -- | Result of statistical test. data Test distr = Test { testSignificance :: !(PValue Double) -- ^ Probability of getting value of test statistics at least as -- extreme as measured. , testStatistics :: !Double -- ^ Statistic used for test. , testDistribution :: distr -- ^ Distribution of test statistics if null hypothesis is correct. } deriving (Eq,Ord,Show,Typeable,Data,Generic,Functor) instance (Binary d) => Binary (Test d) where get = liftM3 Test get get get put (Test sign stat distr) = put sign >> put stat >> put distr instance (FromJSON d) => FromJSON (Test d) instance (ToJSON d) => ToJSON (Test d) instance (NFData d) => NFData (Test d) where rnf (Test _ _ a) = rnf a -- | Check whether test is significant for given p-value. isSignificant :: PValue Double -> Test d -> TestResult isSignificant p t = significant $ p >= testSignificance t -- | Test type for test which compare positional (mean,median etc.) -- information of samples. data PositionTest = SamplesDiffer -- ^ Test whether samples differ in position. Null hypothesis is -- samples are not different | AGreater -- ^ Test if first sample (A) is larger than second (B). Null -- hypothesis is first sample is not larger than second. | BGreater -- ^ Test if second sample is larger than first. deriving (Eq,Ord,Show,Typeable,Data,Generic) instance Binary PositionTest where get = do i <- get case (i :: Int) of 0 -> return SamplesDiffer 1 -> return AGreater 2 -> return BGreater _ -> fail "Invalid PositionTest" put SamplesDiffer = put (0 :: Int) put AGreater = put (1 :: Int) put BGreater = put (2 :: Int) instance FromJSON PositionTest instance ToJSON PositionTest instance NFData PositionTest -- | significant if parameter is 'True', not significant otherwise significant :: Bool -> TestResult significant True = Significant significant False = NotSignificant statistics-0.16.2.1/Statistics/Test/WilcoxonT.hs0000644000000000000000000002452307346545000017663 0ustar0000000000000000{-# LANGUAGE ViewPatterns #-} -- | -- Module : Statistics.Test.WilcoxonT -- Copyright : (c) 2010 Neil Brown -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- The Wilcoxon matched-pairs signed-rank test is non-parametric test -- which could be used to test whether two related samples have -- different means. module Statistics.Test.WilcoxonT ( -- * Wilcoxon signed-rank matched-pair test -- ** Test wilcoxonMatchedPairTest -- ** Building blocks , wilcoxonMatchedPairSignedRank , wilcoxonMatchedPairSignificant , wilcoxonMatchedPairSignificance , wilcoxonMatchedPairCriticalValue , module Statistics.Test.Types -- * References -- $references ) where -- -- -- -- Note that: wilcoxonMatchedPairSignedRank == (\(x, y) -> (y, x)) . flip wilcoxonMatchedPairSignedRank -- The samples are zipped together: if one is longer than the other, both are truncated -- The value returned is the pair (T+, T-). T+ is the sum of positive ranks (the -- These values mean little by themselves, and should be combined with the 'wilcoxonSignificant' -- function in this module to get a meaningful result. -- ranks of the differences where the first parameter is higher) whereas T- is -- the sum of negative ranks (the ranks of the differences where the second parameter is higher). -- to the length of the shorter sample. import Data.Function (on) import Data.List (findIndex) import Data.Ord (comparing) import qualified Data.Vector.Unboxed as U import Prelude hiding (sum) import Statistics.Function (sortBy) import Statistics.Sample.Internal (sum) import Statistics.Test.Internal (rank, splitByTags) import Statistics.Test.Types import Statistics.Types -- (CL,pValue,getPValue) import Statistics.Distribution import Statistics.Distribution.Normal -- | Calculate (n,T⁺,T⁻) values for both samples. Where /n/ is reduced -- sample where equal pairs are removed. wilcoxonMatchedPairSignedRank :: (Ord a, Num a, U.Unbox a) => U.Vector (a,a) -> (Int, Double, Double) wilcoxonMatchedPairSignedRank ab = (nRed, sum ranks1, negate (sum ranks2)) where -- Positive and negative ranks (ranks1, ranks2) = splitByTags $ U.zip tags (rank ((==) `on` abs) diffs) -- Sorted list of differences diffsSorted = sortBy (comparing abs) -- Sort the differences by absolute difference $ U.filter (/= 0) -- Remove equal elements $ U.map (uncurry (-)) ab -- Work out differences nRed = U.length diffsSorted -- Sign tags and differences (tags,diffs) = U.unzip $ U.map (\x -> (x>0 , x)) -- Attach tags to distribution elements $ diffsSorted -- | The coefficients for x^0, x^1, x^2, etc, in the expression -- \prod_{r=1}^s (1 + x^r). See the Mitic paper for details. -- -- We can define: -- f(1) = 1 + x -- f(r) = (1 + x^r)*f(r-1) -- = f(r-1) + x^r * f(r-1) -- The effect of multiplying the equation by x^r is to shift -- all the coefficients by r down the list. -- -- This list will be processed lazily from the head. coefficients :: Int -> [Integer] coefficients 1 = [1, 1] -- 1 + x coefficients r = let coeffs = coefficients (r-1) (firstR, rest) = splitAt r coeffs in firstR ++ add rest coeffs where add (x:xs) (y:ys) = x + y : add xs ys add xs [] = xs add [] ys = ys -- This list will be processed lazily from the head. summedCoefficients :: Int -> [Double] summedCoefficients n | n < 1 = error "Statistics.Test.WilcoxonT.summedCoefficients: nonpositive sample size" | n > 1023 = error "Statistics.Test.WilcoxonT.summedCoefficients: sample is too large (see bug #18)" | otherwise = map fromIntegral $ scanl1 (+) $ coefficients n -- | Tests whether a given result from a Wilcoxon signed-rank matched-pairs test -- is significant at the given level. -- -- This function can perform a one-tailed or two-tailed test. If the first -- parameter to this function is 'TwoTailed', the test is performed two-tailed to -- check if the two samples differ significantly. If the first parameter is -- 'OneTailed', the check is performed one-tailed to decide whether the first sample -- (i.e. the first sample you passed to 'wilcoxonMatchedPairSignedRank') is -- greater than the second sample (i.e. the second sample you passed to -- 'wilcoxonMatchedPairSignedRank'). If you wish to perform a one-tailed test -- in the opposite direction, you can either pass the parameters in a different -- order to 'wilcoxonMatchedPairSignedRank', or simply swap the values in the resulting -- pair before passing them to this function. wilcoxonMatchedPairSignificant :: PositionTest -- ^ How to compare two samples -> PValue Double -- ^ The p-value at which to test (e.g. @mkPValue 0.05@) -> (Int, Double, Double) -- ^ The (n,T⁺, T⁻) values from 'wilcoxonMatchedPairSignedRank'. -> Maybe TestResult -- ^ Return 'Nothing' if the sample was too -- small to make a decision. wilcoxonMatchedPairSignificant test pVal (sampleSize, tPlus, tMinus) = case test of -- According to my nearest book (Understanding Research Methods and Statistics -- by Gary W. Heiman, p590), to check that the first sample is bigger you must -- use the absolute value of T- for a one-tailed check: AGreater -> do crit <- wilcoxonMatchedPairCriticalValue sampleSize pVal return $ significant $ abs tMinus <= fromIntegral crit BGreater -> do crit <- wilcoxonMatchedPairCriticalValue sampleSize pVal return $ significant $ abs tPlus <= fromIntegral crit -- Otherwise you must use the value of T+ and T- with the smallest absolute value: -- -- Note that in absence of ties sum of |T+| and |T-| is constant -- so by selecting minimal we are performing two-tailed test and -- look and both tails of distribution of T. SamplesDiffer -> do crit <- wilcoxonMatchedPairCriticalValue sampleSize (mkPValue $ p/2) return $ significant $ t <= fromIntegral crit where t = min (abs tPlus) (abs tMinus) p = pValue pVal -- | Obtains the critical value of T to compare against, given a sample size -- and a p-value (significance level). Your T value must be less than or -- equal to the return of this function in order for the test to work out -- significant. If there is a Nothing return, the sample size is too small to -- make a decision. -- -- 'wilcoxonSignificant' tests the return value of 'wilcoxonMatchedPairSignedRank' -- for you, so you should use 'wilcoxonSignificant' for determining test results. -- However, this function is useful, for example, for generating lookup tables -- for Wilcoxon signed rank critical values. -- -- The return values of this function are generated using the method -- detailed in the Mitic's paper. According to that paper, the results -- may differ from other published lookup tables, but (Mitic claims) -- the values obtained by this function will be the correct ones. wilcoxonMatchedPairCriticalValue :: Int -- ^ The sample size -> PValue Double -- ^ The p-value (e.g. @mkPValue 0.05@) for which you want the critical value. -> Maybe Int -- ^ The critical value (of T), or Nothing if -- the sample is too small to make a decision. wilcoxonMatchedPairCriticalValue n pVal | n < 100 = case subtract 1 <$> findIndex (> m) (summedCoefficients n) of Just k | k < 0 -> Nothing | otherwise -> Just k Nothing -> error "Statistics.Test.WilcoxonT.wilcoxonMatchedPairCriticalValue: impossible happened" | otherwise = case quantile (normalApprox n) p of z | z < 0 -> Nothing | otherwise -> Just (round z) where p = pValue pVal m = (2 ** fromIntegral n) * p -- | Works out the significance level (p-value) of a T value, given a sample -- size and a T value from the Wilcoxon signed-rank matched-pairs test. -- -- See the notes on 'wilcoxonCriticalValue' for how this is calculated. wilcoxonMatchedPairSignificance :: Int -- ^ The sample size -> Double -- ^ The value of T for which you want the significance. -> PValue Double -- ^ The significance (p-value). wilcoxonMatchedPairSignificance n t = mkPValue p where p | n < 100 = (summedCoefficients n !! floor t) / 2 ** fromIntegral n | otherwise = cumulative (normalApprox n) t -- | Normal approximation for Wilcoxon T statistics normalApprox :: Int -> NormalDistribution normalApprox ni = normalDistr m s where m = n * (n + 1) / 4 s = sqrt $ (n * (n + 1) * (2*n + 1)) / 24 n = fromIntegral ni -- | The Wilcoxon matched-pairs signed-rank test. The samples are -- zipped together: if one is longer than the other, both are -- truncated to the length of the shorter sample. -- -- For one-tailed test it tests whether first sample is significantly -- greater than the second. For two-tailed it checks whether they -- significantly differ -- -- Check 'wilcoxonMatchedPairSignedRank' and -- 'wilcoxonMatchedPairSignificant' for additional information. wilcoxonMatchedPairTest :: (Ord a, Num a, U.Unbox a) => PositionTest -- ^ Perform one-tailed test. -> U.Vector (a,a) -- ^ Sample of pairs -> Test () -- ^ Return 'Nothing' if the sample was too -- small to make a decision. wilcoxonMatchedPairTest test pairs = Test { testSignificance = pVal , testStatistics = t , testDistribution = () } where (n,tPlus,tMinus) = wilcoxonMatchedPairSignedRank pairs (t,pVal) = case test of AGreater -> (abs tMinus, wilcoxonMatchedPairSignificance n (abs tMinus)) BGreater -> (abs tPlus, wilcoxonMatchedPairSignificance n (abs tPlus )) -- Since we take minimum of T+,T- we can't get more -- that p=0.5 and can multiply it by 2 without risk -- of error. SamplesDiffer -> let t' = min (abs tMinus) (abs tPlus) p = wilcoxonMatchedPairSignificance n t' in (t', mkPValue $ min 1 $ 2 * pValue p) -- $references -- -- * \"Critical Values for the Wilcoxon Signed Rank Statistic\", Peter -- Mitic, The Mathematica Journal, volume 6, issue 3, 1996 -- () statistics-0.16.2.1/Statistics/Transform.hs0000644000000000000000000001406507346545000016771 0ustar0000000000000000{-# LANGUAGE BangPatterns, FlexibleContexts #-} -- | -- Module : Statistics.Transform -- Copyright : (c) 2011 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Fourier-related transformations of mathematical functions. -- -- These functions are written for simplicity and correctness, not -- speed. If you need a fast FFT implementation for your application, -- you should strongly consider using a library of FFTW bindings -- instead. module Statistics.Transform ( -- * Type synonyms CD -- * Discrete cosine transform , dct , dct_ , idct , idct_ -- * Fast Fourier transform , fft , ifft ) where import Control.Monad (when) import Control.Monad.ST (ST) import Data.Bits (shiftL, shiftR) import Data.Complex (Complex(..), conjugate, realPart) import Numeric.SpecFunctions (log2) import qualified Data.Vector.Generic as G import qualified Data.Vector.Generic.Mutable as M import qualified Data.Vector.Unboxed as U import qualified Data.Vector as V type CD = Complex Double -- | Discrete cosine transform (DCT-II). dct :: (G.Vector v CD, G.Vector v Double, G.Vector v Int) => v Double -> v Double dct = dctWorker . G.map (:+0) {-# INLINABLE dct #-} {-# SPECIAlIZE dct :: U.Vector Double -> U.Vector Double #-} {-# SPECIAlIZE dct :: V.Vector Double -> V.Vector Double #-} -- | Discrete cosine transform (DCT-II). Only real part of vector is -- transformed, imaginary part is ignored. dct_ :: (G.Vector v CD, G.Vector v Double, G.Vector v Int) => v CD -> v Double dct_ = dctWorker . G.map (\(i :+ _) -> i :+ 0) {-# INLINABLE dct_ #-} {-# SPECIAlIZE dct_ :: U.Vector CD -> U.Vector Double #-} {-# SPECIAlIZE dct_ :: V.Vector CD -> V.Vector Double#-} dctWorker :: (G.Vector v CD, G.Vector v Double, G.Vector v Int) => v CD -> v Double {-# INLINE dctWorker #-} dctWorker xs -- length 1 is special cased because shuffle algorithms fail for it. | G.length xs == 1 = G.map ((2*) . realPart) xs | vectorOK xs = G.map realPart $ G.zipWith (*) weights (fft interleaved) | otherwise = error "Statistics.Transform.dct: bad vector length" where interleaved = G.backpermute xs $ G.enumFromThenTo 0 2 (len-2) G.++ G.enumFromThenTo (len-1) (len-3) 1 weights = G.cons 2 . G.generate (len-1) $ \x -> 2 * exp ((0:+(-1))*fi (x+1)*pi/(2*n)) where n = fi len len = G.length xs -- | Inverse discrete cosine transform (DCT-III). It's inverse of -- 'dct' only up to scale parameter: -- -- > (idct . dct) x = (* length x) idct :: (G.Vector v CD, G.Vector v Double) => v Double -> v Double idct = idctWorker . G.map (:+0) {-# INLINABLE idct #-} {-# SPECIAlIZE idct :: U.Vector Double -> U.Vector Double #-} {-# SPECIAlIZE idct :: V.Vector Double -> V.Vector Double #-} -- | Inverse discrete cosine transform (DCT-III). Only real part of vector is -- transformed, imaginary part is ignored. idct_ :: (G.Vector v CD, G.Vector v Double) => v CD -> v Double idct_ = idctWorker . G.map (\(i :+ _) -> i :+ 0) {-# INLINABLE idct_ #-} {-# SPECIAlIZE idct_ :: U.Vector CD -> U.Vector Double #-} {-# SPECIAlIZE idct_ :: V.Vector CD -> V.Vector Double #-} idctWorker :: (G.Vector v CD, G.Vector v Double) => v CD -> v Double {-# INLINE idctWorker #-} idctWorker xs | vectorOK xs = G.generate len interleave | otherwise = error "Statistics.Transform.dct: bad vector length" where interleave z | even z = vals `G.unsafeIndex` halve z | otherwise = vals `G.unsafeIndex` (len - halve z - 1) vals = G.map realPart . ifft $ G.zipWith (*) weights xs weights = G.cons n $ G.generate (len - 1) $ \x -> 2 * n * exp ((0:+1) * fi (x+1) * pi/(2*n)) where n = fi len len = G.length xs -- | Inverse fast Fourier transform. ifft :: G.Vector v CD => v CD -> v CD ifft xs | vectorOK xs = G.map ((/fi (G.length xs)) . conjugate) . fft . G.map conjugate $ xs | otherwise = error "Statistics.Transform.ifft: bad vector length" {-# INLINABLE ifft #-} {-# SPECIAlIZE ifft :: U.Vector CD -> U.Vector CD #-} {-# SPECIAlIZE ifft :: V.Vector CD -> V.Vector CD #-} -- | Radix-2 decimation-in-time fast Fourier transform. fft :: G.Vector v CD => v CD -> v CD fft v | vectorOK v = G.create $ do mv <- G.thaw v mfft mv return mv | otherwise = error "Statistics.Transform.fft: bad vector length" {-# INLINABLE fft #-} {-# SPECIAlIZE fft :: U.Vector CD -> U.Vector CD #-} {-# SPECIAlIZE fft :: V.Vector CD -> V.Vector CD #-} -- Vector length must be power of two. It's not checked mfft :: (M.MVector v CD) => v s CD -> ST s () {-# INLINE mfft #-} mfft vec = bitReverse 0 0 where bitReverse i j | i == len-1 = stage 0 1 | otherwise = do when (i < j) $ M.swap vec i j let inner k l | k <= l = inner (k `shiftR` 1) (l-k) | otherwise = bitReverse (i+1) (l+k) inner (len `shiftR` 1) j stage l !l1 | l == m = return () | otherwise = do let !l2 = l1 `shiftL` 1 !e = -6.283185307179586/fromIntegral l2 flight j !a | j == l1 = stage (l+1) l2 | otherwise = do let butterfly i | i >= len = flight (j+1) (a+e) | otherwise = do let i1 = i + l1 xi1 :+ yi1 <- M.read vec i1 let !c = cos a !s = sin a d = (c*xi1 - s*yi1) :+ (s*xi1 + c*yi1) ci <- M.read vec i M.write vec i1 (ci - d) M.write vec i (ci + d) butterfly (i+l2) butterfly j flight 0 0 len = M.length vec m = log2 len ---------------------------------------------------------------- -- Helpers ---------------------------------------------------------------- fi :: Int -> CD fi = fromIntegral halve :: Int -> Int halve = (`shiftR` 1) vectorOK :: G.Vector v a => v a -> Bool {-# INLINE vectorOK #-} vectorOK v = (1 `shiftL` log2 n) == n where n = G.length v statistics-0.16.2.1/Statistics/Types.hs0000644000000000000000000004070407346545000016121 0ustar0000000000000000{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} -- | -- Module : Statistics.Types -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Data types common used in statistics module Statistics.Types ( -- * Confidence level CL -- ** Accessors , confidenceLevel , significanceLevel -- ** Constructors , mkCL , mkCLE , mkCLFromSignificance , mkCLFromSignificanceE -- ** Constants and conversion to nσ , cl90 , cl95 , cl99 -- *** Normal approximation , nSigma , nSigma1 , getNSigma , getNSigma1 -- * p-value , PValue -- ** Accessors , pValue -- ** Constructors , mkPValue , mkPValueE -- * Estimates and upper/lower limits , Estimate(..) , NormalErr(..) , ConfInt(..) , UpperLimit(..) , LowerLimit(..) -- ** Constructors , estimateNormErr , (±) , estimateFromInterval , estimateFromErr -- ** Accessors , confidenceInterval , asymErrors , Scale(..) -- * Other , Sample , WeightedSample , Weights ) where import Control.Monad ((<=<), liftM2, liftM3) import Control.DeepSeq (NFData(..)) import Data.Aeson (FromJSON(..), ToJSON) import Data.Binary (Binary(..)) import Data.Data (Data,Typeable) import Data.Maybe (fromMaybe) import Data.Vector.Unboxed (Unbox) import Data.Vector.Unboxed.Deriving (derivingUnbox) import GHC.Generics (Generic) import Statistics.Internal import Statistics.Types.Internal import Statistics.Distribution import Statistics.Distribution.Normal ---------------------------------------------------------------- -- Data type for confidence level ---------------------------------------------------------------- -- | -- Confidence level. In context of confidence intervals it's -- probability of said interval covering true value of measured -- value. In context of statistical tests it's @1-α@ where α is -- significance of test. -- -- Since confidence level are usually close to 1 they are stored as -- @1-CL@ internally. There are two smart constructors for @CL@: -- 'mkCL' and 'mkCLFromSignificance' (and corresponding variant -- returning @Maybe@). First creates @CL@ from confidence level and -- second from @1 - CL@ or significance level. -- -- >>> cl95 -- mkCLFromSignificance 0.05 -- -- Prior to 0.14 confidence levels were passed to function as plain -- @Doubles@. Use 'mkCL' to convert them to @CL@. newtype CL a = CL a deriving (Eq, Typeable, Data, Generic) instance Show a => Show (CL a) where showsPrec n (CL p) = defaultShow1 "mkCLFromSignificance" p n instance (Num a, Ord a, Read a) => Read (CL a) where readPrec = defaultReadPrecM1 "mkCLFromSignificance" mkCLFromSignificanceE instance (Binary a, Num a, Ord a) => Binary (CL a) where put (CL p) = put p get = maybe (fail errMkCL) return . mkCLFromSignificanceE =<< get instance (ToJSON a) => ToJSON (CL a) instance (FromJSON a, Num a, Ord a) => FromJSON (CL a) where parseJSON = maybe (fail errMkCL) return . mkCLFromSignificanceE <=< parseJSON instance NFData a => NFData (CL a) where rnf (CL a) = rnf a -- | -- >>> cl95 > cl90 -- True instance Ord a => Ord (CL a) where CL a < CL b = a > b CL a <= CL b = a >= b CL a > CL b = a < b CL a >= CL b = a <= b max (CL a) (CL b) = CL (min a b) min (CL a) (CL b) = CL (max a b) -- | Create confidence level from probability β or probability -- confidence interval contain true value of estimate. Will throw -- exception if parameter is out of [0,1] range -- -- >>> mkCL 0.95 -- same as cl95 -- mkCLFromSignificance 0.05 mkCL :: (Ord a, Num a) => a -> CL a mkCL = fromMaybe (error "Statistics.Types.mkCL: probability is out if [0,1] range") . mkCLE -- | Same as 'mkCL' but returns @Nothing@ instead of error if -- parameter is out of [0,1] range -- -- >>> mkCLE 0.95 -- same as cl95 -- Just (mkCLFromSignificance 0.05) mkCLE :: (Ord a, Num a) => a -> Maybe (CL a) mkCLE p | p >= 0 && p <= 1 = Just $ CL (1 - p) | otherwise = Nothing -- | Create confidence level from probability α or probability that -- confidence interval does not contain true value of estimate. Will -- throw exception if parameter is out of [0,1] range -- -- >>> mkCLFromSignificance 0.05 -- same as cl95 -- mkCLFromSignificance 0.05 mkCLFromSignificance :: (Ord a, Num a) => a -> CL a mkCLFromSignificance = fromMaybe (error errMkCL) . mkCLFromSignificanceE -- | Same as 'mkCLFromSignificance' but returns @Nothing@ instead of error if -- parameter is out of [0,1] range -- -- >>> mkCLFromSignificanceE 0.05 -- same as cl95 -- Just (mkCLFromSignificance 0.05) mkCLFromSignificanceE :: (Ord a, Num a) => a -> Maybe (CL a) mkCLFromSignificanceE p | p >= 0 && p <= 1 = Just $ CL p | otherwise = Nothing errMkCL :: String errMkCL = "Statistics.Types.mkPValCL: probability is out if [0,1] range" -- | Get confidence level. This function is subject to rounding -- errors. If @1 - CL@ is needed use 'significanceLevel' instead confidenceLevel :: (Num a) => CL a -> a confidenceLevel (CL p) = 1 - p -- | Get significance level. significanceLevel :: CL a -> a significanceLevel (CL p) = p -- | 90% confidence level cl90 :: Fractional a => CL a cl90 = CL 0.10 -- | 95% confidence level cl95 :: Fractional a => CL a cl95 = CL 0.05 -- | 99% confidence level cl99 :: Fractional a => CL a cl99 = CL 0.01 ---------------------------------------------------------------- -- Data type for p-value ---------------------------------------------------------------- -- | Newtype wrapper for p-value. newtype PValue a = PValue a deriving (Eq,Ord, Typeable, Data, Generic) instance Show a => Show (PValue a) where showsPrec n (PValue p) = defaultShow1 "mkPValue" p n instance (Num a, Ord a, Read a) => Read (PValue a) where readPrec = defaultReadPrecM1 "mkPValue" mkPValueE instance (Binary a, Num a, Ord a) => Binary (PValue a) where put (PValue p) = put p get = maybe (fail errMkPValue) return . mkPValueE =<< get instance (ToJSON a) => ToJSON (PValue a) instance (FromJSON a, Num a, Ord a) => FromJSON (PValue a) where parseJSON = maybe (fail errMkPValue) return . mkPValueE <=< parseJSON instance NFData a => NFData (PValue a) where rnf (PValue a) = rnf a -- | Construct PValue. Throws error if argument is out of [0,1] range. -- mkPValue :: (Ord a, Num a) => a -> PValue a mkPValue = fromMaybe (error errMkPValue) . mkPValueE -- | Construct PValue. Returns @Nothing@ if argument is out of [0,1] range. mkPValueE :: (Ord a, Num a) => a -> Maybe (PValue a) mkPValueE p | p >= 0 && p <= 1 = Just $ PValue p | otherwise = Nothing -- | Get p-value pValue :: PValue a -> a pValue (PValue p) = p -- | P-value expressed in sigma. This is convention widely used in -- experimental physics. N sigma confidence level corresponds to -- probability within N sigma of normal distribution. -- -- Note that this correspondence is for normal distribution. Other -- distribution will have different dependency. Also experimental -- distribution usually only approximately normal (especially at -- extreme tails). nSigma :: Double -> PValue Double nSigma n | n > 0 = PValue $ 2 * cumulative standard (-n) | otherwise = error "Statistics.Extra.Error.nSigma: non-positive number of sigma" -- | P-value expressed in sigma for one-tail hypothesis. This correspond to -- probability of obtaining value less than @N·σ@. nSigma1 :: Double -> PValue Double nSigma1 n | n > 0 = PValue $ cumulative standard (-n) | otherwise = error "Statistics.Extra.Error.nSigma1: non-positive number of sigma" -- | Express confidence level in sigmas getNSigma :: PValue Double -> Double getNSigma (PValue p) = negate $ quantile standard (p / 2) -- | Express confidence level in sigmas for one-tailed hypothesis. getNSigma1 :: PValue Double -> Double getNSigma1 (PValue p) = negate $ quantile standard p errMkPValue :: String errMkPValue = "Statistics.Types.mkPValue: probability is out if [0,1] range" ---------------------------------------------------------------- -- Point estimates ---------------------------------------------------------------- -- | -- A point estimate and its confidence interval. It's parametrized by -- both error type @e@ and value type @a@. This module provides two -- types of error: 'NormalErr' for normally distributed errors and -- 'ConfInt' for error with normal distribution. See their -- documentation for more details. -- -- For example @144 ± 5@ (assuming normality) could be expressed as -- -- > Estimate { estPoint = 144 -- > , estError = NormalErr 5 -- > } -- -- Or if we want to express @144 + 6 - 4@ at CL95 we could write: -- -- > Estimate { estPoint = 144 -- > , estError = ConfInt -- > { confIntLDX = 4 -- > , confIntUDX = 6 -- > , confIntCL = cl95 -- > } -- -- Prior to statistics 0.14 @Estimate@ data type used following definition: -- -- > data Estimate = Estimate { -- > estPoint :: {-# UNPACK #-} !Double -- > , estLowerBound :: {-# UNPACK #-} !Double -- > , estUpperBound :: {-# UNPACK #-} !Double -- > , estConfidenceLevel :: {-# UNPACK #-} !Double -- > } -- -- Now type @Estimate ConfInt Double@ should be used instead. Function -- 'estimateFromInterval' allow to easily construct estimate from same inputs. data Estimate e a = Estimate { estPoint :: !a -- ^ Point estimate. , estError :: !(e a) -- ^ Confidence interval for estimate. } deriving (Eq, Read, Show, Generic , Typeable, Data ) instance (Binary (e a), Binary a) => Binary (Estimate e a) where get = liftM2 Estimate get get put (Estimate ep ee) = put ep >> put ee instance (FromJSON (e a), FromJSON a) => FromJSON (Estimate e a) instance (ToJSON (e a), ToJSON a) => ToJSON (Estimate e a) instance (NFData (e a), NFData a) => NFData (Estimate e a) where rnf (Estimate x dx) = rnf x `seq` rnf dx -- | -- Normal errors. They are stored as 1σ errors which corresponds to -- 68.8% CL. Since we can recalculate them to any confidence level if -- needed we don't store it. newtype NormalErr a = NormalErr { normalError :: a } deriving (Eq, Read, Show, Typeable, Data, Generic) instance Binary a => Binary (NormalErr a) where get = fmap NormalErr get put = put . normalError instance FromJSON a => FromJSON (NormalErr a) instance ToJSON a => ToJSON (NormalErr a) instance NFData a => NFData (NormalErr a) where rnf (NormalErr x) = rnf x -- | Confidence interval. It assumes that confidence interval forms -- single interval and isn't set of disjoint intervals. data ConfInt a = ConfInt { confIntLDX :: !a -- ^ Lower error estimate, or distance between point estimate and -- lower bound of confidence interval. , confIntUDX :: !a -- ^ Upper error estimate, or distance between point estimate and -- upper bound of confidence interval. , confIntCL :: !(CL Double) -- ^ Confidence level corresponding to given confidence interval. } deriving (Read,Show,Eq,Typeable,Data,Generic) instance Binary a => Binary (ConfInt a) where get = liftM3 ConfInt get get get put (ConfInt l u cl) = put l >> put u >> put cl instance FromJSON a => FromJSON (ConfInt a) instance ToJSON a => ToJSON (ConfInt a) instance NFData a => NFData (ConfInt a) where rnf (ConfInt x y _) = rnf x `seq` rnf y ---------------------------------------- -- Constructors -- | Create estimate with normal errors estimateNormErr :: a -- ^ Point estimate -> a -- ^ 1σ error -> Estimate NormalErr a estimateNormErr x dx = Estimate x (NormalErr dx) -- | Synonym for 'estimateNormErr' (±) :: a -- ^ Point estimate -> a -- ^ 1σ error -> Estimate NormalErr a (±) = estimateNormErr -- | Create estimate with asymmetric error. estimateFromErr :: a -- ^ Central estimate -> (a,a) -- ^ Lower and upper errors. Both should be -- positive but it's not checked. -> CL Double -- ^ Confidence level for interval -> Estimate ConfInt a estimateFromErr x (ldx,udx) cl = Estimate x (ConfInt ldx udx cl) -- | Create estimate with asymmetric error. estimateFromInterval :: Num a => a -- ^ Point estimate. Should lie within -- interval but it's not checked. -> (a,a) -- ^ Lower and upper bounds of interval -> CL Double -- ^ Confidence level for interval -> Estimate ConfInt a estimateFromInterval x (lx,ux) cl = Estimate x (ConfInt (x-lx) (ux-x) cl) ---------------------------------------- -- Accessors -- | Get confidence interval confidenceInterval :: Num a => Estimate ConfInt a -> (a,a) confidenceInterval (Estimate x (ConfInt ldx udx _)) = (x - ldx, x + udx) -- | Get asymmetric errors asymErrors :: Estimate ConfInt a -> (a,a) asymErrors (Estimate _ (ConfInt ldx udx _)) = (ldx,udx) -- | Data types which could be multiplied by constant. class Scale e where scale :: (Ord a, Num a) => a -> e a -> e a instance Scale NormalErr where scale a (NormalErr e) = NormalErr (abs a * e) instance Scale ConfInt where scale a (ConfInt l u cl) | a >= 0 = ConfInt (a*l) (a*u) cl | otherwise = ConfInt (-a*u) (-a*l) cl instance Scale e => Scale (Estimate e) where scale a (Estimate x dx) = Estimate (a*x) (scale a dx) ---------------------------------------------------------------- -- Upper/lower limit ---------------------------------------------------------------- -- | Upper limit. They are usually given for small non-negative values -- when it's not possible detect difference from zero. data UpperLimit a = UpperLimit { upperLimit :: !a -- ^ Upper limit , ulConfidenceLevel :: !(CL Double) -- ^ Confidence level for which limit was calculated } deriving (Eq, Read, Show, Typeable, Data, Generic) instance Binary a => Binary (UpperLimit a) where get = liftM2 UpperLimit get get put (UpperLimit l cl) = put l >> put cl instance FromJSON a => FromJSON (UpperLimit a) instance ToJSON a => ToJSON (UpperLimit a) instance NFData a => NFData (UpperLimit a) where rnf (UpperLimit x cl) = rnf x `seq` rnf cl -- | Lower limit. They are usually given for large quantities when -- it's not possible to measure them. For example: proton half-life data LowerLimit a = LowerLimit { lowerLimit :: !a -- ^ Lower limit , llConfidenceLevel :: !(CL Double) -- ^ Confidence level for which limit was calculated } deriving (Eq, Read, Show, Typeable, Data, Generic) instance Binary a => Binary (LowerLimit a) where get = liftM2 LowerLimit get get put (LowerLimit l cl) = put l >> put cl instance FromJSON a => FromJSON (LowerLimit a) instance ToJSON a => ToJSON (LowerLimit a) instance NFData a => NFData (LowerLimit a) where rnf (LowerLimit x cl) = rnf x `seq` rnf cl ---------------------------------------------------------------- -- Deriving unbox instances ---------------------------------------------------------------- derivingUnbox "CL" [t| forall a. Unbox a => CL a -> a |] [| \(CL a) -> a |] [| CL |] derivingUnbox "PValue" [t| forall a. Unbox a => PValue a -> a |] [| \(PValue a) -> a |] [| PValue |] derivingUnbox "Estimate" [t| forall a e. (Unbox a, Unbox (e a)) => Estimate e a -> (a, e a) |] [| \(Estimate x dx) -> (x,dx) |] [| \(x,dx) -> (Estimate x dx) |] derivingUnbox "NormalErr" [t| forall a. Unbox a => NormalErr a -> a |] [| \(NormalErr a) -> a |] [| NormalErr |] derivingUnbox "ConfInt" [t| forall a. Unbox a => ConfInt a -> (a, a, CL Double) |] [| \(ConfInt a b c) -> (a,b,c) |] [| \(a,b,c) -> ConfInt a b c |] derivingUnbox "UpperLimit" [t| forall a. Unbox a => UpperLimit a -> (a, CL Double) |] [| \(UpperLimit a b) -> (a,b) |] [| \(a,b) -> UpperLimit a b |] derivingUnbox "LowerLimit" [t| forall a. Unbox a => LowerLimit a -> (a, CL Double) |] [| \(LowerLimit a b) -> (a,b) |] [| \(a,b) -> LowerLimit a b |] statistics-0.16.2.1/Statistics/Types/0000755000000000000000000000000007346545000015560 5ustar0000000000000000statistics-0.16.2.1/Statistics/Types/Internal.hs0000644000000000000000000000114007346545000017664 0ustar0000000000000000-- | -- Module : Statistics.Types.Internal -- Copyright : (c) 2009 Bryan O'Sullivan -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- Types for working with statistics. module Statistics.Types.Internal where import qualified Data.Vector.Unboxed as U (Vector) -- | Sample data. type Sample = U.Vector Double -- | Sample with weights. First element of sample is data, second is weight type WeightedSample = U.Vector (Double,Double) -- | Weights for affecting the importance of elements of a sample. type Weights = U.Vector Double statistics-0.16.2.1/benchmark/0000755000000000000000000000000007346545000014254 5ustar0000000000000000statistics-0.16.2.1/benchmark/bench.hs0000644000000000000000000000521607346545000015673 0ustar0000000000000000import Control.Monad.ST (runST) import Criterion.Main import Data.Complex import Statistics.Sample import Statistics.Transform import Statistics.Correlation.Pearson import System.Random.MWC import qualified Data.Vector.Unboxed as U -- Test sample sample :: U.Vector Double sample = runST $ flip uniformVector 10000 =<< create -- Weighted test sample sampleW :: U.Vector (Double,Double) sampleW = U.zip sample (U.reverse sample) -- Complex vector for FFT tests sampleC :: U.Vector (Complex Double) sampleC = U.zipWith (:+) sample (U.reverse sample) -- Simple benchmark for functions from Statistics.Sample main :: IO () main = defaultMain [ bgroup "sample" [ bench "range" $ nf (\x -> range x) sample -- Mean , bench "mean" $ nf (\x -> mean x) sample , bench "meanWeighted" $ nf (\x -> meanWeighted x) sampleW , bench "harmonicMean" $ nf (\x -> harmonicMean x) sample , bench "geometricMean" $ nf (\x -> geometricMean x) sample -- Variance , bench "variance" $ nf (\x -> variance x) sample , bench "varianceUnbiased" $ nf (\x -> varianceUnbiased x) sample , bench "varianceWeighted" $ nf (\x -> varianceWeighted x) sampleW -- Correlation , bench "pearson" $ nf (\x -> pearson (U.reverse sample) x) sample , bench "pearson'" $ nf (\x -> pearson' (U.reverse sample) x) sample , bench "pearsonFast" $ nf (\x -> pearsonFast (U.reverse sample) x) sample -- Other , bench "stdDev" $ nf (\x -> stdDev x) sample , bench "skewness" $ nf (\x -> skewness x) sample , bench "kurtosis" $ nf (\x -> kurtosis x) sample -- Central moments , bench "C.M. 2" $ nf (\x -> centralMoment 2 x) sample , bench "C.M. 3" $ nf (\x -> centralMoment 3 x) sample , bench "C.M. 4" $ nf (\x -> centralMoment 4 x) sample , bench "C.M. 5" $ nf (\x -> centralMoment 5 x) sample ] , bgroup "FFT" [ bgroup "fft" [ bench (show n) $ whnf fft (U.take n sampleC) | n <- fftSizes ] , bgroup "ifft" [ bench (show n) $ whnf ifft (U.take n sampleC) | n <- fftSizes ] , bgroup "dct" [ bench (show n) $ whnf dct (U.take n sample) | n <- fftSizes ] , bgroup "dct_" [ bench (show n) $ whnf dct_ (U.take n sampleC) | n <- fftSizes ] , bgroup "idct" [ bench (show n) $ whnf idct (U.take n sample) | n <- fftSizes ] , bgroup "idct_" [ bench (show n) $ whnf idct_ (U.take n sampleC) | n <- fftSizes ] ] ] fftSizes :: [Int] fftSizes = [32,128,512,2048] statistics-0.16.2.1/changelog.md0000644000000000000000000002535007346545000014600 0ustar0000000000000000## Changes in 0.16.2.1 * Unnecessary constraint dropped from `tStatisticsPaired`. * Compatibility with QuickCheck-2.14. Test suite doesn't fail every time. ## Changes in 0.16.2.0 * Improved precision for `complCumulative` for hypergeometric and binomial distributions. Precision improvements of geometric distribution * Negative binomial distribution added. ## Changes in 0.16.1.2 * Fixed bug in `fromSample` for exponential distribudion (#190) ## Changes in 0.16.1.0 * Dependency on monad-par is dropped. `parMap` from `parallel` is used instead. ## Changes in 0.16.0.2 * Bug in constructor of binomial distribution is fixed (#181). It accepted out-of range probability before. ## Changes in 0.16.0.0 * Random number generation switched to API introduced in random-1.2 * Support of GHC<7.10 is dropped * Fix for chi-squared test (#167) which was completely wrong * Computation of CDF and quantiles of Cauchy distribution is now numerically stable. * Fix loss of precision in computing of CDF of gamma distribution * Log-normal and Weibull distributions added. * `DiscreteGen` instance added for `DiscreteUniform` ## Changes in 0.15.2.0 * Test suite is finally fixed (#42, #123). It took very-very-very long time but finally happened. * Avoid loss of precision when computing CDF for exponential distribution. * Avoid loss of precision when computing CDF for geometric distribution. Add complement of CDF. * Correctly handle case of n=0 in poissonCI ## Changes in 0.15.1.1 * Fix build for GHC8.0 & 7.10 ## Changes in 0.15.1.0 * GHCJS support * Concurrent resampling now uses `async` instead of hand-rolled primitives ## Changes in 0.15.0.0 * Modules `Statistics.Matrix.*` are split into new package `dense-linear-algebra` and exponent field is removed from `Matrix` data type. * Module `Statistics.Normalize` which contains functions for normalization of samples * Module `Statistics.Quantile` reworked: - `ContParam` given `Default` instance - `quantile` should be used instead of `continuousBy` - `median` and `mad` are added - `quantiles` and `quantilesVec` functions for computation of set of quantiles added. * Modules `Statistics.Function.Comparison` and `Statistics.Math.RootFinding` are removed. Corresponding functionality could be found in `math-functions` package. * Fix vector index out of bounds in `bootstrapBCA` and `bootstrapRegress` (see issue #149) ## Changes in 0.14.0.2 * Compatibility fixes with older GHC ## Changes in 0.14.0.1 * Restored compatibility with GHC 7.4 & 7.6 ## Changes in 0.14.0.0 Breaking update. It seriously changes parts of API. It adds new data types for dealing with with estimates, confidence intervals, confidence levels and p-value. Also API for statistical tests is changed. * Module `Statistis.Types` now contains new data types for estimates, upper/lower bounds, confidence level, and p-value. - `CL` for representing confidence level - `PValue` for representing p-values - `Estimate` data type moved here from `Statistis.Resampling.Bootstrap` and now parametrized by type of error. - `NormalError` — represents normal error. - `ConfInt` — generic confidence interval - `UpperLimit`,`LowerLimit` for upper/lower limits. * New API for statistical tests. Instead of simply return significant/not significant it returns p-value, test statistics and distribution of test statistics if it's available. Tests also return `Nothing` instead of throwing error if sample size is not sufficient. Fixes #25. * `Statistics.Tests.Types.TestType` data type dropped * New smart constructors for distributions are added. They return `Nothing` if parameters are outside of allowed range. * Serialization instances (`Show/Read, Binary, ToJSON/FromJSON`) for distributions no longer allows to create data types with invalid parameters. They will fail to parse. Cached values are not serialized either so `Binary` instances changed normal and F-distributions. Encoding to JSON changed for Normal, F-distribution, and χ² distributions. However data created using older statistics will be successfully decoded. Fixes #59. * Statistics.Resample.Bootstrap uses new data types for central estimates. * Function for calculation of confidence intervals for Poisson and binomial distribution added in `Statistics.ConfidenceInt` * Tests of position now allow to ask whether first sample on average larger than second, second larger than first or whether they differ significantly. Affects Wilcoxon-T, Mann-Whitney-U, and Student-T tests. * API for bootstrap changed. New data types added. * Bug fixes for #74, #81, #83, #92, #94 * `complCumulative` added for many distributions. ## Changes in 0.13.3.0 * Kernel density estimation and FFT use generic versions now. * Code for calculation of Spearman and Pearson correlation added. Modules `Statistics.Correlation.Spearman` and `Statistics.Correlation.Pearson`. * Function for calculation covariance added in `Statistics.Sample`. * `Statistics.Function.pair` added. It zips vector and check that lengths are equal. * New functions added to `Statistics.Matrix` * Laplace distribution added. ## Changes in 0.13.2.3 * Vector dependency restored to >=0.10 ## Changes in 0.13.2.2 * Vector dependency lowered to >=0.9 ## Changes in 0.13.2.1 * Vector dependency bumped to >=0.10 ## Changes in 0.13.2.0 * Support for regression bootstrap added ## Changes in 0.13.1.1 * Fix for out of bound access in bootstrap (see `bos/criterion#52`) ## Changes in 0.13.1.0 * All types now support JSON encoding and decoding. ## Changes in 0.12.0.0 * The `Statistics.Math` module has been removed, after being deprecated for several years. Use the [math-functions](http://hackage.haskell.org/package/math-functions) package instead. * The `Statistics.Test.NonParametric` module has been removed, after being deprecated for several years. * Added support for Kendall's tau. * Added support for OLS regression. * Added basic 2D matrix support. * Added the Kruskal-Wallis test. ## Changes in 0.11.0.3 * Fixed a subtle bug in calculation of the jackknifed unbiased variance. * The test suite now requires QuickCheck 2.7. * We now calculate quantiles for normal distribution in a more numerically stable way (bug #64). ## Changes in 0.10.6.0 * The Estimator type has become an algebraic data type. This allows the jackknife function to potentially use more efficient jackknife implementations. * jackknifeMean, jackknifeStdDev, jackknifeVariance, jackknifeVarianceUnb: new functions. These have O(n) cost instead of the O(n^2) cost of the standard jackknife. * The mean function has been renamed to welfordMean; a new implementation of mean has better numerical accuracy in almost all cases. ## Changes in 0.10.5.2 * histogram correctly chooses range when all elements in the sample are same (bug #57) ## Changes in 0.10.5.1 * Bug fix for S.Distributions.Normal.standard introduced in 0.10.5.0 (Bug #56) ## Changes in 0.10.5.0 * Enthropy type class for distributions is added. * Probability and probability density of distribution is given in log domain too. ## Changes in 0.10.4.0 * Support for versions of GHC older than 7.2 is discontinued. * All datatypes now support 'Data.Binary' and 'GHC.Generics'. ## Changes in 0.10.3.0 * Bug fixes ## Changes in 0.10.2.0 * Bugs in DCT and IDCT are fixed. * Accessors for uniform distribution are added. * ContGen instances for all continuous distributions are added. * Beta distribution is added. * Constructor for improper gamma distribution is added. * Binomial distribution allows zero trials. * Poisson distribution now accept zero parameter. * Integer overflow in calculation of Wilcoxon-T test is fixed. * Bug in 'ContGen' instance for normal distribution is fixed. ## Changes in 0.10.1.0 * Kolmogorov-Smirnov nonparametric test added. * Pearson chi squared test added. * Type class for generating random variates for given distribution is added. * Modules 'Statistics.Math' and 'Statistics.Constants' are moved to the `math-functions` package. They are still available but marked as deprecated. ## Changes in 0.10.0.1 * `dct` and `idct` now have type `Vector Double -> Vector Double` ## Changes in 0.10.0.0 * The type classes Mean and Variance are split in two. This is required for distributions which do not have finite variance or mean. * The S.Sample.KernelDensity module has been renamed, and completely rewritten to be much more robust. The older module oversmoothed multi-modal data. (The older module is still available under the name S.Sample.KernelDensity.Simple). * Histogram computation is added, in S.Sample.Histogram. * Discrete Fourie transform is added, in S.Transform * Root finding is added, in S.Math.RootFinding. * The complCumulative function is added to the Distribution class in order to accurately assess probabilities P(X>x) which are used in one-tailed tests. * A stdDev function is added to the Variance class for distributions. * The constructor S.Distribution.normalDistr now takes standard deviation instead of variance as its parameter. * A bug in S.Quantile.weightedAvg is fixed. It produced a wrong answer if a sample contained only one element. * Bugs in quantile estimations for chi-square and gamma distribution are fixed. * Integer overflow in mannWhitneyUCriticalValue is fixed. It produced incorrect critical values for moderately large samples. Something around 20 for 32-bit machines and 40 for 64-bit ones. * A bug in mannWhitneyUSignificant is fixed. If either sample was larger than 20, it produced a completely incorrect answer. * One- and two-tailed tests in S.Tests.NonParametric are selected with sum types instead of Bool. * Test results returned as enumeration instead of `Bool`. * Performance improvements for Mann-Whitney U and Wilcoxon tests. * Module `S.Tests.NonParamtric` is split into `S.Tests.MannWhitneyU` and `S.Tests.WilcoxonT` * sortBy is added to S.Function. * Mean and variance for gamma distribution are fixed. * Much faster cumulative probability functions for Poisson and hypergeometric distributions. * Better density functions for gamma and Poisson distributions. * Student-T, Fisher-Snedecor F-distributions and Cauchy-Lorentz distribution are added. * The function S.Function.create is removed. Use generateM from the vector package instead. * Function to perform approximate comparison of doubles is added to S.Function.Comparison * Regularized incomplete beta function and its inverse are added to S.Function statistics-0.16.2.1/examples/kde/0000755000000000000000000000000007346545000014703 5ustar0000000000000000statistics-0.16.2.1/examples/kde/KDE.hs0000644000000000000000000000166407346545000015651 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} import Control.Applicative ((<$>)) import Statistics.Sample.KernelDensity (kde) import Text.Hastache (MuType(..), defaultConfig, hastacheFile) import Text.Hastache.Context (mkStrContext) import qualified Data.Attoparsec.ByteString as B import qualified Data.Attoparsec.ByteString.Char8 as A import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as L import qualified Data.Vector.Unboxed as U import qualified Data.Text.Lazy.IO as TL csv = do B.takeTill A.isEndOfLine (A.double `A.sepBy` A.char ',') `A.sepBy` A.endOfLine main = do waits <- (either error (U.fromList . map last . filter (not.null)) . A.parseOnly csv) <$> B.readFile "data/faithful.csv" let xs = map (\(a,b) -> [a,b]) . U.toList . uncurry U.zip . kde 64 $ waits context "data" = MuVariable . show $ xs s <- hastacheFile defaultConfig "kde.tpl" (mkStrContext context) TL.writeFile "kde.html" s statistics-0.16.2.1/examples/kde/data/0000755000000000000000000000000007346545000015614 5ustar0000000000000000statistics-0.16.2.1/examples/kde/data/faithful.csv0000644000000000000000000000433307346545000020136 0ustar0000000000000000eruption,wait 3.6,79 1.8,54 3.333,74 2.283,62 4.533,85 2.883,55 4.7,88 3.6,85 1.95,51 4.35,85 1.833,54 3.917,84 4.2,78 1.75,47 4.7,83 2.167,52 1.75,62 4.8,84 1.6,52 4.25,79 1.8,51 1.75,47 3.45,78 3.067,69 4.533,74 3.6,83 1.967,55 4.083,76 3.85,78 4.433,79 4.3,73 4.467,77 3.367,66 4.033,80 3.833,74 2.017,52 1.867,48 4.833,80 1.833,59 4.783,90 4.35,80 1.883,58 4.567,84 1.75,58 4.533,73 3.317,83 3.833,64 2.1,53 4.633,82 2,59 4.8,75 4.716,90 1.833,54 4.833,80 1.733,54 4.883,83 3.717,71 1.667,64 4.567,77 4.317,81 2.233,59 4.5,84 1.75,48 4.8,82 1.817,60 4.4,92 4.167,78 4.7,78 2.067,65 4.7,73 4.033,82 1.967,56 4.5,79 4,71 1.983,62 5.067,76 2.017,60 4.567,78 3.883,76 3.6,83 4.133,75 4.333,82 4.1,70 2.633,65 4.067,73 4.933,88 3.95,76 4.517,80 2.167,48 4,86 2.2,60 4.333,90 1.867,50 4.817,78 1.833,63 4.3,72 4.667,84 3.75,75 1.867,51 4.9,82 2.483,62 4.367,88 2.1,49 4.5,83 4.05,81 1.867,47 4.7,84 1.783,52 4.85,86 3.683,81 4.733,75 2.3,59 4.9,89 4.417,79 1.7,59 4.633,81 2.317,50 4.6,85 1.817,59 4.417,87 2.617,53 4.067,69 4.25,77 1.967,56 4.6,88 3.767,81 1.917,45 4.5,82 2.267,55 4.65,90 1.867,45 4.167,83 2.8,56 4.333,89 1.833,46 4.383,82 1.883,51 4.933,86 2.033,53 3.733,79 4.233,81 2.233,60 4.533,82 4.817,77 4.333,76 1.983,59 4.633,80 2.017,49 5.1,96 1.8,53 5.033,77 4,77 2.4,65 4.6,81 3.567,71 4,70 4.5,81 4.083,93 1.8,53 3.967,89 2.2,45 4.15,86 2,58 3.833,78 3.5,66 4.583,76 2.367,63 5,88 1.933,52 4.617,93 1.917,49 2.083,57 4.583,77 3.333,68 4.167,81 4.333,81 4.5,73 2.417,50 4,85 4.167,74 1.883,55 4.583,77 4.25,83 3.767,83 2.033,51 4.433,78 4.083,84 1.833,46 4.417,83 2.183,55 4.8,81 1.833,57 4.8,76 4.1,84 3.966,77 4.233,81 3.5,87 4.366,77 2.25,51 4.667,78 2.1,60 4.35,82 4.133,91 1.867,53 4.6,78 1.783,46 4.367,77 3.85,84 1.933,49 4.5,83 2.383,71 4.7,80 1.867,49 3.833,75 3.417,64 4.233,76 2.4,53 4.8,94 2,55 4.15,76 1.867,50 4.267,82 1.75,54 4.483,75 4,78 4.117,79 4.083,78 4.267,78 3.917,70 4.55,79 4.083,70 2.417,54 4.183,86 2.217,50 4.45,90 1.883,54 1.85,54 4.283,77 3.95,79 2.333,64 4.15,75 2.35,47 4.933,86 2.9,63 4.583,85 3.833,82 2.083,57 4.367,82 2.133,67 4.35,74 2.2,54 4.45,83 3.567,73 4.5,73 4.15,88 3.817,80 3.917,71 4.45,83 2,56 4.283,79 4.767,78 4.533,84 1.85,58 4.25,83 1.983,43 2.25,60 4.75,75 4.117,81 2.15,46 4.417,90 1.817,46 4.467,74 statistics-0.16.2.1/examples/kde/kde.html0000644000000000000000000000727107346545000016343 0ustar0000000000000000 Kernel density

Kernel density

This is a 64-point kernel density estimate of wait times between eruptions of the Old Faithful geyser.

statistics-0.16.2.1/examples/kde/kde.tpl0000644000000000000000000000221107346545000016163 0ustar0000000000000000 Kernel density

Kernel density

This is a 64-point kernel density estimate of wait times between eruptions of the Old Faithful geyser.

statistics-0.16.2.1/statistics.cabal0000644000000000000000000001307407346545000015505 0ustar0000000000000000name: statistics version: 0.16.2.1 synopsis: A library of statistical types, data, and functions description: This library provides a number of common functions and types useful in statistics. We focus on high performance, numerical robustness, and use of good algorithms. Where possible, we provide references to the statistical literature. . The library's facilities can be divided into four broad categories: . * Working with widely used discrete and continuous probability distributions. (There are dozens of exotic distributions in use; we focus on the most common.) . * Computing with sample data: quantile estimation, kernel density estimation, histograms, bootstrap methods, significance testing, and regression and autocorrelation analysis. . * Random variate generation under several different distributions. . * Common statistical tests for significant differences between samples. license: BSD2 license-file: LICENSE homepage: https://github.com/haskell/statistics bug-reports: https://github.com/haskell/statistics/issues author: Bryan O'Sullivan , Alexey Khudaykov maintainer: Alexey Khudaykov copyright: 2009-2014 Bryan O'Sullivan category: Math, Statistics build-type: Simple cabal-version: >= 1.10 extra-source-files: README.markdown benchmark/bench.hs changelog.md examples/kde/KDE.hs examples/kde/data/faithful.csv examples/kde/kde.html examples/kde/kde.tpl tests/Tests/Math/Tables.hs tests/Tests/Math/gen.py tests/utils/Makefile tests/utils/fftw.c tested-with: GHC ==8.4.4 GHC ==8.6.5 GHC ==8.8.4 GHC ==8.10.7 GHC ==9.0.2 GHC ==9.2.8 GHC ==9.4.6 GHC ==9.6.2 library default-language: Haskell2010 exposed-modules: Statistics.Autocorrelation Statistics.ConfidenceInt Statistics.Correlation Statistics.Correlation.Kendall Statistics.Distribution Statistics.Distribution.Beta Statistics.Distribution.Binomial Statistics.Distribution.CauchyLorentz Statistics.Distribution.ChiSquared Statistics.Distribution.DiscreteUniform Statistics.Distribution.Exponential Statistics.Distribution.FDistribution Statistics.Distribution.Gamma Statistics.Distribution.Geometric Statistics.Distribution.Hypergeometric Statistics.Distribution.Laplace Statistics.Distribution.Lognormal Statistics.Distribution.NegativeBinomial Statistics.Distribution.Normal Statistics.Distribution.Poisson Statistics.Distribution.StudentT Statistics.Distribution.Transform Statistics.Distribution.Uniform Statistics.Distribution.Weibull Statistics.Function Statistics.Quantile Statistics.Regression Statistics.Resampling Statistics.Resampling.Bootstrap Statistics.Sample Statistics.Sample.Internal Statistics.Sample.Histogram Statistics.Sample.KernelDensity Statistics.Sample.KernelDensity.Simple Statistics.Sample.Normalize Statistics.Sample.Powers Statistics.Test.ChiSquared Statistics.Test.KolmogorovSmirnov Statistics.Test.KruskalWallis Statistics.Test.MannWhitneyU -- Statistics.Test.Runs Statistics.Test.StudentT Statistics.Test.Types Statistics.Test.WilcoxonT Statistics.Transform Statistics.Types other-modules: Statistics.Distribution.Poisson.Internal Statistics.Internal Statistics.Test.Internal Statistics.Types.Internal build-depends: base >= 4.9 && < 5 -- , math-functions >= 0.3.4.1 , mwc-random >= 0.15.0.0 , random >= 1.2 -- , aeson >= 0.6.0.0 , async >= 2.2.2 && <2.3 , deepseq >= 1.1.0.2 , binary >= 0.5.1.0 , primitive >= 0.3 , dense-linear-algebra >= 0.1 && <0.2 , parallel >= 3.2.2.0 && <3.3 , vector >= 0.10 , vector-algorithms >= 0.4 , vector-th-unbox , vector-binary-instances >= 0.2.1 , data-default-class >= 0.1.2 -- Older GHC if impl(ghc < 7.6) build-depends: ghc-prim ghc-options: -O2 -Wall -fwarn-tabs -funbox-strict-fields test-suite statistics-tests default-language: Haskell2010 type: exitcode-stdio-1.0 hs-source-dirs: tests main-is: tests.hs other-modules: Tests.ApproxEq Tests.Correlation Tests.Distribution Tests.ExactDistribution Tests.Function Tests.Helpers Tests.KDE Tests.Matrix Tests.Matrix.Types Tests.NonParametric Tests.NonParametric.Table Tests.Orphanage Tests.Parametric Tests.Serialization Tests.Transform Tests.Quantile ghc-options: -Wall -threaded -rtsopts -fsimpl-tick-factor=500 build-depends: base , statistics , dense-linear-algebra , QuickCheck >= 2.7.5 , binary , erf , aeson , ieee754 >= 0.7.3 , math-functions , primitive , tasty , tasty-hunit , tasty-quickcheck , tasty-expected-failure , vector , vector-algorithms source-repository head type: git location: https://github.com/haskell/statistics statistics-0.16.2.1/tests/Tests/0000755000000000000000000000000007346545000014566 5ustar0000000000000000statistics-0.16.2.1/tests/Tests/ApproxEq.hs0000644000000000000000000000652607346545000016672 0ustar0000000000000000{-# LANGUAGE FlexibleContexts, FlexibleInstances, TypeFamilies #-} module Tests.ApproxEq ( ApproxEq(..) ) where import Data.Complex (Complex(..), realPart) import Data.List (intersperse) import Data.Maybe (catMaybes) import Numeric.MathFunctions.Constants (m_epsilon) import Statistics.Matrix hiding (map, toList) import Test.QuickCheck import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import qualified Statistics.Matrix as M class (Eq a, Show a) => ApproxEq a where type Bounds a eq :: Bounds a -> a -> a -> Bool eql :: Bounds a -> a -> a -> Property eql eps a b = counterexample (show a ++ " /=~ " ++ show b) (eq eps a b) (=~) :: a -> a -> Bool (==~) :: a -> a -> Property a ==~ b = counterexample (show a ++ " /=~ " ++ show b) (a =~ b) instance ApproxEq Double where type Bounds Double = Double eq eps a b | a == 0 && b == 0 = True | otherwise = abs (a - b) <= eps * max (abs a) (abs b) (=~) = eq m_epsilon instance ApproxEq (Complex Double) where type Bounds (Complex Double) = Double eq eps a@(ar :+ ai) b@(br :+ bi) | a == 0 && b == 0 = True | otherwise = abs (ar - br) <= eps * d && abs (ai - bi) <= eps * d where d = max (realPart $ abs a) (realPart $ abs b) (=~) = eq m_epsilon instance ApproxEq [Double] where type Bounds [Double] = Double eq eps (x:xs) (y:ys) = eq eps x y && eq eps xs ys eq _ [] [] = True eq _ _ _ = False eql = eqll length id id (=~) = eq m_epsilon (==~) = eql m_epsilon instance ApproxEq (U.Vector Double) where type Bounds (U.Vector Double) = Double eq = eqv (=~) = eq m_epsilon eql = eqlv (==~) = eqlv m_epsilon instance ApproxEq (V.Vector Double) where type Bounds (V.Vector Double) = Double eq = eqv (=~) = eq m_epsilon eql = eqlv (==~) = eqlv m_epsilon instance ApproxEq Matrix where type Bounds Matrix = Double eq eps (Matrix r1 c1 v1) (Matrix r2 c2 v2) = (r1,c1) == (r2,c2) && eq eps v1 v2 (=~) = eq m_epsilon eql eps a b = eqll dimension M.toList (`quotRem` cols a) eps a b (==~) = eql m_epsilon eqv :: (ApproxEq a, G.Vector v Bool, G.Vector v a) => Bounds a -> v a -> v a -> Bool eqv eps a b = G.length a == G.length b && G.and (G.zipWith (eq eps) a b) eqlv :: (ApproxEq [a], G.Vector v a) => Bounds [a] -> v a -> v a -> Property eqlv eps a b = eql eps (G.toList a) (G.toList b) eqll :: (ApproxEq l, ApproxEq a, Show c, Show d, Eq d, Bounds l ~ Bounds a) => (l -> d) -> (l -> [a]) -> (Int -> c) -> Bounds l -> l -> l -> Property eqll dim toList coord eps a b = counterexample fancy $ eq eps a b where fancy | la /= lb = "size mismatch: " ++ show la ++ " /= " ++ show lb | length summary < length full = summary | otherwise = full summary = concat . intersperse ", " . catMaybes $ zipWith3 whee (map coord [(0::Int)..]) xs ys full | '\n' `elem` sa = sa ++ " /=~\n" ++ sb | otherwise = sa ++ " /=~" ++ sb (sa, sb) = (show a, show b) (xs, ys) = (toList a, toList b) (la, lb) = (dim a, dim b) whee i x y | eq eps x y = Nothing | otherwise = Just $ show i ++ ": " ++ show x ++ " /=~ " ++ show y statistics-0.16.2.1/tests/Tests/Correlation.hs0000644000000000000000000001303107346545000017401 0ustar0000000000000000{-#LANGUAGE BangPatterns #-} module Tests.Correlation ( tests ) where import Control.Arrow (Arrow(..)) import qualified Data.Vector as V import Data.Maybe import Statistics.Correlation import Statistics.Correlation.Kendall import Test.Tasty import Test.Tasty.QuickCheck hiding (sample) import Test.Tasty.HUnit import Tests.ApproxEq ---------------------------------------------------------------- -- Tests list ---------------------------------------------------------------- tests :: TestTree tests = testGroup "Correlation" [ testProperty "Pearson correlation" testPearson , testProperty "Spearman correlation is scale invariant" testSpearmanScale , testProperty "Spearman correlation, nonlinear" testSpearmanNonlinear , testProperty "Kendall test -- general" testKendall , testCase "Kendall test -- special cases" testKendallSpecial ] ---------------------------------------------------------------- -- Pearson's correlation ---------------------------------------------------------------- testPearson :: [(Double,Double)] -> Property testPearson sample = (length sample > 1 && isJust exact) ==> (case exact of Just e -> e ~= fast Nothing -> property False ) where (~=) = eql 1e-12 exact = exactPearson $ map (realToFrac *** realToFrac) sample fast = pearson $ V.fromList sample exactPearson :: [(Rational,Rational)] -> Maybe Double exactPearson sample | varX == 0 || varY == 0 = Nothing | otherwise = Just $ realToFrac cov / sqrt (realToFrac (varX * varY)) where (xs,ys) = unzip sample n = fromIntegral $ length sample -- Mean muX = sum xs / n muY = sum ys / n -- Mean of squares muX2 = sum (map (\x->x*x) xs) / n muY2 = sum (map (\x->x*x) ys) / n -- Covariance cov = sum (zipWith (*) [x - muX | x<-xs] [y - muY | y<-ys]) / n varX = muX2 - muX*muX varY = muY2 - muY*muY ---------------------------------------------------------------- -- Spearman's correlation ---------------------------------------------------------------- -- Test that Spearman correlation is scale invariant testSpearmanScale :: [(Double,Double)] -> Double -> Property testSpearmanScale xs a = and [ length xs > 1 -- Enough to calculate underflow , a /= 0 , not (isNaN c1) , not (isNaN c2) , not (isNaN c3) , not (isNaN c4) ] ==> ( counterexample (show xs2) $ counterexample (show xs3) $ counterexample (show xs4) $ counterexample (show (c1,c2,c3,c4)) $ and [ c1 == c4 , c1 == signum a * c2 , c1 == signum a * c3 ] ) where xs2 = map ((*a) *** id ) xs xs3 = map (id *** (*a)) xs xs4 = map ((*a) *** (*a)) xs c1 = spearman $ V.fromList xs c2 = spearman $ V.fromList xs2 c3 = spearman $ V.fromList xs3 c4 = spearman $ V.fromList xs4 -- Test that Spearman correlation allows to transform sample with testSpearmanNonlinear :: [(Double,Double)] -> Property testSpearmanNonlinear sample0 = and [ length sample0 > 1 , not (isNaN c1) , not (isNaN c2) , not (isNaN c3) , not (isNaN c4) ] ==> ( counterexample (show sample0) $ counterexample (show sample1) $ counterexample (show sample2) $ counterexample (show sample3) $ counterexample (show sample4) $ counterexample (show (c1,c2,c3,c4)) $ and [ c1 == c2 , c1 == c3 , c1 == c4 ] ) where -- We need to stretch sample into [-10 .. 10] range to avoid -- problems with under/overflows etc. stretch xs | a == b = xs | otherwise = [ (x - a - 10) * 20 / (a - b) | x <- xs ] where a = minimum xs b = maximum xs sample1 = uncurry zip $ (stretch *** stretch) $ unzip sample0 sample2 = map (exp *** id ) sample1 sample3 = map (id *** exp) sample1 sample4 = map (exp *** exp) sample1 c1 = spearman $ V.fromList sample1 c2 = spearman $ V.fromList sample2 c3 = spearman $ V.fromList sample3 c4 = spearman $ V.fromList sample4 ---------------------------------------------------------------- -- Kendall's correlation ---------------------------------------------------------------- testKendall :: [(Double, Double)] -> Bool testKendall xy | isNaN r1 = isNaN r2 | otherwise = r1 == r2 where r1 = kendallBruteForce xy r2 = kendall $ V.fromList xy testKendallSpecial :: Assertion testKendallSpecial = vs @=? map (\(xs, ys) -> kendall $ V.fromList $ zip xs ys) d where (d, vs) = unzip testData testData :: [(([Double], [Double]), Double)] testData = [ (([1, 2, 3, 1, 2], [1, 2, 1, 5, 2]), -0.375) , (([1, 1, 1, 3, 3], [3, 3, 3, 2, 5]), 0) ] kendallBruteForce :: [(Double, Double)] -> Double kendallBruteForce xy = (n_c - n_d) / sqrt ((n_0 - n_1) * (n_0 - n_2)) where allPairs = f xy (n_c, n_d, n_1, n_2) = foldl g (0,0,0,0) allPairs n_0 = fromIntegral.length $ allPairs g (!nc, !nd, !n1, !n2) ((x1, y1), (x2, y2)) | (x2 - x1) * (y2 - y1) > 0 = (nc+1, nd, n1, n2) | (x2 - x1) * (y2 - y1) < 0 = (nc, nd+1, n1, n2) | otherwise = if x1 == x2 then if y1 == y2 then (nc, nd, n1+1, n2+1) else (nc, nd, n1+1, n2) else (nc, nd, n1, n2+1) f (x:xs) = zip (repeat x) xs ++ f xs f _ = [] statistics-0.16.2.1/tests/Tests/Distribution.hs0000644000000000000000000004273307346545000017612 0ustar0000000000000000{-# LANGUAGE FlexibleInstances, ScopedTypeVariables, ViewPatterns #-} module Tests.Distribution (tests) where import qualified Control.Exception as E import Data.List (find) import Data.Typeable (Typeable) import Data.Word import Numeric.MathFunctions.Constants (m_tiny,m_huge,m_epsilon) import Numeric.MathFunctions.Comparison import Statistics.Distribution import Statistics.Distribution.Beta (BetaDistribution) import Statistics.Distribution.Binomial (BinomialDistribution) import Statistics.Distribution.CauchyLorentz import Statistics.Distribution.ChiSquared (ChiSquared) import Statistics.Distribution.Exponential (ExponentialDistribution) import Statistics.Distribution.FDistribution (FDistribution,fDistribution) import Statistics.Distribution.Gamma (GammaDistribution,gammaDistr) import Statistics.Distribution.Geometric import Statistics.Distribution.Hypergeometric import Statistics.Distribution.Laplace (LaplaceDistribution) import Statistics.Distribution.Lognormal (LognormalDistribution) import Statistics.Distribution.NegativeBinomial (NegativeBinomialDistribution) import Statistics.Distribution.Normal (NormalDistribution) import Statistics.Distribution.Poisson (PoissonDistribution) import Statistics.Distribution.StudentT import Statistics.Distribution.Transform (LinearTransform) import Statistics.Distribution.Uniform (UniformDistribution) import Statistics.Distribution.Weibull (WeibullDistribution) import Statistics.Distribution.DiscreteUniform (DiscreteUniform) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) import Test.Tasty.ExpectedFailure (ignoreTest) import Test.QuickCheck as QC import Test.QuickCheck.Monadic as QC import Text.Printf (printf) import Tests.ApproxEq (ApproxEq(..)) import Tests.ExactDistribution (exactDistributionTests) import Tests.Helpers (T(..), Double01(..), testAssertion, typeName) import Tests.Helpers (monotonicallyIncreasesIEEE,isDenorm) import Tests.Orphanage () -- | Tests for all distributions tests :: TestTree tests = testGroup "Tests for all distributions" [ contDistrTests (T :: T BetaDistribution ) , contDistrTests (T :: T CauchyDistribution ) , contDistrTests (T :: T ChiSquared ) , contDistrTests (T :: T ExponentialDistribution ) , contDistrTests (T :: T GammaDistribution ) , contDistrTests (T :: T LaplaceDistribution ) , contDistrTests (T :: T LognormalDistribution ) , contDistrTests (T :: T NormalDistribution ) , contDistrTests (T :: T UniformDistribution ) , contDistrTests (T :: T WeibullDistribution ) , contDistrTests (T :: T StudentT ) , contDistrTests (T :: T (LinearTransform NormalDistribution)) , contDistrTests (T :: T FDistribution ) , discreteDistrTests (T :: T BinomialDistribution ) , discreteDistrTests (T :: T GeometricDistribution ) , discreteDistrTests (T :: T GeometricDistribution0 ) , discreteDistrTests (T :: T HypergeometricDistribution ) , discreteDistrTests (T :: T NegativeBinomialDistribution ) , discreteDistrTests (T :: T PoissonDistribution ) , discreteDistrTests (T :: T DiscreteUniform ) , exactDistributionTests , unitTests ] ---------------------------------------------------------------- -- Tests ---------------------------------------------------------------- -- Tests for continuous distribution contDistrTests :: (Param d, ContDistr d, QC.Arbitrary d, Typeable d, Show d) => T d -> TestTree contDistrTests t = testGroup ("Tests for: " ++ typeName t) $ cdfTests t ++ [ testProperty "PDF sanity" $ pdfSanityCheck t , (if quantileIsInvCDF_enabled t then id else ignoreTest) $ testProperty "Quantile is CDF inverse" $ quantileIsInvCDF t , testProperty "quantile fails p<0||p>1" $ quantileShouldFail t , testProperty "log density check" $ logDensityCheck t , testProperty "complQuantile" $ complQuantileCheck t ] -- Tests for discrete distribution discreteDistrTests :: (Param d, DiscreteDistr d, QC.Arbitrary d, Typeable d, Show d) => T d -> TestTree discreteDistrTests t = testGroup ("Tests for: " ++ typeName t) $ cdfTests t ++ [ testProperty "Prob. sanity" $ probSanityCheck t , testProperty "CDF is sum of prob." $ discreteCDFcorrect t , testProperty "Discrete CDF is OK" $ cdfDiscreteIsCorrect t , testProperty "log probability check" $ logProbabilityCheck t ] -- Tests for distributions which have CDF cdfTests :: (Param d, Distribution d, QC.Arbitrary d, Show d) => T d -> [TestTree] cdfTests t = [ testProperty "C.D.F. sanity" $ cdfSanityCheck t , testProperty "CDF limit at +inf" $ cdfLimitAtPosInfinity t , (if cdfLimitAtNegInfinity_enabled t then id else ignoreTest) $ testProperty "CDF limit at -inf" $ cdfLimitAtNegInfinity t , testProperty "CDF at +inf = 1" $ cdfAtPosInfinity t , testProperty "CDF at -inf = 1" $ cdfAtNegInfinity t , testProperty "CDF is nondecreasing" $ cdfIsNondecreasing t , testProperty "1-CDF is correct" $ cdfComplementIsCorrect t ] ---------------------------------------------------------------- -- CDF is in [0,1] range cdfSanityCheck :: (Distribution d) => T d -> d -> Double -> Bool cdfSanityCheck _ d x = c >= 0 && c <= 1 where c = cumulative d x -- CDF never decreases cdfIsNondecreasing :: (Distribution d) => T d -> d -> Double -> Double -> Bool cdfIsNondecreasing _ d = monotonicallyIncreasesIEEE $ cumulative d -- cumulative d +∞ = 1 cdfAtPosInfinity :: (Distribution d) => T d -> d -> Bool cdfAtPosInfinity _ d = cumulative d (1/0) == 1 -- cumulative d - ∞ = 0 cdfAtNegInfinity :: (Distribution d) => T d -> d -> Bool cdfAtNegInfinity _ d = cumulative d (-1/0) == 0 -- CDF limit at +∞ is 1 cdfLimitAtPosInfinity :: (Param d, Distribution d) => T d -> d -> Bool cdfLimitAtPosInfinity _ d = Just 1.0 == find (>=1) probs where probs = map (cumulative d) $ takeWhile (< (m_huge/2)) $ iterate (*1.4) 1 -- CDF limit at -∞ is 0 cdfLimitAtNegInfinity :: (Param d, Distribution d) => T d -> d -> Bool cdfLimitAtNegInfinity _ d = Just 0 == find (<=0) probs where probs = map (cumulative d) $ takeWhile (> (-m_huge/2)) $ iterate (*1.4) (-1) -- CDF's complement is implemented correctly cdfComplementIsCorrect :: (Distribution d, Param d) => T d -> d -> Double -> Property cdfComplementIsCorrect _ d x = counterexample ("err. tolerance = " ++ show tol) $ counterexample ("difference = " ++ show delta) $ delta <= tol where tol = prec_complementCDF d delta = 1 - (cumulative d x + complCumulative d x) -- CDF for discrete distribution uses <= for comparison cdfDiscreteIsCorrect :: (Param d, DiscreteDistr d) => T d -> d -> Property cdfDiscreteIsCorrect _ d = counterexample (unlines badN) $ null badN where -- We are checking that: -- -- > CDF(i) - CDF(i-e) = P(i) -- -- Approximate equality is tricky here. Scale is set by maximum -- value of CDF and probability. Case when all probabilities are -- zero should be treated specially. badN = [ printf "N=%3i p[i]=%g\tp[i+1]=%g\tdP=%g\trelerr=%g" i p p1 dp ((p1-p-dp) / max p1 dp) | i <- [0 .. 100] , let p = cumulative d $ fromIntegral i - 1e-6 p1 = cumulative d $ fromIntegral i dp = probability d i relerr = ((p1 - p) - dp) / max p1 dp , p > m_tiny || p == 0 , p1 > m_tiny , dp > m_tiny , relerr > tol ] tol = prec_discreteCDF d logDensityCheck :: (Param d, ContDistr d) => T d -> d -> Double -> Property logDensityCheck _ d x = not (isDenorm x) ==> ( counterexample (printf "density = %g" p) $ counterexample (printf "logDensity = %g" logP) $ counterexample (printf "log p = %g" (log p)) $ counterexample (printf "ulps[log] = %i" ulpsLog) $ counterexample (printf "ulps[lin] = %i" ulpsLin) $ or [ p == 0 && logP == (-1/0) , p <= m_tiny && logP < log m_tiny -- To avoid problems with roundtripping error in case -- when density is computed as exponent of logDensity we -- accept either inequality , (ulpsLog <= n) || (ulpsLin <= n) ]) where p = density d x logP = logDensity d x n = prec_logDensity d ulpsLog = ulpDistance (log p) logP ulpsLin = ulpDistance p (exp logP) -- PDF is positive pdfSanityCheck :: (ContDistr d) => T d -> d -> Double -> Bool pdfSanityCheck _ d x = p >= 0 where p = density d x complQuantileCheck :: (ContDistr d) => T d -> d -> Double01 -> Property complQuantileCheck _ d (Double01 p) = counterexample (printf "x0 = %g" x0) $ counterexample (printf "x1 = %g" x1) $ counterexample (printf "abs err = %g" $ abs (x1 - x0)) $ counterexample (printf "rel err = %g" $ relativeError x1 x0) -- We avoid extreme tails of distributions -- -- FIXME: all parameters are arbitrary at the moment $ and [ p > 0.01 , p < 0.99 , not $ isInfinite x0 , not $ isInfinite x1 ] ==> (if x0 < 1e6 then abs (x1 - x0) < 1e-6 else relativeError x1 x0 < 1e-12) where x0 = quantile d (1 - p) x1 = complQuantile d p -- Quantile is inverse of CDF quantileIsInvCDF :: (Param d, ContDistr d) => T d -> d -> Double01 -> Property quantileIsInvCDF _ d (Double01 p) = and [ p > m_tiny , p < 1 , x > m_tiny , dens > 0 ] ==> ( counterexample (printf "Quantile = %g" x ) $ counterexample (printf "Probability = %g" p ) $ counterexample (printf "Probability' = %g" p') $ counterexample (printf "Rel. error = %g" (relativeError p p')) $ counterexample (printf "Abs. error = %e" (abs $ p - p')) $ counterexample (printf "Expected err. = %g" err) $ counterexample (printf "Distance = %i" (ulpDistance p p')) $ counterexample (printf "Err/est = %g" (fromIntegral (ulpDistance p p') / err)) $ ulpDistance p p' <= round err ) where -- Algorithm for error estimation is taken from here -- -- http://sepulcarium.org/posts/2012-07-19-rounding_effect_on_inverse.html dens = density d x err = eps + eps' * abs (x / p) * dens -- x = quantile d p p' = cumulative d x (eps,eps') = prec_quantile_CDF d -- Test that quantile fails if p<0 or p>1 quantileShouldFail :: (ContDistr d) => T d -> d -> Double -> Property quantileShouldFail _ d p = p < 0 || p > 1 ==> QC.monadicIO $ do r <- QC.run $ E.catch (False <$ (return $! quantile d p)) (\(_ :: E.SomeException) -> return True) QC.assert r -- Probability is in [0,1] range probSanityCheck :: (DiscreteDistr d) => T d -> d -> Int -> Bool probSanityCheck _ d x = p >= 0 && p <= 1 where p = probability d x -- Check that discrete CDF is correct discreteCDFcorrect :: (DiscreteDistr d) => T d -> d -> Int -> Int -> Property discreteCDFcorrect _ d a b = counterexample (printf "CDF = %g" p1) $ counterexample (printf "Sum = %g" p2) $ counterexample (printf "Delta = %g" (abs (p1 - p2))) $ abs (p1 - p2) < 3e-10 -- Avoid too large differences. Otherwise there is to much to sum -- -- Absolute difference is used guard against precision loss when -- close values of CDF are subtracted where n = min a b m = n + (abs (a - b) `mod` 100) p1 = cumulative d (fromIntegral m + 0.5) - cumulative d (fromIntegral n - 0.5) p2 = sum $ map (probability d) [n .. m] logProbabilityCheck :: (Param d, DiscreteDistr d) => T d -> d -> Int -> Property logProbabilityCheck _ d x = counterexample (printf "probability = %g" p) $ counterexample (printf "logProbability = %g" logP) $ counterexample (printf "log p = %g" (log p)) $ counterexample (printf "ulps[log] = %i" ulpsLog) $ counterexample (printf "ulps[lin] = %i" ulpsLin) $ or [ p == 0 && logP == (-1/0) , p < 1e-308 && logP < 609 -- To avoid problems with roundtripping error in case -- when density is computed as exponent of logDensity we -- accept either inequality , (ulpsLog <= n) || (ulpsLin <= n) ] where p = probability d x logP = logProbability d x n = prec_logDensity d ulpsLog = ulpDistance (log p) logP ulpsLin = ulpDistance p (exp logP) -- | Parameters for distribution testing. Some distribution require -- relaxing parameters a bit class Param a where -- | Whether quantileIsInvCDF is enabled quantileIsInvCDF_enabled :: T a -> Bool quantileIsInvCDF_enabled _ = True -- | Whether cdfLimitAtNegInfinity is enabled cdfLimitAtNegInfinity_enabled :: T a -> Bool cdfLimitAtNegInfinity_enabled _ = True -- | Precision for 'quantileIsInvCDF' test prec_quantile_CDF :: a -> (Double,Double) prec_quantile_CDF _ = (16,16) -- | prec_discreteCDF :: a -> Double prec_discreteCDF _ = 32 * m_epsilon -- | Precision of CDF's complement prec_complementCDF :: a -> Double prec_complementCDF _ = 1e-14 -- | Precision for logDensity check prec_logDensity :: a -> Word64 prec_logDensity _ = 32 instance Param StudentT where -- FIXME: disabled unless incompleteBeta troubles are sorted out quantileIsInvCDF_enabled _ = False instance Param BetaDistribution where -- FIXME: See https://github.com/bos/statistics/issues/161 for details quantileIsInvCDF_enabled _ = False instance Param FDistribution where -- FIXME: disabled unless incompleteBeta troubles are sorted out quantileIsInvCDF_enabled _ = False -- We compute CDF and complement using same method so precision -- should be very good here. prec_complementCDF _ = 64 * m_epsilon instance Param ChiSquared where prec_quantile_CDF _ = (32,32) instance Param BinomialDistribution where prec_discreteCDF _ = 1e-12 prec_logDensity _ = 48 instance Param CauchyDistribution where -- Distribution is long-tailed enough that we may never get to zero cdfLimitAtNegInfinity_enabled _ = False instance Param DiscreteUniform instance Param ExponentialDistribution instance Param GammaDistribution where -- We lose precision near `incompleteGamma 10` because of error -- introduced by exp . logGamma. This could only be fixed in -- math-function by implementing gamma prec_quantile_CDF _ = (24,24) prec_logDensity _ = 512 instance Param GeometricDistribution instance Param GeometricDistribution0 instance Param HypergeometricDistribution instance Param LaplaceDistribution instance Param LognormalDistribution where prec_quantile_CDF _ = (64,64) instance Param NegativeBinomialDistribution where prec_discreteCDF _ = 1e-12 prec_logDensity _ = 48 instance Param NormalDistribution instance Param PoissonDistribution instance Param UniformDistribution instance Param WeibullDistribution instance Param a => Param (LinearTransform a) ---------------------------------------------------------------- -- Unit tests ---------------------------------------------------------------- unitTests :: TestTree unitTests = testGroup "Unit tests" [ testAssertion "density (gammaDistr 150 1/150) 1 == 4.883311" $ 4.883311418525483 =~ density (gammaDistr 150 (1/150)) 1 -- Student-T , testStudentPDF 0.3 1.34 0.0648215 -- PDF , testStudentPDF 1 0.42 0.27058 , testStudentPDF 4.4 0.33 0.352994 , testStudentCDF 0.3 3.34 0.757146 -- CDF , testStudentCDF 1 0.42 0.626569 , testStudentCDF 4.4 0.33 0.621739 -- Student-T General , testStudentUnstandardizedPDF 0.3 1.2 4 0.45 0.0533456 -- PDF , testStudentUnstandardizedPDF 4.3 (-2.4) 3.22 (-0.6) 0.0971141 , testStudentUnstandardizedPDF 3.8 0.22 7.62 0.14 0.0490523 , testStudentUnstandardizedCDF 0.3 1.2 4 0.45 0.458035 -- CDF , testStudentUnstandardizedCDF 4.3 (-2.4) 3.22 (-0.6) 0.698001 , testStudentUnstandardizedCDF 3.8 0.22 7.62 0.14 0.496076 -- F-distribution , testFdistrPDF 1 3 3 (1/(6 * pi)) -- PDF , testFdistrPDF 2 2 1.2 0.206612 , testFdistrPDF 10 12 8 0.000385613179281892790166 , testFdistrCDF 1 3 3 0.81830988618379067153 -- CDF , testFdistrCDF 2 2 1.2 0.545455 , testFdistrCDF 10 12 8 0.99935509863451408041 ] where -- Student-T testStudentPDF ndf x exact = testAssertion (printf "density (studentT %f) %f ~ %f" ndf x exact) $ eq 1e-5 exact (density (studentT ndf) x) testStudentCDF ndf x exact = testAssertion (printf "cumulative (studentT %f) %f ~ %f" ndf x exact) $ eq 1e-5 exact (cumulative (studentT ndf) x) -- Student-T General testStudentUnstandardizedPDF ndf mu sigma x exact = testAssertion (printf "density (studentTUnstandardized %f %f %f) %f ~ %f" ndf mu sigma x exact) $ eq 1e-5 exact (density (studentTUnstandardized ndf mu sigma) x) testStudentUnstandardizedCDF ndf mu sigma x exact = testAssertion (printf "cumulative (studentTUnstandardized %f %f %f) %f ~ %f" ndf mu sigma x exact) $ eq 1e-5 exact (cumulative (studentTUnstandardized ndf mu sigma) x) -- F-distribution testFdistrPDF n m x exact = testAssertion (printf "density (fDistribution %i %i) %f ~ %f [got %f]" n m x exact d) $ eq 1e-5 exact d where d = density (fDistribution n m) x testFdistrCDF n m x exact = testAssertion (printf "cumulative (fDistribution %i %i) %f ~ %f [got %f]" n m x exact d) $ eq 1e-5 exact d where d = cumulative (fDistribution n m) x statistics-0.16.2.1/tests/Tests/ExactDistribution.hs0000644000000000000000000003421407346545000020572 0ustar0000000000000000{-# LANGUAGE BangPatterns, FlexibleInstances, FlexibleContexts, ScopedTypeVariables #-} -- | -- Module : Tests.ExactDistribution -- Copyright : (c) 2022 Lorenz Minder -- License : BSD3 -- -- Maintainer : lminder@gmx.net -- Stability : experimental -- Portability : portable -- -- Tests comparing distributions to exact versions. -- -- This module provides exact versions of some distributions, and tests -- to compare them to the production implementations in -- Statistics.Distribution.*. It also contains the functionality to -- test the production distributions against the exact versions. Errors -- are flagged if data points are discovered where the probability mass -- function, the cumulative probability function, or its complement -- deviates too far (more than a prescribed tolerance) from the exact -- calculation. -- -- The distributions here are implemented with rational integer -- arithmetic, using pretty much the textbook definitions formulas. -- Numerical problems like overflow or rounding errors cannot occur with -- this approach, making them are easy to write, read and verify. They -- are, of course, substantially slower than the production -- distributions in Statistics.Distribution.*. This makes them -- unsuitable for most uses other than testing and debugging. (Also, -- only a handful of distributions can be implemented exactly with -- rational arithmetic.) -- -- This module has the following sub-components: -- -- * Exact (rational) definitions of some distribution functions, -- including both the probability mass as well as the CDF. -- -- * QC.Arbitrary implementations to sample test cases (i.e., -- distribution parameters and evaluation points). -- -- * "Linkage": a mechanism to construct a production distribution -- corresponding to a test case for an exact distribution. -- -- * A set of tests for the distributions derived using all of the above -- components. -- -- This module exports a number symbols which can be useful for -- debugging and experimentation. For use in a test suite, only the -- `exactDistributionTests` function is needed. module Tests.ExactDistribution ( -- * Exact math functions exactChoose -- * Exact distributions , ExactDiscreteDistr(..) , ExactBinomialDistr(..) , ExactDiscreteUniformDistr(..) , ExactGeometricDistr(..) , ExactHypergeomDistr(..) -- * Linking to production distributions , ProductionProbFuncs(..) , productionProbFuncs , ProductionLinkage -- * Individual test routines , pmfMatch , cdfMatch , complCdfMatch -- * Test groups , Tag(..) , distTests , exactDistributionTests ) where ---------------------------------------------------------------- import Data.Foldable import Data.Ratio import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) import Test.QuickCheck as QC import Numeric.MathFunctions.Comparison (relativeError) import Statistics.Distribution import Statistics.Distribution.Binomial import Statistics.Distribution.DiscreteUniform import Statistics.Distribution.Geometric import Statistics.Distribution.Hypergeometric ---------------------------------------------------------------- -- -- Math functions. -- -- Used for implementing the distributions below. -- ---------------------------------------------------------------- -- | Exactly compute binomial coefficient. -- -- /n/ need not be an integer, can be fractional. exactChoose :: Ratio Integer -> Integer -> Ratio Integer exactChoose n k | k < 0 = 0 | otherwise = foldl' (*) 1 factors where factors = [ (n - k' + j) / j | j <- [1..k'] ] k' = fromInteger k :: Ratio Integer ---------------------------------------------------------------- -- -- Exact distributions. -- ---------------------------------------------------------------- -- | Exact discrete distribution. class ExactDiscreteDistr a where -- | Probability mass function. exactProb :: a -> Integer -> Ratio Integer exactProb d x = exactCumulative d x - exactCumulative d (x - 1) -- | Cumulative distribution function. exactCumulative :: a -> Integer -> Ratio Integer -- | Exact Binomial distribution. data ExactBinomialDistr = ExactBD Integer (Ratio Integer) deriving(Show) instance ExactDiscreteDistr ExactBinomialDistr where -- Probability mass, computed with textbook formula. exactProb (ExactBD n p) k | k < 0 || k > n = 0 | otherwise = exactChoose n' k * p^k * (1-p)^(n-k) where n' = fromIntegral n -- CDF -- -- Computed iteratively by summing up all the probabilities -- <= /k/. Rather than computing everything from scratch for each -- probability, we reuse previous results. The meanings of the -- variables in the "update" function are: -- -- bc is the binomial coefficient (n choose j), -- pj is the term p^j, -- pnj is the term (1 - p)^(n - j) -- r is the (partial) sum of the probabilities -- exactCumulative (ExactBD n p) k | k < 0 = 0 | k >= n = 1 -- Special case for p = 1, since in the below fold we -- divide by (1 - p). | p == 1 = if k == n then 1 else 0 | otherwise = result $ foldl' update (1, 1, (1 - p)^n, (1 - p)^n) [1..k] where update (!bc, !pj, !pnj, !r) !j = let bc' = bc * (n - j + 1) `div` j pj' = pj * p pnj' = pnj / (1 - p) r' = r + (fromIntegral bc') * pj' * pnj' in (bc', pj', pnj', r') result (_, _, _, r) = r -- | Exact Discrete Uniform distribution. data ExactDiscreteUniformDistr = ExactDU Integer Integer deriving(Show) instance ExactDiscreteDistr ExactDiscreteUniformDistr where exactProb (ExactDU lower upper) k | k < lower || k > upper = 0 | otherwise = 1 % (upper - lower + 1) exactCumulative (ExactDU lower upper) k | k < lower = 0 | k > upper = 1 | otherwise = let d = (k - lower + 1) in d % (upper - lower + 1) -- | Geometric distribution. data ExactGeometricDistr = ExactGeom (Ratio Integer) deriving(Show) instance ExactDiscreteDistr ExactGeometricDistr where exactProb (ExactGeom p) k | k < 1 = 0 | otherwise = (1 - p)^(k - 1) * p exactCumulative (ExactGeom p) k = 1 - (1 - p)^k -- | Hypergeometric distribution. -- -- Parameters are /K/, /N/ and /n/, where: -- - /N/ is the total sample space size. -- - /K/ is number of "good" objects among /N/. -- - /n/ is the number of draws without replacement. data ExactHypergeomDistr = ExactHG Integer Integer Integer deriving(Show) instance ExactDiscreteDistr ExactHypergeomDistr where exactProb (ExactHG nK nN n) k | k < 0 = 0 | k > n || k > nN = 0 | otherwise = exactChoose nK' k * exactChoose (nN' - nK') (n - k) / exactChoose nN' n where nN' = fromIntegral nN nK' = fromIntegral nK exactCumulative d k = sum [ exactProb d i | i <- [0..k] ] ---------------------------------------------------------------- -- -- TestCase construction. -- -- Contains the TestCase data type which encapsulates an instance of an -- exact distribution together with an evaluation point. -- -- Then in contains the QC.Arbitrary implementations for TestCases of -- the different exact distributions. As a general rule, we try the -- sampling to be relatively efficient, i.e., we only want to sample -- valid distribution parameters. The evaluation points are sampled -- such that most points are within the support of the distribution. -- ---------------------------------------------------------------- -- Divisor to compute a rational number from an integer. -- -- We want input parameters to be exactly representable as -- Double values. This is so that the production distribution does not -- mismatch the exact one simply because the input values don't exactly -- match. (This can happen if the derivative of the distribution -- function is large.) For this reason, the gd value needs to be a -- power of 2, and <= 2^53, since the mantissa of a Double is 53 bits. -- -- A value of 2^53 gives the most accurate and diverse tests, but the -- cost is increased running times, as the computed numerators and -- denominators will become quite large. gd :: Integer gd = 2^(16 :: Int) -- TestCase -- -- Combination of an exact distribution together with an evaluation point. data TestCase a = TestCase a Integer deriving (Show) instance QC.Arbitrary (TestCase ExactBinomialDistr) where arbitrary = do -- This somewhat odd sampling of /n/ is done so that lower -- values (<1000) are more often represented as the larger ones. n <- (*) <$> chooseInteger (1,1000) <*> chooseInteger(1,2) p <- (% gd) <$> chooseInteger (0, gd) k <- chooseInteger (-1, n + 1) return $ TestCase (ExactBD n p) k shrink _ = [] instance QC.Arbitrary (TestCase ExactDiscreteUniformDistr) where arbitrary = do a <- chooseInteger (-1000, 1000) sz <- chooseInteger (1, 1000) let b = a + sz k <- chooseInteger (a - 10, b + 10) return $ TestCase (ExactDU a b) k shrink _ = [] instance QC.Arbitrary (TestCase ExactGeometricDistr) where arbitrary = do p <- (% gd) <$> chooseInteger (1, gd) let lim = (floor $ 100 / p) :: Integer k <- chooseInteger (0, lim) return $ TestCase (ExactGeom p) k shrink _ = [] instance QC.Arbitrary (TestCase ExactHypergeomDistr) where arbitrary = do nN <- chooseInteger (1, 100) -- XXX lower bound should be 0 nK <- chooseInteger (0, nN) n <- chooseInteger (1, nN) -- XXX lower bound should be 0 k <- chooseInteger (0, min n nK) return $ TestCase (ExactHG nK nN n) k shrink _ = [] ---------------------------------------------------------------- -- -- Linking to the production distributions -- -- This section contains the ProductionLinkage typeclass and -- implementation, that allows to obtain a functions for evaluating -- the production distribution functions for a corresponding exact -- distribution. -- ---------------------------------------------------------------- -- | Distribution evaluation functions. -- -- This is used to store a data ProductionProbFuncs = ProductionProbFuncs { prodProb :: Int -> Double , prodCumulative :: Double -> Double , prodComplCumulative :: Double -> Double } productionProbFuncs :: (DiscreteDistr a) => a -> ProductionProbFuncs productionProbFuncs d = ProductionProbFuncs { prodProb = probability d , prodCumulative = cumulative d , prodComplCumulative = complCumulative d } class (ExactDiscreteDistr a) => ProductionLinkage a where productionLinkage :: a -> ProductionProbFuncs instance ProductionLinkage ExactBinomialDistr where productionLinkage (ExactBD n p) = let d = binomial (fromIntegral n) (fromRational p) in productionProbFuncs d instance ProductionLinkage ExactDiscreteUniformDistr where productionLinkage (ExactDU lower upper) = let d = discreteUniformAB (fromIntegral lower) (fromIntegral upper) in productionProbFuncs d instance ProductionLinkage ExactGeometricDistr where productionLinkage (ExactGeom p) = let d = geometric $ fromRational p in productionProbFuncs d instance ProductionLinkage ExactHypergeomDistr where productionLinkage (ExactHG nK nN n) = let d = hypergeometric (fromIntegral nK) (fromIntegral nN) (fromIntegral n) in productionProbFuncs d ---------------------------------------------------------------- -- Tests ---------------------------------------------------------------- -- Check production probability mass function accuracy. -- -- Inputs: tolerance (max relative error) and test case pmfMatch :: (Show a, ProductionLinkage a) => Double -> TestCase a -> Bool pmfMatch tol (TestCase dExact k) = let dProd = productionLinkage dExact pe = fromRational $ exactProb dExact k pa = prodProb dProd k' k' = fromIntegral k in relativeError pe pa < tol -- Check production cumulative probability function accuracy. -- -- Inputs: tolerance (max relative error) and test case. cdfMatch :: (Show a, ProductionLinkage a) => Double -> TestCase a -> Bool cdfMatch tol (TestCase dExact k) = let dProd = productionLinkage dExact pe = fromRational $ exactCumulative dExact k pa = prodCumulative dProd k' k' = fromIntegral k in relativeError pe pa < tol -- Check production complement cumulative function accuracy. -- -- Inputs: tolerance (max relative error) and test case. complCdfMatch :: (Show a, ProductionLinkage a) => Double -> TestCase a -> Bool complCdfMatch tol (TestCase dExact k) = let dProd = productionLinkage dExact pe = fromRational $ 1 - exactCumulative dExact k pa = prodComplCumulative dProd k' k' = fromIntegral k in relativeError pe pa < tol -- Phantom type to encode an exact distribution. data Tag a = Tag distTests :: (Show a, ProductionLinkage a, Arbitrary (TestCase a)) => Tag a -> String -> Double -> TestTree distTests (Tag :: Tag a) name tol = testGroup ("Exact tests for " ++ name) [ testProperty "PMF match" $ ((pmfMatch tol) :: TestCase a -> Bool) , testProperty "CDF match" $ ((cdfMatch tol) :: TestCase a -> Bool) , testProperty "1 - CDF match" $ ((complCdfMatch tol) :: TestCase a -> Bool) ] -- Test driver ------------------------------------------------- exactDistributionTests :: TestTree exactDistributionTests = testGroup "Test distributions against exact" [ distTests (Tag :: Tag ExactBinomialDistr) "Binomial" 1.0e-12 , distTests (Tag :: Tag ExactDiscreteUniformDistr) "DiscreteUniform" 1.0e-12 , distTests (Tag :: Tag ExactGeometricDistr) "Geometric" 1.0e-13 , distTests (Tag :: Tag ExactHypergeomDistr) "Hypergeometric" 1.0e-12 ] statistics-0.16.2.1/tests/Tests/Function.hs0000644000000000000000000000137607346545000016716 0ustar0000000000000000module Tests.Function ( tests ) where import Statistics.Function import Test.Tasty import Test.Tasty.QuickCheck import Test.QuickCheck import Tests.Helpers import qualified Data.Vector.Unboxed as U tests :: TestTree tests = testGroup "S.Function" [ testProperty "Sort is sort" p_sort , testAssertion "nextHighestPowerOfTwo is OK" p_nextHighestPowerOfTwo ] p_sort :: [Double] -> Property p_sort xs = not (null xs) ==> U.all (uncurry (<=)) (U.zip v $ U.tail v) where v = sort $ U.fromList xs p_nextHighestPowerOfTwo :: Bool p_nextHighestPowerOfTwo = all (\(good, is) -> all ((==good) . nextHighestPowerOfTwo) is) lists where pows = [1 .. 17 :: Int] lists = [ (2^m, [2^n+1 .. 2^m]) | (n,m) <- pows `zip` tail pows ] statistics-0.16.2.1/tests/Tests/Helpers.hs0000644000000000000000000000552007346545000016526 0ustar0000000000000000{-# LANGUAGE ScopedTypeVariables #-} -- | Helpers for testing module Tests.Helpers ( -- * helpers T(..) , typeName , Double01(..) -- * IEEE 754 , isDenorm -- * Generic QC tests , monotonicallyIncreases , monotonicallyIncreasesIEEE -- * HUnit helpers , testAssertion , testEquality -- * QC helpers , small , unsquare , shrinkFixedList ) where import Data.Typeable import Numeric.MathFunctions.Constants (m_tiny) import Test.Tasty import Test.Tasty.HUnit import Test.QuickCheck import qualified Numeric.IEEE as IEEE import qualified Test.Tasty.HUnit as HU -- | Phantom typed value used to select right instance in QC tests data T a = T -- | String representation of type name typeName :: Typeable a => T a -> String typeName = show . typeOf . typeParam where typeParam :: T a -> a typeParam _ = undefined -- | Check if Double denormalized isDenorm :: Double -> Bool isDenorm x = let ax = abs x in ax > 0 && ax < m_tiny -- | Generates Doubles in range [0,1] newtype Double01 = Double01 Double deriving (Show) instance Arbitrary Double01 where arbitrary = do (_::Int, x) <- fmap properFraction arbitrary return $ Double01 x ---------------------------------------------------------------- -- Generic QC ---------------------------------------------------------------- -- Check that function is nondecreasing monotonicallyIncreases :: (Ord a, Ord b) => (a -> b) -> a -> a -> Bool monotonicallyIncreases f x1 x2 = f (min x1 x2) <= f (max x1 x2) -- Check that function is nondecreasing taking rounding errors into -- account. -- -- In fact function is allowed to decrease less than one ulp in order -- to guard against problems with excess precision. On x86 FPU works -- with 80-bit numbers but doubles are 64-bit so rounding happens -- whenever values are moved from registers to memory monotonicallyIncreasesIEEE :: (Ord a, IEEE.IEEE b) => (a -> b) -> a -> a -> Bool monotonicallyIncreasesIEEE f x1 x2 = y1 <= y2 || (y1 - y2) < y2 * IEEE.epsilon where y1 = f (min x1 x2) y2 = f (max x1 x2) ---------------------------------------------------------------- -- HUnit helpers ---------------------------------------------------------------- testAssertion :: String -> Bool -> TestTree testAssertion str cont = testCase str $ HU.assertBool str cont testEquality :: (Show a, Eq a) => String -> a -> a -> TestTree testEquality msg a b = testCase msg $ HU.assertEqual msg a b unsquare :: (Arbitrary a, Show a, Testable b) => (a -> b) -> Property unsquare = forAll (small arbitrary) small :: Gen a -> Gen a small act = sized $ \n -> resize (smallish n) act where smallish = round . (sqrt :: Double -> Double) . fromIntegral . abs shrinkFixedList :: (a -> [a]) -> [a] -> [[a]] shrinkFixedList shr (x:xs) = map (:xs) (shr x) ++ map (x:) (shrinkFixedList shr xs) shrinkFixedList _ [] = [] statistics-0.16.2.1/tests/Tests/KDE.hs0000644000000000000000000000226507346545000015532 0ustar0000000000000000-- | Tests for Kernel density estimates. module Tests.KDE ( tests )where import Data.Vector.Unboxed ((!)) import Numeric.Sum (kbn, sumVector) import Statistics.Sample.KernelDensity import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) import Test.QuickCheck (Property, (==>), counterexample) import Text.Printf (printf) import qualified Data.Vector.Unboxed as U tests :: TestTree tests = testGroup "KDE" [ testProperty "integral(PDF) == 1" t_densityIsPDF ] t_densityIsPDF :: [Double] -> Property t_densityIsPDF vec = not (null vec) ==> test where (xs,ys) = kde 4096 (U.fromList vec) step = (xs ! 1) - (xs ! 0) integral = integratePDF step ys -- test = counterexample (printf "Integral %f" integral) $ abs (1 - integral) <= 1e-3 integratePDF :: Double -> U.Vector Double -> Double integratePDF step vec = step * sumVector kbn (U.zipWith (*) vec weights) where n = U.length vec weights = U.generate n go where go i | i == 0 = 0.5 | i == n-1 = 0.5 | otherwise = 1 statistics-0.16.2.1/tests/Tests/Math/0000755000000000000000000000000007346545000015457 5ustar0000000000000000statistics-0.16.2.1/tests/Tests/Math/Tables.hs0000644000000000000000000000454507346545000017235 0ustar0000000000000000module Tests.Math.Tables where tableLogGamma :: [(Double,Double)] tableLogGamma = [(0.000001250000000, 13.592366285131769033) , (0.000068200000000, 9.5930266308318756785) , (0.000246000000000, 8.3100370767447966358) , (0.000880000000000, 7.03508133735248542) , (0.003120000000000, 5.768129358365567505) , (0.026700000000000, 3.6082588918892977148) , (0.077700000000000, 2.5148371858768232556) , (0.234000000000000, 1.3579557559432759994) , (0.860000000000000, 0.098146578027685615897) , (1.340000000000000, -0.11404757557207759189) , (1.890000000000000, -0.0425116422978701336) , (2.450000000000000, 0.25014296569217625565) , (3.650000000000000, 1.3701041997380685178) , (4.560000000000000, 2.5375143317949580002) , (6.660000000000000, 5.9515377269550207018) , (8.250000000000000, 9.0331869196051233217) , (11.300000000000001, 15.814180681373947834) , (25.600000000000001, 56.711261598328121636) , (50.399999999999999, 146.12815158702164808) , (123.299999999999997, 468.85500075897556371) , (487.399999999999977, 2526.9846647543727158) , (853.399999999999977, 4903.9359135978220365) , (2923.300000000000182, 20402.93198938705973) , (8764.299999999999272, 70798.268343590112636) , (12630.000000000000000, 106641.77264982508495) , (34500.000000000000000, 325976.34838781820145) , (82340.000000000000000, 849629.79603036714252) , (234800.000000000000000, 2668846.4390507959761) , (834300.000000000000000, 10540830.912557534873) , (1230000.000000000000000, 16017699.322315014899) ] tableIncompleteBeta :: [(Double,Double,Double,Double)] tableIncompleteBeta = [(2.000000000000000, 3.000000000000000, 0.030000000000000, 0.0051864299999999996862) , (2.000000000000000, 3.000000000000000, 0.230000000000000, 0.22845923000000001313) , (2.000000000000000, 3.000000000000000, 0.760000000000000, 0.95465728000000005249) , (4.000000000000000, 2.300000000000000, 0.890000000000000, 0.93829812158347802864) , (1.000000000000000, 1.000000000000000, 0.550000000000000, 0.55000000000000004441) , (0.300000000000000, 12.199999999999999, 0.110000000000000, 0.95063000053947077639) , (13.100000000000000, 9.800000000000001, 0.120000000000000, 1.3483109941962659385e-07) , (13.100000000000000, 9.800000000000001, 0.420000000000000, 0.071321857831804780226) , (13.100000000000000, 9.800000000000001, 0.920000000000000, 0.99999578339197081611) ] statistics-0.16.2.1/tests/Tests/Math/gen.py0000644000000000000000000000264407346545000016610 0ustar0000000000000000#!/usr/bin/python """ """ from mpmath import * def printListLiteral(lines) : print " [" + "\n , ".join(lines) + "\n ]" ################################################################ # Generate header print "module Tests.Math.Tables where" print ################################################################ ## Generate table for logGamma print "tableLogGamma :: [(Double,Double)]" print "tableLogGamma =" gammaArg = [ 1.25e-6, 6.82e-5, 2.46e-4, 8.8e-4, 3.12e-3, 2.67e-2, 7.77e-2, 0.234, 0.86, 1.34, 1.89, 2.45, 3.65, 4.56, 6.66, 8.25, 11.3, 25.6, 50.4, 123.3, 487.4, 853.4, 2923.3, 8764.3, 1.263e4, 3.45e4, 8.234e4, 2.348e5, 8.343e5, 1.23e6, ] printListLiteral( [ '(%.15f, %.20g)' % (x, log(gamma(x))) for x in gammaArg ] ) ################################################################ ## Generate table for incompleteBeta print "tableIncompleteBeta :: [(Double,Double,Double,Double)]" print "tableIncompleteBeta =" incompleteBetaArg = [ (2, 3, 0.03), (2, 3, 0.23), (2, 3, 0.76), (4, 2.3, 0.89), (1, 1, 0.55), (0.3, 12.2, 0.11), (13.1, 9.8, 0.12), (13.1, 9.8, 0.42), (13.1, 9.8, 0.92), ] printListLiteral( [ '(%.15f, %.15f, %.15f, %.20g)' % (p,q,x, betainc(p,q,0,x, regularized=True)) for (p,q,x) in incompleteBetaArg ]) statistics-0.16.2.1/tests/Tests/Matrix.hs0000644000000000000000000000305407346545000016370 0ustar0000000000000000module Tests.Matrix (tests) where import Statistics.Matrix hiding (map) import Statistics.Matrix.Algorithms import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) import Test.QuickCheck import Tests.Matrix.Types import qualified Data.Vector.Unboxed as U t_row :: Mat Double -> Gen Property t_row ms@(Mat r _ xs) = do i <- choose (0,r-1) return $ row (fromMat ms) i === U.fromList (xs !! i) t_column :: Mat Double -> Gen Property t_column ms@(Mat _ c xs) = do i <- choose (0,c-1) return $ column (fromMat ms) i === U.fromList (map (!! i) xs) t_center :: Mat Double -> Property t_center ms@(Mat r c xs) = (xs !! (r `quot` 2)) !! (c `quot` 2) === center (fromMat ms) t_transpose :: Matrix -> Property t_transpose m = U.concat (map (column n) [0..rows m-1]) === toVector m where n = transpose m t_qr :: Property t_qr = property $ do a <- do (r,c) <- arbitrary fromMat <$> arbMatWith r c (fromIntegral <$> choose (-10, 10::Int)) let (q,r) = qr a a' = multiply q r pure $ counterexample ("A = \n"++show a) $ counterexample ("A' = \n"++show a') $ counterexample ("Q = \n"++show q) $ counterexample ("R = \n"++show r) $ dimension a == dimension a' && ( hasNaN a' || and (zipWith (\x y -> abs (x - y) < 1e-12) (toList a) (toList a')) ) tests :: TestTree tests = testGroup "Matrix" [ testProperty "t_row" t_row , testProperty "t_column" t_column , testProperty "t_center" t_center , testProperty "t_transpose" t_transpose , testProperty "t_qr" t_qr ] statistics-0.16.2.1/tests/Tests/Matrix/0000755000000000000000000000000007346545000016032 5ustar0000000000000000statistics-0.16.2.1/tests/Tests/Matrix/Types.hs0000644000000000000000000000273307346545000017477 0ustar0000000000000000{-# LANGUAGE DeriveFunctor #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Tests.Matrix.Types ( Mat(..) , fromMat , toMat , arbMat , arbMatWith ) where import Control.Monad (join) import Control.Applicative ((<$>), (<*>)) import Statistics.Matrix (Matrix(..), fromList) import Test.QuickCheck import Tests.Helpers (shrinkFixedList, small) import qualified Data.Vector.Unboxed as U data Mat a = Mat { mrows :: Int , mcols :: Int , asList :: [[a]] } deriving (Eq, Ord, Show, Functor) fromMat :: Mat Double -> Matrix fromMat (Mat r c xs) = fromList r c (concat xs) toMat :: Matrix -> Mat Double toMat (Matrix r c v) = Mat r c . split . U.toList $ v where split xs@(_:_) = let (h,t) = splitAt c xs in h : split t split [] = [] instance (Arbitrary a) => Arbitrary (Mat a) where arbitrary = small $ join (arbMat <$> arbitrary <*> arbitrary) shrink (Mat r c xs) = Mat r c <$> shrinkFixedList (shrinkFixedList shrink) xs arbMat :: (Arbitrary a) => Positive (Small Int) -> Positive (Small Int) -> Gen (Mat a) arbMat r c = arbMatWith r c arbitrary arbMatWith :: (Arbitrary a) => Positive (Small Int) -> Positive (Small Int) -> Gen a -> Gen (Mat a) arbMatWith (Positive (Small r)) (Positive (Small c)) genA = Mat r c <$> vectorOf r (vectorOf c genA) instance Arbitrary Matrix where arbitrary = fromMat <$> arbitrary -- shrink = map fromMat . shrink . toMat statistics-0.16.2.1/tests/Tests/NonParametric.hs0000644000000000000000000003061607346545000017672 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ViewPatterns #-} -- Tests for Statistics.Test.NonParametric module Tests.NonParametric (tests) where import Statistics.Distribution.Normal (standard) import Statistics.Test.KolmogorovSmirnov import Statistics.Test.MannWhitneyU import Statistics.Test.KruskalWallis import Statistics.Test.WilcoxonT import Statistics.Types (PValue,pValue,mkPValue) import Test.Tasty (testGroup) import Test.Tasty.HUnit import Tests.ApproxEq (eq) import Tests.Helpers (testAssertion, testEquality) import Tests.NonParametric.Table (tableKSD, tableKS2D) import qualified Test.Tasty as Tst import qualified Data.Vector.Unboxed as U tests :: Tst.TestTree tests = testGroup "Nonparametric tests" $ concat [ mannWhitneyTests , wilcoxonSumTests , wilcoxonPairTests , kruskalWallisRankTests , kruskalWallisTests , kolmogorovSmirnovDTest ] ---------------------------------------------------------------- mannWhitneyTests :: [Tst.TestTree] mannWhitneyTests = zipWith test [(0::Int)..] testData ++ [ testEquality "Mann-Whitney U Critical Values, m=1" (replicate (20*3) Nothing) [mannWhitneyUCriticalValue (1,x) (mkPValue p) | x <- [1..20], p <- [0.005,0.01,0.025]] , testEquality "Mann-Whitney U Critical Values, m=2, p=0.025" (replicate 7 Nothing ++ map Just [0,0,0,0,1,1,1,1,1,2,2,2,2]) [mannWhitneyUCriticalValue (2,x) (mkPValue 0.025) | x <- [1..20]] , testEquality "Mann-Whitney U Critical Values, m=6, p=0.05" (replicate 1 Nothing ++ map Just [0, 2,3,5,7,8,10,12,14,16,17,19,21,23,25,26,28,30,32]) [mannWhitneyUCriticalValue (6,x) (mkPValue 0.05) | x <- [1..20]] , testEquality "Mann-Whitney U Critical Values, m=20, p=0.025" (replicate 1 Nothing ++ map Just [2,8,14,20,27,34,41,48,55,62,69,76,83,90,98,105,112,119,127]) [mannWhitneyUCriticalValue (20,x) (mkPValue 0.025) | x <- [1..20]] ] where test n (a, b, c, d) = testCase "Mann-Whitney" $ do assertEqual ("Mann-Whitney U " ++ show n) c us assertEqual ("Mann-Whitney U Sig " ++ show n) d ss where us = mannWhitneyU (U.fromList a) (U.fromList b) ss = mannWhitneyUSignificant SamplesDiffer (length a, length b) p005 us -- List of (Sample A, Sample B, (Positive Rank, Negative Rank)) testData :: [([Double], [Double], (Double, Double), Maybe TestResult)] testData = [ ( [3,4,2,6,2,5] , [9,7,5,10,6,8] , (2, 34) , Just Significant ) , ( [540,480,600,590,605] , [760,890,1105,595,940] , (2, 23) , Just Significant ) , ( [19,22,16,29,24] , [20,11,17,12] , (17, 3) , Just NotSignificant ) , ( [126,148,85,61, 179,93, 45,189,85,93] , [194,128,69,135,171,149,89,248,79,137] , (35,65) , Just NotSignificant ) , ( [1..30] , [1..30] , (450,450) , Just NotSignificant ) , ( [1 .. 30] , [11.5 .. 40 ] , (190.0,710.0) , Just Significant ) ] wilcoxonSumTests :: [Tst.TestTree] wilcoxonSumTests = zipWith test [(0::Int)..] testData where test n (a, b, c) = testCase "Wilcoxon Sum" $ assertEqual ("Wilcoxon Sum " ++ show n) c (wilcoxonRankSums (U.fromList a) (U.fromList b)) -- List of (Sample A, Sample B, (Positive Rank, Negative Rank)) testData :: [([Double], [Double], (Double, Double))] testData = [ ( [8.50,9.48,8.65,8.16,8.83,7.76,8.63] , [8.27,8.20,8.25,8.14,9.00,8.10,7.20,8.32,7.70] , (75, 61) ) , ( [0.45,0.50,0.61,0.63,0.75,0.85,0.93] , [0.44,0.45,0.52,0.53,0.56,0.58,0.58,0.65,0.79] , (71.5, 64.5) ) ] wilcoxonPairTests :: [Tst.TestTree] wilcoxonPairTests = zipWith test [(0::Int)..] testData ++ -- Taken from the Mitic paper: [ testAssertion "Sig 16, 35" (to4dp 0.0467 $ wilcoxonMatchedPairSignificance 16 35) , testAssertion "Sig 16, 36" (to4dp 0.0523 $ wilcoxonMatchedPairSignificance 16 36) , testEquality "Wilcoxon critical values, p=0.05" (replicate 4 Nothing ++ map Just [0,2,3,5,8,10,13,17,21,25,30,35,41,47,53,60,67,75,83,91,100,110,119]) [wilcoxonMatchedPairCriticalValue x (mkPValue 0.05) | x <- [1..27]] , testEquality "Wilcoxon critical values, p=0.025" (replicate 5 Nothing ++ map Just [0,2,3,5,8,10,13,17,21,25,29,34,40,46,52,58,65,73,81,89,98,107]) [wilcoxonMatchedPairCriticalValue x (mkPValue 0.025) | x <- [1..27]] , testEquality "Wilcoxon critical values, p=0.01" (replicate 6 Nothing ++ map Just [0,1,3,5,7,9,12,15,19,23,27,32,37,43,49,55,62,69,76,84,92]) [wilcoxonMatchedPairCriticalValue x (mkPValue 0.01) | x <- [1..27]] , testEquality "Wilcoxon critical values, p=0.005" (replicate 7 Nothing ++ map Just [0,1,3,5,7,9,12,15,19,23,27,32,37,42,48,54,61,68,75,83]) [wilcoxonMatchedPairCriticalValue x (mkPValue 0.005) | x <- [1..27]] ] where test n (a, b, c) = testEquality ("Wilcoxon Paired " ++ show n) c res where res = wilcoxonMatchedPairSignedRank (U.zip (U.fromList a) (U.fromList b)) -- List of (Sample A, Sample B, (Positive Rank, Negative Rank)) testData :: [([Double], [Double], (Int,Double, Double))] testData = [ ([1..10], [1..10], (0, 0, 0 )) , ([1..5], [6..10], (5, 0, 5*(-3))) -- Worked example from the Internet: , ( [125,115,130,140,140,115,140,125,140,135] , [110,122,125,120,140,124,123,137,135,145] , ( 9 , sum $ filter (> 0) [7,-3,1.5,9,0,-4,8,-6,1.5,-5] , sum $ filter (< 0) [7,-3,1.5,9,0,-4,8,-6,1.5,-5] ) ) -- Worked examples from books/papers: , ( [2.4,1.9,2.3,1.9,2.4,2.5] , [2.0,2.1,2.0,2.0,1.8,2.0] , (6, 18, -3) ) , ( [130,170,125,170,130,130,145,160] , [120,163,120,135,143,136,144,120] , (8, 27, -9) ) , ( [540,580,600,680,430,740,600,690,605,520] , [760,710,1105,880,500,990,1050,640,595,520] , (9, 3, -42) ) ] to4dp tgt (pValue -> x) = x >= tgt - 0.00005 && x < tgt + 0.00005 ---------------------------------------------------------------- kruskalWallisRankTests :: [Tst.TestTree] kruskalWallisRankTests = zipWith test [(0::Int)..] testData where test n (a, b) = testCase "Kruskal-Wallis Ranking" $ assertEqual ("Kruskal-Wallis " ++ show n) (map U.fromList b) (kruskalWallisRank $ map U.fromList a) testData :: [([[Int]],[[Double]])] testData = [ ( [ [68,93,123,83,108,122] , [119,116,101,103,113,84] , [70,68,54,73,81,68] , [61,54,59,67,59,70] ] , [ [8.0,14.0,16.0,19.0,23.0,24.0] , [15.0,17.0,18.0,20.0,21.0,22.0] , [1.5,8.0,8.0,10.5,12.0,13.0] , [1.5,3.5,3.5,5.0,6.0,10.5] ] ) ] kruskalWallisTests :: [Tst.TestTree] kruskalWallisTests = zipWith test [(0::Int)..] testData where test n (a, b, c) = testCase "Kruskal-Wallis" $ do assertEqual ("Kruskal-Wallis " ++ show n) (round100 b) (round100 kw) assertEqual ("Kruskal-Wallis Sig " ++ show n) c kwt where kw = kruskalWallis $ map U.fromList a kwt = isSignificant p005 `fmap` kruskalWallisTest (map U.fromList a) round100 :: Double -> Integer round100 = round . (*100) testData :: [([[Double]], Double, Maybe TestResult)] testData = [ ( [ [68,93,123,83,108,122] , [119,116,101,103,113,84] , [70,68,54,73,81,68] , [61,54,59,67,59,70] ] , 16.03 , Just Significant ) , ( [ [5,5,3,5,5,5,5] , [5,5,5,5,7,5,5] , [5,5,6,5,5,5,5] , [4,5,5,5,6,5,5] ] , 2.24 , Just NotSignificant ) , ( [ [36,48,5,67,53] , [49,33,60,2,55] , [71,31,140,59,42] ] , 1.22 , Just NotSignificant ) , ( [ [6,38,3,17,11,30,15,16,25,5] , [34,28,42,13,40,31,9,32,39,27] , [13,35,19,4,29,0,7,33,18,24] ] , 6.10 , Just Significant ) ] ---------------------------------------------------------------- -- K-S test ---------------------------------------------------------------- kolmogorovSmirnovDTest :: [Tst.TestTree] kolmogorovSmirnovDTest = [ testAssertion "K-S D statistics" $ and [ eq 1e-6 (kolmogorovSmirnovD standard (toU sample)) reference | (reference,sample) <- tableKSD ] , testAssertion "K-S 2-sample statistics" $ and [ eq 1e-6 (kolmogorovSmirnov2D (toU xs) (toU ys)) reference | (reference,xs,ys) <- tableKS2D ] , testAssertion "K-S probability" $ and [ eq 1e-14 (kolmogorovSmirnovProbability n d) p | (d,n,p) <- testData ] ] where toU = U.fromList -- Test data for the calculation of cumulative probability -- P(D[n] < d). -- -- Test data is: -- (D[n], n, p) -- Table is generated using sample program from paper testData :: [(Double,Int,Double)] testData = [ (0.09 , 3, 0 ) , (0.2 , 3, 0.00177777777777778 ) , (0.301 , 3, 0.116357025777778 ) , (0.392 , 3, 0.383127210666667 ) , (0.5003 , 3, 0.667366306558667 ) , (0.604 , 3, 0.861569877333333 ) , (0.699 , 3, 0.945458198 ) , (0.802 , 3, 0.984475216 ) , (0.9 , 3, 0.998 ) , (0.09 , 5, 0 ) , (0.2 , 5, 0.0384 ) , (0.301 , 5, 0.33993786080016 ) , (0.392 , 5, 0.66931908083712 ) , (0.5003 , 5, 0.888397260183794 ) , (0.604 , 5, 0.971609957879808 ) , (0.699 , 5, 0.994331075994008 ) , (0.802 , 5, 0.999391366368064 ) , (0.9 , 5, 0.99998 ) , (0.09 , 8, 3.37615237575e-06 ) , (0.2 , 8, 0.151622071801758 ) , (0.301 , 8, 0.613891042670582 ) , (0.392 , 8, 0.871491561427005 ) , (0.5003 , 8, 0.977534089199071 ) , (0.604 , 8, 0.997473116268255 ) , (0.699 , 8, 0.999806082005123 ) , (0.802 , 8, 0.999995133786947 ) , (0.9 , 8, 0.99999998 ) , (0.09 , 10, 3.89639433093119e-05) , (0.2 , 10, 0.25128096 ) , (0.301 , 10, 0.732913126355935 ) , (0.392 , 10, 0.932185254518767 ) , (0.5003 , 10, 0.992276179340446 ) , (0.604 , 10, 0.999495533516769 ) , (0.699 , 10, 0.999979691783985 ) , (0.802 , 10, 0.999999801409237 ) , (0.09 , 20, 0.00794502217168886 ) , (0.2 , 20, 0.647279826376584 ) , (0.301 , 20, 0.958017466965765 ) , (0.392 , 20, 0.997206424843499 ) , (0.5003 , 20, 0.999962641414228 ) , (0.09 , 30, 0.0498147538075168 ) , (0.2 , 30, 0.842030838984526 ) , (0.301 , 30, 0.993403560017612 ) , (0.392 , 30, 0.99988478803318 ) , (0.09 , 100, 0.629367974413669 ) ] p005 :: PValue Double p005 = mkPValue 0.05 statistics-0.16.2.1/tests/Tests/NonParametric/0000755000000000000000000000000007346545000017330 5ustar0000000000000000statistics-0.16.2.1/tests/Tests/NonParametric/Table.hs0000644000000000000000000001332707346545000020721 0ustar0000000000000000module Tests.NonParametric.Table ( tableKSD , tableKS2D ) where -- Table for Kolmogorov-Smirnov statistics for standard normal -- distribution. Generated using R. -- -- First element of tuple is D second is sample for which it was -- calculated. tableKSD :: [(Double,[Double])] tableKSD = [ (0.2012078,[1.360645,-0.3151904,-1.245443,0.1741977,-0.1421206,-1.798246,1.171594,-1.335844,-5.050093e-2,1.030063,-1.849005,0.6491455,-0.7028004]) , (0.2569956,[0.3884734,-1.227821,-0.4166262,0.429118,-0.9280124,0.8025867,-0.6703089,-0.2124872,0.1224496,0.1087734,-4.285284e-2,-1.039936,-0.7071956]) , (0.1960356,[-1.814745,-0.6327167,0.7082493,0.6264716,1.02061,-0.4094635,0.821026,-0.4255047,-0.4820728,-0.2239833,0.648517,1.114283,0.3610216]) , (0.2095746,[0.187011,0.1805498,0.4448389,0.6065506,0.2308673,0.5292549,-1.489902,-1.455191,0.5449396,-0.1436403,-0.7977073,-0.2693545,0.8260888,-1.474473,-2.158696e-2,-0.1455387]) , (0.1922603,[0.5772317,-1.255561,1.605823,0.4923361,0.2470848,1.176101,-0.3767689,-0.6896885,0.4509345,-0.5048447,0.9436534,1.025599,0.2998393,-3.415219e-2,1.264315,-1.44433,-1.646449e-2]) , (0.2173401,[1.812807,-0.8687497,-0.5710508,1.003647,1.142621,0.6546577,-6.083323e-3,1.628574e-2,1.067499,-1.953143,-0.6060077,1.90859,-0.7480553,0.6715162,-0.928759,1.862,1.604621,-0.2171044,-0.1835918]) , (0.2510541,[-0.4769572,1.062319,0.9952284,1.198086,1.015589,-0.4154523,-0.6711762,1.202902,0.2217098,5.381759e-2,0.6679715,0.2551287,-0.1371492]) , (0.1996022,[1.158607,-0.7354863,1.526559,-0.7855418,-2.82999,-0.6045106,-0.1830228,0.3306812,-0.819657,-1.223715,0.2536423,-0.4155781,1.447042]) , (0.2284761,[1.239965,0.8187093,0.5199788,1.172072,0.748259,1.869376e-2,0.1625921,-1.712065,0.7043582,-1.702702,-0.4792806,-0.1023351,0.1187189]) , (0.2337866,[0.9417261,-0.1024297,-0.7354359,1.099991,0.801984,-0.3745397,-1.749564,1.795771,1.099963,-0.605557,-2.035897,1.893603,-0.3468928,-0.2593938,2.100988,0.9665698,0.8757091,0.7696328,0.8730729,-0.3990352,2.04361,-0.4617864,-0.155021,2.15774,0.2687795,-0.9853512,-0.3264898,1.260026,4.267695,-0.5571145,0.6307067,-0.1691405,-1.730686]) , (0.3389167,[2.025542,-1.542641,-1.090238,3.99027,9.949129e-2,-0.8974433,-2.508418,6.390346,-2.675515,1.154459,1.688072,2.220727,-0.4743102]) , (0.4920231,[0.5192906,-3.260813,-1.245185,1.693084,3.561318,4.058924,2.27063,0.9446943,4.794159,-3.423733,0.8240817,0.644059,0.900175,1.932513,1.024586,2.82823,2.072192,-0.353231,-0.4319673,1.505952,1.0199,4.555054,2.364929,5.531467,3.279415,3.19821,2.726925,1.680027,-0.9041334,-0.8246765,-1.343979,8.454955,1.354581]) , (0.6727408,[-6.705672,-3.193988,-4.612611,-3.207994,-5.070172,-6.141169,-0.397149,-4.093359,-1.204801,-3.986585,-2.724662,0.9868107,-6.295266,-5.95839,-6.35114,-1.679555,-2.635889,-4.050329,1.557428,-2.548465,-0.9073924,-1.502018,-4.535688,-4.158818,-8.833434,-5.944697,-1.569672,-4.70399,-7.832059,-4.093708,-8.393417,-2.085432,-7.06495,-0.4230419,-3.046822,-3.23895,-0.9265873,-9.227822,3.293713,-5.593577,-5.942398,-4.358421,2.660044,-4.301572,-1.258879,0.1499903,3.572833,-3.19844,0.8652432,-0.3025793,-1.576673,-7.666265,-6.751463,-1.398944,-2.690656,-1.429654,7.508364e-2,0.7998344,-3.562074,-1.021431,1.342968,2.110244,-7.561497,-2.372083,-3.649193,-5.7723,-1.068083,0.7537809,-4.569546,-1.198005,-5.638384,-1.227226,-1.195852,-1.118175,-9.130527,0.9675821,-2.497391,0.5988562,-1.965783,-4.25741,-6.547006,-1.459294,-2.380556,-3.977307,-7.809006,-4.276819,-4.028746,-9.055546,-3.599239,-1.470512,-8.253329,-1.351687,-4.269324,-6.140353,-6.30808,-1.834091,-3.135146,-9.391791,3.117815,-5.554733,-2.556769,-3.287376,-2.064013,-5.741995,-5.047918,-4.808841,-1.488526,-0.2351115,-5.760833,-2.722929,-7.012353,2.281171,-3.890514,-1.516824,-1.41011,-1.828457,-5.561244,-3.472142,-10.16919,-0.4369042,-5.698953,-4.587462,-4.897086]) ] -- Table for 2-sample Kolmogorov-Smirnov statistics. Generated using R -- -- First element is D, second and third are samples tableKS2D :: [(Double,[Double],[Double])] tableKS2D = [ (0.2820513,[-0.4212928,2.146532,0.7585263,-0.5086105,-0.7725486,6.235548e-2,-0.1849861,0.861972,-0.1958534,-3.379697e-2,-1.316854,0.6701269],[0.4957582,0.4241167,0.9822869,0.4504248,-0.1749617,1.178098,-1.117222,-0.859273,0.3073736,0.4344583,-0.4761338,-1.332374,1.487291]) , (0.2820513,[-0.712252,0.7990333,-0.7968473,1.443609,1.163096,-1.349071,-0.1553941,-2.003104,-0.3400618,-0.7019282,0.183293,-0.2352167],[-0.4622455,-0.8132221,0.1161614,-1.472115e-2,1.001454,-6.557789e-2,-0.2531216,-1.032432,0.4105478,1.749614,0.9722899,5.850942e-2,-0.3352746]) , (0.2564103,[0.3509882,-0.2982833,1.314731,1.264223,-0.8156374,0.3734029,-3.288915e-2,0.6766016,0.9786335,0.1079949,-0.4211722,1.58656],[0.8024675,7.464538e-2,0.2739861,-2.334255e-2,0.5611802,0.6683374,0.4358206,0.349843,1.207834,1.402578,-0.4049183,0.4286042,1.665129]) , (0.1833333,[1.376196,9.926384e-2,2.199292,-2.04993,0.5585353,-0.4812132,0.1041527,2.084774,0.71194,-1.398245,-4.458574e-2,1.484945,-1.473182,1.020076,-0.7019646,0.2182066,-1.702963,-0.3522622,-0.8129267,-0.6338972],[-1.020371,0.3323861,1.513288,0.1958708,-1.0723,5.323446e-2,-0.9993713,-0.7046356,-0.6781067,-0.4471603,1.512042,-0.2650665,-4.765228e-2,-1.501205,1.228664,0.5332935,-0.2960315,-0.1509683]) , (0.5666667,[0.7145305,0.1255674,2.001531,0.1419216],[2.113474,-0.3352839,-0.4962429,-1.386079,0.6404667,-0.7145304,0.1084008,-0.9821421,-2.270472,-1.003846,-0.5644588,2.699695,-1.296494,-0.1538839,1.319094,-1.127544,0.3568889,0.2004726,-1.313291,0.3581084,0.3313498,0.9336278,0.9850203,-1.309506,1.170459,-0.7517466,-1.771269,0.7156381,-1.129691,0.877729]) , (0.5,[0.6950626,0.1643805,-0.3102472,0.4810762,0.1844602,1.338836,-0.8083386,-0.5482141,0.9532421,-0.2644837],[7.527945,-1.95654,1.513725,-1.318431,2.453895,0.2078194,0.7371092,2.834245,-2.134794,3.938259]) ] statistics-0.16.2.1/tests/Tests/Orphanage.hs0000644000000000000000000001246107346545000017032 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -- | -- Orphan instances for common data types module Tests.Orphanage where import Control.Applicative import Statistics.Distribution.Beta (BetaDistribution, betaDistr) import Statistics.Distribution.Binomial (BinomialDistribution, binomial) import Statistics.Distribution.CauchyLorentz import Statistics.Distribution.ChiSquared (ChiSquared, chiSquared) import Statistics.Distribution.Exponential (ExponentialDistribution, exponential) import Statistics.Distribution.FDistribution (FDistribution, fDistribution) import Statistics.Distribution.Gamma (GammaDistribution, gammaDistr) import Statistics.Distribution.Geometric import Statistics.Distribution.Hypergeometric import Statistics.Distribution.Laplace (LaplaceDistribution, laplace) import Statistics.Distribution.Lognormal (LognormalDistribution, lognormalDistr) import Statistics.Distribution.NegativeBinomial (NegativeBinomialDistribution, negativeBinomial) import Statistics.Distribution.Normal (NormalDistribution, normalDistr) import Statistics.Distribution.Poisson (PoissonDistribution, poisson) import Statistics.Distribution.StudentT import Statistics.Distribution.Transform (LinearTransform, scaleAround) import Statistics.Distribution.Uniform (UniformDistribution, uniformDistr) import Statistics.Distribution.Weibull (WeibullDistribution, weibullDistr) import Statistics.Distribution.DiscreteUniform (DiscreteUniform, discreteUniformAB) import Statistics.Types import Test.QuickCheck as QC ---------------------------------------------------------------- -- Arbitrary instances for distributions ---------------------------------------------------------------- instance QC.Arbitrary BinomialDistribution where arbitrary = binomial <$> QC.choose (1,100) <*> QC.choose (0,1) instance QC.Arbitrary ExponentialDistribution where arbitrary = exponential <$> QC.choose (0,100) instance QC.Arbitrary LaplaceDistribution where arbitrary = laplace <$> QC.choose (-10,10) <*> QC.choose (0, 2) instance QC.Arbitrary GammaDistribution where arbitrary = gammaDistr <$> QC.choose (0.1,100) <*> QC.choose (0.1,100) instance QC.Arbitrary BetaDistribution where arbitrary = betaDistr <$> QC.choose (1e-3,10) <*> QC.choose (1e-3,10) instance QC.Arbitrary GeometricDistribution where arbitrary = geometric <$> QC.choose (1e-10,1) instance QC.Arbitrary GeometricDistribution0 where arbitrary = geometric0 <$> QC.choose (1e-10,1) instance QC.Arbitrary HypergeometricDistribution where arbitrary = do l <- QC.choose (1,20) m <- QC.choose (0,l) k <- QC.choose (1,l) return $ hypergeometric m l k instance QC.Arbitrary LognormalDistribution where -- can't choose sigma too big, otherwise goes outside of double-float limit arbitrary = lognormalDistr <$> QC.choose (-100,100) <*> QC.choose (1e-10, 20) instance QC.Arbitrary NegativeBinomialDistribution where arbitrary = negativeBinomial <$> QC.choose (1,100) <*> QC.choose (1e-10,1) instance QC.Arbitrary NormalDistribution where arbitrary = normalDistr <$> QC.choose (-100,100) <*> QC.choose (1e-3, 1e3) instance QC.Arbitrary PoissonDistribution where arbitrary = poisson <$> QC.choose (0,1) instance QC.Arbitrary ChiSquared where arbitrary = chiSquared <$> QC.choose (1,100) instance QC.Arbitrary UniformDistribution where arbitrary = do a <- QC.arbitrary b <- QC.arbitrary `suchThat` (/= a) return $ uniformDistr a b instance QC.Arbitrary WeibullDistribution where arbitrary = weibullDistr <$> QC.choose (1e-3,1e3) <*> QC.choose (1e-3, 1e3) instance QC.Arbitrary CauchyDistribution where arbitrary = cauchyDistribution <$> arbitrary <*> ((abs <$> arbitrary) `suchThat` (> 0)) instance QC.Arbitrary StudentT where arbitrary = studentT <$> ((abs <$> arbitrary) `suchThat` (>0)) instance QC.Arbitrary d => QC.Arbitrary (LinearTransform d) where arbitrary = do m <- QC.choose (-10,10) s <- QC.choose (1e-1,1e1) d <- arbitrary return $ scaleAround m s d instance QC.Arbitrary FDistribution where arbitrary = fDistribution <$> ((abs <$> arbitrary) `suchThat` (>0)) <*> ((abs <$> arbitrary) `suchThat` (>0)) instance (Arbitrary a, Ord a, RealFrac a) => Arbitrary (PValue a) where arbitrary = do (_::Int,x) <- properFraction <$> arbitrary return $ mkPValue $ abs x instance (Arbitrary a, Ord a, RealFrac a) => Arbitrary (CL a) where arbitrary = do (_::Int,x) <- properFraction <$> arbitrary return $ mkCLFromSignificance $ abs x instance Arbitrary a => Arbitrary (NormalErr a) where arbitrary = NormalErr <$> arbitrary instance Arbitrary a => Arbitrary (ConfInt a) where arbitrary = liftA3 ConfInt arbitrary arbitrary arbitrary instance (Arbitrary (e a), Arbitrary a) => Arbitrary (Estimate e a) where arbitrary = liftA2 Estimate arbitrary arbitrary instance (Arbitrary a) => Arbitrary (UpperLimit a) where arbitrary = liftA2 UpperLimit arbitrary arbitrary instance (Arbitrary a) => Arbitrary (LowerLimit a) where arbitrary = liftA2 LowerLimit arbitrary arbitrary instance QC.Arbitrary DiscreteUniform where arbitrary = discreteUniformAB <$> QC.choose (1,1000) <*> QC.choose(1,1000) statistics-0.16.2.1/tests/Tests/Parametric.hs0000644000000000000000000000650307346545000017215 0ustar0000000000000000module Tests.Parametric (tests) where import Data.Maybe (fromJust) import Statistics.Test.StudentT import Statistics.Types import qualified Data.Vector.Unboxed as U import Test.Tasty (testGroup) import Tests.Helpers (testEquality) import qualified Test.Tasty as Tst tests :: Tst.TestTree tests = testGroup "Parametric tests" studentTTests -- 2 samples x 20 obs data -- -- Both samples are samples from normal distributions with the same variance (= 1.0), -- but their means are different (0.0 and 0.5, respectively). -- -- You can reproduce the data with R (3.1.0) as follows: -- set.seed(0) -- sample1 = rnorm(20) -- sample2 = rnorm(20, 0.5) -- student = t.test(sample1, sample2, var.equal=T) -- welch = t.test(sample1, sample2) -- paired = t.test(sample1, sample2, paired=T) sample1, sample2 :: U.Vector Double sample1 = U.fromList [ 1.262954284880793e+00, -3.262333607056494e-01, 1.329799262922501e+00, 1.272429321429405e+00, 4.146414344564082e-01, -1.539950041903710e+00, -9.285670347135381e-01, -2.947204467905602e-01, -5.767172747536955e-03, 2.404653388857951e+00, 7.635934611404596e-01, -7.990092489893682e-01, -1.147657009236351e+00, -2.894615736882233e-01, -2.992151178973161e-01, -4.115108327950670e-01, 2.522234481561323e-01, -8.919211272845686e-01, 4.356832993557186e-01, -1.237538421929958e+00] sample2 = U.fromList [ 2.757321147216907e-01, 8.773956459817011e-01, 6.333363608148415e-01, 1.304189509744908e+00, 4.428932256161913e-01, 1.003607972233726e+00, 1.585769362145687e+00, -1.909538396968303e-01, -7.845993538721883e-01, 5.467261721883520e-01, 2.642934435604988e-01, -4.288825501025439e-02, 6.668968254321778e-02, -1.494716467962331e-01, 1.226750747385451e+00, 1.651911754087200e+00, 1.492160365445798e+00, 7.048689050811874e-02, 1.738304100853380e+00, 2.206537181457307e-01] testTTest :: String -> PValue Double -> Test d -> [Tst.TestTree] testTTest name pVal test = [ testEquality name (isSignificant pVal test) NotSignificant , testEquality name (isSignificant (mkPValue $ pValue pVal + 1e-5) test) Significant ] studentTTests :: [Tst.TestTree] studentTTests = concat [ -- R: t.test(sample1, sample2, alt="two.sided", var.equal=T) testTTest "two-sample t-test SamplesDiffer Student" (mkPValue 0.03410) (fromJust $ studentTTest SamplesDiffer sample1 sample2) -- R: t.test(sample1, sample2, alt="two.sided", var.equal=F) , testTTest "two-sample t-test SamplesDiffer Welch" (mkPValue 0.03483) (fromJust $ welchTTest SamplesDiffer sample1 sample2) -- R: t.test(sample1, sample2, alt="two.sided", paired=T) , testTTest "two-sample t-test SamplesDiffer Paired" (mkPValue 0.03411) (fromJust $ pairedTTest SamplesDiffer sample12) -- R: t.test(sample1, sample2, alt="less", var.equal=T) , testTTest "two-sample t-test BGreater Student" (mkPValue 0.01705) (fromJust $ studentTTest BGreater sample1 sample2) -- R: t.test(sample1, sample2, alt="less", var.equal=F) , testTTest "two-sample t-test BGreater Welch" (mkPValue 0.01741) (fromJust $ welchTTest BGreater sample1 sample2) -- R: t.test(sample1, sample2, alt="less", paired=F) , testTTest "two-sample t-test BGreater Paired" (mkPValue 0.01705) (fromJust $ pairedTTest BGreater sample12) ] where sample12 = U.zip sample1 sample2 statistics-0.16.2.1/tests/Tests/Quantile.hs0000644000000000000000000000731607346545000016713 0ustar0000000000000000{-# LANGUAGE ViewPatterns #-} -- | -- Tests for quantile module Tests.Quantile (tests) where import Control.Exception import qualified Data.Vector.Unboxed as U import Test.Tasty import Test.Tasty.HUnit import Test.Tasty.QuickCheck hiding (sample) import Numeric.MathFunctions.Comparison (ulpDelta,ulpDistance) import Statistics.Quantile tests :: TestTree tests = testGroup "Quantiles" [ testCase "R alg. 4" $ compareWithR cadpw (0.00, 0.50, 2.50, 8.25, 10.00) , testCase "R alg. 5" $ compareWithR hazen (0.00, 1.00, 5.00, 9.00, 10.00) , testCase "R alg. 6" $ compareWithR spss (0.00, 0.75, 5.00, 9.25, 10.00) , testCase "R alg. 7" $ compareWithR s (0.000, 1.375, 5.000, 8.625,10.00) , testCase "R alg. 8" $ compareWithR medianUnbiased (0.0, 0.9166666666666667, 5.000000000000003, 9.083333333333334, 10.0) , testCase "R alg. 9" $ compareWithR normalUnbiased (0.0000, 0.9375, 5.0000, 9.0625, 10.0000) , testProperty "alg 7." propWeigtedAverage -- Test failures , testCase "weightedAvg should throw errors" $ do let xs = U.fromList [1,2,3] xs0 = U.fromList [] shouldError "Empty sample" $ weightedAvg 1 4 xs0 shouldError "N=0" $ weightedAvg 1 0 xs shouldError "N=1" $ weightedAvg 1 1 xs shouldError "k<0" $ weightedAvg (-1) 4 xs shouldError "k>N" $ weightedAvg 5 4 xs , testCase "quantile should throw errors" $ do let xs = U.fromList [1,2,3] xs0 = U.fromList [] shouldError "Empty xs" $ quantile s 1 4 xs0 shouldError "N=0" $ quantile s 1 0 xs shouldError "N=1" $ quantile s 1 1 xs shouldError "k<0" $ quantile s (-1) 4 xs shouldError "k>N" $ quantile s 5 4 xs -- , testProperty "quantiles are OK" propQuantiles , testProperty "quantilesVec are OK" propQuantilesVec ] sample :: U.Vector Double sample = U.fromList [0, 1, 2.5, 7.5, 9, 10] -- Compare quantiles implementation with reference R implementation compareWithR :: ContParam -> (Double,Double,Double,Double,Double) -> Assertion compareWithR p (q0,q1,q2,q3,q4) = do assertEqual "Q 0" q0 $ quantile p 0 4 sample assertEqual "Q 1" q1 $ quantile p 1 4 sample assertEqual "Q 2" q2 $ quantile p 2 4 sample assertEqual "Q 3" q3 $ quantile p 3 4 sample assertEqual "Q 4" q4 $ quantile p 4 4 sample propWeigtedAverage :: Positive Int -> Positive Int -> Property propWeigtedAverage (Positive k) (Positive q) = (q >= 2 && k <= q) ==> let q1 = weightedAvg k q sample q2 = quantile s k q sample in counterexample ("weightedAvg = " ++ show q1) $ counterexample ("quantile = " ++ show q2) $ counterexample ("delta in ulps = " ++ show (ulpDelta q1 q2)) $ ulpDistance q1 q2 <= 16 propQuantiles :: Positive Int -> Int -> Int -> NonEmptyList Double -> Property propQuantiles (Positive n) ((`mod` n) -> k1) ((`mod` n) -> k2) (NonEmpty xs) = n >= 2 ==> [x1,x2] == quantiles s [k1,k2] n rndXs where rndXs = U.fromList xs x1 = quantile s k1 n rndXs x2 = quantile s k2 n rndXs propQuantilesVec :: Positive Int -> Int -> Int -> NonEmptyList Double -> Property propQuantilesVec (Positive n) ((`mod` n) -> k1) ((`mod` n) -> k2) (NonEmpty xs) = n >= 2 ==> U.fromList [x1,x2] == quantilesVec s (U.fromList [k1,k2]) n rndXs where rndXs = U.fromList xs x1 = quantile s k1 n rndXs x2 = quantile s k2 n rndXs shouldError :: String -> a -> Assertion shouldError nm x = do r <- try (evaluate x) case r of Left (ErrorCall{}) -> return () Right _ -> assertFailure ("Should call error: " ++ nm) statistics-0.16.2.1/tests/Tests/Serialization.hs0000644000000000000000000001012607346545000017737 0ustar0000000000000000-- | -- Tests for data serialization instances module Tests.Serialization where import Data.Binary (Binary,decode,encode) import Data.Aeson (FromJSON,ToJSON,Result(..),toJSON,fromJSON) import Data.Typeable import Statistics.Distribution.Beta (BetaDistribution) import Statistics.Distribution.Binomial (BinomialDistribution) import Statistics.Distribution.CauchyLorentz import Statistics.Distribution.ChiSquared (ChiSquared) import Statistics.Distribution.Exponential (ExponentialDistribution) import Statistics.Distribution.FDistribution (FDistribution) import Statistics.Distribution.Gamma (GammaDistribution) import Statistics.Distribution.Geometric import Statistics.Distribution.Hypergeometric import Statistics.Distribution.Laplace (LaplaceDistribution) import Statistics.Distribution.Lognormal (LognormalDistribution) import Statistics.Distribution.NegativeBinomial (NegativeBinomialDistribution) import Statistics.Distribution.Normal (NormalDistribution) import Statistics.Distribution.Poisson (PoissonDistribution) import Statistics.Distribution.StudentT import Statistics.Distribution.Transform (LinearTransform) import Statistics.Distribution.Uniform (UniformDistribution) import Statistics.Distribution.Weibull (WeibullDistribution) import Statistics.Types import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) import Test.QuickCheck as QC import Tests.Helpers import Tests.Orphanage () tests :: TestTree tests = testGroup "Test for data serialization" [ serializationTests (T :: T (CL Float)) , serializationTests (T :: T (CL Double)) , serializationTests (T :: T (PValue Float)) , serializationTests (T :: T (PValue Double)) , serializationTests (T :: T (NormalErr Double)) , serializationTests (T :: T (ConfInt Double)) , serializationTests' "T (Estimate NormalErr Double)" (T :: T (Estimate NormalErr Double)) , serializationTests' "T (Estimate ConfInt Double)" (T :: T (Estimate ConfInt Double)) , serializationTests (T :: T (LowerLimit Double)) , serializationTests (T :: T (UpperLimit Double)) -- Distributions , serializationTests (T :: T BetaDistribution ) , serializationTests (T :: T CauchyDistribution ) , serializationTests (T :: T ChiSquared ) , serializationTests (T :: T ExponentialDistribution ) , serializationTests (T :: T GammaDistribution ) , serializationTests (T :: T LaplaceDistribution ) , serializationTests (T :: T LognormalDistribution ) , serializationTests (T :: T NegativeBinomialDistribution ) , serializationTests (T :: T NormalDistribution ) , serializationTests (T :: T UniformDistribution ) , serializationTests (T :: T WeibullDistribution ) , serializationTests (T :: T StudentT ) , serializationTests (T :: T (LinearTransform NormalDistribution)) , serializationTests (T :: T FDistribution ) , serializationTests (T :: T BinomialDistribution ) , serializationTests (T :: T GeometricDistribution ) , serializationTests (T :: T GeometricDistribution0 ) , serializationTests (T :: T HypergeometricDistribution ) , serializationTests (T :: T PoissonDistribution ) ] serializationTests :: (Eq a, Typeable a, Binary a, Show a, Read a, ToJSON a, FromJSON a, Arbitrary a) => T a -> TestTree serializationTests t = serializationTests' (typeName t) t -- Not all types are Typeable, unfortunately serializationTests' :: (Eq a, Binary a, Show a, Read a, ToJSON a, FromJSON a, Arbitrary a) => String -> T a -> TestTree serializationTests' name t = testGroup ("Tests for: " ++ name) [ testProperty "show/read" (p_showRead t) , testProperty "binary" (p_binary t) , testProperty "aeson" (p_aeson t) ] p_binary :: (Eq a, Binary a) => T a -> a -> Bool p_binary _ a = a == (decode . encode) a p_showRead :: (Eq a, Read a, Show a) => T a -> a -> Bool p_showRead _ a = a == (read . show) a p_aeson :: (Eq a, ToJSON a, FromJSON a) => T a -> a -> Bool p_aeson _ a = Data.Aeson.Success a == (fromJSON . toJSON) a statistics-0.16.2.1/tests/Tests/Transform.hs0000644000000000000000000001340307346545000017076 0ustar0000000000000000{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ViewPatterns #-} module Tests.Transform ( tests ) where import Data.Bits ((.&.), shiftL) import Data.Complex (Complex((:+))) import Numeric.Sum (kbn, sumVector) import Statistics.Function (within) import Statistics.Transform (CD, dct, fft, idct, ifft) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) import Test.QuickCheck ( Positive(..), Arbitrary(..), Blind(..), (==>), Gen , choose, vectorOf, counterexample, forAll) import Test.QuickCheck.Property (Property(..)) import Tests.Helpers (testAssertion) import Text.Printf (printf) import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U tests :: TestTree tests = testGroup "fft" [ testProperty "t_impulse" t_impulse , testProperty "t_impulse_offset" t_impulse_offset , testProperty "ifft . fft = id" (t_fftInverse $ ifft . fft) , testProperty "fft . ifft = id" (t_fftInverse $ fft . ifft) , testProperty "idct . dct = id [up to scale]" (t_fftInverse (\v -> U.map (/ (2 * fromIntegral (U.length v))) $ idct $ dct v)) , testProperty "dct . idct = id [up to scale]" (t_fftInverse (\v -> U.map (/ (2 * fromIntegral (U.length v))) $ idct $ dct v)) -- Exact small size DCT -- 1 , testDCT [1] $ [2] -- 2 , testDCT [1,0] $ map (*2) [1, cos (pi/4) ] , testDCT [0,1] $ map (*2) [1, cos (3*pi/4) ] -- 4 , testDCT [1,0,0,0] $ map (*2) [1, cos( pi/8), cos( 2*pi/8), cos( 3*pi/8)] , testDCT [0,1,0,0] $ map (*2) [1, cos(3*pi/8), cos( 6*pi/8), cos( 9*pi/8)] , testDCT [0,0,1,0] $ map (*2) [1, cos(5*pi/8), cos(10*pi/8), cos(15*pi/8)] , testDCT [0,0,0,1] $ map (*2) [1, cos(7*pi/8), cos(14*pi/8), cos(21*pi/8)] -- Exact small size IDCT -- 1 , testIDCT [1] [1] -- 2 , testIDCT [1,0] [1, 1 ] , testIDCT [0,1] $ map (*2) [cos(pi/4), cos(3*pi/4)] -- 4 , testIDCT [1,0,0,0] [1, 1, 1, 1 ] , testIDCT [0,1,0,0] $ map (*2) [cos( pi/8), cos( 3*pi/8), cos( 5*pi/8), cos( 7*pi/8) ] , testIDCT [0,0,1,0] $ map (*2) [cos( 2*pi/8), cos( 6*pi/8), cos(10*pi/8), cos(14*pi/8) ] , testIDCT [0,0,0,1] $ map (*2) [cos( 3*pi/8), cos( 9*pi/8), cos(15*pi/8), cos(21*pi/8) ] ] -- A single real-valued impulse at the beginning of an otherwise zero -- vector should be replicated in every real component of the result, -- and all the imaginary components should be zero. t_impulse :: Double -> Positive Int -> Bool t_impulse k (Positive m) = U.all (c_near i) (fft v) where v = i `G.cons` G.replicate (n-1) 0 i = k :+ 0 n = 1 `shiftL` (m .&. 6) -- If a real-valued impulse is offset from the beginning of an -- otherwise zero vector, the sum-of-squares of each component of the -- result should equal the square of the impulse. t_impulse_offset :: Double -> Positive Int -> Positive Int -> Property t_impulse_offset k (Positive x) (Positive m) -- For numbers smaller than 1e-162 their square underflows and test -- fails spuriously = abs k >= 1e-100 ==> U.all ok (fft v) where v = G.concat [G.replicate xn 0, G.singleton i, G.replicate (n-xn-1) 0] ok (re :+ im) = within ulps (re*re + im*im) (k*k) i = k :+ 0 xn = x `rem` n n = 1 `shiftL` (m .&. 6) -- Test that (ifft . fft ≈ id) -- -- Approximate equality here is tricky. Smaller values of vector tend -- to have large relative error. Thus we should test that vectors as -- whole are approximate equal. t_fftInverse :: (HasNorm (U.Vector a), U.Unbox a, Num a, Show a, Arbitrary a) => (U.Vector a -> U.Vector a) -> Property t_fftInverse roundtrip = forAll (Blind <$> genFftVector) $ \(Blind x) -> let n = G.length x x' = roundtrip x d = G.zipWith (-) x x' nd = vectorNorm d nx = vectorNorm x in counterexample "Original vector" $ counterexample (show x ) $ counterexample "Transformed one" $ counterexample (show x') $ counterexample (printf "Length = %i" n) $ counterexample (printf "|x - x'| / |x| = %.6g" (nd / nx)) $ nd <= 3e-14 * nx -- Test discrete cosine transform testDCT :: [Double] -> [Double] -> TestTree testDCT (U.fromList -> vec) (U.fromList -> res) = testAssertion ("DCT test for " ++ show vec) $ vecEqual 3e-14 (dct vec) res -- Test inverse discrete cosine transform testIDCT :: [Double] -> [Double] -> TestTree testIDCT (U.fromList -> vec) (U.fromList -> res) = testAssertion ("IDCT test for " ++ show vec) $ vecEqual 3e-14 (idct vec) res ---------------------------------------------------------------- -- With an error tolerance of 8 ULPs, a million QuickCheck tests are -- likely to all succeed. With a tolerance of 7, we fail around the -- half million mark. ulps :: Int ulps = 8 c_near :: CD -> CD -> Bool c_near (a :+ b) (c :+ d) = within ulps a c && within ulps b d -- Arbitrary vector for FFT od DCT genFftVector :: (U.Unbox a, Arbitrary a) => Gen (U.Vector a) genFftVector = do n <- (2^) <$> choose (1,9::Int) -- Size of vector G.fromList <$> vectorOf n arbitrary -- Vector to transform -- Ad-hoc type class for calculation of vector norm class HasNorm a where vectorNorm :: a -> Double instance HasNorm (U.Vector Double) where vectorNorm = sqrt . sumVector kbn . U.map (\x -> x*x) instance HasNorm (U.Vector CD) where vectorNorm = sqrt . sumVector kbn . U.map (\(x :+ y) -> x*x + y*y) -- Approximate equality for vectors vecEqual :: Double -> U.Vector Double -> U.Vector Double -> Bool vecEqual ε v u = vectorNorm (U.zipWith (-) v u) < ε * vectorNorm v statistics-0.16.2.1/tests/0000755000000000000000000000000007346545000013464 5ustar0000000000000000statistics-0.16.2.1/tests/tests.hs0000644000000000000000000000127607346545000015170 0ustar0000000000000000import Test.Tasty (defaultMain,testGroup) import qualified Tests.Distribution import qualified Tests.Function import qualified Tests.KDE import qualified Tests.Matrix import qualified Tests.NonParametric import qualified Tests.Parametric import qualified Tests.Transform import qualified Tests.Correlation import qualified Tests.Serialization import qualified Tests.Quantile main :: IO () main = defaultMain $ testGroup "statistics" [ Tests.Distribution.tests , Tests.Function.tests , Tests.KDE.tests , Tests.Matrix.tests , Tests.NonParametric.tests , Tests.Parametric.tests , Tests.Transform.tests , Tests.Correlation.tests , Tests.Serialization.tests , Tests.Quantile.tests ] statistics-0.16.2.1/tests/utils/0000755000000000000000000000000007346545000014624 5ustar0000000000000000statistics-0.16.2.1/tests/utils/Makefile0000644000000000000000000000017007346545000016262 0ustar0000000000000000C = gcc CFLAGS = -W -Wall -O2 -std=c99 LDFLAGS = -lfftw3 .PHONY: all clean all : fftw clean : rm -rf fftw *.o statistics-0.16.2.1/tests/utils/fftw.c0000644000000000000000000000200607346545000015734 0ustar0000000000000000/* Generate some test cases using fftw3 */ #include #include #include void dump_vector(int n, double* vec) { for(int i = 0; i < n; i++) printf("%20.15f ", vec[i]); printf("\n"); } void dct(int flag, int n) { double* in = malloc( n * sizeof(double)); double* out = malloc( n * sizeof(double)); // fftw_plan plan = fftw_plan_r2r_1d(n, in, out, flag, FFTW_ESTIMATE); for( int k = 0; k < n; k++) { // Init input vector for( int i = 0; i < n; i++) in[i] = 0; in[k] = 1; // Perform DFT fftw_execute(plan); // Print results dump_vector(n, in ); dump_vector(n, out); printf("\n"); } // free(in); free(out); fftw_destroy_plan(plan); } int main(void) { printf("DCT II (the DCT)\n"); dct( FFTW_REDFT10, 2); dct( FFTW_REDFT10, 4); printf("DCT III (Inverse DCT)\n"); dct( FFTW_REDFT01, 2); dct( FFTW_REDFT01, 4); return 0; }