PDL-Stats-0.84/0000755000175000017500000000000014625061425013113 5ustar osboxesosboxesPDL-Stats-0.84/Basic/0000755000175000017500000000000014625061425014134 5ustar osboxesosboxesPDL-Stats-0.84/Basic/Makefile.PL0000644000175000017500000000032714126063750016107 0ustar osboxesosboxesuse ExtUtils::MakeMaker; use PDL::Core::Dev; $package = ["stats_basic.pd",Basic,PDL::Stats::Basic,undef,1]; %hash = pdlpp_stdargs($package); WriteMakefile( %hash ); sub MY::postamble {pdlpp_postamble($package)}; PDL-Stats-0.84/Basic/stats_basic.pd0000644000175000017500000010346214544645567017005 0ustar osboxesosboxespp_add_exported('', 'binomial_test', 'rtable', 'which_id', ); pp_addpm({At=>'Top'}, <<'EOD'); use PDL::LiteF; use PDL::NiceSlice; use Carp; eval { require PDL::Core; require PDL::GSL::CDF; }; my $CDF = 1 if !$@; =head1 NAME PDL::Stats::Basic -- basic statistics and related utilities such as standard deviation, Pearson correlation, and t-tests. =head1 DESCRIPTION The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively. Does not have mean or median function here. see SEE ALSO. =head1 SYNOPSIS use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::Basic; my $stdv = $data->stdv; or my $stdv = stdv( $data ); =cut EOD pp_addhdr(' #include ' ); pp_def('stdv', Pars => 'a(n); float+ [o]b()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) sa, a2; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} $b() = sqrt( a2 / N - pow(sa/N,2) ); ', BadCode => ' $GENERIC(b) sa, a2; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); a2 += pow($a(), 2); N ++; } %} if (N) { $b() = sqrt( a2 / N - pow(sa/N,2) ); } else { $SETBAD(b()); } ', Doc => ' =for ref Sample standard deviation. =cut ', ); pp_def('stdv_unbiased', Pars => 'a(n); float+ [o]b()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) sa, a2; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} $b() = pow( a2/(N-1) - pow(sa/N,2) * N/(N-1), .5 ); ', BadCode => ' $GENERIC(b) sa, a2; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); a2 += pow($a(), 2); N ++; } %} if (N-1) { $b() = pow( a2/(N-1) - pow(sa/N,2) * N/(N-1), .5 ); } else { $SETBAD(b()); } ', Doc => ' =for ref Unbiased estimate of population standard deviation. =cut ', ); pp_def('var', Pars => 'a(n); float+ [o]b()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) a2, sa; a2 = 0; sa = 0; PDL_Indx N = $SIZE(n); loop (n) %{ a2 += pow($a(), 2); sa += $a(); %} $b() = a2 / N - pow(sa/N, 2); ', BadCode => ' $GENERIC(b) a2, sa; a2 = 0; sa = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { a2 += pow($a(), 2); sa += $a(); N ++; } %} if (N) { $b() = a2 / N - pow(sa/N, 2); } else { $SETBAD(b()); } ', Doc => ' =for ref Sample variance. =cut ', ); pp_def('var_unbiased', Pars => 'a(n); float+ [o]b()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) a2, sa; a2 = 0; sa = 0; PDL_Indx N = $SIZE(n); loop (n) %{ a2 += pow($a(), 2); sa += $a(); %} $b() = (a2 - pow(sa/N, 2) * N) / (N-1); ', BadCode => ' $GENERIC(b) a2, sa; a2 = 0; sa = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { a2 += pow($a(), 2); sa += $a(); N ++; } %} if (N-1) { $b() = (a2 - pow(sa/N, 2) * N) / (N-1); } else { $SETBAD(b()); } ', Doc => ' =for ref Unbiased estimate of population variance. =cut ', ); pp_def('se', Pars => 'a(n); float+ [o]b()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) sa, a2; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} $b() = sqrt( (a2/(N-1) - pow(sa/N,2) * N/(N-1)) / N ); ', BadCode => ' $GENERIC(b) sa, a2; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); a2 += pow($a(), 2); N ++; } %} if (N-1) { $b() = sqrt( (a2/(N-1) - pow(sa/N,2) * N/(N-1)) / N ); } else { $SETBAD(b()); } ', Doc => ' =for ref Standard error of the mean. Useful for calculating confidence intervals. =for usage # 95% confidence interval for samples with large N $ci_95_upper = $data->average + 1.96 * $data->se; $ci_95_lower = $data->average - 1.96 * $data->se; =cut ', ); pp_def('ss', Pars => 'a(n); float+ [o]b()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) sa, a2; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} $b() = a2 - N * pow(sa/N,2); ', BadCode => ' $GENERIC(b) sa, a2; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); a2 += pow($a(), 2); N ++; } %} if (N) { $b() = a2 - N * pow(sa/N,2); } else { $SETBAD(b()); } ', Doc => ' =for ref Sum of squared deviations from the mean. =cut ', ); pp_def('skew', Pars => 'a(n); float+ [o]b()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) sa, m, d, d2, d3; sa = 0; d2 = 0; d3 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); %} m = sa / N; loop (n) %{ d = $a() - m; d2 += pow(d, 2); d3 += pow(d, 3); %} $b() = d3/N / pow(d2/N, 1.5); ', BadCode => ' $GENERIC(b) sa, m, d, d2, d3; sa = 0; m = 0; d=0; d2 = 0; d3 = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); N ++; } %} if (N) { m = sa / N; loop (n) %{ if ( $ISGOOD($a()) ) { d = $a() - m; d2 += pow(d, 2); d3 += pow(d, 3); } %} $b() = d3/N / pow(d2/N, 1.5); } else { $SETBAD(b()); } ', Doc => ' =for ref Sample skewness, measure of asymmetry in data. skewness == 0 for normal distribution. =cut ', ); pp_def('skew_unbiased', Pars => 'a(n); float+ [o]b()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) sa, m, d, d2, d3; sa = 0; d2 = 0; d3 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); %} m = sa / N; loop (n) %{ d = $a() - m; d2 += pow(d, 2); d3 += pow(d, 3); %} $b() = sqrt(N*(N-1)) / (N-2) * d3/N / pow(d2/N, 1.5); ', BadCode => ' $GENERIC(b) sa, m, d, d2, d3; sa = 0; m = 0; d=0; d2 = 0; d3 = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); N ++; } %} if (N-2) { m = sa / N; loop (n) %{ if ( $ISGOOD($a()) ) { d = $a() - m; d2 += pow(d, 2); d3 += pow(d, 3); } %} $b() = sqrt(N*(N-1)) / (N-2) * d3/N / pow(d2/N, 1.5); } else { $SETBAD(b()); } ', Doc => ' =for ref Unbiased estimate of population skewness. This is the number in GNumeric Descriptive Statistics. =cut ', ); pp_def('kurt', Pars => 'a(n); float+ [o]b()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) sa, m, d, d2, d4; sa = 0; d2 = 0; d4 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); %} m = sa / N; loop (n) %{ d = $a() - m; d2 += pow(d, 2); d4 += pow(d, 4); %} $b() = N * d4 / pow(d2,2) - 3; ', BadCode => ' $GENERIC(b) sa, m, d, d2, d4; sa = 0; m = 0; d=0; d2 = 0; d4 = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); N ++; } %} if (N) { m = sa / N; loop (n) %{ if ( $ISGOOD($a()) ) { d = $a() - m; d2 += pow(d, 2); d4 += pow(d, 4); } %} $b() = N * d4 / pow(d2,2) - 3; } else { $SETBAD(b()); } ', Doc => ' =for ref Sample kurtosis, measure of "peakedness" of data. kurtosis == 0 for normal distribution. =cut ', ); pp_def('kurt_unbiased', Pars => 'a(n); float+ [o]b()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) sa, m, d, d2, d4; sa = 0; d2 = 0; d4 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); %} m = sa / N; loop (n) %{ d = $a() - m; d2 += pow(d, 2); d4 += pow(d, 4); %} $b() = ((N-1)*N*(N+1) * d4 / pow(d2,2) - 3 * pow(N-1,2)) / ((N-2)*(N-3)); ', BadCode => ' $GENERIC(b) sa, m, d, d2, d4; sa = 0; m = 0; d=0; d2 = 0; d4 = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); N ++; } %} if (N-3) { m = sa / N; loop (n) %{ if ( $ISGOOD($a()) ) { d = $a() - m; d2 += pow(d, 2); d4 += pow(d, 4); } %} $b() = ((N-1)*N*(N+1) * d4 / pow(d2,2) - 3 * pow(N-1,2)) / ((N-2)*(N-3)); } else { $SETBAD(b()); } ', Doc => ' =for ref Unbiased estimate of population kurtosis. This is the number in GNumeric Descriptive Statistics. =cut ', ); pp_def('cov', Pars => 'a(n); b(n); float+ [o]c()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(c) ab, sa, sb; ab = 0; sa = 0; sb = 0; PDL_Indx N = $SIZE(n); loop (n) %{ ab += $a() * $b(); sa += $a(); sb += $b(); %} $c() = ab / N - (sa/N) * (sb/N); ', BadCode => ' $GENERIC(c) ab, sa, sb; ab = 0; sa = 0; sb = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISBAD($a()) || $ISBAD($b()) ) { } else { ab += $a() * $b(); sa += $a(); sb += $b(); N ++; } %} if (N) { $c() = ab / N - (sa/N) * (sb/N); } else { $SETBAD(c()); } ', Doc => ' =for ref Sample covariance. see B for ways to call =cut ', ); pp_def('cov_table', Pars => 'a(n,m); float+ [o]c(m,m)', HandleBad => 1, Code => ' PDL_Indx N, M; N = $SIZE(n); M = $SIZE(m); $GENERIC(a) a_, b_; $GENERIC(c) ab, sa, sb, cov; if (N > 1 ) { PDL_Indx i, j; for (i=0; in,m=>i); b_ = $a(n=>n,m=>j); ab += a_ * b_; sa += a_; sb += b_; %} cov = ab - (sa * sb) / N; $c(m0=>i, m1=>j) = $c(m0=>j, m1=>i) = cov / N; } } } else { barf( "too few N" ); } ', BadCode => ' if ($SIZE(n) >= 2 ) { $GENERIC(a) a_, b_; $GENERIC(c) ab, sa, sb, cov; PDL_Indx N, M, i, j; M = $SIZE(m); for (i=0; in, m=>i)) || $ISBAD($a(n=>n, m=>j))) { } else { a_ = $a(n=>n,m=>i); b_ = $a(n=>n,m=>j); ab += a_ * b_; sa += a_; sb += b_; N ++; } %} if (N > 1) { cov = ab - (sa * sb) / N; $c(m0=>i, m1=>j) = $c(m0=>j, m1=>i) = cov / N; } else { $SETBAD($c(m0=>i, m1=>j)); $SETBAD($c(m0=>j, m1=>i)); } } } } else { barf( "too few N" ); } ', Doc => ' =for ref Square covariance table. Gives the same result as threading using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls. =for usage Usage: # 5 obs x 3 var, 2 such data tables perldl> $a = random 5, 3, 2 perldl> p $cov = $a->cov_table [ [ [ 8.9636438 -1.8624472 -1.2416588] [-1.8624472 14.341514 -1.4245366] [-1.2416588 -1.4245366 9.8690655] ] [ [ 10.32644 -0.31311789 -0.95643674] [-0.31311789 15.051779 -7.2759577] [-0.95643674 -7.2759577 5.4465141] ] ] # diagonal elements of the cov table are the variances perldl> p $a->var [ [ 8.9636438 14.341514 9.8690655] [ 10.32644 15.051779 5.4465141] ] for the same cov matrix table using B, perldl> p $a->dummy(2)->cov($a->dummy(1)) =cut ', ); pp_def('corr', Pars => 'a(n); b(n); float+ [o]c()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(c) ab, sa, sb, a2, b2, cov, va, vb; ab=0; sa=0; sb=0; a2=0; b2=0; cov=0; va=0; vb=0; PDL_Indx N = $SIZE(n); if (N > 1 ) { loop (n) %{ ab += $a() * $b(); sa += $a(); sb += $b(); a2 += pow($a(), 2); b2 += pow($b(), 2); %} /* in fact cov * N, va * N, and vb * N */ cov = ab - (sa * sb) / N; va = a2 - pow(sa,2) / N; vb = b2 - pow(sb,2) / N; $c() = cov / sqrt( va * vb ); } else { barf( "too few N" ); } ', BadCode => ' $GENERIC(c) ab, sa, sb, a2, b2, cov, va, vb; ab=0; sa=0; sb=0; a2=0; b2=0; PDL_Indx N = 0; loop (n) %{ if ( $ISBAD($a()) || $ISBAD($b()) ) { } else { ab += $a() * $b(); sa += $a(); sb += $b(); a2 += pow($a(), 2); b2 += pow($b(), 2); N ++; } %} if ( N > 1 ) { cov = ab - (sa * sb) / N; va = a2 - pow(sa,2) / N; vb = b2 - pow(sb,2) / N; $c() = cov / sqrt( va * vb ); } else { $SETBAD(c()); } ', Doc => ' =for ref Pearson correlation coefficient. r = cov(X,Y) / (stdv(X) * stdv(Y)). =for usage Usage: perldl> $a = random 5, 3 perldl> $b = sequence 5,3 perldl> p $a->corr($b) [0.20934208 0.30949881 0.26713007] for square corr table perldl> p $a->corr($a->dummy(1)) [ [ 1 -0.41995259 -0.029301192] [ -0.41995259 1 -0.61927619] [-0.029301192 -0.61927619 1] ] but it is easier and faster to use B. =cut ', ); pp_def('corr_table', Pars => 'a(n,m); float+ [o]c(m,m)', HandleBad => 1, Code => ' PDL_Indx N, M; N = $SIZE(n); M = $SIZE(m); $GENERIC(a) a_, b_; $GENERIC(c) ab, sa, sb, a2, b2, cov, va, vb, r; if (N > 1 ) { PDL_Indx i, j; for (i=0; in,m=>i); b_ = $a(n=>n,m=>j); ab += a_ * b_; sa += a_; sb += b_; a2 += pow(a_, 2); b2 += pow(b_, 2); %} cov = ab - (sa * sb) / N; va = a2 - pow(sa,2) / N; vb = b2 - pow(sb,2) / N; r = cov / sqrt( va * vb ); $c(m0=>i, m1=>j) = $c(m0=>j, m1=>i) = r; } $c(m0=>i, m1=>i) = 1.0; } } else { barf( "too few N" ); } ', BadCode => ' if ($SIZE(n) >= 2 ) { $GENERIC(a) a_, b_; $GENERIC(c) ab, sa, sb, a2, b2, cov, va, vb, r; PDL_Indx N, M, i, j; M = $SIZE(m); for (i=0; in, m=>i)) || $ISBAD($a(n=>n, m=>j))) { } else { a_ = $a(n=>n,m=>i); b_ = $a(n=>n,m=>j); ab += a_ * b_; sa += a_; sb += b_; a2 += pow(a_, 2); b2 += pow(b_, 2); N ++; } %} if (N > 1) { cov = ab - (sa * sb) / N; va = a2 - pow(sa,2) / N; vb = b2 - pow(sb,2) / N; r = cov / sqrt( va * vb ); $c(m0=>i, m1=>j) = $c(m0=>j, m1=>i) = r; } else { $SETBAD($c(m0=>i, m1=>j)); $SETBAD($c(m0=>j, m1=>i)); } } N=0; loop (n) %{ if ($ISGOOD($a(n=>n,m=>i))) N ++; if (N > 1) break; %} if (N > 1) { $c(m0=>i, m1=>i) = 1.0; } else { $SETBAD($c(m0=>i, m1=>i)); } } } else { barf( "too few N" ); } ', Doc => ' =for ref Square Pearson correlation table. Gives the same result as threading using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls. =for usage Usage: # 5 obs x 3 var, 2 such data tables perldl> $a = random 5, 3, 2 perldl> p $a->corr_table [ [ [ 1 -0.69835951 -0.18549048] [-0.69835951 1 0.72481605] [-0.18549048 0.72481605 1] ] [ [ 1 0.82722569 -0.71779883] [ 0.82722569 1 -0.63938828] [-0.71779883 -0.63938828 1] ] ] for the same result using B, perldl> p $a->dummy(2)->corr($a->dummy(1)) This is also how to use B and B with such a table. =cut ', ); pp_def('t_corr', Pars => 'r(); n(); [o]t()', GenericTypes => [F, D], HandleBad => 1, Code => ' $t() = $r() / pow( (1 - pow($r(), 2)) / ($n() - 2) , .5); ', BadCode => ' if ($ISBAD(r()) || $ISBAD(n()) ) { $SETBAD( $t() ); } else { if ($n() > 2) { $t() = $r() / pow( (1 - pow($r(), 2)) / ($n() - 2) , .5); } else { $SETBAD(t()); } } ', Doc => ' =for usage $corr = $data->corr( $data->dummy(1) ); $n = $data->n_pair( $data->dummy(1) ); $t_corr = $corr->t_corr( $n ); use PDL::GSL::CDF; $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t_corr->abs, $n-2 )); =for ref t significance test for Pearson correlations. =cut ', ); pp_def('n_pair', Pars => 'a(n); b(n); indx [o]c()', GenericTypes => [qw/L Q/], HandleBad => 1, Code => ' $c() = $SIZE(n); ', BadCode => ' PDL_Indx N = 0; loop(n) %{ if ( $ISBAD($a()) || $ISBAD($b()) ) { } else { N ++; } %} $c() = N; ', Doc => ' =for ref Returns the number of good pairs between 2 lists. Useful with B (esp. when bad values are involved) =cut ', ); pp_def('corr_dev', Pars => 'a(n); b(n); float+ [o]c()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(c) ab, a2, b2, cov, va, vb; ab = 0; a2 = 0; b2 = 0; PDL_Indx N = $SIZE(n); if (N > 1) { loop (n) %{ ab += $a() * $b(); a2 += pow($a(), 2); b2 += pow($b(), 2); %} cov = ab / N; va = a2 / N; vb = b2 / N; $c() = cov / sqrt( va * vb ); } else { barf( "too few N" ); } ', BadCode => ' $GENERIC(c) ab, a2, b2, cov, va, vb; ab = 0; a2 = 0; b2 = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISBAD($a()) || $ISBAD($b()) ) { } else { ab += $a() * $b(); a2 += pow($a(), 2); b2 += pow($b(), 2); N ++; } %} if (N > 1) { cov = ab / N; va = a2 / N; vb = b2 / N; $c() = cov / sqrt( va * vb ); } else { $SETBAD(c()); } ', Doc => ' =for usage $corr = $a->dev_m->corr_dev($b->dev_m); =for ref Calculates correlations from B vals. Seems faster than doing B from original vals when data pdl is big =cut ', ); pp_def('t_test', Pars => 'a(n); b(m); float+ [o]t(); [o]d()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(t) N, M, sa, sb, a2, b2, va, vb, sdiff; sa = 0; sb = 0; a2 = 0; b2 = 0; N = $SIZE(n); M = $SIZE(m); if (N < 2 || M < 2) { barf( "too few N" ); } else { loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} loop (m) %{ sb += $b(); b2 += pow($b(), 2); %} $d() = N + M - 2; va = (a2 - pow(sa/N, 2) * N) / (N-1); vb = (b2 - pow(sb/M, 2) * M) / (M-1); sdiff = sqrt( (1/N + 1/M) * ((N-1)*va + (M-1)*vb) / $d() ); $t() = (sa/N - sb/M) / sdiff; } ', BadCode => ' $GENERIC(t) N, M, sa, sb, a2, b2, va, vb, sdiff; sa = 0; sb = 0; a2 = 0; b2 = 0; N = 0; M = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); a2 += pow($a(), 2); N ++; } %} loop (m) %{ if ( $ISGOOD($b()) ) { sb += $b(); b2 += pow($b(), 2); M ++; } %} if (N < 2 || M < 2) { $SETBAD($t()); $SETBAD($d()); } else { $d() = N + M - 2; va = (a2 - pow(sa/N, 2) * N) / (N-1); vb = (b2 - pow(sb/M, 2) * M) / (M-1); sdiff = sqrt( (1/N + 1/M) * ((N-1)*va + (M-1)*vb) / $d() ); $t() = (sa/N - sb/M) / sdiff; } ', Doc => ' =for usage my ($t, $df) = t_test( $pdl1, $pdl2 ); use PDL::GSL::CDF; my $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t->abs, $df )); =for ref Independent sample t-test, assuming equal var. =cut ', ); pp_def('t_test_nev', Pars => 'a(n); b(m); float+ [o]t(); [o]d()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(t) N, M, sa, sb, a2, b2, se_a_2, se_b_2, sdiff; sa = 0; sb = 0; a2 = 0; b2 = 0; N = $SIZE(n); M = $SIZE(m); if (N < 2 || M < 2) { barf( "too few N" ); } else { loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} loop (m) %{ sb += $b(); b2 += pow($b(), 2); %} se_a_2 = (a2 - pow(sa/N,2)*N) / (N*(N-1)); se_b_2 = (b2 - pow(sb/M,2)*M) / (M*(M-1)); sdiff = sqrt( se_a_2 + se_b_2 ); $t() = (sa/N - sb/M) / sdiff; $d() = pow(se_a_2 + se_b_2, 2) / ( pow(se_a_2,2) / (N-1) + pow(se_b_2,2) / (M-1) ) ; } ', BadCode => ' $GENERIC(t) N, M, sa, sb, a2, b2, se_a_2, se_b_2, sdiff; sa = 0; sb = 0; a2 = 0; b2 = 0; N = 0; M = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); a2 += pow($a(), 2); N ++; } %} loop (m) %{ if ( $ISGOOD($b()) ) { sb += $b(); b2 += pow($b(), 2); M ++; } %} if (N < 2 || M < 2) { $SETBAD($t()); $SETBAD($d()); } else { se_a_2 = (a2 - pow(sa/N,2)*N) / (N*(N-1)); se_b_2 = (b2 - pow(sb/M,2)*M) / (M*(M-1)); sdiff = sqrt( se_a_2 + se_b_2 ); $t() = (sa/N - sb/M) / sdiff; $d() = pow(se_a_2 + se_b_2, 2) / ( pow(se_a_2,2) / (N-1) + pow(se_b_2,2) / (M-1) ) ; } ', Doc => ' =for ref Independent sample t-test, NOT assuming equal var. ie Welch two sample t test. Df follows Welch-Satterthwaite equation instead of Satterthwaite (1946, as cited by Hays, 1994, 5th ed.). It matches GNumeric, which matches R. =for usage my ($t, $df) = $pdl1->t_test( $pdl2 ); =cut ', ); pp_def('t_test_paired', Pars => 'a(n); b(n); float+ [o]t(); [o]d()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(t) N, diff, s_dif, diff2; s_dif = 0; diff2 = 0; N = $SIZE(n); if (N > 1) { loop (n) %{ diff = $a() - $b(); s_dif += diff; diff2 += pow(diff, 2); %} $d() = N - 1; $t() = s_dif / sqrt( N * ( diff2 - pow(s_dif/N,2)*N ) / (N-1) ); } else { barf( "too few N" ); } ', BadCode => ' $GENERIC(t) N, diff, s_dif, diff2; s_dif = 0; diff2 = 0; N = 0; loop (n) %{ if ( $ISBAD($a()) || $ISBAD($b()) ) { } else { diff = $a() - $b(); s_dif += diff; diff2 += pow(diff, 2); N ++; } %} if (N > 1) { $d() = N - 1; $t() = s_dif / sqrt( N * ( diff2 - pow(s_dif/N,2)*N ) / (N-1) ); } else { $SETBAD($t()); $SETBAD($d()); } ', Doc => ' =for ref Paired sample t-test. =cut ', ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 binomial_test =for Sig Signature: (x(); n(); p_expected(); [o]p()) =for ref Binomial test. One-tailed significance test for two-outcome distribution. Given the number of successes, the number of trials, and the expected probability of success, returns the probability of getting this many or more successes. This function does NOT currently support bad value in the number of successes. =for usage Usage: # assume a fair coin, ie. 0.5 probablity of getting heads # test whether getting 8 heads out of 10 coin flips is unusual my $p = binomial_test( 8, 10, 0.5 ); # 0.0107421875. Yes it is unusual. =cut *binomial_test = \&PDL::binomial_test; sub PDL::binomial_test { my ($x, $n, $P) = @_; carp 'Please install PDL::GSL::CDF.' unless $CDF; carp 'This function does NOT currently support bad value in the number of successes.' if $x->badflag(); my $pdlx = pdl($x); $pdlx->badflag(1); $pdlx = $pdlx->setvaltobad(0); my $p = 1 - PDL::GSL::CDF::gsl_cdf_binomial_P( $pdlx - 1, $P, $n ); $p = $p->setbadtoval(1); $p->badflag(0); return $p; } =head1 METHODS =head2 rtable =for ref Reads either file or file handle*. Returns observation x variable pdl and var and obs ids if specified. Ids in perl @ ref to allow for non-numeric ids. Other non-numeric entries are treated as missing, which are filled with $opt{MISSN} then set to BAD*. Can specify num of data rows to read from top but not arbitrary range. *If passed handle, it will not be closed here. =for options Default options (case insensitive): V => 1, # verbose. prints simple status TYPE => double, C_ID => 1, # boolean. file has col id. R_ID => 1, # boolean. file has row id. R_VAR => 0, # boolean. set to 1 if var in rows SEP => "\t", # can take regex qr// MISSN => -999, # this value treated as missing and set to BAD NROW => '', # set to read specified num of data rows =for usage Usage: Sample file diet.txt: uid height weight diet akw 72 320 1 bcm 68 268 1 clq 67 180 2 dwm 70 200 2 ($data, $idv, $ido) = rtable 'diet.txt'; # By default prints out data info and @$idv index and element reading diet.txt for data and id... OK. data table as PDL dim o x v: PDL: Double D [4,3] 0 height 1 weight 2 diet Another way of using it, $data = rtable( \*STDIN, {TYPE=>long} ); =cut sub rtable { # returns obs x var data matrix and var and obs ids my ($src, $opt) = @_; my $fh_in; if ($src =~ /STDIN/ or ref $src eq 'GLOB') { $fh_in = $src } else { open $fh_in, $src or croak "$!" } my %opt = ( V => 1, TYPE => double, C_ID => 1, R_ID => 1, R_VAR => 0, SEP => "\t", MISSN => -999, NROW => '', ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $opt{V} and print STDERR "reading $src for data and id... "; local $PDL::undefval = $opt{MISSN}; my $id_c = []; # match declaration of $id_r for return purpose if ($opt{C_ID}) { chomp( $id_c = <$fh_in> ); my @entries = split $opt{SEP}, $id_c; $opt{R_ID} and shift @entries; $id_c = \@entries; } my ($c_row, $id_r, $data, @data) = (0, [], PDL->null, ); while (<$fh_in>) { chomp; my @entries = split /$opt{SEP}/, $_, -1; $opt{R_ID} and push @$id_r, shift @entries; # rudimentary check for numeric entry for (@entries) { $_ = $opt{MISSN} unless defined $_ and m/\d\b/ } push @data, pdl( $opt{TYPE}, \@entries ); $c_row ++; last if $opt{NROW} and $c_row == $opt{NROW}; } # not explicitly closing $fh_in here in case it's passed from outside # $fh_in will close by going out of scope if opened here. $data = pdl $opt{TYPE}, @data; @data = (); # rid of last col unless there is data there $data = $data(0:$data->getdim(0)-2, )->sever unless ( nelem $data(-1, )->where($data(-1, ) != $opt{MISSN}) ); my ($idv, $ido) = ($id_r, $id_c); # var in columns instead of rows $opt{R_VAR} == 0 and ($data, $idv, $ido) = ($data->inplace->transpose, $id_c, $id_r); if ($opt{V}) { print STDERR "OK.\ndata table as PDL dim o x v: " . $data->info . "\n"; $idv and print STDERR "$_\t$$idv[$_]\n" for (0..$#$idv); } $data = $data->setvaltobad( $opt{MISSN} ); $data->check_badflag; return wantarray? (@$idv? ($data, $idv, $ido) : ($data, $ido)) : $data; } =head2 group_by Returns pdl reshaped according to the specified factor variable. Most useful when used in conjunction with other threading calculations such as average, stdv, etc. When the factor variable contains unequal number of cases in each level, the returned pdl is padded with bad values to fit the level with the most number of cases. This allows the subsequent calculation (average, stdv, etc) to return the correct results for each level. Usage: # simple case with 1d pdl and equal number of n in each level of the factor pdl> p $a = sequence 10 [0 1 2 3 4 5 6 7 8 9] pdl> p $factor = $a > 4 [0 0 0 0 0 1 1 1 1 1] pdl> p $a->group_by( $factor )->average [2 7] # more complex case with threading and unequal number of n across levels in the factor pdl> p $a = sequence 10,2 [ [ 0 1 2 3 4 5 6 7 8 9] [10 11 12 13 14 15 16 17 18 19] ] pdl> p $factor = qsort $a( ,0) % 3 [ [0 0 0 0 1 1 1 2 2 2] ] pdl> p $a->group_by( $factor ) [ [ [ 0 1 2 3] [10 11 12 13] ] [ [ 4 5 6 BAD] [ 14 15 16 BAD] ] [ [ 7 8 9 BAD] [ 17 18 19 BAD] ] ] ARRAY(0xa2a4e40) # group_by supports perl factors, multiple factors # returns factor labels in addition to pdl in array context pdl> p $a = sequence 12 [0 1 2 3 4 5 6 7 8 9 10 11] pdl> $odd_even = [qw( e o e o e o e o e o e o )] pdl> $magnitude = [qw( l l l l l l h h h h h h )] pdl> ($a_grouped, $label) = $a->group_by( $odd_even, $magnitude ) pdl> p $a_grouped [ [ [0 2 4] [1 3 5] ] [ [ 6 8 10] [ 7 9 11] ] ] pdl> p Dumper $label $VAR1 = [ [ 'e_l', 'o_l' ], [ 'e_h', 'o_h' ] ]; =cut *group_by = \&PDL::group_by; sub PDL::group_by { my $p = shift; my @factors = @_; if ( @factors == 1 ) { my $factor = $factors[0]; my $label; if (ref $factor eq 'ARRAY') { $label = _ordered_uniq($factor); $factor = _array_to_pdl($factor); } else { my $perl_factor = [$factor->list]; $label = _ordered_uniq($perl_factor); } my $p_reshaped = _group_by_single_factor( $p, $factor ); return wantarray? ($p_reshaped, $label) : $p_reshaped; } # make sure all are arrays instead of pdls @factors = map { ref($_) eq 'PDL'? [$_->list] : $_ } @factors; my (@cells); for my $ele (0 .. $#{$factors[0]}) { my $c = join '_', map { $_->[$ele] } @factors; push @cells, $c; } # get uniq cell labels (ref List::MoreUtils::uniq) my %seen; my @uniq_cells = grep {! $seen{$_}++ } @cells; my $flat_factor = _array_to_pdl( \@cells ); my $p_reshaped = _group_by_single_factor( $p, $flat_factor ); # get levels of each factor and reshape accordingly my @levels; for (@factors) { my %uniq; @uniq{ @$_ } = (); push @levels, scalar keys %uniq; } $p_reshaped = $p_reshaped->reshape( $p_reshaped->dim(0), @levels )->sever; # make labels for the returned data structure matching pdl structure my @labels; if (wantarray) { for my $ifactor (0 .. $#levels) { my @factor_label; for my $ilevel (0 .. $levels[$ifactor]-1) { my $i = $ifactor * $levels[$ifactor] + $ilevel; push @factor_label, $uniq_cells[$i]; } push @labels, \@factor_label; } } return wantarray? ($p_reshaped, \@labels) : $p_reshaped; } # get uniq cell labels (ref List::MoreUtils::uniq) sub _ordered_uniq { my $arr = shift; my %seen; my @uniq = grep { ! $seen{$_}++ } @$arr; return \@uniq; } sub _group_by_single_factor { my $p = shift; my $factor = shift; $factor = $factor->squeeze; die "Currently support only 1d factor pdl." if $factor->ndims > 1; die "Data pdl and factor pdl do not match!" unless $factor->dim(0) == $p->dim(0); # get active dim that will be split according to factor and dims to thread over my @p_threaddims = $p->dims; my $p_dim0 = shift @p_threaddims; my $uniq = $factor->uniq; my @uniq_ns; for ($uniq->list) { push @uniq_ns, which( $factor == $_ )->nelem; } # get number of n's in each group, find the biggest, fit output pdl to this my $uniq_ns = pdl \@uniq_ns; my $max = pdl(\@uniq_ns)->max->sclr; my $badvalue = int($p->max + 1); my $p_tmp = ones($max, @p_threaddims, $uniq->nelem) * $badvalue; for (0 .. $#uniq_ns) { my $i = which $factor == $uniq($_); $p_tmp->dice_axis(-1,$_)->squeeze->(0:$uniq_ns[$_]-1, ) .= $p($i, ); } $p_tmp->badflag(1); return $p_tmp->setvaltobad($badvalue); } =head2 which_id =for ref Lookup specified var (obs) ids in $idv ($ido) (see B) and return indices in $idv ($ido) as pdl if found. The indices are ordered by the specified subset. Useful for selecting data by var (obs) id. =for usage my $ind = which_id $ido, ['smith', 'summers', 'tesla']; my $data_subset = $data( $ind, ); # take advantage of perl pattern matching # e.g. use data from people whose last name starts with s my $i = which_id $ido, [ grep { /^s/ } @$ido ]; my $data_s = $data($i, ); =cut sub which_id { my ($id, $id_s) = @_; my %ind; @ind{ @$id } = ( 0 .. $#$id ); my @ind_select; for (@$id_s) { defined( $ind{$_} ) and push @ind_select, $ind{$_}; } return pdl @ind_select; } sub _array_to_pdl { my ($var_ref) = @_; $var_ref = [ $var_ref->list ] if UNIVERSAL::isa($var_ref, 'PDL'); my (%level, $l); $l = 0; for (@$var_ref) { if (defined($_) and $_ ne '' and $_ ne 'BAD') { $level{$_} = $l ++ if !exists $level{$_}; } } my $pdl = pdl( map { (defined($_) and $_ ne '' and $_ ne 'BAD')? $level{$_} : -1 } @$var_ref ); $pdl = $pdl->setvaltobad(-1); $pdl->check_badflag; return wantarray? ($pdl, \%level) : $pdl; } =head1 SEE ALSO PDL::Basic (hist for frequency counts) PDL::Ufunc (sum, avg, median, min, max, etc.) PDL::GSL::CDF (various cumulative distribution functions) =head1 REFERENCES Hays, W.L. (1994). Statistics (5th ed.). Fort Worth, TX: Harcourt Brace College Publishers. =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut EOD pp_done(); PDL-Stats-0.84/META.json0000644000175000017500000000300714625061425014534 0ustar osboxesosboxes{ "abstract" : "a collection of statistics modules in Perl Data Language, with a quick-start guide for non-PDL people.", "author" : [ "Maggie J. Xiong " ], "dynamic_config" : 1, "generated_by" : "ExtUtils::MakeMaker version 7.44, CPAN::Meta::Converter version 2.150010", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : 2 }, "name" : "PDL-Stats", "no_index" : { "directory" : [ "t", "inc" ] }, "prereqs" : { "build" : { "requires" : { "PDL::Core" : "2.008" } }, "configure" : { "requires" : { "PDL::Core" : "2.008" } }, "develop" : { "requires" : { "CPAN::Changes" : "0" } }, "runtime" : { "recommends" : { "PDL::Graphics::PGPLOT" : "0", "PDL::Slatec" : "0" }, "requires" : { "PDL" : "2.057" } }, "test" : { "requires" : { "PDL::Core" : "2.008", "Test::More" : "0.88" } } }, "release_status" : "stable", "resources" : { "repository" : { "type" : "git", "url" : "git://github.com/PDLPorters/PDL-Stats", "web" : "https://github.com/PDLPorters/PDL-Stats" } }, "version" : "0.84", "x_serialization_backend" : "JSON::PP version 4.04" } PDL-Stats-0.84/Kmeans/0000755000175000017500000000000014625061425014331 5ustar osboxesosboxesPDL-Stats-0.84/Kmeans/Makefile.PL0000644000175000017500000000035614126063750016306 0ustar osboxesosboxesuse PDL::Core::Dev; use ExtUtils::MakeMaker; PDL::Core::Dev->import(); $package = ["kmeans.pd",Kmeans,PDL::Stats::Kmeans,undef,1]; %hash = pdlpp_stdargs($package); WriteMakefile( %hash ); sub MY::postamble {pdlpp_postamble($package)}; PDL-Stats-0.84/Kmeans/kmeans.pd0000644000175000017500000005502114544645567016156 0ustar osboxesosboxespp_add_exported('', 'random_cluster', 'iv_cluster'); pp_addpm({At=>'Top'}, <<'EOD'); use Carp; use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::Basic; =head1 NAME PDL::Stats::Kmeans -- classic k-means cluster analysis =head1 DESCRIPTION Assumes that we have data pdl dim [observation, variable] and the goal is to put observations into clusters based on their values on the variables. The terms "observation" and "variable" are quite arbitrary but serve as a reminder for "that which is being clustered" and "that which is used to cluster". The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are non-threadable, respectively. =head1 SYNOPSIS Implement a basic k-means procedure, use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats; my ($data, $idv, $ido) = rtable( $file ); # or generate random data: $data = grandom(200, 2); # two vars as below my ($cluster, $centroid, $ss_centroid, $cluster_last); # start out with 8 random clusters $cluster = random_cluster( $data->dim(0), 8 ); # iterate to minimize total ss # stop when no more changes in cluster membership do { $cluster_last = $cluster; ($centroid, $ss_centroid) = $data->centroid( $cluster ); $cluster = $data->assign( $centroid ); } while sum(abs($cluster - $cluster_last)) > 0; or, use the B function provided here, my %k = $data->kmeans( \%opt ); print "$_\t$k{$_}\n" for (sort keys %k); plot the clusters if there are only 2 vars in $data, use PDL::Graphics::PGPLOT::Window; my ($win, $c); $win = pgwin 'xs'; $win->env($data( ,0)->minmax, $data( ,1)->minmax); $win->points( $data->dice_axis(0,which($k{cluster}->(,$_)))->dog, {COLOR=>++$c} ) for (0 .. $k{cluster}->dim(1)-1); =cut EOD pp_addhdr(' #include #include #include ' ); pp_addpm pp_line_numbers(__LINE__, <<'EOD' ); # my tmp var for PDL 2.007 slice upate my $_tmp; =head2 random_cluster =for sig Signature: (short [o]cluster(o,c); int obs=>o; int clu=>c) =for ref Creates masks for random mutually exclusive clusters. Accepts two parameters, num_obs and num_cluster. Extra parameter turns into extra dim in mask. May loop a long time if num_cluster approaches num_obs because empty cluster is not allowed. =for usage my $cluster = random_cluster( $num_obs, $num_cluster ); =cut # can't be called on pdl sub random_cluster { my ($obs, $clu) = @_; # extra param in @_ made into extra dim my $cluster = zeroes @_; do { $cluster->inplace->_random_cluster(); } while (PDL::any $cluster->sumover == 0 ); return $cluster; } EOD pp_def('_random_cluster', Pars => 'short a(o,c); short [o]b(o,c)', Inplace => 1, GenericTypes => [U], Code => ' if ($SIZE(c) > $SIZE(o)) barf("more cluster than obs!"); /* threading w time only srand produces identical clusters */ long r; srand( time( NULL ) + r++); PDL_Indx nc = $SIZE(c); loop (o) %{ PDL_Indx cl = rand() % nc; /* XXX-FIXME rand() is not 64bit friendly here! */ loop (c) %{ $b() = (c == cl)? 1 : 0; %} %} ', Doc => undef, ); pp_def('which_cluster', Pars => 'short a(o,c); indx [o]b(o)', GenericTypes => [U,L], HandleBad => 1, Code => ' PDL_Indx cl; loop(o) %{ cl=-1; loop(c) %{ if ($a()) { cl = c; break; } %} $b() = cl; %} ', BadCode => ' PDL_Indx cl; loop(o) %{ cl=-1; loop(c) %{ if ($ISBAD(a()) || !$a()) { } else { cl = c; break; } %} if (cl==-1) { $SETBAD(b()); } else { $b() = cl; } %} ', Doc => 'Given cluster mask dim [obs x clu], returns the cluster index to which an obs belong. Does not support overlapping clusters. If an obs has TRUE value for multiple clusters, the returned index is the first cluster the obs belongs to. If an obs has no TRUE value for any cluster, the return val is set to -1 or BAD if the input mask has badflag set. Usage: # create a cluster mask dim [obs x clu] perldl> p $c_mask = iv_cluster [qw(a a b b c c)] [ [1 1 0 0 0 0] [0 0 1 1 0 0] [0 0 0 0 1 1] ] # get cluster membership list dim [obs] perldl> p $ic = $c_mask->which_cluster [0 0 1 1 2 2] =cut ', ); pp_def('assign', Pars => 'data(o,v); centroid(c,v); short [o]cluster(o,c)', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(centroid) ssc, ssmin; PDL_Indx cl = 0; loop (o) %{ ssmin = -1; loop (c) %{ ssc = 0; loop (v) %{ ssc += pow($data() - $centroid(), 2); %} /* notice that if multiple ssc == ssmin the 1st is taken as cluster */ if (ssmin < 0 || ssmin > ssc) { cl = c; ssmin = ssc; } %} loop (c) %{ $cluster() = (c == cl)? 1 : 0; %} %} ', BadCode => ' $GENERIC(centroid) ssc, ssmin; PDL_Indx cl, nvc; cl = 0; loop (o) %{ ssmin = -1; loop (c) %{ ssc = 0; nvc = 0; loop (v) %{ if ($ISBAD( $data() ) || $ISBAD( $centroid() )) { } else { ssc += pow($data() - $centroid(), 2); nvc ++; } %} if (nvc) { ssc /= nvc; } else { /* taking advantage of the fact that 1st valid ssmin takes precedence */ /* so ssc has no effect if there is already ssmin. or it is -1 */ ssc = ssmin; } /* notice that if multiple ssc == ssmin the 1st is taken as cluster */ if (ssmin < 0 || ssmin > ssc) { cl = c; ssmin = ssc; } %} loop (c) %{ if (ssmin >= 0) { $cluster() = (c == cl)? 1 : 0; } else { $SETBAD($cluster()); } %} %} ', Doc => ' =for ref Takes data pdl dim [obs x var] and centroid pdl dim [cluster x var] and returns mask dim [obs x cluster] to cluster membership. An obs is assigned to the first cluster with the smallest distance (ie sum squared error) to cluster centroid. With bad value, obs is assigned by smallest mean squared error across variables. =for usage perldl> $centroid = ones 2, 3 perldl> $centroid(0,) .= 0 perldl> p $centroid [ [0 1] [0 1] [0 1] ] perldl> $b = qsort( random 4, 3 ) perldl> p $b [ [0.022774068 0.032513883 0.13890034 0.30942479] [ 0.16943853 0.50262636 0.56251531 0.7152271] [ 0.23964483 0.59932745 0.60967495 0.78452117] ] # notice that 1st 3 obs in $b are on average closer to 0 # and last obs closer to 1 perldl> p $b->assign( $centroid ) [ [1 1 1 0] # cluster 0 membership [0 0 0 1] # cluster 1 membership ] =cut ', ); pp_def('centroid', Pars => 'data(o,v); cluster(o,c); float+ [o]m(c,v); float+ [o]ss(c,v)', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(m) s[ $SIZE(c) ][ $SIZE(v) ], s2[ $SIZE(c) ][ $SIZE(v) ]; PDL_Indx n[ $SIZE(c) ]; loop (c) %{ loop (v) %{ s[c][v] = 0.0; s2[c][v] = 0.0; %} n[c] = 0; loop (o) %{ if ($cluster()) { n[c] ++; loop (v) %{ s[c][v] += $data(); s2[c][v] += pow($data(), 2); } %} %} if (n[c]) { loop (v) %{ $m() = s[c][v] / n[c]; $ss() = s2[c][v] - pow(s[c][v] / n[c], 2) * n[c]; %} } else { loop (v) %{ $m() = 0; $ss() = 0; %} /* barf("please make sure there is no empty cluster!"); */ } %} ', BadCode => ' $GENERIC(m) s[ $SIZE(c) ][ $SIZE(v) ], s2[ $SIZE(c) ][ $SIZE(v) ]; PDL_Indx n[ $SIZE(c) ][ $SIZE(v) ]; loop (c) %{ loop (v) %{ s[c][v] = 0.0; s2[c][v] = 0.0; n[c][v] = 0; %} loop (o) %{ if ($ISBAD($cluster()) || !$cluster()) { } else { loop (v) %{ if ($ISGOOD( $data() )) { s[c][v] += $data(); s2[c][v] += pow($data(), 2); n[c][v] ++; } %} } %} loop (v) %{ if (n[c][v]) { $m() = s[c][v] / n[c][v]; $ss() = s2[c][v] / n[c][v] - pow(s[c][v] / n[c][v], 2); } else { $m() = 0; $ss() = 0; } %} %} ', Doc => ' =for ref Takes data dim [obs x var] and mask dim [obs x cluster], returns mean and ss (ms when data contains bad values) dim [cluster x var], using data where mask == 1. Multiple cluster membership for an obs is okay. If a cluster is empty all means and ss are set to zero for that cluster. =for usage # data is 10 obs x 3 var perldl> p $d = sequence 10, 3 [ [ 0 1 2 3 4 5 6 7 8 9] [10 11 12 13 14 15 16 17 18 19] [20 21 22 23 24 25 26 27 28 29] ] # create two clusters by value on 1st var perldl> p $a = $d( ,(0)) <= 5 [1 1 1 1 1 1 0 0 0 0] perldl> p $b = $d( ,(0)) > 5 [0 0 0 0 0 0 1 1 1 1] perldl> p $c = cat $a, $b [ [1 1 1 1 1 1 0 0 0 0] [0 0 0 0 0 0 1 1 1 1] ] perldl> p $d->centroid($c) # mean for 2 cluster x 3 var [ [ 2.5 7.5] [12.5 17.5] [22.5 27.5] ] # ss for 2 cluster x 3 var [ [17.5 5] [17.5 5] [17.5 5] ] =cut ', ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); sub _scree_ind { # use as scree cutoff the point with max distance to the line formed # by the 1st and last points in $self # it's a heuristic--whether we can get "good" results depends on # the number of components in $self. my ($self) = @_; $self = $self->squeeze; $self->ndims > 1 and croak "1D pdl only please"; my $a = zeroes 2, $self->nelem; ($_tmp = $a->slice('(0)')) .= sequence $self->nelem; ($_tmp = $a->slice('(1)')) .= $self; my $d = _d_point2line( $a, $a->slice(':,(0)'), $a->slice(':,(-1)') ); return $d->maximum_ind; } sub _d_point2line { my ($self, $p1, $p2) = @_; for ($self, $p1, $p2) { $_->dim(0) != 2 and carp "point pdl dim(0) != 2"; } return _d_p2l( $self->mv(0,-1)->dog, $p1->mv(0,-1)->dog, $p2->mv(0,-1)->dog ); } EOD pp_def('_d_p2l', Pars => 'xc(); yc(); xa(); ya(); xb(); yb(); float+ [o]d()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(d) xba, yba; xba = $xb() - $xa(); yba = $yb() - $ya(); $d() = fabs( xba * ($ya() - $yc()) - ($xa() - $xc()) * yba ) / sqrt( pow(xba,2) + pow(yba,2) ); ', BadCode => ' if ($ISBAD(xc()) || $ISBAD(yc()) || $ISBAD(xa()) || $ISBAD(ya()) || $ISBAD(xb()) || $ISBAD(yb()) ) { $SETBAD(d()); } else { $GENERIC(d) xba, yba; xba = $xb() - $xa(); yba = $yb() - $ya(); $d() = fabs( xba * ($ya() - $yc()) - ($xa() - $xc()) * yba ) / sqrt( pow(xba,2) + pow(yba,2) ); } ', Doc => undef, ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 kmeans =for ref Implements classic k-means cluster analysis. Given a number of observations with values on a set of variables, kmeans puts the observations into clusters that maximizes within-cluster similarity with respect to the variables. Tries several different random seeding and clustering in parallel. Stops when cluster assignment of the observations no longer changes. Returns the best result in terms of R2 from the random-seeding trials. Instead of random seeding, kmeans also accepts manual seeding. This is done by providing a centroid to the function, in which case clustering will proceed from the centroid and there is no multiple tries. There are two distinct advantages from seeding with a centroid compared to seeding with predefined cluster membership of a subset of the observations ie "seeds", (1) a centroid could come from a previous study with a different set of observations; (2) a centroid could even be "fictional", or in more proper parlance, an idealized prototype with respect to the actual data. For example, if there are 10 person's ratings of 1 to 5 on 4 movies, ie a ratings pdl of dim [10 obs x 4 var], providing a centroid like [ [5 0 0 0] [0 5 0 0] [0 0 5 0] [0 0 0 5] ] will produce 4 clusters of people with each cluster favoring a different one of the 4 movies. Clusters from an idealized centroid may not give the best result in terms of R2, but they sure are a lot more interpretable. If clustering has to be done from predefined clusters of seeds, simply calculate the centroid using the B function and feed it to kmeans, my ($centroid, $ss) = $rating($iseeds, )->centroid( $seeds_cluster ); my %k = $rating->kmeans( { CNTRD=>$centroid } ); kmeans supports bad value*. =for options Default options (case insensitive): V => 1, # prints simple status FULL => 0, # returns results for all seeding trials CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts NTRY => 5, # num of random seeding trials NSEED => 1000, # num of initial seeds, use NSEED up to max obs NCLUS => 3, # num of clusters =for usage Usage: # suppose we have 4 person's ratings on 5 movies perldl> p $rating = ceil( random(4, 5) * 5 ) [ [3 2 2 3] [2 4 5 4] [5 3 2 3] [3 3 1 5] [4 3 3 2] ] # we want to put the 4 persons into 2 groups perldl> %k = $rating->kmeans( {NCLUS=>2} ) # by default prints back options used # as well as info for all tries and iterations CNTRD => Null FULL => 0 NCLUS => 2 NSEED => 4 NTRY => 5 V => 1 ss total: 20.5 iter 0 R2 [0.024390244 0.024390244 0.26829268 0.4796748 0.4796748] iter 1 R2 [0.46341463 0.46341463 0.4796748 0.4796748 0.4796748] perldl> p "$_\t$k{$_}\n" for (sort keys %k) R2 0.479674796747968 centroid # mean ratings for 2 group x 5 movies [ [ 3 2.3333333] [ 2 4.3333333] [ 5 2.6666667] [ 3 3] [ 4 2.6666667] ] cluster # 4 persons' membership in two groups [ [1 0 0 0] [0 1 1 1] ] n [1 3] # cluster size ss [ [ 0 0.66666667] [ 0 0.66666667] [ 0 0.66666667] [ 0 8] [ 0 0.66666667] ] Now, for the valiant, kmeans is threadable. Say you gathered 10 persons' ratings on 5 movies from 2 countries, so the data is dim [10,5,2], and you want to put the 10 persons from each country into 3 clusters, just specify NCLUS => [3,1], and there you have it. The key is for NCLUS to include $data->ndims - 1 numbers. The 1 in [3,1] turns into a dummy dim, so the 3-cluster operation is repeated on both countries. Similarly, when seeding, CNTRD needs to have ndims that at least match the data ndims. Extra dims in CNTRD will lead to threading (convenient if you want to try out different centroid locations, for example, but you will have to hand pick the best result). See stats_kmeans.t for examples w 3D and 4D data. *With bad value, R2 is based on average of variances instead of sum squared error. =cut *kmeans = \&PDL::kmeans; sub PDL::kmeans { my ($self, $opt) = @_; my %opt = ( V => 1, # prints simple status FULL => 0, # returns results for all seeding trials CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts NTRY => 5, # num of random seeding trials NSEED => 1000, # num of initial seeds, use NSEED up to max obs NCLUS => 3, # num of clusters ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); if (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) { $opt{NTRY} = 1; $opt{NSEED} = $self->dim(0); $opt{NCLUS} = $opt{CNTRD}->dim(0); } else { $opt{NSEED} = pdl($self->dim(0), $opt{NSEED})->min->sclr; } $opt{V} and print STDERR "$_\t=> $opt{$_}\n" for (sort keys %opt); my $ss_ms = $self->badflag? 'ms' : 'ss'; my $ss_total = $self->badflag? $self->var->average : $self->ss->sumover; $opt{V} and print STDERR "overall $ss_ms:\t$ss_total\n"; my ($centroid, $ss_cv, $R2, $clus_this, $clus_last); # NTRY made into extra dim in $cluster for threading my @nclus = (ref $opt{NCLUS} eq 'ARRAY')? @{$opt{NCLUS}} : ($opt{NCLUS}); $clus_this = (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) ? $self->assign( $opt{CNTRD}->dummy(-1) ) # put dummy(-1) to match NTRY : random_cluster($opt{NSEED}, @nclus, $opt{NTRY} ) ; ($centroid, $ss_cv) = $self(0:$opt{NSEED} - 1, )->centroid( $clus_this ); # now obs in $clus_this matches $self $clus_this = $self->assign( $centroid ); ($centroid, $ss_cv) = $self->centroid( $clus_this ); my $iter = 0; do { $R2 = $self->badflag? 1 - $ss_cv->average->average / $ss_total : 1 - $ss_cv->sumover->sumover / $ss_total ; $opt{V} and print STDERR join(' ',('iter', $iter++, 'R2', $R2)) . "\n"; $clus_last = $clus_this; $clus_this = $self->assign( $centroid ); ($centroid, $ss_cv) = $self->centroid( $clus_this ); } while ( any long(abs($clus_this - $clus_last))->sumover->sumover > 0 ); $opt{FULL} and return ( centroid => PDL::squeeze( $centroid ), cluster => PDL::squeeze( $clus_this ), n => PDL::squeeze( $clus_this )->sumover, R2 => PDL::squeeze( $R2 ), $ss_ms => PDL::squeeze( $ss_cv ), ); # xchg/mv(-1,0) leaves it as was if single dim--unlike transpose my $i_best = $R2->mv(-1,0)->maximum_ind; $R2->getndims == 1 and return ( centroid => $centroid->dice_axis(-1,$i_best)->sever->squeeze, cluster => $clus_this->dice_axis(-1,$i_best)->sever->squeeze, n => $clus_this->dice_axis(-1,$i_best)->sever->squeeze->sumover, R2 => $R2->dice_axis(-1,$i_best)->sever->squeeze, $ss_ms => $ss_cv->dice_axis(-1,$i_best)->sever->squeeze, ); # now for threading beyond 2D data # can't believe i'm using a perl loop :P $i_best = $i_best->flat->sever; my @i_best = map { $opt{NTRY} * $_ + $i_best(($_)) } 0 .. $i_best->nelem - 1; my @shapes; for ($centroid, $clus_this, $R2) { my @dims = $_->dims; pop @dims; push @shapes, \@dims; } $clus_this = $clus_this->mv(-1,2)->clump(2..$clus_this->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[1] } )->sever, return ( centroid => $centroid->mv(-1,2)->clump(2..$centroid->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever, cluster => $clus_this, n => $clus_this->sumover, R2 => $R2->mv(-1,0)->clump(0..$R2->ndims-1)->dice_axis(0,\@i_best)->sever->reshape( @{ $shapes[2] } )->sever, $ss_ms => $ss_cv->mv(-1,2)->clump(2..$ss_cv->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever, ); } =head1 METHODS =head2 iv_cluster =for ref Turns an independent variable into a cluster pdl. Returns cluster pdl and level-to-pdl_index mapping in list context and cluster pdl only in scalar context. This is the method used for mean and var in anova. The difference between iv_cluster and dummy_code is that iv_cluster returns pdl dim [obs x level] whereas dummy_code returns pdl dim [obs x (level - 1)]. =for usage Usage: perldl> @bake = qw( y y y n n n ) # accepts @ ref or 1d pdl perldl> p $bake = iv_cluster( \@bake ) [ [1 1 1 0 0 0] [0 0 0 1 1 1] ] perldl> p $rating = sequence 6 [0 1 2 3 4 5] perldl> p $rating->centroid( $bake ) # mean for each iv level [ [1 4] ] # ss [ [2 2] ] =cut *iv_cluster = \&PDL::iv_cluster; sub PDL::iv_cluster { my ($var_ref) = @_; my ($var, $map_ref) = PDL::Stats::Basic::_array_to_pdl( $var_ref ); my $var_a = yvals( short, $var->nelem, $var->max->sclr + 1 ) == $var; $var_a = $var_a->setbadif( $var->isbad ) if $var->badflag; return wantarray? ($var_a, $map_ref) : $var_a; } =head2 pca_cluster Assign variables to components ie clusters based on pca loadings or scores. One way to seed kmeans (see Ding & He, 2004, and Su & Dy, 2004 for other ways of using pca with kmeans). Variables are assigned to their most associated component. Note that some components may not have any variable that is most associated with them, so the returned number of clusters may be smaller than NCOMP. Default options (case insensitive): V => 1, ABS => 1, # high pos and neg loadings on a comp in same cluster NCOMP => undef, # max number of components to consider. determined by # scree plot black magic if not specified PLOT => 0, # pca scree plot with cutoff at NCOMP Usage: # say we need to cluster a group of documents # $data is pdl dim [word x doc] ($data, $idd, $idw) = get_data 'doc_word_info.txt'; perldl> %p = $data->pca; # $cluster is pdl mask dim [doc x ncomp] perldl> $cluster = $p{loading}->pca_cluster; # pca clusters var while kmeans clusters obs. hence transpose perldl> ($m, $ss) = $data->transpose->centroid( $cluster ); perldl> %k = $data->transpose->kmeans( { cntrd=>$m } ); # take a look at cluster 0 doc ids perldl> p join("\n", @$idd[ list which $k{cluster}->( ,0) ]); =cut *pca_cluster = \&PDL::pca_cluster; sub PDL::pca_cluster { my ($self, $opt) = @_; my %opt = ( V => 1, ABS => 1, # high pos and neg loadings on a comp in same cluster NCOMP => undef, # max number of components to consider. determined by # scree plot black magic if not specified PLOT => 0, # pca scree plot with cutoff at NCOMP ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $var = sumover($self ** 2) / $self->dim(0); if (!$opt{NCOMP}) { # here's the black magic part my $comps = ($self->dim(1) > 300)? int($self->dim(1) * .1) : pdl($self->dim(1), 30)->min ; $var = $var(0:$comps-1)->sever; $opt{NCOMP} = _scree_ind( $var ); } $opt{PLOT} and do { require PDL::Stats::GLM; $var->plot_scree( {NCOMP=>$var->dim(0), CUT=>$opt{NCOMP}} ); }; my $c = $self->( ,0:$opt{NCOMP}-1)->transpose->abs->maximum_ind; if ($opt{ABS}) { $c = $c->iv_cluster; } else { my @c = map { ($self->($_,$c($_)) >= 0)? $c($_)*2 : $c($_)*2 + 1 } ( 0 .. $c->dim(0)-1 ); $c = iv_cluster( \@c ); } $opt{V} and print STDERR "cluster membership mask as " . $c->info . "\n"; return $c; } =head1 REFERENCES Ding, C., & He, X. (2004). K-means clustering via principal component analysis. Proceedings of the 21st International Conference on Machine Learning, 69, 29. Su, T., & Dy, J. (2004). A deterministic method for initializing K-means clustering. 16th IEEE International Conference on Tools with Artificial Intelligence, 784-786. Romesburg, H.C. (1984). Cluster Analysis for Researchers. NC: Lulu Press. Wikipedia (retrieved June, 2009). K-means clustering. http://en.wikipedia.org/wiki/K-means_algorithm =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut EOD pp_done(); PDL-Stats-0.84/t/0000755000175000017500000000000014625061425013356 5ustar osboxesosboxesPDL-Stats-0.84/t/stats_kmeans.t0000644000175000017500000001637514544645567016272 0ustar osboxesosboxesuse strict; use warnings; use Test::More; BEGIN { use_ok( 'PDL::Stats::Basic' ); use_ok( 'PDL::Stats::Kmeans' ); } use PDL::LiteF; use PDL::NiceSlice; sub tapprox { my($a,$b, $eps) = @_; $eps ||= 1e-6; my $diff = abs($a-$b); # use max to make it perl scalar ref $diff eq 'PDL' and $diff = $diff->max; return $diff < $eps; } is(tapprox( t_iv_cluster(), 0 ), 1, "independent variable cluster"); sub t_iv_cluster { my @a = qw( a a b b ); my $a = iv_cluster( \@a ); return abs($a - pdl(byte, [1,1,0,0], [0,0,1,1]))->sum; } is(tapprox( t_iv_cluster_bad(), 0 ), 1, "independent variable cluster with bad data"); sub t_iv_cluster_bad { my @a = qw( a a BAD b b ); my $a = iv_cluster( \@a ); is(sum(abs(which($a->isbad) - pdl(2,7))), 0, 'iv_cluster has bad value'); return abs($a - pdl(byte, [1,1,-9,0,0], [0,0,-9,1,1]))->sum; } is(tapprox( t_assign(), 0 ), 1, "assign"); sub t_assign { my $centroid = pdl( [0,1], [0,1], [0,1] ); my $a = sequence 4, 3; $a %= 2; my $c = $a->assign($centroid); my $cluster = pdl(byte, [1,0,1,0], [0,1,0,1]); return abs($c - $cluster)->sum; } is(tapprox( t_centroid(), 0 ), 1, "centroid"); sub t_centroid { my $a = sequence 4, 3; my $cluster = pdl(byte, [1,0,1,0], [0,1,0,1]); my ($m, $ss) = $a->centroid($cluster); my $m_a = pdl([1,2], [5,6], [9,10]); my $ss_a = ones(2,3) * 2; return sum( $m - $m_a + ( $ss - $ss_a ) ); } is(tapprox( t_assign_bad(), 0 ), 1, "assign with bad data"); sub t_assign_bad { my $centroid = pdl( [0,1], [0,1], [0,1] ); my $a = sequence 5, 3; $a->setbadat(4,0); $a->setbadat(4,2); $a %= 2; my $c = $a->assign($centroid); my $cluster = pdl(byte, [1,0,1,0,0], [0,1,0,1,1]); return ($c - $cluster)->sum; } is(tapprox( t_centroid_bad(), 0 ), 1, "centroid with bad data"); sub t_centroid_bad { my $a = sequence 5, 3; $a->setbadat(4,0); $a->setbadat(4,2); my $cluster = pdl(byte, [1,0,1,0,0], [0,1,0,1,1]); my ($m, $ss) = $a->centroid($cluster); my $m_a = pdl([1,2], [6,7.6666667], [11,12]); my $ss_a = pdl([1,1], [1,1.5555556], [1,1]); return sum( $m - $m_a + ( $ss - $ss_a ) ); } # kmeans is undeterministic. retry to for optimal results ok(t_kmeans_with_retry(), 't_kmeans'); sub t_kmeans_with_retry { for my $retry (1..3) { return 1 if (tapprox(t_kmeans(), 0)) } } sub t_kmeans { my $data = sequence 7, 3; my $ind = $data(1, )->flat; # only works because $data is sequence $data = lvalue_assign_detour($data, $ind, 0); my %m = $data->kmeans({NCLUS=>2, NSEED=>6, NTRY=>10, V=>0}); return sum( $m{centroid}->sumover - pdl qw(3.3333333 10.333333 17.333333) ); } t_kmeans_4d(); sub t_kmeans_4d { my $data = sequence 7, 3, 2, 2; # construct ind from sequence, then call lvalue_assign_detour my $ind = sequence($data->dims)->(1, )->flat; $data = lvalue_assign_detour($data, $ind, 0); $ind = sequence($data->dims)->(0,1,0, )->flat; $data = lvalue_assign_detour($data, $ind, 0); $data = lvalue_assign_detour($data, which($data == 42), 0); my %m = $data->kmeans( {nclus=>[2,1,1], ntry=>20, v=>0} ); # print "$_\t$m{$_}\n" for (sort keys %m); my %a = ( 'R2' => pdl ( [ qw(0.74223245 0.97386667) ], [ qw(0.84172845 0.99499377) ], ), 'ss_sum' => pdl ( [ [ qw( 10 10 108 )], [ qw( 23.333333 23.333333 23.333333 )], ], [ [ qw( 10 10 1578 )], [ qw( 23.333333 23.333333 23.333333 )], ] ), ); is(tapprox( sum( $m{R2} - $a{R2} ), 0 ), 1, "kmeans R2 result as expected"); is(tapprox( sum( $m{ss}->sumover - $a{ss_sum} ), 0, 1e-3 ), 1, "kmeans ss result as expected"); } t_kmeans_4d_seed(); sub t_kmeans_4d_seed { my $data = sequence 7, 3, 2, 2; # construct ind from sequence, then call lvalue_assign_detour my $ind = sequence($data->dims)->(1, )->flat; $data = lvalue_assign_detour($data, $ind, 0); $ind = sequence($data->dims)->(0,1,0, ); $data = lvalue_assign_detour($data, $ind, 0); $data = lvalue_assign_detour($data, which($data == 42), 0); # centroid intentially has one less dim than data my $centroid = pdl( [ [qw( 10 0 )], [qw( 10 0 )], [qw( 10 0 )], ], [ [qw( 20 0 )], [qw( 30 0 )], [qw( 30 0 )], ], ); # use dummy to match centroid dims to data dims my %m = $data->kmeans( {cntrd=>$centroid->dummy(-1), v=>0} ); # print "$_\t$m{$_}\n" for (sort keys %m); my %a = ( 'R2' => pdl ( [ qw(0.74223245 0.97386667) ], [ qw(0.84172845 0.99499377) ], ), 'ss_sum' => pdl ( [ [ qw( 10 10 108 )], [ qw( 23.333333 23.333333 23.333333 )], ], [ [ qw( 10 10 1578 )], [ qw( 23.333333 23.333333 23.333333 )], ] ), ); is(tapprox( sum( $m{R2} - $a{R2} ), 0 ), 1, "kmeans R2 with manually seeded centroid"); is(tapprox( sum( $m{ss}->sumover - $a{ss_sum} ), 0, 1e-3 ), 1, "kmeans ss with manually seeded centroid"); } TODO: { local $TODO = 'kmeans is undeterministic. retry to for optimal results'; is(t_kmeans_bad_with_retry(), 1, 't_kmeans_bad'); } sub t_kmeans_bad_with_retry { for my $retry (1..3) { return 1 if (tapprox(t_kmeans_bad(), 0)) } } sub t_kmeans_bad { my $data = sequence 7, 3; $data = $data->setbadat(4,0); my %m = $data->kmeans({NCLUS=>2, NTRY=>10, V=>0}); print "$_\t$m{$_}\n" for (sort keys %m); return sum( $m{ms}->sumover - pdl qw( 1.5 1.9166667 1.9166667 ) ); } t_kmeans_3d_bad(); sub t_kmeans_3d_bad { my $data = sequence 7, 3, 2; my $ind = sequence($data->dims)->(0:1, ,0)->flat; $data = lvalue_assign_detour($data, $ind, 0); $ind = sequence($data->dims)->(4:6, ,1)->flat; $data = lvalue_assign_detour($data, $ind, 1); $data->setbadat(3,0,0); my %m = $data->kmeans( {nclus=>[2,1], ntry=>20, v=>0} ); # print "$_\t$m{$_}\n" for (sort keys %m); my %a = ( 'R2' => pdl( [ qw( 0.96879592 0.99698988 ) ] ), 'ms' => pdl( [ [2.1875, 0], [ 2, 0], [ 2, 0], ], [ [ 0,1.25], [ 0,1.25], [ 0,1.25], ] ), ); is(tapprox( sum( $m{R2} - $a{R2} ), 0 ), 1, "3d kmeans with bad data R2 is as expected"); is(tapprox( sum( $m{ms} - $a{ms} ), 0, 1e-3 ), 1, "3d kmeans with bad data ss is as expected"); } is(tapprox( t_pca_cluster(), 0 ), 1, "principal component analysis clustering"); sub t_pca_cluster { my $l = pdl( [qw( -0.798603 -0.61624 -0.906765 0.103116)], [qw( 0.283269 -0.41041 0.131113 0.894118)], [qw( -0.419717 0.649522 -0.0223668 0.434389)], [qw( 0.325314 0.173015 -0.400108 0.0350236)], ); my $c = $l->pca_cluster({v=>0,ncomp=>4,plot=>0}); return ( $c - pdl(byte, [1,0,1,0], [0,1,0,0], [0,0,0,1]) )->sum; } { my $a = pdl( [[3,1], [2,4]] ); my $b = pdl( [2,4], [3,1] ); my $c = pdl( 5,15 ); my $d = PDL::Stats::Kmeans::_d_point2line( $a, $b, $c ); is( tapprox(sum($d - pdl(1.754116, 1.4142136)), 0), 1, '_d_point2line'); } { my $c0 = pdl(byte, [1,0,1,0], [0,1,0,1]); my $c1 = pdl(byte, [0,0,0,1], [0,1,1,0]); my $c = cat $c0, $c1; my $ans = pdl( [0,1,0,1], [-1,1,1,0] ); is( abs($c->which_cluster - $ans)->sum, 0, 'which_cluster'); } done_testing(); sub lvalue_assign_detour { my ($pdl, $index, $new_value) = @_; my @arr = list $pdl; my @ind = ref($index)? list($index) : $index; $arr[$_] = $new_value for (@ind); return pdl(\@arr)->reshape($pdl->dims)->sever; } PDL-Stats-0.84/t/stats_ts.t0000644000175000017500000001023114544645567015423 0ustar osboxesosboxesuse strict; use warnings; use Test::More; use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::TS; sub tapprox { my($a,$b, $eps) = @_; $eps ||= 1e-6; my $diff = abs($a-$b); # use max to make it perl scalar ref $diff eq 'PDL' and $diff = $diff->max; return $diff < $eps; } { my $a = sequence 10; is(tapprox( sum($a->acvf(4) - pdl qw(82.5 57.75 34 12.25 -6.5) ), 0 ), 1, "autocovariance on $a"); is(tapprox( sum($a->acf(4) - pdl qw(1 0.7 0.41212121 0.14848485 -0.078787879) ), 0 ), 1, "autocorrelation on $a"); is(tapprox( sum($a->filter_ma(2) - pdl qw( 0.6 1.2 2 3 4 5 6 7 7.8 8.4 ) ), 0 ), 1, "filter moving average on $a"); is(tapprox( sum($a->filter_exp(.8) - pdl qw( 0 0.8 1.76 2.752 3.7504 4.75008 5.750016 6.7500032 7.7500006 8.7500001 ) ), 0 ), 1, "filter with exponential smoothing on $a"); is(tapprox( $a->acf(5)->portmanteau($a->nelem), 11.1753902662994 ), 1, "portmanteau significance test on $a"); my $b = sequence(10) + 1; $b = lvalue_assign_detour( $b, 7, 9 ); is( tapprox( $b->mape($a), 0.302619047619048 ), 1, "mean absolute percent error between $a and $b"); is( tapprox( $b->mae($a), 1.1 ), 1, "mean absolute error between $a and $b"); $b = $b->setbadat(3); is( tapprox( $b->mape($a), 0.308465608465608 ), 1, "mean absolute percent error with bad data between $a and $b"); is( tapprox( $b->mae($a), 1.11111111111111 ), 1, "mean absolute error with bad data between $a and $b"); } { my $a = sequence(5)->dummy(1,2)->flat->sever; is(tapprox( sum($a->dseason(5) - pdl qw( 0.6 1.2 2 2 2 2 2 2 2.8 3.4 )), 0 ), 1, "deseasonalize data on $a with period 5"); is(tapprox( sum($a->dseason(4) - pdl qw( 0.5 1.125 2 2.375 2.125 1.875 1.625 2 2.875 3.5 )), 0 ), 1, "deseasonalize data on $a with period 4"); $a = $a->setbadat(4); is(tapprox( sum($a->dseason(5) - pdl qw( 0.6 1.2 1.5 1.5 1.5 1.5 1.5 2 2.8 3.4 )), 0 ), 1, "deseasonalize data with bad data on $a with period 5"); is(tapprox( sum($a->dseason(4) - pdl qw( 0.5 1.125 2 1.8333333 1.5 1.1666667 1.5 2 2.875 3.5 )), 0 ), 1, "deseasonalized data with bad data on $a with period 4"); } { my $a = sequence 4, 2; $a = $a->setbadat(2,0); $a = $a->setbadat(2,1); my $a_ans = pdl( [qw( 0 1 1.75 3)], [qw( 4 5 5.75 7 )], ); is( tapprox( sum($a->fill_ma(2) - $a_ans ), 0 ), 1, "fill missing data with moving average"); } { my $a = sequence 5; is( tapprox( sum( $a->diff - pdl(0, 1, 1, 1, 1) ), 0 ), 1, "difference data on $a - DX(t) = X(t) - X(t-1)"); is( tapprox( sum( $a->diff->inte - $a ), 0 ), 1, "add data on $a - IX(t) = X(t) + X(t-1)"); } { my $x = sequence 2; my $b = pdl(.8, -.2, .3); my $xp = $x->pred_ar($b, 7); is( tapprox(sum($xp - pdl(qw[0 1 1.1 0.74 0.492 0.3656 0.31408])),0), 1, "predict autoregressive series"); my $xp2 = $x->pred_ar($b(0:1), 7, {const=>0}); $xp2($b->dim(0)-1 : -1) += .3; is( tapprox(sum($xp - $xp2),0), 1, "predict autoregressive series with no constant last value"); } { my $a = sequence 10; my $b = pdl( qw(0 1 1 1 3 6 7 7 9 10) ); is( tapprox($a->wmape($b) - 0.177777777777778, 0), 1, "weighted mean absolute percent error between $a and $b"); $a = $a->setbadat(4); is( tapprox($a->wmape($b) - 0.170731707317073, 0), 1, "weighted mean absolute percent error with bad data between $a and $b"); } { my $a = sequence(5)->dummy(1,3)->flat->sever; $a = lvalue_assign_detour( $a, 1, 3); $a = $a->dummy(1,2)->sever; my $ind = sequence($a->dims)->(4,1)->flat; $a = lvalue_assign_detour($a, $ind, 0); my $ans_m = pdl( [ 4, 0, 1.6666667, 2, 3], [ 2.6666667, 0, 1.6666667, 2, 3], ); my $ans_ms = pdl( [ 0, 0,0.88888889, 0, 0], [ 3.5555556, 0,0.88888889, 0, 0], ); my ($m, $ms) = $a->season_m( 5, {start_position=>1, plot=>0} ); is( tapprox(sum(abs($m - $ans_m)), 0), 1, 'season_m m' ); is( tapprox(sum(abs($ms - $ans_ms)), 0), 1, 'season_m ms' ); } done_testing; sub lvalue_assign_detour { my ($pdl, $index, $new_value) = @_; my @arr = list $pdl; my @ind = ref($index)? list($index) : $index; $arr[$_] = $new_value for (@ind); return pdl(\@arr)->reshape($pdl->dims)->sever; } PDL-Stats-0.84/t/stats_ols_rptd.t0000644000175000017500000000500514544645567016626 0ustar osboxesosboxesuse strict; use warnings; use PDL::LiteF; use PDL::Stats; use PDL::NiceSlice; use Test::More; sub tapprox { my($a,$b, $eps) = @_; $eps ||= 1e-6; my $diff = abs($a-$b); # use max to make it perl scalar ref $diff eq 'PDL' and $diff = $diff->max; return $diff < $eps; } # This is the example from Lorch and Myers (1990), # a study on how characteristics of sentences affected reading time # Three within-subject IVs: # SP -- serial position of sentence # WORDS -- number of words in sentence # NEW -- number of new arguments in sentence my ($data, $idv, $ido) = rtable \*DATA, {V=>0}; my %r = $data( ,4)->ols_rptd( $data( ,3), $data( ,(0)), $data( ,1), $data( ,2) ); print "\n"; print "$_\t$r{$_}\n" for (sort keys %r); is( tapprox( $r{'(ss_total)'}, 405.188241771429 ) , 1, 'ss_total' ); is( tapprox( $r{'(ss_residual)'}, 58.3754646504336 ) , 1, 'ss_residual' ); is( tapprox( $r{'(ss_subject)'}, 51.8590337714289 ) , 1, 'ss_subject' ); is( tapprox( sumover($r{ss} - pdl(18.450705, 73.813294, 0.57026483)), 0 ) , 1, 'ss' ); is( tapprox( sumover($r{ss_err} - pdl(23.036272, 10.827623, 5.0104731)), 0 ) , 1, 'ss_err' ); is( tapprox( sumover($r{coeff} - pdl(0.33337285, 0.45858933, 0.15162986)), 0 ) , 1, 'coeff' ); is( tapprox( sumover($r{F} - pdl(7.208473, 61.354153, 1.0243311)), 0 ) , 1, 'F' ); done_testing(); # Lorch and Myers (1990) data __DATA__ Snt Sp Wrds New subj DV 1 1 13 1 1 3.429 2 2 16 3 1 6.482 3 3 9 2 1 1.714 4 4 9 2 1 3.679 5 5 10 3 1 4.000 6 6 18 4 1 6.973 7 7 6 1 1 2.634 1 1 13 1 2 2.795 2 2 16 3 2 5.411 3 3 9 2 2 2.339 4 4 9 2 2 3.714 5 5 10 3 2 2.902 6 6 18 4 2 8.018 7 7 6 1 2 1.750 1 1 13 1 3 4.161 2 2 16 3 3 4.491 3 3 9 2 3 3.018 4 4 9 2 3 2.866 5 5 10 3 3 2.991 6 6 18 4 3 6.625 7 7 6 1 3 2.268 1 1 13 1 4 3.071 2 2 16 3 4 5.063 3 3 9 2 4 2.464 4 4 9 2 4 2.732 5 5 10 3 4 2.670 6 6 18 4 4 7.571 7 7 6 1 4 2.884 1 1 13 1 5 3.625 2 2 16 3 5 9.295 3 3 9 2 5 6.045 4 4 9 2 5 4.205 5 5 10 3 5 3.884 6 6 18 4 5 8.795 7 7 6 1 5 3.491 1 1 13 1 6 3.161 2 2 16 3 6 5.643 3 3 9 2 6 2.455 4 4 9 2 6 6.241 5 5 10 3 6 3.223 6 6 18 4 6 13.188 7 7 6 1 6 3.688 1 1 13 1 7 3.232 2 2 16 3 7 8.357 3 3 9 2 7 4.920 4 4 9 2 7 3.723 5 5 10 3 7 3.143 6 6 18 4 7 11.170 7 7 6 1 7 2.054 1 1 13 1 8 7.161 2 2 16 3 8 4.313 3 3 9 2 8 3.366 4 4 9 2 8 6.330 5 5 10 3 8 6.143 6 6 18 4 8 6.071 7 7 6 1 8 1.696 1 1 13 1 9 1.536 2 2 16 3 9 2.946 3 3 9 2 9 1.375 4 4 9 2 9 1.152 5 5 10 3 9 2.759 6 6 18 4 9 7.964 7 7 6 1 9 1.455 1 1 13 1 10 4.063 2 2 16 3 10 6.652 3 3 9 2 10 2.179 4 4 9 2 10 3.661 5 5 10 3 10 3.330 6 6 18 4 10 7.866 7 7 6 1 10 3.705 PDL-Stats-0.84/t/stats_glm.t0000644000175000017500000005535114603563626015557 0ustar osboxesosboxesuse strict; use warnings; use Test::More; use PDL::Stats::Basic; use PDL::Stats::GLM; use PDL::LiteF; use PDL::NiceSlice; sub tapprox { my($a,$b, $eps) = @_; $eps ||= 1e-6; my $diff = abs($a-$b); # use max to make it perl scalar ref $diff eq 'PDL' and $diff = $diff->max; return $diff < $eps; } my $a = sequence 5; my $b = pdl(0, 0, 0, 1, 1); is( t_fill_m(), 1, "fill_m replaces bad values with sample mean"); sub t_fill_m { my $aa = sequence 5; $aa = $aa->setvaltobad(0); tapprox( $aa->fill_m->sum, 12.5 ); } is( t_fill_rand(), 1, "fill_rand replaces bad values with random sample of good values from same variable"); sub t_fill_rand { my $aa = sequence 5; $aa = $aa->setvaltobad(0); my $stdv = $aa->fill_rand->stdv; tapprox( $stdv, 1.01980390271856 ) || tapprox( $stdv, 1.16619037896906 ); } ok tapprox( $a->dev_m->avg, 0 ), "dev_m replaces values with deviations from the mean on $a"; ok tapprox( $a->stddz->avg, 0 ), "stddz standardizes data on $a"; ok tapprox( $a->sse($b), 18), "sse gives sum of squared errors between actual and predicted values between $a and $b"; ok tapprox( $a->mse($b), 3.6), "mse gives mean of squared errors between actual and predicted values between $a and $b"; ok tapprox( $a->rmse($b), 1.89736659610103 ), "rmse gives root mean squared error, ie. stdv around predicted value between $a and $b"; ok tapprox( $b->glue(1,ones(5))->pred_logistic(pdl(1,2))->sum, 4.54753948757851 ), "pred_logistic calculates predicted probability value for logistic regression"; my $y = pdl(0, 1, 0, 1, 0); ok tapprox( $y->d0(), 6.73011667009256 ), 'd0'; ok tapprox( $y->dm( ones(5) * .5 ), 6.93147180559945 ), 'dm'; ok tapprox( sum($y->dvrs(ones(5) * .5) ** 2), 6.93147180559945 ), 'dvrs'; { my $a = pdl(ushort, [0,0,1,0,1], [0,0,0,1,1] ); my $b = cat sequence(5), sequence(5)**2; $b = cat $b, $b * 2; my %m = $a->ols_t($b->dummy(2)); my $rsq = pdl( [ [ 0.33333333, 0.80952381 ], [ 0.33333333, 0.80952381 ], ], ); my $coeff = pdl( [ [qw( 0.2 -3.3306691e-16 -1.110223e-16)], [qw( 0.014285714 0.071428571 -0.057142857)], ], [ [qw( 0.1 -1.6653345e-16 -1.110223e-16)], [qw( 0.0071428571 0.035714286 -0.057142857)], ], ); ok tapprox( sum( abs($m{R2} - $rsq) ), 0 ), 'ols_t R2'; ok tapprox( sum( abs($m{b} - $coeff) ), 0 ), 'ols_t b'; my %m0 = $a->ols_t(sequence(5), {CONST=>0}); my $b0 = pdl ([ 0.2 ], [ 0.23333333 ]); ok tapprox( sum( abs($m0{b} - $b0) ), 0 ), 'ols_t, const=>0'; } ok tapprox( t_ols(), 0 ), 'ols'; sub t_ols { my $a = sequence 5; my $b = pdl(0,0,0,1,1); my %m = $a->ols($b, {plot=>0}); my %a = ( F => 9, F_df => pdl(1,3), R2 => .75, b => pdl(2.5, 1), b_se => pdl(0.83333333, 0.52704628), b_t => pdl(3, 1.8973666), ss_total => 10, ss_model => 7.5, ); test_stats_cmp(\%m, \%a); } ok tapprox( t_ols_bad(), 0 ), 'ols with bad value'; sub t_ols_bad { my $a = sequence 6; my $b = pdl(0,0,0,1,1,1); $a->setbadat(5); my %m = $a->ols($b, {plot=>0}); is( $b->sumover, 3, "ols with bad value didn't change caller value" ); ok $a->check_badflag, "ols with bad value didn't remove caller bad flag"; my %a = ( F => 9, F_df => pdl(1,3), R2 => .75, b => pdl(2.5, 1), b_se => pdl(0.83333333, 0.52704628), b_t => pdl(3, 1.8973666), ss_total => 10, ss_model => 7.5, ); test_stats_cmp(\%m, \%a); } ok tapprox( t_r2_change(), 0 ), 'r2_change'; sub t_r2_change { my $a = sequence 5, 2; my $b = pdl(0,0,0,1,1); my $c = pdl(0,0,2,2,2); my %m = $a->r2_change( $b, cat $b, $c ); my %a = ( F_change => pdl(3, 3), F_df => pdl(1, 2), R2_change => pdl(.15, .15), ); test_stats_cmp(\%m, \%a); } { # pca my $a = pdl ( [qw(1 3 6 6 8)], [qw(1 4 6 8 9)], [qw(0 2 2 4 9)], ); my %p = $a->pca({CORR=>1, PLOT=>0}); my %a = ( eigenvalue => pdl( qw( 2.786684 0.18473727 0.028578689) ), # loadings in R eigenvector => [pdl( # v1 v2 v3 [qw( 0.58518141 0.58668657 0.55978709)], # comp1 [qw( -0.41537629 -0.37601061 0.82829859)], # comp2 [qw( -0.69643754 0.71722722 -0.023661276)], # comp3 ), \&PDL::abs], loadings => [pdl( [qw( 0.97686463 0.97937725 0.93447296)], [qw( -0.17853319 -0.1616134 0.35601163)], [qw( -0.11773439 0.12124893 -0.0039999937)], ), \&PDL::abs], pct_var => pdl( qw(0.92889468 0.06157909 0.0095262297) ), ); test_stats_cmp(\%p, \%a, 1e-5); %p = $a->pca({CORR=>0, PLOT=>0}); %a = ( eigenvalue => [pdl(qw[ 22.0561695 1.581758022 0.202065959 ]), \&PDL::abs], eigenvector => [pdl( [qw(-0.511688 -0.595281 -0.619528)], [qw( 0.413568 0.461388 -0.78491)], [qw( 0.753085 -0.657846 0.0101023)], ), \&PDL::abs], loadings => [pdl( [qw(-0.96823408 -0.9739215 -0.94697802)], [qw( 0.20956865 0.20214966 -0.32129495)], [qw( 0.13639532 -0.10301693 0.001478041)], ), \&PDL::abs], pct_var => pdl( qw[0.925175 0.0663489 0.00847592] ), ); test_stats_cmp(\%p, \%a, 1e-4); } ok tapprox( t_pca_sorti(), 0 ), "pca_sorti - principal component analysis output sorted to find which vars a component is best represented"; sub t_pca_sorti { my $a = sequence 10, 5; $a = lvalue_assign_detour( $a, which($a % 7 == 0), 0 ); my %m = $a->pca({PLOT=>0}); my ($iv, $ic) = $m{loadings}->pca_sorti; return sum($iv - pdl(qw(4 1 0 2 3))) + sum($ic - pdl(qw( 0 1 2 ))); } SKIP: { eval { require PDL::Fit::LM; }; skip 'no PDL::Fit::LM', 1 if $@; ok tapprox( t_logistic(), 0 ), 'logistic'; my $y = pdl( 0, 0, 0, 1, 1 ); my $x = pdl(2, 3, 5, 5, 5); my %m = $y->logistic( $x, {COV=>1} ); isnt $m{cov}, undef, 'get cov from logistic if ask'; }; sub t_logistic { my $y = pdl( 0, 0, 0, 1, 1 ); my $x = pdl(2, 3, 5, 5, 5); my %m = $y->logistic( $x ); my $y_pred = $x->glue(1, ones(5))->pred_logistic( $m{b} ); my $y_pred_ans = pdl qw(7.2364053e-07 0.00010154254 0.66666667 0.66666667 0.66666667); return sum( $y_pred - $y_pred_ans, $m{Dm_chisq} - 2.91082711764867 ); } my $a_bad = sequence 6; $a_bad->setbadat(-1); my $b_bad = pdl(0, 0, 0, 0, 1, 1); $b_bad->setbadat(0); ok tapprox( $a_bad->dev_m->avg, 0 ), "dev_m with bad values $a_bad"; ok tapprox( $a_bad->stddz->avg, 0 ), "stdz with bad values $a_bad"; ok tapprox( $a_bad->sse($b_bad), 23), "sse with bad values between $a_bad and $b_bad"; ok tapprox( $a_bad->mse($b_bad), 5.75), "mse with badvalues between $a_bad and $b_bad"; ok tapprox( $a_bad->rmse($b_bad), 2.39791576165636 ), "rmse with bad values between $a_bad and $b_bad"; ok tapprox( $b_bad->glue(1,ones(6))->pred_logistic(pdl(1,2))->sum, 4.54753948757851 ), "pred_logistic with bad values"; ok tapprox( $b_bad->d0(), 6.73011667009256 ), "null deviance with bad values on $b_bad"; ok tapprox( $b_bad->dm( ones(6) * .5 ), 6.93147180559945 ), "model deviance with bad values on $b_bad"; ok tapprox( sum($b_bad->dvrs(ones(6) * .5) ** 2), 6.93147180559945 ), "deviance residual with bad values on $b_bad"; { eval { effect_code(['a']) }; isnt $@, '', 'effect_code with only one value dies'; my @a = qw( a a a b b b b c c BAD ); my $a = effect_code(\@a); my $ans = pdl [ [qw( 1 1 1 0 0 0 0 -1 -1 -99 )], [qw( 0 0 0 1 1 1 1 -1 -1 -99 )] ]; $ans = $ans->setvaltobad(-99); is( sum(abs(which($a->isbad) - pdl(9,19))), 0, 'effect_code got bad value' ); ok tapprox( sum(abs($a - $ans)), 0 ), 'effect_code coded with bad value'; } ok tapprox( t_effect_code_w(), 0 ), 'effect_code_w'; sub t_effect_code_w { eval { effect_code_w(['a']) }; isnt $@, '', 'effect_code_w with only one value dies'; my @a = qw( a a a b b b b c c c ); my $a = effect_code_w(\@a); return sum($a->sumover - pdl byte, (0, 0)); } ok tapprox( t_anova(), 0 ), 'anova_3w'; sub t_anova { my $d = sequence 60; my @a = map {$a = $_; map { $a } 0..14 } qw(a b c d); my $b = $d % 3; my $c = $d % 2; $d = lvalue_assign_detour( $d, 20, 10 ); my %m = $d->anova(\@a, $b, $c, {IVNM=>[qw(A B C)], plot=>0}); $m{'# A ~ B ~ C # m'} = $m{'# A ~ B ~ C # m'}->(,2,)->squeeze; test_stats_cmp(\%m, { '| A | F' => 165.252100840336, '| A ~ B ~ C | F' => 0.0756302521008415, '# A ~ B ~ C # m' => pdl([[qw(8 18 38 53)], [qw(8 23 38 53)]]), }); } ok tapprox( t_anova_1way(), 0 ), 'anova_1w'; sub t_anova_1way { my $d = pdl qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 ); my $a = qsort sequence(15) % 3; my %m = $d->anova($a, {plot=>0}); $m{$_} = $m{$_}->squeeze for '# IV_0 # m'; test_stats_cmp(\%m, { F => 0.160919540229886, ms_model => 0.466666666666669, '# IV_0 # m' => pdl(qw( 2.6 2.8 3.2 )), }); } ok tapprox( t_anova_bad_dv(), 0 ), 'anova_3w bad dv'; sub t_anova_bad_dv { my $d = sequence 60; $d = lvalue_assign_detour( $d, 20, 10 ); $d->setbadat(1); $d->setbadat(10); my @a = map {$a = $_; map { $a } 0..14 } qw(a b c d); my $b = sequence(60) % 3; my $c = sequence(60) % 2; my %m = $d->anova(\@a, $b, $c, {IVNM=>[qw(A B C)], plot=>0, v=>0}); $m{$_} = $m{$_}->(,1,)->squeeze for '# A ~ B ~ C # m', '# A ~ B ~ C # se'; test_stats_cmp(\%m, { '| A | F' => 150.00306433446, '| A ~ B ~ C | F' => 0.17534855325553, '# A ~ B ~ C # m' => pdl([qw( 4 22 37 52 )], [qw( 10 22 37 52 )]), '# A ~ B ~ C # se' => pdl([qw( 0 6 1.7320508 3.4641016 )], [qw( 3 3 3.4641016 1.7320508 )]), }); } ok tapprox( t_anova_bad_dv_iv(), 0 ), 'anova_3w bad dv iv'; sub t_anova_bad_dv_iv { my $d = sequence 63; my @a = map {$a = $_; map { $a } 0..14 } qw(a b c d); push @a, undef, qw( b c ); my $b = $d % 3; my $c = $d % 2; $d = lvalue_assign_detour( $d, 20, 10 ); $d->setbadat(62); $b->setbadat(61); my %m = $d->anova(\@a, $b, $c, {IVNM=>[qw(A B C)], plot=>0}); $m{$_} = $m{$_}->(,2,)->squeeze for '# A ~ B ~ C # m'; test_stats_cmp(\%m, { '| A | F' => 165.252100840336, '| A ~ B ~ C | F' => 0.0756302521008415, '# A ~ B ~ C # m' => pdl([qw(8 18 38 53)], [qw(8 23 38 53)]), }); } { my $a = pdl([0,1,2,3,4], [0,0,0,0,0]); $a = $a->setvaltobad(0); is( $a->fill_m->setvaltobad(0)->nbad, 5, 'fill_m nan to bad'); } { my $a = pdl([1,1,1], [2,2,2]); is( which($a->stddz == 0)->nelem, 6, 'stddz nan vs bad'); } ok tapprox( t_anova_rptd_basic(), 0 ), 'anova_rptd_basic'; sub t_anova_rptd_basic { # data from https://www.youtube.com/watch?v=Fh73dAOMm9M # Person,Before,After 2 weeks,After 4 weeks # P1,102,97,95 # P2,79,77,75 # P3,83,77,75 # P4,92,93,87 # in Octave, statistics package 1.4.2: # [p, table] = repanova([102 97 95; 79 77 75; 83 77 75; 92 93 87], 3, 'string') # p = 7.3048e-03 # table = # Source SS df MS F Prob > F # Subject 916.667 3 305.556 # Measure 72 2 36 12.4615 0.00730475 # Error 17.3333 6 2.88889 # turned into format for anova_rptd, then ($data, $idv, $subj) = rtable 'diet.txt', {SEP=>','} # Person,Week,Weight # P1,0,102 # P1,2,97 # P1,4,95 # P2,0,79 # P2,2,77 # P2,4,75 # P3,0,83 # P3,2,77 # P3,4,75 # P4,0,92 # P4,2,93 # P4,4,87 my ($data, $ivnm, $subj) = ( pdl( q[ [ 0 2 4 0 2 4 0 2 4 0 2 4] [102 97 95 79 77 75 83 77 75 92 93 87] ] ), [ qw(Week) ], [ qw(P1 P1 P1 P2 P2 P2 P3 P3 P3 P4 P4 P4) ], ); my ($w, $dv) = $data->dog; my %m = $dv->anova_rptd($subj, $w, {ivnm=>$ivnm}); test_stats_cmp(\%m, { '| Week | F' => 12.4615384615385, '| Week | df' => 2, '| Week | ms' => 36, '| Week | ss' => 72, ss_subject => 916.666666, }); } ok tapprox( t_anova_rptd_1way(), 0 ), 'anova_rptd_1w'; sub t_anova_rptd_1way { my $d = pdl qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 ); my $s = sequence(5)->dummy(1,3)->flat; my $a = qsort sequence(15) % 3; my %m = $d->anova_rptd($s, $a, {plot=>0}); $m{$_} = $m{$_}->squeeze for '# IV_0 # m'; test_stats_cmp(\%m, { '| IV_0 | F' => 0.145077720207254, '| IV_0 | ms' => 0.466666666666667, '# IV_0 # m' => pdl(qw( 2.6 2.8 3.2 )), }); } ok tapprox( t_anova_rptd_2way_bad_dv(), 0 ), 'anova_rptd_2w bad dv'; my %anova_bad_a = ( '| a | F' => 0.351351351351351, '| a | ms' => 0.722222222222222, '| a ~ b | F' => 5.25, '# a ~ b # m' => pdl(qw( 3 1.3333333 3.3333333 3.3333333 3.6666667 2.6666667 ))->reshape(3,2), ); sub t_anova_rptd_2way_bad_dv { my $d = pdl qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2); $d = $d->setbadat(5); my $s = sequence(4)->dummy(1,6)->flat; # [0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3] my $a = qsort sequence(24) % 3; # [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2] my $b = (sequence(8) > 3)->dummy(1,3)->flat; # [0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1] my %m = $d->anova_rptd($s, $a, $b, {ivnm=>['a','b'],plot=>0, v=>0}); test_stats_cmp(\%m, \%anova_bad_a); } ok tapprox( t_anova_rptd_2way_bad_iv(), 0 ), 'anova_rptd_2w bad iv'; sub t_anova_rptd_2way_bad_iv { my $d = pdl qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2); my $s = sequence(4)->dummy(1,6)->flat; # [0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3] my $a = qsort sequence(24) % 3; $a = $a->setbadat(5); # [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2] my $b = (sequence(8) > 3)->dummy(1,3)->flat; # [0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1] my %m = $d->anova_rptd($s, $a, $b, {ivnm=>['a','b'],plot=>0, v=>0}); test_stats_cmp(\%m, \%anova_bad_a); } ok tapprox( t_anova_rptd_3way(), 0 ), 'anova_rptd_3w'; sub t_anova_rptd_3way { my $d = pdl( qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2 ), qw( 5 5 1 1 4 4 1 4 4 2 3 3 5 1 1 2 4 4 4 5 5 1 1 2 ) ); my $s = sequence(4)->dummy(0,12)->flat; my $a = sequence(2)->dummy(0,6)->flat->dummy(1,4)->flat; my $b = sequence(2)->dummy(0,3)->flat->dummy(1,8)->flat; my $c = sequence(3)->dummy(1,16)->flat; my %m = $d->anova_rptd($s, $a, $b, $c, {ivnm=>['a','b', 'c'],plot=>0}); test_stats_cmp(\%m, { '| a | F' => 0.572519083969459, '| a | ms' => 0.520833333333327, '| a ~ c | F' => 3.64615384615385, '| b ~ c || err ms' => 2.63194444444445, '| a ~ b ~ c | F' => 1.71299093655589, '# a ~ b ~ c # m' => pdl(qw( 4 2.75 2.75 2.5 3.25 4.25 3.5 1.75 2 3.5 2.75 2.25 ))->reshape(2,2,3), '# a ~ b # se' => ones(2, 2) * 0.55014729, }); } ok tapprox( t_anova_rptd_mixed(), 0 ), 'anova_rptd mixed'; sub t_anova_rptd_mixed { my $d = pdl qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2); my $s = sequence(4)->dummy(1,6)->flat; # [0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3] my $a = qsort sequence(24) % 3; # [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2] my $b = (sequence(8) > 3)->dummy(1,3)->flat; # [0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1] my %m = $d->anova_rptd($s, $a, $b, {ivnm=>['a','b'],btwn=>[1],plot=>0, v=>0}); test_stats_cmp(\%m, { '| a | F' => 0.0775862068965517, '| a | ms' => 0.125, '| a ~ b | F' => 1.88793103448276, '| b | F' => 0.585657370517928, '| b || err ms' => 3.48611111111111, '# a ~ b # se' => ones(3,2) * 0.63464776, }); } # Tests for mixed anova thanks to Erich Greene ok tapprox( t_anova_rptd_mixed_l2ord2(), 0, ), 'anova_rptd mixed with 2 btwn-subj var levels, data grouped by subject'; SKIP: { skip "yet to be fixed", 3; ok tapprox( t_anova_rptd_mixed_l2ord1(), 0, ), 'anova_rptd mixed with 2 btwn-subj var levels, data grouped by within var'; ok tapprox( t_anova_rptd_mixed_l3ord1(), 0, .001 ), 'anova_rptd mixed with 3 btwn-subj var levels, data grouped by within var'; ok tapprox( t_anova_rptd_mixed_l3ord2(), 0, .001 ), 'anova_rptd mixed with 3 btwn-subj var levels, data grouped by subject'; } sub test_stats_cmp { local $Test::Builder::Level = $Test::Builder::Level + 1; my ($m, $ans, $eps) = @_; $eps ||= 1e-6; my $error = pdl 0; foreach (sort keys %$ans) { my $got = PDL->topdl($m->{$_}); my $exp = $ans->{$_}; if (ref $exp eq 'ARRAY') { ($exp, my $func) = @$exp; ($got, $exp) = map &$func($_), $got, $exp; } $exp = PDL->topdl($exp); $error = $error + (my $this_diff = $got - $exp); fail($_), diag "got $m->{$_}\nexpected $exp" if any($this_diff->abs > $eps); } return $error; } sub t_anova_rptd_mixed_backend { my ($d,$s,$w,$b,$ans) = @_; my %m = $d->anova_rptd($s,$w,$b,{ivnm=>['within','between'],btwn=>[1],plot=>0, v=>0}); test_stats_cmp(\%m, $ans); } sub t_anova_rptd_mixed_l2_common { my ($d,$s,$w,$b) = @_; my %ans = ( '| within | df' => 2, '| within || err df' => 12, '| within | ss' => .25, '| within | ms' => .125, '| within || err ss' => 23.666667, '| within || err ms' => 1.9722222, '| within | F' => 0.063380282, '| between | df' => 1, '| between || err df' => 6, '| between | ss' => 2.0416667, '| between | ms' => 2.0416667, '| between || err ss' => 16.583333, '| between || err ms' => 2.7638889, '| between | F' => 0.73869347, '| within ~ between | df' => 2, '| within ~ between | ss' => 6.0833333, '| within ~ between | ms' => 3.0416667, '| within ~ between | F' => 1.5422535, ); $ans{"| within ~ between || err $_"} = $ans{"| within || err $_"} foreach qw/df ss ms/; return t_anova_rptd_mixed_backend($d,$s,$w,$b,\%ans); } sub t_anova_rptd_mixed_l3_common { my ($d,$s,$w,$b) = @_; my %ans = ( '| within | df' => 2, '| within || err df' => 12, '| within | ss' => .963, '| within | ms' => .481, '| within || err ss' => 20.889, '| within || err ms' => 1.741, '| within | F' => .277, '| between | df' => 2, '| between || err df' => 6, '| between | ss' => 1.185, '| between | ms' => .593, '| between || err ss' => 13.111, '| between || err ms' => 2.185, '| between | F' => .271, '| within ~ between | df' => 4, '| within ~ between | ss' => 4.148, '| within ~ between | ms' => 1.037, '| within ~ between | F' => .596, ); $ans{"| within ~ between || err $_"} = $ans{"| within || err $_"} foreach qw/df ss ms/; return t_anova_rptd_mixed_backend($d,$s,$w,$b,\%ans); } sub t_anova_rptd_mixed_l2ord1 { my $d = pdl qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2); my $s = sequence(8)->dummy(1,3)->flat; # [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7] my $w = qsort sequence(24) % 3; # [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2] my $b = (sequence(8) % 2)->qsort->dummy(1,3)->flat; # [0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1] return t_anova_rptd_mixed_l2_common($d,$s,$w,$b); } sub t_anova_rptd_mixed_l2ord2 { my $d = pdl qw( 3 1 4 2 4 2 1 1 1 5 2 5 2 3 4 1 5 3 5 5 2 3 3 2); my $s = qsort sequence(24) % 8; # [0 0 0 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7] my $w = sequence(3)->dummy(1,8)->flat; # [0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2] my $b = qsort sequence(24) % 2; # [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1] return t_anova_rptd_mixed_l2_common($d,$s,$w,$b); } sub t_anova_rptd_mixed_l3ord1 { my $d = pdl qw( 5 2 2 5 4 1 5 3 5 4 4 3 4 3 4 3 5 1 4 3 3 4 5 4 5 5 2 ); my $s = sequence(9)->dummy(1,3)->flat; # [0 1 2 3 4 5 6 7 8 0 1 2 3 4 5 6 7 8 0 1 2 3 4 5 6 7 8] my $w = qsort sequence(27) % 3; # [0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2] my $b = (sequence(9) % 3)->qsort->dummy(1,3)->flat; # [0 0 0 1 1 1 2 2 2 0 0 0 1 1 1 2 2 2 0 0 0 1 1 1 2 2 2] return t_anova_rptd_mixed_l3_common($d,$s,$w,$b); } sub t_anova_rptd_mixed_l3ord2 { my $d = pdl qw( 5 4 4 2 4 3 2 3 3 5 4 4 4 3 5 1 4 4 5 3 5 3 5 5 5 1 2 ); my $s = qsort sequence(27) % 9; # [0 0 0 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7 8 8 8] my $w = sequence(3)->dummy(1,9)->flat; # [0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2] my $b = qsort sequence(27) % 3; # [0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2] return t_anova_rptd_mixed_l3_common($d,$s,$w,$b); } ok tapprox( t_anova_rptd_mixed_bad(), 0 ), 'anova_rptd mixed bad'; sub t_anova_rptd_mixed_bad { my $d = pdl qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2 1 1 1 1 ); my $s = sequence(4)->dummy(1,6)->flat; # [0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3] # add subj 4 at the end $s = $s->append(ones(4) * 4); my $a = qsort sequence(24) % 3; # [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2] $a = $a->append(zeroes(4)); my $b = (sequence(8) > 3)->dummy(1,3)->flat; # [0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1] $b = $b->append(zeroes(4)); # any missing value causes all data from the subject (4) to be dropped $b->setbadat(-1); my %m = $d->anova_rptd($s, $a, $b, {ivnm=>['a','b'],btwn=>[1],plot=>0, v=>0}); test_stats_cmp(\%m, { '| a | F' => 0.0775862068965517, '| a | ms' => 0.125, '| a ~ b | F' => 1.88793103448276, '| b | F' => 0.585657370517928, '| b || err ms' => 3.48611111111111, '# a ~ b # se' => ones(3,2) * 0.63464776, }); } ok tapprox( t_anova_rptd_mixed_4w(), 0 ), 'anova_rptd_mixed_4w'; sub t_anova_rptd_mixed_4w { my ($data, $idv, $subj) = rtable \*DATA, {v=>0}; my ($age, $aa, $beer, $wings, $dv) = $data->dog; my %m = $dv->anova_rptd( $subj, $age, $aa, $beer, $wings, { ivnm=>[qw(age aa beer wings)], btwn=>[0,1], v=>0, plot=>0 } ); test_stats_cmp(\%m, { '| aa | F' => 0.0829493087557666, '| age ~ aa | F' => 2.3594470046083, '| beer | F' => 0.00943396226415362, '| aa ~ beer | F' => 0.235849056603778, '| age ~ beer ~ wings | F' => 0.0303030303030338, '| beer ~ wings | F' => 2.73484848484849, '| age ~ aa ~ beer ~ wings | F' => 3.03030303030303, }); } { my $a = effect_code( sequence(12) > 5 ); my $b = effect_code([ map {(0, 1)} (1..6) ]); my $c = effect_code([ map {(0,0,1,1,2,2)} (1..2) ]); my $ans = pdl [ [qw( 1 -1 0 -0 -1 1 -1 1 -0 0 1 -1 )], [qw( 0 -0 1 -1 -1 1 -0 0 -1 1 1 -1 )] ]; my $inter = interaction_code( $a, $b, $c); is(sum(abs($inter - $ans)), 0, 'interaction_code'); } done_testing(); sub lvalue_assign_detour { my ($pdl, $index, $new_value) = @_; my @arr = list $pdl; my @ind = ref($index)? list($index) : $index; $arr[$_] = $new_value for (@ind); return pdl(\@arr)->reshape($pdl->dims)->sever; } __DATA__ subj age Apple-android beer wings recall 1 0 0 0 0 5 1 0 0 0 1 4 1 0 0 1 0 8 1 0 0 1 1 3 2 0 0 0 0 3 2 0 0 0 1 7 2 0 0 1 0 9 2 0 0 1 1 3 3 0 0 0 0 2 3 0 0 0 1 9 3 0 0 1 0 1 3 0 0 1 1 0 1 0 1 0 0 4 1 0 1 0 1 6 1 0 1 1 0 9 1 0 1 1 1 6 2 0 1 0 0 9 2 0 1 0 1 7 2 0 1 1 0 5 2 0 1 1 1 8 3 0 1 0 0 6 3 0 1 0 1 6 3 0 1 1 0 3 3 0 1 1 1 4 1 1 0 0 0 8 1 1 0 0 1 8 1 1 0 1 0 10 1 1 0 1 1 7 2 1 0 0 0 10 2 1 0 0 1 1 2 1 0 1 0 8 2 1 0 1 1 11 3 1 0 0 0 4 3 1 0 0 1 10 3 1 0 1 0 5 3 1 0 1 1 2 1 1 1 0 0 10 1 1 1 0 1 6 1 1 1 1 0 10 1 1 1 1 1 6 2 1 1 0 0 2 2 1 1 0 1 5 2 1 1 1 0 9 2 1 1 1 1 4 3 1 1 0 0 3 3 1 1 0 1 5 3 1 1 1 0 9 3 1 1 1 1 2 PDL-Stats-0.84/t/stats_basic.t0000644000175000017500000001761614544645567016074 0ustar osboxesosboxesuse strict; use warnings; use Test::More; use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::Basic; sub tapprox { my($a,$b, $eps) = @_; $eps ||= 1e-6; my $diff = abs($a-$b); # use max to make it perl scalar ref $diff eq 'PDL' and $diff = $diff->max; return $diff < $eps; } my $a = sequence 5; is( tapprox( $a->stdv, 1.4142135623731 ), 1, "standard deviation of $a"); is( tapprox( $a->stdv_unbiased, 1.58113883008419 ), 1, "unbiased standard deviation of $a"); is( tapprox( $a->var, 2 ), 1, "variance of $a"); is( tapprox( $a->var_unbiased, 2.5 ), 1, "unbiased variance of $a"); is( tapprox( $a->se, 0.707106781186548 ), 1, "standard error of $a"); is( tapprox( $a->ss, 10 ), 1, "sum of squared deviations from the mean of $a"); is( tapprox( $a->skew, 0 ), 1, "sample skewness of $a"); is( tapprox( $a->skew_unbiased, 0 ), 1, "unbiased sample skewness of $a"); is( tapprox( $a->kurt, -1.3 ), 1, "sample kurtosis of $a"); is( tapprox( $a->kurt_unbiased, -1.2 ), 1, "unbiased sample kurtosis of $a"); { ok(tapprox($_->ss, (($_ - $_->avg)**2)->sum), "ss for $_") for pdl('[1 1 1 1 2 3 4 4 4 4 4 4]'), pdl('[1 2 2 2 3 3 3 3 4 4 5 5]'), pdl('[1 1 1 2 2 3 3 4 4 5 5 5]'); } my $a_bad = sequence 6; $a_bad->setbadat(-1); is( tapprox( $a_bad->stdv, 1.4142135623731 ), 1, "standard deviation of $a_bad"); is( tapprox( $a_bad->stdv_unbiased, 1.58113883008419 ), 1, "unbiased standard deviation of $a_bad"); is( tapprox( $a_bad->var, 2 ), 1, "variance of $a_bad"); is( tapprox( $a_bad->var_unbiased, 2.5 ), 1, "unbiased variance of $a_bad"); is( tapprox( $a_bad->se, 0.707106781186548 ), 1, "standard error of $a_bad"); is( tapprox( $a_bad->ss, 10 ), 1, "sum of squared deviations from the mean of $a_bad"); is( tapprox( $a_bad->skew, 0 ), 1, "sample skewness of $a_bad"); is( tapprox( $a_bad->skew_unbiased, 0 ), 1, "unbiased sample skewness of $a_bad"); is( tapprox( $a_bad->kurt, -1.3 ), 1, "sample kurtosis of $a_bad"); is( tapprox( $a_bad->kurt_unbiased, -1.2 ), 1, "unbiased sample kurtosis of $a_bad"); my $b = sequence 5; $b %= 2; $b = qsort $b; is( tapprox( $a->cov($b), 0.6 ), 1, "sample covariance of $a and $b" ); is( tapprox( $a->corr($b), 0.866025403784439 ), 1, "Pearson correlation coefficient of $a and $b"); is( tapprox( $a->n_pair($b), 5 ), 1, "Number of good pairs between $a and $b"); is( tapprox( $a->corr($b)->t_corr( 5 ), 3 ), 1, "t significance test of Pearson correlation coefficient of $a and $b"); is( tapprox( $a->corr_dev($b), 0.903696114115064 ), 1, "correlation calculated from dev_m values of $a and $b"); my $b_bad = sequence 6; $b_bad = qsort( $b_bad % 2 ); $b_bad->setbadat(0); is( tapprox( $a_bad->cov($b_bad), 0.5 ), 1, "sample covariance with bad data of $a_bad and $b_bad"); is( tapprox( $a_bad->corr($b_bad), 0.894427190999916 ), 1, "Pearson correlation coefficient with bad data of $a_bad and $b_bad"); is( tapprox( $a_bad->n_pair($b_bad), 4 ), 1, "Number of good pairs between $a_bad and $b_bad with bad values taken into account"); is( tapprox( $a_bad->corr($b_bad)->t_corr( 4 ), 2.82842712474619 ), 1, "t signifiance test of Pearson correlation coefficient with bad data of $a_bad and $b_bad"); is( tapprox( $a_bad->corr_dev($b_bad), 0.903696114115064 ), 1, "correlation calculated from dev_m values with bad data of $a_bad and $b_bad"); my ($t, $df) = $a->t_test($b); is( tapprox( $t, 2.1380899352994 ), 1, "t-test between $a and $b - 't' output"); is( tapprox( $df, 8 ), 1, "t-test between $a and $b - 'df' output"); ($t, $df) = $a->t_test_nev($b); is( tapprox( $t, 2.1380899352994 ), 1, "t-test with non-equal variance between $a and $b - 't' output"); is( tapprox( $df, 4.94637223974763 ), 1, "t-test with non-equal variance between $a and $b - 'df' output"); ($t, $df) = $a->t_test_paired($b); is( tapprox( $t, 3.13785816221094 ), 1, "paired sample t-test between $a and $b - 't' output"); is( tapprox( $df, 4 ), 1, "paired sample t-test between $a and $b - 'df' output"); ($t, $df) = $a_bad->t_test($b_bad); is( tapprox( $t, 1.87082869338697 ), 1, "t-test with bad values between $a_bad and $b_bad - 't' output"); is( tapprox( $df, 8 ), 1, "t-test with bad values between $a_bad and $b_bad - 'd' output"); ($t, $df) = $a_bad->t_test_nev($b_bad); is( tapprox( $t, 1.87082869338697 ), 1, "t-test with non-equal variance with bad values between $a_bad and $b_bad - 't' output"); is( tapprox( $df, 4.94637223974763 ), 1, "t-test with non-equal variance with bad values between $a_bad and $b_bad - 'df' output"); ($t, $df) = $a_bad->t_test_paired($b_bad); is( tapprox( $t, 4.89897948556636 ), 1, "paired sample t-test with bad values between $a_bad and $b_bad - 't' output"); is( tapprox( $df, 3 ), 1, "paired sample t-test with bad values between $a_bad and $b_bad - 'df' output"); { my ($data, $idv, $ido) = rtable(\*DATA, {V=>0}); is( tapprox( sum(pdl($data->dims) - pdl(14, 5)), 0 ), 1, 'rtable data dim' ); is( tapprox( $data->sum / $data->nbad, 1.70731707317073 ), 1, 'rtable bad elem' ); } { my $a = random 10, 3; is( tapprox( sum($a->cov_table - $a->cov($a->dummy(1))), 0 ), 1, 'cov_table' ); $a->setbadat(4,0); is( tapprox( sum($a->cov_table - $a->cov($a->dummy(1))), 0 ), 1, 'cov_table bad val' ); } { my $a = random 10, 3; is( tapprox( sum(abs($a->corr_table - $a->corr($a->dummy(1)))), 0 ), 1, "Square Pearson correlation table"); $a->setbadat(4,0); is( tapprox( sum(abs($a->corr_table - $a->corr($a->dummy(1)))), 0 ), 1, "Square Pearson correlation table with bad data"); } { my $a = pdl([0,1,2,3,4], [0,0,0,0,0]); $a = $a->setvaltobad(0); is( $a->stdv->nbad, 1, "Bad value input to stdv makes the stdv itself bad"); } SKIP: { eval { require PDL::Core; require PDL::GSL::CDF; }; skip 'no PDL::GSL::CDF', 1 if $@; my $x = pdl(1, 2); my $n = pdl(2, 10); my $p = .5; my $a = pdl qw[ 0.75 0.9892578125 ]; is (tapprox( sum(abs(binomial_test( $x,$n,$p ) - $a)) ,0), 1, 'binomial_test'); } { my $a = sequence 10, 2; my $factor = sequence(10) > 4; my $ans = pdl( [[0..4], [10..14]], [[5..9], [15..19]] ); my ($a_, $l) = $a->group_by($factor); is( tapprox( sum(abs($a_ - $ans)), 0 ), 1, 'group_by single factor equal n' ); is_deeply( $l, [0, 1], 'group_by single factor label'); $a = sequence 10,2; $factor = qsort sequence(10) % 3; $ans = pdl( [1.5, 11.5], [5, 15], [8, 18] ); is( tapprox( sum(abs($a->group_by($factor)->average - $ans)), 0 ), 1, 'group_by single factor unequal n' ); $a = sequence 10; my @factors = ( [qw( a a a a b b b b b b )], [qw(0 1 0 1 0 1 0 1 0 1)] ); $ans = pdl( [ [0,2,-1], [1,3,-1], ], [ [4,6,8], [5,7,9], ] ); $ans->badflag(1); $ans = $ans->setvaltobad(-1); ($a_, $l) = $a->group_by( @factors ); is(tapprox(sum(abs($a_ - $ans)), 0), 1, 'group_by multiple factors') or diag($a_, $ans); is_deeply($l, [[qw(a_0 a_1)], [qw( b_0 b_1 )]], 'group_by multiple factors label'); } { my @a = qw(a a b b c c); my $a = PDL::Stats::Basic::_array_to_pdl( \@a ); my $ans = pdl( 0,0,1,1,2,2 ); is( tapprox( sum(abs($a - $ans)), 0 ), 1, '_array_to_pdl' ); $a[-1] = undef; my $a_bad = PDL::Stats::Basic::_array_to_pdl( \@a ); my $ans_bad = pdl( 0,0,1,1,2,2 ); $ans_bad = $ans_bad->setbadat(-1); like( $a_bad(-1)->isbad(), qr/1/, '_array_to_pdl with missing value undef' ); is( tapprox( sum(abs($a_bad - $ans_bad)), 0 ), 1, '_array_to_pdl with missing value undef correctly coded' ); $a[-1] = 'BAD'; $a_bad = PDL::Stats::Basic::_array_to_pdl( \@a ); like( $a_bad(-1)->isbad(), qr/1/, '_array_to_pdl with missing value BAD' ); is( tapprox( sum(abs($a_bad - $ans_bad)), 0 ), 1, '_array_to_pdl with missing value BAD correctly coded' ); } done_testing(); __DATA__ 999 90 91 92 93 94 70 5 7 -999 -999 -999 711 trying 71 -999 3 -999 -999 0 72 2 7 -999 -999 -999 73 -999 0 -999 -999 2 74 5 -999 1 0 -999 75 -999 0 -999 -999 0 76 9 8 1 5 -999 77 4 -999 -999 -999 -999 78 -999 0 -999 -999 0 79 -999 3 -999 -999 0 80 -999 0 -999 -999 2 81 5 -999 1 0 -999 82 -999 0 -999 -999 0 PDL-Stats-0.84/t/00-report-prereqs.t0000644000175000017500000001347614544645567017004 0ustar osboxesosboxesuse strict; use warnings; # This test was generated by Dist::Zilla::Plugin::Test::ReportPrereqs 0.020 # THEN modified with more info by Ed J for PDL project use Test::More tests => 1; use ExtUtils::MakeMaker; use File::Spec; # from $version::LAX my $lax_version_re = qr/(?: undef | (?: (?:[0-9]+) (?: \. | (?:\.[0-9]+) (?:_[0-9]+)? )? | (?:\.[0-9]+) (?:_[0-9]+)? ) | (?: v (?:[0-9]+) (?: (?:\.[0-9]+)+ (?:_[0-9]+)? )? | (?:[0-9]+)? (?:\.[0-9]+){2,} (?:_[0-9]+)? ) )/x; # hide optional CPAN::Meta modules from prereq scanner # and check if they are available my $cpan_meta = "CPAN::Meta"; my $cpan_meta_pre = "CPAN::Meta::Prereqs"; my $HAS_CPAN_META = eval "require $cpan_meta; $cpan_meta->VERSION('2.120900')" && eval "require $cpan_meta_pre"; ## no critic # Verify requirements? my $DO_VERIFY_PREREQS = 1; sub _max { my $max = shift; $max = ( $_ > $max ) ? $_ : $max for @_; return $max; } sub _merge_prereqs { my ($collector, $prereqs) = @_; # CPAN::Meta::Prereqs object if (ref $collector eq $cpan_meta_pre) { return $collector->with_merged_prereqs( CPAN::Meta::Prereqs->new( $prereqs ) ); } # Raw hashrefs for my $phase ( keys %$prereqs ) { for my $type ( keys %{ $prereqs->{$phase} } ) { for my $module ( keys %{ $prereqs->{$phase}{$type} } ) { $collector->{$phase}{$type}{$module} = $prereqs->{$phase}{$type}{$module}; } } } return $collector; } my @include = qw( ); my @exclude = qw( ); # Add static prereqs to the included modules list my $static_prereqs = do 't/00-report-prereqs.dd'; # Merge all prereqs (either with ::Prereqs or a hashref) my $full_prereqs = _merge_prereqs( ( $HAS_CPAN_META ? $cpan_meta_pre->new : {} ), $static_prereqs ); # Add dynamic prereqs to the included modules list (if we can) my ($source) = grep { -f } 'MYMETA.json', 'MYMETA.yml'; if ( $source && $HAS_CPAN_META ) { if ( my $meta = eval { CPAN::Meta->load_file($source) } ) { $full_prereqs = _merge_prereqs($full_prereqs, $meta->prereqs); } } else { $source = 'static metadata'; } my @full_reports; my @dep_errors; my $req_hash = $HAS_CPAN_META ? $full_prereqs->as_string_hash : $full_prereqs; # Add static includes into a fake section for my $mod (@include) { $req_hash->{other}{modules}{$mod} = 0; } for my $phase ( qw(configure build test runtime develop other) ) { next unless $req_hash->{$phase}; next if ($phase eq 'develop' and not $ENV{AUTHOR_TESTING}); for my $type ( qw(requires recommends suggests conflicts modules) ) { next unless $req_hash->{$phase}{$type}; my $title = ucfirst($phase).' '.ucfirst($type); my @reports = [qw/Module Want Have Where Howbig/]; for my $mod ( sort keys %{ $req_hash->{$phase}{$type} } ) { next if $mod eq 'perl'; next if grep { $_ eq $mod } @exclude; my $file = $mod; $file =~ s{::}{/}g; $file .= ".pm"; my ($prefix) = grep { -e File::Spec->catfile($_, $file) } @INC; my $want = $req_hash->{$phase}{$type}{$mod}; $want = "undef" unless defined $want; $want = "any" if !$want && $want == 0; my $req_string = $want eq 'any' ? 'any version required' : "version '$want' required"; if ($prefix) { my $filename = File::Spec->catfile($prefix, $file); my $have = MM->parse_version( $filename ); $have = "undef" unless defined $have; push @reports, [$mod, $want, $have, $prefix, (-s $filename)]; if ( $DO_VERIFY_PREREQS && $HAS_CPAN_META && $type eq 'requires' ) { if ( $have !~ /\A$lax_version_re\z/ ) { push @dep_errors, "$mod version '$have' cannot be parsed ($req_string)"; } elsif ( ! $full_prereqs->requirements_for( $phase, $type )->accepts_module( $mod => $have ) ) { push @dep_errors, "$mod version '$have' is not in required range '$want'"; } } } else { push @reports, [$mod, $want, "missing", '', 0]; if ( $DO_VERIFY_PREREQS && $type eq 'requires' ) { push @dep_errors, "$mod is not installed ($req_string)"; } } } if ( @reports ) { push @full_reports, "=== $title ===\n\n"; my $ml = _max( map { length $_->[0] } @reports ); my $wl = _max( map { length $_->[1] } @reports ); my $hl = _max( map { length $_->[2] } @reports ); my $ll = _max( map { length $_->[3] } @reports ); # location my $sl = _max( map { length $_->[4] } @reports ); # size if ($type eq 'modules') { splice @reports, 1, 0, ["-" x $ml, "", "-" x $hl, "-" x $ll, "-" x $sl]; push @full_reports, map { sprintf(" %*s %*s\n", -$ml, $_->[0], $hl, $_->[2]) } @reports; } else { splice @reports, 1, 0, ["-" x $ml, "-" x $wl, "-" x $hl, "-" x $ll, "-" x $sl]; push @full_reports, map { sprintf(" %*s %*s %*s %*s %*s\n", -$ml, $_->[0], $wl, $_->[1], $hl, $_->[2], -$ll, $_->[3], $sl, $_->[4]) } @reports; } push @full_reports, "\n"; } } } if ( @full_reports ) { diag "\nVersions for all modules listed in $source (including optional ones):\n\n", @full_reports; } if ( @dep_errors ) { diag join("\n", "\n*** WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING ***\n", "The following REQUIRED prerequisites were not satisfied:\n", @dep_errors, "\n" ); } pass; # vim: ts=4 sts=4 sw=4 et: PDL-Stats-0.84/README.md0000644000175000017500000000765214111727665014411 0ustar osboxesosboxes# PDL-Stats | Build status | | ------------- | | ![Build Status](https://github.com/PDLPorters/PDL-Stats/workflows/perl/badge.svg?branch=master) | [![Coverage Status](https://coveralls.io/repos/PDLPorters/PDL-Stats/badge.svg?branch=master&service=github)](https://coveralls.io/github/PDLPorters/PDL-Stats?branch=master) [![CPAN version](https://badge.fury.io/pl/PDL-Stats.svg)](https://metacpan.org/pod/PDL::Stats) This is a collection of statistics modules in Perl Data Language, with a quick-start guide for non-PDL people. They make perldl--the simple shell for PDL--work like a teenie weenie R, but with PDL threading--"the fast (and automagic) vectorised iteration of 'elementary operations' over arbitrary slices of multidimensional data"--on procedures including t-test, ordinary least squares regression, and kmeans. Of course, they also work in perl scripts. ## DEPENDENCIES - PDL Perl Data Language. Preferably installed with a Fortran compiler. A few methods (logistic regression and all plotting methods) will only work with a Fortran compiler and some methods (ordinary least squares regression and pca) work much faster with a Fortran compiler. The required PDL version is 2.057. - GSL (Optional) GNU Scientific Library. This is required by PDL::Stats::Distr and PDL::GSL::CDF, the latter of which provides p-values for PDL::Stats::GLM. GSL is otherwise NOT required for the core PDL::Stats modules to work, ie Basic, Kmeans, and GLM. - PGPLOT (Optional) PDL-Stats currently uses PGPLOT for plotting. There are three pgplot/PGPLOT modules, which cause much confusion upon installation. First there is the pgplot Fortran library. Then there is the perl PGPLOT module, which is the perl interface to pgplot. Finally there is PDL::Graphics::PGPLOT, which depends on pgplot and PGPLOT, that PDL-Stats uses for plotting. ## INSTALLATION ### \*nix For standard perl module installation in \*nix environment form source, to install all included modules, extract the files from the archive by entering this at a shell, tar xvf PDL-Stats-xxx.tar.gz then change to the PDL-Stats directory, cd PDL-Stats-xxx and run the following commands: perl Makefile.PL make make test sudo make install If you don't have permission to run sudo, you can specify an alternative path, perl Makefile.PL PREFIX=/home/user/my_perl_lib make make test make install then add `/home/user/my_perl_lib` to your PERL5LIB environment variable. If you have trouble installing PDL, you can look for help at the PDL wiki or PDL mailing list. ### Windows Thanks to Sisyphus, Windows users can download and install the ppm version of PDL-Stats and all dependencies using the PPM utility included in ActiveState perl or Strawberry perl. You can also get the PPM utility from CPAN. ppm install http://www.sisyphusion.tk/ppm/PGPLOT.ppd ppm install http://www.sisyphusion.tk/ppm/PDL.ppd ppm install http://www.sisyphusion.tk/ppm/PDL-Stats.ppd ## SUPPORT AND DOCUMENTATION After installing, you can find documentation for the modules with the perldoc command. perldoc PDL::Stats perldoc PDL::Stats::Basic etc. You can also look for information at: Home https://github.com/PDLPorters/PDL-Stats Search CPAN https://metacpan.org/dist/PDL-Stats Mailing list (low traffic, open a GitHub issue instead) https://lists.sourceforge.net/lists/listinfo/pdl-stats-help If you notice a bug or have a request, please submit a report at [https://github.com/PDLPorters/PDL-Stats/issues](https://github.com/PDLPorters/PDL-Stats/issues) If you would like to help develop or maintain the package, please email me at the address below. ## COPYRIGHT AND LICENCE Copyright (C) 2009-2012 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. PDL-Stats-0.84/TS/0000755000175000017500000000000014625061425013441 5ustar osboxesosboxesPDL-Stats-0.84/TS/ts.pd0000644000175000017500000004465214111727665014434 0ustar osboxesosboxespp_addpm({At=>'Top'}, <<'EOD'); =encoding utf8 =head1 NAME PDL::Stats::TS -- basic time series functions =head1 DESCRIPTION The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively. Plots require PDL::Graphics::PGPLOT. ***EXPERIMENTAL!*** In particular, bad value support is spotty and may be shaky. USE WITH DISCRETION! =head1 SYNOPSIS use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::TS; my $r = $data->acf(5); =cut use Carp; use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::Basic; use PDL::Stats::Kmeans; my $DEV = ($^O =~ /win/i)? '/png' : '/xs'; EOD pp_addhdr(' #include #define Z10 1.64485362695147 #define Z05 1.95996398454005 #define Z01 2.5758293035489 #define Z001 3.29052673149193 ' ); pp_def('_acf', Pars => 'x(t); [o]r(h)', OtherPars => 'IV lag=>h', GenericTypes => [F,D], Code => ' $GENERIC(x) s, s2, m, cov0, covh; s=0; s2=0; m=0; cov0=0; covh=0; PDL_Indx T, i; T = $SIZE(t); loop(t) %{ s += $x(); s2 += pow($x(), 2); %} m = s/T; cov0 = s2 - T * pow(m, 2); loop (h) %{ if (h) { covh = 0; for (i=0; ii) - m) * ($x(t=>i+h) - m); } $r() = covh / cov0; } else { $r() = 1; } %} ', Doc => undef, ); pp_def('_acvf', Pars => 'x(t); [o]v(h)', OtherPars => 'IV lag=>h;', GenericTypes => [F,D], Code => ' $GENERIC(x) s, s2, m, covh; s=0; s2=0; m=0; covh=0; long T, i; T = $SIZE(t); loop(t) %{ s += $x(); s2 += pow($x(), 2); %} m = s/T; loop (h) %{ if (h) { covh = 0; for (i=0; ii) - m) * ($x(t=>i+h) - m); } $v() = covh; } else { $v() = s2 - T * pow(m, 2); } %} ', Doc => undef, ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 acf =for sig Signature: (x(t); int h(); [o]r(h+1)) =for ref Autocorrelation function for up to lag h. If h is not specified it's set to t-1 by default. acf does not process bad values. =for usage usage: perldl> $a = sequence 10 # lags 0 .. 5 perldl> p $a->acf(5) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] =cut *acf = \&PDL::acf; sub PDL::acf { my ($self, $h) = @_; $h ||= $self->dim(0) - 1; return $self->_acf($h+1); } =head2 acvf =for sig Signature: (x(t); int h(); [o]v(h+1)) =for ref Autocovariance function for up to lag h. If h is not specified it's set to t-1 by default. acvf does not process bad values. =for usage usage: perldl> $a = sequence 10 # lags 0 .. 5 perldl> p $a->acvf(5) [82.5 57.75 34 12.25 -6.5 -21.25] # autocorrelation perldl> p $a->acvf(5) / $a->acvf(0) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] =cut *acvf = \&PDL::acvf; sub PDL::acvf { my ($self, $h) = @_; $h ||= $self->dim(0) - 1; return $self->_acvf($h+1); } EOD pp_def('diff', Pars => 'x(t); [o]dx(t)', Inplace => 1, GenericTypes => [U,L,Q,F,D], Code => ' long tr; /* do it in reverse so inplace works */ for (tr = $SIZE(t) - 1; tr >= 0; tr --) { if (tr) { $dx(t=>tr) = $x(t=>tr) - $x(t=>tr-1); } else { $dx(t=>tr) = $x(t=>tr); } } ', Doc => ' =for ref Differencing. DX(t) = X(t) - X(t-1), DX(0) = X(0). Can be done inplace. =cut ', ); pp_def('inte', Pars => 'x(n); [o]ix(n)', Inplace => 1, GenericTypes => [L,Q,F,D], Code => ' $GENERIC(x) tmp; tmp = 0; loop(n) %{ tmp += $x(); $ix() = tmp; %} ', Doc => ' =for ref Integration. Opposite of differencing. IX(t) = X(t) + X(t-1), IX(0) = X(0). Can be done inplace. =cut ', ); pp_def('dseason', Pars => 'x(t); indx d(); [o]xd(t)', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(x) xc, sum; PDL_Indx i, q, max; q = ($d() % 2)? ($d() - 1) / 2 : $d() / 2; max = $SIZE(t) - 1; if ($d() % 2) { loop(t) %{ sum = 0; for (i=-q; i<=q; i++) { sum += (t+i < 0)? $x(t=>0) : (t+i > max)? $x(t=>max) : $x(t=>t+i) ; } $xd() = sum / $d(); %} } else { loop(t) %{ sum = 0; for (i=-q; i<=q; i++) { xc = (t+i < 0)? $x(t=>0) : (t+i > max)? $x(t=>max) : $x(t=>t+i) ; sum += (i==-q || i==q)? .5 * xc : xc; } $xd() = sum / $d(); %} } ', BadCode => ' $GENERIC(x) sum; PDL_Indx i, q, min, max, ti, dd; min = -1; max = -1; q = ($d() % 2)? ($d() - 1) / 2 : $d() / 2; /*find good min and max ind*/ loop (t) %{ if ( $ISGOOD(x()) ) { if (min < 0) { min = t; } max = t; } %} if ($d() % 2) { loop(t) %{ if (t < min || t > max) { $SETBAD(xd()); } else { sum = 0; dd = 0; for (i=-q; i<=q; i++) { ti = (t+i < min)? min : (t+i > max)? max : t+i ; if ( $ISGOOD($x(t=>ti)) ) { sum += $x(t=>ti); dd ++; } } if (dd) { $xd() = sum / dd; } else { $SETBAD(xd()); } } %} } else { loop(t) %{ if (t < min || t > max) { $SETBAD(xd()); } else { sum = 0; dd = 0; for (i=-q; i<=q; i++) { ti = (t+i < min)? min : (t+i > max)? max : t+i ; if ( $ISGOOD($x(t=>ti)) ) { sum += (i == q || i == -q)? .5 * $x(t=>ti) : $x(t=>ti); dd ++; } } if (dd) { dd --; if ( ($ISBAD(x(t=>t-q)) && $ISGOOD(x(t=>t+q)) ) || ($ISBAD(x(t=>t+q)) && $ISGOOD(x(t=>t-q)) ) ) dd += .5; $xd() = sum / dd; } else { $SETBAD(xd()); } } %} } ', Doc => ' =for ref Deseasonalize data using moving average filter the size of period d. =cut ', ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 fill_ma =for sig Signature: (x(t); int q(); [o]xf(t)) =for ref Fill missing value with moving average. xf(t) = sum(x(t-q .. t-1, t+1 .. t+q)) / 2q. fill_ma does handle bad values. Output pdl bad flag is cleared unless the specified window size q is too small and there are still bad values. =for usage my $x_filled = $x->fill_ma( $q ); =cut *fill_ma = \&PDL::fill_ma; sub PDL::fill_ma { my ($x, $q) = @_; my $x_filled = $x->_fill_ma($q); $x_filled->check_badflag; # carp "ma window too small, still has bad value" # if $x_filled->badflag; return $x_filled; } EOD pp_def('_fill_ma', Pars => 'x(t); indx q(); [o]xf(t)', GenericTypes => [F,D], HandleBad => 1, Code => ' loop(t) %{ $xf() = $x(); %} ', BadCode => ' $GENERIC(x) sum, xx; PDL_Indx i, n, max; max = $SIZE(t) - 1; loop(t) %{ if ($ISBAD(x())) { n=0; sum=0; for (i=-$q(); i<=$q(); i++) { xx = (t+i < 0)? $x(t=>0) : (t+i > max)? $x(t=>max) : $x(t=>t+i) ; if ($ISGOODVAR(xx,x)) { sum += xx; n ++; } } if (n) { $xf() = sum / n; } else { $SETBAD(xf()); } } else { $xf() = $x(); } %} ', Doc => undef, ); pp_def('filter_exp', Pars => 'x(t); a(); [o]xf(t)', GenericTypes => [F,D], Code => ' $GENERIC(x) b, m; b = 1 - $a(); loop(t) %{ if (t) { m = $a() * $x() + b * m; } else { m = $x(); } $xf() = m; %} ', Doc => ' =for ref Filter, exponential smoothing. xf(t) = a * x(t) + (1-a) * xf(t-1) =for usage =cut ', ); pp_def('filter_ma', Pars => 'x(t); indx q(); [o]xf(t)', GenericTypes => [F,D], Code => ' $GENERIC(x) sum; PDL_Indx i, n, max; n = 2 * $q() + 1; max = $SIZE(t) - 1; loop(t) %{ sum = 0; for (i=-$q(); i<=$q(); i++) { sum += (t+i < 0)? $x(t=>0) : (t+i > max)? $x(t=>max) : $x(t=>t+i) ; } $xf() = sum / n; %} ', Doc => ' =for ref Filter, moving average. xf(t) = sum(x(t-q .. t+q)) / (2q + 1) =cut ', ); pp_def('mae', Pars => 'a(n); b(n); float+ [o]c()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(c) sum; sum = 0; PDL_Indx N = $SIZE(n); loop(n) %{ sum += fabs( $a() - $b() ); %} $c() = sum / N; ', BadCode => ' $GENERIC(c) sum; sum = 0; PDL_Indx N = 0; loop(n) %{ if ($ISBAD(a()) || $ISBAD(b())) { } else { sum += fabs( $a() - $b() ); N ++; } %} if (N) { $c() = sum / N; } else { $SETBAD(c()); } ', Doc => ' =for ref Mean absolute error. MAE = 1/n * sum( abs(y - y_pred) ) =for usage Usage: $mae = $y->mae( $y_pred ); =cut ', ); pp_def('mape', Pars => 'a(n); b(n); float+ [o]c()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(c) sum; sum = 0; PDL_Indx N = $SIZE(n); loop(n) %{ sum += fabs( ($a() - $b()) / $a() ); %} $c() = sum / N; ', BadCode => ' $GENERIC(c) sum; sum = 0; PDL_Indx N = 0; loop(n) %{ if ($ISBAD(a()) || $ISBAD(b())) { } else { sum += fabs( ($a() - $b()) / $a() ); N ++; } %} if (N) { $c() = sum / N; } else { $SETBAD(c()); } ', Doc => ' =for ref Mean absolute percent error. MAPE = 1/n * sum(abs((y - y_pred) / y)) =for usage Usage: $mape = $y->mape( $y_pred ); =cut ', ); pp_def('wmape', Pars => 'a(n); b(n); float+ [o]c()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(c) sum_e, sum; sum_e=0; sum=0; loop(n) %{ sum_e += fabs( $a() - $b() ); sum += fabs( $a() ); %} $c() = sum_e / sum; ', BadCode => ' $GENERIC(c) sum_e, sum; sum_e=0; sum=0; loop(n) %{ if ($ISBAD(a()) || $ISBAD(b())) { } else { sum_e += fabs( $a() - $b() ); sum += fabs( $a() ); } %} if (sum) { $c() = sum_e / sum; } else { $SETBAD(c()); } ', Doc => ' =for ref Weighted mean absolute percent error. avg(abs(error)) / avg(abs(data)). Much more robust compared to mape with division by zero error (cf. Schütz, W., & Kolassa, 2006). =for usage Usage: $wmape = $y->wmape( $y_pred ); =cut ', ); pp_def('portmanteau', Pars => 'r(h); longlong t(); [o]Q()', GenericTypes => [F,D], Code => ' $GENERIC(r) sum; sum = 0; loop(h) %{ if (h) sum += pow($r(), 2) / ($t() - h); %} $Q() = $t() * ($t()+2) * sum; ', Doc => ' =for ref Portmanteau significance test (Ljung-Box) for autocorrelations. =for usage Usage: perldl> $a = sequence 10 # acf for lags 0-5 # lag 0 excluded from portmanteau perldl> p $chisq = $a->acf(5)->portmanteau( $a->nelem ) 11.1753902662994 # get p-value from chisq distr perldl> use PDL::GSL::CDF perldl> p 1 - gsl_cdf_chisq_P( $chisq, 5 ) 0.0480112934306748 =cut ', ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 pred_ar =for sig Signature: (x(d); b(p|p+1); int t(); [o]pred(t)) =for ref Calculates predicted values up to period t (extend current series up to period t) for autoregressive series, with or without constant. If there is constant, it is the last element in b, as would be returned by ols or ols_t. pred_ar does not process bad values. =for options CONST => 1, =for usage Usage: perldl> $x = sequence 2 # last element is constant perldl> $b = pdl(.8, -.2, .3) perldl> p $x->pred_ar($b, 7) [0 1 1.1 0.74 0.492 0.3656 0.31408] # no constant perldl> p $x->pred_ar($b(0:1), 7, {const=>0}) [0 1 0.8 0.44 0.192 0.0656 0.01408] =cut sub PDL::pred_ar { my ($x, $b, $t, $opt) = @_; my %opt = ( CONST => 1 ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $b = pdl $b unless ref $b eq 'PDL'; # allows passing simple number my $ext; if ($opt{CONST}) { my $t_ = $t - ( $x->dim(0) - $b->dim(0) + 1 ); $ext = $x(-$b->dim(0)+1:-1, )->_pred_ar($b(0:-2), $t_); $ext($b->dim(0)-1:-1) += $b(-1); return $x->append( $ext( $b->dim(0)-1 : -1 ) ); } else { my $t_ = $t - ( $x->dim(0) - $b->dim(0) ); $ext = $x(-$b->dim(0):-1, )->_pred_ar($b, $t_); return $x->append($ext($b->dim(0) : -1)); } } EOD pp_def('_pred_ar', Pars => 'x(p); b(p); [o]pred(t)', OtherPars => 'IV end=>t;', GenericTypes => [F,D], Code => ' PDL_Indx ord = $SIZE(p); $GENERIC(x) xt, xp[ord]; loop (t) %{ if (t < ord) { xp[t] = $x(p=>t); $pred() = xp[t]; } else { xt = 0; loop(p) %{ xt += xp[p] * $b(p=>ord-p-1); xp[p] = (p < ord - 1)? xp[p+1] : xt; %} $pred() = xt; } %} ', Doc => undef , ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 season_m Given length of season, returns seasonal mean and var for each period (returns seasonal mean only in scalar context). =for options Default options (case insensitive): START_POSITION => 0, # series starts at this position in season MISSING => -999, # internal mark for missing points in season PLOT => 0, # boolean # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pass pgwin object for more plotting control DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows COLOR => 1, See PDL::Graphics::PGPLOT for detailed graphing options. =for usage my ($m, $ms) = $data->season_m( 24, { START_POSITION=>2 } ); =cut *season_m = \&PDL::season_m; sub PDL::season_m { my ($self, $d, $opt) = @_; my %opt = ( START_POSITION => 0, # series starts at this position in season MISSING => -999, # internal mark for missing points in season PLOT => 0, WIN => undef, # pass pgwin object for more plotting control DEV => $DEV, # see PDL::Graphics::PGPLOT for more info COLOR => 1, ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); if ($opt{PLOT}) { require PDL::Graphics::PGPLOT::Window; } my $n_season = ($self->dim(0) + $opt{START_POSITION}) / $d; $n_season = pdl($n_season)->ceil->sum->sclr; my @dims = $self->dims; $dims[0] = $n_season * $d; my $data = zeroes( @dims ) + $opt{MISSING}; $data($opt{START_POSITION} : $opt{START_POSITION} + $self->dim(0)-1, ) .= $self; $data->badflag(1); $data->inplace->setvaltobad( $opt{MISSING} ); my $s = sequence $d; $s = $s->dummy(1, $n_season)->flat; $s = $s->iv_cluster(); my ($m, $ms) = $data->centroid( $s ); if ($opt{PLOT}) { my $w = $opt{WIN}; if (!$w) { $w = PDL::Graphics::PGPLOT::Window::pgwin( Dev=>$opt{DEV} ); $w->env( 0, $d-1, $m->minmax, {XTitle=>'period', YTitle=>'mean'} ); } $w->points( sequence($d), $m, {COLOR=>$opt{COLOR}, PLOTLINE=>1} ); if ($m->squeeze->ndims < 2) { $w->errb( sequence($d), $m, sqrt( $ms / $s->sumover ), {COLOR=>$opt{COLOR}} ); } $w->close unless $opt{WIN}; } return wantarray? ($m, $ms) : $m; } =head2 plot_dseason =for ref Plots deseasonalized data and original data points. Opens and closes default window for plotting unless a pgwin object is passed in options. Returns deseasonalized data. =for options Default options (case insensitive): WIN => undef, DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows COLOR => 1, # data point color See PDL::Graphics::PGPLOT for detailed graphing options. =cut *plot_dseason = \&PDL::plot_dseason; sub PDL::plot_dseason { require PDL::Graphics::PGPLOT::Window; my ($self, $d, $opt) = @_; !defined($d) and croak "please set season period length"; $self = $self->squeeze; my $dsea; my %opt = ( WIN => undef, DEV => $DEV, COLOR => 1, # data point color ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $dsea = $self->dseason($d); my $w = $opt{WIN}; if (!$opt{WIN}) { $w = PDL::Graphics::PGPLOT::Window::pgwin( $opt{DEV} ); $w->env( 0, $self->dim(0)-1, $self->minmax, {XTitle=>'T', YTitle=>'DV'} ); } my $missn = ushort $self->max->sclr + 1; # ushort in case precision issue $w->line( sequence($self->dim(0)), $dsea->setbadtoval( $missn ), {COLOR=>$opt{COLOR}+1, MISSING=>$missn} ); $w->points( sequence($self->dim(0)), $self, {COLOR=>$opt{COLOR}} ); $w->close unless $opt{WIN}; return $dsea; } *filt_exp = \&PDL::filt_exp; sub PDL::filt_exp { print STDERR "filt_exp() deprecated since version 0.5.0. Please use filter_exp() instead\n"; return filter_exp( @_ ); } *filt_ma = \&PDL::filt_ma; sub PDL::filt_ma { print STDERR "filt_ma() deprecated since version 0.5.0. Please use filter_ma() instead\n"; return filter_ma( @_ ); } =head1 METHODS =head2 plot_acf =for ref Plots and returns autocorrelations for a time series. =for options Default options (case insensitive): SIG => 0.05, # can specify .10, .05, .01, or .001 DEV => '/xs', # open and close dev for plotting # defaults to '/png' in Windows =for usage Usage: perldl> $a = sequence 10 perldl> p $r = $a->plot_acf(5) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] =cut *plot_acf = \&PDL::plot_acf; sub PDL::plot_acf { require PDL::Graphics::PGPLOT::Window; my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($self, $h) = @_; my $r = $self->acf($h); my %opt = ( SIG => 0.05, DEV => $DEV, ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $w = PDL::Graphics::PGPLOT::Window::pgwin( Dev=>$opt{DEV} ); $w->env(-1, $h+1, -1.05, 1.05, {XTitle=>'lag', YTitle=>'acf'}); $w->line(pdl(-1,$h+1), zeroes(2)); # x axis my $y_sig = ($opt{SIG} == 0.10)? 1.64485362695147 : ($opt{SIG} == 0.05)? 1.95996398454005 : ($opt{SIG} == 0.01)? 2.5758293035489 : ($opt{SIG} == 0.001)? 3.29052673149193 : 0 ; unless ($y_sig) { carp "SIG outside of recognized value. default to 0.05"; $y_sig = 1.95996398454005; } $w->line( pdl(-1,$h+1), ones(2) * $y_sig / sqrt($self->dim(0)), { LINESTYLE=>"Dashed" } ); $w->line( pdl(-1,$h+1), ones(2) * $y_sig / sqrt($self->dim(0)) * -1, { LINESTYLE=>"Dashed" } ); for my $lag (0..$h) { $w->line( ones(2)*$lag, pdl(0, $r($lag)) ); } $w->close; return $r; } =head1 REFERENCES Brockwell, P.J., & Davis, R.A. (2002). Introcution to Time Series and Forecasting (2nd ed.). New York, NY: Springer. Schütz, W., & Kolassa, S. (2006). Foresight: advantages of the MAD/Mean ratio over the MAPE. Retrieved Jan 28, 2010, from http://www.saf-ag.com/226+M5965d28cd19.html =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut EOD pp_done(); PDL-Stats-0.84/TS/Makefile.PL0000644000175000017500000000034214126063750015411 0ustar osboxesosboxesuse PDL::Core::Dev; use ExtUtils::MakeMaker; PDL::Core::Dev->import(); $package = ["ts.pd",TS,PDL::Stats::TS,undef,1]; %hash = pdlpp_stdargs($package); WriteMakefile( %hash ); sub MY::postamble {pdlpp_postamble($package)}; PDL-Stats-0.84/GENERATED/0000755000175000017500000000000014625061425014411 5ustar osboxesosboxesPDL-Stats-0.84/GENERATED/PDL/0000755000175000017500000000000014625061425015030 5ustar osboxesosboxesPDL-Stats-0.84/GENERATED/PDL/Stats/0000755000175000017500000000000014625061426016127 5ustar osboxesosboxesPDL-Stats-0.84/GENERATED/PDL/Stats/Basic.pm0000644000175000017500000005304314625061425017512 0ustar osboxesosboxes# # GENERATED WITH PDL::PP! Don't modify! # package PDL::Stats::Basic; our @EXPORT_OK = qw(binomial_test rtable which_id stdv stdv_unbiased var var_unbiased se ss skew skew_unbiased kurt kurt_unbiased cov cov_table corr corr_table t_corr n_pair corr_dev t_test t_test_nev t_test_paired ); our %EXPORT_TAGS = (Func=>\@EXPORT_OK); use PDL::Core; use PDL::Exporter; use DynaLoader; our @ISA = ( 'PDL::Exporter','DynaLoader' ); push @PDL::Core::PP, __PACKAGE__; bootstrap PDL::Stats::Basic ; #line 4 "stats_basic.pd" use PDL::LiteF; use PDL::NiceSlice; use Carp; eval { require PDL::Core; require PDL::GSL::CDF; }; my $CDF = 1 if !$@; =head1 NAME PDL::Stats::Basic -- basic statistics and related utilities such as standard deviation, Pearson correlation, and t-tests. =head1 DESCRIPTION The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively. Does not have mean or median function here. see SEE ALSO. =head1 SYNOPSIS use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::Basic; my $stdv = $data->stdv; or my $stdv = stdv( $data ); =cut #line 58 "Basic.pm" =head1 FUNCTIONS =cut =head2 stdv =for sig Signature: (a(n); float+ [o]b()) =for ref Sample standard deviation. =cut =for bad stdv processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *stdv = \&PDL::stdv; =head2 stdv_unbiased =for sig Signature: (a(n); float+ [o]b()) =for ref Unbiased estimate of population standard deviation. =cut =for bad stdv_unbiased processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *stdv_unbiased = \&PDL::stdv_unbiased; =head2 var =for sig Signature: (a(n); float+ [o]b()) =for ref Sample variance. =cut =for bad var processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *var = \&PDL::var; =head2 var_unbiased =for sig Signature: (a(n); float+ [o]b()) =for ref Unbiased estimate of population variance. =cut =for bad var_unbiased processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *var_unbiased = \&PDL::var_unbiased; =head2 se =for sig Signature: (a(n); float+ [o]b()) =for ref Standard error of the mean. Useful for calculating confidence intervals. =for usage # 95% confidence interval for samples with large N $ci_95_upper = $data->average + 1.96 * $data->se; $ci_95_lower = $data->average - 1.96 * $data->se; =for bad se processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *se = \&PDL::se; =head2 ss =for sig Signature: (a(n); float+ [o]b()) =for ref Sum of squared deviations from the mean. =cut =for bad ss processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *ss = \&PDL::ss; =head2 skew =for sig Signature: (a(n); float+ [o]b()) =for ref Sample skewness, measure of asymmetry in data. skewness == 0 for normal distribution. =cut =for bad skew processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *skew = \&PDL::skew; =head2 skew_unbiased =for sig Signature: (a(n); float+ [o]b()) =for ref Unbiased estimate of population skewness. This is the number in GNumeric Descriptive Statistics. =cut =for bad skew_unbiased processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *skew_unbiased = \&PDL::skew_unbiased; =head2 kurt =for sig Signature: (a(n); float+ [o]b()) =for ref Sample kurtosis, measure of "peakedness" of data. kurtosis == 0 for normal distribution. =cut =for bad kurt processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *kurt = \&PDL::kurt; =head2 kurt_unbiased =for sig Signature: (a(n); float+ [o]b()) =for ref Unbiased estimate of population kurtosis. This is the number in GNumeric Descriptive Statistics. =cut =for bad kurt_unbiased processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *kurt_unbiased = \&PDL::kurt_unbiased; =head2 cov =for sig Signature: (a(n); b(n); float+ [o]c()) =for ref Sample covariance. see B for ways to call =cut =for bad cov processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *cov = \&PDL::cov; =head2 cov_table =for sig Signature: (a(n,m); float+ [o]c(m,m)) =for ref Square covariance table. Gives the same result as threading using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls. =for usage Usage: # 5 obs x 3 var, 2 such data tables perldl> $a = random 5, 3, 2 perldl> p $cov = $a->cov_table [ [ [ 8.9636438 -1.8624472 -1.2416588] [-1.8624472 14.341514 -1.4245366] [-1.2416588 -1.4245366 9.8690655] ] [ [ 10.32644 -0.31311789 -0.95643674] [-0.31311789 15.051779 -7.2759577] [-0.95643674 -7.2759577 5.4465141] ] ] # diagonal elements of the cov table are the variances perldl> p $a->var [ [ 8.9636438 14.341514 9.8690655] [ 10.32644 15.051779 5.4465141] ] for the same cov matrix table using B, perldl> p $a->dummy(2)->cov($a->dummy(1)) =for bad cov_table processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *cov_table = \&PDL::cov_table; =head2 corr =for sig Signature: (a(n); b(n); float+ [o]c()) =for ref Pearson correlation coefficient. r = cov(X,Y) / (stdv(X) * stdv(Y)). =for usage Usage: perldl> $a = random 5, 3 perldl> $b = sequence 5,3 perldl> p $a->corr($b) [0.20934208 0.30949881 0.26713007] for square corr table perldl> p $a->corr($a->dummy(1)) [ [ 1 -0.41995259 -0.029301192] [ -0.41995259 1 -0.61927619] [-0.029301192 -0.61927619 1] ] but it is easier and faster to use B. =cut =for bad corr processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *corr = \&PDL::corr; =head2 corr_table =for sig Signature: (a(n,m); float+ [o]c(m,m)) =for ref Square Pearson correlation table. Gives the same result as threading using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls. =for usage Usage: # 5 obs x 3 var, 2 such data tables perldl> $a = random 5, 3, 2 perldl> p $a->corr_table [ [ [ 1 -0.69835951 -0.18549048] [-0.69835951 1 0.72481605] [-0.18549048 0.72481605 1] ] [ [ 1 0.82722569 -0.71779883] [ 0.82722569 1 -0.63938828] [-0.71779883 -0.63938828 1] ] ] for the same result using B, perldl> p $a->dummy(2)->corr($a->dummy(1)) This is also how to use B and B with such a table. =for bad corr_table processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *corr_table = \&PDL::corr_table; =head2 t_corr =for sig Signature: (r(); n(); [o]t()) =for usage $corr = $data->corr( $data->dummy(1) ); $n = $data->n_pair( $data->dummy(1) ); $t_corr = $corr->t_corr( $n ); use PDL::GSL::CDF; $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t_corr->abs, $n-2 )); =for ref t significance test for Pearson correlations. =cut =for bad t_corr processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *t_corr = \&PDL::t_corr; =head2 n_pair =for sig Signature: (a(n); b(n); indx [o]c()) =for ref Returns the number of good pairs between 2 lists. Useful with B (esp. when bad values are involved) =cut =for bad n_pair processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *n_pair = \&PDL::n_pair; =head2 corr_dev =for sig Signature: (a(n); b(n); float+ [o]c()) =for usage $corr = $a->dev_m->corr_dev($b->dev_m); =for ref Calculates correlations from B vals. Seems faster than doing B from original vals when data pdl is big =cut =for bad corr_dev processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *corr_dev = \&PDL::corr_dev; =head2 t_test =for sig Signature: (a(n); b(m); float+ [o]t(); [o]d()) =for usage my ($t, $df) = t_test( $pdl1, $pdl2 ); use PDL::GSL::CDF; my $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t->abs, $df )); =for ref Independent sample t-test, assuming equal var. =cut =for bad t_test processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *t_test = \&PDL::t_test; =head2 t_test_nev =for sig Signature: (a(n); b(m); float+ [o]t(); [o]d()) =for ref Independent sample t-test, NOT assuming equal var. ie Welch two sample t test. Df follows Welch-Satterthwaite equation instead of Satterthwaite (1946, as cited by Hays, 1994, 5th ed.). It matches GNumeric, which matches R. =for usage my ($t, $df) = $pdl1->t_test( $pdl2 ); =cut =for bad t_test_nev processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *t_test_nev = \&PDL::t_test_nev; =head2 t_test_paired =for sig Signature: (a(n); b(n); float+ [o]t(); [o]d()) =for ref Paired sample t-test. =cut =for bad t_test_paired processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *t_test_paired = \&PDL::t_test_paired; #line 1251 "stats_basic.pd" #line 1252 "stats_basic.pd" =head2 binomial_test =for Sig Signature: (x(); n(); p_expected(); [o]p()) =for ref Binomial test. One-tailed significance test for two-outcome distribution. Given the number of successes, the number of trials, and the expected probability of success, returns the probability of getting this many or more successes. This function does NOT currently support bad value in the number of successes. =for usage Usage: # assume a fair coin, ie. 0.5 probablity of getting heads # test whether getting 8 heads out of 10 coin flips is unusual my $p = binomial_test( 8, 10, 0.5 ); # 0.0107421875. Yes it is unusual. =cut *binomial_test = \&PDL::binomial_test; sub PDL::binomial_test { my ($x, $n, $P) = @_; carp 'Please install PDL::GSL::CDF.' unless $CDF; carp 'This function does NOT currently support bad value in the number of successes.' if $x->badflag(); my $pdlx = pdl($x); $pdlx->badflag(1); $pdlx = $pdlx->setvaltobad(0); my $p = 1 - PDL::GSL::CDF::gsl_cdf_binomial_P( $pdlx - 1, $P, $n ); $p = $p->setbadtoval(1); $p->badflag(0); return $p; } =head1 METHODS =head2 rtable =for ref Reads either file or file handle*. Returns observation x variable pdl and var and obs ids if specified. Ids in perl @ ref to allow for non-numeric ids. Other non-numeric entries are treated as missing, which are filled with $opt{MISSN} then set to BAD*. Can specify num of data rows to read from top but not arbitrary range. *If passed handle, it will not be closed here. =for options Default options (case insensitive): V => 1, # verbose. prints simple status TYPE => double, C_ID => 1, # boolean. file has col id. R_ID => 1, # boolean. file has row id. R_VAR => 0, # boolean. set to 1 if var in rows SEP => "\t", # can take regex qr// MISSN => -999, # this value treated as missing and set to BAD NROW => '', # set to read specified num of data rows =for usage Usage: Sample file diet.txt: uid height weight diet akw 72 320 1 bcm 68 268 1 clq 67 180 2 dwm 70 200 2 ($data, $idv, $ido) = rtable 'diet.txt'; # By default prints out data info and @$idv index and element reading diet.txt for data and id... OK. data table as PDL dim o x v: PDL: Double D [4,3] 0 height 1 weight 2 diet Another way of using it, $data = rtable( \*STDIN, {TYPE=>long} ); =cut sub rtable { # returns obs x var data matrix and var and obs ids my ($src, $opt) = @_; my $fh_in; if ($src =~ /STDIN/ or ref $src eq 'GLOB') { $fh_in = $src } else { open $fh_in, $src or croak "$!" } my %opt = ( V => 1, TYPE => double, C_ID => 1, R_ID => 1, R_VAR => 0, SEP => "\t", MISSN => -999, NROW => '', ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $opt{V} and print STDERR "reading $src for data and id... "; local $PDL::undefval = $opt{MISSN}; my $id_c = []; # match declaration of $id_r for return purpose if ($opt{C_ID}) { chomp( $id_c = <$fh_in> ); my @entries = split $opt{SEP}, $id_c; $opt{R_ID} and shift @entries; $id_c = \@entries; } my ($c_row, $id_r, $data, @data) = (0, [], PDL->null, ); while (<$fh_in>) { chomp; my @entries = split /$opt{SEP}/, $_, -1; $opt{R_ID} and push @$id_r, shift @entries; # rudimentary check for numeric entry for (@entries) { $_ = $opt{MISSN} unless defined $_ and m/\d\b/ } push @data, pdl( $opt{TYPE}, \@entries ); $c_row ++; last if $opt{NROW} and $c_row == $opt{NROW}; } # not explicitly closing $fh_in here in case it's passed from outside # $fh_in will close by going out of scope if opened here. $data = pdl $opt{TYPE}, @data; @data = (); # rid of last col unless there is data there $data = $data(0:$data->getdim(0)-2, )->sever unless ( nelem $data(-1, )->where($data(-1, ) != $opt{MISSN}) ); my ($idv, $ido) = ($id_r, $id_c); # var in columns instead of rows $opt{R_VAR} == 0 and ($data, $idv, $ido) = ($data->inplace->transpose, $id_c, $id_r); if ($opt{V}) { print STDERR "OK.\ndata table as PDL dim o x v: " . $data->info . "\n"; $idv and print STDERR "$_\t$$idv[$_]\n" for (0..$#$idv); } $data = $data->setvaltobad( $opt{MISSN} ); $data->check_badflag; return wantarray? (@$idv? ($data, $idv, $ido) : ($data, $ido)) : $data; } =head2 group_by Returns pdl reshaped according to the specified factor variable. Most useful when used in conjunction with other threading calculations such as average, stdv, etc. When the factor variable contains unequal number of cases in each level, the returned pdl is padded with bad values to fit the level with the most number of cases. This allows the subsequent calculation (average, stdv, etc) to return the correct results for each level. Usage: # simple case with 1d pdl and equal number of n in each level of the factor pdl> p $a = sequence 10 [0 1 2 3 4 5 6 7 8 9] pdl> p $factor = $a > 4 [0 0 0 0 0 1 1 1 1 1] pdl> p $a->group_by( $factor )->average [2 7] # more complex case with threading and unequal number of n across levels in the factor pdl> p $a = sequence 10,2 [ [ 0 1 2 3 4 5 6 7 8 9] [10 11 12 13 14 15 16 17 18 19] ] pdl> p $factor = qsort $a( ,0) % 3 [ [0 0 0 0 1 1 1 2 2 2] ] pdl> p $a->group_by( $factor ) [ [ [ 0 1 2 3] [10 11 12 13] ] [ [ 4 5 6 BAD] [ 14 15 16 BAD] ] [ [ 7 8 9 BAD] [ 17 18 19 BAD] ] ] ARRAY(0xa2a4e40) # group_by supports perl factors, multiple factors # returns factor labels in addition to pdl in array context pdl> p $a = sequence 12 [0 1 2 3 4 5 6 7 8 9 10 11] pdl> $odd_even = [qw( e o e o e o e o e o e o )] pdl> $magnitude = [qw( l l l l l l h h h h h h )] pdl> ($a_grouped, $label) = $a->group_by( $odd_even, $magnitude ) pdl> p $a_grouped [ [ [0 2 4] [1 3 5] ] [ [ 6 8 10] [ 7 9 11] ] ] pdl> p Dumper $label $VAR1 = [ [ 'e_l', 'o_l' ], [ 'e_h', 'o_h' ] ]; =cut *group_by = \&PDL::group_by; sub PDL::group_by { my $p = shift; my @factors = @_; if ( @factors == 1 ) { my $factor = $factors[0]; my $label; if (ref $factor eq 'ARRAY') { $label = _ordered_uniq($factor); $factor = _array_to_pdl($factor); } else { my $perl_factor = [$factor->list]; $label = _ordered_uniq($perl_factor); } my $p_reshaped = _group_by_single_factor( $p, $factor ); return wantarray? ($p_reshaped, $label) : $p_reshaped; } # make sure all are arrays instead of pdls @factors = map { ref($_) eq 'PDL'? [$_->list] : $_ } @factors; my (@cells); for my $ele (0 .. $#{$factors[0]}) { my $c = join '_', map { $_->[$ele] } @factors; push @cells, $c; } # get uniq cell labels (ref List::MoreUtils::uniq) my %seen; my @uniq_cells = grep {! $seen{$_}++ } @cells; my $flat_factor = _array_to_pdl( \@cells ); my $p_reshaped = _group_by_single_factor( $p, $flat_factor ); # get levels of each factor and reshape accordingly my @levels; for (@factors) { my %uniq; @uniq{ @$_ } = (); push @levels, scalar keys %uniq; } $p_reshaped = $p_reshaped->reshape( $p_reshaped->dim(0), @levels )->sever; # make labels for the returned data structure matching pdl structure my @labels; if (wantarray) { for my $ifactor (0 .. $#levels) { my @factor_label; for my $ilevel (0 .. $levels[$ifactor]-1) { my $i = $ifactor * $levels[$ifactor] + $ilevel; push @factor_label, $uniq_cells[$i]; } push @labels, \@factor_label; } } return wantarray? ($p_reshaped, \@labels) : $p_reshaped; } # get uniq cell labels (ref List::MoreUtils::uniq) sub _ordered_uniq { my $arr = shift; my %seen; my @uniq = grep { ! $seen{$_}++ } @$arr; return \@uniq; } sub _group_by_single_factor { my $p = shift; my $factor = shift; $factor = $factor->squeeze; die "Currently support only 1d factor pdl." if $factor->ndims > 1; die "Data pdl and factor pdl do not match!" unless $factor->dim(0) == $p->dim(0); # get active dim that will be split according to factor and dims to thread over my @p_threaddims = $p->dims; my $p_dim0 = shift @p_threaddims; my $uniq = $factor->uniq; my @uniq_ns; for ($uniq->list) { push @uniq_ns, which( $factor == $_ )->nelem; } # get number of n's in each group, find the biggest, fit output pdl to this my $uniq_ns = pdl \@uniq_ns; my $max = pdl(\@uniq_ns)->max->sclr; my $badvalue = int($p->max + 1); my $p_tmp = ones($max, @p_threaddims, $uniq->nelem) * $badvalue; for (0 .. $#uniq_ns) { my $i = which $factor == $uniq($_); $p_tmp->dice_axis(-1,$_)->squeeze->(0:$uniq_ns[$_]-1, ) .= $p($i, ); } $p_tmp->badflag(1); return $p_tmp->setvaltobad($badvalue); } =head2 which_id =for ref Lookup specified var (obs) ids in $idv ($ido) (see B) and return indices in $idv ($ido) as pdl if found. The indices are ordered by the specified subset. Useful for selecting data by var (obs) id. =for usage my $ind = which_id $ido, ['smith', 'summers', 'tesla']; my $data_subset = $data( $ind, ); # take advantage of perl pattern matching # e.g. use data from people whose last name starts with s my $i = which_id $ido, [ grep { /^s/ } @$ido ]; my $data_s = $data($i, ); =cut sub which_id { my ($id, $id_s) = @_; my %ind; @ind{ @$id } = ( 0 .. $#$id ); my @ind_select; for (@$id_s) { defined( $ind{$_} ) and push @ind_select, $ind{$_}; } return pdl @ind_select; } sub _array_to_pdl { my ($var_ref) = @_; $var_ref = [ $var_ref->list ] if UNIVERSAL::isa($var_ref, 'PDL'); my (%level, $l); $l = 0; for (@$var_ref) { if (defined($_) and $_ ne '' and $_ ne 'BAD') { $level{$_} = $l ++ if !exists $level{$_}; } } my $pdl = pdl( map { (defined($_) and $_ ne '' and $_ ne 'BAD')? $level{$_} : -1 } @$var_ref ); $pdl = $pdl->setvaltobad(-1); $pdl->check_badflag; return wantarray? ($pdl, \%level) : $pdl; } =head1 SEE ALSO PDL::Basic (hist for frequency counts) PDL::Ufunc (sum, avg, median, min, max, etc.) PDL::GSL::CDF (various cumulative distribution functions) =head1 REFERENCES Hays, W.L. (1994). Statistics (5th ed.). Fort Worth, TX: Harcourt Brace College Publishers. =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut #line 1214 "Basic.pm" # Exit with OK status 1; PDL-Stats-0.84/GENERATED/PDL/Stats/Distr.pm0000644000175000017500000003713714625061426017565 0ustar osboxesosboxes# # GENERATED WITH PDL::PP! Don't modify! # package PDL::Stats::Distr; our @EXPORT_OK = qw(mme_beta pdf_beta mme_binomial pmf_binomial mle_exp pdf_exp mme_gamma pdf_gamma mle_gaussian pdf_gaussian mle_geo pmf_geo mle_geosh pmf_geosh mle_lognormal mme_lognormal pdf_lognormal mme_nbd pmf_nbd mme_pareto pdf_pareto mle_poisson pmf_poisson pmf_poisson_stirling _pmf_poisson_factorial ); our %EXPORT_TAGS = (Func=>\@EXPORT_OK); use PDL::Core; use PDL::Exporter; use DynaLoader; our @ISA = ( 'PDL::Exporter','DynaLoader' ); push @PDL::Core::PP, __PACKAGE__; bootstrap PDL::Stats::Distr ; #line 3 "distr.pd" use strict; use warnings; use Carp; use PDL::LiteF; my $DEV = ($^O =~ /win/i)? '/png' : '/xs'; =head1 NAME PDL::Stats::Distr -- parameter estimations and probability density functions for distributions. =head1 DESCRIPTION Parameter estimate is maximum likelihood estimate when there is closed form estimate, otherwise it is method of moments estimate. =head1 SYNOPSIS use PDL::LiteF; use PDL::Stats::Distr; # do a frequency (probability) plot with fitted normal curve my $data = grandom(100)->abs; my ($xvals, $hist) = $data->hist; # turn frequency into probability $hist /= $data->nelem; # get maximum likelihood estimates of normal curve parameters my ($m, $v) = $data->mle_gaussian(); # fitted normal curve probabilities my $p = $xvals->pdf_gaussian($m, $v); use PDL::Graphics::PGPLOT::Window; my $win = pgwin( Dev=>"/xs" ); $win->bin( $hist ); $win->hold; $win->line( $p, {COLOR=>2} ); $win->close; Or, play with different distributions with B :) $data->plot_distr( 'gaussian', 'lognormal' ); =cut #line 76 "Distr.pm" =head1 FUNCTIONS =cut =head2 mme_beta =for sig Signature: (a(n); float+ [o]alpha(); float+ [o]beta()) =for usage my ($a, $b) = $data->mme_beta(); =for ref beta distribution. pdf: f(x; a,b) = 1/B(a,b) x^(a-1) (1-x)^(b-1) =for bad mme_beta processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mme_beta = \&PDL::mme_beta; =head2 pdf_beta =for sig Signature: (x(); a(); b(); float+ [o]p()) =for ref probability density function for beta distribution. x defined on [0,1]. =for bad pdf_beta processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pdf_beta = \&PDL::pdf_beta; =head2 mme_binomial =for sig Signature: (a(n); int [o]n_(); float+ [o]p()) =for usage my ($n, $p) = $data->mme_binomial; =for ref binomial distribution. pmf: f(k; n,p) = (n k) p^k (1-p)^(n-k) for k = 0,1,2..n =for bad mme_binomial processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mme_binomial = \&PDL::mme_binomial; =head2 pmf_binomial =for sig Signature: (ushort x(); ushort n(); p(); float+ [o]out()) =for ref probability mass function for binomial distribution. =for bad pmf_binomial processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pmf_binomial = \&PDL::pmf_binomial; =head2 mle_exp =for sig Signature: (a(n); float+ [o]l()) =for usage my $lamda = $data->mle_exp; =for ref exponential distribution. mle same as method of moments estimate. =for bad mle_exp processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mle_exp = \&PDL::mle_exp; =head2 pdf_exp =for sig Signature: (x(); l(); float+ [o]p()) =for ref probability density function for exponential distribution. =for bad pdf_exp processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pdf_exp = \&PDL::pdf_exp; =head2 mme_gamma =for sig Signature: (a(n); float+ [o]shape(); float+ [o]scale()) =for usage my ($shape, $scale) = $data->mme_gamma(); =for ref two-parameter gamma distribution =for bad mme_gamma processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mme_gamma = \&PDL::mme_gamma; =head2 pdf_gamma =for sig Signature: (x(); a(); t(); float+ [o]p()) =for ref probability density function for two-parameter gamma distribution. =for bad pdf_gamma processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pdf_gamma = \&PDL::pdf_gamma; =head2 mle_gaussian =for sig Signature: (a(n); float+ [o]m(); float+ [o]v()) =for usage my ($m, $v) = $data->mle_gaussian(); =for ref gaussian aka normal distribution. same results as $data->average and $data->var. mle same as method of moments estimate. =for bad mle_gaussian processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mle_gaussian = \&PDL::mle_gaussian; =head2 pdf_gaussian =for sig Signature: (x(); m(); v(); float+ [o]p()) =for ref probability density function for gaussian distribution. =for bad pdf_gaussian processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pdf_gaussian = \&PDL::pdf_gaussian; =head2 mle_geo =for sig Signature: (a(n); float+ [o]p()) =for ref geometric distribution. mle same as method of moments estimate. =for bad mle_geo processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mle_geo = \&PDL::mle_geo; =head2 pmf_geo =for sig Signature: (ushort x(); p(); float+ [o]out()) =for ref probability mass function for geometric distribution. x >= 0. =for bad pmf_geo processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pmf_geo = \&PDL::pmf_geo; =head2 mle_geosh =for sig Signature: (a(n); float+ [o]p()) =for ref shifted geometric distribution. mle same as method of moments estimate. =for bad mle_geosh processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mle_geosh = \&PDL::mle_geosh; =head2 pmf_geosh =for sig Signature: (ushort x(); p(); float+ [o]out()) =for ref probability mass function for shifted geometric distribution. x >= 1. =for bad pmf_geosh processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pmf_geosh = \&PDL::pmf_geosh; =head2 mle_lognormal =for sig Signature: (a(n); float+ [o]m(); float+ [o]v()) =for usage my ($m, $v) = $data->mle_lognormal(); =for ref lognormal distribution. maximum likelihood estimation. =for bad mle_lognormal processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mle_lognormal = \&PDL::mle_lognormal; =head2 mme_lognormal =for sig Signature: (a(n); float+ [o]m(); float+ [o]v()) =for usage my ($m, $v) = $data->mme_lognormal(); =for ref lognormal distribution. method of moments estimation. =for bad mme_lognormal processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mme_lognormal = \&PDL::mme_lognormal; =head2 pdf_lognormal =for sig Signature: (x(); m(); v(); float+ [o]p()) =for ref probability density function for lognormal distribution. x > 0. v > 0. =for bad pdf_lognormal processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pdf_lognormal = \&PDL::pdf_lognormal; =head2 mme_nbd =for sig Signature: (a(n); float+ [o]r(); float+ [o]p()) =for usage my ($r, $p) = $data->mme_nbd(); =for ref negative binomial distribution. pmf: f(x; r,p) = (x+r-1 r-1) p^r (1-p)^x for x=0,1,2... =for bad mme_nbd processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mme_nbd = \&PDL::mme_nbd; =head2 pmf_nbd =for sig Signature: (ushort x(); r(); p(); float+ [o]out()) =for ref probability mass function for negative binomial distribution. =for bad pmf_nbd processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pmf_nbd = \&PDL::pmf_nbd; =head2 mme_pareto =for sig Signature: (a(n); float+ [o]k(); float+ [o]xm()) =for usage my ($k, $xm) = $data->mme_pareto(); =for ref pareto distribution. pdf: f(x; k,xm) = k xm^k / x^(k+1) for x >= xm > 0. =for bad mme_pareto processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mme_pareto = \&PDL::mme_pareto; =head2 pdf_pareto =for sig Signature: (x(); k(); xm(); float+ [o]p()) =for ref probability density function for pareto distribution. x >= xm > 0. =for bad pdf_pareto processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pdf_pareto = \&PDL::pdf_pareto; =head2 mle_poisson =for sig Signature: (a(n); float+ [o]l()) =for usage my $lamda = $data->mle_poisson(); =for ref poisson distribution. pmf: f(x;l) = e^(-l) * l^x / x! =for bad mle_poisson processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mle_poisson = \&PDL::mle_poisson; =head2 pmf_poisson =for sig Signature: (x(); l(); float+ [o]p()) =for ref Probability mass function for poisson distribution. Uses Stirling's formula for x > 85. =for bad pmf_poisson processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pmf_poisson = \&PDL::pmf_poisson; =head2 pmf_poisson_stirling =for sig Signature: (x(); l(); [o]p()) =for ref Probability mass function for poisson distribution. Uses Stirling's formula for all values of the input. See http://en.wikipedia.org/wiki/Stirling's_approximation for more info. =for bad pmf_poisson_stirling processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pmf_poisson_stirling = \&PDL::pmf_poisson_stirling; #line 1139 "distr.pd" #line 1140 "distr.pd" =head2 pmf_poisson_factorial =for sig Signature: ushort x(); l(); float+ [o]p() =for ref Probability mass function for poisson distribution. Input is limited to x < 170 to avoid gsl_sf_fact() overflow. =cut *pmf_poisson_factorial = \&PDL::pmf_poisson_factorial; sub PDL::pmf_poisson_factorial { my ($x, $l) = @_; my $pdlx = pdl($x); if (any( $pdlx >= 170 )) { croak "Does not support input greater than 170. Please use pmf_poisson or pmf_poisson_stirling instead."; } else { return _pmf_poisson_factorial(@_); } } #line 850 "Distr.pm" *_pmf_poisson_factorial = \&PDL::_pmf_poisson_factorial; #line 1201 "distr.pd" #line 1202 "distr.pd" =head2 plot_distr =for ref Plots data distribution. When given specific distribution(s) to fit, returns % ref to sum log likelihood and parameter values under fitted distribution(s). See FUNCTIONS above for available distributions. =for options Default options (case insensitive): MAXBN => 20, # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple distr in same plot # set env before passing WIN DEV => '/xs' , # open and close dev for plotting if no WIN # defaults to '/png' in Windows COLOR => 1, # color for data distr =for usage Usage: # yes it threads :) my $data = grandom( 500, 3 )->abs; # ll on plot is sum across 3 data curves my ($ll, $pars) = $data->plot_distr( 'gaussian', 'lognormal', {DEV=>'/png'} ); # pars are from normalized data (ie data / bin_size) print "$_\t@{$pars->{$_}}\n" for (sort keys %$pars); print "$_\t$ll->{$_}\n" for (sort keys %$ll); =cut *plot_distr = \&PDL::plot_distr; sub PDL::plot_distr { require PDL::Graphics::PGPLOT::Window; my ($self, @distr) = @_; my %opt = ( MAXBN => 20, WIN => undef, # pgwin object. not closed here if passed DEV => $DEV, # open and close default win if no WIN COLOR => 1, # color for data distr ); my $opt = pop @distr if ref $distr[-1] eq 'HASH'; $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $self = $self->squeeze; # use int range, step etc for int xvals--pmf compatible my $INT = 1 if grep { /(?:binomial)|(?:geo)|(?:nbd)|(?:poisson)/ } @distr; my ($range, $step, $step_int); $range = $self->max->sclr - $self->min->sclr; $step = $range / $opt{MAXBN}; $step_int = ($range <= $opt{MAXBN})? 1 : PDL::ceil( $range / $opt{MAXBN} ) ; $opt{MAXBN} = PDL::ceil( $range / $step )->min->sclr; my $hist = $self->double->histogram($step, $self->min->sclr, $opt{MAXBN}); # turn fre into prob $hist /= $self->dim(0); my $xvals = $self->min->sclr + sequence( $opt{MAXBN} ) * $step; my $xvals_int = PDL::ceil($self->min->sclr) + sequence( $opt{MAXBN} ) * $step_int; $xvals_int = $xvals_int->where( $xvals_int <= $xvals->max )->sever; my $win = $opt{WIN}; if (!$win) { $win = PDL::Graphics::PGPLOT::Window::pgwin( Dev=>$opt{DEV} ); $win->env($xvals->minmax,0,1, {XTitle=>'xvals', YTitle=>'probability'}); } $win->line( $xvals, $hist, { COLOR=>$opt{COLOR} } ); if (!@distr) { $win->close unless defined $opt{WIN}; return; } my (%ll, %pars, @text, $c); $c = $opt{COLOR}; # fitted lines start from ++$c for my $distr ( @distr ) { # find mle_ or mme_$distr; my @funcs = grep { /_$distr$/ } (keys %PDL::Stats::Distr::); if (!@funcs) { carp "Do not recognize $distr distribution!"; next; } # might have mle and mme for a distr. sort so mle comes first @funcs = sort @funcs; my ($f_para, $f_prob) = @funcs[0, -1]; my $nrmd = $self / $step; eval { my @paras = $nrmd->$f_para(); $pars{$distr} = \@paras; @paras = map { $_->dummy(0) } @paras; $ll{$distr} = $nrmd->$f_prob( @paras )->log->sumover; push @text, sprintf "$distr LL = %.2f", $ll{$distr}->sum; if ($f_prob =~ /^pdf/) { $win->line( $xvals, ($xvals/$step)->$f_prob(@paras), {COLOR=>++$c} ); } else { $win->points( $xvals_int, ($xvals_int/$step_int)->$f_prob(@paras), {COLOR=>++$c} ); } }; carp $@ if $@; } $win->legend(\@text, ($xvals->min->sclr + $xvals->max->sclr)/2, .95, {COLOR=>[$opt{COLOR}+1 .. $c], TextFraction=>.75} ); $win->close unless defined $opt{WIN}; return (\%ll, \%pars); } =head1 DEPENDENCIES GSL - GNU Scientific Library =head1 SEE ALSO PDL::Graphics::PGPLOT PDL::GSL::CDF =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong , David Mertens All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut #line 1006 "Distr.pm" # Exit with OK status 1; PDL-Stats-0.84/GENERATED/PDL/Stats/GLM.pm0000644000175000017500000020463614625061426017117 0ustar osboxesosboxes# # GENERATED WITH PDL::PP! Don't modify! # package PDL::Stats::GLM; our @EXPORT_OK = qw(ols_t anova anova_rptd dummy_code effect_code effect_code_w interaction_code ols ols_rptd r2_change logistic pca pca_sorti plot_means plot_residuals plot_screes fill_m fill_rand dev_m stddz sse mse rmse pred_logistic d0 dm dvrs ); our %EXPORT_TAGS = (Func=>\@EXPORT_OK); use PDL::Core; use PDL::Exporter; use DynaLoader; our @ISA = ( 'PDL::Exporter','DynaLoader' ); push @PDL::Core::PP, __PACKAGE__; bootstrap PDL::Stats::GLM ; #line 3 "glm.pd" use strict; use warnings; use Carp; use PDL::LiteF; use PDL::MatrixOps; use PDL::NiceSlice; use PDL::Stats::Basic; use PDL::Stats::Kmeans; eval { require PDL::Core; require PDL::GSL::CDF; }; my $CDF = 1 if !$@; eval { require PDL::Slatec; }; my $SLATEC = 1 if !$@; my $MATINV = $SLATEC ? \&PDL::Slatec::matinv : \&inv; my $DEV = ($^O =~ /win/i)? '/png' : '/xs'; =head1 NAME PDL::Stats::GLM -- general and generalized linear modeling methods such as ANOVA, linear regression, PCA, and logistic regression. =head1 DESCRIPTION The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively. FUNCTIONS except B support bad value. B strongly recommended for most METHODS, and it is required for B. P-values, where appropriate, are provided if PDL::GSL::CDF is installed. =head1 SYNOPSIS use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::GLM; # do a multiple linear regression and plot the residuals my $y = pdl( 8, 7, 7, 0, 2, 5, 0 ); my $x = pdl( [ 0, 1, 2, 3, 4, 5, 6 ], # linear component [ 0, 1, 4, 9, 16, 25, 36 ] ); # quadratic component my %m = $y->ols( $x, {plot=>1} ); print "$_\t$m{$_}\n" for (sort keys %m); =cut #line 75 "GLM.pm" =head1 FUNCTIONS =cut =head2 fill_m =for sig Signature: (a(n); float+ [o]b(n)) =for ref Replaces bad values with sample mean. Mean is set to 0 if all obs are bad. Can be done inplace. =for usage perldl> p $data [ [ 5 BAD 2 BAD] [ 7 3 7 BAD] ] perldl> p $data->fill_m [ [ 5 3.5 2 3.5] [ 7 3 7 5.66667] ] =for bad The output pdl badflag is cleared. =cut *fill_m = \&PDL::fill_m; =head2 fill_rand =for sig Signature: (a(n); [o]b(n)) =for ref Replaces bad values with random sample (with replacement) of good observations from the same variable. Can be done inplace. =for usage perldl> p $data [ [ 5 BAD 2 BAD] [ 7 3 7 BAD] ] perldl> p $data->fill_rand [ [5 2 2 5] [7 3 7 7] ] =for bad The output pdl badflag is cleared. =cut *fill_rand = \&PDL::fill_rand; =head2 dev_m =for sig Signature: (a(n); float+ [o]b(n)) =for ref Replaces values with deviations from the mean. Can be done inplace. =for bad dev_m processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *dev_m = \&PDL::dev_m; =head2 stddz =for sig Signature: (a(n); float+ [o]b(n)) =for ref Standardize ie replace values with z_scores based on sample standard deviation from the mean (replace with 0s if stdv==0). Can be done inplace. =for bad stddz processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *stddz = \&PDL::stddz; =head2 sse =for sig Signature: (a(n); b(n); float+ [o]c()) =for ref Sum of squared errors between actual and predicted values. =for bad sse processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *sse = \&PDL::sse; =head2 mse =for sig Signature: (a(n); b(n); float+ [o]c()) =for ref Mean of squared errors between actual and predicted values, ie variance around predicted value. =for bad mse processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mse = \&PDL::mse; =head2 rmse =for sig Signature: (a(n); b(n); float+ [o]c()) =for ref Root mean squared error, ie stdv around predicted value. =for bad rmse processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *rmse = \&PDL::rmse; =head2 pred_logistic =for sig Signature: (a(n,m); b(m); float+ [o]c(n)) =for ref Calculates predicted prob value for logistic regression. =for usage # glue constant then apply coeff returned by the logistic method $pred = $x->glue(1,ones($x->dim(0)))->pred_logistic( $m{b} ); =for bad pred_logistic processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pred_logistic = \&PDL::pred_logistic; =head2 d0 =for sig Signature: (a(n); float+ [o]c()) =for usage my $d0 = $y->d0(); =for ref Null deviance for logistic regression. =for bad d0 processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *d0 = \&PDL::d0; =head2 dm =for sig Signature: (a(n); b(n); float+ [o]c()) =for usage my $dm = $y->dm( $y_pred ); # null deviance my $d0 = $y->dm( ones($y->nelem) * $y->avg ); =for ref Model deviance for logistic regression. =for bad dm processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *dm = \&PDL::dm; =head2 dvrs =for sig Signature: (a(); b(); float+ [o]c()) =for ref Deviance residual for logistic regression. =for bad dvrs processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *dvrs = \&PDL::dvrs; #line 593 "glm.pd" #line 594 "glm.pd" # my tmp var for PDL 2.007 slice upate my $_tmp; =head2 ols_t =for ref Threaded version of ordinary least squares regression (B). The price of threading was losing significance tests for coefficients (but see B). The fitting function was shamelessly copied then modified from PDL::Fit::Linfit. Uses PDL::Slatec when possible but otherwise uses PDL::MatrixOps. Intercept is LAST of coeff if CONST => 1. ols_t does not handle bad values. consider B or B if there are bad values. =for options Default options (case insensitive): CONST => 1, =for usage Usage: # DV, 2 person's ratings for top-10 box office movies # ascending sorted by box office numbers perldl> p $y = qsort ceil( random(10, 2)*5 ) [ [1 1 2 4 4 4 4 5 5 5] [1 2 2 2 3 3 3 3 5 5] ] # model with 2 IVs, a linear and a quadratic trend component perldl> $x = cat sequence(10), sequence(10)**2 # suppose our novice modeler thinks this creates 3 different models # for predicting movie ratings perldl> p $x = cat $x, $x * 2, $x * 3 [ [ [ 0 1 2 3 4 5 6 7 8 9] [ 0 1 4 9 16 25 36 49 64 81] ] [ [ 0 2 4 6 8 10 12 14 16 18] [ 0 2 8 18 32 50 72 98 128 162] ] [ [ 0 3 6 9 12 15 18 21 24 27] [ 0 3 12 27 48 75 108 147 192 243] ] ] perldl> p $x->info PDL: Double D [10,2,3] # insert a dummy dim between IV and the dim (model) to be threaded perldl> %m = $y->ols_t( $x->dummy(2) ) perldl> p "$_\t$m{$_}\n" for (sort keys %m) # 2 persons' ratings, eached fitted with 3 "different" models F [ [ 38.314159 25.087209] [ 38.314159 25.087209] [ 38.314159 25.087209] ] # df is the same across dv and iv models F_df [2 7] F_p [ [0.00016967051 0.00064215074] [0.00016967051 0.00064215074] [0.00016967051 0.00064215074] ] R2 [ [ 0.9162963 0.87756762] [ 0.9162963 0.87756762] [ 0.9162963 0.87756762] ] b [ # linear quadratic constant [ [ 0.99015152 -0.056818182 0.66363636] # person 1 [ 0.18939394 0.022727273 1.4] # person 2 ] [ [ 0.49507576 -0.028409091 0.66363636] [ 0.09469697 0.011363636 1.4] ] [ [ 0.33005051 -0.018939394 0.66363636] [ 0.063131313 0.0075757576 1.4] ] ] # our novice modeler realizes at this point that # the 3 models only differ in the scaling of the IV coefficients ss_model [ [ 20.616667 13.075758] [ 20.616667 13.075758] [ 20.616667 13.075758] ] ss_residual [ [ 1.8833333 1.8242424] [ 1.8833333 1.8242424] [ 1.8833333 1.8242424] ] ss_total [22.5 14.9] y_pred [ [ [0.66363636 1.5969697 2.4166667 3.1227273 ... 4.9727273] ... =cut *ols_t = \&PDL::ols_t; sub PDL::ols_t { _ols_common(1, @_); } sub _ols_common { my $threaded = shift; my $opt = pop @_ if ref $_[-1] eq 'HASH'; # y [n], ivs [n x attr] pdl my ($y, $ivs) = @_; my %opt = ( CONST => 1, PLOT => 0, WIN => undef, # for plotting ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); if (!$threaded) { $y = $y->squeeze; $y->getndims > 1 and croak "use ols_t for threaded version"; } $ivs = $ivs->dummy(1) if $ivs->getndims == 1; ($y, $ivs) = _rm_bad_value( $y, $ivs ) if !$threaded; # set up ivs and const as ivs $opt{CONST} and $ivs = $ivs->glue( 1, ones($ivs->dim(0)) ); # Internally normalise data # (double) it or ushort y and sequence iv won't work right my $ymean = $y->abs->avgover->double; ($_tmp = $ymean->where( $ymean==0 )) .= 1; my $y2 = $y / ($threaded ? $ymean->dummy(0) : $ymean); # Do the fit my $Y = $ivs x $y2->dummy(0); my $C = &$MATINV( $ivs x $ivs->xchg(0,1) ); # avoid niceslice # Fitted coefficients vector my $coeff = PDL::squeeze( $C x $Y ); $coeff = $coeff->dummy(0) if $threaded and $coeff->getndims == 1 and $y->getndims > 1; $coeff *= ($threaded ? $ymean->dummy(0) : $ymean); # Un-normalise my %ret; # ***$coeff x $ivs looks nice but produces nan on successive tries*** $ret{y_pred} = sumover( ($threaded ? $coeff->dummy(1) : $coeff) * $ivs->transpose ); $opt{PLOT} and $y->plot_residuals( $ret{y_pred}, \%opt ); return $coeff unless wantarray; $ret{ss_total} = $opt{CONST}? $y->ss : sumover( $y ** 2 ); $ret{ss_residual} = $y->sse( $ret{y_pred} ); $ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; $ret{R2} = $ret{ss_model} / $ret{ss_total}; my $n_var = $opt{CONST}? $ivs->dim(1) - 1 : $ivs->dim(1); $ret{F_df} = pdl( $n_var, $y->dim(0) - $ivs->dim(1) ); $ret{F} = $ret{ss_model} / $ret{F_df}->(0) / ($ret{ss_residual} / $ret{F_df}->(1)); $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $ret{F_df}->dog ) if $CDF; if (!$threaded) { my $se_b = ones( $coeff->dims? $coeff->dims : 1 ); $opt{CONST} and ($_tmp = $se_b(-1)) .= sqrt( $ret{ss_residual} / $ret{F_df}->(1) * $C(-1,-1) ); # get the se for bs by successivly regressing each iv by the rest ivs if ($ivs->dim(1) > 1) { for my $k (0 .. $n_var-1) { my @G = grep { $_ != $k } (0 .. $n_var-1); my $G = $ivs->dice_axis(1, \@G); $opt{CONST} and $G = $G->glue( 1, ones($ivs->dim(0)) ); my $b_G = $ivs( ,$k)->ols( $G, {CONST=>0,PLOT=>0} ); my $ss_res_k = $ivs( ,$k)->squeeze->sse( sumover($b_G * $G->transpose) ); ($_tmp = $se_b($k)) .= sqrt( $ret{ss_residual} / $ret{F_df}->(1) / $ss_res_k ); } } else { ($_tmp = $se_b(0)) .= sqrt( $ret{ss_residual} / $ret{F_df}->(1) / sum( $ivs( ,0)**2 ) ); } $ret{b_se} = $se_b; $ret{b_t} = $coeff / $ret{b_se}; $ret{b_p} = 2 * ( 1 - $ret{b_t}->abs->gsl_cdf_tdist_P( $ret{F_df}->(1) ) ) if $CDF; } for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; $ret{b} = $coeff; return %ret; } =head2 r2_change =for ref Significance test for the incremental change in R2 when new variable(s) are added to an ols regression model. Returns the change stats as well as stats for both models. Based on B. (One way to make up for the lack of significance tests for coeffs in ols_t). =for options Default options (case insensitive): CONST => 1, =for usage Usage: # suppose these are two persons' ratings for top 10 box office movies # ascending sorted by box office perldl> p $y = qsort ceil(random(10, 2) * 5) [ [1 1 2 2 2 3 4 4 4 4] [1 2 2 3 3 3 4 4 5 5] ] # first IV is a simple linear trend perldl> p $x1 = sequence 10 [0 1 2 3 4 5 6 7 8 9] # the modeler wonders if adding a quadratic trend improves the fit perldl> p $x2 = sequence(10) ** 2 [0 1 4 9 16 25 36 49 64 81] # two difference models are given in two pdls # each as would be pass on to ols_t # the 1st model includes only linear trend # the 2nd model includes linear and quadratic trends # when necessary use dummy dim so both models have the same ndims perldl> %c = $y->r2_change( $x1->dummy(1), cat($x1, $x2) ) perldl> p "$_\t$c{$_}\n" for (sort keys %c) # person 1 person 2 F_change [0.72164948 0.071283096] # df same for both persons F_df [1 7] F_p [0.42370145 0.79717232] R2_change [0.0085966043 0.00048562549] model0 HASH(0x8c10828) model1 HASH(0x8c135c8) # the answer here is no. =cut *r2_change = \&PDL::r2_change; sub PDL::r2_change { my ($self, $ivs0, $ivs1, $opt) = @_; $ivs0->getndims == 1 and $ivs0 = $ivs0->dummy(1); my %ret; $ret{model0} = { $self->ols_t( $ivs0, $opt ) }; $ret{model1} = { $self->ols_t( $ivs1, $opt ) }; $ret{R2_change} = $ret{model1}->{R2} - $ret{model0}->{R2}; $ret{F_df} = pdl($ivs1->dim(1) - $ivs0->dim(1), $ret{model1}->{F_df}->((1)) ); $ret{F_change} = $ret{R2_change} * $ret{F_df}->((1)) / ( (1-$ret{model1}->{R2}) * $ret{F_df}->((0)) ); $ret{F_p} = 1 - $ret{F_change}->gsl_cdf_fdist_P( $ret{F_df}->dog ) if $CDF; for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; return %ret; } =head1 METHODS =head2 anova =for ref Analysis of variance. Uses type III sum of squares for unbalanced data. Dependent variable should be a 1D pdl. Independent variables can be passed as 1D perl array ref or 1D pdl. Will only calculate p-value (C) if there are more samples than the product of categories of all the IVs. Supports bad value (by ignoring missing or BAD values in dependent and independent variables list-wise). =for options Default options (case insensitive): V => 1, # carps if bad value in variables IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] PLOT => 0, # plots highest order effect # can set plot_means options here WIN => undef, # for plotting =for usage Usage: # suppose this is ratings for 12 apples perldl> p $y = qsort ceil( random(12)*5 ) [1 1 2 2 2 3 3 4 4 4 5 5] # IV for types of apple perldl> p $a = sequence(12) % 3 + 1 [1 2 3 1 2 3 1 2 3 1 2 3] # IV for whether we baked the apple perldl> @b = qw( y y y y y y n n n n n n ) perldl> %m = $y->anova( $a, \@b, { IVNM=>['apple', 'bake'] } ) perldl> p "$_\t$m{$_}\n" for (sort keys %m) # apple # m [ [2.5 3 3.5] ] # apple # se [ [0.64549722 0.91287093 0.64549722] ] # apple ~ bake # m [ [1.5 1.5 2.5] [3.5 4.5 4.5] ] # apple ~ bake # se [ [0.5 0.5 0.5] [0.5 0.5 0.5] ] # bake # m [ [ 1.8333333 4.1666667] ] # bake # se [ [0.30731815 0.30731815] ] F 7.6 F_df [5 6] F_p 0.0141586545851857 ms_model 3.8 ms_residual 0.5 ss_model 19 ss_residual 3 ss_total 22 | apple | F 2 | apple | F_df [2 6] | apple | F_p 0.216 | apple | ms 1 | apple | ss 2 | apple ~ bake | F 0.666666666666667 | apple ~ bake | F_df [2 6] | apple ~ bake | F_p 0.54770848985725 | apple ~ bake | ms 0.333333333333334 | apple ~ bake | ss 0.666666666666667 | bake | F 32.6666666666667 | bake | F_df [1 6] | bake | F_p 0.00124263849516693 | bake | ms 16.3333333333333 | bake | ss 16.3333333333333 =cut *anova = \&PDL::anova; sub PDL::anova { my ($y, @ivs_raw) = @_; anova_rptd($y, undef, @ivs_raw); } sub _effect_code_ivs { my $ivs = shift; my (@i_iv, @i_cmo); for (@$ivs) { my ($e, $map) = effect_code($_->squeeze); my $var = ($e->getndims == 1)? $e->dummy(1) : $e; push @i_iv, $var; my @indices = sort { $a<=>$b } values %$map; push @i_cmo, pdl @indices; } return \@i_iv, \@i_cmo; } sub _add_interactions { my ($var_ref, $i_cmo_ref, $idv, $raw_ref) = @_; # append info re inter to main effects my (@inter, @idv_inter, @inter_cm, @inter_cmo); for my $nway ( 2 .. @$var_ref ) { my $iter_idv = _combinations( $nway, [0..$#$var_ref] ); while ( my @v = &$iter_idv() ) { my $i = ones( $var_ref->[0]->dim(0), 1 ); for (@v) { $i = $i * $var_ref->[$_]->dummy(1); $i = $i->clump(1,2); } push @inter, $i; my $e = join( ' ~ ', @$idv[@v] ); push @idv_inter, $e; # now prepare for cell mean my @i_cm = (); for my $o ( 0 .. $raw_ref->[0]->dim(0) - 1 ) { my @cell = map { $_($o)->squeeze } @$raw_ref[@v]; push @i_cm, join('', @cell); } my ($inter, $map) = effect_code( \@i_cm ); push @inter_cm, $inter; # get the order to put means in correct multi dim pdl pos # this is order in var_e dim(1) my @levels = sort { $map->{$a} <=> $map->{$b} } keys %$map; # this is order needed for cell mean my @i_cmo = sort { reverse($levels[$a]) cmp reverse($levels[$b]) } 0 .. $#levels; push @inter_cmo, pdl @i_cmo; } } # append info re inter to main effects return ([@$var_ref, @inter], [@$i_cmo_ref, @inter_cmo], [@$idv, @idv_inter], [@$var_ref, @inter_cm] ); } sub _cell_means { my ($data, $ivs_ref, $i_cmo_ref, $ids, $raw_ref) = @_; my %ind_id; @ind_id{ @$ids } = 0..$#$ids; my %cm; my $i = 0; for (@$ivs_ref) { confess "_cell_means passed empty ivs_ref ndarray at pos $i" if $_->isempty; my $last = zeroes $_->dim(0); my $i_neg = which $_( ,0) == -1; ($_tmp = $last($i_neg)) .= 1; ($_tmp = $_->where($_ == -1)) .= 0; $_ = $_->glue(1, $last); my @v = split ' ~ ', $ids->[$i]; my @shape = map { $raw_ref->[$_]->uniq->nelem } @ind_id{@v}; my ($m, $ss) = $data->centroid( $_ ); $m = $m($i_cmo_ref->[$i])->sever; $ss = $ss($i_cmo_ref->[$i])->sever; $m = $m->reshape(@shape); $m->getndims == 1 and $m = $m->dummy(1); my $se = sqrt( ($ss/($_->sumover - 1)) / $_->sumover )->reshape(@shape); $se->getndims == 1 and $se = $se->dummy(1); $cm{ "# $ids->[$i] # m" } = $m; $cm{ "# $ids->[$i] # se" } = $se; $i++; } return \%cm; } # http://www.perlmonks.org/?node_id=371228 sub _combinations { my ($num, $arr) = @_; return sub { return } if $num == 0 or $num > @$arr; my @pick; return sub { return @$arr[ @pick = ( 0 .. $num - 1 ) ] unless @pick; my $i = $#pick; $i-- until $i < 0 or $pick[$i]++ < @$arr - $num + $i; return if $i < 0; @pick[$i .. $#pick] = $pick[$i] .. $#$arr; return @$arr[@pick]; }; } =head2 anova_rptd Repeated measures and mixed model anova. Uses type III sum of squares. The standard error (se) for the means are based on the relevant mean squared error from the anova, ie it is pooled across levels of the effect. Will only calculate p-value (C) if there are more samples than the product of categories of all the IVs. anova_rptd supports bad value in the dependent and independent variables. It automatically removes bad data listwise, ie remove a subject's data if there is any cell missing for the subject. Default options (case insensitive): V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] BTWN => [], # indices of between-subject IVs (matches IVNM indices) PLOT => 0, # plots highest order effect # see plot_means() for more options WIN => undef, # for plotting Usage: Some fictional data: recall_w_beer_and_wings.txt Subject Beer Wings Recall Alex 1 1 8 Alex 1 2 9 Alex 1 3 12 Alex 2 1 7 Alex 2 2 9 Alex 2 3 12 Brian 1 1 12 Brian 1 2 13 Brian 1 3 14 Brian 2 1 9 Brian 2 2 8 Brian 2 3 14 ... # rtable allows text only in 1st row and col my ($data, $idv, $subj) = rtable 'recall_w_beer_and_wings.txt'; my ($b, $w, $dv) = $data->dog; # subj and IVs can be 1d pdl or @ ref # subj must be the first argument my %m = $dv->anova_rptd( $subj, $b, $w, {ivnm=>['Beer', 'Wings']} ); print "$_\t$m{$_}\n" for (sort keys %m); # Beer # m [ [ 10.916667 8.9166667] ] # Beer # se [ [ 0.4614791 0.4614791] ] # Beer ~ Wings # m [ [ 10 7] [ 10.5 9.25] [12.25 10.5] ] # Beer ~ Wings # se [ [0.89170561 0.89170561] [0.89170561 0.89170561] [0.89170561 0.89170561] ] # Wings # m [ [ 8.5 9.875 11.375] ] # Wings # se [ [0.67571978 0.67571978 0.67571978] ] ss_residual 19.0833333333333 ss_subject 24.8333333333333 ss_total 133.833333333333 | Beer | F 9.39130434782609 | Beer | F_p 0.0547977008378944 | Beer | df 1 | Beer | ms 24 | Beer | ss 24 | Beer || err df 3 | Beer || err ms 2.55555555555556 | Beer || err ss 7.66666666666667 | Beer ~ Wings | F 0.510917030567687 | Beer ~ Wings | F_p 0.623881438624431 | Beer ~ Wings | df 2 | Beer ~ Wings | ms 1.625 | Beer ~ Wings | ss 3.25000000000001 | Beer ~ Wings || err df 6 | Beer ~ Wings || err ms 3.18055555555555 | Beer ~ Wings || err ss 19.0833333333333 | Wings | F 4.52851711026616 | Wings | F_p 0.0632754786153548 | Wings | df 2 | Wings | ms 16.5416666666667 | Wings | ss 33.0833333333333 | Wings || err df 6 | Wings || err ms 3.65277777777778 | Wings || err ss 21.9166666666667 For mixed model anova, ie when there are between-subject IVs involved, feed the IVs as above, but specify in BTWN which IVs are between-subject. For example, if we had added age as a between-subject IV in the above example, we would do my %m = $dv->anova_rptd( $subj, $age, $b, $w, { ivnm=>['Age', 'Beer', 'Wings'], btwn=>[0] }); =cut *anova_rptd = \&PDL::anova_rptd; sub PDL::anova_rptd { my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($y, $subj, @ivs_raw) = @_; confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1; croak "Mismatched number of elements in DV and IV. Are you passing IVs the old-and-abandoned way?" if (ref $ivs_raw[0] eq 'ARRAY') and (@{ $ivs_raw[0] } != $y->nelem); for (@ivs_raw) { croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1 } my %opt = ( V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] ( !defined($subj) ? () : ( BTWN => [], # indices of between-subject IVs (matches IVNM indices) )), PLOT => 0, # plots highest order effect WIN => undef, # for plotting ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ] if !$opt{IVNM} or !@{ $opt{IVNM} }; my @idv = @{ $opt{IVNM} }; my %ret; $y = $y->squeeze; my @pdl_ivs_raw = map scalar PDL::Stats::Basic::_array_to_pdl($_), @ivs_raw; my $pdl_ivs_raw = pdl \@pdl_ivs_raw; # explicit set badflag because pdl() removes badflag $pdl_ivs_raw->badflag( scalar grep { $_->badflag } @pdl_ivs_raw ); my $sj; if (defined($subj)) { # delete bad data listwise ie remove subj if any cell missing $sj = PDL::Stats::Basic::_array_to_pdl($subj); my $ibad = which( $y->isbad | nbadover($pdl_ivs_raw->transpose) ); my $sj_bad = $sj($ibad)->uniq; if ($sj_bad->nelem) { print STDERR $sj_bad->nelem . " subjects with missing data removed\n" if $opt{V}; $sj = $sj->setvaltobad($_) for (list $sj_bad); my $igood = which $sj->isgood; for ($y, $sj, @pdl_ivs_raw) { $_ = $_( $igood )->sever; $_->badflag(0); } } } else { ($y, $pdl_ivs_raw) = _rm_bad_value( $y, $pdl_ivs_raw ); if ($opt{V} and $y->nelem < $pdl_ivs_raw[0]->nelem) { printf STDERR "%d subjects with missing data removed\n", $pdl_ivs_raw[0]->nelem - $y->nelem; } # dog preserves data flow @pdl_ivs_raw = map {$_->copy} $pdl_ivs_raw->dog; } # code for ivs and cell mean in diff @s: effect_code vs iv_cluster my ($ivs_ref, $i_cmo_ref) = _effect_code_ivs( \@pdl_ivs_raw ); ($ivs_ref, $i_cmo_ref, my( $idv, $ivs_cm_ref)) = _add_interactions( $ivs_ref, $i_cmo_ref, \@idv, \@pdl_ivs_raw ); # matches $ivs_ref, with an extra last pdl for subj effect my $err_ref = defined($subj) ? _add_errors( $sj, $ivs_ref, $idv, \@pdl_ivs_raw, \%opt ) : []; # stitch together my $ivs = PDL->null->glue( 1, @$ivs_ref, grep defined($_) && ref($_), @$err_ref); $ivs = $ivs->glue(1, ones $ivs->dim(0)); my $b_full = $y->ols_t( $ivs, {CONST=>0} ); $ret{ss_total} = $y->ss; $ret{ss_residual} = $y->sse( sumover( $b_full * $ivs->xchg(0,1) ) ); if (defined $subj) { my @full = (@$ivs_ref, @$err_ref); EFFECT: for my $k (0 .. $#full) { my $e = ($k > $#$ivs_ref)? '| err' : ''; my $i = ($k > $#$ivs_ref)? $k - @$ivs_ref : $k; if (!defined $full[$k]) { # ss_residual as error $ret{ "| $idv->[$i] |$e ss" } = $ret{ss_residual}; # highest ord inter for purely within design, (p-1)*(q-1)*(n-1) $ret{ "| $idv->[$i] |$e df" } = pdl(map { $_->dim(1) } @full[0 .. $#ivs_raw])->prodover; $ret{ "| $idv->[$i] |$e df" } *= ref($full[-1])? $full[-1]->dim(1) : $err_ref->[$err_ref->[-1]]->dim(1) ; $ret{ "| $idv->[$i] |$e ms" } = $ret{ "| $idv->[$i] |$e ss" } / $ret{ "| $idv->[$i] |$e df" }; } elsif (ref $full[$k]) { # unique error term my (@G, $G, $b_G); @G = grep { $_ != $k and defined $full[$_] } (0 .. $#full); next EFFECT unless @G; $G = PDL->null->glue( 1, grep { ref $_ } @full[@G] ); $G = $G->glue(1, ones $G->dim(0)); $b_G = $y->ols_t( $G, {CONST=>0} ); if ($k == $#full) { $ret{ss_subject} = $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual}; } else { $ret{ "| $idv->[$i] |$e ss" } = $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual}; $ret{ "| $idv->[$i] |$e df" } = $full[$k]->dim(1); $ret{ "| $idv->[$i] |$e ms" } = $ret{ "| $idv->[$i] |$e ss" } / $ret{ "| $idv->[$i] |$e df" }; } } else { # repeating error term if ($k == $#full) { $ret{ss_subject} = $ret{"| $idv->[$full[$k]] |$e ss"}; } else { $ret{ "| $idv->[$i] |$e ss" } = $ret{"| $idv->[$full[$k]] |$e ss"}; $ret{ "| $idv->[$i] |$e df" } = $ret{"| $idv->[$full[$k]] |$e df"}; $ret{ "| $idv->[$i] |$e ms" } = $ret{ "| $idv->[$i] |$e ss" } / $ret{ "| $idv->[$i] |$e df" }; } } } # have all iv, inter, and error effects. get F and F_p for (0 .. $#$ivs_ref) { $ret{ "| $idv->[$_] | F" } = $ret{ "| $idv->[$_] | ms" } / $ret{ "| $idv->[$_] || err ms" }; $ret{ "| $idv->[$_] | F_p" } = 1 - $ret{ "| $idv->[$_] | F" }->gsl_cdf_fdist_P( $ret{ "| $idv->[$_] | df" }, $ret{ "| $idv->[$_] || err df" } ) if $CDF and $ret{ "| $idv->[$_] || err df" } > 0; } } else { $ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; $ret{F_df} = pdl($ivs->dim(1) - 1, $y->nelem - ($ivs->dim(1) - 1) -1); $ret{ms_model} = $ret{ss_model} / $ret{F_df}->(0); $ret{ms_residual} = $ret{ss_residual} / $ret{F_df}->(1); $ret{F} = $ret{ms_model} / $ret{ms_residual}; $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $ret{F_df}->dog ) if $CDF and $ret{F_df}->(1) > 0; # get IV ss from $ivs_ref instead of $ivs pdl for my $k (0 .. $#$ivs_ref) { my ($G); my @G = grep $_ != $k, 0 .. $#$ivs_ref; if (@G) { $G = PDL->null->glue( 1, @$ivs_ref[@G] ); $G = $G->glue(1, ones $G->dim(0)); } else { $G = ones( $y->dim(0) ); } my $b_G = $y->ols_t( $G, {CONST=>0} ); $ret{ "| $idv->[$k] | ss" } = $y->sse( sumover($b_G * $G->transpose) ) - $ret{ss_residual}; $ret{ "| $idv->[$k] | F_df" } = pdl( $ivs_ref->[$k]->dim(1), $ret{F_df}->(1)->copy )->squeeze; $ret{ "| $idv->[$k] | ms" } = $ret{ "| $idv->[$k] | ss" } / $ret{ "| $idv->[$k] | F_df" }->(0); $ret{ "| $idv->[$k] | F" } = $ret{ "| $idv->[$k] | ms" } / $ret{ms_residual}; $ret{ "| $idv->[$k] | F_p" } = 1 - $ret{ "| $idv->[$k] | F" }->gsl_cdf_fdist_P( $ret{ "| $idv->[$k] | F_df" }->dog ) if $CDF and $ret{ "| $idv->[$k] | F_df" }->(1) > 0; } } for (keys %ret) {ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze}; my $cm_ref = _cell_means( $y, $ivs_cm_ref, $i_cmo_ref, $idv, \@pdl_ivs_raw ); if (defined $subj) { my @ls = map { $_->uniq->nelem } @pdl_ivs_raw; $cm_ref = _fix_rptd_se( $cm_ref, \%ret, $opt{'IVNM'}, \@ls, $sj->uniq->nelem ); } # integrate mean and se into %ret @ret{ keys %$cm_ref } = values %$cm_ref; my $highest = join(' ~ ', @{ $opt{IVNM} }); $cm_ref->{"# $highest # m"}->plot_means( $cm_ref->{"# $highest # se"}, { %opt, IVNM=>$idv } ) if $opt{PLOT}; return %ret; } sub _add_errors { my ($subj, $ivs_ref, $idv, $raw_ivs, $opt) = @_; # code (btwn group) subjects. Rutherford (2001) pp 101-102 my (@grp, %grp_s); for my $n (0 .. $subj->nelem - 1) { # construct string to code group membership # something not treated as BAD by _array_to_pdl to start off marking group membership # if no $opt->{BTWN}, everyone ends up in the same grp my $s = '_'; $s .= $_->($n) for (@$raw_ivs[@{ $opt->{BTWN} }]); push @grp, $s; # group membership $s .= $subj($n); # keep track of total uniq subj $grp_s{$s} = 1; } my $grp = PDL::Stats::Kmeans::iv_cluster \@grp; my $spdl = zeroes $subj->dim(0), keys(%grp_s) - $grp->dim(1); my ($d0, $d1) = (0, 0); for my $g (0 .. $grp->dim(1)-1) { my $gsub = $subj( which $grp( ,$g) )->effect_code; my ($nobs, $nsub) = $gsub->dims; ($_tmp = $spdl($d0:$d0+$nobs-1, $d1:$d1+$nsub-1)) .= $gsub; $d0 += $nobs; $d1 += $nsub; } # if btwn factor involved, or highest order inter for within factors # elem is undef, so that # @errors ind matches @$ivs_ref, with an extra elem at the end for subj # mark btwn factors for error terms # same error term for B(wn) and A(btwn) x B(wn) (Rutherford, p98) my @qr = map { "(?:$idv->[$_])" } @{ $opt->{BTWN} }; my $qr = join('|', @qr); my $ie_subj; my @errors = map { my @fs = split ' ~ ', $idv->[$_]; # separate bw and wn factors # if only bw, error is bw x subj # if only wn or wn and bw, error is wn x subj my (@wn, @bw); if ($qr) { for (@fs) { /$qr/? push @bw, $_ : push @wn, $_; } } else { @wn = @fs; } $ie_subj = defined($ie_subj)? $ie_subj : $_ if !@wn; my $err = @wn? join(' ~ ', @wn) : join(' ~ ', @bw); my $ie; # mark repeating error term for my $i (0 .. $#$ivs_ref) { if ($idv->[$i] eq $err) { $ie = $i; last; } } # highest order inter of within factors, use ss_residual as error if ( @wn == @$raw_ivs - @{$opt->{BTWN}} ) { undef } # repeating btwn factors use ss_subject as error elsif (!@wn and $_ > $ie_subj) { $ie_subj } # repeating error term elsif ($_ > $ie) { $ie } else { PDL::clump($ivs_ref->[$_] * $spdl->dummy(1), 1,2) } } 0 .. $#$ivs_ref; @{$opt->{BTWN}}? push @errors, $ie_subj : push @errors, $spdl; return \@errors; } sub _fix_rptd_se { # if ivnm lvls_ref for within ss only this can work for mixed design my ($cm_ref, $ret, $ivnm, $lvls_ref, $n) = @_; my @se = grep /se$/, keys %$cm_ref; @se = map { /^# (.+?) # se$/; $1; } @se; my @n_obs = map { my @ivs = split / ~ /, $_; my $i_ivs = which_id $ivnm, \@ivs; my $icollapsed = setops pdl(0 .. $#$ivnm), 'XOR', $i_ivs; my $collapsed = $icollapsed->nelem? pdl( @$lvls_ref[(list $icollapsed)] )->prodover : 1 ; $n * $collapsed; } @se; for my $i (0 .. $#se) { ($_tmp = $cm_ref->{"# $se[$i] # se"}) .= sqrt( $ret->{"| $se[$i] || err ms"} / $n_obs[$i] ); } return $cm_ref; } =head2 dummy_code =for ref Dummy coding of nominal variable (perl @ ref or 1d pdl) for use in regression. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage perldl> @a = qw(a a a b b b c c c) perldl> p $a = dummy_code(\@a) [ [1 1 1 0 0 0 0 0 0] [0 0 0 1 1 1 0 0 0] ] =cut *dummy_code = \&PDL::dummy_code; sub PDL::dummy_code { my ($var_ref) = @_; my $var_e = effect_code( $var_ref ); ($_tmp = $var_e->where( $var_e == -1 )) .= 0; return $var_e; } =head2 effect_code =for ref Unweighted effect coding of nominal variable (perl @ ref or 1d pdl) for use in regression. returns in @ context coded pdl and % ref to level - pdl->dim(1) index. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage my @var = qw( a a a b b b c c c ); my ($var_e, $map) = effect_code( \@var ); print $var_e . $var_e->info . "\n"; [ [ 1 1 1 0 0 0 -1 -1 -1] [ 0 0 0 1 1 1 -1 -1 -1] ] PDL: Double D [9,2] print "$_\t$map->{$_}\n" for (sort keys %$map) a 0 b 1 c 2 =cut *effect_code = \&PDL::effect_code; sub PDL::effect_code { my ($var_ref) = @_; my ($var, $map_ref) = PDL::Stats::Basic::_array_to_pdl( $var_ref ); my $var_max = $var->max; confess "effect_code called with only one unique value" if $var_max < 1; my $var_e = yvals( float, $var->nelem, $var_max ) == $var; ($_tmp = $var_e(which( $var == $var_max ), )) .= -1; $var_e = $var_e->setbadif( $var->isbad ) if $var->badflag; return wantarray? ($var_e, $map_ref) : $var_e; } =head2 effect_code_w =for ref Weighted effect code for nominal variable. returns in @ context coded pdl and % ref to level - pdl->dim(1) index. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage perldl> @a = qw( a a b b b c c ) perldl> p $a = effect_code_w(\@a) [ [ 1 1 0 0 0 -1 -1] [ 0 0 1 1 1 -1.5 -1.5] ] =cut *effect_code_w = \&PDL::effect_code_w; sub PDL::effect_code_w { my ($var_ref) = @_; my ($var_e, $map_ref) = effect_code( $var_ref ); return wantarray ? ($var_e, $map_ref) : $var_e if $var_e->sum == 0; my $pos = $var_e == 1; my $neg = $var_e == -1; my $w = $pos->sumover / $neg->sumover; my $neg_ind = $neg->whichND; ($_tmp = $var_e->indexND($neg_ind)) *= $w($neg_ind((1))); return wantarray ? ($var_e, $map_ref) : $var_e; } =head2 interaction_code Returns the coded interaction term for effect-coded variables. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage perldl> $a = sequence(6) > 2 perldl> p $a = $a->effect_code [ [ 1 1 1 -1 -1 -1] ] perldl> $b = pdl( qw( 0 1 2 0 1 2 ) ) perldl> p $b = $b->effect_code [ [ 1 0 -1 1 0 -1] [ 0 1 -1 0 1 -1] ] perldl> p $ab = interaction_code( $a, $b ) [ [ 1 0 -1 -1 -0 1] [ 0 1 -1 -0 -1 1] ] =cut *interaction_code = \&PDL::interaction_code; sub PDL::interaction_code { my @vars = @_; my $i = ones( $vars[0]->dim(0), 1 ); for (@vars) { $i = $i * $_->dummy(1); $i = $i->clump(1,2); } return $i; } =head2 ols =for ref Ordinary least squares regression, aka linear regression. Unlike B, ols is not threadable, but it can handle bad value (by ignoring observations with bad value in dependent or independent variables list-wise) and returns the full model in list context with various stats. IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. Intercept is automatically added and returned as LAST of the coeffs if CONST=>1. Returns full model in list context and coeff in scalar context. =for options Default options (case insensitive): CONST => 1, PLOT => 0, # see plot_residuals() for plot options WIN => undef, # for plotting =for usage Usage: # suppose this is a person's ratings for top 10 box office movies # ascending sorted by box office perldl> p $y = qsort ceil( random(10) * 5 ) [1 1 2 2 2 2 4 4 5 5] # construct IV with linear and quadratic component perldl> p $x = cat sequence(10), sequence(10)**2 [ [ 0 1 2 3 4 5 6 7 8 9] [ 0 1 4 9 16 25 36 49 64 81] ] perldl> %m = $y->ols( $x ) perldl> p "$_\t$m{$_}\n" for (sort keys %m) F 40.4225352112676 F_df [2 7] F_p 0.000142834216344756 R2 0.920314253647587 # coeff linear quadratic constant b [0.21212121 0.03030303 0.98181818] b_p [0.32800118 0.20303404 0.039910509] b_se [0.20174693 0.021579989 0.38987581] b_t [ 1.0514223 1.404219 2.5182844] ss_model 19.8787878787879 ss_residual 1.72121212121212 ss_total 21.6 y_pred [0.98181818 1.2242424 1.5272727 ... 4.6181818 5.3454545] =cut *ols = \&PDL::ols; sub PDL::ols { _ols_common(0, @_); } sub _rm_bad_value { my ($y, $ivs) = @_; my $idx; if ($y->check_badflag or $ivs->check_badflag) { $idx = which(($y->isbad==0) & (nbadover ($ivs->transpose)==0)); $y = $y($idx)->sever; $ivs = $ivs($idx,)->sever; $ivs->badflag(0); $y->badflag(0); } return $y, $ivs, $idx; } =head2 ols_rptd =for ref Repeated measures linear regression (Lorch & Myers, 1990; Van den Noortgate & Onghena, 2006). Handles purely within-subject design for now. See t/stats_ols_rptd.t for an example using the Lorch and Myers data. =for usage Usage: # This is the example from Lorch and Myers (1990), # a study on how characteristics of sentences affected reading time # Three within-subject IVs: # SP -- serial position of sentence # WORDS -- number of words in sentence # NEW -- number of new arguments in sentence # $subj can be 1D pdl or @ ref and must be the first argument # IV can be 1D @ ref or pdl # 1D @ ref is effect coded internally into pdl # pdl is left as is my %r = $rt->ols_rptd( $subj, $sp, $words, $new ); print "$_\t$r{$_}\n" for (sort keys %r); (ss_residual) 58.3754646504336 (ss_subject) 51.8590337714286 (ss_total) 405.188241771429 # SP WORDS NEW F [ 7.208473 61.354153 1.0243311] F_p [0.025006181 2.619081e-05 0.33792837] coeff [0.33337285 0.45858933 0.15162986] df [1 1 1] df_err [9 9 9] ms [ 18.450705 73.813294 0.57026483] ms_err [ 2.5595857 1.2030692 0.55671923] ss [ 18.450705 73.813294 0.57026483] ss_err [ 23.036272 10.827623 5.0104731] =cut *ols_rptd = \&PDL::ols_rptd; sub PDL::ols_rptd { my ($y, $subj, @ivs_raw) = @_; $y = $y->squeeze; $y->getndims > 1 and croak "ols_rptd does not support threading"; my @ivs = map { (ref $_ eq 'PDL' and $_->ndims > 1)? $_ : ref $_ eq 'PDL' ? $_->dummy(1) : scalar effect_code($_) ; } @ivs_raw; my %r; $r{'(ss_total)'} = $y->ss; # STEP 1: subj my $s = effect_code $subj; # gives same results as dummy_code my $b_s = $y->ols_t($s); my $pred = sumover($b_s(0:-2) * $s->transpose) + $b_s(-1); $r{'(ss_subject)'} = $r{'(ss_total)'} - $y->sse( $pred ); # STEP 2: add predictor variables my $iv_p = $s->glue(1, @ivs); my $b_p = $y->ols_t($iv_p); # only care about coeff for predictor vars. no subj or const coeff $r{coeff} = $b_p(-(@ivs+1) : -2)->sever; # get total sse for this step $pred = sumover($b_p(0:-2) * $iv_p->transpose) + $b_p(-1); my $ss_pe = $y->sse( $pred ); # get predictor ss by successively reducing the model $r{ss} = zeroes scalar(@ivs); for my $i (0 .. $#ivs) { my @i_rest = grep { $_ != $i } 0 .. $#ivs; my $iv = $s->glue(1, @ivs[ @i_rest ]); my $b = $y->ols_t($iv); $pred = sumover($b(0:-2) * $iv->transpose) + $b(-1); ($_tmp = $r{ss}->($i)) .= $y->sse($pred) - $ss_pe; } # STEP 3: get precitor x subj interaction as error term my $iv_e = PDL::glue 1, map { interaction_code( $s, $_ ) } @ivs; # get total sse for this step. full model now. my $b_f = $y->ols_t( $iv_p->glue(1,$iv_e) ); $pred = sumover($b_f(0:-2) * $iv_p->glue(1,$iv_e)->transpose) + $b_f(-1); $r{'(ss_residual)'} = $y->sse( $pred ); # get predictor x subj ss by successively reducing the error term $r{ss_err} = zeroes scalar(@ivs); for my $i (0 .. $#ivs) { my @i_rest = grep { $_ != $i } 0 .. $#ivs; my $e_rest = PDL::glue 1, map { interaction_code( $s, $_ ) } @ivs[@i_rest]; my $iv = $iv_p->glue(1, $e_rest); my $b = $y->ols_t($iv); my $pred = sumover($b(0:-2) * $iv->transpose) + $b(-1); ($_tmp = $r{ss_err}->($i)) .= $y->sse($pred) - $r{'(ss_residual)'}; } # Finally, get MS, F, etc $r{df} = pdl( map { $_->squeeze->ndims } @ivs ); $r{ms} = $r{ss} / $r{df}; $r{df_err} = $s->dim(1) * $r{df}; $r{ms_err} = $r{ss_err} / $r{df_err}; $r{F} = $r{ms} / $r{ms_err}; $r{F_p} = 1 - $r{F}->gsl_cdf_fdist_P( $r{df}, $r{df_err} ) if $CDF; return %r; } =head2 logistic =for ref Logistic regression with maximum likelihood estimation using PDL::Fit::LM (requires PDL::Slatec. Hence loaded with "require" in the sub instead of "use" at the beginning). IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. It is included in the model and returned as LAST of coeff. Returns full model in list context and coeff in scalar context. The significance tests are likelihood ratio tests (-2LL deviance) tests. IV significance is tested by comparing deviances between the reduced model (ie with the IV in question removed) and the full model. ***NOTE: the results here are qualitatively similar to but not identical with results from R, because different algorithms are used for the nonlinear parameter fit. Use with discretion*** =for options Default options (case insensitive): INITP => zeroes( $x->dim(1) + 1 ), # n_iv + 1 MAXIT => 1000, EPS => 1e-7, =for usage Usage: # suppose this is whether a person had rented 10 movies perldl> p $y = ushort( random(10)*2 ) [0 0 0 1 1 0 0 1 1 1] # IV 1 is box office ranking perldl> p $x1 = sequence(10) [0 1 2 3 4 5 6 7 8 9] # IV 2 is whether the movie is action- or chick-flick perldl> p $x2 = sequence(10) % 2 [0 1 0 1 0 1 0 1 0 1] # concatenate the IVs together perldl> p $x = cat $x1, $x2 [ [0 1 2 3 4 5 6 7 8 9] [0 1 0 1 0 1 0 1 0 1] ] perldl> %m = $y->logistic( $x ) perldl> p "$_\t$m{$_}\n" for (sort keys %m) D0 13.8629436111989 Dm 9.8627829791575 Dm_chisq 4.00016063204141 Dm_df 2 Dm_p 0.135324414081692 # ranking genre constant b [0.41127706 0.53876358 -2.1201285] b_chisq [ 3.5974504 0.16835559 2.8577151] b_p [0.057868258 0.6815774 0.090936587] iter 12 y_pred [0.10715577 0.23683909 ... 0.76316091 0.89284423] # to get the covariance out, supply a true value for the COV option: perldl> %m = $y->logistic( $x, {COV=>1} ) perldl> p $m{cov}; =cut *logistic = \&PDL::logistic; sub PDL::logistic { require PDL::Fit::LM; # uses PDL::Slatec my ( $self, $ivs, $opt ) = @_; $self = $self->squeeze; # make compatible w multiple var cases $ivs->getndims == 1 and $ivs = $ivs->dummy(1); $self->dim(0) != $ivs->dim(0) and carp "mismatched n btwn DV and IV!"; my %opt = ( INITP => zeroes( $ivs->dim(1) + 1 ), # n_ivs + 1 MAXIT => 1000, EPS => 1e-7, ); $opt and $opt{uc $_} = $opt->{$_} for (%$opt); # not using it atm $opt{WT} = 1; # Use lmfit. Fourth input argument is reference to user-defined # copy INITP so we have the original value when needed my ($yfit,$coeff,$cov,$iter) = PDL::Fit::LM::lmfit($ivs, $self, $opt{WT}, \&_logistic, $opt{INITP}->copy, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); # apparently at least coeff is child of some pdl # which is changed in later lmfit calls $yfit = $yfit->copy; $coeff = $coeff->copy; return $coeff unless wantarray; my %ret; my $n0 = $self->where($self == 0)->nelem; my $n1 = $self->nelem - $n0; $ret{cov} = $cov if $opt{COV}; $ret{D0} = -2*($n0 * log($n0 / $self->nelem) + $n1 * log($n1 / $self->nelem)); $ret{Dm} = sum( $self->dvrs( $yfit ) ** 2 ); $ret{Dm_chisq} = $ret{D0} - $ret{Dm}; $ret{Dm_df} = $ivs->dim(1); $ret{Dm_p} = 1 - PDL::GSL::CDF::gsl_cdf_chisq_P( $ret{Dm_chisq}, $ret{Dm_df} ) if $CDF; my $coeff_chisq = zeroes $opt{INITP}->nelem; if ( $ivs->dim(1) > 1 ) { for my $k (0 .. $ivs->dim(1)-1) { my @G = grep { $_ != $k } (0 .. $ivs->dim(1)-1); my $G = $ivs->dice_axis(1, \@G); my $init = $opt{INITP}->dice([ @G, $opt{INITP}->dim(0)-1 ])->copy; my $y_G = PDL::Fit::LM::lmfit( $G, $self, $opt{WT}, \&_logistic, $init, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); ($_tmp = $coeff_chisq($k)) .= $self->dm( $y_G ) - $ret{Dm}; } } else { # d0 is, by definition, the deviance with only intercept ($_tmp = $coeff_chisq(0)) .= $ret{D0} - $ret{Dm}; } my $y_c = PDL::Fit::LM::lmfit( $ivs, $self, $opt{WT}, \&_logistic_no_intercept, $opt{INITP}->(0:-2)->copy, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); ($_tmp = $coeff_chisq(-1)) .= $self->dm( $y_c ) - $ret{Dm}; $ret{b} = $coeff; $ret{b_chisq} = $coeff_chisq; $ret{b_p} = 1 - $ret{b_chisq}->gsl_cdf_chisq_P( 1 ) if $CDF; $ret{y_pred} = $yfit; $ret{iter} = $iter; for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; return %ret; } sub _logistic { my ($x,$par,$ym,$dyda) = @_; # $b and $c are fit parameters slope and intercept my $b = $par(0 : $x->dim(1) - 1)->sever; my $c = $par(-1)->sever; # Write function with dependent variable $ym, # independent variable $x, and fit parameters as specified above. # Use the .= (dot equals) assignment operator to express the equality # (not just a plain equals) ($_tmp = $ym) .= 1 / ( 1 + exp( -1 * (sumover($b * $x->transpose) + $c) ) ); my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1); # Partial derivative of the function with respect to each slope # fit parameter ($b in this case). Again, note .= assignment # operator (not just "equals") ($_tmp = $dy[$_]) .= $x( ,$_) * $ym * (1 - $ym) for (0 .. $b->dim(0)-1); # Partial derivative of the function re intercept par ($_tmp = $dy[-1]) .= $ym * (1 - $ym); } sub _logistic_no_intercept { my ($x,$par,$ym,$dyda) = @_; my $b = $par(0 : $x->dim(1) - 1)->sever; # Write function with dependent variable $ym, # independent variable $x, and fit parameters as specified above. # Use the .= (dot equals) assignment operator to express the equality # (not just a plain equals) ($_tmp = $ym) .= 1 / ( 1 + exp( -1 * sumover($b * $x->transpose) ) ); my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1); # Partial derivative of the function with respect to each slope # fit parameter ($b in this case). Again, note .= assignment # operator (not just "equals") ($_tmp = $dy[$_]) .= $x( ,$_) * $ym * (1 - $ym) for (0 .. $b->dim(0)-1); } =head2 pca =for ref Principal component analysis. Based on corr instead of cov (bad values are ignored pair-wise. OK when bad values are few but otherwise probably should fill_m etc before pca). Use PDL::Slatec::eigsys() if installed, otherwise use PDL::MatrixOps::eigens_sym(). =for options Default options (case insensitive): CORR => 1, # boolean. use correlation or covariance PLOT => 0, # calls plot_screes by default # can set plot_screes options here WIN => undef, # for plotting =for usage Usage: my $d = qsort random 10, 5; # 10 obs on 5 variables my %r = $d->pca( \%opt ); print "$_\t$r{$_}\n" for (keys %r); eigenvalue # variance accounted for by each component [4.70192 0.199604 0.0471421 0.0372981 0.0140346] eigenvector # dim var x comp. weights for mapping variables to component [ [ -0.451251 -0.440696 -0.457628 -0.451491 -0.434618] [ -0.274551 0.582455 0.131494 0.255261 -0.709168] [ 0.43282 0.500662 -0.139209 -0.735144 -0.0467834] [ 0.693634 -0.428171 0.125114 0.128145 -0.550879] [ 0.229202 0.180393 -0.859217 0.4173 0.0503155] ] loadings # dim var x comp. correlation between variable and component [ [ -0.978489 -0.955601 -0.992316 -0.97901 -0.942421] [ -0.122661 0.260224 0.0587476 0.114043 -0.316836] [ 0.0939749 0.108705 -0.0302253 -0.159616 -0.0101577] [ 0.13396 -0.0826915 0.0241629 0.0247483 -0.10639] [ 0.027153 0.0213708 -0.101789 0.0494365 0.00596076] ] pct_var # percent variance accounted for by each component [0.940384 0.0399209 0.00942842 0.00745963 0.00280691] Plot scores along the first two components, $d->plot_scores( $r{eigenvector} ); =cut *pca = \&PDL::pca; sub PDL::pca { my ($self, $opt) = @_; my %opt = ( CORR => 1, PLOT => 0, WIN => undef, # for plotting ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $var_var = $opt{CORR}? $self->corr_table : $self->cov_table; # value is axis pdl and score is var x axis my ($eigval, $eigvec); if ( $SLATEC ) { ($eigval, $eigvec) = $var_var->PDL::Slatec::eigsys; } else { ($eigvec, $eigval) = $var_var->eigens_sym; # compatibility with PDL::Slatec::eigsys $eigvec = $eigvec->inplace->transpose->sever; } # ind is sticky point for threading my $ind_sorted = $eigval->qsorti->(-1:0); $eigvec = $eigvec( ,$ind_sorted)->sever; $eigval = $eigval($ind_sorted)->sever; # var x axis my $var = $eigval / $eigval->sum->sclr; my $loadings; if ($opt{CORR}) { $loadings = $eigvec * sqrt( $eigval->transpose ); } else { my $scores = $eigvec x $self->dev_m; $loadings = $self->corr( $scores->dummy(1) ); } $var->plot_screes(\%opt) if $opt{PLOT}; return ( eigenvalue=>$eigval, eigenvector=>$eigvec, pct_var=>$var, loadings=>$loadings ); } =head2 pca_sorti Determine by which vars a component is best represented. Descending sort vars by size of association with that component. Returns sorted var and relevant component indices. =for options Default options (case insensitive): NCOMP => 10, # maximum number of components to consider =for usage Usage: # let's see if we replicated the Osgood et al. (1957) study perldl> ($data, $idv, $ido) = rtable 'osgood_exp.csv', {v=>0} # select a subset of var to do pca perldl> $ind = which_id $idv, [qw( ACTIVE BASS BRIGHT CALM FAST GOOD HAPPY HARD LARGE HEAVY )] perldl> $data = $data( ,$ind)->sever perldl> @$idv = @$idv[list $ind] perldl> %m = $data->pca perldl> ($iv, $ic) = $m{loadings}->pca_sorti() perldl> p "$idv->[$_]\t" . $m{loadings}->($_,$ic)->flat . "\n" for (list $iv) # COMP0 COMP1 COMP2 COMP3 HAPPY [0.860191 0.364911 0.174372 -0.10484] GOOD [0.848694 0.303652 0.198378 -0.115177] CALM [0.821177 -0.130542 0.396215 -0.125368] BRIGHT [0.78303 0.232808 -0.0534081 -0.0528796] HEAVY [-0.623036 0.454826 0.50447 0.073007] HARD [-0.679179 0.0505568 0.384467 0.165608] ACTIVE [-0.161098 0.760778 -0.44893 -0.0888592] FAST [-0.196042 0.71479 -0.471355 0.00460276] LARGE [-0.241994 0.594644 0.634703 -0.00618055] BASS [-0.621213 -0.124918 0.0605367 -0.765184] =cut *pca_sorti = \&PDL::pca_sorti; sub PDL::pca_sorti { # $self is pdl (var x component) my ($self, $opt) = @_; my %opt = ( NCOMP => 10, # maximum number of components to consider ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $ncomp = pdl($opt{NCOMP}, $self->dim(1))->min; $self = $self->dice_axis( 1, pdl(0..$ncomp-1) ); my $icomp = $self->transpose->abs->maximum_ind; # sort between comp my $ivar_sort = $icomp->qsorti; $self = $self($ivar_sort, )->sever; # sort within comp my $ic = $icomp($ivar_sort)->iv_cluster; for my $comp (0 .. $ic->dim(1)-1) { my $i = $self(which($ic( ,$comp)), ($comp))->qsorti->(-1:0); ($_tmp = $ivar_sort(which $ic( ,$comp))) .= $ivar_sort(which $ic( ,$comp))->($i)->sever; } return wantarray? ($ivar_sort, pdl(0 .. $ic->dim(1)-1)) : $ivar_sort; } =head2 plot_means Plots means anova style. Can handle up to 4-way interactions (ie 4D pdl). =for options Default options (case insensitive): IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'], DVNM => 'DV', AUTO => 1, # auto set dims to be on x-axis, line, panel # if set 0, dim 0 goes on x-axis, dim 1 as lines # dim 2+ as panels # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows SIZE => 640, # individual square panel size in pixels SYMBL => [0, 4, 7, 11], =for usage Usage: # see anova for mean / se pdl structure $mean->plot_means( $se, {IVNM=>['apple', 'bake']} ); Or like this: $m{'# apple ~ bake # m'}->plot_means; =cut *plot_means = \&PDL::plot_means; sub PDL::plot_means { require PDL::Graphics::PGPLOT::Window; my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($self, $se) = @_; $self = $self->squeeze; if ($self->ndims > 4) { carp "Data is > 4D. No plot here."; return; } my %opt = ( IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'], DVNM => 'DV', AUTO => 1, # auto set vars to be on X axis, line, panel WIN => undef, # PDL::Graphics::PGPLOT::Window object DEV => $DEV, SIZE => 640, # individual square panel size in pixels SYMBL => [0, 4, 7, 11], # ref PDL::Graphics::PGPLOT::Window ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); # decide which vars to plot as x axis, lines, panels # put var w most levels on x axis # put var w least levels on diff panels my @iD = 0..3; my @dims = (1, 1, 1, 1); # splice ARRAY,OFFSET,LENGTH,LIST splice @dims, 0, $self->ndims, $self->dims; $self = $self->reshape(@dims)->sever; $se = $se->reshape(@dims)->sever if defined $se; @iD = reverse list qsorti pdl @dims if $opt{AUTO}; # $iD[0] on x axis # $iD[1] as separate lines my $nx = $self->dim($iD[2]); # n xpanels my $ny = $self->dim($iD[3]); # n ypanels my $w = $opt{WIN}; if (!defined $w) { $w = PDL::Graphics::PGPLOT::Window::pgwin(DEV=>$opt{DEV}, NX=>$nx, NY=>$ny, SIZE=>[$opt{SIZE}*$nx, $opt{SIZE}*$ny], UNIT=>3); } my ($min, $max) = defined $se? pdl($self + $se, $self - $se)->minmax : $self->minmax ; my $range = $max - $min; my $p = 0; # panel for my $y (0..$self->dim($iD[3])-1) { for my $x (0..$self->dim($iD[2])-1) { $p ++; my $tl = ''; $tl = $opt{IVNM}->[$iD[2]] . " $x" if $self->dim($iD[2]) > 1; $tl .= ' ' . $opt{IVNM}->[$iD[3]] . " $y" if $self->dim($iD[3]) > 1; $w->env( 0, $self->dim($iD[0])-1, $min - 2*$range/5, $max + $range/5, { XTitle=>$opt{IVNM}->[$iD[0]], YTitle=>$opt{DVNM}, Title=>$tl, PANEL=>$p, AXIS=>['BCNT', 'BCNST'], Border=>1, } ) unless $opt{WIN}; my (@legend, @color); for (0 .. $self->dim($iD[1]) - 1) { push @legend, $opt{IVNM}->[$iD[1]] . " $_" if ($self->dim($iD[1]) > 1); push @color, $_ + 2; # start from red $w->points( sequence($self->dim($iD[0])), $self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_), $opt{SYMBL}->[$_], { PANEL=>$p, CHARSIZE=>2, COLOR=>$_+2, PLOTLINE=>1, } ); $w->errb( sequence($self->dim($iD[0])), $self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_), $se->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_), { PANEL=>$p, CHARSIZE=>2, COLOR=>$_+2 } ) if defined $se; } if ($self->dim($iD[1]) > 1) { $w->legend( \@legend, ($self->dim($iD[0])-1)/1.6, $min - $range/10, { COLOR=>\@color } ); $w->legend( \@legend, ($self->dim($iD[0])-1)/1.6, $min - $range/10, { COLOR=>\@color, SYMBOL=>[ @{$opt{SYMBL}}[0..$#color] ] } ); } } } $w->close unless $opt{WIN}; return; } =head2 plot_residuals Plots residuals against predicted values. =for usage Usage: $y->plot_residuals( $y_pred, { dev=>'/png' } ); =for options Default options (case insensitive): # see PDL::Graphics::PGPLOT::Window for more info WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows SIZE => 640, # plot size in pixels COLOR => 1, =cut *plot_residuals = \&PDL::plot_residuals; sub PDL::plot_residuals { require PDL::Graphics::PGPLOT::Window; my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($y, $y_pred) = @_; my %opt = ( # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => $DEV , # open and close dev for plotting if no WIN SIZE => 640, # plot size in pixels COLOR => 1, ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $residuals = $y - $y_pred; my $win = $opt{WIN}; if (!$win) { $win = PDL::Graphics::PGPLOT::Window::pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3); $win->env( $y_pred->minmax, $residuals->minmax, {XTITLE=>'predicted value', YTITLE=>'residuals', AXIS=>['BCNT', 'BCNST'], Border=>1,} ); } $win->points($y_pred, $residuals, { COLOR=>$opt{COLOR} }); # add 0-line $win->line(pdl($y_pred->minmax), pdl(0,0), { COLOR=>$opt{COLOR} } ); $win->close unless $opt{WIN}; return; } =head2 plot_scores Plots standardized original and PCA transformed scores against two components. (Thank you, Bob MacCallum, for the documentation suggestion that led to this function.) =for options Default options (case insensitive): CORR => 1, # boolean. PCA was based on correlation or covariance COMP => [0,1], # indices to components to plot # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows SIZE => 640, # plot size in pixels COLOR => [1,2], # color for original and rotated scores =for usage Usage: my %p = $data->pca(); $data->plot_scores( $p{eigenvector}, \%opt ); =cut *plot_scores = \&PDL::plot_scores; sub PDL::plot_scores { require PDL::Graphics::PGPLOT::Window; my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($self, $eigvec) = @_; my %opt = ( CORR => 1, # boolean. PCA was based on correlation or covariance COMP => [0,1], # indices to components to plot # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => $DEV , # open and close dev for plotting if no WIN SIZE => 640, # plot size in pixels COLOR => [1,2], # color for original and transformed scoress ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $i = pdl $opt{COMP}; my $z = $opt{CORR}? $self->stddz : $self->dev_m; # transformed normed values my $scores = sumover($eigvec( ,$i) * $z->transpose->dummy(1))->transpose; $z = $z( ,$i)->sever; my $win = $opt{WIN}; my $max = pdl($z, $scores)->abs->ceil->max->sclr; if (!$win) { $win = PDL::Graphics::PGPLOT::Window::pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3); $win->env(-$max, $max, -$max, $max, {XTitle=>"Component $opt{COMP}->[0]", YTitle=>"Component $opt{COMP}->[1]", AXIS=>['ABCNST', 'ABCNST'], Border=>1, }); } $win->points( $z( ,0;-), $z( ,1;-), { COLOR=>$opt{COLOR}->[0] } ); $win->points( $scores( ,0;-), $scores( ,1;-), { COLOR=>$opt{COLOR}->[1] } ); $win->legend( ['original', 'transformed'], .2*$max, .8*$max, {color=>[1,2],symbol=>[1,1]} ); $win->close unless $opt{WIN}; return; } =head2 plot_screes Scree plot. Plots proportion of variance accounted for by PCA components. =for options Default options (case insensitive): NCOMP => 20, # max number of components to plot CUT => 0, # set to plot cutoff line after this many components # undef to plot suggested cutoff line for NCOMP comps # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows SIZE => 640, # plot size in pixels COLOR => 1, =for usage Usage: # variance should be in descending order $pca{var}->plot_screes( {ncomp=>16} ); Or, because NCOMP is used so often, it is allowed a shortcut, $pca{var}->plot_screes( 16 ); =cut *plot_scree = \&PDL::plot_screes; # here for now for compatibility *plot_screes = \&PDL::plot_screes; sub PDL::plot_screes { require PDL::Graphics::PGPLOT::Window; my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($self, $ncomp) = @_; my %opt = ( NCOMP => 20, # max number of components to plot CUT => 0, # set to plot cutoff line after this many components # undef to plot suggested cutoff line for NCOMP comps # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => $DEV , # open and close dev for plotting if no WIN SIZE => 640, # plot size in pixels COLOR => 1, ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $opt{NCOMP} = $ncomp if $ncomp; # re-use $ncomp below $ncomp = ($self->dim(0) < $opt{NCOMP})? $self->dim(0) : $opt{NCOMP}; $opt{CUT} = PDL::Stats::Kmeans::_scree_ind $self(0:$ncomp-1) if !defined $opt{CUT}; my $win = $opt{WIN}; if (!$win) { $win = PDL::Graphics::PGPLOT::Window::pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3); $win->env(0, $ncomp-1, 0, 1, {XTitle=>'Component', YTitle=>'Proportion of Variance Accounted for', AXIS=>['BCNT', 'BCNST'], Border=>1, }); } $win->points(sequence($ncomp), $self(0:$ncomp-1, ), {CHARSIZE=>2, COLOR=>$opt{COLOR}, PLOTLINE=>1} ); $win->line( pdl($opt{CUT}-.5, $opt{CUT}-.5), pdl(-.05, $self->max->sclr+.05), {COLOR=>15} ) if $opt{CUT}; $win->close unless $opt{WIN}; return; } =head1 SEE ALSO PDL::Fit::Linfit PDL::Fit::LM =head1 REFERENCES Cohen, J., Cohen, P., West, S.G., & Aiken, L.S. (2003). Applied Multiple Regression/correlation Analysis for the Behavioral Sciences (3rd ed.). Mahwah, NJ: Lawrence Erlbaum Associates Publishers. Hosmer, D.W., & Lemeshow, S. (2000). Applied Logistic Regression (2nd ed.). New York, NY: Wiley-Interscience. Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated measures data in cognitive research. Journal of Experimental Psychology: Learning, Memory, & Cognition, 16, 149-157. Osgood C.E., Suci, G.J., & Tannenbaum, P.H. (1957). The Measurement of Meaning. Champaign, IL: University of Illinois Press. Rutherford, A. (2001). Introducing Anova and Ancova: A GLM Approach (1st ed.). Thousand Oaks, CA: Sage Publications. Shlens, J. (2009). A Tutorial on Principal Component Analysis. Retrieved April 10, 2011 from http://citeseerx.ist.psu.edu/ The GLM procedure: unbalanced ANOVA for two-way design with interaction. (2008). SAS/STAT(R) 9.2 User's Guide. Retrieved June 18, 2009 from http://support.sas.com/ Van den Noortgatea, W., & Onghenaa, P. (2006). Analysing repeated measures data in cognitive research: A comment on regression coefficient analyses. European Journal of Cognitive Psychology, 18, 937-952. =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut #line 2514 "GLM.pm" # Exit with OK status 1; PDL-Stats-0.84/GENERATED/PDL/Stats/TS.pm0000644000175000017500000003417614625061426017026 0ustar osboxesosboxes# # GENERATED WITH PDL::PP! Don't modify! # package PDL::Stats::TS; our @EXPORT_OK = qw(_acf _acvf diff inte dseason _fill_ma filter_exp filter_ma mae mape wmape portmanteau _pred_ar ); our %EXPORT_TAGS = (Func=>\@EXPORT_OK); use PDL::Core; use PDL::Exporter; use DynaLoader; our @ISA = ( 'PDL::Exporter','DynaLoader' ); push @PDL::Core::PP, __PACKAGE__; bootstrap PDL::Stats::TS ; #line 1 "ts.pd" =encoding utf8 =head1 NAME PDL::Stats::TS -- basic time series functions =head1 DESCRIPTION The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively. Plots require PDL::Graphics::PGPLOT. ***EXPERIMENTAL!*** In particular, bad value support is spotty and may be shaky. USE WITH DISCRETION! =head1 SYNOPSIS use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::TS; my $r = $data->acf(5); =cut use Carp; use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::Basic; use PDL::Stats::Kmeans; my $DEV = ($^O =~ /win/i)? '/png' : '/xs'; #line 57 "TS.pm" =head1 FUNCTIONS =cut *_acf = \&PDL::_acf; *_acvf = \&PDL::_acvf; #line 112 "ts.pd" #line 113 "ts.pd" =head2 acf =for sig Signature: (x(t); int h(); [o]r(h+1)) =for ref Autocorrelation function for up to lag h. If h is not specified it's set to t-1 by default. acf does not process bad values. =for usage usage: perldl> $a = sequence 10 # lags 0 .. 5 perldl> p $a->acf(5) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] =cut *acf = \&PDL::acf; sub PDL::acf { my ($self, $h) = @_; $h ||= $self->dim(0) - 1; return $self->_acf($h+1); } =head2 acvf =for sig Signature: (x(t); int h(); [o]v(h+1)) =for ref Autocovariance function for up to lag h. If h is not specified it's set to t-1 by default. acvf does not process bad values. =for usage usage: perldl> $a = sequence 10 # lags 0 .. 5 perldl> p $a->acvf(5) [82.5 57.75 34 12.25 -6.5 -21.25] # autocorrelation perldl> p $a->acvf(5) / $a->acvf(0) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] =cut *acvf = \&PDL::acvf; sub PDL::acvf { my ($self, $h) = @_; $h ||= $self->dim(0) - 1; return $self->_acvf($h+1); } #line 150 "TS.pm" =head2 diff =for sig Signature: (x(t); [o]dx(t)) =for ref Differencing. DX(t) = X(t) - X(t-1), DX(0) = X(0). Can be done inplace. =for bad diff does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *diff = \&PDL::diff; =head2 inte =for sig Signature: (x(n); [o]ix(n)) =for ref Integration. Opposite of differencing. IX(t) = X(t) + X(t-1), IX(0) = X(0). Can be done inplace. =for bad inte does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *inte = \&PDL::inte; =head2 dseason =for sig Signature: (x(t); indx d(); [o]xd(t)) =for ref Deseasonalize data using moving average filter the size of period d. =for bad dseason processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *dseason = \&PDL::dseason; #line 362 "ts.pd" #line 363 "ts.pd" =head2 fill_ma =for sig Signature: (x(t); int q(); [o]xf(t)) =for ref Fill missing value with moving average. xf(t) = sum(x(t-q .. t-1, t+1 .. t+q)) / 2q. fill_ma does handle bad values. Output pdl bad flag is cleared unless the specified window size q is too small and there are still bad values. =for usage my $x_filled = $x->fill_ma( $q ); =cut *fill_ma = \&PDL::fill_ma; sub PDL::fill_ma { my ($x, $q) = @_; my $x_filled = $x->_fill_ma($q); $x_filled->check_badflag; # carp "ma window too small, still has bad value" # if $x_filled->badflag; return $x_filled; } #line 269 "TS.pm" *_fill_ma = \&PDL::_fill_ma; =head2 filter_exp =for sig Signature: (x(t); a(); [o]xf(t)) =for ref Filter, exponential smoothing. xf(t) = a * x(t) + (1-a) * xf(t-1) =for usage =for bad filter_exp does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *filter_exp = \&PDL::filter_exp; =head2 filter_ma =for sig Signature: (x(t); indx q(); [o]xf(t)) =for ref Filter, moving average. xf(t) = sum(x(t-q .. t+q)) / (2q + 1) =for bad filter_ma does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *filter_ma = \&PDL::filter_ma; =head2 mae =for sig Signature: (a(n); b(n); float+ [o]c()) =for ref Mean absolute error. MAE = 1/n * sum( abs(y - y_pred) ) =for usage Usage: $mae = $y->mae( $y_pred ); =for bad mae processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mae = \&PDL::mae; =head2 mape =for sig Signature: (a(n); b(n); float+ [o]c()) =for ref Mean absolute percent error. MAPE = 1/n * sum(abs((y - y_pred) / y)) =for usage Usage: $mape = $y->mape( $y_pred ); =for bad mape processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mape = \&PDL::mape; =head2 wmape =for sig Signature: (a(n); b(n); float+ [o]c()) =for ref Weighted mean absolute percent error. avg(abs(error)) / avg(abs(data)). Much more robust compared to mape with division by zero error (cf. Schütz, W., & Kolassa, 2006). =for usage Usage: $wmape = $y->wmape( $y_pred ); =for bad wmape processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *wmape = \&PDL::wmape; =head2 portmanteau =for sig Signature: (r(h); longlong t(); [o]Q()) =for ref Portmanteau significance test (Ljung-Box) for autocorrelations. =for usage Usage: perldl> $a = sequence 10 # acf for lags 0-5 # lag 0 excluded from portmanteau perldl> p $chisq = $a->acf(5)->portmanteau( $a->nelem ) 11.1753902662994 # get p-value from chisq distr perldl> use PDL::GSL::CDF perldl> p 1 - gsl_cdf_chisq_P( $chisq, 5 ) 0.0480112934306748 =for bad portmanteau does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *portmanteau = \&PDL::portmanteau; #line 700 "ts.pd" #line 701 "ts.pd" =head2 pred_ar =for sig Signature: (x(d); b(p|p+1); int t(); [o]pred(t)) =for ref Calculates predicted values up to period t (extend current series up to period t) for autoregressive series, with or without constant. If there is constant, it is the last element in b, as would be returned by ols or ols_t. pred_ar does not process bad values. =for options CONST => 1, =for usage Usage: perldl> $x = sequence 2 # last element is constant perldl> $b = pdl(.8, -.2, .3) perldl> p $x->pred_ar($b, 7) [0 1 1.1 0.74 0.492 0.3656 0.31408] # no constant perldl> p $x->pred_ar($b(0:1), 7, {const=>0}) [0 1 0.8 0.44 0.192 0.0656 0.01408] =cut sub PDL::pred_ar { my ($x, $b, $t, $opt) = @_; my %opt = ( CONST => 1 ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $b = pdl $b unless ref $b eq 'PDL'; # allows passing simple number my $ext; if ($opt{CONST}) { my $t_ = $t - ( $x->dim(0) - $b->dim(0) + 1 ); $ext = $x(-$b->dim(0)+1:-1, )->_pred_ar($b(0:-2), $t_); $ext($b->dim(0)-1:-1) += $b(-1); return $x->append( $ext( $b->dim(0)-1 : -1 ) ); } else { my $t_ = $t - ( $x->dim(0) - $b->dim(0) ); $ext = $x(-$b->dim(0):-1, )->_pred_ar($b, $t_); return $x->append($ext($b->dim(0) : -1)); } } #line 542 "TS.pm" *_pred_ar = \&PDL::_pred_ar; #line 790 "ts.pd" #line 791 "ts.pd" =head2 season_m Given length of season, returns seasonal mean and var for each period (returns seasonal mean only in scalar context). =for options Default options (case insensitive): START_POSITION => 0, # series starts at this position in season MISSING => -999, # internal mark for missing points in season PLOT => 0, # boolean # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pass pgwin object for more plotting control DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows COLOR => 1, See PDL::Graphics::PGPLOT for detailed graphing options. =for usage my ($m, $ms) = $data->season_m( 24, { START_POSITION=>2 } ); =cut *season_m = \&PDL::season_m; sub PDL::season_m { my ($self, $d, $opt) = @_; my %opt = ( START_POSITION => 0, # series starts at this position in season MISSING => -999, # internal mark for missing points in season PLOT => 0, WIN => undef, # pass pgwin object for more plotting control DEV => $DEV, # see PDL::Graphics::PGPLOT for more info COLOR => 1, ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); if ($opt{PLOT}) { require PDL::Graphics::PGPLOT::Window; } my $n_season = ($self->dim(0) + $opt{START_POSITION}) / $d; $n_season = pdl($n_season)->ceil->sum->sclr; my @dims = $self->dims; $dims[0] = $n_season * $d; my $data = zeroes( @dims ) + $opt{MISSING}; $data($opt{START_POSITION} : $opt{START_POSITION} + $self->dim(0)-1, ) .= $self; $data->badflag(1); $data->inplace->setvaltobad( $opt{MISSING} ); my $s = sequence $d; $s = $s->dummy(1, $n_season)->flat; $s = $s->iv_cluster(); my ($m, $ms) = $data->centroid( $s ); if ($opt{PLOT}) { my $w = $opt{WIN}; if (!$w) { $w = PDL::Graphics::PGPLOT::Window::pgwin( Dev=>$opt{DEV} ); $w->env( 0, $d-1, $m->minmax, {XTitle=>'period', YTitle=>'mean'} ); } $w->points( sequence($d), $m, {COLOR=>$opt{COLOR}, PLOTLINE=>1} ); if ($m->squeeze->ndims < 2) { $w->errb( sequence($d), $m, sqrt( $ms / $s->sumover ), {COLOR=>$opt{COLOR}} ); } $w->close unless $opt{WIN}; } return wantarray? ($m, $ms) : $m; } =head2 plot_dseason =for ref Plots deseasonalized data and original data points. Opens and closes default window for plotting unless a pgwin object is passed in options. Returns deseasonalized data. =for options Default options (case insensitive): WIN => undef, DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows COLOR => 1, # data point color See PDL::Graphics::PGPLOT for detailed graphing options. =cut *plot_dseason = \&PDL::plot_dseason; sub PDL::plot_dseason { require PDL::Graphics::PGPLOT::Window; my ($self, $d, $opt) = @_; !defined($d) and croak "please set season period length"; $self = $self->squeeze; my $dsea; my %opt = ( WIN => undef, DEV => $DEV, COLOR => 1, # data point color ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $dsea = $self->dseason($d); my $w = $opt{WIN}; if (!$opt{WIN}) { $w = PDL::Graphics::PGPLOT::Window::pgwin( $opt{DEV} ); $w->env( 0, $self->dim(0)-1, $self->minmax, {XTitle=>'T', YTitle=>'DV'} ); } my $missn = ushort $self->max->sclr + 1; # ushort in case precision issue $w->line( sequence($self->dim(0)), $dsea->setbadtoval( $missn ), {COLOR=>$opt{COLOR}+1, MISSING=>$missn} ); $w->points( sequence($self->dim(0)), $self, {COLOR=>$opt{COLOR}} ); $w->close unless $opt{WIN}; return $dsea; } *filt_exp = \&PDL::filt_exp; sub PDL::filt_exp { print STDERR "filt_exp() deprecated since version 0.5.0. Please use filter_exp() instead\n"; return filter_exp( @_ ); } *filt_ma = \&PDL::filt_ma; sub PDL::filt_ma { print STDERR "filt_ma() deprecated since version 0.5.0. Please use filter_ma() instead\n"; return filter_ma( @_ ); } =head1 METHODS =head2 plot_acf =for ref Plots and returns autocorrelations for a time series. =for options Default options (case insensitive): SIG => 0.05, # can specify .10, .05, .01, or .001 DEV => '/xs', # open and close dev for plotting # defaults to '/png' in Windows =for usage Usage: perldl> $a = sequence 10 perldl> p $r = $a->plot_acf(5) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] =cut *plot_acf = \&PDL::plot_acf; sub PDL::plot_acf { require PDL::Graphics::PGPLOT::Window; my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($self, $h) = @_; my $r = $self->acf($h); my %opt = ( SIG => 0.05, DEV => $DEV, ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $w = PDL::Graphics::PGPLOT::Window::pgwin( Dev=>$opt{DEV} ); $w->env(-1, $h+1, -1.05, 1.05, {XTitle=>'lag', YTitle=>'acf'}); $w->line(pdl(-1,$h+1), zeroes(2)); # x axis my $y_sig = ($opt{SIG} == 0.10)? 1.64485362695147 : ($opt{SIG} == 0.05)? 1.95996398454005 : ($opt{SIG} == 0.01)? 2.5758293035489 : ($opt{SIG} == 0.001)? 3.29052673149193 : 0 ; unless ($y_sig) { carp "SIG outside of recognized value. default to 0.05"; $y_sig = 1.95996398454005; } $w->line( pdl(-1,$h+1), ones(2) * $y_sig / sqrt($self->dim(0)), { LINESTYLE=>"Dashed" } ); $w->line( pdl(-1,$h+1), ones(2) * $y_sig / sqrt($self->dim(0)) * -1, { LINESTYLE=>"Dashed" } ); for my $lag (0..$h) { $w->line( ones(2)*$lag, pdl(0, $r($lag)) ); } $w->close; return $r; } =head1 REFERENCES Brockwell, P.J., & Davis, R.A. (2002). Introcution to Time Series and Forecasting (2nd ed.). New York, NY: Springer. Schütz, W., & Kolassa, S. (2006). Foresight: advantages of the MAD/Mean ratio over the MAPE. Retrieved Jan 28, 2010, from http://www.saf-ag.com/226+M5965d28cd19.html =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut #line 778 "TS.pm" # Exit with OK status 1; PDL-Stats-0.84/GENERATED/PDL/Stats/Kmeans.pm0000644000175000017500000004661714625061426017721 0ustar osboxesosboxes# # GENERATED WITH PDL::PP! Don't modify! # package PDL::Stats::Kmeans; our @EXPORT_OK = qw(random_cluster iv_cluster _random_cluster which_cluster assign centroid _d_p2l ); our %EXPORT_TAGS = (Func=>\@EXPORT_OK); use PDL::Core; use PDL::Exporter; use DynaLoader; our @ISA = ( 'PDL::Exporter','DynaLoader' ); push @PDL::Core::PP, __PACKAGE__; bootstrap PDL::Stats::Kmeans ; #line 3 "kmeans.pd" use Carp; use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::Basic; =head1 NAME PDL::Stats::Kmeans -- classic k-means cluster analysis =head1 DESCRIPTION Assumes that we have data pdl dim [observation, variable] and the goal is to put observations into clusters based on their values on the variables. The terms "observation" and "variable" are quite arbitrary but serve as a reminder for "that which is being clustered" and "that which is used to cluster". The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are non-threadable, respectively. =head1 SYNOPSIS Implement a basic k-means procedure, use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats; my ($data, $idv, $ido) = rtable( $file ); # or generate random data: $data = grandom(200, 2); # two vars as below my ($cluster, $centroid, $ss_centroid, $cluster_last); # start out with 8 random clusters $cluster = random_cluster( $data->dim(0), 8 ); # iterate to minimize total ss # stop when no more changes in cluster membership do { $cluster_last = $cluster; ($centroid, $ss_centroid) = $data->centroid( $cluster ); $cluster = $data->assign( $centroid ); } while sum(abs($cluster - $cluster_last)) > 0; or, use the B function provided here, my %k = $data->kmeans( \%opt ); print "$_\t$k{$_}\n" for (sort keys %k); plot the clusters if there are only 2 vars in $data, use PDL::Graphics::PGPLOT::Window; my ($win, $c); $win = pgwin 'xs'; $win->env($data( ,0)->minmax, $data( ,1)->minmax); $win->points( $data->dice_axis(0,which($k{cluster}->(,$_)))->dog, {COLOR=>++$c} ) for (0 .. $k{cluster}->dim(1)-1); =cut #line 85 "Kmeans.pm" =head1 FUNCTIONS =cut #line 74 "kmeans.pd" #line 75 "kmeans.pd" # my tmp var for PDL 2.007 slice upate my $_tmp; =head2 random_cluster =for sig Signature: (short [o]cluster(o,c); int obs=>o; int clu=>c) =for ref Creates masks for random mutually exclusive clusters. Accepts two parameters, num_obs and num_cluster. Extra parameter turns into extra dim in mask. May loop a long time if num_cluster approaches num_obs because empty cluster is not allowed. =for usage my $cluster = random_cluster( $num_obs, $num_cluster ); =cut # can't be called on pdl sub random_cluster { my ($obs, $clu) = @_; # extra param in @_ made into extra dim my $cluster = zeroes @_; do { $cluster->inplace->_random_cluster(); } while (PDL::any $cluster->sumover == 0 ); return $cluster; } #line 129 "Kmeans.pm" *_random_cluster = \&PDL::_random_cluster; =head2 which_cluster =for sig Signature: (short a(o,c); indx [o]b(o)) Given cluster mask dim [obs x clu], returns the cluster index to which an obs belong. Does not support overlapping clusters. If an obs has TRUE value for multiple clusters, the returned index is the first cluster the obs belongs to. If an obs has no TRUE value for any cluster, the return val is set to -1 or BAD if the input mask has badflag set. Usage: # create a cluster mask dim [obs x clu] perldl> p $c_mask = iv_cluster [qw(a a b b c c)] [ [1 1 0 0 0 0] [0 0 1 1 0 0] [0 0 0 0 1 1] ] # get cluster membership list dim [obs] perldl> p $ic = $c_mask->which_cluster [0 0 1 1 2 2] =for bad which_cluster processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *which_cluster = \&PDL::which_cluster; =head2 assign =for sig Signature: (data(o,v); centroid(c,v); short [o]cluster(o,c)) =for ref Takes data pdl dim [obs x var] and centroid pdl dim [cluster x var] and returns mask dim [obs x cluster] to cluster membership. An obs is assigned to the first cluster with the smallest distance (ie sum squared error) to cluster centroid. With bad value, obs is assigned by smallest mean squared error across variables. =for usage perldl> $centroid = ones 2, 3 perldl> $centroid(0,) .= 0 perldl> p $centroid [ [0 1] [0 1] [0 1] ] perldl> $b = qsort( random 4, 3 ) perldl> p $b [ [0.022774068 0.032513883 0.13890034 0.30942479] [ 0.16943853 0.50262636 0.56251531 0.7152271] [ 0.23964483 0.59932745 0.60967495 0.78452117] ] # notice that 1st 3 obs in $b are on average closer to 0 # and last obs closer to 1 perldl> p $b->assign( $centroid ) [ [1 1 1 0] # cluster 0 membership [0 0 0 1] # cluster 1 membership ] =for bad assign processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *assign = \&PDL::assign; =head2 centroid =for sig Signature: (data(o,v); cluster(o,c); float+ [o]m(c,v); float+ [o]ss(c,v)) =for ref Takes data dim [obs x var] and mask dim [obs x cluster], returns mean and ss (ms when data contains bad values) dim [cluster x var], using data where mask == 1. Multiple cluster membership for an obs is okay. If a cluster is empty all means and ss are set to zero for that cluster. =for usage # data is 10 obs x 3 var perldl> p $d = sequence 10, 3 [ [ 0 1 2 3 4 5 6 7 8 9] [10 11 12 13 14 15 16 17 18 19] [20 21 22 23 24 25 26 27 28 29] ] # create two clusters by value on 1st var perldl> p $a = $d( ,(0)) <= 5 [1 1 1 1 1 1 0 0 0 0] perldl> p $b = $d( ,(0)) > 5 [0 0 0 0 0 0 1 1 1 1] perldl> p $c = cat $a, $b [ [1 1 1 1 1 1 0 0 0 0] [0 0 0 0 0 0 1 1 1 1] ] perldl> p $d->centroid($c) # mean for 2 cluster x 3 var [ [ 2.5 7.5] [12.5 17.5] [22.5 27.5] ] # ss for 2 cluster x 3 var [ [17.5 5] [17.5 5] [17.5 5] ] =for bad centroid processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *centroid = \&PDL::centroid; #line 431 "kmeans.pd" #line 432 "kmeans.pd" sub _scree_ind { # use as scree cutoff the point with max distance to the line formed # by the 1st and last points in $self # it's a heuristic--whether we can get "good" results depends on # the number of components in $self. my ($self) = @_; $self = $self->squeeze; $self->ndims > 1 and croak "1D pdl only please"; my $a = zeroes 2, $self->nelem; ($_tmp = $a->slice('(0)')) .= sequence $self->nelem; ($_tmp = $a->slice('(1)')) .= $self; my $d = _d_point2line( $a, $a->slice(':,(0)'), $a->slice(':,(-1)') ); return $d->maximum_ind; } sub _d_point2line { my ($self, $p1, $p2) = @_; for ($self, $p1, $p2) { $_->dim(0) != 2 and carp "point pdl dim(0) != 2"; } return _d_p2l( $self->mv(0,-1)->dog, $p1->mv(0,-1)->dog, $p2->mv(0,-1)->dog ); } #line 332 "Kmeans.pm" *_d_p2l = \&PDL::_d_p2l; #line 494 "kmeans.pd" #line 495 "kmeans.pd" =head2 kmeans =for ref Implements classic k-means cluster analysis. Given a number of observations with values on a set of variables, kmeans puts the observations into clusters that maximizes within-cluster similarity with respect to the variables. Tries several different random seeding and clustering in parallel. Stops when cluster assignment of the observations no longer changes. Returns the best result in terms of R2 from the random-seeding trials. Instead of random seeding, kmeans also accepts manual seeding. This is done by providing a centroid to the function, in which case clustering will proceed from the centroid and there is no multiple tries. There are two distinct advantages from seeding with a centroid compared to seeding with predefined cluster membership of a subset of the observations ie "seeds", (1) a centroid could come from a previous study with a different set of observations; (2) a centroid could even be "fictional", or in more proper parlance, an idealized prototype with respect to the actual data. For example, if there are 10 person's ratings of 1 to 5 on 4 movies, ie a ratings pdl of dim [10 obs x 4 var], providing a centroid like [ [5 0 0 0] [0 5 0 0] [0 0 5 0] [0 0 0 5] ] will produce 4 clusters of people with each cluster favoring a different one of the 4 movies. Clusters from an idealized centroid may not give the best result in terms of R2, but they sure are a lot more interpretable. If clustering has to be done from predefined clusters of seeds, simply calculate the centroid using the B function and feed it to kmeans, my ($centroid, $ss) = $rating($iseeds, )->centroid( $seeds_cluster ); my %k = $rating->kmeans( { CNTRD=>$centroid } ); kmeans supports bad value*. =for options Default options (case insensitive): V => 1, # prints simple status FULL => 0, # returns results for all seeding trials CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts NTRY => 5, # num of random seeding trials NSEED => 1000, # num of initial seeds, use NSEED up to max obs NCLUS => 3, # num of clusters =for usage Usage: # suppose we have 4 person's ratings on 5 movies perldl> p $rating = ceil( random(4, 5) * 5 ) [ [3 2 2 3] [2 4 5 4] [5 3 2 3] [3 3 1 5] [4 3 3 2] ] # we want to put the 4 persons into 2 groups perldl> %k = $rating->kmeans( {NCLUS=>2} ) # by default prints back options used # as well as info for all tries and iterations CNTRD => Null FULL => 0 NCLUS => 2 NSEED => 4 NTRY => 5 V => 1 ss total: 20.5 iter 0 R2 [0.024390244 0.024390244 0.26829268 0.4796748 0.4796748] iter 1 R2 [0.46341463 0.46341463 0.4796748 0.4796748 0.4796748] perldl> p "$_\t$k{$_}\n" for (sort keys %k) R2 0.479674796747968 centroid # mean ratings for 2 group x 5 movies [ [ 3 2.3333333] [ 2 4.3333333] [ 5 2.6666667] [ 3 3] [ 4 2.6666667] ] cluster # 4 persons' membership in two groups [ [1 0 0 0] [0 1 1 1] ] n [1 3] # cluster size ss [ [ 0 0.66666667] [ 0 0.66666667] [ 0 0.66666667] [ 0 8] [ 0 0.66666667] ] Now, for the valiant, kmeans is threadable. Say you gathered 10 persons' ratings on 5 movies from 2 countries, so the data is dim [10,5,2], and you want to put the 10 persons from each country into 3 clusters, just specify NCLUS => [3,1], and there you have it. The key is for NCLUS to include $data->ndims - 1 numbers. The 1 in [3,1] turns into a dummy dim, so the 3-cluster operation is repeated on both countries. Similarly, when seeding, CNTRD needs to have ndims that at least match the data ndims. Extra dims in CNTRD will lead to threading (convenient if you want to try out different centroid locations, for example, but you will have to hand pick the best result). See stats_kmeans.t for examples w 3D and 4D data. *With bad value, R2 is based on average of variances instead of sum squared error. =cut *kmeans = \&PDL::kmeans; sub PDL::kmeans { my ($self, $opt) = @_; my %opt = ( V => 1, # prints simple status FULL => 0, # returns results for all seeding trials CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts NTRY => 5, # num of random seeding trials NSEED => 1000, # num of initial seeds, use NSEED up to max obs NCLUS => 3, # num of clusters ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); if (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) { $opt{NTRY} = 1; $opt{NSEED} = $self->dim(0); $opt{NCLUS} = $opt{CNTRD}->dim(0); } else { $opt{NSEED} = pdl($self->dim(0), $opt{NSEED})->min->sclr; } $opt{V} and print STDERR "$_\t=> $opt{$_}\n" for (sort keys %opt); my $ss_ms = $self->badflag? 'ms' : 'ss'; my $ss_total = $self->badflag? $self->var->average : $self->ss->sumover; $opt{V} and print STDERR "overall $ss_ms:\t$ss_total\n"; my ($centroid, $ss_cv, $R2, $clus_this, $clus_last); # NTRY made into extra dim in $cluster for threading my @nclus = (ref $opt{NCLUS} eq 'ARRAY')? @{$opt{NCLUS}} : ($opt{NCLUS}); $clus_this = (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) ? $self->assign( $opt{CNTRD}->dummy(-1) ) # put dummy(-1) to match NTRY : random_cluster($opt{NSEED}, @nclus, $opt{NTRY} ) ; ($centroid, $ss_cv) = $self(0:$opt{NSEED} - 1, )->centroid( $clus_this ); # now obs in $clus_this matches $self $clus_this = $self->assign( $centroid ); ($centroid, $ss_cv) = $self->centroid( $clus_this ); my $iter = 0; do { $R2 = $self->badflag? 1 - $ss_cv->average->average / $ss_total : 1 - $ss_cv->sumover->sumover / $ss_total ; $opt{V} and print STDERR join(' ',('iter', $iter++, 'R2', $R2)) . "\n"; $clus_last = $clus_this; $clus_this = $self->assign( $centroid ); ($centroid, $ss_cv) = $self->centroid( $clus_this ); } while ( any long(abs($clus_this - $clus_last))->sumover->sumover > 0 ); $opt{FULL} and return ( centroid => PDL::squeeze( $centroid ), cluster => PDL::squeeze( $clus_this ), n => PDL::squeeze( $clus_this )->sumover, R2 => PDL::squeeze( $R2 ), $ss_ms => PDL::squeeze( $ss_cv ), ); # xchg/mv(-1,0) leaves it as was if single dim--unlike transpose my $i_best = $R2->mv(-1,0)->maximum_ind; $R2->getndims == 1 and return ( centroid => $centroid->dice_axis(-1,$i_best)->sever->squeeze, cluster => $clus_this->dice_axis(-1,$i_best)->sever->squeeze, n => $clus_this->dice_axis(-1,$i_best)->sever->squeeze->sumover, R2 => $R2->dice_axis(-1,$i_best)->sever->squeeze, $ss_ms => $ss_cv->dice_axis(-1,$i_best)->sever->squeeze, ); # now for threading beyond 2D data # can't believe i'm using a perl loop :P $i_best = $i_best->flat->sever; my @i_best = map { $opt{NTRY} * $_ + $i_best(($_)) } 0 .. $i_best->nelem - 1; my @shapes; for ($centroid, $clus_this, $R2) { my @dims = $_->dims; pop @dims; push @shapes, \@dims; } $clus_this = $clus_this->mv(-1,2)->clump(2..$clus_this->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[1] } )->sever, return ( centroid => $centroid->mv(-1,2)->clump(2..$centroid->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever, cluster => $clus_this, n => $clus_this->sumover, R2 => $R2->mv(-1,0)->clump(0..$R2->ndims-1)->dice_axis(0,\@i_best)->sever->reshape( @{ $shapes[2] } )->sever, $ss_ms => $ss_cv->mv(-1,2)->clump(2..$ss_cv->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever, ); } =head1 METHODS =head2 iv_cluster =for ref Turns an independent variable into a cluster pdl. Returns cluster pdl and level-to-pdl_index mapping in list context and cluster pdl only in scalar context. This is the method used for mean and var in anova. The difference between iv_cluster and dummy_code is that iv_cluster returns pdl dim [obs x level] whereas dummy_code returns pdl dim [obs x (level - 1)]. =for usage Usage: perldl> @bake = qw( y y y n n n ) # accepts @ ref or 1d pdl perldl> p $bake = iv_cluster( \@bake ) [ [1 1 1 0 0 0] [0 0 0 1 1 1] ] perldl> p $rating = sequence 6 [0 1 2 3 4 5] perldl> p $rating->centroid( $bake ) # mean for each iv level [ [1 4] ] # ss [ [2 2] ] =cut *iv_cluster = \&PDL::iv_cluster; sub PDL::iv_cluster { my ($var_ref) = @_; my ($var, $map_ref) = PDL::Stats::Basic::_array_to_pdl( $var_ref ); my $var_a = yvals( short, $var->nelem, $var->max->sclr + 1 ) == $var; $var_a = $var_a->setbadif( $var->isbad ) if $var->badflag; return wantarray? ($var_a, $map_ref) : $var_a; } =head2 pca_cluster Assign variables to components ie clusters based on pca loadings or scores. One way to seed kmeans (see Ding & He, 2004, and Su & Dy, 2004 for other ways of using pca with kmeans). Variables are assigned to their most associated component. Note that some components may not have any variable that is most associated with them, so the returned number of clusters may be smaller than NCOMP. Default options (case insensitive): V => 1, ABS => 1, # high pos and neg loadings on a comp in same cluster NCOMP => undef, # max number of components to consider. determined by # scree plot black magic if not specified PLOT => 0, # pca scree plot with cutoff at NCOMP Usage: # say we need to cluster a group of documents # $data is pdl dim [word x doc] ($data, $idd, $idw) = get_data 'doc_word_info.txt'; perldl> %p = $data->pca; # $cluster is pdl mask dim [doc x ncomp] perldl> $cluster = $p{loading}->pca_cluster; # pca clusters var while kmeans clusters obs. hence transpose perldl> ($m, $ss) = $data->transpose->centroid( $cluster ); perldl> %k = $data->transpose->kmeans( { cntrd=>$m } ); # take a look at cluster 0 doc ids perldl> p join("\n", @$idd[ list which $k{cluster}->( ,0) ]); =cut *pca_cluster = \&PDL::pca_cluster; sub PDL::pca_cluster { my ($self, $opt) = @_; my %opt = ( V => 1, ABS => 1, # high pos and neg loadings on a comp in same cluster NCOMP => undef, # max number of components to consider. determined by # scree plot black magic if not specified PLOT => 0, # pca scree plot with cutoff at NCOMP ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $var = sumover($self ** 2) / $self->dim(0); if (!$opt{NCOMP}) { # here's the black magic part my $comps = ($self->dim(1) > 300)? int($self->dim(1) * .1) : pdl($self->dim(1), 30)->min ; $var = $var(0:$comps-1)->sever; $opt{NCOMP} = _scree_ind( $var ); } $opt{PLOT} and do { require PDL::Stats::GLM; $var->plot_scree( {NCOMP=>$var->dim(0), CUT=>$opt{NCOMP}} ); }; my $c = $self->( ,0:$opt{NCOMP}-1)->transpose->abs->maximum_ind; if ($opt{ABS}) { $c = $c->iv_cluster; } else { my @c = map { ($self->($_,$c($_)) >= 0)? $c($_)*2 : $c($_)*2 + 1 } ( 0 .. $c->dim(0)-1 ); $c = iv_cluster( \@c ); } $opt{V} and print STDERR "cluster membership mask as " . $c->info . "\n"; return $c; } =head1 REFERENCES Ding, C., & He, X. (2004). K-means clustering via principal component analysis. Proceedings of the 21st International Conference on Machine Learning, 69, 29. Su, T., & Dy, J. (2004). A deterministic method for initializing K-means clustering. 16th IEEE International Conference on Tools with Artificial Intelligence, 784-786. Romesburg, H.C. (1984). Cluster Analysis for Researchers. NC: Lulu Press. Wikipedia (retrieved June, 2009). K-means clustering. http://en.wikipedia.org/wiki/K-means_algorithm =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut #line 706 "Kmeans.pm" # Exit with OK status 1; PDL-Stats-0.84/Stats.pm0000644000175000017500000001700514625061164014552 0ustar osboxesosboxespackage PDL::Stats; use strict; use warnings; our $VERSION = '0.84'; sub import { my $pkg = (caller())[0]; eval { require PDL::Core; require PDL::GSL::CDF; }; my $cdf = !$@; my $use = <<"EOD"; package $pkg; use PDL::Stats::Basic; use PDL::Stats::GLM; use PDL::Stats::Kmeans; use PDL::Stats::TS; @{[ $cdf ? 'use PDL::Stats::Distr;' : '' ]} @{[ $cdf ? 'use PDL::GSL::CDF;' : '' ]} EOD eval $use; die $@ if $@; } =head1 NAME PDL::Stats - a collection of statistics modules in Perl Data Language, with a quick-start guide for non-PDL people. =head1 DESCRIPTION Loads modules named below, making the functions available in the current namespace. Properly formatted documentations online at http://pdl-stats.sf.net =head1 SYNOPSIS use PDL::LiteF; # loads less modules use PDL::NiceSlice; # preprocessor for easier pdl indexing syntax use PDL::Stats; # Is equivalent to the following: use PDL::Stats::Basic; use PDL::Stats::GLM; use PDL::Stats::Kmeans; use PDL::Stats::TS; # and the following if installed; use PDL::Stats::Distr; use PDL::GSL::CDF; =head1 QUICK-START FOR NON-PDL PEOPLE Enjoy PDL::Stats without having to dive into PDL, just wet your feet a little. Three key words two concepts and an icing on the cake, you should be well on your way there. =head2 pdl The magic word that puts PDL::Stats at your disposal. pdl creates a PDL numeric data object (a pdl, or ndarray) from perl array or array ref. All PDL::Stats methods, unless meant for regular perl array, can then be called from the data object. my @y = 0..5; my $y = pdl @y; # a simple function my $stdv = $y->stdv; # you can skip the intermediate $y my $stdv = stdv( pdl @y ); # a more complex method, skipping intermediate $y my @x1 = qw( y y y n n n ); my @x2 = qw( 1 0 1 0 1 0 ) # do a two-way analysis of variance with y as DV and x1 x2 as IVs my %result = pdl(@y)->anova( \@x1, \@x2 ); print "$_\t$result{$_}\n" for (sort keys %result); If you have a list of list, ie array of array refs, pdl will create a multi-dimensional data object. my @a = ( [1,2,3,4], [0,1,2,3], [4,5,6,7] ); my $a = pdl @a; print $a . $a->info; # here's what you will get [ [1 2 3 4] [0 1 2 3] [4 5 6 7] ] PDL: Double D [4,3] PDL::Stats puts observations in the first dimension and variables in the second dimension, ie pdl [obs, var]. In PDL::Stats the above example represents 4 observations on 3 variables. # you can do all kinds of fancy stuff on such a 2D pdl. my %result = $a->kmeans( {NCLUS=>2} ); print "$_\t$result{$_}\n" for (sort keys %result); Make sure the array of array refs is rectangular. If the array refs are of unequal sizes, pdl will pad it out with 0s to match the longest list. =head2 info Tells you the data type (yes pdls are typed, but you shouldn't have to worry about it here*) and dimensionality of the pdl, as seen in the above example. I find it a big help for my sanity to keep track of the dimensionality of a pdl. As mentioned above, PDL::Stats uses 2D pdl with observation x variable dimensionality. *pdl uses double precision by default. If you are working with things like epoch time, then you should probably use pdl(long, @epoch) to maintain the precision. =head2 list Come back to the perl reality from the PDL wonder land. list turns a pdl data object into a regular perl list. Caveat: list produces a flat list. The dimensionality of the data object is lost. =head2 Signature This is not a function, but a concept. You will see something like this frequently in the pod: stdv Signature: (a(n); float+ [o]b()) The signature tells you what the function expects as input and what kind of output it produces. a(n) means it expects a 1D pdl with n elements; [o] is for output, b() means its a scalar. So stdv will take your 1D list and give back a scalar. float+ you can ignore; but if you insist, it means the output is at float or double precision. The name a or b or c is not important. What's important is the thing in the parenthesis. corr Signature: (a(n); b(n); float+ [o]c()) Here the function corr takes two inputs, two 1D pdl with the same numbers of elements, and gives back a scalar. t_test Signature: (a(n); b(m); float+ [o]t(); [o]d()) Here the function t_test can take two 1D pdls of unequal size (n==m is certainly fine), and give back two scalars, t-value and degrees of freedom. Yes we accommodate t-tests with unequal sample sizes. assign Signature: (data(o,v); centroid(c,v); byte [o]cluster(o,c)) Here is one of the most complicated signatures in the package. This is a function from Kmeans. assign takes data of observation x variable dimensions, and a centroid of cluster x variable dimensions, and returns an observation x cluster membership pdl (indicated by 1s and 0s). Got the idea? Then we can see how PDL does its magic :) =head2 Threading Another concept. The first thing to know is that, threading is optional. PDL threading means automatically repeating the operation on extra elements or dimensions fed to a function. For a function with a signature like this gsl_cdf_tdist_P Signature: (double x(); double nu(); [o]out()) the signatures says that it takes two scalars as input, and returns a scalar as output. If you need to look up the p-values for a list of t's, with the same degrees of freedom 19, my @t = ( 1.65, 1.96, 2.56 ); my $p = gsl_cdf_tdist_P( pdl(@t), 19 ); print $p . "\n" . $p->info; # here's what you will get [0.94231136 0.96758551 0.99042586] PDL: Double D [3] The same function is repeated on each element in the list you provided. If you had different degrees of freedoms for the t's, my @df = (199, 39, 19); my $p = gsl_cdf_tdist_P( pdl(@t), pdl(@df) ); print $p . "\n" . $p->info; # here's what you will get [0.94973979 0.97141553 0.99042586] PDL: Double D [3] The df's are automatically matched with the t's to give you the results. An example of threading thru extra dimension(s): stdv Signature: (a(n); float+ [o]b()) if the input is of 2D, say you want to compute the stdv for each of the 3 variables, my @a = ( [1,1,3,4], [0,1,2,3], [4,5,6,7] ); # pdl @a is pdl dim [4,3] my $sd = stdv( pdl @a ); print $sd . "\n" . $sd->info; # this is what you will get [ 1.2990381 1.118034 1.118034] PDL: Double D [3] Here the function was given an input with an extra dimension of size 3, so it repeats the stdv operation on the extra dimension 3 times, and gives back a 1D pdl of size 3. Threading works for arbitrary number of dimensions, but it's best to refrain from higher dim pdls unless you have already decided to become a PDL wiz / witch. Not all PDL::Stats methods thread. As a rule of thumb, if a function has a signature attached to it, it threads. =head2 perldl Essentially a perl shell with "use PDL;" at start up. Comes with the PDL installation. Very handy to try out pdl operations, or just plain perl. print is shortened to p to avoid injury from excessive typing. C goes out of scope at the end of (multi)line input, so mostly you will have to drop the good practice of my here. =head2 For more info L =head1 AUTHOR ~~~~~~~~~~~~ ~~~~~ ~~~~~~~~ ~~~~~ ~~~ `` ><((("> Copyright (C) 2009-2015 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut 1; PDL-Stats-0.84/META.yml0000644000175000017500000000140614625061425014365 0ustar osboxesosboxes--- abstract: 'a collection of statistics modules in Perl Data Language, with a quick-start guide for non-PDL people.' author: - 'Maggie J. Xiong ' build_requires: PDL::Core: '2.008' Test::More: '0.88' configure_requires: PDL::Core: '2.008' dynamic_config: 1 generated_by: 'ExtUtils::MakeMaker version 7.44, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: PDL-Stats no_index: directory: - t - inc recommends: PDL::Graphics::PGPLOT: '0' PDL::Slatec: '0' requires: PDL: '2.057' resources: repository: git://github.com/PDLPorters/PDL-Stats version: '0.84' x_serialization_backend: 'CPAN::Meta::YAML version 0.018' PDL-Stats-0.84/Distr/0000755000175000017500000000000014625061425014200 5ustar osboxesosboxesPDL-Stats-0.84/Distr/t/0000755000175000017500000000000014625061425014443 5ustar osboxesosboxesPDL-Stats-0.84/Distr/t/stats_distr.t0000644000175000017500000001011414544645567017207 0ustar osboxesosboxesuse strict; use warnings; use Test::More; use PDL::Stats::Distr; use PDL::LiteF; sub tapprox { my($a,$b) = @_; my $diff = abs($a-$b); # use max to make it perl scalar ref $diff eq 'PDL' and $diff = $diff->max; return $diff < 1.0e-6; } # 2-11 { my $a = sequence 5; $a /= 10; is( tapprox( sum(pdl($a->mme_beta) - pdl(1.4, 5.6)), 0 ), 1 ); is( tapprox( $a->pdf_beta(1, 3)->sum, 9.9 ), 1 ); } { my $a = sequence 5; $a %= 2; is( tapprox( sum(pdl($a->mme_binomial) - pdl(1, .4)), 0 ), 1 ); is( tapprox( $a->pmf_binomial(2,.4)->sum, 2.04 ), 1 ); } { my $a = sequence 5; is( tapprox( $a->mle_exp, .5 ), 1 ); is( tapprox( $a->pdf_exp(2.5)->sum, 2.72355357480724 ), 1 ); } { my $a = sequence 5; is( tapprox( sum(pdl($a->mle_gaussian) - pdl(2,2)), 0 ), 1 ); is( tapprox( $a->pdf_gaussian(1,2)->sum, 0.854995527902657 ), 1 ); } { my $a = sequence 5; is( tapprox( $a->mle_geo, 0.333333333333333 ), 1 ); is( tapprox( $a->pmf_geo(.5)->sum, 0.96875 ), 1 ); } # 12-22 { my $a = sequence 5; $a += 1; is( tapprox( $a->mle_geosh, 0.333333333333333 ), 1 ); is( tapprox( $a->pmf_geosh(.5)->sum, 0.96875 ), 1 ); } { my $a = sequence(5) + 1; is( tapprox( sum(pdl($a->mle_lognormal) - pdl(0.957498348556409, 0.323097797388514)), 0 ), 1 ); is( tapprox( sum(pdl($a->mme_lognormal) - pdl(2.19722457733622, 0.200670695462151)), 0 ), 1 ); is( tapprox( $a->pdf_lognormal(1,2)->sum, 0.570622216518612 ), 1 ); } { my $a = sequence 5; $a *= $a; is( tapprox( sum(pdl($a->mme_nbd) - pdl(1.25, 0.172413793103448)), 0 ), 1 ); is( tapprox( $a->pmf_nbd(2, .4)->sum, 0.472571655494828 ), 1 ); } { my $a = sequence 5; $a += 1; is( tapprox( sum(pdl($a->mme_pareto) - pdl(1.4, 0.857142857142857)), 0 ), 1 ); is( tapprox( $a->pdf_pareto(2, .4)->sum, 0.379411851851852 ), 1 ); } { my $a = sequence 5; $a %= 2; is( tapprox( $a->mle_poisson, .4 ), 1 ); is( tapprox( $a->pmf_poisson(.4)->sum, 2.54721617493543 ), 1 ); } # 23-32 { my $a = sequence 6; $a->setbadat(-1); $a /= 10; is( tapprox( sum(pdl($a->mme_beta) - pdl(1.4, 5.6)), 0 ), 1 ); is( tapprox( $a->pdf_beta(1, 3)->sum, 9.9 ), 1 ); } { my $a = sequence 6; $a->setbadat(-1); $a %= 2; is( tapprox( sum(pdl($a->mme_binomial) - pdl(1, .4)), 0 ), 1 ); is( tapprox( $a->pmf_binomial(2,.4)->sum, 2.04 ), 1 ); } { my $a = sequence 6; $a->setbadat(-1); is( tapprox( $a->mle_exp, .5 ), 1 ); is( tapprox( $a->pdf_exp(2.5)->sum, 2.72355357480724 ), 1 ); } { my $a = sequence 6; $a->setbadat(-1); is( tapprox( sum(pdl($a->mle_gaussian) - pdl(2,2)), 0 ), 1 ); is( tapprox( $a->pdf_gaussian(1,2)->sum, 0.854995527902657 ), 1 ); } { my $a = sequence 6; $a->setbadat(-1); is( tapprox( $a->mle_geo, 0.333333333333333 ), 1 ); is( tapprox( $a->pmf_geo(.5)->sum, 0.96875 ), 1 ); } # 33-43 { my $a = sequence 6; $a->setbadat(-1); $a += 1; is( tapprox( $a->mle_geosh, 0.333333333333333 ), 1 ); is( tapprox( $a->pmf_geosh(.5)->sum, 0.96875 ), 1 ); } { my $a = sequence 6; $a->setbadat(-1); $a += 1; is( tapprox( sum(pdl($a->mle_lognormal) - pdl(0.957498348556409, 0.323097797388514)), 0 ), 1 ); is( tapprox( sum(pdl($a->mme_lognormal) - pdl(2.19722457733622, 0.200670695462151)), 0 ), 1 ); is( tapprox( $a->pdf_lognormal(1,2)->sum, 0.570622216518612 ), 1 ); } { my $a = sequence 6; $a->setbadat(-1); $a *= $a; is( tapprox( sum(pdl($a->mme_nbd) - pdl(1.25, 0.172413793103448)), 0 ), 1 ); is( tapprox( $a->pmf_nbd(2, .4)->sum, 0.472571655494828 ), 1 ); } { my $a = sequence 6; $a->setbadat(-1); $a += 1; is( tapprox( sum(pdl($a->mme_pareto) - pdl(1.4, 0.857142857142857)), 0 ), 1 ); is( tapprox( $a->pdf_pareto(2, .4)->sum, 0.379411851851852 ), 1 ); } { my $a = sequence 6; $a->setbadat(-1); $a %= 2; is( tapprox( $a->mle_poisson, .4 ), 1 ); is( tapprox( $a->pmf_poisson(.4)->sum, 2.54721617493543 ), 1 ); is( tapprox( $a->pmf_poisson_factorial(.4)->sum, 2.54721617493543 ), 1 ); is( tapprox( $a->pmf_poisson_stirling(.4)->sum, 2.5470618950599 ), 1 ); $a += 171; ok( $a->pmf_poisson_stirling(10)->sum ); # the result is so close to 0 it's pointless to test with tapprox } done_testing; PDL-Stats-0.84/Distr/distr.pd0000644000175000017500000006142614625061113015655 0ustar osboxesosboxespp_add_exported( ); pp_addpm({At=>'Top'}, <<'EOD'); use strict; use warnings; use Carp; use PDL::LiteF; my $DEV = ($^O =~ /win/i)? '/png' : '/xs'; =head1 NAME PDL::Stats::Distr -- parameter estimations and probability density functions for distributions. =head1 DESCRIPTION Parameter estimate is maximum likelihood estimate when there is closed form estimate, otherwise it is method of moments estimate. =head1 SYNOPSIS use PDL::LiteF; use PDL::Stats::Distr; # do a frequency (probability) plot with fitted normal curve my $data = grandom(100)->abs; my ($xvals, $hist) = $data->hist; # turn frequency into probability $hist /= $data->nelem; # get maximum likelihood estimates of normal curve parameters my ($m, $v) = $data->mle_gaussian(); # fitted normal curve probabilities my $p = $xvals->pdf_gaussian($m, $v); use PDL::Graphics::PGPLOT::Window; my $win = pgwin( Dev=>"/xs" ); $win->bin( $hist ); $win->hold; $win->line( $p, {COLOR=>2} ); $win->close; Or, play with different distributions with B :) $data->plot_distr( 'gaussian', 'lognormal' ); =cut EOD pp_addhdr(' #include #include #include '); pp_def('mme_beta', Pars => 'a(n); float+ [o]alpha(); float+ [o]beta()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(alpha) sa, a2, m, v; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} m = sa / N; v = a2 / N - pow(m, 2); $alpha() = m * ( m * (1 - m) / v - 1 ); $beta() = (1 - m) * ( m * (1 - m) / v - 1 ); ', BadCode => ' $GENERIC(alpha) sa, a2, m, v; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { sa += $a(); a2 += pow($a(), 2); N ++; } %} if (N) { m = sa / N; v = a2 / N - pow(m, 2); $alpha() = m * ( m * (1 - m) / v - 1 ); $beta() = (1 - m) * ( m * (1 - m) / v - 1 ); } else { $SETBAD(alpha()); $SETBAD(beta()); } ', Doc => ' =for usage my ($a, $b) = $data->mme_beta(); =for ref beta distribution. pdf: f(x; a,b) = 1/B(a,b) x^(a-1) (1-x)^(b-1) =cut ', ); pp_def('pdf_beta', Pars => 'x(); a(); b(); float+ [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => ' if ($x()>=0 && $x()<=1) { double B_1 = 1 / gsl_sf_beta( $a(), $b() ); $p() = B_1 * pow($x(), $a()-1) * pow(1-$x(), $b()-1); } else { barf("x out of range [0,1]"); } ', BadCode => ' if ( $ISBAD($x()) || $ISBAD($a()) || $ISBAD($b()) ) { $SETBAD( $p() ); } else { if ($x()>=0 && $x()<=1) { double B_1 = 1 / gsl_sf_beta( $a(), $b() ); $p() = B_1 * pow($x(), $a()-1) * pow(1-$x(), $b()-1); } else { barf("x out of range [0,1]"); } } ', Doc => ' =for ref probability density function for beta distribution. x defined on [0,1]. =cut ', ); pp_def('mme_binomial', Pars => 'a(n); int [o]n_(); float+ [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(p) sa, a2, m, v; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} m = sa / N; v = a2 / N - pow(m, 2); $p() = 1 - v/m; $n_() = m / $p() >= 0? (int) (m / $p() + .5) : (int) (m / $p() - .5); $p() = m / $n_(); ', BadCode => ' $GENERIC(p) sa, a2, m, v; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { sa += $a(); a2 += pow($a(), 2); N ++; } %} if (N) { m = sa / N; v = a2 / N - pow(m, 2); $p() = 1 - v/m; $n_() = m / $p() >= 0? (int) (m / $p() + .5) : (int) (m / $p() - .5); $p() = m / $n_(); } else { $SETBAD(n_()); $SETBAD(p()); } ', Doc => ' =for usage my ($n, $p) = $data->mme_binomial; =for ref binomial distribution. pmf: f(k; n,p) = (n k) p^k (1-p)^(n-k) for k = 0,1,2..n =cut ', ); pp_def('pmf_binomial', Pars => 'ushort x(); ushort n(); p(); float+ [o]out()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(out) bc = gsl_sf_choose($n(), $x()); $out() = bc * pow($p(), $x()) * pow(1-$p(), $n() - $x()); ', BadCode => ' if ( $ISBAD($x()) || $ISBAD($n()) || $ISBAD($p()) ) { $SETBAD( $out() ); } else { $GENERIC(out) bc = gsl_sf_choose($n(), $x()); $out() = bc * pow($p(), $x()) * pow(1-$p(), $n() - $x()); } ', Doc => ' =for ref probability mass function for binomial distribution. =cut ', ); pp_def('mle_exp', Pars => 'a(n); float+ [o]l()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(l) sa = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); %} $l() = N / sa; ', BadCode => ' $GENERIC(l) sa = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { sa += $a(); N ++; } %} if (sa > 0) { $l() = N / sa; } else { $SETBAD(l()); } ', Doc => ' =for usage my $lamda = $data->mle_exp; =for ref exponential distribution. mle same as method of moments estimate. =cut ', ); pp_def('pdf_exp', Pars => 'x(); l(); float+ [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => ' $p() = $l() * exp( -1 * $l() * $x() ); ', BadCode => ' if ( $ISBAD($x()) || $ISBAD($l()) ) { $SETBAD( $p() ); } else { $p() = $l() * exp( -1 * $l() * $x() ); } ', Doc => ' =for ref probability density function for exponential distribution. =cut ', ); pp_def('mme_gamma', Pars => 'a(n); float+ [o]shape(); float+ [o]scale()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(shape) sa, a2, m, v; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} m = sa / N; v = a2 / N - pow(m, 2); $shape() = pow(m, 2) / v; $scale() = v / m; ', BadCode => ' $GENERIC(shape) sa, a2, m, v; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { sa += $a(); a2 += pow($a(), 2); N ++; } %} if (N) { m = sa / N; v = a2 / N - pow(m, 2); $shape() = pow(m, 2) / v; $scale() = v / m; } else { $SETBAD(shape()); $SETBAD(scale()); } ', Doc => ' =for usage my ($shape, $scale) = $data->mme_gamma(); =for ref two-parameter gamma distribution =cut ', ); pp_def('pdf_gamma', Pars => 'x(); a(); t(); float+ [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => ' double g = gsl_sf_gamma( $a() ); $p() = pow($x(), $a()-1) * exp(-1*$x() / $t()) / (pow($t(), $a()) * g); ', BadCode => ' if ( $ISBAD($x()) || $ISBAD($a()) || $ISBAD($t()) ) { $SETBAD( $p() ); } else { double g = gsl_sf_gamma( $a() ); $p() = pow($x(), $a()-1) * exp(-1*$x() / $t()) / (pow($t(), $a()) * g); } ', Doc => ' =for ref probability density function for two-parameter gamma distribution. =cut ', ); pp_def('mle_gaussian', Pars => 'a(n); float+ [o]m(); float+ [o]v()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(m) sa, a2; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} $m() = sa / N; $v() = a2 / N - pow($m(),2); ', BadCode => ' $GENERIC(m) sa, a2; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { sa += $a(); a2 += pow($a(), 2); N ++; } %} if (N) { $m() = sa / N; $v() = a2 / N - pow($m(),2); } else { $SETBAD(m()); $SETBAD(v()); } ', Doc => ' =for usage my ($m, $v) = $data->mle_gaussian(); =for ref gaussian aka normal distribution. same results as $data->average and $data->var. mle same as method of moments estimate. =cut ', ); pp_def('pdf_gaussian', Pars => 'x(); m(); v(); float+ [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => ' $p() = 1 / sqrt($v() * 2 * M_PI) * exp( -1 * pow($x() - $m(), 2) / (2*$v()) ); ', BadCode => ' if ( $ISBAD($x()) || $ISBAD($m()) || $ISBAD($v()) ) { $SETBAD( $p() ); } else { $p() = 1 / sqrt($v() * 2 * M_PI) * exp( -1 * pow($x() - $m(), 2) / (2*$v()) ); } ', Doc => ' =for ref probability density function for gaussian distribution. =cut ', ); pp_def('mle_geo', Pars => 'a(n); float+ [o]p();', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(p) sa = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); %} $p() = 1 / (1 + sa/N); ', BadCode => ' $GENERIC(p) sa = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { sa += $a(); N ++; } %} if (N) { $p() = 1 / (1 + sa/N); } else { $SETBAD(p()); } ', Doc => ' =for ref geometric distribution. mle same as method of moments estimate. =cut ', ); pp_def('pmf_geo', Pars => 'ushort x(); p(); float+ [o]out()', GenericTypes => [F,D], HandleBad => 1, Code => ' $out() = pow(1-$p(), $x()) * $p(); ', BadCode => ' if ( $ISBAD($x()) || $ISBAD($p()) ) { $SETBAD( $out() ); } else { $out() = pow(1-$p(), $x()) * $p(); } ', Doc => ' =for ref probability mass function for geometric distribution. x >= 0. =cut ', ); pp_def('mle_geosh', Pars => 'a(n); float+ [o]p();', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(p) sa = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); %} $p() = N / sa; ', BadCode => ' $GENERIC(p) sa = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { sa += $a(); N ++; } %} if (sa > 0) { $p() = N / sa; } else { $SETBAD(p()); } ', Doc => ' =for ref shifted geometric distribution. mle same as method of moments estimate. =cut ', ); pp_def('pmf_geosh', Pars => 'ushort x(); p(); float+ [o]out()', GenericTypes => [F,D], HandleBad => 1, Code => ' if ( $x() >= 1 ) { $out() = pow(1-$p(), $x()-1) * $p(); } else { barf( "x >= 1 please" ); } ', BadCode => ' if ( $ISBAD($x()) || $ISBAD($p()) ) { $SETBAD( $out() ); } else { if ( $x() >= 1 ) { $out() = pow(1-$p(), $x()-1) * $p(); } else { barf( "x >= 1 please" ); } } ', Doc => ' =for ref probability mass function for shifted geometric distribution. x >= 1. =cut ', ); pp_def('mle_lognormal', Pars => 'a(n); float+ [o]m(); float+ [o]v()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(m) sa, a2; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += log($a()); %} $m() = sa / N; loop (n) %{ a2 += pow(log($a()) - $m(), 2); %} $v() = a2 / N; ', BadCode => ' $GENERIC(m) sa, a2; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { sa += log($a()); N ++; } %} if (N) { $m() = sa / N; loop (n) %{ if ($ISGOOD( $a() )) { a2 += pow(log($a()) - $m(), 2); } %} $v() = a2 / N; } else { $SETBAD(m()); $SETBAD(v()); } ', Doc => ' =for usage my ($m, $v) = $data->mle_lognormal(); =for ref lognormal distribution. maximum likelihood estimation. =cut ', ); pp_def('mme_lognormal', Pars => 'a(n); float+ [o]m(); float+ [o]v()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(m) sa, a2; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} $m() = 2 * log(sa / N) - 1/2 * log( a2 / N ); $v() = log( a2 / N ) - 2 * log( sa / N ); ', BadCode => ' $GENERIC(m) sa, a2; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { sa += $a(); a2 += pow($a(), 2); N ++; } %} if (N) { $m() = 2 * log(sa / N) - 1/2 * log( a2 / N ); $v() = log( a2 / N ) - 2 * log( sa / N ); } else { $SETBAD(m()); $SETBAD(v()); } ', Doc => ' =for usage my ($m, $v) = $data->mme_lognormal(); =for ref lognormal distribution. method of moments estimation. =cut ', ); pp_def('pdf_lognormal', Pars => 'x(); m(); v(); float+ [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => ' if ( $x() > 0 && $v() > 0 ) { $p() = 1 / ($x() * sqrt($v() * 2 * M_PI)) * exp( -1 * pow(log($x()) - $m(), 2) / (2*$v()) ); } else { barf( "x and v > 0 please" ); } ', BadCode => ' if ( $ISBAD($x()) || $ISBAD($m()) || $ISBAD($v()) ) { $SETBAD( $p() ); } else { if ( $x() > 0 && $v() > 0 ) { $p() = 1 / ($x() * sqrt($v() * 2 * M_PI)) * exp( -1 * pow(log($x()) - $m(), 2) / (2*$v()) ); } else { barf( "x and v > 0 please" ); } } ', Doc => ' =for ref probability density function for lognormal distribution. x > 0. v > 0. =cut ', ); pp_def('mme_nbd', Pars => 'a(n); float+ [o]r(); float+ [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(p) sa, a2, m, v; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(), 2); %} m = sa / N; v = a2 / N - pow(m, 2); $r() = pow(m, 2) / (v - m); $p() = m / v; ', BadCode => ' $GENERIC(p) sa, a2, m, v; sa = 0; a2 = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { sa += $a(); a2 += pow($a(), 2); N ++; } %} if (N) { m = sa / N; v = a2 / N - pow(m, 2); $r() = pow(m, 2) / (v - m); $p() = m / v; } else { $SETBAD(r()); $SETBAD(p()); } ', Doc => ' =for usage my ($r, $p) = $data->mme_nbd(); =for ref negative binomial distribution. pmf: f(x; r,p) = (x+r-1 r-1) p^r (1-p)^x for x=0,1,2... =cut ', ); pp_def('pmf_nbd', Pars => 'ushort x(); r(); p(); float+ [o]out()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(out) nbc = gsl_sf_gamma($x()+$r()) / (gsl_sf_fact($x()) * gsl_sf_gamma($r())); $out() = nbc * pow($p(),$r()) * pow(1-$p(), $x()); ', BadCode => ' if ( $ISBAD($x()) || $ISBAD($r()) || $ISBAD($p()) ) { $SETBAD( $out() ); } else { $GENERIC(out) nbc = gsl_sf_gamma($x()+$r()) / (gsl_sf_fact($x()) * gsl_sf_gamma($r())); $out() = nbc * pow($p(),$r()) * pow(1-$p(), $x()); } ', Doc => ' =for ref probability mass function for negative binomial distribution. =cut ', ); pp_def('mme_pareto', Pars => 'a(n); float+ [o]k(); float+ [o]xm()', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(xm) sa, min; sa = 0; min = $a(n=>0); PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); if (min > $a()) min = $a(); %} if (min > 0) { $k() = (sa - min) / ( N*( sa/N - min ) ); $xm() = (N * $k() - 1) * min / ( N * $k() ); } else { barf("min <= 0!"); } ', BadCode => ' $GENERIC(xm) sa, min; sa = 0; min = $a(n=>0); PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); if (min > $a()) min = $a(); N ++; } %} if (min > 0) { $k() = (sa - min) / ( N*( sa/N - min ) ); $xm() = (N * $k() - 1) * min / ( N * $k() ); } else { barf("min <= 0!"); } ', Doc => ' =for usage my ($k, $xm) = $data->mme_pareto(); =for ref pareto distribution. pdf: f(x; k,xm) = k xm^k / x^(k+1) for x >= xm > 0. =cut ', ); pp_def('pdf_pareto', Pars => 'x(); k(); xm(); float+ [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => ' if ( $xm() > 0 && $x() >= $xm() ) { $p() = $k() * pow($xm(),$k()) / pow($x(), $k()+1); } else { barf("x >= xm > 0 please"); } ', BadCode => ' if ( $ISBAD($x()) || $ISBAD($k()) || $ISBAD($xm()) ) { $SETBAD( $p() ); } else { if ( $xm() > 0 && $x() >= $xm() ) { $p() = $k() * pow($xm(),$k()) / pow($x(), $k()+1); } else { barf("x >= xm > 0 please"); } } ', Doc => ' =for ref probability density function for pareto distribution. x >= xm > 0. =cut ', ); pp_def('mle_poisson', Pars => 'a(n); float+ [o]l();', GenericTypes => [F,D], HandleBad => 1, Code => ' $GENERIC(l) sa; sa = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); %} $l() = sa / N; ', BadCode => ' $GENERIC(l) sa; sa = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); N ++; } %} if (N) { $l() = sa / N; } else { $SETBAD(l()); } ', Doc => ' =for usage my $lamda = $data->mle_poisson(); =for ref poisson distribution. pmf: f(x;l) = e^(-l) * l^x / x! =cut ', ); pp_def('pmf_poisson', Pars => 'x(); l(); float+ [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => q{ if ($x() < 0) { $p() = 0; } else if ($x() < GSL_SF_FACT_NMAX / 2) { /* Exact formula */ $p() = exp( -1 * $l()) * pow($l(),$x()) / gsl_sf_fact( (unsigned int) $x() ); } else { /* Use Stirling's approximation. See * http://en.wikipedia.org/wiki/Stirling%27s_approximation */ double log_p = $x() - $l() + $x() * log($l() / $x()) - 0.5 * log(2*M_PI * $x()) - 1. / 12. / $x() + 1 / 360. / $x()/$x()/$x() - 1. / 1260. / $x()/$x()/$x()/$x()/$x(); $p() = exp(log_p); } }, BadCode => q{ if ( $ISBAD($x()) || $ISBAD($l()) ) { $SETBAD( $p() ); } else { if ($x() < 0) { $p() = 0; } else if ($x() < GSL_SF_FACT_NMAX / 2) { /* Exact formula */ $p() = exp( -1 * $l()) * pow($l(),$x()) / gsl_sf_fact( (unsigned int) $x() ); } else { /* Use Stirling's approximation. See * http://en.wikipedia.org/wiki/Stirling%27s_approximation */ double log_p = $x() - $l() + $x() * log($l() / $x()) - 0.5 * log(2*M_PI * $x()) - 1. / 12. / $x() + 1 / 360. / $x()/$x()/$x() - 1. / 1260. / $x()/$x()/$x()/$x()/$x(); $p() = exp(log_p); } } }, Doc => q{ =for ref Probability mass function for poisson distribution. Uses Stirling's formula for x > 85. =cut }, ); pp_def('pmf_poisson_stirling', Pars => 'x(); l(); [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => q{ if ($x() < 0) { $p() = 0; } else if ($x() == 0) { $p() = exp(-$l()); } else { /* Use Stirling's approximation. See * http://en.wikipedia.org/wiki/Stirling%27s_approximation */ double log_p = $x() - $l() + $x() * log($l() / $x()) - 0.5 * log(2*M_PI * $x()) - 1. / 12. / $x() + 1 / 360. / $x()/$x()/$x() - 1. / 1260. / $x()/$x()/$x()/$x()/$x(); $p() = exp(log_p); } }, BadCode => q{ if ( $ISBAD($x()) || $ISBAD($l()) ) { $SETBAD( $p() ); } else if ($x() < 0) { $p() = 0; } else if ($x() == 0) { $p() = exp(-$l()); } else { /* Use Stirling's approximation. See * http://en.wikipedia.org/wiki/Stirling%27s_approximation */ double log_p = $x() - $l() + $x() * log($l() / $x()) - 0.5 * log(2*M_PI * $x()) - 1. / 12. / $x() + 1 / 360. / $x()/$x()/$x() - 1. / 1260. / $x()/$x()/$x()/$x()/$x(); $p() = exp(log_p); } }, Doc => q{ =for ref Probability mass function for poisson distribution. Uses Stirling's formula for all values of the input. See http://en.wikipedia.org/wiki/Stirling's_approximation for more info. =cut }, ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 pmf_poisson_factorial =for sig Signature: ushort x(); l(); float+ [o]p() =for ref Probability mass function for poisson distribution. Input is limited to x < 170 to avoid gsl_sf_fact() overflow. =cut *pmf_poisson_factorial = \&PDL::pmf_poisson_factorial; sub PDL::pmf_poisson_factorial { my ($x, $l) = @_; my $pdlx = pdl($x); if (any( $pdlx >= 170 )) { croak "Does not support input greater than 170. Please use pmf_poisson or pmf_poisson_stirling instead."; } else { return _pmf_poisson_factorial(@_); } } EOD pp_def('_pmf_poisson_factorial', Pars => 'ushort x(); l(); float+ [o]p()', GenericTypes => [F,D], HandleBad => 1, Code => q{ if ($x() < GSL_SF_FACT_NMAX) { $p() = exp( -1 * $l()) * pow($l(),$x()) / gsl_sf_fact( $x() ); } else { /* bail out */ $p() = 0; } }, BadCode => q{ if ( $ISBAD($x()) || $ISBAD($l()) ) { $SETBAD( $p() ); } else { if ($x() < GSL_SF_FACT_NMAX) { $p() = exp( -1 * $l()) * pow($l(),$x()) / gsl_sf_fact( $x() ); } else { $p() = 0; } } }, Doc => undef, ); pp_addpm {At=>'Bot'}, pp_line_numbers(__LINE__, <<'EOD'); =head2 plot_distr =for ref Plots data distribution. When given specific distribution(s) to fit, returns % ref to sum log likelihood and parameter values under fitted distribution(s). See FUNCTIONS above for available distributions. =for options Default options (case insensitive): MAXBN => 20, # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple distr in same plot # set env before passing WIN DEV => '/xs' , # open and close dev for plotting if no WIN # defaults to '/png' in Windows COLOR => 1, # color for data distr =for usage Usage: # yes it threads :) my $data = grandom( 500, 3 )->abs; # ll on plot is sum across 3 data curves my ($ll, $pars) = $data->plot_distr( 'gaussian', 'lognormal', {DEV=>'/png'} ); # pars are from normalized data (ie data / bin_size) print "$_\t@{$pars->{$_}}\n" for (sort keys %$pars); print "$_\t$ll->{$_}\n" for (sort keys %$ll); =cut *plot_distr = \&PDL::plot_distr; sub PDL::plot_distr { require PDL::Graphics::PGPLOT::Window; my ($self, @distr) = @_; my %opt = ( MAXBN => 20, WIN => undef, # pgwin object. not closed here if passed DEV => $DEV, # open and close default win if no WIN COLOR => 1, # color for data distr ); my $opt = pop @distr if ref $distr[-1] eq 'HASH'; $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $self = $self->squeeze; # use int range, step etc for int xvals--pmf compatible my $INT = 1 if grep { /(?:binomial)|(?:geo)|(?:nbd)|(?:poisson)/ } @distr; my ($range, $step, $step_int); $range = $self->max->sclr - $self->min->sclr; $step = $range / $opt{MAXBN}; $step_int = ($range <= $opt{MAXBN})? 1 : PDL::ceil( $range / $opt{MAXBN} ) ; $opt{MAXBN} = PDL::ceil( $range / $step )->min->sclr; my $hist = $self->double->histogram($step, $self->min->sclr, $opt{MAXBN}); # turn fre into prob $hist /= $self->dim(0); my $xvals = $self->min->sclr + sequence( $opt{MAXBN} ) * $step; my $xvals_int = PDL::ceil($self->min->sclr) + sequence( $opt{MAXBN} ) * $step_int; $xvals_int = $xvals_int->where( $xvals_int <= $xvals->max )->sever; my $win = $opt{WIN}; if (!$win) { $win = PDL::Graphics::PGPLOT::Window::pgwin( Dev=>$opt{DEV} ); $win->env($xvals->minmax,0,1, {XTitle=>'xvals', YTitle=>'probability'}); } $win->line( $xvals, $hist, { COLOR=>$opt{COLOR} } ); if (!@distr) { $win->close unless defined $opt{WIN}; return; } my (%ll, %pars, @text, $c); $c = $opt{COLOR}; # fitted lines start from ++$c for my $distr ( @distr ) { # find mle_ or mme_$distr; my @funcs = grep { /_$distr$/ } (keys %PDL::Stats::Distr::); if (!@funcs) { carp "Do not recognize $distr distribution!"; next; } # might have mle and mme for a distr. sort so mle comes first @funcs = sort @funcs; my ($f_para, $f_prob) = @funcs[0, -1]; my $nrmd = $self / $step; eval { my @paras = $nrmd->$f_para(); $pars{$distr} = \@paras; @paras = map { $_->dummy(0) } @paras; $ll{$distr} = $nrmd->$f_prob( @paras )->log->sumover; push @text, sprintf "$distr LL = %.2f", $ll{$distr}->sum; if ($f_prob =~ /^pdf/) { $win->line( $xvals, ($xvals/$step)->$f_prob(@paras), {COLOR=>++$c} ); } else { $win->points( $xvals_int, ($xvals_int/$step_int)->$f_prob(@paras), {COLOR=>++$c} ); } }; carp $@ if $@; } $win->legend(\@text, ($xvals->min->sclr + $xvals->max->sclr)/2, .95, {COLOR=>[$opt{COLOR}+1 .. $c], TextFraction=>.75} ); $win->close unless defined $opt{WIN}; return (\%ll, \%pars); } =head1 DEPENDENCIES GSL - GNU Scientific Library =head1 SEE ALSO PDL::Graphics::PGPLOT PDL::GSL::CDF =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong , David Mertens All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut EOD pp_done(); PDL-Stats-0.84/Distr/Makefile.PL0000644000175000017500000000100514126063750016145 0ustar osboxesosboxesuse PDL::Core::Dev; use ExtUtils::MakeMaker; PDL::Core::Dev->import(); $package = ["distr.pd",Distr,PDL::Stats::Distr,undef,1]; %hash = pdlpp_stdargs($package); chomp( my $libgsl = `gsl-config --libs` ); chomp( my $cflags = `gsl-config --cflags` ); if ($libgsl) { unshift @{ $hash{'LIBS'} }, $libgsl; $hash{'INC'} .= " $cflags"; WriteMakefile( %hash ); sub MY::postamble {pdlpp_postamble($package)}; } else { my $msg = "no GSL lib, skip $package->[-1]"; warn $msg . "\n"; write_dummy_make( $msg ); } PDL-Stats-0.84/Changes0000644000175000017500000001777114625061255014424 0ustar osboxesosboxes0.84 2024-05-27 - effect_code exception if only one unique value - fix for Distr so M_PI found - thanks @shawnlaffan 0.83 2023-04-01 - install pdldoc using PDL::Doc 0.82 2022-05-23 - dodge false-negative on NiceSlice 0.81 2022-01-11 - fix Kmeans NiceSlice for PDL 2.066 0.80 2021-10-02 - add COV option to GLM::logistic - thanks David Myers for suggestion (in 2011) - make all PLOT options default to off; only load PGPLOT when try to use - remove plot_season and dsea deprecated since 0.5.0 - effect_code and iv_cluster now don't croak on multi-dim inputs - opt in to upcoming PDL multi-C feature 0.79 2021-08-17 - update now that PDL's ->sum etc return ndarrays, not Perl scalars 0.78 2021-04-24 - update build to avoid false positives for working CDF library 0.77 2021-04-20 - moved the GSL CDF functions over to main PDL - thanks @maggiexyz! 0.76 2020-12-19 - spelling fixes - thanks @sebastic 0.75 2016-05-30 - no change from 0.74_1 0.74_1 2016-05-28 - build updates 0.74 Fri Dec 4 22:09:47 2015 -0500 - no change from 0.73_1 0.73_1 Sun Nov 29 10:15:44 2015 -0500 - Improved support for 64bit indexing (kmx) - PDL::GSL::CDF facelift (kmx) - kmeans test workaround (kmx) - messages for various tests (kaoru) - doc-building improvements 0.72 Mon Aug 31 18:54:08 BST 2015 - changelog test in CI - remove erroneous dep on PDL::GSL::CDF 0.71 Sat Aug 29 18:54:08 BST 2015 - Beef up CI with optional requirements spelled out and apt installed - kmeans and kmeans_bad tests retry, marked TODO as fails on clang - add badges to README and rename to Markdown file - Print out kmeans_bad test for debugging. - Metadata fix - Add manifest test, remove META.yml from main MANIFEST 0.70 18/08/2015 - Updated to work with PDL-2.008. More tests for anova. 0.6.5 11/03/2013 - Updated to work with PDL-2.007. Real 64bit support yet to come. 0.6.4 09/05/2013 - Make 'cpan PDL::Stats' work. 0.6.3 05/05/2013 - binomial_test() bug fix -- [rt.cpan.org #82725] - pmf_poisson() updated to handle large input values thanks to David Mertens. - ols() bad value support update thanks to Ingo Schmid. - anova(), anova_rptd() bad value support update. 0.6.2 04/29/2012 - Windows pdldoc database install fix. 0.6.1 04/24/2012 - Bails out of installation if running under perl 5.15.0 to 5.15.8 but allows --force to attempt a build regardless. 0.6.0 04/08/2012 - Added group_by() method in Basic.pm. - Updates pdl doc database. - Bails out of installation if running under perl 5.14.0 to 5.15.8. - Removed old deprecated method get_data() from Basic.pm - Moved no slatec warning from module to test. Removed lvalue assignments from tests. 0.5.5 04/22/2011 - GLM Windows 32-bit test bug fix; GLM quits installation with PDL-2.4.7 warning. Documentation update. 0.5.4 04/19/2011 - GLM pca() and plot_scores() can do cov version. - [rt.cpan.org #67557] Undo PREREQ_FATAL => 1 in Makefile.PL. 0.5.3 04/18/2011 - stats_glm.t pca() test SLATEC vs. MatrixOps precision fix. 0.5.2 04/17/2011 - stats_glm.t pca() test precision fix. 0.5.1 04/16/2011 - ***GLM pca() return value names updated*** to be more compatible with the literature. - GLM added plot_scores(); plot_scree() name changed to plot_screes() for consistency across plotting functions. - Updated plotting functions to default to png in Windows because GW doesn't reopen after close. 0.5.0 04/02/2011 - Updated tests to use __DATA__ instead of separate test data files. - GLM added interaction_code(); bug fix for dummy_code(). - GLM added ols_rptd()! - GLM added plot_residual(). - GLM eval wrapped optional modules; replaced perl reverse sort with slice (-1:0). - Basic added binomial_test(). - TS updated plot_season() to season_m(); deprecated a few old function names, filt_exp(), filt_ma(), and dsea(). 0.4.3 04/08/2010 - Distr::plot_distr() bug fix. - Kmeans::kmeans() pod update. 0.4.2 03/27/2010 - rtable() added to PDL::Basic. get_data() deprecated. Preparing for transitioning to using rcols() for IO. - perldl online help support! (ie "? stdv" in perldl works) 0.4.1 02/02/2010 - GLM anova anova_rptd IVNM default val matched to []. - Makefile.PL exit 0 without PDL::Lite. - TS added wmape - pod fix for CPAN 0.4.0 01/24/2010 - GLM anova_rptd supports between subject factors! We are now beta! 0.3.2 01/08/2010 - Kmeans kmeans CNTRD checks undef besides PDL->null. opt{V} printout update - Kmeans added which_cluster - Kmeans pca_cluster does scree plot with cutoff line - GLM logistic pod update - GLM anova anova_rptd pca plot_scree plotting options update 0.3.1 12/16/2009 - Removed pms--relying on pdl-stats.sourceforge.net for web documentation instead of CPAN. Minor pod update. - GLM::ols_t removed $y->squeeze, so that edge case like ols_t on 1 obs threads properly (y[1,10]->ols_t(x[1,10]->dummy(1), {const=>0}) gives b[1,10]). - GLM::anova_rptd se fix. - GLM::anova and anova_rptd plot se fix. - PDL::Stats::TS included. ***alpha alpha*** code! Included because I'm tired of maintaining it separately from PDL::Stats. 0.3.0 12/03/2009 - GLM::anova_rptd! works for purely within designs - GLM::anova bug fix--not messing up caller @ if passed @ ref in IV instead of pdl - Change prereq from PDL to PDL::Lite - Stats.pm quick-start guide update - t/stats_glm logistic test modified to use more proper numbers in x 0.2.8 11/04/2009 - Fixed minor bug (mismatched nseed and self cluster pdls. oops) in switching to use cluster assignment as stop criteria. btw initial R2 now based on self instead of seeds. 0.2.7 11/04/2009 - GLM::plot_scree allows directly passing NCOMP. plots cutoff line for screes if requested - kmeans stops when cluster assignment no longer changes instead of using R2CRT. - If not specified, Kmeans::pca_cluster studies scree plot to determine NCOMP Basic::get_data STDERR message fix: data pdl o x v, NOT v x o 0.2.6 10/12/2009 - anova bad value se bug fix - centroid bad value empty cluser bug fix - basic, glm, distr rid of nan and use bad value instead 0.2.5 09/17/2009 - PREREQ_FATAL => 1 in Makefile.PL - added pca_cluster in Kmeans. - anova one-way bug fix. - plot_means one-way bug fix. 0.2.4 09/02/2009 - added corr_table in PDL::Stats::Basic. - PDL::Stats::GLM::pca uses corr_table. - kmeans no longer quits with empty cluster. kmeans 3d threading bug fix. - use short-circuit ISBAD(a) || ISBAD(b) instead of ISGOOD(a) && ISGOOD(b) in bad value processing. 0.2.3 08/24/2009 - ***anova interface change*** IV names now passed in \%opt instead of @ ref. I appologize for the interface change, but it matches the new method plot_means, and makes it easier to pass IVs themselves. I promise that such interface changes will be extremely rare, if at all. - anova now supports bad value in DV. - Added plot_means in PDL::Stats::GLM. anova plots highest order interaction by default. - Added plot_scree in PDL::Stats::GLM. pca does scree plot by default. - Added pca_sorti in PDL::Stats::GLM. - plot_distr in PDL::Stats::Distr accepts WIN option - Set prerequisite PDL => 2.4.4, which has bad value support by default 0.2.2 07/23/2009 - stats_distr.t moved form t/ to Distr/t, ie skip the test if no GSL. - more kmeans fix (4d, seeding). added seeded kmeans tests. 0.2.1 07/22/2009 - kmeans 4d fix. added pms dir for cpan pod parse 0.2.0 07/21/2009 - kmeans threads! ss/ms bug fix. more tests on kmeans added. 0.1.3 07/17/2009 - quick-start for non-PDL people in Stats.pm pod - ols_t uses double internally. GLM pod update - r2_change threads (switched to use ols_t instead of ols) - get_data which_id moved to Basic from GLM - tests grouped in PDL-Stats/t except for PDL::GSL::CDF 0.1.2 07/15/2009 - Cleaned up some versioning stuff in Stats.pm - GLM::get_data TYPE default to double. MISSN handling update; set to BAD by default. - Case insensitive option passing. 0.1.1 07/14/2009 - Fixed PDL::Stats::GLM::ols b_p value for b_t < 0. 0.1.0 07/13/2009 - First version, released on an unsuspecting world. PDL-Stats-0.84/GLM/0000755000175000017500000000000014625061425013532 5ustar osboxesosboxesPDL-Stats-0.84/GLM/glm.pd0000755000175000017500000021550514603563235014653 0ustar osboxesosboxespp_add_exported('', 'ols_t', 'anova', 'anova_rptd', 'dummy_code', 'effect_code', 'effect_code_w', 'interaction_code', 'ols', 'ols_rptd', 'r2_change', 'logistic', 'pca', 'pca_sorti', 'plot_means', 'plot_residuals', 'plot_screes'); pp_addpm({At=>'Top'}, <<'EOD'); use strict; use warnings; use Carp; use PDL::LiteF; use PDL::MatrixOps; use PDL::NiceSlice; use PDL::Stats::Basic; use PDL::Stats::Kmeans; eval { require PDL::Core; require PDL::GSL::CDF; }; my $CDF = 1 if !$@; eval { require PDL::Slatec; }; my $SLATEC = 1 if !$@; my $MATINV = $SLATEC ? \&PDL::Slatec::matinv : \&inv; my $DEV = ($^O =~ /win/i)? '/png' : '/xs'; =head1 NAME PDL::Stats::GLM -- general and generalized linear modeling methods such as ANOVA, linear regression, PCA, and logistic regression. =head1 DESCRIPTION The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively. FUNCTIONS except B support bad value. B strongly recommended for most METHODS, and it is required for B. P-values, where appropriate, are provided if PDL::GSL::CDF is installed. =head1 SYNOPSIS use PDL::LiteF; use PDL::NiceSlice; use PDL::Stats::GLM; # do a multiple linear regression and plot the residuals my $y = pdl( 8, 7, 7, 0, 2, 5, 0 ); my $x = pdl( [ 0, 1, 2, 3, 4, 5, 6 ], # linear component [ 0, 1, 4, 9, 16, 25, 36 ] ); # quadratic component my %m = $y->ols( $x, {plot=>1} ); print "$_\t$m{$_}\n" for (sort keys %m); =cut EOD pp_addhdr(' #include #include #include ' ); pp_def('fill_m', Pars => 'a(n); float+ [o]b(n)', Inplace => 1, GenericTypes => [F, D], HandleBad => 1, Code => ' loop (n) %{ $b() = $a(); %} ', BadCode => ' $GENERIC(b) sa, m; sa = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); N ++; } %} m = N? sa / N : 0; loop (n) %{ if ( $ISGOOD($a()) ) { $b() = $a(); } else { $b() = m; } %} ', CopyBadStatusCode => ' /* propagate badflag if inplace AND it has changed */ if ( a == b && $ISPDLSTATEBAD(a) ) PDL->propagate_badflag( b, 0 ); /* always make sure the output is "good" */ $SETPDLSTATEGOOD(b); ', Doc => ' =for ref Replaces bad values with sample mean. Mean is set to 0 if all obs are bad. Can be done inplace. =for usage perldl> p $data [ [ 5 BAD 2 BAD] [ 7 3 7 BAD] ] perldl> p $data->fill_m [ [ 5 3.5 2 3.5] [ 7 3 7 5.66667] ] =cut ', BadDoc => ' The output pdl badflag is cleared. ', ); pp_def('fill_rand', Pars => 'a(n); [o]b(n)', Inplace => 1, HandleBad => 1, Code => ' loop (n) %{ $b() = $a(); %} ', BadCode => ' $GENERIC(a) *g[ $SIZE(n) ]; PDL_Indx i, j; i = 0; srand( time( NULL ) ); loop (n) %{ if ( $ISGOOD($a()) ) { g[i++] = &$a(); } %} loop (n) %{ if ( $ISGOOD($a()) ) { $b() = $a(); } else { /* XXX-FIXME works on 64bit, but rand() is quite limited */ j = (PDL_Indx) ((i-1) * (double)(rand()) / (double)(RAND_MAX) + .5); $b() = *g[j]; } %} ', CopyBadStatusCode => ' /* propagate badflag if inplace AND it has changed */ if ( a == b && $ISPDLSTATEBAD(a) ) PDL->propagate_badflag( b, 0 ); /* always make sure the output is "good" */ $SETPDLSTATEGOOD(b); ', Doc => ' =for ref Replaces bad values with random sample (with replacement) of good observations from the same variable. Can be done inplace. =for usage perldl> p $data [ [ 5 BAD 2 BAD] [ 7 3 7 BAD] ] perldl> p $data->fill_rand [ [5 2 2 5] [7 3 7 7] ] =cut ', BadDoc => ' The output pdl badflag is cleared. ', ); pp_def('dev_m', Pars => 'a(n); float+ [o]b(n)', Inplace => 1, GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) sa, m; sa = 0; m = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); %} m = sa / N; loop (n) %{ $b() = $a() - m; %} ', BadCode => ' $GENERIC(b) sa, m; sa = 0; m = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); N ++; } %} m = sa / N; loop (n) %{ if ( $ISGOOD($a()) ) { $b() = $a() - m; } else { $SETBAD($b()); } %} ', Doc => ' =for ref Replaces values with deviations from the mean. Can be done inplace. =cut ', ); pp_def('stddz', Pars => 'a(n); float+ [o]b(n)', Inplace => 1, GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(b) sa, a2, m, sd; sa = 0; a2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ sa += $a(); a2 += pow($a(),2); %} m = sa / N; sd = pow( a2/N - pow(m,2), .5 ); loop (n) %{ $b() = (sd>0)? (($a() - m) / sd) : 0; %} ', BadCode => ' $GENERIC(b) sa, a2, m, sd; sa = 0; a2 = 0; m = 0; sd = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISGOOD($a()) ) { sa += $a(); a2 += pow($a(),2); N ++; } %} if (N) { m = sa / N; sd = pow( a2/N - pow(m,2), .5 ); loop (n) %{ if ( $ISGOOD(a()) ) { /* sd? does not work, presumably due to floating point */ $b() = (sd>0)? (($a() - m) / sd) : 0; } else { $SETBAD(b()); } %} } else { loop (n) %{ $SETBAD(b()); %} } ', Doc => ' =for ref Standardize ie replace values with z_scores based on sample standard deviation from the mean (replace with 0s if stdv==0). Can be done inplace. =cut ', ); pp_def('sse', Pars => 'a(n); b(n); float+ [o]c()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(c) ss = 0; loop (n) %{ ss += pow($a() - $b(), 2); %} $c() = ss; ', BadCode => ' $GENERIC(c) ss = 0; loop (n) %{ if ( $ISBAD($a()) || $ISBAD($b()) ) { } else { ss += pow($a() - $b(), 2); } %} $c() = ss; ', Doc => ' =for ref Sum of squared errors between actual and predicted values. =cut ', ); pp_def('mse', Pars => 'a(n); b(n); float+ [o]c()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(c) ss = 0; loop (n) %{ ss += pow($a() - $b(), 2); %} $c() = ss / $SIZE(n); ', BadCode => ' $GENERIC(c) ss = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISBAD($a()) || $ISBAD($b()) ) { } else { ss += pow($a() - $b(), 2); N ++; } %} if (N) { $c() = ss/N; } else { $SETBAD(c()); } ', Doc => ' =for ref Mean of squared errors between actual and predicted values, ie variance around predicted value. =cut ', ); pp_def('rmse', Pars => 'a(n); b(n); float+ [o]c()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(c) d2; d2 = 0; PDL_Indx N = $SIZE(n); loop (n) %{ d2 += pow($a() - $b(), 2); %} $c() = sqrt( d2 / N ); ', BadCode => ' $GENERIC(c) d2; d2 = 0; PDL_Indx N = 0; loop (n) %{ if ( $ISBAD($a()) || $ISBAD($b()) ) { } else { d2 += pow($a() - $b(), 2); N ++; } %} if (N) { $c() = sqrt( d2 / N ); } else { $SETBAD(c()); } ', Doc => ' =for ref Root mean squared error, ie stdv around predicted value. =cut ', ); pp_def('pred_logistic', Pars => 'a(n,m); b(m); float+ [o]c(n)', GenericTypes => [F, D], HandleBad => 1, Code => ' loop (n) %{ $GENERIC(c) l = 0; loop (m) %{ l += $a() * $b(); %} $c() = 1 / ( 1 + exp(-l) ); %} ', BadCode => ' loop (n) %{ $GENERIC(c) l = 0; PDL_Indx bad = 0; loop (m) %{ if ( $ISBAD($a()) || $ISBAD($b()) ) { bad = 1; } else { l += $a() * $b(); } %} if (bad) { $SETBAD( $c() ); } else { $c() = 1 / ( 1 + exp(-l) ); } %} ', Doc => ' =for ref Calculates predicted prob value for logistic regression. =for usage # glue constant then apply coeff returned by the logistic method $pred = $x->glue(1,ones($x->dim(0)))->pred_logistic( $m{b} ); =cut ', ); pp_def('d0', Pars => 'a(n); float+ [o]c()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(c) p, ll; p = 0; ll = 0; PDL_Indx N = $SIZE(n); loop (n) %{ p += $a(); %} p /= N; loop (n) %{ ll += $a()? log( p ) : log( 1 - p ); %} $c() = -2 * ll; ', BadCode => ' $GENERIC(c) p, ll; p = 0; ll = 0; PDL_Indx N = 0; loop (n) %{ if ($ISGOOD( $a() )) { p += $a(); N ++; } %} if (N) { p /= N; loop (n) %{ if ($ISGOOD( $a() )) ll += $a()? log( p ) : log( 1 - p ); %} $c() = -2 * ll; } else { $SETBAD(c()); } ', Doc => ' =for usage my $d0 = $y->d0(); =for ref Null deviance for logistic regression. =cut ', ); pp_def('dm', Pars => 'a(n); b(n); float+ [o]c()', GenericTypes => [F, D], HandleBad => 1, Code => ' $GENERIC(c) ll; ll = 0; loop (n) %{ ll += $a()? log( $b() ) : log( 1 - $b() ); %} $c() = -2 * ll; ', BadCode => ' $GENERIC(c) ll; ll = 0; loop (n) %{ if ( $ISBAD($a()) || $ISBAD($b()) ) { } else { ll += $a()? log( $b() ) : log( 1 - $b() ); } %} $c() = -2 * ll; ', Doc => ' =for usage my $dm = $y->dm( $y_pred ); # null deviance my $d0 = $y->dm( ones($y->nelem) * $y->avg ); =for ref Model deviance for logistic regression. =cut ', ); pp_def('dvrs', Pars => 'a(); b(); float+ [o]c()', GenericTypes => [F, D], HandleBad => 1, Code => ' $c() = $a()? sqrt( -2 * log($b()) ) : -1 * sqrt( -2 * log(1-$b()) ) ; ', BadCode => ' if ( $ISBAD($a()) || $ISBAD($b()) ) { $SETBAD( $c() ); } else { $c() = $a()? sqrt( -2 * log($b()) ) : -1 * sqrt( -2 * log(1-$b()) ) ; } ', Doc => ' =for ref Deviance residual for logistic regression. =cut ', ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); # my tmp var for PDL 2.007 slice upate my $_tmp; =head2 ols_t =for ref Threaded version of ordinary least squares regression (B). The price of threading was losing significance tests for coefficients (but see B). The fitting function was shamelessly copied then modified from PDL::Fit::Linfit. Uses PDL::Slatec when possible but otherwise uses PDL::MatrixOps. Intercept is LAST of coeff if CONST => 1. ols_t does not handle bad values. consider B or B if there are bad values. =for options Default options (case insensitive): CONST => 1, =for usage Usage: # DV, 2 person's ratings for top-10 box office movies # ascending sorted by box office numbers perldl> p $y = qsort ceil( random(10, 2)*5 ) [ [1 1 2 4 4 4 4 5 5 5] [1 2 2 2 3 3 3 3 5 5] ] # model with 2 IVs, a linear and a quadratic trend component perldl> $x = cat sequence(10), sequence(10)**2 # suppose our novice modeler thinks this creates 3 different models # for predicting movie ratings perldl> p $x = cat $x, $x * 2, $x * 3 [ [ [ 0 1 2 3 4 5 6 7 8 9] [ 0 1 4 9 16 25 36 49 64 81] ] [ [ 0 2 4 6 8 10 12 14 16 18] [ 0 2 8 18 32 50 72 98 128 162] ] [ [ 0 3 6 9 12 15 18 21 24 27] [ 0 3 12 27 48 75 108 147 192 243] ] ] perldl> p $x->info PDL: Double D [10,2,3] # insert a dummy dim between IV and the dim (model) to be threaded perldl> %m = $y->ols_t( $x->dummy(2) ) perldl> p "$_\t$m{$_}\n" for (sort keys %m) # 2 persons' ratings, eached fitted with 3 "different" models F [ [ 38.314159 25.087209] [ 38.314159 25.087209] [ 38.314159 25.087209] ] # df is the same across dv and iv models F_df [2 7] F_p [ [0.00016967051 0.00064215074] [0.00016967051 0.00064215074] [0.00016967051 0.00064215074] ] R2 [ [ 0.9162963 0.87756762] [ 0.9162963 0.87756762] [ 0.9162963 0.87756762] ] b [ # linear quadratic constant [ [ 0.99015152 -0.056818182 0.66363636] # person 1 [ 0.18939394 0.022727273 1.4] # person 2 ] [ [ 0.49507576 -0.028409091 0.66363636] [ 0.09469697 0.011363636 1.4] ] [ [ 0.33005051 -0.018939394 0.66363636] [ 0.063131313 0.0075757576 1.4] ] ] # our novice modeler realizes at this point that # the 3 models only differ in the scaling of the IV coefficients ss_model [ [ 20.616667 13.075758] [ 20.616667 13.075758] [ 20.616667 13.075758] ] ss_residual [ [ 1.8833333 1.8242424] [ 1.8833333 1.8242424] [ 1.8833333 1.8242424] ] ss_total [22.5 14.9] y_pred [ [ [0.66363636 1.5969697 2.4166667 3.1227273 ... 4.9727273] ... =cut *ols_t = \&PDL::ols_t; sub PDL::ols_t { _ols_common(1, @_); } sub _ols_common { my $threaded = shift; my $opt = pop @_ if ref $_[-1] eq 'HASH'; # y [n], ivs [n x attr] pdl my ($y, $ivs) = @_; my %opt = ( CONST => 1, PLOT => 0, WIN => undef, # for plotting ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); if (!$threaded) { $y = $y->squeeze; $y->getndims > 1 and croak "use ols_t for threaded version"; } $ivs = $ivs->dummy(1) if $ivs->getndims == 1; ($y, $ivs) = _rm_bad_value( $y, $ivs ) if !$threaded; # set up ivs and const as ivs $opt{CONST} and $ivs = $ivs->glue( 1, ones($ivs->dim(0)) ); # Internally normalise data # (double) it or ushort y and sequence iv won't work right my $ymean = $y->abs->avgover->double; ($_tmp = $ymean->where( $ymean==0 )) .= 1; my $y2 = $y / ($threaded ? $ymean->dummy(0) : $ymean); # Do the fit my $Y = $ivs x $y2->dummy(0); my $C = &$MATINV( $ivs x $ivs->xchg(0,1) ); # avoid niceslice # Fitted coefficients vector my $coeff = PDL::squeeze( $C x $Y ); $coeff = $coeff->dummy(0) if $threaded and $coeff->getndims == 1 and $y->getndims > 1; $coeff *= ($threaded ? $ymean->dummy(0) : $ymean); # Un-normalise my %ret; # ***$coeff x $ivs looks nice but produces nan on successive tries*** $ret{y_pred} = sumover( ($threaded ? $coeff->dummy(1) : $coeff) * $ivs->transpose ); $opt{PLOT} and $y->plot_residuals( $ret{y_pred}, \%opt ); return $coeff unless wantarray; $ret{ss_total} = $opt{CONST}? $y->ss : sumover( $y ** 2 ); $ret{ss_residual} = $y->sse( $ret{y_pred} ); $ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; $ret{R2} = $ret{ss_model} / $ret{ss_total}; my $n_var = $opt{CONST}? $ivs->dim(1) - 1 : $ivs->dim(1); $ret{F_df} = pdl( $n_var, $y->dim(0) - $ivs->dim(1) ); $ret{F} = $ret{ss_model} / $ret{F_df}->(0) / ($ret{ss_residual} / $ret{F_df}->(1)); $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $ret{F_df}->dog ) if $CDF; if (!$threaded) { my $se_b = ones( $coeff->dims? $coeff->dims : 1 ); $opt{CONST} and ($_tmp = $se_b(-1)) .= sqrt( $ret{ss_residual} / $ret{F_df}->(1) * $C(-1,-1) ); # get the se for bs by successivly regressing each iv by the rest ivs if ($ivs->dim(1) > 1) { for my $k (0 .. $n_var-1) { my @G = grep { $_ != $k } (0 .. $n_var-1); my $G = $ivs->dice_axis(1, \@G); $opt{CONST} and $G = $G->glue( 1, ones($ivs->dim(0)) ); my $b_G = $ivs( ,$k)->ols( $G, {CONST=>0,PLOT=>0} ); my $ss_res_k = $ivs( ,$k)->squeeze->sse( sumover($b_G * $G->transpose) ); ($_tmp = $se_b($k)) .= sqrt( $ret{ss_residual} / $ret{F_df}->(1) / $ss_res_k ); } } else { ($_tmp = $se_b(0)) .= sqrt( $ret{ss_residual} / $ret{F_df}->(1) / sum( $ivs( ,0)**2 ) ); } $ret{b_se} = $se_b; $ret{b_t} = $coeff / $ret{b_se}; $ret{b_p} = 2 * ( 1 - $ret{b_t}->abs->gsl_cdf_tdist_P( $ret{F_df}->(1) ) ) if $CDF; } for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; $ret{b} = $coeff; return %ret; } =head2 r2_change =for ref Significance test for the incremental change in R2 when new variable(s) are added to an ols regression model. Returns the change stats as well as stats for both models. Based on B. (One way to make up for the lack of significance tests for coeffs in ols_t). =for options Default options (case insensitive): CONST => 1, =for usage Usage: # suppose these are two persons' ratings for top 10 box office movies # ascending sorted by box office perldl> p $y = qsort ceil(random(10, 2) * 5) [ [1 1 2 2 2 3 4 4 4 4] [1 2 2 3 3 3 4 4 5 5] ] # first IV is a simple linear trend perldl> p $x1 = sequence 10 [0 1 2 3 4 5 6 7 8 9] # the modeler wonders if adding a quadratic trend improves the fit perldl> p $x2 = sequence(10) ** 2 [0 1 4 9 16 25 36 49 64 81] # two difference models are given in two pdls # each as would be pass on to ols_t # the 1st model includes only linear trend # the 2nd model includes linear and quadratic trends # when necessary use dummy dim so both models have the same ndims perldl> %c = $y->r2_change( $x1->dummy(1), cat($x1, $x2) ) perldl> p "$_\t$c{$_}\n" for (sort keys %c) # person 1 person 2 F_change [0.72164948 0.071283096] # df same for both persons F_df [1 7] F_p [0.42370145 0.79717232] R2_change [0.0085966043 0.00048562549] model0 HASH(0x8c10828) model1 HASH(0x8c135c8) # the answer here is no. =cut *r2_change = \&PDL::r2_change; sub PDL::r2_change { my ($self, $ivs0, $ivs1, $opt) = @_; $ivs0->getndims == 1 and $ivs0 = $ivs0->dummy(1); my %ret; $ret{model0} = { $self->ols_t( $ivs0, $opt ) }; $ret{model1} = { $self->ols_t( $ivs1, $opt ) }; $ret{R2_change} = $ret{model1}->{R2} - $ret{model0}->{R2}; $ret{F_df} = pdl($ivs1->dim(1) - $ivs0->dim(1), $ret{model1}->{F_df}->((1)) ); $ret{F_change} = $ret{R2_change} * $ret{F_df}->((1)) / ( (1-$ret{model1}->{R2}) * $ret{F_df}->((0)) ); $ret{F_p} = 1 - $ret{F_change}->gsl_cdf_fdist_P( $ret{F_df}->dog ) if $CDF; for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; return %ret; } =head1 METHODS =head2 anova =for ref Analysis of variance. Uses type III sum of squares for unbalanced data. Dependent variable should be a 1D pdl. Independent variables can be passed as 1D perl array ref or 1D pdl. Will only calculate p-value (C) if there are more samples than the product of categories of all the IVs. Supports bad value (by ignoring missing or BAD values in dependent and independent variables list-wise). =for options Default options (case insensitive): V => 1, # carps if bad value in variables IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] PLOT => 0, # plots highest order effect # can set plot_means options here WIN => undef, # for plotting =for usage Usage: # suppose this is ratings for 12 apples perldl> p $y = qsort ceil( random(12)*5 ) [1 1 2 2 2 3 3 4 4 4 5 5] # IV for types of apple perldl> p $a = sequence(12) % 3 + 1 [1 2 3 1 2 3 1 2 3 1 2 3] # IV for whether we baked the apple perldl> @b = qw( y y y y y y n n n n n n ) perldl> %m = $y->anova( $a, \@b, { IVNM=>['apple', 'bake'] } ) perldl> p "$_\t$m{$_}\n" for (sort keys %m) # apple # m [ [2.5 3 3.5] ] # apple # se [ [0.64549722 0.91287093 0.64549722] ] # apple ~ bake # m [ [1.5 1.5 2.5] [3.5 4.5 4.5] ] # apple ~ bake # se [ [0.5 0.5 0.5] [0.5 0.5 0.5] ] # bake # m [ [ 1.8333333 4.1666667] ] # bake # se [ [0.30731815 0.30731815] ] F 7.6 F_df [5 6] F_p 0.0141586545851857 ms_model 3.8 ms_residual 0.5 ss_model 19 ss_residual 3 ss_total 22 | apple | F 2 | apple | F_df [2 6] | apple | F_p 0.216 | apple | ms 1 | apple | ss 2 | apple ~ bake | F 0.666666666666667 | apple ~ bake | F_df [2 6] | apple ~ bake | F_p 0.54770848985725 | apple ~ bake | ms 0.333333333333334 | apple ~ bake | ss 0.666666666666667 | bake | F 32.6666666666667 | bake | F_df [1 6] | bake | F_p 0.00124263849516693 | bake | ms 16.3333333333333 | bake | ss 16.3333333333333 =cut *anova = \&PDL::anova; sub PDL::anova { my ($y, @ivs_raw) = @_; anova_rptd($y, undef, @ivs_raw); } sub _effect_code_ivs { my $ivs = shift; my (@i_iv, @i_cmo); for (@$ivs) { my ($e, $map) = effect_code($_->squeeze); my $var = ($e->getndims == 1)? $e->dummy(1) : $e; push @i_iv, $var; my @indices = sort { $a<=>$b } values %$map; push @i_cmo, pdl @indices; } return \@i_iv, \@i_cmo; } sub _add_interactions { my ($var_ref, $i_cmo_ref, $idv, $raw_ref) = @_; # append info re inter to main effects my (@inter, @idv_inter, @inter_cm, @inter_cmo); for my $nway ( 2 .. @$var_ref ) { my $iter_idv = _combinations( $nway, [0..$#$var_ref] ); while ( my @v = &$iter_idv() ) { my $i = ones( $var_ref->[0]->dim(0), 1 ); for (@v) { $i = $i * $var_ref->[$_]->dummy(1); $i = $i->clump(1,2); } push @inter, $i; my $e = join( ' ~ ', @$idv[@v] ); push @idv_inter, $e; # now prepare for cell mean my @i_cm = (); for my $o ( 0 .. $raw_ref->[0]->dim(0) - 1 ) { my @cell = map { $_($o)->squeeze } @$raw_ref[@v]; push @i_cm, join('', @cell); } my ($inter, $map) = effect_code( \@i_cm ); push @inter_cm, $inter; # get the order to put means in correct multi dim pdl pos # this is order in var_e dim(1) my @levels = sort { $map->{$a} <=> $map->{$b} } keys %$map; # this is order needed for cell mean my @i_cmo = sort { reverse($levels[$a]) cmp reverse($levels[$b]) } 0 .. $#levels; push @inter_cmo, pdl @i_cmo; } } # append info re inter to main effects return ([@$var_ref, @inter], [@$i_cmo_ref, @inter_cmo], [@$idv, @idv_inter], [@$var_ref, @inter_cm] ); } sub _cell_means { my ($data, $ivs_ref, $i_cmo_ref, $ids, $raw_ref) = @_; my %ind_id; @ind_id{ @$ids } = 0..$#$ids; my %cm; my $i = 0; for (@$ivs_ref) { confess "_cell_means passed empty ivs_ref ndarray at pos $i" if $_->isempty; my $last = zeroes $_->dim(0); my $i_neg = which $_( ,0) == -1; ($_tmp = $last($i_neg)) .= 1; ($_tmp = $_->where($_ == -1)) .= 0; $_ = $_->glue(1, $last); my @v = split ' ~ ', $ids->[$i]; my @shape = map { $raw_ref->[$_]->uniq->nelem } @ind_id{@v}; my ($m, $ss) = $data->centroid( $_ ); $m = $m($i_cmo_ref->[$i])->sever; $ss = $ss($i_cmo_ref->[$i])->sever; $m = $m->reshape(@shape); $m->getndims == 1 and $m = $m->dummy(1); my $se = sqrt( ($ss/($_->sumover - 1)) / $_->sumover )->reshape(@shape); $se->getndims == 1 and $se = $se->dummy(1); $cm{ "# $ids->[$i] # m" } = $m; $cm{ "# $ids->[$i] # se" } = $se; $i++; } return \%cm; } # http://www.perlmonks.org/?node_id=371228 sub _combinations { my ($num, $arr) = @_; return sub { return } if $num == 0 or $num > @$arr; my @pick; return sub { return @$arr[ @pick = ( 0 .. $num - 1 ) ] unless @pick; my $i = $#pick; $i-- until $i < 0 or $pick[$i]++ < @$arr - $num + $i; return if $i < 0; @pick[$i .. $#pick] = $pick[$i] .. $#$arr; return @$arr[@pick]; }; } =head2 anova_rptd Repeated measures and mixed model anova. Uses type III sum of squares. The standard error (se) for the means are based on the relevant mean squared error from the anova, ie it is pooled across levels of the effect. Will only calculate p-value (C) if there are more samples than the product of categories of all the IVs. anova_rptd supports bad value in the dependent and independent variables. It automatically removes bad data listwise, ie remove a subject's data if there is any cell missing for the subject. Default options (case insensitive): V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] BTWN => [], # indices of between-subject IVs (matches IVNM indices) PLOT => 0, # plots highest order effect # see plot_means() for more options WIN => undef, # for plotting Usage: Some fictional data: recall_w_beer_and_wings.txt Subject Beer Wings Recall Alex 1 1 8 Alex 1 2 9 Alex 1 3 12 Alex 2 1 7 Alex 2 2 9 Alex 2 3 12 Brian 1 1 12 Brian 1 2 13 Brian 1 3 14 Brian 2 1 9 Brian 2 2 8 Brian 2 3 14 ... # rtable allows text only in 1st row and col my ($data, $idv, $subj) = rtable 'recall_w_beer_and_wings.txt'; my ($b, $w, $dv) = $data->dog; # subj and IVs can be 1d pdl or @ ref # subj must be the first argument my %m = $dv->anova_rptd( $subj, $b, $w, {ivnm=>['Beer', 'Wings']} ); print "$_\t$m{$_}\n" for (sort keys %m); # Beer # m [ [ 10.916667 8.9166667] ] # Beer # se [ [ 0.4614791 0.4614791] ] # Beer ~ Wings # m [ [ 10 7] [ 10.5 9.25] [12.25 10.5] ] # Beer ~ Wings # se [ [0.89170561 0.89170561] [0.89170561 0.89170561] [0.89170561 0.89170561] ] # Wings # m [ [ 8.5 9.875 11.375] ] # Wings # se [ [0.67571978 0.67571978 0.67571978] ] ss_residual 19.0833333333333 ss_subject 24.8333333333333 ss_total 133.833333333333 | Beer | F 9.39130434782609 | Beer | F_p 0.0547977008378944 | Beer | df 1 | Beer | ms 24 | Beer | ss 24 | Beer || err df 3 | Beer || err ms 2.55555555555556 | Beer || err ss 7.66666666666667 | Beer ~ Wings | F 0.510917030567687 | Beer ~ Wings | F_p 0.623881438624431 | Beer ~ Wings | df 2 | Beer ~ Wings | ms 1.625 | Beer ~ Wings | ss 3.25000000000001 | Beer ~ Wings || err df 6 | Beer ~ Wings || err ms 3.18055555555555 | Beer ~ Wings || err ss 19.0833333333333 | Wings | F 4.52851711026616 | Wings | F_p 0.0632754786153548 | Wings | df 2 | Wings | ms 16.5416666666667 | Wings | ss 33.0833333333333 | Wings || err df 6 | Wings || err ms 3.65277777777778 | Wings || err ss 21.9166666666667 For mixed model anova, ie when there are between-subject IVs involved, feed the IVs as above, but specify in BTWN which IVs are between-subject. For example, if we had added age as a between-subject IV in the above example, we would do my %m = $dv->anova_rptd( $subj, $age, $b, $w, { ivnm=>['Age', 'Beer', 'Wings'], btwn=>[0] }); =cut *anova_rptd = \&PDL::anova_rptd; sub PDL::anova_rptd { my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($y, $subj, @ivs_raw) = @_; confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1; croak "Mismatched number of elements in DV and IV. Are you passing IVs the old-and-abandoned way?" if (ref $ivs_raw[0] eq 'ARRAY') and (@{ $ivs_raw[0] } != $y->nelem); for (@ivs_raw) { croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1 } my %opt = ( V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] ( !defined($subj) ? () : ( BTWN => [], # indices of between-subject IVs (matches IVNM indices) )), PLOT => 0, # plots highest order effect WIN => undef, # for plotting ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ] if !$opt{IVNM} or !@{ $opt{IVNM} }; my @idv = @{ $opt{IVNM} }; my %ret; $y = $y->squeeze; my @pdl_ivs_raw = map scalar PDL::Stats::Basic::_array_to_pdl($_), @ivs_raw; my $pdl_ivs_raw = pdl \@pdl_ivs_raw; # explicit set badflag because pdl() removes badflag $pdl_ivs_raw->badflag( scalar grep { $_->badflag } @pdl_ivs_raw ); my $sj; if (defined($subj)) { # delete bad data listwise ie remove subj if any cell missing $sj = PDL::Stats::Basic::_array_to_pdl($subj); my $ibad = which( $y->isbad | nbadover($pdl_ivs_raw->transpose) ); my $sj_bad = $sj($ibad)->uniq; if ($sj_bad->nelem) { print STDERR $sj_bad->nelem . " subjects with missing data removed\n" if $opt{V}; $sj = $sj->setvaltobad($_) for (list $sj_bad); my $igood = which $sj->isgood; for ($y, $sj, @pdl_ivs_raw) { $_ = $_( $igood )->sever; $_->badflag(0); } } } else { ($y, $pdl_ivs_raw) = _rm_bad_value( $y, $pdl_ivs_raw ); if ($opt{V} and $y->nelem < $pdl_ivs_raw[0]->nelem) { printf STDERR "%d subjects with missing data removed\n", $pdl_ivs_raw[0]->nelem - $y->nelem; } # dog preserves data flow @pdl_ivs_raw = map {$_->copy} $pdl_ivs_raw->dog; } # code for ivs and cell mean in diff @s: effect_code vs iv_cluster my ($ivs_ref, $i_cmo_ref) = _effect_code_ivs( \@pdl_ivs_raw ); ($ivs_ref, $i_cmo_ref, my( $idv, $ivs_cm_ref)) = _add_interactions( $ivs_ref, $i_cmo_ref, \@idv, \@pdl_ivs_raw ); # matches $ivs_ref, with an extra last pdl for subj effect my $err_ref = defined($subj) ? _add_errors( $sj, $ivs_ref, $idv, \@pdl_ivs_raw, \%opt ) : []; # stitch together my $ivs = PDL->null->glue( 1, @$ivs_ref, grep defined($_) && ref($_), @$err_ref); $ivs = $ivs->glue(1, ones $ivs->dim(0)); my $b_full = $y->ols_t( $ivs, {CONST=>0} ); $ret{ss_total} = $y->ss; $ret{ss_residual} = $y->sse( sumover( $b_full * $ivs->xchg(0,1) ) ); if (defined $subj) { my @full = (@$ivs_ref, @$err_ref); EFFECT: for my $k (0 .. $#full) { my $e = ($k > $#$ivs_ref)? '| err' : ''; my $i = ($k > $#$ivs_ref)? $k - @$ivs_ref : $k; if (!defined $full[$k]) { # ss_residual as error $ret{ "| $idv->[$i] |$e ss" } = $ret{ss_residual}; # highest ord inter for purely within design, (p-1)*(q-1)*(n-1) $ret{ "| $idv->[$i] |$e df" } = pdl(map { $_->dim(1) } @full[0 .. $#ivs_raw])->prodover; $ret{ "| $idv->[$i] |$e df" } *= ref($full[-1])? $full[-1]->dim(1) : $err_ref->[$err_ref->[-1]]->dim(1) ; $ret{ "| $idv->[$i] |$e ms" } = $ret{ "| $idv->[$i] |$e ss" } / $ret{ "| $idv->[$i] |$e df" }; } elsif (ref $full[$k]) { # unique error term my (@G, $G, $b_G); @G = grep { $_ != $k and defined $full[$_] } (0 .. $#full); next EFFECT unless @G; $G = PDL->null->glue( 1, grep { ref $_ } @full[@G] ); $G = $G->glue(1, ones $G->dim(0)); $b_G = $y->ols_t( $G, {CONST=>0} ); if ($k == $#full) { $ret{ss_subject} = $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual}; } else { $ret{ "| $idv->[$i] |$e ss" } = $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual}; $ret{ "| $idv->[$i] |$e df" } = $full[$k]->dim(1); $ret{ "| $idv->[$i] |$e ms" } = $ret{ "| $idv->[$i] |$e ss" } / $ret{ "| $idv->[$i] |$e df" }; } } else { # repeating error term if ($k == $#full) { $ret{ss_subject} = $ret{"| $idv->[$full[$k]] |$e ss"}; } else { $ret{ "| $idv->[$i] |$e ss" } = $ret{"| $idv->[$full[$k]] |$e ss"}; $ret{ "| $idv->[$i] |$e df" } = $ret{"| $idv->[$full[$k]] |$e df"}; $ret{ "| $idv->[$i] |$e ms" } = $ret{ "| $idv->[$i] |$e ss" } / $ret{ "| $idv->[$i] |$e df" }; } } } # have all iv, inter, and error effects. get F and F_p for (0 .. $#$ivs_ref) { $ret{ "| $idv->[$_] | F" } = $ret{ "| $idv->[$_] | ms" } / $ret{ "| $idv->[$_] || err ms" }; $ret{ "| $idv->[$_] | F_p" } = 1 - $ret{ "| $idv->[$_] | F" }->gsl_cdf_fdist_P( $ret{ "| $idv->[$_] | df" }, $ret{ "| $idv->[$_] || err df" } ) if $CDF and $ret{ "| $idv->[$_] || err df" } > 0; } } else { $ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; $ret{F_df} = pdl($ivs->dim(1) - 1, $y->nelem - ($ivs->dim(1) - 1) -1); $ret{ms_model} = $ret{ss_model} / $ret{F_df}->(0); $ret{ms_residual} = $ret{ss_residual} / $ret{F_df}->(1); $ret{F} = $ret{ms_model} / $ret{ms_residual}; $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $ret{F_df}->dog ) if $CDF and $ret{F_df}->(1) > 0; # get IV ss from $ivs_ref instead of $ivs pdl for my $k (0 .. $#$ivs_ref) { my ($G); my @G = grep $_ != $k, 0 .. $#$ivs_ref; if (@G) { $G = PDL->null->glue( 1, @$ivs_ref[@G] ); $G = $G->glue(1, ones $G->dim(0)); } else { $G = ones( $y->dim(0) ); } my $b_G = $y->ols_t( $G, {CONST=>0} ); $ret{ "| $idv->[$k] | ss" } = $y->sse( sumover($b_G * $G->transpose) ) - $ret{ss_residual}; $ret{ "| $idv->[$k] | F_df" } = pdl( $ivs_ref->[$k]->dim(1), $ret{F_df}->(1)->copy )->squeeze; $ret{ "| $idv->[$k] | ms" } = $ret{ "| $idv->[$k] | ss" } / $ret{ "| $idv->[$k] | F_df" }->(0); $ret{ "| $idv->[$k] | F" } = $ret{ "| $idv->[$k] | ms" } / $ret{ms_residual}; $ret{ "| $idv->[$k] | F_p" } = 1 - $ret{ "| $idv->[$k] | F" }->gsl_cdf_fdist_P( $ret{ "| $idv->[$k] | F_df" }->dog ) if $CDF and $ret{ "| $idv->[$k] | F_df" }->(1) > 0; } } for (keys %ret) {ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze}; my $cm_ref = _cell_means( $y, $ivs_cm_ref, $i_cmo_ref, $idv, \@pdl_ivs_raw ); if (defined $subj) { my @ls = map { $_->uniq->nelem } @pdl_ivs_raw; $cm_ref = _fix_rptd_se( $cm_ref, \%ret, $opt{'IVNM'}, \@ls, $sj->uniq->nelem ); } # integrate mean and se into %ret @ret{ keys %$cm_ref } = values %$cm_ref; my $highest = join(' ~ ', @{ $opt{IVNM} }); $cm_ref->{"# $highest # m"}->plot_means( $cm_ref->{"# $highest # se"}, { %opt, IVNM=>$idv } ) if $opt{PLOT}; return %ret; } sub _add_errors { my ($subj, $ivs_ref, $idv, $raw_ivs, $opt) = @_; # code (btwn group) subjects. Rutherford (2001) pp 101-102 my (@grp, %grp_s); for my $n (0 .. $subj->nelem - 1) { # construct string to code group membership # something not treated as BAD by _array_to_pdl to start off marking group membership # if no $opt->{BTWN}, everyone ends up in the same grp my $s = '_'; $s .= $_->($n) for (@$raw_ivs[@{ $opt->{BTWN} }]); push @grp, $s; # group membership $s .= $subj($n); # keep track of total uniq subj $grp_s{$s} = 1; } my $grp = PDL::Stats::Kmeans::iv_cluster \@grp; my $spdl = zeroes $subj->dim(0), keys(%grp_s) - $grp->dim(1); my ($d0, $d1) = (0, 0); for my $g (0 .. $grp->dim(1)-1) { my $gsub = $subj( which $grp( ,$g) )->effect_code; my ($nobs, $nsub) = $gsub->dims; ($_tmp = $spdl($d0:$d0+$nobs-1, $d1:$d1+$nsub-1)) .= $gsub; $d0 += $nobs; $d1 += $nsub; } # if btwn factor involved, or highest order inter for within factors # elem is undef, so that # @errors ind matches @$ivs_ref, with an extra elem at the end for subj # mark btwn factors for error terms # same error term for B(wn) and A(btwn) x B(wn) (Rutherford, p98) my @qr = map { "(?:$idv->[$_])" } @{ $opt->{BTWN} }; my $qr = join('|', @qr); my $ie_subj; my @errors = map { my @fs = split ' ~ ', $idv->[$_]; # separate bw and wn factors # if only bw, error is bw x subj # if only wn or wn and bw, error is wn x subj my (@wn, @bw); if ($qr) { for (@fs) { /$qr/? push @bw, $_ : push @wn, $_; } } else { @wn = @fs; } $ie_subj = defined($ie_subj)? $ie_subj : $_ if !@wn; my $err = @wn? join(' ~ ', @wn) : join(' ~ ', @bw); my $ie; # mark repeating error term for my $i (0 .. $#$ivs_ref) { if ($idv->[$i] eq $err) { $ie = $i; last; } } # highest order inter of within factors, use ss_residual as error if ( @wn == @$raw_ivs - @{$opt->{BTWN}} ) { undef } # repeating btwn factors use ss_subject as error elsif (!@wn and $_ > $ie_subj) { $ie_subj } # repeating error term elsif ($_ > $ie) { $ie } else { PDL::clump($ivs_ref->[$_] * $spdl->dummy(1), 1,2) } } 0 .. $#$ivs_ref; @{$opt->{BTWN}}? push @errors, $ie_subj : push @errors, $spdl; return \@errors; } sub _fix_rptd_se { # if ivnm lvls_ref for within ss only this can work for mixed design my ($cm_ref, $ret, $ivnm, $lvls_ref, $n) = @_; my @se = grep /se$/, keys %$cm_ref; @se = map { /^# (.+?) # se$/; $1; } @se; my @n_obs = map { my @ivs = split / ~ /, $_; my $i_ivs = which_id $ivnm, \@ivs; my $icollapsed = setops pdl(0 .. $#$ivnm), 'XOR', $i_ivs; my $collapsed = $icollapsed->nelem? pdl( @$lvls_ref[(list $icollapsed)] )->prodover : 1 ; $n * $collapsed; } @se; for my $i (0 .. $#se) { ($_tmp = $cm_ref->{"# $se[$i] # se"}) .= sqrt( $ret->{"| $se[$i] || err ms"} / $n_obs[$i] ); } return $cm_ref; } =head2 dummy_code =for ref Dummy coding of nominal variable (perl @ ref or 1d pdl) for use in regression. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage perldl> @a = qw(a a a b b b c c c) perldl> p $a = dummy_code(\@a) [ [1 1 1 0 0 0 0 0 0] [0 0 0 1 1 1 0 0 0] ] =cut *dummy_code = \&PDL::dummy_code; sub PDL::dummy_code { my ($var_ref) = @_; my $var_e = effect_code( $var_ref ); ($_tmp = $var_e->where( $var_e == -1 )) .= 0; return $var_e; } =head2 effect_code =for ref Unweighted effect coding of nominal variable (perl @ ref or 1d pdl) for use in regression. returns in @ context coded pdl and % ref to level - pdl->dim(1) index. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage my @var = qw( a a a b b b c c c ); my ($var_e, $map) = effect_code( \@var ); print $var_e . $var_e->info . "\n"; [ [ 1 1 1 0 0 0 -1 -1 -1] [ 0 0 0 1 1 1 -1 -1 -1] ] PDL: Double D [9,2] print "$_\t$map->{$_}\n" for (sort keys %$map) a 0 b 1 c 2 =cut *effect_code = \&PDL::effect_code; sub PDL::effect_code { my ($var_ref) = @_; my ($var, $map_ref) = PDL::Stats::Basic::_array_to_pdl( $var_ref ); my $var_max = $var->max; confess "effect_code called with only one unique value" if $var_max < 1; my $var_e = yvals( float, $var->nelem, $var_max ) == $var; ($_tmp = $var_e(which( $var == $var_max ), )) .= -1; $var_e = $var_e->setbadif( $var->isbad ) if $var->badflag; return wantarray? ($var_e, $map_ref) : $var_e; } =head2 effect_code_w =for ref Weighted effect code for nominal variable. returns in @ context coded pdl and % ref to level - pdl->dim(1) index. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage perldl> @a = qw( a a b b b c c ) perldl> p $a = effect_code_w(\@a) [ [ 1 1 0 0 0 -1 -1] [ 0 0 1 1 1 -1.5 -1.5] ] =cut *effect_code_w = \&PDL::effect_code_w; sub PDL::effect_code_w { my ($var_ref) = @_; my ($var_e, $map_ref) = effect_code( $var_ref ); return wantarray ? ($var_e, $map_ref) : $var_e if $var_e->sum == 0; my $pos = $var_e == 1; my $neg = $var_e == -1; my $w = $pos->sumover / $neg->sumover; my $neg_ind = $neg->whichND; ($_tmp = $var_e->indexND($neg_ind)) *= $w($neg_ind((1))); return wantarray ? ($var_e, $map_ref) : $var_e; } =head2 interaction_code Returns the coded interaction term for effect-coded variables. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage perldl> $a = sequence(6) > 2 perldl> p $a = $a->effect_code [ [ 1 1 1 -1 -1 -1] ] perldl> $b = pdl( qw( 0 1 2 0 1 2 ) ) perldl> p $b = $b->effect_code [ [ 1 0 -1 1 0 -1] [ 0 1 -1 0 1 -1] ] perldl> p $ab = interaction_code( $a, $b ) [ [ 1 0 -1 -1 -0 1] [ 0 1 -1 -0 -1 1] ] =cut *interaction_code = \&PDL::interaction_code; sub PDL::interaction_code { my @vars = @_; my $i = ones( $vars[0]->dim(0), 1 ); for (@vars) { $i = $i * $_->dummy(1); $i = $i->clump(1,2); } return $i; } =head2 ols =for ref Ordinary least squares regression, aka linear regression. Unlike B, ols is not threadable, but it can handle bad value (by ignoring observations with bad value in dependent or independent variables list-wise) and returns the full model in list context with various stats. IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. Intercept is automatically added and returned as LAST of the coeffs if CONST=>1. Returns full model in list context and coeff in scalar context. =for options Default options (case insensitive): CONST => 1, PLOT => 0, # see plot_residuals() for plot options WIN => undef, # for plotting =for usage Usage: # suppose this is a person's ratings for top 10 box office movies # ascending sorted by box office perldl> p $y = qsort ceil( random(10) * 5 ) [1 1 2 2 2 2 4 4 5 5] # construct IV with linear and quadratic component perldl> p $x = cat sequence(10), sequence(10)**2 [ [ 0 1 2 3 4 5 6 7 8 9] [ 0 1 4 9 16 25 36 49 64 81] ] perldl> %m = $y->ols( $x ) perldl> p "$_\t$m{$_}\n" for (sort keys %m) F 40.4225352112676 F_df [2 7] F_p 0.000142834216344756 R2 0.920314253647587 # coeff linear quadratic constant b [0.21212121 0.03030303 0.98181818] b_p [0.32800118 0.20303404 0.039910509] b_se [0.20174693 0.021579989 0.38987581] b_t [ 1.0514223 1.404219 2.5182844] ss_model 19.8787878787879 ss_residual 1.72121212121212 ss_total 21.6 y_pred [0.98181818 1.2242424 1.5272727 ... 4.6181818 5.3454545] =cut *ols = \&PDL::ols; sub PDL::ols { _ols_common(0, @_); } sub _rm_bad_value { my ($y, $ivs) = @_; my $idx; if ($y->check_badflag or $ivs->check_badflag) { $idx = which(($y->isbad==0) & (nbadover ($ivs->transpose)==0)); $y = $y($idx)->sever; $ivs = $ivs($idx,)->sever; $ivs->badflag(0); $y->badflag(0); } return $y, $ivs, $idx; } =head2 ols_rptd =for ref Repeated measures linear regression (Lorch & Myers, 1990; Van den Noortgate & Onghena, 2006). Handles purely within-subject design for now. See t/stats_ols_rptd.t for an example using the Lorch and Myers data. =for usage Usage: # This is the example from Lorch and Myers (1990), # a study on how characteristics of sentences affected reading time # Three within-subject IVs: # SP -- serial position of sentence # WORDS -- number of words in sentence # NEW -- number of new arguments in sentence # $subj can be 1D pdl or @ ref and must be the first argument # IV can be 1D @ ref or pdl # 1D @ ref is effect coded internally into pdl # pdl is left as is my %r = $rt->ols_rptd( $subj, $sp, $words, $new ); print "$_\t$r{$_}\n" for (sort keys %r); (ss_residual) 58.3754646504336 (ss_subject) 51.8590337714286 (ss_total) 405.188241771429 # SP WORDS NEW F [ 7.208473 61.354153 1.0243311] F_p [0.025006181 2.619081e-05 0.33792837] coeff [0.33337285 0.45858933 0.15162986] df [1 1 1] df_err [9 9 9] ms [ 18.450705 73.813294 0.57026483] ms_err [ 2.5595857 1.2030692 0.55671923] ss [ 18.450705 73.813294 0.57026483] ss_err [ 23.036272 10.827623 5.0104731] =cut *ols_rptd = \&PDL::ols_rptd; sub PDL::ols_rptd { my ($y, $subj, @ivs_raw) = @_; $y = $y->squeeze; $y->getndims > 1 and croak "ols_rptd does not support threading"; my @ivs = map { (ref $_ eq 'PDL' and $_->ndims > 1)? $_ : ref $_ eq 'PDL' ? $_->dummy(1) : scalar effect_code($_) ; } @ivs_raw; my %r; $r{'(ss_total)'} = $y->ss; # STEP 1: subj my $s = effect_code $subj; # gives same results as dummy_code my $b_s = $y->ols_t($s); my $pred = sumover($b_s(0:-2) * $s->transpose) + $b_s(-1); $r{'(ss_subject)'} = $r{'(ss_total)'} - $y->sse( $pred ); # STEP 2: add predictor variables my $iv_p = $s->glue(1, @ivs); my $b_p = $y->ols_t($iv_p); # only care about coeff for predictor vars. no subj or const coeff $r{coeff} = $b_p(-(@ivs+1) : -2)->sever; # get total sse for this step $pred = sumover($b_p(0:-2) * $iv_p->transpose) + $b_p(-1); my $ss_pe = $y->sse( $pred ); # get predictor ss by successively reducing the model $r{ss} = zeroes scalar(@ivs); for my $i (0 .. $#ivs) { my @i_rest = grep { $_ != $i } 0 .. $#ivs; my $iv = $s->glue(1, @ivs[ @i_rest ]); my $b = $y->ols_t($iv); $pred = sumover($b(0:-2) * $iv->transpose) + $b(-1); ($_tmp = $r{ss}->($i)) .= $y->sse($pred) - $ss_pe; } # STEP 3: get precitor x subj interaction as error term my $iv_e = PDL::glue 1, map { interaction_code( $s, $_ ) } @ivs; # get total sse for this step. full model now. my $b_f = $y->ols_t( $iv_p->glue(1,$iv_e) ); $pred = sumover($b_f(0:-2) * $iv_p->glue(1,$iv_e)->transpose) + $b_f(-1); $r{'(ss_residual)'} = $y->sse( $pred ); # get predictor x subj ss by successively reducing the error term $r{ss_err} = zeroes scalar(@ivs); for my $i (0 .. $#ivs) { my @i_rest = grep { $_ != $i } 0 .. $#ivs; my $e_rest = PDL::glue 1, map { interaction_code( $s, $_ ) } @ivs[@i_rest]; my $iv = $iv_p->glue(1, $e_rest); my $b = $y->ols_t($iv); my $pred = sumover($b(0:-2) * $iv->transpose) + $b(-1); ($_tmp = $r{ss_err}->($i)) .= $y->sse($pred) - $r{'(ss_residual)'}; } # Finally, get MS, F, etc $r{df} = pdl( map { $_->squeeze->ndims } @ivs ); $r{ms} = $r{ss} / $r{df}; $r{df_err} = $s->dim(1) * $r{df}; $r{ms_err} = $r{ss_err} / $r{df_err}; $r{F} = $r{ms} / $r{ms_err}; $r{F_p} = 1 - $r{F}->gsl_cdf_fdist_P( $r{df}, $r{df_err} ) if $CDF; return %r; } =head2 logistic =for ref Logistic regression with maximum likelihood estimation using PDL::Fit::LM (requires PDL::Slatec. Hence loaded with "require" in the sub instead of "use" at the beginning). IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. It is included in the model and returned as LAST of coeff. Returns full model in list context and coeff in scalar context. The significance tests are likelihood ratio tests (-2LL deviance) tests. IV significance is tested by comparing deviances between the reduced model (ie with the IV in question removed) and the full model. ***NOTE: the results here are qualitatively similar to but not identical with results from R, because different algorithms are used for the nonlinear parameter fit. Use with discretion*** =for options Default options (case insensitive): INITP => zeroes( $x->dim(1) + 1 ), # n_iv + 1 MAXIT => 1000, EPS => 1e-7, =for usage Usage: # suppose this is whether a person had rented 10 movies perldl> p $y = ushort( random(10)*2 ) [0 0 0 1 1 0 0 1 1 1] # IV 1 is box office ranking perldl> p $x1 = sequence(10) [0 1 2 3 4 5 6 7 8 9] # IV 2 is whether the movie is action- or chick-flick perldl> p $x2 = sequence(10) % 2 [0 1 0 1 0 1 0 1 0 1] # concatenate the IVs together perldl> p $x = cat $x1, $x2 [ [0 1 2 3 4 5 6 7 8 9] [0 1 0 1 0 1 0 1 0 1] ] perldl> %m = $y->logistic( $x ) perldl> p "$_\t$m{$_}\n" for (sort keys %m) D0 13.8629436111989 Dm 9.8627829791575 Dm_chisq 4.00016063204141 Dm_df 2 Dm_p 0.135324414081692 # ranking genre constant b [0.41127706 0.53876358 -2.1201285] b_chisq [ 3.5974504 0.16835559 2.8577151] b_p [0.057868258 0.6815774 0.090936587] iter 12 y_pred [0.10715577 0.23683909 ... 0.76316091 0.89284423] # to get the covariance out, supply a true value for the COV option: perldl> %m = $y->logistic( $x, {COV=>1} ) perldl> p $m{cov}; =cut *logistic = \&PDL::logistic; sub PDL::logistic { require PDL::Fit::LM; # uses PDL::Slatec my ( $self, $ivs, $opt ) = @_; $self = $self->squeeze; # make compatible w multiple var cases $ivs->getndims == 1 and $ivs = $ivs->dummy(1); $self->dim(0) != $ivs->dim(0) and carp "mismatched n btwn DV and IV!"; my %opt = ( INITP => zeroes( $ivs->dim(1) + 1 ), # n_ivs + 1 MAXIT => 1000, EPS => 1e-7, ); $opt and $opt{uc $_} = $opt->{$_} for (%$opt); # not using it atm $opt{WT} = 1; # Use lmfit. Fourth input argument is reference to user-defined # copy INITP so we have the original value when needed my ($yfit,$coeff,$cov,$iter) = PDL::Fit::LM::lmfit($ivs, $self, $opt{WT}, \&_logistic, $opt{INITP}->copy, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); # apparently at least coeff is child of some pdl # which is changed in later lmfit calls $yfit = $yfit->copy; $coeff = $coeff->copy; return $coeff unless wantarray; my %ret; my $n0 = $self->where($self == 0)->nelem; my $n1 = $self->nelem - $n0; $ret{cov} = $cov if $opt{COV}; $ret{D0} = -2*($n0 * log($n0 / $self->nelem) + $n1 * log($n1 / $self->nelem)); $ret{Dm} = sum( $self->dvrs( $yfit ) ** 2 ); $ret{Dm_chisq} = $ret{D0} - $ret{Dm}; $ret{Dm_df} = $ivs->dim(1); $ret{Dm_p} = 1 - PDL::GSL::CDF::gsl_cdf_chisq_P( $ret{Dm_chisq}, $ret{Dm_df} ) if $CDF; my $coeff_chisq = zeroes $opt{INITP}->nelem; if ( $ivs->dim(1) > 1 ) { for my $k (0 .. $ivs->dim(1)-1) { my @G = grep { $_ != $k } (0 .. $ivs->dim(1)-1); my $G = $ivs->dice_axis(1, \@G); my $init = $opt{INITP}->dice([ @G, $opt{INITP}->dim(0)-1 ])->copy; my $y_G = PDL::Fit::LM::lmfit( $G, $self, $opt{WT}, \&_logistic, $init, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); ($_tmp = $coeff_chisq($k)) .= $self->dm( $y_G ) - $ret{Dm}; } } else { # d0 is, by definition, the deviance with only intercept ($_tmp = $coeff_chisq(0)) .= $ret{D0} - $ret{Dm}; } my $y_c = PDL::Fit::LM::lmfit( $ivs, $self, $opt{WT}, \&_logistic_no_intercept, $opt{INITP}->(0:-2)->copy, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); ($_tmp = $coeff_chisq(-1)) .= $self->dm( $y_c ) - $ret{Dm}; $ret{b} = $coeff; $ret{b_chisq} = $coeff_chisq; $ret{b_p} = 1 - $ret{b_chisq}->gsl_cdf_chisq_P( 1 ) if $CDF; $ret{y_pred} = $yfit; $ret{iter} = $iter; for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; return %ret; } sub _logistic { my ($x,$par,$ym,$dyda) = @_; # $b and $c are fit parameters slope and intercept my $b = $par(0 : $x->dim(1) - 1)->sever; my $c = $par(-1)->sever; # Write function with dependent variable $ym, # independent variable $x, and fit parameters as specified above. # Use the .= (dot equals) assignment operator to express the equality # (not just a plain equals) ($_tmp = $ym) .= 1 / ( 1 + exp( -1 * (sumover($b * $x->transpose) + $c) ) ); my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1); # Partial derivative of the function with respect to each slope # fit parameter ($b in this case). Again, note .= assignment # operator (not just "equals") ($_tmp = $dy[$_]) .= $x( ,$_) * $ym * (1 - $ym) for (0 .. $b->dim(0)-1); # Partial derivative of the function re intercept par ($_tmp = $dy[-1]) .= $ym * (1 - $ym); } sub _logistic_no_intercept { my ($x,$par,$ym,$dyda) = @_; my $b = $par(0 : $x->dim(1) - 1)->sever; # Write function with dependent variable $ym, # independent variable $x, and fit parameters as specified above. # Use the .= (dot equals) assignment operator to express the equality # (not just a plain equals) ($_tmp = $ym) .= 1 / ( 1 + exp( -1 * sumover($b * $x->transpose) ) ); my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1); # Partial derivative of the function with respect to each slope # fit parameter ($b in this case). Again, note .= assignment # operator (not just "equals") ($_tmp = $dy[$_]) .= $x( ,$_) * $ym * (1 - $ym) for (0 .. $b->dim(0)-1); } =head2 pca =for ref Principal component analysis. Based on corr instead of cov (bad values are ignored pair-wise. OK when bad values are few but otherwise probably should fill_m etc before pca). Use PDL::Slatec::eigsys() if installed, otherwise use PDL::MatrixOps::eigens_sym(). =for options Default options (case insensitive): CORR => 1, # boolean. use correlation or covariance PLOT => 0, # calls plot_screes by default # can set plot_screes options here WIN => undef, # for plotting =for usage Usage: my $d = qsort random 10, 5; # 10 obs on 5 variables my %r = $d->pca( \%opt ); print "$_\t$r{$_}\n" for (keys %r); eigenvalue # variance accounted for by each component [4.70192 0.199604 0.0471421 0.0372981 0.0140346] eigenvector # dim var x comp. weights for mapping variables to component [ [ -0.451251 -0.440696 -0.457628 -0.451491 -0.434618] [ -0.274551 0.582455 0.131494 0.255261 -0.709168] [ 0.43282 0.500662 -0.139209 -0.735144 -0.0467834] [ 0.693634 -0.428171 0.125114 0.128145 -0.550879] [ 0.229202 0.180393 -0.859217 0.4173 0.0503155] ] loadings # dim var x comp. correlation between variable and component [ [ -0.978489 -0.955601 -0.992316 -0.97901 -0.942421] [ -0.122661 0.260224 0.0587476 0.114043 -0.316836] [ 0.0939749 0.108705 -0.0302253 -0.159616 -0.0101577] [ 0.13396 -0.0826915 0.0241629 0.0247483 -0.10639] [ 0.027153 0.0213708 -0.101789 0.0494365 0.00596076] ] pct_var # percent variance accounted for by each component [0.940384 0.0399209 0.00942842 0.00745963 0.00280691] Plot scores along the first two components, $d->plot_scores( $r{eigenvector} ); =cut *pca = \&PDL::pca; sub PDL::pca { my ($self, $opt) = @_; my %opt = ( CORR => 1, PLOT => 0, WIN => undef, # for plotting ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $var_var = $opt{CORR}? $self->corr_table : $self->cov_table; # value is axis pdl and score is var x axis my ($eigval, $eigvec); if ( $SLATEC ) { ($eigval, $eigvec) = $var_var->PDL::Slatec::eigsys; } else { ($eigvec, $eigval) = $var_var->eigens_sym; # compatibility with PDL::Slatec::eigsys $eigvec = $eigvec->inplace->transpose->sever; } # ind is sticky point for threading my $ind_sorted = $eigval->qsorti->(-1:0); $eigvec = $eigvec( ,$ind_sorted)->sever; $eigval = $eigval($ind_sorted)->sever; # var x axis my $var = $eigval / $eigval->sum->sclr; my $loadings; if ($opt{CORR}) { $loadings = $eigvec * sqrt( $eigval->transpose ); } else { my $scores = $eigvec x $self->dev_m; $loadings = $self->corr( $scores->dummy(1) ); } $var->plot_screes(\%opt) if $opt{PLOT}; return ( eigenvalue=>$eigval, eigenvector=>$eigvec, pct_var=>$var, loadings=>$loadings ); } =head2 pca_sorti Determine by which vars a component is best represented. Descending sort vars by size of association with that component. Returns sorted var and relevant component indices. =for options Default options (case insensitive): NCOMP => 10, # maximum number of components to consider =for usage Usage: # let's see if we replicated the Osgood et al. (1957) study perldl> ($data, $idv, $ido) = rtable 'osgood_exp.csv', {v=>0} # select a subset of var to do pca perldl> $ind = which_id $idv, [qw( ACTIVE BASS BRIGHT CALM FAST GOOD HAPPY HARD LARGE HEAVY )] perldl> $data = $data( ,$ind)->sever perldl> @$idv = @$idv[list $ind] perldl> %m = $data->pca perldl> ($iv, $ic) = $m{loadings}->pca_sorti() perldl> p "$idv->[$_]\t" . $m{loadings}->($_,$ic)->flat . "\n" for (list $iv) # COMP0 COMP1 COMP2 COMP3 HAPPY [0.860191 0.364911 0.174372 -0.10484] GOOD [0.848694 0.303652 0.198378 -0.115177] CALM [0.821177 -0.130542 0.396215 -0.125368] BRIGHT [0.78303 0.232808 -0.0534081 -0.0528796] HEAVY [-0.623036 0.454826 0.50447 0.073007] HARD [-0.679179 0.0505568 0.384467 0.165608] ACTIVE [-0.161098 0.760778 -0.44893 -0.0888592] FAST [-0.196042 0.71479 -0.471355 0.00460276] LARGE [-0.241994 0.594644 0.634703 -0.00618055] BASS [-0.621213 -0.124918 0.0605367 -0.765184] =cut *pca_sorti = \&PDL::pca_sorti; sub PDL::pca_sorti { # $self is pdl (var x component) my ($self, $opt) = @_; my %opt = ( NCOMP => 10, # maximum number of components to consider ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $ncomp = pdl($opt{NCOMP}, $self->dim(1))->min; $self = $self->dice_axis( 1, pdl(0..$ncomp-1) ); my $icomp = $self->transpose->abs->maximum_ind; # sort between comp my $ivar_sort = $icomp->qsorti; $self = $self($ivar_sort, )->sever; # sort within comp my $ic = $icomp($ivar_sort)->iv_cluster; for my $comp (0 .. $ic->dim(1)-1) { my $i = $self(which($ic( ,$comp)), ($comp))->qsorti->(-1:0); ($_tmp = $ivar_sort(which $ic( ,$comp))) .= $ivar_sort(which $ic( ,$comp))->($i)->sever; } return wantarray? ($ivar_sort, pdl(0 .. $ic->dim(1)-1)) : $ivar_sort; } =head2 plot_means Plots means anova style. Can handle up to 4-way interactions (ie 4D pdl). =for options Default options (case insensitive): IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'], DVNM => 'DV', AUTO => 1, # auto set dims to be on x-axis, line, panel # if set 0, dim 0 goes on x-axis, dim 1 as lines # dim 2+ as panels # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows SIZE => 640, # individual square panel size in pixels SYMBL => [0, 4, 7, 11], =for usage Usage: # see anova for mean / se pdl structure $mean->plot_means( $se, {IVNM=>['apple', 'bake']} ); Or like this: $m{'# apple ~ bake # m'}->plot_means; =cut *plot_means = \&PDL::plot_means; sub PDL::plot_means { require PDL::Graphics::PGPLOT::Window; my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($self, $se) = @_; $self = $self->squeeze; if ($self->ndims > 4) { carp "Data is > 4D. No plot here."; return; } my %opt = ( IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'], DVNM => 'DV', AUTO => 1, # auto set vars to be on X axis, line, panel WIN => undef, # PDL::Graphics::PGPLOT::Window object DEV => $DEV, SIZE => 640, # individual square panel size in pixels SYMBL => [0, 4, 7, 11], # ref PDL::Graphics::PGPLOT::Window ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); # decide which vars to plot as x axis, lines, panels # put var w most levels on x axis # put var w least levels on diff panels my @iD = 0..3; my @dims = (1, 1, 1, 1); # splice ARRAY,OFFSET,LENGTH,LIST splice @dims, 0, $self->ndims, $self->dims; $self = $self->reshape(@dims)->sever; $se = $se->reshape(@dims)->sever if defined $se; @iD = reverse list qsorti pdl @dims if $opt{AUTO}; # $iD[0] on x axis # $iD[1] as separate lines my $nx = $self->dim($iD[2]); # n xpanels my $ny = $self->dim($iD[3]); # n ypanels my $w = $opt{WIN}; if (!defined $w) { $w = PDL::Graphics::PGPLOT::Window::pgwin(DEV=>$opt{DEV}, NX=>$nx, NY=>$ny, SIZE=>[$opt{SIZE}*$nx, $opt{SIZE}*$ny], UNIT=>3); } my ($min, $max) = defined $se? pdl($self + $se, $self - $se)->minmax : $self->minmax ; my $range = $max - $min; my $p = 0; # panel for my $y (0..$self->dim($iD[3])-1) { for my $x (0..$self->dim($iD[2])-1) { $p ++; my $tl = ''; $tl = $opt{IVNM}->[$iD[2]] . " $x" if $self->dim($iD[2]) > 1; $tl .= ' ' . $opt{IVNM}->[$iD[3]] . " $y" if $self->dim($iD[3]) > 1; $w->env( 0, $self->dim($iD[0])-1, $min - 2*$range/5, $max + $range/5, { XTitle=>$opt{IVNM}->[$iD[0]], YTitle=>$opt{DVNM}, Title=>$tl, PANEL=>$p, AXIS=>['BCNT', 'BCNST'], Border=>1, } ) unless $opt{WIN}; my (@legend, @color); for (0 .. $self->dim($iD[1]) - 1) { push @legend, $opt{IVNM}->[$iD[1]] . " $_" if ($self->dim($iD[1]) > 1); push @color, $_ + 2; # start from red $w->points( sequence($self->dim($iD[0])), $self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_), $opt{SYMBL}->[$_], { PANEL=>$p, CHARSIZE=>2, COLOR=>$_+2, PLOTLINE=>1, } ); $w->errb( sequence($self->dim($iD[0])), $self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_), $se->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_), { PANEL=>$p, CHARSIZE=>2, COLOR=>$_+2 } ) if defined $se; } if ($self->dim($iD[1]) > 1) { $w->legend( \@legend, ($self->dim($iD[0])-1)/1.6, $min - $range/10, { COLOR=>\@color } ); $w->legend( \@legend, ($self->dim($iD[0])-1)/1.6, $min - $range/10, { COLOR=>\@color, SYMBOL=>[ @{$opt{SYMBL}}[0..$#color] ] } ); } } } $w->close unless $opt{WIN}; return; } =head2 plot_residuals Plots residuals against predicted values. =for usage Usage: $y->plot_residuals( $y_pred, { dev=>'/png' } ); =for options Default options (case insensitive): # see PDL::Graphics::PGPLOT::Window for more info WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows SIZE => 640, # plot size in pixels COLOR => 1, =cut *plot_residuals = \&PDL::plot_residuals; sub PDL::plot_residuals { require PDL::Graphics::PGPLOT::Window; my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($y, $y_pred) = @_; my %opt = ( # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => $DEV , # open and close dev for plotting if no WIN SIZE => 640, # plot size in pixels COLOR => 1, ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $residuals = $y - $y_pred; my $win = $opt{WIN}; if (!$win) { $win = PDL::Graphics::PGPLOT::Window::pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3); $win->env( $y_pred->minmax, $residuals->minmax, {XTITLE=>'predicted value', YTITLE=>'residuals', AXIS=>['BCNT', 'BCNST'], Border=>1,} ); } $win->points($y_pred, $residuals, { COLOR=>$opt{COLOR} }); # add 0-line $win->line(pdl($y_pred->minmax), pdl(0,0), { COLOR=>$opt{COLOR} } ); $win->close unless $opt{WIN}; return; } =head2 plot_scores Plots standardized original and PCA transformed scores against two components. (Thank you, Bob MacCallum, for the documentation suggestion that led to this function.) =for options Default options (case insensitive): CORR => 1, # boolean. PCA was based on correlation or covariance COMP => [0,1], # indices to components to plot # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows SIZE => 640, # plot size in pixels COLOR => [1,2], # color for original and rotated scores =for usage Usage: my %p = $data->pca(); $data->plot_scores( $p{eigenvector}, \%opt ); =cut *plot_scores = \&PDL::plot_scores; sub PDL::plot_scores { require PDL::Graphics::PGPLOT::Window; my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($self, $eigvec) = @_; my %opt = ( CORR => 1, # boolean. PCA was based on correlation or covariance COMP => [0,1], # indices to components to plot # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => $DEV , # open and close dev for plotting if no WIN SIZE => 640, # plot size in pixels COLOR => [1,2], # color for original and transformed scoress ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); my $i = pdl $opt{COMP}; my $z = $opt{CORR}? $self->stddz : $self->dev_m; # transformed normed values my $scores = sumover($eigvec( ,$i) * $z->transpose->dummy(1))->transpose; $z = $z( ,$i)->sever; my $win = $opt{WIN}; my $max = pdl($z, $scores)->abs->ceil->max->sclr; if (!$win) { $win = PDL::Graphics::PGPLOT::Window::pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3); $win->env(-$max, $max, -$max, $max, {XTitle=>"Component $opt{COMP}->[0]", YTitle=>"Component $opt{COMP}->[1]", AXIS=>['ABCNST', 'ABCNST'], Border=>1, }); } $win->points( $z( ,0;-), $z( ,1;-), { COLOR=>$opt{COLOR}->[0] } ); $win->points( $scores( ,0;-), $scores( ,1;-), { COLOR=>$opt{COLOR}->[1] } ); $win->legend( ['original', 'transformed'], .2*$max, .8*$max, {color=>[1,2],symbol=>[1,1]} ); $win->close unless $opt{WIN}; return; } =head2 plot_screes Scree plot. Plots proportion of variance accounted for by PCA components. =for options Default options (case insensitive): NCOMP => 20, # max number of components to plot CUT => 0, # set to plot cutoff line after this many components # undef to plot suggested cutoff line for NCOMP comps # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => '/xs', # open and close dev for plotting if no WIN # defaults to '/png' in Windows SIZE => 640, # plot size in pixels COLOR => 1, =for usage Usage: # variance should be in descending order $pca{var}->plot_screes( {ncomp=>16} ); Or, because NCOMP is used so often, it is allowed a shortcut, $pca{var}->plot_screes( 16 ); =cut *plot_scree = \&PDL::plot_screes; # here for now for compatibility *plot_screes = \&PDL::plot_screes; sub PDL::plot_screes { require PDL::Graphics::PGPLOT::Window; my $opt = pop @_ if ref $_[-1] eq 'HASH'; my ($self, $ncomp) = @_; my %opt = ( NCOMP => 20, # max number of components to plot CUT => 0, # set to plot cutoff line after this many components # undef to plot suggested cutoff line for NCOMP comps # see PDL::Graphics::PGPLOT::Window for next options WIN => undef, # pgwin object. not closed here if passed # allows comparing multiple lines in same plot # set env before passing WIN DEV => $DEV , # open and close dev for plotting if no WIN SIZE => 640, # plot size in pixels COLOR => 1, ); $opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); $opt{NCOMP} = $ncomp if $ncomp; # re-use $ncomp below $ncomp = ($self->dim(0) < $opt{NCOMP})? $self->dim(0) : $opt{NCOMP}; $opt{CUT} = PDL::Stats::Kmeans::_scree_ind $self(0:$ncomp-1) if !defined $opt{CUT}; my $win = $opt{WIN}; if (!$win) { $win = PDL::Graphics::PGPLOT::Window::pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3); $win->env(0, $ncomp-1, 0, 1, {XTitle=>'Component', YTitle=>'Proportion of Variance Accounted for', AXIS=>['BCNT', 'BCNST'], Border=>1, }); } $win->points(sequence($ncomp), $self(0:$ncomp-1, ), {CHARSIZE=>2, COLOR=>$opt{COLOR}, PLOTLINE=>1} ); $win->line( pdl($opt{CUT}-.5, $opt{CUT}-.5), pdl(-.05, $self->max->sclr+.05), {COLOR=>15} ) if $opt{CUT}; $win->close unless $opt{WIN}; return; } =head1 SEE ALSO PDL::Fit::Linfit PDL::Fit::LM =head1 REFERENCES Cohen, J., Cohen, P., West, S.G., & Aiken, L.S. (2003). Applied Multiple Regression/correlation Analysis for the Behavioral Sciences (3rd ed.). Mahwah, NJ: Lawrence Erlbaum Associates Publishers. Hosmer, D.W., & Lemeshow, S. (2000). Applied Logistic Regression (2nd ed.). New York, NY: Wiley-Interscience. Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated measures data in cognitive research. Journal of Experimental Psychology: Learning, Memory, & Cognition, 16, 149-157. Osgood C.E., Suci, G.J., & Tannenbaum, P.H. (1957). The Measurement of Meaning. Champaign, IL: University of Illinois Press. Rutherford, A. (2001). Introducing Anova and Ancova: A GLM Approach (1st ed.). Thousand Oaks, CA: Sage Publications. Shlens, J. (2009). A Tutorial on Principal Component Analysis. Retrieved April 10, 2011 from http://citeseerx.ist.psu.edu/ The GLM procedure: unbalanced ANOVA for two-way design with interaction. (2008). SAS/STAT(R) 9.2 User's Guide. Retrieved June 18, 2009 from http://support.sas.com/ Van den Noortgatea, W., & Onghenaa, P. (2006). Analysing repeated measures data in cognitive research: A comment on regression coefficient analyses. European Journal of Cognitive Psychology, 18, 937-952. =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut EOD pp_done(); PDL-Stats-0.84/GLM/Makefile.PL0000644000175000017500000000034614126063750015506 0ustar osboxesosboxesuse PDL::Core::Dev; use ExtUtils::MakeMaker; PDL::Core::Dev->import(); $package = ["glm.pd",GLM,PDL::Stats::GLM,undef,1]; %hash = pdlpp_stdargs($package); WriteMakefile( %hash ); sub MY::postamble {pdlpp_postamble($package)}; PDL-Stats-0.84/MANIFEST0000644000175000017500000000201114625061426014237 0ustar osboxesosboxesBasic/Makefile.PL Basic/stats_basic.pd Changes Distr/distr.pd Distr/Makefile.PL Distr/t/stats_distr.t GLM/glm.pd GLM/Makefile.PL Kmeans/kmeans.pd Kmeans/Makefile.PL Makefile.PL MANIFEST This list of files README.md Stats.pm t/00-report-prereqs.t t/stats_basic.t t/stats_glm.t t/stats_kmeans.t t/stats_ols_rptd.t t/stats_ts.t TS/Makefile.PL TS/ts.pd META.yml Module YAML meta-data (added by MakeMaker) META.json Module JSON meta-data (added by MakeMaker) GENERATED/PDL/Stats/Basic.pm mod=PDL::Stats::Basic pd=Basic/stats_basic.pd (added by pdlpp_mkgen) GENERATED/PDL/Stats/Distr.pm mod=PDL::Stats::Distr pd=Distr/distr.pd (added by pdlpp_mkgen) GENERATED/PDL/Stats/GLM.pm mod=PDL::Stats::GLM pd=GLM/glm.pd (added by pdlpp_mkgen) GENERATED/PDL/Stats/Kmeans.pm mod=PDL::Stats::Kmeans pd=Kmeans/kmeans.pd (added by pdlpp_mkgen) GENERATED/PDL/Stats/TS.pm mod=PDL::Stats::TS pd=TS/ts.pd (added by pdlpp_mkgen) PDL-Stats-0.84/Makefile.PL0000644000175000017500000000426714544645567015115 0ustar osboxesosboxesuse strict; use warnings; use ExtUtils::MakeMaker; use PDL::Core::Dev; use Getopt::Long; my $force; my $result = GetOptions("force"=>\$force); if ($] >= 5.015000 and $] < 5.015009 and !$force) { warn "Perl version bewteen 5.15.0 and 5.15.8 detected. Sheepishly bailing out of installation because of perl and PDL :lvalue subroutine issue. Use --force with 'perl Makefile.PL' to attemp a build anyways. For more information, please see perl bug ticket #107366 https://rt.perl.org/rt3//Public/Bug/Display.html?id=107366\n"; exit; } my $got_PDL = eval { require PDL::Core::Dev }; WriteMakefile( NAME => 'PDL::Stats', AUTHOR => 'Maggie J. Xiong ', VERSION_FROM => 'Stats.pm', ABSTRACT_FROM => 'Stats.pm', ($ExtUtils::MakeMaker::VERSION >= 6.3002 ? ('LICENSE'=> 'perl') : ()), PREREQ_PM => { 'PDL' => '2.057', # ->sum etc return ndarray not Perl scalar }, CONFIGURE_REQUIRES => { 'PDL::Core' => 2.008, }, BUILD_REQUIRES => { 'PDL::Core' => 2.008, }, TEST_REQUIRES => { 'PDL::Core' => 2.008, 'Test::More' => '0.88', # done_testing }, $got_PDL ? () : (DIR => []), # just write MYMETA if no PDL dist => { PREOP => 'gsl-config --version && $(PERL) -MPDL::Core::Dev -e pdlpp_mkgen $(DISTVNAME)' }, clean => { FILES => ['PDL-Stats-*'] }, META_MERGE => { "meta-spec" => { version => 2 }, prereqs => { develop => { requires => { 'CPAN::Changes' => 0, }, }, runtime => { recommends => { 'PDL::Graphics::PGPLOT' => 0, 'PDL::Slatec' => 0, }, }, }, resources => { repository => { type => 'git', url => 'git://github.com/PDLPorters/PDL-Stats', web => 'https://github.com/PDLPorters/PDL-Stats', }, }, }, ); sub MY::postamble { my $oneliner = PDL::Core::Dev::_oneliner(qq{exit if \$ENV{DESTDIR}; use PDL::Doc; eval { PDL::Doc::add_module(shift); }}); qq|\ninstall :: pure_install\n\t$oneliner \$(NAME)\n|; }