patsy-0.4.1+git34-ga5b54c2/ 0000775 0000000 0000000 00000000000 13006651415 0015015 5 ustar 00root root 0000000 0000000 patsy-0.4.1+git34-ga5b54c2/.coveragerc 0000664 0000000 0000000 00000000147 13006651415 0017140 0 ustar 00root root 0000000 0000000 [run]
branch=True
source=patsy
[report]
exclude_lines =
pragma: no cover
^def test_
precision = 1
patsy-0.4.1+git34-ga5b54c2/.gitignore 0000664 0000000 0000000 00000002004 13006651415 0017001 0 ustar 00root root 0000000 0000000 # Project specific files #
##########################
.coverage
htmlcov/
.tox
# Generated by doc build
doc/_static/basis-*.png
# Cribbed from numpy's .gitignore:
# Editor temporary/working/backup files #
#########################################
.#*
[#]*#
*~
*$
*.bak
*.diff
*.org
.project
*.rej
.settings/
.*.sw[nop]
.sw[nop]
*.tmp
# Compiled source #
###################
*.a
*.com
*.class
*.dll
*.exe
*.o
*.py[ocd]
*.so
# Packages #
############
# it's better to unpack these files and commit the raw source
# git has its own built in compression methods
*.7z
*.bz2
*.bzip2
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.tbz2
*.tgz
*.zip
# Python files #
################
# setup.py working directory
build
# sphinx build directory
_build
# setup.py dist directory
dist
doc/build
doc/cdoc/build
# Egg metadata
*.egg-info
# The shelf plugin uses this dir
./.shelf
# Logs and databases #
######################
*.log
*.sql
*.sqlite
# OS generated files #
######################
.gdb_history
.DS_Store?
ehthumbs.db
Icon?
Thumbs.db
patsy-0.4.1+git34-ga5b54c2/.travis.yml 0000664 0000000 0000000 00000005144 13006651415 0017132 0 ustar 00root root 0000000 0000000 language: python
python:
- 2.7
- 3.4
- 3.5
matrix:
include:
# 0.14.0 is the last version with the old categorical system
# libfortran=1.0 is needed to work around a bug in anaconda
# (https://github.com/pydata/patsy/pull/83#issuecomment-206895923)
- python: 3.4
env: PANDAS_VERSION_STR="=0.14.0 libgfortran=1.0"
- python: 2.7
env: PANDAS_VERSION_STR="=0.14.0 libgfortran=1.0"
# 0.18.0 has is_categorical_dtype in a different place than 0.19.0+
- python: 3.4
env: PANDAS_VERSION_STR="=0.18.0"
- python: 2.7
env: PANDAS_VERSION_STR="=0.18.0"
# make sure it works without pandas
- python: 3.5
env: PANDAS_VERSION_STR="NONE"
- python: 2.7
env: PANDAS_VERSION_STR="NONE"
# This disables sudo, but makes builds start much faster
# See http://blog.travis-ci.com/2014-12-17-faster-builds-with-container-based-infrastructure/
sudo: false
before_install:
# Work around terrible pathological behaviour in OpenBLAS multithreading, that causes execution time to blow up from 3 minutes to 18 minutes, apparently in SVD on smallish matrices
- export OMP_NUM_THREADS=1
# Escape Travis virtualenv
- deactivate
# See: http://conda.pydata.org/docs/travis.html
- wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
- bash miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- hash -r
- conda config --set always_yes yes --set changeps1 no
- conda update -q conda
- conda info -a
- conda create -q -n testenv python=$TRAVIS_PYTHON_VERSION numpy scipy coverage nose pip
- source activate testenv
- if [ "$PANDAS_VERSION_STR" != "NONE" ]; then conda install pandas${PANDAS_VERSION_STR}; fi
install:
- python setup.py sdist
- pip install dist/*
script:
# We change directories to make sure that python won't find the copy
# of patsy in the source directory.
- mkdir empty
- cd empty
- INSTALLDIR=$(python -c "import os; import patsy; print(os.path.dirname(patsy.__file__))")
- export PYTHONWARNINGS=default PATSY_FORCE_NO_WARNINGS=1
# The --exe is because python sometimes marks all installed modules
# as executable, so without --exe nosetests will just ignore
# everything. Baffling, but so it goes.
- coverage run --source=$INSTALLDIR --rcfile=../.coveragerc $(which nosetests) -vvv --nocapture --exe --failure-detail --all-modules $INSTALLDIR
- coverage report --rcfile=../.coveragerc --show-missing
- python ../tools/check-API-refs.py
notifications:
email:
- njs@pobox.com
after_success:
#- pip install coveralls && coveralls
- pip install codecov && codecov
patsy-0.4.1+git34-ga5b54c2/CODE_OF_CONDUCT.md 0000664 0000000 0000000 00000004517 13006651415 0017623 0 ustar 00root root 0000000 0000000 # Contributor Code of Conduct
As contributors and maintainers of this project, and in the interest of
fostering an open and welcoming community, we pledge to respect all people who
contribute through reporting issues, posting feature requests, updating
documentation, submitting pull requests or patches, and other activities.
We are committed to making participation in this project a harassment-free
experience for everyone, regardless of level of experience, gender, gender
identity and expression, sexual orientation, disability, personal appearance,
body size, race, ethnicity, age, religion, or nationality.
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery
* Personal attacks
* Trolling or insulting/derogatory comments
* Public or private harassment
* Publishing other's private information, such as physical or electronic
addresses, without explicit permission
* Other unethical or unprofessional conduct
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
By adopting this Code of Conduct, project maintainers commit themselves to
fairly and consistently applying these principles to every aspect of managing
this project. Project maintainers who do not follow or enforce the Code of
Conduct may be permanently removed from the project team.
This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community.
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting a project maintainer at njs@pobox.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. Maintainers are
obligated to maintain confidentiality with regard to the reporter of an
incident.
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 1.3.0, available at
[http://contributor-covenant.org/version/1/3/0/][version]
[homepage]: http://contributor-covenant.org
[version]: http://contributor-covenant.org/version/1/3/0/
patsy-0.4.1+git34-ga5b54c2/LICENSE.txt 0000664 0000000 0000000 00000010331 13006651415 0016636 0 ustar 00root root 0000000 0000000 The bulk of Patsy is distributed under a simple 2-clause BSD license:
Copyright (C) 2011-2012, Patsy Developers. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The module patsy.compat contains code derived from the Python
standard library, and is covered by the following license:
PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
--------------------------------------------
1. This LICENSE AGREEMENT is between the Python Software Foundation
("PSF"), and the Individual or Organization ("Licensee") accessing and
otherwise using this software ("Python") in source or binary form and
its associated documentation.
2. Subject to the terms and conditions of this License Agreement, PSF hereby
grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
analyze, test, perform and/or display publicly, prepare derivative works,
distribute, and otherwise use Python alone or in any derivative version,
provided, however, that PSF's License Agreement and PSF's notice of copyright,
i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
2011, 2012 Python Software Foundation; All Rights Reserved" are retained in Python
alone or in any derivative version prepared by Licensee.
3. In the event Licensee prepares a derivative work that is based on
or incorporates Python or any part thereof, and wants to make
the derivative work available to others as provided herein, then
Licensee hereby agrees to include in any such work a brief summary of
the changes made to Python.
4. PSF is making Python available to Licensee on an "AS IS"
basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
INFRINGE ANY THIRD PARTY RIGHTS.
5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
6. This License Agreement will automatically terminate upon a material
breach of its terms and conditions.
7. Nothing in this License Agreement shall be deemed to create any
relationship of agency, partnership, or joint venture between PSF and
Licensee. This License Agreement does not grant permission to use PSF
trademarks or trade name in a trademark sense to endorse or promote
products or services of Licensee, or any third party.
8. By copying, installing or otherwise using Python, Licensee
agrees to be bound by the terms and conditions of this License
Agreement.
As per item (3), we are required to provide a brief summary of
changes. For this, see comments in patsy/compat.py.
patsy-0.4.1+git34-ga5b54c2/MANIFEST.in 0000664 0000000 0000000 00000000247 13006651415 0016556 0 ustar 00root root 0000000 0000000 include setup.cfg .coveragerc tox.ini
include TODO LICENSE.txt README.rst CODE_OF_CONDUCT.md
recursive-include tools *.py *.R
recursive-include doc *
prune doc/_build
patsy-0.4.1+git34-ga5b54c2/README.rst 0000664 0000000 0000000 00000002120 13006651415 0016477 0 ustar 00root root 0000000 0000000 Patsy is a Python library for describing statistical models
(especially linear models, or models that have a linear component) and
building design matrices. Patsy brings the convenience of `R
`_ "formulas" to Python.
.. image:: https://travis-ci.org/pydata/patsy.png?branch=master
:target: https://travis-ci.org/pydata/patsy
.. image:: https://coveralls.io/repos/pydata/patsy/badge.png?branch=master
:target: https://coveralls.io/r/pydata/patsy?branch=master
Documentation:
https://patsy.readthedocs.io/
Downloads:
http://pypi.python.org/pypi/patsy/
Dependencies:
* Python (2.6, 2.7, or 3.3+)
* six
* numpy
Optional dependencies:
* nose: needed to run tests
* scipy: needed for spline-related functions like ``bs``
Install:
``pip install patsy`` (or, for traditionalists: ``python setup.py install``)
Code and bug tracker:
https://github.com/pydata/patsy
Mailing list:
* pydata@googlegroups.com
* http://groups.google.com/group/pydata
* http://news.gmane.org/gmane.comp.python.pydata
License:
2-clause BSD, see LICENSE.txt for details.
patsy-0.4.1+git34-ga5b54c2/TODO 0000664 0000000 0000000 00000034641 13006651415 0015515 0 ustar 00root root 0000000 0000000 * Add missing data handling to the just-pass-in-a-matrix bit of the high-level API
* Add parallel array handling to build_design_matrices
* Add parallel array handling of some sort to high-level API...
* Refactor build so that there are two stages
- first stage takes a set of factor evaluators, and returns a set of
evaluated columns
- second stage handles interactions and categorical coding and assembles
these together into design matrices
use case: any model where you actually want to get categorical data
out (like multinomial regression with a factor on the LHS, or CART
with factors on the right-hand side)
** first stage should also handle other "parallel" data, like weights, which need to participate in the missingness calculations
** possibly also support a "subset=" argument at this stage
** and for parallel vectors and subset=, allow a string as a value, and if seen then evaluate it as python code in the same context as formula data (like R's subset=(MyCol > 10))
** And do NaN/mask/missing data handling at this stage
*** Imputation?
*** numpy.ma
* Better NaN/masks/missing data handling in transforms. I think the
current ones will just blow up if there are any NaNs. (The previous
entry is about handling the term "x" where x has NAs; this entry is
about handling "center(x)" where x has NAs.) R's solution to this is
that scale(x) simply unconditionally ignores NAs when computing the
mean, regardless of the overall setting of na.action. That seems
reasonable...
* Advocacy
Potential users?
- statsmodels
- PyMC has a (closed) ticket requesting such features:
http://code.google.com/p/pymc/issues/detail?id=162
- nipy, though they have their own thing...
- sklearn, which has regression (and might find it useful otherwise!)
* Do something smarter with mismatched pandas indexes
Right now we're conservative -- if you do ~ x + y and x and y don't
have identical indexes, then that's an error. It's possible we should
do something cleverer, though. Perhaps we should merge them somehow
(pandas.concat(..., join="outer")?). (This of course would require
that *all* items have indexes, though; right now you can mix plain
ndarrays and pandas objects.)
* Improve EvalFactor's stateful transform handling to follow . lookups
right now it can only detect stateful transforms when they are called
directly like
scale(x)
but not if referenced through some module like
mylib.scale(x)
In general we don't even want to try handling every possible function
lookup syntax (see next item for a safety check for that), but we
should allow for people to distribute non-builtin stateful
transforms.
* As a safety check for non-stateful transforms, we should always
evaluate each formula on just the first row of data alone, and make
sure the result matches what we got when evaluating it vectorized
(i.e., confirm f(x[0]) == f(x)[0], where f is our transform. However,
this is kind of tricky given that x might be pulled out of the
environment, the 'data' dict might have arbitrary objects,
etc. Hmm. Maybe intercept variable lookups and just munge those? This
is easy to do if someone's passing in a structured array or dataframe
and pulling all their data from it, or even if they use a dict with
well-behaved columns. But the problem is when people do things like:
In [1]: logx = np.log(data["x"])
# refers to data["y"] and logx together
In [2]: lm("y ~ logx", data)
* More contrast tools
- Some sort of symbolic tools for user-defined contrasts -- take the
comparisons that people want to compute in terms of linear
combinations of level names, convert that to a matrix and do the
pinv dance? We have the linear_contrast code already, but that's for
describing constraints in terms of the coefficients you have -- it
seems like people want to be able to describe constraints in terms
of... I'm not sure what. Group means? The coefficients they could
have had if they'd fit some other model? (Presumably the
all-full-rank-dummy-coding-all-the-time model.) If I can ever figure
out what this is (it has something to do with "estimable contrasts")
then I'll implement it.
- Short-hands for Type II, Type III, and "remove this term and
everything marginal to it" contrast tests?
Might need to figure out the trick that car::Anova uses to do
efficient Type II tests with two contrast matrices.
- Understand how coding matters for Type-III ANOVA. The tabs I had
open last time I was looking at this:
http://goanna.cs.rmit.edu.au/~fscholer/anova.php
http://www.mail-archive.com/r-help@stat.math.ethz.ch/msg69781.html
https://stat.ethz.ch/pipermail/r-help/2007-October/143047.html
http://www.uni-kiel.de/psychologie/dwoll/r/ssTypes.php
* A good way to support magic functions like mgcv's s().
statsmodels wants this for things like
y ~ arima(2, 3)
y ~ garch(1, 1)
the cheap trick way of doing it is:
class ArimaModelType(object):
__patsy_magic__ = True
...
def arima(n, m):
return ArimaModelType(n, m)
and then in the factor type sniffing code detect these things and
separate them out from "real" factors.
* make sure that pickling works
- And make sure that if we allow it at all, then it's sustainable!
i.e. we'll be able to guarantee that if people pickle a ModelDesc or
Design or whatever now, then they'll be able to get it back later.
* Should EvalEnvironment.capture make a copy of the scope dictionaries?
- The effect would be to prevent later changes in the enclosing scope
from affecting predictions. Of course, we probably don't want to
make a *deep* copy of the scope, so there's still no guarantees --
changes to mutable objects within that scope would still be
visible. Perhaps we *could* get away with making a deep copy of all
mutable objects that are accessed during the initial build,
though... I think we'd need to special-case and ignore any READONLY
ndarrays, as a safety valve for people who have a giant data-set
they're referring to. of course, even a deep copy isn't enough --
they could call an immutable function which accesses mutable state.
- Josef points out that in long-running REPLs people often need to del
local variables to let memory be released, and if all their formulas
are going and making shallow copies of the environment then this
will be impossible. So making a shallow copy is probably out.
- The other approach would be to extend the state dependency checking
that we already want to do (to catch undeclared stateful
transforms), and have it not only check that building an isolated
row of data gives the same result as building the full list, but
also that re-building that same row later at prediction time gives
the same result as it did in the first place.
* Export information on which terms are marginal to which other ones
Marginality only makes sense within a numeric-interaction "bucket", so
this has to be computed in patsy.build and exported as part of
DesignMatrixColumnInfo. Then it can be used for Type II tests.
* Some way to specify the default contrast
* Support for R's magic "." term
- The "y ~ everything else" form
- The "what I had in this other ModelDesc" form (e.g., "y ~ . - a"
to drop the 'a' predictor from an old model)
- This will require that the formula->ModelDesc have access to the
data or previous formula...
* More stateful transforms:
- Splines
- 'cut': numeric->factor by quantile dichotimization
- Orthogonal polynomials
- 'code': takes a Categorical (or coerces to one), and optionally
a contrast, and and does the standard contrast-coding. And
possibly this should replace _CatFactorEvaluator...
* Support for building sparse model matrices directly. (This should
be pretty straightforward when it comes to exploiting the intrinsic
sparsity of categorical factors; numeric factors that evaluate to a
sparse matrix directly might be slightly more complicated.)
* Real testing/support for formula syntax extensions
The tricky part here is making sure we produce something useful.
Use cases:
- multinomial log-linear modelling
- see below
Prior art:
- R package "lmer" interpets formulas like
y ~ x1 + x2 + (1 | foo) + (1 + x | bar)
- The R [[http://cran.r-project.org/web/packages/Formula/vignettes/Formula.pdf][Formula]] package, which has two features:
- you can write multivariate responses, like y1 + y2 ~ ... (in stock
R, this is interpreted as addition (!)).
- you can write multiple "parts" on each side, separated
by |. Basically these are treated as a list of design matrix
specifications, and there are ways to pull out the first, second
etc. on each side.
- R package "plm" uses Formula to allow formulas like:
y ~ x1 + x2
y ~ x1 + x2 | x3
y ~ x1 + x2 | . + x3
where the second part specifies "instrumental variables". I can't
tell if the second part has an implicit intercept.
- R package "frontier" uses Formula in a similar way, allowing
formulas like
y ~ x1 + x2
y ~ x1 + x2 | x3
where the first form computes a "error components frontier" and the
latter computes an "efficiency effects frontier" (where the part
after the | are regresses "used to explain the efficiency levels (Z
variables)"). The part after the bar does have an implicit
intercept.
- package AER uses this in its "ivreg" command, which seems similar
to plm. An example makes clear that "y ~ . | x1 + x2" works, and
presumably the "." means the same thing as it would in "y ~ ." for
lm.
- package betareg does "beta regression", and a formula like "y ~
x1 | x2" states that "x1" should be used for the "mean submodel"
and "x2" should be used for the "precision submodel". Its betatree
function extends this further to "y ~ x1 | x2 | c1 + c2" where
"c1", "c2" are "partitioning variables". AFAICT this means that it
does basically does a CART-style tree division of the data based
on c1, c2, and then fits beta regression models x1 | x2 on each
subset.
- package "fdaMixed" uses formulas like
Y | id ~ fixed | random
where Y is a response variable, id is "a factor separating the
samples", and fixed and random are linear models for the fixed
and random effects. The 'id' part seems to be used to match
multiple samples from the same random effects group?
- package "growcurves" allows "y ~ fixed | random". If there is
no |, then there is a second argument (random.only) which is
consulted to determine whether the sole RHS argument is fixed or
random. (Maybe 'y ~ x1 + x2 + random(x3 + x3)' would be a better
syntax?)
- package "games" uses a syntax like "y ~ x1 + x2 | 0 | x3 |
z". There is another version with 8 entries instead of 4.
- package "metafor" does effect-size calculations using the syntax
"outcome ~ group | study" where each entry has to be a 2-level
factor. (And the 'weights' argument gives the actual numbers.)
- package "mhurdle" seems to describe a kind of multi-step process
via three-part formulas
y ~ x1 | x2 | x3
where "the first part describes the selection process if any, the
second part the regression equation, and the third part the
purchase infrequency process". You can fill in 0 if you want to
assume that some process doesn't actually apply (or leave out the
last one altogether).
- package "mlogit" uses three-part RHS formulas to specify different
parts of a multinomial logit model. "the first one contains the
alternative specific variables with generic coefficient, i.e. a
unique coefficient for all the alternatives; the second one
contains the individual specific variables for which one
coefficient is estimated for all the alternatives except one of
them ; the third one contains the alternative specific variables
with alternative specific coefficients...If a standard formula is
writen, it is assumed that there are only alternative specific
variables with generic coefficients."
The second RHS termlist has an intercept by default; for the other
two termlists any intercept is ignored in any case.
- package "polywog" does some clever polynomial basis function
fitting thing, and uses formulas like
y ~ x1 + x2 | z1 + z2
to mean basically the equivalent of
y ~ x1*x2 + z1 + z2
i.e., the first termlist gets a super-rich non-linear interaction
between all its entries, and the second is just entered linearly.
* Currently we don't distinguish between ordered and unordered categorical data.
Should that change?
* how should redundancy elimination and explicit factor matrices interact?
Example: If you do 1 + C(a, mat):C(b, mat), then currently it will
expand that to 1 + C(a, mat) + C(a, mat):C(b, mat), which is going to
be weird. Probably we should notice that the .contrast attribute in
these cases does not give us the option of full- versus reduced-rank
coding, and in redundancy.py we should note that such factors cannot
be "expanded".
* Profiling/optimization. There are lots of places where I use lazy
quadratic algorithms (or even exponential, in the case of the
non-redundant coding stuff). Perhaps worse is the heavy
multiplication used unconditionally to load data into the model
matrix. I'm pretty sure that at least most of the quadratic stuff
doesn't matter because it's n^2 where n is something like the
number of factors in an interaction term (and who has hundreds of
factors interacting in one term?), but it wouldn't hurt to run some
profiles to check. I think really what I mean is just, run timeit
on a 10-variable interaction to make sure it isn't completely
annoying.
* Possible optimization: let a stateful transform's memorize_chunk
function raise Stateless to indicate that actually, ha-ha, it turns
out that it doesn't need to memorize anything after all (b/c the
relevant data turns out to be specified explicitly in *args,
**kwargs).
Actually, this would be really useful for things like splines,
which need to do expensive quantile estimation, but not if knots
are specified.
Another use case: C(something_that's_already_categorical,
contrast=...). Note that this can't be detected until we do the first
round of evaluation.
A better interface would be memorize_needed(self, *args, **kwargs).
I guess we could even have memorize_passes_needed, but eh...
* Wacky idea: make factors into an actual stateful transform (one
that takes a dict-like object and returns a matrix or Categorical)
This would require:
- adding memorize_passes support to stateful transforms
- moving the factor memorization state inside an object (so it
wouldn't be factors that would be stateful transforms, factors would
be factories for stateful transforms)
patsy-0.4.1+git34-ga5b54c2/doc/ 0000775 0000000 0000000 00000000000 13006651415 0015562 5 ustar 00root root 0000000 0000000 patsy-0.4.1+git34-ga5b54c2/doc/API-reference.rst 0000664 0000000 0000000 00000013032 13006651415 0020660 0 ustar 00root root 0000000 0000000 ``patsy`` API reference
==========================
This is a complete reference for everything you get when you `import
patsy`.
.. module:: patsy
.. ipython:: python
:suppress:
from patsy import *
Basic API
---------
.. autofunction:: dmatrix
.. autofunction:: dmatrices
.. autofunction:: incr_dbuilders
.. autofunction:: incr_dbuilder
.. autoexception:: PatsyError
:members:
Convenience utilities
---------------------
.. autofunction:: balanced
.. autofunction:: demo_data
Design metadata
---------------
.. autoclass:: DesignInfo
Here's an example of the most common way to get a :class:`DesignInfo`:
.. ipython:: python
mat = dmatrix("a + x", demo_data("a", "x", nlevels=3))
di = mat.design_info
.. attribute:: column_names
The names of each column, represented as a list of strings in
the proper order. Guaranteed to exist.
.. ipython:: python
di.column_names
.. attribute:: column_name_indexes
An :class:`~collections.OrderedDict` mapping column names (as
strings) to column indexes (as integers). Guaranteed to exist
and to be sorted from low to high.
.. ipython:: python
di.column_name_indexes
.. attribute:: term_names
The names of each term, represented as a list of strings in
the proper order. Guaranteed to exist. There is a one-to-many
relationship between columns and terms -- each term generates
one or more columns.
.. ipython:: python
di.term_names
.. attribute:: term_name_slices
An :class:`~collections.OrderedDict` mapping term names (as
strings) to Python :func:`slice` objects indicating which
columns correspond to each term. Guaranteed to exist. The slices
are guaranteed to be sorted from left to right and to cover the
whole range of columns with no overlaps or gaps.
.. ipython:: python
di.term_name_slices
.. attribute:: terms
A list of :class:`Term` objects representing each term. May be
None, for example if a user passed in a plain preassembled
design matrix rather than using the Patsy machinery.
.. ipython:: python
di.terms
[term.name() for term in di.terms]
.. attribute:: term_slices
An :class:`~collections.OrderedDict` mapping :class:`Term`
objects to Python :func:`slice` objects indicating which columns
correspond to which terms. Like :attr:`terms`, this may be None.
.. ipython:: python
di.term_slices
.. attribute:: factor_infos
A dict mapping factor objects to :class:`FactorInfo` objects
providing information about each factor. Like :attr:`terms`,
this may be None.
.. ipython:: python
di.factor_infos
.. attribute:: term_codings
An :class:`~collections.OrderedDict` mapping each :class:`Term`
object to a list of :class:`SubtermInfo` objects which together
describe how this term is encoded in the final design
matrix. Like :attr:`terms`, this may be None.
.. ipython:: python
di.term_codings
.. attribute:: builder
In versions of patsy before 0.4.0, this returned a
``DesignMatrixBuilder`` object which could be passed to
:func:`build_design_matrices`. Starting in 0.4.0,
:func:`build_design_matrices` now accepts :class:`DesignInfo`
objects directly, and writing ``f(design_info.builder)`` is now a
deprecated alias for simply writing ``f(design_info)``.
A number of convenience methods are also provided that take
advantage of the above metadata:
.. automethod:: describe
.. automethod:: linear_constraint
.. automethod:: slice
.. automethod:: subset
.. automethod:: from_array
.. autoclass:: FactorInfo
.. autoclass:: SubtermInfo
.. autoclass:: DesignMatrix
.. automethod:: __new__
.. _stateful-transforms-list:
Stateful transforms
-------------------
Patsy comes with a number of :ref:`stateful transforms
` built in:
.. autofunction:: center
.. autofunction:: standardize
.. function:: scale(x, center=True, rescale=True, ddof=0)
An alias for :func:`standardize`, for R compatibility.
Finally, this is not itself a stateful transform, but it's useful if
you want to define your own:
.. autofunction:: stateful_transform
.. _categorical-coding-ref:
Handling categorical data
-------------------------
.. autoclass:: Treatment
.. autoclass:: Diff
.. autoclass:: Poly
.. autoclass:: Sum
.. autoclass:: Helmert
.. autoclass:: ContrastMatrix
Spline regression
-----------------
.. autofunction:: bs
.. autofunction:: cr
.. autofunction:: cc
.. autofunction:: te
Working with formulas programmatically
--------------------------------------
.. autoclass:: Term
.. data:: INTERCEPT
This is a pre-instantiated zero-factors :class:`Term` object
representing the intercept, useful for making your code clearer. Do
remember though that this is not a singleton object, i.e., you
should compare against it using ``==``, not ``is``.
.. autoclass:: LookupFactor
.. autoclass:: EvalFactor
.. autoclass:: ModelDesc
Working with the Python execution environment
---------------------------------------------
.. autoclass:: EvalEnvironment
:members:
Building design matrices
------------------------
.. autofunction:: design_matrix_builders
.. autofunction:: build_design_matrices
Missing values
--------------
.. autoclass:: NAAction
:members:
Linear constraints
------------------
.. autoclass:: LinearConstraint
Origin tracking
---------------
.. autoclass:: Origin
:members:
patsy-0.4.1+git34-ga5b54c2/doc/Makefile 0000664 0000000 0000000 00000005677 13006651415 0017241 0 ustar 00root root 0000000 0000000 # Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d _build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
help:
@echo "Please use \`make ' where is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
-rm -rf _build/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) _build/html
@echo
@echo "Build finished. The HTML pages are in _build/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) _build/dirhtml
@echo
@echo "Build finished. The HTML pages are in _build/dirhtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) _build/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) _build/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) _build/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in _build/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) _build/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in _build/qthelp, like this:"
@echo "# qcollectiongenerator _build/qthelp/scikitssparse.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile _build/qthelp/scikitssparse.qhc"
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) _build/latex
@echo
@echo "Build finished; the LaTeX files are in _build/latex."
@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
"run these through (pdf)latex."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) _build/changes
@echo
@echo "The overview file is in _build/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) _build/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in _build/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) _build/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in _build/doctest/output.txt."
patsy-0.4.1+git34-ga5b54c2/doc/R-comparison.rst 0000664 0000000 0000000 00000015766 13006651415 0020704 0 ustar 00root root 0000000 0000000 .. _R-comparison:
Differences between R and Patsy formulas
===========================================
.. currentmodule:: patsy
Patsy has a very high degree of compatibility with R. Almost any
formula you would use in R will also work in Patsy -- with a few
caveats.
.. note:: All R quirks described herein were last verified with R
2.15.0.
Differences from R:
- Most obviously, we both support using arbitrary code to perform
variable transformations, but in Patsy this code is written in
Python, not R.
- Patsy has no ``%in%``. In R, ``a %in% b`` is identical to
``b:a``. Patsy only supports the ``b:a`` version of this syntax.
- In Patsy, only ``**`` can be used for exponentiation. In R, both
``^`` and ``**`` can be used for exponentiation, i.e., you can write
either ``(a + b)^2`` or ``(a + b)**2``. In Patsy (as in Python
generally), only ``**`` indicates exponentiation; ``^`` is ignored
by the parser (and if present, will be interpreted as a call to the
Python binary XOR operator).
- In Patsy, the left-hand side of a formula uses the same
evaluation rules as the right-hand side. In R, the left hand side is
treated as R code, so a formula like ``y1 + y2 ~ x1 + x2`` actually
regresses the *sum* of ``y1`` and ``y2`` onto the *set of
predictors* ``x1`` and ``x2``. In Patsy, the only difference
between the left-hand side and the right-hand side is that there is
no automatic intercept added to the left-hand side. (In this regard
Patsy is similar to the R enhanced formula package `Formula
`_.)
- Patsy produces a different column ordering for formulas involving
numeric predictors. In R, there are two rules for term ordering:
first, lower-order interactions are sorted before higher-order
interactions, and second, interactions of the same order are listed
in whatever order they appeared in the formula. In Patsy, we add
another rule: terms are first grouped together based on which
numeric factors they include. Then within each group, we use the
same ordering as R.
- Patsy has more rigorous handling of the presence or absence of
the intercept term. In R, the rules for when deciding whether to
include an intercept are somewhat idiosyncratic and can ignore
things like parentheses. To understand the difference, first
consider the formula ``a + (b - a)``. In both Patsy and R, we
first evaluate the ``(b - a)`` part; since there is no ``a`` term to
remove, this simplifies to just ``b``. We then evaluate ``a + b``:
the end result is a model which contains an ``a`` term in it.
Now consider the formula ``1 + (b - 1)``. In Patsy, this is
analogous to the case above: first ``(b - 1)`` is reduced to just ``b``,
and then ``1 + b`` produces a model with intercept included. In R, the
parentheses are ignored, and ``1 + (b - 1)`` gives a model that does
*not* include the intercept.
This can be slightly more confusing when it comes to the implicit
intercept term. In Patsy, this is handled exactly as if the
right-hand side of each formula has an invisible ``"1 +"`` inserted at
the beginning. Therefore in Patsy, these formulas are different::
# Python:
dmatrices("y ~ b - 1") # equivalent to 1 + b - 1: no intercept
dmatrices("y ~ (b - 1)") # equivalent to 1 + (b - 1): has intercept
In R, these two formulas are equivalent.
- Patsy has a more accurate algorithm for deciding whether to use a
full- or reduced-rank coding scheme for categorical factors. There
are two situations in which R's coding algorithm for categorical
variables can become confused and produce over- or under-specified
model matrices. Patsy, so far as we are aware, produces correctly
specified matrices in all cases. It's unlikely that you'll run into
these in actual usage, but they're worth mentioning. To illustrate,
let's define ``a`` and ``b`` as categorical predictors, each with 2
levels:
.. code-block:: rconsole
# R:
> a <- factor(c("a1", "a1", "a2", "a2"))
> b <- factor(c("b1", "b2", "b1", "b2"))
.. ipython:: python
:suppress:
a = ["a1", "a1", "a2", "a2"]
b = ["b1", "b2", "b1", "b2"]
from patsy import dmatrix
The first problem occurs for formulas like ``1 + a:b``. This produces
a model matrix with rank 4, just like many other formulas that
include ``a:b``, such as ``0 + a:b``, ``1 + a + a:b``, and ``a*b``:
.. code-block:: rconsole
# R:
> qr(model.matrix(~ 1 + a:b))$rank
[1] 4
However, the matrix produced for this formula has 5 columns, meaning
that it contains redundant overspecification:
.. code-block:: rconsole
# R:
> mat <- model.matrix(~ 1 + a:b)
> ncol(mat)
[1] 5
The underlying problem is that R's algorithm does not pay attention
to 'non-local' redundancies -- it will adjust its coding to avoid a
redundancy between two terms of degree-n, or a term of degree-n and
one of degree-(n+1), but it is blind to a redundancy between a term
of degree-n and one of degree-(n+2), as we have here.
Patsy's algorithm has no such limitation:
.. ipython:: python
# Python:
a = ["a1", "a1", "a2", "a2"]
b = ["b1", "b2", "b1", "b2"]
mat = dmatrix("1 + a:b")
mat.shape[1]
To produce this result, it codes ``a:b`` uses the same columns that
would be used to code ``b + a:b`` in the formula ``"1 + b + a:b"``.
The second problem occurs for formulas involving numeric
predictors. Effectively, when determining coding schemes, R assumes
that all factors are categorical. So for the formula ``0 + a:c +
a:b``, R will notice that if it used a full-rank coding for the ``c``
and ``b`` factors, then both terms would be collinear with ``a``, and
thus each other. Therefore, it encodes ``c`` with a full-rank
encoding, and uses a reduced-rank encoding for ``b``. (And the ``0 +``
lets it avoid the previous bug.) So far, so good.
But now consider the formula ``0 + a:x + a:b``, where ``x`` is
numeric. Here, ``a:x`` and ``a:b`` will not be collinear, even if we do
use a full-rank encoding for ``b``. Therefore, we *should* use a
full-rank encoding for ``b``, and produce a model matrix with 6
columns. But in fact, R gives us only 4:
.. code-block:: rconsole
# R:
> x <- c(1, 2, 3, 4)
> mat <- model.matrix(~ 0 + a:x + a:b)
> ncol(mat)
[1] 4
The problem is that it cannot tell the difference between ``0 + a:x +
a:b`` and ``0 + a:c + a:b``: it uses the same coding for both, whether
it's appropriate or not.
(The alert reader might wonder whether this bug could be triggered
by a simpler formula, like ``0 + x + b``. It turns out that R's code
``do_modelmatrix`` function has a special-case where for first-order
interactions only, it *will* peek at the type of the data before
deciding on a coding scheme.)
Patsy always checks whether each factor is categorical or numeric
before it makes coding decisions, and thus handles this case
correctly:
.. ipython:: python
# Python:
x = [1, 2, 3, 4]
mat = dmatrix("0 + a:x + a:b")
mat.shape[1]
patsy-0.4.1+git34-ga5b54c2/doc/_examples/ 0000775 0000000 0000000 00000000000 13006651415 0017537 5 ustar 00root root 0000000 0000000 patsy-0.4.1+git34-ga5b54c2/doc/_examples/add_predictors.py 0000664 0000000 0000000 00000000617 13006651415 0023103 0 ustar 00root root 0000000 0000000 def add_predictors(base_formula, extra_predictors):
desc = ModelDesc.from_formula(base_formula)
# Using LookupFactor here ensures that everything will work correctly even
# if one of the column names in extra_columns is named like "weight.in.kg"
# or "sys.exit()" or "LittleBobbyTables()".
desc.rhs_termlist += [Term([LookupFactor(p)]) for p in extra_predictors]
return desc
patsy-0.4.1+git34-ga5b54c2/doc/_examples/example_lm.py 0000664 0000000 0000000 00000003267 13006651415 0022244 0 ustar 00root root 0000000 0000000 import numpy as np
from patsy import dmatrices, build_design_matrices
class LM(object):
"""An example ordinary least squares linear model class, analogous to R's
lm() function. Don't use this in real life, it isn't properly tested."""
def __init__(self, formula_like, data={}):
y, x = dmatrices(formula_like, data, 1)
self.nobs = x.shape[0]
self.betas, self.rss, _, _ = np.linalg.lstsq(x, y)
self._y_design_info = y.design_info
self._x_design_info = x.design_info
def __repr__(self):
summary = ("Ordinary least-squares regression\n"
" Model: %s ~ %s\n"
" Regression (beta) coefficients:\n"
% (self._y_design_info.describe(),
self._x_design_info.describe()))
for name, value in zip(self._x_design_info.column_names, self.betas):
summary += " %s: %0.3g\n" % (name, value[0])
return summary
def predict(self, new_data):
(new_x,) = build_design_matrices([self._x_design_info],
new_data)
return np.dot(new_x, self.betas)
def loglik(self, new_data):
(new_y, new_x) = build_design_matrices([self._y_design_info,
self._x_design_info],
new_data)
new_pred = np.dot(new_x, self.betas)
sigma2 = self.rss / self.nobs
# It'd be more elegant to use scipy.stats.norm.logpdf here, but adding
# a dependency on scipy makes the docs build more complicated:
Z = -0.5 * np.log(2 * np.pi * sigma2)
return Z + -0.5 * (new_y - new_x) ** 2/sigma2
patsy-0.4.1+git34-ga5b54c2/doc/_examples/example_treatment.py 0000664 0000000 0000000 00000001326 13006651415 0023631 0 ustar 00root root 0000000 0000000 import numpy as np
class MyTreat(object):
def __init__(self, reference=0):
self.reference = reference
def code_with_intercept(self, levels):
return ContrastMatrix(np.eye(len(levels)),
["[My.%s]" % (level,) for level in levels])
def code_without_intercept(self, levels):
eye = np.eye(len(levels) - 1)
contrasts = np.vstack((eye[:self.reference, :],
np.zeros((1, len(levels) - 1)),
eye[self.reference:, :]))
suffixes = ["[MyT.%s]" % (level,) for level in
levels[:self.reference] + levels[self.reference + 1:]]
return ContrastMatrix(contrasts, suffixes)
patsy-0.4.1+git34-ga5b54c2/doc/_static/ 0000775 0000000 0000000 00000000000 13006651415 0017210 5 ustar 00root root 0000000 0000000 patsy-0.4.1+git34-ga5b54c2/doc/_static/closelabel.png 0000664 0000000 0000000 00000000250 13006651415 0022020 0 ustar 00root root 0000000 0000000 ‰PNG
IHDR ľ‹ tEXtSoftware Adobe ImageReadyqÉe<