pax_global_header00006660000000000000000000000064132527720050014515gustar00rootroot0000000000000052 comment=1c69998ee07f7c00b5f2bacc18c7d798f3ee0d3c datalad-0.9.3/000077500000000000000000000000001325277200500131205ustar00rootroot00000000000000datalad-0.9.3/.asv/000077500000000000000000000000001325277200500137675ustar00rootroot00000000000000datalad-0.9.3/.coveragerc000066400000000000000000000000551325277200500152410ustar00rootroot00000000000000[paths] source = datalad/ */datalad/ datalad-0.9.3/.github/000077500000000000000000000000001325277200500144605ustar00rootroot00000000000000datalad-0.9.3/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000002431325277200500202600ustar00rootroot00000000000000This pull request fixes # This pull request proposes ### Changes - [x] This change is complete - [ ] This is in progress Please have a look @datalad/developers datalad-0.9.3/.github/issue_template.md000066400000000000000000000006561325277200500200340ustar00rootroot00000000000000#### What is the problem? #### What steps will reproduce the problem? #### What version of DataLad are you using (run `datalad --version`)? On what operating system (consider running `datalad plugin wtf`)? #### Is there anything else that would be useful to know in this context? #### Have you had any success using DataLad before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders) datalad-0.9.3/.gitignore000066400000000000000000000003371325277200500151130ustar00rootroot00000000000000.pybuild/ datalad.build/ /.idea .coverage /venv* /cfgs /dist /fixtures /diagrams /build /docs/build /docs/source/generated /.tox /repos tmp .noseids *.egg-info *.py[coe] *.bak .#* .orig .rej .*.swp .travis.yml.evil-bd .asv datalad-0.9.3/.gitmodules000066400000000000000000000001131325277200500152700ustar00rootroot00000000000000[submodule ".asv"] path = .asv url = https://github.com/datalad/.asv.git datalad-0.9.3/.mailmap000066400000000000000000000010051325277200500145350ustar00rootroot00000000000000Alejandro de la Vega Benjamin Poldrack Benjamin Poldrack Christian Olaf Häusler chris Debanjum Singh Solanky Debanjum debanjum Jason Gors Michael Hanke mih Yaroslav Halchenko Yaroslav Halchenko datalad-0.9.3/.travis.yml000066400000000000000000000244521325277200500152400ustar00rootroot00000000000000# vim ft=yaml # travis-ci.org definition for DataLad build language: python python: - 2.7 - 3.4 cache: - apt env: global: # will be used in the matrix, where neither other variable is used - BOTO_CONFIG=/tmp/nowhere - DATALAD_TESTS_SSH=1 - DATALAD_LOG_CMD_ENV=GIT_SSH_COMMAND - TESTS_TO_PERFORM= - NOSE_OPTS=-s - NOSE_SELECTION_OP="not " # so it would be "not (integration or usecase)" # Special settings/helper for combined coverage from special remotes execution - COVERAGE=coverage - DATALAD_DATASETS_TOPURL=http://datasets-tests.datalad.org matrix: - DATALAD_REPO_DIRECT=yes - DATALAD_REPO_VERSION=6 - DATALAD_REPO_VERSION=5 # Disabled since old folks don't want to change workflows of submitting through central authority # - secure: "k2rHdTBjUU3pUUASqfRr7kHaaSmNKLLAR2f66A0fFSulih4CXxwLrR3g8/HP9m+jMve8mAYEiPSI7dT7cCm3WMA/piyLh2wKCGgzDD9oLjtvPAioR8dgLpzbgjxV/Vq6fwwPMlvbqqa+MmAImnAoSufEmI7zVQHCq11Hd5nd6Es=" # - secure: "Az7Pzo5pSdAFTTX6XXzE4VUS3wnlIe1vi/+bfHBzDjxstDvZVkPjPzaIs6v/BLod43AYBl1v9dyJR4qnBnaVrUDLB3tC0emLhJ2qnw+8GKHSSImCwIOeZzg9QpXeVQovZUqQVQ3fg3KIWCIzhmJ59EbMQcI4krNDxk4WcXmyVfk=" matrix: include: # Additional custom ones - python: 2.7 # By default no logs will be output. This one is to test with log output at INFO level env: - _DATALAD_UPSTREAM_GITPYTHON=1 - _DATALAD_UPSTREAM_GITANNEX=1 - _DATALAD_NONPR_ONLY=1 # Just so we test if we did not screw up running under nose without -s as well - NOSE_OPTS= - python: 2.7 # Run slow etc tests under a single tricky scenario env: - NOSE_SELECTION_OP="" - TMPDIR="/var/tmp/sym link" # And the leading - in filenames for the most challenge - DATALAD_TESTS_OBSCURE_PREFIX=- - python: 3.4 # Run slow etc tests under a single tricky scenario env: - NOSE_SELECTION_OP="" - TMPDIR="/var/tmp/sym link" # And the leading - in filenames for the most challenge - DATALAD_TESTS_OBSCURE_PREFIX=- - python: 2.7 # By default no logs will be output. This one is to test with log output at INFO level env: - DATALAD_LOG_LEVEL=INFO - DATALAD_LOG_TRACEBACK=1 # just a smoke test for now - _DATALAD_NONPR_ONLY=1 - python: 2.7 # By default no logs will be output. This one is to test with low level but dumped to /dev/null env: - DATALAD_LOG_LEVEL=2 - DATALAD_LOG_TARGET=/dev/null - DATALAD_LOG_TRACEBACK=collide # just a smoke test for now - DATALAD_API_ALWAYSRENDER=1 - DATALAD_TESTS_PROTOCOLREMOTE=1 - DATALAD_TESTS_DATALADREMOTE=1 - DATALAD_LOG_CMD_CWD=1 - DATALAD_LOG_CMD_OUTPUTS=1 - DATALAD_LOG_CMD_ENV=1 - DATALAD_LOG_CMD_STDIN=1 - DATALAD_TESTS_UI_BACKEND=console - DATALAD_TESTS_OBSCURE_PREFIX=- - DATALAD_SEED=1 - GIT_AUTHOR_DATE="Thu, 07 Apr 2005 22:13:13 +0200" - GIT_AUTHOR_NAME=blah - GIT_AUTHOR_EMAIL=committer@example.com - GIT_COMMITTER_DATE="Thu, 07 Apr 2005 22:13:13 +0200" - GIT_COMMITTER_NAME=blah - GIT_COMMITTER_EMAIL=committer@example.com - python: 2.7 env: # to test operation under root since also would consider FS "crippled" due to # ability to rewrite R/O files - NOSE_WRAPPER="sudo -E" # no key authentication for root: - DATALAD_TESTS_SSH=0 - _DATALAD_NONPR_ONLY=1 - python: 2.7 env: - DATALAD_TESTS_NONETWORK=1 # must operate nicely with those env variables set - http_proxy= - https_proxy= - _DATALAD_NONPR_ONLY=1 - python: 2.7 env: - PYTHONPATH=$PWD/tools/testing/bad_internals/_scrapy/ - _DATALAD_NONPR_ONLY=1 - python: 2.7 # To make sure that operates correctly whenever dataset directories have symlinks # in their path. env: # Eventually we will get there, but atm causes a good number of failures # - TMPDIR="/tmp/sym ссылка" - TMPDIR="/tmp/sym link" - _DATALAD_NONPR_ONLY=1 - python: 2.7 # apparently moving symlink outside has different effects on abspath # see https://github.com/datalad/datalad/issues/878 env: - TMPDIR="/var/tmp/sym link" - python: 2.7 # To make sure that operates correctly whenever dataset directories have symlinks # in their path. env: # To make orthogonal test where it is not a symlink but a dir with spaces - TMPDIR="/tmp/d i r" - _DATALAD_NONPR_ONLY=1 - python: 2.7 # Test under NFS mount (only selected sub-set) env: - TMPDIR="/tmp/nfsmount" - TESTS_TO_PERFORM="datalad/tests datalad/support" - python: 2.7 # Test under NFS mount (full, only in master) env: - TMPDIR="/tmp/nfsmount" - _DATALAD_NONPR_ONLY=1 - python: 2.7 # test whether known v6 failures still fail env: - DATALAD_TESTS_SSH=1 - DATALAD_REPO_VERSION=6 - DATALAD_TESTS_KNOWNFAILURES_SKIP=no - DATALAD_TESTS_KNOWNFAILURES_PROBE=yes - python: 2.7 # test whether known direct mode failures still fail env: - DATALAD_TESTS_SSH=1 - DATALAD_REPO_DIRECT=yes - DATALAD_TESTS_KNOWNFAILURES_SKIP=no - DATALAD_TESTS_KNOWNFAILURES_PROBE=yes # run if git-annex version in neurodebian -devel differs - python: 2.7 env: - _DATALAD_DEVEL_ANNEX=1 allow_failures: # Test under NFS mount (full, only in master) - python: 2.7 env: - TMPDIR="/tmp/nfsmount" - _DATALAD_NONPR_ONLY=1 # test whether known v6 failures still fail - env: - DATALAD_TESTS_SSH=1 - DATALAD_REPO_VERSION=6 - DATALAD_TESTS_KNOWNFAILURES_SKIP=no - DATALAD_TESTS_KNOWNFAILURES_PROBE=yes # test whether known direct mode failures still fail - env: - DATALAD_TESTS_SSH=1 - DATALAD_REPO_DIRECT=yes - DATALAD_TESTS_KNOWNFAILURES_SKIP=no - DATALAD_TESTS_KNOWNFAILURES_PROBE=yes # run if git-annex version in neurodebian -devel differs - python: 2.7 env: - _DATALAD_DEVEL_ANNEX=1 # Causes complete laptop or travis instance crash atm, but survives in a docker # need to figure it out (looks like some PID explosion) # - python: 3.5 # # we would need to migrate to boto3 to test it fully, but SSH should work # env: # - DATALAD_TESTS_SSH=1 # - UNSET_S3_SECRETS=1 before_install: - if [ ! "${TRAVIS_PULL_REQUEST:-false}" = "false" ] && [ ! -z "${_DATALAD_NONPR_ONLY:-}" ]; then echo "Exiting early since these tests should run only in master branch"; exit 0; fi # Just in case we need to check if nfs is there etc - sudo lsmod # The ultimate one-liner setup for NeuroDebian repository - bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) - travis_retry sudo apt-get update -qq - travis_retry sudo apt-get install eatmydata # to speedup some installations - sudo eatmydata tools/ci/prep-travis-forssh-sudo.sh - tools/ci/prep-travis-forssh.sh # Install grunt-cli - eatmydata npm install grunt-cli # Install optionally upstream current development so we are sure that they break nothing important for us - if [ ! -z "${_DATALAD_UPSTREAM_GITPYTHON:-}" ]; then pip install https://github.com/gitpython-developers/GitPython/archive/master.zip; fi - if [ ! -z "${_DATALAD_UPSTREAM_GITANNEX:-}" ]; then sudo tools/ci/install-annex-snapshot.sh; sudo ln -s `find /usr/local/lib/git-annex.linux -maxdepth 1 -type f -perm /+x` /usr/local/bin/; else sudo eatmydata apt-get install git-annex-standalone ; fi # Install optionally -devel version of annex, and if goes wrong (we have most recent), exit right away - if [ ! -z "${_DATALAD_DEVEL_ANNEX:-}" ]; then tools/ci/prep-travis-devel-annex.sh || exit 0; fi install: # Install standalone build of git-annex for the recent enough version - travis_retry sudo eatmydata apt-get install zip pandoc - travis_retry sudo eatmydata apt-get install shunit2 - git config --global user.email "test@travis.land" - git config --global user.name "Travis Almighty" - cd ..; pip install -q codecov; cd - - pip install -r requirements-devel.txt - pip install 'sphinx>=1.6.2' # So we could test under sudo -E with PATH pointing to installed location - sudo sed -i -e 's/^Defaults.*secure_path.*$//' /etc/sudoers # TMPDIRs - if [[ "${TMPDIR:-}" =~ .*/sym\ link ]]; then echo "Symlinking $TMPDIR"; ln -s /tmp "$TMPDIR"; fi - if [[ "${TMPDIR:-}" =~ .*/d\ i\ r ]]; then echo "mkdir $TMPDIR"; mkdir -p "$TMPDIR"; fi - if [[ "${TMPDIR:-}" =~ .*/nfsmount ]]; then echo "mkdir $TMPDIR"; mkdir -p "$TMPDIR" "${TMPDIR}_"; echo "/tmp/nfsmount_ localhost(rw)" | sudo bash -c 'cat - > /etc/exports'; sudo apt-get install -y nfs-kernel-server; sudo exportfs -a; sudo mount -t nfs localhost:/tmp/nfsmount_ /tmp/nfsmount; fi # S3 - if [ ! -z "$UNSET_S3_SECRETS" ]; then echo "usetting"; unset DATALAD_datalad_test_s3_key_id DATALAD_datalad_test_s3_secret_id; fi # Install grunt to test run javascript frontend tests - npm install grunt - npm install grunt-contrib-qunit script: # Verify that setup.py build doesn't puke - python setup.py build # Run tests - WRAPT_DISABLE_EXTENSIONS=1 PATH=$PWD/tools/coverage-bin:$PATH $NOSE_WRAPPER `which nosetests` $NOSE_OPTS -v -A "$NOSE_SELECTION_OP(integration or usecase or slow)" --with-doctest --doctest-tests --with-cov --cover-package datalad --logging-level=INFO $TESTS_TO_PERFORM # Generate documentation and run doctests # but do only when we do not have obnoxious logging turned on -- something screws up sphinx on travis - if [ ! "${DATALAD_LOG_LEVEL:-}" = 2 ]; then PYTHONPATH=$PWD $NOSE_WRAPPER make -C docs html doctest; fi # Run javascript tests - grunt test --verbose # Run doc examples if no spaces in the TMPDIR and SSH is allowed - if [ $DATALAD_TESTS_SSH != 1 ] || echo "${TMPDIR:-}" | grep -q ' '; then echo "skipping due spaces in $TMPDIR"; else $NOSE_WRAPPER tools/testing/run_doc_examples; fi # Test installation system-wide - sudo pip install . # Report WTF information using system wide installed version - datalad plugin wtf after_success: - coverage combine -a /tmp/.coverage-entrypoints-* - codecov # makes it only more difficult to comprehend the failing output. Enable only when necessary # for a particular debugging #after_failure: # - if [ ! -z "$DATALAD_TESTS_NONETWORK" ]; then sudo route add -net 0.0.0.0 netmask 0.0.0.0 dev lo; fi # - DATALAD_LOG_LEVEL=DEBUG $NOSE_WRAPPER `which nosetests` -s -v --with-doctest --doctest-tests --with-cov --cover-package datalad --logging-level=DEBUG # - if [ ! -z "$DATALAD_TESTS_NONETWORK" ]; then sudo route del -net 0.0.0.0 netmask 0.0.0.0 dev lo; fi datalad-0.9.3/CHANGELOG.md000066400000000000000000000610201325277200500147300ustar00rootroot00000000000000 ____ _ _ _ | _ \ __ _ | |_ __ _ | | __ _ __| | | | | | / _` | | __| / _` | | | / _` | / _` | | |_| | | (_| | | |_ | (_| | | |___ | (_| | | (_| | |____/ \__,_| \__| \__,_| |_____| \__,_| \__,_| Change Log This is a high level and scarce summary of the changes between releases. We would recommend to consult log of the [DataLad git repository](http://github.com/datalad/datalad) for more details. ## 0.9.3 (Mar 16, 2018) -- pi+0.02 release Some important bug fixes which should improve usability ### Fixes - `datalad-archives` special remote now will lock on acquiring or extracting an archive - this allows for it to be used with -J flag for parallel operation - relax introduced in 0.9.2 demand on git being configured for datalad operation - now we will just issue a warning - `datalad ls` should now list "authored date" and work also for datasets in detached HEAD mode - `datalad save` will now save original file as well, if file was "git mv"ed, so you can now `datalad run git mv old new` and have changes recorded ### Enhancements and new features - `--jobs` argument now could take `auto` value which would decide on # of jobs depending on the # of available CPUs. `git-annex` > 6.20180314 is recommended to avoid regression with -J. - memoize calls to `RI` meta-constructor -- should speed up operation a bit - `DATALAD_SEED` environment variable could be used to seed Python RNG and provide reproducible UUIDs etc (useful for testing and demos) ## 0.9.2 (Mar 04, 2017) -- it is (again) better than ever Largely a bugfix release with a few enhancements. ### Fixes - Execution of external commands (git) should not get stuck when lots of both stdout and stderr output, and should not loose remaining output in some cases - Config overrides provided in the command line (-c) should now be handled correctly - Consider more remotes (not just tracking one, which might be none) while installing subdatasets - Compatibility with git 2.16 with some changed behaviors/annotations for submodules - Fail `remove` if `annex drop` failed - Do not fail operating on files which start with dash (-) - URL unquote paths within S3, URLs and DataLad RIs (///) - In non-interactive mode fail if authentication/access fails - Web UI: - refactored a little to fix incorrect listing of submodules in subdirectories - now auto-focuses on search edit box upon entering the page - Assure that extracted from tarballs directories have executable bit set ### Enhancements and new features - A log message and progress bar will now inform if a tarball to be downloaded while getting specific files (requires git-annex > 6.20180206) - A dedicated `datalad rerun` command capable of rerunning entire sequences of previously `run` commands. **Reproducibility through VCS. Use `run` even if not interested in `rerun`** - Alert the user if `git` is not yet configured but git operations are requested - Delay collection of previous ssh connections until it is actually needed. Also do not require ':' while specifying ssh host - AutomagicIO: Added proxying of isfile, lzma.LZMAFile and io.open - Testing: - added DATALAD_DATASETS_TOPURL=http://datasets-tests.datalad.org to run tests against another website to not obscure access stats - tests run against temporary HOME to avoid side-effects - better unit-testing of interactions with special remotes - CONTRIBUTING.md describes how to setup and use `git-hub` tool to "attach" commits to an issue making it into a PR - DATALAD_USE_DEFAULT_GIT env variable could be used to cause DataLad to use default (not the one possibly bundled with git-annex) git - Be more robust while handling not supported requests by annex in special remotes - Use of `swallow_logs` in the code was refactored away -- less mysteries now, just increase logging level - `wtf` plugin will report more information about environment, externals and the system # 0.9.1 (Oct 01, 2017) -- "DATALAD!"(JBTM) Minor bugfix release ### Fixes - Should work correctly with subdatasets named as numbers of bool values (requires also GitPython >= 2.1.6) - Custom special remotes should work without crashing with git-annex >= 6.20170924 ## 0.9.0 (Sep 19, 2017) -- isn't it a lucky day even though not a Friday? ### Major refactoring and deprecations - the `files` argument of [save] has been renamed to `path` to be uniform with any other command - all major commands now implement more uniform API semantics and result reporting. Functionality for modification detection of dataset content has been completely replaced with a more efficient implementation - [publish] now features a `--transfer-data` switch that allows for a disambiguous specification of whether to publish data -- independent of the selection which datasets to publish (which is done via their paths). Moreover, [publish] now transfers data before repository content is pushed. ### Fixes - [drop] no longer errors when some subdatasets are not installed - [install] will no longer report nothing when a Dataset instance was given as a source argument, but rather perform as expected - [remove] doesn't remove when some files of a dataset could not be dropped - [publish] - no longer hides error during a repository push - publish behaves "correctly" for `--since=` in considering only the differences the last "pushed" state - data transfer handling while publishing with dependencies, to github - improved robustness with broken Git configuration - [search] should search for unicode strings correctly and not crash - robustify git-annex special remotes protocol handling to allow for spaces in the last argument - UI credentials interface should now allow to Ctrl-C the entry - should not fail while operating on submodules named with numerics only or by bool (true/false) names - [crawl] templates should not now override settings for `largefiles` if specified in `.gitattributes` ### Enhancements and new features - **Exciting new feature** [run] command to protocol execution of an external command and rerun computation if desired. See [screencast](http://datalad.org/features.html#reproducible-science) - [save] now uses Git for detecting with sundatasets need to be inspected for potential changes, instead of performing a complete traversal of a dataset tree - [add] looks for changes relative to the last commited state of a dataset to discover files to add more efficiently - [diff] can now report untracked files in addition to modified files - [uninstall] will check itself whether a subdataset is properly registered in a superdataset, even when no superdataset is given in a call - [subdatasets] can now configure subdatasets for exclusion from recursive installation (`datalad-recursiveinstall` submodule configuration property) - precrafted pipelines of [crawl] now will not override `annex.largefiles` setting if any was set within `.gitattribues` (e.g. by `datalad create --text-no-annex`) - framework for screencasts: `tools/cast*` tools and sample cast scripts under `doc/casts` which are published at [datalad.org/features.html](http://datalad.org/features.html) - new [project YouTube channel](https://www.youtube.com/channel/UCB8-Zf7D0DSzAsREoIt0Bvw) - tests failing in direct and/or v6 modes marked explicitly ## 0.8.1 (Aug 13, 2017) -- the best birthday gift Bugfixes ### Fixes - Do not attempt to [update] a not installed sub-dataset - In case of too many files to be specified for [get] or [copy_to], we will make multiple invocations of underlying git-annex command to not overfill command line - More robust handling of unicode output in terminals which might not support it ### Enhancements and new features - Ship a copy of numpy.testing to facilitate [test] without requiring numpy as dependency. Also allow to pass to command which test(s) to run - In [get] and [copy_to] provide actual original requested paths, not the ones we deduced need to be transferred, solely for knowing the total ## 0.8.0 (Jul 31, 2017) -- it is better than ever A variety of fixes and enhancements ### Fixes - [publish] would now push merged `git-annex` branch even if no other changes were done - [publish] should be able to publish using relative path within SSH URI (git hook would use relative paths) - [publish] should better tollerate publishing to pure git and `git-annex` special remotes ### Enhancements and new features - [plugin] mechanism came to replace [export]. See [export_tarball] for the replacement of [export]. Now it should be easy to extend datalad's interface with custom functionality to be invoked along with other commands. - Minimalistic coloring of the results rendering - [publish]/`copy_to` got progress bar report now and support of `--jobs` - minor fixes and enhancements to crawler (e.g. support of recursive removes) ## 0.7.0 (Jun 25, 2017) -- when it works - it is quite awesome! New features, refactorings, and bug fixes. ### Major refactoring and deprecations - [add-sibling] has been fully replaced by the [siblings] command - [create-sibling], and [unlock] have been re-written to support the same common API as most other commands ### Enhancements and new features - [siblings] can now be used to query and configure a local repository by using the sibling name ``here`` - [siblings] can now query and set annex preferred content configuration. This includes ``wanted`` (as previously supported in other commands), and now also ``required`` - New [metadata] command to interface with datasets/files [meta-data] - Documentation for all commands is now built in a uniform fashion - Significant parts of the documentation of been updated - Instantiate GitPython's Repo instances lazily ### Fixes - API documentation is now rendered properly as HTML, and is easier to browse by having more compact pages - Closed files left open on various occasions (Popen PIPEs, etc) - Restored basic (consumer mode of operation) compatibility with Windows OS ## 0.6.0 (Jun 14, 2017) -- German perfectionism This release includes a **huge** refactoring to make code base and functionality more robust and flexible - outputs from API commands could now be highly customized. See `--output-format`, `--report-status`, `--report-type`, and `--report-type` options for [datalad] command. - effort was made to refactor code base so that underlying functions behave as generators where possible - input paths/arguments analysis was redone for majority of the commands to provide unified behavior ### Major refactoring and deprecations - `add-sibling` and `rewrite-urls` were refactored in favor of new [siblings] command which should be used for siblings manipulations - 'datalad.api.alwaysrender' config setting/support is removed in favor of new outputs processing ### Fixes - Do not flush manually git index in pre-commit to avoid "Death by the Lock" issue - Deployed by [publish] `post-update` hook script now should be more robust (tolerate directory names with spaces, etc.) - A variety of fixes, see [list of pull requests and issues closed](https://github.com/datalad/datalad/milestone/41?closed=1) for more information ### Enhancements and new features - new [annotate-paths] plumbing command to inspect and annotate provided paths. Use `--modified` to summarize changes between different points in the history - new [clone] plumbing command to provide a subset (install a single dataset from a URL) functionality of [install] - new [diff] plumbing command - new [siblings] command to list or manipulate siblings - new [subdatasets] command to list subdatasets and their properties - [drop] and [remove] commands were refactored - `benchmarks/` collection of [Airspeed velocity](https://github.com/spacetelescope/asv/) benchmarks initiated. See reports at http://datalad.github.io/datalad/ - crawler would try to download a new url multiple times increasing delay between attempts. Helps to resolve problems with extended crawls of Amazon S3 - [CRCNS] crawler pipeline now also fetches and aggregates meta-data for the datasets from datacite - overall optimisations to benefit from the aforementioned refactoring and improve user-experience - a few stub and not (yet) implemented commands (e.g. `move`) were removed from the interface - Web frontend got proper coloring for the breadcrumbs and some additional caching to speed up interactions. See http://datasets.datalad.org - Small improvements to the online documentation. See e.g. [summary of differences between git/git-annex/datalad](http://docs.datalad.org/en/latest/related.html#git-git-annex-datalad) ## 0.5.1 (Mar 25, 2017) -- cannot stop the progress A bugfix release ### Fixes - [add] was forcing addition of files to annex regardless of settings in `.gitattributes`. Now that decision is left to annex by default - `tools/testing/run_doc_examples` used to run doc examples as tests, fixed up to provide status per each example and not fail at once - `doc/examples` - [3rdparty_analysis_workflow.sh](http://docs.datalad.org/en/latest/generated/examples/3rdparty_analysis_workflow.html) was fixed up to reflect changes in the API of 0.5.0. - progress bars - should no longer crash **datalad** and report correct sizes and speeds - should provide progress reports while using Python 3.x ### Enhancements and new features - `doc/examples` - [nipype_workshop_dataset.sh](http://docs.datalad.org/en/latest/generated/examples/nipype_workshop_dataset.html) new example to demonstrate how new super- and sub- datasets were established as a part of our datasets collection ## 0.5.0 (Mar 20, 2017) -- it's huge This release includes an avalanche of bug fixes, enhancements, and additions which at large should stay consistent with previous behavior but provide better functioning. Lots of code was refactored to provide more consistent code-base, and some API breakage has happened. Further work is ongoing to standardize output and results reporting (see [PR 1350]) ### Most notable changes - requires [git-annex] >= 6.20161210 (or better even >= 6.20161210 for improved functionality) - commands should now operate on paths specified (if any), without causing side-effects on other dirty/staged files - [save] - `-a` is deprecated in favor of `-u` or `--all-updates` so only changes known components get saved, and no new files automagically added - `-S` does no longer store the originating dataset in its commit message - [add] - can specify commit/save message with `-m` - [add-sibling] and [create-sibling] - now take the name of the sibling (remote) as a `-s` (`--name`) option, not a positional argument - `--publish-depends` to setup publishing data and code to multiple repositories (e.g. github + webserve) should now be functional see [this comment](https://github.com/datalad/datalad/issues/335#issuecomment-277240733) - got `--publish-by-default` to specify what refs should be published by default - got `--annex-wanted`, `--annex-groupwanted` and `--annex-group` settings which would be used to instruct annex about preferred content. [publish] then will publish data using those settings if `wanted` is set. - got `--inherit` option to automagically figure out url/wanted and other git/annex settings for new remote sub-dataset to be constructed - [publish] - got `--skip-failing` refactored into `--missing` option which could use new feature of [create-sibling] `--inherit` ### Fixes - More consistent interaction through ssh - all ssh connections go through [sshrun] shim for a "single point of authentication", etc. - More robust [ls] operation outside of the datasets - A number of fixes for direct and v6 mode of annex ### Enhancements and new features - New [drop] and [remove] commands - [clean] - got `--what` to specify explicitly what cleaning steps to perform and now could be invoked with `-r` - `datalad` and `git-annex-remote*` scripts now do not use setuptools entry points mechanism and rely on simple import to shorten start up time - [Dataset] is also now using [Flyweight pattern], so the same instance is reused for the same dataset - progressbars should not add more empty lines ### Internal refactoring - Majority of the commands now go through `_prep` for arguments validation and pre-processing to avoid recursive invocations ## 0.4.1 (Nov 10, 2016) -- CA release Requires now GitPython >= 2.1.0 ### Fixes - [save] - to not save staged files if explicit paths were provided - improved (but not yet complete) support for direct mode - [update] to not crash if some sub-datasets are not installed - do not log calls to `git config` to avoid leakage of possibly sensitive settings to the logs ### Enhancements and new features - New [rfc822-compliant metadata] format - [save] - -S to save the change also within all super-datasets - [add] now has progress-bar reporting - [create-sibling-github] to create a :term:`sibling` of a dataset on github - [OpenfMRI] crawler and datasets were enriched with URLs to separate files where also available from openfmri s3 bucket (if upgrading your datalad datasets, you might need to run `git annex enableremote datalad` to make them available) - various enhancements to log messages - web interface - populates "install" box first thus making UX better over slower connections ## 0.4 (Oct 22, 2016) -- Paris is waiting Primarily it is a bugfix release but because of significant refactoring of the [install] and [get] implementation, it gets a new minor release. ### Fixes - be able to [get] or [install] while providing paths while being outside of a dataset - remote annex datasets get properly initialized - robust detection of outdated [git-annex] ### Enhancements and new features - interface changes - [get] `--recursion-limit=existing` to not recurse into not-installed subdatasets - [get] `-n` to possibly install sub-datasets without getting any data - [install] `--jobs|-J` to specify number of parallel jobs for annex [get] call could use (ATM would not work when data comes from archives) - more (unit-)testing - documentation: see http://docs.datalad.org/en/latest/basics.html for basic principles and useful shortcuts in referring to datasets - various webface improvements: breadcrumb paths, instructions how to install dataset, show version from the tags, etc. ## 0.3.1 (Oct 1, 2016) -- what a wonderful week Primarily bugfixes but also a number of enhancements and core refactorings ### Fixes - do not build manpages and examples during installation to avoid problems with possibly previously outdated dependencies - [install] can be called on already installed dataset (with `-r` or `-g`) ### Enhancements and new features - complete overhaul of datalad configuration settings handling (see [Configuration documentation]), so majority of the environment. Now uses git format and stores persistent configuration settings under `.datalad/config` and local within `.git/config` variables we have used were renamed to match configuration names - [create-sibling] does not now by default upload web front-end - [export] command with a plug-in interface and `tarball` plugin to export datasets - in Python, `.api` functions with rendering of results in command line got a _-suffixed sibling, which would render results as well in Python as well (e.g., using `search_` instead of `search` would also render results, not only output them back as Python objects) - [get] - `--jobs` option (passed to `annex get`) for parallel downloads - total and per-download (with git-annex >= 6.20160923) progress bars (note that if content is to be obtained from an archive, no progress will be reported yet) - [install] `--reckless` mode option - [search] - highlights locations and fieldmaps for better readability - supports `-d^` or `-d///` to point to top-most or centrally installed meta-datasets - "complete" paths to the datasets are reported now - `-s` option to specify which fields (only) to search - various enhancements and small fixes to [meta-data] handling, [ls], custom remotes, code-base formatting, downloaders, etc - completely switched to `tqdm` library (`progressbar` is no longer used/supported) ## 0.3 (Sep 23, 2016) -- winter is coming Lots of everything, including but not limited to - enhanced index viewer, as the one on http://datasets.datalad.org - initial new data providers support: [Kaggle], [BALSA], [NDA], [NITRC] - initial [meta-data support and management] - new and/or improved crawler pipelines for [BALSA], [CRCNS], [OpenfMRI] - refactored [install] command, now with separate [get] - some other commands renaming/refactoring (e.g., [create-sibling]) - datalad [search] would give you an option to install datalad's super-dataset under ~/datalad if ran outside of a dataset ### 0.2.3 (Jun 28, 2016) -- busy OHBM New features and bugfix release - support of /// urls to point to http://datasets.datalad.org - variety of fixes and enhancements throughout ### 0.2.2 (Jun 20, 2016) -- OHBM we are coming! New feature and bugfix release - greately improved documentation - publish command API RFing allows for custom options to annex, and uses --to REMOTE for consistent with annex invocation - variety of fixes and enhancements throughout ### 0.2.1 (Jun 10, 2016) - variety of fixes and enhancements throughout ## 0.2 (May 20, 2016) Major RFing to switch from relying on rdf to git native submodules etc ## 0.1 (Oct 14, 2015) Release primarily focusing on interface functionality including initial publishing [git-annex]: http://git-annex.branchable.com/ [Kaggle]: https://www.kaggle.com [BALSA]: http://balsa.wustl.edu [NDA]: http://data-archive.nimh.nih.gov [NITRC]: https://www.nitrc.org [CRCNS]: http://crcns.org [FCON1000]: http://fcon_1000.projects.nitrc.org [OpenfMRI]: http://openfmri.org [Configuration documentation]: http://docs.datalad.org/config.html [Dataset]: http://docs.datalad.org/en/latest/generated/datalad.api.Dataset.html [Sibling]: http://docs.datalad.org/en/latest/glossary.html [rfc822-compliant metadata]: http://docs.datalad.org/en/latest/metadata.html#rfc822-compliant-meta-data [meta-data support and management]: http://docs.datalad.org/en/latest/cmdline.html#meta-data-handling [meta-data]: http://docs.datalad.org/en/latest/cmdline.html#meta-data-handling [add-sibling]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-add-sibling.html [add]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-add.html [annotate-paths]: http://docs.datalad.org/en/latest/generated/man/datalad-annotate-paths.html [clean]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-clean.html [clone]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-clone.html [copy_to]: http://docs.datalad.org/en/latest/_modules/datalad/support/annexrepo.html?highlight=%22copy_to%22 [create-sibling-github]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-create-sibling-github.html [create-sibling]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-create-sibling.html [datalad]: http://docs.datalad.org/en/latest/generated/man/datalad.html [drop]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-drop.html [export]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-export.html [export_tarball]: http://docs.datalad.org/en/latest/generated/datalad.plugin.export_tarball.html [get]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-get.html [install]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-install.html [ls]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-ls.html [metadata]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-metadata.html [plugin]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-plugin.html [publish]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-publish.html [remove]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-remove.html [run]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-run.html [save]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-save.html [search]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-search.html [siblings]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-siblings.html [sshrun]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-sshrun.html [update]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-update.html [Flyweight pattern]: https://en.wikipedia.org/wiki/Flyweight_pattern [PR 1350]: https://github.com/datalad/datalad/pull/1350 datalad-0.9.3/CODE_OF_CONDUCT.md000066400000000000000000000004611325277200500157200ustar00rootroot00000000000000# Code of Conduct * Be nice -- motivated people are more creative and more productive, the nature of any interaction should be such that it leads to more motivation, not less * Be constructive -- getting a fix for a problem, or identifying a concrete path forward should be the focus of any discussion datalad-0.9.3/CONTRIBUTING.md000066400000000000000000000447071325277200500153650ustar00rootroot00000000000000Contributing to DataLad ======================= [gh-datalad]: http://github.com/datalad/datalad Files organization ------------------ - `datalad/` is the main Python module where major development is happening, with major submodules being: - `cmdline/` - helpers for accessing `interface/` functionality from command line - `crawler/` - functionality for crawling (online) resources and creating or updating datasets and collections based on the scraped/downloaded data - `nodes/` processing elements which are used in the pipeline - `pipelines/` pipelines generators, to produce pipelines to be ran - `pipeline.py` pipeline runner - `customremotes/` - custom special remotes for annex provided by datalad - `downloaders/` - support for accessing data from various sources (e.g. http, S3, XNAT) via a unified interface. - `configs/` - specifications for known data providers and associated credentials - `interface/` - high level interface functions which get exposed via command line (`cmdline/`) or Python (`datalad.api`). - `tests/` - some unit- and regression- tests (more could be found under `tests/` of corresponding submodules) - `utils.py` provides convenience helpers used by unit-tests such as `@with_tree`, `@serve_path_via_http` and other decorators - `ui/` - user-level interactions, such as messages about errors, warnings, progress reports, AND when supported by available frontend -- interactive dialogs - `support/` - various support modules, e.g. for git/git-annex interfaces, constraints for the `interface/`, etc - `docs/` - yet to be heavily populated documentation - `bash-completions` - bash and zsh completion setup for datalad (just `source` it) - `fixtures/` currently not under git, contains generated by vcr fixtures - `tools/` contains helper utilities used during development, testing, and benchmarking of DataLad. Implemented in any most appropriate language (Python, bash, etc.) How to contribute ----------------- The preferred way to contribute to the DataLad code base is to fork the [main repository][gh-datalad] on GitHub. Here we outline the workflow used by the developers: 0. Have a clone of our main [project repository][gh-datalad] as `origin` remote in your git: git clone git://github.com/datalad/datalad 1. Fork the [project repository][gh-datalad]: click on the 'Fork' button near the top of the page. This creates a copy of the code base under your account on the GitHub server. 2. Add your forked clone as a remote to the local clone you already have on your local disk: git remote add gh-YourLogin git@github.com:YourLogin/datalad.git git fetch gh-YourLogin To ease addition of other github repositories as remotes, here is a little bash function/script to add to your `~/.bashrc`: ghremote () { url="$1" proj=${url##*/} url_=${url%/*} login=${url_##*/} git remote add gh-$login $url git fetch gh-$login } thus you could simply run: ghremote git@github.com:YourLogin/datalad.git to add the above `gh-YourLogin` remote. Additional handy aliases such as `ghpr` (to fetch existing pr from someone's remote) and `ghsendpr` could be found at [yarikoptic's bash config file](http://git.onerussian.com/?p=etc/bash.git;a=blob;f=.bash/bashrc/30_aliases_sh;hb=HEAD#l865) 3. Create a branch (generally off the `origin/master`) to hold your changes: git checkout -b nf-my-feature and start making changes. Ideally, use a prefix signaling the purpose of the branch - `nf-` for new features - `bf-` for bug fixes - `rf-` for refactoring - `doc-` for documentation contributions (including in the code docstrings). - `bm-` for changes to benchmarks We recommend to not work in the ``master`` branch! 4. Work on this copy on your computer using Git to do the version control. When you're done editing, do: git add modified_files git commit to record your changes in Git. Ideally, prefix your commit messages with the `NF`, `BF`, `RF`, `DOC`, `BM` similar to the branch name prefixes, but you could also use `TST` for commits concerned solely with tests, and `BK` to signal that the commit causes a breakage (e.g. of tests) at that point. Multiple entries could be listed joined with a `+` (e.g. `rf+doc-`). See `git log` for examples. If a commit closes an existing DataLad issue, then add to the end of the message `(Closes #ISSUE_NUMER)` 5. Push to GitHub with: git push -u gh-YourLogin nf-my-feature Finally, go to the web page of your fork of the DataLad repo, and click 'Pull request' (PR) to send your changes to the maintainers for review. This will send an email to the committers. You can commit new changes to this branch and keep pushing to your remote -- github automagically adds them to your previously opened PR. (If any of the above seems like magic to you, then look up the [Git documentation](http://git-scm.com/documentation) on the web.) Development environment ----------------------- Although we now support Python 3 (>= 3.3), primarily we still use Python 2.7 and thus instructions below are for python 2.7 deployments. Replace `python-{` with `python{,3}-{` to also install dependencies for Python 3 (e.g., if you would like to develop and test through tox). See [README.md:Dependencies](README.md#Dependencies) for basic information about installation of datalad itself. On Debian-based systems we recommend to enable [NeuroDebian](http://neuro.debian.net) since we use it to provide backports of recent fixed external modules we depend upon: ```sh apt-get install -y -q git git-annex-standalone apt-get install -y -q patool python-scrapy python-{appdirs,argcomplete,git,humanize,keyring,lxml,msgpack,mock,progressbar,requests,setuptools,six} ``` and additionally, for development we suggest to use tox and new versions of dependencies from pypy: ```sh apt-get install -y -q python-{dev,httpretty,nose,pip,vcr,virtualenv} python-tox # Some libraries which might be needed for installing via pip apt-get install -y -q lib{ffi,ssl,curl4-openssl,xml2,xslt1}-dev ``` some of which you could also install from PyPi using pip (prior installation of those libraries listed above might be necessary) ```sh pip install -r requirements-devel.txt ``` and you will need to install recent git-annex using appropriate for your OS means (for Debian/Ubuntu, once again, just use NeuroDebian). Documentation ------------- ### Docstrings We use [NumPy standard] for the description of parameters docstrings. If you are using PyCharm, set your project settings (`Tools` -> `Python integrated tools` -> `Docstring format`). [NumPy standard]: https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard In addition, we follow the guidelines of [Restructured Text] with the additional features and treatments provided by [Sphinx]. [Restructured Text]: http://docutils.sourceforge.net/docs/user/rst/quickstart.html [Sphinx]: http://www.sphinx-doc.org/en/stable/ Additional Hints ---------------- ### Merge commits For merge commits to have more informative description, add to your `.git/config` or `~/.gitconfig` following section: [merge] summary = true log = true and if conflicts occur, provide short summary on how they were resolved in "Conflicts" listing within the merge commit (see [example](https://github.com/datalad/datalad/commit/eb062a8009d160ae51929998771964738636dcc2)). ### Issue -> PR [git-hub](https://github.com/sociomantic/git-hub) utility allows to attach commits to an issue, thus effectively converting it into a pull request. This allows to avoid necessity to have 2 items (both issue and PR) which often would duplicate information and discussion. To use `git-hub` you first would need to configure it using `git hub setup` command which would create a section within your `.git/config` such as [hub] username= oauthtoken= upstream=datalad/datalad forkremote=YourLogin/datalad pullbase=master Then, if you are in a branch with the commits that you want to attach to an issue, thus making it into a pull request, you can use: git hub pull attach If you would like to use `git hub clone -t` to fork other projects, but would like to maintain our above convention (official repository as `origin`, not `upstream`, and your fork as `gh-yourlogin`, not `fork`), set following git configuration options globally: hub.upstreamremote = origin hub.forkremote = gh-YourLogin Quality Assurance ----------------- It is recommended to check that your contribution complies with the following rules before submitting a pull request: - All public methods should have informative docstrings with sample usage presented as doctests when appropriate. - All other tests pass when everything is rebuilt from scratch. - New code should be accompanied by tests. ### Tests `datalad/tests` contains tests for the core portion of the project, and more tests are provided under corresponding submodules in `tests/` subdirectories to simplify re-running the tests concerning that portion of the codebase. To execute many tests, the codebase first needs to be "installed" in order to generate scripts for the entry points. For that, the recommended course of action is to use `virtualenv`, e.g. ```sh virtualenv --system-site-packages venv-tests source venv-tests/bin/activate pip install -r requirements.txt python setup.py develop ``` and then use that virtual environment to run the tests, via ```sh python -m nose -s -v datalad ``` or similarly, ```sh nosetests -s -v datalad ``` then to later deactivate the virtualenv just simply enter ```sh deactivate ``` Alternatively, or complimentary to that, you can use `tox` -- there is a `tox.ini` file which sets up a few virtual environments for testing locally, which you can later reuse like any other regular virtualenv for troubleshooting. Additionally, [tools/testing/test_README_in_docker](tools/testing/test_README_in_docker) script can be used to establish a clean docker environment (based on any NeuroDebian-supported release of Debian or Ubuntu) with all dependencies listed in README.md pre-installed. ### CI setup We are using Travis-CI and have [buildbot setup](https://github.com/datalad/buildbot) which also exercises our tests battery for every PR and on the master. Note that buildbot runs tests only submitted by datalad developers, or if a PR acquires 'buildbot' label. In case if you want to enter buildbot's environment 1. Login to our development server (`smaug`) 2. Find container ID associated with the environment you are interested in, e.g. docker ps | grep nd16.04 3. Enter that docker container environment using docker exec -it /bin/bash 4. Become buildbot user su - buildbot 5. Activate corresponding virtualenv using source e.g. `source /home/buildbot/datalad-pr-docker-dl-nd15_04/build/venv-ci/bin/activate` And now you should be in the same environment as the very last tested PR. Note that the same path/venv is reused for all the PRs, so you might want first to check using `git show` under the `build/` directory if it corresponds to the commit you are interested to troubleshoot. ### Coverage You can also check for common programming errors with the following tools: - Code with good unittest coverage (at least 80%), check with: pip install nose coverage nosetests --with-coverage path/to/tests_for_package - We rely on https://codecov.io to provide convenient view of code coverage. Installation of the codecov extension for Firefox/Iceweasel or Chromium is strongly advised, since it provides coverage annotation of pull requests. ### Linting We are not (yet) fully PEP8 compliant, so please use these tools as guidelines for your contributions, but not to PEP8 entire code base. [beyond-pep8]: https://www.youtube.com/watch?v=wf-BqAjZb8M *Sidenote*: watch [Raymond Hettinger - Beyond PEP 8][beyond-pep8] - No pyflakes warnings, check with: pip install pyflakes pyflakes path/to/module.py - No PEP8 warnings, check with: pip install pep8 pep8 path/to/module.py - AutoPEP8 can help you fix some of the easy redundant errors: pip install autopep8 autopep8 path/to/pep8.py Also, some team developers use [PyCharm community edition](https://www.jetbrains.com/pycharm) which provides built-in PEP8 checker and handy tools such as smart splits/joins making it easier to maintain code following the PEP8 recommendations. NeuroDebian provides `pycharm-community-sloppy` package to ease pycharm installation even further. Easy Issues ----------- A great way to start contributing to DataLad is to pick an item from the list of [Easy issues](https://github.com/datalad/datalad/labels/easy) in the issue tracker. Resolving these issues allows you to start contributing to the project without much prior knowledge. Your assistance in this area will be greatly appreciated by the more experienced developers as it helps free up their time to concentrate on other issues. Various hints for developers ---------------------------- ### Useful tools - While performing IO/net heavy operations use [dstat](http://dag.wieers.com/home-made/dstat) for quick logging of various health stats in a separate terminal window: dstat -c --top-cpu -d --top-bio --top-latency --net - To monitor speed of any data pipelining [pv](http://www.ivarch.com/programs/pv.shtml) is really handy, just plug it in the middle of your pipe. - For remote debugging epdb could be used (avail in pip) by using `import epdb; epdb.serve()` in Python code and then connecting to it with `python -c "import epdb; epdb.connect()".` - We are using codecov which has extensions for the popular browsers (Firefox, Chrome) which annotates pull requests on github regarding changed coverage. ### Useful Environment Variables Refer datalad/config.py for information on how to add these environment variables to the config file and their naming convention - *DATALAD_DATASETS_TOPURL*: Used to point to an alternative location for `///` dataset. If running tests preferred to be set to http://datasets-tests.datalad.org - *DATALAD_LOG_LEVEL*: Used for control the verbosity of logs printed to stdout while running datalad commands/debugging - *DATALAD_LOG_CMD_OUTPUTS*: Used to control either both stdout and stderr of external commands execution are logged in detail (at DEBUG level) - *DATALAD_LOG_CMD_ENV*: If contains a digit (e.g. 1), would log entire environment passed into the Runner.run's popen call. Otherwise could be a comma separated list of environment variables to log - *DATALAD_LOG_CMD_STDIN*: Either to log stdin for the command - *DATALAD_LOG_CMD_CWD*: Either to log cwd where command to be executed - *DATALAD_LOG_PID* To instruct datalad to log PID of the process - *DATALAD_LOG_TARGET* Where to log: `stderr` (default), `stdout`, or another filename - *DATALAD_LOG_TIMESTAMP*: Used to add timestamp to datalad logs - *DATALAD_LOG_TRACEBACK*: Runs TraceBack function with collide set to True, if this flag is set to 'collide'. This replaces any common prefix between current traceback log and previous invocation with "..." - *DATALAD_EXC_STR_TBLIMIT*: This flag is used by the datalad extract_tb function which extracts and formats stack-traces. It caps the number of lines to DATALAD_EXC_STR_TBLIMIT of pre-processed entries from traceback. - *DATALAD_SEED*: To seed Python's `random` RNG, which will also be used for generation of dataset UUIDs to make those random values reproducible. You might want also to set all the relevant git config variables like we do in one of the travis runs - *DATALAD_TESTS_TEMP_KEEP*: Function rmtemp will not remove temporary file/directory created for testing if this flag is set - *DATALAD_TESTS_TEMP_DIR*: Create a temporary directory at location specified by this flag. It is used by tests to create a temporary git directory while testing git annex archives etc - *DATALAD_TESTS_NONETWORK*: Skips network tests completely if this flag is set Examples include test for s3, git_repositories, openfmri etc - *DATALAD_TESTS_SSH*: Skips SSH tests if this flag is **not** set - *DATALAD_TESTS_NOTEARDOWN*: Does not execute teardown_package which cleans up temp files and directories created by tests if this flag is set - *DATALAD_TESTS_USECASSETTE*: Specifies the location of the file to record network transactions by the VCR module. Currently used by when testing custom special remotes - *DATALAD_TESTS_OBSCURE_PREFIX*: A string to prefix the most obscure (but supported by the filesystem test filename - *DATALAD_TESTS_PROTOCOLREMOTE*: Binary flag to specify whether to test protocol interactions of custom remote with annex - *DATALAD_TESTS_RUNCMDLINE*: Binary flag to specify if shell testing using shunit2 to be carried out - *DATALAD_TESTS_TEMP_FS*: Specify the temporary file system to use as loop device for testing DATALAD_TESTS_TEMP_DIR creation - *DATALAD_TESTS_TEMP_FSSIZE*: Specify the size of temporary file system to use as loop device for testing DATALAD_TESTS_TEMP_DIR creation - *DATALAD_TESTS_NONLO*: Specifies network interfaces to bring down/up for testing. Currently used by travis. - *DATALAD_API_ALWAYSRENDER*: Would make api functions always use a version with cmdline output renderer (i.e. the one with `_` suffix) - *DATALAD_CMD_PROTOCOL*: Specifies the protocol number used by the Runner to note shell command or python function call times and allows for dry runs. 'externals-time' for ExecutionTimeExternalsProtocol, 'time' for ExecutionTimeProtocol and 'null' for NullProtocol. Any new DATALAD_CMD_PROTOCOL has to implement datalad.support.protocol.ProtocolInterface - *DATALAD_CMD_PROTOCOL_PREFIX*: Sets a prefix to add before the command call times are noted by DATALAD_CMD_PROTOCOL. - *DATALAD_USE_DEFAULT_GIT*: Instructs to use `git` as available in current environment, and not the one which possibly comes with git-annex (default behavior). # Changelog section For the upcoming release use this template ## 0.9.4 (??? ??, 2018) -- will be better than ever bet we will fix some bugs and make a world even a better place. ### Major refactoring and deprecations - hopefully none ### Fixes ? ### Enhancements and new features ? datalad-0.9.3/CONTRIBUTORS000066400000000000000000000003521325277200500150000ustar00rootroot00000000000000The following people have contributed to DataLad: Alejandro de la Vega Alex Waite Benjamin Poldrack Christian Olaf Häusler Debanjum Singh Solanky Gergana Alteva Horea Christian Jason Gors Kyle Meyer Michael Hanke Yaroslav Halchenko datalad-0.9.3/COPYING000066400000000000000000000041021325277200500141500ustar00rootroot00000000000000# Main Copyright/License DataLad, including all examples, code snippets and attached documentation is covered by the MIT license. The MIT License Copyright (c) 2013- Yaroslav Halchenko 2015- DataLad Team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. See CONTRIBUTORS file for a full list of contributors. # 3rd Party Code Some code distributed within DataLad was not developed by the DataLad team, hence you should adhere to the copyright and license terms of respective authors if you are to use corresponding parts. ## `datalad/resources/website` - [JQuery](https://code.jquery.com) - MIT License - Copyright (c) 2016- - [Datatables](http://datatables.net) - MIT License - Copyright (c) 2016- - [BlueImp-MD5](https://github.com/blueimp/JavaScript-MD5) - MIT License - Copyright (c) 2016- - [QUnit](https://qunitjs.com/) - MIT License - Copyright (c) - [Sinon-QUnit Plugin](http://sinonjs.org/qunit/) - BSD License - Copyright (c) 2010-2011 - [NumPy Testing](http://numpy.org) - BSD license - Copyright (c) 2005- NumPy Developers datalad-0.9.3/Gruntfile.js000066400000000000000000000004471325277200500154220ustar00rootroot00000000000000module.exports = function(grunt) { // Project configuration. grunt.initConfig({ qunit: { files: ['datalad/resources/website/tests/test.html'] } }); // Load plugin grunt.loadNpmTasks('grunt-contrib-qunit'); // Task to run tests grunt.registerTask('test', 'qunit'); }; datalad-0.9.3/Makefile000066400000000000000000000017601325277200500145640ustar00rootroot00000000000000# simple makefile to simplify repetetive build env management tasks under posix # Ideas borrowed from scikit-learn's and PyMVPA Makefiles -- thanks! PYTHON ?= python NOSETESTS ?= nosetests MODULE ?= datalad all: clean test clean: $(PYTHON) setup.py clean rm -rf dist build bin bin: mkdir -p $@ PYTHONPATH=bin:$(PYTHONPATH) python setup.py develop --install-dir $@ test-code: bin PATH=bin:$(PATH) PYTHONPATH=bin:$(PYTHONPATH) $(NOSETESTS) -s -v $(MODULE) test-coverage: rm -rf coverage .coverage $(NOSETESTS) -s -v --with-coverage $(MODULE) test: test-code trailing-spaces: find $(MODULE) -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \; code-analysis: flake8 $(MODULE) | grep -v __init__ | grep -v external pylint -E -i y $(MODULE)/ # -d E1103,E0611,E1101 update-changelog: @echo ".. This file is auto-converted from CHANGELOG.md (make update-changelog) -- do not edit\n\nChange log\n**********" > docs/source/changelog.rst pandoc -t rst CHANGELOG.md >> docs/source/changelog.rst datalad-0.9.3/README.md000066400000000000000000000157561325277200500144150ustar00rootroot00000000000000 ____ _ _ _ | _ \ __ _ | |_ __ _ | | __ _ __| | | | | | / _` | | __| / _` | | | / _` | / _` | | |_| | | (_| | | |_ | (_| | | |___ | (_| | | (_| | |____/ \__,_| \__| \__,_| |_____| \__,_| \__,_| Read me [![Travis tests status](https://secure.travis-ci.org/datalad/datalad.png?branch=master)](https://travis-ci.org/datalad/datalad) [![codecov.io](https://codecov.io/github/datalad/datalad/coverage.svg?branch=master)](https://codecov.io/github/datalad/datalad?branch=master) [![Documentation](https://readthedocs.org/projects/datalad/badge/?version=latest)](http://datalad.rtfd.org) The full documentation is available at: http://docs.datalad.org # 10000ft overview DataLad makes data management and data distribution more accessible. To do that it stands on the shoulders of [Git] and [Git-annex] to deliver a decentralized system for data exchange. This includes automated ingestion of data from online portals, and exposing it in readily usable form as Git(-annex) repositories, so-called datasets. The actual data storage and permission management, however, remains with the original data providers. # Status DataLad is under rapid development. While the code base is still growing, the focus is increasingly shifting towards robust and safe operation with a sensible API. Organization and configuration are still subject of considerable reorganization and standardization. However, DataLad is, in fact, usable today and user feedback is always welcome. # Support [Neurostars](https://neurostars.org) is the preferred venue for DataLad support. Forum login is possible with your existing Google, Twitter, or GitHub account. Before posting a [new topic](https://neurostars.org/new-topic?tags=datalad), please check the [previous posts](https://neurostars.org/search?q=tags%3Adatalad) tagged with `#datalad`. To get help on a datalad-related issue, please consider to follow this [message template](https://neurostars.org/new-topic?body=-%20Please%20describe%20the%20problem.%0A-%20What%20steps%20will%20reproduce%20the%20problem%3F%0A-%20What%20version%20of%20DataLad%20are%20you%20using%20%28run%20%60datalad%20--version%60%29%3F%20On%20what%20operating%20system%20%28consider%20running%20%60datalad%20plugin%20wtf%60%29%3F%0A-%20Please%20provide%20any%20additional%20information%20below.%0A-%20Have%20you%20had%20any%20luck%20using%20DataLad%20before%3F%20%28Sometimes%20we%20get%20tired%20of%20reading%20bug%20reports%20all%20day%20and%20a%20lil'%20positive%20end%20note%20does%20wonders%29&tags=datalad). # DataLad 101 A growing number of datasets is made available from http://datasets.datalad.org . Those datasets are just regular git/git-annex repositories organized into a hierarchy using git submodules mechanism. So you can use regular git/git-annex commands to work with them, but might need `datalad` to be installed to provide additional functionality (e.g., fetching from portals requiring authentication such as CRCNS, HCP; or accessing data originally distributed in tarballs). But datalad aims to provide higher level interface on top of git/git-annex to simplify consumption and sharing of new or derived datasets. To that end, you can install **all** of those datasets using datalad install -r /// which will `git clone` all of those datasets under `datasets.datalad.org` sub-directory. This command will not fetch any large data files, but will merely recreate full hierarchy of all of those datasets locally, which also takes a good chunk of your filesystem meta-data storage. Instead of fetching all datasets at once you could either specify specific dataset to be installed, e.g. datalad install ///openfmri/ds000113 or install top level dataset by omitting `-r` option and then calling `datalad install` for specific sub-datasets you want to have installed, possibly with `-r` to install their sub-datasets as well, e.g. datalad install /// cd datasets.datalad.org datalad install -r openfmri/ds000001 indi/fcon1000 You can navigate datasets you have installed in your terminal or browser, while fetching necessary files or installing new sub-datasets using the `datalad get [FILE|DIR]` command. DataLad will take care about downloading, extracting, and possibly authenticating (would ask you for credentials) in a uniform fashion regardless of the original data location or distribution serialization (e.g., a tarball). Since it is using git and git-annex underneath, you can be assured that you are getting **exact** correct version of the data. Use-cases DataLad covers are not limited to "consumption" of data. DataLad aims also to help publishing original or derived data, thus facilitating more efficient data management when collaborating or simply sharing your data. You can find more documentation at http://docs.datalad.org . # Contributing See [CONTRIBUTING.md](CONTRIBUTING.md) if you are interested in internals or contributing to the project. # Installation ## Debian-based systems On Debian-based systems we recommend to enable [NeuroDebian] from which we provide recent releases of DataLad. datalad package recommends some relatively heavy packages (e.g. scrapy) which are useful only if you are interested in using `crawl` functionality. If you need just the base functionality of the datalad, install without recommended packages (e.g., `apt-get install --no-install-recommends datalad`) ## Other Linux'es, OSX (Windows yet TODO) via pip By default, installation via pip installs core functionality of datalad allowing for managing datasets etc. Additional installation schemes are available, so you could provide enhanced installation via `pip install datalad[SCHEME]` where `SCHEME` could be - `crawl` to also install `scrapy` which is used in some crawling constructs - `tests` to also install dependencies used by unit-tests battery of the datalad - `full` to install all dependencies. For installation through `pip` you would need some external dependencies not shipped from it (e.g. `git-annex`, etc.) for which please refer to the next section. ## Dependencies Our [setup.py] and accompanying packaging describe all necessary dependencies. On Debian-based systems we recommend to enable [NeuroDebian] since we use it to provide backports of recent fixed external modules we depend upon, and up-to-date [Git-annex] is necessary for proper operation of DataLad packaged (install `git-annex-standalone` from NeuroDebian repository). Additionally, if you would like to develop and run our tests battery see [CONTRIBUTING.md](CONTRIBUTING.md) regarding additional dependencies. Later we will provide bundled installations of DataLad across popular platforms. # License MIT/Expat # Disclaimer It is in a alpha stage -- **nothing** is set in stone yet -- but already usable in a limited scope. [Git]: https://git-scm.com [Git-annex]: http://git-annex.branchable.com [setup.py]: https://github.com/datalad/datalad/blob/master/setup.py [NeuroDebian]: http://neuro.debian.net datalad-0.9.3/asv.conf.json000066400000000000000000000136621325277200500155400ustar00rootroot00000000000000{ // The version of the config file format. Do not change, unless // you know what you are doing. "version": 1, // The name of the project being benchmarked "project": "DataLad", // The project's homepage "project_url": "http://datalad.org", // The URL or local path of the source code repository for the // project being benchmarked "repo": ".", // List of branches to benchmark. If not provided, defaults to "master" // (for git) or "default" (for mercurial). "branches": ["master", "0.5.x", "0.9.x"], // for git // "branches": ["default"], // for mercurial // The DVCS being used. If not set, it will be automatically // determined from "repo" by looking at the protocol in the URL // (if remote), or by looking for special directories, such as // ".git" (if local). // "dvcs": "git", // The tool to use to create environments. May be "conda", // "virtualenv" or other value depending on the plugins in use. // If missing or the empty string, the tool will be automatically // determined by looking for tools on the PATH environment // variable. "environment_type": "virtualenv", // timeout in seconds for installing any dependencies in environment // defaults to 10 min //"install_timeout": 600, // the base URL to show a commit for the project. "show_commit_url": "http://github.com/datalad/datalad/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. // Some hosts might not have python3.5 yet, so 3.4 was added "pythons": ["2.7", "3.4"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty // list or empty string indicates to just test against the default // (latest) version. null indicates that the package is to not be // installed. If the package to be tested is only available from // PyPi, and the 'environment_type' is conda, then you can preface // the package name by 'pip+', and the package will be installed via // pip (with all the conda available packages installed first, // followed by the pip installed packages). // // "matrix": { // "numpy": ["1.6", "1.7"], // "six": ["", null], // test with and without six installed // "pip+emcee": [""], // emcee is only available for install with pip. // }, // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. // // An exclude entry excludes entries where all values match. The // values are regexps that should match the whole string. // // An include entry adds an environment. Only the packages listed // are installed. The 'python' key is required. The exclude rules // do not apply to includes. // // In addition to package names, the following keys are available: // // - python // Python version, as in the *pythons* variable above. // - environment_type // Environment type, as above. // - sys_platform // Platform, as in sys.platform. Possible values for the common // cases: 'linux2', 'win32', 'cygwin', 'darwin'. // // "exclude": [ // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows // {"environment_type": "conda", "six": null}, // don't run without six on conda // ], // // "include": [ // // additional env for python2.7 // {"python": "2.7", "numpy": "1.8"}, // // additional env if run on windows+conda // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, // ], // The directory (relative to the current directory) that benchmarks are // stored in. If not provided, defaults to "benchmarks" "benchmark_dir": "benchmarks", // The directory (relative to the current directory) to cache the Python // environments in. If not provided, defaults to "env" "env_dir": ".asv/env", // The directory (relative to the current directory) that raw benchmark // results are stored in. If not provided, defaults to "results". "results_dir": ".asv/results", // The directory (relative to the current directory) that the html tree // should be written to. If not provided, defaults to "html". "html_dir": ".asv/html", // The number of characters to retain in the commit hashes. // "hash_length": 8, // `asv` will cache wheels of the recent builds in each // environment, making them faster to install next time. This is // number of builds to keep, per environment. // "wheel_cache_size": 0 // The commits after which the regression search in `asv publish` // should start looking for regressions. Dictionary whose keys are // regexps matching to benchmark names, and values corresponding to // the commit (exclusive) after which to start looking for // regressions. The default is to start from the first commit // with results. If the commit is `null`, regression detection is // skipped for the matching benchmark. // // "regressions_first_commits": { // "some_benchmark": "352cdf", // Consider regressions only after this commit // "another_benchmark": null, // Skip regression detection altogether // } // The thresholds for relative change in results, after which `asv // publish` starts reporting regressions. Dictionary of the same // form as in ``regressions_first_commits``, with values // indicating the thresholds. If multiple entries match, the // maximum is taken. If no entry matches, the default is 5%. // // "regressions_thresholds": { // "some_benchmark": 0.01, // Threshold of 1% // "another_benchmark": 0.5, // Threshold of 50% // } } datalad-0.9.3/benchmarks/000077500000000000000000000000001325277200500152355ustar00rootroot00000000000000datalad-0.9.3/benchmarks/__init__.py000066400000000000000000000000011325277200500173350ustar00rootroot00000000000000 datalad-0.9.3/benchmarks/api.py000066400000000000000000000076541325277200500163740ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Benchmarks for DataLad""" import os import sys import os.path as osp from os.path import join as opj import tarfile import timeit from subprocess import call from datalad.api import add from datalad.api import create from datalad.api import create_test_dataset from datalad.api import Dataset from datalad.api import install from datalad.api import ls from datalad.api import remove from datalad.api import uninstall from datalad.utils import rmtree from datalad.utils import getpwd # Some tracking example -- may be we should track # of datasets.datalad.org #import gc #def track_num_objects(): # return len(gc.get_objects()) #track_num_objects.unit = "objects" from .common import SuprocBenchmarks class testds(SuprocBenchmarks): """ Benchmarks to test on create_test_dataset how fast we could generate datasets """ def time_create_test_dataset1(self): create_test_dataset(spec='1', seed=0) def time_create_test_dataset2x2(self): create_test_dataset(spec='2/2', seed=0) class supers(SuprocBenchmarks): """ Benchmarks on common operations on collections of datasets using datalad API """ timeout = 3600 # need to assure that we are working in a different repository now # see https://github.com/datalad/datalad/issues/1512 # might not be sufficient due to side effects between tests and # thus getting into the same situation ds_count = 0 def setup_cache(self): # creating in CWD so things get removed when ASV is done ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0] # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar') with tarfile.open(tarfile_path, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree('testds1', ro=False, chmod_files=False) tar.add('testds1', recursive=True) rmtree('testds1') return tarfile_path def setup(self, tarfile_path): import tarfile tempdir = osp.dirname(tarfile_path) with tarfile.open(tarfile_path) as tar: tar.extractall(tempdir) # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed epath = opj(tempdir, 'testds1') epath_unique = epath + str(self.__class__.ds_count) os.rename(epath, epath_unique) self.__class__.ds_count += 1 self.ds = Dataset(epath_unique) print("Finished setup for %s" % tempdir) def teardown(self, tarfile_path): for path in [self.ds.path + '_', self.ds.path]: print("Cleaning up %s" % path) if osp.exists(path): rmtree(path) def time_installr(self, tarfile_path): # somewhat duplicating setup but lazy to do different one for now assert install(self.ds.path + '_', source=self.ds.path, recursive=True) def time_createadd(self, tarfile_path): assert self.ds.create('newsubds') def time_createadd_to_dataset(self, tarfile_path): subds = create(opj(self.ds.path, 'newsubds')) self.ds.add(subds.path) def time_ls(self, tarfile_path): ls(self.ds.path) # TODO: since doesn't really allow to uninstall top level ds... bleh ;) #def time_uninstall(self, tarfile_path): # uninstall(self.ds.path, recursive=True) def time_remove(self, tarfile_path): remove(self.ds.path, recursive=True) datalad-0.9.3/benchmarks/common.py000066400000000000000000000012701325277200500170770ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Helpers for benchmarks of DataLad""" import timeit class SuprocBenchmarks(object): # manually set a number since otherwise takes way too long! # see https://github.com/spacetelescope/asv/issues/497 #number = 3 # although seems to work ok with a timer which accounts for subprocesses # custom timer so we account for subprocess times timer = timeit.default_timer datalad-0.9.3/benchmarks/core.py000066400000000000000000000042501325277200500165400ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Benchmarks for DataLad""" import os import sys import os.path as osp from os.path import join as opj import tarfile import timeit from subprocess import call from datalad.api import add from datalad.api import create from datalad.api import create_test_dataset from datalad.api import Dataset from datalad.api import install from datalad.api import ls from datalad.api import remove from datalad.api import uninstall from datalad.utils import rmtree from datalad.utils import getpwd # Some tracking example -- may be we should track # of datasets.datalad.org #import gc #def track_num_objects(): # return len(gc.get_objects()) #track_num_objects.unit = "objects" from .common import SuprocBenchmarks class startup(SuprocBenchmarks): """ Benchmarks for datalad commands startup """ def setup(self): # we need to prepare/adjust PATH to point to installed datalad # We will base it on taking sys.executable python_path = osp.dirname(sys.executable) self.env = os.environ.copy() self.env['PATH'] = '%s:%s' % (python_path, self.env.get('PATH', '')) def time_help_np(self): call(["datalad", "--help-np"], env=self.env) def time_import(self): call([sys.executable, "-c", "import datalad"]) def time_import_api(self): call([sys.executable, "-c", "import datalad.api"]) class runner(SuprocBenchmarks): """Some rudimentary tests to see if there is no major slowdowns from Runner """ def setup(self): from datalad.cmd import Runner self.runner = Runner() # older versions might not have it try: from datalad.cmd import GitRunner self.git_runner = GitRunner() except ImportError: pass def time_echo(self): self.runner.run("echo") def time_echo_gitrunner(self): self.git_runner.run("echo") datalad-0.9.3/cfgs/000077500000000000000000000000001325277200500140425ustar00rootroot00000000000000datalad-0.9.3/cfgs/allen-human.cfg000066400000000000000000000014521325277200500167260ustar00rootroot00000000000000[DEFAULT] # Now organization specific settings organization = allen # Now project specific -- that is the level where things would be placed # according to the default scheme of things project = human description = "Allen institute human MRI/genetic data" [genetic] url = file:///home/yoh/proj/datalad/sample_htmls/microarray.html # First fetch archived ones and place/tag/commit accordingly include_href = well_known_file_download/177 # and only then enable fresh ones #include_href = well_known_file_download/178 # regardless of the extract filename_e = "%%(href_a)s.zip" %% locals() # how do we figure out either it changed or not [docs] url = file:///home/yoh/proj/datalad/sample_htmls/Documentation #url = http://help.brain-map.org/display/humanbrain/Documentation include_href = download/attachments datalad-0.9.3/cfgs/arjlover-fast.cfg000066400000000000000000000017231325277200500173050ustar00rootroot00000000000000[DEFAULT] # Now organization specific settings incoming = repos/arjlover-fast public = %(incoming)s description = "film.arjlover.net: Collection of Soviet-era cartoons/films/audiobooks for kids" # in general we are after videos include_href = \.(avi|mpeg|mpg|mp3|mp4|ogg)$ # TODO: --relaxed mode = fast [multiki] url = http://multiki.arjlover.net/multiki/ #url = file:///home/yoh/proj/datalad/sample_htmls/multiki.html #include_href = szopie.avi$ # "INTERESTING" one -- just plain hierarchy where we need to get inside [audiobooks] url = http://audio.arjlover.net/audio/ include_href=.* recurse=/$ [filmy] url = http://film.arjlover.net/film/ #include_href = 01\.99\.avi$ #url = file:///home/yoh/proj/datalad/sample_htmls/films_short.html ## [filmiki] url = http://filmiki.arjlover.net/filmiki/ #include_href = 01\.99\.avi$ [radioteatr] url = http://radioteatr.arjlover.net #url = file:///home/yoh/proj/datalad/sample_htmls/radio-theater_short.html #include_href = \.mp3$ datalad-0.9.3/cfgs/arjlover.cfg000066400000000000000000000001321325277200500163430ustar00rootroot00000000000000[INCLUDES] before = arjlover-fast.cfg [DEFAULT] incoming = repos/arjlover mode = relaxed datalad-0.9.3/cfgs/common.cfg000066400000000000000000000037371325277200500160250ustar00rootroot00000000000000# There should be a config file per repository # identifying all pieces to be downloaded and placed in annexes: # # incoming -- annex where we "download things" before either # linking or extracting into a public annex # public -- annex where we extract and possibly complement # with manual additions/downloads. But whatever gets # added automatically should be maintained in aggrement # with .meta_info.json stored in incoming # By default incoming == public, but then we would track that things # are not extracted?? TODO # TODO: should we allow hierarchies? i.e. page pointing to files under # different subdirectories and we would like to preserve that # structure... [DEFAULT] # modes of operation. # download # fast -- git annex addurl --fast, where no download will be # carried out, in case of encountering archives -- crash. # Intended for quickly creating git-annex repositories # for website directories such as arjlover. In fast mode # it would not even query for remote filenames mode = download # BIG defaults which might need to be shared among ALL # organizations and projects keep_orig = True # store "timestamp" information per each target file meta_info = True directory = %(__name__)s # default public = %(incoming)s incoming_top = repos/incoming public_top = repos/public # Generic arrangement incoming = %(incoming_top)s/%(organization)s/%(project)s public = %(public_top)s/%(organization)s/%(project)s # These are just common among sections exclude_href = # based on the link text include_href_a = exclude_href_a = # specify which files should be extracted. How to extract would be # decided internally # By defaulit would just extract all known (internally) archives types #archives_re = (\.(tar\.gz|tar\.bz))$ # It could also be used to add a suffix (e.g. %(filename)s.zip) corresponding # to the archive whenever url/filename doesn't carry any filename = %(filename)s datalad-0.9.3/cfgs/example-feat.cfg000066400000000000000000000007031325277200500170730ustar00rootroot00000000000000[DEFAULT] # Now organization specific settings incoming = repos/1r/classes public = %(incoming)s description = "TEST TWO" # in general we are after videos #[cope7] #url = http://www.onerussian.com/Sci/experiments/YHOB/feat-III-2nd-S1-19+22-30_z2.3_t0.05_gen_scan_sg.gfeat/cope7.feat/report.html url = http://www.onerussian.com/classes/cis780/ [cis780/docs] include_href = \.pdf$ [cis780/scripts] include_href = \.tgz$ #filename = %(filename)s__/ datalad-0.9.3/cfgs/nih-videos.cfg000066400000000000000000000021761325277200500165760ustar00rootroot00000000000000[DEFAULT] # Now organization specific settings incoming = repos/nih/videocast public = %(incoming)s description = "NIH VideoCasting and Podcasting archived videos" # in general we are after videos but can't afford to download them exec = from nihhelpers import get_video_filename mode = fast [recent] url = http://videocast.nih.gov/pastevents.asp include_href = .*File=\d*&download=1$ filename_e = get_video_filename(link, filename) # NOT IMPLEMENTED! recurse_a = "Next Page" # Initial run -- all the pages recurse = .*pastevents\.asp\?c=0&s=\d+ #recurse = .*pastevents\.asp\?c=0&s=11$ # for subsequent runs not really needed to check the whole history # and first few pages should be enough #recurse = .*pastevents\.asp\?c=0&s=[123]$ # TODO: interesting use-case -- may be those should be added as alternative urls #exclude_href_a = mirror # TODO... They should appear on http://videocast.nih.gov/PastEvents.asp # but seems to be not yet there # [videos] # heavy -- so not for the laptop # [materials] # include_href = \.(zip|tar\.gz) # # TODO: interesting use-case -- may be those should be added as alternative urls # exclude_href_a = mirror datalad-0.9.3/cfgs/nihhelpers.py000066400000000000000000000045511325277200500165620ustar00rootroot00000000000000#emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- #ex: set sts=4 ts=4 sw=4 noet: """ COPYRIGHT: Yaroslav Halchenko 2013 LICENSE: MIT Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ __author__ = 'Yaroslav Halchenko' __copyright__ = 'Copyright (c) 2013 Yaroslav Halchenko' __license__ = 'MIT' import re import os import bs4 import time def slugify(value): """Normalizes the string: removes non-alpha characters. """ import unicodedata value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') value = unicode(re.sub('[^\w\s-]', '', value).strip()) return value def get_video_filename(link, filename): video_entry = list(link.parents)[3] #
assert(isinstance(video_entry, bs4.element.Tag)) bold_entries = video_entry.find_all('b') assert(len(bold_entries) == 2) # that is what we know atm title = bold_entries[0].text date_ = bold_entries[1].text # Parse/convert the date date = time.strptime(date_, '%A, %B %d, %Y') date_str = time.strftime('%Y/%m-%d', date) # try to find extension in the filename ext = os.path.splitext(filename)[1] if not ext or len(ext) > 5: # For now just hope that it is a video extension... # TODO -- check with mime ... etc? ext = '.avi' return "%s - %s%s" % (date_str, slugify(title), ext) datalad-0.9.3/cfgs/openfmri-ds000001.cfg000066400000000000000000000022331325277200500174070ustar00rootroot00000000000000[INCLUDES] before = openfmri.cfg [DEFAULT] dataset = ds000001 #url = https://openfmri.org/dataset/ds000001 #url = file:///home/yoh/proj/datalad/sample_htmls/openfmri/ds00001/ds000001 experiment = Balloon Analog Risk-taking Task [data] include_href = .*_raw_.*\.tgz filename = raw/ check_url_limit = 1 # TODO? and how? #checksums_url = https://openfmri.org/system/files/ds001_raw_checksums.txt checksums_url = .*_raw_checksums\.txt #changelog_url = https://openfmri.org/system/files/release_history_6.txt license_url = http://opendatacommons.org/licenses/pddl/1.0/ #TODO: check where they 'extract' now into? may be we are doomed now #to introduce tracking within the archives if they extract under the #same directory [data/processed] include_href = .*_models.tgz # TODO -- we would need to strip the .tgz! filename = "models/" check_url_limit = 1 [addurl:aws-http] # url to check versions etc: http://openfmri.aws.amazon.com.s3.amazonaws.com/?versions # url to download -- the same? backend = s3 s3_prefix = ds001/ s3_versioning = True # either all files found on the remote should be present locally absent_locally = .*\.bak # or the other way around absent_remotely = .* datalad-0.9.3/cfgs/openfmri.cfg000066400000000000000000000006711325277200500163460ustar00rootroot00000000000000[DEFAULT] organization = openfmri project = %(dataset)s incoming = repos/incoming/%(organization)s/%(project)s public = repos/public/%(organization)s/%(project)s url = https://openfmri.org/dataset/%(dataset)s description = "OpenFMRI: %(experiment)s" archives_re = (\.(tar\.gz|tar\.bz|tgz|zip))$ [changelogs] directory = include_href = .*release_history_.*\.txt filename = "changelog.txt" check_url_limit = 1 # do not annex anything annex = datalad-0.9.3/datalad/000077500000000000000000000000001325277200500145125ustar00rootroot00000000000000datalad-0.9.3/datalad/__init__.py000066400000000000000000000156461325277200500166370ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """DataLad aims to expose (scientific) data available online as a unified data distribution with the convenience of git-annex repositories as a backend.""" # For reproducible demos/tests import os _seed = os.environ.get('DATALAD_SEED', None) if _seed: import random random.seed(_seed) # Other imports are interspersed with lgr.debug to ease troubleshooting startup # delays etc. # If there is a bundled git, make sure GitPython uses it too: from datalad.cmd import GitRunner GitRunner._check_git_path() if GitRunner._GIT_PATH: import os os.environ['GIT_PYTHON_GIT_EXECUTABLE'] = \ os.path.join(GitRunner._GIT_PATH, 'git') from .config import ConfigManager cfg = ConfigManager() from .log import lgr import atexit from datalad.utils import on_windows if not on_windows: lgr.log(5, "Instantiating ssh manager") from .support.sshconnector import SSHManager ssh_manager = SSHManager() atexit.register(ssh_manager.close, allow_fail=False) else: ssh_manager = None try: # this will fix the rendering of ANSI escape sequences # for colored terminal output on windows # it will do nothing on any other platform, hence it # is safe to call unconditionally import colorama colorama.init() atexit.register(colorama.deinit) except ImportError as e: if on_windows: from datalad.dochelpers import exc_str lgr.warning( "'colorama' Python module missing, terminal output may look garbled [%s]", exc_str(e)) pass atexit.register(lgr.log, 5, "Exiting") from .version import __version__ def test(module='datalad', verbose=False, nocapture=False, pdb=False, stop=False): """A helper to run datalad's tests. Requires nose """ argv = [] #module] # could make it 'smarter' but decided to be explicit so later we could # easily migrate to another runner without changing any API here if verbose: argv.append('-v') if nocapture: argv.append('-s') if pdb: argv.append('--pdb') if stop: argv.append('--stop') from datalad.support.third.nosetester import NoseTester tester = NoseTester(module) tester.package_name = module.split('.', 1)[0] tester.test(extra_argv=argv) test.__test__ = False # Following fixtures are necessary at the top level __init__ for fixtures which # would cover all **/tests and not just datalad/tests/ # To store settings which setup_package changes and teardown_package should return _test_states = { 'loglevel': None, 'DATALAD_LOG_LEVEL': None, 'HOME': None, } def setup_package(): import os from datalad import consts _test_states['HOME'] = os.environ.get('HOME', None) _test_states['DATASETS_TOPURL_ENV'] = os.environ.get('DATALAD_DATASETS_TOPURL', None) _test_states['DATASETS_TOPURL'] = consts.DATASETS_TOPURL os.environ['DATALAD_DATASETS_TOPURL'] = consts.DATASETS_TOPURL = 'http://datasets-tests.datalad.org/' # To overcome pybuild overriding HOME but us possibly wanting our # own HOME where we pre-setup git for testing (name, email) if 'GIT_HOME' in os.environ: os.environ['HOME'] = os.environ['GIT_HOME'] else: # we setup our own new HOME, the BEST and HUGE one from datalad.utils import make_tempfile from datalad.tests import _TEMP_PATHS_GENERATED # TODO: split into a function + context manager with make_tempfile(mkdir=True) as new_home: os.environ['HOME'] = new_home if not os.path.exists(new_home): os.makedirs(new_home) with open(os.path.join(new_home, '.gitconfig'), 'w') as f: f.write("""\ [user] name = DataLad Tester email = test@example.com """) _TEMP_PATHS_GENERATED.append(new_home) # For now we will just verify that it is ready to run the tests from datalad.support.gitrepo import check_git_configured check_git_configured() # To overcome pybuild by default defining http{,s}_proxy we would need # to define them to e.g. empty value so it wouldn't bother touching them. # But then haskell libraries do not digest empty value nicely, so we just # pop them out from the environment for ev in ('http_proxy', 'https_proxy'): if ev in os.environ and not (os.environ[ev]): lgr.debug("Removing %s from the environment since it is empty", ev) os.environ.pop(ev) DATALAD_LOG_LEVEL = os.environ.get('DATALAD_LOG_LEVEL', None) if DATALAD_LOG_LEVEL is None: # very very silent. Tests introspecting logs should use # swallow_logs(new_level=...) _test_states['loglevel'] = lgr.getEffectiveLevel() lgr.setLevel(100) # And we should also set it within environ so underlying commands also stay silent _test_states['DATALAD_LOG_LEVEL'] = DATALAD_LOG_LEVEL os.environ['DATALAD_LOG_LEVEL'] = '100' else: # We are not overriding them, since explicitly were asked to have some log level _test_states['loglevel'] = None # Set to non-interactive UI from datalad.ui import ui _test_states['ui_backend'] = ui.backend # obtain() since that one consults for the default value ui.set_backend(cfg.obtain('datalad.tests.ui.backend')) def teardown_package(): import os if os.environ.get('DATALAD_TESTS_NOTEARDOWN'): return from datalad.ui import ui from datalad import consts ui.set_backend(_test_states['ui_backend']) if _test_states['loglevel'] is not None: lgr.setLevel(_test_states['loglevel']) if _test_states['DATALAD_LOG_LEVEL'] is None: os.environ.pop('DATALAD_LOG_LEVEL') else: os.environ['DATALAD_LOG_LEVEL'] = _test_states['DATALAD_LOG_LEVEL'] from datalad.tests import _TEMP_PATHS_GENERATED from datalad.tests.utils import rmtemp if len(_TEMP_PATHS_GENERATED): msg = "Removing %d dirs/files: %s" % (len(_TEMP_PATHS_GENERATED), ', '.join(_TEMP_PATHS_GENERATED)) else: msg = "Nothing to remove" lgr.debug("Teardown tests. " + msg) for path in _TEMP_PATHS_GENERATED: rmtemp(path, ignore_errors=True) if _test_states['HOME'] is not None: os.environ['HOME'] = _test_states['HOME'] if _test_states['DATASETS_TOPURL_ENV']: os.environ['DATALAD_DATASETS_TOPURL'] = _test_states['DATASETS_TOPURL_ENV'] consts.DATASETS_TOPURL = _test_states['DATASETS_TOPURL'] lgr.debug("Printing versioning information collected so far") from datalad.support.external_versions import external_versions as ev print(ev.dumps(query=True)) lgr.log(5, "Done importing main __init__") datalad-0.9.3/datalad/__main__.py000066400000000000000000000060741325277200500166130ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Helper to use datalad as a "runnable" module with -m datalad""" import sys from . import __version__ from .auto import AutomagicIO from .log import lgr def usage(outfile, executable=sys.argv[0]): if '__main__.py' in executable: # That was -m datalad way to launch executable = "%s -m datalad" % sys.executable outfile.write("""Usage: %s [OPTIONS] [ARGS] Purpose: To provide FUSE-like operation whenever necessary files (as accessed by open, h5py.File) are requested, they get fetched. Meta-options: --help Display this help then exit. --version Output version information then exit. """ % executable) def runctx(cmd, globals=None, locals=None): if globals is None: globals = {} if locals is None: locals = {} try: exec(cmd, globals, locals) finally: # good opportunity to avoid atexit I guess. pass for now pass def main(argv=None): import os import getopt if argv is None: argv = sys.argv try: opts, prog_argv = getopt.getopt(argv[1:], "", ["help", "version"]) # TODO: support options for whatever we would support ;) # probably needs to hook in somehow into commands/options available # under cmdline/ except getopt.error as msg: sys.stderr.write("%s: %s\n" % (sys.argv[0], msg)) sys.stderr.write("Try `%s --help' for more information\n" % sys.argv[0]) sys.exit(1) # and now we need to execute target script "manually" # Borrowing up on from trace.py for opt, val in opts: if opt == "--help": usage(sys.stdout, executable=argv[0]) sys.exit(0) if opt == "--version": sys.stdout.write("datalad %s\n" % __version__) sys.exit(0) sys.argv = prog_argv progname = prog_argv[0] sys.path[0] = os.path.split(progname)[0] try: with open(progname) as fp: code = compile(fp.read(), progname, 'exec') # try to emulate __main__ namespace as much as possible globs = { '__file__': progname, '__name__': '__main__', '__package__': None, '__cached__': None, } # Since used explicitly -- activate the beast aio = AutomagicIO(activate=True) lgr.info("Running code of %s", progname) runctx(code, globs, globs) # TODO: see if we could hide our presence from the final tracebacks if execution fails except IOError as err: lgr.error("Cannot run file %r because: %s" % (sys.argv[0], err)) sys.exit(1) except SystemExit: pass if __name__ == '__main__': main() datalad-0.9.3/datalad/api.py000066400000000000000000000057751325277200500156530ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Python DataLad API exposing user-oriented commands (also available via CLI)""" # Should have no spurious imports/definitions at the module level from .distribution.dataset import Dataset def _generate_func_api(): """Auto detect all available interfaces and generate a function-based API from them """ from importlib import import_module from inspect import isgenerator from collections import namedtuple from functools import wraps from .interface.base import get_interface_groups from .interface.base import get_api_name from .interface.base import get_allargs_as_kwargs def _kwargs_to_namespace(call, args, kwargs): """ Given a __call__, args and kwargs passed, prepare a cmdlineargs-like thing """ kwargs_ = get_allargs_as_kwargs(call, args, kwargs) # Get all arguments removing those possible ones used internally and # which shouldn't be exposed outside anyways [kwargs_.pop(k) for k in kwargs_ if k.startswith('_')] namespace = namedtuple("smth", kwargs_.keys())(**kwargs_) return namespace def call_gen(call, renderer): """Helper to generate a call_ for call, to use provided renderer""" @wraps(call) def call_(*args, **kwargs): ret1 = ret = call(*args, **kwargs) if isgenerator(ret): # At first I thought we might just rerun it for output # at the end, but that wouldn't work if command actually # has a side-effect, i.e. actually doing something # so we actually need to memoize all generated output and output # it instead from datalad.utils import saved_generator ret, ret1 = saved_generator(ret) renderer(ret, _kwargs_to_namespace(call, args, kwargs)) return ret1 # TODO: see if we could proxy the "signature" of function # call from the original one call_.__doc__ += \ "\nNote\n----\n\n" \ "This version of a function uses cmdline results renderer before " \ "returning the result" return call_ for grp_name, grp_descr, interfaces in get_interface_groups(): for intfspec in interfaces: # turn the interface spec into an instance mod = import_module(intfspec[0], package='datalad') intf = getattr(mod, intfspec[1]) api_name = get_api_name(intfspec) globals()[api_name] = intf.__call__ # Invoke above helper _generate_func_api() # Be nice and clean up the namespace properly del _generate_func_api datalad-0.9.3/datalad/auto.py000066400000000000000000000245221325277200500160410ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Proxy basic file operations (e.g. open) to auto-obtain files upon I/O """ import sys # OPT delay import for expensive mock until used #from mock import patch from six import PY2 import six.moves.builtins as __builtin__ builtins_name = '__builtin__' if PY2 else 'builtins' import logging import io import os from os.path import dirname, lexists, realpath from os.path import exists from os.path import isabs from os.path import join as opj from git.exc import InvalidGitRepositoryError from .utils import getpwd from .dochelpers import exc_str from .support.annexrepo import AnnexRepo from .cmdline.helpers import get_repo_instance lgr = logging.getLogger("datalad.auto") h5py = None try: import h5py except ImportError: pass except Exception as exc: # could happen due to misbehaving handlers provided by git module # see https://github.com/gitpython-developers/GitPython/issues/600 # we could overload the handler by providing a blank one, but I do not # think it is worthwhile at this point. So let's just issue a warning lgr.warning( "Failed to import h5py, so no automagic handling for it atm: %s", exc_str(exc) ) lzma = None try: import lzma except ImportError: pass except Exception as exc: lgr.warning( "Failed to import lzma, so no automagic handling for it atm: %s", exc_str(exc) ) # TODO: RF to reduce code duplication among cases, also RF tests for the same reason class _EarlyExit(Exception): """Helper to early escape try/except logic in wrappde open""" pass class AutomagicIO(object): """Class to proxy commonly used API for accessing files so they get automatically fetched Currently supports builtin open() and h5py.File when those are read """ def __init__(self, autoget=True, activate=False): self._active = False self._builtin_open = __builtin__.open self._io_open = io.open self._builtin_exists = os.path.exists self._builtin_isfile = os.path.isfile if h5py: self._h5py_File = h5py.File else: self._h5py_File = None if lzma: self._lzma_LZMAFile = lzma.LZMAFile else: self._lzma_LZMAFile = None self._autoget = autoget self._in_open = False self._log_online = True from mock import patch self._patch = patch if activate: self.activate() def __enter__(self): self.activate() return self def __exit__(self, exc_type, exc_value, traceback): self.deactivate() @property def autoget(self): return self._autoget @property def active(self): return self._active def _proxy_open_name_mode(self, origname, origfunc, *args, **kwargs): """Proxy for various "open" which have first argument name and 2nd - mode """ # wrap it all for resilience to errors -- proxying must do no harm! try: if self._in_open: raise _EarlyExit self._in_open = True # just in case someone kept alias/assignment # return stock open for the duration of handling so that # logging etc could workout correctly with self._patch(origname, origfunc): lgr.log(2, "Proxying open with %r %r", args, kwargs) # had to go with *args since in PY2 it is name, in PY3 file # deduce arguments if len(args) > 0: # name/file was provided file = args[0] else: filearg = "name" if PY2 else "file" if filearg not in kwargs: # so the name was missing etc, just proxy into original open call and let it puke lgr.debug("No name/file was given, avoiding proxying") raise _EarlyExit file = kwargs.get(filearg) if isinstance(file, int): lgr.debug( "Skipping operation on %i, already a file descriptor", file) raise _EarlyExit mode = 'r' if len(args) > 1: mode = args[1] elif 'mode' in kwargs: mode = kwargs['mode'] if 'r' in mode: self._dataset_auto_get(file) else: lgr.debug("Skipping operation on %s since mode=%r", file, mode) except _EarlyExit: pass except Exception as e: # If anything goes wrong -- we should complain and proceed with self._patch(origname, origfunc): lgr.warning("Failed proxying open with %r, %r: %s", args, kwargs, exc_str(e)) finally: self._in_open = False # finally give it back to stock open return origfunc(*args, **kwargs) def _proxy_open(self, *args, **kwargs): return self._proxy_open_name_mode(builtins_name + '.open', self._builtin_open, *args, **kwargs) def _proxy_io_open(self, *args, **kwargs): return self._proxy_open_name_mode('io.open', self._io_open, *args, **kwargs) def _proxy_h5py_File(self, *args, **kwargs): return self._proxy_open_name_mode('h5py.File', self._h5py_File, *args, **kwargs) def _proxy_lzma_LZMAFile(self, *args, **kwargs): return self._proxy_open_name_mode('lzma.LZMAFile', self._lzma_LZMAFile, *args, **kwargs) def _proxy_exists(self, path): # TODO: decide either it should may be retrieved right away. # For now, as long as it is a symlink pointing to under .git/annex if exists(path): return True return lexists(path) and 'annex/objects' in str(realpath(path)) def _proxy_isfile(self, path): return self._proxy_open_name_mode( 'os.path.isfile', self._builtin_isfile, path ) def _dataset_auto_get(self, filepath): """Verify that filepath is under annex, and if so and not present - get it""" if not self._autoget: return # if filepath is not there at all (program just "checked" if it could access it if not lexists(filepath): lgr.log(2, "Not testing/getting file %s since it is not there", filepath) return # deduce directory for filepath filedir = dirname(filepath) try: # TODO: verify logic for create -- we shouldn't 'annexify' non-annexified # see https://github.com/datalad/datalad/issues/204 annex = get_repo_instance(filedir) except (RuntimeError, InvalidGitRepositoryError) as e: # must be not under annex etc return if not isinstance(annex, AnnexRepo): # not an annex -- can do nothing return # since Git/AnnexRepo functionality treats relative paths relative to the # top of the repository and might be outside, get a full path if not isabs(filepath): filepath = opj(getpwd(), filepath) # "quick" check first if under annex at all try: # might fail. TODO: troubleshoot when it does e.g. # datalad/tests/test_auto.py:test_proxying_open_testrepobased under_annex = annex.is_under_annex(filepath, batch=True) except: # MIH: really? what if MemoryError under_annex = None # either it has content if (under_annex or under_annex is None) and not annex.file_has_content(filepath): lgr.info("AutomagicIO: retrieving file content of %s", filepath) annex.get(filepath) def activate(self): # we should stay below info for this message. With PR #1630 we # start to use this functionality internally, and this will show # up frequently even in cases where it does nothing at all lgr.debug("Activating DataLad's AutoMagicIO") # Some beasts (e.g. tornado used by IPython) override outputs, and # provide fileno which throws exception. In such cases we should not log online self._log_online = hasattr(sys.stdout, 'fileno') and hasattr(sys.stderr, 'fileno') try: if self._log_online: sys.stdout.fileno() sys.stderr.fileno() except: # MIH: IOError? self._log_online = False if self.active: # this is not a warning, because there is nothing going # wrong or being undesired. Nested invokation could happen # caused by independent pieces of code, e.g. user code # that invokes our own metadata handling. lgr.debug("%s already active. No action taken" % self) return # overloads __builtin__.open = self._proxy_open io.open = self._proxy_io_open os.path.exists = self._proxy_exists os.path.isfile = self._proxy_isfile if h5py: h5py.File = self._proxy_h5py_File if lzma: lzma.LZMAFile = self._proxy_lzma_LZMAFile self._active = True def deactivate(self): # just debug level -- see activate() lgr.debug("Deactivating DataLad's AutoMagicIO") if not self.active: lgr.warning("%s is not active, can't deactivate" % self) return __builtin__.open = self._builtin_open io.open = self._io_open if h5py: h5py.File = self._h5py_File if lzma: lzma.LZMAFile = self._lzma_LZMAFile os.path.exists = self._builtin_exists os.path.isfile = self._builtin_isfile self._active = False def __del__(self): try: if self._active: self.deactivate() except: # MIH: IOError? pass try: super(self.__class__, self).__del__() except: pass datalad-0.9.3/datalad/cmd.py000066400000000000000000000637501325277200500156420ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ Wrapper for command and function calls, allowing for dry runs and output handling """ import subprocess import sys import logging import os import shutil import shlex import atexit import functools import tempfile from collections import OrderedDict from six import PY3, PY2 from six import string_types, binary_type, text_type from os.path import abspath, isabs, pathsep, exists from .consts import GIT_SSH_COMMAND from .dochelpers import exc_str from .support.exceptions import CommandError from .support.protocol import NullProtocol, DryRunProtocol, \ ExecutionTimeProtocol, ExecutionTimeExternalsProtocol from .utils import on_windows, get_tempfile_kwargs, assure_unicode from .dochelpers import borrowdoc lgr = logging.getLogger('datalad.cmd') # In python3 to split byte stream on newline, it must be bytes linesep_bytes = os.linesep.encode() _TEMP_std = sys.stdout, sys.stderr # To be used in the temp file name to distinguish the ones we create # in Runner so we take care about their removal, in contrast to those # which might be created outside and passed into Runner _MAGICAL_OUTPUT_MARKER = "_runneroutput_" if PY2: # TODO apparently there is a recommended substitution for Python2 # which is a backported implementation of python3 subprocess # https://pypi.python.org/pypi/subprocess32/ file_class = file else: from io import IOBase as file_class def _decide_to_log(v): """Hacky workaround for now so we could specify per each which to log online and which to the log""" if isinstance(v, bool) or callable(v): return v elif v in {'online'}: return True elif v in {'offline'}: return False else: raise ValueError("can be bool, callable, 'online' or 'offline'") def _get_output_stream(log_std, false_value): """Helper to prepare output stream for Popen and use file for 'offline' Necessary to avoid lockdowns when both stdout and stderr are pipes """ if log_std: if log_std == 'offline': # we will open a temporary file tf = tempfile.mktemp( **get_tempfile_kwargs({}, prefix=_MAGICAL_OUTPUT_MARKER) ) return open(tf, 'w') # XXX PY3 should be 'b' may be? else: return subprocess.PIPE else: return false_value def _cleanup_output(stream, std): if isinstance(stream, file_class) and _MAGICAL_OUTPUT_MARKER in stream.name: if not stream.closed: stream.close() if exists(stream.name): os.unlink(stream.name) elif stream == subprocess.PIPE: std.close() class Runner(object): """Provides a wrapper for calling functions and commands. An object of this class provides a methods that calls shell commands or python functions, allowing for protocolling the calls and output handling. Outputs (stdout and stderr) can be either logged or streamed to system's stdout/stderr during execution. This can be enabled or disabled for both of them independently. Additionally, a protocol object can be a used with the Runner. Such a protocol has to implement datalad.support.protocol.ProtocolInterface, is able to record calls and allows for dry runs. """ __slots__ = ['commands', 'dry', 'cwd', 'env', 'protocol', '_log_opts'] def __init__(self, cwd=None, env=None, protocol=None, log_outputs=None): """ Parameters ---------- cwd: string, optional Base current working directory for commands. Could be overridden per run call via cwd option env: dict, optional Custom environment to use for calls. Could be overridden per run call via env option protocol: ProtocolInterface Protocol object to write to. log_outputs : bool, optional Switch to instruct either outputs should be logged or not. If not set (default), config 'datalad.log.outputs' would be consulted """ self.cwd = cwd self.env = env if protocol is None: # TODO: config cmd.protocol = null protocol_str = os.environ.get('DATALAD_CMD_PROTOCOL', 'null') protocol = { 'externals-time': ExecutionTimeExternalsProtocol, 'time': ExecutionTimeProtocol, 'null': NullProtocol }[protocol_str]() if protocol_str != 'null': # we need to dump it into a file at the end # TODO: config cmd.protocol_prefix = protocol filename = '%s-%s.log' % ( os.environ.get('DATALAD_CMD_PROTOCOL_PREFIX', 'protocol'), id(self) ) atexit.register(functools.partial(protocol.write_to_file, filename)) self.protocol = protocol # Various options for logging self._log_opts = {} # we don't know yet either we need ot log every output or not if log_outputs is not None: self._log_opts['outputs'] = log_outputs def __call__(self, cmd, *args, **kwargs): """Convenience method This will call run() or call() depending on the kind of `cmd`. If `cmd` is a string it is interpreted as the to be executed command. Otherwise it is expected to be a callable. Any other argument is passed to the respective method. Parameters ---------- cmd: str or callable command string to be executed via shell or callable to be called. `*args`: `**kwargs`: see Runner.run() and Runner.call() for available arguments. Raises ------ TypeError if cmd is neither a string nor a callable. """ if isinstance(cmd, string_types) or isinstance(cmd, list): return self.run(cmd, *args, **kwargs) elif callable(cmd): return self.call(cmd, *args, **kwargs) else: raise TypeError("Argument 'command' is neither a string, " "nor a list nor a callable.") def _opt_env_adapter(v): """If value is a string, split by ,""" if v: if v.isdigit(): log_env = bool(int(v)) else: log_env = v.split(',') return log_env else: return False _LOG_OPTS_ADAPTERS = OrderedDict([ ('outputs', None), ('cwd', None), ('env', _opt_env_adapter), ('stdin', None), ]) def _get_log_setting(self, opt, default=False): try: return self._log_opts[opt] except KeyError: try: from . import cfg except ImportError: return default adapter = self._LOG_OPTS_ADAPTERS.get(opt, None) self._log_opts[opt] = \ (cfg.getbool if not adapter else cfg.get_value)( 'datalad.log.cmd', opt, default=default) if adapter: self._log_opts[opt] = adapter(self._log_opts[opt]) return self._log_opts[opt] @property def log_outputs(self): return self._get_log_setting('outputs') @property def log_cwd(self): return self._get_log_setting('cwd') @property def log_stdin(self): return self._get_log_setting('stdin') @property def log_env(self): return self._get_log_setting('env') # Two helpers to encapsulate formatting/output def _log_out(self, line): if line and self.log_outputs: self.log("stdout| " + line.rstrip('\n')) def _log_err(self, line, expected=False): if line and self.log_outputs: self.log("stderr| " + line.rstrip('\n'), level={True: 9, False: 11}[expected]) def _get_output_online(self, proc, log_stdout, log_stderr, outputstream, errstream, expect_stderr=False, expect_fail=False): """ If log_stdout or log_stderr are callables, they will be given a read line to be processed, and return processed result. So if they need to 'swallow' the line from being logged, should just return None Parameters ---------- proc log_stdout: bool or callable or 'online' or 'offline' log_stderr: : bool or callable or 'online' or 'offline' If any of those 'offline', we would call proc.communicate at the end to grab possibly outstanding output from it expect_stderr expect_fail Returns ------- """ stdout, stderr = binary_type(), binary_type() log_stdout_ = _decide_to_log(log_stdout) log_stderr_ = _decide_to_log(log_stderr) log_stdout_is_callable = callable(log_stdout_) log_stderr_is_callable = callable(log_stderr_) # arguments to be passed into _process_one_line stdout_args = ( 'stdout', proc, log_stdout_, log_stdout_is_callable ) stderr_args = ( 'stderr', proc, log_stderr_, log_stderr_is_callable, expect_stderr or expect_fail ) while proc.poll() is None: # see for a possibly useful approach to processing output # in another thread http://codereview.stackexchange.com/a/17959 # current problem is that if there is no output on stderr # it stalls if log_stdout_: stdout += self._process_one_line(*stdout_args) if log_stderr_: stderr += self._process_one_line(*stderr_args) # Handle possible remaining output stdout_, stderr_ = proc.communicate() # ??? should we condition it on log_stdout in {'offline'} ??? stdout += self._process_remaining_output(outputstream, stdout_, *stdout_args) stderr += self._process_remaining_output(errstream, stderr_, *stderr_args) return stdout, stderr def _process_remaining_output(self, stream, out_, *pargs): """Helper to process output which might have been obtained from popen or should be loaded from file""" out = binary_type() if isinstance(stream, file_class) and _MAGICAL_OUTPUT_MARKER in stream.name: assert out_ is None, "should have gone into a file" if not stream.closed: stream.close() with open(stream.name, 'rb') as f: for line in f: out += self._process_one_line(*pargs, line=line) else: if out_: # resolving a once in a while failing test #2185 if isinstance(out_, text_type): out_ = out_.encode('utf-8') for line in out_.split(linesep_bytes): out += self._process_one_line(*pargs, line=line) return out def _process_one_line(self, out_type, proc, log_, log_is_callable, expected=False, line=None): if line is None: lgr.log(3, "Reading line from %s", out_type) line = {'stdout': proc.stdout, 'stderr': proc.stderr}[out_type].readline() else: lgr.log(3, "Processing provided line") if line and log_is_callable: # Let it be processed line = log_(assure_unicode(line)) if line is not None: # we are working with binary type here line = line.encode() if line: if out_type == 'stdout': self._log_out(assure_unicode(line)) elif out_type == 'stderr': self._log_err(line.decode() if PY3 else line, expected) else: # pragma: no cover raise RuntimeError("must not get here") return line # it was output already directly but for code to work, return "" return binary_type() def run(self, cmd, log_stdout=True, log_stderr=True, log_online=False, expect_stderr=False, expect_fail=False, cwd=None, env=None, shell=None, stdin=None): """Runs the command `cmd` using shell. In case of dry-mode `cmd` is just added to `commands` and it is actually executed otherwise. Allows for separately logging stdout and stderr or streaming it to system's stdout or stderr respectively. Note: Using a string as `cmd` and shell=True allows for piping, multiple commands, etc., but that implies shlex.split() is not used. This is considered to be a security hazard. So be careful with input. Parameters ---------- cmd : str, list String (or list) defining the command call. No shell is used if cmd is specified as a list log_stdout: bool, optional If True, stdout is logged. Goes to sys.stdout otherwise. log_stderr: bool, optional If True, stderr is logged. Goes to sys.stderr otherwise. log_online: bool, optional Either to log as output comes in. Setting to True is preferable for running user-invoked actions to provide timely output expect_stderr: bool, optional Normally, having stderr output is a signal of a problem and thus it gets logged at level 11. But some utilities, e.g. wget, use stderr for their progress output. Whenever such output is expected, set it to True and output will be logged at level 9 unless exit status is non-0 (in non-online mode only, in online -- would log at 9) expect_fail: bool, optional Normally, if command exits with non-0 status, it is considered an error and logged at level 11 (above DEBUG). But if the call intended for checking routine, such messages are usually not needed, thus it will be logged at level 9. cwd : string, optional Directory under which run the command (passed to Popen) env : string, optional Custom environment to pass shell: bool, optional Run command in a shell. If not specified, then it runs in a shell only if command is specified as a string (not a list) stdin: file descriptor input stream to connect to stdin of the process. Returns ------- (stdout, stderr) Raises ------ CommandError if command's exitcode wasn't 0 or None. exitcode is passed to CommandError's `code`-field. Command's stdout and stderr are stored in CommandError's `stdout` and `stderr` fields respectively. """ outputstream = _get_output_stream(log_stdout, sys.stdout) errstream = _get_output_stream(log_stderr, sys.stderr) popen_env = env or self.env # TODO: if outputstream is sys.stdout and that one is set to StringIO # we have to "shim" it with something providing fileno(). # This happens when we do not swallow outputs, while allowing nosetest's # StringIO to be provided as stdout, crashing the Popen requiring # fileno(). In out swallow_outputs, we just use temporary files # to overcome this problem. # For now necessary test code should be wrapped into swallow_outputs cm # to avoid the problem log_msgs = ["Running: %s"] log_args = [cmd] if self.log_cwd: log_msgs += ['cwd=%r'] log_args += [cwd or self.cwd] if self.log_stdin: log_msgs += ['stdin=%r'] log_args += [stdin] log_env = self.log_env if log_env and popen_env: log_msgs += ["env=%r"] log_args.append( popen_env if log_env is True else {k: popen_env[k] for k in log_env if k in popen_env} ) log_msg = '\n'.join(log_msgs) self.log(log_msg, *log_args) if self.protocol.do_execute_ext_commands: if shell is None: shell = isinstance(cmd, string_types) if self.protocol.records_ext_commands: prot_exc = None prot_id = self.protocol.start_section( shlex.split(cmd, posix=not on_windows) if isinstance(cmd, string_types) else cmd) try: proc = subprocess.Popen(cmd, stdout=outputstream, stderr=errstream, shell=shell, cwd=cwd or self.cwd, env=popen_env, stdin=stdin) except Exception as e: prot_exc = e lgr.log(11, "Failed to start %r%r: %s" % (cmd, " under %r" % cwd if cwd else '', exc_str(e))) raise finally: if self.protocol.records_ext_commands: self.protocol.end_section(prot_id, prot_exc) try: if log_online: out = self._get_output_online(proc, log_stdout, log_stderr, outputstream, errstream, expect_stderr=expect_stderr, expect_fail=expect_fail) else: out = proc.communicate() if PY3: # Decoding was delayed to this point def decode_if_not_None(x): return "" if x is None else binary_type.decode(x) # TODO: check if we can avoid PY3 specific here out = tuple(map(decode_if_not_None, out)) status = proc.poll() # needs to be done after we know status if not log_online: self._log_out(out[0]) if status not in [0, None]: self._log_err(out[1], expected=expect_fail) else: # as directed self._log_err(out[1], expected=expect_stderr) if status not in [0, None]: msg = "Failed to run %r%s. Exit code=%d. out=%s err=%s" \ % (cmd, " under %r" % (cwd or self.cwd), status, out[0], out[1]) lgr.log(9 if expect_fail else 11, msg) raise CommandError(str(cmd), msg, status, out[0], out[1]) else: self.log("Finished running %r with status %s" % (cmd, status), level=8) finally: # Those streams are for us to close if we asked for a PIPE # TODO -- assure closing the files import pdb; pdb.set_trace() _cleanup_output(outputstream, proc.stdout) _cleanup_output(errstream, proc.stderr) else: if self.protocol.records_ext_commands: self.protocol.add_section(shlex.split(cmd, posix=not on_windows) if isinstance(cmd, string_types) else cmd, None) out = ("DRY", "DRY") return out def call(self, f, *args, **kwargs): """Helper to unify collection of logging all "dry" actions. Calls `f` if `Runner`-object is not in dry-mode. Adds `f` along with its arguments to `commands` otherwise. Parameters ---------- f: callable """ if self.protocol.do_execute_callables: if self.protocol.records_callables: prot_exc = None prot_id = self.protocol.start_section( [str(f), "args=%s" % str(args), "kwargs=%s" % str(kwargs)]) try: return f(*args, **kwargs) except Exception as e: prot_exc = e raise finally: if self.protocol.records_callables: self.protocol.end_section(prot_id, prot_exc) else: if self.protocol.records_callables: self.protocol.add_section( [str(f), "args=%s" % str(args), "kwargs=%s" % str(kwargs)], None) def log(self, msg, *args, **kwargs): """log helper Logs at level 9 by default and adds "Protocol:"-prefix in order to log the used protocol. """ level = kwargs.pop('level', 9) if isinstance(self.protocol, NullProtocol): lgr.log(level, msg, *args, **kwargs) else: if args: msg = msg % args lgr.log(level, "{%s} %s" % ( self.protocol.__class__.__name__, msg) ) class GitRunner(Runner): """ Runner to be used to run git and git annex commands Overloads the runner class to check & update GIT_DIR and GIT_WORK_TREE environment variables set to the absolute path if is defined and is relative path """ _GIT_PATH = None @borrowdoc(Runner) def __init__(self, *args, **kwargs): super(GitRunner, self).__init__(*args, **kwargs) self._check_git_path() @staticmethod def _check_git_path(): """If using bundled git-annex, we would like to use bundled with it git Thus we will store _GIT_PATH a path to git in the same directory as annex if found. If it is empty (but not None), we do nothing """ if GitRunner._GIT_PATH is None: from distutils.spawn import find_executable # with all the nesting of config and this runner, cannot use our # cfg here, so will resort to dark magic of environment options if (os.environ.get('DATALAD_USE_DEFAULT_GIT', '0').lower() in ('1', 'on', 'true', 'yes')): git_fpath = find_executable("git") if git_fpath: GitRunner._GIT_PATH = '' lgr.log(9, "Will use default git %s", git_fpath) return # we are done - there is a default git avail. # if not -- we will look for a bundled one annex_fpath = find_executable("git-annex") if not annex_fpath: # not sure how to live further anyways! ;) alongside = False else: annex_path = os.path.dirname(os.path.realpath(annex_fpath)) if on_windows: # just bundled installations so git should be taken from annex alongside = True else: alongside = os.path.lexists(os.path.join(annex_path, 'git')) GitRunner._GIT_PATH = annex_path if alongside else '' lgr.log(9, "Will use git under %r (no adjustments to PATH if empty " "string)", GitRunner._GIT_PATH) assert(GitRunner._GIT_PATH is not None) # we made the decision! @staticmethod def get_git_environ_adjusted(env=None): """ Replaces GIT_DIR and GIT_WORK_TREE with absolute paths if relative path and defined """ # if env set copy else get os environment git_env = env.copy() if env else os.environ.copy() if GitRunner._GIT_PATH: git_env['PATH'] = pathsep.join([GitRunner._GIT_PATH, git_env['PATH']]) \ if 'PATH' in git_env \ else GitRunner._GIT_PATH for varstring in ['GIT_DIR', 'GIT_WORK_TREE']: var = git_env.get(varstring) if var: # if env variable set if not isabs(var): # and it's a relative path git_env[varstring] = abspath(var) # to absolute path lgr.log(9, "Updated %s to %s", varstring, git_env[varstring]) if 'GIT_SSH_COMMAND' not in git_env: git_env['GIT_SSH_COMMAND'] = GIT_SSH_COMMAND return git_env def run(self, cmd, env=None, *args, **kwargs): return super(GitRunner, self).run( cmd, env=self.get_git_environ_adjusted(env), *args, **kwargs) # #### # Preserve from previous version # TODO: document intention # #### # this one might get under Runner for better output/control def link_file_load(src, dst, dry_run=False): """Just a little helper to hardlink files's load """ dst_dir = os.path.dirname(dst) if not os.path.exists(dst_dir): os.makedirs(dst_dir) if os.path.lexists(dst): lgr.log(9, "Destination file %(dst)s exists. Removing it first", locals()) # TODO: how would it interact with git/git-annex os.unlink(dst) lgr.log(9, "Hardlinking %(src)s under %(dst)s", locals()) src_realpath = os.path.realpath(src) try: os.link(src_realpath, dst) except AttributeError as e: lgr.warn("Linking of %s failed (%s), copying file" % (src, e)) shutil.copyfile(src_realpath, dst) shutil.copystat(src_realpath, dst) else: lgr.log(2, "Hardlinking finished") def get_runner(*args, **kwargs): # needs local import, because the ConfigManager itself needs the runner from . import cfg # TODO: this is all crawl specific -- should be moved away if cfg.obtain('datalad.crawl.dryrun', default=False): kwargs = kwargs.copy() kwargs['protocol'] = DryRunProtocol() return Runner(*args, **kwargs) datalad-0.9.3/datalad/cmdline/000077500000000000000000000000001325277200500161255ustar00rootroot00000000000000datalad-0.9.3/datalad/cmdline/__init__.py000066400000000000000000000006611325277200500202410ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ """ __docformat__ = 'restructuredtext' datalad-0.9.3/datalad/cmdline/common_args.py000066400000000000000000000043021325277200500210020ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ """ __docformat__ = 'restructuredtext' # argument spec template # = ( # , # {} #) from ..cmdline.helpers import HelpAction, LogLevelAction help = ( 'help', ('-h', '--help', '--help-np'), dict(nargs=0, action=HelpAction, help="""show this help message. --help-np forcefully disables the use of a pager for displaying the help message""") ) version = ( 'version', ('--version',), dict(action='version', help="show the program's version and license information") ) _log_level_names = ['critical', 'error', 'warning', 'info', 'debug'] log_level = ( 'log-level', ('-l', '--log-level'), dict(action=LogLevelAction, choices=_log_level_names + [str(x) for x in range(1, 10)], metavar="LEVEL", default='warning', help="""set logging verbosity level. Choose among %s. Also you can specify an integer <10 to provide even more debugging information""" % ', '.join(_log_level_names)) ) pbs_runner = ( 'pbs-runner', ('--pbs-runner',), dict(choices=['condor'], default=None, help="""execute command by scheduling it via available PBS. For settings, config file will be consulted""") ) change_path = ( 'change-path', ('-C',), dict(action='append', dest='change_path', metavar='PATH', help="""run as if datalad was started in instead of the current working directory. When multiple -C options are given, each subsequent non-absolute -C is interpreted relative to the preceding -C . This option affects the interpretations of the path names in that they are made relative to the working directory caused by the -C option""") ) datalad-0.9.3/datalad/cmdline/helpers.py000066400000000000000000000216721325277200500201510ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ """ __docformat__ = 'restructuredtext' import argparse import os import re import sys import gzip from tempfile import NamedTemporaryFile from ..cmd import Runner from ..log import is_interactive from ..utils import getpwd from ..version import __version__ from ..dochelpers import exc_str from logging import getLogger lgr = getLogger('datalad.cmdline') class HelpAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): if is_interactive() and option_string == '--help': # lets use the manpage on mature systems ... try: import subprocess # get the datalad manpage to use manfile = os.environ.get('MANPATH', '/usr/share/man') \ + '/man1/{0}.1.gz'.format(parser.prog.replace(' ', '-')) # extract version field from the manpage if not os.path.exists(manfile): raise IOError("manfile is not found") with gzip.open(manfile) as f: man_th = [line for line in f if line.startswith(b".TH")][0] man_version = man_th.split(b' ')[5].strip(b" '\"\t\n").decode('utf-8') # don't show manpage if man_version not equal to current datalad_version if __version__ != man_version: raise ValueError subprocess.check_call( 'man %s 2> /dev/null' % manfile, shell=True) sys.exit(0) except (subprocess.CalledProcessError, IOError, OSError, IndexError, ValueError) as e: lgr.debug("Did not use manpage since %s", exc_str(e)) if option_string == '-h': helpstr = "%s\n%s" % ( parser.format_usage(), "Use '--help' to get more comprehensive information.") else: helpstr = parser.format_help() # better for help2man # for main command -- should be different sections. And since we are in # heavy output massaging mode... if "commands for dataset operations" in helpstr.lower(): opt_args_str = '*Global options*' pos_args_str = '*Commands*' # tune up usage -- default one is way too heavy helpstr = re.sub('^[uU]sage: .*?\n\s*\n', 'Usage: datalad [global-opts] command [command-opts]\n\n', helpstr, flags=re.MULTILINE | re.DOTALL) # and altogether remove sections with long list of commands helpstr = re.sub(r'positional arguments:\s*\n\s*{.*}\n', '', helpstr) else: opt_args_str = "*Options*" pos_args_str = "*Arguments*" helpstr = re.sub(r'optional arguments:', opt_args_str, helpstr) helpstr = re.sub(r'positional arguments:', pos_args_str, helpstr) # convert all headings to have the first character uppercase headpat = re.compile(r'^([a-z])(.*):$', re.MULTILINE) helpstr = re.subn( headpat, lambda match: r'{0}{1}:'.format(match.group(1).upper(), match.group(2)), helpstr)[0] # usage is on the same line helpstr = re.sub(r'^usage:', 'Usage:', helpstr) print(helpstr) sys.exit(0) class LogLevelAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): from ..log import LoggerHelper LoggerHelper().set_level(level=values) # MIH: Disabled. Non-functional, untested. #class PBSAction(argparse.Action): # """Action to schedule actual command execution via PBS (e.g. Condor)""" # def __call__(self, parser, namespace, values, option_string=None): # pbs = values[0] # import pdb; pdb.set_trace() # i = 1 def parser_add_common_opt(parser, opt, names=None, **kwargs): from . import common_args opt_tmpl = getattr(common_args, opt) opt_kwargs = opt_tmpl[2].copy() opt_kwargs.update(kwargs) if names is None: parser.add_argument(*opt_tmpl[1], **opt_kwargs) else: parser.add_argument(*names, **opt_kwargs) def strip_arg_from_argv(args, value, opt_names): """Strip an originally listed option (with its value) from the list cmdline args """ # Yarik doesn't know better if args is None: args = sys.argv # remove present pbs-runner option args_clean = [] skip = 0 for i, arg in enumerate(args): if skip: # we skip only one as instructed skip -= 1 continue if not (arg in opt_names and i < len(args) - 1 and args[i + 1] == value): args_clean.append(arg) else: # we need to skip this one and next one skip = 1 return args_clean def run_via_pbs(args, pbs): assert(pbs in ('condor',)) # for now # TODO: RF to support multiple backends, parameters, etc, for now -- just condor, no options f = NamedTemporaryFile('w', prefix='datalad-%s-' % pbs, suffix='.submit', delete=False) try: pwd = getpwd() logs = f.name.replace('.submit', '.log') exe = args[0] # TODO: we might need better way to join them, escaping spaces etc. There must be a stock helper #exe_args = ' '.join(map(repr, args[1:])) if len(args) > 1 else '' exe_args = ' '.join(args[1:]) if len(args) > 1 else '' f.write("""\ Executable = %(exe)s Initialdir = %(pwd)s Output = %(logs)s Error = %(logs)s getenv = True arguments = %(exe_args)s queue """ % locals()) f.close() Runner().run(['condor_submit', f.name]) lgr.info("Scheduled execution via %s. Logs will be stored under %s" % (pbs, logs)) finally: os.unlink(f.name) class RegexpType(object): """Factory for creating regular expression types for argparse DEPRECATED AFAIK -- now things are in the config file, but we might provide a mode where we operate solely from cmdline """ def __call__(self, string): if string: return re.compile(string) else: return None # TODO: useful also outside of cmdline, move to support/ from os import curdir def get_repo_instance(path=curdir, class_=None): """Returns an instance of appropriate datalad repository for path. Check whether a certain path is inside a known type of repository and returns an instance representing it. May also check for a certain type instead of detecting the type of repository. Parameters ---------- path: str path to check; default: current working directory class_: class if given, check whether path is inside a repository, that can be represented as an instance of the passed class. Raises ------ RuntimeError, in case cwd is not inside a known repository. """ from os.path import ismount, exists, normpath, isabs from git.exc import InvalidGitRepositoryError from ..utils import expandpath from ..support.gitrepo import GitRepo from ..support.annexrepo import AnnexRepo dir_ = expandpath(path) abspath_ = path if isabs(path) else dir_ if class_ is not None: if class_ == AnnexRepo: type_ = "annex" elif class_ == GitRepo: type_ = "git" else: raise RuntimeError("Unknown class %s." % str(class_)) while not ismount(dir_): # TODO: always correct termination? if exists(opj(dir_, '.git')): # found git dir if class_ is None: # detect repo type: try: return AnnexRepo(dir_, create=False) except RuntimeError as e: pass try: return GitRepo(dir_, create=False) except InvalidGitRepositoryError as e: raise RuntimeError("No datalad repository found in %s" % abspath_) else: try: return class_(dir_, create=False) except (RuntimeError, InvalidGitRepositoryError) as e: raise RuntimeError("No %s repository found in %s." % (type_, abspath_)) else: dir_ = normpath(opj(dir_, "..")) if class_ is not None: raise RuntimeError("No %s repository found in %s" % (type_, abspath_)) else: raise RuntimeError("No datalad repository found in %s" % abspath_) from appdirs import AppDirs from os.path import join as opj dirs = AppDirs("datalad", "datalad.org") datalad-0.9.3/datalad/cmdline/main.py000066400000000000000000000427741325277200500174410ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """""" __docformat__ = 'restructuredtext' import logging lgr = logging.getLogger('datalad.cmdline') lgr.log(5, "Importing cmdline.main") import argparse import sys import textwrap import shutil from importlib import import_module import os from six import text_type import datalad from datalad.cmdline import helpers from datalad.support.exceptions import InsufficientArgumentsError from datalad.support.exceptions import IncompleteResultsError from datalad.support.exceptions import CommandError from .helpers import strip_arg_from_argv from ..utils import setup_exceptionhook, chpwd from ..dochelpers import exc_str def _license_info(): return """\ Copyright (c) 2013-2018 DataLad developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ # TODO: OPT look into making setup_parser smarter to become faster # Now it seems to take up to 200ms to do all the parser setup # even though it might not be necessary to know about all the commands etc. # I wondered if it could somehow decide on what commands to worry about etc # by going through sys.args first def setup_parser( formatter_class=argparse.RawDescriptionHelpFormatter, return_subparsers=False): lgr.log(5, "Starting to setup_parser") # delay since it can be a heavy import from ..interface.base import dedent_docstring, get_interface_groups, \ get_cmdline_command_name, alter_interface_docs_for_cmdline # setup cmdline args parser parts = {} # main parser parser = argparse.ArgumentParser( # cannot use '@' because we need to input JSON-LD properties (which might come wit @ prefix) # MH: question, do we need this at all? fromfile_prefix_chars=':', # usage="%(prog)s ...", description=dedent_docstring("""\ DataLad provides a unified data distribution with the convenience of git-annex repositories as a backend. DataLad command line tools allow to manipulate (obtain, create, update, publish, etc.) datasets and their collections."""), epilog='"Control Your Data"', formatter_class=formatter_class, add_help=False) # common options helpers.parser_add_common_opt(parser, 'help') helpers.parser_add_common_opt(parser, 'log_level') helpers.parser_add_common_opt(parser, 'pbs_runner') helpers.parser_add_common_opt(parser, 'change_path') helpers.parser_add_common_opt( parser, 'version', version='datalad %s\n\n%s' % (datalad.__version__, _license_info())) if __debug__: parser.add_argument( '--dbg', action='store_true', dest='common_debug', help="enter Python debugger when uncaught exception happens") parser.add_argument( '--idbg', action='store_true', dest='common_idebug', help="enter IPython debugger when uncaught exception happens") parser.add_argument( '-c', action='append', dest='cfg_overrides', metavar='KEY=VALUE', help="""configuration variable setting. Overrides any configuration read from a file, but is potentially overridden itself by configuration variables in the process environment.""") parser.add_argument( '--output-format', dest='common_output_format', default='default', metavar="{default,json,json_pp,tailored,'