pax_global_header00006660000000000000000000000064136346173460014527gustar00rootroot0000000000000052 comment=96aa9ae582a1beede7c195dc13c958a84a1717f6 datalad-0.12.4/000077500000000000000000000000001363461734600132055ustar00rootroot00000000000000datalad-0.12.4/.asv/000077500000000000000000000000001363461734600140545ustar00rootroot00000000000000datalad-0.12.4/.coveragerc000066400000000000000000000000551363461734600153260ustar00rootroot00000000000000[paths] source = datalad/ */datalad/ datalad-0.12.4/.github/000077500000000000000000000000001363461734600145455ustar00rootroot00000000000000datalad-0.12.4/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000012661363461734600203530ustar00rootroot00000000000000### Instructions Please go through the following checklist before submitting the PR: - [ ] Provide an overview of the changes you're making and explain why you're proposing them. - [ ] Include `Fixes #NNN` or `Closes #NNN` somewhere in the description if this PR addresses an existing issue. - [ ] If this PR is not complete, select "Create Draft Pull Request" in the pull request button's menu. Consider using a task list (e.g., `- [ ] add tests ...`) to indicate remaining to-do items. - [ ] If you would like to list yourself as a DataLad contributor and your name is not mentioned please modify .zenodo.json file. - [ ] **Delete these instructions**. :-) Thanks for contributing! datalad-0.12.4/.github/issue_template.md000066400000000000000000000007311363461734600201130ustar00rootroot00000000000000#### What is the problem? #### What steps will reproduce the problem? #### What version of DataLad are you using (run `datalad --version`)? On what operating system (consider running `datalad wtf`)? #### Is there anything else that would be useful to know in this context? #### Have you had any success using DataLad before? (to assess your expertise/prior luck. We would welcome your testimonial additions to https://github.com/datalad/datalad/wiki/Testimonials as well) datalad-0.12.4/.github/workflows/000077500000000000000000000000001363461734600166025ustar00rootroot00000000000000datalad-0.12.4/.github/workflows/benchmarks.yml000066400000000000000000000017721363461734600214510ustar00rootroot00000000000000name: Benchmarks on: [pull_request] jobs: vs-master: runs-on: ubuntu-latest steps: - name: Set up system shell: bash run: | bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) sudo apt-get update -qq sudo apt-get install eatmydata sudo eatmydata apt-get install git-annex-standalone - name: Set up environment run: | git config --global user.email "test@github.land" git config --global user.name "GitHub Almighty" - uses: actions/checkout@v1 - name: Set up Python 3.7 uses: actions/setup-python@v1 with: python-version: 3.7 - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt pip install ".[devel-docs]" - name: Run benchmarks env: # fake environment to be able to reuse script for travis TRAVIS_PULL_REQUEST: true run: | tools/ci/benchmark-travis-pr.sh datalad-0.12.4/.github/workflows/docbuild.yml000066400000000000000000000012171363461734600211130ustar00rootroot00000000000000name: Docs on: [pull_request] jobs: build: runs-on: ubuntu-latest steps: - name: Set up environment run: | git config --global user.email "test@github.land" git config --global user.name "GitHub Almighty" - uses: actions/checkout@v1 - name: Set up Python 3.7 uses: actions/setup-python@v1 with: python-version: 3.7 - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt pip install ".[devel-docs]" sudo apt-get install p7zip - name: Build docs run: | make -C docs html doctest; datalad-0.12.4/.github/workflows/test_extensions.yml000066400000000000000000000032571363461734600225720ustar00rootroot00000000000000name: Extensions on: [pull_request] jobs: test: runs-on: ubuntu-latest strategy: fail-fast: false matrix: extension: [ datalad_neuroimaging, datalad_container, datalad_metalad, datalad_crawler, ] steps: - name: Set up system shell: bash run: | bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) sudo apt-get update -qq sudo apt-get install eatmydata sudo eatmydata apt-get install git-annex-standalone - name: Set up environment run: | git config --global user.email "test@github.land" git config --global user.name "GitHub Almighty" - uses: actions/checkout@v1 - name: Set up Python 3.7 uses: actions/setup-python@v1 with: python-version: 3.7 - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt - name: Install ${{ matrix.extension }} extension run: | pip install ${{ matrix.extension }} if: matrix.extension != 'datalad_crawler' - name: Install ${{ matrix.extension }} extension run: | pip install ${{ matrix.extension }}[devel] if: matrix.extension == 'datalad_crawler' - name: Install Singularity run: sudo eatmydata apt-get install singularity-container if: matrix.extension == 'datalad_container' - name: WTF!? run: | datalad wtf - name: ${{ matrix.extension }} tests run: | mkdir -p __testhome__ cd __testhome__ python -m nose -s -v --with-cov --cover-package datalad ${{ matrix.extension }} datalad-0.12.4/.github/workflows/test_win2019_disabled000066400000000000000000000035571363461734600225360ustar00rootroot00000000000000name: Win2019 on: [pull_request] jobs: test: runs-on: windows-2019 strategy: fail-fast: false matrix: # group modules to get a job runtime of ~15min module: [ datalad.core, datalad.local datalad.distributed datalad.support, datalad.customremotes datalad.downloaders datalad.plugin, datalad.distribution, datalad.interface, datalad.metadata datalad.tests datalad.ui datalad.cmdline ] steps: - name: Set up environment run: | git config --global user.email "test@github.land" git config --global user.name "GitHub Almighty" - uses: actions/checkout@v1 - name: Set up Python 3.7 uses: actions/setup-python@v1 with: python-version: 3.7 - name: Set up git-annex run: | python -c "import urllib.request as r; r.urlretrieve('https://downloads.kitenet.net/git-annex/windows/current/git-annex-installer.exe', 'C:\\git-annex-installer.exe')" 7z x -o"C:\\Program Files\Git" C:\\git-annex-installer.exe - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt pip install colorama pip install ".[tests]" pip install ".[devel-utils]" - name: WTF!? run: | datalad wtf dir - name: ${{ matrix.module }} tests run: | mkdir -p __testhome__ cd __testhome__ python -m nose -s -v --with-cov --cover-package datalad ${{ matrix.module }} # coverage report is not functional because codecov refuses to accept the # report #- name: Coverage report # run: | # cd __testhome__ # python -m coverage xml # powershell.exe Invoke-WebRequest -Uri "https://codecov.io/bash" -OutFile codecov.sh # bash codecov.sh -f coverage.xml datalad-0.12.4/.gitignore000066400000000000000000000004001363461734600151670ustar00rootroot00000000000000.pybuild/ datalad.build/ /.idea .coverage /venv* /cfgs /dist /fixtures /diagrams /build /docs/build /docs/source/generated /docs/source/_extras/schema.json /.tox /repos tmp .noseids *.egg-info *.py[coe] *.bak .#* .orig .rej .*.swp .travis.yml.evil-bd .asv datalad-0.12.4/.gitmodules000066400000000000000000000001131363461734600153550ustar00rootroot00000000000000[submodule ".asv"] path = .asv url = https://github.com/datalad/.asv.git datalad-0.12.4/.mailmap000066400000000000000000000013441363461734600146300ustar00rootroot00000000000000Alejandro de la Vega Anisha Keshavan Benjamin Poldrack Benjamin Poldrack Christian Olaf Häusler chris Debanjum Singh Solanky Debanjum debanjum Jason Gors Michael Hanke mih Torsten Stoeter Yaroslav Halchenko Yaroslav Halchenko Neuroimaging Community Neuroimaging Community Neuroimaging Community datalad-0.12.4/.noannex000066400000000000000000000000001363461734600146420ustar00rootroot00000000000000datalad-0.12.4/.travis.yml000066400000000000000000000253571363461734600153320ustar00rootroot00000000000000# vim ft=yaml # travis-ci.org definition for DataLad build language: python python: - 3.5 ##1 - 3.6 cache: - apt env: global: # will be used in the matrix, where neither other variable is used - BOTO_CONFIG=/tmp/nowhere - DATALAD_TESTS_SSH=1 - DATALAD_LOG_CMD_ENV=GIT_SSH_COMMAND - TESTS_TO_PERFORM=datalad - NOSE_OPTS= - NOSE_SELECTION="integration or usecase or slow" - NOSE_SELECTION_OP="not " # so it would be "not (integration or usecase)" # Special settings/helper for combined coverage from special remotes execution - COVERAGE=coverage - DATALAD_DATASETS_TOPURL=http://datasets-tests.datalad.org matrix: ##1 - DATALAD_REPO_VERSION=6 - DATALAD_REPO_VERSION=5 # Disabled since old folks don't want to change workflows of submitting through central authority # - secure: "k2rHdTBjUU3pUUASqfRr7kHaaSmNKLLAR2f66A0fFSulih4CXxwLrR3g8/HP9m+jMve8mAYEiPSI7dT7cCm3WMA/piyLh2wKCGgzDD9oLjtvPAioR8dgLpzbgjxV/Vq6fwwPMlvbqqa+MmAImnAoSufEmI7zVQHCq11Hd5nd6Es=" # - secure: "Az7Pzo5pSdAFTTX6XXzE4VUS3wnlIe1vi/+bfHBzDjxstDvZVkPjPzaIs6v/BLod43AYBl1v9dyJR4qnBnaVrUDLB3tC0emLhJ2qnw+8GKHSSImCwIOeZzg9QpXeVQovZUqQVQ3fg3KIWCIzhmJ59EbMQcI4krNDxk4WcXmyVfk=" matrix: include: # Additional custom ones - python: 3.7 # Single run for Python 3.7 env: # Run all tests in a single whoop here # We cannot have empty -A selector, so the one which always will be fulfilled - NOSE_SELECTION= - NOSE_SELECTION_OP=not - python: 3.6 # Single run for Python 3.6 env: # Run all tests in a single whoop here # We cannot have empty -A selector, so the one which always will be fulfilled - NOSE_SELECTION= - NOSE_SELECTION_OP=not - python: 3.5 # Split runs for v6 since a single one is too long now env: - DATALAD_REPO_VERSION=6 - NOSE_SELECTION_OP=not - python: 3.5 env: - DATALAD_REPO_VERSION=6 - NOSE_SELECTION_OP="" - python: 3.5 # Run slow etc tests under a single tricky scenario env: - TMPDIR="/var/tmp/sym link" - NOSE_SELECTION_OP="" # And the leading - in filenames for the most challenge - DATALAD_TESTS_OBSCURE_PREFIX=- - DATALAD_LOG_TRACEBACK=collide # just a smoke test for now - python: 3.5 # A run loaded with various customizations to smoke test those functionalities # apparently moving symlink outside has different effects on abspath # see https://github.com/datalad/datalad/issues/878 env: # eventually: - TMPDIR="/var/tmp/sym ссылка" - TMPDIR="/var/tmp/sym link" # and obscure the names more a bit - DATALAD_TESTS_OBSCURE_PREFIX=- # By default no logs will be output. This one is to test with log output at INFO level - DATALAD_LOG_LEVEL=INFO - DATALAD_LOG_TRACEBACK=1 # just a smoke test for now - DATALAD_LOG_VMEM=1 - python: 3.5 # By default no logs will be output. This one is to test with low level but dumped to /dev/null env: - DATALAD_LOG_LEVEL=2 - DATALAD_LOG_TARGET=/dev/null - DATALAD_TESTS_PROTOCOLREMOTE=1 - DATALAD_TESTS_DATALADREMOTE=1 - DATALAD_LOG_CMD_CWD=1 - DATALAD_LOG_CMD_OUTPUTS=1 - DATALAD_LOG_CMD_ENV=1 - DATALAD_LOG_CMD_STDIN=1 - DATALAD_TESTS_UI_BACKEND=console - DATALAD_TESTS_OBSCURE_PREFIX=- - DATALAD_SEED=1 - GIT_AUTHOR_DATE="Thu, 07 Apr 2005 22:13:13 +0200" - GIT_AUTHOR_NAME=blah - GIT_AUTHOR_EMAIL=committer@example.com - GIT_COMMITTER_DATE="Thu, 07 Apr 2005 22:13:13 +0200" - GIT_COMMITTER_NAME=blah - GIT_COMMITTER_EMAIL=committer@example.com - python: 3.5 # Test some under NFS mount (only selected sub-set) env: - TMPDIR="/tmp/nfsmount" - TESTS_TO_PERFORM="datalad.tests datalad.support" # # The ones to run only on weekends against master. # They will not contribute to coverage etc, but might lead to failed status # - python: 3.5 # By default no logs will be output. This one is to test with log output at INFO level env: - _DL_UPSTREAM_GITPYTHON=1 # It is not usable without setting locales env vars ATM # see https://github.com/datalad/datalad/issues/3159 # - _DL_UPSTREAM_GITANNEX=1 # Just so we test if we did not screw up running under nose without -s as well - NOSE_OPTS= - _DL_CRON=1 # run if git-annex version in neurodebian -devel differs - python: 3.5 env: - _DL_DEVEL_ANNEX=1 - _DL_CRON=1 # Run with latest git rather than the bundled one. - python: 3.5 addons: apt: sources: - sourceline: 'ppa:git-core/ppa' env: - DATALAD_USE_DEFAULT_GIT=1 - _DL_UPSTREAM_GIT=1 - _DL_CRON=1 - python: 3.5 env: # to test operation under root since also would consider FS "crippled" due to # ability to rewrite R/O files - NOSE_WRAPPER="sudo -E" # no key authentication for root: - DATALAD_TESTS_SSH=0 - _DL_CRON=1 - python: 3.5 env: - DATALAD_TESTS_NONETWORK=1 # must operate nicely with those env variables set - http_proxy= - https_proxy= - _DL_CRON=1 - python: 3.5 env: - PYTHONPATH=$PWD/tools/testing/bad_internals/_scrapy/ - _DL_CRON=1 - python: 3.5 # Test under NFS mount (full, only in master) env: - TMPDIR="/tmp/nfsmount" - _DL_CRON=1 # - python: 3.5 # # test whether known v6 failures still fail # env: # - DATALAD_REPO_VERSION=6 # - DATALAD_TESTS_KNOWNFAILURES_SKIP=no # - DATALAD_TESTS_KNOWNFAILURES_PROBE=yes # - DATALAD_TESTS_SSH=1 # - _DL_CRON=1 allow_failures: # Test under NFS mount (full, only in master) - python: 3.5 env: - TMPDIR="/tmp/nfsmount" - _DL_CRON=1 # # test whether known v6 failures still fail # - env: # - DATALAD_REPO_VERSION=6 # - DATALAD_TESTS_KNOWNFAILURES_SKIP=no # - DATALAD_TESTS_KNOWNFAILURES_PROBE=yes # - DATALAD_TESTS_SSH=1 # - _DL_CRON=1 # Causes complete laptop or travis instance crash atm, but survives in a docker # need to figure it out (looks like some PID explosion) # - python: 3.5 # # we would need to migrate to boto3 to test it fully, but SSH should work # env: # - DATALAD_TESTS_SSH=1 # - UNSET_S3_SECRETS=1 before_install: - if [ ! -z "${_DL_CRON:-}" ]; then if [ ! ${TRAVIS_EVENT_TYPE} = "cron" ]; then echo "Exiting since should only be ran on a weekly basis by cron!"; exit 0; fi fi # Life was so fast that we needed a slow down # - sleep 60 # TEMP -- to spot diff # Just in case we need to check if nfs is there etc - sudo lsmod # The ultimate one-liner setup for NeuroDebian repository - bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) - travis_retry sudo apt-get update -qq - travis_retry sudo apt-get install eatmydata # to speedup some installations - sudo eatmydata tools/ci/prep-travis-forssh-sudo.sh - tools/ci/prep-travis-forssh.sh # Install grunt-cli - eatmydata npm install grunt-cli # Install optionally upstream current development so we are sure that they break nothing important for us - if [ ! -z "${_DL_UPSTREAM_GITPYTHON:-}" ]; then pip install https://github.com/gitpython-developers/GitPython/archive/master.zip; fi - if [ ! -z "${_DL_UPSTREAM_GITANNEX:-}" ]; then sudo tools/ci/install-annex-snapshot.sh; sudo ln -s `find /usr/local/lib/git-annex.linux -maxdepth 1 -type f -perm /+x` /usr/local/bin/; else sudo eatmydata apt-get install git-annex-standalone ; fi # Install optionally -devel version of annex, and if goes wrong (we have most recent), exit right away - if [ ! -z "${_DL_DEVEL_ANNEX:-}" ]; then tools/ci/prep-travis-devel-annex.sh || { ex="$?"; if [ "$ex" -eq 99 ]; then exit 0; else exit "$ex"; fi; }; fi # Optionally install the latest Git. Exit code 100 indicates that bundled is same as the latest. - if [ ! -z "${_DL_UPSTREAM_GIT:-}" ]; then sudo tools/ci/install-latest-git.sh || { [ $? -eq 100 ] && exit 0; } || exit 1; fi install: # Install standalone build of git-annex for the recent enough version - travis_retry sudo eatmydata apt-get install zip pandoc p7zip-full # needed for tests of patool compression fall-back solution - travis_retry sudo eatmydata apt-get install xz-utils - travis_retry sudo eatmydata apt-get install shunit2 # for metadata support - travis_retry sudo eatmydata apt-get install exempi - git config --global user.email "test@travis.land" - git config --global user.name "Travis Almighty" - cd ..; pip install -q codecov; cd - - pip install -r requirements-devel.txt # So we could test under sudo -E with PATH pointing to installed location - sudo sed -i -e 's/^Defaults.*secure_path.*$//' /etc/sudoers # TMPDIRs - if [[ "${TMPDIR:-}" =~ .*/sym\ link ]]; then echo "Symlinking $TMPDIR"; ln -s /tmp "$TMPDIR"; fi - if [[ "${TMPDIR:-}" =~ .*/d\ i\ r ]]; then echo "mkdir $TMPDIR"; mkdir -p "$TMPDIR"; fi - if [[ "${TMPDIR:-}" =~ .*/nfsmount ]]; then echo "mkdir $TMPDIR"; mkdir -p "$TMPDIR" "${TMPDIR}_"; echo "/tmp/nfsmount_ localhost(rw)" | sudo bash -c 'cat - > /etc/exports'; sudo apt-get install -y nfs-kernel-server; sudo exportfs -a; sudo mount -t nfs localhost:/tmp/nfsmount_ /tmp/nfsmount; fi # S3 - if [ ! -z "$UNSET_S3_SECRETS" ]; then echo "usetting"; unset DATALAD_datalad_test_s3_key_id DATALAD_datalad_test_s3_secret_id; fi # Install grunt to test run javascript frontend tests - npm install grunt - npm install grunt-contrib-qunit@>=3.0.1 script: # Test installation system-wide - sudo pip install . - mkdir -p __testhome__ - cd __testhome__ # Run tests - http_proxy= PATH=$PWD/../tools/coverage-bin:$PATH $NOSE_WRAPPER `which nosetests` $NOSE_OPTS -A "$NOSE_SELECTION_OP($NOSE_SELECTION)" --with-doctest --with-cov --cover-package datalad --logging-level=INFO $TESTS_TO_PERFORM # Run doc examples if no spaces in the TMPDIR and SSH is allowed - if [ ${DATALAD_TESTS_SSH:-0} != 1 ] || echo "${TMPDIR:-}" | grep -q ' '; then echo "Testing examples is disabled by configuration"; else $NOSE_WRAPPER ../tools/testing/run_doc_examples; fi - cd .. # Run javascript tests - grunt test --verbose # Report WTF information using system wide installed version - datalad wtf after_success: # submit only what we have covered in the PRs - if [ -z "${_DL_CRON:-}" ]; then cp __testhome__/.coverage .; coverage combine -a /tmp/.coverage-entrypoints-*; codecov; fi # makes it only more difficult to comprehend the failing output. Enable only when necessary # for a particular debugging #after_failure: # - if [ ! -z "$DATALAD_TESTS_NONETWORK" ]; then sudo route add -net 0.0.0.0 netmask 0.0.0.0 dev lo; fi # - DATALAD_LOG_LEVEL=DEBUG $NOSE_WRAPPER `which nosetests` -s -v --with-doctest --with-cov --cover-package datalad --logging-level=DEBUG # - if [ ! -z "$DATALAD_TESTS_NONETWORK" ]; then sudo route del -net 0.0.0.0 netmask 0.0.0.0 dev lo; fi datalad-0.12.4/.zenodo.json000066400000000000000000000055431363461734600154630ustar00rootroot00000000000000{ "creators": [ { "affiliation": "Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany and Institute of Systems Neuroscience, Medical Faculty, Heinrich Heine University Düsseldorf, Düsseldorf, Germany", "name": "Hanke, Michael", "orcid": "0000-0001-6398-6370" }, { "affiliation": "Dartmouth College, Hanover, NH, United States", "name": "Halchenko, Yaroslav O.", "orcid": "0000-0003-3456-2493" }, { "affiliation": "Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany", "name": "Poldrack, Benjamin", "orcid": "0000-0001-7628-0801" }, { "affiliation": "Dartmouth College, Hanover, NH, United States", "name": "Meyer, Kyle" }, { "affiliation": "Dartmouth College, Hanover, NH, United States", "name": "Solanky, Debanjum Singh" }, { "name": "Alteva, Gergana" }, { "affiliation": "Dartmouth College, Hanover, NH, United States", "name": "Gors, Jason" }, { "name": "MacFarlane, Dave" }, { "name": "Olaf Häusler, Christian" }, { "name": "Olson, Taylor" }, { "affiliation": "Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany", "name": "Waite, Alex" }, { "affiliation": "University of Texas at Austin", "name": "De La Vega, Alejandro", "orcid": "0000-0001-9062-3778" }, { "name": "Sochat, Vanessa" }, { "affiliation": "UC Berkeley - UCSF Graduate Program in Bioengineering", "name": "Keshavan, Anisha", "orcid": "0000-0003-3554-043X" }, { "affiliation": "Dartmouth College, Hanover, NH, United States", "name": "Ma, Feilong" }, { "name": "Christian, Horea" }, { "name": "Poelen, Jorrit" }, { "name": "Skytén, Kusti" }, { "affiliation": "UC Berkeley", "name": "Visconti di Oleggio Castello, Matteo", "orcid": "0000-0001-7931-5272" }, { "name": "Hardcastle, Nell" }, { "name": "Stoeter, Torsten" }, { "name": "C Lau, Vicky" }, { "affiliation": "Stanford University, Stanford, CA, United States", "name": "Markiewicz, Christopher J.", "orcid": "0000-0002-6533-164X" }, { "affiliation": "Psychoinformatics Lab, INM-7, Research Centre Juelich", "name": "Wagner, Adina S.", "orcid": "0000-0003-2917-3450" } ], "grants": [ {"id": "10.13039/100000001::1429999"} ], "keywords": [ "data management", "data distribution", "execution provenance tracking", "version control" ], "access_right": "open", "license": "MIT", "upload_type": "software" } datalad-0.12.4/CHANGELOG.md000066400000000000000000003302271363461734600150250ustar00rootroot00000000000000 ____ _ _ _ | _ \ __ _ | |_ __ _ | | __ _ __| | | | | | / _` | | __| / _` | | | / _` | / _` | | |_| | | (_| | | |_ | (_| | | |___ | (_| | | (_| | |____/ \__,_| \__| \__,_| |_____| \__,_| \__,_| Change Log This is a high level and scarce summary of the changes between releases. We would recommend to consult log of the [DataLad git repository](http://github.com/datalad/datalad) for more details. ## 0.12.4 (Mar 19, 2020) -- Windows?!  The main purpose of this release is to have one on PyPi that has no associated wheel to enable a working installation on Windows ([#4315][]). ### Fixes - Adjust the behavior of the `log.outputs` config switch to make outputs visible. Its description was adjusted accordingly. ## 0.12.3 (March 16, 2020) -- . Updates for compatibility with the latest git-annex, along with a few miscellaneous fixes ### Major refactoring and deprecations - All spots that raised a `NoDatasetArgumentFound` exception now raise a `NoDatasetFound` exception to better reflect the situation: it is the _dataset_ rather than the _argument_ that is not found. For compatibility, the latter inherits from the former, but new code should prefer the latter. ([#4285][]) ### Fixes - Updates for compatibility with git-annex version 8.20200226. ([#4214][]) - `datalad export-to-figshare` failed to export if the generated title was fewer than three characters. It now queries the caller for the title and guards against titles that are too short. ([#4140][]) - Authentication was requested multiple times when git-annex launched parallel downloads from the `datalad` special remote. ([#4308][]) - At verbose logging levels, DataLad requests that git-annex display debugging information too. Work around a bug in git-annex that prevented that from happening. ([#4212][]) - The internal command runner looked in the wrong place for some configuration variables, including `datalad.log.outputs`, resulting in the default value always being used. ([#4194][]) - [publish][] failed when trying to publish to a git-lfs special remote for the first time. ([#4200][]) - `AnnexRepo.set_remote_url` is supposed to establish shared SSH connections but failed to do so. ([#4262][]) ### Enhancements and new features - The message provided when a command cannot determine what dataset to operate on has been improved. ([#4285][]) - The "aws-s3" authentication type now allows specifying the host through "aws-s3_host", which was needed to work around an authorization error due to a longstanding upstream bug. ([#4239][]) - The xmp metadata extractor now recognizes ".wav" files. ## 0.12.2 (Jan 28, 2020) -- Smoothen the ride Mostly a bugfix release with various robustifications, but also makes the first step towards versioned dataset installation requests. ### Major refactoring and deprecations - The minimum required version for GitPython is now 2.1.12. ([#4070][]) ### Fixes - The class for handling configuration values, `ConfigManager`, inappropriately considered the current working directory's dataset, if any, for both reading and writing when instantiated with `dataset=None`. This misbehavior is fairly inaccessible through typical use of DataLad. It affects `datalad.cfg`, the top-level configuration instance that should not consider repository-specific values. It also affects Python users that call `Dataset` with a path that does not yet exist and persists until that dataset is created. ([#4078][]) - [update][] saved the dataset when called with `--merge`, which is unnecessary and risks committing unrelated changes. ([#3996][]) - Confusing and irrelevant information about Python defaults have been dropped from the command-line help. ([#4002][]) - The logic for automatically propagating the 'origin' remote when cloning a local source didn't properly account for relative paths. ([#4045][]) - Various fixes to file name handling and quoting on Windows. ([#4049][]) ([#4050][]) - When cloning failed, error lines were not bubbled up to the user in some scenarios. ([#4060][]) ### Enhancements and new features - [clone][] (and thus [install][]) - now propagates the `reckless` mode from the superdataset when cloning a dataset into it. ([#4037][]) - gained support for `ria+://` URLs that point to [RIA][handbook-scalable-datastore] stores. ([#4022][]) - learned to read "@version" from `ria+` URLs and install that version of a dataset ([#4036][]) and to apply URL rewrites configured through Git's `url.*.insteadOf` mechanism ([#4064][]). - now copies `datalad.get.subdataset-source-candidate-` options configured within the superdataset into the subdataset. This is particularly useful for RIA data stores. ([#4073][]) - Archives are now (optionally) handled with 7-Zip instead of `patool`. 7-Zip will be used by default, but `patool` will be used on non-Windows systems if the `datalad.runtime.use-patool` option is set or the `7z` executable is not found. ([#4041][]) ## 0.12.1 (Jan 15, 2020) -- Small bump after big bang Fix some fallout after major release. ### Fixes - Revert incorrect relative path adjustment to URLs in [clone][]. ([#3538][]) - Various small fixes to internal helpers and test to run on Windows ([#2566][]) ([#2534][]) ## 0.12.0 (Jan 11, 2020) -- Krakatoa This release is the result of more than a year of development that includes fixes for a large number of issues, yielding more robust behavior across a wider range of use cases, and introduces major changes in API and behavior. It is the first release for which extensive user documentation is available in a dedicated [DataLad Handbook][handbook]. Python 3 (3.5 and later) is now the only supported Python flavor. ### Major changes 0.12 vs 0.11 - [save][] fully replaces [add][] (which is obsolete now, and will be removed in a future release). - A new Git-annex aware [status][] command enables detailed inspection of dataset hierarchies. The previously available [diff][] command has been adjusted to match [status][] in argument semantics and behavior. - The ability to configure dataset procedures prior and after the execution of particular commands has been replaced by a flexible "hook" mechanism that is able to run arbitrary DataLad commands whenever command results are detected that match a specification. - Support of the Windows platform has been improved substantially. While performance and feature coverage on Windows still falls behind Unix-like systems, typical data consumer use cases, and standard dataset operations, such as [create][] and [save][], are now working. Basic support for data provenance capture via [run][] is also functional. - Support for Git-annex direct mode repositories has been removed, following the end of support in Git-annex itself. - The semantics of relative paths in command line arguments have changed. Previously, a call `datalad save --dataset /tmp/myds some/relpath` would have been interpreted as saving a file at `/tmp/myds/some/relpath` into dataset `/tmp/myds`. This has changed to saving `$PWD/some/relpath` into dataset `/tmp/myds`. More generally, relative paths are now always treated as relative to the current working directory, except for path arguments of [Dataset][] class instance methods of the Python API. The resulting partial duplication of path specifications between path and dataset arguments is mitigated by the introduction of two special symbols that can be given as dataset argument: `^` and `^.`, which identify the topmost superdataset and the closest dataset that contains the working directory, respectively. - The concept of a "core API" has been introduced. Commands situated in the module `datalad.core` (such as [create][], [save][], [run][], [status][], [diff][]) receive additional scrutiny regarding API and implementation, and are meant to provide longer-term stability. Application developers are encouraged to preferentially build on these commands. ### Major refactoring and deprecations since 0.12.0rc6 - [clone][] has been incorporated into the growing core API. The public `--alternative-source` parameter has been removed, and a `clone_dataset` function with multi-source capabilities is provided instead. The `--reckless` parameter can now take literal mode labels instead of just beeing a binary flag, but backwards compatibility is maintained. - The `get_file_content` method of `GitRepo` was no longer used internally or in any known DataLad extensions and has been removed. ([#3812][]) - The function `get_dataset_root` has been replaced by `rev_get_dataset_root`. `rev_get_dataset_root` remains as a compatibility alias and will be removed in a later release. ([#3815][]) - The `add_sibling` module, marked obsolete in v0.6.0, has been removed. ([#3871][]) - `mock` is no longer declared as an external dependency because we can rely on it being in the standard library now that our minimum required Python version is 3.5. ([#3860][]) - [download-url][] now requires that directories be indicated with a trailing slash rather than interpreting a path as directory when it doesn't exist. This avoids confusion that can result from typos and makes it possible to support directory targets that do not exist. ([#3854][]) - The `dataset_only` argument of the `ConfigManager` class is deprecated. Use `source="dataset"` instead. ([#3907][]) - The `--proc-pre` and `--proc-post` options have been removed, and configuration values for `datalad.COMMAND.proc-pre` and `datalad.COMMAND.proc-post` are no longer honored. The new result hook mechanism provides an alternative for `proc-post` procedures. ([#3963][]) ### Fixes since 0.12.0rc6 - [publish][] crashed when called with a detached HEAD. It now aborts with an informative message. ([#3804][]) - Since 0.12.0rc6 the call to [update][] in [siblings][] resulted in a spurious warning. ([#3877][]) - [siblings][] crashed if it encountered an annex repository that was marked as dead. ([#3892][]) - The update of [rerun][] in v0.12.0rc3 for the rewritten [diff][] command didn't account for a change in the output of `diff`, leading to `rerun --report` unintentionally including unchanged files in its diff values. ([#3873][]) - In 0.12.0rc5 [download-url][] was updated to follow the new path handling logic, but its calls to AnnexRepo weren't properly adjusted, resulting in incorrect path handling when the called from a dataset subdirectory. ([#3850][]) - [download-url][] called `git annex addurl` in a way that failed to register a URL when its header didn't report the content size. ([#3911][]) - With Git v2.24.0, saving new subdatasets failed due to a bug in that Git release. ([#3904][]) - With DataLad configured to stop on failure (e.g., specifying `--on-failure=stop` from the command line), a failing result record was not rendered. ([#3863][]) - Installing a subdataset yielded an "ok" status in cases where the repository was not yet in its final state, making it ineffective for a caller to operate on the repository in response to the result. ([#3906][]) - The internal helper for converting git-annex's JSON output did not relay information from the "error-messages" field. ([#3931][]) - [run-procedure][] reported relative paths that were confusingly not relative to the current directory in some cases. It now always reports absolute paths. ([#3959][]) - [diff][] inappropriately reported files as deleted in some cases when `to` was a value other than `None`. ([#3999][]) - An assortment of fixes for Windows compatibility. ([#3971][]) ([#3974][]) ([#3975][]) ([#3976][]) ([#3979][]) - Subdatasets installed from a source given by relative path will now have this relative path used as 'url' in their .gitmodules record, instead of an absolute path generated by Git. ([#3538][]) - [clone][] will now correctly interpret '~/...' paths as absolute path specifications. ([#3958][]) - [run-procedure][] mistakenly reported a directory as a procedure. ([#3793][]) - The cleanup for batched git-annex processes has been improved. ([#3794][]) ([#3851][]) - The function for adding a version ID to an AWS S3 URL doesn't support URLs with an "s3://" scheme and raises a `NotImplementedError` exception when it encounters one. The function learned to return a URL untouched if an "s3://" URL comes in with a version ID. ([#3842][]) - A few spots needed to be adjusted for compatibility with git-annex's new `--sameas` [feature][gx-sameas], which allows special remotes to share a data store. ([#3856][]) - The `swallow_logs` utility failed to capture some log messages due to an incompatibility with Python 3.7. ([#3935][]) - [siblings][] - crashed if `--inherit` was passed but the parent dataset did not have a remote with a matching name. ([#3954][]) - configured the wrong pushurl and annexurl values in some cases. ([#3955][]) ### Enhancements and new features since 0.12.0rc6 - By default, datasets cloned from local source paths will now get a configured remote for any recursively discoverable 'origin' sibling that is also available from a local path in order to maximize automatic file availability across local annexes. ([#3926][]) - The new [result hooks mechanism][hooks] allows callers to specify, via local Git configuration values, DataLad command calls that will be triggered in response to matching result records (i.e., what you see when you call a command with `-f json_pp`). ([#3903][]) - The command interface classes learned to use a new `_examples_` attribute to render documentation examples for both the Python and command-line API. ([#3821][]) - Candidate URLs for cloning a submodule can now be generated based on configured templates that have access to various properties of the submodule, including its dataset ID. ([#3828][]) - DataLad's check that the user's Git identity is configured has been sped up and now considers the appropriate environment variables as well. ([#3807][]) - The `tag` method of `GitRepo` can now tag revisions other than `HEAD` and accepts a list of arbitrary `git tag` options. ([#3787][]) - When `get` clones a subdataset and the subdataset's HEAD differs from the commit that is registered in the parent, the active branch of the subdataset is moved to the registered commit if the registered commit is an ancestor of the subdataset's HEAD commit. This handling has been moved to a more central location within `GitRepo`, and now applies to any `update_submodule(..., init=True)` call. ([#3831][]) - The output of `datalad -h` has been reformatted to improve readability. ([#3862][]) - [unlock][] has been sped up. ([#3880][]) - [run-procedure][] learned to provide and render more information about discovered procedures, including whether the procedure is overridden by another procedure with the same base name. ([#3960][]) - [save][] now ([#3817][]) - records the active branch in the superdataset when registering a new subdataset. - calls `git annex sync` when saving a dataset on an adjusted branch so that the changes are brought into the mainline branch. - [subdatasets][] now aborts when its `dataset` argument points to a non-existent dataset. ([#3940][]) - [wtf][] now - reports the dataset ID if the current working directory is visiting a dataset. ([#3888][]) - outputs entries deterministically. ([#3927][]) - The `ConfigManager` class - learned to exclude ``.datalad/config`` as a source of configuration values, restricting the sources to standard Git configuration files, when called with `source="local"`. ([#3907][]) - accepts a value of "override" for its `where` argument to allow Python callers to more convenient override configuration. ([#3970][]) - Commands now accept a `dataset` value of "^." as shorthand for "the dataset to which the current directory belongs". ([#3242][]) ## 0.12.0rc6 (Oct 19, 2019) -- some releases are better than the others bet we will fix some bugs and make a world even a better place. ### Major refactoring and deprecations - DataLad no longer supports Python 2. The minimum supported version of Python is now 3.5. ([#3629][]) - Much of the user-focused content at http://docs.datalad.org has been removed in favor of more up to date and complete material available in the [DataLad Handbook][handbook]. Going forward, the plan is to restrict http://docs.datalad.org to technical documentation geared at developers. ([#3678][]) - [update][] used to allow the caller to specify which dataset(s) to update as a `PATH` argument or via the the `--dataset` option; now only the latter is supported. Path arguments only serve to restrict which subdataset are updated when operating recursively. ([#3700][]) - Result records from a [get][] call no longer have a "state" key. ([#3746][]) - [update][] and [get][] no longer support operating on independent hierarchies of datasets. ([#3700][]) ([#3746][]) - The [run][] update in 0.12.0rc4 for the new path resolution logic broke the handling of inputs and outputs for calls from a subdirectory. ([#3747][]) - The `is_submodule_modified` method of `GitRepo` as well as two helper functions in gitrepo.py, `kwargs_to_options` and `split_remote_branch`, were no longer used internally or in any known DataLad extensions and have been removed. ([#3702][]) ([#3704][]) - The `only_remote` option of `GitRepo.is_with_annex` was not used internally or in any known extensions and has been dropped. ([#3768][]) - The `get_tags` method of `GitRepo` used to sort tags by committer date. It now sorts them by the tagger date for annotated tags and the committer date for lightweight tags. ([#3715][]) - The `rev_resolve_path` substituted `resolve_path` helper. ([#3797][]) ### Fixes - Correctly handle relative paths in [publish][]. ([#3799][]) ([#3102][]) - Do not errorneously discover directory as a procedure. ([#3793][]) - Correctly extract version from manpage to trigger use of manpages for `--help`. ([#3798][]) - The `cfg_yoda` procedure saved all modifications in the repository rather than saving only the files it modified. ([#3680][]) - Some spots in the documentation that were supposed appear as two hyphen's were incorrectly rendered in the HTML output en-dash's. ([#3692][]) - [create][], [install][], and [clone][] treated paths as relative to the dataset even when the string form was given, violating the new path handling rules. ([#3749][]) ([#3777][]) ([#3780][]) - Providing the "^" shortcut to `--dataset` didn't work properly when called from a subdirectory of a subdataset. ([#3772][]) - We failed to propagate some errors from git-annex when working with its JSON output. ([#3751][]) - With the Python API, callers are allowed to pass a string or list of strings as the `cfg_proc` argument to [create][], but the string form was mishandled. ([#3761][]) - Incorrect command quoting for SSH calls on Windows that rendered basic SSH-related functionality (e.g., [sshrun][]) on Windows unusable. ([#3688][]) - Annex JSON result handling assumed platform-specific paths on Windows instead of the POSIX-style that is happening across all platforms. ([#3719][]) - `path_is_under()` was incapable of comparing Windows paths with different drive letters. ([#3728][]) ### Enhancements and new features - Provide a collection of "public" `call_git*` helpers within GitRepo and replace use of "private" and less specific `_git_custom_command` calls. ([#3791][]) - [status][] gained a `--report-filetype`. Setting it to "raw" can give a performance boost for the price of no longer distinguishing symlinks that point to annexed content from other symlinks. ([#3701][]) - [save][] disables file type reporting by [status][] to improve performance. ([#3712][]) - [subdatasets][] ([#3743][]) - now extends its result records with a `contains` field that lists which `contains` arguments matched a given subdataset. - yields an 'impossible' result record when a `contains` argument wasn't matched to any of the reported subdatasets. - [install][] now shows more readable output when cloning fails. ([#3775][]) - `SSHConnection` now displays a more informative error message when it cannot start the `ControlMaster` process. ([#3776][]) - If the new configuration option `datalad.log.result-level` is set to a single level, all result records will be logged at that level. If you've been bothered by DataLad's double reporting of failures, consider setting this to "debug". ([#3754][]) - Configuration values from `datalad -c OPTION=VALUE ...` are now validated to provide better errors. ([#3695][]) - [rerun][] learned how to handle history with merges. As was already the case when cherry picking non-run commits, re-creating merges may results in conflicts, and `rerun` does not yet provide an interface to let the user handle these. ([#2754][]) - The `fsck` method of `AnnexRepo` has been enhanced to expose more features of the underlying `git fsck` command. ([#3693][]) - `GitRepo` now has a `for_each_ref_` method that wraps `git for-each-ref`, which is used in various spots that used to rely on GitPython functionality. ([#3705][]) - Do not pretend to be able to work in optimized (`python -O`) mode, crash early with an informative message. ([#3803][]) ## 0.12.0rc5 (September 04, 2019) -- . Various fixes and enhancements that bring the 0.12.0 release closer. ### Major refactoring and deprecations - The two modules below have a new home. The old locations still exist as compatibility shims and will be removed in a future release. - `datalad.distribution.subdatasets` has been moved to `datalad.local.subdatasets` ([#3429][]) - `datalad.interface.run` has been moved to `datalad.core.local.run` ([#3444][]) - The `lock` method of `AnnexRepo` and the `options` parameter of `AnnexRepo.unlock` were unused internally and have been removed. ([#3459][]) - The `get_submodules` method of `GitRepo` has been rewritten without GitPython. When the new `compat` flag is true (the current default), the method returns a value that is compatible with the old return value. This backwards-compatible return value and the `compat` flag will be removed in a future release. ([#3508][]) - The logic for resolving relative paths given to a command has changed ([#3435][]). The new rule is that relative paths are taken as relative to the dataset only if a dataset _instance_ is passed by the caller. In all other scenarios they're considered relative to the current directory. The main user-visible difference from the command line is that using the `--dataset` argument does _not_ result in relative paths being taken as relative to the specified dataset. (The undocumented distinction between "rel/path" and "./rel/path" no longer exists.) All commands under `datalad.core` and `datalad.local`, as well as `unlock` and `addurls`, follow the new logic. The goal is for all commands to eventually do so. ### Fixes - The function for loading JSON streams wasn't clever enough to handle content that included a Unicode line separator like U2028. ([#3524][]) - When [unlock][] was called without an explicit target (i.e., a directory or no paths at all), the call failed if any of the files did not have content present. ([#3459][]) - `AnnexRepo.get_content_info` failed in the rare case of a key without size information. ([#3534][]) - [save][] ignored `--on-failure` in its underlying call to [status][]. ([#3470][]) - Calling [remove][] with a subdirectory displayed spurious warnings about the subdirectory files not existing. ([#3586][]) - Our processing of `git-annex --json` output mishandled info messages from special remotes. ([#3546][]) - [create][] - didn't bypass the "existing subdataset" check when called with `--force` as of 0.12.0rc3 ([#3552][]) - failed to register the up-to-date revision of a subdataset when `--cfg-proc` was used with `--dataset` ([#3591][]) - The base downloader had some error handling that wasn't compatible with Python 3. ([#3622][]) - Fixed a number of Unicode py2-compatibility issues. ([#3602][]) - `AnnexRepo.get_content_annexinfo` did not properly chunk file arguments to avoid exceeding the command-line character limit. ([#3587][]) ### Enhancements and new features - New command `create-sibling-gitlab` provides an interface for creating a publication target on a GitLab instance. ([#3447][]) - [subdatasets][] ([#3429][]) - now supports path-constrained queries in the same manner as commands like `save` and `status` - gained a `--contains=PATH` option that can be used to restrict the output to datasets that include a specific path. - now narrows the listed subdatasets to those underneath the current directory when called with no arguments - [status][] learned to accept a plain `--annex` (no value) as shorthand for `--annex basic`. ([#3534][]) - The `.dirty` property of `GitRepo` and `AnnexRepo` has been sped up. ([#3460][]) - The `get_content_info` method of `GitRepo`, used by `status` and commands that depend on `status`, now restricts its git calls to a subset of files, if possible, for a performance gain in repositories with many files. ([#3508][]) - Extensions that do not provide a command, such as those that provide only metadata extractors, are now supported. ([#3531][]) - When calling git-annex with `--json`, we log standard error at the debug level rather than the warning level if a non-zero exit is expected behavior. ([#3518][]) - [create][] no longer refuses to create a new dataset in the odd scenario of an empty .git/ directory upstairs. ([#3475][]) - As of v2.22.0 Git treats a sub-repository on an unborn branch as a repository rather than as a directory. Our documentation and tests have been updated appropriately. ([#3476][]) - [addurls][] learned to accept a `--cfg-proc` value and pass it to its `create` calls. ([#3562][]) ## 0.12.0rc4 (May 15, 2019) -- the revolution is over With the replacement of the `save` command implementation with `rev-save` the revolution effort is now over, and the set of key commands for local dataset operations (`create`, `run`, `save`, `status`, `diff`) is now complete. This new core API is available from `datalad.core.local` (and also via `datalad.api`, as any other command).  ### Major refactoring and deprecations - The `add` command is now deprecated. It will be removed in a future release. ### Fixes - Remove hard-coded dependencies on POSIX path conventions in SSH support code ([#3400][]) - Emit an `add` result when adding a new subdataset during [save][] ([#3398][]) - SSH file transfer now actually opens a shared connection, if none exists yet ([#3403][]) ### Enhancements and new features - `SSHConnection` now offers methods for file upload and dowload (`get()`, `put()`. The previous `copy()` method only supported upload and was discontinued ([#3401][]) ## 0.12.0rc3 (May 07, 2019) -- the revolution continues  Continues API consolidation and replaces the `create` and `diff` command with more performant implementations. ### Major refactoring and deprecations - The previous `diff` command has been replaced by the diff variant from the [datalad-revolution][] extension. ([#3366][]) - `rev-create` has been renamed to `create`, and the previous `create` has been removed. ([#3383][]) - The procedure `setup_yoda_dataset` has been renamed to `cfg_yoda` ([#3353][]). - The `--nosave` of `addurls` now affects only added content, not newly created subdatasets ([#3259][]). - `Dataset.get_subdatasets` (deprecated since v0.9.0) has been removed. ([#3336][]) - The `.is_dirty` method of `GitRepo` and `AnnexRepo` has been replaced by `.status` or, for a subset of cases, the `.dirty` property. ([#3330][]) - `AnnexRepo.get_status` has been replaced by `AnnexRepo.status`. ([#3330][]) ### Fixes - [status][] - reported on directories that contained only ignored files ([#3238][]) - gave a confusing failure when called from a subdataset with an explicitly specified dataset argument and "." as a path ([#3325][]) - misleadingly claimed that the locally present content size was zero when `--annex basic` was specified ([#3378][]) - An informative error wasn't given when a download provider was invalid. ([#3258][]) - Calling `rev-save PATH` saved unspecified untracked subdatasets. ([#3288][]) - The available choices for command-line options that take values are now displayed more consistently in the help output. ([#3326][]) - The new pathlib-based code had various encoding issues on Python 2. ([#3332][]) ### Enhancements and new features - [wtf][] now includes information about the Python version. ([#3255][]) - When operating in an annex repository, checking whether git-annex is available is now delayed until a call to git-annex is actually needed, allowing systems without git-annex to operate on annex repositories in a restricted fashion. ([#3274][]) - The `load_stream` on helper now supports auto-detection of compressed files. ([#3289][]) - `create` (formerly `rev-create`) - learned to be speedier by passing a path to `status` ([#3294][]) - gained a `--cfg-proc` (or `-c`) convenience option for running configuration procedures (or more accurately any procedure that begins with "cfg_") in the newly created dataset ([#3353][]) - `AnnexRepo.set_metadata` now returns a list while `AnnexRepo.set_metadata_` returns a generator, a behavior which is consistent with the `add` and `add_` method pair. ([#3298][]) - `AnnexRepo.get_metadata` now supports batch querying of known annex files. Note, however, that callers should carefully validate the input paths because the batch call will silently hang if given non-annex files. ([#3364][]) - [status][] - now reports a "bytesize" field for files tracked by Git ([#3299][]) - gained a new option `eval_subdataset_state` that controls how the subdataset state is evaluated. Depending on the information you need, you can select a less expensive mode to make `status` faster. ([#3324][]) - colors deleted files "red" ([#3334][]) - Querying repository content is faster due to batching of `git cat-file` calls. ([#3301][]) - The dataset ID of a subdataset is now recorded in the superdataset. ([#3304][]) - `GitRepo.diffstatus` - now avoids subdataset recursion when the comparison is not with the working tree, which substantially improves performance when diffing large dataset hierarchies ([#3314][]) - got smarter and faster about labeling a subdataset as "modified" ([#3343][]) - `GitRepo.get_content_info` now supports disabling the file type evaluation, which gives a performance boost in cases where this information isn't needed. ([#3362][]) - The XMP metadata extractor now filters based on file name to improve its performance. ([#3329][]) ## 0.12.0rc2 (Mar 18, 2019) -- revolution! ### Fixes - `GitRepo.dirty` does not report on nested empty directories ([#3196][]). - `GitRepo.save()` reports results on deleted files. ### Enhancements and new features - Absorb a new set of core commands from the datalad-revolution extension: - `rev-status`: like `git status`, but simpler and working with dataset hierarchies - `rev-save`: a 2-in-1 replacement for save and add - `rev-create`: a ~30% faster create - JSON support tools can now read and write compressed files. ## 0.12.0rc1 (Mar 03, 2019) -- to boldly go ... ### Major refactoring and deprecations - Discontinued support for git-annex direct-mode (also no longer supported upstream). ### Enhancements and new features - Dataset and Repo object instances are now hashable, and can be created based on pathlib Path object instances - Imported various additional methods for the Repo classes to query information and save changes. ## 0.11.8 (Oct 11, 2019) -- annex-we-are-catching-up ### Fixes - Our internal command runner failed to capture output in some cases. ([#3656][]) - Workaround in the tests around python in cPython >= 3.7.5 ';' in the filename confusing mimetypes ([#3769][]) ([#3770][]) ### Enhancements and new features - Prepared for upstream changes in git-annex, including support for the latest git-annex - 7.20190912 auto-upgrades v5 repositories to v7. ([#3648][]) ([#3682][]) - 7.20191009 fixed treatment of (larger/smaller)than in .gitattributes ([#3765][]) - The `cfg_text2git` procedure, as well the `--text-no-annex` option of [create][], now configure .gitattributes so that empty files are stored in git rather than annex. ([#3667][]) ## 0.11.7 (Sep 06, 2019) -- python2-we-still-love-you-but-... Primarily bugfixes with some optimizations and refactorings. ### Fixes - [addurls][] - now provides better handling when the URL file isn't in the expected format. ([#3579][]) - always considered a relative file for the URL file argument as relative to the current working directory, which goes against the convention used by other commands of taking relative paths as relative to the dataset argument. ([#3582][]) - [run-procedure][] - hard coded "python" when formatting the command for non-executable procedures ending with ".py". `sys.executable` is now used. ([#3624][]) - failed if arguments needed more complicated quoting than simply surrounding the value with double quotes. This has been resolved for systems that support `shlex.quote`, but note that on Windows values are left unquoted. ([#3626][]) - [siblings][] now displays an informative error message if a local path is given to `--url` but `--name` isn't specified. ([#3555][]) - [sshrun][], the command DataLad uses for `GIT_SSH_COMMAND`, didn't support all the parameters that Git expects it to. ([#3616][]) - Fixed a number of Unicode py2-compatibility issues. ([#3597][]) - [download-url][] now will create leading directories of the output path if they do not exist ([#3646][]) ### Enhancements and new features - The [annotate-paths][] helper now caches subdatasets it has seen to avoid unnecessary calls. ([#3570][]) - A repeated configuration query has been dropped from the handling of `--proc-pre` and `--proc-post`. ([#3576][]) - Calls to `git annex find` now use `--in=.` instead of the alias `--in=here` to take advantage of an optimization that git-annex (as of the current release, 7.20190730) applies only to the former. ([#3574][]) - [addurls][] now suggests close matches when the URL or file format contains an unknown field. ([#3594][]) - Shared logic used in the setup.py files of Datalad and its extensions has been moved to modules in the _datalad_build_support/ directory. ([#3600][]) - Get ready for upcoming git-annex dropping support for direct mode ([#3631][]) ## 0.11.6 (Jul 30, 2019) -- am I the last of 0.11.x? Primarily bug fixes to achieve more robust performance ### Fixes - Our tests needed various adjustments to keep up with upstream changes in Travis and Git. ([#3479][]) ([#3492][]) ([#3493][]) - `AnnexRepo.is_special_annex_remote` was too selective in what it considered to be a special remote. ([#3499][]) - We now provide information about unexpected output when git-annex is called with `--json`. ([#3516][]) - Exception logging in the `__del__` method of `GitRepo` and `AnnexRepo` no longer fails if the names it needs are no longer bound. ([#3527][]) - [addurls][] botched the construction of subdataset paths that were more than two levels deep and failed to create datasets in a reliable, breadth-first order. ([#3561][]) - Cloning a `type=git` special remote showed a spurious warning about the remote not being enabled. ([#3547][]) ### Enhancements and new features - For calls to git and git-annex, we disable automatic garbage collection due to past issues with GitPython's state becoming stale, but doing so results in a larger .git/objects/ directory that isn't cleaned up until garbage collection is triggered outside of DataLad. Tests with the latest GitPython didn't reveal any state issues, so we've re-enabled automatic garbage collection. ([#3458][]) - [rerun][] learned an `--explicit` flag, which it relays to its calls to [run][[]]. This makes it possible to call `rerun` in a dirty working tree ([#3498][]). - The [metadata][] command aborts earlier if a metadata extractor is unavailable. ([#3525][]) ## 0.11.5 (May 23, 2019) -- stability is not overrated Should be faster and less buggy, with a few enhancements. ### Fixes - [create-sibling][] ([#3318][]) - Siblings are no longer configured with a post-update hook unless a web interface is requested with `--ui`. - `git submodule update --init` is no longer called from the post-update hook. - If `--inherit` is given for a dataset without a superdataset, a warning is now given instead of raising an error. - The internal command runner failed on Python 2 when its `env` argument had unicode values. ([#3332][]) - The safeguard that prevents creating a dataset in a subdirectory that already contains tracked files for another repository failed on Git versions before 2.14. For older Git versions, we now warn the caller that the safeguard is not active. ([#3347][]) - A regression introduced in v0.11.1 prevented [save][] from committing changes under a subdirectory when the subdirectory was specified as a path argument. ([#3106][]) - A workaround introduced in v0.11.1 made it possible for [save][] to do a partial commit with an annex file that has gone below the `annex.largefiles` threshold. The logic of this workaround was faulty, leading to files being displayed as typechanged in the index following the commit. ([#3365][]) - The resolve_path() helper confused paths that had a semicolon for SSH RIs. ([#3425][]) - The detection of SSH RIs has been improved. ([#3425][]) ### Enhancements and new features - The internal command runner was too aggressive in its decision to sleep. ([#3322][]) - The "INFO" label in log messages now retains the default text color for the terminal rather than using white, which only worked well for terminals with dark backgrounds. ([#3334][]) - A short flag `-R` is now available for the `--recursion-limit` flag, a flag shared by several subcommands. ([#3340][]) - The authentication logic for [create-sibling-github][] has been revamped and now supports 2FA. ([#3180][]) - New configuration option `datalad.ui.progressbar` can be used to configure the default backend for progress reporting ("none", for example, results in no progress bars being shown). ([#3396][]) - A new progress backend, available by setting datalad.ui.progressbar to "log", replaces progress bars with a log message upon completion of an action. ([#3396][]) - DataLad learned to consult the [NO_COLOR][] environment variable and the new `datalad.ui.color` configuration option when deciding to color output. The default value, "auto", retains the current behavior of coloring output if attached to a TTY ([#3407][]). - [clean][] now removes annex transfer directories, which is useful for cleaning up failed downloads. ([#3374][]) - [clone][] no longer refuses to clone into a local path that looks like a URL, making its behavior consistent with `git clone`. ([#3425][]) - [wtf][] - Learned to fall back to the `dist` package if `platform.dist`, which has been removed in the yet-to-be-release Python 3.8, does not exist. ([#3439][]) - Gained a `--section` option for limiting the output to specific sections and a `--decor` option, which currently knows how to format the output as GitHub's `
` section. ([#3440][]) ## 0.11.4 (Mar 18, 2019) -- get-ready Largely a bug fix release with a few enhancements ### Important - 0.11.x series will be the last one with support for direct mode of [git-annex][] which is used on crippled (no symlinks and no locking) filesystems. v7 repositories should be used instead. ### Fixes - Extraction of .gz files is broken without p7zip installed. We now abort with an informative error in this situation. ([#3176][]) - Committing failed in some cases because we didn't ensure that the path passed to `git read-tree --index-output=...` resided on the same filesystem as the repository. ([#3181][]) - Some pointless warnings during metadata aggregation have been eliminated. ([#3186][]) - With Python 3 the LORIS token authenticator did not properly decode a response ([#3205][]). - With Python 3 downloaders unnecessarily decoded the response when getting the status, leading to an encoding error. ([#3210][]) - In some cases, our internal command Runner did not adjust the environment's `PWD` to match the current working directory specified with the `cwd` parameter. ([#3215][]) - The specification of the pyliblzma dependency was broken. ([#3220][]) - [search] displayed an uninformative blank log message in some cases. ([#3222][]) - The logic for finding the location of the aggregate metadata DB anchored the search path incorrectly, leading to a spurious warning. ([#3241][]) - Some progress bars were still displayed when stdout and stderr were not attached to a tty. ([#3281][]) - Check for stdin/out/err to not be closed before checking for `.isatty`. ([#3268][]) ### Enhancements and new features - Creating a new repository now aborts if any of the files in the directory are tracked by a repository in a parent directory. ([#3211][]) - [run] learned to replace the `{tmpdir}` placeholder in commands with a temporary directory. ([#3223][]) - [duecredit][] support has been added for citing DataLad itself as well as datasets that an analysis uses. ([#3184][]) - The `eval_results` interface helper unintentionally modified one of its arguments. ([#3249][]) - A few DataLad constants have been added, changed, or renamed ([#3250][]): - `HANDLE_META_DIR` is now `DATALAD_DOTDIR`. The old name should be considered deprecated. - `METADATA_DIR` now refers to `DATALAD_DOTDIR/metadata` rather than `DATALAD_DOTDIR/meta` (which is still available as `OLDMETADATA_DIR`). - The new `DATASET_METADATA_FILE` refers to `METADATA_DIR/dataset.json`. - The new `DATASET_CONFIG_FILE` refers to `DATALAD_DOTDIR/config`. - `METADATA_FILENAME` has been renamed to `OLDMETADATA_FILENAME`. ## 0.11.3 (Feb 19, 2019) -- read-me-gently Just a few of important fixes and minor enhancements. ### Fixes - The logic for setting the maximum command line length now works around Python 3.4 returning an unreasonably high value for `SC_ARG_MAX` on Debian systems. ([#3165][]) - DataLad commands that are conceptually "read-only", such as `datalad ls -L`, can fail when the caller lacks write permissions because git-annex tries merging remote git-annex branches to update information about availability. DataLad now disables `annex.merge-annex-branches` in some common "read-only" scenarios to avoid these failures. ([#3164][]) ### Enhancements and new features - Accessing an "unbound" dataset method now automatically imports the necessary module rather than requiring an explicit import from the Python caller. For example, calling `Dataset.add` no longer needs to be preceded by `from datalad.distribution.add import Add` or an import of `datalad.api`. ([#3156][]) - Configuring the new variable `datalad.ssh.identityfile` instructs DataLad to pass a value to the `-i` option of `ssh`. ([#3149][]) ([#3168][]) ## 0.11.2 (Feb 07, 2019) -- live-long-and-prosper A variety of bugfixes and enhancements ### Major refactoring and deprecations - All extracted metadata is now placed under git-annex by default. Previously files smaller than 20 kb were stored in git. ([#3109][]) - The function `datalad.cmd.get_runner` has been removed. ([#3104][]) ### Fixes - Improved handling of long commands: - The code that inspected `SC_ARG_MAX` didn't check that the reported value was a sensible, positive number. ([#3025][]) - More commands that invoke `git` and `git-annex` with file arguments learned to split up the command calls when it is likely that the command would fail due to exceeding the maximum supported length. ([#3138][]) - The `setup_yoda_dataset` procedure created a malformed .gitattributes line. ([#3057][]) - [download-url][] unnecessarily tried to infer the dataset when `--no-save` was given. ([#3029][]) - [rerun][] aborted too late and with a confusing message when a ref specified via `--onto` didn't exist. ([#3019][]) - [run][]: - `run` didn't preserve the current directory prefix ("./") on inputs and outputs, which is problematic if the caller relies on this representation when formatting the command. ([#3037][]) - Fixed a number of unicode py2-compatibility issues. ([#3035][]) ([#3046][]) - To proceed with a failed command, the user was confusingly instructed to use `save` instead of `add` even though `run` uses `add` underneath. ([#3080][]) - Fixed a case where the helper class for checking external modules incorrectly reported a module as unknown. ([#3051][]) - [add-archive-content][] mishandled the archive path when the leading path contained a symlink. ([#3058][]) - Following denied access, the credential code failed to consider a scenario, leading to a type error rather than an appropriate error message. ([#3091][]) - Some tests failed when executed from a `git worktree` checkout of the source repository. ([#3129][]) - During metadata extraction, batched annex processes weren't properly terminated, leading to issues on Windows. ([#3137][]) - [add][] incorrectly handled an "invalid repository" exception when trying to add a submodule. ([#3141][]) - Pass `GIT_SSH_VARIANT=ssh` to git processes to be able to specify alternative ports in SSH urls ### Enhancements and new features - [search][] learned to suggest closely matching keys if there are no hits. ([#3089][]) - [create-sibling][] - gained a `--group` option so that the caller can specify the file system group for the repository. ([#3098][]) - now understands SSH URLs that have a port in them (i.e. the "ssh://[user@]host.xz[:port]/path/to/repo.git/" syntax mentioned in `man git-fetch`). ([#3146][]) - Interface classes can now override the default renderer for summarizing results. ([#3061][]) - [run][]: - `--input` and `--output` can now be shortened to `-i` and `-o`. ([#3066][]) - Placeholders such as "{inputs}" are now expanded in the command that is shown in the commit message subject. ([#3065][]) - `interface.run.run_command` gained an `extra_inputs` argument so that wrappers like [datalad-container][] can specify additional inputs that aren't considered when formatting the command string. ([#3038][]) - "--" can now be used to separate options for `run` and those for the command in ambiguous cases. ([#3119][]) - The utilities `create_tree` and `ok_file_has_content` now support ".gz" files. ([#3049][]) - The Singularity container for 0.11.1 now uses [nd_freeze][] to make its builds reproducible. - A [publications][] page has been added to the documentation. ([#3099][]) - `GitRepo.set_gitattributes` now accepts a `mode` argument that controls whether the .gitattributes file is appended to (default) or overwritten. ([#3115][]) - `datalad --help` now avoids using `man` so that the list of subcommands is shown. ([#3124][]) ## 0.11.1 (Nov 26, 2018) -- v7-better-than-v6 Rushed out bugfix release to stay fully compatible with recent [git-annex][] which introduced v7 to replace v6. ### Fixes - [install][]: be able to install recursively into a dataset ([#2982][]) - [save][]: be able to commit/save changes whenever files potentially could have swapped their storage between git and annex ([#1651][]) ([#2752][]) ([#3009][]) - [aggregate-metadata][]: - dataset's itself is now not "aggregated" if specific paths are provided for aggregation ([#3002][]). That resolves the issue of `-r` invocation aggregating all subdatasets of the specified dataset as well - also compare/verify the actual content checksum of aggregated metadata while considering subdataset metadata for re-aggregation ([#3007][]) - `annex` commands are now chunked assuming 50% "safety margin" on the maximal command line length. Should resolve crashes while operating ot too many files at ones ([#3001][]) - `run` sidecar config processing ([#2991][]) - no double trailing period in docs ([#2984][]) - correct identification of the repository with symlinks in the paths in the tests ([#2972][]) - re-evaluation of dataset properties in case of dataset changes ([#2946][]) - [text2git][] procedure to use `ds.repo.set_gitattributes` ([#2974][]) ([#2954][]) - Switch to use plain `os.getcwd()` if inconsistency with env var `$PWD` is detected ([#2914][]) - Make sure that credential defined in env var takes precedence ([#2960][]) ([#2950][]) ### Enhancements and new features - [shub://datalad/datalad:git-annex-dev](https://singularity-hub.org/containers/5663/view) provides a Debian buster Singularity image with build environment for [git-annex][]. `tools/bisect-git-annex` provides a helper for running `git bisect` on git-annex using that Singularity container ([#2995][]) - Added `.zenodo.json` for better integration with Zenodo for citation - [run-procedure][] now provides names and help messages with a custom renderer for ([#2993][]) - Documentation: point to [datalad-revolution][] extension (prototype of the greater DataLad future) - [run][] - support injecting of a detached command ([#2937][]) - `annex` metadata extractor now extracts `annex.key` metadata record. Should allow now to identify uses of specific files etc ([#2952][]) - Test that we can install from http://datasets.datalad.org - Proper rendering of `CommandError` (e.g. in case of "out of space" error) ([#2958][]) ## 0.11.0 (Oct 23, 2018) -- Soon-to-be-perfect [git-annex][] 6.20180913 (or later) is now required - provides a number of fixes for v6 mode operations etc. ### Major refactoring and deprecations - `datalad.consts.LOCAL_CENTRAL_PATH` constant was deprecated in favor of `datalad.locations.default-dataset` [configuration][] variable ([#2835][]) ### Minor refactoring - `"notneeded"` messages are no longer reported by default results renderer - [run][] no longer shows commit instructions upon command failure when `explicit` is true and no outputs are specified ([#2922][]) - `get_git_dir` moved into GitRepo ([#2886][]) - `_gitpy_custom_call` removed from GitRepo ([#2894][]) - `GitRepo.get_merge_base` argument is now called `commitishes` instead of `treeishes` ([#2903][]) ### Fixes - [update][] should not leave the dataset in non-clean state ([#2858][]) and some other enhancements ([#2859][]) - Fixed chunking of the long command lines to account for decorators and other arguments ([#2864][]) - Progress bar should not crash the process on some missing progress information ([#2891][]) - Default value for `jobs` set to be `"auto"` (not `None`) to take advantage of possible parallel get if in `-g` mode ([#2861][]) - [wtf][] must not crash if `git-annex` is not installed etc ([#2865][]), ([#2865][]), ([#2918][]), ([#2917][]) - Fixed paths (with spaces etc) handling while reporting annex error output ([#2892][]), ([#2893][]) - `__del__` should not access `.repo` but `._repo` to avoid attempts for reinstantiation etc ([#2901][]) - Fix up submodule `.git` right in `GitRepo.add_submodule` to avoid added submodules being non git-annex friendly ([#2909][]), ([#2904][]) - [run-procedure][] ([#2905][]) - now will provide dataset into the procedure if called within dataset - will not crash if procedure is an executable without `.py` or `.sh` suffixes - Use centralized `.gitattributes` handling while setting annex backend ([#2912][]) - `GlobbedPaths.expand(..., full=True)` incorrectly returned relative paths when called more than once ([#2921][]) ### Enhancements and new features - Report progress on [clone][] when installing from "smart" git servers ([#2876][]) - Stale/unused `sth_like_file_has_content` was removed ([#2860][]) - Enhancements to [search][] to operate on "improved" metadata layouts ([#2878][]) - Output of `git annex init` operation is now logged ([#2881][]) - New - `GitRepo.cherry_pick` ([#2900][]) - `GitRepo.format_commit` ([#2902][]) - [run-procedure][] ([#2905][]) - procedures can now recursively be discovered in subdatasets as well. The uppermost has highest priority - Procedures in user and system locations now take precedence over those in datasets. ## 0.10.3.1 (Sep 13, 2018) -- Nothing-is-perfect Emergency bugfix to address forgotten boost of version in `datalad/version.py`. ## 0.10.3 (Sep 13, 2018) -- Almost-perfect This is largely a bugfix release which addressed many (but not yet all) issues of working with git-annex direct and version 6 modes, and operation on Windows in general. Among enhancements you will see the support of public S3 buckets (even with periods in their names), ability to configure new providers interactively, and improved `egrep` search backend. Although we do not require with this release, it is recommended to make sure that you are using a recent `git-annex` since it also had a variety of fixes and enhancements in the past months. ### Fixes - Parsing of combined short options has been broken since DataLad v0.10.0. ([#2710][]) - The `datalad save` instructions shown by `datalad run` for a command with a non-zero exit were incorrectly formatted. ([#2692][]) - Decompression of zip files (e.g., through `datalad add-archive-content`) failed on Python 3. ([#2702][]) - Windows: - colored log output was not being processed by colorama. ([#2707][]) - more codepaths now try multiple times when removing a file to deal with latency and locking issues on Windows. ([#2795][]) - Internal git fetch calls have been updated to work around a GitPython `BadName` issue. ([#2712][]), ([#2794][]) - The progess bar for annex file transferring was unable to handle an empty file. ([#2717][]) - `datalad add-readme` halted when no aggregated metadata was found rather than displaying a warning. ([#2731][]) - `datalad rerun` failed if `--onto` was specified and the history contained no run commits. ([#2761][]) - Processing of a command's results failed on a result record with a missing value (e.g., absent field or subfield in metadata). Now the missing value is rendered as "N/A". ([#2725][]). - A couple of documentation links in the "Delineation from related solutions" were misformatted. ([#2773][]) - With the latest git-annex, several known V6 failures are no longer an issue. ([#2777][]) - In direct mode, commit changes would often commit annexed content as regular Git files. A new approach fixes this and resolves a good number of known failures. ([#2770][]) - The reporting of command results failed if the current working directory was removed (e.g., after an unsuccessful `install`). ([#2788][]) - When installing into an existing empty directory, `datalad install` removed the directory after a failed clone. ([#2788][]) - `datalad run` incorrectly handled inputs and outputs for paths with spaces and other characters that require shell escaping. ([#2798][]) - Globbing inputs and outputs for `datalad run` didn't work correctly if a subdataset wasn't installed. ([#2796][]) - Minor (in)compatibility with git 2.19 - (no) trailing period in an error message now. ([#2815][]) ### Enhancements and new features - Anonymous access is now supported for S3 and other downloaders. ([#2708][]) - A new interface is available to ease setting up new providers. ([#2708][]) - Metadata: changes to egrep mode search ([#2735][]) - Queries in egrep mode are now case-sensitive when the query contains any uppercase letters and are case-insensitive otherwise. The new mode egrepcs can be used to perform a case-sensitive query with all lower-case letters. - Search can now be limited to a specific key. - Multiple queries (list of expressions) are evaluated using AND to determine whether something is a hit. - A single multi-field query (e.g., `pa*:findme`) is a hit, when any matching field matches the query. - All matching key/value combinations across all (multi-field) queries are reported in the query_matched result field. - egrep mode now shows all hits rather than limiting the results to the top 20 hits. - The documentation on how to format commands for `datalad run` has been improved. ([#2703][]) - The method for determining the current working directory on Windows has been improved. ([#2707][]) - `datalad --version` now simply shows the version without the license. ([#2733][]) - `datalad export-archive` learned to export under an existing directory via its `--filename` option. ([#2723][]) - `datalad export-to-figshare` now generates the zip archive in the root of the dataset unless `--filename` is specified. ([#2723][]) - After importing `datalad.api`, `help(datalad.api)` (or `datalad.api?` in IPython) now shows a summary of the available DataLad commands. ([#2728][]) - Support for using `datalad` from IPython has been improved. ([#2722][]) - `datalad wtf` now returns structured data and reports the version of each extension. ([#2741][]) - The internal handling of gitattributes information has been improved. A user-visible consequence is that `datalad create --force` no longer duplicates existing attributes. ([#2744][]) - The "annex" metadata extractor can now be used even when no content is present. ([#2724][]) - The `add_url_to_file` method (called by commands like `datalad download-url` and `datalad add-archive-content`) learned how to display a progress bar. ([#2738][]) ## 0.10.2 (Jul 09, 2018) -- Thesecuriestever Primarily a bugfix release to accommodate recent git-annex release forbidding file:// and http://localhost/ URLs which might lead to revealing private files if annex is publicly shared. ### Fixes - fixed testing to be compatible with recent git-annex (6.20180626) - [download-url][] will now download to current directory instead of the top of the dataset ### Enhancements and new features - do not quote ~ in URLs to be consistent with quote implementation in Python 3.7 which now follows RFC 3986 - [run][] support for user-configured placeholder values - documentation on native git-annex metadata support - handle 401 errors from LORIS tokens - `yoda` procedure will instantiate `README.md` - `--discover` option added to [run-procedure][] to list available procedures ## 0.10.1 (Jun 17, 2018) -- OHBM polish The is a minor bugfix release. ### Fixes - Be able to use backports.lzma as a drop-in replacement for pyliblzma. - Give help when not specifying a procedure name in `run-procedure`. - Abort early when a downloader received no filename. - Avoid `rerun` error when trying to unlock non-available files. ## 0.10.0 (Jun 09, 2018) -- The Release This release is a major leap forward in metadata support. ### Major refactoring and deprecations - Metadata - Prior metadata provided by datasets under `.datalad/meta` is no longer used or supported. Metadata must be reaggregated using 0.10 version - Metadata extractor types are no longer auto-guessed and must be explicitly specified in `datalad.metadata.nativetype` config (could contain multiple values) - Metadata aggregation of a dataset hierarchy no longer updates all datasets in the tree with new metadata. Instead, only the target dataset is updated. This behavior can be changed via the --update-mode switch. The new default prevents needless modification of (3rd-party) subdatasets. - Neuroimaging metadata support has been moved into a dedicated extension: https://github.com/datalad/datalad-neuroimaging - Crawler - moved into a dedicated extension: https://github.com/datalad/datalad-crawler - `export_tarball` plugin has been generalized to `export_archive` and can now also generate ZIP archives. - By default a dataset X is now only considered to be a super-dataset of another dataset Y, if Y is also a registered subdataset of X. ### Fixes A number of fixes did not make it into the 0.9.x series: - Dynamic configuration overrides via the `-c` option were not in effect. - `save` is now more robust with respect to invocation in subdirectories of a dataset. - `unlock` now reports correct paths when running in a dataset subdirectory. - `get` is more robust to path that contain symbolic links. - symlinks to subdatasets of a dataset are now correctly treated as a symlink, and not as a subdataset - `add` now correctly saves staged subdataset additions. - Running `datalad save` in a dataset no longer adds untracked content to the dataset. In order to add content a path has to be given, e.g. `datalad save .` - `wtf` now works reliably with a DataLad that wasn't installed from Git (but, e.g., via pip) - More robust URL handling in `simple_with_archives` crawler pipeline. ### Enhancements and new features - Support for DataLad extension that can contribute API components from 3rd-party sources, incl. commands, metadata extractors, and test case implementations. See https://github.com/datalad/datalad-extension-template for a demo extension. - Metadata (everything has changed!) - Metadata extraction and aggregation is now supported for datasets and individual files. - Metadata query via `search` can now discover individual files. - Extracted metadata can now be stored in XZ compressed files, is optionally annexed (when exceeding a configurable size threshold), and obtained on demand (new configuration option `datalad.metadata.create-aggregate-annex-limit`). - Status and availability of aggregated metadata can now be reported via `metadata --get-aggregates` - New configuration option `datalad.metadata.maxfieldsize` to exclude too large metadata fields from aggregation. - The type of metadata is no longer guessed during metadata extraction. A new configuration option `datalad.metadata.nativetype` was introduced to enable one or more particular metadata extractors for a dataset. - New configuration option `datalad.metadata.store-aggregate-content` to enable the storage of aggregated metadata for dataset content (i.e. file-based metadata) in contrast to just metadata describing a dataset as a whole. - `search` was completely reimplemented. It offers three different modes now: - 'egrep' (default): expression matching in a plain string version of metadata - 'textblob': search a text version of all metadata using a fully featured query language (fast indexing, good for keyword search) - 'autofield': search an auto-generated index that preserves individual fields of metadata that can be represented in a tabular structure (substantial indexing cost, enables the most detailed queries of all modes) - New extensions: - [addurls][], an extension for creating a dataset (and possibly subdatasets) from a list of URLs. - export_to_figshare - extract_metadata - add_readme makes use of available metadata - By default the wtf extension now hides sensitive information, which can be included in the output by passing `--senstive=some` or `--senstive=all`. - Reduced startup latency by only importing commands necessary for a particular command line call. - [create][]: - `-d --nosave` now registers subdatasets, when possible. - `--fake-dates` configures dataset to use fake-dates - [run][] now provides a way for the caller to save the result when a command has a non-zero exit status. - `datalad rerun` now has a `--script` option that can be used to extract previous commands into a file. - A DataLad Singularity container is now available on [Singularity Hub](https://singularity-hub.org/collections/667). - More casts have been embedded in the [use case section of the documentation](http://docs.datalad.org/en/docs/usecases/index.html). - `datalad --report-status` has a new value 'all' that can be used to temporarily re-enable reporting that was disable by configuration settings. ## 0.9.3 (Mar 16, 2018) -- pi+0.02 release Some important bug fixes which should improve usability ### Fixes - `datalad-archives` special remote now will lock on acquiring or extracting an archive - this allows for it to be used with -J flag for parallel operation - relax introduced in 0.9.2 demand on git being configured for datalad operation - now we will just issue a warning - `datalad ls` should now list "authored date" and work also for datasets in detached HEAD mode - `datalad save` will now save original file as well, if file was "git mv"ed, so you can now `datalad run git mv old new` and have changes recorded ### Enhancements and new features - `--jobs` argument now could take `auto` value which would decide on # of jobs depending on the # of available CPUs. `git-annex` > 6.20180314 is recommended to avoid regression with -J. - memoize calls to `RI` meta-constructor -- should speed up operation a bit - `DATALAD_SEED` environment variable could be used to seed Python RNG and provide reproducible UUIDs etc (useful for testing and demos) ## 0.9.2 (Mar 04, 2018) -- it is (again) better than ever Largely a bugfix release with a few enhancements. ### Fixes - Execution of external commands (git) should not get stuck when lots of both stdout and stderr output, and should not loose remaining output in some cases - Config overrides provided in the command line (-c) should now be handled correctly - Consider more remotes (not just tracking one, which might be none) while installing subdatasets - Compatibility with git 2.16 with some changed behaviors/annotations for submodules - Fail `remove` if `annex drop` failed - Do not fail operating on files which start with dash (-) - URL unquote paths within S3, URLs and DataLad RIs (///) - In non-interactive mode fail if authentication/access fails - Web UI: - refactored a little to fix incorrect listing of submodules in subdirectories - now auto-focuses on search edit box upon entering the page - Assure that extracted from tarballs directories have executable bit set ### Enhancements and new features - A log message and progress bar will now inform if a tarball to be downloaded while getting specific files (requires git-annex > 6.20180206) - A dedicated `datalad rerun` command capable of rerunning entire sequences of previously `run` commands. **Reproducibility through VCS. Use `run` even if not interested in `rerun`** - Alert the user if `git` is not yet configured but git operations are requested - Delay collection of previous ssh connections until it is actually needed. Also do not require ':' while specifying ssh host - AutomagicIO: Added proxying of isfile, lzma.LZMAFile and io.open - Testing: - added DATALAD_DATASETS_TOPURL=http://datasets-tests.datalad.org to run tests against another website to not obscure access stats - tests run against temporary HOME to avoid side-effects - better unit-testing of interactions with special remotes - CONTRIBUTING.md describes how to setup and use `git-hub` tool to "attach" commits to an issue making it into a PR - DATALAD_USE_DEFAULT_GIT env variable could be used to cause DataLad to use default (not the one possibly bundled with git-annex) git - Be more robust while handling not supported requests by annex in special remotes - Use of `swallow_logs` in the code was refactored away -- less mysteries now, just increase logging level - `wtf` plugin will report more information about environment, externals and the system ## 0.9.1 (Oct 01, 2017) -- "DATALAD!"(JBTM) Minor bugfix release ### Fixes - Should work correctly with subdatasets named as numbers of bool values (requires also GitPython >= 2.1.6) - Custom special remotes should work without crashing with git-annex >= 6.20170924 ## 0.9.0 (Sep 19, 2017) -- isn't it a lucky day even though not a Friday? ### Major refactoring and deprecations - the `files` argument of [save][] has been renamed to `path` to be uniform with any other command - all major commands now implement more uniform API semantics and result reporting. Functionality for modification detection of dataset content has been completely replaced with a more efficient implementation - [publish][] now features a `--transfer-data` switch that allows for a disambiguous specification of whether to publish data -- independent of the selection which datasets to publish (which is done via their paths). Moreover, [publish][] now transfers data before repository content is pushed. ### Fixes - [drop][] no longer errors when some subdatasets are not installed - [install][] will no longer report nothing when a Dataset instance was given as a source argument, but rather perform as expected - [remove][] doesn't remove when some files of a dataset could not be dropped - [publish][] - no longer hides error during a repository push - publish behaves "correctly" for `--since=` in considering only the differences the last "pushed" state - data transfer handling while publishing with dependencies, to github - improved robustness with broken Git configuration - [search][] should search for unicode strings correctly and not crash - robustify git-annex special remotes protocol handling to allow for spaces in the last argument - UI credentials interface should now allow to Ctrl-C the entry - should not fail while operating on submodules named with numerics only or by bool (true/false) names - crawl templates should not now override settings for `largefiles` if specified in `.gitattributes` ### Enhancements and new features - **Exciting new feature** [run][] command to protocol execution of an external command and rerun computation if desired. See [screencast](http://datalad.org/features.html#reproducible-science) - [save][] now uses Git for detecting with sundatasets need to be inspected for potential changes, instead of performing a complete traversal of a dataset tree - [add][] looks for changes relative to the last commited state of a dataset to discover files to add more efficiently - [diff][] can now report untracked files in addition to modified files - [uninstall][] will check itself whether a subdataset is properly registered in a superdataset, even when no superdataset is given in a call - [subdatasets][] can now configure subdatasets for exclusion from recursive installation (`datalad-recursiveinstall` submodule configuration property) - precrafted pipelines of [crawl][] now will not override `annex.largefiles` setting if any was set within `.gitattribues` (e.g. by `datalad create --text-no-annex`) - framework for screencasts: `tools/cast*` tools and sample cast scripts under `doc/casts` which are published at [datalad.org/features.html](http://datalad.org/features.html) - new [project YouTube channel](https://www.youtube.com/channel/UCB8-Zf7D0DSzAsREoIt0Bvw) - tests failing in direct and/or v6 modes marked explicitly ## 0.8.1 (Aug 13, 2017) -- the best birthday gift Bugfixes ### Fixes - Do not attempt to [update][] a not installed sub-dataset - In case of too many files to be specified for [get][] or [copy_to][], we will make multiple invocations of underlying git-annex command to not overfill command line - More robust handling of unicode output in terminals which might not support it ### Enhancements and new features - Ship a copy of numpy.testing to facilitate [test][] without requiring numpy as dependency. Also allow to pass to command which test(s) to run - In [get][] and [copy_to][] provide actual original requested paths, not the ones we deduced need to be transferred, solely for knowing the total ## 0.8.0 (Jul 31, 2017) -- it is better than ever A variety of fixes and enhancements ### Fixes - [publish][] would now push merged `git-annex` branch even if no other changes were done - [publish][] should be able to publish using relative path within SSH URI (git hook would use relative paths) - [publish][] should better tollerate publishing to pure git and `git-annex` special remotes ### Enhancements and new features - [plugin][] mechanism came to replace [export][]. See [export_tarball][] for the replacement of [export][]. Now it should be easy to extend datalad's interface with custom functionality to be invoked along with other commands. - Minimalistic coloring of the results rendering - [publish][]/`copy_to` got progress bar report now and support of `--jobs` - minor fixes and enhancements to crawler (e.g. support of recursive removes) ## 0.7.0 (Jun 25, 2017) -- when it works - it is quite awesome! New features, refactorings, and bug fixes. ### Major refactoring and deprecations - [add-sibling][] has been fully replaced by the [siblings][] command - [create-sibling][], and [unlock][] have been re-written to support the same common API as most other commands ### Enhancements and new features - [siblings][] can now be used to query and configure a local repository by using the sibling name ``here`` - [siblings][] can now query and set annex preferred content configuration. This includes ``wanted`` (as previously supported in other commands), and now also ``required`` - New [metadata][] command to interface with datasets/files [meta-data][] - Documentation for all commands is now built in a uniform fashion - Significant parts of the documentation of been updated - Instantiate GitPython's Repo instances lazily ### Fixes - API documentation is now rendered properly as HTML, and is easier to browse by having more compact pages - Closed files left open on various occasions (Popen PIPEs, etc) - Restored basic (consumer mode of operation) compatibility with Windows OS ## 0.6.0 (Jun 14, 2017) -- German perfectionism This release includes a **huge** refactoring to make code base and functionality more robust and flexible - outputs from API commands could now be highly customized. See `--output-format`, `--report-status`, `--report-type`, and `--report-type` options for [datalad][] command. - effort was made to refactor code base so that underlying functions behave as generators where possible - input paths/arguments analysis was redone for majority of the commands to provide unified behavior ### Major refactoring and deprecations - `add-sibling` and `rewrite-urls` were refactored in favor of new [siblings][] command which should be used for siblings manipulations - 'datalad.api.alwaysrender' config setting/support is removed in favor of new outputs processing ### Fixes - Do not flush manually git index in pre-commit to avoid "Death by the Lock" issue - Deployed by [publish][] `post-update` hook script now should be more robust (tolerate directory names with spaces, etc.) - A variety of fixes, see [list of pull requests and issues closed](https://github.com/datalad/datalad/milestone/41?closed=1) for more information ### Enhancements and new features - new [annotate-paths][] plumbing command to inspect and annotate provided paths. Use `--modified` to summarize changes between different points in the history - new [clone][] plumbing command to provide a subset (install a single dataset from a URL) functionality of [install][] - new [diff][] plumbing command - new [siblings][] command to list or manipulate siblings - new [subdatasets][] command to list subdatasets and their properties - [drop][] and [remove][] commands were refactored - `benchmarks/` collection of [Airspeed velocity](https://github.com/spacetelescope/asv/) benchmarks initiated. See reports at http://datalad.github.io/datalad/ - crawler would try to download a new url multiple times increasing delay between attempts. Helps to resolve problems with extended crawls of Amazon S3 - [CRCNS][] crawler pipeline now also fetches and aggregates meta-data for the datasets from datacite - overall optimisations to benefit from the aforementioned refactoring and improve user-experience - a few stub and not (yet) implemented commands (e.g. `move`) were removed from the interface - Web frontend got proper coloring for the breadcrumbs and some additional caching to speed up interactions. See http://datasets.datalad.org - Small improvements to the online documentation. See e.g. [summary of differences between git/git-annex/datalad](http://docs.datalad.org/en/latest/related.html#git-git-annex-datalad) ## 0.5.1 (Mar 25, 2017) -- cannot stop the progress A bugfix release ### Fixes - [add][] was forcing addition of files to annex regardless of settings in `.gitattributes`. Now that decision is left to annex by default - `tools/testing/run_doc_examples` used to run doc examples as tests, fixed up to provide status per each example and not fail at once - `doc/examples` - [3rdparty_analysis_workflow.sh](http://docs.datalad.org/en/latest/generated/examples/3rdparty_analysis_workflow.html) was fixed up to reflect changes in the API of 0.5.0. - progress bars - should no longer crash **datalad** and report correct sizes and speeds - should provide progress reports while using Python 3.x ### Enhancements and new features - `doc/examples` - [nipype_workshop_dataset.sh](http://docs.datalad.org/en/latest/generated/examples/nipype_workshop_dataset.html) new example to demonstrate how new super- and sub- datasets were established as a part of our datasets collection ## 0.5.0 (Mar 20, 2017) -- it's huge This release includes an avalanche of bug fixes, enhancements, and additions which at large should stay consistent with previous behavior but provide better functioning. Lots of code was refactored to provide more consistent code-base, and some API breakage has happened. Further work is ongoing to standardize output and results reporting ([#1350][]) ### Most notable changes - requires [git-annex][] >= 6.20161210 (or better even >= 6.20161210 for improved functionality) - commands should now operate on paths specified (if any), without causing side-effects on other dirty/staged files - [save][] - `-a` is deprecated in favor of `-u` or `--all-updates` so only changes known components get saved, and no new files automagically added - `-S` does no longer store the originating dataset in its commit message - [add][] - can specify commit/save message with `-m` - [add-sibling][] and [create-sibling][] - now take the name of the sibling (remote) as a `-s` (`--name`) option, not a positional argument - `--publish-depends` to setup publishing data and code to multiple repositories (e.g. github + webserve) should now be functional see [this comment](https://github.com/datalad/datalad/issues/335#issuecomment-277240733) - got `--publish-by-default` to specify what refs should be published by default - got `--annex-wanted`, `--annex-groupwanted` and `--annex-group` settings which would be used to instruct annex about preferred content. [publish][] then will publish data using those settings if `wanted` is set. - got `--inherit` option to automagically figure out url/wanted and other git/annex settings for new remote sub-dataset to be constructed - [publish][] - got `--skip-failing` refactored into `--missing` option which could use new feature of [create-sibling][] `--inherit` ### Fixes - More consistent interaction through ssh - all ssh connections go through [sshrun][] shim for a "single point of authentication", etc. - More robust [ls][] operation outside of the datasets - A number of fixes for direct and v6 mode of annex ### Enhancements and new features - New [drop][] and [remove][] commands - [clean][] - got `--what` to specify explicitly what cleaning steps to perform and now could be invoked with `-r` - `datalad` and `git-annex-remote*` scripts now do not use setuptools entry points mechanism and rely on simple import to shorten start up time - [Dataset][] is also now using [Flyweight pattern][], so the same instance is reused for the same dataset - progressbars should not add more empty lines ### Internal refactoring - Majority of the commands now go through `_prep` for arguments validation and pre-processing to avoid recursive invocations ## 0.4.1 (Nov 10, 2016) -- CA release Requires now GitPython >= 2.1.0 ### Fixes - [save][] - to not save staged files if explicit paths were provided - improved (but not yet complete) support for direct mode - [update][] to not crash if some sub-datasets are not installed - do not log calls to `git config` to avoid leakage of possibly sensitive settings to the logs ### Enhancements and new features - New [rfc822-compliant metadata][] format - [save][] - -S to save the change also within all super-datasets - [add][] now has progress-bar reporting - [create-sibling-github][] to create a :term:`sibling` of a dataset on github - [OpenfMRI][] crawler and datasets were enriched with URLs to separate files where also available from openfmri s3 bucket (if upgrading your datalad datasets, you might need to run `git annex enableremote datalad` to make them available) - various enhancements to log messages - web interface - populates "install" box first thus making UX better over slower connections ## 0.4 (Oct 22, 2016) -- Paris is waiting Primarily it is a bugfix release but because of significant refactoring of the [install][] and [get][] implementation, it gets a new minor release. ### Fixes - be able to [get][] or [install][] while providing paths while being outside of a dataset - remote annex datasets get properly initialized - robust detection of outdated [git-annex][] ### Enhancements and new features - interface changes - [get][] `--recursion-limit=existing` to not recurse into not-installed subdatasets - [get][] `-n` to possibly install sub-datasets without getting any data - [install][] `--jobs|-J` to specify number of parallel jobs for annex [get][] call could use (ATM would not work when data comes from archives) - more (unit-)testing - documentation: see http://docs.datalad.org/en/latest/basics.html for basic principles and useful shortcuts in referring to datasets - various webface improvements: breadcrumb paths, instructions how to install dataset, show version from the tags, etc. ## 0.3.1 (Oct 1, 2016) -- what a wonderful week Primarily bugfixes but also a number of enhancements and core refactorings ### Fixes - do not build manpages and examples during installation to avoid problems with possibly previously outdated dependencies - [install][] can be called on already installed dataset (with `-r` or `-g`) ### Enhancements and new features - complete overhaul of datalad configuration settings handling (see [Configuration documentation][]), so majority of the environment. Now uses git format and stores persistent configuration settings under `.datalad/config` and local within `.git/config` variables we have used were renamed to match configuration names - [create-sibling][] does not now by default upload web front-end - [export][] command with a plug-in interface and `tarball` plugin to export datasets - in Python, `.api` functions with rendering of results in command line got a _-suffixed sibling, which would render results as well in Python as well (e.g., using `search_` instead of `search` would also render results, not only output them back as Python objects) - [get][] - `--jobs` option (passed to `annex get`) for parallel downloads - total and per-download (with git-annex >= 6.20160923) progress bars (note that if content is to be obtained from an archive, no progress will be reported yet) - [install][] `--reckless` mode option - [search][] - highlights locations and fieldmaps for better readability - supports `-d^` or `-d///` to point to top-most or centrally installed meta-datasets - "complete" paths to the datasets are reported now - `-s` option to specify which fields (only) to search - various enhancements and small fixes to [meta-data][] handling, [ls][], custom remotes, code-base formatting, downloaders, etc - completely switched to `tqdm` library (`progressbar` is no longer used/supported) ## 0.3 (Sep 23, 2016) -- winter is coming Lots of everything, including but not limited to - enhanced index viewer, as the one on http://datasets.datalad.org - initial new data providers support: [Kaggle][], [BALSA][], [NDA][], [NITRC][] - initial [meta-data support and management][] - new and/or improved crawler pipelines for [BALSA][], [CRCNS][], [OpenfMRI][] - refactored [install][] command, now with separate [get][] - some other commands renaming/refactoring (e.g., [create-sibling][]) - datalad [search][] would give you an option to install datalad's super-dataset under ~/datalad if ran outside of a dataset ### 0.2.3 (Jun 28, 2016) -- busy OHBM New features and bugfix release - support of /// urls to point to http://datasets.datalad.org - variety of fixes and enhancements throughout ### 0.2.2 (Jun 20, 2016) -- OHBM we are coming! New feature and bugfix release - greately improved documentation - publish command API RFing allows for custom options to annex, and uses --to REMOTE for consistent with annex invocation - variety of fixes and enhancements throughout ### 0.2.1 (Jun 10, 2016) - variety of fixes and enhancements throughout ## 0.2 (May 20, 2016) Major RFing to switch from relying on rdf to git native submodules etc ## 0.1 (Oct 14, 2015) Release primarily focusing on interface functionality including initial publishing [git-annex]: http://git-annex.branchable.com/ [gx-sameas]: https://git-annex.branchable.com/tips/multiple_remotes_accessing_the_same_data_store/ [duecredit]: https://github.com/duecredit/duecredit [Kaggle]: https://www.kaggle.com [BALSA]: http://balsa.wustl.edu [NDA]: http://data-archive.nimh.nih.gov [NITRC]: https://www.nitrc.org [CRCNS]: http://crcns.org [FCON1000]: http://fcon_1000.projects.nitrc.org [OpenfMRI]: http://openfmri.org [Configuration documentation]: http://docs.datalad.org/config.html [Dataset]: http://docs.datalad.org/en/latest/generated/datalad.api.Dataset.html [Sibling]: http://docs.datalad.org/en/latest/glossary.html [rfc822-compliant metadata]: http://docs.datalad.org/en/latest/metadata.html#rfc822-compliant-meta-data [meta-data support and management]: http://docs.datalad.org/en/latest/cmdline.html#meta-data-handling [meta-data]: http://docs.datalad.org/en/latest/cmdline.html#meta-data-handling [add-archive-content]: https://datalad.readthedocs.io/en/latest/generated/man/datalad-add-archive-content.html [add-sibling]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-add-sibling.html [add]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-add.html [addurls]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-addurls.html [annotate-paths]: http://docs.datalad.org/en/latest/generated/man/datalad-annotate-paths.html [clean]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-clean.html [clone]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-clone.html [configuration]: http://docs.datalad.org/en/latest/config.html [copy_to]: http://docs.datalad.org/en/latest/_modules/datalad/support/annexrepo.html?highlight=%22copy_to%22 [create]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-create.html [create-sibling-github]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-create-sibling-github.html [create-sibling]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-create-sibling.html [datalad]: http://docs.datalad.org/en/latest/generated/man/datalad.html [datalad-container]: https://github.com/datalad/datalad-container [datalad-revolution]: http://github.com/datalad/datalad-revolution [download-url]: https://datalad.readthedocs.io/en/latest/generated/man/datalad-download-url.html [diff]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-diff.html [drop]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-drop.html [export]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-export.html [export_tarball]: http://docs.datalad.org/en/latest/generated/datalad.plugin.export_tarball.html [get]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-get.html [install]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-install.html [ls]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-ls.html [metadata]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-metadata.html [nd_freeze]: https://github.com/neurodebian/neurodebian/blob/master/tools/nd_freeze [plugin]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-plugin.html [publications]: https://datalad.readthedocs.io/en/latest/publications.html [publish]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-publish.html [remove]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-remove.html [rerun]: https://datalad.readthedocs.io/en/latest/generated/man/datalad-rerun.html [run]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-run.html [run-procedure]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-run-procedure.html [save]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-save.html [search]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-search.html [siblings]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-siblings.html [sshrun]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-sshrun.html [status]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-status.html [subdatasets]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-subdatasets.html [unlock]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-unlock.html [update]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-update.html [wtf]: http://datalad.readthedocs.io/en/latest/generated/man/datalad-wtf.html [handbook]: http://handbook.datalad.org [handbook-scalable-datastore]: http://handbook.datalad.org/en/latest/usecases/datastorage_for_institutions.html [hooks]: http://handbook.datalad.org/en/latest/basics/101-145-hooks.html [Flyweight pattern]: https://en.wikipedia.org/wiki/Flyweight_pattern [NO_COLOR]: https://no-color.org/ [#1350]: https://github.com/datalad/datalad/issues/1350 [#1651]: https://github.com/datalad/datalad/issues/1651 [#2534]: https://github.com/datalad/datalad/issues/2534 [#2566]: https://github.com/datalad/datalad/issues/2566 [#2692]: https://github.com/datalad/datalad/issues/2692 [#2702]: https://github.com/datalad/datalad/issues/2702 [#2703]: https://github.com/datalad/datalad/issues/2703 [#2707]: https://github.com/datalad/datalad/issues/2707 [#2708]: https://github.com/datalad/datalad/issues/2708 [#2710]: https://github.com/datalad/datalad/issues/2710 [#2712]: https://github.com/datalad/datalad/issues/2712 [#2717]: https://github.com/datalad/datalad/issues/2717 [#2722]: https://github.com/datalad/datalad/issues/2722 [#2723]: https://github.com/datalad/datalad/issues/2723 [#2724]: https://github.com/datalad/datalad/issues/2724 [#2725]: https://github.com/datalad/datalad/issues/2725 [#2728]: https://github.com/datalad/datalad/issues/2728 [#2731]: https://github.com/datalad/datalad/issues/2731 [#2733]: https://github.com/datalad/datalad/issues/2733 [#2735]: https://github.com/datalad/datalad/issues/2735 [#2738]: https://github.com/datalad/datalad/issues/2738 [#2741]: https://github.com/datalad/datalad/issues/2741 [#2744]: https://github.com/datalad/datalad/issues/2744 [#2752]: https://github.com/datalad/datalad/issues/2752 [#2754]: https://github.com/datalad/datalad/issues/2754 [#2761]: https://github.com/datalad/datalad/issues/2761 [#2770]: https://github.com/datalad/datalad/issues/2770 [#2773]: https://github.com/datalad/datalad/issues/2773 [#2777]: https://github.com/datalad/datalad/issues/2777 [#2788]: https://github.com/datalad/datalad/issues/2788 [#2794]: https://github.com/datalad/datalad/issues/2794 [#2795]: https://github.com/datalad/datalad/issues/2795 [#2796]: https://github.com/datalad/datalad/issues/2796 [#2798]: https://github.com/datalad/datalad/issues/2798 [#2815]: https://github.com/datalad/datalad/issues/2815 [#2835]: https://github.com/datalad/datalad/issues/2835 [#2858]: https://github.com/datalad/datalad/issues/2858 [#2859]: https://github.com/datalad/datalad/issues/2859 [#2860]: https://github.com/datalad/datalad/issues/2860 [#2861]: https://github.com/datalad/datalad/issues/2861 [#2864]: https://github.com/datalad/datalad/issues/2864 [#2865]: https://github.com/datalad/datalad/issues/2865 [#2876]: https://github.com/datalad/datalad/issues/2876 [#2878]: https://github.com/datalad/datalad/issues/2878 [#2881]: https://github.com/datalad/datalad/issues/2881 [#2886]: https://github.com/datalad/datalad/issues/2886 [#2891]: https://github.com/datalad/datalad/issues/2891 [#2892]: https://github.com/datalad/datalad/issues/2892 [#2893]: https://github.com/datalad/datalad/issues/2893 [#2894]: https://github.com/datalad/datalad/issues/2894 [#2900]: https://github.com/datalad/datalad/issues/2900 [#2901]: https://github.com/datalad/datalad/issues/2901 [#2902]: https://github.com/datalad/datalad/issues/2902 [#2903]: https://github.com/datalad/datalad/issues/2903 [#2904]: https://github.com/datalad/datalad/issues/2904 [#2905]: https://github.com/datalad/datalad/issues/2905 [#2909]: https://github.com/datalad/datalad/issues/2909 [#2912]: https://github.com/datalad/datalad/issues/2912 [#2914]: https://github.com/datalad/datalad/issues/2914 [#2917]: https://github.com/datalad/datalad/issues/2917 [#2918]: https://github.com/datalad/datalad/issues/2918 [#2921]: https://github.com/datalad/datalad/issues/2921 [#2922]: https://github.com/datalad/datalad/issues/2922 [#2937]: https://github.com/datalad/datalad/issues/2937 [#2946]: https://github.com/datalad/datalad/issues/2946 [#2950]: https://github.com/datalad/datalad/issues/2950 [#2952]: https://github.com/datalad/datalad/issues/2952 [#2954]: https://github.com/datalad/datalad/issues/2954 [#2958]: https://github.com/datalad/datalad/issues/2958 [#2960]: https://github.com/datalad/datalad/issues/2960 [#2972]: https://github.com/datalad/datalad/issues/2972 [#2974]: https://github.com/datalad/datalad/issues/2974 [#2982]: https://github.com/datalad/datalad/issues/2982 [#2984]: https://github.com/datalad/datalad/issues/2984 [#2991]: https://github.com/datalad/datalad/issues/2991 [#2993]: https://github.com/datalad/datalad/issues/2993 [#2995]: https://github.com/datalad/datalad/issues/2995 [#3001]: https://github.com/datalad/datalad/issues/3001 [#3002]: https://github.com/datalad/datalad/issues/3002 [#3007]: https://github.com/datalad/datalad/issues/3007 [#3009]: https://github.com/datalad/datalad/issues/3009 [#3019]: https://github.com/datalad/datalad/issues/3019 [#3025]: https://github.com/datalad/datalad/issues/3025 [#3029]: https://github.com/datalad/datalad/issues/3029 [#3035]: https://github.com/datalad/datalad/issues/3035 [#3037]: https://github.com/datalad/datalad/issues/3037 [#3038]: https://github.com/datalad/datalad/issues/3038 [#3046]: https://github.com/datalad/datalad/issues/3046 [#3049]: https://github.com/datalad/datalad/issues/3049 [#3051]: https://github.com/datalad/datalad/issues/3051 [#3057]: https://github.com/datalad/datalad/issues/3057 [#3058]: https://github.com/datalad/datalad/issues/3058 [#3061]: https://github.com/datalad/datalad/issues/3061 [#3065]: https://github.com/datalad/datalad/issues/3065 [#3066]: https://github.com/datalad/datalad/issues/3066 [#3080]: https://github.com/datalad/datalad/issues/3080 [#3089]: https://github.com/datalad/datalad/issues/3089 [#3091]: https://github.com/datalad/datalad/issues/3091 [#3098]: https://github.com/datalad/datalad/issues/3098 [#3099]: https://github.com/datalad/datalad/issues/3099 [#3102]: https://github.com/datalad/datalad/issues/3102 [#3104]: https://github.com/datalad/datalad/issues/3104 [#3106]: https://github.com/datalad/datalad/issues/3106 [#3109]: https://github.com/datalad/datalad/issues/3109 [#3115]: https://github.com/datalad/datalad/issues/3115 [#3119]: https://github.com/datalad/datalad/issues/3119 [#3124]: https://github.com/datalad/datalad/issues/3124 [#3129]: https://github.com/datalad/datalad/issues/3129 [#3137]: https://github.com/datalad/datalad/issues/3137 [#3138]: https://github.com/datalad/datalad/issues/3138 [#3141]: https://github.com/datalad/datalad/issues/3141 [#3146]: https://github.com/datalad/datalad/issues/3146 [#3149]: https://github.com/datalad/datalad/issues/3149 [#3156]: https://github.com/datalad/datalad/issues/3156 [#3164]: https://github.com/datalad/datalad/issues/3164 [#3165]: https://github.com/datalad/datalad/issues/3165 [#3168]: https://github.com/datalad/datalad/issues/3168 [#3176]: https://github.com/datalad/datalad/issues/3176 [#3180]: https://github.com/datalad/datalad/issues/3180 [#3181]: https://github.com/datalad/datalad/issues/3181 [#3184]: https://github.com/datalad/datalad/issues/3184 [#3186]: https://github.com/datalad/datalad/issues/3186 [#3196]: https://github.com/datalad/datalad/issues/3196 [#3205]: https://github.com/datalad/datalad/issues/3205 [#3210]: https://github.com/datalad/datalad/issues/3210 [#3211]: https://github.com/datalad/datalad/issues/3211 [#3215]: https://github.com/datalad/datalad/issues/3215 [#3220]: https://github.com/datalad/datalad/issues/3220 [#3222]: https://github.com/datalad/datalad/issues/3222 [#3223]: https://github.com/datalad/datalad/issues/3223 [#3238]: https://github.com/datalad/datalad/issues/3238 [#3241]: https://github.com/datalad/datalad/issues/3241 [#3242]: https://github.com/datalad/datalad/issues/3242 [#3249]: https://github.com/datalad/datalad/issues/3249 [#3250]: https://github.com/datalad/datalad/issues/3250 [#3255]: https://github.com/datalad/datalad/issues/3255 [#3258]: https://github.com/datalad/datalad/issues/3258 [#3259]: https://github.com/datalad/datalad/issues/3259 [#3268]: https://github.com/datalad/datalad/issues/3268 [#3274]: https://github.com/datalad/datalad/issues/3274 [#3281]: https://github.com/datalad/datalad/issues/3281 [#3288]: https://github.com/datalad/datalad/issues/3288 [#3289]: https://github.com/datalad/datalad/issues/3289 [#3294]: https://github.com/datalad/datalad/issues/3294 [#3298]: https://github.com/datalad/datalad/issues/3298 [#3299]: https://github.com/datalad/datalad/issues/3299 [#3301]: https://github.com/datalad/datalad/issues/3301 [#3304]: https://github.com/datalad/datalad/issues/3304 [#3314]: https://github.com/datalad/datalad/issues/3314 [#3318]: https://github.com/datalad/datalad/issues/3318 [#3322]: https://github.com/datalad/datalad/issues/3322 [#3324]: https://github.com/datalad/datalad/issues/3324 [#3325]: https://github.com/datalad/datalad/issues/3325 [#3326]: https://github.com/datalad/datalad/issues/3326 [#3329]: https://github.com/datalad/datalad/issues/3329 [#3330]: https://github.com/datalad/datalad/issues/3330 [#3332]: https://github.com/datalad/datalad/issues/3332 [#3334]: https://github.com/datalad/datalad/issues/3334 [#3336]: https://github.com/datalad/datalad/issues/3336 [#3340]: https://github.com/datalad/datalad/issues/3340 [#3343]: https://github.com/datalad/datalad/issues/3343 [#3347]: https://github.com/datalad/datalad/issues/3347 [#3353]: https://github.com/datalad/datalad/issues/3353 [#3362]: https://github.com/datalad/datalad/issues/3362 [#3364]: https://github.com/datalad/datalad/issues/3364 [#3365]: https://github.com/datalad/datalad/issues/3365 [#3366]: https://github.com/datalad/datalad/issues/3366 [#3374]: https://github.com/datalad/datalad/issues/3374 [#3378]: https://github.com/datalad/datalad/issues/3378 [#3383]: https://github.com/datalad/datalad/issues/3383 [#3396]: https://github.com/datalad/datalad/issues/3396 [#3398]: https://github.com/datalad/datalad/issues/3398 [#3400]: https://github.com/datalad/datalad/issues/3400 [#3401]: https://github.com/datalad/datalad/issues/3401 [#3403]: https://github.com/datalad/datalad/issues/3403 [#3407]: https://github.com/datalad/datalad/issues/3407 [#3425]: https://github.com/datalad/datalad/issues/3425 [#3429]: https://github.com/datalad/datalad/issues/3429 [#3435]: https://github.com/datalad/datalad/issues/3435 [#3439]: https://github.com/datalad/datalad/issues/3439 [#3440]: https://github.com/datalad/datalad/issues/3440 [#3444]: https://github.com/datalad/datalad/issues/3444 [#3447]: https://github.com/datalad/datalad/issues/3447 [#3458]: https://github.com/datalad/datalad/issues/3458 [#3459]: https://github.com/datalad/datalad/issues/3459 [#3460]: https://github.com/datalad/datalad/issues/3460 [#3470]: https://github.com/datalad/datalad/issues/3470 [#3475]: https://github.com/datalad/datalad/issues/3475 [#3476]: https://github.com/datalad/datalad/issues/3476 [#3479]: https://github.com/datalad/datalad/issues/3479 [#3492]: https://github.com/datalad/datalad/issues/3492 [#3493]: https://github.com/datalad/datalad/issues/3493 [#3498]: https://github.com/datalad/datalad/issues/3498 [#3499]: https://github.com/datalad/datalad/issues/3499 [#3508]: https://github.com/datalad/datalad/issues/3508 [#3516]: https://github.com/datalad/datalad/issues/3516 [#3518]: https://github.com/datalad/datalad/issues/3518 [#3524]: https://github.com/datalad/datalad/issues/3524 [#3525]: https://github.com/datalad/datalad/issues/3525 [#3527]: https://github.com/datalad/datalad/issues/3527 [#3531]: https://github.com/datalad/datalad/issues/3531 [#3534]: https://github.com/datalad/datalad/issues/3534 [#3538]: https://github.com/datalad/datalad/issues/3538 [#3546]: https://github.com/datalad/datalad/issues/3546 [#3547]: https://github.com/datalad/datalad/issues/3547 [#3552]: https://github.com/datalad/datalad/issues/3552 [#3555]: https://github.com/datalad/datalad/issues/3555 [#3561]: https://github.com/datalad/datalad/issues/3561 [#3562]: https://github.com/datalad/datalad/issues/3562 [#3570]: https://github.com/datalad/datalad/issues/3570 [#3574]: https://github.com/datalad/datalad/issues/3574 [#3576]: https://github.com/datalad/datalad/issues/3576 [#3579]: https://github.com/datalad/datalad/issues/3579 [#3582]: https://github.com/datalad/datalad/issues/3582 [#3586]: https://github.com/datalad/datalad/issues/3586 [#3587]: https://github.com/datalad/datalad/issues/3587 [#3591]: https://github.com/datalad/datalad/issues/3591 [#3594]: https://github.com/datalad/datalad/issues/3594 [#3597]: https://github.com/datalad/datalad/issues/3597 [#3600]: https://github.com/datalad/datalad/issues/3600 [#3602]: https://github.com/datalad/datalad/issues/3602 [#3616]: https://github.com/datalad/datalad/issues/3616 [#3622]: https://github.com/datalad/datalad/issues/3622 [#3624]: https://github.com/datalad/datalad/issues/3624 [#3626]: https://github.com/datalad/datalad/issues/3626 [#3629]: https://github.com/datalad/datalad/issues/3629 [#3631]: https://github.com/datalad/datalad/issues/3631 [#3646]: https://github.com/datalad/datalad/issues/3646 [#3648]: https://github.com/datalad/datalad/issues/3648 [#3656]: https://github.com/datalad/datalad/issues/3656 [#3667]: https://github.com/datalad/datalad/issues/3667 [#3678]: https://github.com/datalad/datalad/issues/3678 [#3680]: https://github.com/datalad/datalad/issues/3680 [#3682]: https://github.com/datalad/datalad/issues/3682 [#3688]: https://github.com/datalad/datalad/issues/3688 [#3692]: https://github.com/datalad/datalad/issues/3692 [#3693]: https://github.com/datalad/datalad/issues/3693 [#3695]: https://github.com/datalad/datalad/issues/3695 [#3700]: https://github.com/datalad/datalad/issues/3700 [#3701]: https://github.com/datalad/datalad/issues/3701 [#3702]: https://github.com/datalad/datalad/issues/3702 [#3704]: https://github.com/datalad/datalad/issues/3704 [#3705]: https://github.com/datalad/datalad/issues/3705 [#3712]: https://github.com/datalad/datalad/issues/3712 [#3715]: https://github.com/datalad/datalad/issues/3715 [#3719]: https://github.com/datalad/datalad/issues/3719 [#3728]: https://github.com/datalad/datalad/issues/3728 [#3743]: https://github.com/datalad/datalad/issues/3743 [#3746]: https://github.com/datalad/datalad/issues/3746 [#3747]: https://github.com/datalad/datalad/issues/3747 [#3749]: https://github.com/datalad/datalad/issues/3749 [#3751]: https://github.com/datalad/datalad/issues/3751 [#3754]: https://github.com/datalad/datalad/issues/3754 [#3761]: https://github.com/datalad/datalad/issues/3761 [#3765]: https://github.com/datalad/datalad/issues/3765 [#3768]: https://github.com/datalad/datalad/issues/3768 [#3769]: https://github.com/datalad/datalad/issues/3769 [#3770]: https://github.com/datalad/datalad/issues/3770 [#3772]: https://github.com/datalad/datalad/issues/3772 [#3775]: https://github.com/datalad/datalad/issues/3775 [#3776]: https://github.com/datalad/datalad/issues/3776 [#3777]: https://github.com/datalad/datalad/issues/3777 [#3780]: https://github.com/datalad/datalad/issues/3780 [#3787]: https://github.com/datalad/datalad/issues/3787 [#3791]: https://github.com/datalad/datalad/issues/3791 [#3793]: https://github.com/datalad/datalad/issues/3793 [#3794]: https://github.com/datalad/datalad/issues/3794 [#3797]: https://github.com/datalad/datalad/issues/3797 [#3798]: https://github.com/datalad/datalad/issues/3798 [#3799]: https://github.com/datalad/datalad/issues/3799 [#3803]: https://github.com/datalad/datalad/issues/3803 [#3804]: https://github.com/datalad/datalad/issues/3804 [#3807]: https://github.com/datalad/datalad/issues/3807 [#3812]: https://github.com/datalad/datalad/issues/3812 [#3815]: https://github.com/datalad/datalad/issues/3815 [#3817]: https://github.com/datalad/datalad/issues/3817 [#3821]: https://github.com/datalad/datalad/issues/3821 [#3828]: https://github.com/datalad/datalad/issues/3828 [#3831]: https://github.com/datalad/datalad/issues/3831 [#3842]: https://github.com/datalad/datalad/issues/3842 [#3850]: https://github.com/datalad/datalad/issues/3850 [#3851]: https://github.com/datalad/datalad/issues/3851 [#3854]: https://github.com/datalad/datalad/issues/3854 [#3856]: https://github.com/datalad/datalad/issues/3856 [#3860]: https://github.com/datalad/datalad/issues/3860 [#3862]: https://github.com/datalad/datalad/issues/3862 [#3863]: https://github.com/datalad/datalad/issues/3863 [#3871]: https://github.com/datalad/datalad/issues/3871 [#3873]: https://github.com/datalad/datalad/issues/3873 [#3877]: https://github.com/datalad/datalad/issues/3877 [#3880]: https://github.com/datalad/datalad/issues/3880 [#3888]: https://github.com/datalad/datalad/issues/3888 [#3892]: https://github.com/datalad/datalad/issues/3892 [#3903]: https://github.com/datalad/datalad/issues/3903 [#3904]: https://github.com/datalad/datalad/issues/3904 [#3906]: https://github.com/datalad/datalad/issues/3906 [#3907]: https://github.com/datalad/datalad/issues/3907 [#3911]: https://github.com/datalad/datalad/issues/3911 [#3926]: https://github.com/datalad/datalad/issues/3926 [#3927]: https://github.com/datalad/datalad/issues/3927 [#3931]: https://github.com/datalad/datalad/issues/3931 [#3935]: https://github.com/datalad/datalad/issues/3935 [#3940]: https://github.com/datalad/datalad/issues/3940 [#3954]: https://github.com/datalad/datalad/issues/3954 [#3955]: https://github.com/datalad/datalad/issues/3955 [#3958]: https://github.com/datalad/datalad/issues/3958 [#3959]: https://github.com/datalad/datalad/issues/3959 [#3960]: https://github.com/datalad/datalad/issues/3960 [#3963]: https://github.com/datalad/datalad/issues/3963 [#3970]: https://github.com/datalad/datalad/issues/3970 [#3971]: https://github.com/datalad/datalad/issues/3971 [#3974]: https://github.com/datalad/datalad/issues/3974 [#3975]: https://github.com/datalad/datalad/issues/3975 [#3976]: https://github.com/datalad/datalad/issues/3976 [#3979]: https://github.com/datalad/datalad/issues/3979 [#3996]: https://github.com/datalad/datalad/issues/3996 [#3999]: https://github.com/datalad/datalad/issues/3999 [#4002]: https://github.com/datalad/datalad/issues/4002 [#4022]: https://github.com/datalad/datalad/issues/4022 [#4036]: https://github.com/datalad/datalad/issues/4036 [#4037]: https://github.com/datalad/datalad/issues/4037 [#4041]: https://github.com/datalad/datalad/issues/4041 [#4045]: https://github.com/datalad/datalad/issues/4045 [#4049]: https://github.com/datalad/datalad/issues/4049 [#4050]: https://github.com/datalad/datalad/issues/4050 [#4060]: https://github.com/datalad/datalad/issues/4060 [#4064]: https://github.com/datalad/datalad/issues/4064 [#4070]: https://github.com/datalad/datalad/issues/4070 [#4073]: https://github.com/datalad/datalad/issues/4073 [#4078]: https://github.com/datalad/datalad/issues/4078 [#4140]: https://github.com/datalad/datalad/issues/4140 [#4194]: https://github.com/datalad/datalad/issues/4194 [#4200]: https://github.com/datalad/datalad/issues/4200 [#4212]: https://github.com/datalad/datalad/issues/4212 [#4214]: https://github.com/datalad/datalad/issues/4214 [#4239]: https://github.com/datalad/datalad/issues/4239 [#4262]: https://github.com/datalad/datalad/issues/4262 [#4285]: https://github.com/datalad/datalad/issues/4285 [#4308]: https://github.com/datalad/datalad/issues/4308 [#4315]: https://github.com/datalad/datalad/issues/4315 datalad-0.12.4/CODE_OF_CONDUCT.md000066400000000000000000000004611363461734600160050ustar00rootroot00000000000000# Code of Conduct * Be nice -- motivated people are more creative and more productive, the nature of any interaction should be such that it leads to more motivation, not less * Be constructive -- getting a fix for a problem, or identifying a concrete path forward should be the focus of any discussion datalad-0.12.4/CONTRIBUTING.md000066400000000000000000000535651363461734600154540ustar00rootroot00000000000000Contributing to DataLad ======================= [gh-datalad]: http://github.com/datalad/datalad Files organization ------------------ - [datalad/](./datalad) is the main Python module where major development is happening, with major submodules being: - `cmdline/` - helpers for accessing `interface/` functionality from command line - `customremotes/` - custom special remotes for annex provided by datalad - `downloaders/` - support for accessing data from various sources (e.g. http, S3, XNAT) via a unified interface. - `configs/` - specifications for known data providers and associated credentials - `interface/` - high level interface functions which get exposed via command line (`cmdline/`) or Python (`datalad.api`). - `tests/` - some unit- and regression- tests (more could be found under `tests/` of corresponding submodules. See [Tests](#tests)) - [utils.py](./datalad/tests/utils.py) provides convenience helpers used by unit-tests such as `@with_tree`, `@serve_path_via_http` and other decorators - `ui/` - user-level interactions, such as messages about errors, warnings, progress reports, AND when supported by available frontend -- interactive dialogs - `support/` - various support modules, e.g. for git/git-annex interfaces, constraints for the `interface/`, etc - [benchmarks/](./benchmarks) - [asv] benchmarks suite (see [Benchmarking](#benchmarking)) - [docs/](./docs) - yet to be heavily populated documentation - `bash-completions` - bash and zsh completion setup for datalad (just `source` it) - [fixtures/](./fixtures) currently not under git, contains generated by vcr fixtures - [tools/](./tools) contains helper utilities used during development, testing, and benchmarking of DataLad. Implemented in any most appropriate language (Python, bash, etc.) How to contribute ----------------- The preferred way to contribute to the DataLad code base is to fork the [main repository][gh-datalad] on GitHub. Here we outline the workflow used by the developers: 0. Have a clone of our main [project repository][gh-datalad] as `origin` remote in your git: git clone git://github.com/datalad/datalad 1. Fork the [project repository][gh-datalad]: click on the 'Fork' button near the top of the page. This creates a copy of the code base under your account on the GitHub server. 2. Add your forked clone as a remote to the local clone you already have on your local disk: git remote add gh-YourLogin git@github.com:YourLogin/datalad.git git fetch gh-YourLogin To ease addition of other github repositories as remotes, here is a little bash function/script to add to your `~/.bashrc`: ghremote () { url="$1" proj=${url##*/} url_=${url%/*} login=${url_##*/} git remote add gh-$login $url git fetch gh-$login } thus you could simply run: ghremote git@github.com:YourLogin/datalad.git to add the above `gh-YourLogin` remote. Additional handy aliases such as `ghpr` (to fetch existing pr from someone's remote) and `ghsendpr` could be found at [yarikoptic's bash config file](http://git.onerussian.com/?p=etc/bash.git;a=blob;f=.bash/bashrc/30_aliases_sh;hb=HEAD#l865) 3. Create a branch (generally off the `origin/master`) to hold your changes: git checkout -b nf-my-feature and start making changes. Ideally, use a prefix signaling the purpose of the branch - `nf-` for new features - `bf-` for bug fixes - `rf-` for refactoring - `doc-` for documentation contributions (including in the code docstrings). - `bm-` for changes to benchmarks We recommend to not work in the ``master`` branch! 4. Work on this copy on your computer using Git to do the version control. When you're done editing, do: git add modified_files git commit to record your changes in Git. Ideally, prefix your commit messages with the `NF`, `BF`, `RF`, `DOC`, `BM` similar to the branch name prefixes, but you could also use `TST` for commits concerned solely with tests, and `BK` to signal that the commit causes a breakage (e.g. of tests) at that point. Multiple entries could be listed joined with a `+` (e.g. `rf+doc-`). See `git log` for examples. If a commit closes an existing DataLad issue, then add to the end of the message `(Closes #ISSUE_NUMER)` 5. Push to GitHub with: git push -u gh-YourLogin nf-my-feature Finally, go to the web page of your fork of the DataLad repo, and click 'Pull request' (PR) to send your changes to the maintainers for review. This will send an email to the committers. You can commit new changes to this branch and keep pushing to your remote -- github automagically adds them to your previously opened PR. (If any of the above seems like magic to you, then look up the [Git documentation](http://git-scm.com/documentation) on the web.) Development environment ----------------------- We support Python 3 only (>= 3.5). See [README.md:Dependencies](README.md#Dependencies) for basic information about installation of datalad itself. On Debian-based systems we recommend to enable [NeuroDebian](http://neuro.debian.net) since we use it to provide backports of recent fixed external modules we depend upon: ```sh apt-get install -y -q git git-annex-standalone apt-get install -y -q patool python3-scrapy python3-{appdirs,argcomplete,git,humanize,keyring,lxml,msgpack,progressbar,requests,setuptools} ``` and additionally, for development we suggest to use tox and new versions of dependencies from pypy: ```sh apt-get install -y -q python3-{dev,httpretty,nose,pip,vcr,virtualenv} python3-tox # Some libraries which might be needed for installing via pip apt-get install -y -q lib{ffi,ssl,curl4-openssl,xml2,xslt1}-dev ``` some of which you could also install from PyPi using pip (prior installation of those libraries listed above might be necessary) ```sh pip install -r requirements-devel.txt ``` and you will need to install recent git-annex using appropriate for your OS means (for Debian/Ubuntu, once again, just use NeuroDebian). Documentation ------------- ### Docstrings We use [NumPy standard] for the description of parameters docstrings. If you are using PyCharm, set your project settings (`Tools` -> `Python integrated tools` -> `Docstring format`). [NumPy standard]: https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard In addition, we follow the guidelines of [Restructured Text] with the additional features and treatments provided by [Sphinx]. [Restructured Text]: http://docutils.sourceforge.net/docs/user/rst/quickstart.html [Sphinx]: http://www.sphinx-doc.org/en/stable/ Additional Hints ---------------- ### Merge commits For merge commits to have more informative description, add to your `.git/config` or `~/.gitconfig` following section: [merge] log = true and if conflicts occur, provide short summary on how they were resolved in "Conflicts" listing within the merge commit (see [example](https://github.com/datalad/datalad/commit/eb062a8009d160ae51929998771964738636dcc2)). Quality Assurance ----------------- It is recommended to check that your contribution complies with the following rules before submitting a pull request: - All public methods should have informative docstrings with sample usage presented as doctests when appropriate. - All other tests pass when everything is rebuilt from scratch. - New code should be accompanied by tests. ### Tests `datalad/tests` contains tests for the core portion of the project, and more tests are provided under corresponding submodules in `tests/` subdirectories to simplify re-running the tests concerning that portion of the codebase. To execute many tests, the codebase first needs to be "installed" in order to generate scripts for the entry points. For that, the recommended course of action is to use `virtualenv`, e.g. ```sh virtualenv --system-site-packages venv-tests source venv-tests/bin/activate pip install -r requirements.txt python setup.py develop ``` and then use that virtual environment to run the tests, via ```sh python -m nose -s -v datalad ``` or similarly, ```sh nosetests -s -v datalad ``` then to later deactivate the virtualenv just simply enter ```sh deactivate ``` Alternatively, or complimentary to that, you can use `tox` -- there is a `tox.ini` file which sets up a few virtual environments for testing locally, which you can later reuse like any other regular virtualenv for troubleshooting. Additionally, [tools/testing/test_README_in_docker](tools/testing/test_README_in_docker) script can be used to establish a clean docker environment (based on any NeuroDebian-supported release of Debian or Ubuntu) with all dependencies listed in README.md pre-installed. ### CI setup We are using Travis-CI and have [buildbot setup](https://github.com/datalad/buildbot) which also exercises our tests battery for every PR and on the master. Note that buildbot runs tests only submitted by datalad developers, or if a PR acquires 'buildbot' label. In case if you want to enter buildbot's environment 1. Login to our development server (`smaug`) 2. Find container ID associated with the environment you are interested in, e.g. docker ps | grep nd16.04 3. Enter that docker container environment using docker exec -it /bin/bash 4. Become buildbot user su - buildbot 5. Activate corresponding virtualenv using source e.g. `source /home/buildbot/datalad-pr-docker-dl-nd15_04/build/venv-ci/bin/activate` And now you should be in the same environment as the very last tested PR. Note that the same path/venv is reused for all the PRs, so you might want first to check using `git show` under the `build/` directory if it corresponds to the commit you are interested to troubleshoot. For developing on Windows you can use free [Windows VMs](https://developer.microsoft.com/en-us/microsoft-edge/tools/vms/). ### Coverage You can also check for common programming errors with the following tools: - Code with good unittest coverage (at least 80%), check with: pip install nose coverage nosetests --with-coverage path/to/tests_for_package - We rely on https://codecov.io to provide convenient view of code coverage. Installation of the codecov extension for Firefox/Iceweasel or Chromium is strongly advised, since it provides coverage annotation of pull requests. ### Linting We are not (yet) fully PEP8 compliant, so please use these tools as guidelines for your contributions, but not to PEP8 entire code base. [beyond-pep8]: https://www.youtube.com/watch?v=wf-BqAjZb8M *Sidenote*: watch [Raymond Hettinger - Beyond PEP 8][beyond-pep8] - No pyflakes warnings, check with: pip install pyflakes pyflakes path/to/module.py - No PEP8 warnings, check with: pip install pep8 pep8 path/to/module.py - AutoPEP8 can help you fix some of the easy redundant errors: pip install autopep8 autopep8 path/to/pep8.py Also, some team developers use [PyCharm community edition](https://www.jetbrains.com/pycharm) which provides built-in PEP8 checker and handy tools such as smart splits/joins making it easier to maintain code following the PEP8 recommendations. NeuroDebian provides `pycharm-community-sloppy` package to ease pycharm installation even further. ### Benchmarking We use [asv] to benchmark some core DataLad functionality. The benchmarks suite is located under [benchmarks/](./benchmarks), and periodically we publish results of running benchmarks on a dedicated host to http://datalad.github.io/datalad/ . Those results are collected and available under the `.asv/` submodule of this repository, so to get started - `git submodule update --init .asv` - `pip install .[devel]` or just `pip install asv` - `asv machine` - to configure asv for your host if you want to run benchmarks locally And then you could use [asv] in multiple ways. #### Quickly benchmark the working tree - `asv run -E existing` - benchmark using the existing python environment and just print out results (not stored anywhere). You can add `-q` to run each benchmark just once (thus less reliable estimates) - `asv run -b api.supers.time_createadd_to_dataset -E existing` would run that specific benchmark using the existing python environment Note: `--python=same` (`-E existing`) seems to have restricted applicability, e.g. can't be used for a range of commits, so it can't be used with `continuous`. #### Compare results for two commits from recorded runs Use [asv compare] to compare results from different runs, which should be available under `.asv/results/`. (Note that the example below passes ref names instead of commit IDs, which requires asv v0.3 or later.) ```shell > asv compare -m hopa 0.9.x master All benchmarks: before after ratio [b619eca4] [7635f467] - 1.87s 1.54s 0.82 api.supers.time_createadd - 1.85s 1.56s 0.84 api.supers.time_createadd_to_dataset - 5.57s 4.40s 0.79 api.supers.time_installr 145±6ms 145±6ms 1.00 api.supers.time_ls - 4.59s 2.17s 0.47 api.supers.time_remove 427±1ms 434±8ms 1.02 api.testds.time_create_test_dataset1 - 4.10s 3.37s 0.82 api.testds.time_create_test_dataset2x2 1.81±0.07ms 1.73±0.04ms 0.96 core.runner.time_echo 2.30±0.2ms 2.04±0.03ms ~0.89 core.runner.time_echo_gitrunner + 420±10ms 535±3ms 1.27 core.startup.time_help_np 111±6ms 107±3ms 0.96 core.startup.time_import + 334±6ms 466±4ms 1.39 core.startup.time_import_api ``` #### Run and compare results for two commits [asv continuous] could be used to first run benchmarks for the to-be-tested commits and then provide stats: - `asv continuous 0.9.x master` - would run and compare 0.9.x and master branches - `asv continuous HEAD` - would compare HEAD against HEAD^ - `asv continuous master HEAD` - would compare HEAD against state of master - [TODO: contineous -E existing](https://github.com/airspeed-velocity/asv/issues/338#issuecomment-380520022) Notes: - only significant changes will be reported - raw results from benchmarks are not stored (use `--record-samples` if desired) #### Run and record benchmarks results (for later comparison etc) - `asv run` would run all configured branches (see [asv.conf.json](./asv.conf.json)) #### Profile a benchmark and produce a nice graph visualization Example (replace with the benchmark of interest) asv profile -v -o profile.gprof usecases.study_forrest.time_make_studyforrest_mockup gprof2dot -f pstats profile.gprof | dot -Tpng -o profile.png \ && xdg-open profile.png #### Common options - `-E` to restrict to specific environment, e.g. `-E virtualenv:2.7` - `-b` could be used to specify specific benchmark(s) - `-q` to run benchmark just once for a quick assessment (results are not stored since too unreliable) [asv compare]: http://asv.readthedocs.io/en/latest/commands.html#asv-compare [asv continuous]: http://asv.readthedocs.io/en/latest/commands.html#asv-continuous [asv]: http://asv.readthedocs.io Easy Issues ----------- A great way to start contributing to DataLad is to pick an item from the list of [Easy issues](https://github.com/datalad/datalad/labels/easy) in the issue tracker. Resolving these issues allows you to start contributing to the project without much prior knowledge. Your assistance in this area will be greatly appreciated by the more experienced developers as it helps free up their time to concentrate on other issues. Recognizing contributions ------------------------- We welcome and recognize all contributions from documentation to testing to code development. You can see a list of current contributors in our [zenodo file][link_zenodo]. If you are new to the project, don't forget to add your name and affiliation there! Thank you! ---------- You're awesome. :wave::smiley: Various hints for developers ---------------------------- ### Useful tools - While performing IO/net heavy operations use [dstat](http://dag.wieers.com/home-made/dstat) for quick logging of various health stats in a separate terminal window: dstat -c --top-cpu -d --top-bio --top-latency --net - To monitor speed of any data pipelining [pv](http://www.ivarch.com/programs/pv.shtml) is really handy, just plug it in the middle of your pipe. - For remote debugging epdb could be used (avail in pip) by using `import epdb; epdb.serve()` in Python code and then connecting to it with `python -c "import epdb; epdb.connect()".` - We are using codecov which has extensions for the popular browsers (Firefox, Chrome) which annotates pull requests on github regarding changed coverage. ### Useful Environment Variables Refer datalad/config.py for information on how to add these environment variables to the config file and their naming convention - *DATALAD_DATASETS_TOPURL*: Used to point to an alternative location for `///` dataset. If running tests preferred to be set to http://datasets-tests.datalad.org - *DATALAD_LOG_LEVEL*: Used for control the verbosity of logs printed to stdout while running datalad commands/debugging - *DATALAD_LOG_CMD_OUTPUTS*: Used to control either both stdout and stderr of external commands execution are logged in detail (at DEBUG level) - *DATALAD_LOG_CMD_ENV*: If contains a digit (e.g. 1), would log entire environment passed into the Runner.run's popen call. Otherwise could be a comma separated list of environment variables to log - *DATALAD_LOG_CMD_STDIN*: Whether to log stdin for the command - *DATALAD_LOG_CMD_CWD*: Whether to log cwd where command to be executed - *DATALAD_LOG_PID* To instruct datalad to log PID of the process - *DATALAD_LOG_TARGET* Where to log: `stderr` (default), `stdout`, or another filename - *DATALAD_LOG_TIMESTAMP*: Used to add timestamp to datalad logs - *DATALAD_LOG_TRACEBACK*: Runs TraceBack function with collide set to True, if this flag is set to 'collide'. This replaces any common prefix between current traceback log and previous invocation with "..." - *DATALAD_LOG_VMEM*: Reports memory utilization (resident/virtual) at every log line, needs `psutil` module - *DATALAD_EXC_STR_TBLIMIT*: This flag is used by the datalad extract_tb function which extracts and formats stack-traces. It caps the number of lines to DATALAD_EXC_STR_TBLIMIT of pre-processed entries from traceback. - *DATALAD_SEED*: To seed Python's `random` RNG, which will also be used for generation of dataset UUIDs to make those random values reproducible. You might want also to set all the relevant git config variables like we do in one of the travis runs - *DATALAD_TESTS_TEMP_KEEP*: Function rmtemp will not remove temporary file/directory created for testing if this flag is set - *DATALAD_TESTS_TEMP_DIR*: Create a temporary directory at location specified by this flag. It is used by tests to create a temporary git directory while testing git annex archives etc - *DATALAD_TESTS_NONETWORK*: Skips network tests completely if this flag is set Examples include test for s3, git_repositories, openfmri etc - *DATALAD_TESTS_SSH*: Skips SSH tests if this flag is **not** set - *DATALAD_TESTS_NOTEARDOWN*: Does not execute teardown_package which cleans up temp files and directories created by tests if this flag is set - *DATALAD_TESTS_USECASSETTE*: Specifies the location of the file to record network transactions by the VCR module. Currently used by when testing custom special remotes - *DATALAD_TESTS_OBSCURE_PREFIX*: A string to prefix the most obscure (but supported by the filesystem test filename - *DATALAD_TESTS_PROTOCOLREMOTE*: Binary flag to specify whether to test protocol interactions of custom remote with annex - *DATALAD_TESTS_RUNCMDLINE*: Binary flag to specify if shell testing using shunit2 to be carried out - *DATALAD_TESTS_TEMP_FS*: Specify the temporary file system to use as loop device for testing DATALAD_TESTS_TEMP_DIR creation - *DATALAD_TESTS_TEMP_FSSIZE*: Specify the size of temporary file system to use as loop device for testing DATALAD_TESTS_TEMP_DIR creation - *DATALAD_TESTS_NONLO*: Specifies network interfaces to bring down/up for testing. Currently used by travis. - *DATALAD_CMD_PROTOCOL*: Specifies the protocol number used by the Runner to note shell command or python function call times and allows for dry runs. 'externals-time' for ExecutionTimeExternalsProtocol, 'time' for ExecutionTimeProtocol and 'null' for NullProtocol. Any new DATALAD_CMD_PROTOCOL has to implement datalad.support.protocol.ProtocolInterface - *DATALAD_CMD_PROTOCOL_PREFIX*: Sets a prefix to add before the command call times are noted by DATALAD_CMD_PROTOCOL. - *DATALAD_USE_DEFAULT_GIT*: Instructs to use `git` as available in current environment, and not the one which possibly comes with git-annex (default behavior). - *DATALAD_ASSERT_NO_OPEN_FILES*: Instructs test helpers to check for open files at the end of a test. If set, remaining open files are logged at ERROR level. Alternative modes are: "assert" (raise AssertionError if any open file is found), "pdb"/"epdb" (drop into debugger when open files are found, info on files is provided in a "files" dictionary, mapping filenames to psutil process objects). - *DATALAD_ALLOW_FAIL*: Instructs `@never_fail` decorator to allow to fail, e.g. to ease debugging. # Changelog section For the upcoming release use this template ## 0.12.5 (??? ??, 2020) -- will be better than ever bet we will fix some bugs and make a world even a better place. ### Major refactoring and deprecations - hopefully none ### Fixes ? ### Enhancements and new features ? [link_zenodo]: https://github.com/datalad/datalad/blob/master/.zenodo.json datalad-0.12.4/CONTRIBUTORS000066400000000000000000000006151363461734600150670ustar00rootroot00000000000000The following people have contributed to DataLad: Alejandro de la Vega Alex Waite Anisha Keshavan Benjamin Poldrack Christian Olaf Häusler Dave MacFarlane Debanjum Singh Solanky Feilong Ma Gergana Alteva Horea Christian Jason Gors Jorrit Poelen Kusti Skytén Kyle Meyer Matteo Visconti dOC Michael Hanke Nell Hardcastle Taylor Olson Torsten Stoeter Vanessa Sochat Vicky C Lau Yaroslav Halchenko datalad-0.12.4/COPYING000066400000000000000000000041021363461734600142350ustar00rootroot00000000000000# Main Copyright/License DataLad, including all examples, code snippets and attached documentation is covered by the MIT license. The MIT License Copyright (c) 2013- Yaroslav Halchenko 2015- DataLad Team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. See CONTRIBUTORS file for a full list of contributors. # 3rd Party Code Some code distributed within DataLad was not developed by the DataLad team, hence you should adhere to the copyright and license terms of respective authors if you are to use corresponding parts. ## `datalad/resources/website` - [JQuery](https://code.jquery.com) - MIT License - Copyright (c) 2016- - [Datatables](http://datatables.net) - MIT License - Copyright (c) 2016- - [BlueImp-MD5](https://github.com/blueimp/JavaScript-MD5) - MIT License - Copyright (c) 2016- - [QUnit](https://qunitjs.com/) - MIT License - Copyright (c) - [Sinon-QUnit Plugin](http://sinonjs.org/qunit/) - BSD License - Copyright (c) 2010-2011 - [NumPy Testing](http://numpy.org) - BSD license - Copyright (c) 2005- NumPy Developers datalad-0.12.4/Gruntfile.js000066400000000000000000000004471363461734600155070ustar00rootroot00000000000000module.exports = function(grunt) { // Project configuration. grunt.initConfig({ qunit: { files: ['datalad/resources/website/tests/test.html'] } }); // Load plugin grunt.loadNpmTasks('grunt-contrib-qunit'); // Task to run tests grunt.registerTask('test', 'qunit'); }; datalad-0.12.4/Makefile000066400000000000000000000032271363461734600146510ustar00rootroot00000000000000# simple makefile to simplify repetetive build env management tasks under posix # Ideas borrowed from scikit-learn's and PyMVPA Makefiles -- thanks! PYTHON ?= python NOSETESTS ?= $(PYTHON) -m nose MODULE ?= datalad all: clean test clean: $(PYTHON) setup.py clean rm -rf dist build bin -find . -name '*.pyc' -delete -find . -name '__pycache__' -type d -delete bin: mkdir -p $@ PYTHONPATH=bin:$(PYTHONPATH) $(PYTHON) setup.py develop --install-dir $@ test-code: bin PATH=bin:$(PATH) PYTHONPATH=bin:$(PYTHONPATH) $(NOSETESTS) -s -v $(MODULE) test-coverage: rm -rf coverage .coverage $(NOSETESTS) -s -v --with-coverage $(MODULE) test: test-code trailing-spaces: find $(MODULE) -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \; code-analysis: flake8 $(MODULE) | grep -v __init__ | grep -v external pylint -E -i y $(MODULE)/ # -d E1103,E0611,E1101 linkissues-changelog: tools/link_issues_CHANGELOG update-changelog: linkissues-changelog @echo ".. This file is auto-converted from CHANGELOG.md (make update-changelog) -- do not edit\n\nChange log\n**********" > docs/source/changelog.rst pandoc -t rst CHANGELOG.md >> docs/source/changelog.rst release-pypi: update-changelog # better safe than sorry test ! -e dist $(PYTHON) setup.py sdist # the wheels we would produce are broken on windows, because they # install an incompatible entrypoint script # https://github.com/datalad/datalad/issues/4315 #$(PYTHON) setup.py bdist_wheel twine upload dist/* docs/source/basics_cmdline.rst.in: build/casts/cmdline_basic_usage.json tools/cast2rst $^ > $@ docs/source/basics_nesteddatasets.rst.in: build/casts/seamless_nested_repos.json tools/cast2rst $^ > $@ datalad-0.12.4/README.md000066400000000000000000000164661363461734600145010ustar00rootroot00000000000000 ____ _ _ _ | _ \ __ _ | |_ __ _ | | __ _ __| | | | | | / _` | | __| / _` | | | / _` | / _` | | |_| | | (_| | | |_ | (_| | | |___ | (_| | | (_| | |____/ \__,_| \__| \__,_| |_____| \__,_| \__,_| Read me [![Travis tests status](https://secure.travis-ci.org/datalad/datalad.png?branch=master)](https://travis-ci.org/datalad/datalad) [![Build status](https://ci.appveyor.com/api/projects/status/github/datalad/datalad?branch=master&svg=true)](https://ci.appveyor.com/project/mih/datalad/branch/master) [![codecov.io](https://codecov.io/github/datalad/datalad/coverage.svg?branch=master)](https://codecov.io/github/datalad/datalad?branch=master) [![Documentation](https://readthedocs.org/projects/datalad/badge/?version=latest)](http://datalad.rtfd.org) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![GitHub release](https://img.shields.io/github/release/datalad/datalad.svg)](https://GitHub.com/datalad/datalad/releases/) [![PyPI version fury.io](https://badge.fury.io/py/datalad.svg)](https://pypi.python.org/pypi/datalad/) [![Testimonials 4](https://img.shields.io/badge/testimonials-4-brightgreen.svg)](https://github.com/datalad/datalad/wiki/Testimonials) [![https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg](https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg)](https://singularity-hub.org/collections/667) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3233911.svg)](https://doi.org/10.5281/zenodo.3233911) # 10000ft overview DataLad makes data management and data distribution more accessible. To do that, it stands on the shoulders of [Git] and [Git-annex] to deliver a decentralized system for data exchange. This includes automated ingestion of data from online portals and exposing it in readily usable form as Git(-annex) repositories, so-called datasets. The actual data storage and permission management, however, remains with the original data providers. The full documentation is available at http://docs.datalad.org and http://handbook.datalad.org provides a hands-on crash-course on DataLad. # Extensions A number of extensions are available that provide additional functionality for DataLad. Extensions are separate packages that are to be installed in addition to DataLad. In order to install DataLad customized for a particular domain, one can simply install an extension directly, and DataLad itself will be automatically installed with it. Here is a list of known extensions: - [crawler](https://github.com/datalad/datalad-crawler) -- tracking web resources and automated data distributions [![crawler release](https://img.shields.io/github/release/datalad/datalad-crawler.svg)](https://GitHub.com/datalad/datalad-crawler/releases/) - [neuroimaging](https://github.com/datalad/datalad-neuroimaging) -- neuroimaging research data and workflows [![neuroimaging release](https://img.shields.io/github/release/datalad/datalad-neuroimaging.svg)](https://GitHub.com/datalad/datalad-neuroimaging/releases/) - [container](https://github.com/datalad/datalad-container) -- support for containerized computational environments [![container release](https://img.shields.io/github/release/datalad/datalad-container.svg)](https://GitHub.com/datalad/datalad-container/releases/) - [webapp](https://github.com/datalad/datalad-webapp) -- support for exposing selected DataLad API as REST API webapp [tech demo] # Support The documentation of this project is found here: http://docs.datalad.org All bugs, concerns and enhancement requests for this software can be submitted here: https://github.com/datalad/datalad/issues If you have a problem or would like to ask a question about how to use DataLad, please [submit a question to NeuroStars.org](https://neurostars.org/new-topic?body=-%20Please%20describe%20the%20problem.%0A-%20What%20steps%20will%20reproduce%20the%20problem%3F%0A-%20What%20version%20of%20DataLad%20are%20you%20using%20%28run%20%60datalad%20--version%60%29%3F%20On%20what%20operating%20system%20%28consider%20running%20%60datalad%20plugin%20wtf%60%29%3F%0A-%20Please%20provide%20any%20additional%20information%20below.%0A-%20Have%20you%20had%20any%20luck%20using%20DataLad%20before%3F%20%28Sometimes%20we%20get%20tired%20of%20reading%20bug%20reports%20all%20day%20and%20a%20lil'%20positive%20end%20note%20does%20wonders%29&tags=datalad) with a ``datalad`` tag. NeuroStars.org is a platform similar to StackOverflow but dedicated to neuroinformatics. All previous DataLad questions are available here: http://neurostars.org/tags/datalad/ # Installation ## Debian-based systems On Debian-based systems, we recommend to enable [NeuroDebian] from which we provide recent releases of DataLad. Once enabled, just do: apt-get install datalad ## Other Linux'es via conda conda install -c conda-forge datalad will install released released version, and release candidates are available via conda install -c conda-forge/label/rc datalad ## Other Linux'es, OSX via pip Before you install this package, please make sure that you [install a recent version of git-annex](https://git-annex.branchable.com/install). Afterwards, install the latest version of `datalad` from [PyPi](https://pypi.org/project/datalad). It is recommended to use a dedicated [virtualenv](https://virtualenv.pypa.io): # create and enter a new virtual environment (optional) virtualenv --python=python3 ~/env/datalad . ~/env/datalad/bin/activate # install from PyPi pip install datalad By default, installation via pip installs core functionality of datalad allowing for managing datasets etc. Additional installation schemes are available, so you could provide enhanced installation via `pip install datalad[SCHEME]` where `SCHEME` could be - `tests` to also install dependencies used by unit-tests battery of the datalad - `full` to install all dependencies. There is also a [Singularity container](http://singularity.lbl.gov) available. The latest release version can be obtained by running: singularity pull shub://datalad/datalad More details on installation and initial configuration could be found in the [DataLad Handbook: Installation]. # License MIT/Expat # Contributing See [CONTRIBUTING.md](CONTRIBUTING.md) if you are interested in internals or contributing to the project. ## Acknowledgements DataLad development is supported by a US-German collaboration in computational neuroscience (CRCNS) project "DataGit: converging catalogues, warehouses, and deployment logistics into a federated 'data distribution'" (Halchenko/Hanke), co-funded by the US National Science Foundation (NSF 1429999) and the German Federal Ministry of Education and Research (BMBF 01GQ1411). Additional support is provided by the German federal state of Saxony-Anhalt and the European Regional Development Fund (ERDF), Project: Center for Behavioral Brain Sciences, Imaging Platform. This work is further facilitated by the ReproNim project (NIH 1P41EB019936-01A1). [Git]: https://git-scm.com [Git-annex]: http://git-annex.branchable.com [setup.py]: https://github.com/datalad/datalad/blob/master/setup.py [NeuroDebian]: http://neuro.debian.net [DataLad Handbook: Installation]: http://handbook.datalad.org/en/latest/intro/installation.htmldatalad-0.12.4/_datalad_build_support/000077500000000000000000000000001363461734600177115ustar00rootroot00000000000000datalad-0.12.4/_datalad_build_support/__init__.py000066400000000000000000000010211363461734600220140ustar00rootroot00000000000000# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the DataLad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Python package for functionality needed at package 'build' time by DataLad and its extensions __init__ here should be really minimalistic, not import submodules by default and submodules should also not require heavy dependencies. """ __version__ = '0.1' datalad-0.12.4/_datalad_build_support/formatters.py000066400000000000000000000246671363461734600224700ustar00rootroot00000000000000# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the DataLad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## import argparse import datetime import re class ManPageFormatter(argparse.HelpFormatter): # This code was originally distributed # under the same License of Python # Copyright (c) 2014 Oz Nahum Tiram def __init__(self, prog, indent_increment=2, max_help_position=4, width=1000000, section=1, ext_sections=None, authors=None, version=None ): super(ManPageFormatter, self).__init__( prog, indent_increment=indent_increment, max_help_position=max_help_position, width=width) self._prog = prog self._section = 1 self._today = datetime.date.today().strftime('%Y\\-%m\\-%d') self._ext_sections = ext_sections self._version = version def _get_formatter(self, **kwargs): return self.formatter_class(prog=self.prog, **kwargs) def _markup(self, txt): return txt.replace('-', '\\-') def _underline(self, string): return "\\fI\\s-1" + string + "\\s0\\fR" def _bold(self, string): if not string.strip().startswith('\\fB'): string = '\\fB' + string if not string.strip().endswith('\\fR'): string = string + '\\fR' return string def _mk_synopsis(self, parser): self.add_usage(parser.usage, parser._actions, parser._mutually_exclusive_groups, prefix='') usage = self._format_usage(None, parser._actions, parser._mutually_exclusive_groups, '') # replace too long list of commands with a single placeholder usage = re.sub(r'{[^]]*?create,.*?}', ' COMMAND ', usage, flags=re.MULTILINE) # take care of proper wrapping usage = re.sub(r'\[([-a-zA-Z0-9]*)\s([a-zA-Z0-9{}|_]*)\]', r'[\1\~\2]', usage) usage = usage.replace('%s ' % self._prog, '') usage = '.SH SYNOPSIS\n.nh\n.HP\n\\fB%s\\fR %s\n.hy\n' % (self._markup(self._prog), usage) return usage def _mk_title(self, prog): name_version = "{0} {1}".format(prog, self._version) return '.TH "{0}" "{1}" "{2}" "{3}"\n'.format( prog, self._section, self._today, name_version) def _mk_name(self, prog, desc): """ this method is in consitent with others ... it relies on distribution """ desc = desc.splitlines()[0] if desc else 'it is in the name' # ensure starting lower case desc = desc[0].lower() + desc[1:] return '.SH NAME\n%s \\- %s\n' % (self._bold(prog), desc) def _mk_description(self, parser): desc = parser.description desc = '\n'.join(desc.splitlines()[1:]) if not desc: return '' desc = desc.replace('\n\n', '\n.PP\n') # sub-section headings desc = re.sub(r'^\*(.*)\*$', r'.SS \1', desc, flags=re.MULTILINE) # italic commands desc = re.sub(r'^ ([-a-z]*)$', r'.TP\n\\fI\1\\fR', desc, flags=re.MULTILINE) # deindent body text, leave to troff viewer desc = re.sub(r'^ (\S.*)\n', '\\1\n', desc, flags=re.MULTILINE) # format NOTEs as indented paragraphs desc = re.sub(r'^NOTE\n', '.TP\nNOTE\n', desc, flags=re.MULTILINE) # deindent indented paragraphs after heading setup desc = re.sub(r'^ (.*)$', '\\1', desc, flags=re.MULTILINE) return '.SH DESCRIPTION\n%s\n' % self._markup(desc) def _mk_footer(self, sections): if not hasattr(sections, '__iter__'): return '' footer = [] for section, value in sections.items(): part = ".SH {}\n {}".format(section.upper(), value) footer.append(part) return '\n'.join(footer) def format_man_page(self, parser): page = [] page.append(self._mk_title(self._prog)) page.append(self._mk_name(self._prog, parser.description)) page.append(self._mk_synopsis(parser)) page.append(self._mk_description(parser)) page.append(self._mk_options(parser)) page.append(self._mk_footer(self._ext_sections)) return ''.join(page) def _mk_options(self, parser): formatter = parser._get_formatter() # positionals, optionals and user-defined groups for action_group in parser._action_groups: formatter.start_section(None) formatter.add_text(None) formatter.add_arguments(action_group._group_actions) formatter.end_section() # epilog formatter.add_text(parser.epilog) # determine help from format above help = formatter.format_help() # add spaces after comma delimiters for easier reformatting help = re.sub(r'([a-z]),([a-z])', '\\1, \\2', help) # get proper indentation for argument items help = re.sub(r'^ (\S.*)\n', '.TP\n\\1\n', help, flags=re.MULTILINE) # deindent body text, leave to troff viewer help = re.sub(r'^ (\S.*)\n', '\\1\n', help, flags=re.MULTILINE) return '.SH OPTIONS\n' + help def _format_action_invocation(self, action): if not action.option_strings: metavar, = self._metavar_formatter(action, action.dest)(1) return metavar else: parts = [] # if the Optional doesn't take a value, format is: # -s, --long if action.nargs == 0: parts.extend([self._bold(action_str) for action_str in action.option_strings]) # if the Optional takes a value, format is: # -s ARGS, --long ARGS else: default = self._underline(action.dest.upper()) args_string = self._format_args(action, default) for option_string in action.option_strings: parts.append('%s %s' % (self._bold(option_string), args_string)) return ', '.join(p.replace('--', '-\\\\-') for p in parts) class RSTManPageFormatter(ManPageFormatter): def _get_formatter(self, **kwargs): return self.formatter_class(prog=self.prog, **kwargs) def _markup(self, txt): # put general tune-ups here return txt def _underline(self, string): return "*{0}*".format(string) def _bold(self, string): return "**{0}**".format(string) def _mk_synopsis(self, parser): self.add_usage(parser.usage, parser._actions, parser._mutually_exclusive_groups, prefix='') usage = self._format_usage(None, parser._actions, parser._mutually_exclusive_groups, '') usage = usage.replace('%s ' % self._prog, '') usage = 'Synopsis\n--------\n::\n\n %s %s\n' \ % (self._markup(self._prog), usage) return usage def _mk_title(self, prog): # and an easy to use reference point title = ".. _man_%s:\n\n" % prog.replace(' ', '-') title += "{0}".format(prog) title += '\n{0}\n\n'.format('=' * len(prog)) return title def _mk_name(self, prog, desc): return '' def _mk_description(self, parser): desc = parser.description if not desc: return '' return 'Description\n-----------\n%s\n' % self._markup(desc) def _mk_footer(self, sections): if not hasattr(sections, '__iter__'): return '' footer = [] for section, value in sections.items(): part = "\n{0}\n{1}\n{2}\n".format( section, '-' * len(section), value) footer.append(part) return '\n'.join(footer) def _mk_options(self, parser): # this non-obvious maneuver is really necessary! formatter = self.__class__(self._prog) # positionals, optionals and user-defined groups for action_group in parser._action_groups: formatter.start_section(None) formatter.add_text(None) formatter.add_arguments(action_group._group_actions) formatter.end_section() # epilog formatter.add_text(parser.epilog) # determine help from format above option_sec = formatter.format_help() return '\n\nOptions\n-------\n{0}'.format(option_sec) def _format_action(self, action): # determine the required width and the entry label action_header = self._format_action_invocation(action) if action.help: help_text = self._expand_help(action) help_lines = self._split_lines(help_text, 80) help = ' '.join(help_lines) else: help = '' # return a single string return '{0}\n{1}\n{2}\n\n'.format( action_header, '~' * len(action_header), help) def cmdline_example_to_rst(src, out=None, ref=None): if out is None: from io import StringIO out = StringIO() # place header out.write('.. AUTO-GENERATED FILE -- DO NOT EDIT!\n\n') if ref: # place cross-ref target out.write('.. {0}:\n\n'.format(ref)) # parser status vars inexample = False incodeblock = False for line in src: if line.startswith('#% EXAMPLE START'): inexample = True incodeblock = False continue if not inexample: continue if line.startswith('#% EXAMPLE END'): break if not inexample: continue if line.startswith('#%'): incodeblock = not incodeblock if incodeblock: out.write('\n.. code-block:: sh\n\n') continue if not incodeblock and line.startswith('#'): out.write(line[(min(2, len(line) - 1)):]) continue if incodeblock: if not line.rstrip().endswith('#% SKIP'): out.write(' %s' % line) continue if not len(line.strip()): continue else: raise RuntimeError("this should not happen") return out datalad-0.12.4/_datalad_build_support/setup.py000066400000000000000000000447721363461734600214410ustar00rootroot00000000000000# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the DataLad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## import datetime import os import platform import setuptools import sys from distutils.core import Command from distutils.errors import DistutilsOptionError from distutils.version import LooseVersion from genericpath import exists from os import linesep, makedirs from os.path import dirname, join as opj, sep as pathsep, splitext from setuptools import findall, find_packages, setup from . import formatters as fmt def _path_rel2file(*p): # dirname instead of joining with pardir so it works if # datalad_build_support/ is just symlinked into some extension # while developing return opj(dirname(dirname(__file__)), *p) def get_version(name): """Load version from version.py without entailing any imports Parameters ---------- name: str Name of the folder (package) where from to read version.py """ # This might entail lots of imports which might not yet be available # so let's do ad-hoc parsing of the version.py with open(_path_rel2file(name, 'version.py')) as f: version_lines = list(filter(lambda x: x.startswith('__version__'), f)) assert (len(version_lines) == 1) return version_lines[0].split('=')[1].strip(" '\"\t\n") class BuildManPage(Command): # The BuildManPage code was originally distributed # under the same License of Python # Copyright (c) 2014 Oz Nahum Tiram description = 'Generate man page from an ArgumentParser instance.' user_options = [ ('manpath=', None, 'output path for manpages'), ('rstpath=', None, 'output path for RST files'), ('parser=', None, 'module path to an ArgumentParser instance' '(e.g. mymod:func, where func is a method or function which return' 'a dict with one or more arparse.ArgumentParser instances.'), ] def initialize_options(self): self.manpath = opj('build', 'man') self.rstpath = opj('docs', 'source', 'generated', 'man') self.parser = 'datalad.cmdline.main:setup_parser' def finalize_options(self): if self.manpath is None: raise DistutilsOptionError('\'manpath\' option is required') if self.rstpath is None: raise DistutilsOptionError('\'rstpath\' option is required') if self.parser is None: raise DistutilsOptionError('\'parser\' option is required') self.manpath = _path_rel2file(self.manpath) self.rstpath = _path_rel2file(self.rstpath) mod_name, func_name = self.parser.split(':') fromlist = mod_name.split('.') try: mod = __import__(mod_name, fromlist=fromlist) self._parser = getattr(mod, func_name)( ['datalad'], formatter_class=fmt.ManPageFormatter, return_subparsers=True, help_ignore_extensions=True) except ImportError as err: raise err self.announce('Writing man page(s) to %s' % self.manpath) self._today = datetime.date.today() @classmethod def handle_module(cls, mod_name, **kwargs): """Module specific handling. This particular one does 1. Memorize (at class level) the module name of interest here 2. Check if 'datalad.extensions' are specified for the module, and then analyzes them to obtain command names it provides If cmdline commands are found, its entries are to be used instead of the ones in datalad's _parser. Parameters ---------- **kwargs: all the kwargs which might be provided to setuptools.setup """ cls.mod_name = mod_name exts = kwargs.get('entry_points', {}).get('datalad.extensions', []) for ext in exts: assert '=' in ext # should be label=module:obj ext_label, mod_obj = ext.split('=', 1) assert ':' in mod_obj # should be module:obj mod, obj = mod_obj.split(':', 1) assert mod_name == mod # AFAIK should be identical mod = __import__(mod_name) if hasattr(mod, obj): command_suite = getattr(mod, obj) assert len(command_suite) == 2 # as far as I see it if not hasattr(cls, 'cmdline_names'): cls.cmdline_names = [] cls.cmdline_names += [ cmd for _, _, cmd, _ in command_suite[1] ] def run(self): dist = self.distribution #homepage = dist.get_url() #appname = self._parser.prog appname = 'datalad' sections = { 'Authors': """{0} is developed by {1} <{2}>.""".format( appname, dist.get_author(), dist.get_author_email()), } for cls, opath, ext in ((fmt.ManPageFormatter, self.manpath, '1'), (fmt.RSTManPageFormatter, self.rstpath, 'rst')): if not os.path.exists(opath): os.makedirs(opath) for cmdname in getattr(self, 'cmdline_names', list(self._parser)): p = self._parser[cmdname] cmdname = "{0}{1}".format( 'datalad ' if cmdname != 'datalad' else '', cmdname) format = cls( cmdname, ext_sections=sections, version=get_version(getattr(self, 'mod_name', appname))) formatted = format.format_man_page(p) with open(opj(opath, '{0}.{1}'.format( cmdname.replace(' ', '-'), ext)), 'w') as f: f.write(formatted) class BuildRSTExamplesFromScripts(Command): description = 'Generate RST variants of example shell scripts.' user_options = [ ('expath=', None, 'path to look for example scripts'), ('rstpath=', None, 'output path for RST files'), ] def initialize_options(self): self.expath = opj('docs', 'examples') self.rstpath = opj('docs', 'source', 'generated', 'examples') def finalize_options(self): if self.expath is None: raise DistutilsOptionError('\'expath\' option is required') if self.rstpath is None: raise DistutilsOptionError('\'rstpath\' option is required') self.expath = _path_rel2file(self.expath) self.rstpath = _path_rel2file(self.rstpath) self.announce('Converting example scripts') def run(self): opath = self.rstpath if not os.path.exists(opath): os.makedirs(opath) from glob import glob for example in glob(opj(self.expath, '*.sh')): exname = os.path.basename(example)[:-3] with open(opj(opath, '{0}.rst'.format(exname)), 'w') as out: fmt.cmdline_example_to_rst( open(example), out=out, ref='_example_{0}'.format(exname)) class BuildConfigInfo(Command): description = 'Generate RST documentation for all config items.' user_options = [ ('rstpath=', None, 'output path for RST file'), ] def initialize_options(self): self.rstpath = opj('docs', 'source', 'generated', 'cfginfo') def finalize_options(self): if self.rstpath is None: raise DistutilsOptionError('\'rstpath\' option is required') self.rstpath = _path_rel2file(self.rstpath) self.announce('Generating configuration documentation') def run(self): opath = self.rstpath if not os.path.exists(opath): os.makedirs(opath) from datalad.interface.common_cfg import definitions as cfgdefs from datalad.dochelpers import _indent categories = { 'global': {}, 'local': {}, 'dataset': {}, 'misc': {} } for term, v in cfgdefs.items(): categories[v.get('destination', 'misc')][term] = v for cat in categories: with open(opj(opath, '{}.rst'.format(cat)), 'w') as rst: rst.write('.. glossary::\n') for term, v in sorted(categories[cat].items(), key=lambda x: x[0]): rst.write(_indent(term, '\n ')) qtype, docs = v.get('ui', (None, {})) desc_tmpl = '\n' if 'title' in docs: desc_tmpl += '{title}:\n' if 'text' in docs: desc_tmpl += '{text}\n' if 'default' in v: default = v['default'] if hasattr(default, 'replace'): # protect against leaking specific home dirs v['default'] = default.replace(os.path.expanduser('~'), '~') desc_tmpl += 'Default: {default}\n' if 'type' in v: type_ = v['type'] if hasattr(type_, 'long_description'): type_ = type_.long_description() else: type_ = type_.__name__ desc_tmpl += '\n[{type}]\n' v['type'] = type_ if desc_tmpl == '\n': # we need something to avoid joining terms desc_tmpl += 'undocumented\n' v.update(docs) rst.write(_indent(desc_tmpl.format(**v), ' ')) class BuildSchema(Command): description = 'Generate DataLad JSON-LD schema.' user_options = [ ('path=', None, 'output path for schema file'), ] def initialize_options(self): self.path = opj('docs', 'source', '_extras') def finalize_options(self): if self.path is None: raise DistutilsOptionError('\'path\' option is required') self.path = _path_rel2file(self.path) self.announce('Generating JSON-LD schema file') def run(self): from datalad.metadata.definitions import common_defs from datalad.metadata.definitions import version as schema_version import json import shutil def _mk_fname(label, version): return '{}{}{}.json'.format( label, '_v' if version else '', version) def _defs2context(defs, context_label, vocab_version, main_version=schema_version): opath = opj( self.path, _mk_fname(context_label, vocab_version)) odir = dirname(opath) if not os.path.exists(odir): os.makedirs(odir) # to become DataLad's own JSON-LD context context = {} schema = {"@context": context} if context_label != 'schema': schema['@vocab'] = 'http://docs.datalad.org/{}'.format( _mk_fname('schema', main_version)) for key, val in defs.items(): # git-annex doesn't allow ':', but in JSON-LD we need it for # namespace separation -- let's make '.' in git-annex mean # ':' in JSON-LD key = key.replace('.', ':') definition = val['def'] if definition.startswith('http://') or definition.startswith('https://'): # this is not a URL, hence an @id definitions that points # to another schema context[key] = definition continue # the rest are compound definitions props = {'@id': definition} if 'unit' in val: props['unit'] = val['unit'] if 'descr' in val: props['description'] = val['descr'] context[key] = props with open(opath, 'w') as fp: json.dump( schema, fp, ensure_ascii=True, indent=1, separators=(', ', ': '), sort_keys=True) print('schema written to {}'.format(opath)) # core vocabulary _defs2context(common_defs, 'schema', schema_version) # present the same/latest version also as the default shutil.copy( opj(self.path, _mk_fname('schema', schema_version)), opj(self.path, 'schema.json')) def setup_entry_points(entry_points): """Sneaky monkey patching could be fixed only via even sneakier monkey patching It will never break, I promise! """ def get_script_content(script_name, shebang="#!/usr/bin/env python"): return linesep.join([ shebang, "#", "# Custom simplistic runner for DataLad. Assumes datalad module", "# being available. Generated by monkey patching monkey patched", "# setuptools.", "#", "from %s import main" % entry_points[script_name], "main()", ""]).encode() def patch_write_script(mod): """Patches write_script of the module with our shim to provide lightweight invocation script """ orig_meth = getattr(mod, 'write_script') def _provide_lean_script_contents( self, script_name, contents, mode="t", *ignored): # could be a script from another module -- let it be as is if script_name in entry_points: # keep shebang contents = get_script_content( script_name, contents.splitlines()[0].decode()) return orig_meth(self, script_name, contents, mode=mode) setattr(mod, 'write_script', _provide_lean_script_contents) # We still need this one so that setuptools known about the scripts # So we generate some bogus ones, and provide a list of them ;) # pre-generate paths so we could give them to setuptools scripts_build_dir = opj('build', 'scripts_generated') scripts = [opj(scripts_build_dir, x) for x in entry_points] if 'clean' not in sys.argv: if not exists(scripts_build_dir): makedirs(scripts_build_dir) for s, mod in entry_points.items(): with open(opj(scripts_build_dir, s), 'wb') as f: f.write(get_script_content(s)) platform_system = platform.system().lower() setup_kwargs = {} if platform_system == 'windows': # TODO: investigate https://github.com/matthew-brett/myscripter, # nibabel/nixext approach to support similar setup on Windows setup_kwargs['entry_points'] = { 'console_scripts': ['%s=%s:main' % i for i in entry_points.items()] } else: # Damn you sharktopus! from setuptools.command.install_scripts import \ install_scripts as stinstall_scripts from setuptools.command.easy_install import easy_install patch_write_script(stinstall_scripts) patch_write_script(easy_install) setup_kwargs['scripts'] = scripts return setup_kwargs def get_long_description_from_README(): """Read README.md, convert to .rst using pypandoc If pypandoc is not available or fails - just output original .md. Returns ------- dict with keys long_description and possibly long_description_content_type for newer setuptools which support uploading of markdown as is. """ # PyPI used to not render markdown. Workaround for a sane appearance # https://github.com/pypa/pypi-legacy/issues/148#issuecomment-227757822 # is still in place for older setuptools README = opj(_path_rel2file('README.md')) ret = {} if LooseVersion(setuptools.__version__) >= '38.6.0': # check than this ret['long_description'] = open(README).read() ret['long_description_content_type'] = 'text/markdown' return ret # Convert or fall-back try: import pypandoc return {'long_description': pypandoc.convert(README, 'rst')} except (ImportError, OSError) as exc: # attempting to install pandoc via brew on OSX currently hangs and # pypandoc imports but throws OSError demanding pandoc print( "WARNING: pypandoc failed to import or thrown an error while " "converting" " README.md to RST: %r .md version will be used as is" % exc ) return {'long_description': open(README).read()} def findsome(subdir, extensions): """Find files under subdir having specified extensions Leading directory (datalad) gets stripped """ return [ f.split(pathsep, 1)[1] for f in findall(opj('datalad', subdir)) if splitext(f)[-1].lstrip('.') in extensions ] def datalad_setup(name, **kwargs): """A helper for a typical invocation of setuptools.setup. If not provided in kwargs, following fields will be autoset to the defaults or obtained from the present on the file system files: - author - author_email - packages -- all found packages which start with `name` - long_description -- converted to .rst using pypandoc README.md - version -- parsed `__version__` within `name/version.py` Parameters ---------- name: str Name of the Python package **kwargs: The rest of the keyword arguments passed to setuptools.setup as is """ # Simple defaults for k, v in { 'author': "The DataLad Team and Contributors", 'author_email': "team@datalad.org" }.items(): if kwargs.get(k) is None: kwargs[k] = v # More complex, requiring some function call # Only recentish versions of find_packages support include # packages = find_packages('.', include=['datalad*']) # so we will filter manually for maximal compatibility if kwargs.get('packages') is None: kwargs['packages'] = [pkg for pkg in find_packages('.') if pkg.startswith(name)] if kwargs.get('long_description') is None: kwargs.update(get_long_description_from_README()) if kwargs.get('version') is None: kwargs['version'] = get_version(name) cmdclass = kwargs.get('cmdclass', {}) # Check if command needs some module specific handling for v in cmdclass.values(): if hasattr(v, 'handle_module'): getattr(v, 'handle_module')(name, **kwargs) return setup(name=name, **kwargs)datalad-0.12.4/appveyor.yml000066400000000000000000000171421363461734600156020ustar00rootroot00000000000000build: false environment: matrix: - PYTHON: "C:\\Python35" PYTHON_VERSION: "3.5.1" PYTHON_ARCH: "32" MINICONDA: C:\Miniconda35 DATALAD_TESTS_SSH: 1 - PYTHON: "C:\\Python35" PYTHON_VERSION: "3.5.1" PYTHON_ARCH: "32" MINICONDA: C:\Miniconda35 DATALAD_TESTS_SSH: 1 DATALAD_REPO_VERSION: 6 cache: # cache the pip cache - C:\Users\appveyor\AppData\Local\pip\Cache -> appveyor.yml init: # alter machine PATH setting to have git-core tools and SSH installation # accessible even when SSHing into localhost (see gh-3683) - ps: '[System.Environment]::SetEnvironmentVariable("PATH", "$env:Path;C:\Program Files\Git\mingw64\libexec\git-core;C:\projects\datalad\resources\OpenSSH-Win32", [System.EnvironmentVariableTarget]::Machine)' - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH% %MINICONDA%" # this will display login RDP info for the build VM, but if the build VM should block # see on_finish below instead #- ps: iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) install: - "set PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%" - conda config --set always_yes yes --set changeps1 no - conda update -q conda - conda info -a #- "conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy matplotlib pytest pandas" - "conda create -q -n test-environment python=%PYTHON_VERSION%" - activate test-environment - mkdir resources # define test host alias - echo. >> %SYSTEMROOT%\System32\drivers\etc\hosts - echo.127.0.0.1 datalad-test >> %SYSTEMROOT%\System32\drivers\etc\hosts # OpenSSH server setup - appveyor DownloadFile https://github.com/PowerShell/Win32-OpenSSH/releases/download/v7.6.1.0p1-Beta/OpenSSH-Win32.zip -FileName resources\openssh.zip - 7z x -o"resources" resources\openssh.zip # install - powershell.exe -ExecutionPolicy Bypass -File resources\OpenSSH-Win32\install-sshd.ps1 # configure service - powershell.exe New-NetFirewallRule -Name sshd -DisplayName 'OpenSSH Server (sshd)' -Enabled True -Direction Inbound -Protocol TCP -Action Allow -LocalPort 22 # keys in default place - ssh-keygen -f C:\Users\appveyor\.ssh\id_rsa -N "" # authorize access with these keys - copy C:\Users\appveyor\.ssh\id_rsa.pub c:\Users\appveyor\.ssh\authorized_keys - copy tools\ci\appveyor_ssh_config c:\Users\appveyor\.ssh\config # fire up service - net start sshd # test login - ssh -v localhost exit - ssh datalad-test exit # git annex setup - appveyor DownloadFile https://downloads.kitenet.net/git-annex/windows/current/git-annex-installer.exe -FileName resources\git-annex-installer.exe # extract git annex into the system Git installation path - 7z x -o"C:\\Program Files\Git" resources\git-annex-installer.exe # info on how python is ticking - python -c "import sys; print(sys.path)" # cannot do full, e.g. because libxmp is N/A, causes unguarded ERRORs #- pip install ".[full]" - pip install ".[tests]" - pip install ".[devel-utils]" # fixup # ATM datalad does not pull in colorama, which is needed for color output # on windows - pip install colorama - git config --global user.email "test@appveyor.land" - git config --global user.name "Appveyor Almighty" test_script: # establish baseline, if annex doesn't work, we are not even trying #- git annex test # run tests on installed module, not source tree files - mkdir __testhome__ - cd __testhome__ # report basic info - git version - git annex version # first sign of life - datalad wtf # and now this... [keep appending tests that should work!!] # one call per datalad component for now -- to better see what is being tested # everything in core/ must work - "python -m nose -s -v --with-cov --cover-package datalad datalad.core" # cmdline - "python -m nose -s -v --with-cov --cover-package datalad datalad.cmdline" # remaining fails: test_archives.test_basic_scenario test_datalad.test_basic_scenario_local_url #- python -m nose -s -v datalad.customremotes # remaining fails: test_add test_create_test_dataset test_uninstall test_utils - "python -m nose -s -v --with-cov --cover-package datalad datalad.distribution.tests.test_get datalad.distribution.tests.test_create_github datalad.distribution.tests.test_dataset_binding datalad.distribution.tests.test_siblings datalad.distribution.tests.test_update datalad.distribution.tests.test_dataset datalad.distribution.tests.test_publish datalad.local.tests.test_subdataset datalad.distribution.tests.test_create_sibling" # remaining fails: test_http - "python -m nose -s -v --with-cov --cover-package datalad datalad.downloaders.tests.test_credentials datalad.downloaders.tests.test_providers datalad.downloaders.tests.test_s3" # remaining fails: test_add_archive_content test_annotate_paths test_diff test_ls_webui test_run test_save test_utils - "python -m nose -s -v --with-cov --cover-package datalad datalad.interface.tests.test_base datalad.interface.tests.test_clean datalad.interface.tests.test_docs datalad.interface.tests.test_ls datalad.interface.tests.test_unlock datalad.interface.tests.test_rerun datalad.interface.tests.test_run_procedure" # remaining fails: extractors.tests.test_base test_aggregation test_base datalad.metadata.extractors.tests.test_datacite_xml - "python -m nose -s -v --with-cov --cover-package datalad datalad.metadata.tests.test_search datalad.metadata.tests.test_extract_metadata datalad.metadata.extractors.tests.test_frictionless_datapackage datalad.metadata.extractors.tests.test_rfc822" # remaining fails: test_addurls test_export_archive test_plugins" # additional tests need module `dateutil`!! - "python -m nose -s -v --with-cov --cover-package datalad datalad.plugin.tests.test_check_dates" # remaining fails: test_annexrepo test_digests test_locking test_repodates test_sshrun - "python -m nose -s -v --with-cov --cover-package datalad datalad.support.tests.test_cache datalad.support.tests.test_stats datalad.support.tests.test_status datalad.support.tests.test_versions datalad.support.tests.test_network datalad.support.tests.test_external_versions datalad.support.tests.test_sshconnector datalad.support.tests.test_json_py datalad.support.tests.test_vcr_ datalad.support.tests.test_gitrepo" # remaining fails: test__main__ test_cmd test_log test_protocols test_test_utils test_auto - "python -m nose -s -v --with-cov --cover-package datalad datalad.tests.test_utils datalad.tests.test_api datalad.tests.test_base datalad.tests.test_config datalad.tests.test_constraints datalad.tests.test_dochelpers datalad.tests.test_installed datalad.tests.test_interface datalad.tests.test_misc datalad.tests.test_s3 datalad.tests.test_testrepos datalad.tests.test_utils_testrepos datalad.tests.test_archives" - "python -m nose -s -v --with-cov --cover-package datalad datalad.ui" - "python -m nose -s -v --with-cov --cover-package datalad datalad.distribution.tests.test_install" # prepare coverage.xml in a separate invocation. If invoked directly with nose - do not include test_ files themselves - "python -m coverage xml" after_test: - ps: | $env:PATH = 'C:\msys64\usr\bin;' + $env:PATH Invoke-WebRequest -Uri 'https://codecov.io/bash' -OutFile codecov.sh bash codecov.sh -f "coverage.xml" on_finish: # enable the next to let the build VM block for up to 60min to log in via RDP and debug #- ps: $blockRdp = $true; iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) datalad-0.12.4/asv.conf.json000066400000000000000000000136501363461734600156220ustar00rootroot00000000000000{ // The version of the config file format. Do not change, unless // you know what you are doing. "version": 1, // The name of the project being benchmarked "project": "DataLad", // The project's homepage "project_url": "http://datalad.org", // The URL or local path of the source code repository for the // project being benchmarked "repo": ".", // List of branches to benchmark. If not provided, defaults to "master" // (for git) or "default" (for mercurial). "branches": ["master", "0.11.x"], // for git // "branches": ["default"], // for mercurial // The DVCS being used. If not set, it will be automatically // determined from "repo" by looking at the protocol in the URL // (if remote), or by looking for special directories, such as // ".git" (if local). // "dvcs": "git", // The tool to use to create environments. May be "conda", // "virtualenv" or other value depending on the plugins in use. // If missing or the empty string, the tool will be automatically // determined by looking for tools on the PATH environment // variable. "environment_type": "virtualenv", // timeout in seconds for installing any dependencies in environment // defaults to 10 min //"install_timeout": 600, // the base URL to show a commit for the project. "show_commit_url": "http://github.com/datalad/datalad/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. // We are looking into the future now, so benchmarking 3.7 "pythons": ["2.7", "3.7"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty // list or empty string indicates to just test against the default // (latest) version. null indicates that the package is to not be // installed. If the package to be tested is only available from // PyPi, and the 'environment_type' is conda, then you can preface // the package name by 'pip+', and the package will be installed via // pip (with all the conda available packages installed first, // followed by the pip installed packages). // // "matrix": { // "numpy": ["1.6", "1.7"], // "six": ["", null], // test with and without six installed // "pip+emcee": [""], // emcee is only available for install with pip. // }, // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. // // An exclude entry excludes entries where all values match. The // values are regexps that should match the whole string. // // An include entry adds an environment. Only the packages listed // are installed. The 'python' key is required. The exclude rules // do not apply to includes. // // In addition to package names, the following keys are available: // // - python // Python version, as in the *pythons* variable above. // - environment_type // Environment type, as above. // - sys_platform // Platform, as in sys.platform. Possible values for the common // cases: 'linux2', 'win32', 'cygwin', 'darwin'. // // "exclude": [ // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows // {"environment_type": "conda", "six": null}, // don't run without six on conda // ], // // "include": [ // // additional env for python2.7 // {"python": "2.7", "numpy": "1.8"}, // // additional env if run on windows+conda // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, // ], // The directory (relative to the current directory) that benchmarks are // stored in. If not provided, defaults to "benchmarks" "benchmark_dir": "benchmarks", // The directory (relative to the current directory) to cache the Python // environments in. If not provided, defaults to "env" "env_dir": ".asv/env", // The directory (relative to the current directory) that raw benchmark // results are stored in. If not provided, defaults to "results". "results_dir": ".asv/results", // The directory (relative to the current directory) that the html tree // should be written to. If not provided, defaults to "html". "html_dir": ".asv/html", // The number of characters to retain in the commit hashes. // "hash_length": 8, // `asv` will cache wheels of the recent builds in each // environment, making them faster to install next time. This is // number of builds to keep, per environment. // "wheel_cache_size": 0 // The commits after which the regression search in `asv publish` // should start looking for regressions. Dictionary whose keys are // regexps matching to benchmark names, and values corresponding to // the commit (exclusive) after which to start looking for // regressions. The default is to start from the first commit // with results. If the commit is `null`, regression detection is // skipped for the matching benchmark. // // "regressions_first_commits": { // "some_benchmark": "352cdf", // Consider regressions only after this commit // "another_benchmark": null, // Skip regression detection altogether // } // The thresholds for relative change in results, after which `asv // publish` starts reporting regressions. Dictionary of the same // form as in ``regressions_first_commits``, with values // indicating the thresholds. If multiple entries match, the // maximum is taken. If no entry matches, the default is 5%. // // "regressions_thresholds": { // "some_benchmark": 0.01, // Threshold of 1% // "another_benchmark": 0.5, // Threshold of 50% // } } datalad-0.12.4/benchmarks/000077500000000000000000000000001363461734600153225ustar00rootroot00000000000000datalad-0.12.4/benchmarks/__init__.py000066400000000000000000000000011363461734600174220ustar00rootroot00000000000000 datalad-0.12.4/benchmarks/api.py000066400000000000000000000066421363461734600164550ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Benchmarks of the datalad.api functionality""" from os.path import join as opj try: from datalad.api import rev_save from datalad.api import rev_create except ImportError: # If it is a version without revolution - those will not be benchmarked pass from datalad.api import create from datalad.api import create_test_dataset from datalad.api import install from datalad.api import ls from datalad.api import remove from datalad.api import uninstall # # Following ones could be absent in older versions # try: from datalad.api import diff except ImportError: diff = None try: from datalad.api import status except ImportError: status = None # Some tracking example -- may be we should track # of datasets.datalad.org #import gc #def track_num_objects(): # return len(gc.get_objects()) #track_num_objects.unit = "objects" from .common import ( SampleSuperDatasetBenchmarks, SuprocBenchmarks, ) class testds(SuprocBenchmarks): """ Benchmarks to test on create_test_dataset how fast we could generate datasets """ def time_create_test_dataset1(self): self.remove_paths.extend( create_test_dataset(spec='1', seed=0) ) def time_create_test_dataset2x2(self): self.remove_paths.extend( create_test_dataset(spec='2/2', seed=0) ) class supers(SampleSuperDatasetBenchmarks): """ Benchmarks on common operations on collections of datasets using datalad API """ def time_installr(self): # somewhat duplicating setup but lazy to do different one for now assert install(self.ds.path + '_', source=self.ds.path, recursive=True) def time_createadd(self): assert self.ds.create('newsubds') def time_rev_createadd(self): assert self.ds.rev_create('newsubds') def time_rev_createadd_to_dataset(self): subds = rev_create(opj(self.ds.path, 'newsubds')) self.ds.rev_save(subds.path) def time_createadd_to_dataset(self): subds = create(opj(self.ds.path, 'newsubds')) self.ds.add(subds.path) def time_ls(self): ls(self.ds.path) def time_ls_recursive(self): ls(self.ds.path, recursive=True) def time_ls_recursive_long_all(self): ls(self.ds.path, recursive=True, long_=True, all_=True) def time_subdatasets(self): self.ds.subdatasets() def time_subdatasets_recursive(self): self.ds.subdatasets(recursive=True) def time_subdatasets_recursive_first(self): next(self.ds.subdatasets(recursive=True, return_type='generator')) def time_uninstall(self): for subm in self.ds.repo.get_submodules(): self.ds.uninstall(subm.path, recursive=True, check=False) def time_remove(self): remove(self.ds.path, recursive=True) def time_diff(self): diff(self.ds.path, revision="HEAD^") def time_diff_recursive(self): diff(self.ds.path, revision="HEAD^", recursive=True) # Status must be called with the dataset, unlike diff def time_status(self): self.ds.status() def time_status_recursive(self): self.ds.status(recursive=True)datalad-0.12.4/benchmarks/common.py000066400000000000000000000123561363461734600171730ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Helpers for benchmarks of DataLad""" import os import sys import tarfile import tempfile import timeit import os.path as op from glob import glob from datalad.utils import ( getpwd, get_tempfile_kwargs, rmtree, ) from datalad.api import ( Dataset, create_test_dataset, ) ############ # Monkey patches # Robust is_interactive. Should be not needed since 0.11.4 # https://github.com/datalad/datalad/pull/3268 def _is_stream_tty(stream): try: # TODO: check on windows if hasattr check would work correctly and # add value: return stream.isatty() except ValueError as exc: # Who knows why it is a ValueError, but let's try to be specific # If there is a problem with I/O - non-interactive, otherwise reraise if "I/O" in str(exc): return False raise def is_interactive(): """Return True if all in/outs are tty""" return all(_is_stream_tty(s) for s in (sys.stdin, sys.stdout, sys.stderr)) class SuprocBenchmarks(object): # manually set a number since otherwise takes way too long! # see https://github.com/spacetelescope/asv/issues/497 #number = 3 # although seems to work ok with a timer which accounts for subprocesses # custom timer so we account for subprocess times timer = timeit.default_timer _monkey_patched = False def __init__(self): if not self._monkey_patched: # monkey patch things if needed # ASV started to close one of the std streams since some point # which caused our is_interactive to fail. We need to provide # more robust version from datalad.support.external_versions import external_versions # comparing to 0.12.1 since the returned version is "loose" # so fails correctly identify rc as pre .0 if external_versions['datalad'] < '0.12.1': from datalad import utils from datalad.interface import ls utils.is_interactive = is_interactive ls.is_interactive = is_interactive SuprocBenchmarks._monkey_patched = True self.remove_paths = [] def _cleanup(self): if not self.remove_paths: return # Nothing TODO self.log("Cleaning up %d paths", len(self.remove_paths)) while self.remove_paths: path = self.remove_paths.pop() if op.lexists(path): rmtree(path) def teardown(self): self._cleanup() def __del__(self): # We will at least try try: self._cleanup() except: pass def log(self, msg, *args): """Consistent benchmarks logging""" print("BM: "+ str(msg % tuple(args))) class SampleSuperDatasetBenchmarks(SuprocBenchmarks): """ Setup a sample hierarchy of datasets to be used """ timeout = 3600 # need to assure that we are working in a different repository now # see https://github.com/datalad/datalad/issues/1512 # might not be sufficient due to side effects between tests and # thus getting into the same situation ds_count = 0 # Creating in CWD so things get removed when ASV is done # https://asv.readthedocs.io/en/stable/writing_benchmarks.html # that is where it would be run and cleaned up after dsname = 'testds1' tarfile = 'testds1.tar' def setup_cache(self): ds_path = create_test_dataset( self.dsname , spec='2/-2/-2' , seed=0 )[0] self.log("Setup cache ds path %s. CWD: %s", ds_path, getpwd()) # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark # Store full path since apparently setup is not ran in that directory self.tarfile = op.realpath(SampleSuperDatasetBenchmarks.tarfile) with tarfile.open(self.tarfile, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree(self.dsname, ro=False, chmod_files=False) tar.add(self.dsname, recursive=True) rmtree(self.dsname) def setup(self): self.log("Setup ran in %s, existing paths: %s", getpwd(), glob('*')) tempdir = tempfile.mkdtemp( **get_tempfile_kwargs({}, prefix="bm") ) self.remove_paths.append(tempdir) with tarfile.open(self.tarfile) as tar: tar.extractall(tempdir) # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed epath = op.join(tempdir, 'testds1') epath_unique = epath + str(self.__class__.ds_count) os.rename(epath, epath_unique) self.__class__.ds_count += 1 self.ds = Dataset(epath_unique) self.repo = self.ds.repo self.log("Finished setup for %s", tempdir) datalad-0.12.4/benchmarks/core.py000066400000000000000000000107761363461734600166370ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Benchmarks for DataLad""" import os import sys import os.path as osp from os.path import join as opj import tarfile import timeit from time import time from subprocess import call from datalad.cmd import Runner from datalad.api import add from datalad.api import create from datalad.api import create_test_dataset from datalad.api import Dataset from datalad.api import install from datalad.api import ls from datalad.api import remove from datalad.api import uninstall from datalad.utils import rmtree from datalad.utils import getpwd # Some tracking example -- may be we should track # of datasets.datalad.org #import gc #def track_num_objects(): # return len(gc.get_objects()) #track_num_objects.unit = "objects" from .common import SuprocBenchmarks scripts_dir = osp.join(osp.dirname(__file__), 'scripts') heavyout_cmd = "{} 1000".format(osp.join(scripts_dir, 'heavyout')) class startup(SuprocBenchmarks): """ Benchmarks for datalad commands startup """ def setup(self): # we need to prepare/adjust PATH to point to installed datalad # We will base it on taking sys.executable python_path = osp.dirname(sys.executable) self.env = os.environ.copy() self.env['PATH'] = '%s:%s' % (python_path, self.env.get('PATH', '')) def time_help_np(self): call(["datalad", "--help-np"], env=self.env) def time_import(self): call([sys.executable, "-c", "import datalad"]) def time_import_api(self): call([sys.executable, "-c", "import datalad.api"]) class runner(SuprocBenchmarks): """Some rudimentary tests to see if there is no major slowdowns from Runner """ def setup(self): self.runner = Runner() # older versions might not have it try: from datalad.cmd import GitRunner self.git_runner = GitRunner() except ImportError: pass def time_echo(self): self.runner.run("echo") def time_echo_gitrunner(self): self.git_runner.run("echo") # Following "track" measures computing overhead comparing to the simplest # os.system call on the same command without carrying for in/out unit = "% overhead" def _get_overhead(self, cmd, nrepeats=3, **run_kwargs): """Estimate overhead over running command via the simplest os.system and to not care about any output """ # asv does not repeat tracking ones I think, so nrepeats overheads = [] for _ in range(nrepeats): t0 = time() os.system(cmd + " >/dev/null 2>&1") t1 = time() self.runner.run(cmd, **run_kwargs) t2 = time() overhead = 100 * ((t2 - t1) / (t1 - t0) - 1.0) # print("O :", t1 - t0, t2 - t0, overhead) overheads.append(overhead) overhead = round(sum(overheads) / len(overheads), 2) #overhead = round(min(overheads), 2) return overhead def track_overhead_echo(self): return self._get_overhead("echo") # 100ms chosen below as providing some sensible stability for me. # at 10ms -- too much variability def track_overhead_100ms(self): return self._get_overhead("sleep 0.1") def track_overhead_heavyout(self): # run busyloop for 100ms outputing as much as it could return self._get_overhead(heavyout_cmd) def track_overhead_heavyout_online_through(self): return self._get_overhead(heavyout_cmd, log_stderr='offline', # needed to would get stuck log_online=True) def track_overhead_heavyout_online_process(self): return self._get_overhead(heavyout_cmd, log_stdout=lambda s: '', log_stderr='offline', # needed to would get stuck log_online=True) # # Probably not really interesting, and good lord wobbles around 0 # def track_overhead_heavyout_offline(self): # return self._get_overhead(heavyout_cmd, # log_stdout='offline', # log_stderr='offline') # TODO: track the one with in/out, i.e. for those BatchedProcessesdatalad-0.12.4/benchmarks/repo.py000066400000000000000000000015161363461734600166440ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Benchmarks of the basic repos (Git/Annex) functionality""" from .common import ( SampleSuperDatasetBenchmarks, SuprocBenchmarks, ) # TODO: probably SampleSuperDatasetBenchmarks is not the best for these benchmarks # but we are yet to make it parametric so we could sweep through a set # of typical scenarios class gitrepo(SampleSuperDatasetBenchmarks): def time_get_content_info(self): info = self.repo.get_content_info() assert isinstance(info, dict) # just so we do not end up with a generator datalad-0.12.4/benchmarks/scripts/000077500000000000000000000000001363461734600170115ustar00rootroot00000000000000datalad-0.12.4/benchmarks/scripts/heavyout000077500000000000000000000003371363461734600206060ustar00rootroot00000000000000#!/usr/bin/env python from time import time import sys niter = int(sys.argv[1]) load = {i: "x" * i for i in range(10)} for i in range(niter): print("I am looping already for {} iterations. Load: {}".format(i, load)) datalad-0.12.4/benchmarks/support/000077500000000000000000000000001363461734600170365ustar00rootroot00000000000000datalad-0.12.4/benchmarks/support/__init__.py000066400000000000000000000000001363461734600211350ustar00rootroot00000000000000datalad-0.12.4/benchmarks/support/path.py000066400000000000000000000032631363461734600203500ustar00rootroot00000000000000# Import functions to be tested with _ suffix and name the suite after the # original function so we could easily benchmark it e.g. by # asv run --python=same -b get_parent_paths # without need to discover what benchmark to use etc from datalad.support.path import get_parent_paths as get_parent_paths_ from ..common import SuprocBenchmarks class get_parent_paths(SuprocBenchmarks): def setup(self): # prepare some more or less realistic with a good number of paths # and some hierarchy of submodules self.nfiles = 40 # per each construct self.nsubmod = 30 # at two levels self.toplevel_submods = ['submod%d' % i for i in range(self.nsubmod)] self.posixpaths = \ ['file%d' % i for i in range(self.nfiles)] + \ ['subdir/anotherfile%d' % i for i in range(self.nfiles)] for submod in range(self.nsubmod): self.posixpaths += \ ['submod%d/file%d' % (submod, i) for i in range(self.nfiles)] + \ ['subdir/submod%d/file%d' % (submod, i) for i in range(self.nfiles)] + \ ['submod/sub%d/file%d' % (submod, i) for i in range(self.nfiles)] def time_no_submods(self): assert get_parent_paths_(self.posixpaths, [], True) == [] def time_one_submod_toplevel(self): get_parent_paths_(self.posixpaths, ['submod9'], True) def time_one_submod_subdir(self): get_parent_paths_(self.posixpaths, ['subdir/submod9'], True) def time_allsubmods_toplevel_only(self): get_parent_paths_(self.posixpaths, self.toplevel_submods, True) def time_allsubmods_toplevel(self): get_parent_paths_(self.posixpaths, self.toplevel_submods) datalad-0.12.4/benchmarks/usecases.py000066400000000000000000000111301363461734600175030ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Benchmarks for some use cases, typically at datalad.api level""" import sys import tempfile from datalad.utils import get_tempfile_kwargs import os.path as osp from os.path import join as opj from datalad.api import create from datalad.utils import ( create_tree, rmtree, ) from .common import SuprocBenchmarks class study_forrest(SuprocBenchmarks): """ Benchmarks for Study Forrest use cases """ timeout = 180 # especially with profiling might take longer than default 60s def setup(self): self.path = tempfile.mkdtemp(**get_tempfile_kwargs({}, prefix='bm_forrest')) def teardown(self): if osp.exists(self.path): rmtree(self.path) def time_make_studyforrest_mockup(self): path = self.path # Carries a copy of the # datalad.tests.utils_testdatasets.py:make_studyforrest_mockup # as of 0.12.0rc2-76-g6ba6d53b # A copy is made so we do not reflect in the benchmark results changes # to that helper's code. This copy only tests on 2 not 3 analyses # subds public = create(opj(path, 'public'), description="umbrella dataset") # the following tries to capture the evolution of the project phase1 = public.create('phase1', description='old-style, no connection to RAW') structural = public.create('structural', description='anatomy') tnt = public.create('tnt', description='image templates') tnt.clone(source=phase1.path, path=opj('src', 'phase1'), reckless=True) tnt.clone(source=structural.path, path=opj('src', 'structural'), reckless=True) aligned = public.create('aligned', description='aligned image data') aligned.clone(source=phase1.path, path=opj('src', 'phase1'), reckless=True) aligned.clone(source=tnt.path, path=opj('src', 'tnt'), reckless=True) # new acquisition labet = create(opj(path, 'private', 'labet'), description="raw data ET") phase2_dicoms = create(opj(path, 'private', 'p2dicoms'), description="raw data P2MRI") phase2 = public.create('phase2', description='new-style, RAW connection') phase2.clone(source=labet.path, path=opj('src', 'labet'), reckless=True) phase2.clone(source=phase2_dicoms.path, path=opj('src', 'dicoms'), reckless=True) # add to derivatives tnt.clone(source=phase2.path, path=opj('src', 'phase2'), reckless=True) aligned.clone(source=phase2.path, path=opj('src', 'phase2'), reckless=True) # never to be published media files media = create(opj(path, 'private', 'media'), description="raw data ET") # assuming all annotations are in one dataset (in reality this is also # a superdatasets with about 10 subdatasets annot = public.create('annotations', description='stimulus annotation') annot.clone(source=media.path, path=opj('src', 'media'), reckless=True) # a few typical analysis datasets # (just doing 2, actual status quo is just shy of 10) # and also the real goal -> meta analysis metaanalysis = public.create('metaanalysis', description="analysis of analyses") for i in range(1, 2): ana = public.create('analysis{}'.format(i), description='analysis{}'.format(i)) ana.clone(source=annot.path, path=opj('src', 'annot'), reckless=True) ana.clone(source=aligned.path, path=opj('src', 'aligned'), reckless=True) ana.clone(source=tnt.path, path=opj('src', 'tnt'), reckless=True) # link to metaanalysis metaanalysis.clone(source=ana.path, path=opj('src', 'ana{}'.format(i)), reckless=True) # simulate change in an input (but not raw) dataset create_tree( aligned.path, {'modification{}.txt'.format(i): 'unique{}'.format(i)}) aligned.add('.') # finally aggregate data aggregate = public.create('aggregate', description='aggregate data') aggregate.clone(source=aligned.path, path=opj('src', 'aligned'), reckless=True) # the toplevel dataset is intentionally left dirty, to reflect the # most likely condition for the joint dataset to be in at any given # point in time datalad-0.12.4/datalad/000077500000000000000000000000001363461734600145775ustar00rootroot00000000000000datalad-0.12.4/datalad/__init__.py000066400000000000000000000210131363461734600167050ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """DataLad aims to expose (scientific) data available online as a unified data distribution with the convenience of git-annex repositories as a backend. Commands are exposed through both a command-line interface and a Python API. On the command line, run 'datalad --help' for a summary of the available commands. From an interactive Python session, import `datalad.api` and inspect its documentation with `help`. """ if not __debug__: raise RuntimeError('DataLad cannot run in "optimized" mode, i.e. python -O') # For reproducible demos/tests import os _seed = os.environ.get('DATALAD_SEED', None) if _seed: import random random.seed(_seed) import atexit # Colorama (for Windows terminal colors) must be imported before we use/bind # any sys.stdout try: # this will fix the rendering of ANSI escape sequences # for colored terminal output on windows # it will do nothing on any other platform, hence it # is safe to call unconditionally import colorama colorama.init() atexit.register(colorama.deinit) except ImportError as e: pass # Other imports are interspersed with lgr.debug to ease troubleshooting startup # delays etc. # If there is a bundled git, make sure GitPython uses it too: from datalad.cmd import GitRunner GitRunner._check_git_path() if GitRunner._GIT_PATH: import os os.environ['GIT_PYTHON_GIT_EXECUTABLE'] = \ os.path.join(GitRunner._GIT_PATH, 'git') from .config import ConfigManager cfg = ConfigManager() from .log import lgr from datalad.utils import get_encoding_info, get_envvars_info, getpwd # To analyze/initiate our decision making on what current directory to return getpwd() lgr.log(5, "Instantiating ssh manager") from .support.sshconnector import SSHManager ssh_manager = SSHManager() atexit.register(ssh_manager.close, allow_fail=False) atexit.register(lgr.log, 5, "Exiting") from .version import __version__ def test(module='datalad', verbose=False, nocapture=False, pdb=False, stop=False): """A helper to run datalad's tests. Requires nose """ argv = [] #module] # could make it 'smarter' but decided to be explicit so later we could # easily migrate to another runner without changing any API here if verbose: argv.append('-v') if nocapture: argv.append('-s') if pdb: argv.append('--pdb') if stop: argv.append('--stop') from datalad.support.third.nosetester import NoseTester tester = NoseTester(module) tester.package_name = module.split('.', 1)[0] tester.test(extra_argv=argv) test.__test__ = False # Following fixtures are necessary at the top level __init__ for fixtures which # would cover all **/tests and not just datalad/tests/ # To store settings which setup_package changes and teardown_package should return _test_states = { 'loglevel': None, 'DATALAD_LOG_LEVEL': None, 'HOME': None, } def setup_package(): import os from datalad import consts _test_states['HOME'] = os.environ.get('HOME', None) _test_states['DATASETS_TOPURL_ENV'] = os.environ.get('DATALAD_DATASETS_TOPURL', None) _test_states['DATASETS_TOPURL'] = consts.DATASETS_TOPURL os.environ['DATALAD_DATASETS_TOPURL'] = consts.DATASETS_TOPURL = 'http://datasets-tests.datalad.org/' # To overcome pybuild overriding HOME but us possibly wanting our # own HOME where we pre-setup git for testing (name, email) if 'GIT_HOME' in os.environ: os.environ['HOME'] = os.environ['GIT_HOME'] else: # we setup our own new HOME, the BEST and HUGE one from datalad.utils import make_tempfile from datalad.tests import _TEMP_PATHS_GENERATED # TODO: split into a function + context manager with make_tempfile(mkdir=True) as new_home: os.environ['HOME'] = new_home if not os.path.exists(new_home): os.makedirs(new_home) with open(os.path.join(new_home, '.gitconfig'), 'w') as f: f.write("""\ [user] name = DataLad Tester email = test@example.com """) _TEMP_PATHS_GENERATED.append(new_home) # To overcome pybuild by default defining http{,s}_proxy we would need # to define them to e.g. empty value so it wouldn't bother touching them. # But then haskell libraries do not digest empty value nicely, so we just # pop them out from the environment for ev in ('http_proxy', 'https_proxy'): if ev in os.environ and not (os.environ[ev]): lgr.debug("Removing %s from the environment since it is empty", ev) os.environ.pop(ev) # During tests we allow for "insecure" access to local file:// and # http://localhost URLs since all of them either generated as tests # fixtures or cloned from trusted sources from datalad.support.annexrepo import AnnexRepo AnnexRepo._ALLOW_LOCAL_URLS = True DATALAD_LOG_LEVEL = os.environ.get('DATALAD_LOG_LEVEL', None) if DATALAD_LOG_LEVEL is None: # very very silent. Tests introspecting logs should use # swallow_logs(new_level=...) _test_states['loglevel'] = lgr.getEffectiveLevel() lgr.setLevel(100) # And we should also set it within environ so underlying commands also stay silent _test_states['DATALAD_LOG_LEVEL'] = DATALAD_LOG_LEVEL os.environ['DATALAD_LOG_LEVEL'] = '100' else: # We are not overriding them, since explicitly were asked to have some log level _test_states['loglevel'] = None # Set to non-interactive UI from datalad.ui import ui _test_states['ui_backend'] = ui.backend # obtain() since that one consults for the default value ui.set_backend(cfg.obtain('datalad.tests.ui.backend')) # Monkey patch nose so it does not ERROR out whenever code asks for fileno # of the output. See https://github.com/nose-devs/nose/issues/6 from io import StringIO as OrigStringIO class StringIO(OrigStringIO): fileno = lambda self: 1 encoding = None from nose.ext import dtcompat from nose.plugins import capture, multiprocess, plugintest dtcompat.StringIO = StringIO capture.StringIO = StringIO multiprocess.StringIO = StringIO plugintest.StringIO = StringIO def teardown_package(): import os from datalad.tests.utils import rmtemp, OBSCURE_FILENAME lgr.debug("Printing versioning information collected so far") from datalad.support.external_versions import external_versions as ev print(ev.dumps(query=True)) try: print("Obscure filename: str=%s repr=%r" % (OBSCURE_FILENAME.encode('utf-8'), OBSCURE_FILENAME)) except UnicodeEncodeError as exc: from .dochelpers import exc_str print("Obscure filename failed to print: %s" % exc_str(exc)) def print_dict(d): return " ".join("%s=%r" % v for v in d.items()) print("Encodings: %s" % print_dict(get_encoding_info())) print("Environment: %s" % print_dict(get_envvars_info())) if os.environ.get('DATALAD_TESTS_NOTEARDOWN'): return from datalad.ui import ui from datalad import consts ui.set_backend(_test_states['ui_backend']) if _test_states['loglevel'] is not None: lgr.setLevel(_test_states['loglevel']) if _test_states['DATALAD_LOG_LEVEL'] is None: os.environ.pop('DATALAD_LOG_LEVEL') else: os.environ['DATALAD_LOG_LEVEL'] = _test_states['DATALAD_LOG_LEVEL'] from datalad.tests import _TEMP_PATHS_GENERATED if len(_TEMP_PATHS_GENERATED): msg = "Removing %d dirs/files: %s" % (len(_TEMP_PATHS_GENERATED), ', '.join(_TEMP_PATHS_GENERATED)) else: msg = "Nothing to remove" lgr.debug("Teardown tests. " + msg) for path in _TEMP_PATHS_GENERATED: rmtemp(path, ignore_errors=True) if _test_states['HOME'] is not None: os.environ['HOME'] = _test_states['HOME'] if _test_states['DATASETS_TOPURL_ENV']: os.environ['DATALAD_DATASETS_TOPURL'] = _test_states['DATASETS_TOPURL_ENV'] consts.DATASETS_TOPURL = _test_states['DATASETS_TOPURL'] from datalad.support.cookies import cookies_db cookies_db.close() from datalad.support.annexrepo import AnnexRepo AnnexRepo._ALLOW_LOCAL_URLS = False # stay safe! lgr.log(5, "Done importing main __init__") datalad-0.12.4/datalad/__main__.py000066400000000000000000000060741363461734600167000ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Helper to use datalad as a "runnable" module with -m datalad""" import sys from . import __version__ from .auto import AutomagicIO from .log import lgr def usage(outfile, executable=sys.argv[0]): if '__main__.py' in executable: # That was -m datalad way to launch executable = "%s -m datalad" % sys.executable outfile.write("""Usage: %s [OPTIONS] [ARGS] Purpose: To provide FUSE-like operation whenever necessary files (as accessed by open, h5py.File) are requested, they get fetched. Meta-options: --help Display this help then exit. --version Output version information then exit. """ % executable) def runctx(cmd, globals=None, locals=None): if globals is None: globals = {} if locals is None: locals = {} try: exec(cmd, globals, locals) finally: # good opportunity to avoid atexit I guess. pass for now pass def main(argv=None): import os import getopt if argv is None: argv = sys.argv try: opts, prog_argv = getopt.getopt(argv[1:], "", ["help", "version"]) # TODO: support options for whatever we would support ;) # probably needs to hook in somehow into commands/options available # under cmdline/ except getopt.error as msg: sys.stderr.write("%s: %s\n" % (sys.argv[0], msg)) sys.stderr.write("Try `%s --help' for more information\n" % sys.argv[0]) sys.exit(1) # and now we need to execute target script "manually" # Borrowing up on from trace.py for opt, val in opts: if opt == "--help": usage(sys.stdout, executable=argv[0]) sys.exit(0) if opt == "--version": sys.stdout.write("datalad %s\n" % __version__) sys.exit(0) sys.argv = prog_argv progname = prog_argv[0] sys.path[0] = os.path.split(progname)[0] try: with open(progname) as fp: code = compile(fp.read(), progname, 'exec') # try to emulate __main__ namespace as much as possible globs = { '__file__': progname, '__name__': '__main__', '__package__': None, '__cached__': None, } # Since used explicitly -- activate the beast aio = AutomagicIO(activate=True) lgr.info("Running code of %s", progname) runctx(code, globs, globs) # TODO: see if we could hide our presence from the final tracebacks if execution fails except IOError as err: lgr.error("Cannot run file %r because: %s" % (sys.argv[0], err)) sys.exit(1) except SystemExit: pass if __name__ == '__main__': main() datalad-0.12.4/datalad/api.py000066400000000000000000000067521363461734600157340ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Python DataLad API exposing user-oriented commands (also available via CLI)""" from datalad.coreapi import * def _command_summary(): # Import here to avoid polluting the datalad.api namespace. from collections import defaultdict from datalad.interface.base import alter_interface_docs_for_api from datalad.interface.base import get_api_name from datalad.interface.base import get_cmd_doc from datalad.interface.base import get_cmd_summaries from datalad.interface.base import get_interface_groups from datalad.interface.base import load_interface groups = get_interface_groups(include_plugins=True) grp_short_descriptions = defaultdict(list) for group, _, specs in sorted(groups, key=lambda x: x[1]): for spec in specs: intf = load_interface(spec) if intf is None: continue sdescr = getattr(intf, "short_description", None) or \ alter_interface_docs_for_api(get_cmd_doc(intf)).split("\n")[0] grp_short_descriptions[group].append( (get_api_name(spec), sdescr)) return "\n".join(get_cmd_summaries(grp_short_descriptions, groups)) __doc__ += "\n\n{}".format(_command_summary()) def _load_plugins(): from datalad.plugin import _get_plugins from datalad.plugin import _load_plugin import re camel = re.compile(r'([a-z])([A-Z])') for pname, props in _get_plugins(): pi = _load_plugin(props['file'], fail=False) if pi is None: continue globals()[camel.sub('\\1_\\2', pi.__name__).lower()] = pi.__call__ def _generate_extension_api(): """Auto detect all available extensions and generate an API from them """ from importlib import import_module from pkg_resources import iter_entry_points from .interface.base import get_api_name from datalad.dochelpers import exc_str import logging lgr = logging.getLogger('datalad.api') for entry_point in iter_entry_points('datalad.extensions'): try: lgr.debug( 'Loading entrypoint %s from datalad.extensions for API building', entry_point.name) grp_descr, interfaces = entry_point.load() lgr.debug( 'Loaded entrypoint %s from datalad.extensions', entry_point.name) except Exception as e: lgr.warning('Failed to load entrypoint %s: %s', entry_point.name, exc_str(e)) continue for intfspec in interfaces: # turn the interface spec into an instance mod = import_module(intfspec[0]) intf = getattr(mod, intfspec[1]) api_name = get_api_name(intfspec) if api_name in globals(): lgr.debug( 'Command %s from extension %s is replacing a previously loaded implementation', api_name, entry_point.name) globals()[api_name] = intf.__call__ _generate_extension_api() _load_plugins() # Be nice and clean up the namespace properly del _load_plugins del _generate_extension_api del _command_summary datalad-0.12.4/datalad/auto.py000066400000000000000000000313611363461734600161250ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Proxy basic file operations (e.g. open) to auto-obtain files upon I/O """ import sys # OPT delay import for expensive mock until used #from unittest.mock import patch import builtins import lzma import logging import io import os from os.path import dirname, lexists, realpath, sep as pathsep from os.path import exists from os.path import isabs from os.path import join as opj from git.exc import InvalidGitRepositoryError from .utils import getpwd from .dochelpers import exc_str from .support.annexrepo import AnnexRepo from .cmdline.helpers import get_repo_instance from .consts import DATALAD_DOTDIR # To be used for a quick detection of path being under .git/ _DOT_GIT_DIR = pathsep + '.git' + pathsep lgr = logging.getLogger("datalad.auto") h5py = None try: import h5py except ImportError: pass except Exception as exc: # could happen due to misbehaving handlers provided by git module # see https://github.com/gitpython-developers/GitPython/issues/600 # we could overload the handler by providing a blank one, but I do not # think it is worthwhile at this point. So let's just issue a warning lgr.warning( "Failed to import h5py, so no automagic handling for it atm: %s", exc_str(exc) ) # TODO: RF to reduce code duplication among cases, also RF tests for the same reason class _EarlyExit(Exception): """Helper to early escape try/except logic in wrapped open""" def __init__(self, msg, *args): self.msg = msg self.args = args class AutomagicIO(object): """Class to proxy commonly used API for accessing files so they get automatically fetched Currently supports builtin open() and h5py.File when those are read """ def __init__(self, autoget=True, activate=False, check_once=False): """ Parameters ---------- autoget activate check_once: bool, optional To speed things up and avoid unnecessary repeated checks, if True, paths considered for proxying and corresponding repositories are remembered, and are not subject to datalad checks on subsequent calls. This option is to be used if you do not expect new git repositories to not be created and files not to get dropped while operating under AutomagicIO supervision. """ self._active = False self._builtin_open = builtins.open self._io_open = io.open self._os_stat = os.stat self._builtin_exists = os.path.exists self._builtin_isfile = os.path.isfile if h5py: self._h5py_File = h5py.File else: self._h5py_File = None self._lzma_LZMAFile = lzma.LZMAFile self._autoget = autoget self._in_open = False self._log_online = True from unittest.mock import patch self._patch = patch self._paths_cache = set() if check_once else None self._repos_cache = {} if check_once else None if activate: self.activate() def __enter__(self): self.activate() return self def __exit__(self, exc_type, exc_value, traceback): self.deactivate() @property def autoget(self): return self._autoget @property def active(self): return self._active def _proxy_open_name_mode(self, origname, origfunc, *args, **kwargs): """Proxy for various "open" which have first argument name and 2nd - mode """ # wrap it all for resilience to errors -- proxying must do no harm! try: if self._in_open: raise _EarlyExit("within open already") self._in_open = True # just in case someone kept alias/assignment # return stock open for the duration of handling so that # logging etc could workout correctly with self._patch(origname, origfunc): lgr.log(3, "Proxying open with %r %r", args, kwargs) # had to go with *args since in PY2 it is name, in PY3 file # deduce arguments if len(args) > 0: # name/file was provided file = args[0] else: filearg = "file" if filearg not in kwargs: # so the name was missing etc, just proxy into original open call and let it puke raise _EarlyExit("no name/file was given") file = kwargs.get(filearg) if isinstance(file, int): raise _EarlyExit("already a file descriptor") if self._paths_cache is not None: filefull = file if isabs(file) else os.path.abspath(file) if filefull in self._paths_cache: raise _EarlyExit("considered before") else: self._paths_cache.add(filefull) if _DOT_GIT_DIR in file: raise _EarlyExit("we ignore paths under .git/") mode = 'r' if len(args) > 1: mode = args[1] elif 'mode' in kwargs: mode = kwargs['mode'] if 'r' in mode: self._dataset_auto_get(file) else: raise _EarlyExit("mode=%r", mode) except _EarlyExit as e: lgr.log(2, " skipping since " + e.msg, *e.args, extra={'notraceback': True}) except Exception as e: # If anything goes wrong -- we should complain and proceed with self._patch(origname, origfunc): lgr.warning("Failed proxying open with %r, %r: %s", args, kwargs, exc_str(e)) finally: self._in_open = False # finally give it back to stock open return origfunc(*args, **kwargs) def _proxy_open(self, *args, **kwargs): return self._proxy_open_name_mode('builtins.open', self._builtin_open, *args, **kwargs) def _proxy_io_open(self, *args, **kwargs): return self._proxy_open_name_mode('io.open', self._io_open, *args, **kwargs) def _proxy_os_stat(self, *args, **kwargs): return self._proxy_open_name_mode('os.stat', self._os_stat, *args, **kwargs) def _proxy_h5py_File(self, *args, **kwargs): return self._proxy_open_name_mode('h5py.File', self._h5py_File, *args, **kwargs) def _proxy_lzma_LZMAFile(self, *args, **kwargs): return self._proxy_open_name_mode('lzma.LZMAFile', self._lzma_LZMAFile, *args, **kwargs) def _proxy_exists(self, path): # TODO: decide either it should may be retrieved right away. # For now, as long as it is a symlink pointing to under .git/annex if exists(path): return True return lexists(path) and 'annex/objects' in str(realpath(path)) def _proxy_isfile(self, path): return self._proxy_open_name_mode( 'os.path.isfile', self._builtin_isfile, path ) def _dataset_auto_get(self, filepath): """Verify that filepath is under annex, and if so and not present - get it""" if not self._autoget: return # if filepath is not there at all (program just "checked" if it could access it if not lexists(filepath): lgr.log(2, " skipping %s since it is not there", filepath) return # deduce directory for filepath filedir = dirname(filepath) annex = None if self._repos_cache is not None: filedir_parts = filedir.split(pathsep) # ATM we do not expect subdatasets under .datalad, so we could take the top # level dataset for that try: filedir = pathsep.join( filedir_parts[:filedir_parts.index(DATALAD_DOTDIR)] ) except ValueError: # would happen if no .datalad pass try: annex = self._repos_cache[filedir] except KeyError: pass if annex is None: try: # TODO: verify logic for create -- we shouldn't 'annexify' non-annexified # see https://github.com/datalad/datalad/issues/204 annex = get_repo_instance(filedir) lgr.log(2, "Got the repository %s id:%s containing %s", annex, id(annex), filedir) except (RuntimeError, InvalidGitRepositoryError) as e: # must be not under annex etc return if self._repos_cache is not None: self._repos_cache[filedir] = annex if not isinstance(annex, AnnexRepo): # not an annex -- can do nothing lgr.log(2, " skipping %s since the repo is not annex", filepath) return # since Git/AnnexRepo functionality treats relative paths relative to the # top of the repository and might be outside, get a full path if not isabs(filepath): filepath = opj(getpwd(), filepath) # "quick" check first if under annex at all try: # might fail. TODO: troubleshoot when it does e.g. # datalad/tests/test_auto.py:test_proxying_open_testrepobased under_annex = annex.is_under_annex(filepath, batch=True) except Exception as exc: # MIH: really? what if MemoryError lgr.log(5, " cannot determine if %s under annex: %s", filepath, exc_str(exc)) under_annex = None # either it has content if (under_annex or under_annex is None) and not annex.file_has_content(filepath): lgr.info("AutomagicIO: retrieving file content of %s", filepath) out = annex.get(filepath) if out and not out.get('success', False): # to assure that it is present and without trailing/leading new lines out['note'] = out.get('note', '').strip() lgr.error("Failed to retrieve %(file)s: %(note)s", out) def activate(self): # we should stay below info for this message. With PR #1630 we # start to use this functionality internally, and this will show # up frequently even in cases where it does nothing at all lgr.debug("Activating DataLad's AutoMagicIO") # Some beasts (e.g. tornado used by IPython) override outputs, and # provide fileno which throws exception. In such cases we should not log online self._log_online = hasattr(sys.stdout, 'fileno') and hasattr(sys.stderr, 'fileno') try: if self._log_online: sys.stdout.fileno() sys.stderr.fileno() except: # MIH: IOError? self._log_online = False if self.active: # this is not a warning, because there is nothing going # wrong or being undesired. Nested invokation could happen # caused by independent pieces of code, e.g. user code # that invokes our own metadata handling. lgr.debug("%s already active. No action taken" % self) return # overloads builtins.open = self._proxy_open io.open = self._proxy_io_open os.stat = self._proxy_os_stat os.path.exists = self._proxy_exists os.path.isfile = self._proxy_isfile if h5py: h5py.File = self._proxy_h5py_File lzma.LZMAFile = self._proxy_lzma_LZMAFile self._active = True def deactivate(self): # just debug level -- see activate() lgr.debug("Deactivating DataLad's AutoMagicIO") if not self.active: lgr.warning("%s is not active, can't deactivate" % self) return builtins.open = self._builtin_open io.open = self._io_open os.stat = self._os_stat if h5py: h5py.File = self._h5py_File lzma.LZMAFile = self._lzma_LZMAFile os.path.exists = self._builtin_exists os.path.isfile = self._builtin_isfile self._active = False def __del__(self): try: if self._active: self.deactivate() except: # MIH: IOError? pass try: super(self.__class__, self).__del__() except: pass datalad-0.12.4/datalad/cmd.py000066400000000000000000001010451363461734600157150ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ Wrapper for command and function calls, allowing for dry runs and output handling """ import time import subprocess import sys import logging import os import atexit import functools import tempfile from collections import OrderedDict from .support import path as op from .consts import GIT_SSH_COMMAND from .dochelpers import exc_str from .support.exceptions import CommandError from .support.protocol import ( NullProtocol, ExecutionTimeProtocol, ExecutionTimeExternalsProtocol, ) from .utils import ( on_windows, get_tempfile_kwargs, assure_unicode, assure_bytes, unlink, auto_repr, split_cmdline, ) from .dochelpers import borrowdoc lgr = logging.getLogger('datalad.cmd') # In python3 to split byte stream on newline, it must be bytes linesep_bytes = os.linesep.encode() _TEMP_std = sys.stdout, sys.stderr # To be used in the temp file name to distinguish the ones we create # in Runner so we take care about their removal, in contrast to those # which might be created outside and passed into Runner _MAGICAL_OUTPUT_MARKER = "_runneroutput_" from io import IOBase as file_class def _decide_to_log(v): """Hacky workaround for now so we could specify per each which to log online and which to the log""" if isinstance(v, bool) or callable(v): return v elif v in {'online'}: return True elif v in {'offline'}: return False else: raise ValueError("can be bool, callable, 'online' or 'offline'") def _get_output_stream(log_std, false_value): """Helper to prepare output stream for Popen and use file for 'offline' Necessary to avoid lockdowns when both stdout and stderr are pipes """ if log_std: if log_std == 'offline': # we will open a temporary file tf = tempfile.mktemp( **get_tempfile_kwargs({}, prefix=_MAGICAL_OUTPUT_MARKER) ) return open(tf, 'w') # XXX PY3 should be 'b' may be? else: return subprocess.PIPE else: return false_value def _cleanup_output(stream, std): if isinstance(stream, file_class) and \ _MAGICAL_OUTPUT_MARKER in getattr(stream, 'name', ''): if not stream.closed: stream.close() if op.exists(stream.name): unlink(stream.name) elif stream == subprocess.PIPE: std.close() class Runner(object): """Provides a wrapper for calling functions and commands. An object of this class provides a methods that calls shell commands or python functions, allowing for protocolling the calls and output handling. Outputs (stdout and stderr) can be either logged or streamed to system's stdout/stderr during execution. This can be enabled or disabled for both of them independently. Additionally, a protocol object can be a used with the Runner. Such a protocol has to implement datalad.support.protocol.ProtocolInterface, is able to record calls and allows for dry runs. """ __slots__ = ['commands', 'dry', 'cwd', 'env', 'protocol', '_log_opts'] def __init__(self, cwd=None, env=None, protocol=None, log_outputs=None): """ Parameters ---------- cwd: string, optional Base current working directory for commands. Could be overridden per run call via cwd option env: dict, optional Custom environment to use for calls. Could be overridden per run call via env option protocol: ProtocolInterface Protocol object to write to. log_outputs : bool, optional Switch to instruct whether outputs should be logged or not. If not set (default), config 'datalad.log.outputs' would be consulted """ self.cwd = cwd self.env = env if protocol is None: # TODO: config cmd.protocol = null protocol_str = os.environ.get('DATALAD_CMD_PROTOCOL', 'null') protocol = { 'externals-time': ExecutionTimeExternalsProtocol, 'time': ExecutionTimeProtocol, 'null': NullProtocol }[protocol_str]() if protocol_str != 'null': # we need to dump it into a file at the end # TODO: config cmd.protocol_prefix = protocol filename = '%s-%s.log' % ( os.environ.get('DATALAD_CMD_PROTOCOL_PREFIX', 'protocol'), id(self) ) atexit.register(functools.partial(protocol.write_to_file, filename)) self.protocol = protocol # Various options for logging self._log_opts = {} # we don't know yet whether we need to log every output or not if log_outputs is not None: self._log_opts['outputs'] = log_outputs def __call__(self, cmd, *args, **kwargs): """Convenience method This will call run() or call() depending on the kind of `cmd`. If `cmd` is a string it is interpreted as the to be executed command. Otherwise it is expected to be a callable. Any other argument is passed to the respective method. Parameters ---------- cmd: str or callable command string to be executed via shell or callable to be called. `*args`: `**kwargs`: see Runner.run() and Runner.call() for available arguments. Raises ------ TypeError if cmd is neither a string nor a callable. """ if isinstance(cmd, str) or isinstance(cmd, list): return self.run(cmd, *args, **kwargs) elif callable(cmd): return self.call(cmd, *args, **kwargs) else: raise TypeError("Argument 'command' is neither a string, " "nor a list nor a callable.") def _opt_env_adapter(v): """If value is a string, split by ,""" if v: if v.isdigit(): log_env = bool(int(v)) else: log_env = v.split(',') return log_env else: return False _LOG_OPTS_ADAPTERS = OrderedDict([ ('outputs', None), ('cwd', None), ('env', _opt_env_adapter), ('stdin', None), ]) def _get_log_setting(self, opt, default=False): try: return self._log_opts[opt] except KeyError: try: from . import cfg except ImportError: return default adapter = self._LOG_OPTS_ADAPTERS.get(opt, None) self._log_opts[opt] = \ (cfg.getbool if not adapter else cfg.get_value)( 'datalad.log', opt, default=default) if adapter: self._log_opts[opt] = adapter(self._log_opts[opt]) return self._log_opts[opt] @property def log_outputs(self): return self._get_log_setting('outputs') @property def log_cwd(self): return self._get_log_setting('cwd') @property def log_stdin(self): return self._get_log_setting('stdin') @property def log_env(self): return self._get_log_setting('env') # Two helpers to encapsulate formatting/output def _log_out(self, line): if line and self.log_outputs: self.log("stdout| " + line.rstrip('\n')) def _log_err(self, line, expected=False): if line and self.log_outputs: self.log("stderr| " + line.rstrip('\n'), level={True: 9, False: 11}[expected]) def _get_output_online(self, proc, log_stdout, log_stderr, outputstream, errstream, expect_stderr=False, expect_fail=False): """ If log_stdout or log_stderr are callables, they will be given a read line to be processed, and return processed result. So if they need to 'swallow' the line from being logged, should just return None Parameters ---------- proc log_stdout: bool or callable or 'online' or 'offline' log_stderr: : bool or callable or 'online' or 'offline' If any of those 'offline', we would call proc.communicate at the end to grab possibly outstanding output from it expect_stderr expect_fail Returns ------- """ stdout, stderr = bytes(), bytes() log_stdout_ = _decide_to_log(log_stdout) log_stderr_ = _decide_to_log(log_stderr) log_stdout_is_callable = callable(log_stdout_) log_stderr_is_callable = callable(log_stderr_) # arguments to be passed into _process_one_line stdout_args = ( 'stdout', proc, log_stdout_, log_stdout_is_callable ) stderr_args = ( 'stderr', proc, log_stderr_, log_stderr_is_callable, expect_stderr or expect_fail ) while proc.poll() is None: # see for a possibly useful approach to processing output # in another thread http://codereview.stackexchange.com/a/17959 # current problem is that if there is no output on stderr # it stalls # Monitor if anything was output and if nothing, sleep a bit stdout_, stderr_ = None, None if log_stdout_: stdout_ = self._process_one_line(*stdout_args) stdout += stdout_ if log_stderr_: stderr_ = self._process_one_line(*stderr_args) stderr += stderr_ if stdout_ is None and stderr_ is None: # no output was really produced, so sleep a tiny bit time.sleep(0.001) # Handle possible remaining output if log_stdout_ and log_stderr_: # If Popen was called with more than two pipes, calling # communicate() after we partially read the stream will return # empty output. stdout += self._process_remaining_output( outputstream, proc.stdout.read(), *stdout_args) stderr += self._process_remaining_output( errstream, proc.stderr.read(), *stderr_args) stdout_, stderr_ = proc.communicate() # ??? should we condition it on log_stdout in {'offline'} ??? stdout += self._process_remaining_output(outputstream, stdout_, *stdout_args) stderr += self._process_remaining_output(errstream, stderr_, *stderr_args) return stdout, stderr def _process_remaining_output(self, stream, out_, *pargs): """Helper to process output which might have been obtained from popen or should be loaded from file""" out = bytes() if isinstance(stream, file_class) and \ _MAGICAL_OUTPUT_MARKER in getattr(stream, 'name', ''): assert out_ is None, "should have gone into a file" if not stream.closed: stream.close() with open(stream.name, 'rb') as f: for line in f: out += self._process_one_line(*pargs, line=line) else: if out_: # resolving a once in a while failing test #2185 if isinstance(out_, str): out_ = out_.encode('utf-8') for line in out_.split(linesep_bytes): out += self._process_one_line( *pargs, line=line, suf=linesep_bytes) return out def _process_one_line(self, out_type, proc, log_, log_is_callable, expected=False, line=None, suf=None): if line is None: lgr.log(3, "Reading line from %s", out_type) line = {'stdout': proc.stdout, 'stderr': proc.stderr}[out_type].readline() else: lgr.log(3, "Processing provided line") if line and log_is_callable: # Let it be processed line = log_(assure_unicode(line)) if line is not None: # we are working with binary type here line = assure_bytes(line) if line: if out_type == 'stdout': self._log_out(assure_unicode(line)) elif out_type == 'stderr': self._log_err(line.decode('utf-8'), expected) else: # pragma: no cover raise RuntimeError("must not get here") return (line + suf) if suf else line # it was output already directly but for code to work, return "" return bytes() def run(self, cmd, log_stdout=True, log_stderr=True, log_online=False, expect_stderr=False, expect_fail=False, cwd=None, env=None, shell=None, stdin=None): """Runs the command `cmd` using shell. In case of dry-mode `cmd` is just added to `commands` and it is actually executed otherwise. Allows for separately logging stdout and stderr or streaming it to system's stdout or stderr respectively. Note: Using a string as `cmd` and shell=True allows for piping, multiple commands, etc., but that implies split_cmdline() is not used. This is considered to be a security hazard. So be careful with input. Parameters ---------- cmd : str, list String (or list) defining the command call. No shell is used if cmd is specified as a list log_stdout: bool, optional If True, stdout is logged. Goes to sys.stdout otherwise. log_stderr: bool, optional If True, stderr is logged. Goes to sys.stderr otherwise. log_online: bool, optional Whether to log as output comes in. Setting to True is preferable for running user-invoked actions to provide timely output expect_stderr: bool, optional Normally, having stderr output is a signal of a problem and thus it gets logged at level 11. But some utilities, e.g. wget, use stderr for their progress output. Whenever such output is expected, set it to True and output will be logged at level 9 unless exit status is non-0 (in non-online mode only, in online -- would log at 9) expect_fail: bool, optional Normally, if command exits with non-0 status, it is considered an error and logged at level 11 (above DEBUG). But if the call intended for checking routine, such messages are usually not needed, thus it will be logged at level 9. cwd : string, optional Directory under which run the command (passed to Popen) env : string, optional Custom environment to pass shell: bool, optional Run command in a shell. If not specified, then it runs in a shell only if command is specified as a string (not a list) stdin: file descriptor input stream to connect to stdin of the process. Returns ------- (stdout, stderr) - bytes! Raises ------ CommandError if command's exitcode wasn't 0 or None. exitcode is passed to CommandError's `code`-field. Command's stdout and stderr are stored in CommandError's `stdout` and `stderr` fields respectively. """ outputstream = _get_output_stream(log_stdout, sys.stdout) errstream = _get_output_stream(log_stderr, sys.stderr) popen_env = env or self.env popen_cwd = cwd or self.cwd if popen_cwd and popen_env and 'PWD' in popen_env: # we must have inherited PWD, but cwd was provided, so we must # adjust it popen_env = popen_env.copy() # to avoid side-effects popen_env['PWD'] = popen_cwd # TODO: if outputstream is sys.stdout and that one is set to StringIO # we have to "shim" it with something providing fileno(). # This happens when we do not swallow outputs, while allowing nosetest's # StringIO to be provided as stdout, crashing the Popen requiring # fileno(). In out swallow_outputs, we just use temporary files # to overcome this problem. # For now necessary test code should be wrapped into swallow_outputs cm # to avoid the problem log_msgs = ["Running: %s"] log_args = [cmd] if self.log_cwd: log_msgs += ['cwd=%r'] log_args += [popen_cwd] if self.log_stdin: log_msgs += ['stdin=%r'] log_args += [stdin] log_env = self.log_env if log_env and popen_env: log_msgs += ["env=%r"] log_args.append( popen_env if log_env is True else {k: popen_env[k] for k in log_env if k in popen_env} ) log_msg = '\n'.join(log_msgs) self.log(log_msg, *log_args) if self.protocol.do_execute_ext_commands: if shell is None: shell = isinstance(cmd, str) if self.protocol.records_ext_commands: prot_exc = None prot_id = self.protocol.start_section( split_cmdline(cmd) if isinstance(cmd, str) else cmd) try: proc = subprocess.Popen(cmd, stdout=outputstream, stderr=errstream, shell=shell, cwd=popen_cwd, env=popen_env, stdin=stdin) except Exception as e: prot_exc = e lgr.log(11, "Failed to start %r%r: %s" % (cmd, " under %r" % cwd if cwd else '', exc_str(e))) raise finally: if self.protocol.records_ext_commands: self.protocol.end_section(prot_id, prot_exc) try: if log_online: out = self._get_output_online(proc, log_stdout, log_stderr, outputstream, errstream, expect_stderr=expect_stderr, expect_fail=expect_fail) else: out = proc.communicate() # Decoding was delayed to this point def decode_if_not_None(x): return "" if x is None else bytes.decode(x) out = tuple(map(decode_if_not_None, out)) status = proc.poll() # needs to be done after we know status if not log_online: self._log_out(out[0]) if status not in [0, None]: self._log_err(out[1], expected=expect_fail) else: # as directed self._log_err(out[1], expected=expect_stderr) if status not in [0, None]: msg = "Failed to run %r%s. Exit code=%d.%s%s" \ % (cmd, " under %r" % (popen_cwd), status, "" if log_online else " out=%s" % out[0], "" if log_online else " err=%s" % out[1]) lgr.log(9 if expect_fail else 11, msg) raise CommandError(str(cmd), msg, status, out[0], out[1]) else: self.log("Finished running %r with status %s" % (cmd, status), level=8) except CommandError: # do not bother with reacting to "regular" CommandError # exceptions. Somehow if we also terminate here for them # some processes elsewhere might stall: # see https://github.com/datalad/datalad/pull/3794 raise except BaseException as exc: exc_info = sys.exc_info() # KeyboardInterrupt is subclass of BaseException lgr.debug("Terminating process for %s upon exception: %s", cmd, exc_str(exc)) try: # there are still possible (although unlikely) cases when # we fail to interrupt but we # should not crash if we fail to terminate the process proc.terminate() except BaseException as exc2: lgr.warning("Failed to terminate process for %s: %s", cmd, exc_str(exc2)) raise exc_info[1] finally: # Those streams are for us to close if we asked for a PIPE # TODO -- assure closing the files _cleanup_output(outputstream, proc.stdout) _cleanup_output(errstream, proc.stderr) else: if self.protocol.records_ext_commands: self.protocol.add_section(split_cmdline(cmd) if isinstance(cmd, str) else cmd, None) out = ("DRY", "DRY") return out def call(self, f, *args, **kwargs): """Helper to unify collection of logging all "dry" actions. Calls `f` if `Runner`-object is not in dry-mode. Adds `f` along with its arguments to `commands` otherwise. Parameters ---------- f: callable """ if self.protocol.do_execute_callables: if self.protocol.records_callables: prot_exc = None prot_id = self.protocol.start_section( [str(f), "args=%s" % str(args), "kwargs=%s" % str(kwargs)]) try: return f(*args, **kwargs) except Exception as e: prot_exc = e raise finally: if self.protocol.records_callables: self.protocol.end_section(prot_id, prot_exc) else: if self.protocol.records_callables: self.protocol.add_section( [str(f), "args=%s" % str(args), "kwargs=%s" % str(kwargs)], None) def log(self, msg, *args, **kwargs): """log helper Logs at level 9 by default and adds "Protocol:"-prefix in order to log the used protocol. """ level = kwargs.pop('level', 9) if isinstance(self.protocol, NullProtocol): lgr.log(level, msg, *args, **kwargs) else: if args: msg = msg % args lgr.log(level, "{%s} %s" % ( self.protocol.__class__.__name__, msg) ) class GitRunner(Runner): """ Runner to be used to run git and git annex commands Overloads the runner class to check & update GIT_DIR and GIT_WORK_TREE environment variables set to the absolute path if is defined and is relative path """ _GIT_PATH = None @borrowdoc(Runner) def __init__(self, *args, **kwargs): super(GitRunner, self).__init__(*args, **kwargs) self._check_git_path() @staticmethod def _check_git_path(): """If using bundled git-annex, we would like to use bundled with it git Thus we will store _GIT_PATH a path to git in the same directory as annex if found. If it is empty (but not None), we do nothing """ if GitRunner._GIT_PATH is None: from distutils.spawn import find_executable # with all the nesting of config and this runner, cannot use our # cfg here, so will resort to dark magic of environment options if (os.environ.get('DATALAD_USE_DEFAULT_GIT', '0').lower() in ('1', 'on', 'true', 'yes')): git_fpath = find_executable("git") if git_fpath: GitRunner._GIT_PATH = '' lgr.log(9, "Will use default git %s", git_fpath) return # we are done - there is a default git avail. # if not -- we will look for a bundled one GitRunner._GIT_PATH = GitRunner._get_bundled_path() lgr.log(9, "Will use git under %r (no adjustments to PATH if empty " "string)", GitRunner._GIT_PATH) assert(GitRunner._GIT_PATH is not None) # we made the decision! @staticmethod def _get_bundled_path(): from distutils.spawn import find_executable annex_fpath = find_executable("git-annex") if not annex_fpath: # not sure how to live further anyways! ;) alongside = False else: annex_path = op.dirname(op.realpath(annex_fpath)) alongside = op.lexists(op.join(annex_path, 'git')) return annex_path if alongside else '' @staticmethod def get_git_environ_adjusted(env=None): """ Replaces GIT_DIR and GIT_WORK_TREE with absolute paths if relative path and defined """ # if env set copy else get os environment git_env = env.copy() if env else os.environ.copy() if GitRunner._GIT_PATH: git_env['PATH'] = op.pathsep.join([GitRunner._GIT_PATH, git_env['PATH']]) \ if 'PATH' in git_env \ else GitRunner._GIT_PATH for varstring in ['GIT_DIR', 'GIT_WORK_TREE']: var = git_env.get(varstring) if var: # if env variable set if not op.isabs(var): # and it's a relative path git_env[varstring] = op.abspath(var) # to absolute path lgr.log(9, "Updated %s to %s", varstring, git_env[varstring]) if 'GIT_SSH_COMMAND' not in git_env: git_env['GIT_SSH_COMMAND'] = GIT_SSH_COMMAND git_env['GIT_SSH_VARIANT'] = 'ssh' return git_env def run(self, cmd, env=None, *args, **kwargs): out, err = super(GitRunner, self).run( cmd, env=self.get_git_environ_adjusted(env), *args, **kwargs) # All communication here will be returned as unicode # TODO: do that instead within the super's run! return assure_unicode(out), assure_unicode(err) def readline_rstripped(stdout): #return iter(stdout.readline, b'').next().rstrip() return stdout.readline().rstrip() class SafeDelCloseMixin(object): """A helper class to use where __del__ would call .close() which might fail if "too late in GC game" """ def __del__(self): try: self.close() except TypeError: if os.fdopen is None or lgr.debug is None: # if we are late in the game and things already gc'ed in py3, # it is Ok return raise @auto_repr class BatchedCommand(SafeDelCloseMixin): """Container for a process which would allow for persistent communication """ def __init__(self, cmd, path=None, output_proc=None): if not isinstance(cmd, list): cmd = [cmd] self.cmd = cmd self.path = path self.output_proc = output_proc if output_proc else readline_rstripped self._process = None self._stderr_out = None self._stderr_out_fname = None def _initialize(self): lgr.debug("Initiating a new process for %s" % repr(self)) lgr.log(5, "Command: %s" % self.cmd) # according to the internet wisdom there is no easy way with subprocess # while avoid deadlocks etc. We would need to start a thread/subprocess # to timeout etc # kwargs = dict(bufsize=1, universal_newlines=True) if PY3 else {} self._stderr_out, self._stderr_out_fname = tempfile.mkstemp() self._process = subprocess.Popen( self.cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=self._stderr_out, env=GitRunner.get_git_environ_adjusted(), cwd=self.path, bufsize=1, universal_newlines=True # **kwargs ) def _check_process(self, restart=False): """Check if the process was terminated and restart if restart Returns ------- bool True if process was alive. str stderr if any recorded if was terminated """ process = self._process ret = True ret_stderr = None if process and process.poll(): lgr.warning("Process %s was terminated with returncode %s" % (process, process.returncode)) ret_stderr = self.close(return_stderr=True) ret = False if self._process is None and restart: lgr.warning("Restarting the process due to previous failure") self._initialize() return ret, ret_stderr def __call__(self, cmds): """ Parameters ---------- cmds : str or tuple or list of (str or tuple) Returns ------- str or list Output received from process. list in case if cmds was a list """ input_multiple = isinstance(cmds, list) if not input_multiple: cmds = [cmds] output = [o for o in self.yield_(cmds)] return output if input_multiple else output[0] def yield_(self, cmds): """Same as __call__, but requires `cmds` to be an iterable and yields results for each item.""" for entry in cmds: if not isinstance(entry, str): entry = ' '.join(entry) yield self.proc1(entry) def proc1(self, arg): """Same as __call__, but only takes a single command argument and returns a single result. """ # TODO: add checks -- may be process died off and needs to be reinitiated if not self._process: self._initialize() entry = arg + '\n' lgr.log(5, "Sending %r to batched command %s" % (entry, self)) # apparently communicate is just a one time show # stdout, stderr = self._process.communicate(entry) # according to the internet wisdom there is no easy way with subprocess self._check_process(restart=True) process = self._process # _check_process might have restarted it process.stdin.write(entry) process.stdin.flush() lgr.log(5, "Done sending.") still_alive, stderr = self._check_process(restart=False) # TODO: we might want to handle still_alive, e.g. to allow for # a number of restarts/resends, but it should be per command # since for some we cannot just resend the same query. But if # it is just a "get"er - we could resend it few times # The default output_proc expects a single line output. # TODO: timeouts etc stdout = assure_unicode(self.output_proc(process.stdout)) \ if not process.stdout.closed else None if stderr: lgr.warning("Received output in stderr: %r", stderr) lgr.log(5, "Received output: %r" % stdout) return stdout def close(self, return_stderr=False): """Close communication and wait for process to terminate Returns ------- str stderr output if return_stderr and stderr file was there. None otherwise """ ret = None if self._stderr_out: # close possibly still open fd os.fdopen(self._stderr_out).close() self._stderr_out = None if self._process: process = self._process lgr.debug( "Closing stdin of %s and waiting process to finish", process) process.stdin.close() process.stdout.close() process.wait() self._process = None lgr.debug("Process %s has finished", process) if self._stderr_out_fname and os.path.exists(self._stderr_out_fname): if return_stderr: with open(self._stderr_out_fname, 'r') as f: ret = f.read() # remove the file where we kept dumping stderr unlink(self._stderr_out_fname) self._stderr_out_fname = None return ret datalad-0.12.4/datalad/cmdline/000077500000000000000000000000001363461734600162125ustar00rootroot00000000000000datalad-0.12.4/datalad/cmdline/__init__.py000066400000000000000000000006611363461734600203260ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ """ __docformat__ = 'restructuredtext' datalad-0.12.4/datalad/cmdline/common_args.py000066400000000000000000000042521363461734600210730ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ """ __docformat__ = 'restructuredtext' # argument spec template # = ( # , # {} #) from ..cmdline.helpers import HelpAction, LogLevelAction help = ( 'help', ('-h', '--help', '--help-np'), dict(nargs=0, action=HelpAction, help="""show this help message. --help-np forcefully disables the use of a pager for displaying the help message""") ) version = ( 'version', ('--version',), dict(action='version', help="show the program's version") ) _log_level_names = ['critical', 'error', 'warning', 'info', 'debug'] log_level = ( 'log-level', ('-l', '--log-level'), dict(action=LogLevelAction, choices=_log_level_names + [str(x) for x in range(1, 10)], metavar="LEVEL", default='warning', help="""set logging verbosity level. Choose among %s. Also you can specify an integer <10 to provide even more debugging information""" % ', '.join(_log_level_names)) ) pbs_runner = ( 'pbs-runner', ('--pbs-runner',), dict(choices=['condor'], default=None, help="""execute command by scheduling it via available PBS. For settings, config file will be consulted""") ) change_path = ( 'change-path', ('-C',), dict(action='append', dest='change_path', metavar='PATH', help="""run as if datalad was started in instead of the current working directory. When multiple -C options are given, each subsequent non-absolute -C is interpreted relative to the preceding -C . This option affects the interpretations of the path names in that they are made relative to the working directory caused by the -C option""") ) datalad-0.12.4/datalad/cmdline/helpers.py000066400000000000000000000233221363461734600202300ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ """ __docformat__ = 'restructuredtext' import argparse import os import re import sys import gzip from tempfile import NamedTemporaryFile from textwrap import wrap from ..cmd import Runner from ..log import is_interactive from ..utils import ( getpwd, unlink, ) from ..version import __version__ from ..dochelpers import exc_str from logging import getLogger lgr = getLogger('datalad.cmdline') class HelpAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): # Lets use the manpage on mature systems but only for subcommands -- # --help should behave similar to how git does it: # regular --help for "git" but man pages for specific commands. # It is important since we do discover all subcommands from entry # points at run time and thus any static manpage would like be out of # date if is_interactive() \ and option_string == '--help' \ and ' ' in parser.prog: # subcommand try: import subprocess # get the datalad manpage to use manfile = os.environ.get('MANPATH', '/usr/share/man') \ + '/man1/{0}.1.gz'.format(parser.prog.replace(' ', '-')) # extract version field from the manpage if not os.path.exists(manfile): raise IOError("manfile is not found") with gzip.open(manfile) as f: man_th = [line for line in f if line.startswith(b".TH")][0] man_version = man_th.split(b' ')[-1].strip(b" '\"\t\n").decode('utf-8') # don't show manpage if man_version not equal to current datalad_version if __version__ != man_version: raise ValueError subprocess.check_call( 'man %s 2> /dev/null' % manfile, shell=True) sys.exit(0) except (subprocess.CalledProcessError, IOError, OSError, IndexError, ValueError) as e: lgr.debug("Did not use manpage since %s", exc_str(e)) if option_string == '-h': usage = parser.format_usage() ucomps = re.match( r'(?P
.*){(?P.*)}(?P....*)',
                usage,
                re.DOTALL)
            if ucomps:
                ucomps = ucomps.groupdict()
                indent_level = len(ucomps['post']) - len(ucomps['post'].lstrip())
                usage = '{pre}{{{cmds}}}{post}'.format(
                    pre=ucomps['pre'],
                    cmds='\n'.join(wrap(
                        ', '.join(sorted(c.strip() for c in ucomps['cmds'].split(','))),
                        break_on_hyphens=False,
                        subsequent_indent=' ' * indent_level)),
                    post=ucomps['post'],
                )
            helpstr = "%s\n%s" % (
                usage,
                "Use '--help' to get more comprehensive information.")
        else:
            helpstr = parser.format_help()
        # better for help2man
        # for main command -- should be different sections. And since we are in
        # heavy output massaging mode...
        if "commands for dataset operations" in helpstr.lower():
            opt_args_str = '*Global options*'
            pos_args_str = '*Commands*'
            # tune up usage -- default one is way too heavy
            helpstr = re.sub(r'^[uU]sage: .*?\n\s*\n',
                             'Usage: datalad [global-opts] command [command-opts]\n\n',
                             helpstr,
                             flags=re.MULTILINE | re.DOTALL)
            # and altogether remove sections with long list of commands
            helpstr = re.sub(r'positional arguments:\s*\n\s*{.*}\n', '', helpstr)
        else:
            opt_args_str = "*Options*"
            pos_args_str = "*Arguments*"
        helpstr = re.sub(r'optional arguments:', opt_args_str, helpstr)
        helpstr = re.sub(r'positional arguments:', pos_args_str, helpstr)
        # convert all headings to have the first character uppercase
        headpat = re.compile(r'^([a-z])(.*):$',  re.MULTILINE)
        helpstr = re.subn(
            headpat,
            lambda match: r'{0}{1}:'.format(match.group(1).upper(),
                                            match.group(2)),
            helpstr)[0]
        # usage is on the same line
        helpstr = re.sub(r'^usage:', 'Usage:', helpstr)

        print(helpstr)
        sys.exit(0)


class LogLevelAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        from ..log import LoggerHelper
        LoggerHelper().set_level(level=values)


# MIH: Disabled. Non-functional, untested.
#class PBSAction(argparse.Action):
#    """Action to schedule actual command execution via PBS (e.g. Condor)"""
#    def __call__(self, parser, namespace, values, option_string=None):
#        pbs = values[0]
#        import pdb; pdb.set_trace()
#        i = 1


def parser_add_common_opt(parser, opt, names=None, **kwargs):
    from . import common_args
    opt_tmpl = getattr(common_args, opt)
    opt_kwargs = opt_tmpl[2].copy()
    opt_kwargs.update(kwargs)
    if names is None:
        parser.add_argument(*opt_tmpl[1], **opt_kwargs)
    else:
        parser.add_argument(*names, **opt_kwargs)


def strip_arg_from_argv(args, value, opt_names):
    """Strip an originally listed option (with its value) from the list cmdline args
    """
    # Yarik doesn't know better
    if args is None:
        args = sys.argv
    # remove present pbs-runner option
    args_clean = []
    skip = 0
    for i, arg in enumerate(args):
        if skip:
            # we skip only one as instructed
            skip -= 1
            continue
        if not (arg in opt_names and i < len(args) - 1 and args[i + 1] == value):
            args_clean.append(arg)
        else:
            # we need to skip this one and next one
            skip = 1
    return args_clean


def run_via_pbs(args, pbs):
    assert(pbs in ('condor',))  # for now

    # TODO: RF to support multiple backends, parameters, etc, for now -- just condor, no options
    f = NamedTemporaryFile('w', prefix='datalad-%s-' % pbs, suffix='.submit', delete=False)
    try:
        pwd = getpwd()
        logs = f.name.replace('.submit', '.log')
        exe = args[0]
        # TODO: we might need better way to join them, escaping spaces etc.  There must be a stock helper
        #exe_args = ' '.join(map(repr, args[1:])) if len(args) > 1 else ''
        exe_args = ' '.join(args[1:]) if len(args) > 1 else ''
        f.write("""\
Executable = %(exe)s
Initialdir = %(pwd)s
Output = %(logs)s
Error = %(logs)s
getenv = True

arguments = %(exe_args)s
queue
""" % locals())
        f.close()
        Runner().run(['condor_submit', f.name])
        lgr.info("Scheduled execution via %s.  Logs will be stored under %s" % (pbs, logs))
    finally:
        unlink(f.name)


# TODO: useful also outside of cmdline, move to support/
from os import curdir


def get_repo_instance(path=curdir, class_=None):
    """Returns an instance of appropriate datalad repository for path.
    Check whether a certain path is inside a known type of repository and
    returns an instance representing it. May also check for a certain type
    instead of detecting the type of repository.

    Parameters
    ----------
    path: str
      path to check; default: current working directory
    class_: class
      if given, check whether path is inside a repository, that can be
      represented as an instance of the passed class.

    Raises
    ------
    RuntimeError, in case cwd is not inside a known repository.
    """

    from os.path import ismount, exists, normpath, isabs
    from git.exc import InvalidGitRepositoryError
    from ..utils import expandpath
    from ..support.gitrepo import GitRepo
    from ..support.annexrepo import AnnexRepo

    dir_ = expandpath(path)
    abspath_ = path if isabs(path) else dir_
    if class_ is not None:
        if class_ == AnnexRepo:
            type_ = "annex"
        elif class_ == GitRepo:
            type_ = "git"
        else:
            raise RuntimeError("Unknown class %s." % str(class_))

    while not ismount(dir_):  # TODO: always correct termination?
        if exists(opj(dir_, '.git')):
            # found git dir
            if class_ is None:
                # detect repo type:
                try:
                    return AnnexRepo(dir_, create=False)
                except RuntimeError as e:
                    pass
                try:
                    return GitRepo(dir_, create=False)
                except InvalidGitRepositoryError as e:
                    raise RuntimeError("No datalad repository found in %s" %
                                       abspath_)
            else:
                try:
                    return class_(dir_, create=False)
                except (RuntimeError, InvalidGitRepositoryError) as e:
                    raise RuntimeError("No %s repository found in %s." %
                                       (type_, abspath_))
        else:
            dir_ = normpath(opj(dir_, ".."))

    if class_ is not None:
        raise RuntimeError("No %s repository found in %s" % (type_, abspath_))
    else:
        raise RuntimeError("No datalad repository found in %s" % abspath_)


from appdirs import AppDirs
from os.path import join as opj

dirs = AppDirs("datalad", "datalad.org")
datalad-0.12.4/datalad/cmdline/main.py000066400000000000000000000607041363461734600175170ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
#   See COPYING file distributed along with the datalad package for the
#   copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
""""""
from datalad.ui.utils import get_console_width

__docformat__ = 'restructuredtext'

import logging
lgr = logging.getLogger('datalad.cmdline')

lgr.log(5, "Importing cmdline.main")

import argparse
from collections import defaultdict
import sys
import textwrap
import os


import datalad

from datalad.cmdline import helpers
from datalad.support.exceptions import InsufficientArgumentsError
from datalad.support.exceptions import IncompleteResultsError
from datalad.support.exceptions import CommandError
from .helpers import strip_arg_from_argv
from ..utils import (
    assure_unicode,
    chpwd,
    get_suggestions_msg,
    on_msys_tainted_paths,
    setup_exceptionhook,
)
from ..dochelpers import exc_str


def _license_info():
    return """\
Copyright (c) 2013-2019 DataLad developers

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""


class ArgumentParserDisableAbbrev(argparse.ArgumentParser):
    # Don't accept abbreviations for long options. With py3.5 and above, we
    # could just use allow_abbrev=False.
    #
    # Modified from the solution posted at
    # https://bugs.python.org/issue14910#msg204678
    def _get_option_tuples(self, option_string):
        chars = self.prefix_chars
        if option_string[0] in chars and option_string[1] in chars:
            # option_string is a long flag. Disable abbreviation.
            return []
        return super(ArgumentParserDisableAbbrev, self)._get_option_tuples(
            option_string)


# TODO:  OPT look into making setup_parser smarter to become faster
# Now it seems to take up to 200ms to do all the parser setup
# even though it might not be necessary to know about all the commands etc.
# I wondered if it could somehow decide on what commands to worry about etc
# by going through sys.args first
def setup_parser(
        cmdlineargs,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        return_subparsers=False,
        # prevent loading of extension entrypoints when --help is requested
        # this is enabled when building docs to avoid pollution of generated
        # manpages with extensions commands (that should appear in their own
        # docs, but not in the core datalad package docs)
        help_ignore_extensions=False):
    lgr.log(5, "Starting to setup_parser")
    # delay since it can be a heavy import
    from ..interface.base import dedent_docstring, get_interface_groups, \
        get_cmdline_command_name, alter_interface_docs_for_cmdline, \
        load_interface, get_cmd_doc, get_cmd_ex
    # setup cmdline args parser
    parts = {}
    # main parser
    parser = ArgumentParserDisableAbbrev(
        fromfile_prefix_chars=None,
        # usage="%(prog)s ...",
        description=dedent_docstring("""\
            Comprehensive data management solution

            DataLad provides a unified data distribution system built on the Git
            and Git-annex. DataLad command line tools allow to manipulate (obtain,
            create, update, publish, etc.) datasets and provide a comprehensive
            toolbox for joint management of data and code. Compared to Git/annex
            it primarly extends their functionality to transparently and
            simultaneously work with multiple inter-related repositories."""),
        epilog='"Be happy!"',
        formatter_class=formatter_class,
        add_help=False)
    # common options
    helpers.parser_add_common_opt(parser, 'log_level')
    helpers.parser_add_common_opt(parser, 'pbs_runner')
    helpers.parser_add_common_opt(parser, 'change_path')
    helpers.parser_add_common_opt(
        parser,
        'version',
        version='datalad %s\n' % datalad.__version__)
    if __debug__:
        parser.add_argument(
            '--dbg', action='store_true', dest='common_debug',
            help="enter Python debugger when uncaught exception happens")
        parser.add_argument(
            '--idbg', action='store_true', dest='common_idebug',
            help="enter IPython debugger when uncaught exception happens")
    parser.add_argument(
        '-c', action='append', dest='cfg_overrides', metavar='KEY=VALUE',
        help="""configuration variable setting. Overrides any configuration
        read from a file, but is potentially overridden itself by configuration
        variables in the process environment.""")
    parser.add_argument(
        '-f', '--output-format', dest='common_output_format',
        default='default',
        type=assure_unicode,
        metavar="{default,json,json_pp,tailored,'