pax_global_header00006660000000000000000000000064146232162460014520gustar00rootroot0000000000000052 comment=9bbe486cec5ae83d2eae2c343a984df321ee3647 datalad-next-1.4.1/000077500000000000000000000000001462321624600140715ustar00rootroot00000000000000datalad-next-1.4.1/.all-contributorsrc000066400000000000000000000113111462321624600177170ustar00rootroot00000000000000{ "projectName": "datalad-next", "projectOwner": "datalad", "repoType": "github", "commitConvention": "angular", "contributors": [ { "login": "mih", "name": "Michael Hanke", "avatar_url": "https://avatars.githubusercontent.com/u/136479?v=4", "profile": "http://psychoinformatics.de/", "contributions": [ "bug", "code", "content", "design", "doc", "financial", "fundingFinding", "ideas", "infra", "maintenance", "mentoring", "platform", "projectManagement", "review", "talk", "test", "tool", "userTesting" ] }, { "login": "catetrai", "name": "catetrai", "avatar_url": "https://avatars.githubusercontent.com/u/18424941?v=4", "profile": "https://github.com/catetrai", "contributions": [ "code", "design", "doc", "ideas", "test" ] }, { "login": "effigies", "name": "Chris Markiewicz", "avatar_url": "https://avatars.githubusercontent.com/u/83442?v=4", "profile": "https://github.com/effigies", "contributions": [ "maintenance", "code" ] }, { "login": "mslw", "name": "Michał Szczepanik", "avatar_url": "https://avatars.githubusercontent.com/u/11985212?v=4", "profile": "https://github.com/mslw", "contributions": [ "bug", "code", "content", "doc", "example", "ideas", "infra", "maintenance", "review", "talk", "test", "tutorial", "userTesting" ] }, { "login": "jsheunis", "name": "Stephan Heunis", "avatar_url": "https://avatars.githubusercontent.com/u/10141237?v=4", "profile": "https://jsheunis.github.io/", "contributions": [ "bug", "code", "doc", "ideas", "maintenance", "talk", "userTesting" ] }, { "login": "bpoldrack", "name": "Benjamin Poldrack", "avatar_url": "https://avatars.githubusercontent.com/u/10498301?v=4", "profile": "https://github.com/bpoldrack", "contributions": [ "bug", "code" ] }, { "login": "yarikoptic", "name": "Yaroslav Halchenko", "avatar_url": "https://avatars.githubusercontent.com/u/39889?v=4", "profile": "https://github.com/yarikoptic", "contributions": [ "bug", "code", "infra", "maintenance", "tool" ] }, { "login": "christian-monch", "name": "Christian Mönch", "avatar_url": "https://avatars.githubusercontent.com/u/17925232?v=4", "profile": "https://github.com/christian-monch", "contributions": [ "code", "design", "doc", "ideas", "review", "test", "userTesting" ] }, { "login": "adswa", "name": "Adina Wagner", "avatar_url": "https://avatars.githubusercontent.com/u/29738718?v=4", "profile": "https://github.com/adswa", "contributions": [ "a11y", "bug", "code", "doc", "example", "maintenance", "projectManagement", "review", "talk", "test", "tutorial", "userTesting" ] }, { "login": "jwodder", "name": "John T. Wodder II", "avatar_url": "https://avatars.githubusercontent.com/u/98207?v=4", "profile": "https://github.com/jwodder", "contributions": [ "code", "infra", "test" ] } ] }datalad-next-1.4.1/.appveyor.yml000066400000000000000000000377151462321624600165540ustar00rootroot00000000000000# This CI setup provides a largely homogeneous configuration across all # major platforms (Windows, MacOS, and Linux). The aim of this test setup is # to create a "native" platform experience, using as few cross-platform # helper tools as possible. # # On Linux/Mac a virtualenv is used for testing. The effective virtual env # is available under ~/VENV. # # All workers support remote login. Login details are shown at the top of each # CI run log. # # - Linux/Mac workers (via SSH): # # - A permitted SSH key must be defined in an APPVEYOR_SSH_KEY environment # variable (via the appveyor project settings) # # - SSH login info is given in the form of: 'appveyor@67.225.164.xx -p 22xxx' # # - Login with: # # ssh -o StrictHostKeyChecking=no # # - to prevent the CI run from exiting, `touch` a file named `BLOCK` in the # user HOME directory (current directory directly after login). The session # will run until the file is removed (or 60 min have passed) # # - Windows workers (via RDP): # # - An RDP password should be defined in an APPVEYOR_RDP_PASSWORD environment # variable (via the appveyor project settings), or a random password is used # every time # # - RDP login info is given in the form of IP:PORT # # - Login with: # # xfreerdp /cert:ignore /dynamic-resolution /u:appveyor /p: /v: # # - to prevent the CI run from exiting, create a textfile named `BLOCK` on the # Desktop (a required .txt extension will be added automatically). The session # will run until the file is removed (or 60 min have passed) # # - in a terminal execute, for example, `C:\datalad_debug.bat 39` to set up the # environment to debug in a Python 3.8 session (should generally match the # respective CI run configuration). # do not make repository clone cheap: interfers with versioneer shallow_clone: false # turn of support for MS project build support (not needed) build: off environment: # unless indicated otherwise, we test datalad_next DTS: datalad_next # SSH testing is done via a side-loaded container that provides a POSIX/SSHable # server environment DATALAD_TESTS_DOCKER_SSHD_SECKEY_DOWNLOADURL: https://ci.appveyor.com/api/projects/mih/datalad-ci-docker-containers/artifacts/recipes/sshd/id_rsa?job=sshd DATALAD_TESTS_DOCKER_SSHD_DOWNLOADURL: https://ci.appveyor.com/api/projects/mih/datalad-ci-docker-containers/artifacts/sshd.dockerimg.gz?job=sshd DATALAD_TESTS_DOCKER_SSHD_CONTAINER_NAME: test-sshd # std SSH container runtime setup DATALAD_TESTS_SERVER_SSH_HOST: datalad-test-sshd DATALAD_TESTS_SERVER_SSH_PORT: 2222 DATALAD_TESTS_SERVER_SSH_LOGIN: sshuser DATALAD_TESTS_SERVER_SSH_SECKEY: /home/appveyor/.ssh/datalad_tests_id_rsa DATALAD_TESTS_SERVER_SSH_PATH: /usr/local/apache2/htdocs DATALAD_TESTS_SERVER_LOCALPATH: /home/appveyor/DLTMP/sshdroot # Do not use `image` as a matrix dimension, to have fine-grained control over # what tests run on which platform # The ID variable had no impact, but sorts first in the CI run overview # an intelligible name can help to locate a specific test run matrix: # List a CI run for each platform first, to have immediate access when there # is a need for debugging # Ubuntu core tests - job_name: test-linux APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2004 PY: 3.10 # datalad-annex git remote needs something after git-annex_8.20211x INSTALL_GITANNEX: git-annex -m snapshot CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov # 'test_publish_target_url' relies on a strict `localhost` target # setup, we don't have that KEYWORDS: not test_publish_target_url DEPLOY_HTTPBIN_IMAGE: yes INSTALL_SYSPKGS: DATALAD_TESTS_SSH: 1 # same as 'test-linux', but TMPDIR is on a crippled filesystem, causing # most, if not all test datasets to be created on that filesystem - job_name: test-linux-crippled APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2004 PY: 3.10 # datalad-annex git remote needs something after git-annex_8.20211x INSTALL_GITANNEX: git-annex -m snapshot CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov DEPLOY_HTTPBIN_IMAGE: yes INSTALL_SYSPKGS: DATALAD_TESTS_SSH: 1 # Windows core tests - job_name: test-win # ~35 min APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019 # Python version specification is non-standard on windows PY: 39-x64 INSTALL_GITANNEX: git-annex -m datalad/packages DATALAD_TESTS_SSH: 1 DATALAD_TESTS_SERVER_SSH_SECKEY: C:\DLTMP\datalad_tests_id_rsa DATALAD_TESTS_SERVER_LOCALPATH: C:\DLTMP\sshdroot # MacOS core tests - job_name: test-mac APPVEYOR_BUILD_WORKER_IMAGE: macos-monterey PY: 3.8 INSTALL_GITANNEX: git-annex DATALAD_LOCATIONS_SOCKETS: /Users/appveyor/DLTMP/sockets CODECOV_BINARY: https://uploader.codecov.io/latest/macos/codecov DATALAD_TESTS_SSH: 1 # no docker on Mac, we log into self # 'test_publish_target_url' relies git-annex being installed on the # target, but we only have that in a personal env KEYWORDS: not test_publish_target_url DATALAD_TESTS_SERVER_SSH_HOST: localhost DATALAD_TESTS_SERVER_SSH_PORT: 22 DATALAD_TESTS_SERVER_SSH_LOGIN: appveyor DATALAD_TESTS_SERVER_SSH_SECKEY: /Users/appveyor/.ssh/datalad_tests_id_rsa DATALAD_TESTS_SERVER_SSH_PATH: /Users/appveyor/DLTMP/riaroot DATALAD_TESTS_SERVER_LOCALPATH: /Users/appveyor/DLTMP/riaroot # run a subset of the core tests on the oldest supported Python version - job_name: datalad-core-1 DTS: > datalad.cli datalad.core # do not run tests that ensure behavior we intentionally changed # - test_gh1811: is included in next in an alternative implementation # - test_librarymode: assumes that CLI config overrides end up in the # session `datalad.cfg.overrides`, but -next changes that behavior # to have `.overrides` be uniformly limited to instance overrides KEYWORDS: not test_gh1811 and not test_librarymode APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2004 PY: 3.8 INSTALL_SYSPKGS: # datalad-annex git remote needs something after git-annex_8.20211x INSTALL_GITANNEX: git-annex -m snapshot CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov - job_name: datalad-core-2 DTS: > datalad.customremotes datalad.dataset datalad.distributed datalad.downloaders datalad.interface # do not run tests that ensure behavior we intentionally changed # - test_gh1811: is included in next in an alternative implementation # - test_fake_gitlab: we have an updated variant in next # - test_dryrun: we have an updated variant in next; what is disabled is # the one in test_create_sibling_gitlab.py. However, there is one with # identical name in test_create_sibling_ghlike.py, now also disabled # because MIH does not know better KEYWORDS: not test_gh1811 and not test_fake_gitlab and not test_dryrun APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2004 PY: 3.8 INSTALL_SYSPKGS: # datalad-annex git remote needs something after git-annex_8.20211x INSTALL_GITANNEX: git-annex -m snapshot CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov - job_name: datalad-core-3 DTS: > datalad.distribution KEYWORDS: not test_invalid_args APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2004 PY: 3.8 INSTALL_SYSPKGS: # datalad-annex git remote needs something after git-annex_8.20211x INSTALL_GITANNEX: git-annex -m snapshot CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov - job_name: datalad-core-4 DTS: > datalad.local APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2004 PY: 3.8 INSTALL_SYSPKGS: # datalad-annex git remote needs something after git-annex_8.20211x INSTALL_GITANNEX: git-annex -m snapshot CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov - job_name: datalad-core-5 DTS: > datalad.runner datalad.support datalad.tests datalad.ui APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2004 PY: 3.8 INSTALL_SYSPKGS: # datalad-annex git remote needs something after git-annex_8.20211x INSTALL_GITANNEX: git-annex -m snapshot CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov # only run the CI if there are code or tooling changes only_commits: files: - datalad_next/ - tools/ # tests need specific hostnames to be available # note, this is insufficient on MacOS, and needs to be reflected # in the SSH config too hosts: datalad-test-sshd: 127.0.0.1 # same, but for datalad-core implementations datalad-test: 127.0.0.1 # job-specific configurations for: # # POSIX TEST RUNS # - matrix: only: - job_name: test-linux - job_name: test-linux-crippled - job_name: test-mac - job_name: datalad-core-1 - job_name: datalad-core-2 - job_name: datalad-core-3 - job_name: datalad-core-4 - job_name: datalad-core-5 cache: # pip cache - /home/appveyor/.cache/pip -> .appveyor.yml # cache the docker image for httpbin. in 2023 it has not changed in # 4 years, not worth pulling each time # given the low change frequency we also do not invalidate the cache # but would do manually, if needed - /home/appveyor/cache/httpbin.dockerimg # init cannot use any components from the repo, because it runs prior to # cloning it init: # enable external SSH access to CI worker # needs APPVEYOR_SSH_KEY defined in project settings (or environment) - curl -sflL 'https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-ssh.sh' | bash -e - # Scratch space # we place the "unix" one into the user's HOME to avoid git-annex issues on MacOSX # gh-5291 - mkdir ~/DLTMP && export TMPDIR=~/DLTMP # establish a root directory for SSH tests - "mkdir ${DATALAD_TESTS_SERVER_LOCALPATH}" install: # verify that a PY variable is declared that identifies the desired Python version # for this run - "[ \"x$PY\" != x ]" # Missing system software - tools/appveyor/install-syspkgs $INSTALL_SYSPKGS # If a particular Python version is requested, use env setup (using the # appveyor provided environments/installation). Note, these are broken # on the ubuntu images # https://help.appveyor.com/discussions/problems/28217-appveyor-ubunu-image-with-python3-lzma-module # Otherwise create a virtualenv using the default Python 3, to enable uniform # use of python/pip executables below - "[ \"x$PY\" != x ] && . ${HOME}/venv${PY}/bin/activate || virtualenv -p 3 ${HOME}/dlvenv && . ${HOME}/dlvenv/bin/activate; ln -s \"$VIRTUAL_ENV\" \"${HOME}/VENV\"" - tools/appveyor/install-git-annex ${INSTALL_GITANNEX} # enable the git-annex provisioned by the installer - "[ -f ${HOME}/dlinstaller_env.sh ] && . ${HOME}/dlinstaller_env.sh || true" # HTTPBIN - "[ -n \"$DEPLOY_HTTPBIN_IMAGE\" ] && tools/appveyor/docker-load-httpbin || true" before_test: - "[ \"$DATALAD_TESTS_SSH\" = 1 ] && tools/appveyor/setup-sshd || true" - "[ \"$DATALAD_TESTS_SSH\" = 1 ] && tools/appveyor/verify-ssh-access || true" test_script: # store original TMPDIR setting to limit modification to test execution - export PREV_TMPDIR=$TMPDIR # make TMPDIR a "crippled filesystem" to test wrong assumptions of POSIX-ness # on POSIX OSes. The test fixtures will create all test datasets under TMPDIR - | set -e if [ "$APPVEYOR_JOB_NAME" = "test-linux-crippled" ]; then # 750 MB VFAT FS in a box sudo dd if=/dev/zero of=/crippledfs.img count=750 bs=1M sudo mkfs.vfat /crippledfs.img sudo mkdir /crippledfs sudo mount -o "uid=$(id -u),gid=$(id -g)" /crippledfs.img /crippledfs echo "== mount >>" mount | grep crippled echo "<< mount ==" export TMPDIR=/crippledfs fi - echo TMPDIR=$TMPDIR # run tests on installed module, not source tree files - mkdir __testhome__ # run tests on installed module, not source tree files - | cd __testhome__ python -m pytest -s -v --durations 20 -m "not (turtle)" -k "$KEYWORDS" --cov=datalad_next --cov datalad --cov-config=../.coveragerc --pyargs ${DTS} after_test: - python -m coverage xml - "curl -Os $CODECOV_BINARY" - chmod +x codecov - ./codecov on_finish: # conditionally block the exit of a CI run for direct debugging - while [ -f ~/BLOCK ]; do sleep 5; done # # WINDOWS TEST RUNS # - matrix: only: - job_name: test-win cache: # pip cache - C:\Users\appveyor\AppData\Local\pip\Cache -> .appveyor.yml # init cannot use any components from the repo, because it runs prior to # cloning it init: # remove windows 260-char limit on path names - ps: Set-Itemproperty -path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name LongPathsEnabled -value 1 # enable developer mode on windows # this should enable mklink without admin privileges, but it doesn't seem to work #- ps: tools\ci\appveyor_enable_windevmode.ps1 # enable RDP access on windows (RDP password is in appveyor project config) # this is relatively expensive (1-2min), but very convenient to jump into any build at any time - ps: iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) # Scratch space - cmd: md C:\DLTMP # and use that scratch space to get short paths in test repos # (avoiding length-limits as much as possible) - cmd: "set TMP=C:\\DLTMP" - cmd: "set TEMP=C:\\DLTMP" # establish a root directory for SSH tests - cmd: "md %DATALAD_TESTS_SERVER_LOCALPATH%" install: # place a debug setup helper at a convenient location - cmd: copy tools\appveyor\env_setup.bat C:\\datalad_debug.bat - cmd: "set PATH=C:\\Python%PY%;C:\\Python%PY%\\Scripts;%PATH%" # deploy the datalad installer, override version via DATALAD_INSTALLER_VERSION - cmd: IF DEFINED DATALAD_INSTALLER_VERSION ( python -m pip install "datalad-installer%DATALAD_INSTALLER_VERSION%" ) ELSE ( python -m pip install datalad-installer ) # Install git-annex on windows, otherwise INSTALL_SYSPKGS can be used # deploy git-annex, if desired - cmd: IF DEFINED INSTALL_GITANNEX datalad-installer --sudo ok %INSTALL_GITANNEX% before_test: - tools\appveyor\setup-sshd - tools\appveyor\verify-ssh-access test_script: # run tests on installed module, not source tree files - cmd: md __testhome__ - cmd: cd __testhome__ # run test selecion (--traverse-namespace needed from Python 3.8 onwards) - cmd: python -m pytest -s -v --durations 20 -m "not (turtle)" -k "%KEYWORDS%" --cov=datalad_next --cov-config=..\.coveragerc --pyargs %DTS% after_test: - cmd: python -m coverage xml - cmd: curl -fsSL -o codecov.exe "https://uploader.codecov.io/latest/windows/codecov.exe" - cmd: .\codecov.exe -f "coverage.xml" on_finish: # conditionally block the exit of a CI run for direct debugging - ps: while ((Test-Path "C:\Users\\appveyor\\Desktop\\BLOCK.txt")) { Start-Sleep 5 } # # ALL TEST RUNS # build_script: - python -m pip install -r requirements-devel.txt - python -m pip install . after_build: # Identity setup - git config --global user.email "test@appveyor.land" - git config --global user.name "Appveyor Almighty" # enable "next" extension for patching datalad core - git config --global datalad.extensions.load next # simple call to see if datalad and git-annex are installed properly - datalad wtf datalad-next-1.4.1/.changelog.md.j2000066400000000000000000000007511462321624600167350ustar00rootroot00000000000000{% for entry in tree %} ## {{ entry.version }}{% if entry.date %} ({{ entry.date }}){% endif %} {% for change_key, changes in entry.changes.items() %} {% if change_key %} ### {{ change_key }} {% endif %} {% for change in changes %} {% if change.scope %} - {{ change.scope }}: {{ change.message }} [{{ change.sha1 | truncate(8, true, '') }}] {% elif change.message %} - {{ change.message }} [{{ change.sha1 | truncate(8, true, '') }}] {% endif %} {% endfor %} {% endfor %} {% endfor %} datalad-next-1.4.1/.codeclimate.yml000066400000000000000000000004111462321624600171370ustar00rootroot00000000000000version: "2" checks: file-lines: config: threshold: 500 plugins: bandit: enabled: true checks: assert_used: enabled: false exclude_patterns: - "_datalad_buildsupport/" - "versioneer.py" - "*/_version.py" - "tools/" - "**/tests/" datalad-next-1.4.1/.codespell-exclude000066400000000000000000000000561462321624600174740ustar00rootroot00000000000000 froms=ds.repo.get_revisions()[1], datalad-next-1.4.1/.codespellrc000066400000000000000000000001171462321624600163700ustar00rootroot00000000000000[codespell] skip = .git,*.pdf,*.svg,venvs,versioneer.py # ignore-words-list = datalad-next-1.4.1/.coveragerc000066400000000000000000000004361462321624600162150ustar00rootroot00000000000000[run] parallel = True branch = True data_file = ${COVERAGE_ROOT-.}/.coverage omit = # versioneer */_version.py [paths] source = datalad_next/ */datalad_next/ [report] # show lines missing coverage in output show_missing = True exclude_also = raise NotImplementedError datalad-next-1.4.1/.gitattributes000066400000000000000000000000461462321624600167640ustar00rootroot00000000000000datalad_next/_version.py export-subst datalad-next-1.4.1/.github/000077500000000000000000000000001462321624600154315ustar00rootroot00000000000000datalad-next-1.4.1/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000012211462321624600212260ustar00rootroot00000000000000### PR checklist - [ ] If this PR is not complete, select "Create Draft Pull Request" in the pull request button's menu. Consider using a task list (e.g., `- [ ] add tests ...`) to indicate remaining to-do items. - [ ] Provide an overview of the changes you're making and explain why you're proposing them. Ideally, include them as a new file in `changelog.d/` - [ ] Include `Fixes #NNN` somewhere in the description if this PR addresses an existing issue. - [ ] If you would like to list yourself as a DataLad contributor and your name is not mentioned please modify .zenodo.json file. - [ ] **Delete these instructions**. :-) Thanks for contributing! datalad-next-1.4.1/.github/workflows/000077500000000000000000000000001462321624600174665ustar00rootroot00000000000000datalad-next-1.4.1/.github/workflows/codespell.yml000066400000000000000000000005711462321624600221660ustar00rootroot00000000000000--- name: Codespell on: push: branches: [main] pull_request: branches: [main] jobs: codespell: name: Check for spelling errors runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 - name: Codespell uses: codespell-project/actions-codespell@v1 with: exclude_file: .codespell-exclude datalad-next-1.4.1/.github/workflows/conventional-commits.yml000066400000000000000000000014341462321624600243630ustar00rootroot00000000000000name: Conventional commits on: pull_request jobs: check-messages: runs-on: ubuntu-latest steps: - name: Setup Python uses: actions/setup-python@v5 with: python-version: 3.11 architecture: x64 - name: Checkout uses: actions/checkout@v4 with: # we need all the history to be able to resolve revision ranges properly fetch-depth: 0 - name: Install commitizen run: python -m pip install commitizen - name: Run commit message checks run: | echo cz check --rev-range ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} cz check --rev-range ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} datalad-next-1.4.1/.github/workflows/docbuild.yml000066400000000000000000000011451462321624600217770ustar00rootroot00000000000000name: docs on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - name: Set up environment run: | git config --global user.email "test@github.land" git config --global user.name "GitHub Almighty" - uses: actions/checkout@v4 - name: Set up Python 3.10 uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements-devel.txt pip install . - name: Build docs run: | make -C docs html datalad-next-1.4.1/.github/workflows/mypy.yml000066400000000000000000000027631462321624600212170ustar00rootroot00000000000000name: Type annotation on: pull_request: paths: - 'datalad_next/**.py' - '!**/tests/**.py' jobs: static-type-check: runs-on: ubuntu-latest steps: - name: Setup Python uses: actions/setup-python@v5 with: python-version: 3.11 architecture: x64 - name: Checkout uses: actions/checkout@v4 - name: Install mypy run: python -m pip install mypy # you can pin your preferred version - name: Get Python changed files id: changed-py-files uses: tj-actions/changed-files@v44 with: files: | *.py **/*.py - name: Type check changed files if: steps.changed-py-files.outputs.any_changed == 'true' run: | # get any type stubs that mypy thinks it needs mypy --install-types --non-interactive --follow-imports skip --ignore-missing-imports ${{ steps.changed-py-files.outputs.all_changed_files }} # run mypy on the modified files only, and do not even follow imports. # this results is a fairly superficial test, but given the overall # state of annotations, we strive to become more correct incrementally # with focused error reports, rather than barfing a huge complaint # that is unrelated to the changeset someone has been working on mypy --follow-imports skip --ignore-missing-imports --pretty --show-error-context ${{ steps.changed-py-files.outputs.all_changed_files }} datalad-next-1.4.1/.github/workflows/update-contributors.yml000066400000000000000000000054161462321624600242340ustar00rootroot00000000000000name: allcontributors-auto-detect on: push: branches: - main jobs: Update: name: Generate runs-on: ubuntu-latest if: contains(github.repository, 'datalad/datalad-next') steps: - name: Checkout Repository uses: actions/checkout@v4 - name: Tributors Update uses: con/tributors@0.1.1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: # Single text list (space separated) of parsers, leave unset to auto-detect parsers: unset # Update lookup with GitHub metadata update_lookup: github # Skip these users (example) skip_users: # INFO, DEBUG, ERROR, WARNING, etc. log_level: DEBUG # If files already exist and an init is done, force overwrite force: true # the minimum number of contributions required to add a user threshold: 1 - name: Checkout New Branch env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} BRANCH_AGAINST: "main" run: | printf "GitHub Actor: ${GITHUB_ACTOR}\n" export BRANCH_FROM="contributors/update-$(date '+%Y-%m-%d')" git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" BRANCH_EXISTS=$(git ls-remote --heads origin ${BRANCH_FROM}) if [[ -z ${BRANCH_EXISTS} ]]; then printf "Branch does not exist in remote.\n" else printf "Branch already exists in remote.\n" exit 1 fi git branch git checkout -b "${BRANCH_FROM}" || git checkout "${BRANCH_FROM}" git branch git config --global user.name "github-actions" git config --global user.email "github-actions@users.noreply.github.com" git status if git diff-index --quiet HEAD --; then export OPEN_PULL_REQUEST=0 printf "No changes\n" else export OPEN_PULL_REQUEST=1 printf "Changes\n" git commit -a -m "Automated deployment to update contributors $(date '+%Y-%m-%d') [skip ci]" git push origin "${BRANCH_FROM}" fi echo "OPEN_PULL_REQUEST=${OPEN_PULL_REQUEST}" >> $GITHUB_ENV echo "PULL_REQUEST_FROM_BRANCH=${BRANCH_FROM}" >> $GITHUB_ENV echo "PULL_REQUEST_TITLE=[tributors] ${BRANCH_FROM}" >> $GITHUB_ENV echo "PULL_REQUEST_BODY='Tributors update automated pull request.\n\n[skip ci]'" >> $GITHUB_ENV - name: Open Pull Request uses: vsoch/pull-request-action@1.1.1 if: ${{ env.OPEN_PULL_REQUEST == '1' }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PULL_REQUEST_BRANCH: "main" datalad-next-1.4.1/.gitignore000066400000000000000000000001731462321624600160620ustar00rootroot00000000000000.pybuild/ .coverage /.tox *.egg-info *.py[coe] .#* .*.swp pip-wheel-metadata docs/build docs/source/generated build/ dist/ datalad-next-1.4.1/.noannex000066400000000000000000000000001462321624600155260ustar00rootroot00000000000000datalad-next-1.4.1/.zenodo.json000066400000000000000000000045321462321624600163440ustar00rootroot00000000000000{ "title": "DataLad-next extension", "creators": [ { "affiliation": "Dartmouth College, Hanover, NH, United States", "name": "Halchenko, Yaroslav O.", "orcid": "0000-0003-3456-2493" }, { "affiliation": "Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany and Institute of Systems Neuroscience, Medical Faculty, Heinrich Heine University Düsseldorf, Düsseldorf, Germany", "name": "Hanke, Michael", "orcid": "0000-0001-6398-6370" }, { "affiliation": "Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany", "name": "Heunis, Stephan", "orcid": "0000-0003-3503-9872" }, { "affiliation": "Stanford University, Stanford, CA, United States", "name": "Markiewicz, Christopher J.", "orcid": "0000-0002-6533-164X" }, { "affiliation": "Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany", "name": "Mönch, Christian", "orcid": "0000-0002-3092-0612" }, { "affiliation": "Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany", "name": "Poldrack, Benjamin", "orcid": "0000-0001-7628-0801" }, { "affiliation": "Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany", "name": "Szczepanik, Michał", "orcid": "0000-0002-4028-2087" }, { "affiliation": "Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany", "name": "Wagner, Adina S.", "orcid": "0000-0003-2917-3450" }, { "name": "Wodder II, John T.", "orcid": "0000-0001-7106-2661" }, { "name": "Trainito, Caterina", "orcid": "0000-0002-1713-8343" } ], "keywords": [ "data management", "data distribution" ], "access_right": "open", "license": "MIT", "upload_type": "software" }datalad-next-1.4.1/CHANGELOG.md000066400000000000000000001635771462321624600157250ustar00rootroot00000000000000## 1.4.1 (2024-05-22) ### 🐛 Bug Fixes - dependencies: limit test patch import to test runs [905b99bd] ### 📝 Documentation - add note of Git >= v2.31 requirement for next-status [093575d8] - state conventional-commits requirement [a9180fc0] ### 🛡 Tests - fixture: add missing import (for non-WebDAV fallback) [ddd66799] # 1.4.0 (2024-05-17) ## 🐛 Bug Fixes - RIA over SSH access from Mac clients to Linux server was broken due to an inappropriate platform check that assumed that local and remote platform are identical. Fixes https://github.com/datalad/datalad/issues/7536 via https://github.com/datalad/datalad-next/pull/653 (by @mih) - `next-status` has received a number of fixes: - It no longer issues undesirable modification reports that are based on `mtime` changes alone (i.e., no content change). Fixes https://github.com/datalad/datalad-next/issues/639 via https://github.com/datalad/datalad-next/pull/650 (by @mih) - It now detects staged changes in repositories with no commit. Fixes https://github.com/datalad/datalad-next/issues/680 via https://github.com/datalad/datalad-next/pull/681 (by @mih) - `next-status -r mono` now reports on new commits in submodules. Previously this was ignored, leading to the impression of clean datasets despite unsaved changes. Fixes https://github.com/datalad/datalad-next/issues/645 via https://github.com/datalad/datalad-next/pull/679 (by @mih) - `iter_annexworktree()` can now also be used on plain Git repos, and would behave exactly as if reporting on non-annexed files in a git-annex repo. Previously, a cryptic `iterable did not yield matching item for route-in item, cardinality mismatch?` error was issued in this case. Fixes https://github.com/datalad/datalad-next/issues/670 via https://github.com/datalad/datalad-next/pull/673 (by @mih) ## 💫 Enhancements and new features - `datalad_next.shell` provides a context manager for (long-running) shell or interpreter subprocesses. Within the context any number of commands can be executed in such a shell, and each command can process input (iterables), and yield output (iterables). This feature is suitable for running and controlling "remote shells" like a login shell on a server via SSH. A range of utilities is provided to employ this functionality for special purpose implementations (e.g., accept fixed-length or variable-length process output). A suite of operations like download/upload file to a remote shell is provided for POSIX-compliant shells `datalad_next.shell.operations.posix`. https://github.com/datalad/datalad-next/pull/596 (by @christian-monch) - A rewrite of `SSHRemoteIO`, the RIA SSH-operations implementation from datalad-core is provided as a patch. It is based on the new `shell` feature, and provides more robust operations. It's IO performance is at the same level as `scp`-based down/uploads. In contrast to the original implementation, it support fine-grained progress reporting for uploads and downloads. Via https://github.com/datalad/datalad-next/pull/655 (by @mih) - The `SpecialRemote` base class in datalad-core is patched to support a standard `close()` method for implementing resource release and cleanup operations. The main special remote entry point has been altered to run implementations within a `closing()` context manager to guarantee execution of such handlers. Via https://github.com/datalad/datalad-next/pull/655 (by @mih) - A new `has_initialized_annex()` helper function is provided to test for a locally initialized annex in a repo. Via https://github.com/datalad/datalad-next/pull/673 (by @mih) - `iter_annexworktree()` can now also be used on plain Git repositories, and it yields the same output and behavior as running on a git-annex repository with no annex'ed content (just tracked with Git). Fixes https://github.com/datalad/datalad-next/issues/670 via https://github.com/datalad/datalad-next/pull/673 (by @mih) - `next-status` and `iter_gitstatus()` have been improved to report on further modifications after a file addition has been originally staged. Fixes https://github.com/datalad/datalad-next/issues/637 via https://github.com/datalad/datalad-next/pull/679 (by @mih) - `next-status` result rendering has been updated to be more markedly different than git-status's. Coloring is now exclusively determined by the nature of a change, rather than being partially similar to git-status's index-updated annotation. This reduces the chance for misinterpretations, and does not create an undesirable focus on the Git index (which is largely ignored by DataLad). Fixes https://github.com/datalad/datalad-next/issues/640 via https://github.com/datalad/datalad-next/pull/679 (by @mih) - A large 3k-line patch set replaces almost the entire RIA implementation, including the ORA special remote, and the `create-sibling-ria` command. The new implementation brings uniform support for Windows clients, progress reporting for uploads and downloads via SSH, and a faster and more robust behavior for SSH-based operations (based on the new remote shell feature). Fixes https://github.com/datalad/datalad-next/issues/654 via https://github.com/datalad/datalad-next/pull/669 (by @christian-monch) ## 📝 Documentation - Git-related subprocess execution helpers are now accessible in the rendered documentation, and all supported file collections are now mentioned in the `ls-file-collection` command help. Fixes https://github.com/datalad/datalad-next/issues/668 via https://github.com/datalad/datalad-next/pull/671 (by @mih) ## 🛡 Tests - Test setup has been improved to support a uniform, datalad-next enabled environment for subprocesses too. This extends the scope of testing to special remote implementations and other code that is executed in subprocesses, and relies on runtime patches. See https://github.com/datalad/datalad-next/pull/i665 (by @mih) # 1.3.0 (2024-03-19) ## 💫 Enhancements and new features - Code organization is adjusted to clearly indicate what is part of the package's public Python API. Anything that can be imported directly from the top-level of any sub-package is part of the public API. As an example: `from datalad_next.runners import iter_git_subproc` imports a part of the public API, but `from datalad_next.runners.git import iter_git_subproc` does not. See `README.md` for more information. Fixes https://github.com/datalad/datalad-next/issues/613 via https://github.com/datalad/datalad-next/pull/615 (by @mih) https://github.com/datalad/datalad-next/pull/617 (by @mih) https://github.com/datalad/datalad-next/pull/618 (by @mih) https://github.com/datalad/datalad-next/pull/619 (by @mih) https://github.com/datalad/datalad-next/pull/620 (by @mih) https://github.com/datalad/datalad-next/pull/621 (by @mih) https://github.com/datalad/datalad-next/pull/622 (by @mih) https://github.com/datalad/datalad-next/pull/623 (by @mih) - New `patched_env` context manager for patching a process' environment. This avoids the for importing `unittest` outside test implementations. Via https://github.com/datalad/datalad-next/pull/633 (by @mih) - `call_git...()` functions received a new `force_c_locale` parameter. This can be set whenever Git output needs to be parsed to force running the command with `LC_ALL=C`. Such an environment manipulation is off by default and not done unconditionally to let localized messaging through in a user's normal locale. ## 🐛 Bug Fixes - `datalad-annex::` Git remote helper now tests for a repository deposit, and distinguishes an absent remote repository deposit vs cloning from an empty repository deposit. This rectifies confusing behavior (successful clones of empty repositories from broken URLs), but also fixes handling of subdataset clone candidate handling in `get` (which failed to skip inaccessible `datalad-annex::` URLs for the same reason). Fixes https://github.com/datalad/datalad-next/issues/636 via https://github.com/datalad/datalad-next/pull/638 (by @mih) ## 📝 Documentation - API docs have been updated to include all top-level symbols of any sub-package, or in other words: the public API. See https://github.com/datalad/datalad-next/pull/627 (by @mih) ## 🏠 Internal - The `tree` command no longer uses the `subdatasets` command for queries, but employs the recently introduced `iter_submodules()` for leaner operations. See https://github.com/datalad/datalad-next/pull/628 (by @mih) - `call_git...()` functions are established as the only used abstraction to interface with Git and git-annex commands outside the use in DataLad's `Repo` classes. Any usage of DataLad's traditional `Runner` functionality is discontinued. Fixes https://github.com/datalad/datalad-next/issues/541 via https://github.com/datalad/datalad-next/pull/632 (by @mih) - Type annotations have been added to the implementation of the `uncurl` git-annex remote. A number of unhandled conditions have been discovered and were rectified. # 1.2.0 (2024-02-02) ## 🐛 Bug Fixes - Fix an invalid escape sequence in a regex that caused a syntax warning. Fixes https://github.com/datalad/datalad-next/issues/602 via https://github.com/datalad/datalad-next/pull/603 (by @mih) ## 💫 Enhancements and new features - Speed up of status reports for repositories with many submodules. An early presence check for submodules skips unnecessary evaluation steps. Fixes https://github.com/datalad/datalad-next/issues/606 via https://github.com/datalad/datalad-next/pull/607 (by @mih) ## 🏠 Internal - Fix implementation error in `ParamDictator` class that caused a test failure. The class itself is unused and has been scheduled for removal. See https://github.com/datalad/datalad-next/issues/611 and https://github.com/datalad/datalad-next/pull/610 (by @christian-monch) ## 🛡 Tests - Promote a previously internal fixture to provide a standard `modified_dataset` fixture. This fixture is sessions-scope, and yields a dataset with many facets of modification, suitable for testing change reporting. The fixture verifies that no modifications have been applied to the testbed. (by @mih) - `iterable_subprocess` tests have been robustified to better handle the observed diversity of execution environments. This addresseses, for example, https://bugs.debian.org/1061739. https://github.com/datalad/datalad-next/pull/614 (by @christian-monch) # 1.1.0 (2024-01-21) -- Iterate! ## 💫 Enhancements and new features - A new paradigm for subprocess execution is introduced. The main workhorse is `datalad_next.runners.iter_subproc`. This is a context manager that feeds input to subprocesses via iterables, and also exposes their output as an iterable. The implementation is based on https://github.com/uktrade/iterable-subprocess, and a copy of it is now included in the sources. It has been modified to work homogeneously on the Windows platform too. This new implementation is leaner and more performant. Benchmarks suggest that the execution of multi-step pipe connections of Git and git-annex commands is within 5% of the runtime of their direct shell-execution equivalent (outside Python). See https://github.com/datalad/datalad-next/pull/538 (by @mih), https://github.com/datalad/datalad-next/pull/547 (by @mih). With this change a number of additional features have been added, and internal improvements have been made. For example, any use of `ThreadedRunner` has been discontinued. See https://github.com/datalad/datalad-next/pull/539 (by @christian-monch), https://github.com/datalad/datalad-next/pull/545 (by @christian-monch), https://github.com/datalad/datalad-next/pull/550 (by @christian-monch), https://github.com/datalad/datalad-next/pull/573 (by @christian-monch) - A new `itertools` module was added. It provides implementations of iterators that can be used in conjunction with `iter_subproc` for standard tasks. This includes the itemization of output (e.g., line-by-line) across chunks of bytes read from a process (`itemize`), output decoding (`decode_bytes`), JSON-loading (`json_load`), and helpers to construct more complex data flows (`route_out`, `route_in`). - The `more_itertools` package has been added as a new dependency. It is used for `datalad-next` iterator implementations, but is also ideal for client code that employed this new functionality. - A new `iter_annexworktree()` provides the analog of `iter_gitworktree()` for git-annex repositories. - `iter_gitworktree()` has been reimplemented around `iter_subproc`. The performance is substantially improved. - `iter_gitworktree()` now also provides file pointers to symlinked content. Fixes https://github.com/datalad/datalad-next/issues/553 via https://github.com/datalad/datalad-next/pull/555 (by @mih) - `iter_gitworktree()` and `iter_annexworktree()` now support single directory (i.e., non-recursive) reporting too. See https://github.com/datalad/datalad-next/pull/552 - A new `iter_gittree()` that wraps `git ls-tree` for iterating over the content of a Git tree-ish. https://github.com/datalad/datalad-next/pull/580 (by @mih). - A new `iter_gitdiff()` wraps `git diff-tree|files` and provides a flexible basis for iteration over changesets. - `PathBasedItem`, a dataclass that is the bases for many item types yielded by iterators now more strictly separates `name` property from path semantics. The name is a plain string, and an additional, explicit `path` property provides it in the form of a `Path`. This simplifies code (the `_ZipFileDirPath` utility class became obsolete and was removed), and improve performance. Fixes https://github.com/datalad/datalad-next/issues/554 and https://github.com/datalad/datalad-next/issues/581 via https://github.com/datalad/datalad-next/pull/583 (by @mih) - A collection of helpers for running Git command has been added at `datalad_next.runners.git`. Direct uses of datalad-core runners, or `subprocess.run()` for this purpose have been replaced with call to these utilities. https://github.com/datalad/datalad-next/pull/585 (by @mih) - The performance of `iter_gitworktree()` has been improved by about 10%. Fixes https://github.com/datalad/datalad-next/issues/540 via https://github.com/datalad/datalad-next/pull/544 (by @mih). - New `EnsureHashAlgorithm` constraint to automatically expose and verify algorithm labels from `hashlib.algorithms_guaranteed` Fixes https://github.com/datalad/datalad-next/issues/346 via https://github.com/datalad/datalad-next/pull/492 (by @mslw @adswa) - The `archivist` remote now supports archive type detection from `*E`-type annex keys for `.tgz` archives too. Fixes https://github.com/datalad/datalad-next/issues/517 via https://github.com/datalad/datalad-next/pull/518 (by @mih) - `iter_zip()` uses a dedicated, internal `PurePath` variant to report on directories (`_ZipFileDirPath`). This enables more straightforward `item.name in zip_archive` tests, which require a trailing `/` for directory-type archive members. https://github.com/datalad/datalad-next/pull/430 (by @christian-monch) - A new `ZipArchiveOperations` class added support for ZIP files, and enables their use together with the `archivist` git-annex special remote. https://github.com/datalad/datalad-next/pull/578 (by @christian-monch) - `datalad ls-file-collection` has learned additional collections types: - The new `zipfile` collection type that enables uniform reporting on the additional archive type. - The new `annexworktree` collection that enhances the `gitworktree` collection by also reporting on annexed content, using the new `iter_annexworktree()` implementation. It is about 15% faster than a `datalad --annex basic --untracked no -e no -t eval`. - The new `gittree` collection for listing any Git tree-ish. - A new `iter_gitstatus()` can replace the functionality of `GitRepo.diffstatus()` with a substantially faster implementation. It also provides a novel `mono` recursion mode that completely hides the notion of submodules and presents deeply nested hierarchies of datasets as a single "monorepo". https://github.com/datalad/datalad-next/pull/592 (by @mih) - A new `next-status` command provides a substantially faster alternative to the datalad-core `status` command. It is closely aligned to `git status` semantics, only reports changes (not repository listings), and supports type change detection. Moreover, it exposes the "monorepo" recursion mode, and single-directory reporting options of `iter_gitstatus()`. It is the first command to use `dataclass` instances as result types, rather than the traditional dictionaries. Git v2.31 or later is required. - `SshUrlOperations` now supports non-standard SSH ports, non-default user names, and custom identity file specifications. Fixed https://github.com/datalad/datalad-next/issues/571 via https://github.com/datalad/datalad-next/pull/570 (by @mih) - A new `EnsureRemoteName` constraint improves the parameter validation of `create-sibling-webdav`. Moreover, the command has been uplifted to support uniform parameter validation also for the Python API. Missing required remotes, or naming conflicts are now detected and reported immediately before the actual command implementation runs. Fixes https://github.com/datalad/datalad-next/issues/193 via https://github.com/datalad/datalad-next/pull/577 (by @mih) - `datalad_next.repo_utils` provide a collection of implementations for common operations on Git repositories. Unlike the datalad-core `Repo` classes, these implementations do no require a specific data structure or object type beyond a `Path`. ## 🐛 Bug Fixes - Add patch to fix `update`'s target detection for adjusted mode datasets that can crash under some circumstances. See https://github.com/datalad/datalad/issues/7507, fixed via https://github.com/datalad/datalad-next/pull/509 (by @mih) - Comparison with `is` and a literal was replaced with a proper construct. While having no functional impact, it removes an ugly `SyntaxWarning`. Fixed https://github.com/datalad/datalad-next/issues/526 via https://github.com/datalad/datalad-next/pull/527 (by @mih) ## 📝 Documentation - The API documentation has been substantially extended. More already documented API components are now actually renderer, and more documentation has been written. ## 🏠 Internal - Type annotations have been extended. The development workflows now inform about type annotation issues for each proposed change. - Constants have been migrated to `datalad_next.consts`. https://github.com/datalad/datalad-next/pull/575 (by @mih) ## 🛡 Tests - A new test verifies compatibility with HTTP serves that do not report download progress. https://github.com/datalad/datalad-next/pull/369 (by @christian-monch) - The overall noise-level in the test battery output has been reduced substantially. INFO log messages are no longer shown, and command result rendering is largely suppressed. New test fixtures make it easier to maintain tidier output: `reduce_logging`, `no_result_rendering`. The contribution guide has been adjusted encourage their use. - Tests that require an unprivileged system account to run are now skipped when executed as root. This fixes an issue of the Debian package. https://github.com/datalad/datalad-next/pull/593 (by @adswa) # 1.0.2 (2023-10-23) -- Debianize! ## 🏠 Internal - The `www-authenticate` dependencies is dropped. The functionality is replaced by a `requests`-based implementation of an alternative parser. This trims the dependency footprint and facilitates Debian-packaging. The previous test cases are kept and further extended. Fixes https://github.com/datalad/datalad-next/issues/493 via https://github.com/datalad/datalad-next/pull/495 (by @mih) ## 🛡 Tests - The test battery now honors the `DATALAD_TESTS_NONETWORK` environment variable and downgrades by skipping any tests that require external network access. (by @mih) # 1.0.1 (2023-10-18) ## 🐛 Bug Fixes - Fix f-string syntax in error message of the `uncurl` remote. https://github.com/datalad/datalad-next/pull/455 (by @christian-monch) - `FileSystemItem.from_path()` now honors its `link_target` parameter, and resolves a target for any symlink item conditional on this setting. Previously, a symlink target was always resolved. Fixes https://github.com/datalad/datalad-next/issues/462 via https://github.com/datalad/datalad-next/pull/464 (by @mih) - Update the vendor installation of versioneer to v0.29. This resolves an installation failure with Python 3.12 due to the removal of an ancient class. Fixes https://github.com/datalad/datalad-next/issues/475 via https://github.com/datalad/datalad-next/pull/483 (by @mih) - Bump dependency on Python to 3.8. This is presently the oldest version still supported upstream. However, some functionality already used 3.8 features, so this is also a bug fix. Fixes https://github.com/datalad/datalad-next/issues/481 via https://github.com/datalad/datalad-next/pull/486 (by @mih) ## 💫 Enhancements and new features - Patch datalad-core's `run` command to honor configuration defaults for substitutions. This enables placeholders like `{python}` that point to `sys.executable` by default, and need not be explicitly defined in system/user/dataset configuration. Fixes https://github.com/datalad/datalad-next/issues/478 via https://github.com/datalad/datalad-next/pull/485 (by @mih) ## 📝 Documentation - Include `gitworktree` among the available file collection types listed in `ls-file-collection`'s docstring. Fixes https://github.com/datalad/datalad-next/issues/470 via https://github.com/datalad/datalad-next/pull/471 (by @mslw) - The renderer API documentation now includes an entrypoint for the runner-related functionality and documentation at https://docs.datalad.org/projects/next/en/latest/generated/datalad_next.runners.html Fixes https://github.com/datalad/datalad-next/issues/466 via https://github.com/datalad/datalad-next/pull/467 (by @mih) ## 🛡 Tests - Simplified setup for subprocess test-coverage reporting. Standard pytest-cov features are not employed, rather than the previous approach that was adopted from datalad-core, which originated in a time when testing was performed via nose. Fixes https://github.com/datalad/datalad-next/issues/453 via https://github.com/datalad/datalad-next/pull/457 (by @mih) # 1.0.0 (2023-09-25) This release represents a milestone in the development of the extension. The package is reorganized to be a collection of more self-contained mini-packages, each with its own set of tests. Developer documentation and guidelines have been added to aid further development. One particular goal is to establish datalad-next as a proxy for importing datalad-core functionality for other extensions. Direct imports from datalad-core can be minimized in favor of imports from datalad-next. This helps identifying functionality needed outside the core package, and guides efforts for future improvements. The 1.0 release marks the switch to a more standard approach to semantic versioning. However, although a substantial improvements have been made, the 1.0 version nohow indicates a slowdown of development or a change in the likelihood of (breaking) changes. They will merely become more easily discoverable from the version label alone. Notable high-level features introduced by this major release are: - The new `UrlOperations` framework to provide a set of basic operations like `download`, `upload`, `stat` for different protocols. This framework can be thought of as a replacement for the "downloaders" functionality in datalad-core -- although the feature list is not 100% overlapping. This new framework is more easily extensible by 3rd-party code. - The `Constraints` framework elevates parameter/input validation to the next level. In contrast to datalad-core, declarative input validation is no longer limited to the CLI. Instead, command parameters can now be validated regardless of the entrypoint through which a command is used. They can be validated individually, but also sets of parameters can be validated jointly to implement particular interaction checks. All parameter validations can now be performed exhaustive, to present a user with a complete list of validation errors, rather then the fail-on-first-error method implemented exclusively in datalad-core. Validation errors are now reported using dedicated structured data type to aid their communication via non-console interfaces. - The `Credentials` system has been further refined with more homogenized workflows and deeper integration into other subsystems. This release merely represents a snapshot of continued development towards a standardization of credential handling workflows. - The annex remotes `uncurl` and `archivist` are replacements for the datalad-core implementations `datalad` and `datalad-archive`. The offer substantially improved configurability and leaner operation -- built on the `UrlOperations` framework. - A growing collection of iterator (see `iter_collections`) aims to provide fast (and more Pythonic) operations on common data structures (Git worktrees, directories, archives). The can be used as an alternative to the traditional `Repo` classes (`GitRepo`, `AnnexRepo`) from datalad-core. - Analog to `UrlOperations` the `ArchiveOperations` framework aims to provide an abstraction for operations on different archive types (e.g., TAR). The represent an alternative to the traditional implementations of `ExtractedArchive` and `ArchivesCache` from datalad-core, and aim at leaner resource footprints. - The collection of runtime patches for datalad-core has been further expanded. All patches are now individually documented, and applied using a set of standard helpers (see http://docs.datalad.org/projects/next/en/latest/patches.html). For details, please see the changelogs of the 1.0.0 beta releases below. ## 💫 Enhancements and new features - `TarArchiveOperations` is the first implementation of the `ArchiveOperations` abstraction, providing archive handlers with a set of standard operations: - `open` to get a file object for a particular archive member - `__contains__` to check for the presence of a particular archive member - `__iter__` to get an iterator for processing all archive members https://github.com/datalad/datalad-next/pull/415 (by @mih) ## 🐛 Bug Fixes - Make `TarfileItem.name` be of type `PurePosixPath` to reflect the fact that a TAR archive can contain members with names that cannot be represent unmodified on a non-POSIX file system. https://github.com/datalad/datalad-next/pull/422 (by @mih) An analog change is done for `ZipfileItem.name`. https://github.com/datalad/datalad-next/pull/409 (by @christian-monch) - Fix `git ls-file` parsing in `iter_gitworktree()` to be compatible with file names that start with a `tab` character. https://github.com/datalad/datalad-next/pull/421 (by @christian-monch) ## 📝 Documentation - Expanded guidelines on test implementations. - Add missing and fix wrong docstrings for HTTP/WebDAV server related fixtures. https://github.com/datalad/datalad-next/pull/445 (by @adswa) ## 🏠 Internal - Deduplicate configuration handling code in annex remotes. https://github.com/datalad/datalad-next/pull/440 (by @adswa) ## 🛡 Tests - New test fixtures have been introduced to replace traditional test helpers from datalad-core: - `datalad_interactive_ui` and `datalad_noninteractive_ui` for testing user interactions. They replace `with_testsui`. https://github.com/datalad/datalad-next/pull/427 (by @mih) - Expand test coverage for `create_sibling_webdav` to include recursive operation. https://github.com/datalad/datalad-next/pull/434 (by @adswa) # 1.0.0b3 (2023-06-09) ## 🐛 Bug Fixes - Patch `CommandError`, the standard exception raised for any non-zero exit command execution to now reports which command failed with `repr()` too. Previously, only `str()` would produce an informative message about a failure, while `repr()` would report `CommandError('')`, unless a dedicated message was provided. (by @mih) - Some error messages (in particular from within git-annex special remotes) exhibited uninformative error messages like `CommandError('')`. This is now fixed by letting `CommandError` produce the same error rendering in `__str__` and `__repr__`. Previously, `RuntimeError.__repr__` was used, which was unaware of command execution details also available in the exception. https://github.com/datalad/datalad-next/pull/386 (by @mih) - The `datalad-annex` Git remote helper can now handle the case where a to-be-clone repository has a configured HEAD ref that does not match the local configured default (e.g., `master` vs `main` default branch). Fixes https://github.com/datalad/datalad-next/issues/412 via https://github.com/datalad/datalad-next/pull/411 (by @mih) - Patch `create_sibling_gitlab` to work with present day GitLab deployments. This required adjusting the naming scheme for the `flat` and `collection` layouts. Moreover, the `hierarchy` layout is removed. it has never been fully implemented, and conceptually suffers from various corner-cases that cannot be (easily) addressed. Consequently, the `collection` layout is the new default. It's behavior matches that of `hierarchy` as far as this was functional, hence there should be no breakage for active users. https://github.com/datalad/datalad-next/pull/413 ## 💫 Enhancements and new features - Patch the process entrypoint of DataLad's git-annex special remote implementations to funnel internal progress reporting to git-annex via standard `PROGRESS` protocol messages. This makes it obsolete (in many cases) to implement custom progress reporting, and the use of the standard `log_progress()` helper (either directly or indirectly) is sufficient to let both a parent DataLad process or git-annex see progress reports from special remotes. Fixes https://github.com/datalad/datalad-next/issues/328 via https://github.com/datalad/datalad-next/pull/329 (by @mih) - The `HttpUrlOperations` handler now supports custom HTTP headers. This makes it possible to define custom handlers in configuration that include such header customization, for example to send custom secret or session IDs. Fixes https://github.com/datalad/datalad-next/issues/336 (by @mih) - `Constraint` implementations now raise `ConstraintError` consistently on a violation. This now makes it possible to distinguish properly handled violations from improper implementation of such checks. Moreover, `raise_for()` is now used consistently, providing uniform, structured information on such violations. `ConstraintError` is derived from `ValueError` (the exception that was previously (mostly) raised. Therefore, client-code should continue to work without modification, unless a specific wording of an exception message is relied upon. In few cases, an implicit `TypeError` (e.g., `EnsureIterableof`) has been replaced by an explicit `ConstraintError`, and client code needs to be adjusted. The underlying exception continues to be available via `ConstraintError.caused_by`. (by @mih) - New `MultiHash` helper to compute multiple hashes in one go. Fixes https://github.com/datalad/datalad-next/issues/345 (by @mih) - As a companion of `LeanGitRepo` a `LeanAnnexRepo` has been added. This class is primarily used to signal that particular code does not require the full `AnnexRepo` API, but works with a much reduced API, as defined by that class. The API definition is not final and will grow in future releases to accommodate all standard use cases. https://github.com/datalad/datalad-next/pull/387 (by @mih) - Dedicated dataclasses for common types, such as git-annex keys (`AnnexKey`) and `dl+archives:` URLs (`ArchivistLocator`) have been added. They support parsing and rendering their respective plain-text representations. These new types are now also available for more precise type annotation and argument validation. (by @mih) - `datalad_next.archive_operations` has been added, and follows the pattern established by the `UrlOperations` framework, to provide uniform handling to different archive types. Two main (read) operations are supported: iteration over archive members, and access to individual member content via a file-like. (by @mih) - New `archivist` git-annex special remote, as a replacement for the `datalad-archives` remote. It is implemented as a drop-in replacement with the ability to also fall-back on the previous implementation. In comparison to its predecessor, it reduces the storage overhead from 200% to 100% by doing partial extraction from fully downloaded archives. It is designed to be extended with support for partial access to remote archives (thereby reducing storage overhead to zero), but this is not yet implemented. - New `datalad_next.iter_collections` module providing iterators for items in particular collections, such as TAR or ZIP archives members, the content of a file system directory, or the worktree of a Git repository. Iterators yield items of defined types that typically carry information on the properties of collections items, and (in the case of files) access to their content. - New command `ls_file_collection()` is providing access to a select set of collection iterators via the DataLad command. In addition to the plain iterators, it provide uniform content hashing across all supported collection types. - The `datalad-annex` Git remote helper can now recognize and handle legacy repository deposits made by its predecessor from `datalad-osf`. https://github.com/datalad/datalad-next/pull/411 (by @mih) ## 🏠 Internal - Remove DataLad runner performance patch, and all patches to clone functionality. They are included in datalad-0.18.1, dependency adjusted. - New `deprecated` decorator for standardized deprecation handling of commands, functions, and also individual keyword arguments of callables, and even particular values for such arguments. Inspired by https://github.com/datalad/datalad/issues/6998. Contributed by @adswa - Use the correct type annotation for `cfg`-parameter of `datalad_next.utils.requests_auth.DataladAuth.__init__()` https://github.com/datalad/datalad-next/pull/385 (by @christian-monch) - The patch registry has been moved to `datalad_next.patches.enabled`, and the `apply_patch()` helper is now located in `datalad_next.patches` directly to avoid issues with circular dependencies when patching core components like the `ConfigManager`. The documentation on patching has been adjusted accordingly. https://github.com/datalad/datalad-next/pull/391 (by @mih) - The `main()` entrypoint of the `datalad-annex` Git remote helper has be generalized to be more reusable by other (derived) remote helper implementations. https://github.com/datalad/datalad-next/pull/411 (by @mih) # 1.0.0b2 (2023-03-17) ## 💫 Enhancements and new features - `CredentialManager` - The Credential Manager gained a new helper, ``obtain()``, that supports a credential selection by name/ID, falls back to querying with a set of properties, and would finally resort to an interactive credential query from the user. ([#216](https://github.com/datalad/datalad-next/pull/216) by @mih) - All optional arguments of the CredentialManager are now keyword-argument-only ([#230](https://github.com/datalad/datalad-next/pull/230) by @mih) - Users no longer need to provide type hints for legacy credentials in "provider" configurations ([#247](https://github.com/datalad/datalad-next/pull/247) by @mih) - Credential reporting supports a ``cred_type`` annotation ([#257](https://github.com/datalad/datalad-next/pull/257) by @mih) - Credential errors for GitHub-like remotes were improved to hint users how to update or set new credentials ([#235](https://github.com/datalad/datalad-next/pull/235) by @mih) - `UrlOperations` - The URL handler can now load configurations from config files ([#222](https://github.com/datalad/datalad-next/pull/222) by @mih) - Improved messaging within `URLOperationsRemoteError` ([#308](https://github.com/datalad/datalad-next/pull/308) by @mih) - `Parameter validation` - A new `validate_defaults` parameter of ``EnsureCommandParameterization`` allows opt-in parameter validation, which causes processing of any specified parameter's default. ([#227](https://github.com/datalad/datalad-next/pull/227) by @mih) - A new base class ``ConstraintError`` can communicate parameter validation errors and can associate constraint violations with a particular context. ``CommandParametrizationError`` uses it to communicate violations for a full command parameterization at once and is used in an improved `EnsureCommandParametrization` constraint. Callers can now also decide whether to perform an exhaustive parameter validation, or fail on first error. ([#234](https://github.com/datalad/datalad-next/pull/234) by @mih) - A new ``ConstraintWithPassthrough`` constraint exposes `EnsureParameterConstraint`'s pass-through feature ([#244](https://github.com/datalad/datalad-next/pull/244) by @mih) - `EnsureCommandParameterization` learned a `tailor_for_dataset()` parameter that can be used to identify which parameters' constraints should be tailored for which dataset. This allows tailoring constraints for particular datasets ([#260](https://github.com/datalad/datalad-next/pull/260) by @mih) - ``EnsurePath`` can be tailored to dataset instances to resolve paths against a given Dataset ([#271](https://github.com/datalad/datalad-next/pull/271) by @mih) - The ``EnsureDataset`` constraint learned an optional check for a valid dataset ID ([#279](https://github.com/datalad/datalad-next/pull/279) by @adswa) - A ``WithDescription`` meta constraints paves the way for custom docs for parameters: If given, it replaces the original parameter documentation, and can be used to tailor descriptions for specific use cases. ([#294](https://github.com/datalad/datalad-next/pull/294) by @mih) - Parameter violations gained structured error reporting and customized rendering of parameter violations ([#306](https://github.com/datalad/datalad-next/pull/306) by @mih) - ``EnsureGeneratorFromFileLike`` became more suitable for batch mode use by learning to yield instead of raise internal exceptions, if configured by the caller ([#278](https://github.com/datalad/datalad-next/pull/278) by @mih) ## 🐛 Bug Fixes - Previously, the last used credential matching a ``realm`` was used unconditionally. Now, credentials without secrets are excluded. ([#248](https://github.com/datalad/datalad-next/pull/248) by @mih) - ``AND`` and ``OR`` compounds for Constraints do not modify Constraints in place anymore, but return a new instance. ([#292](https://github.com/datalad/datalad-next/pull/292) by @mih) - Even though the ``EnsureDataset`` constraint returns ``DatasetParameter`` objects, ``_execute_command`` that would patch up DataLad commands wasn't able to work with them ([#269](https://github.com/datalad/datalad-next/pull/269) by @adswa) ## 🪓 Deprecations and removals - The URL operation ``sniff`` was renamed to ``stat``. ([#231](https://github.com/datalad/datalad-next/pull/231) by @adswa) - `serve_path_via_webdav()` that came with 0.2 was deprecated in favor of the `webdav_server` fixture ([#301](https://github.com/datalad/datalad-next/pull/301) by @mih) ## 📝 Documentation - A dedicated Developer Guide section of the docs was introduced ([#304](https://github.com/datalad/datalad-next/pull/304) by @adswa) - The README mentions the `uncurl` special remote, and the documentation now provide installation information - ``CONTRIBUTING.md`` was updated on patching ([#262](https://github.com/datalad/datalad-next/pull/262/) by @mih) ## 🏠 Internal - Package dependencies were made explicit ([#212](https://github.com/datalad/datalad-next/pull/212) by @mih) - Misc. code reorganization: - The CredentialManager was elevated to a top-level module ([#229](https://github.com/datalad/datalad-next/pull/220) by @mih) - Dataset-lookup behavior of the ``credentials`` command became identical to ``download`` ([#256](https://github.com/datalad/datalad-next/pull/256) by @mih) - The DataLad runner performance patch and all patches to clone functionality were removed as they are included in datalad-0.18.1; The dependency was adjusted accordingly. ([#218](https://github.com/datalad/datalad-next/pull/218) by @mih) - Compound constraints got a comprehensive ``__repr__`` to improve debugging ([#276](https://github.com/datalad/datalad-next/pull/276) by @mih) - Discontinue legacy code ([#300](https://github.com/datalad/datalad-next/pull/300/) by @mih) ## 🛡 Tests - Automatic CI builds were disabled for changes constrained to the following files and directories: `.github/`, `CHANGELOG.md`, `CITATION.cff`, `CONTRIBUTORS`, `LICENSE`, `Makefile`, `README.md`, `readthedocs.yml` - Coverage reports for the uncurl special remote ([#220](https://github.com/datalad/datalad-next/pull/220) by @mih) - Tests will not fail if coverage uploads fail ([#241](https://github.com/datalad/datalad-next/pull/241/files) by @mih) - GitHub actions use the `datalad-installer` to install git-annex ([#239](https://github.com/datalad/datalad-next/pull/239/files) by @mih) - A bug in DataLad's test setup causes configuration managers to leak across datasets (https://github.com/datalad/datalad/issues/7297). Next implemented test isolation for keyring and config as a fix ([#263](https://github.com/datalad/datalad-next/pull/263) by @mih) - A number of new pytest fixtures were introduced: - `memory_keyring` ([#254](https://github.com/datalad/datalad-next/pull/254) by @mih), which was then replaced by ``tmp_keywing`` ([#264](https://github.com/datalad/datalad-next/pull/264)) - `dataset` and `existing_dataset` ([#296](https://github.com/datalad/datalad-next/pull/296) by @mih) - `webdav_server` ([#297](https://github.com/datalad/datalad-next/pull/297/) by @mih) - `httpbin` ([#313](https://github.com/datalad/datalad-next/pull/313) by @jwodder) - 100% coverage for constraints ([#259](https://github.com/datalad/datalad-next/pull/259/)) # 1.0.0b1 (2022-12-23) ## 💫 Enhancements and new features - Improved composition of importable functionality. Key components for `commands`, `annexremotes`, `datasets` (etc) are collected in topical top-level modules that provide "all" necessary pieces in a single place. - Add patch to `ThreadedRunner` to use a more optimal buffer size for its read thread. This was previously fixed to 1024 bytes, and now uses the value of `shutil.COPY_BUFSIZE` as a platform-tailored default. This can boost the throughput from several tens to several hundreds MB/s. - A new `download` command aims to replace any download-related functionality in DataLad. It supports single-pass checksumming, progress reporting for any supported URL scheme. Currently support schemes are `http(s)://`, `file://`, and `ssh://`. The new command integrates with the `datalad-next` credential system, and supports auto-discovery, interactive-prompt-on-demand, and (optional) save-on-success of credentials. Additional URL scheme handlers can be provided by extension packages. Unlike the datalad-core downloaders framework, they can be fully standalone, as long as they implement a lean adaptor class (see `datalad_next.url_operations`). The `AnyUrlOperations` is provided to enable generic usage in client code where an underlying handler is auto-selected based on the URL scheme. `datalad_next.url_operations.any._urlscheme_handler` contains a (patch-able) mapping of scheme identifiers to handler classes. The `uncurl` special remote makes this URL operations framework accessible via `git-annex`. It provides flexible means to compose and rewrite URLs (e.g., to compensate for storage infrastructure changes) without having to modify individual URLs recorded in datasets. It enables seamless transitions between any services and protocols supported by the framework. - A `python-requests` compatible authentication handler (`DataladAuth`) that interfaces DataLad's credential system has been added. - A substantially more comprehensive replacement for DataLad's `constraints` system for type conversion and parameter validation has been developed and is included in this release. This includes all types of the predecessor in the DataLad core package, and a large number of additions, including - `EnsureMapping` (aids handling of key-value specification) - `EnsureGeneratorFromFileLike` (aids reading inputs from, e.g. STDIN; see the `download` command for how reading JSON-lines input can be supported in addition with virtually no changes to the actual command implementation) - `EnsurePath` (existing or not, particular formats, etc.) - `EnsureJSON` (automatic validation and loading) - `Ensure(Parsed)URL` (pattern matching, requiring/forbidding components) - `EnsureGitRefName` (check for compliance with Git's naming rules) - Commands can now opt-in to receive fully validated parameters. This can substantially simplify the implementation complexity of a command at the expense of a more elaborate specification of the structural and semantic properties of the parameters. This specification is achieved by declaring an `EnsureCommandParameterization`, in a `_validator_` member of a command's `ValidatedInterface` class. This feature is introduced as a patch to the command execution in datalad-core. With this patch, commands are now exclusively called with keyword-style parameters only. This feature is in an early stage of development (although all included commands have already been ported to use it) that will likely undergo substantial changes in the coming releases. - A new `EnsureDataset` constraint is provided that returns a `DatasetParameter` on successful validation. This return value contains the original input specification, and the `Dataset` class instance. The `resolve_path()` utility is adjust to support this parameter-type, thereby making the use of the `require_dataset()` utility obsolete. - As a companion for the `http(s)://` URL handling for the new `download` command, a `requests`-compatible authentication handler has been implemented that integrates with the `datalad-next` credential system. ## 📝 Documentation - All runtime patches are now documented and included in the readthedocs-hosted documentation. ## 🏠 Internal - No code uses `Constraint` implementations from the DataLad core package anymore. - Further expand type annotations of the code base. # 0.6.3 (2022-10-26) -- Tests only ## 🐛 Bug Fixes - Small change in the tests of the `tree` command for more robust behavior across Python and pytest versions. https://github.com/datalad/datalad-next/pull/117 (by @bpoldrack) # 0.6.2 (2022-10-14) -- Hidden secrets ## 🐛 Bug Fixes - `CredentialManager` no longer splits a credential input prompt into a prompt message (`ui.message()`) and the actual input (`ui.question()`) this enables DataLad Gooey to properly render this jointly as an input dialog with a description. https://github.com/datalad/datalad-next/pull/113 (by @bpoldrack) ## 💫 Enhancements and new features - `CredentialManager.get()` and the `credentials` command now also report credential fragments for which there is no secret on record. This enables the discovery of DataLad's legacy credentials, and for setting a secret for them for use with the next credential system. Moreover, it reports half-configured credentials, and facilitates their clean-up or completion, for example with DataLad Gooey's credential management GUI. # 0.6.1 (2022-09-27) ## 💫 Enhancements and new features - A new patch set break up the implementation of `clone_dataset()` into its procedural components, and makes it more accessible for extension patches. There are no behavior changes associated with this internal reorganization. # 0.6.0 (2022-08-25) ## 🐛 Bug Fixes - Fixed datalad-push always reporting success when pushing to an export remote. Fixes https://github.com/datalad/datalad-next/issues/88 via https://github.com/datalad/datalad-next/pull/93 (by @bpoldrack) - Token secrets entered for GitHub-like sibling creation are now stored by default under a name matching the API endpoint hostname (e.g. 'api.github.com'), rather than a confusing and conflict-prone 'None'. Using the `--credential` option, an alternative name can be given, as before. Fixes https://github.com/datalad/datalad-next/issues/97 via https://github.com/datalad/datalad-next/pull/98 (by @mih) ## 💫 Enhancements and new features - The `configuration` command now indicates the absence of a particular configuration setting queried via `get` with a `status='impossible'` result. This change enables the distinction of an unset configuration item from an item set to an empty string with the default CLI result renderer. Fixes https://github.com/datalad/datalad/issues/6851 via https://github.com/datalad/datalad-next/pull/87 by @mih - The default of the configuration item `datalad.annex.retry` (in effect when not explicitly configured otherwise) is changed from `3` to `1`. This prevents a set of performance and user experience issues resulting from, e.g., repeated download attempts, even when no change in outcome can be expected (e.g., a wrong or no credential supplied). This change can cause a loss of robustness in download behavior for services that indeed experience spurious failures. Its is recommended to specifically parametrize such command calls (e.g., downloads in CI runs) with an appropriate configuration override. Fixes https://github.com/datalad/datalad/issues/6969 and https://github.com/datalad/datalad/issues/6509 (by @mih) - New `tree` command for traversing a directory hierarchy. Like the UNIX equivalent, it can visualize a directory tree. Additionally, it annotates the output with DataLad-related information, like the location of dataset, and their nesting depth. Besides visualization, `tree` also reports structured data in the form of result records that enable other applications to use `tree` for gathering data from the file system. Fixes https://github.com/datalad/datalad-next/issues/78 via https://github.com/datalad/datalad-next/pull/92 (by @catetrai) ## 📝 Documentation - Add an example of adding a `user_password`-type credentials, with a given `user` property, to the examples in the `credentials` command. https://github.com/datalad/datalad-next/pull/103 (by @mslw) # 0.5.0 (2022-07-19) ## 💫 Enhancements and new features - The `configuration` command no longer requires a datasets to be present for a `get` operation to retrieve a configuration item from scope `global`. Fixes [#6864](https://github.com/datalad/datalad/issues/6854) via [#86](https://github.com/datalad/datalad-next/pull/86) (by @mih) # 0.4.1 (2022-07-14) ## 🐛 Bug Fixes - Fix a missing import in the credential retrieval for GitHub-like sibling creation, which made it impossible to discover credentials without providing an explicit credential name. # 0.4.0 (2022-07-08) -- datalad-annex:: for all #### 💫 Enhancements and new features - `datalad-annex::` Git remote helper now uses `git annex transferkey` instead of `fsck` to "probe" for `XDLRA` repository keys. This avoid problems due to a behavior change in git-annex 10.20220525, and can also speed-up operation for slow special remotes, by avoiding a dedicated probe-request. [#76](https://github.com/datalad/datalad-next/pull/76) (by @mih) - `datalad-annex::` Git remote helper now fully compatible with the Windows platform, by working around [a git-annex issue](https://git-annex.branchable.com/bugs/Fails_to_drop_key_on_windows___40__Access_denied__41__) [#77](https://github.com/datalad/datalad-next/pull/77) (by @mih) #### 🐛 Bug Fixes - Prevent docstring duplication in patched `push` command [#71](https://github.com/datalad/datalad-next/pull/71) (by @mih) #### 📝 Documentation - Bibliographic information on authorship was added [#80](https://github.com/datalad/datalad-next/pull/80) (by @mslw) #### 🛡 Tests - The test battery is now using `pytest`. This change required bumping the dependency on DataLad to version 0.17.0. [#73](https://github.com/datalad/datalad-next/pull/73) (by @mih) #### 🏠 Internal - Reduced code duplication by consolidating on a common helper for sibling identification, now available from DataLad 0.17.0 [#82](https://github.com/datalad/datalad-next/pull/82) (by @adswa) #### Authors: 3 - Michael Hanke (@mih) - Michał Szczepanik (@mslw) - Adina Wagner (@adswa) # 0.3.0 (2022-05-25) -- Optimized push #### 💫 Enhancements and new features - Make push avoid refspec handling for special remote push targets. See PR https://github.com/datalad/datalad-next/pull/64 for details on the associated behavior changes that are introduced with this new patch. # 0.2.2 (2022-04-29) -- More docs! #### 📝 Documentation - Adjusted documentation of patched datalad-core commands now also shows properly in Python sessions. - Extended the documentation on collaborative workflows with ``datalad-annex::``Git remotes and WebDAV siblings. # 0.2.1 (2022-04-28) -- User experience #### 💫 Enhancements and new features - Disable auto-enabling of webdav storage remotes on clone. Datalad does not yet support the needed inspection to determine the necessary credentials automatically. Instead an explicit `datalad sibling enable` call is required. This is now also added to the documentation. - Make sure that `create-sibling-webdav` does not ask users to input the internal `realm` property, when prompting for credentials. - `CredentialManager` now displays more appropriate labels when prompting for a secret, e.g. `password` instead of `user_password`. # 0.2.0 (2022-04-28) -- WebDAV This release primarily brings the ability to store DataLad datasets on a WebDAV server. This is done in a way that allows for cloning such dataset with `datalad clone` from such a WebDAV server too. This feature enables full-featured DataLad-based collaborative workflows on widely available cloud storage systems, such as owncloud/next/cloud -- which are also the basis for several institutional services like the European Open Science Cloud's (EOSC) B2DROP service. #### 💫 Enhancements and new features - A `create-sibling-webdav` command for hosting datasets on a WebDAV server via a sibling tandem for Git history and file storage. Datasets hosted on WebDAV in this fashion are cloneable with `datalad-clone`. A full annex setup for storing complete datasets with historical file content version, and an additional mode for depositing single-version dataset snapshot are supported. The latter enables convenient collaboration with audiences that are not using DataLad, because all files are browsable via a WebDAV server's point-and-click user interface. - Enhance `datalad-push` to automatically export files to git-annex special remotes configured with `exporttree=yes`. - Enhance `datalad-siblings enable` (`AnnexRepo.enable_remote()` to automatically deploy credentials for git-annex special remotes that require them. - `git-remote-datalad-annex` is a Git remote helper to push/fetch to any location accessible by any git-annex special remote. - `git-annex-backend-XDLRA` (originally available from the `mihextras` extension) is a custom external git-annex backend used by git-remote-datalad-annex. A base class to facilitate development of external backends in Python is also provided. - `serve_path_via_webdav` test decorator that automatically deploys a local WebDAV server. - `with_credential` test decorator that temporarily deploy a credential to the local credential system. - Utilities for HTTP handling - `probe_url()` to discover redirects and authentication requirements for an HTTP URL - `get_auth_realm()` return a label for an authentication realm that can be used to query for matching credentials - Utilities for special remote credential management: - `get_specialremote_credential_properties()` inspects a special remote and return properties for querying a credential store for matching credentials - `update_specialremote_credential()` updates a credential in a store after successful use - `get_specialremote_credential_envpatch()` returns a suitable environment "patch" from a credential for a particular special remote type # 0.1.0 (2022-03-31) -- Credentials, please! #### 💫 Enhancements and new features - A new credential management system is introduced that enables storage and query of credentials with any number of properties associated with a secret. These properties are stored as regular configuration items, following the scheme `datalad.credential..`. The special property `secret` lives in a keystore, but can be overridden using the normal configuration mechanisms. The new system continues to support the previous credential storage setup. Fixes [#6519](https://github.com/datalad/datalad/issues/6519) ([@mih](https://github.com/mih)) - A new `credentials` command enables query, modification and storage of credentials. Legacy credentials are also supported, but may require the specification of a `type`, such as (`token`, or `user_password`) to be discoverable. Fixes [#396](https://github.com/datalad/datalad/issues/396) ([@mih](https://github.com/mih)) - Two new configuration settings enable controlling how the interactive entry of credential secrets is conducted for the new credential manager: `datalad.credentials.repeat-secret-entry` can be used to turn off the default double-entry of secrets, and `datalad.credentials.hidden-secret-entry` can turn off the default hidden entry of secrets. Fixes [#2721](https://github.com/datalad/datalad/issues/2721) ([@mih](https://github.com/mih)) #### Authors: 1 - Michael Hanke ([@mih](https://github.com/mih)) --- datalad-next-1.4.1/CITATION.cff000066400000000000000000000046231462321624600157700ustar00rootroot00000000000000# This CITATION.cff file was generated with cffinit. # Visit https://bit.ly/cffinit to generate yours today! cff-version: 1.2.0 title: DataLad-next extension message: >- DataLad extension for additional, broadly applicable functionality type: software authors: - given-names: Yaroslav O. family-names: Halchenko affiliation: 'Dartmouth College, Hanover, NH, United States' orcid: 'https://orcid.org/0000-0003-3456-2493' - given-names: Michael family-names: Hanke affiliation: >- Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany and Institute of Systems Neuroscience, Medical Faculty, Heinrich Heine University Düsseldorf, Düsseldorf, Germany orcid: 'https://orcid.org/0000-0001-6398-6370' - given-names: Stephan family-names: Heunis affiliation: >- Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany orcid: 'https://orcid.org/0000-0003-3503-9872' - given-names: Christopher J. family-names: Markiewicz affiliation: >- Stanford University, Stanford, CA, United States orcid: 'https://orcid.org/0000-0002-6533-164X' - given-names: Christian family-names: Mönch affiliation: >- Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany orcid: 'https://orcid.org/0000-0002-3092-0612' - given-names: Benjamin family-names: Poldrack affiliation: >- Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany orcid: 'https://orcid.org/0000-0001-7628-0801' - given-names: Michał family-names: Szczepanik affiliation: >- Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany orcid: 'https://orcid.org/0000-0002-4028-2087' - given-names: Adina S. family-names: Wagner affiliation: >- Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany orcid: 'https://orcid.org/0000-0003-2917-3450' - given-names: John T. family-names: Wodder name-suffix: II keywords: - data management - data distribution repository-code: 'https://github.com/datalad/datalad-next' license: MIT datalad-next-1.4.1/CONTRIBUTING.md000066400000000000000000000202101462321624600163150ustar00rootroot00000000000000# Contributing to `datalad-next` - [What contributions are most suitable for `datalad-next`](#when-should-i-consider-a-contribution-to-datalad-next) - [Style guide](#contribution-style-guide) - [Code organization](#code-organization) - [How to implement runtime patches](#runtime-patches) - [How to implement imports](#imports) - [Prohibited DataLad functionality](#prohibited-datalad-core-features) ## When should I consider a contribution to `datalad-next`? In short: whenever a contribution to the DataLad core package would make sense, it should also be suitable for `datalad-next`. ### What contributions should be directed elsewhere? Special interest, highly domain-specific functionality is likely better suited for a topical DataLad extension package. Functionality that requires complex additional dependencies, or is highly platform-specific might also be better kept in a dedicated extension package. If in doubt, it is advisable to file an issue and ask for feedback before preparing a contribution. ### When is a contribution to `datalad-next` preferable over one to the DataLad core package? New feature releases of `datalad-next` are happening more frequently. Typically, every 4-6 weeks. New features depending on other `datalad-next` features are, by necessity, better directed at `datalad-next`. ## Contribution style guide A contribution must be complete with code, tests, and documentation. `datalad-next` is a staging area for features, hence any code is expected to move and morph. Therefore, tests are essential. A high test-coverage is desirable. Contributors should aim for 95% coverage or better. Tests must be dedicated for the code of a particular contribution. It is not sufficient, if other code happens to also exercise a new feature. New code should be type-annotated. At minimum, a type annotation of the main API (e.g., function signatures) is needed. A dedicated CI run is testing type annotations. Docstrings should be complete with information on parameters, return values, and exception behavior. Documentation should be added to and rendered with the sphinx-based documentation. Commits and commit messages must be [Conventional Commits](https://www.conventionalcommits.org). Their compliance is checked for each pull request. ### Code organization In `datalad-next`, all code is organized in shallow sub-packages. Each sub-package is located in a directory within the `datalad_next` package. Consequently, there are no top-level source files other than a few exceptions for technical reasons (`__init__.py`, `conftest.py`, `_version.py`). A sub-package contains any number of code files, and a `tests` directory with all test implementations for that particular sub-package, and only for that sub-package. Other, deeper directory hierarchies are not to be expected. There is no limit to the number of files. Contributors should strive for files with less than 500 lines of code. Within a sub-package, code should generally use relative imports. The corresponding tests should also import the tested code via relative imports. Code users should be able to import the most relevant functionality from the sub-package's `__init__.py`. Only items importable from the sub-package's top-level are considered to be part of its "public" API. If a sub-module is imported in the sub-package's `__init__.py`, consider adding `__all__` to the sub-module to restrict wildcard imports from the sub-module, and to document what is considered to be part of the "public" API. Sub-packages should be as self-contained as possible. Individual components in `datalad-next` should strive to be easily migratable to the DataLad core package. This means that any organization principles like *all-exceptions-go-into-a-single-location-in-datalad-next* do not apply. For example, each sub-package should define its exceptions separately from others. When functionality is shared between sub-packages, absolute imports should be made. There is one special sub-package in `datalad-next`: `patches`. All runtime patches to be applied to the DataLad core package must be placed here. ### Runtime patches The `patches` sub-package contains all runtime patches that are applied by `datalad-next`. Patches are applied on-import of `datalad-next`, and may modify arbitrary aspects of the runtime environment. A patch is enabled by adding a corresponding `import` statement to `datalad_next/patches/enabled.py`. The order of imports in this file is significant. New patches should consider behavior changes caused by other patches, and should be considerate of changes imposed on other patches. `datalad-next` is imported (and thereby its patches applied) whenever used directly (e.g., when running commands provided by `datalad-next`, or by an extension that uses `datalad-next`). In addition, it is imported by the DataLad core package itself when the configuration item `datalad.extensions.load=next` is set. Patches modify an external implementation that is itself subject to change. To improve the validity and longevity of patches, it is helpful to consider a few guidelines: - Patches should use `datalad_next.patches.apply_patch()` to perform the patching, in order to yield uniform (logging) behavior - Patches should be as self-contained as possible. The aim is for patches to be merged upstream (at the patched entity) as quickly as possible. Self-contained patches facilitate this process. - Patches should maximally limit their imports from sources that are not the patch target. The helps to detect when changes to the patch target (or its environment) are made, and also helps to isolate the patch from changes in the general environment of the patches software package that are unrelated to the specific patched code. ### Imports #### Import centralization per sub-package If possible, sub-packages should have a "central" place for imports of functionality from outside `datalad-next` and the Python standard library. Other sub-package code should then import from this place via relative imports. This aims to make external dependencies more obvious, and import-error handling and mitigation for missing dependencies simpler and cleaner. Such a location could be the sub-package's `__init__.py`, or possibly a dedicated `dependencies.py`. #### No "direct" imports from `datalad` This is a specialization of the "Import centralization" rule above. All sub-package code should import from `datalad` into a *single* dedicated location inside the sub-package. All other sub-package code imports from this location. The aim is to clearly see what of the huge DataLad API is actually relevant for a particular feature. For some generic helpers it may be best to import them to `datalad_next.utils` or `datalad_next.tests.utils`. ### Prohibited DataLad core features The following components of the `datalad` package must not be used (directly) in contributions to `datalad-next`, because they have been replace by a different solution with the aim to phase them out. #### `require_dataset()` Commands must use `datalad_next.constraints.EnsureDataset` instead. #### nose-style decorators in test implementations The use of decorators like `with_tempfile` is not allowed. `pytest` fixtures have to be used instead. A *temporary* exception *may* be the helpers that are imported in `datalad_next.tests.utils`. However, these will be reduced and removed over time, and additional usage only adds to the necessary refactoring effort. Therefore new usage is highly discouraged. #### nose-style assertion helpers in test implementations The use of helpers like `assert_equal` is not allowed. `pytest` constructs have to be used instead -- this typically means plain `assert` statements. A *temporary* exception *may* be the helpers that are imported in `datalad_next.tests.utils`. However, these will be reduced and removed over time, and additional usage only adds to the necessary refactoring effort. Therefore new usage is highly discouraged. ### Test output Tests should be silent on stdout/stderr as much as possible. In particular, result renderings of DataLad commands must no be produced, unless necessary for testing a particular feature. A `no_result_rendering` fixture can be used to turn it off, without adding complexity to test implementations. datalad-next-1.4.1/CONTRIBUTORS000066400000000000000000000000671462321624600157540ustar00rootroot00000000000000See README.md for a comprehensive list of contributors datalad-next-1.4.1/LICENSE000066400000000000000000000031111462321624600150720ustar00rootroot00000000000000# Main Copyright/License DataLad, including all examples, code snippets and attached documentation is covered by the MIT license. The MIT License Copyright (c) 2018- DataLad Team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. See CONTRIBUTORS file for a full list of contributors. # 3rd-party code A copy of https://github.com/uktrade/iterable-subprocess is included at `datalad_next/iterable_subprocess`. It was written by Michal Charemza for UK Department of Business and Trade, and was made available under the terms of the MIT license. See `datalad_next/iterable_subprocess/LICENSE`. datalad-next-1.4.1/MANIFEST.in000066400000000000000000000001741462321624600156310ustar00rootroot00000000000000include CONTRIBUTORS LICENSE versioneer.py graft _datalad_buildsupport graft docs prune docs/build global-exclude *.py[cod] datalad-next-1.4.1/Makefile000066400000000000000000000007721462321624600155370ustar00rootroot00000000000000PYTHON ?= python clean: $(PYTHON) setup.py clean rm -rf dist build bin docs/build docs/source/generated *.egg-info -find . -name '*.pyc' -delete -find . -name '__pycache__' -type d -delete release-pypi: # avoid upload of stale builds test ! -e dist $(PYTHON) setup.py sdist bdist_wheel twine upload dist/* update-buildsupport: git subtree pull \ -m "Update DataLad build helper" \ --squash \ --prefix _datalad_buildsupport \ https://github.com/datalad/datalad-buildsupport.git \ main datalad-next-1.4.1/README.md000066400000000000000000000460311462321624600153540ustar00rootroot00000000000000# DataLad NEXT extension [![All Contributors](https://img.shields.io/github/all-contributors/datalad/datalad-next?color=ee8449&style=flat-square)](#contributors) [![Build status](https://ci.appveyor.com/api/projects/status/dxomp8wysjb7x2os/branch/main?svg=true)](https://ci.appveyor.com/project/mih/datalad-next/branch/main) [![codecov](https://codecov.io/gh/datalad/datalad-next/branch/main/graph/badge.svg?token=2P8rak7lSX)](https://codecov.io/gh/datalad/datalad-next) [![docs](https://github.com/datalad/datalad-next/workflows/docs/badge.svg)](https://github.com/datalad/datalad-next/actions?query=workflow%3Adocs) [![Documentation Status](https://readthedocs.org/projects/datalad-next/badge/?version=latest)](http://docs.datalad.org/projects/next/en/latest/?badge=latest) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![GitHub release](https://img.shields.io/github/release/datalad/datalad-next.svg)](https://GitHub.com/datalad/datalad-next/releases/) [![PyPI version fury.io](https://badge.fury.io/py/datalad-next.svg)](https://pypi.python.org/pypi/datalad-next/) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6833099.svg)](https://doi.org/10.5281/zenodo.6833099) This DataLad extension can be thought of as a staging area for additional functionality, or for improved performance and user experience. Unlike other topical or more experimental extensions, the focus here is on functionality with broad applicability. This extension is a suitable dependency for other software packages that intend to build on this improved set of functionality. ## Installation ``` # create and enter a new virtual environment (optional) $ virtualenv --python=python3 ~/env/dl-next $ . ~/env/dl-next/bin/activate # install from PyPi $ python -m pip install datalad-next ``` ## How to use Additional commands provided by this extension are immediately available after installation. However, in order to fully benefit from all improvements, the extension has to be enabled for auto-loading by executing: git config --global --add datalad.extensions.load next Doing so will enable the extension to also alter the behavior the core DataLad package and its commands. ## Summary of functionality provided by this extension - A replacement sub-system for credential handling that is able to handle arbitrary properties for annotating a secret, and facilitates determining suitable credentials while minimizing avoidable user interaction, without compromising configurability. A convenience method is provided that implements a standard workflow for obtaining a credential. - A user-facing `credentials` command to set, remove, and query credentials. - The `create-sibling-...` commands for the platforms GitHub, GIN, GOGS, Gitea are equipped with improved credential handling that, for example, only stores entered credentials after they were confirmed to work, or auto-selects the most recently used, matching credentials, when none are specified. - A `create-sibling-webdav` command for hosting datasets on a WebDAV server via a sibling tandem for Git history and file storage. Datasets hosted on WebDAV in this fashion are cloneable with `datalad-clone`. A full annex setup for storing complete datasets with historical file content version, and an additional mode for depositing single-version dataset snapshot are supported. The latter enables convenient collaboration with audiences that are not using DataLad, because all files are browsable via a WebDAV server's point-and-click user interface. - Enhance `datalad-push` to automatically export files to git-annex special remotes configured with `exporttree=yes`. - Speed-up `datalad-push` when processing non-git special remotes. This particularly benefits less efficient hosting scenarios like WebDAV. - Enhance `datalad-siblings enable` (`AnnexRepo.enable_remote()`) to automatically deploy credentials for git-annex special remotes that require them. - `git-remote-datalad-annex` is a Git remote helper to push/fetch to any location accessible by any git-annex special remote. - `git-annex-backend-XDLRA` (originally available from the `mihextras` extension) is a custom external git-annex backend used by `git-remote-datalad-annex`. A base class to facilitate development of external backends in Python is also provided. - Enhance `datalad-configuration` to support getting configuration from "global" scope without a dataset being present. - New modular framework for URL operations. This framework directly supports operation on `http(s)`, `ssh`, and `file` URLs, and can be extended with custom functionality for additional protocols or even interaction with specific individual servers. The basic operations `download`, `upload`, `delete`, and `stat` are recognized, and can be implemented. The framework offers uniform progress reporting and simultaneous content has computation. This framework is meant to replace and extend the downloader/provide framework in the DataLad core package. In contrast to its predecessor it is integrated with the new credential framework, and operations beyond downloading. - `git-annex-remote-uncurl` is a special remote that exposes the new URL operations framework via git-annex. It provides flexible means to compose and rewrite URLs (e.g., to compensate for storage infrastructure changes) without having to modify individual URLs recorded in datasets. It enables seamless transitions between any services and protocols supported by the framework. This special remote can replace the `datalad` special remote provided by the DataLad core package. - A `download` command is provided as a front-end for the new modular URL operations framework. - A `python-requests` compatible authentication handler (`DataladAuth`) that interfaces DataLad's credential system. - Boosted throughput of DataLad's `runner` component for command execution. - Substantially more comprehensive replacement for DataLad's `constraints` system for type conversion and parameter validation. - Windows and Mac client support for RIA store access. - A `next-status` command that is A LOT faster than `status`, and offers a `mono` recursion mode that shows modifications of nested dataset hierarchies relative to the state of the root dataset. Requires Git v2.31 (or later). ## Summary of additional features for DataLad extension development - Framework for uniform command parameter validation. Regardless of the used API (Python, CLI, or GUI), command parameters are uniformly validated. This facilitates a stricter separation of parameter specification (and validation) from the actual implementation of a command. The latter can now focus on a command's logic only, while the former enables more uniform and more comprehensive validation and error reporting. Beyond per-parameter validation and type-conversion also inter-parameter dependency validation and value transformations are supported. - Improved composition of importable functionality. Key components for `commands`, `annexremotes`, `datasets` (etc) are collected in topical top-level modules that provide "all" necessary pieces in a single place. - `webdav_server` fixture that automatically deploys a local WebDAV server. - Utilities for HTTP handling - `probe_url()` discovers redirects and authentication requirements for an HTTP URL - `get_auth_realm()` returns a label for an authentication realm that can be used to query for matching credentials - Utilities for special remote credential management: - `get_specialremote_credential_properties()` inspects a special remote and returns properties for querying a credential store for matching credentials - `update_specialremote_credential()` updates a credential in a store after successful use - `get_specialremote_credential_envpatch()` returns a suitable environment "patch" from a credential for a particular special remote type - Helper for runtime-patching other datalad code (`datalad_next.utils.patch`) - Base class for implementing custom `git-annex` backends. - A set of `pytest` fixtures to: - check that no global configuration side-effects are left behind by a test - check that no secrets are left behind by a test - provide a temporary configuration that is isolated from a user environment and from other tests - provide a temporary secret store that is isolated from a user environment and from other tests - provide a temporary credential manager to perform credential deployment and manipulation isolated from a user environment and from other tests - An `iter_subproc()` helper that enable communication with subprocesses via input/output iterables. - A `shell` context manager that enables interaction with (remote) shells, including support for input/output iterables for each shell-command execution within the context. ## Patching the DataLad core package. Some of the features described above rely on a modification of the DataLad core package itself, rather than coming in the form of additional commands. Loading this extension causes a range of patches to be applied to the `datalad` package to enable them. A comprehensive description of the current set of patch is available at http://docs.datalad.org/projects/next/en/latest/#datalad-patches ## Developing with DataLad NEXT This extension package moves fast in comparison to the core package. Nevertheless, attention is paid to API stability, adequate semantic versioning, and informative changelogs. ### Public vs internal API Anything that can be imported directly from any of the sub-packages in `datalad_next` is considered to be part of the public API. Changes to this API determine the versioning, and development is done with the aim to keep this API as stable as possible. This includes signatures and return value behavior. As an example: `from datalad_next.runners import iter_git_subproc` imports a part of the public API, but `from datalad_next.runners.git import iter_git_subproc` does not. ### Use of the internal API Developers can obviously use parts of the non-public API. However, this should only be done with the understanding that these components may change from one release to another, with no guarantee of transition periods, deprecation warnings, etc. Developers are advised to never reuse any components with names starting with `_` (underscore). Their use should be limited to their individual subpackage. ## Acknowledgements This DataLad extension was developed with funding from the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under grant SFB 1451 ([431549029](https://gepris.dfg.de/gepris/projekt/431549029), INF project). ## Contributors
Michael Hanke
Michael Hanke

🐛 💻 🖋 🎨 📖 💵 🔍 🤔 🚇 🚧 🧑‍🏫 📦 📆 👀 📢 ⚠️ 🔧 📓
catetrai
catetrai

💻 🎨 📖 🤔 ⚠️
Chris Markiewicz
Chris Markiewicz

🚧 💻
Michał Szczepanik
Michał Szczepanik

🐛 💻 🖋 📖 💡 🤔 🚇 🚧 👀 📢 ⚠️ 📓
Stephan Heunis
Stephan Heunis

🐛 💻 📖 🤔 🚧 📢 📓
Benjamin Poldrack
Benjamin Poldrack

🐛 💻
Yaroslav Halchenko
Yaroslav Halchenko

🐛 💻 🚇 🚧 🔧
Christian Mönch
Christian Mönch

💻 🎨 📖 🤔 👀 ⚠️ 📓
Adina Wagner
Adina Wagner

️️️️♿️ 🐛 💻 📖 💡 🚧 📆 👀 📢 ⚠️ 📓
John T. Wodder II
John T. Wodder II

💻 🚇 ⚠️
datalad-next-1.4.1/_datalad_buildsupport/000077500000000000000000000000001462321624600204365ustar00rootroot00000000000000datalad-next-1.4.1/_datalad_buildsupport/__init__.py000066400000000000000000000010211462321624600225410ustar00rootroot00000000000000# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the DataLad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Python package for functionality needed at package 'build' time by DataLad and its extensions __init__ here should be really minimalistic, not import submodules by default and submodules should also not require heavy dependencies. """ __version__ = '0.1' datalad-next-1.4.1/_datalad_buildsupport/formatters.py000066400000000000000000000247131462321624600232050ustar00rootroot00000000000000# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the DataLad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## import argparse import datetime import re class ManPageFormatter(argparse.HelpFormatter): # This code was originally distributed # under the same License of Python # Copyright (c) 2014 Oz Nahum Tiram def __init__(self, prog, indent_increment=2, max_help_position=4, width=1000000, section=1, ext_sections=None, authors=None, version=None ): super(ManPageFormatter, self).__init__( prog, indent_increment=indent_increment, max_help_position=max_help_position, width=width) self._prog = prog self._section = 1 self._today = datetime.date.today().strftime('%Y\\-%m\\-%d') self._ext_sections = ext_sections self._version = version def _get_formatter(self, **kwargs): return self.formatter_class(prog=self.prog, **kwargs) def _markup(self, txt): return txt.replace('-', '\\-') def _underline(self, string): return "\\fI\\s-1" + string + "\\s0\\fR" def _bold(self, string): if not string.strip().startswith('\\fB'): string = '\\fB' + string if not string.strip().endswith('\\fR'): string = string + '\\fR' return string def _mk_synopsis(self, parser): self.add_usage(parser.usage, parser._actions, parser._mutually_exclusive_groups, prefix='') usage = self._format_usage(None, parser._actions, parser._mutually_exclusive_groups, '') # replace too long list of commands with a single placeholder usage = re.sub(r'{[^]]*?create,.*?}', ' COMMAND ', usage, flags=re.MULTILINE) # take care of proper wrapping usage = re.sub(r'\[([-a-zA-Z0-9]*)\s([a-zA-Z0-9{}|_]*)\]', r'[\1\~\2]', usage) usage = usage.replace('%s ' % self._prog, '') usage = '.SH SYNOPSIS\n.nh\n.HP\n\\fB%s\\fR %s\n.hy\n' % (self._markup(self._prog), usage) return usage def _mk_title(self, prog): name_version = "{0} {1}".format(prog, self._version) return '.TH "{0}" "{1}" "{2}" "{3}"\n'.format( prog, self._section, self._today, name_version) def _mk_name(self, prog, desc): """ this method is in consistent with others ... it relies on distribution """ desc = desc.splitlines()[0] if desc else 'it is in the name' # ensure starting lower case desc = desc[0].lower() + desc[1:] return '.SH NAME\n%s \\- %s\n' % (self._bold(prog), desc) def _mk_description(self, parser): desc = parser.description desc = '\n'.join(desc.splitlines()[1:]) if not desc: return '' desc = desc.replace('\n\n', '\n.PP\n') # sub-section headings desc = re.sub(r'^\*(.*)\*$', r'.SS \1', desc, flags=re.MULTILINE) # italic commands desc = re.sub(r'^ ([-a-z]*)$', r'.TP\n\\fI\1\\fR', desc, flags=re.MULTILINE) # deindent body text, leave to troff viewer desc = re.sub(r'^ (\S.*)\n', '\\1\n', desc, flags=re.MULTILINE) # format NOTEs as indented paragraphs desc = re.sub(r'^NOTE\n', '.TP\nNOTE\n', desc, flags=re.MULTILINE) # deindent indented paragraphs after heading setup desc = re.sub(r'^ (.*)$', '\\1', desc, flags=re.MULTILINE) return '.SH DESCRIPTION\n%s\n' % self._markup(desc) def _mk_footer(self, sections): if not hasattr(sections, '__iter__'): return '' footer = [] for section, value in sections.items(): part = ".SH {}\n {}".format(section.upper(), value) footer.append(part) return '\n'.join(footer) def format_man_page(self, parser): page = [] page.append(self._mk_title(self._prog)) page.append(self._mk_name(self._prog, parser.description)) page.append(self._mk_synopsis(parser)) page.append(self._mk_description(parser)) page.append(self._mk_options(parser)) page.append(self._mk_footer(self._ext_sections)) return ''.join(page) def _mk_options(self, parser): formatter = parser._get_formatter() # positionals, optionals and user-defined groups for action_group in parser._action_groups: formatter.start_section(None) formatter.add_text(None) formatter.add_arguments(action_group._group_actions) formatter.end_section() # epilog formatter.add_text(parser.epilog) # determine help from format above help = formatter.format_help() # add spaces after comma delimiters for easier reformatting help = re.sub(r'([a-z]),([a-z])', '\\1, \\2', help) # get proper indentation for argument items help = re.sub(r'^ (\S.*)\n', '.TP\n\\1\n', help, flags=re.MULTILINE) # deindent body text, leave to troff viewer help = re.sub(r'^ (\S.*)\n', '\\1\n', help, flags=re.MULTILINE) return '.SH OPTIONS\n' + help def _format_action_invocation(self, action, doubledash='--'): if not action.option_strings: metavar, = self._metavar_formatter(action, action.dest)(1) return metavar else: parts = [] # if the Optional doesn't take a value, format is: # -s, --long if action.nargs == 0: parts.extend([self._bold(action_str) for action_str in action.option_strings]) # if the Optional takes a value, format is: # -s ARGS, --long ARGS else: default = self._underline(action.dest.upper()) args_string = self._format_args(action, default) for option_string in action.option_strings: parts.append('%s %s' % (self._bold(option_string), args_string)) return ', '.join(p.replace('--', doubledash) for p in parts) class RSTManPageFormatter(ManPageFormatter): def _get_formatter(self, **kwargs): return self.formatter_class(prog=self.prog, **kwargs) def _markup(self, txt): # put general tune-ups here return txt def _underline(self, string): return "*{0}*".format(string) def _bold(self, string): return "**{0}**".format(string) def _mk_synopsis(self, parser): self.add_usage(parser.usage, parser._actions, parser._mutually_exclusive_groups, prefix='') usage = self._format_usage(None, parser._actions, parser._mutually_exclusive_groups, '') usage = usage.replace('%s ' % self._prog, '') usage = 'Synopsis\n--------\n::\n\n %s %s\n' \ % (self._markup(self._prog), usage) return usage def _mk_title(self, prog): # and an easy to use reference point title = ".. _man_%s:\n\n" % prog.replace(' ', '-') title += "{0}".format(prog) title += '\n{0}\n\n'.format('=' * len(prog)) return title def _mk_name(self, prog, desc): return '' def _mk_description(self, parser): desc = parser.description if not desc: return '' return 'Description\n-----------\n%s\n' % self._markup(desc) def _mk_footer(self, sections): if not hasattr(sections, '__iter__'): return '' footer = [] for section, value in sections.items(): part = "\n{0}\n{1}\n{2}\n".format( section, '-' * len(section), value) footer.append(part) return '\n'.join(footer) def _mk_options(self, parser): # this non-obvious maneuver is really necessary! formatter = self.__class__(self._prog) # positionals, optionals and user-defined groups for action_group in parser._action_groups: formatter.start_section(None) formatter.add_text(None) formatter.add_arguments(action_group._group_actions) formatter.end_section() # epilog formatter.add_text(parser.epilog) # determine help from format above option_sec = formatter.format_help() return '\n\nOptions\n-------\n{0}'.format(option_sec) def _format_action(self, action): # determine the required width and the entry label action_header = self._format_action_invocation(action) if action.help: help_text = self._expand_help(action) help_lines = self._split_lines(help_text, 80) help = ' '.join(help_lines) else: help = '' # return a single string return '{0}\n{1}\n{2}\n\n'.format( action_header, '~' * len(action_header), help) def cmdline_example_to_rst(src, out=None, ref=None): if out is None: from io import StringIO out = StringIO() # place header out.write('.. AUTO-GENERATED FILE -- DO NOT EDIT!\n\n') if ref: # place cross-ref target out.write('.. {0}:\n\n'.format(ref)) # parser status vars inexample = False incodeblock = False for line in src: if line.startswith('#% EXAMPLE START'): inexample = True incodeblock = False continue if not inexample: continue if line.startswith('#% EXAMPLE END'): break if not inexample: continue if line.startswith('#%'): incodeblock = not incodeblock if incodeblock: out.write('\n.. code-block:: sh\n\n') continue if not incodeblock and line.startswith('#'): out.write(line[(min(2, len(line) - 1)):]) continue if incodeblock: if not line.rstrip().endswith('#% SKIP'): out.write(' %s' % line) continue if not len(line.strip()): continue else: raise RuntimeError("this should not happen") return out datalad-next-1.4.1/_datalad_buildsupport/setup.py000066400000000000000000000205101462321624600221460ustar00rootroot00000000000000# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the DataLad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## import datetime import os from os.path import ( dirname, join as opj, ) from setuptools import Command, DistutilsOptionError from setuptools.config import read_configuration import versioneer from . import formatters as fmt class BuildManPage(Command): # The BuildManPage code was originally distributed # under the same License of Python # Copyright (c) 2014 Oz Nahum Tiram description = 'Generate man page from an ArgumentParser instance.' user_options = [ ('manpath=', None, 'output path for manpages (relative paths are relative to the ' 'datalad package)'), ('rstpath=', None, 'output path for RST files (relative paths are relative to the ' 'datalad package)'), ('parser=', None, 'module path to an ArgumentParser instance' '(e.g. mymod:func, where func is a method or function which return' 'a dict with one or more arparse.ArgumentParser instances.'), ('cmdsuite=', None, 'module path to an extension command suite ' '(e.g. mymod:command_suite) to limit the build to the contained ' 'commands.'), ] def initialize_options(self): self.manpath = opj('build', 'man') self.rstpath = opj('docs', 'source', 'generated', 'man') self.parser = 'datalad.cmdline.main:setup_parser' self.cmdsuite = None def finalize_options(self): if self.manpath is None: raise DistutilsOptionError('\'manpath\' option is required') if self.rstpath is None: raise DistutilsOptionError('\'rstpath\' option is required') if self.parser is None: raise DistutilsOptionError('\'parser\' option is required') mod_name, func_name = self.parser.split(':') fromlist = mod_name.split('.') try: mod = __import__(mod_name, fromlist=fromlist) self._parser = getattr(mod, func_name)( ['datalad'], formatter_class=fmt.ManPageFormatter, return_subparsers=True, # ignore extensions only for the main package to avoid pollution # with all extension commands that happen to be installed help_ignore_extensions=self.distribution.get_name() == 'datalad') except ImportError as err: raise err if self.cmdsuite: mod_name, suite_name = self.cmdsuite.split(':') mod = __import__(mod_name, fromlist=mod_name.split('.')) suite = getattr(mod, suite_name) self.cmdlist = [c[2] if len(c) > 2 else c[1].replace('_', '-').lower() for c in suite[1]] self.announce('Writing man page(s) to %s' % self.manpath) self._today = datetime.date.today() @classmethod def handle_module(cls, mod_name, **kwargs): """Module specific handling. This particular one does 1. Memorize (at class level) the module name of interest here 2. Check if 'datalad.extensions' are specified for the module, and then analyzes them to obtain command names it provides If cmdline commands are found, its entries are to be used instead of the ones in datalad's _parser. Parameters ---------- **kwargs: all the kwargs which might be provided to setuptools.setup """ cls.mod_name = mod_name exts = kwargs.get('entry_points', {}).get('datalad.extensions', []) for ext in exts: assert '=' in ext # should be label=module:obj ext_label, mod_obj = ext.split('=', 1) assert ':' in mod_obj # should be module:obj mod, obj = mod_obj.split(':', 1) assert mod_name == mod # AFAIK should be identical mod = __import__(mod_name) if hasattr(mod, obj): command_suite = getattr(mod, obj) assert len(command_suite) == 2 # as far as I see it if not hasattr(cls, 'cmdline_names'): cls.cmdline_names = [] cls.cmdline_names += [ cmd for _, _, cmd, _ in command_suite[1] ] def run(self): dist = self.distribution #homepage = dist.get_url() #appname = self._parser.prog appname = 'datalad' cfg = read_configuration( opj(dirname(dirname(__file__)), 'setup.cfg'))['metadata'] sections = { 'Authors': """{0} is developed by {1} <{2}>.""".format( appname, cfg['author'], cfg['author_email']), } for cls, opath, ext in ((fmt.ManPageFormatter, self.manpath, '1'), (fmt.RSTManPageFormatter, self.rstpath, 'rst')): if not os.path.exists(opath): os.makedirs(opath) for cmdname in getattr(self, 'cmdline_names', list(self._parser)): if hasattr(self, 'cmdlist') and cmdname not in self.cmdlist: continue p = self._parser[cmdname] cmdname = "{0}{1}".format( 'datalad ' if cmdname != 'datalad' else '', cmdname) format = cls( cmdname, ext_sections=sections, version=versioneer.get_version()) formatted = format.format_man_page(p) with open(opj(opath, '{0}.{1}'.format( cmdname.replace(' ', '-'), ext)), 'w') as f: f.write(formatted) class BuildConfigInfo(Command): description = 'Generate RST documentation for all config items.' user_options = [ ('rstpath=', None, 'output path for RST file'), ] def initialize_options(self): self.rstpath = opj('docs', 'source', 'generated', 'cfginfo') def finalize_options(self): if self.rstpath is None: raise DistutilsOptionError('\'rstpath\' option is required') self.announce('Generating configuration documentation') def run(self): opath = self.rstpath if not os.path.exists(opath): os.makedirs(opath) from datalad.interface.common_cfg import definitions as cfgdefs from datalad.dochelpers import _indent categories = { 'global': {}, 'local': {}, 'dataset': {}, 'misc': {} } for term, v in cfgdefs.items(): categories[v.get('destination', 'misc')][term] = v for cat in categories: with open(opj(opath, '{}.rst.in'.format(cat)), 'w') as rst: rst.write('.. glossary::\n') for term, v in sorted(categories[cat].items(), key=lambda x: x[0]): rst.write(_indent(term, '\n ')) qtype, docs = v.get('ui', (None, {})) desc_tmpl = '\n' if 'title' in docs: desc_tmpl += '{title}:\n' if 'text' in docs: desc_tmpl += '{text}\n' if 'default' in v: default = v['default'] if hasattr(default, 'replace'): # protect against leaking specific home dirs v['default'] = default.replace(os.path.expanduser('~'), '~') desc_tmpl += 'Default: {default}\n' if 'type' in v: type_ = v['type'] if hasattr(type_, 'long_description'): type_ = type_.long_description() else: type_ = type_.__name__ desc_tmpl += '\n[{type}]\n' v['type'] = type_ if desc_tmpl == '\n': # we need something to avoid joining terms desc_tmpl += 'undocumented\n' v.update(docs) rst.write(_indent(desc_tmpl.format(**v), ' ')) datalad-next-1.4.1/changelog.d/000077500000000000000000000000001462321624600162425ustar00rootroot00000000000000datalad-next-1.4.1/changelog.d/README000066400000000000000000000003521462321624600171220ustar00rootroot00000000000000This directory contains changelog items for the next release. The easiest way to add new change log items is to run `scriv create`, and edit and commit the generated template. `scriv` is available at https://pypi.org/project/scriv/ datalad-next-1.4.1/changelog.d/scriv.ini000066400000000000000000000004701462321624600200720ustar00rootroot00000000000000[scriv] fragment_directory = changelog.d entry_title_template = file: templates/entry_title.md.j2 new_fragment_template = file: templates/new_fragment.md.j2 format = md categories = 🐛 Bug Fixes, 💫 Enhancements and new features, 🪓 Deprecations and removals, 📝 Documentation, 🏠 Internal, 🛡 Tests datalad-next-1.4.1/changelog.d/templates/000077500000000000000000000000001462321624600202405ustar00rootroot00000000000000datalad-next-1.4.1/changelog.d/templates/entry_title.md.j2000066400000000000000000000001121462321624600234300ustar00rootroot00000000000000{{ version if version else "VERSION" }} ({{ date.strftime('%Y-%m-%d') }}) datalad-next-1.4.1/changelog.d/templates/new_fragment.md.j2000066400000000000000000000005351462321624600235530ustar00rootroot00000000000000 {% for cat in config.categories -%} {% endfor -%} datalad-next-1.4.1/datalad_next/000077500000000000000000000000001462321624600165215ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/__init__.py000066400000000000000000000076741462321624600206500ustar00rootroot00000000000000"""DataLad NEXT extension""" __docformat__ = 'restructuredtext' import logging lgr = logging.getLogger('datalad.next') # Defines a datalad command suite. # This variable must be bound as a setuptools entrypoint # to be found by datalad command_suite = ( # description of the command suite, displayed in cmdline help "What is next in DataLad", [ # specification of a command, any number of commands can be defined ( # importable module that contains the command implementation 'datalad_next.commands.credentials', # name of the command class implementation in above module 'Credentials', ), ( # importable module that contains the command implementation 'datalad_next.commands.create_sibling_webdav', # name of the command class implementation in above module 'CreateSiblingWebDAV', # we gotta make this explicit, or the build_support code will # not pick it up, due to the dashes in the name 'create-sibling-webdav', ), ( # importable module that contains the command implementation 'datalad_next.commands.tree', # name of the command class implementation in above module 'TreeCommand', # command name (differs from lowercase command class name) 'tree' ), ( 'datalad_next.commands.download', 'Download', 'download', ), ( 'datalad_next.commands.ls_file_collection', 'LsFileCollection', 'ls-file-collection', ), ( 'datalad_next.commands.status', 'Status', 'next-status', 'next_status', ), ] ) # patch datalad-core import datalad_next.patches.enabled # register additional configuration items in datalad-core from datalad.support.extensions import register_config from datalad_next.constraints import ( EnsureBool, EnsureChoice, ) register_config( 'datalad.credentials.repeat-secret-entry', 'Require entering secrets twice for interactive specification?', type=EnsureBool(), default=True, dialog='yesno') register_config( 'datalad.credentials.hidden-secret-entry', 'Hide secret in interactive entry?', type=EnsureBool(), default=True, dialog='yesno') register_config( 'datalad.clone.url-substitute.webdav', 'webdav(s):// clone URL substitution', description="Convenience conversion of custom WebDAV URLs to " "git-cloneable 'datalad-annex::'-type URLs. The 'webdav://' " "prefix implies a remote sibling in 'filetree' or 'export' mode " "See https://docs.datalad.org/design/url_substitution.html for details", dialog='question', scope='global', default=( r',^webdav([s]*)://([^?]+)$,datalad-annex::http\1://\2?type=webdav&encryption=none&exporttree=yes&url={noquery}', ), ) register_config( 'datalad.runtime.parameter-violation', 'Perform exhaustive command parameter validation, or fail on first error?', type=EnsureChoice('raise-early', 'raise-at-end'), default='raise-early', dialog='question', ) register_config( 'datalad.archivist.legacy-mode', 'Fall back on legacy ``datalad-archives`` special remote implementation?', description='If enabled, all `archivist` special remote operations ' 'fall back onto the legacy ``datalad-archives`` special remote ' 'implementation. This mode is only provided for backward-compatibility. ' 'This legacy implementation unconditionally downloads archive files ' 'completely, and keeps an internal cache of the full extracted archive ' 'around. The implied 200% storage cost overhead for obtaining a complete ' 'dataset can be prohibitive for datasets tracking large amount of data ' '(in archive files).', type=EnsureBool(), default=False, dialog='yesno', ) from . import _version __version__ = _version.get_versions()['version'] datalad-next-1.4.1/datalad_next/_version.py000066400000000000000000000577551462321624600207420ustar00rootroot00000000000000 # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. # Generated by versioneer-0.29 # https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" import errno import os import re import subprocess import sys from typing import Any, Callable, Dict, List, Optional, Tuple import functools def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = " (grafted, HEAD, tag: 1.4.1)" git_full = "9bbe486cec5ae83d2eae2c343a984df321ee3647" git_date = "2024-05-22 00:13:58 +0200" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" VCS: str style: str tag_prefix: str parentdir_prefix: str versionfile_source: str verbose: bool def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "pep440" cfg.tag_prefix = "" cfg.parentdir_prefix = "" cfg.versionfile_source = "datalad_next/_version.py" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command( commands: List[str], args: List[str], cwd: Optional[str] = None, verbose: bool = False, hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs: Dict[str, Any] = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen([command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs) break except OSError as e: if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, process.returncode return stdout, process.returncode def versions_from_parentdir( parentdir_prefix: str, root: str, verbose: bool, ) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords( keywords: Dict[str, str], tag_prefix: str, verbose: bool, ) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r'\d', r): continue if verbose: print("picking %s" % r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner(GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", f"{tag_prefix}[[:digit:]]*" ], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%s'" % describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" % (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for _ in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} datalad-next-1.4.1/datalad_next/annexbackends/000077500000000000000000000000001462321624600213255ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/annexbackends/__init__.py000066400000000000000000000000001462321624600234240ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/annexbackends/base.py000066400000000000000000000242071462321624600226160ustar00rootroot00000000000000# Helper module to develop git-annex backends # # https://git-annex.branchable.com/design/external_backend_protocol/ # # Derived from AnnexRemote Copyright (C) 2017 Silvio Ankermann (GPL-3) """Interface and essential utilities to implement external git-annex backends """ import logging from abc import ( ABCMeta, abstractmethod, ) import sys import traceback class Backend(metaclass=ABCMeta): """Metaclass for backends. It implements the communication with git-annex via the external backend protocol. More information on the protocol is available at https://git-annex.branchable.com/design/external_backend_protocol/ External backends can be built by implementing the abstract methods defined in this class. Attributes ---------- annex : Master The Master object to which this backend is linked. Master acts as an abstraction layer for git-annex. """ def __init__(self, annex): self.annex = annex @abstractmethod def can_verify(self): """Returns whether the backend can verify the content of files match a key it generated. The verification does not need to be cryptographically secure, but should catch data corruption. Returns ------- bool """ @abstractmethod def is_stable(self): """Returns whether a key it has generated will always have the same content. The answer to this is almost always yes; URL keys are an example of a type of key that may have different content at different times. Returns ------- bool """ @abstractmethod def is_cryptographically_secure(self): """ Returns whether keys it generates are verified using a cryptographically secure hash. Note that sha1 is not a cryptographically secure hash any longer. A program can change its answer to this question as the state of the art advances, and should aim to stay ahead of the state of the art by a reasonable amount of time. Returns ------- bool """ @abstractmethod def gen_key(self, local_file): """Examine the content of `local_file` and from it generate a key. While it is doing this, it can send any number of PROGRESS messages indication the position in the file that it's gotten to. Parameters ---------- local_file: str Path for which to generate a key. Note that in some cases, local_file may contain whitespace. Returns ------- str The generated key. Raises ------ BackendError If the file could not be received from the backend. """ @abstractmethod def verify_content(self, key, content_file): """Examine a file and verify it has the content expected given a key While it is doing this, it can send any number of PROGRESS messages indicating the position in the file that it's gotten to. If `can_verify() == False`, git-annex not ask to do this. Returns ------- bool """ def error(self, error_msg): """Communicate a generic error. Can be sent at any time if things get too messed up to continue. If the program receives an error() from git-annex, it can exit with its own error(). Eg.: self.annex.error("Error received. Exiting.") raise SystemExit Parameters ---------- error_msg : str The error message received from git-annex """ self.annex.error("Error received. Exiting.") raise SystemExit # Exceptions class AnnexError(Exception): """ Common base class for all annexbackend exceptions. """ class ProtocolError(AnnexError): """ Base class for protocol errors """ class UnsupportedRequest(ProtocolError): """ Must be raised when an optional request is not supported by the backend. """ class UnexpectedMessage(ProtocolError): """ Raised when git-annex sends a message which is not expected at the moment """ class BackendError(AnnexError): """ Must be raised by the backend when a request did not succeed. """ class NotLinkedError(AnnexError): """ Will be raised when a Master instance is accessed without being linked to a Backend instance """ class Protocol(object): """ Helper class handling the receiving part of the protocol (git-annex to backend) It parses the requests coming from git-annex and calls the respective method of the backend object. """ def __init__(self, backend): self.backend = backend self.version = "VERSION 1" def command(self, line): line = line.strip() if not line: raise ProtocolError("Got empty line") parts = line.split(" ", 1) method = self.lookupMethod(parts[0]) if method is None: raise UnsupportedRequest(f'Unknown request {line!r}') try: if len(parts) == 1: reply = method() else: reply = method(parts[1]) except TypeError as e: raise SyntaxError(e) else: return reply def lookupMethod(self, command): return getattr(self, 'do_' + command.upper(), None) def do_GETVERSION(self): return self.version def do_CANVERIFY(self): return 'CANVERIFY-YES' if self.backend.can_verify() else 'CANVERIFY-NO' def do_ISSTABLE(self): return 'ISSTABLE-YES' if self.backend.is_stable() else 'ISSTABLE-NO' def do_ISCRYPTOGRAPHICALLYSECURE(self): return 'ISCRYPTOGRAPHICALLYSECURE-YES' \ if self.backend.is_cryptographically_secure() \ else 'ISCRYPTOGRAPHICALLYSECURE-NO' def do_GENKEY(self, *arg): try: key = self.backend.gen_key(arg[0]) return f'GENKEY-SUCCESS {key}' except BackendError as e: return f'GENKEY-FAILURE {str(e)}' def do_VERIFYKEYCONTENT(self, *arg): try: success = self.backend.verify_content(*arg[0].split(" ", 1)) except BackendError: success = False return 'VERIFYKEYCONTENT-SUCCESS' if success \ else 'VERIFYKEYCONTENT-FAILURE' def do_ERROR(self, message): self.backend.error(message) class Master(object): """ Metaclass for backends. Attributes ---------- input : io.TextIOBase Where to listen for git-annex request messages. Default: sys.stdin output : io.TextIOBase Where to send replies and backend messages Default: sys.stdout backend : Backend A class implementing the Backend interface to which this master is linked. """ def __init__(self, output=sys.stdout): """ Initialize the Master with an output. Parameters ---------- output : io.TextIOBase Where to send replies and backend messages Default: sys.stdout """ self.output = output def LinkBackend(self, backend): """ Link the Master to a backend. This must be done before calling Listen() Parameters ---------- backend : Backend A class implementing Backend interface to which this master will be linked. """ self.backend = backend self.protocol = Protocol(backend) def Listen(self, input=sys.stdin): """ Listen on `input` for messages from git annex. Parameters ---------- input : io.TextIOBase Where to listen for git-annex request messages. Default: sys.stdin Raises ---------- NotLinkedError If there is no backend linked to this master. """ if not (hasattr(self, 'backend') and hasattr(self, 'protocol')): raise NotLinkedError("Please execute LinkBackend(backend) first.") self.input = input while True: # due to a bug in python 2 we can't use an iterator here: https://bugs.python.org/issue1633941 line = self.input.readline() if not line: break line = line.rstrip() try: reply = self.protocol.command(line) if reply: self._send(reply) except UnsupportedRequest as e: self.debug(str(e)) self._send("UNSUPPORTED-REQUEST") except Exception as e: for line in traceback.format_exc().splitlines(): self.debug(line) self.error(e) raise SystemExit def debug(self, *args): """ Tells git-annex to display the message if --debug is enabled. Parameters ---------- message : str The message to be displayed to the user """ self._send("DEBUG", *args) def error(self, *args): """ Generic error. Can be sent at any time if things get too messed up to continue. When possible, raise a BackendError inside the respective functions. The backend program should exit after sending this, as git-annex will not talk to it any further. Parameters ---------- error_msg : str The error message to be sent to git-annex """ self._send("ERROR", *args) def progress(self, progress): """ Indicates the current progress of the transfer (in bytes). May be repeated any number of times during the transfer process, but it's wasteful to update the progress until at least another 1% of the file has been sent. This is highly recommended for ``*_store()``. (It is optional but good for ``*_retrieve()``.) Parameters ---------- progress : int The current progress of the transfer in bytes. """ self._send("PROGRESS {progress}".format(progress=int(progress))) def _send(self, *args, **kwargs): print(*args, file=self.output, **kwargs) self.output.flush() datalad-next-1.4.1/datalad_next/annexbackends/tests/000077500000000000000000000000001462321624600224675ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/annexbackends/tests/__init__.py000066400000000000000000000000001462321624600245660ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/annexbackends/tests/test_base.py000066400000000000000000000036721462321624600250220ustar00rootroot00000000000000import logging import io from datalad_next.tests import ( assert_raises, eq_, ) from datalad_next.utils import swallow_outputs from ..base import ( BackendError, Master, NotLinkedError, Protocol, ProtocolError, UnsupportedRequest, ) class FakeBackend(object): def error(self, message): raise ValueError(message) def gen_key(self, val): raise BackendError('not worky-worky') def verify_content(self, f, key): raise BackendError('not worky-worky') def can_verify(self): raise RuntimeError('intentional blow') def test_protocol(): """Test essential protocol (error) behavior""" p = Protocol(FakeBackend()) # no empty lines assert_raises(ProtocolError, p.command, '') # version check eq_(p.command('GETVERSION'), 'VERSION 1') # unknown command assert_raises(UnsupportedRequest, p.command, 'GOTNOCLUE') # pass arg to command method with assert_raises(ValueError) as ve: p.command('ERROR my message') eq_(str(ve.exception), 'my message') # backend failure is communicated for key generation eq_(p.command('GENKEY for-some-file'), 'GENKEY-FAILURE not worky-worky') # and for key verification eq_(p.command('VERIFYKEYCONTENT for-some-file mykey'), 'VERIFYKEYCONTENT-FAILURE') def test_master(): master = Master() assert_raises(NotLinkedError, master.Listen) master.LinkBackend(FakeBackend()) with swallow_outputs() as cmo: master.Listen(io.StringIO('GETVERSION')) eq_(cmo.out, 'VERSION 1\n') with swallow_outputs() as cmo: master.Listen(io.StringIO('FUNKY')) # comes with a DEBUG message showing exactly what went wrong eq_(cmo.out, "DEBUG Unknown request 'FUNKY'\nUNSUPPORTED-REQUEST\n") assert_raises(SystemExit, master.Listen, io.StringIO('CANVERIFY')) with swallow_outputs() as cmo: master.progress(15) eq_(cmo.out, 'PROGRESS 15\n') datalad-next-1.4.1/datalad_next/annexbackends/xdlra.py000066400000000000000000000045751462321624600230240ustar00rootroot00000000000000"""git-annex external backend XDLRA for git-remote-datalad-annex""" from pathlib import Path import zipfile from .base import ( Backend, BackendError, Master, ) class DataladRepoAnnexBackend(Backend): """Implementation of an external git-annex backend This backend is tightly coupled to the `git-remote-datalad-annex` and hardly of any general utility. It is essentially aiming to be the leanest possible implementation to get git-annex to transport the content of two distinct files to and from a special remote. This backend is unlike most backends, because there is no fixed association of a particular file content to a particular key. In other words, the key content is expected to change without any change in the key name. Only two keys are supported: - ``XDLRA--refs`` - ``XDLRA--repo-export`` ``XDLRA--refs`` contains a "refs" list of a Git repository, similar to the output of ``git for-each-ref``. ``XDLRA--repo-export`` hold a ZIP archive of a bare Git repository. """ def can_verify(self): # we can verify that a key matches the type of content # this is basically no more than a sanity check that a # download yielded something that we can work with for # downstream clone processing return True def is_stable(self): # the content behind a key is not always the same # in fact, it is typically different each time return False def is_cryptographically_secure(self): # we are not using any hashes return False def gen_key(self, local_file): localfile = Path(local_file) if _is_component_repoexport(localfile): return "XDLRA--repo-export" elif _is_component_refs(localfile): return "XDLRA--refs" else: # local_file is a TMP location, no use in reporting it raise BackendError('Unrecognized repository clone component') def verify_content(self, key, content_file): return self.gen_key(content_file) == key def _is_component_refs(path): return path.read_text().endswith(' HEAD\n') def _is_component_repoexport(path): return zipfile.is_zipfile(path) def main(): """Entry point for the backend utility""" master = Master() backend = DataladRepoAnnexBackend(master) master.LinkBackend(backend) master.Listen() datalad-next-1.4.1/datalad_next/annexremotes/000077500000000000000000000000001462321624600212315ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/annexremotes/__init__.py000066400000000000000000000070671462321624600233540ustar00rootroot00000000000000from __future__ import annotations # import all the pieces one would need for an implementation # in a single place from annexremote import UnsupportedRequest from typing import Any from datalad.customremotes import ( # this is an enhanced RemoteError that self-documents its cause RemoteError, SpecialRemote as _SpecialRemote, ) from datalad.customremotes.main import main as super_main from datalad_next.datasets import LeanAnnexRepo class SpecialRemote(_SpecialRemote): """Base class of all datalad-next git-annex special remotes""" def __init__(self, annex): super(SpecialRemote, self).__init__(annex=annex) self._repo = None self._remotename = None @property def repo(self) -> LeanAnnexRepo: """Returns a representation of the underlying git-annex repository An instance of :class:`~datalad_next.datasets.LeanAnnexRepo` is returned, which intentionally provides a restricted API only. In order to limit further proliferation of the ``AnnexRepo`` API. """ if self._repo is None: self._repo = LeanAnnexRepo(self.annex.getgitdir()) return self._repo @property def remotename(self) -> str: """Name of the (git) remote the special remote is operating under""" if self._remotename is None: self._remotename = self.annex.getgitremotename() return self._remotename def get_remote_gitcfg( self, remotetypename: str, name: str, default: Any | None = None, **kwargs ): """Get a particular Git configuration item for the special remote This target configuration here is *not* the git-annex native special remote configuration that is provided or altered with ``initremote`` and ``enableremote``, and is committed to the ``git-annex`` branch. Instead this is a clone and remote specific configuration, declared in Git's configuration system. The configuration items queried have the naming scheme:: remote..- datalad.. where ```` is the name of the Git remote, the special remote is operating under, ```` is the name of the special remote implementation (e.g., ``uncurl``), and ```` is the name of a particular configuration flavor. Parameters ---------- remotetypename: str Name of the special remote implementation configuration is requested for. name: str The name of the "naked" configuration item, without any sub/sections. Must be a valid git-config variable name, i.e., case-insensitive, only alphanumeric characters and -, and must start with an alphabetic character. default: A default value to be returned if there is no configuration. **kwargs: Passed on to :func:`datalad_next.config.ConfigManager.get()` Returns ------- Any If a remote-specific configuration exists, it is reported. Otherwise a remote-type specific configuration is reported, or the default provided with the method call, if no configuration is found at all. """ cfgget = self.repo.config.get return cfgget( f'remote.{self.remotename}.{remotetypename}-{name}', default=cfgget( f'datalad.{remotetypename}.{name}', default=default, ) ) datalad-next-1.4.1/datalad_next/annexremotes/archivist.py000066400000000000000000000553651462321624600236150ustar00rootroot00000000000000"""git-annex special remote *archivist* for obtaining files from archives """ from __future__ import annotations # General TODO for future improvements # # `datalad.archivist.archive-cache-mode=` # Choice of archive (access) caching behavior. ```` can be any of # # ``persistent-whole`` # This causes an archive to be downloaded completely on first access to any # archive member. A regular ``annex get`` is performed and an archive is # placed at its standard location in the local annex. Any archive member # will be extracted from this local copy. # # Some ideas on optional additional cache modes related to dropping as much as # possible after the special remote is done. However, these modes also come # with potential issues re parallel access (what if another remote process # is still using a particular archive... Think about that when there is a # real need # # ``keep-downloads`` # No caching will be performed per se. However, when archive member access # happens to require a full archive download, a downloaded archive will # not be removed after member extraction. In such cases, this mode will # behave like ``persistent-whole``. # # ``none`` # This is behaving like ``keep-downloads``, but any downloaded archive # will be dropped again after extraction is complete. from collections.abc import Iterable from dataclasses import dataclass from pathlib import Path from shutil import copyfileobj from typing import ( Dict, Generator, List, Tuple, ) from datalad_next.archive_operations import ArchiveOperations # we intentionally limit ourselves to the most basic interface # and even that we only need to get a `ConfigManager` instance. # If that class would support a plain path argument, we could # avoid it entirely from datalad_next.datasets import LegacyAnnexRepo from datalad_next.exceptions import CommandError from datalad_next.types import ( AnnexKey, ArchivistLocator, ArchiveType, ) from . import ( RemoteError, SpecialRemote, UnsupportedRequest, super_main ) class ArchivistRemote(SpecialRemote): """git-annex special remote *archivist* for obtaining files from archives Successor of the `datalad-archive` special remote. It claims and acts on particular archive locator "URLs", registered for individual annex keys (see :class:`datalad_next.types.archivist.ArchivistLocator`). These locators identify another annex key that represents an archive (e.g., a tarball or a zip files) that contains the respective annex key as a member. This special remote trigger the extraction of such members from any candidate archive when retrieval of a key is requested. This special remote cannot store or remove content. The desired usage is to register a locator "URL" for any relevant key via ``git annex addurl|registerurl`` or ``datalad addurls``. Configuration ------------- The behavior of this special remote can be tuned via a number of configuration settings. `datalad.archivist.legacy-mode=yes|[no]` If enabled, all special remote operations fall back onto the legacy ``datalad-archives`` special remote implementation. This mode is only provided for backward-compatibility. This legacy implementation unconditionally downloads archive files completely, and keeps an internal cache of the full extracted archive around. The implied 200% (or more) storage cost overhead for obtaining a complete dataset can be prohibitive for datasets tracking large amount of data (in archive files). Implementation details ---------------------- *CHECKPRESENT* When performing a non-download test for the (continued) presence of an annex key (as triggered via ``git annex fsck --fast`` or ``git annex checkpresentkey``), the underlying archive containing a key will NOT be inspected. Instead, only the continued availability of the annex key for the containing archive will be tested. In other words: this implementation trust the archive member annotation to be correct/valid, and it also trusts the archive content to be unchanged. The latter will be generally the case, but may no with URL-style keys. Not implementing such a trust-approach *would* have a number of consequences. Depending on where the archive is located (local/remote) and what format it is (fsspec-inspectable or not), we would need to download it completely in order to verify a matching archive member. Moreover, an archive might also reference another archive as a source, leading to a multiplication of transfer demands. """ def __init__(self, annex): super().__init__(annex) # central archive handler cache, initialized on-prepare self._ahandlers = None # a potential instance of the legacy datalad-archives implementation self._legacy_special_remote = None def __getattribute__(self, name: str): """Redirect top-level API calls to legacy implementation, if needed""" lsr = SpecialRemote.__getattribute__(self, '_legacy_special_remote') if lsr is None or name not in ( 'initremote', 'prepare', 'claimurl', 'checkurl', 'checkpresent', 'remove', 'whereis', 'transfer_retrieve', 'stop', ): # we are not in legacy mode or this is no top-level API call return SpecialRemote.__getattribute__(self, name) return getattr(lsr, name) def initremote(self): """This method does nothing, because the special remote requires no particular setup. """ pass def prepare(self): """Prepare the special remote for requests by git-annex If the special remote is instructed to run in "legacy mode", all subsequent operations will be processed by the ``datalad-archives`` special remote implementation! """ # we have to do this here, because the base class `.repo` will only give # us a `LeanAnnexRepo`. # TODO it is unclear to MIH what is actually needed API-wise of the legacy # interface. Needs research. self._repo = LegacyAnnexRepo(self.annex.getgitdir()) # are we in legacy mode? # let remote-specific setting take priority (there could be # multiple archivist-type remotes configured), and use unspecific switch # as a default, with a general default of NO if self.get_remote_gitcfg( 'archivist', 'legacy-mode', default='no').lower() == 'yes': # ATTENTION DEBUGGERS! # If we get here, we will bypass all of the archivist # implementation! Check __getattribute__() -- pretty much no # other code in this file will run!!! # __getattribute__ will relay all top-level operations # to an instance of the legacy implementation from datalad.customremotes.archives import ArchiveAnnexCustomRemote lsr = ArchiveAnnexCustomRemote(self.annex) lsr.prepare() # we can skip everything else, it won't be triggered anymore self._legacy_special_remote = lsr return # central archive key handler coordination self._ahandlers = _ArchiveHandlers( self.repo, # TODO #cache_mode=self._getcfg( # 'archive-cache-mode', # default='').lower(), ) def claimurl(self, url: str) -> bool: """Returns True for :class:`~datalad_next.types.archivist.ArchivistLocator`-style URLs Only a lexical check is performed. Any other URL will result in ``False`` to be returned. """ try: ArchivistLocator.from_str(url) return True except Exception: return False def checkurl(self, url: str) -> bool: """Parses :class:`~datalad_next.types.archivist.ArchivistLocator`-style URLs Returns ``True`` for any syntactically correct URL with all required properties. The implementation is identical to ``claimurl()``. """ try: ArchivistLocator.from_str(url) except Exception as e: self.message(f'Invalid URL {url!r}: {e}', type='debug') return False # we should be able to work with this. # do not actually test whether the archive is around or whether # the path actually points to a member in the archive, # leave to transfer_retrieve # Do not give detailed info to git-annex for now # https://github.com/Lykos153/AnnexRemote/issues/60 #if member_props.get('size'): # return dict( # filename=member_props['path'].name, # size=member_props['size'], # ) #else: # return dict(filename=member_props['path'].name) return True def checkpresent(self, key: str) -> bool: """Verifies continued availability of the archive referenced by the key No content verification of the archive, or of the particular archive member is performed. See "Implementation details" of this class for a rational. Returns ------- bool True if the referenced archive key is present on any remote. False if not. """ # the idea here is that: as long as the archive declared to contain # the key is still accessible, we declare CHECKPRESENT. # In other words: we trust the archive member annotation to be # correct/valid. # not trusting it would have sever consequences. depending on # where the archive is located (local/remote) and what format it # is (fsspec-inspectable), we might need to download it completely # in order to verify a matching archive member. Moreover, an archive # might also reference another archive as a source, leading to a # multiplication of transfer demands # get all associated archive keys, turn into set because any key might # map to multiple archive keys, and we only need to check them once akeys = set( str(ArchivistLocator.from_str(url).akey) for url in self._get_key_dlarchive_urls(key) ) # As with transfer_retrieve blindly checking akeys in arbitrary # order is stupid. We should again sort by (local) availability. # if we have an archive locally we can check faster, we could check # more precisely (actually look into it). # We only need to find one archive with a hit, if we search clever # we can exit earlier. # So let's do a two-pass approach, first check local availability # for any archive key, and only if that does not find us an archive # go for the remotes if any(_get_key_contentpath(self.repo, akey) for akey in akeys): # any one is good enough # TODO here we could actually look into the archive and # verify member presence without relatively little cost return True for akey in akeys: # we leave all checking logic to git-annex try: # if it exits clean, the key is still present at at least one # remote self.repo.call_annex(['checkpresentkey', akey]) return True except CommandError: self.message( f'Archive key candidate {akey} for key {key} ' 'not present in any known remote or here', type='debug') # when we end up here, we have tried all known archives keys and # found none to be present in any known location return False def transfer_retrieve(self, key: str, localfilename: str): """Retrieve an archive member from a (remote) archive All registered locators for a requested key will be sorted by availability and size of the references archives. For each archive the most suitable handler will be initialized, and extraction of the identified member will be attempted. If that fails, the next handler is tried until all candidate handlers are exhausted. Depending on the archive availability and type, archives may need to be retrieved from remote sources. """ # rely on from_locators() to bring the candidate archives # in some intelligent order to try one after the other. # break ASAP to prevent unnecessary processing msgs = [] try: for handler, locs in self._ahandlers.from_locators([ ArchivistLocator.from_str(url) for url in self._get_key_dlarchive_urls(key)]): with Path(localfilename).open('wb') as dst_fp: for loc in locs: try: with handler.open(loc.member) as fp: # TODO progress reporting # but what progress? the extraction # may just be one part, there could also # be file retrieval copyfileobj(fp, dst_fp) return except Exception as e: msg = f'Failed to extract {key!r} from ' \ f'{handler} ({loc.member}): {e}' self.message(msg, type='debug') msgs.append(msg) except Exception as e: raise RemoteError(f'Could not obtain {key!r}') from e raise RemoteError(f'Could not obtain {key!r} from any archive') def transfer_store(self, key: str, filename: str): """Raises ``UnsupportedRequest``. This operation is not supported.""" raise UnsupportedRequest('This remote cannot store content') def remove(self, key: str): """Raises ``UnsupportedRequest``. This operation is not supported.""" raise UnsupportedRequest('This remote cannot remove content') # # Helpers # def _get_key_dlarchive_urls(self, key): return self.annex.geturls(key, prefix='dl+archive:') def main(): """CLI entry point installed as ``git-annex-remote-archivist``""" super_main( cls=ArchivistRemote, remote_name='archivist', description=\ "access to annex keys stored within other archive-type annex keys ", ) # # Internal helpers # @dataclass class _ArchiveInfo: """Representation of an archive used internally by ``_ArchiveHandlers``""" local_path: Path | None handler: ArchiveOperations | None = None type: ArchiveType | None = None class _ArchiveHandlers: """Wraps annex repo to provide access to keys given by ArchivistLocator(s) The main functionality is provided by ``from_locators()``. """ # TODO make archive access caching behavior configurable from the outside def __init__(self, repo): # mapping of archive keys to an info dict self._db: Dict[AnnexKey, _ArchiveInfo] = {} # for running git-annex queries against the repo self._repo = repo def from_locators( self, locs: List[ArchivistLocator] ) -> Generator[Tuple[ArchiveOperations, Iterable[ArchivistLocator]], None, None]: """Produce archive handlers for the given locators Yield them one-by-one in a maximally intelligent order for efficient retrieval (i.e., handlers for archives that are already available locally first. Each handlers is yielded fully prepared, i.e. if necessary an archive is retrieved before the handler is yielded. Therefore a consumer should not fully consume the returned generator when an operation can be completed before all handlers are exhausted. Parameters ---------- locs: List[ArchivistLocator] Any number of locators that must all refer to the same annex key (key, not archive annex key!). Yields ------ ArchiveOperations, Iterable[ArchivistLocator] The referenced archive annex keys are de-duplicated and sorted by (local) availability and size. For each archive key a suitable ``ArchiveOperations`` handler is yielded together with the locators matching the respective archive. """ # determine all candidate source archive keys akeys = set(loc.akey for loc in locs) # determine which of the known handlers point to a local archive, # yield those for akey, kh in { akey: self._db[akey] for akey in akeys if akey in self._db and self._db[akey].handler }.items(): # local_path will be None now, if not around if kh.local_path: # we found one with a local archive. # yield handler and all matching locators yield kh.handler, [loc for loc in locs if loc.akey == akey] # if we get here, this did not work, do not try again akeys.remove(akey) # of the handlers we do not yet know, which ones have local data, # yield those for akey in [k for k in akeys if k not in self._db]: ainfo = self._get_archive_info(akey, locs) # cache for later self._db[akey] = ainfo if not ainfo.local_path: # do not try a local handler, but keep the akey itself in the # race, we might need to try "remote" access later on continue handler = self._get_local_handler(ainfo) # store for later ainfo.handler = handler # yield handler and all matching locators yield handler, [loc for loc in locs if loc.akey == akey] # if we get here, this did not work, do not try again akeys.remove(akey) # of the handlers we do know, but do not have local data, # possibly obtain the archive, yield those # # this is the same as the first loop, but this time all local # paths are checked, and some akeys might already have been # removed for akey, kh in { akey: self._db[akey] for akey in akeys if akey in self._db and self._db[akey].handler }.items(): yield handler, [loc for loc in locs if loc.akey == akey] # if we get here, this did not work, do not try again akeys.remove(akey) # all that is left is to create "remote" handlers and yield them. # collect any exceptions to report them at the end, if needed exc = [] # but this time sort the keys to start with the smallest ones # (just in case a download is involved) for akey in sorted(akeys, key=lambda x: x.size): # at this point we must have an existing _ArchiveInfo record # for this akey ainfo = self._db[akey] # but we do not have a handler yet assert ainfo.handler is None try: handler = self._get_remote_handler(akey, ainfo) except Exception as e: exc.append(e) continue # if this worked, store the handler for later ainfo.handler = handler yield handler, [loc for loc in locs if loc.akey == akey] # if we get here we can stop -- everything was tried. If there were # exceptions, make sure to report them if exc: # TODO better error e = RuntimeError( 'Exhausted all candidate archive handlers ' f'(previous failures {exc})') e.errors = exc raise e def _get_archive_info( self, akey: AnnexKey, locs: Iterable[ArchivistLocator], ) -> _ArchiveInfo: # figure out if the archive is local first local_path = _get_key_contentpath(self._repo, str(akey)) # get all reported archive types akey_atypes = set( loc.atype for loc in locs if loc.akey == akey and loc.atype ) # if we have (consistent) information, pick the type, if not # set to None/ignore and wait for type detection by handler akey_atype = None if len(akey_atypes) != 1 else akey_atypes.pop() ainfo = _ArchiveInfo( local_path=local_path, type=akey_atype, ) # cache for later self._db[akey] = ainfo return ainfo def _get_local_handler(self, ainfo: _ArchiveInfo) -> ArchiveOperations: if not ainfo.type: # TODO we could still do mime-type detection. We have the # archive file present locally. # check datalad-core how it is done in archive support raise NotImplementedError if ainfo.type == ArchiveType.tar: from datalad_next.archive_operations import TarArchiveOperations return TarArchiveOperations( ainfo.local_path, cfg=self._repo.config, ) else: raise NotImplementedError def _get_remote_handler( self, akey: AnnexKey, ainfo: _ArchiveInfo, ) -> ArchiveOperations: # right now we have no remote handlers available # TODO: use akey to ask the repo for URLs from which the key # would be available and select a remote handler to work # with that URL # instead we retrieve the archive res = self._repo.get(str(akey), key=True) # if the akey was already around, `res` could be an empty list. # however, under these circumstances we should not have ended # up here. assert to alert on logic error in that case assert isinstance(res, dict) if res.pop('success', None) is not True: # TODO better error raise RuntimeError(f'Failed to download archive key: {res!r}') # now we have the akey locally ainfo.local_path = _get_key_contentpath(self._repo, str(akey)) return self._get_local_handler(ainfo) def _get_key_contentpath(repo: LegacyAnnexRepo, key: str): """Return ``Path`` to a locally present annex key, or ``None`` ``None`` is return when there is not such key present locally. """ try: # if it exits clean, there will be a content location # and the content can be found at the location loc = next(repo.call_annex_items_(['contentlocation', key])) # convert to path. git-annex will report a path relative to the # dotgit-dir # TODO platform-native? loc = repo.dot_git / Path(loc) except CommandError: loc = None return loc datalad-next-1.4.1/datalad_next/annexremotes/tests/000077500000000000000000000000001462321624600223735ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/annexremotes/tests/__init__.py000066400000000000000000000000001462321624600244720ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/annexremotes/tests/test_archivist.py000066400000000000000000000225501462321624600260040ustar00rootroot00000000000000from pathlib import ( Path, PurePosixPath, ) import pytest from .. import UnsupportedRequest from ..archivist import ArchivistRemote from datalad_next.datasets import Dataset from datalad_next.runners import CommandError from datalad_next.tests import assert_result_count @pytest.fixture(autouse=False, scope="function") def archivist_dataset(tmp_path_factory, no_result_rendering): wpath = tmp_path_factory.mktemp("archivistds") # now create a second dataset that can pull all its content from # archives ads = Dataset(wpath / 'archiveds').create() # configure the archivist special remote for all dl_archive URL # handling ads.repo.call_annex([ 'initremote', 'archivist', 'type=external', 'externaltype=archivist', 'encryption=none', 'autoenable=true', ]) return ads @pytest.fixture(autouse=False, scope="function") def populated_archivist_dataset( archivist_dataset, tmp_path_factory, no_result_rendering): """Returns a path to generated dataset This dataset references an annex archive with no other annexed files. The datalad special remote 'archivist' is enabled in the dataset and also set to autoenable. Returns ------- Dataset, str, str, tuple(tuples) 1. generated dataset instance 2. the annex key for the included archive 3. the leading directory of all files in the archive 4. iterable with POSIX-path:content pairs for archive members. The path is relative to the leading archive directory, and can also be interpreted relative to the dataset root. """ wpath = tmp_path_factory.mktemp("archivistds") ads = archivist_dataset dscontent = ( ('azip/file1.txt', 'zipfile1'), ('azip/file2.csv', 'zipfile2_muchcontent'), ('atar/file1.txt', 'tarfile1'), ('atar/file2.csv', 'tarfile2_muchcontent'), ) srcds = Dataset(wpath / 'srcds').create() for fpath, fcontent in dscontent: fpath = srcds.pathobj / (PurePosixPath(fpath)) fpath.parent.mkdir(parents=True, exist_ok=True) fpath.write_text(fcontent) srcds.save() archive_root = wpath / 'myarchive' #archivetype = 'zip' akeys = {} # no ZIP just yet # for archivetype, ext in (('zip', ''), ('tar', '.gz')): for archivetype, ext in (('tar', '.gz'), ): archive_path = Path(f"{archive_root}.{archivetype}{ext}") archive_path_inds = ads.pathobj / '.archives' / archive_path.name # create an archive, the easy way, by simply exporting the # entire dataset worktree srcds.export_archive(archive_root, archivetype=archivetype) assert archive_path.exists() # add the archive (in a hidden dir) to be able to reference # it via a key aurl = archive_path.as_uri() ads.repo.call_annex([ 'addurl', '--file', str(archive_path_inds), aurl]) ads.save() # get the key of the archive akeys[archivetype] = ads.status( archive_path_inds, annex='basic', return_type='item-or-list', )['key'] return ads, akeys, archive_root, dscontent def _check_archivist_addurl(atypes, ads, akeys, archive_root, dscontent): # run addurl on dl+archive URLs: this exercises CLAIMURL, CHECKURL, # TRANSFER_RETRIEVE for archivetype in atypes: for fpath, fcontent in dscontent: # take individual files from archives of different types if not fpath.startswith(f'a{archivetype}'): continue ads.repo.call_annex([ 'addurl', '--file', str(PurePosixPath(fpath)), f'dl+archive:{akeys[archivetype]}' f'#path={archive_root.name}/{fpath}&size={len(fcontent)}', ]) # check that we reached the desired state whereis = ads.repo.whereis( # this dance is needed, because the normalize_paths decorator # of whereis() required platform paths str(Path('atar', 'file1.txt')), output='full', ) # the file is known to exactly one remote (besides "here") assert len(whereis) == 2 # and one remote is the archivist remote, and importantly not the # 'web' remote -- which would indicate a failure of claimurl/checkurl assert any(wi['description'] == '[archivist]' for wi in whereis.values()) def _check_archivist_retrieval(archivist_dataset): ads, akeys, archive_root, dscontent = archivist_dataset # step 1: addurl _check_archivist_addurl( # check all archive types supported # no ZIP just yet #('zip', 'tar'), ('tar',), ads, akeys, archive_root, dscontent, ) # make a clean dataset ads.save() # step 2: drop keys with dl+archive: URLs # now drop all archive member content. this should work, # because for each file there is a URL on record # -- hence always another copy # this requires archivist's CHECKPRESENT to function properly # no ZIP just yet #res = ads.drop(['azip', 'atar']) res = ads.drop(['atar']) assert_result_count( res, # all files, plus the two directories we gave as arguments # no ZIP just yet #len(dscontent) + 2, 3, action='drop', status='ok', ) # step 3: retrieve keys with dl+archive: URLs from locally present archives # no ZIP just yet #res = ads.get(['azip', 'atar']) res = ads.get(['atar']) assert_result_count( res, # no ZIP just yet # len(dscontent), 2, action='get', status='ok', type='file', ) for fpath, fcontent in dscontent: # no ZIP just yet if 'zip' in fpath: continue assert (ads.pathobj / fpath).read_text() == fcontent # step 4: now drop ALL keys (incl. archives) # this will present a challenge for CHECKPRESENT: # without the archives no longer being around, it would requires remote # access or download to actually verify continued presence. # force this condition by dropping the archive keys first res = ads.drop('.archives') assert_result_count( res, # a tar and a zip # no ZIP just yet #2, 1, action='drop', type='file', status='ok', ) # and 4a now drop the keys that have their content from archives # no ZIP just yet #res = ads.drop(['azip', 'atar']) res = ads.drop(['atar']) assert_result_count( res, # no ZIP just yet #len(dscontent), 2, action='drop', status='ok', type='file', ) # and now get again, this time with no archives around locally # no ZIP just yet #res = ads.get(['azip', 'atar']) res = ads.get(['atar']) assert_result_count( res, # no ZIP just yet # len(dscontent), 2, action='get', status='ok', type='file', ) for fpath, fcontent in dscontent: # no ZIP just yet if 'zip' in fpath: continue assert (ads.pathobj / fpath).read_text() == fcontent # and drop everything again to leave the dataset empty res = ads.drop(['.']) def test_archivist_retrieval(populated_archivist_dataset, no_result_rendering): _check_archivist_retrieval(populated_archivist_dataset) # the following is either not possible or not identical between archivist # and the old datalad-archives special remotes ads, akeys, archive_root, dscontent = populated_archivist_dataset # step 5: remove the only remaining source of the archives, and check # how it get/fsck fails for archive in archive_root.parent.glob('*.*z*'): archive.unlink() with pytest.raises(CommandError) as e: # no ZIP just yet #ads.repo.call_annex(['get', 'atar', 'azip']) ads.repo.call_annex(['get', 'atar']) # make sure the "reason" is communicated outwards assert 'does not exist' in e.value.stderr with pytest.raises(CommandError): # no ZIP just yet #ads.repo.call_annex(['fsck', '-f', 'archivist', 'atar', 'azip']) ads.repo.call_annex(['fsck', '-f', 'archivist', 'atar']) def test_archivist_retrieval_legacy( populated_archivist_dataset, monkeypatch, no_result_rendering): """Same as test_archivist_retrieval(), but performs everything via the datalad-core provided datalad-archives special remote code """ with monkeypatch.context() as m: m.setenv("DATALAD_ARCHIVIST_LEGACY__MODE", "yes") _check_archivist_retrieval(populated_archivist_dataset) def test_claimcheck_url(): class DummyAnnex: def debug(*args, **kwargs): pass def info(*args, **kwargs): pass def error(*args, **kwargs): pass ar = ArchivistRemote(DummyAnnex()) valid_url = \ 'dl+archive:MD5E-s1--e9f624eb778e6f945771c543b6e9c7b2.zip#path=f.txt' invalid_url = \ 'dl+BROKENarchive:MD5E-s1--e9f624eb778e6f945771c543b6e9c7b2.zip#path=f.txt' assert ar.claimurl(valid_url) is True assert ar.claimurl(invalid_url) is False assert ar.checkurl(valid_url) is True assert ar.checkurl(invalid_url) is False def test_archivist_unsupported(): ar = ArchivistRemote(None) with pytest.raises(UnsupportedRequest): ar.transfer_store('mykey', 'myfile') with pytest.raises(UnsupportedRequest): ar.remove('mykey') datalad-next-1.4.1/datalad_next/annexremotes/tests/test_uncurl.py000066400000000000000000000410121462321624600253120ustar00rootroot00000000000000from pathlib import Path import pytest import re from datalad_next.consts import on_windows from datalad_next.tests import ( create_tree, skip_if_on_windows, skip_if_root, ) from datalad_next.constraints import EnsureDataset from datalad_next.exceptions import ( CommandError, UrlOperationsRemoteError, IncompleteResultsError, ) from datalad_next.url_operations import AnyUrlOperations from ..uncurl import ( RemoteError, UncurlRemote, ) # for some tests below it is important that this base config contains no # url= or match= declaration (or any other tailoring to a specific use case) std_initargs = [ 'type=external', 'externaltype=uncurl', 'encryption=none', ] class NoOpAnnex: def __init__(self, gitdir: Path): self._gitdir = gitdir def error(*args, **kwargs): pass def info(*args, **kwargs): pass def debug(*args, **kwargs): pass def getgitdir(self): return str(self._gitdir) def getgitremotename(self): return 'origin' def getconfig(self, key): return None def getuuid(self): return '2d0660ae-e5bf-11ee-b6f2-37ba632b6dd3' def test_uncurl_remove_no_tmpl(existing_dataset): # without a template configured we refuse to remove anything # for the simple reason that it may not be clear what is being # removed at all. We could only iterate over all recorded URLs # and wipe out the key from "the internet". This is, however, # a rather unexpected thing to do from a user perspective -- # who would expect a single key instance "at the uncurl remote" # to be removed. The best proxy we have for this expectation # is a URL tmpl being configured, pointing to such a single # location r = UncurlRemote(NoOpAnnex(existing_dataset.pathobj)) r.prepare() with pytest.raises(RemoteError): r.remove(None) def test_uncurl_transfer_store_no_tmpl(existing_dataset): r = UncurlRemote(NoOpAnnex(existing_dataset.pathobj)) r.url_handler = AnyUrlOperations() # whenever there is not template configured with pytest.raises(RemoteError): r.transfer_store(None, '') def test_uncurl_checkretrieve(existing_dataset): def handler(url): raise UrlOperationsRemoteError(url) def get_urls(key): return 'some' r = UncurlRemote(NoOpAnnex(existing_dataset.pathobj)) r.prepare() r.get_key_urls = get_urls # we raise the correct RemoteError and not the underlying # UrlOperationsRemoteError with pytest.raises(RemoteError): r._check_retrieve('somekey', handler, ('blow', 'here')) def test_uncurl_claimurl(existing_dataset): r = UncurlRemote(NoOpAnnex(existing_dataset.pathobj)) r.prepare() # if we have a match expression defined, this determines claim or noclaim r.match = [re.compile('bongo.*')] assert r.claimurl('bongo://joe') assert not r.claimurl('http://example.com') r.match = None # without a match expression, the url handler decides r.url_handler = AnyUrlOperations() for url in ('http://example.com', 'https://example.com', 'ssh://example.com', 'file:///home/me'): assert r.claimurl(url) assert not r.claimurl('bongo://joe') def test_uncurl_checkurl(httpbin, tmp_path, existing_dataset): # this is the URL against which the httpbin calls will be made hbsurl = httpbin['standard'] exists_path = tmp_path / 'testfile' exists_path.write_text('123') exists_url = exists_path.as_uri() no_exists_url = (tmp_path / 'notestfile').as_uri() # checkurl is basically an 'exists?' test against a URL. # the catch is that this test is not performed against the # incoming URL, but against the mangled URL that is the result # of instantiating the URL template based on all properties # extractable from the URL alone (via any configured match # expressions) r = UncurlRemote(NoOpAnnex(existing_dataset.pathobj)) r.prepare() # no match and no template defined assert not r.checkurl(no_exists_url) assert r.checkurl(exists_url) # # now with rewriting # # MIH cannot think of a usecase where declaring a URL template # would serve any purpose without also declaring a match expression # here. # outside the context of checkurl() this is different: It would # make sense to declare a template that only uses standard key properties # in order to define/declare upload targets for existing keys. # consequently, checkurl() is ignoring a template-based rewriting # when no match is defined, or when the matchers cannot extract all # necessary identifiers from the incoming (single) URL in order to # instantiate the URL template. In such cases, the original URL is # used for checking r.url_tmpl = '{absurd}' assert not r.checkurl(no_exists_url) assert r.checkurl(exists_url) # now add a matcher to make use of URL rewriting even for 'addurl'-type # use case, such as: we want to pass a "fixed" URL verbatim to some kind # of external redirector service r.url_tmpl = f'{hbsurl}/redirect-to?url={{origurl}}' r.match = [ re.compile('.*(?Phttps?://.*)$'), ] assert not r.checkurl(f'garbled{hbsurl}/status/404') assert r.checkurl(f'garbled{hbsurl}/bytes/24') # sibling of `test_uncurl_checkurl()`, but more high-level def test_uncurl_addurl_unredirected( existing_dataset, httpbin, no_result_rendering): # this is the URL against which the httpbin calls will be made hbsurl = httpbin['standard'] ds = existing_dataset dsca = ds.repo.call_annex # same set as in `test_uncurl_checkurl()` dsca(['initremote', 'myuncurl'] + std_initargs + [ 'match=.*(?Phttps?://.*)$', f'url={hbsurl}/redirect-to?url={{origurl}}', ]) # feed it a broken URL, which must be getting fixed by the rewriting # (pulls 24 bytes) testurl = f'garbled{hbsurl}/bytes/24' dsca(['addurl', '--file=dummy', testurl]) # we got what we expected assert ds.status( 'dummy', annex='basic', return_type='item-or-list', )['bytesize'] == 24 # make sure git-annex recorded an unmodified URL assert any(testurl in r.get('urls', []) for r in ds.repo.whereis('dummy', output='full').values()) def test_uncurl(existing_dataset, tmp_path): archive_path = tmp_path create_tree(archive_path, {'lvlA1': {'lvlB2_flavor1.tar': 'data_A1B2F1'}}) ds = existing_dataset archive_path = Path(archive_path) dsca = ds.repo.call_annex dsca(['initremote', 'myuncurl'] + std_initargs + [ 'match=bingofile://(?P.*)/(?P[^/]+)/(?P[^/]+)_(?P.*)$ someothermatch', 'url=file://{basepath}/{lvlA}/{lvlB}_{flavor}', ]) data_url = (archive_path / 'lvlA1' / 'lvlB2_flavor1.tar').as_uri() # prefix the URL so git-annex has no idea how to handle it # (same as migrating from an obsolete system with no support anymore) data_url = f'bingo{data_url}' dsca(['addurl', '--file', 'data_A1B2F1.dat', data_url]) assert (ds.pathobj / 'data_A1B2F1.dat').read_text() == 'data_A1B2F1' # file is known to be here and a (uncurl) remote assert len(ds.repo.whereis('data_A1B2F1.dat')) == 2 # must survive an fsck (CHECKPRESENT) dsca(['fsck', '-q', '-f', 'myuncurl']) # RIA tooling is not working for this test on windows # https://github.com/datalad/datalad/issues/7212 @skip_if_on_windows def test_uncurl_ria_access(tmp_path, no_result_rendering): """ - create dataset with test file and push into RIA store - create INDEPENDENT dataset and 'addurl' test file directly from RIA - test that addurls work without any config, just initremote with no custom settings - now move RIA and hence break URL - fix1: only using a URL template, point to dataset dir in RIA store plus some always available key-properties - alternative fix2: simpler template, plus match expression to "understand" some structural aspects of RIA and reuse them """ # we create a dataset to bootstrap the test setup, with on file # of known content srcds = EnsureDataset()(tmp_path / 'srcds').ds.create() testfile_content = 'mikewashere!' (srcds.pathobj / 'testfile.txt').write_text(testfile_content) srcds.save() # pull out some essential properties for the underlying key for later # use in this test testkey_props = srcds.status( 'testfile.txt', annex='basic', return_type='item-or-list') testkey_props = { k: v for k, v in testkey_props.items() if k in ('key', 'hashdirmixed', 'hashdirlower') } # establish a RIA sibling and push baseurl = (tmp_path / "ria").as_uri() srcds.create_sibling_ria( # use a ria+file:// URL for simplicity f'ria+{baseurl}', name='ria', new_store_ok=True, ) srcds.push(to='ria') # setup is done # start of the actual test # create a fresh dataset ds = EnsureDataset()(tmp_path / 'testds').ds.create() dsca = ds.repo.call_annex # we add uncurl WITH NO config whatsoever. # this must be enough to be able to use the built-in downloaders target_fname = 'mydownload.txt' dsca(['initremote', 'myuncurl'] + std_initargs) dsca(['addurl', '--file', target_fname, # we download from the verbatim, hand-crafted URL f'{baseurl}/{srcds.id[:3]}/{srcds.id[3:]}/annex/objects/' f'{testkey_props["hashdirmixed"]}' f'{testkey_props["key"]}/{testkey_props["key"]}' ]) assert (ds.pathobj / target_fname).read_text() == testfile_content # make sure the re-downloaded key ends up having the same keyname in # the new dataset assert ds.status( target_fname, annex='basic', return_type='item-or-list', )['key'] == testkey_props['key'] # now we drop the key... ds.drop(target_fname) assert not ds.status( target_fname, annex='availability', return_type='item-or-list', )['has_content'] # ...and we move the RIA store to break the recorded # URL (simulating an infrastructure change) (tmp_path / 'ria').rename(tmp_path / 'ria_moved') # verify that no residual magic makes data access possible with pytest.raises(IncompleteResultsError): ds.get(target_fname) # fix it via an access URL config, # point directly via a hard-coded dataset ID # NOTE: last line is no f-string! url_tmpl = ( tmp_path / "ria_moved" / srcds.id[:3] / srcds.id[3:] ).as_uri() + '/annex/objects/{annex_dirhash}/{annex_key}/{annex_key}' ds.configuration( 'set', f'remote.myuncurl.uncurl-url={url_tmpl}') # confirm checkpresent acknowledges this dsca(['fsck', '-q', '-f', 'myuncurl']) # confirm transfer_retrieve acknowledges this ds.get(target_fname) # but we can also do without hard-coding anything, so let's drop again ds.drop(target_fname) assert not ds.status( target_fname, annex='availability', return_type='item-or-list', )['has_content'] # for that we need to add a match expression that can "understand" # the original URL. All we need is to distinguish the old base path # from the structured components in the RIA store (for the # latter we can simple account for 4 levels of sudirs ds.configuration( 'set', 'remote.myuncurl.uncurl-match=' 'file://(?P.*)/(?P[^/]+/[^/]+)/annex/objects/.*$', scope='local') # NOTE: last line is no f-string! url_tmpl = (tmp_path / "ria_moved").as_uri() \ + '/{dsdir}/annex/objects/{annex_dirhash}/{annex_key}/{annex_key}' ds.configuration( 'set', f'remote.myuncurl.uncurl-url={url_tmpl}', scope='local') # confirm checkpresent acknowledges this dsca(['fsck', '-q', '-f', 'myuncurl']) # confirm transfer_retrieve acknowledges this ds.get(target_fname) assert (ds.pathobj / target_fname).read_text() == testfile_content @skip_if_root # see https://github.com/datalad/datalad-next/issues/525 def test_uncurl_store(tmp_path, existing_dataset, no_result_rendering): ds = existing_dataset testfile = ds.pathobj / 'testfile1.txt' testfile_content = 'uppytyup!' testfile.write_text(testfile_content) ds.save() dsca = ds.repo.call_annex # init the remote with a template that places keys in the same structure # as annex/objects within a bare remote repo dsca(['initremote', 'myuncurl'] + std_initargs + [ # intentional double-braces at the end to get templates into the template f'url={(tmp_path).as_uri()}/{{annex_dirhash_lower}}{{annex_key}}/{{annex_key}}', ]) # store file at remote dsca(['copy', '-t', 'myuncurl', str(testfile)]) # let remote verify presence dsca(['fsck', '-q', '-f', 'myuncurl']) # doublecheck testfile_props = ds.status(testfile, annex='basic', return_type='item-or-list') assert (tmp_path / testfile_props['hashdirlower'] / testfile_props['key'] / testfile_props['key'] ).read_text() == testfile_content # we have no URLs recorded assert all(not v['urls'] for v in ds.repo.whereis(str(testfile), output='full').values()) # yet we can retrieve via uncurl, because local key properties are enough # to fill the template ds.drop(testfile) assert not ds.status( testfile, annex='availability', return_type='item-or-list', )['has_content'] dsca(['copy', '-f', 'myuncurl', str(testfile)]) assert testfile.read_text() == testfile_content if on_windows: # remaining bits assume POSIX FS return ds.config.set( 'remote.myuncurl.uncurl-url', # same as above, but root with no write-permissions 'file:///youshallnotpass/{annex_dirhash_lower}{annex_key}/{annex_key}', scope='local', ) with pytest.raises(CommandError): dsca(['fsck', '-q', '-f', 'myuncurl']) with pytest.raises(CommandError): dsca(['copy', '-t', 'myuncurl', str(testfile)]) def test_uncurl_store_via_ssh( sshserver, existing_dataset, no_result_rendering): ds = existing_dataset testfile = ds.pathobj / 'testfile1.txt' testfile_content = 'uppytyup!' testfile.write_text(testfile_content) ds.save() dsca = ds.repo.call_annex # init the remote with a template that places keys in the same structure # as annex/objects within a bare remote repo dsca(['initremote', 'myuncurl'] + std_initargs + [ # intentional double-braces at the end to get templates into the template f'url={sshserver[0]}/{{annex_key}}', ]) # store file at remote dsca(['copy', '-t', 'myuncurl', str(testfile)]) # let remote verify presence dsca(['fsck', '-q', '-f', 'myuncurl']) def test_uncurl_remove(existing_dataset, tmp_path, no_result_rendering): testfile = tmp_path / 'testdeposit' / 'testfile1.txt' testfile_content = 'uppytyup!' testfile.parent.mkdir() testfile.write_text(testfile_content) ds = existing_dataset dsca = ds.repo.call_annex # init without URL template dsca(['initremote', 'myuncurl'] + std_initargs) # add the testdeposit by URL target_fname = ds.pathobj / 'target1.txt' dsca(['addurl', '--file', str(target_fname), testfile.as_uri()]) # it will not drop without a URL tmpl # see test_uncurl_remove_no_tmpl() for rational with pytest.raises(CommandError): dsca(['drop', '-f', 'myuncurl', str(target_fname)]) assert testfile.read_text() == testfile_content # now make it possible # use the simplest possible match expression ds.configuration( 'set', 'remote.myuncurl.uncurl-match=file://(?P.*)$', scope='local') # and the presence of a tmpl enables deletion ds.configuration( 'set', 'remote.myuncurl.uncurl-url=file://{allofit}', scope='local') dsca(['drop', '-f', 'myuncurl', str(target_fname)]) assert not testfile.exists() # >30s def test_uncurl_testremote(tmp_path, existing_dataset): "Point git-annex's testremote at uncurl" ds = existing_dataset dsca = ds.repo.call_annex dsca(['initremote', 'myuncurl'] + std_initargs # file:///key + [f'url=file://{tmp_path}/{{annex_key}}']) # Temporarily disable this until # https://github.com/datalad/datalad-dataverse/issues/127 # is sorted out. Possibly via # https://git-annex.branchable.com/bugs/testremote_is_not_honoring_--backend if not on_windows: # not running with --fast to also cover key chunking dsca(['testremote', '--quiet', 'myuncurl']) datalad-next-1.4.1/datalad_next/annexremotes/uncurl.py000066400000000000000000000546571462321624600231340ustar00rootroot00000000000000""" *uncurl* git-annex external special remote ========================================== This implementation is a git-annex accessible interface to datalad-next's URL operations framework. It serves two main purposes: 1. Combine git-annex's capabilities of registering and accessing file content via URLs with DataLad's access credential management and (additional or alternative) transport protocol implementations. 2. Minimize the maintenance effort for datasets (primarily) composed from content that is remotely accessible via URLs from systems other than Datalad or git-annex in the event of an infrastructure transition (e.g. moving to a different technical system or a different data organization on a storage system). Requirements ------------ This special remote implementation requires git-annex version 8.20210127 (or later) to be available. Download helper --------------- The simplest way to use this remote is to initialize it without any particular configuration:: $ git annex initremote uncurl type=external externaltype=uncurl encryption=none initremote uncurl ok (recording state in git...) Once initialized, or later enabled in a clone, ``git-annex addurl`` will check with the *uncurl* remote whether it can handle a particular URL, and will let the remote perform the download in case of positive response. By default, the remote will claim any URLs with a scheme that the local datalad-next installation supports. This always includes ``file://``, ``http://``, and ``https://``, but is extensible, and a particular installation may also support ``ssh://`` (by default when openssh is installed), or other schemes. This additional URL support is also available for other commands. Here is an example how ``datalad addurls`` can be given any uncurl-supported URLs (here an SSH-URL) directly, provided that the ``uncurl`` remote was initialized for a dataset (as shown above):: $ echo '[{"url":"ssh://my.server.org/home/me/file", "file":"dummy"}]' \\ | datalad addurls - '{url}' '{file}' This makes legacy commands (e.g., ``datalad download-url``), unnecessary, and facilitates the use of more advanced ``datalad addurls`` features (e.g., automatic creation of subdatasets) that are not provided by lower-level commands like ``git annex addurl``. Download helper with credential management support -------------------------------------------------- With this setup, download requests now also use DataLad's credential system for authentication. DataLad will automatically lookup matching credentials, prompt for manual entry if none are found, and offer to store them securely for later use after having used them successfully:: $ git annex addurl http://httpbin.org/basic-auth/myuser/mypassword Credential needed for access to http://httpbin.org/basic-auth/myuser/mypassword user: myuser password: password (repeat): Enter a name to save the credential (for accessing http://httpbin.org/basic-auth/myuser/mypassword) securely for future reuse, or 'skip' to not save the credential name: httpbin-dummy addurl http://httpbin.org/basic-auth/myuser/mypassword (from uncurl) (to ...) ok (recording state in git...) By adding files via downloads from URLs in this fashion, datasets can be built that track information across a range of locations/services, using a possibly heterogeneous set of access methods. This feature is very similar to the ``datalad`` special remote implementation included in the core DataLad package. The difference here is that alternative implementations of downloaders are employed and the ``datalad-next`` credential system is used instead of the "providers" mechanism from DataLad's core package. Transforming recorded URLs -------------------------- The main benefit of using *uncurl* is, however, only revealed when the original snapshot of where data used to be accessible becomes invalid, maybe because data were moved to a different storage system, or simply a different host. This would typically require an update of each, now broken, access URL. For datasets with thousands or even millions of files this can be an expensive operation. For data portal operators providing a large number of datasets it is even more tedious. *uncurl* enables programmatic, on-access URL rewriting. This is similar, in spirit, to Git's ``url..insteadOf`` URL modification feature. However, modification possibilities reach substantially beyond replacing a base URL. This feature is based on two customizable settings: 1) a *URL template*; and 2) a *set of match expressions* that extract additional identifiers from any recorded access URL for an annex key. Here is an example: Let's say a file in a dataset has a recorded access URL of:: https://data.example.org/c542/s7612_figure1.pdf We can let *uncurl* know that ``c542`` is actually an identifier for a particular collection of items in this data store. Likewise ``s7612`` is an identifier of a particular item in that collection, and ``figure1.pdf`` is the name of a component in that collection item. The following Python regular expression can be used to "decompose" the above URL into these semantic components:: (?Phttps://[^/]+)/(?Pc[^/]+)/(?Ps[^/]+)_(?P.*)$ This expression is not the most readable, but it basically chunks the URL into segments of ``(?P...)``, so-called named groups (see a `live demo of this expression `__). This expression, and additional ones like it, can set as a configuration parameter of an *uncurl* remote setup. Extending the configuration established by the ``initremote`` call above:: $ git annex enableremote uncurl \\ 'match=(?Phttps://[^/]+)/(?Pc[^/]+)/(?Ps[^/]+)_(?P.*)$' The last argument is quoted to prevent it from being processed by the shell. With the match expression configured, URL rewriting can be enabled by declaring a URL template as another configuration item. The URL template uses the `Python Format String Syntax `__. If the new URL for the file above is now ``http://newsite.net/ex-archive/c542_s7612_figure1.pdf``, we can declare the following URL template to have *uncurl* go to the new site:: http://newsite.net/ex-archive/{collection}_{item}_{component} This template references the identifiers of the named groups we defined in the match expression. Again, the URL template can be set via ``git annex enableremote``:: $ git annex enableremote uncurl \\ 'url=http://newsite.net/ex-archive/{collection}_{item}_{component}' There is no need to separate the ``enableremote`` calls. Both configuration can be given at the same time. In fact, they can also be given to ``initremote`` immediately. The three identifiers ``site``, ``collection``, ``item``, and ``component`` are actually a custom addition to a standard set of identifiers that are available for composing URLs via a template. - ``datalad_dsid`` - the DataLad dataset ID (UUID) - ``annex_dirhash`` - "mixed" variant of the two level hash for a particular key (uses POSIX directory separators, and included a trailing separator) - ``annex_dirhash_lower`` - "lower case" variant of the two level hash for a particular key (uses POSIX directory separators, and included a trailing separator) - ``annex_key`` - git-annex key name for a request - ``annex_remoteuuid`` - UUID of the special remote (location) used by git-annex - ``git_remotename`` - Name of the Git remote for the uncurl special remote .. note:: The URL template must "resolve" to a complete and valid URL. This cannot be verified at configuration time, because even the URL scheme could be a dynamic setting. Uploading content ----------------- The *uncurl* special remote can upload file content or store annex keys via supported URL schemes whenever a URL template is defined. At minimum, storing at ``file://`` and ``ssh://`` URLs are supported. But other URL scheme handlers with upload support may be available at a local DataLad installation. Deleting content ---------------- As for uploading, deleting content is only permitted with a configured URL template. Moreover, it also depends on the delete operation being supported for a particular URL scheme. Configuration overrides ----------------------- Both match expressions and the URL template can also be configured in a dataset's configuration (committed branch configuration, or any Git configuration scope (local, global, system) using the following configuration item names: - ``remote..uncurl-url`` - ``remote..uncurl-match`` where ```` is the name of the special remote in the dataset. A URL template provided via configuration *overrides* one defined in the special remote setup via ``init/enableremote``. Match expressions defined as configuration items *extend* the set of match expressions that may be included in the special remote setup via ``init/enableremote``. The ``remote..uncurl-match`` configuration item can be set as often as necessary (which one match expression each). Tips ---- When multiple match expressions are defined, it is recommended to use unique names for each match-group to avoid collisions. """ from __future__ import annotations from annexremote import Master from functools import partial from pathlib import Path import re from typing import ( Callable, Pattern, ) from datalad_next.exceptions import ( CapturedException, UrlOperationsRemoteError, UrlOperationsResourceUnknown, ) from datalad_next.url_operations import AnyUrlOperations from datalad_next.utils import ensure_list from . import ( RemoteError, SpecialRemote, super_main ) class UncurlRemote(SpecialRemote): """ """ def __init__(self, annex: Master): super().__init__(annex) self.configs.update( url='Python format language template composing an access URL', match='(whitespace-separated list of) regular expression(s) to match particular components in supported URL via named groups', ) self.url_tmpl: str | None = None self.match: list[Pattern[str]] | None = None self.url_handler: AnyUrlOperations | None = None # cache of properties that do not vary within a session # or across annex keys self.persistent_tmpl_props: dict[str, str] = {} def initremote(self) -> None: # at present there is nothing that needs to be done on init/enable. # the remote is designed to work without any specific setup too pass def prepare(self) -> None: # we need the git remote name to be able to look up config about # that remote # check the config for a URL template setting self.url_tmpl = self.get_remote_gitcfg('uncurl', 'url', '') # only if we have no local, overriding, configuration ask git-annex # for the committed special remote config on the URL template if not self.url_tmpl: # ask for the commit config, could still be empty self.url_tmpl = self.annex.getconfig('url') # TODO test the case of a fully absent URL template # that would be fine and only verbatim recorded URLs could # be sent to a downloader # unconditionally ask git-annex for a match-url setting, any local # config amends, and does not override match_spec: str = self.annex.getconfig('match') if match_spec: # TODO implement sanity checks, but running it through re.compile() # might just be enough matches = [re.compile(m) for m in match_spec.split()] else: matches = [] # extend with additional matchers from local config self.match = matches + [ re.compile(m) for m in ensure_list(self.get_remote_gitcfg( 'uncurl', 'match', default=[], get_all=True)) ] self.message( f'URL rewriting template: {self.url_tmpl!r}', type='debug') self.message( f'Active URL match expressions: {[e.pattern for e in self.match]!r}', type='debug') # let the URL handler use the repo's config self.url_handler = AnyUrlOperations(cfg=self.repo.cfg) # cache template properties # using function arg name syntax, we need the identifiers to be valid # Python symbols to work in `format()` self.persistent_tmpl_props.update( datalad_dsid=self.repo.cfg.get('datalad.dataset.id', ''), git_remotename=self.remotename, annex_remoteuuid=self.annex.getuuid(), ) def claimurl(self, url: str) -> bool: """Needs to check if want to handle a given URL If match expressions are configured, matches the URL against all known URL expressions, and returns `True` if there is any match, or `False` otherwise. If no match expressions are configured, return `True` of the URL scheme is supported, or `False` otherwise. """ assert self.url_handler if self.match: return self.is_recognized_url(url) else: return self.url_handler.is_supported_url(url) def checkurl(self, url: str) -> bool: """ When running `git-annex addurl`, this is called after CLAIMURL indicated that we could handle a URL. It can return information on the URL target (e.g., size of the download, a target filename, or a sequence thereof with additional URLs pointing to individual components that would jointly make up the full download from the given URL. However, all of that is optional, and a simple `True` returned is sufficient to make git-annex call `TRANSFER RETRIEVE`. """ assert self.url_handler url_tmpl = self.url_tmpl or '' # try-except, because a URL template might need something # that cannot be extracted from this very URL. # we cannot consult a key that may already point to the same # content as the URL, and may have other information -- # we simply dont have it at this point try: mangled_url = self.get_mangled_url( url, url_tmpl, self.extract_tmpl_props( tmpl=url_tmpl, urls=[url], ), ) except KeyError as e: self.message( 'URL rewriting template requires unavailable component ' f'{e}, continuing with original URL', type='debug', ) # otherwise go ahead with the original URL. the template might # just be here to aid structured uploads mangled_url = url try: self.url_handler.stat(mangled_url) return True except UrlOperationsRemoteError as e: # leave a trace in the logs CapturedException(e) return False # we could return a URL/size/filename triple instead of a bool. # this would make git annex download from a URL different from the input, # and to the given filename. # it could be nice to report the (mangled) url, even if the handler reports # a potentially deviating URL (redirects, etc.). Keeping external # resolvers in the loop can be intentional, and users could provide # resolved URL if they consider that desirable. # however, going with the original URL kinda does that already, rewriting # is happening anyways. And not reporting triplets avoids the issue # of git-annex insisting to write into a dedicated directory for this # download. def transfer_retrieve(self, key: str, filename: str) -> None: assert self.url_handler self._check_retrieve( key, partial(self.url_handler.download, to_path=Path(filename)), ('download', 'from'), ) def checkpresent(self, key: str) -> bool: assert self.url_handler return self._check_retrieve( key, self.url_handler.stat, ('find', 'at'), ) def transfer_store(self, key: str, filename: str) -> None: assert self.url_handler self._store_delete( key, partial(self.url_handler.upload, from_path=Path(filename)), 'cannot store', ) def remove(self, key: str) -> None: def _delete(to_url: str) -> None: # we have to map parameter names to be able to use a common # helper with transfer_store(), because UrlOperations.upload() # needs to get the URL as a second argument, hence we need # to pass parameters as keyword-args assert self.url_handler self.url_handler.delete(url=to_url) return try: self._store_delete( key, _delete, 'refuses to delete', ) except UrlOperationsResourceUnknown: self.message( f'{key!r} not found at the remote, skipping', type='debug') # # helpers # def is_recognized_url(self, url: str) -> bool: return any(m.match(url) for m in self.match or []) def get_key_urls(self, key: str) -> list[str]: # ask git-annex for the URLs it has on record for the key. # this will also work within checkurl() for a temporary key # generated by git-annex after claimurl() urls = self.annex.geturls(key, prefix='') self.message(f'Known urls for {key!r}: {urls}', type='debug') if self.url_tmpl: # we have a rewriting template. extract all properties # from all known URLs and instantiate the template # to get the ONE desired URL props = self.extract_tmpl_props( tmpl=self.url_tmpl, urls=urls, key=key, ) url = self.get_mangled_url( fallback_url=None, tmpl=self.url_tmpl, tmpl_props=props, ) return [url] if url else [] # we have no rewriting template, and must return all URLs we know # to let the caller sort it out return urls def get_mangled_url( self, fallback_url: str | None, tmpl: str, tmpl_props: dict[str, str], ) -> str | None: if not tmpl: # the best thing we can do without a URL template is to # return the URL itself return fallback_url url = tmpl.format(**tmpl_props) return url def extract_tmpl_props( self, tmpl: str, *, urls: list[str] | None = None, key: str | None = None, ) -> dict[str, str]: # look up all the standard allprops = dict(self.persistent_tmpl_props) if key: allprops['annex_key'] = key # if we are working on a specific key, check the template if it # needs more key-specific properties. The conditionals below # are intentionally imprecise to avoid false-negatives given the # flexibility of the format-string-syntax if 'annex_dirhash' in tmpl: allprops['annex_dirhash'] = self.annex.dirhash(key) if 'annex_dirhash_lower' in tmpl: allprops['annex_dirhash_lower'] = self.annex.dirhash_lower(key) # try all URLs against all matchers for url in ensure_list(urls): for matcher in (self.match or []): match = matcher.match(url) if not match: # ignore any non-match continue # we only support named groups in expressions so this is sufficient props = match.groupdict() if any(p in allprops and allprops[p] != props[p] for p in props): self.message( 'Partial URL property shadowing detected. ' 'Avoid by using unique expression match group names.', type='debug' ) allprops.update(props) return allprops def _check_retrieve( self, key: str, handler: Callable, action: tuple[str, str], ) -> bool: urls = self.get_key_urls(key) # depending on the configuration (rewriting template or not) # we could have one or more URLs to try for url in urls: try: handler(url) # we succeeded, no need to try again return True except UrlOperationsResourceUnknown: # general system access worked, but at the key location is nothing # to be found return False except UrlOperationsRemoteError as e: # return False only if we could be sure that the remote # system works properly and just the key is not around CapturedException(e) self.message( f'Failed to {action[0]} key {key!r} {action[1]} {url!r}', type='debug') raise RemoteError( f'Failed to {action[0]} {key!r} {action[1]} any of {urls!r}') def _store_delete( self, key: str, handler: Callable, action: str, ) -> None: if not self.url_tmpl: raise RemoteError( f'Remote {action} content without a configured URL template') urls = self.get_key_urls(key) # we have a rewriting template, so we expect exactly one URL assert len(urls) == 1 url = urls[0] try: handler(to_url=url) except UrlOperationsResourceUnknown: # pass-through, would happen when removing a non-existing key, # which git-annex wants to be a OK thing to happen. # handler in callers raise except Exception as e: # we need to raise RemoteError whenever we could not perform raise RemoteError from e _stat2checkurl_map = { 'content-length': 'size', } """Translate property names returned by AnyUrlOperations.stat() to those expected from checkurl()""" def main(): """cmdline entry point""" super_main( cls=UncurlRemote, remote_name='uncurl', description=\ "flexible access data (in archive systems) " "via a variety of identification schemes", ) datalad-next-1.4.1/datalad_next/archive_operations/000077500000000000000000000000001462321624600224055ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/archive_operations/__init__.py000066400000000000000000000015461462321624600245240ustar00rootroot00000000000000"""Handler for operations on various archive types All handlers implement the API defined by :class:`ArchiveOperations`. Available handlers: .. currentmodule:: datalad_next.archive_operations .. autosummary:: :toctree: generated TarArchiveOperations ZipArchiveOperations """ from .tarfile import TarArchiveOperations from .zipfile import ZipArchiveOperations # TODO REMOVE EVERYTHING BELOW FOR V2.0 import logging from abc import ( ABC, abstractmethod, ) from contextlib import contextmanager from typing import ( Any, Generator, IO, ) # this API is not cooked enough yet to promote it for 3rd-part extensions from .base import ArchiveOperations import datalad from datalad_next.config import ConfigManager from datalad_next.iter_collections.utils import FileSystemItem lgr = logging.getLogger('datalad.ext.next.archive_operations') datalad-next-1.4.1/datalad_next/archive_operations/base.py000066400000000000000000000061551462321624600237000ustar00rootroot00000000000000# allow for |-type UnionType declarations from __future__ import annotations from abc import ( ABC, abstractmethod, ) from contextlib import contextmanager from typing import ( Any, Generator, IO, ) import datalad from datalad_next.config import ConfigManager from datalad_next.iter_collections import FileSystemItem class ArchiveOperations(ABC): """Base class of all archives handlers Any handler can be used as a context manager to adequately acquire and release any resources necessary to access an archive. Alternatively, the :func:`~ArchiveOperations.close()` method can be called, when archive access is no longer needed. In addition to the :func:`~ArchiveOperations.open()` method for accessing archive item content, each handler implements the standard ``__contains__()``, and ``__iter__()`` methods. ``__contains__() -> bool`` reports whether the archive contains an items of a given identifier. ``__iter__()`` provides an iterator that yields :class:`~datalad_next.iter_collections.FileSystemItem` instances with information on each archive item. """ def __init__(self, location: Any, *, cfg: ConfigManager | None = None): """ Parameters ---------- location: Archive location identifier (path, URL, etc.) understood by a particular archive handler. cfg: ConfigManager, optional A config manager instance that implementations will consult for any configuration items they may support. """ self._cfg = cfg self._location = location def __str__(self) -> str: return f'{self.__class__.__name__}({self._location})' def __repr__(self) -> str: return \ f'{self.__class__.__name__}({self._location}, cfg={self._cfg!r})' @property def cfg(self) -> ConfigManager: """ConfigManager given to the constructor, or the session default""" if self._cfg is None: self._cfg = datalad.cfg return self._cfg def __enter__(self): """Default implementation that does nothing in particular""" return self def __exit__(self, exc_type, exc_value, traceback): """Default implementation that only calls ``.close()``""" self.close() # we have no desire to suppress exception, indicate standard # handling by not returning True return @contextmanager @abstractmethod def open(self, item: Any) -> Generator[IO | None, None, None]: """Get a file-like for an archive item Parameters ---------- item: Any identifier for an archive item supported by a particular handler """ raise NotImplementedError def close(self) -> None: """Default implementation for closing a archive handler This default implementation does nothing. """ pass @abstractmethod def __contains__(self, item: Any) -> bool: raise NotImplementedError @abstractmethod def __iter__(self) -> Generator[FileSystemItem, None, None]: raise NotImplementedError datalad-next-1.4.1/datalad_next/archive_operations/tarfile.py000066400000000000000000000073641462321624600244170ustar00rootroot00000000000000"""TAR archive operation handler""" # allow for |-type UnionType declarations from __future__ import annotations import logging import tarfile from contextlib import contextmanager from pathlib import ( Path, PurePosixPath, ) from typing import ( Generator, IO, ) from datalad_next.config import ConfigManager # TODO we might just want to do it in reverse: # move the code of `iter_tar` in here and have it call # `TarArchiveOperations(path).__iter__()` instead. # However, the flexibility to have `iter_tar()` behave # differently depending on parameters (fp=True/False) # is nice, and `__iter__()` only has `self`, such that # any customization would need to be infused in the whole # class. Potentially cumbersome. from datalad_next.iter_collections import ( TarfileItem, iter_tar, ) from .base import ArchiveOperations lgr = logging.getLogger('datalad.ext.next.archive_operations.tarfile') class TarArchiveOperations(ArchiveOperations): """Handler for a TAR archive on a local file system Any methods that take an archive item/member name as an argument accept a POSIX path string, or any `PurePath` instance. """ def __init__(self, location: Path, *, cfg: ConfigManager | None = None): """ Parameters ---------- location: Path TAR archive location cfg: ConfigManager, optional A config manager instance that is consulted for any supported configuration items """ # TODO expose `mode` other kwargs of `tarfile.TarFile` super().__init__(location, cfg=cfg) # Consider supporting file-like for `location`, # see tarfile.open(fileobj=) self._tarfile_path = location self._tarfile = None @property def tarfile(self) -> tarfile.TarFile: """Returns `TarFile` instance, after creating it on-demand The instance is cached, and needs to be released by calling ``.close()`` if called outside a context manager. """ if self._tarfile is None: self._tarfile = tarfile.open(self._tarfile_path, 'r') return self._tarfile def close(self) -> None: """Closes any opened TAR file handler""" if self._tarfile: self._tarfile.close() self._tarfile = None @contextmanager def open(self, item: str | PurePosixPath) -> Generator[IO | None]: """Get a file-like for a TAR archive item The file-like object allows to read from the archive-item specified by `item`. Parameters ---------- item: str | PurePath The identifier must be a POSIX path string, or a `PurePath` instance. Returns ------- IO | None A file-like object to read bytes from the item, if the item is a regular file, else `None`. (This is returned by the context manager that is created via the decorator `@contextmanager`.) Raises ------ KeyError If no item with the name `item` can be found in the tar-archive """ with self.tarfile.extractfile(_anyid2membername(item)) as fp: yield fp def __contains__(self, item: str | PurePosixPath) -> bool: try: self.tarfile.getmember(_anyid2membername(item)) return True except KeyError: return False def __iter__(self) -> Generator[TarfileItem, None, None]: # if fp=True is needed, either `iter_tar()` can be used # directly, or `TarArchiveOperations.open` yield from iter_tar(self._tarfile_path, fp=False) def _anyid2membername(item_id: str | PurePosixPath) -> str: if isinstance(item_id, PurePosixPath): return item_id.as_posix() else: return item_id datalad-next-1.4.1/datalad_next/archive_operations/tests/000077500000000000000000000000001462321624600235475ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/archive_operations/tests/__init__.py000066400000000000000000000000001462321624600256460ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/archive_operations/tests/test_tarfile.py000066400000000000000000000051651462321624600266150ustar00rootroot00000000000000from __future__ import annotations from dataclasses import dataclass from pathlib import ( Path, PurePosixPath, ) from typing import Generator import pytest from datalad_next.iter_collections import FileSystemItemType from datalad_next.tests.marker import skipif_no_network from ..tarfile import TarArchiveOperations @dataclass class _TestArchive: path: Path item_count: int content: bytes target_hash: dict[str, str] @pytest.fixture(scope='session') def structured_sample_tar_xz( sample_tar_xz ) -> Generator[_TestArchive, None, None]: yield _TestArchive( path=sample_tar_xz, item_count=6, content=b'123\n', target_hash={ 'SHA1': 'b5dfcec4d1b6166067226fae102f7fbcf6bd1bd4', 'md5': 'd700214df5487801e8ee23d31e60382a', } ) @skipif_no_network def test_tararchive_basics(structured_sample_tar_xz: _TestArchive): spec = structured_sample_tar_xz # this is intentionally a hard-coded POSIX relpath member_name = 'test-archive/onetwothree.txt' with TarArchiveOperations(spec.path) as archive_ops: with archive_ops.open(member_name) as member: assert member.read() == spec.content with archive_ops.open(PurePosixPath(member_name)) as member: assert member.read() == spec.content @skipif_no_network def test_tararchive_contain(structured_sample_tar_xz: _TestArchive): # this is intentionally a hard-coded POSIX relpath member_name = 'test-archive/onetwothree.txt' archive_ops = TarArchiveOperations(structured_sample_tar_xz.path) # POSIX path str assert member_name in archive_ops # POSIX path as obj assert PurePosixPath(member_name) in archive_ops assert 'bogus' not in archive_ops @skipif_no_network def test_tararchive_iterator(structured_sample_tar_xz: _TestArchive): spec = structured_sample_tar_xz with TarArchiveOperations(spec.path) as archive_ops: items = list(archive_ops) assert len(items) == spec.item_count for item in items: assert item.name in archive_ops @skipif_no_network def test_open(structured_sample_tar_xz: _TestArchive): spec = structured_sample_tar_xz file_pointer = set() with TarArchiveOperations(spec.path) as tf: for item in tf: if item.type == FileSystemItemType.file: with tf.open(str(PurePosixPath(item.name))) as fp: file_pointer.add(fp) assert fp.read(len(spec.content)) == spec.content # check the fp before we close the archive handler for fp in file_pointer: assert fp.closed is True datalad-next-1.4.1/datalad_next/archive_operations/tests/test_zipfile.py000066400000000000000000000060601462321624600266240ustar00rootroot00000000000000from __future__ import annotations from dataclasses import dataclass from pathlib import ( Path, PurePosixPath, ) from typing import Generator import pytest from datalad_next.iter_collections import FileSystemItemType from ..zipfile import ZipArchiveOperations @dataclass class _TestArchive: path: Path item_count: int content: bytes target_hash: dict[str, str] @pytest.fixture(scope='session') def structured_sample_zip(sample_zip) -> Generator[_TestArchive, None, None]: yield _TestArchive( path=sample_zip, item_count=4, content=b'zip-123\n', target_hash={ 'SHA1': 'b5dfcec4d1b6166067226fae102f7fbcf6bd1bd4', 'md5': 'd700214df5487801e8ee23d31e60382a', } ) def test_ziparchive_basics(structured_sample_zip: _TestArchive): spec = structured_sample_zip # this is intentionally a hard-coded POSIX relpath member_name = 'test-archive/onetwothree.txt' with ZipArchiveOperations(spec.path) as archive_ops: with archive_ops.open(member_name) as member: assert member.read() == spec.content with archive_ops.open(PurePosixPath(member_name)) as member: assert member.read() == spec.content def test_ziparchive_contain(structured_sample_zip: _TestArchive): # this is intentionally a hard-coded POSIX relpath member_name = 'test-archive/onetwothree.txt' with ZipArchiveOperations(structured_sample_zip.path) as archive_ops: assert member_name in archive_ops assert PurePosixPath(member_name) in archive_ops assert 'bogus' not in archive_ops def test_ziparchive_iterator(structured_sample_zip: _TestArchive): spec = structured_sample_zip with ZipArchiveOperations(spec.path) as archive_ops: items = list(archive_ops) assert len(items) == spec.item_count for item in items: assert item.name in archive_ops def test_open(structured_sample_zip: _TestArchive): spec = structured_sample_zip file_pointer = set() with ZipArchiveOperations(spec.path) as zf: for item in zf: if item.type != FileSystemItemType.file: continue with zf.open(item.name) as fp: file_pointer.add(fp) assert fp.read(len(spec.content)) == spec.content for fp in file_pointer: assert fp.closed is True def test_open_zipinfo(structured_sample_zip: _TestArchive): spec = structured_sample_zip with ZipArchiveOperations(spec.path) as zf: # get zipfile-native ZipInfo items for item in zf.zipfile.infolist(): if item.filename.endswith('/'): # crude approach to skippping non-files continue with zf.open(item) as fp: assert fp.read(len(spec.content)) == spec.content def test_ziparchive_noncontext(structured_sample_zip: _TestArchive): spec = structured_sample_zip zip = ZipArchiveOperations(spec.path) assert zip.zipfile.filename == str(spec.path) zip.close() assert zip._zipfile is None datalad-next-1.4.1/datalad_next/archive_operations/zipfile.py000066400000000000000000000103331462321624600244210ustar00rootroot00000000000000"""ZIP archive operation handler""" from __future__ import annotations import logging import zipfile from contextlib import contextmanager from pathlib import ( Path, PurePosixPath, ) from typing import ( Generator, IO, ) from zipfile import ZipInfo from datalad_next.config import ConfigManager # TODO we might just want to do it in reverse: # move the code of `iter_zip` in here and have it call # `ZipArchiveOperations(path).__iter__()` instead. # However, the flexibility to have `iter_zip()` behave # differently depending on parameters (fp=True/False) # is nice, and `__iter__()` only has `self`, such that # any customization would need to be infused in the whole # class. Potentially cumbersome. from datalad_next.iter_collections import ( ZipfileItem, iter_zip, ) from .base import ArchiveOperations lgr = logging.getLogger('datalad.ext.next.archive_operations.zipfile') class ZipArchiveOperations(ArchiveOperations): """Handler for a ZIP archive on a local file system """ def __init__(self, location: Path, *, cfg: ConfigManager | None = None, **kwargs): """ Parameters ---------- location: Path ZIP archive location cfg: ConfigManager, optional A config manager instance that is consulted for any supported configuration items **kwargs: dict Keyword arguments that are passed to zipfile.ZipFile-constructor """ super().__init__(location, cfg=cfg) self.zipfile_kwargs = kwargs # Consider supporting file-like for `location`, # see zipfile.ZipFile(file_like_object) self._zipfile_path = location self._zipfile: zipfile.ZipFile | None = None @property def zipfile(self) -> zipfile.ZipFile: """Access to the wrapped ZIP archive as a ``zipfile.ZipFile``""" if self._zipfile is None: self._zipfile = zipfile.ZipFile( self._zipfile_path, **self.zipfile_kwargs ) return self._zipfile def close(self) -> None: """Calls `.close()` on the underlying ``zipfile.ZipFile`` instance""" if self._zipfile: self._zipfile.close() self._zipfile = None @contextmanager def open( self, item: str | PurePosixPath | ZipInfo, **kwargs, ) -> Generator[IO | None, None, None]: """Context manager, returning an open file for a member of the archive. The file-like object will be closed when the context-handler exits. This method can be used in conjunction with ``__iter__`` to read any file from an archive:: with ZipArchiveOperations(archive_path) as zf: for item in zf: if item.type != FileSystemItemType.file: continue with zf.open(item.name) as fp: ... Parameters ---------- item: str | PurePosixPath | zipfile.ZipInfo Name, path, or ZipInfo-instance that identifies an item in the zipfile kwargs: dict Keyword arguments that will be used for ZipFile.open() Returns ------- IO A file-like object to read bytes from the item or to write bytes to the item. """ with self.zipfile.open(_anyzipid2membername(item), **kwargs) as fp: yield fp def __contains__(self, item: str | PurePosixPath | ZipInfo) -> bool: try: self.zipfile.getinfo(_anyzipid2membername(item)) return True except KeyError: return False def __iter__(self) -> Generator[ZipfileItem, None, None]: # if fp=True is needed, either `iter_zip()` can be used # directly, or `ZipArchiveOperations.open` yield from iter_zip(self._zipfile_path, fp=False) def _anyzipid2membername(item: str | PurePosixPath | ZipInfo) -> str: """Convert any supported archive member ID for ``zipfile.open|getinfo()`` """ if isinstance(item, ZipInfo): return item.filename elif isinstance(item, PurePosixPath): return item.as_posix() return item datalad-next-1.4.1/datalad_next/commands/000077500000000000000000000000001462321624600203225ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/commands/__init__.py000066400000000000000000000071661462321624600224450ustar00rootroot00000000000000"""Essential tooling for implementing DataLad commands This module provides the advanced command base class :class:`ValidatedInterface`, for implementing commands with uniform argument validation and structured error reporting. Beyond that, any further components necessary to implement command are imported in this module to offer a one-stop-shop experience. This includes ``build_doc``, ``datasetmethod``, and ``eval_results``, among others. .. currentmodule:: datalad_next.commands .. autosummary:: :toctree: generated CommandResult CommandResultStatus status.StatusResult """ from __future__ import annotations from typing import Dict from datalad.interface.base import ( Interface, build_doc, ) from datalad.interface.results import get_status_dict from datalad.interface.utils import generic_result_renderer from datalad.interface.base import eval_results from datalad.support.param import Parameter from datalad_next.constraints import ( EnsureCommandParameterization, ParameterConstraintContext, ) from datalad_next.datasets import datasetmethod from .results import ( CommandResult, CommandResultStatus, ) class ValidatedInterface(Interface): """Alternative base class for commands with uniform parameter validation .. note:: This interface is a draft. Usage is encouraged, but future changes are to be expected. Commands derived from the traditional ``Interface`` class have no built-in input parameter validation beyond CLI input validation of individual parameters. Consequently, each command must perform custom parameter validation, which often leads to complex boilerplate code that is largely unrelated to the purpose of a particular command. This class is part of a framework for uniform parameter validation, regardless of the target API (Python, CLI, GUI). The implementation of a command's ``__call__`` method can focus on the core purpose of the command, while validation and error handling can be delegated elsewhere. A validator for all individual parameters and the joint-set of all parameters can be provided through the :meth:`get_parameter_validator` method. To transition a command from ``Interface`` to ``ValidatedInterface``, replace the base class declaration and declare a ``_validator_`` class member. Any ``constraints=`` declaration for ``Parameter`` instances should either be removed, or moved to the corresponding entry in ``_validator_``. """ _validator_: EnsureCommandParameterization | None = None @classmethod def get_parameter_validator(cls) -> EnsureCommandParameterization | None: """Returns a validator for the entire parameter set of a command If parameter validation shall be performed, this method must return an instance of :class:`~datalad_next.constraints.EnsureCommandParameterization`. All parameters will be passed through this validator, and only the its output will be passed to the underlying command's ``__call__`` method. Consequently, the core implementation of a command only needs to support the output values of the validators declared by itself. Factoring out input validation, normalization, type coercion etc. into a dedicated component also makes it accessible for upfront validation and improved error reporting via the different DataLad APIs. If a command does not implement parameter validation in this fashion, this method must return ``None``. The default implementation returns the ``_validator_`` class member. """ return cls._validator_ datalad-next-1.4.1/datalad_next/commands/create_sibling_webdav.py000066400000000000000000000633751462321624600252140ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 et: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """High-level interface for creating a combi-target on a WebDAV capable server """ import logging from urllib.parse import ( quote as urlquote, urlunparse, ) from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, Parameter, build_doc, datasetmethod, eval_results, generic_result_renderer, get_status_dict, ) from datalad.interface.common_opts import ( recursion_flag, recursion_limit ) from datalad_next.constraints import ( ConstraintError, DatasetParameter, EnsureBool, EnsureChoice, EnsureDataset, EnsureInt, EnsureParsedURL, EnsureRange, EnsureRemoteName, EnsureStr, ParameterConstraintContext, ) from datalad_next.utils import CredentialManager from datalad_next.utils import ( get_specialremote_credential_properties, patched_env, update_specialremote_credential, _yield_ds_w_matching_siblings, ) __docformat__ = "restructuredtext" lgr = logging.getLogger('datalad.distributed.create_sibling_webdav') class CreateSiblingWebDAVParamValidator(EnsureCommandParameterization): def __init__(self): super().__init__( param_constraints=dict( url=EnsureParsedURL( required=['scheme', 'netloc'], forbidden=['query', 'fragment'], match='^(http|https)://', ), dataset=EnsureDataset( installed=True, purpose='create WebDAV sibling(s)'), name=EnsureRemoteName(), storage_name=EnsureRemoteName(), mode=EnsureChoice( 'annex', 'filetree', 'annex-only', 'filetree-only', 'git-only', ), # TODO https://github.com/datalad/datalad-next/issues/131 credential=EnsureStr(), existing=EnsureChoice('skip', 'error', 'reconfigure'), recursive=EnsureBool(), recursion_limit=EnsureInt() & EnsureRange(min=0), ), validate_defaults=('dataset',), joint_constraints={ ParameterConstraintContext(('url',), 'url'): self._validate_param_url, ParameterConstraintContext( ('url', 'name'), 'default name'): self._validate_default_name, ParameterConstraintContext( ('mode', 'name', 'storage_name'), 'default storage name'): self._validate_default_storage_name, ParameterConstraintContext( ('mode', 'name', 'storage_name'), 'default storage name'): self._validate_default_storage_name, ParameterConstraintContext( ('existing', 'recursive', 'name', 'storage_name', 'dataset', 'mode')): self._validate_existing_names, }, ) def _validate_param_url(self, url): if url.scheme == "http": lgr.warning( f"Using 'http:' ({url.geturl()!r}) means that WebDAV " "credentials are sent unencrypted over network links. " "Consider using 'https:'.") def _validate_default_name(self, url, name): if not name: # not using .netloc to avoid ports to show up in the name return {'name': url.hostname} def _validate_default_storage_name(self, mode, name, storage_name): if mode in ('annex-only', 'filetree-only') and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided" ) if mode == 'git-only' and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided" ) if mode != 'git-only' and not storage_name: storage_name = f"{name}-storage" if mode != 'git-only' and name == storage_name: # leads to unresolvable, circular dependency with publish-depends self.raise_for( dict(mode=mode, name=name, storage_name=storage_name), "sibling names must not be equal", ) return dict(mode=mode, name=name, storage_name=storage_name) def _validate_existing_names( self, existing, recursive, name, storage_name, dataset, mode): if recursive: # we don't do additional validation for recursive processing, # this has to be done when things are running, because an # upfront validation would require an expensive traversal return if existing != 'error': # nothing to check here return if not isinstance(dataset, DatasetParameter): # we did not get a proper dataset parameter, # hence cannot tailor to a dataset to check a remote # name against return validator = EnsureRemoteName(known=False, dsarg=dataset) try: if mode != 'annex-only': validator(name) if mode != 'git-only': validator(storage_name) except ConstraintError as e: self.raise_for( dict(existing=existing, recursive=recursive, name=name, storage_name=storage_name, dataset=dataset, mode=mode), e.msg, ) return @build_doc class CreateSiblingWebDAV(ValidatedInterface): """Create a sibling(-tandem) on a WebDAV server WebDAV is a standard HTTP protocol extension for placing files on a server that is supported by a number of commercial storage services (e.g. 4shared.com, box.com), but also instances of cloud-storage solutions like Nextcloud or ownCloud. These software packages are also the basis for some institutional or public cloud storage solutions, such as EUDAT B2DROP. For basic usage, only the URL with the desired dataset location on a WebDAV server needs to be specified for creating a sibling. However, the sibling setup can be flexibly customized (no storage sibling, or only a storage sibling, multi-version storage, or human-browsable single-version storage). This command does not check for conflicting content on the WebDAV server! When creating siblings recursively for a dataset hierarchy, subdataset exports are placed at their corresponding relative paths underneath the root location on the WebDAV server. Collaboration on WebDAV siblings The primary use case for WebDAV siblings is dataset deposition, where only one site is uploading dataset and file content updates. For collaborative workflows with multiple contributors, please make sure to consult the documentation on the underlying ``datalad-annex::`` Git remote helper for advice on appropriate setups: http://docs.datalad.org/projects/next/ Git-annex implementation details Storage siblings are presently configured to NOT be enabled automatically on cloning a dataset. Due to a limitation of git-annex, this would initially fail (missing credentials). Instead, an explicit ``datalad siblings enable --name `` command must be executed after cloning. If necessary, it will prompt for credentials. This command does not (and likely will not) support embedding credentials in the repository (see ``embedcreds`` option of the git-annex ``webdav`` special remote; https://git-annex.branchable.com/special_remotes/webdav), because such credential copies would need to be updated, whenever they change or expire. Instead, credentials are retrieved from DataLad's credential system. In many cases, credentials are determined automatically, based on the HTTP authentication realm identified by a WebDAV server. This command does not support setting up encrypted remotes (yet). Neither for the storage sibling, nor for the regular Git-remote. However, adding support for it is primarily a matter of extending the API of this command, and passing the respective options on to the underlying git-annex setup. This command does not support setting up chunking for webdav storage siblings (https://git-annex.branchable.com/chunking). """ _examples_ = [ dict(text="Create a WebDAV sibling tandem for storage of a dataset's " "file content and revision history. A user will be prompted " "for any required credentials, if they are not yet known.", code_py="create_sibling_webdav(url='https://webdav.example.com/myds')", code_cmd='datalad create-sibling-webdav "https://webdav.example.com/myds"'), dict(text="Such a dataset can be cloned by DataLad via a specially " "crafted URL. Again, credentials are automatically " "determined, or a user is prompted to enter them", code_py="clone('datalad-annex::?type=webdav&encryption=none&url=https://webdav.example.com/myds')", code_cmd='datalad clone "datalad-annex::?type=webdav&encryption=none&url=https://webdav.example.com/myds"'), dict( text="A sibling can also be created with a human-readable file " "tree, suitable for data exchange with non-DataLad users, " "but only able to host a single version of each file", code_py="create_sibling_webdav(url='https://example.com/browseable', mode='filetree')", code_cmd='datalad create-sibling-webdav --mode filetree "https://example.com/browsable"'), dict(text="Cloning such dataset siblings is possible via a convenience " "URL", code_py="clone('webdavs://example.com/browseable')", code_cmd='datalad clone "webdavs://example.com/browsable"'), dict(text="In all cases, the storage sibling needs to explicitly " "enabled prior to file content retrieval", code_py="siblings('enable', name='example.com-storage')", code_cmd='datalad siblings enable --name example.com-storage'), ] _params_ = dict( url=Parameter( args=("url",), metavar='URL', doc="URL identifying the sibling root on the target WebDAV server"), dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to process. If no dataset is given, an attempt is made to identify the dataset based on the current working directory"""), name=Parameter( args=('-s', '--name',), metavar='NAME', doc="""name of the sibling. If none is given, the hostname-part of the WebDAV URL will be used. With `recursive`, the same name will be used to label all the subdatasets' siblings."""), storage_name=Parameter( args=("--storage-name",), metavar="NAME", doc="""name of the storage sibling (git-annex special remote). Must not be identical to the sibling name. If not specified, defaults to the sibling name plus '-storage' suffix. If only a storage sibling is created, this setting is ignored, and the primary sibling name is used."""), credential=Parameter( args=("--credential",), metavar='NAME', doc="""name of the credential providing a user/password credential to be used for authorization. The credential can be supplied via configuration setting 'datalad.credential..user|secret', or environment variable DATALAD_CREDENTIAL__USER|SECRET, or will be queried from the active credential store using the provided name. If none is provided, the last-used credential for the authentication realm associated with the WebDAV URL will be used. Only if a credential name was given, it will be encoded in the URL of the created WebDAV Git remote, credential auto-discovery will be performed on each remote access.""", ), existing=Parameter( args=("--existing",), doc="""action to perform, if a (storage) sibling is already configured under the given name. In this case, sibling creation can be skipped ('skip') or the sibling (re-)configured ('reconfigure') in the dataset, or the command be instructed to fail ('error').""", ), recursive=recursion_flag, recursion_limit=recursion_limit, mode=Parameter( args=("--mode",), doc="""Siblings can be created in various modes: full-featured sibling tandem, one for a dataset's Git history and one storage sibling to host any number of file versions ('annex'). A single sibling for the Git history only ('git-only'). A single annex sibling for multi-version file storage only ('annex-only'). As an alternative to the standard (annex) storage sibling setup that is capable of storing any number of historical file versions using a content hash layout ('annex'|'annex-only'), the 'filetree' mode can used. This mode offers a human-readable data organization on the WebDAV remote that matches the file tree of a dataset (branch). However, it can, consequently, only store a single version of each file in the file tree. This mode is useful for depositing a single dataset snapshot for consumption without DataLad. The 'filetree' mode nevertheless allows for cloning such a single-version dataset, because the full dataset history can still be pushed to the WebDAV server. Git history hosting can also be turned off for this setup ('filetree-only'). When both a storage sibling and a regular sibling are created together, a publication dependency on the storage sibling is configured for the regular sibling in the local dataset clone. """), ) _validator_ = CreateSiblingWebDAVParamValidator() @staticmethod @datasetmethod(name='create_sibling_webdav') @eval_results def __call__( url, *, dataset=None, name=None, storage_name=None, mode='annex', credential=None, existing='error', recursive=False, recursion_limit=None): ds = dataset.ds res_kwargs = dict( action='create_sibling_webdav', logger=lgr, refds=ds.path, ) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. if existing == 'error': # even if we have to fail, let's report all conflicting siblings # in subdatasets, an outside controller can stop the generator # if desired failed = False for dpath, sname in _yield_ds_w_matching_siblings( ds, (name, storage_name), recursive=recursive, recursion_limit=recursion_limit): res = get_status_dict( status='error', message=( "a sibling %r is already configured in dataset %r", sname, dpath), type='sibling', name=sname, ds=ds, **res_kwargs, ) failed = True yield res if failed: return # determine the credential upfront # can be done once at the start, all siblings will live on the same # server # if all goes well, we'll store a credential (update) at the very end credman = CredentialManager(ds.config) credname, credprops = credman.obtain( credential, prompt='User name and password are required for WebDAV access ' f'at {url.geturl()}', query_props=get_specialremote_credential_properties( dict(type='webdav', url=url.geturl())), type_hint='user_password', # make it raise ValueError when the critical components are missing expected_props=['user', 'secret'], ) cred_user = credprops['user'] cred_password = credprops['secret'] def _dummy(ds, refds, **kwargs): """Small helper to prepare the actual call to _create_sibling_webdav() for a particular (sub)dataset. We only have kwargs to catch whatever it throws at us. """ relpath = ds.pathobj.relative_to(refds.pathobj) if not ds == refds else None if relpath: dsurl = f"{urlunparse(url)}/{relpath}" else: dsurl = url.geturl() return _create_sibling_webdav( ds, dsurl, # we pass the given, not the discovered, credential name! # given a name means "take this particular one", not giving a # name means "take what is best". Only if we pass this # information on, we achieve maintaining this behavior credential_name=credential, credential=(cred_user, cred_password), mode=mode, name=name, storage_name=storage_name, existing=existing, ) # Generate a sibling for dataset "ds", and for sub-datasets if recursive # is True. for res in ds.foreach_dataset( _dummy, return_type='generator', result_renderer='disabled', recursive=recursive, # recursive False is not enough to disable recursion # https://github.com/datalad/datalad/issues/6659 recursion_limit=0 if not recursive else recursion_limit, ): # unwind result generator for partial_result in res.get('result', []): yield dict(res_kwargs, **partial_result) # this went well, update the credential update_specialremote_credential( 'webdav', credman, credname, credprops, credtype_hint='user_password', duplicate_hint= 'Specify a credential name via the `credential` parameter ' ' and/or configure a credential with the datalad-credentials ' 'command{}'.format( f' with a `realm={credprops["realm"]}` property' if 'realm' in credprops else ''), ) @staticmethod def custom_result_renderer(res, **kwargs): from datalad_next.uis import ( ansi_colors as ac, ui_switcher as ui, ) from os.path import relpath if res['status'] != 'ok' or 'sibling_webdav' not in res['action'] or \ res['type'] != 'sibling': # It's either 'notneeded' (not rendered), an `error`/`impossible` or # something unspecific to this command. No special rendering # needed. generic_result_renderer(res) return ui.message('{action}({status}): {path} [{name}{url}]'.format( action=ac.color_word(res['action'], ac.BOLD), path=relpath(res['path'], res['refds']) if 'refds' in res else res['path'], name=ac.color_word(res.get('name', ''), ac.MAGENTA), url=f": {res['url']}" if 'url' in res else '', status=ac.color_status(res['status']), )) def _create_sibling_webdav( ds, url, *, credential_name, credential, mode='git-only', name=None, storage_name=None, existing='error'): """ Parameters ---------- ds: Dataset url: str credential_name: str credential: tuple mode: str, optional name: str, optional storage_name: str, optional existing: str, optional """ # simplify downstream logic, export yes or no export_storage = 'filetree' in mode existing_siblings = [ r[1] for r in _yield_ds_w_matching_siblings( ds, (name, storage_name), recursive=False) ] if mode != 'git-only': yield from _create_storage_sibling( ds, url, storage_name, credential, export=export_storage, existing=existing, known=storage_name in existing_siblings, ) if mode not in ('annex-only', 'filetree-only'): yield from _create_git_sibling( ds, url, name, credential_name, credential, export=export_storage, existing=existing, known=name in existing_siblings, publish_depends=storage_name if mode != 'git-only' else None ) def _get_skip_sibling_result(name, ds, type_): return get_status_dict( action='create_sibling_webdav{}'.format( '.storage' if type_ == 'storage' else ''), ds=ds, status='notneeded', message=("skipped creating %r sibling %r, already exists", type_, name), name=name, type='sibling', ) def _create_git_sibling( ds, url, name, credential_name, credential, export, existing, known, publish_depends=None): """ Parameters ---------- ds: Dataset url: str name: str credential_name: str credential: tuple export: bool existing: {skip, error, reconfigure} known: bool Flag whether the sibling is a known remote (no implied necessary existence of content on the remote). publish_depends: str or None publication dependency to set """ if known and existing == 'skip': yield _get_skip_sibling_result(name, ds, 'git') return remote_url = \ "datalad-annex::?type=webdav&encryption=none" \ "&exporttree={export}&url={url}".format( export='yes' if export else 'no', # urlquote, because it goes into the query part of another URL url=urlquote(url)) if credential_name: # we need to quote the credential name too. # e.g., it is not uncommon for credentials to be named after URLs remote_url += f'&dlacredential={urlquote(credential_name)}' # announce the sibling to not have an annex (we have a dedicated # storage sibling for that) to avoid needless annex-related processing # and speculative whining by `siblings()` ds.config.set(f'remote.{name}.annex-ignore', 'true', scope='local') for r in ds.siblings( # action must always be 'configure' (not 'add'), because above we just # made a remote {name} known, which is detected by `sibling()`. Any # conflict detection must have taken place separately before this call # https://github.com/datalad/datalad/issues/6649 action="configure", name=name, url=remote_url, # this is presently the default, but it may change fetch=False, publish_depends=publish_depends, return_type='generator', result_renderer='disabled'): if r.get('action') == 'configure-sibling': r['action'] = 'reconfigure_sibling_webdav' \ if known and existing == 'reconfigure' \ else 'create_sibling_webdav' yield r def _create_storage_sibling( ds, url, name, credential, export, existing, known=False): """ Parameters ---------- ds: Dataset url: str name: str credential: tuple export: bool existing: {skip, error, reconfigure} (Presently unused) known: bool Flag whether the sibling is a known remote (no implied necessary existence of content on the remote). """ if known and existing == 'skip': yield _get_skip_sibling_result(name, ds, 'storage') return cmd_args = [ 'enableremote' if known and existing == 'reconfigure' else 'initremote', name, "type=webdav", f"url={url}", f"exporttree={'yes' if export else 'no'}", "encryption=none", # for now, no autoenable. It would result in unconditional # error messages on clone #https://github.com/datalad/datalad/issues/6634 #"autoenable=true" ] # Add a git-annex webdav special remote. This requires to set # the webdav environment variables accordingly. with patched_env(WEBDAV_USERNAME=credential[0], WEBDAV_PASSWORD=credential[1], ): ds.repo.call_annex(cmd_args) yield get_status_dict( ds=ds, status='ok', action='reconfigure_sibling_webdav.storage' if known and existing == 'reconfigure' else 'create_sibling_webdav.storage', name=name, type='sibling', url=url, ) datalad-next-1.4.1/datalad_next/commands/credentials.py000066400000000000000000000453621462321624600232030ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See LICENSE file distributed along with the datalad_osf package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Credential management and query""" __docformat__ = 'restructuredtext' import json import logging from typing import Dict from datalad import ( cfg as dlcfg, ) from datalad_next.credman import ( CredentialManager, verify_property_names, ) from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, Parameter, ParameterConstraintContext, build_doc, eval_results, generic_result_renderer, get_status_dict, ) from datalad_next.exceptions import CapturedException from datalad_next.datasets import datasetmethod from datalad_next.constraints import ( EnsureChoice, EnsureDataset, EnsureNone, EnsureStr, ) from datalad_next.utils import ParamDictator lgr = logging.getLogger('datalad.local.credentials') credential_actions = ('query', 'get', 'set', 'remove') class CredentialsParamValidator(EnsureCommandParameterization): def __init__(self): super().__init__( param_constraints=dict( action=EnsureChoice(*credential_actions), dataset=EnsureDataset( # if given, it must also exist as a source for # configuration items and/or credentials installed=True, purpose='manage credentials', ), name=EnsureStr(), prompt=EnsureStr(), ), # order in joint_constraints is relevant! joint_constraints={ ParameterConstraintContext(('action', 'name', 'spec'), 'CLI normalization'): self._normalize_cli_params, ParameterConstraintContext(('spec',), 'credential spec normalization'): self._normalize_spec, # check parameter requirements for particular actions ParameterConstraintContext(('action', 'name'), 'remove-action requirements'): self._check_remove_requirements, ParameterConstraintContext(('action', 'name', 'spec'), 'get-action requirements'): self._check_get_requirements, }, ) def _normalize_cli_params(self, action, name, spec): if action in ('get', 'set', 'remove') and not name and spec \ and isinstance(spec, list): # spec came in like from the CLI (but doesn't have to be from # there) and we have no name set if spec[0][0] != ':' and '=' not in spec[0]: name = spec[0] spec = spec[1:] return dict(action=action, name=name, spec=spec) def _normalize_spec(self, spec): # `spec` could be many things, make uniform dict return dict(spec=normalize_specs(spec)) def _check_remove_requirements(self, action, name): if action == 'remove' and not name: self.raise_for( dict(action=action, name=name), 'no credential name provided', ) def _check_get_requirements(self, action, name, spec): if action == 'get' and not name and not spec: self.raise_for( dict(action=action, name=name, spec=spec), 'no name or credential property specified', ) @build_doc class Credentials(ValidatedInterface): """Credential management and query This command enables inspection and manipulation of credentials used throughout DataLad. The command provides four basic actions: QUERY When executed without any property specification, all known credentials with all their properties will be yielded. Please note that this may not include credentials that only comprise of a secret and no other properties, or legacy credentials for which no trace in the configuration can be found. Therefore, the query results are not guaranteed to contain all credentials ever configured by DataLad. When additional property/value pairs are specified, only credentials that have matching values for all given properties will be reported. This can be used, for example, to discover all suitable credentials for a specific "realm", if credentials were annotated with such information. SET This is the companion to 'get', and can be used to store properties and secret of a credential. Importantly, and in contrast to a 'get' operation, given properties with no values indicate a removal request. Any matching properties on record will be removed. If a credential is to be stored for which no secret is on record yet, an interactive session will prompt a user for a manual secret entry. Only changed properties will be contained in the result record. The appearance of the interactive secret entry can be configured with the two settings `datalad.credentials.repeat-secret-entry` and `datalad.credentials.hidden-secret-entry`. REMOVE This action will remove any secret and properties associated with a credential identified by its name. GET (plumbing operation) This is a *read-only* action that will never store (updates of) credential properties or secrets. Given properties will amend/overwrite those already on record. When properties with no value are given, and also no value for the respective properties is on record yet, their value will be requested interactively, if a ``prompt||--prompt`` text was provided too. This can be used to ensure a complete credential record, comprising any number of properties. Details on credentials A credential comprises any number of properties, plus exactly one secret. There are no constraints on the format or property values or the secret, as long as they are encoded as a string. Credential properties are normally stored as configuration settings in a user's configuration ('global' scope) using the naming scheme: `datalad.credential..` Therefore both credential name and credential property name must be syntax-compliant with Git configuration items. For property names this means only alphanumeric characters and dashes. For credential names virtually no naming restrictions exist (only null-byte and newline are forbidden). However, when naming credentials it is recommended to use simple names in order to enable convenient one-off credential overrides by specifying DataLad configuration items via their environment variable counterparts (see the documentation of the ``configuration`` command for details. In short, avoid underscores and special characters other than '.' and '-'. While there are no constraints on the number and nature of credential properties, a few particular properties are recognized on used for particular purposes: - 'secret': always refers to the single secret of a credential - 'type': identifies the type of a credential. With each standard type, a list of mandatory properties is associated (see below) - 'last-used': is an ISO 8601 format time stamp that indicated the last (successful) usage of a credential Standard credential types and properties The following standard credential types are recognized, and their mandatory field with their standard names will be automatically included in a 'get' report. - 'user_password': with properties 'user', and the password as secret - 'token': only comprising the token as secret - 'aws-s3': with properties 'key-id', 'session', 'expiration', and the secret_id as the credential secret Legacy support DataLad credentials not configured via this command may not be fully discoverable (i.e., including all their properties). Discovery of such legacy credentials can be assisted by specifying a dedicated 'type' property. """ result_renderer = 'tailored' _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify a dataset whose configuration to inspect rather than the global (user) settings"""), action=Parameter( args=("action",), nargs='?', doc="""which action to perform"""), name=Parameter( # exclude from CLI args=tuple(), doc="""name of a credential to set, get, or remove."""), spec=Parameter( args=("spec",), doc="""specification of[CMD: a credential name and CMD] credential properties. Properties are[CMD: either CMD] given as name/value pairs[CMD: or as a property name prefixed by a colon CMD]. Properties [CMD: prefixed with a colon CMD][PY: with a `None` value PY] indicate a property to be deleted (action 'set'), or a property to be entered interactively, when no value is set yet, and a prompt text is given (action 'get'). All property names are case-insensitive, must start with a letter or a digit, and may only contain '-' apart from these characters. [PY: Property specifications should be given a as dictionary, e.g., spec={'type': 'user_password'}. However, a CLI-like list of string arguments is also supported, e.g., spec=['type=user_password'] PY]""", nargs='*', metavar='[name] [:]property[=value]'), prompt=Parameter( args=("--prompt",), doc="""message to display when entry of missing credential properties is required for action 'get'. This can be used to present information on the nature of a credential and for instructions on how to obtain a credential"""), ) _examples_ = [ dict(text="Report all discoverable credentials", code_py="credentials()", code_cmd="datalad credentials"), dict( text="Set a new credential mycred & input its secret interactively", code_py="credentials('set', name='mycred')", code_cmd="datalad credentials set mycred"), dict(text="Remove a credential's type property", code_py="credentials('set', name='mycred', spec={'type': None})", code_cmd="datalad credentials set mycred :type"), dict(text="Get all information on a specific credential in a structured record", code_py="credentials('get', name='mycred')", code_cmd="datalad -f json credentials get mycred"), dict(text="Upgrade a legacy credential by annotating it with a 'type' property", code_py="credentials('set', name='legacycred', spec={'type': 'user_password')", code_cmd="datalad credentials set legacycred type=user_password"), dict(text="Set a new credential of type user_password, with a given user property, " "and input its secret interactively", code_py="credentials('set', name='mycred', spec={'type': 'user_password', 'user': ''})", code_cmd="datalad credentials set mycred type=user_password user="), dict(text="Obtain a (possibly yet undefined) credential with a minimum set of " "properties. All missing properties and secret will be " "prompted for, no information will be stored! " "This is mostly useful for ensuring availability of an " "appropriate credential in an application context", code_py="credentials('get', prompt='Can I haz info plz?', name='newcred', spec={'newproperty': None})", code_cmd="datalad credentials --prompt 'can I haz info plz?' get newcred :newproperty"), ] _validator_ = CredentialsParamValidator() @staticmethod @datasetmethod(name='credentials') @eval_results def __call__(action='query', spec=None, *, name=None, prompt=None, dataset=None): # which config manager to use: global or from dataset # It makes no sense to work with a non-existing dataset's config, # due to https://github.com/datalad/datalad/issues/7299 # so the `dataset` validator must not run for the default value # ``None`` cfg = dataset.ds.config if dataset else dlcfg credman = CredentialManager(cfg) if action == 'set': try: updated = credman.set(name, **spec) except Exception as e: yield get_status_dict( action='credentials', status='error', name=name, message='could not set credential properties', exception=CapturedException(e), ) return # pull name out of report, if entered manually if not name and updated is not None: name = updated.pop('name', None) yield get_status_dict( action='credentials', status='notneeded' if updated is None else 'ok', name=name, **_prefix_result_keys(updated if updated else spec), ) elif action == 'get': cred = credman.get(name=name, _prompt=prompt, **spec) if not cred: yield get_status_dict( action='credentials', status='error', name=name, message='credential not found', ) else: yield get_status_dict( action='credentials', status='ok', name=name, **_prefix_result_keys(cred), ) elif action == 'remove': try: removed = credman.remove(name, type_hint=spec.get('type')) except Exception as e: yield get_status_dict( action='credentials', status='error', name=name, message='could not remove credential properties', exception=CapturedException(e), ) return yield get_status_dict( action='credentials', status='ok' if removed else 'notneeded', name=name, ) elif action == 'query': for name, cred in credman.query_(**spec): yield get_status_dict( action='credentials', status='ok', name=name, type='credential', **_prefix_result_keys(cred), ) else: raise RuntimeError('Impossible state reached') # pragma: no cover @staticmethod def custom_result_renderer(res, **kwargs): # we only handle our own stuff in a custom fashion, the rest is generic if res['action'] != 'credentials': generic_result_renderer(res) return # must make a copy, because we modify the record in-place # https://github.com/datalad/datalad/issues/6560 res = res.copy() # the idea here is to twist the result records such that the generic # renderer can be used if 'name' in res: res['action'] = res['name'] res['status'] = '{} {}'.format( res.get('cred_type', 'secret'), '✓' if res.get('cred_secret') else '✗', ) if res.pop('from_backend', None) == 'legacy': res['type'] = 'legacy-credential' if 'message' not in res: # give the names of all properties # but avoid duplicating the type, hide the prefix, # add removal marker for vanished properties res['message'] = ','.join( '{}{}{}{}'.format( f':{p[5:]}' if res[p] is None else p[5:], '' if res[p] is None else '=', '' if res[p] is None else res[p][:25], '…' if res[p] and len(res[p]) > 25 else '', ) for p in sorted(res) if p.startswith('cred_') and p not in ('cred_type', 'cred_secret')) generic_result_renderer(res) def normalize_specs(specs): """Normalize all supported `spec` argument values for `credentials` Parameter --------- specs: JSON-formatted str or list Returns ------- dict Keys are the names of any property (with removal markers stripped), and values are `None` whenever property removal is desired, and not `None` for any value to be stored. Raises ------ ValueError For missing values, missing removal markers, and invalid JSON input """ if not specs: return {} elif isinstance(specs, str): try: specs = json.loads(specs) except json.JSONDecodeError as e: raise ValueError('Invalid JSON input') from e if isinstance(specs, list): # convert property assignment list specs = [ (str(s[0]), str(s[1])) if isinstance(s, tuple) else (str(s),) if '=' not in s else (tuple(s.split('=', 1))) for s in specs ] if isinstance(specs, list): missing = [ i for i in specs if (len(i) == 1 and i[0][0] != ":") or ( len(i) > 1 and (i[0][0] == ':' and i[1] is not None)) ] else: missing = [ k for k, v in specs.items() if k[0] == ":" and v is not None ] if missing: raise ValueError( f'Value or unset flag ":" missing for property {missing!r}') if isinstance(specs, list): # expand absent values in tuples to ease conversion to dict below specs = [(i[0], i[1] if len(i) > 1 else None) for i in specs] # apply "unset marker" specs = { # this stuff goes to git-config, is therefore case-insensitive # and we should normalize right away (k[1:] if k[0] == ':' else k).lower(): None if k[0] == ':' else v for k, v in (specs.items() if isinstance(specs, dict) else specs) } verify_property_names(specs) return specs def _prefix_result_keys(props): return { f'cred_{k}' if not k.startswith('_') else k[1:]: v for k, v in props.items() } datalad-next-1.4.1/datalad_next/commands/download.py000066400000000000000000000334011462321624600225040ustar00rootroot00000000000000""" """ __docformat__ = 'restructuredtext' from logging import getLogger from pathlib import ( Path, PurePosixPath, ) from urllib.parse import urlparse import datalad from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, Parameter, build_doc, datasetmethod, eval_results, get_status_dict, ) from datalad_next.exceptions import ( CapturedException, UrlOperationsRemoteError, ) from datalad_next.utils import ensure_list from datalad_next.constraints import ( AnyOf, EnsureChoice, EnsureDataset, EnsureGeneratorFromFileLike, EnsureJSON, EnsureListOf, EnsureMapping, EnsurePath, EnsureURL, EnsureValue, WithDescription, ) from datalad_next.url_operations import AnyUrlOperations lgr = getLogger('datalad.local.download') class EnsureURLFilenamePairFromURL(EnsureURL): """Accept a URL and derive filename from it path component Return original URL and filename as a mapping """ def __init__(self): super().__init__(required=['scheme', 'path']) def __call__(self, value): url = super().__call__(value) # derive a filename from the URL. # we take the last element of the 'path' component # of the URL, or fail parsed = urlparse(url) filename = PurePosixPath(parsed.path).name return {url: filename} @build_doc class Download(ValidatedInterface): """Download from URLs This command is the front-end to an extensible framework for performing downloads from a variety of URL schemes. Built-in support for the schemes 'http', 'https', 'file', and 'ssh' is provided. Extension packages may add additional support. In contrast to other downloader tools, this command integrates with the DataLad credential management and is able to auto-discover credentials. If no credential is available, it automatically prompts for them, and offers to store them for reuse after a successful authentication. Simultaneous hashing (checksumming) of downloaded content is supported with user-specified algorithms. The command can process any number of downloads (serially). it can read download specifications from (command line) arguments, files, or STDIN. It can deposit downloads to individual files, or stream to STDOUT. Implementation and extensibility Each URL scheme is processed by a dedicated handler. Additional schemes can be supported by sub-classing :class:`datalad_next.url_operations.UrlOperations` and implementing the `download()` method. Extension packages can register new handlers, by patching them into the `datalad_next.download._urlscheme_handlers` registry dict. """ # # argument format specifications # # any URL that we would take must have a scheme, because we switch # protocol handling based on that. It is also crucial for distinguishing # stuff like local paths and file names from URLs url_constraint = EnsureURL(required=['scheme']) # other than a plain URL we take a mapping from a URL to a local path. # The special value '-' is used to indicate stdout # if given as a single string, we support single-space-delimited items: # " " url2path_constraint = WithDescription( EnsureMapping( key=url_constraint, value=EnsureValue('-') | EnsurePath(), delimiter=' ', # we disallow length-2 sequences to be able to distinguish from # a length-2 list of URLs. # the key issue is the flexibility of EnsurePath -- pretty much # anything could be a valid unix path allow_length2_sequence=False, ), error_message=f'not a dict, length-2-iterable, or space-delimited str', ) # each specification items is either a mapping url->path, just a url, or a # JSON-encoded url->path mapping. the order is complex-to-simple for the # first two (to be able to distinguish a mapping from an encoded URL. The # JSON-encoding is tried last, it want match accidentally) urlonly_item_constraint = WithDescription( EnsureURLFilenamePairFromURL() & url2path_constraint, error_message='not a URL with a path component ' 'from which a filename can be derived', ) json_item_constraint = WithDescription( EnsureJSON() & url2path_constraint, error_message='not a JSON-encoded str with an object or length-2-array', ) any_item_constraint = WithDescription( AnyOf( # TODO explain url2path_constraint, urlonly_item_constraint, json_item_constraint, ), error_message='not a single item\n{__itemized_causes__}', ) # we support reading specification items (matching any format defined # above) as # - a single item # - as a list of items # - a list given in a file, or via stdin (or any file-like in Python) spec_constraint = WithDescription( AnyOf( any_item_constraint, WithDescription( EnsureListOf(any_item_constraint), error_message='not a list of any such item', ), WithDescription( EnsureGeneratorFromFileLike( any_item_constraint, exc_mode='yield', ), error_message="not a path to a file with one such item per-line, " "nor '-' to read any such item from STDIN", ), ), error_message="does not provide URL->(PATH|-) mapping(s)\n{__itemized_causes__}" ) force_choices = EnsureChoice('overwrite-existing') # Interface.validate_args() will inspect this dict for the presence of a # validator for particular parameters _validator_ = EnsureCommandParameterization(dict( spec=spec_constraint, # if given, it must also exist as a source for configuration items # and/or credentials dataset=EnsureDataset(installed=True), force=force_choices | EnsureListOf(force_choices), # TODO EnsureCredential #credential= # TODO EnsureHashAlgorithm #hash=EnsureHashAlgorithm | EnsureIterableOf(EnsureHashAlgorithm) )) # this is largely here for documentation and CLI parser building _params_ = dict( spec=Parameter( args=('spec',), metavar='||', doc="""Download sources and targets can be given in a variety of formats: as a URL, or as a URL-path-pair that is mapping a source URL to a dedicated download target path. Any number of URLs or URL-path-pairs can be provided, either as an argument list, or read from a file (one item per line). Such a specification input file can be given as a path to an existing file (as a single value, not as part of a URL-path-pair). When the special path identifier '-' is used, the download is written to STDOUT. A specification can also be read in JSON-lines encoding (each line being a string with a URL or an object mapping a URL-string to a path-string). [PY: In addition, specifications can also be given as a list or URLs, or as a list of dicts with a URL to path mapping. Paths are supported in string form, or as `Path` objects. PY]""", nargs='+'), dataset=Parameter( args=("-d", "--dataset"), doc="""Dataset to be used as a configuration source. Beyond reading configuration items, this command does not interact with the dataset."""), force=Parameter( args=("--force",), action='append', # TODO only here because datalad-core CLI generates docs from this choices=force_choices._allowed, doc="""By default, a target path for a download must not exist yet. 'force-overwrite' disabled this check."""), credential=Parameter( args=("--credential",), metavar='NAME', doc="""name of a credential to be used for authorization. If no credential is identified, the last-used credential for the authentication realm associated with the download target will be used. If there is no credential available yet, it will be prompted for. Once used successfully, a prompt for entering to save such a new credential will be presented.""", ), hash=Parameter( args=("--hash",), action='append', metavar='ALGORITHM', doc="""Name of a hashing algorithm supported by the Python 'hashlib' module, e.g. 'md5' or 'sha256'. [CMD: This option can be given more than once CMD] """), ) _examples_ = [ {'text': 'Download webpage to "myfile.txt"', 'code_cmd': 'datalad download "http://example.com myfile.txt"', 'code_py': 'download({"http://example.com": "myfile.txt"})'}, {'text': 'Read download specification from STDIN (e.g. JSON-lines)', 'code_cmd': 'datalad download -', 'code_py': 'download("-")'}, {'text': 'Simultaneously hash download, hexdigest reported in result record', 'code_cmd': 'datalad download --hash sha256 http://example.com/data.xml"', 'code_py': 'download("http://example.com/data.xml", hash=["sha256"])'}, {'text': 'Download from SSH server', 'code_cmd': 'datalad download "ssh://example.com/home/user/data.xml"', 'code_py': 'download("ssh://example.com/home/user/data.xml")'}, {'text': 'Stream a download to STDOUT', 'code_cmd': 'datalad -f disabled download "http://example.com -"'}, ] @staticmethod @datasetmethod(name="download") @eval_results def __call__(spec, *, dataset=None, force=None, credential=None, hash=None): # which config to inspect for credentials etc cfg = dataset.ds.config if dataset else datalad.cfg if isinstance(spec, (str, dict)): # input validation allows for a non-list item, turn into # list for uniform processing spec = [spec] # cache of already used handlers url_handler = AnyUrlOperations(cfg=cfg) # we are not running any tests upfront on the whole spec, # because the spec can be a generator and consume from a # long-running source (e.g. via stdin) for item in spec: if isinstance(item, CapturedException): # the generator encountered an exception for a particular # item and is relaying it as per instructions # exc_mode='yield'. We report and move on. Outside # flow logic will decide if processing continues yield get_status_dict( action='download', status='impossible', exception=item, ) continue url, dest = item.popitem() # we know that any URL has a scheme if not url_handler.is_supported_url(url): yield get_status_dict( action='download', status='error', message='unsupported URL ' '(custom URL handlers can be declared via DataLad ' 'configuration)', url=url, ) continue # ready destination path try: dest = _prep_dest_path(dest, force) except ValueError as e: yield get_status_dict( action='download', status='error', exception=CapturedException(e), url=url, path=dest, ) continue try: download_props = url_handler.download( url, dest, credential=credential, hash=ensure_list(hash), ) res = get_status_dict( action='download', status='ok', url=url, path=dest, ) # take the reported download properties (e.g. any computed # hashes a a starting point, and overwrite any potentially # conflicting keys with the standard ones) res = dict( download_props, **res) yield res except Exception as e: ce = CapturedException(e) res = get_status_dict( action='download', status='error', message='download failure', url=url, path=dest, exception=ce, ) if issubclass(type(e), UrlOperationsRemoteError): res['status_code'] = e.status_code yield res def _prep_dest_path(dest, force): if dest == '-': # nothing to prep for stdout return # make sure we can replace any existing target path later # on. but do not remove here, we might not actually be # able to download for other reasons if _lexists(dest) and ( not force or 'overwrite-existing' not in force): raise ValueError('target path already exists') # create parent directory if needed dest.parent.mkdir(parents=True, exist_ok=True) return dest def _lexists(path: Path): try: path.lstat() return True except FileNotFoundError: return False datalad-next-1.4.1/datalad_next/commands/ls_file_collection.py000066400000000000000000000421301462321624600245240ustar00rootroot00000000000000""" """ from __future__ import annotations __docformat__ = 'restructuredtext' from dataclasses import ( asdict, dataclass, ) from datetime import datetime from humanize import ( naturalsize, naturaldate, naturaltime, ) from logging import getLogger from pathlib import Path from stat import filemode from typing import ( Any, Callable, Dict, Iterator, List, ) from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, Parameter, ParameterConstraintContext, build_doc, eval_results, get_status_dict, ) from datalad_next.constraints import ( EnsureChoice, EnsurePath, EnsureURL, EnsureHashAlgorithm, EnsureListOf, ) from datalad_next.uis import ( ansi_colors as ac, ui_switcher as ui, ) from datalad_next.utils import ensure_list from datalad_next.iter_collections import ( FileSystemItemType, GitTreeItemType, GitWorktreeFileSystemItem, compute_multihash_from_fp, iter_annexworktree, iter_dir, iter_gittree, iter_gitworktree, iter_tar, iter_zip, ) lgr = getLogger('datalad.local.ls_file_collection') # hand-maintain a list of collection type names that should be # advertised and supported. it makes little sense to auto-discover # them, because each collection type likely needs some custom glue # code, and some iterators may not even be about *file* collections _supported_collection_types = ( 'directory', 'tarfile', 'zipfile', 'gittree', 'gitworktree', 'annexworktree', ) @dataclass # sadly PY3.10+ only (kw_only=True) class CollectionSpec: """Internal type for passing a collection specification to ``ls_file_collection``. it is created by the command validator transparently. """ orig_id: Any iter: Iterator item2res: Callable class LsFileCollectionParamValidator(EnsureCommandParameterization): """Parameter validator for the ``ls_file_collection`` command""" _collection_types = EnsureChoice(*_supported_collection_types) def __init__(self): super().__init__( param_constraints=dict( type=self._collection_types, collection=EnsurePath(lexists=True) | EnsureURL(), hash=EnsureHashAlgorithm() | EnsureListOf(EnsureHashAlgorithm()), ), joint_constraints={ ParameterConstraintContext(('type', 'collection', 'hash'), 'collection iterator'): self.get_collection_iter, }, ) def get_collection_iter(self, **kwargs): type = kwargs['type'] collection = kwargs['collection'] hash = kwargs['hash'] iter_fx = None iter_kwargs = None if type in ('directory', 'tarfile', 'zipfile', 'gitworktree', 'annexworktree'): if not isinstance(collection, Path): self.raise_for( kwargs, "{type} collection requires a Path-type identifier", type=type, ) iter_kwargs = dict( path=collection, fp=hash is not None, ) item2res = fsitem_to_dict if type == 'directory': iter_fx = iter_dir item2res = fsitem_to_dict elif type == 'tarfile': iter_fx = iter_tar item2res = fsitem_to_dict elif type == 'zipfile': iter_fx = iter_zip item2res = fsitem_to_dict elif type == 'gittree': if hash is not None: self.raise_for( kwargs, "gittree collection does not support " "content hash reporting", ) iter_fx = iter_gittree item2res = gittreeitem_to_dict iter_kwargs = dict( path=Path('.'), treeish=collection, ) elif type == 'gitworktree': iter_fx = iter_gitworktree item2res = gitworktreeitem_to_dict elif type == 'annexworktree': iter_fx = iter_annexworktree item2res = annexworktreeitem_to_dict else: raise RuntimeError( 'unhandled collection-type: this is a defect, please report.') assert iter_fx is not None return dict( collection=CollectionSpec( orig_id=collection, iter=iter_fx(**iter_kwargs), item2res=item2res), ) def fsitem_to_dict(item, hash) -> Dict: keymap = {'name': 'item'} # FileSystemItemType is too fine-grained to be used as result type # directly, map some cases! fsitem_type_to_res_type = { 'specialfile': 'file', } # file-objects need special handling (cannot be pickled for asdict()) fp = item.fp item.fp = None # TODO likely could be faster by moving the conditional out of the # dict-comprehension and handling them separately upfront/after d = { keymap.get(k, k): # explicit str value access, until we can use `StrEnum` v if k != 'type' else fsitem_type_to_res_type.get(v.value, v.value) for k, v in asdict(item).items() # strip pointless symlink target reports for anything but symlinks if item.type is FileSystemItemType.symlink or k != 'link_target' } if fp: for hname, hdigest in compute_multihash_from_fp(fp, hash).items(): d[f'hash-{hname}'] = hdigest # we also provide the file pointer to the consumer, although # it may have been "exhausted" by the hashing above and would # need a seek(0) for any further processing. # however, we do not do this here, because it is generic code, # and we do not know whether a particular file-like even supports # seek() under all circumstances. we simply document the fact. d['fp'] = fp return d def gittreeitem_to_dict(item, hash) -> Dict: gittreeitem_type_to_res_type = { # permission bits are not distinguished for types GitTreeItemType.executablefile: 'file', # 'dataset' is the commonly used label as the command API # level GitTreeItemType.submodule: 'dataset', } gittype = gittreeitem_type_to_res_type.get( item.gittype, item.gittype.value) if item.gittype else None d = dict(item=item.name) if gittype is not None: d['type'] = gittype if item.gitsha: d['gitsha'] = item.gitsha if gittype is not None: d['gittype'] = gittype return d def gitworktreeitem_to_dict(item, hash) -> Dict: gitworktreeitem_type_to_res_type = { # permission bits are not distinguished for types GitTreeItemType.executablefile: 'file', # 'dataset' is the commonly used label as the command API # level GitTreeItemType.submodule: 'dataset', } gittype = gitworktreeitem_type_to_res_type.get( item.gittype, item.gittype.value) if item.gittype else None if isinstance(item, GitWorktreeFileSystemItem): d = fsitem_to_dict(item, hash) else: d = dict(item=item.name) if gittype is not None: d['type'] = gittype if item.gitsha: d['gitsha'] = item.gitsha if gittype is not None: d['gittype'] = gittype return d def annexworktreeitem_to_dict(item, hash) -> Dict: d = gitworktreeitem_to_dict(item, hash) if item.annexkey: d['type'] = 'annexed file' d['annexkey'] = item.annexkey d['annexsize'] = item.annexsize d['annexobjpath'] = item.annexobjpath return d @build_doc class LsFileCollection(ValidatedInterface): """Report information on files in a collection This is a utility that can be used to query information on files in different file collections. The type of information reported varies across collection types. However, each result at minimum contains some kind of identifier for the collection ('collection' property), and an identifier for the respective collection item ('item' property). Each result also contains a ``type`` property that indicates particular type of file that is being reported on. In most cases this will be ``file``, but other categories like ``symlink`` or ``directory`` are recognized too. If a collection type provides file-access, this command can compute one or more hashes (checksums) for any file in a collection. Supported file collection types are: ``directory`` Reports on the content of a given directory (non-recursively). The collection identifier is the path of the directory. Item identifiers are the names of items within that directory. Standard properties like ``size``, ``mtime``, or ``link_target`` are included in the report. [PY: When hashes are computed, an ``fp`` property with a file-like is provided. Reading file data from it requires a ``seek(0)`` in most cases. This file handle is only open when items are yielded directly by this command (``return_type='generator``) and only until the next result is yielded. PY] ``gittree`` Reports on the content of a Git "tree-ish". The collection identifier is that tree-ish. The command must be executed inside a Git repository. If the working directory for the command is not the repository root (in case of a non-bare repository), the report is constrained to items underneath the working directory. Item identifiers are the relative paths of items within that working directory. Reported properties include ``gitsha`` and ``gittype``; note that the ``gitsha`` is not equivalent to a SHA1 hash of a file's content, but is the SHA-type blob identifier as reported and used by Git. Reporting of content hashes beyond the ``gitsha`` is presently not supported. ``gitworktree`` Reports on all tracked and untracked content of a Git repository's work tree. The collection identifier is a path of a directory in a Git repository (which can, but needs not be, its root). Item identifiers are the relative paths of items within that directory. Reported properties include ``gitsha`` and ``gittype``; note that the ``gitsha`` is not equivalent to a SHA1 hash of a file's content, but is the SHA-type blob identifier as reported and used by Git. [PY: When hashes are computed, an ``fp`` property with a file-like is provided. Reading file data from it requires a ``seek(0)`` in most cases. This file handle is only open when items are yielded directly by this command (``return_type='generator``) and only until the next result is yielded. PY] ``annexworktree`` Like ``gitworktree``, but amends the reported items with git-annex information, such as ``annexkey``, ``annexsize``, and ``annnexobjpath``. ``tarfile`` Reports on members of a TAR archive. The collection identifier is the path of the TAR file. Item identifiers are the relative paths of archive members within the archive. Reported properties are similar to the ``directory`` collection type. [PY: When hashes are computed, an ``fp`` property with a file-like is provided. Reading file data from it requires a ``seek(0)`` in most cases. This file handle is only open when items are yielded directly by this command (``return_type='generator``) and only until the next result is yielded. PY] ``zipfile`` Like ``tarfile`` for reporting on ZIP archives. """ _validator_ = LsFileCollectionParamValidator() # this is largely here for documentation and CLI parser building _params_ = dict( type=Parameter( args=("type",), choices=_supported_collection_types, doc="""Name of the type of file collection to report on"""), collection=Parameter( args=('collection',), metavar='ID/LOCATION', doc="""identifier or location of the file collection to report on. Depending on the type of collection to process, the specific nature of this parameter can be different. A common identifier for a file collection is a path (to a directory, to an archive), but might also be a URL. See the documentation for details on supported collection types."""), hash=Parameter( args=("--hash",), action='append', metavar='ALGORITHM', doc="""One or more names of algorithms to be used for reporting file hashes. They must be supported by the Python 'hashlib' module, e.g. 'md5' or 'sha256'. Reporting file hashes typically implies retrieving/reading file content. This processing may also enable reporting of additional properties that may otherwise not be readily available. [CMD: This option can be given more than once CMD] """), ) _examples_: List = [ {'text': 'Report on the content of a directory', 'code_cmd': 'datalad -f json ls-file-collection directory /tmp', 'code_py': 'records = ls_file_collection("directory", "/tmp")'}, {'text': 'Report on the content of a TAR archive with ' 'MD5 and SHA1 file hashes', 'code_cmd': 'datalad -f json ls-file-collection' ' --hash md5 --hash sha1 tarfile myarchive.tar.gz', 'code_py': 'records = ls_file_collection("tarfile",' ' "myarchive.tar.gz", hash=["md5", "sha1"])'}, {'text': "Register URLs for files in a directory that is" " also reachable via HTTP. This uses ``ls-file-collection``" " for listing files and computing MD5 hashes," " then using ``jq`` to filter and transform the output" " (just file records, and in a JSON array)," " and passes them to ``addurls``, which generates" " annex keys/files and assigns URLs." " When the command finishes, the dataset contains no" " data, but can retrieve the files after confirming" " their availability (i.e., via `git annex fsck`)", 'code_cmd': 'datalad -f json ls-file-collection directory wwwdir --hash md5 \\\n' ' | jq \'. | select(.type == "file")\' \\\n' ' | jq --slurp . \\\n' " | datalad addurls --key 'et:MD5-s{size}--{hash-md5}' - 'https://example.com/{item}'"}, {'text': 'List annex keys of all files in the working tree of a dataset', 'code_py': "[r['annexkey'] \\\n" "for r in ls_file_collection('annexworktree', '.') \\\n" "if 'annexkey' in r]", 'code_cmd': "datalad -f json ls-file-collection annexworktree . \\\n" "| jq '. | select(.annexkey) | .annexkey'", }, ] @staticmethod @eval_results def __call__( type: str, collection: CollectionSpec, *, hash: str | List[str] | None = None, ): for item in collection.iter: res = collection.item2res( item, hash=ensure_list(hash), ) res.update(get_status_dict( action='ls_file_collection', status='ok', collection=collection.orig_id, )) yield res @staticmethod def custom_result_renderer(res, **kwargs): # given the to-be-expected diversity, this renderer only # outputs identifiers and type info. In almost any real use case # either no rendering or JSON rendering will be needed type = res.get('type', None) # if there is no mode, produces '?---------' # .. or 0 is needed, because some iterators report an explicit # `None` mode mode = filemode(res.get('mode', 0) or 0) size = None if type in ('file', 'hardlink'): size = res.get('size', None) size = '-' if size is None else naturalsize(size, gnu=True) mtime = res.get('mtime', '') if mtime: dt = datetime.fromtimestamp(mtime) hts = naturaldate(dt) if hts == 'today': hts = naturaltime(dt) hts = hts.replace( 'minutes ago', 'min ago').replace( 'seconds ago', 'sec ago') # stick with numerical IDs (although less accessible), we cannot # know in general whether this particular system can map numerical # IDs to valid target names (think stored name in tarballs) owner_info = f'{res["uid"]}:{res["gid"]}' if res.get('uid') else '' ui.message('{mode} {size: >6} {owner: >9} {hts: >11} {item} ({type})'.format( mode=mode, size=size, owner=owner_info, hts=hts if mtime else '', item=ac.color_word( res.get('item', ''), ac.BOLD), type=ac.color_word( res.get('type', ''), ac.MAGENTA), )) datalad-next-1.4.1/datalad_next/commands/results.py000066400000000000000000000105121462321624600223740ustar00rootroot00000000000000from __future__ import annotations from dataclasses import dataclass from enum import Enum import logging from pathlib import ( Path, PurePath, ) from datalad_next.datasets import Dataset from datalad_next.exceptions import CapturedException # TODO Could be `StrEnum`, came with PY3.11 class CommandResultStatus(Enum): """Enumeration of possible statuses of command results """ ok = 'ok' notneeded = 'notneeded' impossible = 'impossible' error = 'error' # which status is a success , which is failure success_status_map = { 'ok': 'success', 'notneeded': 'success', 'impossible': 'failure', 'error': 'failure', } # We really want this to be `kw_only=True`, but cannot, because it only # came with PY3.10 # Until this can be enabled, we cannot have additional _required_ properties # coming from derived classes. Instead, we have to make any and all # additional properties optional (with default None), because also in this # base class we do define optional ones (and it makes no sense not to do # that either). #@dataclass(kw_only=True) @dataclass class CommandResult: """Base data class for result records emitted by DataLad commands. Historically, such results records have taken the form of a Python ``dict``. This class provides some API for its instances to be compatible with legacy code that expects a ``dict``. .. seealso:: https://docs.datalad.org/design/result_records.html """ # TODO implement post_init and possibly check for validated of # some arguments (e.g. status is a valid value). Maybe do all of that # conditional on some config flag that could be set during test # execution # mandatory as per # http://docs.datalad.org/design/result_records.html#mandatory-fields action: str """A string label identifying which type of operation a result is associated with. Labels must not contain white space. They should be compact, and lower-cases, and use ``_`` (underscore) to separate words in compound labels. """ status: CommandResultStatus """This field indicates the nature of a result in terms of four categories, identified by a :class:`CommandResultStatus` value. The result status is used by user communication, but also for decision making on the overall success or failure of a command operation. """ path: str | Path """An *absolute* path describing the local entity a result is associated with (the subject of the result record). Paths must be platform-specific (e.g., Windows paths on Windows, and POSIX paths on other operating systems). When a result is about an entity that has no meaningful relation to the local file system (e.g., a URL to be downloaded), the ``path`` value should be determined with respect to the potential impact of the result on any local entity (e.g., a URL downloaded to a local file path, a local dataset modified based on remote information). """ # optional # TODO complete documentation of all members message: str | tuple | None = None exception: CapturedException | None = None error_message: str | tuple | None = None type: str | None = None logger: logging.Logger | None = None refds: str | Path | Dataset = None # any and all of the code below makes it possible to feed such result # instances through the datalad-core result processing loop (which # expects results to be dicts with string keys and (most) values to # be string only also. def __contains__(self, key: str) -> bool: return hasattr(self, key) def __getitem__(self, key: str): return self._stringify4legacy(getattr(self, key)) def get(self, key, default=None): return self._stringify4legacy(getattr(self, key, default)) def pop(self, key, default=None): item = getattr(self, key, default) if hasattr(self, key): setattr(self, key, None) return self._stringify4legacy(item) def items(self): for k, v in self.__dict__.items(): yield k, self._stringify4legacy(v) def _stringify4legacy(self, val): if isinstance(val, PurePath): return str(val) elif isinstance(val, Dataset): return val.path elif issubclass(getattr(val, '__class__', None), Enum): return val.value return val datalad-next-1.4.1/datalad_next/commands/status.py000066400000000000000000000344111462321624600222220ustar00rootroot00000000000000""" """ from __future__ import annotations __docformat__ = 'restructuredtext' from dataclasses import dataclass from enum import Enum from logging import getLogger from pathlib import Path from typing import Generator from datalad_next.commands import ( CommandResult, CommandResultStatus, EnsureCommandParameterization, ValidatedInterface, Parameter, ParameterConstraintContext, build_doc, datasetmethod, eval_results, ) from datalad_next.constraints import ( EnsureChoice, EnsureDataset, WithDescription, ) from datalad_next.iter_collections import ( GitDiffStatus, GitTreeItemType, GitContainerModificationType, iter_gitstatus, ) from datalad_next.uis import ( ui_switcher as ui, ansi_colors as ac, ) lgr = getLogger('datalad.core.local.status') # TODO Could be `StrEnum`, came with PY3.11 class StatusState(Enum): """Enumeration of possible states of a status command result The "state" is the condition of the dataset item being reported on. """ clean = 'clean' added = 'added' modified = 'modified' deleted = 'deleted' untracked = 'untracked' unknown = 'unknown' diffstatus2resultstate_map = { GitDiffStatus.addition: StatusState.added, GitDiffStatus.copy: StatusState.added, GitDiffStatus.deletion: StatusState.deleted, GitDiffStatus.modification: StatusState.modified, GitDiffStatus.rename: StatusState.added, GitDiffStatus.typechange: StatusState.modified, GitDiffStatus.unmerged: StatusState.unknown, GitDiffStatus.unknown: StatusState.unknown, GitDiffStatus.other: StatusState.untracked, } # see base class decorator comment for why this is commented out #@dataclass(kw_only=True) @dataclass class StatusResult(CommandResult): # TODO any of the following property are not actually optional # we only have to declare them such for limitations of dataclasses # prior PY3.10 (see kw_only command in base class diff_state: GitDiffStatus | None = None """The ``status`` of the underlying ``GitDiffItem``. It is named "_state" to emphasize the conceptual similarity with the legacy property 'state' """ gittype: GitTreeItemType | None = None """The ``gittype`` of the underlying ``GitDiffItem``.""" prev_gittype: GitTreeItemType | None = None """The ``prev_gittype`` of the underlying ``GitDiffItem``.""" modification_types: tuple[GitContainerModificationType] | None = None """Qualifiers for modification types of container-type items (directories, submodules).""" @property def state(self) -> StatusState: """A (more or less legacy) simplified representation of the subject state. For a more accurate classification use the ``diff_status`` property. """ return diffstatus2resultstate_map[self.diff_state] # the previous status-implementation did not report plain git-types # we establish a getter to perform this kind of inference/mangling, # when it is needed @property def type(self) -> str | None: """ """ # TODO this is just a placeholder return self.gittype.value if self.gittype else None # we need a setter for this `type`-override stunt @type.setter def type(self, value): self.gittype = value @property def prev_type(self) -> str: """ """ return self.prev_gittype.value if self.prev_gittype else None @property def type_src(self) -> str | None: """Backward-compatibility adaptor""" return self.prev_type opt_untracked_values = ('no', 'whole-dir', 'no-empty-dir', 'normal', 'all') opt_recursive_values = ('no', 'repository', 'datasets', 'mono') opt_eval_subdataset_state_values = ('no', 'commit', 'full') class StatusParamValidator(EnsureCommandParameterization): def __init__(self): super().__init__( param_constraints=dict( # if given, it must also exist dataset=EnsureDataset(installed=True), untracked=EnsureChoice(*opt_untracked_values), recursive=EnsureChoice(*opt_recursive_values), eval_subdataset_state=EnsureChoice( *opt_eval_subdataset_state_values) ), validate_defaults=('dataset',), joint_constraints={ ParameterConstraintContext(('untracked', 'recursive'), 'option normalization'): self.normalize_options, }, ) def normalize_options(self, **kwargs): if kwargs['untracked'] == 'no': kwargs['untracked'] = None if kwargs['untracked'] == 'normal': kwargs['untracked'] = 'no-empty-dir' if kwargs['recursive'] == 'datasets': kwargs['recursive'] = 'submodules' if kwargs['recursive'] == 'mono': kwargs['recursive'] = 'monolithic' return kwargs @build_doc class Status(ValidatedInterface): """Report on the (modification) status of a dataset .. note:: This is a preview of an command implementation aiming to replace the DataLad ``status`` command. For now, expect anything here to change again. This command provides a report that is roughly identical to that of ``git status``. Running with default parameters yields a report that should look familiar to Git and DataLad users alike, and contain the same information as offered by ``git status``. The main difference to ``git status`` are: - Support for recursion into submodule. ``git status`` does that too, but the report is limited to the global state of an entire submodule, whereas this command can issue detailed reports in changes inside a submodule (any nesting depth). - Support for directory-constrained reporting. Much like ``git status`` limits its report to a single repository, this command can optionally limit its report to a single directory and its direct children. In this report subdirectories are considered containers (much like) submodules, and a change summary is provided for them. - Support for a "mono" (monolithic repository) report. Unlike a standard recursion into submodules, and checking each of them for changes with respect to the HEAD commit of the worktree, this report compares a submodule with respect to the state recorded in its parent repository. This provides an equally comprehensive status report from the point of view of a queried repository, but does not include a dedicated item on the global state of a submodule. This makes nested hierarchy of repositories appear like a single (mono) repository. - Support for "adjusted mode" git-annex repositories. These utilize a managed branch that is repeatedly rewritten, hence is not suitable for tracking within a parent repository. Instead, the underlying "corresponding branch" is used, which contains the equivalent content in an un-adjusted form, persistently. This command detects this condition and automatically check a repositories state against the corresponding branch state. *Presently missing/planned features* - There is no support for specifying paths (or pathspecs) for constraining the operation to specific dataset parts. This will be added in the future. - There is no reporting of git-annex properties, such as tracked file size. It is undetermined whether this will be added in the future. However, even without a dedicated switch, this command has support for datasets (and their submodules) in git-annex's "adjusted mode". *Differences to the ``status`` command implementation prior DataLad v2* - Like ``git status`` this implementation reports on dataset modification, whereas the previous ``status`` also provided a listing of unchanged dataset content. This is no longer done. Equivalent functionality for listing dataset content is provided by the ``ls_file_collection`` command. - The implementation is substantially faster. Depending on the context the speed-up is typically somewhere between 2x and 100x. - The implementation does not suffer from the limitation re type change detection. - Python and CLI API of the command use uniform parameter validation. """ # Interface.validate_args() will inspect this dict for the presence of a # validator for particular parameters _validator_ = StatusParamValidator() # this is largely here for documentation and CLI parser building _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""Dataset to be used as a configuration source. Beyond reading configuration items, this command does not interact with the dataset."""), untracked=Parameter( args=('--untracked',), choices=opt_untracked_values, doc="""Determine how untracked content is considered and reported when comparing a revision to the state of the working tree. 'no': no untracked content is considered as a change; 'normal': untracked files and entire untracked directories are reported as such; 'all': report individual files even in fully untracked directories. In addition to these git-status modes, 'whole-dir' (like normal, but include empty directories), and 'no-empty-dir' (alias for 'normal') are understood."""), recursive=Parameter( args=('-r', '--recursive'), nargs='?', const='datasets', choices=opt_recursive_values, doc="""Mode of recursion for status reporting. With 'no' the report is restricted to a single directory and its direct children. With 'repository', the report comprises all repository content underneath current working directory or root of a given dataset, but is limited to items directly contained in that repository. With 'datasets', the report also comprises any content in any subdatasets. Each subdataset is evaluated against its respective HEAD commit. With 'mono', a report similar to 'datasets' is generated, but any subdataset is evaluate with respect to the state recorded in its parent repository. In contrast to the 'datasets' mode, no report items on a joint submodule are generated. [CMD: If no particular value is given with this option the 'datasets' mode is selected. CMD] """), eval_subdataset_state=Parameter( args=("-e", "--eval-subdataset-state",), choices=opt_eval_subdataset_state_values, doc="""Evaluation of subdataset state (modified or untracked content) can be expensive for deep dataset hierarchies as subdataset have to be tested recursively for uncommitted modifications. Setting this option to 'no' or 'commit' can substantially boost performance by limiting what is being tested. With 'no' no state is evaluated and subdataset are not investigated for modifications. With 'commit' only a discrepancy of the HEAD commit gitsha of a subdataset and the gitsha recorded in the superdataset's record is evaluated. With 'full' any other modifications are considered too."""), ) _examples_ = [ ] @staticmethod @datasetmethod(name="next_status") @eval_results def __call__( # TODO later #path=None, *, dataset=None, # TODO possibly later #annex=None, untracked='normal', recursive='repository', eval_subdataset_state='full', ) -> Generator[StatusResult, None, None] | list[StatusResult]: ds = dataset.ds rootpath = Path.cwd() if dataset.original is None else ds.pathobj for item in iter_gitstatus( path=rootpath, untracked=untracked, recursive=recursive, eval_submodule_state=eval_subdataset_state, ): yield StatusResult( action='status', status=CommandResultStatus.ok, path=rootpath / (item.path or item.prev_path), gittype=item.gittype, prev_gittype=item.prev_gittype, diff_state=item.status, modification_types=item.modification_types, refds=ds, logger=lgr, ) def custom_result_renderer(res, **kwargs): # we are guaranteed to have dataset-arg info through uniform # parameter validation dsarg = kwargs['dataset'] rootpath = Path.cwd() if dsarg.original is None else dsarg.ds.pathobj # because we can always determine the root path of the command # execution environment, we can report meaningful relative paths # unconditionally path = res.path.relative_to(rootpath) # collapse item type information across current and previous states type_ = res.type or res.prev_type or '' max_len = len('untracked') state = res.state.value # message format is same as for previous command implementation ui.message(u'{fill}{state}: {path}{type_}{annot}'.format( fill=' ' * max(0, max_len - len(state)), state=ac.color_word( res.state.value, _get_result_status_render_color(res)), path=path, type_=' ({})'.format(ac.color_word(type_, ac.MAGENTA)) if type_ else '', annot=f' [{", ".join(q.value for q in res.modification_types)}]' if res.modification_types else '', )) @staticmethod def custom_result_summary_renderer(results): # no reports, no changes if len(results) == 0: ui.message("nothing to save, working tree clean") def _get_result_status_render_color(res): if res.state == StatusState.deleted: return ac.RED elif res.state == StatusState.modified: return ac.CYAN elif res.state == StatusState.added: return ac.GREEN else: return ac.BOLD datalad-next-1.4.1/datalad_next/commands/tests/000077500000000000000000000000001462321624600214645ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/commands/tests/__init__.py000066400000000000000000000000001462321624600235630ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/commands/tests/test_create_sibling_webdav.py000066400000000000000000000352301462321624600274020ustar00rootroot00000000000000from pathlib import Path from unittest.mock import ( call, patch, ) from urllib.parse import quote as urlquote from datalad_next.tests import ( assert_in, assert_in_results, assert_raises, assert_result_count, assert_status, create_tree, eq_, ok_, run_main, ) import pytest from datalad.api import ( clone, create_sibling_webdav, ) from datalad_next.utils import chpwd def test_common_workflow_implicit_cred( credman, existing_dataset, tmp_path, webdav_credential, webdav_server, no_result_rendering): check_common_workflow( False, 'annex', credman, existing_dataset, tmp_path, webdav_credential, webdav_server) def test_common_workflow_explicit_cred( credman, existing_dataset, tmp_path, webdav_credential, webdav_server, no_result_rendering): check_common_workflow( True, 'annex', credman, existing_dataset, tmp_path, webdav_credential, webdav_server) def test_common_workflow_export( credman, existing_dataset, tmp_path, webdav_credential, webdav_server, no_result_rendering): check_common_workflow( False, 'filetree', credman, existing_dataset, tmp_path, webdav_credential, webdav_server) def check_common_workflow( declare_credential, mode, credman, ds, clonepath, webdav_credential, webdav_server): credman.set(**webdav_credential) # need to amend the test credential, can only do after we know the URL ds.credentials( 'set', name=webdav_credential['name'], # the test webdav webserver uses a realm label '/' spec=dict(realm=webdav_server.url + '/'), ) # we use a nasty target directory that has the potential to ruin the # git-remote URL handling targetdir_name = 'tar&get=mike' targetdir = Path(webdav_server.path) / targetdir_name url = f'{webdav_server.url}/{targetdir_name}' with chpwd(ds.path): res = create_sibling_webdav( url, credential=webdav_credential['name'] if declare_credential else None, mode=mode, ) # Ensure that remote name constraint check works # second time should raise because the sibling exists already with pytest.raises(ValueError) as e: create_sibling_webdav( url, credential=webdav_credential['name'] if declare_credential else None, mode=mode, name='127.0.0.1', ) with pytest.raises(ValueError) as e: create_sibling_webdav( url, credential=webdav_credential['name'] if declare_credential else None, mode=mode, name='other', storage_name='127.0.0.1-storage', ) assert_in_results( res, action='create_sibling_webdav.storage', status='ok', type='sibling', path=ds.path, url=url, name='127.0.0.1-storage', # TODO: name=??? # Parse url for host or depend on serve_path always # delivering 127.0.0.1? (Think IPv6 or a literal `localhost` or # anything like that) Same applies to hardcoded `dlaurl`. ) # where it should be accessible # needs to be quoted dlaurl = ( 'datalad-annex::?type=webdav&encryption=none&exporttree={exp}&' 'url=http%3A//127.0.0.1%3A43612/tar%26get%3Dmike').format( exp='yes' if 'filetree' in mode else 'no', ) if declare_credential: dlaurl += f'&dlacredential={urlquote(webdav_credential["name"])}' assert_in_results( res, action='create_sibling_webdav', status='ok', path=ds.path, name='127.0.0.1', url=dlaurl, type='sibling', ) ok_(targetdir.exists()) # add some annex payload (ds.pathobj / 'testfile.dat').write_text('dummy') ds.save() res = ds.push(to='127.0.0.1') assert_in_results( res, action='copy', path=str(ds.pathobj / 'testfile.dat'), status='ok', ) assert_in_results(res, action='publish', status='ok') cloneurl = dlaurl if not declare_credential and 'filetree' in mode: # we can use a simplified URL cloneurl = 'webdav://{url}'.format( # strip http:// url=url[7:], ) dsclone = clone(cloneurl, clonepath) # we get the same thing eq_(ds.repo.get_hexsha(ds.repo.get_corresponding_branch()), dsclone.repo.get_hexsha(dsclone.repo.get_corresponding_branch())) # check that it auto-deploys webdav credentials # at some point, clone should be able to do this internally # https://github.com/datalad/datalad/issues/6634 dsclone.siblings('enable', name='127.0.0.1-storage') # verify that we can get testfile.dat # just get the whole damn thing assert_status('ok', dsclone.get('.')) # verify testfile content eq_('dummy', (dsclone.pathobj / 'testfile.dat').read_text()) # ensure that recursive operations succeed # create a subdataset subds = ds.create('mysubds') targetdir_name = 'recursiontest' subtargetdir = Path(webdav_server.path) / targetdir_name / 'mysubds' url = f'{webdav_server.url}/{targetdir_name}' with chpwd(ds.path): res = create_sibling_webdav( url, credential=webdav_credential['name'] if declare_credential else None, name='recursive-sibling', mode=mode, recursive=True, ) assert len(res) == 4 # 2 for create-sibling-webdav, 2 for storage assert_in_results( res, action='create_sibling_webdav.storage', status='ok', type='sibling', path=subds.path, name='recursive-sibling-storage', ) ok_(subtargetdir.exists()) def test_bad_url_catching(existing_dataset): # Ensure that bad URLs are detected and handled ds = existing_dataset check_pairs = [ ( "http://localhost:33322/abc?a", "URL has forbidden 'query' component" ), ( "https://netloc/has-a-fragment#sdsd", "URL has forbidden 'fragment' component" ), ( "https:///has-no-net-location", "URL is missing 'netloc' component" ), ( "xxx://localhost:33322/abc", "url={url!r}\n does not match expression '^(http|https)://'" ), ] for bad_url, expected_message in check_pairs: with pytest.raises(ValueError) as e: create_sibling_webdav(dataset=ds, url=bad_url) assert expected_message.format(url=bad_url) in str(e.value) def test_http_warning(existing_dataset): # Check that usage of http: triggers a warning. ds = existing_dataset url = "http://localhost:33322/abc" with patch("datalad_next.commands.create_sibling_webdav._create_sibling_webdav") as csw_mock, \ patch("datalad_next.commands.create_sibling_webdav.lgr") as lgr_mock: csw_mock.return_value = iter([]) with pytest.raises(Exception): create_sibling_webdav(dataset=ds, url=url) eq_(lgr_mock.warning.call_count, 1) assert_in( call( f"Using 'http:' ({url!r}) means that WebDAV credentials are " f"sent unencrypted over network links. Consider using " f"'https:'."), lgr_mock.warning.mock_calls) def test_constraints_checking(existing_dataset): # Ensure that constraints are checked internally ds = existing_dataset url = "http://localhost:22334/abc" for key in ("existing", "mode"): with pytest.raises(ValueError) as e: create_sibling_webdav( dataset=ds, url=url, **{key: "illegal-value"}) assert "is not one of" in str(e.value) def test_name_clash_detection(existing_dataset): # Ensure that constraints are checked internally ds = existing_dataset url = "http://localhost:22334/abc" for mode in ("annex", 'filetree', 'annex-only', 'filetree-only'): with pytest.raises(ValueError) as e: create_sibling_webdav( dataset=ds, url=url, name="abc", storage_name="abc", mode=mode) assert "sibling names must not be equal" in str(e.value) def test_unused_storage_name_warning(existing_dataset): # Ensure that constraints are checked internally ds = existing_dataset url = "https://localhost:22334/abc" with patch("datalad_next.commands.create_sibling_webdav._create_sibling_webdav") as csw_mock, \ patch("datalad_next.commands.create_sibling_webdav.lgr") as lgr_mock: csw_mock.return_value = iter([]) mode_values = ("git-only", "annex-only", "filetree-only") for mode in mode_values: # We set up the mocks to generate the following exception. This allows # us to limit the test to the logic in 'create_sibling_wabdav'. assert_raises( Exception, create_sibling_webdav, dataset=ds, url=url, name="abc", storage_name="abc", mode=mode) eq_(lgr_mock.warning.call_count, len(mode_values)) def test_existing_switch(existing_dataset, credman, webdav_credential, webdav_server, no_result_rendering): credman.set(**webdav_credential) check_existing_switch(existing_dataset, webdav_credential, webdav_server) def check_existing_switch(ds, webdav_credential, webdav_server): create_tree( ds.path, {'sub': {'f0': '0'}, 'sub2': {'subsub': {'f1': '1'}, 'f2': '2'}, 'f3': '3'} ) # use a tricky name: '3f7' will be the hashdir of the XDLRA # key containing the superdataset's datalad-annex archive after a push sub = ds.create('3f7', force=True) sub2 = ds.create('sub2', force=True) subsub = sub2.create('subsub', force=True) ds.save(recursive=True) url = webdav_server.url # need to amend the test credential, can only do after we know the URL ds.credentials( 'set', name=webdav_credential['name'], # the test webdav webserver uses a realm label '/' spec=dict(realm=url + '/'), ) subsub.create_sibling_webdav(f'{url}/sub2/subsub', mode='annex') sub2.create_sibling_webdav(f'{url}/sub2', mode='annex-only') sub.create_sibling_webdav(f'{url}/3f7', mode='git-only') res = ds.create_sibling_webdav(url, mode='annex', existing='skip', recursive=True) dlaurl='datalad-annex::?type=webdav&encryption=none&exporttree=no&' \ 'url=http%3A//127.0.0.1%3A43612/' # results per dataset: # super: assert_in_results( res, action='create_sibling_webdav.storage', status='ok', type='sibling', name='127.0.0.1-storage', path=ds.path, url=url, ) assert_in_results( res, action='create_sibling_webdav', status='ok', type='sibling', path=ds.path, name='127.0.0.1', url=dlaurl[:-1], ) # sub assert_in_results( res, action='create_sibling_webdav.storage', status='ok', type='sibling', name='127.0.0.1-storage', path=sub.path, url=f'{url}/3f7', ) assert_in_results( res, action='create_sibling_webdav', status='notneeded', type='sibling', name='127.0.0.1', path=sub.path, ) # sub2 assert_in_results( res, action='create_sibling_webdav.storage', status='notneeded', type='sibling', name='127.0.0.1-storage', path=sub2.path ) assert_in_results( res, action='create_sibling_webdav', status='ok', type='sibling', path=sub2.path, name='127.0.0.1', url=f'{dlaurl}sub2', ) # subsub assert_in_results( res, action='create_sibling_webdav.storage', status='notneeded', type='sibling', name='127.0.0.1-storage', path=subsub.path ) assert_in_results( res, action='create_sibling_webdav', status='notneeded', type='sibling', name='127.0.0.1', path=subsub.path, ) # should fail upfront with first discovered remote that already exist res = ds.create_sibling_webdav( url, mode='annex', existing='error', recursive=True, on_failure='ignore') assert_result_count(res, 8, status='error', type='sibling') # Note: 'message' is expected to be a tuple (and always present). assert all("is already configured" in r['message'][0] for r in res) assert all(r['action'].startswith('create_sibling_webdav') for r in res) srv_rt = Path(webdav_server.path) (srv_rt / '3f7').rmdir() (srv_rt / 'sub2' / 'subsub').rmdir() (srv_rt / 'sub2').rmdir() # existing=skip actually doesn't do anything (other than yielding notneeded) res = ds.create_sibling_webdav(url, mode='annex', existing='skip', recursive=True) assert_result_count(res, 8, status='notneeded') remote_content = list(srv_rt.glob('**')) assert len(remote_content) == 1 # nothing but root dir # reconfigure to move target one directory level: dlaurl += 'reconfigure' url += '/reconfigure' new_root = srv_rt / 'reconfigure' res = ds.create_sibling_webdav(url, mode='annex', existing='reconfigure', recursive=True) assert_result_count(res, 8, status='ok') assert all(r['action'].startswith('reconfigure_sibling_webdav') for r in res) remote_content = list(new_root.glob('**')) assert_in(new_root / '3f7', remote_content) assert_in(new_root / 'sub2', remote_content) assert_in(new_root / 'sub2' / 'subsub', remote_content) def test_result_renderer(existing_dataset, credman, webdav_credential, webdav_server): # need to amend the test credential, can only do after we know the URL # the test webdav webserver uses a realm label '/' credman.set(realm=f'{webdav_server.url}/', **webdav_credential) # consume stdout to make test self-contained out, err = run_main([ 'create-sibling-webdav', '-d', existing_dataset.path, webdav_server.url, ]) # url is somehow reported assert_in('datalad-annex::?type=webdav', out) # and the two custom result renderings assert_in('create_sibling_webdav(ok)', out) assert_in('create_sibling_webdav.storage(ok)', out) datalad-next-1.4.1/datalad_next/commands/tests/test_credentials.py000066400000000000000000000140201462321624600253670ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # -*- coding: utf-8 -*- # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ """ import logging from unittest.mock import patch from ..credentials import ( Credentials, normalize_specs, ) from datalad_next.exceptions import IncompleteResultsError from datalad_next.utils import external_versions from datalad_next.tests import ( assert_in, assert_in_results, assert_raises, eq_, run_main, swallow_logs, ) # helper to test error handling class BrokenCredentialManager(object): def __init__(*args, **kwargs): pass def set(self, *args, **kwargs): raise RuntimeError('INTENTIONAL FAILURE') def remove(self, *args, **kwargs): raise RuntimeError('INTENTIONAL FAILURE') def get(self, *args, **kwargs): # if there is no secret return None def test_normalize_specs(): for i, o in ( ('', {}), ('{}', {}), ([], {}), # indicate removal ([':mike'], {'mike': None}), (['this=that'], {'this': 'that'}), # not removal without : prefix (['empty='], {'empty': ''}), # multiple works (['this=that', ':mike'], {'this': 'that', 'mike': None}), # complete specs dict needs no removal marker ('{"this":"that","mike":null}', {'this': 'that', 'mike': None}), # list-spec, but as JSON ('["this=that", ":mike"]', {'this': 'that', 'mike': None}), ): eq_(normalize_specs(i), o) for error in ( # no removal marker ['mike'], # any string would be JSON and must be valid 'brokenjson' ): assert_raises(ValueError, normalize_specs, error) def test_errorhandling_smoketest(no_result_rendering): callcfg = dict(on_failure='ignore') with patch('datalad_next.commands.credentials.CredentialManager', BrokenCredentialManager): cred = Credentials() assert_in_results( cred('set', name='dummy', **callcfg), status='error', name='dummy') assert_in_results( cred('remove', name='dummy', **callcfg), status='error', name='dummy') def test_credentials_cli(tmp_keyring): # usable command cred = Credentials() # unknown action assert_raises(ValueError, cred, 'levitate') with swallow_logs(new_level=logging.ERROR) as cml: # it is a shame that the error is not coming out on # stderr run_main(['credentials', 'remove'], exit_code=1) if external_versions['datalad'] > '0.17.9': # depends on (yet unreleased) # https://github.com/datalad/datalad/pull/7210 assert ' no credential name provided' in cml.lines # catch missing `name` via Python call too assert_raises(IncompleteResultsError, cred, 'set', spec=[':mike']) # no name and no property assert_raises(ValueError, cred, 'get') assert_in_results( cred('get', name='donotexiststest', on_failure='ignore', result_renderer='disabled'), status='error', name='donotexiststest', ) # we are not getting a non-existing credential, and it is also # not blocking for input with a non-interactive session assert_in( 'credential not found', run_main(['credentials', 'get', 'donotexiststest'], exit_code=1)[0] ) # set a credential CLI run_main( ['credentials', 'set', 'mycred', 'secret=some', 'user=mike'], exit_code=0) # which we can retrieve run_main(['credentials', 'get', 'mycred'], exit_code=0) # query runs without asking for input, and comes back clean assert_in( 'some', run_main(['-f', 'json', 'credentials', 'query'], exit_code=0)[0]) # and remove run_main(['credentials', 'remove', 'mycred'], exit_code=0) # nothing bad on second attempt run_main(['credentials', 'remove', 'mycred'], exit_code=0) # query runs without asking for input, and comes back clean run_main(['credentials', 'query'], exit_code=0) def test_interactive_entry_get(tmp_keyring, datalad_interactive_ui, no_result_rendering): ui = datalad_interactive_ui ui.staged_responses.extend([ 'attr1', 'attr2', 'secret']) # should ask all properties in order and the secret last cred = Credentials() assert_in_results( cred('get', name='myinteractive_get', # use CLI notation spec=[':attr1', ':attr2'], prompt='dummyquestion'), cred_attr1='attr1', cred_attr2='attr2', cred_secret='secret', ) assert ui.operation_sequence == ['question', 'response'] * 3 def test_interactive_entry_set(tmp_keyring, datalad_interactive_ui, no_result_rendering): ui = datalad_interactive_ui ui.staged_responses.append('secretish') # should ask all properties in order and the secret last cred = Credentials() assert_in_results( cred('set', name='myinteractive_set', prompt='dummyquestion'), cred_secret='secretish', ) assert ui.operation_sequence == ['question', 'response'] def test_result_renderer(): # it must survive a result that is not from the command itself Credentials.custom_result_renderer(dict( action='weird', status='broken', )) def test_extreme_credential_name(tmp_keyring, datalad_cfg, no_result_rendering): cred = Credentials() extreme = 'ΔЙקم๗あ |/;&%b5{}"' assert_in_results( cred( 'set', name=extreme, # use CLI style spec to exercise more code spec=[f'someprop={extreme}', f'secret={extreme}'], ), cred_someprop=extreme, cred_secret=extreme, ) datalad-next-1.4.1/datalad_next/commands/tests/test_download.py000066400000000000000000000216551462321624600247150ustar00rootroot00000000000000from io import StringIO import json import pytest import datalad from datalad.api import ( credentials, download, ) from datalad_next.tests import ( assert_result_count, assert_status, ) from datalad_next.utils import chpwd from datalad_next.utils import CredentialManager @pytest.fixture def hbsurl(httpbin): # shortcut for the standard URL return httpbin["standard"] test_cred = ('dltest-my&=http', 'datalad', 'secure') @pytest.fixture def hbscred(hbsurl): return ( 'hbscred', dict(user='mike', secret='dummy', type='user_password', realm=f'{hbsurl}/Fake Realm'), ) def test_download(tmp_path, http_server, no_result_rendering): wdir = tmp_path srvurl = http_server.url (http_server.path / 'testfile.txt').write_text('test') # simple download, taking the target filename from the URL # single-pass hashing with two algorithms with chpwd(wdir): res = download(f'{srvurl}testfile.txt', hash=['md5', 'SHA256'], return_type='item-or-list') assert (wdir / 'testfile.txt').read_text() == 'test' # keys for hashes keep user-provided captialization assert res['md5'] == '098f6bcd4621d373cade4e832627b4f6' assert res['SHA256'] == \ '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08' # doing it again fails due to overwrite detection with chpwd(wdir): assert_result_count( download(f'{srvurl}testfile.txt', on_failure='ignore'), 1, status='error', error_message='target path already exists') # works with explicit alternative filename with chpwd(wdir): download(f'{srvurl}testfile.txt testfile2.txt') assert (wdir / 'testfile2.txt').read_text() == 'test' # non-existing download source assert_result_count( download(f'{srvurl}nothere', on_failure='ignore'), 1, status='error', message='download failure') def test_download_invalid_calls(monkeypatch, no_result_rendering): # unsupported url scheme, only detected when actually calling # a handler inside, hence error result assert_result_count( download('dummy://mike/file', on_failure='ignore'), 1, status='error', message='unsupported URL (custom URL handlers can be declared ' 'via DataLad configuration)', ) # no target path derivable # immediate error, when all information is readily available with pytest.raises(ValueError): download('http://example.com') # deferred error result when a generator is gathering batch-mode # items at runtime monkeypatch.setattr('sys.stdin', StringIO('http://example.com')) assert_result_count( download( '-', on_failure='ignore', ), 1, status='impossible') def test_download_auth( tmp_path, credman, http_credential, http_server_with_basicauth, no_result_rendering): credman.set(**http_credential) wdir = tmp_path srvurl = http_server_with_basicauth.url srvpath = http_server_with_basicauth.path (srvpath / 'testfile.txt').write_text('test') # we have a credential, but there is nothing to discover from # that we should use it for this request assert_result_count( download(f'{srvurl}nothere', on_failure='ignore'), 1, status='error', message='download failure') # amend the test credential with the realm of the test server credman = CredentialManager(datalad.cfg) credman.set(test_cred[0], realm=f'{srvurl}Protected') # now it must be able to auto-detect it download({f'{srvurl}testfile.txt': wdir / 'download1.txt'}) assert (wdir / 'download1.txt').read_text() == 'test' auth_ok_response = {"authenticated": True, "user": "mike"} def test_download_basic_auth(credman, capsys, hbscred, hbsurl, no_result_rendering): credman.set(hbscred[0], **hbscred[1]) # consume stdout to make test self-contained capsys.readouterr() download( {f'{hbsurl}/basic-auth/mike/dummy': '-'}) assert json.loads(capsys.readouterr().out) == auth_ok_response def test_download_bearer_token_auth(credman, capsys, hbsurl): credman.set('dummy', realm=f'{hbsurl}/', type='token', secret='very') # consume stdout to make test self-contained capsys.readouterr() download( {f'{hbsurl}/bearer': '-'}) assert json.loads(capsys.readouterr().out) == { 'authenticated': True, 'token': 'very', } def test_download_digest_auth(credman, capsys, hbscred, hbsurl, no_result_rendering): credman.set(hbscred[0], **dict(hbscred[1], realm=f'{hbsurl}/me@kennethreitz.com')) # consume stdout to make test self-contained capsys.readouterr() for url_suffix in ( '/digest-auth/auth/mike/dummy', # non-default algorithm '/digest-auth/auth/mike/dummy/SHA-256', ): download({f'{hbsurl}{url_suffix}': '-'}) assert json.loads(capsys.readouterr().out) == auth_ok_response # repeated reads do not accumulate assert capsys.readouterr().out == '' def test_download_explicit_credential(credman, capsys, hbscred, hbsurl): # the provided credential has the wrong 'realm' for auto-detection. # but choosing it explicitly must put things to work credman.set(hbscred[0], **hbscred[1]) # consume stdout to make test self-contained capsys.readouterr() download({f'{hbsurl}/digest-auth/auth/mike/dummy': '-'}, credential=hbscred[0]) assert json.loads(capsys.readouterr().out) == auth_ok_response def test_download_auth_after_redirect(credman, capsys, hbscred, hbsurl): credman.set(hbscred[0], **hbscred[1]) # consume stdout to make test self-contained capsys.readouterr() download( {f'{hbsurl}/redirect-to?url={hbsurl}/basic-auth/mike/dummy': '-'}) assert json.loads(capsys.readouterr().out) == auth_ok_response def test_download_no_credential_leak_to_http(credman, capsys, hbscred, httpbin, no_result_rendering): credman.set(hbscred[0], **hbscred[1]) redirect_url = f'{httpbin["http"]}/basic-auth/mike/dummy' res = download( # redirect from https to http, must drop provideded credential # to avoid leakage {f'{httpbin["https"]}/redirect-to?url={redirect_url}': '-'}, credential=hbscred[0], on_failure='ignore') assert_status('error', res) assert '401' in res[0]['error_message'] assert f' {redirect_url}' in res[0]['error_message'] # do the same again, but without the explicit credential, # also must not work # this is not the right test, though. What would be suitable # is an authenticated request that then redirects res = download( # redirect from https to http, must drop provideded credential # to avoid leakage {f'{httpbin["https"]}/redirect-to?url={redirect_url}': '-'}, on_failure='ignore') assert_status('error', res) def test_download_new_bearer_token( tmp_keyring, capsys, hbsurl, datalad_interactive_ui, no_result_rendering): ui = datalad_interactive_ui ui.staged_responses.extend([ 'token123', # after download, it asks for a name 'dataladtest_test_download_new_bearer_token', ]) try: download({f'{hbsurl}/bearer': '-'}) # and it was saved under this name assert_result_count( credentials( 'get', name='dataladtest_test_download_new_bearer_token'), 1, cred_secret='token123', cred_type='token', ) finally: credentials( 'remove', name='dataladtest_test_download_new_bearer_token', ) def test_download_new_bearer_token_nosave( capsys, hbsurl, datalad_interactive_ui, no_result_rendering): ui = datalad_interactive_ui ui.staged_responses.extend([ 'datalad_uniquetoken123', # after download, it asks for a name, but skip to save 'skip', ]) download({f'{hbsurl}/bearer': '-'}) # and it was saved under this name assert_result_count( credentials('query', dict(secret='datalad_uniquetoken123')), 0, ) # make sure a 404 is easily discoverable # https://github.com/datalad/datalad/issues/6545 def test_download_404(hbsurl, no_result_rendering): assert_result_count( download(f'{hbsurl}/status/404', on_failure='ignore'), 1, status_code=404, status='error') def test_downloadurl(tmp_path, no_result_rendering): (tmp_path / 'src').mkdir() dst_path = tmp_path / 'dst' dst_path.mkdir() testfile = tmp_path / 'src' / 'myfile.txt' testfile.write_text('some content') res = download( {testfile.as_uri(): dst_path / 'target.txt'}, hash=['md5'], return_type='item-or-list') assert_result_count(res, 1, md5='9893532233caff98cd083a116b013c0b') datalad-next-1.4.1/datalad_next/commands/tests/test_ls_file_collection.py000066400000000000000000000202151462321624600267250ustar00rootroot00000000000000from pathlib import ( Path, PurePath, ) import pytest from datalad.api import ls_file_collection from datalad_next.constraints import CommandParametrizationError # we need this fixture from datalad_next.iter_collections.tests.test_iterzip import sample_zip from datalad_next.tests import skipif_no_network from ..ls_file_collection import LsFileCollectionParamValidator def test_ls_file_collection_insufficient_args(): with pytest.raises(CommandParametrizationError): ls_file_collection() # any collection needs some kind of identifier, just the type # parameter is not enough with pytest.raises(CommandParametrizationError): ls_file_collection('tarfile') # individual collection types have particular requirements re # the identifiers -- tarfile wants an existing path with pytest.raises(CommandParametrizationError): ls_file_collection('tarfile', 'http://example.com') # not a known collection type with pytest.raises(CommandParametrizationError): ls_file_collection('bogus', 'http://example.com') def _check_archive_member_result(r, collection): # basics of a result assert r['action'] == 'ls_file_collection' assert r['status'] == 'ok' # a collection identifier, here the tar location assert 'collection' in r assert r['collection'] == collection # an item identifier, here a str-path of an archive member assert 'item' in r assert isinstance(r['item'], str) # item type info, here some filesystem-related category assert 'type' in r assert r['type'] in ('file', 'directory', 'symlink', 'hardlink') def test_ls_file_collection_zipfile(sample_zip, no_result_rendering): for res in ( ls_file_collection('zipfile', sample_zip), ls_file_collection('zipfile', sample_zip, hash='md5'), ): assert len(res) == 4 # test a few basic properties that should be true for any result for r in res: _check_archive_member_result(r, sample_zip) @skipif_no_network def test_ls_file_collection_tarfile(sample_tar_xz, no_result_rendering): for res in ( ls_file_collection('tarfile', sample_tar_xz), ls_file_collection('tarfile', sample_tar_xz, hash='md5'), ): assert len(res) == 6 # test a few basic properties that should be true for any result for r in res: _check_archive_member_result(r, sample_tar_xz) def test_ls_file_collection_directory(tmp_path, no_result_rendering): # smoke test on an empty dir res = ls_file_collection('directory', tmp_path) assert len(res) == 0 def test_ls_file_collection_gitworktree(existing_dataset, no_result_rendering): # smoke test on a plain dataset res = ls_file_collection('gitworktree', existing_dataset.pathobj) assert len(res) > 1 assert all('gitsha' in r for r in res) # and with hashing res_hash = ls_file_collection('gitworktree', existing_dataset.pathobj, hash='md5') assert len(res) == len(res_hash) assert all('hash-md5' in r for r in res_hash) def test_ls_file_collection_validator(): val = LsFileCollectionParamValidator() with pytest.raises(RuntimeError): val.get_collection_iter(type='bogus', collection='any', hash=None) @skipif_no_network def test_replace_add_archive_content(sample_tar_xz, existing_dataset, no_result_rendering): ds = existing_dataset archive_path = ds.pathobj / '.datalad' / 'myarchive.tar.xz' # get archive copy in dataset (not strictly needed, but # add-archive-content worked like this ds.download({sample_tar_xz.as_uri(): archive_path}) # properly safe to dataset (download is ignorant of datasets) res = ds.save(message='add archive') # the first result has the archive addition, snatch the archive key from it assert res[0]['path'] == str(archive_path) archive_key = res[0]['key'] # now we can scan the archive and register keys for its content. # the order and specific composition of the following steps is flexible. # we could simply extract the local archive, save the content to the # dataset, and then register `dl+archive` URLs. # however, we will use an approach that does not require any data # to be present locally (actually not even the archive that we have locally # already for this test), but is instead based on some metadata # that is provided by `ls-file-collection` (but could come from elsewhere, # including `ls-file-collection` executed on a different host). file_recs = [ r for r in ls_file_collection( 'tarfile', sample_tar_xz, hash=['md5'], ) # ignore any non-file, would not have an annex key. # Also ignores hardlinks (they consume no space (size=0), but could be # represented as regular copies of a shared key. however, this # requires further processing of the metadat records, in order to find # the size of the item that has the same checksum as this one) if r.get('type') == 'file' ] # we enable the `datalad-archives` special remote using a particular # configuration that `add-archive-content` would use. # this special remote can act on the particular URLs that we will add next ds.repo.call_annex([ 'initremote', 'datalad-archives', 'type=external', 'externaltype=datalad-archives', 'encryption=none', 'autoenable=true']) # assign special `dl+archive` URLs to all file keys # the `datalad-archives` special remote will see them and perform the # extraction of file content from the archive on demand. # the entire operation is not doing any extraction or data retrieval, # because we have all information necessary to generate keys ds.addurls( # takes an iterable of dicts file_recs, # urlformat: handcrafted archive key, as expected by datalad-archive # (double braces to keep item and size as placeholders for addurls) f'dl+archive:{archive_key}#path={{item}}&size={{size}}', # filenameformat '{item}', key='et:MD5-s{size}--{hash-md5}', ) # because we have been adding the above URLs using a pure metadata-driven # approach, git-annex does not yet know that the archives remote actually # has the keys. we could use `annex setpresentkey` for that (fast local # operation), but here we use `fsck` to achieve a comprehensive smoke test # of compatibility with our hand-crafted and the special remote # implementation # (actually: without --fast the special remote crashes with a protocol # error -- a bug in the special remote probably) ds.repo.call_annex( ['fsck', '--fast', '-f', 'datalad-archives'], files=['test-archive'], ) # at this point we are done # check retrieval for a test file, which is not yet around testfile = ds.pathobj / 'test-archive' / '123_hard.txt' assert ds.status( testfile, annex='availability')[0]['has_content'] is False ds.get(testfile) assert testfile.read_text() == '123\n' def test_ls_renderer(): # nothing more than a smoke test ls_file_collection( 'directory', Path(__file__).parent, result_renderer='tailored', ) def test_ls_annexworktree_empty_dataset(existing_dataset): res = ls_file_collection( 'annexworktree', existing_dataset.pathobj, result_renderer='disabled' ) assert len(res) == 3 annexed_files = [annex_info for annex_info in res if 'annexkey' in annex_info] assert len(annexed_files) == 0 def test_ls_annexworktree_simple_dataset(existing_dataset): (existing_dataset.pathobj / 'sample.bin').write_bytes(b'\x00' * 1024) existing_dataset.save(message='add sample file') res = ls_file_collection( 'annexworktree', existing_dataset.pathobj, result_renderer='disabled' ) assert len(res) == 4 annexed_files = [annex_info for annex_info in res if 'annexkey' in annex_info] assert len(annexed_files) == 1 assert annexed_files[0]['type'] == 'annexed file' assert { 'annexkey', 'annexsize', 'annexobjpath' }.issubset(set(annexed_files[0].keys())) datalad-next-1.4.1/datalad_next/commands/tests/test_results.py000066400000000000000000000033461462321624600246040ustar00rootroot00000000000000from pathlib import ( Path, PurePath, ) import pytest from datalad_next.datasets import Dataset from ..results import ( CommandResult, CommandResultStatus, ) def test_commandresult(): # CommandResult is a plain data class, so there is no much to test # only the partial dict API that is implemented as a compatibility # shim for the datalad core result loop # # we need action, status, and path unconditionally with pytest.raises(TypeError): CommandResult() with pytest.raises(TypeError): CommandResult(action='some') with pytest.raises(TypeError): CommandResult(action='some', status='ok') # no something that works st = CommandResult( action='actionlabel', status=CommandResultStatus.ok, path=PurePath('mypath'), refds=Dataset('myds'), ) # we can get a dict with stringified values (for some types) assert dict(st.items()) == { 'action': 'actionlabel', 'status': 'ok', 'path': 'mypath', 'message': None, 'exception': None, 'error_message': None, 'type': None, 'logger': None, 'refds': str(Path.cwd() / 'myds'), } # 'in' works assert 'action' in st assert 'weird' not in st # getitem works, and gives strings assert st['path'] == 'mypath' # same for get assert st.get('path') == 'mypath' assert st.get('weird', 'normal') == 'normal' # 'pop' is emulated by setting to None assert st.pop('path') == 'mypath' assert st.path is None # popping something unknown doesn't blow assert st.pop('weird', 'default') == 'default' # and does not add cruft to the dataclass instance assert not hasattr(st, 'weird') datalad-next-1.4.1/datalad_next/commands/tests/test_status.py000066400000000000000000000034331462321624600244230ustar00rootroot00000000000000import pytest from datalad.api import next_status from datalad_next.constraints import ( CommandParametrizationError, ParameterConstraintContext, ) from datalad_next.utils import chpwd from ..status import ( opt_eval_subdataset_state_values, opt_recursive_values, opt_untracked_values, ) def test_status_invalid(tmp_path, datalad_cfg): # we want exhaustive parameter validation (i.e., continue after # first failure), saves some code here datalad_cfg.set('datalad.runtime.parameter-violation', 'raise-at-end', scope='global') with chpwd(tmp_path): with pytest.raises(CommandParametrizationError) as e: next_status( untracked='weird', recursive='upsidedown', eval_subdataset_state='moonphase', ) errors = e.value.errors assert 'no dataset found' in \ errors[ParameterConstraintContext(('dataset',))].msg.casefold() for opt in ('untracked', 'recursive', 'eval_subdataset_state'): assert 'is not one of' in \ errors[ParameterConstraintContext((opt,))].msg.casefold() def test_status_renderer_smoke(existing_dataset): ds = existing_dataset assert ds.next_status() == [] (ds.pathobj / 'untracked').touch() st = ds.next_status() assert len(st) == 1 def test_status_clean(existing_dataset, no_result_rendering): ds = existing_dataset ds.create('subds') for recmode in opt_recursive_values: assert [] == ds.next_status(recursive=recmode) for untracked in opt_untracked_values: assert [] == ds.next_status(untracked=untracked) for eval_sm in opt_eval_subdataset_state_values: assert [] == ds.next_status(eval_subdataset_state=eval_sm) datalad-next-1.4.1/datalad_next/commands/tests/test_tree.py000066400000000000000000000760121462321624600240420ustar00rootroot00000000000000from contextlib import contextmanager from pathlib import Path from os import sep import pytest from datalad_next.tests import ( BasicGitTestRepo, assert_raises, create_tree, get_deeply_nested_structure, skip_wo_symlink_capability, skip_if_on_windows, skip_if_root, ok_good_symlink, ok_broken_symlink, run_main, ) from datalad_next.utils import chpwd from datalad_next.uis import ui_switcher as ui from datalad_next.datasets import Dataset from ..tree import ( Tree, TreeCommand ) """Tests for the ``datalad tree`` command.""" # ============================ Helper functions =============================== @contextmanager def ensure_no_permissions(path: Path): """Remove all permissions for given file/directory and restore the original permissions at the end""" # modeled after 'datalad.utils.ensure_write_permission' original_mode = path.stat().st_mode try: path.chmod(0o000) yield finally: try: path.chmod(original_mode) except FileNotFoundError: # ignore error if path was deleted in the context block pass @pytest.fixture(scope="module") def path_no_ds(tmp_path_factory): """Fixture for creating a temporary directory tree (**without** datasets) to be used in tests. Returns ------- Path Root directory of the newly created tree """ dir_tree = { "root": { ".dir3": { "dir3_file0": '', ".dir3_file1": '', }, "dir0": {}, "dir1": { "dir1_file0": '', }, "dir2": { "dir2_dir0": {}, "dir2_dir1": { "dir2_dir1_file0": '', }, "dir2_dir2": { "dir2_dir2_file0": '', "dir2_dir2_file1": '', }, "dir2_file0": '', "dir2_file1": '', }, ".file2": '', "file0": '', "file1": '', } } temp_dir_root = tmp_path_factory.mktemp("no-ds") create_tree(temp_dir_root, dir_tree) yield temp_dir_root @pytest.fixture(scope="module") def path_ds(tmp_path_factory): """Fixture for creating a temporary directory tree (**including** datasets) to be used in tests. Returns ------- Path Root directory of the newly created tree """ ds_tree = { "root": { "superds0": { "sd0_file0": "", "sd0_subds0": { "sd0_sub0_subds0": {} } }, "superds1": { "sd1_file0": "", "sd1_dir0": { "sd1_d0_repo0": {}, "sd1_d0_subds0": {}, }, "sd1_ds0": {}, # not registered as subdataset "sd1_subds0": {}, # not installed (drop all) }, # plain git repo (contents are defined in BasicGitTestRepo) "repo0": {}, "file0": "", } } temp_dir_root = tmp_path_factory.mktemp('ds') create_tree( temp_dir_root, ds_tree, ) # create datasets / repos root = temp_dir_root / "root" BasicGitTestRepo(path=root / "repo0", puke_if_exists=False) ckwa = dict(force=True, result_renderer="disabled") superds0 = Dataset(root / "superds0").create(**ckwa) sd0_subds0 = superds0.create("sd0_subds0", **ckwa) sd0_subds0.create("sd0_sub0_subds0", **ckwa) superds1 = Dataset(root / "superds1").create(**ckwa) superds1.create(Path("sd1_dir0") / "sd1_d0_subds0", **ckwa) Dataset(root / "superds1" / "sd1_ds0").create(**ckwa) BasicGitTestRepo( path=root / "superds1" / "sd1_dir0" / "sd1_d0_repo0", puke_if_exists=False) sd1_subds0 = superds1.create("sd1_subds0", **ckwa) sd1_subds0.drop(what='all', reckless='kill', recursive=True, result_renderer='disabled') yield temp_dir_root def get_tree_rendered_output(tree_cmd: list, exit_code: int = 0): """ Run 'tree' CLI command with the given list of arguments and return the output of the custom results renderer, broken down into 3 components (tree root, tree body, report line). Assumes command exit code 0 and no additional logging to stdout. Parameters ---------- tree_cmd: list(str) 'tree' command given as list of strings exit_code: int Expected exit code of command (default: 0) Returns ------- Tuple[str, str, str] 3-value tuple consisting of: tree root, tree body, report line """ # remove any empty strings from command out, _ = run_main([c for c in tree_cmd if c != ''], exit_code=exit_code) # remove trailing newline lines = out.rstrip("\n").split("\n") root = lines[0] # first line of tree output body = "\n".join(lines[1:-1]) report = lines[-1] return root, body, report @pytest.fixture(scope="class") def inject_path(request, path_ds, path_no_ds): """ Set a path fixture (root path of temp directory tree) as class attribute, to make it available to all tests in the class. The fixture is chosen based on the class' ``tree_with_ds`` attribute. """ if request.cls.tree_with_ds: request.cls.path = path_ds else: request.cls.path = path_no_ds def format_param_ids(val) -> str: """ Helper to format pytest parameter IDs. If the parameter is a multiline string, we assume it is the parameter 'expected' (expected output of tree), and just give it a fixed ID (otherwise, it would be displayed in the parameter list as a long unreadable string). Parameters ---------- val Parameter value """ if isinstance(val, str) and "\n" in val: return "expected" def build_param_matrix(matrix, params): """Turn inner dicts into lists (required by pytest parametrize)""" matrix_out = [] for combination in matrix: matrix_out.append( # order of combinations does not matter [val for key, val in combination.items() if key in params] ) return matrix_out def pytest_generate_tests(metafunc): """Pytest helper to automatically configure parametrization. Avoids having to duplicate definition of parameter names and values across tests that use the same data. See: https://docs.pytest.org/en/7.1.x/example/parametrize.html#parametrizing-test-methods-through-per-class-configuration """ if metafunc.cls and \ hasattr(metafunc.cls, 'params') and \ hasattr(metafunc.cls, 'MATRIX'): test_id = metafunc.function.__name__ test_params_dict = metafunc.cls.params matrix = metafunc.cls.MATRIX if test_id in metafunc.cls.params: param_names = test_params_dict[test_id] metafunc.parametrize( param_names, build_param_matrix(matrix, param_names), ids=format_param_ids ) # ================================= Tests ===================================== @pytest.mark.usefixtures("inject_path") class TestTree: """Base class with tests that should run for multiple Tree configurations. Configurations are defined by: - ``MATRIX``: dicts of pytest parameters and their values, where each dict corresponds to a separate parametrized test instance. - ``params``: a dict defining for each test method, which parameters will be used in that test (from the parameter names contained in ``MATRIX``). """ __test__ = False # tells pytest to not collect tests in this class tree_with_ds = False path = None # will be set by the inject_* fixture to temp dir tree root # matrix of combinations of parameters to be tested and their # expected results MATRIX = [] # dict specifying parameter sets for each test method params = { "test_print_tree": [ "depth", "include_files", "include_hidden", "expected_str" ], "test_print_stats": [ "depth", "include_files", "include_hidden", "expected_stats_str" ], "test_exhausted_levels_are_below_current_depth": [ "depth", "include_files", "include_hidden" ] } class TestTreeWithoutDatasets(TestTree): """Test directory tree without any datasets""" __test__ = True tree_with_ds = False MATRIX = [ { "depth": 1, "include_files": False, "include_hidden": False, "expected_stats_str": "0 datasets, 3 directories", "expected_str": """ ├── dir0/ ├── dir1/ └── dir2/ """ }, { "depth": 3, "include_files": False, "include_hidden": False, "expected_stats_str": "0 datasets, 6 directories", "expected_str": """ ├── dir0/ ├── dir1/ └── dir2/ ├── dir2_dir0/ ├── dir2_dir1/ └── dir2_dir2/ """ }, { "depth": 1, "include_files": True, "include_hidden": False, "expected_stats_str": "0 datasets, 3 directories, 2 files", "expected_str": """ ├── dir0/ ├── dir1/ ├── dir2/ ├── file0 └── file1 """ }, { "depth": 3, "include_files": True, "include_hidden": False, "expected_stats_str": "0 datasets, 6 directories, 8 files", "expected_str": """ ├── dir0/ ├── dir1/ │ └── dir1_file0 ├── dir2/ │ ├── dir2_dir0/ │ ├── dir2_dir1/ │ │ └── dir2_dir1_file0 │ ├── dir2_dir2/ │ │ ├── dir2_dir2_file0 │ │ └── dir2_dir2_file1 │ ├── dir2_file0 │ └── dir2_file1 ├── file0 └── file1 """ }, { "depth": 1, "include_files": True, "include_hidden": True, "expected_stats_str": "0 datasets, 4 directories, 3 files", "expected_str": """ ├── .dir3/ ├── .file2 ├── dir0/ ├── dir1/ ├── dir2/ ├── file0 └── file1 """ }, { "depth": 3, "include_files": True, "include_hidden": True, "expected_stats_str": "0 datasets, 7 directories, 11 files", "expected_str": """ ├── .dir3/ │ ├── .dir3_file1 │ └── dir3_file0 ├── .file2 ├── dir0/ ├── dir1/ │ └── dir1_file0 ├── dir2/ │ ├── dir2_dir0/ │ ├── dir2_dir1/ │ │ └── dir2_dir1_file0 │ ├── dir2_dir2/ │ │ ├── dir2_dir2_file0 │ │ └── dir2_dir2_file1 │ ├── dir2_file0 │ └── dir2_file1 ├── file0 └── file1 """ }, { "depth": 1, "include_files": False, "include_hidden": True, "expected_stats_str": "0 datasets, 4 directories", "expected_str": """ ├── .dir3/ ├── dir0/ ├── dir1/ └── dir2/ """ }, { "depth": 3, "include_files": False, "include_hidden": True, "expected_stats_str": "0 datasets, 7 directories", "expected_str": """ ├── .dir3/ ├── dir0/ ├── dir1/ └── dir2/ ├── dir2_dir0/ ├── dir2_dir1/ └── dir2_dir2/ """ }, ] def test_print_tree( self, depth, include_files, include_hidden, expected_str ): root = str(self.path / "root") command = [ 'tree', root, '--depth', str(depth), '--include-hidden' if include_hidden else '', '--include-files' if include_files else '' ] _, actual_res, _ = get_tree_rendered_output(command) expected_res = expected_str.lstrip("\n") # strip first newline ui.message("expected:") ui.message(expected_res) ui.message("actual:") ui.message(actual_res) assert expected_res == actual_res def test_print_stats( self, depth, include_files, include_hidden, expected_stats_str ): root = str(self.path / 'root') command = [ 'tree', root, '--depth', str(depth), '--include-hidden' if include_hidden else '', '--include-files' if include_files else '' ] _, _, actual_res = get_tree_rendered_output(command) expected_res = expected_stats_str assert expected_res == actual_res @pytest.mark.parametrize( "root_dir_name", ["root/", "root/.", "root/./", "root/../root"] ) def test_root_path_is_normalized(self, root_dir_name): """ Test that root path in the first line of string output is normalized path """ root = str(self.path / root_dir_name) command = ['tree', root, '--depth', '0'] actual, _, _ = get_tree_rendered_output(command) expected = str(self.path / "root") assert expected == actual def test_no_difference_if_root_path_absolute_or_relative(self): """Tree output should be identical whether the root directory is given as absolute or relative path""" root = str(self.path / "root") output_abs_path = get_tree_rendered_output(['tree', root]) with chpwd(root): output_rel_path = get_tree_rendered_output(['tree', '.']) assert output_abs_path == output_rel_path def test_print_tree_depth_zero(self): root = str(self.path / "root") # including files should have no effect command = ['tree', root, '--depth', '0', '--include-files'] actual = get_tree_rendered_output(command) expected = (root, '', '0 datasets, 0 directories, 0 files') assert expected == actual def test_exhausted_levels_are_below_current_depth( self, depth, include_files, include_hidden): """For each node, the exhausted levels reported for that node should be smaller or equal to the node's depth""" results = TreeCommand.__call__( self.path, depth=depth, include_files=include_files, include_hidden=include_hidden, result_renderer="disabled", # return only 'depth' and 'exhausted_levels' from result dicts result_xfm=lambda res: {k: res[k] for k in ("depth", "exhausted_levels")} ) # sanity checks assert len(results) > 1 assert any(res["exhausted_levels"] for res in results) # actual test assert all(level <= res["depth"] for res in results for level in res["exhausted_levels"]) class TestTreeWithDatasets(TestTreeWithoutDatasets): """Test directory tree with datasets""" __test__ = True tree_with_ds = True # set `include_files` and `include_hidden` to False, # they should be already covered in `TestTreeWithoutDatasets` MATRIX = [ { "depth": 1, "include_files": False, "include_hidden": False, "expected_stats_str": "2 datasets, 1 directory", "expected_str": """ ├── repo0/ ├── [DS~0] superds0/ └── [DS~0] superds1/ """, }, { "depth": 4, "include_files": False, "include_hidden": False, "expected_stats_str": "7 datasets, 3 directories", "expected_str": """ ├── repo0/ ├── [DS~0] superds0/ │ └── [DS~1] sd0_subds0/ │ └── [DS~2] sd0_sub0_subds0/ └── [DS~0] superds1/ ├── sd1_dir0/ │ ├── sd1_d0_repo0/ │ └── [DS~1] sd1_d0_subds0/ ├── [DS~0] sd1_ds0/ └── [DS~1] (not installed) sd1_subds0/ """, }, ] class TestDatasetTree(TestTree): """Test dataset tree with max_dataset_depth parameter""" __test__ = True tree_with_ds = True MATRIX = [ { "dataset_depth": 0, "depth": 0, "expected_stats_str": "3 datasets, 0 directories", "expected_str": """ ├── [DS~0] superds0/ └── [DS~0] superds1/ └── [DS~0] sd1_ds0/ """ }, { "dataset_depth": 0, "depth": 1, "expected_stats_str": "3 datasets, 1 directory", "expected_str": """ ├── [DS~0] superds0/ └── [DS~0] superds1/ ├── sd1_dir0/ └── [DS~0] sd1_ds0/ """ }, { "dataset_depth": 0, "depth": 2, "expected_stats_str": "3 datasets, 2 directories", "expected_str": """ ├── [DS~0] superds0/ └── [DS~0] superds1/ ├── sd1_dir0/ │ └── sd1_d0_repo0/ └── [DS~0] sd1_ds0/ """ }, { "dataset_depth": 1, "depth": 0, "expected_stats_str": "6 datasets, 1 directory", "expected_str": """ ├── [DS~0] superds0/ │ └── [DS~1] sd0_subds0/ └── [DS~0] superds1/ ├── sd1_dir0/ │ └── [DS~1] sd1_d0_subds0/ ├── [DS~0] sd1_ds0/ └── [DS~1] (not installed) sd1_subds0/ """ }, { "dataset_depth": 1, "depth": 2, "expected_stats_str": "6 datasets, 2 directories", "expected_str": """ ├── [DS~0] superds0/ │ └── [DS~1] sd0_subds0/ └── [DS~0] superds1/ ├── sd1_dir0/ │ ├── sd1_d0_repo0/ │ └── [DS~1] sd1_d0_subds0/ ├── [DS~0] sd1_ds0/ └── [DS~1] (not installed) sd1_subds0/ """ }, { "dataset_depth": None, "depth": 0, "expected_stats_str": "7 datasets, 1 directory", "expected_str": """ ├── [DS~0] superds0/ │ └── [DS~1] sd0_subds0/ │ └── [DS~2] sd0_sub0_subds0/ └── [DS~0] superds1/ ├── sd1_dir0/ │ └── [DS~1] sd1_d0_subds0/ ├── [DS~0] sd1_ds0/ └── [DS~1] (not installed) sd1_subds0/ """ }, { "dataset_depth": None, "depth": 2, "expected_stats_str": "7 datasets, 2 directories", "expected_str": """ ├── [DS~0] superds0/ │ └── [DS~1] sd0_subds0/ │ └── [DS~2] sd0_sub0_subds0/ └── [DS~0] superds1/ ├── sd1_dir0/ │ ├── sd1_d0_repo0/ │ └── [DS~1] sd1_d0_subds0/ ├── [DS~0] sd1_ds0/ └── [DS~1] (not installed) sd1_subds0/ """ }, ] params = { "test_print_tree": [ "dataset_depth", "depth", "expected_str" ], "test_print_stats": [ "dataset_depth", "depth", "expected_stats_str" ] } def test_print_tree( self, dataset_depth, depth, expected_str ): root = str(self.path / "root") recursive_opts = ["--recursive"] if dataset_depth is not None: recursive_opts = ['--recursion-limit', str(dataset_depth)] command = [ 'tree', root, '--depth', str(depth), *recursive_opts ] _, actual_res, _ = get_tree_rendered_output(command) expected_res = expected_str.lstrip("\n") # strip first newline ui.message("expected:") ui.message(expected_res) ui.message("actual:") ui.message(actual_res) assert expected_res == actual_res def test_print_tree_without_datasets(self): """If there are no datasets, should only print the root""" root = str(self.path / "root" / "repo0") command = [ 'tree', root, '--depth', '10', '--recursive', '--include-files' ] _, actual_res, _ = get_tree_rendered_output(command) expected_res = "" ui.message("expected:") ui.message(expected_res) ui.message("actual:") ui.message(actual_res) assert expected_res == actual_res def test_print_stats( self, dataset_depth, depth, expected_stats_str ): root = str(self.path / "root") recursive_opts = ["--recursive"] if dataset_depth is not None: recursive_opts = ['--recursion-limit', str(dataset_depth)] command = [ 'tree', root, '--depth', str(depth), *recursive_opts ] _, _, actual_res = get_tree_rendered_output(command) expected_res = expected_stats_str assert expected_res == actual_res class TestTreeFilesystemIssues: """Test tree with missing permissions, broken symlinks, etc.""" def test_print_tree_fails_for_nonexistent_directory(self, tmp_path): """Obtain nonexistent directory by creating a temp dir and deleting it (may be safest method)""" with assert_raises(ValueError): Tree(tmp_path / 'nonexistent_dir', max_depth=1) @skip_if_root # see https://github.com/datalad/datalad-next/issues/525 @skip_if_on_windows @skip_wo_symlink_capability def test_print_tree_permission_denied(self, tmp_path): """ - If the tree contains a directory for which the user has no permissions (so it would not be possible to traverse it), a message should be displayed next to the affected directory path - The rest of the tree following the forbidden directory should be printed as usual - The command should return error exit status but not crash """ (tmp_path / 'z_dir' / 'subdir').mkdir(parents=True) forbidden_dir = tmp_path / 'a_forbidden_dir' forbidden_dir.mkdir(parents=True) # temporarily remove all permissions (octal 000) # restore permissions at the end, otherwise we can't delete temp dir with ensure_no_permissions(forbidden_dir): command = ['tree', str(tmp_path), '--depth', '2'] # expect exit code 1 _, actual, _ = get_tree_rendered_output(command, exit_code=1) expected = f""" ├── {forbidden_dir.name}/ [error opening dir] └── z_dir/ └── subdir/ """.lstrip("\n") ui.message("expected:") ui.message(expected) ui.message("actual:") ui.message(actual) assert expected == actual @skip_wo_symlink_capability @pytest.mark.parametrize("include_files", (True, False)) def test_tree_with_broken_symlinks(self, tmp_path, include_files): """Test that broken symlinks are reported as such""" # prep dir1 = tmp_path / 'real' / 'dir1' file1 = tmp_path / 'real' / 'dir1' / 'file1' dir1.mkdir(parents=True) file1.touch() (tmp_path / 'links').mkdir() # create symlinks # 1. broken symlink pointing to non-existent target link_to_nonexistent = tmp_path / 'links' / '1_link_to_nonexistent' link_to_nonexistent.symlink_to(tmp_path / 'nonexistent') ok_broken_symlink(link_to_nonexistent) # 2. broken symlink pointing to itself link_to_self = tmp_path / 'links' / '2_link_to_self' link_to_self.symlink_to(link_to_self) with assert_raises((RuntimeError, OSError)): # OSError on Windows # resolution should fail because of infinite loop link_to_self.resolve() # 3. good symlink pointing to existing directory link_to_dir1 = tmp_path / 'links' / '3_link_to_dir1' link_to_dir1.symlink_to(dir1, target_is_directory=True) ok_good_symlink(link_to_dir1) # 4. good symlink pointing to existing file link_to_file1 = tmp_path / 'links' / '4_link_to_file1' link_to_file1.symlink_to(file1) ok_good_symlink(link_to_file1) # test results dict using python API # implicitly also tests that command yields tree without crashing actual = TreeCommand.__call__( tmp_path, depth=None, # unlimited include_files=include_files, result_renderer="disabled", result_xfm=lambda res: (Path(res["path"]).name, res["is_broken_symlink"]), result_filter=lambda res: "is_broken_symlink" in res, return_type="list", on_failure="ignore" ) if include_files: expected = [ # (path, is_broken_symlink) (link_to_nonexistent.name, True), (link_to_self.name, True), (link_to_dir1.name, False), (link_to_file1.name, False) ] else: expected = [ (link_to_dir1.name, False) ] assert set(expected) == set(actual) @skip_if_root # see https://github.com/datalad/datalad-next/issues/525 @skip_if_on_windows @skip_wo_symlink_capability @pytest.mark.parametrize("include_files", (True, False)) def test_tree_with_broken_symlinks_to_inaccessible_targets( self, tmp_path, include_files): """Test that symlinks to targets underneath inaccessible directories are reported as broken, whereas symlinks to inaccessible file/directories themselves are not reported as broken.""" # prep root = tmp_path / "root" # tree root root.mkdir(parents=True) # create file and directory without permissions outside of tree # root (permissions will be removed later ad-hoc, because need to # create symlinks first) forbidden_file = tmp_path / "forbidden_file" forbidden_file.touch() # permissions will be removed later ad-hoc forbidden_dir = tmp_path / "forbidden_dir" forbidden_dir.mkdir() file_in_forbidden_dir = forbidden_dir / "file_in_forbidden_dir" file_in_forbidden_dir.touch() dir_in_forbidden_dir = forbidden_dir / "dir_in_forbidden_dir" dir_in_forbidden_dir.mkdir() # create symlinks # 1. broken symlink pointing to file under inaccessible directory link_to_file_in_forbidden_dir = root / "1_link_to_file_in_forbidden_dir" link_to_file_in_forbidden_dir.symlink_to(file_in_forbidden_dir) with ensure_no_permissions(forbidden_dir): with assert_raises(PermissionError): # resolution should fail because of missing permissions link_to_file_in_forbidden_dir.resolve(strict=True) # 2. broken symlink pointing to directory under inaccessible directory link_to_dir_in_forbidden_dir = root / "2_link_to_dir_in_forbidden_dir" link_to_dir_in_forbidden_dir.symlink_to(dir_in_forbidden_dir) with ensure_no_permissions(forbidden_dir): with assert_raises(PermissionError): # resolution should fail because of missing permissions link_to_dir_in_forbidden_dir.resolve(strict=True) # 3. good symlink pointing to existing but inaccessible directory link_to_forbidden_dir = root / "3_link_to_forbidden_dir" link_to_forbidden_dir.symlink_to(forbidden_dir, target_is_directory=True) with ensure_no_permissions(forbidden_dir): ok_good_symlink(link_to_forbidden_dir) # 4. good symlink pointing to existing but inaccessible file link_to_forbidden_file = root / "4_link_to_forbidden_file" link_to_forbidden_file.symlink_to(forbidden_file) with ensure_no_permissions(forbidden_file): ok_good_symlink(link_to_forbidden_file) # temporarily remove all permissions (octal 000) # restore permissions at the end, otherwise we can't delete temp dir with ensure_no_permissions(forbidden_dir), \ ensure_no_permissions(forbidden_file): # test results dict using python API # implicitly also tests that command yields tree without crashing actual = TreeCommand.__call__( root, depth=None, include_files=include_files, result_renderer="disabled", result_xfm=lambda res: (Path(res["path"]).name, res["is_broken_symlink"]), result_filter=lambda res: "is_broken_symlink" in res, return_type="list", on_failure="ignore" ) if include_files: expected = [ # (path, is_broken_symlink) (link_to_file_in_forbidden_dir.name, True), (link_to_dir_in_forbidden_dir.name, True), (link_to_forbidden_dir.name, False), (link_to_forbidden_file.name, False) ] else: expected = [ (link_to_forbidden_dir.name, False) ] assert set(expected) == set(actual) @skip_wo_symlink_capability def test_print_tree_with_recursive_symlinks(self, tmp_path): """ TODO: break down into separate tests - Symlinks targets are displayed in custom renderer output - We do not follow symlinks that point to directories underneath the tree root or its parent (to prevent duplicate subtrees) - Symlinks pointing to datasets are not considered dataset nodes themselves, but regular directories (to prevent duplicate counts of datasets) """ ds = get_deeply_nested_structure(str(tmp_path / 'superds')) # change current dir to create symlinks with relative path with chpwd(ds.path): # create symlink to a sibling directory of the tree # (should be recursed into) (tmp_path / 'ext_dir' / 'ext_subdir').mkdir(parents=True) Path('link2extdir').symlink_to(Path('..') / 'ext_dir', target_is_directory=True) # create symlink to grandparent of the tree root (should NOT # be recursed into) Path('link2parent').symlink_to(Path('..') / '..', target_is_directory=True) # create symlink to subdir of the tree root at depth > max_depth # (should be recursed into) deepdir = Path('subds_modified') / 'subdir' / 'deepdir' deepdir.mkdir() (deepdir / 'subdeepdir').mkdir() Path('link2deepdir').symlink_to(deepdir, target_is_directory=True) root = ds.path command = ["tree", "--depth", "2", root] _, actual_res, counts = get_tree_rendered_output(command) s = sep expected_res = f""" ├── directory_untracked/ │ └── link2dir/ -> ..{s}subdir ├── link2deepdir/ -> subds_modified{s}subdir{s}deepdir │ └── subdeepdir/ ├── link2dir/ -> subdir ├── link2extdir/ -> ..{s}ext_dir │ └── ext_subdir/ ├── link2parent/ -> ..{s}.. ├── link2subdsdir/ -> subds_modified{s}subdir ├── link2subdsroot/ -> subds_modified ├── subdir/ └── [DS~1] subds_modified/ ├── link2superdsdir/ -> ..{s}subdir ├── subdir/ └── [DS~2] subds_lvl1_modified/ """.lstrip("\n") # Compare with output of 'tree' command # ui.message(counts) # import subprocess # subprocess.run(["tree", "-dlL", "2", root]) ui.message("expected:") ui.message(expected_res) ui.message("actual:") ui.message(actual_res) assert expected_res == actual_res datalad-next-1.4.1/datalad_next/commands/tree.py000066400000000000000000001323771462321624600216500ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See LICENSE file distributed along with the datalad_osf package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """'tree'-like command for visualizing dataset hierarchies""" from __future__ import annotations __docformat__ = "numpy" import logging from functools import ( wraps, lru_cache ) from os import readlink from pathlib import Path from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, Parameter, build_doc, eval_results, datasetmethod, get_status_dict, ) from datalad_next.exceptions import ( CapturedException, NoDatasetFound ) from datalad_next.iter_collections import iter_submodules from datalad_next.constraints import ( EnsureBool, EnsureDataset, EnsureInt, EnsurePath, EnsureRange, ) from datalad_next.utils import get_dataset_root from datalad_next.uis import ui_switcher as ui from datalad_next.datasets import ( Dataset, ) lgr = logging.getLogger('datalad.local.tree') @build_doc class TreeCommand(ValidatedInterface): """Visualize directory and dataset hierarchies This command mimics the UNIX/MS-DOS 'tree' utility to generate and display a directory tree, with DataLad-specific enhancements. It can serve the following purposes: 1. Glorified 'tree' command 2. Dataset discovery 3. Programmatic directory traversal *Glorified 'tree' command* The rendered command output uses 'tree'-style visualization:: /tmp/mydir ├── [DS~0] ds_A/ │ └── [DS~1] subds_A/ └── [DS~0] ds_B/ ├── dir_B/ │ ├── file.txt │ ├── subdir_B/ │ └── [DS~1] subds_B0/ └── [DS~1] (not installed) subds_B1/ 5 datasets, 2 directories, 1 file Dataset paths are prefixed by a marker indicating subdataset hierarchy level, like ``[DS~1]``. This is the absolute subdataset level, meaning it may also take into account superdatasets located above the tree root and thus not included in the output. If a subdataset is registered but not installed (such as after a non-recursive ``datalad clone``), it will be prefixed by ``(not installed)``. Only DataLad datasets are considered, not pure git/git-annex repositories. The 'report line' at the bottom of the output shows the count of displayed datasets, in addition to the count of directories and files. In this context, datasets and directories are mutually exclusive categories. By default, only directories (no files) are included in the tree, and hidden directories are skipped. Both behaviours can be changed using command options. Symbolic links are always followed. This means that a symlink pointing to a directory is traversed and counted as a directory (unless it potentially creates a loop in the tree). *Dataset discovery* Using the [CMD: ``--recursive`` CMD][PY: ``recursive`` PY] or [CMD: ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY] option, this command generates the layout of dataset hierarchies based on subdataset nesting level, regardless of their location in the filesystem. In this case, tree depth is determined by subdataset depth. This mode is thus suited for discovering available datasets when their location is not known in advance. By default, only datasets are listed, without their contents. If [CMD: ``--depth`` CMD][PY: ``depth`` PY] is specified additionally, the contents of each dataset will be included up to [CMD: ``--depth`` CMD][PY: ``depth`` PY] directory levels (excluding subdirectories that are themselves datasets). Tree filtering options such as [CMD: ``--include-hidden`` CMD][PY: ``include_hidden`` PY] only affect which directories are reported as dataset contents, not which directories are traversed to find datasets. **Performance note**: since no assumption is made on the location of datasets, running this command with the [CMD: ``--recursive`` CMD][PY: ``recursive`` PY] or [CMD: ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY] option does a full scan of the whole directory tree. As such, it can be significantly slower than a call with an equivalent output that uses [CMD: ``--depth`` CMD][PY: ``depth`` PY] to limit the tree instead. *Programmatic directory traversal* The command yields a result record for each tree node (dataset, directory or file). The following properties are reported, where available: "path" Absolute path of the tree node "type" Type of tree node: "dataset", "directory" or "file" "depth" Directory depth of node relative to the tree root "exhausted_levels" Depth levels for which no nodes are left to be generated (the respective subtrees have been 'exhausted') "count" Dict with cumulative counts of datasets, directories and files in the tree up until the current node. File count is only included if the command is run with the [CMD: ``--include-files`` CMD][PY: ``include_files`` PY] option. "dataset_depth" Subdataset depth level relative to the tree root. Only included for node type "dataset". "dataset_abs_depth" Absolute subdataset depth level. Only included for node type "dataset". "dataset_is_installed" Whether the registered subdataset is installed. Only included for node type "dataset". "symlink_target" If the tree node is a symlink, the path to the link target "is_broken_symlink" If the tree node is a symlink, whether it is a broken symlink """ result_renderer = 'tailored' _params_ = dict( path=Parameter( args=("path",), nargs='?', doc="""path to directory from which to generate the tree. Defaults to the current directory."""), depth=Parameter( args=("-L", "--depth",), doc="""limit the tree to maximum level of subdirectories. If not specified, will generate the full tree with no depth constraint. If paired with [CMD: ``--recursive`` CMD][PY: ``recursive`` PY] or [CMD: ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY], refers to the maximum directory level to output below each dataset."""), recursive=Parameter( args=("-r", "--recursive",), doc="""produce a dataset tree of the full hierarchy of nested subdatasets. *Note*: may have slow performance on large directory trees.""", action='store_true'), recursion_limit=Parameter( args=("-R", "--recursion-limit",), metavar="LEVELS", doc="""limit the dataset tree to maximum level of nested subdatasets. 0 means include only top-level datasets, 1 means top-level datasets and their immediate subdatasets, etc. *Note*: may have slow performance on large directory trees."""), include_files=Parameter( args=("--include-files",), doc="""include files in the tree""", action='store_true'), include_hidden=Parameter( args=("--include-hidden",), doc="""include hidden files/directories in the tree. This option does not affect which directories will be searched for datasets when specifying [CMD: ``--recursive`` CMD][PY: ``recursive`` PY] or [CMD: ``--recursion-limit`` CMD][PY: ``recursion_limit`` PY]. For example, datasets located underneath the hidden folder `.datalad` will be reported even if [CMD: ``--include-hidden`` CMD][PY: ``include_hidden`` PY] is omitted. """, action='store_true'), ) _validator_ = EnsureCommandParameterization(dict( path=EnsurePath(), depth=EnsureInt() & EnsureRange(min=0), recursive=EnsureBool(), recursion_limit=EnsureInt() & EnsureRange(min=0), include_files=EnsureBool(), include_hidden=EnsureBool(), )) _examples_ = [ dict(text="Show up to 3 levels of subdirectories below the current " "directory, including files and hidden contents", code_py="tree(depth=3, include_files=True, include_hidden=True)", code_cmd="datalad tree -L 3 --include-files --include-hidden"), dict(text="Find all top-level datasets located anywhere under ``/tmp``", code_py="tree('/tmp', recursion_limit=0)", code_cmd="datalad tree /tmp -R 0"), dict(text="Report all subdatasets recursively and their directory " "contents, up to 1 subdirectory deep within each " "dataset", code_py="tree(recursive=True, depth=1)", code_cmd="datalad tree -r -L 1"), ] @staticmethod @datasetmethod(name='tree') @eval_results def __call__( path='.', *, depth=None, recursive=False, recursion_limit=None, include_files=False, include_hidden=False): if recursive or recursion_limit is not None: # special tree defined by subdataset nesting depth tree_cls = DatasetTree dataset_tree_args = {"max_dataset_depth": recursion_limit} else: # simple tree defined by directory depth tree_cls = Tree dataset_tree_args = {} tree = tree_cls( path, max_depth=depth, exclude_node_func=build_excluded_node_func( include_hidden=include_hidden, include_files=include_files), **dataset_tree_args ) for node in tree.generate_nodes(): # yield one node at a time to improve UX / perceived speed res_dict = { "action": "tree", "path": str(node.path), "type": node.TYPE, "depth": node.depth, "exhausted_levels": list(tree.exhausted_levels), "count": { "datasets": tree.node_count["DatasetNode"], "directories": tree.node_count["DirectoryNode"], **({"files": tree.node_count["FileNode"]} if include_files else {}) }, } if node.TYPE == "dataset": res_dict.update({ "dataset_depth": node.ds_depth, "dataset_abs_depth": node.ds_absolute_depth, "dataset_is_installed": node.is_installed }) if node.is_symlink(): # TODO: should we inform if the symlink is recursive (as per # `tree.is_recursive_symlink()`) although not broken? The # UNIX 'tree' command shows the message '[recursive, # not followed]' next to the path. Not sure if this is # interesting at all or more confusing. res_dict["symlink_target"] = node.get_symlink_target() res_dict["is_broken_symlink"] = node.is_broken_symlink() if node.exception is not None: # mimic error message of unix 'tree' command for # permission denied error, otherwise use exception short # message message = "error opening dir" \ if node.exception.name == "PermissionError" \ else node.exception.message yield get_status_dict( status="error", message=message, exception=node.exception, **res_dict ) else: yield get_status_dict( status="ok", **res_dict ) @staticmethod def custom_result_renderer(res, **kwargs): """ Each node is printed on one line. The string uses the format:: [] [] [] Example line:: │ │ ├── path_dir_level3 """ from datalad_next.uis import ansi_colors # get values from result record node_type = res["type"] node_path = res["path"] depth = res["depth"] exhausted_levels = res["exhausted_levels"] # build indentation string indentation = "" if depth > 0: indentation_symbols_for_levels = [ ("│" if level not in exhausted_levels else " ") + " " for level in range(1, depth) ] indentation = "".join(indentation_symbols_for_levels) # build prefix (tree branch tip) prefix = "" if depth > 0: # root node has no prefix is_last_child = depth in exhausted_levels prefix = "└──" if is_last_child else "├──" # build dataset marker if dataset ds_marker = "" if node_type == "dataset": ds_absolute_depth = res["dataset_abs_depth"] ds_is_installed = res["dataset_is_installed"] ds_marker_depth = ansi_colors.color_word( f"DS~{ds_absolute_depth}", ansi_colors.WHITE) install_flag = " (not installed)" if not ds_is_installed else "" ds_marker = f"[{ds_marker_depth}]" + install_flag # build path string with optional color # display only root directory with full path, all other nodes # with basename path = node_path if depth == 0 else Path(node_path).name color_for_type = { "dataset": ansi_colors.MAGENTA, "directory": ansi_colors.BLUE, "file": None, "broken_symlink": ansi_colors.RED } # ANSI color for the path, if terminal colors are enabled color = color_for_type[node_type] if color is not None: path = ansi_colors.color_word(path, color) if res.get("is_broken_symlink", False): path = ansi_colors.color_word(path, color_for_type["broken_symlink"]) # set suffix for directories dir_suffix = "" if depth > 0 and node_type in ("directory", "dataset"): dir_suffix = "/" # append symlink target if symlink symlink_target = "" if "symlink_target" in res: symlink_target = " -> " + res["symlink_target"] # add short error message if there was exception error_msg = "" if "exception" in res: error_msg = f" [{res['message']}]" line = indentation + \ " ".join((s for s in (prefix, ds_marker, path) if s != "")) + \ dir_suffix + symlink_target + error_msg ui.message(line) @staticmethod def custom_result_summary_renderer(res, **kwargs): """Print the summary 'report line' with count of nodes by type""" c_ds = res[-1]['count']['datasets'] c_dirs = res[-1]['count']['directories'] # files may not be included in results (if not using command # option '--include-files') c_files = res[-1]['count'].get('files') descriptions = [ f"{c_ds} " + ("dataset" if int(c_ds) == 1 else "datasets"), f"{c_dirs} " + ("directory" if int(c_dirs) == 1 else "directories") ] if c_files is not None: descriptions.append( f"{c_files} " + ("file" if int(c_files) == 1 else "files")) ui.message("\n" + ", ".join(descriptions)) def build_excluded_node_func(include_hidden=False, include_files=False): """Return a function to exclude ``_TreeNode`` objects from the tree (prevents them from being yielded by the node generator). Returns ------- Callable Function that takes the Path object of a ``_TreeNode`` as input, and returns true if the node should *not* be displayed in the tree. """ def is_excluded(node: _TreeNode): return any(( isinstance(node, FileNode) if not include_files else False, node.path.name.startswith(".") if not include_hidden else False )) return is_excluded def increment_node_count(node_generator_func): """Decorator for incrementing the node count whenever a ``_TreeNode`` is yielded. Parameters ---------- node_generator_func: Callable Function that yields ``_TreeNode`` objects """ @wraps(node_generator_func) def _wrapper(*args, **kwargs): self = args[0] # 'self' is a Tree instance for node in node_generator_func(*args, **kwargs): node_type = node.__class__.__name__ if node_type not in self.node_count: raise ValueError( f"No counts collected for unknown node type '{node_type}'" ) if node.depth > 0: # do not count the root directory # TODO: do not count symlinks if they point to # files/directories that are already included in the tree # (to prevent double counting)? Note that UNIX 'tree' does # count double. self.node_count[node_type] += 1 yield node # yield what the generator yielded return _wrapper def yield_with_last_item(generator): """Takes a generator and yields for each item, the item itself and whether it is the last item in the sequence. Returns ------- Tuple[bool, Any] A tuple (is_last_item, item) """ prev_val = next(generator, None) if prev_val is not None: for current_val in generator: yield False, prev_val prev_val = current_val yield True, prev_val def path_depth(path: Path, root: Path): """Calculate directory depth of a path relative to the given root. Can also be a negative integer if the path is a parent of the tree root. Returns ------- int Number of levels of the given path *below* the root (positive integer) or *above* the tree root (negative integer) Raises ------ ValueError Like ``path.relative_to()``, raises ``ValueError`` if the path is not relative to the root """ sign = 1 try: rpath = path.relative_to(root) except ValueError: try: rpath = root.relative_to(path) sign = -1 except ValueError: raise ValueError( "Could not calculate directory depth: " f"'{path}' is not relative to the tree root " f"'{root}' (or vice-versa)") return sign * len(rpath.parts) def is_empty_dir(path: Path): """Does not check that path is a directory (to avoid extra system calls)""" return not any(path.iterdir()) @lru_cache() def is_dataset(path: Path, installed_only=False): """Fast dataset detection. Infer that a directory is a dataset if it is either: - installed, or - not installed, but has an installed superdatset (only if argument ``installed_only`` is False) Only consider datalad datasets, not plain git/git-annex repos. Symlinks pointing to datasets are not resolved, so will always return False for symlinks. This prevents potentially detecting duplicate datasets if the symlink and its target are both included in the tree. Results are cached because the check is somewhat expensive and may be run multiple times on the same path. Parameters ---------- path: Path Path to directory to be identified as dataset or non-dataset installed_only: bool Whether to ignore datasets that are not installed """ try: if path.is_symlink(): # ignore symlinks even if pointing to datasets, otherwise we may # get duplicate counts of datasets lgr.debug("Path is a symlink, will not check if it points to a " "dataset: %s", path) return False if (path / ".datalad" / "config").is_file(): # could also query `ds.id`, but checking just for existence # of config file is quicker. return True # if it is not installed, check if it has an installed superdataset. # instead of querying ds.is_installed() (which checks if the # directory has the .git folder), we check if the directory # is empty (faster) -- as e.g. after a non-recursive `datalad clone` if not installed_only: if is_empty_dir(path): return get_superdataset(path) is not None except Exception as ex: # if anything fails (e.g. permission denied), we raise exception # instead of returning False. this can be caught and handled by the # caller. raise NoDatasetFound(f"Cannot determine if '{path.name}' is a " f"dataset") from ex return False @lru_cache() def get_subds_paths(ds_path: Path): """Return paths of immediate subdatasets for a given dataset path.""" # This is an expensive operation because it calls git to read the # submodules. Since we need to run it to (A) calculate dataset depth and # (B) detect non-installed datasets, we cache results, so that the list of # subdatasets is computed only once for each parent dataset. return [ str(ds_path / sm.path) for sm in iter_submodules(ds_path) ] @lru_cache() def get_dataset_root_datalad_only(path: Path): """Get root of dataset containing a given path (datalad datasets only, not pure git/git-annex repo) Parameters ---------- path: Path Path to file or directory Returns ------- Path """ ds_root = path while ds_root: potential_ds_root = get_dataset_root(str(ds_root)) if potential_ds_root is None: return None # we are not inside a dataset potential_ds_root = Path(potential_ds_root) if is_dataset(potential_ds_root, installed_only=True): return potential_ds_root # it's a match # we go one directory higher and try again ds_root = (potential_ds_root / "..").resolve(strict=True) return ds_root @lru_cache() def get_superdataset(path: Path): """Reimplementation of ``Dataset.get_superdataset()`` to allow caching results of `ds.subdatasets()` (the most expensive operation). Parameters ---------- path: Path Path to a dataset Returns ------- Dataset or None """ superds_path = None while path: parent_path = (path / "..").resolve(strict=True) sds_path_ = get_dataset_root_datalad_only(parent_path) if sds_path_ is None: # no more parents, use previous found break superds = Dataset(sds_path_) # test if path is registered subdataset of the parent if not str(path) in get_subds_paths(superds.pathobj): break # That was a good candidate superds_path = sds_path_ path = parent_path break if superds_path is None: # None was found return None return Dataset(superds_path) def is_path_relative_to(my_path: Path, other_path: Path): """Port of pathlib's ``Path.is_relative_to()`` (requires python3.9+)""" try: my_path.relative_to(other_path) return True except ValueError: return False class Tree: """Main class for generating and serializing a directory tree""" def __init__(self, root: Path, max_depth=None, exclude_node_func=None): """ Parameters ---------- root: Path Directory to be used as tree root max_depth: int or None Maximum directory depth for traversing the tree exclude_node_func: Callable or None Function to filter out tree nodes from the tree """ try: root = Path(root) self.root = root.resolve(strict=False) assert self.root.is_dir(), f"path is not a directory: {self.root}" except (AssertionError, OSError) as ex: # could be permission error raise ValueError(f"directory not found: '{root}'") from ex self.max_depth = max_depth if max_depth is not None and max_depth < 0: raise ValueError("max_depth must be >= 0") # set callable to exclude nodes from the tree, meaning they # will not be yielded by the node generator self.exclude_node_func = exclude_node_func or self.default_exclude_func # keep track of levels where the subtree is exhausted, i.e. we # have reached the last node of the current subtree. # this is needed for the custom results renderer, to display nodes # differently depending on whether they are the last child or not. self.exhausted_levels = set([]) # store dict with count of nodes for each node type, similar to the # tree command's 'report line' at the end of the output. # the node types (subclasses of ``_TreeNode``) are mutually exclusive, # so the sum of their counts equals to the total node count. # does not count the root itself, only the contents below the root. self.node_count = {node_type.__name__: 0 for node_type in _TreeNode.__subclasses__()} def __repr__(self): return self.__class__.__name__ + \ f"('{self.root}', max_depth={self.max_depth})" @staticmethod def default_exclude_func(node): """By default, exclude files and hidden directories from the tree""" return any( (isinstance(node, FileNode), node.path.name.startswith(".")) ) def path_depth(self, path: Path): return path_depth(path, self.root) def _generate_tree_nodes(self, dir_path: Path): """Recursively yield ``_TreeNode`` objects starting from ``dir_path`` Parameters ---------- dir_path: Path Directory from which to calculate the tree """ # yield current directory/dataset node current_depth = self.path_depth(dir_path) current_node = Node(dir_path, current_depth) yield current_node # check that we are within max_depth levels # (None means unlimited depth) if self.max_depth is None or \ current_depth < self.max_depth: if current_node.is_symlink() and \ current_node.is_recursive_symlink(self.max_depth): # if symlink points to directory that we may visit or may # have visited already, do not recurse into it lgr.debug("Symlink is potentially recursive, " "will not traverse target directory: %s", dir_path) return if current_node.exception is not None: # if some exception occurred when instantiating the node # (missing permissions etc), do not recurse into directory lgr.debug("Node has exception, will not traverse directory: " "%r", current_node) return # sort child nodes alphabetically # needs to be done *before* calling the exclusion function, # because the function may depend on sort order all_children = sorted(list(dir_path.iterdir())) child_depth = current_depth + 1 # generator to apply exclusion filter def children(): for child_path in all_children: child_node = Node(child_path, child_depth) if not self.exclude_node_func(child_node): yield child_node # exclusion function could be expensive to compute, so we # use a generator for child nodes. however, we need to be able # to detect the last child node within each subtree (needed for # displaying special end-of-subtree prefix). so we wrap the # generator in another 'lookahead' generator to detect the last # item. for is_last_child, child in yield_with_last_item(children()): if is_last_child: # last child of its subtree self.exhausted_levels.add(child_depth) else: self.exhausted_levels.discard(child_depth) # remove exhausted levels that are deeper than the # current depth (we don't need them anymore) levels = set(self.exhausted_levels) # copy self.exhausted_levels.difference_update( l for l in levels if l > child_depth ) if isinstance(child, (DirectoryNode, DatasetNode)): # recurse into subdirectories yield from self._generate_tree_nodes(child.path) else: # it's a file, just yield it yield child @increment_node_count def generate_nodes(self): """ Traverse a directory tree starting from the root path. Yields ``_TreeNode`` objects, each representing a directory or dataset or file. Nodes are traversed in depth-first order. Returns ------- Generator[_TreeNode] """ # because the node generator is recursive, we cannot directly # decorate it with `increment_node_count` (since it would count # twice whenever the function recurses). # so we decorate a separate function where we just yield from the # underlying generator. yield from self._generate_tree_nodes(self.root) class DatasetTree(Tree): """ ``DatasetTree`` is a ``Tree`` whose depth is determined primarily by the subdataset hierarchy level (parameter ``max_dataset_depth``). Here, ``max_depth`` can also be specified, but it refers to the depth of each dataset's content. If this depth is 0, only datasets are reported, without any files or subdirectories underneath. Because of the different semantics of the ``max_depth`` parameter, this class is implemented as a separate subclass of ``Tree``. """ def __init__(self, *args, max_dataset_depth=None, **kwargs): super().__init__(*args, **kwargs) # default max_dataset_depth 'None' means unlimited subdataset deoth self.max_dataset_depth = max_dataset_depth if self.max_depth is None: # by default, do not include datasets' contents self.max_depth = 0 # lazy initialization of list of datasets and their parents, # will be computed when generating nodes for the first time self.ds_nodes = [] def __repr__(self): return self.__class__.__name__ + \ f"('{self.root}', " \ f"max_dataset_depth={self.max_dataset_depth}, " \ f"max_depth={self.max_depth})" @increment_node_count def generate_nodes(self): # compute full list of dataset nodes and their parents upfront. # this requires an unlimited-depth tree traversal, so will # be the slowest operation if not self.ds_nodes: lgr.debug("Started computing dataset nodes for %r", self) self.ds_nodes = list(self.generate_dataset_nodes()) lgr.debug("Finished computing dataset nodes for %r", self) if not self.ds_nodes: depth = 0 # no datasets to report on, just yield the root else: depth = max(node.depth for node in self.ds_nodes) + \ self.max_depth # max levels below the deepest dataset tree = Tree( self.root, max_depth=depth, exclude_node_func=self.exclude_func, ) # synchronize exhausted levels with the main tree self.exhausted_levels = tree.exhausted_levels yield from tree.generate_nodes() def generate_dataset_nodes(self): """ Generator of dataset nodes and their parent directories starting from below the tree root and up to ``max_dataset_depth`` levels. The assumption is that (super)datasets could be located at any level of the directory tree. Therefore, this function does a full-depth tree traversal to discover datasets. Returns ------- Generator[DirectoryNode or DatasetNode] """ def is_excluded(n: _TreeNode): # assumption: we won't find datasets underneath the git folder return isinstance(n, FileNode) or \ (isinstance(n, DirectoryNode) and n.path.name == ".git") # keep track of traversed nodes # (needed to prevent yielding duplicates) visited = set([]) ds_tree = Tree( self.root, max_depth=None, # unlimited depth, datasets could be anywhere exclude_node_func=is_excluded, ) nodes_below_root = ds_tree.generate_nodes() next(nodes_below_root) # skip root node for node in nodes_below_root: # for each dataset node, yield its parents first, then # yield the dataset itself if isinstance(node, DatasetNode) and \ (self.max_dataset_depth is None or node.ds_depth <= self.max_dataset_depth) and \ not self.exclude_node_func(node): # yield parent directories if not already done parents_below_root = node.parents[1:] # first parent is root for par_depth, par_path in enumerate(parents_below_root): parent = Node(par_path, par_depth) if parent not in visited: visited.add(parent) yield parent visited.add(node) yield node def exclude_func(self, node): """Exclusion function for pruning the main tree""" include, exclude = False, True # prevent headaches try: if node in self.ds_nodes: # we hit a dataset or the parent of a dataset return include # if `max_depth` is specified for returning dataset contents, # exclude non-dataset nodes below a dataset that have # depth (relative to parent dataset) > max_depth if self.max_depth > 0 and \ not isinstance(node, DatasetNode): # check that node is the child of a dataset ds_parent = self._find_closest_ds_parent(node) if ds_parent is not None: rel_depth = node.depth - ds_parent.depth exceeds_max_depth = rel_depth > self.max_depth # also filter by the user-supplied # exclusion logic in `exclude_node_func` return exceeds_max_depth or \ self.exclude_node_func(node) except Exception as ex: CapturedException(ex, level=10) # DEBUG level lgr.debug("Excluding node from tree because " "an exception occurred while applying " "exclusion filter: %r", node) return exclude # exclude by default def _find_closest_ds_parent(self, node): ds_parent = None for parent_path in node.path.parents: # bottom-up order ds_parent = next((n for n in self.ds_nodes if n.path == parent_path and isinstance(n, DatasetNode)), None) if ds_parent is not None: break return ds_parent class _TreeNode: """Base class for a directory or file represented as a single tree node and printed as single line of the 'tree' output.""" TYPE = None # needed for command result dict def __init__(self, path: Path, depth: int, exception: CapturedException | None = None): """ Parameters ---------- path: Path Path of the tree node depth: int Directory depth of the node within its tree exception: CapturedException Exception that may have occurred at validation/creation """ self.path = path self.depth = depth self.exception = exception def __eq__(self, other): return self.path == other.path def __hash__(self): return hash(str(self.path)) def __repr__(self): return f"{self.__class__.__name__}('{self.path}', depth={self.depth})" @property def tree_root(self) -> Path: """Calculate tree root path from node path and depth""" parents = self.parents return parents[0] if parents \ else self.path # we are the root @property # More accurate annotation only from PY3.9 onwards # def parents(self) -> list[Path]: def parents(self) -> list: """List of parent paths in top-down order beginning from the tree root. Assumes the node path to be already normalized. Returns ------- List[Path] """ parents_from_tree_root = [] for depth, path in enumerate(self.path.parents): if depth >= self.depth: break parents_from_tree_root.append(path) return parents_from_tree_root[::-1] # top-down order def is_symlink(self) -> bool: """Check if node path is a symlink""" try: if self.path.is_symlink(): return True except Exception as ex: # could fail because of permission issues etc. # in which case we just default to False self.exception = CapturedException(ex, level=10) return False def get_symlink_target(self) -> str: """If node path is a symlink, get link target as string. Otherwise, return None. Does not check that target path exists.""" try: if self.is_symlink(): # use os.readlink() instead of Path.readlink() for # Python <3.9 compatibility return readlink(str(self.path)) except Exception as ex: self.exception = CapturedException(ex, level=10) def is_broken_symlink(self) -> bool: """If node path is a symlink, check if it points to a nonexisting or inaccessible target or to itself (self-referencing link). Raise exception if the node path is not a symlink.""" if not self.is_symlink(): raise ValueError("Node path is not a symlink, cannot check if " f"symlink is broken: {self.path}") try: self.path.resolve(strict=True) return False except FileNotFoundError: # target does not exist return True except PermissionError: # target exists but is not accessible return True except (RuntimeError, OSError): # symlink loop (OSError on Windows) return True except Exception as ex: # probably broken in some other way self.exception = CapturedException(ex, level=10) return True def is_recursive_symlink(self, max_depth) -> bool: """Detect symlink pointing to a directory within the same tree (directly or indirectly). The default behaviour is to follow symlinks when traversing the tree. However, we should not follow symlinks to directories that we may visit or have visited already, i.e. are also located under the tree root or any parent of the tree root (within a distance of ``max_depth``). Otherwise, the same subtree could be generated multiple times in different places, potentially in a recursive loop (e.g. if the symlink points to its parent). This is similar to the logic of the UNIX 'tree' command, but goes a step further to prune all duplicate subtrees. Parameters ---------- max_depth Max depth of the ``Tree`` to which this node belongs """ if not self.is_symlink(): raise ValueError("Node path is not a symlink, cannot check if " f"symlink is recursive: {self.path}") if isinstance(self, FileNode): # we are only interested in symlinks pointing to a directory return False if self.is_broken_symlink(): # cannot identify target, no way to know if link is recursive return False target_dir = self.path.resolve() tree_root = self.tree_root # either: # - target dir is within `max_depth` levels beneath the tree # root, so it will likely be yielded or has already been # yielded (bar any exclusion filters) # - target dir is a parent of the tree root, so we may still # get into a loop if we recurse more than `max_depth` levels try: rel_depth = abs(path_depth(target_dir, tree_root)) return max_depth is None or \ rel_depth <= max_depth except ValueError: # cannot compute path depth because target is outside # of the tree root, so no loop is possible return False class Node: """ Factory class for creating a ``_TreeNode`` of a particular subclass. Detects whether the path is a file or a directory or dataset, and handles any exceptions (permission errors, broken symlinks, etc.) """ def __new__(cls, path: Path, depth: int, **kwargs): if not isinstance(path, Path): raise ValueError("path must be a Path object") node_cls = FileNode captured_ex = None try: if path.is_dir(): if is_dataset(path): node_cls = DatasetNode else: node_cls = DirectoryNode except NoDatasetFound as ex: # means 'is_dataset()' failed # default to directory node # just log the exception, do not set it as node attribute CapturedException(ex, level=10) node_cls = DirectoryNode except Exception as ex: # means 'is_dir()' failed # default to file node # set exception as node attribute captured_ex = CapturedException(ex, level=10) return node_cls(path, depth, exception=captured_ex, **kwargs) class DirectoryNode(_TreeNode): TYPE = "directory" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) try: # get first child if exists. this is a check for whether # we can potentially recurse into the directory or # if there are any filesystem issues (permissions errors, etc) any(self.path.iterdir()) except OSError as ex: # permission errors etc. are logged and stored as node # attribute so they can be passed to results dict. # this will overwrite any exception passed to the constructor, # since we assume that this exception is closer to the root # cause. self.exception = CapturedException(ex, level=10) # DEBUG level class FileNode(_TreeNode): TYPE = "file" class DatasetNode(_TreeNode): TYPE = "dataset" def __init__(self, *args, **kwargs): """Does not check if valid dataset. This needs to be done before creating the instance.""" super().__init__(*args, **kwargs) try: self.ds = EnsureDataset(installed=None)(self.path).ds self.is_installed = self.ds.is_installed() self.ds_depth, self.ds_absolute_depth = self.calculate_dataset_depth() except Exception as ex: if self.exception is not None: # only if exception has not already been passed to constructor self.exception = CapturedException(ex, level=10) @lru_cache() def calculate_dataset_depth(self): """ Calculate 2 measures of a dataset's nesting depth/level: 1. ``ds_depth``: subdataset depth relative to the tree root 2. ``ds_absolute_depth``: absolute subdataset depth in the full hierarchy, potentially taking into account parent datasets at levels above the tree root Returns ------- Tuple[int, int] Tuple of relative dataset depth and absolute dataset depth """ ds_depth = 0 ds_absolute_depth = 0 ds = self.ds while ds: superds = get_superdataset(ds.pathobj) if superds is None: # it is not a dataset, do nothing break else: if superds == ds: # it is a top-level dataset, we are done break ds_absolute_depth += 1 if is_path_relative_to(superds.pathobj, self.tree_root): # if the parent dataset is underneath the tree # root, we increment the relative depth ds_depth += 1 ds = superds return ds_depth, ds_absolute_depth datalad-next-1.4.1/datalad_next/config/000077500000000000000000000000001462321624600177665ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/config/__init__.py000066400000000000000000000003641462321624600221020ustar00rootroot00000000000000"""Configuration query and manipulation This modules provides the central ``ConfigManager`` class. .. currentmodule:: datalad_next.config .. autosummary:: :toctree: generated ConfigManager """ from datalad.config import ConfigManager datalad-next-1.4.1/datalad_next/config/tests/000077500000000000000000000000001462321624600211305ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/config/tests/__init__.py000066400000000000000000000000001462321624600232270ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/config/tests/test_core.py000066400000000000000000000005051462321624600234710ustar00rootroot00000000000000from datalad.tests.test_config import * # this datalad-core test is causing a persistent git config modification # this is not legal on datalad-next, we must wrap and protect _test_cross_cfgman_update = test_cross_cfgman_update def test_cross_cfgman_update(datalad_cfg, tmp_path): _test_cross_cfgman_update(tmp_path) datalad-next-1.4.1/datalad_next/config/tests/test_utils.py000066400000000000000000000131631462321624600237050ustar00rootroot00000000000000 import pytest from .. import utils # for patching environ from ..utils import ( get_gitconfig_items_from_env, set_gitconfig_items_in_env, ) def test_get_gitconfig_items_from_env(monkeypatch): with monkeypatch.context() as m: # without the COUNT the rest does not matter and we always # get an empty dict m.delenv('GIT_CONFIG_COUNT', raising=False) assert get_gitconfig_items_from_env() == {} with monkeypatch.context() as m: # setting zero items, also makes everything else irrelevant m.setenv('GIT_CONFIG_COUNT', '0') assert get_gitconfig_items_from_env() == {} with monkeypatch.context() as m: # predictable error for botched count m.setenv('GIT_CONFIG_COUNT', 'rubbish') with pytest.raises(ValueError) as e: get_gitconfig_items_from_env() assert 'bogus count in GIT_CONFIG_COUNT' in str(e) # bunch of std error conditions for env, excstr in ( ({'GIT_CONFIG_COUNT': 1, 'GIT_CONFIG_KEY_0': 'section.name'}, 'missing config value'), ({'GIT_CONFIG_COUNT': 1, 'GIT_CONFIG_VALUE_0': 'value'}, 'missing config key'), ({'GIT_CONFIG_COUNT': 1, 'GIT_CONFIG_KEY_0': '', 'GIT_CONFIG_VALUE_0': 'value'}, 'empty config key'), ({'GIT_CONFIG_COUNT': 1, 'GIT_CONFIG_KEY_0': 'nosection', 'GIT_CONFIG_VALUE_0': 'value'}, 'does not contain a section'), ): with monkeypatch.context() as m: m.setattr(utils, 'environ', env) with pytest.raises(ValueError) as e: get_gitconfig_items_from_env() assert excstr in str(e) # proper functioning for env, target in ( ({'GIT_CONFIG_COUNT': 1, 'GIT_CONFIG_KEY_0': 'section.name', 'GIT_CONFIG_VALUE_0': 'value'}, {'section.name': 'value'}), ({'GIT_CONFIG_COUNT': 2, 'GIT_CONFIG_KEY_0': 'section.name1', 'GIT_CONFIG_VALUE_0': 'value1', 'GIT_CONFIG_KEY_1': 'section.name2', 'GIT_CONFIG_VALUE_1': 'value2'}, {'section.name1': 'value1', 'section.name2': 'value2'}), # double-specification appends # ❯ GIT_CONFIG_COUNT=2 \ # GIT_CONFIG_KEY_0=section.name \ # GIT_CONFIG_VALUE_0=val1 \ # GIT_CONFIG_KEY_1=section.name \ # GIT_CONFIG_VALUE_1=val2 \ # git config --list --show-origin | grep 'command line:' # command line: section.name=val1 # command line: section.name=val2 ({'GIT_CONFIG_COUNT': 3, 'GIT_CONFIG_KEY_0': 'section.name', 'GIT_CONFIG_VALUE_0': 'value0', 'GIT_CONFIG_KEY_1': 'section.name', 'GIT_CONFIG_VALUE_1': 'value1', 'GIT_CONFIG_KEY_2': 'section.name', 'GIT_CONFIG_VALUE_2': 'value2'}, {'section.name': ('value0', 'value1', 'value2')}), ): with monkeypatch.context() as m: m.setattr(utils, 'environ', env) assert get_gitconfig_items_from_env() == target def test_set_gitconfig_items_in_env(monkeypatch): for start, items, target in ( # giving nothing preserves statusquo ({}, {}, {}), ({'DUMMY': 'value'}, {}, {'DUMMY': 'value'}), # fixable specification is cleaned up ({'GIT_CONFIG_COUNT': '526'}, {}, {}), # but it has limits ({'GIT_CONFIG_COUNT': 'nochance'}, {}, {'GIT_CONFIG_COUNT': 'nochance'}), # and there is no exhaustive search ({'GIT_CONFIG_KEY_3': 'dummy'}, {}, {'GIT_CONFIG_KEY_3': 'dummy'}), # virgin territory ({}, {'section.name': 'value'}, {'GIT_CONFIG_COUNT': '1', 'GIT_CONFIG_KEY_0': 'section.name', 'GIT_CONFIG_VALUE_0': 'value'}), # "set" means "replace, not amend ({'GIT_CONFIG_COUNT': '1', 'GIT_CONFIG_KEY_0': 'section.name', 'GIT_CONFIG_VALUE_0': 'value'}, {'altsection.name2': 'value2'}, {'GIT_CONFIG_COUNT': '1', 'GIT_CONFIG_KEY_0': 'altsection.name2', 'GIT_CONFIG_VALUE_0': 'value2'}), # full cleanupage ({'GIT_CONFIG_COUNT': '2', 'GIT_CONFIG_KEY_0': 'section.name', 'GIT_CONFIG_VALUE_0': 'value', 'GIT_CONFIG_KEY_1': 'altsection.name2', 'GIT_CONFIG_VALUE_1': 'value2'}, {}, {}), # multi-value support, order preserved ({}, {'section.name': ('c', 'a', 'b')}, {'GIT_CONFIG_COUNT': '3', 'GIT_CONFIG_KEY_0': 'section.name', 'GIT_CONFIG_VALUE_0': 'c', 'GIT_CONFIG_KEY_1': 'section.name', 'GIT_CONFIG_VALUE_1': 'a', 'GIT_CONFIG_KEY_2': 'section.name', 'GIT_CONFIG_VALUE_2': 'b'}), ): with monkeypatch.context() as m: env = dict(start) m.setattr(utils, 'environ', env) set_gitconfig_items_in_env(items) assert env == target def test_get_set_gitconfig_env_roundtrip(monkeypatch): items = {'section.name': ('c', 'a', 'b'), 'space section.na me.so me': 'v al'} with monkeypatch.context() as m: env = {} m.setattr(utils, 'environ', env) # feed in copy to ensure validity of the test set_gitconfig_items_in_env(dict(items)) assert get_gitconfig_items_from_env() == items datalad-next-1.4.1/datalad_next/config/utils.py000066400000000000000000000075001462321624600215020ustar00rootroot00000000000000from __future__ import annotations from os import environ from typing import ( Dict, Mapping, Tuple, ) def get_gitconfig_items_from_env() -> Mapping[str, str | Tuple[str, ...]]: """Parse git-config ENV (``GIT_CONFIG_COUNT|KEY|VALUE``) and return as dict This implementation does not use ``git-config`` directly, but aims to mimic its behavior with respect to parsing the environment as much as possible. Raises ------ ValueError Whenever ``git-config`` would also error out, and includes an message in the respective exception that resembles ``git-config``'s for that specific case. Returns ------- dict Configuration key-value mappings. When a key is declared multiple times, the respective values are aggregated in reported as a tuple for that specific key. """ items: Dict[str, str | Tuple[str, ...]] = {} for k, v in ((_get_gitconfig_var_from_env(i, 'key'), _get_gitconfig_var_from_env(i, 'value')) for i in range(_get_gitconfig_itemcount())): val = items.get(k) if val is None: items[k] = v elif isinstance(val, tuple): items[k] = val + (v,) else: items[k] = (val, v) return items def _get_gitconfig_itemcount() -> int: try: return int(environ.get('GIT_CONFIG_COUNT', '0')) except (TypeError, ValueError) as e: raise ValueError("bogus count in GIT_CONFIG_COUNT") from e def _get_gitconfig_var_from_env(nid: int, kind: str) -> str: envname = f'GIT_CONFIG_{kind.upper()}_{nid}' var = environ.get(envname) if var is None: raise ValueError(f"missing config {kind} {envname}") if kind != 'key': return var if not var: raise ValueError(f"empty config key {envname}") if '.' not in var: raise ValueError(f"key {envname} does not contain a section: {var}") return var def set_gitconfig_items_in_env(items: Mapping[str, str | Tuple[str, ...]]): """Set git-config ENV (``GIT_CONFIG_COUNT|KEY|VALUE``) from a mapping Any existing declaration of configuration items in the environment is replaced. Any ENV variable of a *valid* existing declaration is removed, before the set configuration items are posted in the ENV. Multi-value configuration keys are supported (values provided as a tuple). Any item with a value of ``None`` will be posted into the ENV with an empty string as value, i.e. the corresponding ``GIT_CONFIG_VALUE_{count}`` variable will be an empty string. ``None`` item values indicate that the configuration key was unset on the command line, via the global option ``-c``. No verification (e.g., of syntax compliance) is performed. """ _clean_env_from_gitconfig_items() count = 0 for key, value in items.items(): # homogeneous processing of multiple value items, and single values values = value if isinstance(value, tuple) else (value,) for v in values: environ[f'GIT_CONFIG_KEY_{count}'] = key # we support None even though not an allowed input type, because # of https://github.com/datalad/datalad/issues/7589 # this can be removed, when that issue is resolved. environ[f'GIT_CONFIG_VALUE_{count}'] = '' if v is None else str(v) count += 1 if count: environ['GIT_CONFIG_COUNT'] = str(count) def _clean_env_from_gitconfig_items(): # we only care about intact specifications here, if there was cruft # to start with, we have no responsibilities try: count = _get_gitconfig_itemcount() except ValueError: return for i in range(count): environ.pop(f'GIT_CONFIG_KEY_{i}', None) environ.pop(f'GIT_CONFIG_VALUE_{i}', None) environ.pop('GIT_CONFIG_COUNT', None) datalad-next-1.4.1/datalad_next/conftest.py000066400000000000000000000042251462321624600207230ustar00rootroot00000000000000from datalad.conftest import setup_package # fixture setup from datalad_next.tests.fixtures import ( # no test can leave global config modifications behind check_gitconfig_global, # no test can leave secrets behind check_plaintext_keyring, # function-scope credential manager credman, # function-scope config manager datalad_cfg, # function-scope UI wrapper that can provide staged responses datalad_interactive_ui, # function-scope UI wrapper that can will raise when asked for responses datalad_noninteractive_ui, # function-scope temporary keyring tmp_keyring, # function-scope, Dataset instance dataset, # function-scope, Dataset instance with underlying repository existing_dataset, # function-scope, Dataset instance with underlying Git-only repository existing_noannex_dataset, # session-scope, Dataset instance with various modifications, # to-be-treated read-only modified_dataset, # session-scope, standard http credential (full dict) http_credential, # function-scope, auth-less HTTP server http_server, # function-scope, HTTP server with required authentication http_server_with_basicauth, # function-scope relay httpbin_service, unless undesired and skips instead httpbin, # session-scope HTTPBIN instance startup and URLs httpbin_service, # function-scope, disabled datalad command result rendering for all # command calls no_result_rendering, # session-scope redirection of log messages reduce_logging, # session-scope determine setup of an SSH server to use for testing sshserver_setup, # function-scope SSH server base url and local path sshserver, # session-scope, standard webdav credential (full dict) webdav_credential, # function-scope, serve a local temp-path via WebDAV webdav_server, ) from datalad_next.iter_collections.tests.test_itertar import ( # session-scope, downloads a tarball with a set of standard # file/dir/link types sample_tar_xz, ) from datalad_next.iter_collections.tests.test_iterzip import ( # session-scope, create a sample zip file sample_zip, ) datalad-next-1.4.1/datalad_next/constraints/000077500000000000000000000000001462321624600210705ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/constraints/__init__.py000066400000000000000000000066421462321624600232110ustar00rootroot00000000000000"""Data validation, coercion, and parameter documentation This module provides a set of uniform classes to validate and document particular aspects of inputs. In a nutshell, each of these :class:`~datalad_next.constraints.Constraint` class: - focuses on a specific aspect, such as data type coercion, or checking particular input properties - is instantiated with a set of parameters to customize such an instance for a particular task - performs its task by receiving an input via its ``__call__()`` method - provides default auto-documentation that can be customized by wrapping an instance in :class:`~datalad_next.constraints.WithDescription` Individual ``Constraint`` instances can be combined with logical AND (:class:`~datalad_next.constraints.AllOf`) and OR (:class:`~datalad_next.constraints.AnyOf`) operations to form arbitrarily complex constructs. On (validation/coercion) error, instances raise :class:`~datalad_next.constraints.ConstraintError`) via their ``raise_for()`` method. This approach to error reporting helps to communicate standard (yet customizable) error messages, aids structured error reporting, and is capable of communication the underlying causes of an error in full detail without the need to generate long textual descriptions. :class:`~datalad_next.constraints.EnsureCommandParameterization` is a particular variant of a ``Constraint`` that is capable of validating a complete parameterization of a command (or function), for each parameter individually, and for arbitrary combinations of parameters. It puts a particular emphasis on structured error reporting. .. currentmodule:: datalad_next.constraints .. autosummary:: :toctree: generated Constraint AllOf AnyOf NoConstraint WithDescription ConstraintError CommandParametrizationError ParameterConstraintContext EnsureDataset DatasetParameter EnsureBool EnsureCallable EnsureChoice EnsureFloat EnsureHashAlgorithm EnsureDType EnsureInt EnsureKeyChoice EnsureNone EnsurePath EnsureStr EnsureStrPrefix EnsureRange EnsureValue EnsureIterableOf EnsureListOf EnsureTupleOf EnsureMapping EnsureGeneratorFromFileLike EnsureJSON EnsureURL EnsureParsedURL EnsureGitRefName EnsureRemoteName EnsureSiblingName EnsureCommandParameterization """ from .base import ( AllOf, AnyOf, Constraint, DatasetParameter, ) from .exceptions import ( # this is the key type, almost all consuming code will want to # have this for `except` clauses ConstraintError, CommandParametrizationError, ParameterConstraintContext, ) # expose constraints with direct applicability, but not # base and helper classes from .basic import ( EnsureBool, EnsureCallable, EnsureChoice, EnsureFloat, EnsureHashAlgorithm, EnsureDType, EnsureInt, EnsureKeyChoice, EnsureNone, EnsurePath, EnsureStr, EnsureStrPrefix, EnsureRange, EnsureValue, NoConstraint, ) from .compound import ( EnsureIterableOf, EnsureListOf, EnsureTupleOf, EnsureMapping, EnsureGeneratorFromFileLike, WithDescription, ) from .formats import ( EnsureJSON, EnsureURL, EnsureParsedURL, ) from .dataset import EnsureDataset from .git import ( EnsureGitRefName, EnsureRemoteName, EnsureSiblingName, ) from .parameter import ( EnsureCommandParameterization, ) datalad-next-1.4.1/datalad_next/constraints/base.py000066400000000000000000000210241462321624600223530ustar00rootroot00000000000000"""Base classes for constraints and their logical connectives """ from __future__ import annotations __docformat__ = 'restructuredtext' __all__ = ['Constraint', 'AllOf', 'AnyOf', 'DatasetParameter'] from .exceptions import ConstraintError class DatasetParameter: """Utility class to report an original and resolve dataset parameter value This is used by `EnsureDataset` to be able to report the original argument semantics of a dataset parameter to a receiving command. It is consumed by any ``Constraint.for_dataset()``. The original argument is provided via the `original` property. A corresponding `Dataset` instance is provided via the `ds` property. """ def __init__(self, original, ds): self.original = original self.ds = ds def __repr__(self): return f'{self.__class__.__name__}({self.original}, {self.ds})' class Constraint: """Base class for value coercion/validation. These classes are also meant to be able to generate appropriate documentation on an appropriate parameter value. """ def __str__(self): """Rudimentary self-description""" return f"constraint: {self.short_description()}" def __repr__(self): """Rudimentary repr to avoid default scary to the user Python repr""" return f"{self.__class__.__name__}()" def raise_for(self, value, msg, **ctx) -> None: """Convenience method for raising a ``ConstraintError`` The parameters are identical to those of ``ConstraintError``. This method merely passes the ``Constraint`` instance as ``self`` to the constructor. """ if ctx: raise ConstraintError(self, value, msg, ctx) else: raise ConstraintError(self, value, msg) def __and__(self, other): return AllOf(self, other) def __or__(self, other): return AnyOf(self, other) def __call__(self, value): # do any necessary checks or conversions, potentially catch exceptions # and generate a meaningful error message raise NotImplementedError("abstract class") @property def input_synopsis(self) -> str: """Returns brief, single line summary of valid input for a constraint This information is user-facing, and to be used in any place where space is limited (tooltips, usage summaries, etc). If possible, the synopsis should be written in a UI/API-agnostic fashion. However, if this is impossible or leads to imprecisions or confusion, it should focus on use within Python code and with Python data types. Tailored documentation can be provided via the ``WithDescription`` wrapper. """ # return the legacy short description for now return self.short_description() @property def input_description(self) -> str: """Returns full description of valid input for a constraint Like ``input_synopsis`` this information is user-facing. In contrast, to the synopsis there is length/line limit. Nevertheless, the information should be presented in a compact fashion that avoids needless verbosity. If possible, a single paragraph is a good format. If multiple paragraphs are necessary, they should be separated by a single, empty line. Rendering code may indent, or rewrap the text, so no line-by-line formatting will be preserved. If possible, the synopsis should be written in a UI/API-agnostic fashion. However, if this is impossible or leads to imprecisions or confusion, it should focus on use within Python code and with Python data types. Tailored documentation can be provided via the ``WithDescription`` wrapper. """ # return the legacy short description for now return self.long_description() def long_description(self): """This method is deprecated. Use ``input_description`` instead""" # return meaningful docs or None # used as a comprehensive description in the parameter list return self.short_description() def short_description(self): """This method is deprecated. Use ``input_synopsis`` instead""" # return meaningful docs or None # used as a condensed primer for the parameter lists raise NotImplementedError("abstract class") def for_dataset(self, dataset: DatasetParameter) -> Constraint: """Return a constraint-variant for a specific dataset context The default implementation returns the unmodified, identical constraint. However, subclasses can implement different behaviors. """ return self class _MultiConstraint(Constraint): """Helper class to override the description methods to reported multiple constraints """ def __init__(self, *constraints): # TODO Why is EnsureNone needed? Remove if possible from .basic import EnsureNone self._constraints = [ EnsureNone() if c is None else c for c in constraints ] def __repr__(self): creprs = ', '.join(f'{c!r}' for c in self.constraints) return f"{self.__class__.__name__}({creprs})" @property def constraints(self): return self._constraints def _get_description(self, attr: str, operation: str) -> str: cs = [ getattr(c, attr)() for c in self.constraints if hasattr(c, attr) ] cs = [c for c in cs if c is not None] doc = f' {operation} '.join(cs) if len(cs) > 1: return f'({doc})' else: return doc class AnyOf(_MultiConstraint): """Logical OR for constraints. An arbitrary number of constraints can be given. They are evaluated in the order in which they were specified. The value returned by the first constraint that does not raise an exception is the global return value. Documentation is aggregated for all alternative constraints. """ def __init__(self, *constraints): """ Parameters ---------- *constraints Alternative constraints """ super().__init__(*constraints) def __or__(self, other): constraints = list(self.constraints) if isinstance(other, AnyOf): constraints.extend(other.constraints) else: constraints.append(other) return AnyOf(*constraints) def __call__(self, value): e_list = [] for c in self.constraints: try: return c(value) except Exception as e: e_list.append(e) self.raise_for( value, # plural OK, no sense in having 1 "alternative" 'does not match any of {n_alternatives} alternatives\n' '{__itemized_causes__}', # if any exception would be a ConstraintError # this would not be needed, because they # know the underlying constraint constraints=self.constraints, n_alternatives=len(self.constraints), __caused_by__=e_list, ) def long_description(self): return self._get_description('long_description', 'or') def short_description(self): return self._get_description('short_description', 'or') class AllOf(_MultiConstraint): """Logical AND for constraints. An arbitrary number of constraints can be given. They are evaluated in the order in which they were specified. The return value of each constraint is passed an input into the next. The return value of the last constraint is the global return value. No intermediate exceptions are caught. Documentation is aggregated for all constraints. """ def __init__(self, *constraints): """ Parameters ---------- *constraints Constraints all of which must be satisfied """ super().__init__(*constraints) def __and__(self, other): constraints = list(self.constraints) if isinstance(other, AllOf): constraints.extend(other.constraints) else: constraints.append(other) return AllOf(*constraints) def __call__(self, value): for c in (self.constraints): value = c(value) return value def long_description(self): return self._get_description('long_description', 'and') def short_description(self): return self._get_description('short_description', 'and') # keep for backward compatibility Constraints = AllOf AltConstraints = AnyOf datalad-next-1.4.1/datalad_next/constraints/basic.py000066400000000000000000000404661462321624600225350ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 et: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Basic constraints for declaring essential data types, values, and ranges""" from __future__ import annotations __docformat__ = 'restructuredtext' from hashlib import algorithms_guaranteed as hash_algorithms_guaranteed from pathlib import Path import re from typing import Callable from datalad_next.datasets import resolve_path from .base import ( Constraint, DatasetParameter, ) from .utils import _type_str class NoConstraint(Constraint): """A constraint that represents no constraints""" def short_description(self): return '' def __call__(self, value): return value class EnsureValue(Constraint): """Ensure an input is a particular value""" def __init__(self, value): super().__init__() self._target_value = value def __call__(self, value): if value == self._target_value: return value else: self.raise_for( value, "must be {target_value!r}", target_value=self._target_value, ) def short_description(self): return f'{self._target_value!r}' def long_description(self): return f'value must be {self.short_description()}' class EnsureDType(Constraint): """Ensure that an input (or several inputs) are of a particular data type. Examples: >>> c = EnsureDType(float) >>> type(c(8)) # doctest: +SKIP float >>> import numpy as np # doctest: +SKIP >>> c = EnsureDType(np.float64) # doctest: +SKIP >>> type(c(8)) # doctest: +SKIP numpy.float64 """ def __init__(self, dtype): """ Parameters ---------- dtype : functor """ self._dtype = dtype def __call__(self, value): try: return self._dtype(value) except Exception as e: self.raise_for( value, str(e), ) def short_description(self): return _type_str(self._dtype) def long_description(self): return "value must be convertible to type '%s'" % self.short_description() class EnsureInt(EnsureDType): """Ensure that an input (or several inputs) are of a data type 'int'. """ def __init__(self): """Initializes EnsureDType with int""" EnsureDType.__init__(self, int) class EnsureFloat(EnsureDType): """Ensure that an input (or several inputs) are of a data type 'float'. """ def __init__(self): """Initializes EnsureDType with float""" EnsureDType.__init__(self, float) class EnsureBool(Constraint): """Ensure that an input is a bool. A couple of literal labels are supported, such as: False: '0', 'no', 'off', 'disable', 'false' True: '1', 'yes', 'on', 'enable', 'true' """ def __call__(self, value): if isinstance(value, bool): return value elif isinstance(value, (bytes, str)): value = value.lower() if value in ('0', 'no', 'off', 'disable', 'false'): return False elif value in ('1', 'yes', 'on', 'enable', 'true'): return True self.raise_for(value, "must be convertible to boolean") def long_description(self): return 'value must be convertible to type bool' def short_description(self): return 'bool' class EnsureStr(Constraint): """Ensure an input is a string of some min. length and matching a pattern Pattern matching is optional and minimum length is zero (empty string is OK). No type conversion is performed. """ def __init__(self, min_len: int = 0, match: str | None = None): """ Parameters ---------- min_len: int, optional Minimal length for a string. match: Regular expression used to match any input value against. Values not matching the expression will cause a `ValueError` to be raised. """ assert min_len >= 0 self._min_len = min_len self._match = match super().__init__() if match is not None: self._match = re.compile(match) def __call__(self, value) -> str: if not isinstance(value, (bytes, str)): # do not perform a blind conversion ala str(), as almost # anything can be converted and the result is most likely # unintended self.raise_for(value, "must be a string") if len(value) < self._min_len: self.raise_for(value, "must have minimum length {len}", len=self._min_len) if self._match: if not self._match.match(value): self.raise_for( value, 'does not match {pattern}', pattern=self._match.pattern, ) return value def long_description(self): return 'must be a string{}'.format( f' and match {self._match.pattern}' if self._match else '', ) def short_description(self): return 'str{}'.format( f'({self._match.pattern})' if self._match else '', ) # TODO possibly consolidate on EnsureStr from -gooey, which can take # a regex that could perform this. CON: documentation less clear. # But if custom documentation will be supported, it might get even # more clear nevertheless class EnsureStrPrefix(EnsureStr): """Ensure an input is a string that starts with a given prefix. """ def __init__(self, prefix): """ Parameters ---------- prefix : str Mandatory prefix. """ self._prefix = prefix super().__init__() def __call__(self, value): super().__call__(value) if not value.startswith(self._prefix): self.raise_for( value, "does not start with {prefix!r}", prefix=self._prefix, ) return value def long_description(self): return "value must start with '{}'".format(self._prefix) def short_description(self): return '{}...'.format(self._prefix) class EnsureNone(EnsureValue): """Ensure an input is of value `None`""" def __init__(self): super().__init__(None) class EnsureCallable(Constraint): """Ensure an input is a callable object""" def __call__(self, value): if hasattr(value, '__call__'): return value else: self.raise_for(value, "must be a callable") def short_description(self): return 'callable' def long_description(self): return 'value must be a callable' class EnsureChoice(Constraint): """Ensure an input is element of a set of possible values""" def __init__(self, *values): """ Parameters ---------- *values Possible accepted values. """ self._allowed = values super(EnsureChoice, self).__init__() def __call__(self, value): if value not in self._allowed: self.raise_for( value, "is not one of {allowed}", allowed=self._allowed, ) return value def long_description(self): return 'value must be one of [CMD: %s CMD][PY: %s PY]' % ( str(tuple(i for i in self._allowed if i is not None)), str(self._allowed) ) def short_description(self): return '{%s}' % ', '.join([repr(c) for c in self._allowed]) def __str__(self): return f"one of {self.short_description()}" class EnsureKeyChoice(EnsureChoice): """Ensure value under a key in an input is in a set of possible values""" def __init__(self, key, values): """ Parameters ---------- key : str The to-be-tested values are looked up under the given key in a dict-like input object. values : tuple Possible accepted values. """ self._key = key super(EnsureKeyChoice, self).__init__(*values) def __call__(self, value): if self._key not in value: self.raise_for(value, "must be dict-like") super(EnsureKeyChoice, self).__call__(value[self._key]) return value def long_description(self): return "value in '%s' must be one of %s" % (self._key, str(self._allowed),) def short_description(self): return '%s:{%s}' % (self._key, ', '.join([repr(c) for c in self._allowed])) class EnsureRange(Constraint): """Ensure an input is within a particular range No type checks are performed. """ def __init__(self, min=None, max=None): """ Parameters ---------- min Minimal value to be accepted in the range max Maximal value to be accepted in the range """ self._min = min self._max = max if self._min is None and self._max is None: raise ValueError('No range given, min == max == None') super(EnsureRange, self).__init__() def __call__(self, value): if self._min is not None: if self._max is not None: if value < self._min or value > self._max: self.raise_for( value, f"must be in range from {self._min!r} to {self._max!r}" ) else: if value < self._min: self.raise_for(value, f"must be at least {self._min!r}") if self._max is not None: if value > self._max: self.raise_for(value, f"must be at most {self._max!r}") return value def long_description(self): return self.short_description() def short_description(self): if self._max is None: return f'not less than {self._min!r}' elif self._min is None: return f'not greater than {self._max!r}' else: # it is inclusive, but spelling it out would be wordy return f'in range from {self._min!r} to {self._max!r}' class EnsurePath(Constraint): """Ensures input is convertible to a (platform) path and returns a `Path` Optionally, the path can be tested for existence and whether it is absolute or relative. """ def __init__(self, *, path_type: type = Path, is_format: str | None = None, lexists: bool | None = None, is_mode: Callable | None = None, ref: Path | None = None, ref_is: str = 'parent-or-same-as', dsarg: DatasetParameter | None = None): """ Parameters ---------- path_type: Specific pathlib type to convert the input to. The default is `Path`, i.e. the platform's path type. Not all pathlib Path types can be instantiated on all platforms, and not all checks are possible with all path types. is_format: {'absolute', 'relative'} or None If not None, the path is tested whether it matches being relative or absolute. lexists: If not None, the path is tested to confirmed exists or not. A symlink need not point to an existing path to fulfil the "exists" condition. is_mode: If set, this callable will receive the path's `.lstat().st_mode`, and an exception is raised, if the return value does not evaluate to `True`. Typical callables for this feature are provided by the `stat` module, e.g. `S_ISDIR()` ref: If set, defines a reference Path any given path is compared to. The comparison operation is given by `ref_is`. ref_is: {'parent-or-same-as', 'parent-of'} Comparison operation to perform when `ref` is given. dsarg: DatasetParameter, optional If given, incoming paths are resolved in the following fashion: If, and only if, the original "dataset" parameter was a ``Dataset`` object instance, relative paths are interpreted as relative to the given dataset. In all other cases, relative paths are treated as relative to the current working directory. """ super().__init__() self._path_type = path_type self._is_format = is_format self._lexists = lexists self._is_mode = is_mode self._ref = ref self._ref_is = ref_is self._dsarg = dsarg assert self._ref_is in ('parent-or-same-as', 'parent-of'), \ 'Unrecognized `ref_is` operation label' def __call__(self, value): # turn it into the target type to make everything below # more straightforward path = self._path_type(value) # we are testing the format first, because resolve_path() # will always turn things into absolute paths if self._is_format is not None: is_abs = path.is_absolute() if self._is_format == 'absolute' and not is_abs: self.raise_for(path, 'is not an absolute path') elif self._is_format == 'relative' and is_abs: self.raise_for(path, 'is not a relative path') # resolve relative paths against a dataset, if given if self._dsarg: path = resolve_path( path, self._dsarg.original, self._dsarg.ds) mode = None if self._lexists is not None or self._is_mode is not None: try: mode = path.lstat().st_mode except FileNotFoundError: # this is fine, handled below pass if self._lexists is not None: if self._lexists and mode is None: self.raise_for(path, 'does not exist') elif not self._lexists and mode is not None: self.raise_for(path, 'does (already) exist') if self._is_mode is not None: if not self._is_mode(mode): self.raise_for(path, 'does not match desired mode') if self._ref: ok = True if self._ref_is == 'parent-or-same-as': ok = (path == self._ref or self._ref in path.parents) elif self._ref_is == 'parent-of': ok = self._ref in path.parents else: # pragma: nocover # this code cannot be reached with normal usage. # it is prevented by an assertion in __init__() raise RuntimeError('Unknown `ref_is` operation label') if not ok: self.raise_for( path, '{ref} is not {ref_is} {path}', ref=self._ref, ref_is=self._ref_is, ) return path def for_dataset(self, dataset: DatasetParameter) -> Constraint: """Return an similarly parametrized variant that resolves paths against a given dataset (argument) """ return self.__class__( path_type=self._path_type, is_format=self._is_format, lexists=self._lexists, is_mode=self._is_mode, ref=self._ref, ref_is=self._ref_is, dsarg=dataset, ) def short_description(self): return '{}{}path{}'.format( 'existing ' if self._lexists else 'non-existing ' if self._lexists else '', 'absolute ' if self._is_format == 'absolute' else 'relative' if self._is_format == 'relative' else '', f' that is {self._ref_is} {self._ref}' if self._ref else '', ) class EnsureHashAlgorithm(EnsureChoice): """Ensure an input matches a name of a ``hashlib`` algorithm Specifically the item must be in the ``algorithms_guaranteed`` collection. """ def __init__(self): super().__init__(*hash_algorithms_guaranteed) datalad-next-1.4.1/datalad_next/constraints/compound.py000066400000000000000000000445531462321624600233010ustar00rootroot00000000000000"""Constraints that wrap or contain other constraints""" from __future__ import annotations from pathlib import Path import sys from typing import ( Any, Callable, Dict, Generator, ) from datalad_next.exceptions import CapturedException from .base import ( Constraint, ConstraintError, DatasetParameter, ) class EnsureIterableOf(Constraint): """Ensure that an input is a list of a particular data type """ # TODO support a delimiter to be able to take str-lists? def __init__(self, iter_type: type, item_constraint: Callable, min_len: int | None = None, max_len: int | None = None): """ Parameters ---------- iter_type: Target type of iterable. Common types are `list`, or `tuple`, but also generator type iterables are possible. Type constructor must take an iterable with items as the only required positional argument. item_constraint: Each incoming item will be mapped through this callable before being passed to the iterable type constructor. min_len: If not None, the iterable will be verified to have this minimum number of items. The iterable type must implement `__len__()` for this check to be supported. max_len: If not None, the iterable will be verified to have this maximum number of items. The iterable type must implement `__len__()` for this check to be supported. """ if min_len is not None and max_len is not None and min_len > max_len: raise ValueError( 'Given minimum length exceeds given maximum length') self._iter_type = iter_type self._item_constraint = item_constraint self._min_len = min_len self._max_len = max_len super().__init__() def __repr__(self): # not showing iter_type here, will come via class.name # in general return ( f'{self.__class__.__name__}(' f'item_constraint={self._item_constraint!r}' f', min_len={self._min_len!r}' f', max_len={self._max_len!r})' ) @property def item_constraint(self): return self._item_constraint def __call__(self, value): try: iter = self._iter_type( self._item_constraint(i) for i in value ) except (ConstraintError, TypeError) as e: self.raise_for( value, "{itertype} item is not {itype}", itertype=self._iter_type.__name__, itype=self._item_constraint, __caused_by__=e, ) if self._min_len is not None or self._max_len is not None: # only do this if necessary, generators will not support # __len__, for example iter_len = len(iter) if self._min_len is not None and iter_len < self._min_len: self.raise_for( iter, 'must have minimum length {len}', len=self._min_len, ) if self._max_len is not None and iter_len > self._max_len: self.raise_for( iter, 'must not exceed maximum length {len}', len=self._max_len, ) return iter def short_description(self): return f'{self._iter_type}({self._item_constraint})' class EnsureListOf(EnsureIterableOf): def __init__(self, item_constraint: Callable, min_len: int | None = None, max_len: int | None = None): """ Parameters ---------- item_constraint: Each incoming item will be mapped through this callable before being passed to the list constructor. min_len: If not None, the list will be verified to have this minimum number of items. max_len: If not None, the list will be verified to have this maximum number of items. """ super().__init__(list, item_constraint, min_len=min_len, max_len=max_len) def short_description(self): return f'list({self._item_constraint})' class EnsureTupleOf(EnsureIterableOf): def __init__(self, item_constraint: Callable, min_len: int | None = None, max_len: int | None = None): """ Parameters ---------- item_constraint: Each incoming item will be mapped through this callable before being passed to the tuple constructor. min_len: If not None, the tuple will be verified to have this minimum number of items. max_len: If not None, the tuple will be verified to have this maximum number of items. """ super().__init__(tuple, item_constraint, min_len=min_len, max_len=max_len) def short_description(self): return f'tuple({self._item_constraint})' class EnsureMapping(Constraint): """Ensure a mapping of a key to a value of a specific nature""" def __init__(self, key: Constraint, value: Constraint, delimiter: str = ':', allow_length2_sequence: bool = True): """ Parameters ---------- key: Key constraint instance. value: Value constraint instance. delimiter: Delimiter to use for splitting a key from a value for a `str` input. """ super().__init__() self._key_constraint = key self._value_constraint = value self._delimiter = delimiter self._allow_length2_sequence = allow_length2_sequence def __repr__(self): return ( f'{self.__class__.__name__}(' f'key={self._key_constraint!r}' f', value={self._value_constraint!r}' f', delimiter={self._delimiter!r})' ) def short_description(self): return 'mapping of {} -> {}'.format( self._key_constraint.short_description(), self._value_constraint.short_description(), ) def _get_key_value(self, value) -> tuple: # determine key and value from various kinds of input if isinstance(value, str): # will raise if it cannot split into two key, val = value.split(sep=self._delimiter, maxsplit=1) elif isinstance(value, dict): if not len(value): self.raise_for(value, 'dict does not contain a key') elif len(value) > 1: self.raise_for(value, 'dict contains more than one key') key, val = value.copy().popitem() elif self._allow_length2_sequence and isinstance(value, (list, tuple)): if not len(value) == 2: self.raise_for(value, 'key/value sequence does not have length 2') key, val = value else: self.raise_for(value, 'not a recognized mapping') return key, val def __call__(self, value) -> Dict: key, val = self._get_key_value(value) key = self._key_constraint(key) val = self._value_constraint(val) return {key: val} def for_dataset(self, dataset: DatasetParameter) -> Constraint: # tailor both constraints to the dataset and reuse delimiter return EnsureMapping( key=self._key_constraint.for_dataset(dataset), value=self._value_constraint.for_dataset(dataset), delimiter=self._delimiter, ) class EnsureGeneratorFromFileLike(Constraint): """Ensure a constraint for each item read from a file-like. A given value can either be a file-like (the outcome of `open()`, or `StringIO`), or `-` as an alias of STDIN, or a path to an existing file to be read from. """ def __init__( self, item_constraint: Callable, exc_mode: str = 'raise', ): """ Parameters ---------- item_constraint: Each incoming item will be mapped through this callable before being yielded by the generator. exc_mode: {'raise', 'yield'}, optional How to deal with exceptions occurring when processing individual lines/items. With 'yield' the respective exception instance is yielded as a ``CapturedException``, and processing continues. A caller can then decide whether to ignore, or report the exception. With 'raise', an exception is raised immediately and processing stops. """ assert exc_mode in ('raise', 'yield') self._item_constraint = item_constraint self._exc_mode = exc_mode super().__init__() def __repr__(self): # not showing iter_type here, will come via class.name # in general return ( f'{self.__class__.__name__}(' f'item_constraint={self._item_constraint!r})' ) def short_description(self): return \ f'items of type "{self._item_constraint.short_description()}" ' \ 'read from a file-like' def __call__(self, value) -> Generator[Any, None, None]: # we only support a single file-like source. If we happened to get # a length-1 sequence (for technical reasons, such as argparse # having collected the value), we unpack it. if isinstance(value, (list, tuple)) and len(value) == 1: value = value[0] opened_file = False if value == '-': value = sys.stdin elif isinstance(value, (str, Path)): # we covered the '-' special case, so this must be a Path path = Path(value) if not isinstance(value, Path) else value if not path.is_file(): self.raise_for( value, "not '-', or a path to an existing file", ) value = path.open() opened_file = True return self._item_yielder(value, opened_file) def _item_yielder(self, fp, close_file): try: for line in fp: try: yield self._item_constraint( # splitlines() removes the newline at the end of # the string that is left in by __iter__() line.splitlines()[0] ) except Exception as e: if self._exc_mode == 'raise': raise else: yield CapturedException(e) finally: if close_file: fp.close() class ConstraintWithPassthrough(Constraint): """Regular constraint, but with a "pass-through" value that is not processed This is different from a `Constraint() | EnsureValue(...)` construct, because the pass-through value is not communicated. This can be useful when a particular value must be supported for technical reasons, but need not, or must not be included in (error) messages. The pass-through is returned as-is, and is not processed except for an identity check (`==`). For almost all reporting (`__str__`, descriptions, ...) the wrapped value constraint is used, making this class virtually invisible. Only ``__repr__`` reflects the wrapping. """ def __init__(self, constraint: Constraint, passthrough: Any): """ Parameters ---------- constraint: Constraint Any ``Constraint`` subclass instance that will be used to validate values. passthrough: A value that will not be subjected to validation by the value constraint, but is returned as-is. This value is not copied. It is a caller's responsibility to guarantee immutability if that is desired. """ super().__init__() self._constraint = constraint self._passthrough = passthrough @property def constraint(self) -> Constraint: """Returns the wrapped constraint instance""" return self._constraint @property def passthrough(self) -> Any: """Returns the set pass-through value""" return self._passthrough def __call__(self, value) -> Any: if value == self._passthrough: val = value else: val = self._constraint(value) return val def __str__(self) -> str: return self._constraint.__str__() def __repr__(self) -> str: return f'{self.__class__.__name__}' \ f'({self._constraint!r}, passthrough={self._passthrough!r})' def for_dataset(self, dataset: DatasetParameter) -> Constraint: """Wrap the wrapped constraint again after tailoring it for the dataset The pass-through value is re-used. """ return self.__class__( self._constraint.for_dataset(dataset), passthrough=self._passthrough, ) def long_description(self) -> str: return self._constraint.long_description() def short_description(self) -> str: return self._constraint.short_description() class WithDescription(Constraint): """Constraint that wraps another constraint and replaces its description Whenever a constraint's self-description does not fit an application context, it can be wrapped with this class. The given synopsis and description of valid inputs replaces those of the wrapped constraint. """ def __init__(self, constraint: Constraint, *, input_synopsis: str | None = None, input_description: str | None = None, error_message: str | None = None, input_synopsis_for_ds: str | None = None, input_description_for_ds: str | None = None, error_message_for_ds: str | None = None, ): """ Parameters ---------- constraint: Constraint Any ``Constraint`` subclass instance that will be used to validate values. input_synopsis: optional If given, text to be returned as the constraint's ``input_synopsis``. Otherwise the wrapped constraint's ``input_synopsis`` is returned. input_description: optional If given, text to be returned as the constraint's ``input_description``. Otherwise the wrapped constraint's ``input_description`` is returned. error_message: optional If given, replaces the error message of a ``ConstraintError`` raised by the wrapped ``Constraint``. Only the message (template) is replaced, not the error context dictionary. input_synopsis_for_ds: optional If either this, or ``input_description_for_ds``, or ``error_message_for_ds`` are given, the result of tailoring a constraint for a particular dataset (``for_dataset()``) will also be wrapped with this custom synopsis. input_description_for_ds: optional If either this, or ``input_synopsis_for_ds``, or ``error_message_for_ds`` are given, the result of tailoring a constraint for a particular dataset (``for_dataset()``) will also be wrapped with this custom description. error_message: optional If either this, or ``input_synopsis_for_ds``, or ``input_description_for_ds`` are given, the result of tailoring a constraint for a particular dataset (``for_dataset()``) will also be wrapped with this custom error message (template). """ super().__init__() self._constraint = constraint self._synopsis = input_synopsis self._description = input_description self._error_message = error_message self._synopsis_for_ds = input_synopsis_for_ds self._description_for_ds = input_description_for_ds self._error_message_for_ds = error_message_for_ds @property def constraint(self) -> Constraint: """Returns the wrapped constraint instance""" return self._constraint def __call__(self, value) -> Any: try: return self._constraint(value) except ConstraintError as e: # rewrap the error to get access to the top-level # self-description. msg, cnstr, value, ctx = e.args raise ConstraintError( self, value, self._error_message or msg, ctx, ) from e def __str__(self) -> str: return \ f'<{self._constraint.__class__.__name__} with custom description>' def __repr__(self) -> str: return f'{self.__class__.__name__}' \ f'({self._constraint!r}, ' \ f'input_synopsis={self._synopsis!r}, ' \ f'input_description={self._description!r}, ' \ f'input_synopsis_for_ds={self._synopsis_for_ds!r}, ' \ f'input_description_for_ds={self._description_for_ds!r}, ' \ f'error_message={self._error_message!r}, ' \ f'error_message_for_ds={self._error_message_for_ds!r})' def for_dataset(self, dataset: DatasetParameter) -> Constraint: """Wrap the wrapped constraint again after tailoring it for the dataset """ if any(x is not None for x in ( self._synopsis_for_ds, self._description_for_ds, self._error_message_for_ds)): # we also want to wrap the tailored constraint return self.__class__( self._constraint.for_dataset(dataset), input_synopsis=self._synopsis_for_ds, input_description=self._description_for_ds, error_message=self._error_message_for_ds, ) else: return self._constraint.for_dataset(dataset) @property def input_synopsis(self): return self._synopsis or self.constraint.input_synopsis @property def input_description(self): return self._description or self.constraint.input_description # legacy compatibility def long_description(self) -> str: return self.input_description def short_description(self) -> str: return self.input_synopsis datalad-next-1.4.1/datalad_next/constraints/dataset.py000066400000000000000000000112461462321624600230730ustar00rootroot00000000000000"""Constraints for DataLad datasets""" from __future__ import annotations from pathlib import ( Path, PurePath, ) from datalad_next.datasets import Dataset from .base import ( Constraint, DatasetParameter, ) from .exceptions import NoDatasetFound class EnsureDataset(Constraint): """Ensure an absent/present `Dataset` from any path or Dataset instance Regardless of the nature of the input (`Dataset` instance or local path) a resulting instance (if it can be created) is optionally tested for absence or presence on the local file system. Due to the particular nature of the `Dataset` class (the same instance is used for a unique path), this constraint returns a `DatasetParameter` rather than a `Dataset` directly. Consuming commands can discover the original parameter value via its `original` property, and access a `Dataset` instance via its `ds` property. In addition to any value representing an explicit path, this constraint also recognizes the special value `None`. This instructs the implementation to find a dataset that contains the process working directory (PWD). Such a dataset need not have its root at PWD, but could be located in any parent directory too. If no such dataset can be found, PWD is used directly. Tests for ``installed`` are performed in the same way as with an explicit dataset location argument. If `None` is given and ``installed=True``, but no dataset is found, an exception is raised (this is the behavior of the ``required_dataset()`` function in the DataLad core package). With ``installed=False`` no exception is raised and a dataset instances matching PWD is returned. """ def __init__(self, installed: bool | None = None, purpose: str | None = None, require_id: bool | None = None): """ Parameters ---------- installed: bool, optional If given, a dataset will be verified to be installed or not. Otherwise the installation-state will not be inspected. purpose: str, optional If given, will be used in generated error messages to communicate why a dataset is required (to exist) idcheck: bool, option If given, performs an additional check whether the dataset has a valid dataset ID. """ self._installed = installed self._purpose = purpose self._require_id = require_id super().__init__() def __call__(self, value) -> DatasetParameter: # good-enough test to recognize a dataset instance cheaply if hasattr(value, 'repo') and hasattr(value, 'pathobj'): ds = value # anticipate what require_dataset() could handle and fail if we got # something else elif not isinstance(value, (str, PurePath, type(None))): self.raise_for( value, "cannot create Dataset from {type}", type=type(value) ) else: ds = self._require_dataset(value) assert ds if self._installed is not None: is_installed = ds.is_installed() if self._installed is False and is_installed: self.raise_for(ds, 'already exists locally') if self._installed and not is_installed: self.raise_for(ds, 'not installed') if self._require_id and not ds.id: self.raise_for(ds, 'does not have a valid datalad-id') return DatasetParameter(value, ds) def short_description(self) -> str: return "(path to) {}Dataset".format( 'an existing ' if self._installed is True else 'a non-existing ' if self._installed is False else 'a ') def _require_dataset(self, value): from datalad.distribution.dataset import require_dataset try: ds = require_dataset( value, check_installed=self._installed is True, purpose=self._purpose, ) return ds except NoDatasetFound: # mitigation of non-uniform require_dataset() behavior. # with value == None it does not honor check_installed # https://github.com/datalad/datalad/issues/7281 if self._installed is True: # if we are instructed to ensure an installed dataset raise else: # but otherwise go with CWD. require_dataset() did not # find a dataset in any parent dir either, so this is # the best we can do. Installation absence verification # will happen further down return Dataset(Path.cwd()) datalad-next-1.4.1/datalad_next/constraints/exceptions.py000066400000000000000000000312561462321624600236320ustar00rootroot00000000000000"""Custom exceptions raised by ``Constraint`` implementations""" from __future__ import annotations from collections.abc import Mapping from dataclasses import dataclass from textwrap import indent from types import MappingProxyType from typing import ( Any, Dict, Tuple, ) # needed for imports in other pieced of the ``constraints`` module from datalad_next.exceptions import NoDatasetFound class ConstraintError(ValueError): # we derive from ValueError, because it provides the seemingly best fit # of any built-in exception. It is defined as: # # Raised when an operation or function receives an argument that has # the right type but an inappropriate value, and the situation is not # described by a more precise exception such as IndexError. # # In general a validation error can also occur because of a TypeError, but # ultimately what also matters here is an ability to coerce a given value # to a target type/value, but such an exception is not among the built-ins. # Moreover, many pieces of existing code do raise ValueError in practice, # and we aim to be widely applicable with this specialized class """Exception type raised by constraints when their conditions are violated A primary purpose of this class is to provide uniform means for communicating information on violated constraints. """ def __init__(self, constraint, value: Any, msg: str, ctx: Dict[str, Any] | None = None): """ Parameters ---------- constraint: Constraint Instance of the ``Constraint`` class that determined a violation. value: The value that is in violation of a constraint. msg: str A message describing the violation. If ``ctx`` is given too, the message can contain keyword placeholders in Python's ``format()`` syntax that will be applied on-access. ctx: dict, optional Mapping with context information on the violation. This information is used to interpolate a message, but may also contain additional key-value mappings. A recognized key is ``'__caused_by__'``, with a value of one exception (or a list of exceptions) that led to a ``ConstraintError`` being raised. """ # the msg/ctx setup is inspired by pydantic # we put `msg` in the `.args` container first to match where # `ValueError` would have it. Everything else goes after it. super().__init__(msg, constraint, value, ctx) @property def msg(self): """Obtain an (interpolated) message on the constraint violation The error message template can be interpolated with any information available in the error context dict (``ctx``). In addition to the information provided by the ``Constraint`` that raised the error, the following additional placeholders are provided: - ``__value__``: the value reported to have caused the error - ``__itemized_causes__``: an indented bullet list str with on item for each error in the ``caused_by`` report of the error. Message template can use any feature of the Python format mini language. For example ``{__value__!r}`` to get a ``repr()``-style representation of the offending value. """ msg_tmpl = self.args[0] # get interpolation values for message formatting # we need a copy, because we need to mutate the dict ctx = dict(self.context) # support a few standard placeholders # the verbatim value that caused the error: with !r and !s both # types of stringifications are accessible ctx['__value__'] = self.value if self.caused_by: ctx['__itemized_causes__'] = indent( '\n'.join(f'- {str(c)}' for c in self.caused_by), " ", ) return msg_tmpl.format(**ctx) @property def constraint(self): """Get the instance of the constraint that was violated""" return self.args[1] @property def caused_by(self) -> Tuple[Exception] | None: """Returns a tuple of any underlying exceptions that caused a violation """ cb = self.context.get('__caused_by__', None) if cb is None: return None elif isinstance(cb, Exception): return (cb,) else: return tuple(cb) @property def value(self): """Get the value that violated the constraint""" return self.args[2] @property def context(self) -> MappingProxyType: """Get a constraint violation's context This is a mapping of key/value-pairs matching the ``ctx`` constructor argument. """ return MappingProxyType(self.args[3] or {}) def __str__(self): return self.msg def __repr__(self): # rematch constructor arg-order, because we put `msg` first into # `.args` return '{0}({2!r}, {3!r}, {1!r}, {4!r})'.format( self.__class__.__name__, *self.args, ) class ConstraintErrors(ConstraintError): """Exception representing context-specific ConstraintError instances This class enables the association of a context in which any particular constraint was violated. This is done by passing a mapping, of a context identifier (e.g., a label) to the particular ``ConstraintError`` that occurred in this context, to the constructor. This is a generic implementation with no requirements regarding the nature of the context identifiers (expect for being hashable). See ``CommandParametrizationError`` for a specialization. """ def __init__(self, exceptions: Dict[Any, ConstraintError]): super().__init__( # this is the main payload, the base class expects a Constraint # but only stores it constraint=exceptions, # all values are already on record in the respective exceptions # no need to pass again value=None, # no support for a dedicated message here (yet?), pass an empty # string to match assumptions msg='', # and no context ctx=None, ) @property def errors(self) -> MappingProxyType[Any, ConstraintError]: # read-only access return MappingProxyType(self.args[1]) def __repr__(self): return '{0}({{{1}}})'.format( self.__class__.__name__, ', '.join(f'{k!r}: {v!r}' for k, v in self.errors.items()), ) class ParameterContextErrors(Mapping): """Read-only convenience that wraps a ``ConstraintErrors`` error mapping """ # TODO extend this class with any query functionality that a command # API would want to use in order to get streamlined information on what # went wrong (in general, for a specific parameter, etc...) def __init__( self, errors: Dict[ParameterConstraintContext, ConstraintError], ): self._errors = errors def __repr__(self): return self._errors.__repr__() def __len__(self): return len(self._errors) def __iter__(self): return self._errors.__iter__() def __getitem__(self, key): return self._errors[key] def items(self): return self._errors.items() @property def messages(self): return [e.msg for e in self._errors.values()] @property def context_labels(self): return [e.label for e in self._errors.keys()] # TODO return all errors related to some parameter @dataclass(frozen=True) class ParameterConstraintContext: """Representation of a parameter constraint context This type is used for the keys in the error map of. ``ParametrizationErrors``. Its purpose is to clearly identify which parameter combination (and its nature) led to a `ConstraintError`. An error context comprises to components: 1) the names of the parameters that were considered, and 2) a description of how the parameters were linked or combined. In the simple case of an error occurring in the context of a single parameter, the second component is superfluous. Otherwise, it can be thought of as an operation label, describing what aspect of the set of parameters is being relevant in a particular context. Example: A command has two parameters `p1` and `p2`. The may also have respective individual constraints, but importantly they 1) must not have identical values, and 2) their sum must be larger than 3. If the command is called with ``cmd(p1=1, p2=1)``, both conditions are violated. The reporting may be implemented using the following ``ParameterConstraintContext`` and ``ConstraintError`` instances:: ParameterConstraintContext(('p1', 'p2'), 'inequality): ConstraintError(EnsureValue(True), False, ) ParameterConstraintContext(('p1', 'p2'), 'sum): ConstraintError(EnsureRange(min=3), False, ) where the ``ConstraintError`` instances are generated by standard ``Constraint`` implementation. For the second error, this could look like:: EnsureRange(min=3)(params['p1'] + params['p2']) """ parameters: Tuple[str] description: str | None = None def __str__(self): return f'Context<{self.label}>' @property def label(self) -> str: """A concise summary of the context This label will be a compact as possible. """ # XXX this could be __str__ but its intended usage for rendering # a text description of all errors would seemingly forbid adding # type information -- which OTOH seems to be desirable for __str__ return '{param}{descr}'.format( param=", ".join(self.parameters), descr=f" ({self.description})" if self.description else '', ) def get_label_with_parameter_values(self, values: dict) -> str: """Like ``.label`` but each parameter will also state a value""" # TODO truncate the values after repr() to ensure a somewhat compact # output from .parameter import NoValue return '{param}{descr}'.format( param=", ".join( f'{p}=' if isinstance(values[p], NoValue) else f'{p}={values[p]!r}' for p in self.parameters ), descr=f" ({self.description})" if self.description else '', ) class ParametrizationErrors(ConstraintErrors): """Exception type raised on violating parameter constraints This is a ``ConstraintErrors`` variant that uses parameter names (i.e, ``str`` labels) as context identifiers. In addition to individual parameter names an additional ``__all__`` identifier is recognized. It can be used to record a ``ConstraintError`` arising from high-order constraints, such as the violation of "mutually exclusive" requirements across more than one parameter. """ def __init__( self, exceptions: Dict[str, ConstraintError] | Dict[ParameterConstraintContext, ConstraintError]): super().__init__( {k if isinstance(k, ParameterConstraintContext) else ParameterConstraintContext((k,)): v for k, v in exceptions.items()} ) @property def errors(self) -> ParameterContextErrors: # read-only access return ParameterContextErrors(self.args[1]) def __str__(self): return self._render_violations_as_indented_text_list( 'parameter') def _render_violations_as_indented_text_list(self, violation_subject): violations = len(self.errors) return '{ne} {vs}constraint violation{p}\n{el}'.format( ne=violations, vs=f'{violation_subject} ' if violation_subject else '', p='s' if violations > 1 else '', el='\n'.join( '{ctx}\n{msg}'.format( ctx=ctx.get_label_with_parameter_values( c.value if isinstance(c.value, dict) else {ctx.parameters[0]: c.value} ), msg=indent(str(c), ' '), ) for ctx, c in self.errors.items() ), ) class CommandParametrizationError(ParametrizationErrors): """Exception type raised on violating any command parameter constraints .. seealso:: :mod:`~datalad_next.constraints.EnsureCommandParameterization` """ def __str__(self): return self._render_violations_as_indented_text_list( 'command parameter') datalad-next-1.4.1/datalad_next/constraints/formats.py000066400000000000000000000074511462321624600231240ustar00rootroot00000000000000"""Constraints for particular formats or protocols""" # allow for |-type UnionType declarations from __future__ import annotations from json import loads import re from urllib.parse import ( urlparse, ParseResult, ) from .base import Constraint class EnsureJSON(Constraint): """Ensures that string is JSON formatted and can be deserialized. """ def __init__(self): super().__init__() def __call__(self, value: str): try: return loads(value) except Exception as e: self.raise_for( value, str(e), ) def short_description(self): return 'JSON' class EnsureURL(Constraint): """Ensures that a string is a valid URL with a select set of components and/or: - does not contain certain components - matches a particular regular expression Given that a large variety of strings are also a valid URL, a typical use of this constraint would involve using a `required=['scheme']` setting. All URL attribute names supported by `urllib.parse.urlparse()` are also supported here: scheme, netloc, path, params, query, fragment, username, password, hostname, port. .. seealso:: https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlparse """ def __init__( self, required: list | None = None, forbidden: list | None = None, match: str | None = None, ): """ Parameters ---------- required: list, optional List of any URL component names as recognized by ``urlparse()``, such as ``scheme``, ``netloc``, ``path``, ``params``, ``query``, ``fragment``, ``username``, ``password``, ``hostname``, ``port`` forbidden: list, optional Like ``required`` but names URL components that must not be present match: str, optional Regular expression that the URL must match """ self._required = required self._forbidden = forbidden self._match_exp = re.compile(match) if match else None super().__init__() def __call__(self, value: str) -> str: self._validate_parsed(value) # return the str here, see EnsureParsedURL for an alternative return value def _validate_parsed(self, value: str) -> ParseResult: if not isinstance(value, str): self.raise_for(value, 'not a string') if self._match_exp and not self._match_exp.match(value): self.raise_for( value, 'does not match expression {match_expression!r}', match_expression=self._match_exp.pattern, ) parsed = urlparse(value, scheme='', allow_fragments=True) for r in (self._required or []): if not getattr(parsed, r, None): self.raise_for( value, 'URL is missing {component!r} component', component=r, ) for f in (self._forbidden or []): if getattr(parsed, f, None): self.raise_for( value, 'URL has forbidden {component!r} component', component=f, ) return parsed def short_description(self): return 'URL{}{}{}{}'.format( f' with required {self._required}' if self._required else '', ' and' if self._required and self._forbidden else '', f' with no {self._forbidden}' if self._forbidden else '', ' component(s)' if self._required or self._forbidden else '', ) class EnsureParsedURL(EnsureURL): """Like `EnsureURL`, but returns a parsed URL""" def __call__(self, value: str) -> ParseResult: return self._validate_parsed(value) datalad-next-1.4.1/datalad_next/constraints/git.py000066400000000000000000000130451462321624600222300ustar00rootroot00000000000000"""Constraints for Git-related concepts and parameters""" from __future__ import annotations from datalad_next.runners import ( CommandError, call_git, call_git_oneline, ) from .base import ( Constraint, DatasetParameter, ) class EnsureGitRefName(Constraint): """Ensures that a reference name is well formed Validation is performed by calling `git check-ref-format`. """ def __init__(self, allow_onelevel: bool = True, normalize: bool = True, refspec_pattern: bool = False): """ Parameters ---------- allow_onelevel: Flag whether one-level refnames are accepted, e.g. just 'main' instead of 'refs/heads/main'. normalize: Flag whether a normalized refname is validated and return. This includes removing any leading slash (/) characters and collapsing runs of adjacent slashes between name components into a single slash. refspec_pattern: Flag whether to interpret a value as a reference name pattern for a refspec (allowed to contain a single '*'). """ super().__init__() self._allow_onelevel = allow_onelevel self._normalize = normalize self._refspec_pattern = refspec_pattern def __call__(self, value: str) -> str: if not value: # simple, do here self.raise_for(value, 'refname must not be empty') cmd = ['check-ref-format'] cmd.append('--allow-onelevel' if self._allow_onelevel else '--no-allow-onelevel') if self._refspec_pattern: cmd.append('--refspec-pattern') if self._normalize: cmd.append('--normalize') cmd.append(value) try: res = (call_git_oneline if self._normalize else call_git)(cmd) except CommandError as e: self.raise_for( value, 'is not a valid refname', __caused_by__=e, ) if self._normalize: return res else: return value def short_description(self): return '{}Git refname{}'.format( '(single-level) ' if self._allow_onelevel else '', ' or refspec pattern' if self._refspec_pattern else '', ) class EnsureRemoteName(Constraint): """Ensures a valid remote name, and optionally if such a remote is known """ _label = 'remote' def __init__(self, known: bool | None = None, dsarg: DatasetParameter | None = None): """ Parameters ---------- known: bool, optional By default, a given value is only checked if it is a syntactically correct remote name. If ``True``, also checks that the given name corresponds to a known remote in the dataset given by ``dsarg``. If ``False``, checks that the given remote does not match any known remote in that dataset. dsarg: DatasetParameter, optional Identifies a dataset for testing remote existence, if requested. """ self._label = 'remote' self._known = known self._dsarg = dsarg def __call__(self, value: str) -> str: if not value: # simple, do here self.raise_for( value, f'missing {self._label} name', ) if self._known is not None: assert self._dsarg, \ f"Existence check for {self._label} requires dataset " \ "specification" if self._known: # we don't need to check much, only if a remote of this name # already exists -- no need to check for syntax compliance # again if not any( k.startswith(f"remote.{value}.") for k in self._dsarg.ds.config.keys() ): self.raise_for( value, f'is not a known {self._label}', ) else: # whether or not the remote must not exist, or we would not care, # in all cases we need to check for syntax compliance EnsureGitRefName( allow_onelevel=True, refspec_pattern=False, )(value) if self._known is None: # we only need to know that something was provided, # no further check return value if self._known is False and any( k.startswith(f"remote.{value}.") for k in self._dsarg.ds.config.keys() ): self.raise_for( value, f'name conflicts with a known {self._label}', ) return value def short_description(self): return f"Name of a{{desc}} {self._label}".format( desc=' known' if self._known else ' not-yet-known' if self._known is False else '' ) def for_dataset(self, dataset: DatasetParameter) -> Constraint: """Return an similarly parametrized variant that checks remote names against a given dataset (argument)""" return self.__class__( known=self._known, dsarg=dataset, ) class EnsureSiblingName(EnsureRemoteName): """Identical to ``EnsureRemoteName``, but used the term "sibling" Only error messages and documentation differ, with "remote" being replaced with "sibling". """ _label = 'sibling' datalad-next-1.4.1/datalad_next/constraints/parameter.py000066400000000000000000000443551462321624600234350ustar00rootroot00000000000000"""Constraints for command/function parameters""" from __future__ import annotations from collections.abc import Container from itertools import chain from typing import ( Callable, Dict, ) from .base import Constraint from .basic import ( NoConstraint, ) from .dataset import DatasetParameter from .exceptions import ( ConstraintError, ParametrizationErrors, CommandParametrizationError, ParameterConstraintContext, ) class NoValue: """Type to annotate the absence of a value For example in a list of parameter defaults. In general `None` cannot be used, as it may be an actual value, hence we use a local, private type. """ pass class EnsureCommandParameterization(Constraint): """Base class for `ValidatedInterface` parameter validators This class can be used as-is, by declaring individual constraints in the constructor, or it can be subclassed to consolidate all custom validation-related code for a command in a single place. Commonly this constraint is used by declaring particular value constraints for individual parameters as a mapping. Declaring that the ``path`` parameter should receive something that is or can be coerced to a valid ``Path`` object looks like this:: EnsureCommandParameterization({'path': EnsurePath()}) This class differs from a standard ``Constraint`` implementation, because its ``__call__()`` method support additional arguments that are used by the internal ``Interface`` handling code to control how parameters are validated. During validation, when no validator for a particular parameter is declared, any input value is passed on as-is, and otherwise an input is passed through the validator. There is one exception to this rule: When a parameter value is identical to its default value (as declared in the command signature, and communicated via the ``at_default`` argument of ``__call__()``), this default value is also passed as-is, unless the respective parameter name is included in the ``validate_defaults`` constructor argument. An important consequence of this behavior is that validators need not cover a default value. For example, a parameter constraint for ``path=None``, where ``None`` is a special value used to indicate an optional and unset value, but actually only paths are acceptable input values. can simply use ``EnsurePath()`` and it is not necessary to do something like ``EnsurePath() | EnsureNone()``. However, `EnsureCommandParameterization` can also be specifically instructed to perform validation of defaults for individual parameters, as described above. A common use case is the auto-discovery of datasets, where often `None` is the default value of a `dataset` parameter (to make it optional), and an `EnsureDataset` constraint is used. This constraint can perform the auto-discovery (with the `None` value indicating that), but validation of defaults must be turned on for the `dataset` parameter in order to do that. A second difference to a common ``Constraint`` implementation is the ability to perform an "exhaustive validation" on request (via ``__call__(on_error=...)``). In this case, validation is not stopped at the first discovered violation, but all violations are collected and communicated by raising a ``CommandParametrizationError`` exception, which can be inspected by a caller for details on number and nature of all discovered violations. Exhaustive validation and joint reporting are only supported for individual constraint implementations that raise `ConstraintError` exceptions. For legacy constraints, any raised exception of another type are not caught and reraised immediately. """ def __init__( self, param_constraints: Dict[str, Constraint], *, validate_defaults: Container[str] | None = None, joint_constraints: Dict[ParameterConstraintContext, Callable] | None = None, tailor_for_dataset: Dict[str, str] | None = None, ): """ Parameters ---------- param_constraints: dict Mapping of parameter names to parameter constraints. On validation an ``EnsureParameterConstraint`` instance will be created for each item in this dict. validate_defaults: container(str), optional If given, this is a set of parameter names for which the default rule, to not validate default values, does not apply and default values shall be passed through a given validator. joint_constraints: dict, optional Specification of higher-order constraints considering multiple parameters together. See the ``joint_validation()`` method for details. Constraints will be processed in the order in which they are declared in the mapping. Earlier validators can modify the parameter values that are eventually passed to validators executed later. tailor_for_dataset: dict, optional If given, this is a mapping of a name of a parameter whose constraint should be tailored to a particular dataset, to a name of a parameter providing this dataset. The dataset-providing parameter constraints will be evaluated first, and the resulting Dataset instances are used to tailor the constraints that require a dataset-context. The tailoring is performed if, and only if, the dataset-providing parameter actually evaluated to a `Dataset` instance. The non-tailored constraint is used otherwise. """ super().__init__() self._param_constraints = param_constraints self._joint_constraints = joint_constraints self._validate_defaults = validate_defaults or set() self._tailor_for_dataset = tailor_for_dataset or {} def joint_validation(self, params: Dict, on_error: str) -> Dict: """Higher-order validation considering multiple parameters at a time This method is called with all, individually validated, command parameters in keyword-argument form in the ``params`` dict argument. Arbitrary additional validation steps can be performed on the full set of parameters that may involve raising exceptions on validation errors, but also value transformation or replacements of individual parameters based on the setting of others. The parameter values returned by the method are passed on to the respective command implementation. The default implementation iterates over the ``joint_validators`` specification given to the constructor, in order to perform any number of validations. This is a mapping of a ``ParameterConstraintContext`` instance to a callable implementing a validation for a particular parameter set. Example:: _joint_validators_ = { ParameterConstraintContext(('p1', 'p2'), 'sum'): MyValidator._check_sum, } def _checksum(self, p1, p2): if (p1 + p2) < 3: self.raise_for( dict(p1=p1, p2=p2), 'parameter sum is too large', ) The callable will be passed the arguments named in the ``ParameterConstraintContext`` as keyword arguments, using the same names as originally given to ``EnsureCommandParameterization``. Any raised ``ConstraintError`` is caught and reported together with the respective ``ParameterConstraintContext``. The violating value reported in such a ``ConstraintError`` must be a mapping of parameter name to value, comprising the full parameter set (i.e., keys matching the ``ParameterConstraintContext``). The use of ``self.raise_for()`` is encouraged. If the callable anyhow modifies the passed arguments, it must return them as a kwargs-like mapping. If nothing is modified, it is OK to return ``None``. Returns ------- dict The returned dict must have a value for each item passed in via ``params``. on_error: {'raise-early', 'raise-at-end'} Flag how to handle constraint violation. By default, validation is stopped at the first error and an exception is raised. When an exhaustive validation is performed, an eventual exception contains information on all constraint violations. Raises ------ ConstraintErrors With `on_error='raise-at-end'` an implementation can choose to collect more than one higher-order violation and raise them as a `ConstraintErrors` exception. """ # if we have nothing, do nothing if not self._joint_constraints: return params exceptions = {} validated = params.copy() for ctx, validator in self._joint_constraints.items(): # what the validator will produce res = None try: # call the validator with the parameters given in the context # and only with those, to make sure the context is valid # and not an underspecification. # pull the values form `validated` to be able to benefit # from incremental coercing done in individual checks res = validator(**{p: validated[p] for p in ctx.parameters}) except ConstraintError as e: if not isinstance(e.value, dict) \ or set(ctx.parameters) != e.value.keys(): # pragma: no cover raise RuntimeError( 'on raising a ConstraintError the joint validator ' f'{validator} did not report ' 'a mapping of parameter name to (violating) value ' 'comprising all constraint context parameters. ' 'This is a software defect of the joint validator. ' 'Please report!') exceptions[ctx] = e if on_error == 'raise-early': raise CommandParametrizationError(exceptions) if res is not None: validated.update(**res) if exceptions: raise CommandParametrizationError(exceptions) return validated def __call__( self, kwargs, at_default=None, required=None, on_error='raise-early', ) -> Dict: """ Parameters ---------- kwargs: dict Parameter name (``str``)) to value (any) mapping of the parameter set. at_default: set or None Set of parameter names where the respective values in ``kwargs`` match their respective defaults. This is used for deciding whether or not to process them with an associated value constraint (see the ``validate_defaults`` constructor argument). required: set or None Set of parameter names that are known to be required. on_error: {'raise-early', 'raise-at-end'} Flag how to handle constraint violation. By default, validation is stopped at the first error and an exception is raised. When an exhaustive validation is performed, an eventual exception contains information on all constraint violations. Regardless of this mode more than one error can be reported (in case (future) implementation perform independent validations in parallel). Raises ------ CommandParametrizationError Raised whenever one (or more) ``ConstraintError`` exceptions are caught during validation. Other exception types are not caught and pass through. """ assert on_error in ('raise-early', 'raise-at-end') exceptions = {} missing_args = tuple(a for a in (required or []) if a not in kwargs) if missing_args: exceptions[ParameterConstraintContext(missing_args)] = \ ConstraintError( self, dict(zip(missing_args, [NoValue()] * len(missing_args))), 'missing required arguments', ) if on_error == 'raise-early': raise CommandParametrizationError(exceptions) # validators to work with. make a copy of the dict to be able to tailor # them for this run only # TODO copy likely not needed param_constraints = self._param_constraints.copy() # names of parameters we need to process to_validate = set(kwargs) # check for any dataset that are required for tailoring other parameters ds_provider_params = set(self._tailor_for_dataset.values()) # take these out of the set of parameters to validate, because we need # to process them first. # the approach is to simply sort them first, but otherwise apply standard # handling to_validate.difference_update(ds_provider_params) # strip all args provider args that have not been provided ds_provider_params.intersection_update(kwargs) validated = {} # process all parameters. starts with those that are needed as # dependencies for others. # this dependency-based sorting is very crude for now. it does not # consider possible dependencies within `ds_provider_params` at all for argname in chain(ds_provider_params, to_validate): arg = kwargs[argname] if at_default \ and argname not in self._validate_defaults \ and argname in at_default: # do not validate any parameter where the value matches the # default declared in the signature. Often these are just # 'do-nothing' settings or have special meaning that need # not be communicated to a user. Not validating them has # two consequences: # - the condition can simply be referred to as "default # behavior" regardless of complexity # - a command implementation must always be able to handle # its own defaults directly, and cannot delegate a # default value handling to a constraint # # we must nevertheless pass any such default value through # to make/keep them accessible to the general result handling # code validated[argname] = arg continue # look-up validator for this parameter, if there is none use # NoConstraint to avoid complex conditionals in the code below validator = param_constraints.get(argname, NoConstraint()) # do we need to tailor this constraint for a specific dataset? # only do if instructed AND the respective other parameter # validated to a Dataset instance. Any such parameter was sorted # to be validated first in this loop, so the outcome of that is # already available tailor_for = self._tailor_for_dataset.get(argname) if tailor_for and isinstance(validated.get(tailor_for), DatasetParameter): validator = validator.for_dataset(validated[tailor_for]) try: validated[argname] = validator(arg) # we catch only ConstraintError -- only these exceptions have what # we need for reporting. If any validator chooses to raise # something else, we do not handle it here, but let it bubble up. # it may be an indication of something being wrong with validation # itself except ConstraintError as e: # standard exception type, record and proceed exceptions[ParameterConstraintContext((argname,))] = e if on_error == 'raise-early': raise CommandParametrizationError(exceptions) except Exception as e: # non-standard exception type # we need to achieve uniform CommandParametrizationError # raising, so let's create a ConstraintError for this # exception e = ConstraintError( validator, arg, '{__caused_by__}', ctx=dict(__caused_by__=e), ) exceptions[ParameterConstraintContext((argname,))] = e if on_error == 'raise-early': raise CommandParametrizationError(exceptions) # do not bother with joint validation when the set of expected # arguments is not complete expected_for_joint_validation = set() for jv in self._joint_constraints or []: expected_for_joint_validation.update(jv.parameters) if not expected_for_joint_validation.issubset(validated): raise CommandParametrizationError(exceptions) try: # call (subclass) method to perform holistic, cross-parameter # validation of the full parameterization final = self.joint_validation(validated, on_error) # check requirements of .joint_validation(), a particular # implementation could be faulty, and we want to report this # problem in the right context try: assert final.keys() == validated.keys() except Exception as e: raise RuntimeError( f"{self.__class__.__name__}.joint_validation() " "did not return items for all passed parameters. " "Invalid implementation.") from e # we catch the good stuff first. the underlying implementation is # providing an exception with detailed context info on possibly # multiple errors except ParametrizationErrors as e: # we can simply suck in the reports, the context keys do not # overlap, unless the provided validators want that for some # reason exceptions.update(e.errors) if exceptions: raise CommandParametrizationError(exceptions) return final datalad-next-1.4.1/datalad_next/constraints/parameter_legacy.py000066400000000000000000000163011462321624600247470ustar00rootroot00000000000000"""Constraints for legacy implementations related to command parameters""" from __future__ import annotations from typing import ( Any, Dict, TYPE_CHECKING, Type, TypeVar, ) from .base import Constraint from .basic import ( EnsureBool, EnsureChoice, EnsureFloat, EnsureInt, EnsureStr, NoConstraint, ) from .compound import ( ConstraintWithPassthrough, EnsureIterableOf, EnsureMapping, ) from .parameter import NoValue if TYPE_CHECKING: # pragma: no cover from datalad_next.commands import Parameter EnsureParameterConstraint_T = TypeVar( 'EnsureParameterConstraint_T', bound='EnsureParameterConstraint', ) class EnsureParameterConstraint(EnsureMapping): """Ensures a mapping from a Python parameter name to a value constraint An optional "pass-though" value can be declare that is then exempt from validation and is returned as-is. This can be used to support, for example, special default values that only indicate the optional nature of a parameter. Declaring them as "pass-through" avoids a needless complexity-increase of a value constraint that would translate onto user-targeted error reporting. """ # valid parameter name for Python and CLI # - must start with a lower-case letter # - must not contain symbols other than lower-case letters, # digits, and underscore valid_param_name_regex = r'[a-z]{1}[a-z0-9_]*' def __init__(self, constraint: Constraint, passthrough: Any = NoValue): """ Parameters ---------- constraint: Any ``Constraint`` subclass instance that will be used to validate parameter values. passthrough: A value that will not be subjected to validation by the value constraint, but is returned as-is. This can be used to exempt default values from validation, e.g. when defaults are only placeholder values to indicate the optional nature of a parameter. """ super().__init__( key=EnsureStr( match=EnsureParameterConstraint.valid_param_name_regex), value=ConstraintWithPassthrough( constraint, passthrough, ), # make it look like dict(...) delimiter='=', ) @property def parameter_constraint(self): return self._value_constraint @property def passthrough_value(self): return self._value_constraint.passthrough def __call__(self, value) -> Dict: key, val = self._get_key_value(value) key = self._key_constraint(key) val = self._value_constraint(val) \ if val != self.passthrough_value else val return {key: val} @classmethod def from_parameter( cls: Type[EnsureParameterConstraint_T], spec: Parameter, default: Any, item_constraint: Constraint | None = None, nargs: str | int | None = None) -> EnsureParameterConstraint_T: """ Parameters ---------- spec: Parameter Instance of a datalad-core Parameter. If not overwritten by values given to the other arguments of this method, item constraints, number of arguments and other argparse-specific information is taken from this object and processed to built a comprehensive constraint that handles all aspects of the specification in a homogeneous fashion via the Constraint interface. default: Any A parameter's default value. It is configured as a "pass-through" value that will not be subjected to validation. item_constraint: If given, it override any constraint declared in the Parameter instance given to `spec` nargs: If given, it override any nargs setting declared in the Parameter instance given to `spec`. """ value_constraint = _get_comprehensive_constraint( spec, item_constraint, nargs, ) return cls(value_constraint, passthrough=default) # that mapping is NOT to be expanded! # it is a legacy leftover. It's usage triggers a DeprecationWarning _constraint_spec_map = { 'float': EnsureFloat(), 'int': EnsureInt(), 'bool': EnsureBool(), 'str': EnsureStr(), } def _get_comprehensive_constraint( param_spec: Parameter, # TODO remove `str` when literal constraint support is removed item_constraint_override: Constraint | str | None = None, nargs_override: str | int | None = None): action = param_spec.cmd_kwargs.get('action') # definitive per-item constraint, consider override # otherwise fall back on Parameter.constraints constraint = item_constraint_override or param_spec.constraints if not (constraint is None or hasattr(constraint, '__call__')): import warnings warnings.warn("Literal constraint labels are no longer supported.", DeprecationWarning) try: return _constraint_spec_map[constraint] except KeyError: raise ValueError( f"unsupported constraint specification '{constraint}'") if not constraint: if action in ('store_true', 'store_false'): constraint = EnsureBool() elif param_spec.cmd_kwargs.get('choices'): constraint = EnsureChoice(*param_spec.cmd_kwargs.get('choices')) else: # always have one for simplicity constraint = NoConstraint() # we must additionally consider the following nargs spec for # a complete constraint specification # (int, '*', '+'), plus action= # - 'store_const' TODO # - 'store_true' and 'store_false' TODO # - 'append' # - 'append_const' TODO # - 'count' TODO # - 'extend' TODO # get the definitive argparse "nargs" value nargs = nargs_override or param_spec.cmd_kwargs.get('nargs', None) # try making a specific number explicit via dtype change try: nargs = int(nargs) except (ValueError, TypeError): pass # TODO reconsider using `list`, with no length-check it could # be a generator if isinstance(nargs, int): # we currently consider nargs=1 to be a request of a # single item, not a forced single-item list if nargs > 1: # sequence of a particular length constraint = EnsureIterableOf( list, constraint, min_len=nargs, max_len=nargs) elif nargs == '*': # datalad expects things often/always to also work for a single item constraint = EnsureIterableOf(list, constraint) | constraint elif nargs == '+': # sequence of at least 1 item, always a sequence, # but again datalad expects things often/always to also work for # a single item constraint = EnsureIterableOf( list, constraint, min_len=1) | constraint # handling of `default` and `const` would be here #elif nargs == '?' if action == 'append': # wrap into a(nother) sequence # (think: list of 2-tuples, etc. constraint = EnsureIterableOf(list, constraint) return constraint datalad-next-1.4.1/datalad_next/constraints/tests/000077500000000000000000000000001462321624600222325ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/constraints/tests/__init__.py000066400000000000000000000000001462321624600243310ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/constraints/tests/test_base.py000066400000000000000000000105001462321624600245510ustar00rootroot00000000000000import pytest from ..base import ( Constraint, AllOf, AnyOf, ) from ..basic import ( EnsureDType, EnsureInt, EnsureFloat, EnsureBool, EnsureNone, EnsureRange, EnsureStr, ) def test_base(): # there is no "standard" implementation with pytest.raises(NotImplementedError): Constraint()('whatever') # no standard docs with pytest.raises(NotImplementedError): Constraint().short_description() with pytest.raises(NotImplementedError): Constraint().long_description() # dataset context switching is by default a no-op generic = Constraint() assert id(generic) == id(generic.for_dataset('some')) def test_allof(): # this should always work c = AllOf(EnsureFloat()) assert c(7.0) == 7.0 c = AllOf(EnsureFloat(), EnsureRange(min=4.0)) assert repr(c) == 'AllOf(EnsureFloat(), EnsureRange())' assert c(7.0) == 7.0 # __and__ form c = EnsureFloat() & EnsureRange(min=4.0) assert c.short_description() == '(float and not less than 4.0)' assert 'and not less than 4.0' in c.long_description() assert c(7.0) == 7.0 with pytest.raises(ValueError): c(3.9) c = AllOf(EnsureFloat(), EnsureRange(min=4), EnsureRange(max=9)) assert c(7.0) == 7.0 with pytest.raises(ValueError): c(3.9) with pytest.raises(ValueError): c(9.01) # __and__ form c = EnsureFloat() & EnsureRange(min=4) & EnsureRange(max=9) assert c(7.0) == 7.0 with pytest.raises(ValueError): c(3.99) with pytest.raises(ValueError): c(9.01) # and reordering should not have any effect c = AllOf(EnsureRange(max=4), EnsureRange(min=9), EnsureFloat()) with pytest.raises(ValueError): c(3.99) with pytest.raises(ValueError): c(9.01) # smoke test concat AND constraints c1 = AllOf(EnsureRange(max=10), EnsureRange(min=5)) c2 = AllOf(EnsureRange(max=6), EnsureRange(min=2)) c = c1 & c2 # make sure that neither c1, nor c2 is modified assert len(c1.constraints) == 2 assert len(c2.constraints) == 2 assert len(c.constraints) == 4 assert c(6) == 6 with pytest.raises(ValueError): c(4) def test_anyof(): # this should always work c = AnyOf(EnsureFloat()) # passes the docs through assert c.short_description() == EnsureFloat().short_description() assert c(7.0) == 7.0 c = AnyOf(EnsureFloat(), EnsureNone()) assert repr(c) == 'AnyOf(EnsureFloat(), EnsureNone())' # wraps docs in parenthesis to help appreciate the scope of the # OR'ing assert c.short_description().startswith( f'({EnsureFloat().short_description()}') assert c.short_description(), '(float or None)' assert c(7.0) == 7.0 assert c(None) is None # OR with an alternative just extends c = c | EnsureInt() assert c.short_description(), '(float or None or int)' # OR with an alternative combo also extends c = c | AnyOf(EnsureBool(), EnsureInt()) # yes, no de-duplication assert c.short_description(), '(float or None or int or bool or int)' # spot check long_description, must have some number assert len(c.long_description().split(' or ')) == 5 # __or__ form c = EnsureFloat() | EnsureNone() assert c(7.0) == 7.0 assert c(None) is None # this should always fail c = AllOf(EnsureRange(min=0, max=4), EnsureRange(min=9, max=11)) with pytest.raises(ValueError): c(7.0) c = EnsureRange(min=0, max=4) | EnsureRange(min=9, max=11) assert c(3.0) == 3.0 assert c(9.0) == 9.0 with pytest.raises(ValueError): c(7.0) with pytest.raises(ValueError): c(-1.0) # verify no inplace modification c1 = EnsureInt() | EnsureStr() c2 = c1 | EnsureDType(c1) # OR'ing does not "append" the new alternative to c1. assert len(c1.constraints) == 2 # at the same time, c2 does not contain an AnyOf # as an internal constraint, because this would be needless # complexity re the semantics of OR assert len(c2.constraints) == 3 def test_both(): # this should always work c = AnyOf( AllOf( EnsureFloat(), EnsureRange(min=7.0, max=44.0)), EnsureNone(), ) assert c(7.0) == 7.0 assert c(None) is None # this should always fail with pytest.raises(ValueError): c(77.0) datalad-next-1.4.1/datalad_next/constraints/tests/test_basic.py000066400000000000000000000232231462321624600247260ustar00rootroot00000000000000import pathlib import pytest from ..base import DatasetParameter from ..basic import ( EnsureInt, EnsureFloat, EnsureBool, EnsureStr, EnsureStrPrefix, EnsureNone, EnsureCallable, EnsureChoice, EnsureHashAlgorithm, EnsureKeyChoice, EnsureRange, EnsurePath, EnsureValue, NoConstraint, ) from ..exceptions import ConstraintError from ..utils import _type_str def test_noconstraint(): c = NoConstraint() assert c(5) == 5 assert c.short_description() == '' def test_int(): c = EnsureInt() # this should always work assert c(7) == 7 assert c(7.0) == 7 assert c('7') == 7 # no automatic inspection of iterables, should use EnsureIterableOf with pytest.raises(ConstraintError): c([7, 3]) # this should always fail with pytest.raises(ValueError): c('fail') # this will also fail with pytest.raises(ValueError): c('17.0') assert c.short_description() == 'int' def test_float(): c = EnsureFloat() # this should always work assert c(7.0) == 7.0 assert c(7) == 7.0 assert c('7') == 7.0 # no automatic inspection of iterables, should use EnsureIterableOf with pytest.raises(ConstraintError): c([7.0, '3.0']) # this should always fail with pytest.raises(ValueError): c('fail') def test_bool(): c = EnsureBool() # this should always work assert c(True) is True assert c(False) is False # all that results in True assert c('True') is True assert c('true') is True assert c('1') is True assert c('yes') is True assert c('on') is True assert c('enable') is True # all that results in False assert c('false') is False assert c('False') is False assert c('0') is False assert c('no') is False assert c('off') is False assert c('disable') is False # this should always fail with pytest.raises(ValueError): c(0) with pytest.raises(ValueError): c(1) def test_str(): c = EnsureStr() # this should always work assert c('hello') == 'hello' assert c('7.0') == '7.0' # this should always fail with pytest.raises(ValueError): c(['ab']) with pytest.raises(ValueError): c(['a', 'b']) with pytest.raises(ValueError): c(('a', 'b')) # no automatic conversion attempted with pytest.raises(ValueError): c(7.0) assert c.short_description() == 'str' def test_str_min_len(): c = EnsureStr(min_len=1) assert c('hello') == 'hello' assert c('h') == 'h' with pytest.raises(ValueError): c('') c = EnsureStr(min_len=2) assert c('hello') == 'hello' with pytest.raises(ValueError): c('h') def test_EnsureStr_match(): # alphanum plus _ and ., non-empty pattern = '[a-zA-Z0-9-.]+' constraint = EnsureStr(match=pattern) # reports the pattern in the description for m in (constraint.short_description, constraint.long_description): assert pattern in m() # must work assert constraint('a0F-2.') == 'a0F-2.' for v in ('', '123_abc'): with pytest.raises(ValueError): assert constraint('') def test_EnsureStrPrefix(): c = EnsureStrPrefix('some-') c('some-mess') == 'some-mess' with pytest.raises(ValueError): c('mess') assert c.short_description() == 'some-...' assert c.long_description() == "value must start with 'some-'" def test_EnsureValue(): c = EnsureValue(5) assert c.short_description() == '5' # this should always work assert c(5) == 5 # type mismatch with pytest.raises(ValueError): c('5') # value mismatches with pytest.raises(ValueError): c('None') with pytest.raises(ValueError): c([]) # special case of EnsureValue def test_none(): c = EnsureNone() assert c.short_description() == 'None' # this should always work assert c(None) is None # this should always fail with pytest.raises(ValueError): c('None') with pytest.raises(ValueError): c([]) def test_callable(): c = EnsureCallable() assert c.short_description() == 'callable' assert c.long_description() == 'value must be a callable' # this should always work assert c(range) == range with pytest.raises(ValueError): c('range') def test_choice(): c = EnsureChoice('choice1', 'choice2', None) descr = c.long_description() for i in ('choice1', 'choice2', 'CMD', 'PY'): assert i in descr # short is a "set" or repr()s assert c.short_description() == "{'choice1', 'choice2', None}" assert str(c) == "one of {'choice1', 'choice2', None}" # this should always work assert c('choice1') == 'choice1' assert c(None) is None # this should always fail with pytest.raises(ValueError): c('fail') with pytest.raises(ValueError): c('None') def test_keychoice(): c = EnsureKeyChoice(key='some', values=('choice1', 'choice2', None)) descr = c.long_description() for i in ('some', 'choice1', 'choice2'): assert i in descr assert c.short_description() == "some:{'choice1', 'choice2', None}" assert c({'some': 'choice1'}) == {'some': 'choice1'} assert c({'some': None}) == {'some': None} assert c({'some': None, 'ign': 'ore'}) == {'some': None, 'ign': 'ore'} with pytest.raises(ValueError): c('fail') with pytest.raises(ValueError): c('None') with pytest.raises(ValueError): c({'nope': 'None'}) with pytest.raises(ValueError): c({'some': 'None'}) with pytest.raises(ValueError): c({'some': ('a', 'b')}) def test_range(): with pytest.raises(ValueError): EnsureRange(min=None, max=None) c = EnsureRange(max=7) assert c.short_description() == 'not greater than 7' c = EnsureRange(min=3, max=7) # this should always work assert c(3.0) == 3.0 # this should always fail with pytest.raises(ValueError): c(2.9999999) with pytest.raises(ValueError): c(77) with pytest.raises(TypeError): c('fail') with pytest.raises(TypeError): c((3, 4)) # since no type checks are performed with pytest.raises(TypeError): c('7') # Range doesn't have to be numeric c = EnsureRange(min="e", max="qqq") assert c.short_description() == "in range from 'e' to 'qqq'" assert c('e') == 'e' assert c('fa') == 'fa' assert c('qq') == 'qq' with pytest.raises(ValueError): c('a') with pytest.raises(ValueError): c('qqqa') def test_type_str(): assert _type_str((str,)) == 'str' assert _type_str(str) == 'str' def test_EnsurePath(tmp_path): target = pathlib.Path(tmp_path) assert EnsurePath()(tmp_path) == target assert EnsurePath(lexists=True)(tmp_path) == target with pytest.raises(ValueError): EnsurePath(lexists=False)(tmp_path) with pytest.raises(ValueError): EnsurePath(lexists=True)(tmp_path / 'nothere') assert EnsurePath(is_format='absolute')(tmp_path) == target with pytest.raises(ValueError): EnsurePath(is_format='relative')(tmp_path) with pytest.raises(ValueError): EnsurePath(is_format='absolute')(tmp_path.name) from stat import S_ISDIR, S_ISREG assert EnsurePath(is_mode=S_ISDIR)(tmp_path) == target with pytest.raises(ValueError): EnsurePath(is_mode=S_ISREG)(tmp_path) # give particular path type assert EnsurePath(path_type=pathlib.PurePath )(tmp_path) == pathlib.PurePath(tmp_path) # not everything is possible, this is known and OK with pytest.raises(AttributeError): EnsurePath( path_type=pathlib.PurePath, is_mode=S_ISREG, )(tmp_path) assert EnsurePath().short_description() == 'path' assert EnsurePath(is_format='absolute').short_description() == 'absolute path' # default comparison mode is parent-or-same-as c = EnsurePath(ref=target) assert c(target) == target assert c(target / 'some') == target / 'some' with pytest.raises(ValueError): assert c(target.parent) c = EnsurePath(ref=target, ref_is='parent-of') assert c(target / 'some') == target / 'some' with pytest.raises(ValueError): assert c(target) assert c.short_description() == f'path that is parent-of {target}' with pytest.raises(AssertionError): c = EnsurePath(ref=target, ref_is='stupid') def test_EnsurePath_fordataset(existing_dataset): P = pathlib.Path ds = existing_dataset # standard: relative in, relative out c = EnsurePath() assert c('relpath') == P('relpath') # tailor constraint for our dataset # (this is what would be done by EnsureCommandParameterization # 1. dataset given as a path -- resolve against CWD # output is always absolute tc = c.for_dataset(DatasetParameter(None, ds)) assert tc('relpath') == (P.cwd() / 'relpath') # 2. dataset is given as a dataset object tc = c.for_dataset(DatasetParameter(ds, ds)) assert tc('relpath') == (ds.pathobj / 'relpath') def test_EnsureHashAlgorithm(): c = EnsureHashAlgorithm() # simple cases that should pass hashes = [ 'sha3_256', 'shake_256', 'sha3_384', 'md5', 'shake_128', 'sha384', 'sha3_224', 'blake2s', 'sha1', 'blake2b', 'sha224', 'sha512', 'sha256', 'sha3_512' ] for hash in hashes: c(hash) # a few bogus ones: bad_hashes = [ 'md17', 'McGyver', 'sha2', 'bogus' ] for baddie in bad_hashes: with pytest.raises(ConstraintError): c(baddie) # check messaging for i in ('md5', 'shake_256', 'sha3_512'): assert i in c.short_description() assert i in c.long_description() datalad-next-1.4.1/datalad_next/constraints/tests/test_cmdarg_validation.py000066400000000000000000000272521462321624600273220ustar00rootroot00000000000000import pytest from io import StringIO from pathlib import Path from tempfile import NamedTemporaryFile from unittest.mock import patch from uuid import UUID from datalad_next.commands import ( ValidatedInterface, Parameter, eval_results, ) from datalad_next.consts import on_windows from .. import ( ConstraintError, EnsureGeneratorFromFileLike, EnsureInt, EnsureJSON, EnsureListOf, EnsureMapping, EnsurePath, EnsureRange, EnsureStr, EnsureURL, EnsureValue, ) from ..base import ( AnyOf, Constraint, ) from ..dataset import EnsureDataset from ..parameter import EnsureCommandParameterization from ..exceptions import ParameterConstraintContext class EnsureAllUnique(Constraint): def __call__(self, value): if len(set(value)) < len(value): self.raise_for(value, 'not all values are unique') return value class BasicCmdValidator(EnsureCommandParameterization): url_constraint = EnsureURL(required=['scheme']) url2path_constraint = EnsureMapping( key=url_constraint, value=EnsurePath(), delimiter='\t' ) spec_item_constraint = url2path_constraint | url_constraint \ | (EnsureJSON() & url2path_constraint) spec_constraint = AnyOf( EnsureListOf(spec_item_constraint), EnsureGeneratorFromFileLike(spec_item_constraint), spec_item_constraint, ) def __init__(self, **kwargs): # this is the key bit: a mapping of parameter names to validators super().__init__( dict( spec=self.spec_constraint, p1=EnsureInt() | EnsureStr(), ), **kwargs ) class SophisticatedCmdValidator(BasicCmdValidator): def _check_unique_values(self, **kwargs): try: EnsureAllUnique()(kwargs.values()) except ConstraintError as e: self.raise_for( kwargs, e.msg, ) def _check_sum_range(self, p1, p2): try: EnsureRange(min=3)(p1 + p2) except ConstraintError: self.raise_for( dict(p1=p1, p2=p2), "it's too small" ) def _limit_sum_range(self, p1, p2): # random example of a joint constraint that modifies the parameter # set it is given return dict(p1=p1, p2=min(p2, 100 - p1 - p2)) def __init__(self): # this is the key bit: a mapping of parameter names to validators super().__init__( # implementation example of a higher-order constraint joint_constraints={ ParameterConstraintContext(('p1', 'p2'), 'identity'): self._check_unique_values, ParameterConstraintContext(('p1', 'p2'), 'sum'): self._check_sum_range, ParameterConstraintContext(('p1', 'p2'), 'sum-limit'): self._limit_sum_range, }, ) class BrokenJointValidation(SophisticatedCmdValidator): def joint_validation(self, params, on_error): res = super().joint_validation(params, on_error) # remove any report, and it should trigger a RuntimeError on return res.popitem() return res class CmdWithValidation(ValidatedInterface): # this is of little relevance, no validation configured here _params_ = dict(spec=Parameter(args=('spec',), nargs='+')) _validator_ = BasicCmdValidator() # command implementation that only validates and returns the outcome @staticmethod @eval_results def __call__(spec, p1='one', p2='two'): yield dict( action='cmd_with_validation', # list() consumes any potential generator spec=list(spec), status='ok', ) def test_multi_validation(): val = BasicCmdValidator() # we break the parameter specification, and get a ConstraintError with pytest.raises(ConstraintError) as e: val(dict(spec='5')) # but actually, it is a ConstraintErrors instance, and we get the # violation exceptions within the context in which they occurred. # here this is a parameter name errors = e.value.errors assert len(errors) == 1 ctx = ParameterConstraintContext(('spec',)) assert ctx in errors assert errors[ctx].constraint == BasicCmdValidator.spec_constraint assert errors[ctx].value == '5' # now we trigger a higher-order error, and receive multiple reports val = SophisticatedCmdValidator() # but first a quick check if it behaves will with valid input valid_input = dict(spec='http://example.com', p1=1, p2=2) assert val(valid_input) == valid_input with pytest.raises(ConstraintError) as e: val(dict(spec='5', p1=1, p2=1), on_error='raise-at-end') errors = e.value.errors assert len(errors) == 3 # the spec-param-only error assert errors.messages[0].startswith('does not match any of') # higher-order issue traces (their order is deterministic) assert 'not all values are unique' == errors.messages[1] assert 'p1, p2 (identity)' == errors.context_labels[1] assert 'p1, p2 (sum)' in errors.context_labels # and now against, but with stop-on-first-error with pytest.raises(ConstraintError) as e: val(dict(spec='5', p1=1, p2=1), on_error='raise-early') errors = e.value.errors # and we only get one! assert len(errors) == 1 # the spec-param-only error assert errors.messages[0].startswith('does not match any of') assert 'not all values are unique' not in errors.messages # now we do it again, but with a valid spec, such that the first # and only error is a higher order error with pytest.raises(ConstraintError) as e: val(dict(spec=5, p1=1, p2=1), on_error='raise-early') errors = e.value.errors assert len(errors) == 1 assert 'not all values are unique' == errors.messages[0] assert 'p1, p2 (identity)' == errors.context_labels[0] # a single-parameter validation error does not lead to a crash # in higher-order validation, instead the latter is performed # when a require argument could not be provided with pytest.raises(ConstraintError) as e: # p1 must be int|str val(dict(spec=5, p1=None, p2=1), on_error='raise-at-end') def test_invalid_multi_validation(): val = BrokenJointValidation() # this works for the underlying validator, but BrokenJointValidation # butchers the result, which must be detected valid_input = dict(spec='http://example.com', p1=1, p2=2) with pytest.raises(RuntimeError): val(valid_input) def test_cmd_with_validation(no_result_rendering): target_urls = ['http://example.com', 'file:///dev/null'] target_url_path_maps = [ {'http://example.com': Path('some/dir/file')}, {'file:///dev/null': Path('/dev/null')}, ] json_lines = '{"http://example.com":"some/dir/file"}\n' \ '{"file:///dev/null":"/dev/null"}' for input, target in ( # perfect input (target_urls, target_urls), (target_url_path_maps, target_url_path_maps), # actual invput conversion ([{'http://example.com': 'some/dir/file'}, {'file:///dev/null': '/dev/null'}], target_url_path_maps), # custom mapping syntax (['http://example.com\tsome/dir/file', 'file:///dev/null\t/dev/null'], target_url_path_maps), # JSON lines (['{"http://example.com":"some/dir/file"}', '{"file:///dev/null":"/dev/null"}'], target_url_path_maps), # from file with JSON lines (StringIO(json_lines), target_url_path_maps), ): res = CmdWithValidation.__call__( input, return_type='item-or-list', ) assert 'spec' in res assert res['spec'] == target # read from file if not on_windows: # on windows the write-rewind-test logic is not possible # (PermissionError) -- too lazy to implement a workaround with NamedTemporaryFile('w+') as f: f.write(json_lines) f.seek(0) res = CmdWithValidation.__call__( f.name, return_type='item-or-list', ) assert res['spec'] == target_url_path_maps with patch("sys.stdin", StringIO(json_lines)): res = CmdWithValidation.__call__( '-', return_type='item-or-list', ) assert res['spec'] == target_url_path_maps # and now something that fails # TODO error reporting should be standardized (likely) on an explicit # and dedicated exception type # https://github.com/datalad/datalad/issues/7167 with pytest.raises(ValueError): CmdWithValidation.__call__( 'unsupported', return_type='item-or-list', ) # no call with a required argument missing with pytest.raises(ValueError) as e: CmdWithValidation.__call__() exc_rendering = str(e.value) # must label the issue correctly assert 'missing required argument' in exc_rendering # must identify the missing argument assert 'spec=' in exc_rendering # # test dataset tailoring # class EnsureUUID(Constraint): def __call__(self, value): return UUID(value) class EnsureDatasetID(EnsureUUID): """Makes sure that something is a dataset ID (UUID), or the dataset UUID of a particular dataset when tailored""" def for_dataset(self, dsarg): return EnsureValue(UUID(dsarg.ds.id)) class DsTailoringValidator(EnsureCommandParameterization): def __init__(self, **kwargs): # this is the key bit: a mapping of parameter names to validators super().__init__( dict( dataset=EnsureDataset(), id=EnsureDatasetID(), ), **kwargs ) def test_constraint_dataset_tailoring(existing_dataset): proper_uuid = '152f4fc0-b444-11ed-a9cb-701ab88b716c' no_uuid = '152f4fc0------11ed-a9cb-701ab88b716c' # no tailoring works as expected val = DsTailoringValidator() assert val(dict(id=proper_uuid)) == dict(id=UUID(proper_uuid)) with pytest.raises(ValueError): val(dict(id=no_uuid)) # adding a dataset to the mix does not change a thing re the uuid ds = existing_dataset res = val(dict(dataset=ds.pathobj, id=proper_uuid)) assert res['id'] == UUID(proper_uuid) assert res['dataset'].ds == ds # and we can still break it with pytest.raises(ValueError): val(dict(dataset=ds.pathobj, id=no_uuid)) # now with tailoring the UUID checking to a particular dataset. # it is enabled via parameter, because it is a use case specific # choice, not a mandate target_uuid = ds.id tailoring_val = DsTailoringValidator( tailor_for_dataset=dict(id='dataset'), ) # no uuid is still an issue with pytest.raises(ValueError): tailoring_val(dict(dataset=ds.pathobj, id=no_uuid)) # what was good enough above (any UUID), no longer is with pytest.raises(ValueError): tailoring_val(dict(dataset=ds.pathobj, id=proper_uuid)) # only the actual dataset's UUID makes it past the gates res = val(dict(dataset=ds.pathobj, id=target_uuid)) assert res['id'] == UUID(target_uuid) assert res['dataset'].ds == ds # the order in the kwargs does not matter assert val(dict(id=target_uuid, dataset=ds.pathobj))['id'] == \ UUID(target_uuid) # but when no dataset is being provided (and the dataset-constraint # allows for that), no tailoring is performed assert tailoring_val(dict(id=proper_uuid))['id'] == UUID(proper_uuid) # but still no luck with invalid args with pytest.raises(ValueError): val(dict(dataset=ds.pathobj, id=no_uuid)) datalad-next-1.4.1/datalad_next/constraints/tests/test_compound.py000066400000000000000000000224631462321624600254760ustar00rootroot00000000000000from inspect import isgenerator from io import StringIO import pytest from tempfile import NamedTemporaryFile from unittest.mock import patch from pathlib import Path from datalad_next.consts import on_windows from datalad_next.exceptions import CapturedException from ..base import DatasetParameter from ..basic import ( EnsureInt, EnsureBool, EnsurePath, ) from ..compound import ( ConstraintWithPassthrough, EnsureIterableOf, EnsureListOf, EnsureTupleOf, EnsureMapping, EnsureGeneratorFromFileLike, WithDescription, ) from ..exceptions import ConstraintError # imported from ancient test code in datalad-core, # main test is test_EnsureIterableOf def test_EnsureTupleOf(): c = EnsureTupleOf(str) assert c(['a', 'b']) == ('a', 'b') assert c(['a1', 'b2']) == ('a1', 'b2') assert c.short_description() == "tuple()" # imported from ancient test code in datalad-core, # main test is test_EnsureIterableOf def test_EnsureListOf(): c = EnsureListOf(str) assert c(['a', 'b']) == ['a', 'b'] assert c(['a1', 'b2']) == ['a1', 'b2'] assert c.short_description() == "list()" assert repr(c) == \ "EnsureListOf(item_constraint=, min_len=None, max_len=None)" def test_EnsureIterableOf(): c = EnsureIterableOf(list, int) assert c.short_description() == "()" assert c.item_constraint == int # testing aspects that are not covered by test_EnsureListOf tgt = [True, False, True] assert EnsureIterableOf(list, bool)((1, 0, 1)) == tgt assert EnsureIterableOf(list, bool, min_len=3, max_len=3)((1, 0, 1)) == tgt with pytest.raises(ValueError): # too many items EnsureIterableOf(list, bool, max_len=2)((1, 0, 1)) with pytest.raises(ValueError): # too few items EnsureIterableOf(list, bool, min_len=4)((1, 0, 1)) with pytest.raises(ValueError): # invalid specification min>max EnsureIterableOf(list, bool, min_len=1, max_len=0) with pytest.raises(ValueError): # item_constraint fails EnsureIterableOf(list, dict)([5.6, 3.2]) with pytest.raises(ValueError): # item_constraint fails EnsureIterableOf(list, EnsureBool())([5.6, 3.2]) seq = [3.3, 1, 2.6] def _mygen(): for i in seq: yield i def _myiter(iter): for i in iter: yield i # feeding a generator into EnsureIterableOf and getting one out assert list(EnsureIterableOf(_myiter, int)(_mygen())) == [3, 1, 2] def test_EnsureMapping(dataset): true_key = 5 true_value = False constraint = EnsureMapping(EnsureInt(), EnsureBool(), delimiter='::') # called without a mapping type with pytest.raises(ValueError): constraint(true_key) assert 'mapping of int -> bool' in constraint.short_description() assert repr(constraint) == \ "EnsureMapping(key=EnsureInt(), value=EnsureBool(), delimiter='::')" # must all work for v in ('5::no', [5, 'false'], ('5', False), {'5': 'False'}, ): d = constraint(v) assert isinstance(d, dict) assert len(d) == 1 k, v = d.popitem() assert k == true_key assert v == true_value # must all fail for v in ('5', [], tuple(), {}, # additional value [5, False, False], {'5': 'False', '6': True}): with pytest.raises(ValueError): d = constraint(v) # test for_dataset() # smoketest ds = dataset cds = constraint.for_dataset(ds) assert cds._key_constraint == constraint._key_constraint.for_dataset(ds) assert cds._value_constraint == \ constraint._value_constraint.for_dataset(ds) # test that the path is resolved for the dataset pathconstraint = \ EnsureMapping(key=EnsurePath(), value=EnsureInt()).for_dataset( DatasetParameter(ds.pathobj, ds)) assert pathconstraint('some:5') == {(Path.cwd() / 'some'): 5} pathconstraint = \ EnsureMapping(key=EnsurePath(), value=EnsurePath()).for_dataset( DatasetParameter(ds, ds)) assert pathconstraint('some:other') == \ {(ds.pathobj / 'some'): (ds.pathobj / 'other')} def test_EnsureGeneratorFromFileLike(): item_constraint = EnsureMapping(EnsureInt(), EnsureBool(), delimiter='::') constraint = EnsureGeneratorFromFileLike(item_constraint) assert 'items of type "mapping of int -> bool" read from a file-like' \ == constraint.short_description() assert repr(constraint) == \ "EnsureGeneratorFromFileLike(" \ "item_constraint=EnsureMapping(key=EnsureInt(), " \ "value=EnsureBool(), delimiter='::'))" c = constraint(StringIO("5::yes\n1234::no\n")) assert isgenerator(c) assert list(c) == [{5: True}, {1234: False}] # missing final newline is not a problem c = constraint(StringIO("5::yes\n1234::no")) assert list(c) == [{5: True}, {1234: False}] # item constraint violation invalid_input = StringIO("1234::BANG\n5::yes") # immediate raise is default with pytest.raises(ValueError) as e: list(constraint(invalid_input)) assert 'be convertible to boolean' in str(e) # but optionally it yields the exception to be able to # continue and enable a caller to raise/report/ignore # (must redefine `invalid_input` to read from start) invalid_input = StringIO("1234::BANG\n5::yes") res = list( EnsureGeneratorFromFileLike( item_constraint, exc_mode='yield', )(invalid_input) ) # we get the result after the exception occurred assert isinstance(res[0], CapturedException) assert res[1] == {5: True} # read from STDIN with patch("sys.stdin", StringIO("5::yes\n1234::no")): assert list(constraint('-')) == [{5: True}, {1234: False}] with patch("sys.stdin", StringIO("5::yes\n1234::no")): # will unpack a length-1 sequence for convenience assert list(constraint(['-'])) == [{5: True}, {1234: False}] # read from file if not on_windows: # on windows the write-rewind-test logic is not possible # (PermissionError) -- too lazy to implement a workaround with NamedTemporaryFile('w+') as f: f.write("5::yes\n1234::no") f.seek(0) assert list(constraint(f.name)) == [{5: True}, {1234: False}] # invalid file with pytest.raises(ValueError) as e: list(constraint('pytestNOTHEREdatalad')) def test_ConstraintWithPassthrough(dataset): wrapped = EnsureInt() cwp = ConstraintWithPassthrough(wrapped, passthrough='mike') # main purpose assert cwp('mike') == 'mike' assert cwp('5') == 5 # most info is coming straight from `wrapped`, the pass-through is # meant to be transparent assert str(cwp) == str(wrapped) assert cwp.short_description() == wrapped.short_description() assert cwp.long_description() == wrapped.long_description() # but repr reveals it assert repr(cwp).startswith('ConstraintWithPassthrough(') # tailoring for a dataset keeps the pass-through ds = dataset cwp_ds = cwp.for_dataset(ds) assert cwp_ds.passthrough == cwp.passthrough assert cwp.constraint == wrapped.for_dataset(ds) def test_WithDescription(dataset): wrapped = EnsureInt() # confirm starting point assert wrapped.input_synopsis == 'int' assert wrapped.input_description \ == "value must be convertible to type 'int'" # we are actually not replacing anything c = WithDescription(wrapped) assert c.input_synopsis == wrapped.input_synopsis assert c.input_description == wrapped.input_description # with no dataset docs, the wrapping is removed on tailoring ds = dataset assert isinstance( c.for_dataset(DatasetParameter(None, ds)), EnsureInt) # check all replacements are working c = WithDescription( wrapped, input_synopsis='mysynopsis', input_description='mydescription', input_synopsis_for_ds='dssynopsis', input_description_for_ds='dsdescription', error_message='myerror', error_message_for_ds='dserror', ) # function is maintained assert c('5') == 5 assert str(c) == '' assert repr(c) == \ "WithDescription(EnsureInt(), " \ "input_synopsis='mysynopsis', " \ "input_description='mydescription', " \ "input_synopsis_for_ds='dssynopsis', " \ "input_description_for_ds='dsdescription', " \ "error_message='myerror', " \ "error_message_for_ds='dserror')" assert c.constraint is wrapped assert c.input_synopsis == 'mysynopsis' assert c.input_description == 'mydescription' # description propagates through tailoring cds = c.for_dataset(DatasetParameter(None, ds)) assert isinstance(cds, WithDescription) assert cds.input_synopsis == 'dssynopsis' assert cds.input_description == 'dsdescription' # when the wrapped constraint raises, the wrapper # interjects and reports a different error with pytest.raises(ConstraintError) as e: c(None) assert e.value.msg == 'myerror' # legacy functionality c.short_description() == c.input_synopsis c.long_description() == c.input_description datalad-next-1.4.1/datalad_next/constraints/tests/test_exceptions.py000066400000000000000000000034171462321624600260310ustar00rootroot00000000000000from types import MappingProxyType from ..basic import EnsureInt from ..exceptions import ( CommandParametrizationError, ConstraintError, ConstraintErrors, ParameterConstraintContext, ParameterContextErrors, ParametrizationErrors, ) def test_constrainterror_repr(): c = EnsureInt() ce = ConstraintError(c, 'noint', 'yeah, bullshit') assert repr(ce) == \ f"ConstraintError({c!r}, 'noint', 'yeah, bullshit', None)" def test_constrainterrors(): c = EnsureInt() ce = ConstraintError(c, 'noint', 'yeah, bullshit') emap = dict(c1=ce) ces = ConstraintErrors(emap) assert ces.errors == emap assert isinstance(ces.errors, MappingProxyType) assert repr(ces) == f"ConstraintErrors({emap!r})" def test_parametercontext(): assert str(ParameterConstraintContext(('p1',))) == 'Context' assert str(ParameterConstraintContext( ('p1', 'p2'), 'some details', )) == 'Context' def test_parametercontexterrors(): c = EnsureInt() ce = ConstraintError(c, 'noint', 'yeah, bullshit') emap = { ParameterConstraintContext(('c1',)): ce, } pces = ParameterContextErrors(emap) assert pces.items() == emap.items() assert repr(pces) == repr(emap) def test_parameterizationerrors(): c = EnsureInt() ce = ConstraintError(c, 'noint', 'yeah, bullshit') emap = { ParameterConstraintContext(('c1',)): ce, } pes = ParametrizationErrors(emap) assert str(pes) == """\ 1 parameter constraint violation c1='noint' yeah, bullshit""" # CommandParametrizationError is pretty much the same thing cpes = CommandParametrizationError(emap) assert str(cpes) == """\ 1 command parameter constraint violation c1='noint' yeah, bullshit""" datalad-next-1.4.1/datalad_next/constraints/tests/test_special_purpose.py000066400000000000000000000346221462321624600270470ustar00rootroot00000000000000from io import StringIO import pytest from datalad_next.commands import Parameter from datalad_next.utils import chpwd from ..base import DatasetParameter from ..basic import ( EnsureInt, EnsureStr, NoConstraint, ) from ..compound import EnsureGeneratorFromFileLike from ..dataset import EnsureDataset from ..exceptions import ( ConstraintError, NoDatasetFound, ) from ..formats import ( EnsureJSON, EnsureURL, EnsureParsedURL, ) from ..git import ( EnsureGitRefName, EnsureRemoteName ) from ..parameter_legacy import EnsureParameterConstraint def test_EnsureGitRefName(): assert EnsureGitRefName().short_description() == '(single-level) Git refname' # standard branch name must work assert EnsureGitRefName()('main') == 'main' # normalize is on by default assert EnsureGitRefName()('/main') == 'main' with pytest.raises(ValueError): EnsureGitRefName(normalize=False)('/main') assert EnsureGitRefName(normalize=False)('main') == 'main' # no empty with pytest.raises(ValueError): EnsureGitRefName()('') with pytest.raises(ValueError): EnsureGitRefName()(None) # be able to turn off onelevel with pytest.raises(ValueError): EnsureGitRefName(allow_onelevel=False)('main') assert EnsureGitRefName(allow_onelevel=False)( 'refs/heads/main') == 'refs/heads/main' # refspec pattern off by default with pytest.raises(ValueError): EnsureGitRefName()('refs/heads/*') assert EnsureGitRefName(refspec_pattern=True)( 'refs/heads/*') == 'refs/heads/*' def test_EnsureRemoteName(existing_dataset): # empty sibling name must raise with pytest.raises(ValueError): EnsureRemoteName()('') assert EnsureRemoteName().short_description() == 'Name of a remote' assert EnsureRemoteName( known=True).short_description() == 'Name of a known remote' assert EnsureRemoteName( known=False).short_description() == 'Name of a not-yet-known remote' ds = existing_dataset c = EnsureRemoteName(known=False) tc = c.for_dataset(DatasetParameter(None, ds)) assert tc('newremotename') == 'newremotename' # add a remote ds._repo.add_remote('my-remote', 'here') # check should fail when it shouldn't exist with pytest.raises(ValueError): tc('my-remote') # should work when it should exist c = EnsureRemoteName(known=True) tc = c.for_dataset(DatasetParameter(None, ds)) assert tc('my-remote') == 'my-remote' # but fail with non-existing remote with pytest.raises(ValueError) as e: tc('not-my-remote') assert str(e.value) == "is not a known remote" # return sibling name with no existence checks assert EnsureRemoteName()('anything') == 'anything' def test_EnsureParameterConstraint(): # most basic case, no value constraint c = EnsureParameterConstraint(NoConstraint()) # invalid name with pytest.raises(ValueError): c({'4way': 123}) assert c('so1230_s82me=value') == dict(so1230_s82me='value') # now some from a standard Parameter declaration c = EnsureParameterConstraint.from_parameter( Parameter(), 'whateverdefault') assert c('some=value') == dict(some='value') # want a bool c = EnsureParameterConstraint.from_parameter( Parameter(action="store_true"), False) assert c('some=off') == dict(some=False) with pytest.raises(ValueError): c('some=5') c = EnsureParameterConstraint.from_parameter( # argparse specific choice declaration without # any constraint Parameter(choices=['a', 'b']), # but specifically use a default that is not a valid choice None) assert c('choice=a') == dict(choice='a') # default is valid too assert c({'choice': None}) == dict(choice=None) # multi-item values c = EnsureParameterConstraint.from_parameter( Parameter(nargs=2), (None, None)) assert c({'some': [3, 4]}) == dict(some=[3, 4]) with pytest.raises(ValueError): c({'some': 3}) with pytest.raises(ValueError): c({'some': [3, 4, 5]}) # one or more items c = EnsureParameterConstraint.from_parameter( Parameter(nargs='*'), None) # always prefers a list, no item type conversion by default assert c('some=5') == dict(some=['5']) assert c({'some': [5, 2]}) == dict(some=[5, 2]) # empty ok assert c({'some': []}) == dict(some=[]) # at least one item c = EnsureParameterConstraint.from_parameter( Parameter(nargs='+', constraints=EnsureInt()), None) assert c('some=5') == dict(some=[5]) assert c({'some': [5, 2]}) == dict(some=[5, 2]) # empty not ok with pytest.raises(ValueError): c({'some': []}) # complex case of iterables of length 2 c = EnsureParameterConstraint.from_parameter( Parameter(nargs=2, constraints=EnsureInt(), action='append'), None) # no iterable does not violate assert c({'some': []}) == dict(some=[]) assert c({'some': [[3, 2]]}) == dict(some=[[3, 2]]) assert c({'some': [[3, 2], [5, 4]]}) == dict(some=[[3, 2], [5, 4]]) # length mismatch with pytest.raises(ValueError): c({'some': [[3, 2], [1]]}) # no iterable with pytest.raises(ValueError): c({'some': [3, [1, 2]]}) with pytest.raises(ValueError): c({'some': 3}) # overwrite an item constraint and nargs c = EnsureParameterConstraint.from_parameter( Parameter(nargs=2, constraints=EnsureInt(), action='append'), None, item_constraint=EnsureStr(), nargs=1) assert c({'some': ['5']}) == dict(some=['5']) # literal constraint label # this is no longer supported, but still works: test until removed with pytest.deprecated_call(): c = EnsureParameterConstraint.from_parameter( Parameter(), 2, item_constraint='float') assert c('some=3') == dict(some=3.0) with pytest.raises(ValueError), \ pytest.deprecated_call(): EnsureParameterConstraint.from_parameter( Parameter(), 2, item_constraint='unknown') def test_EnsureParameterConstraint_passthrough(): c = EnsureParameterConstraint(EnsureInt(), passthrough=None) # rejects wrong ones with pytest.raises(ValueError): c('p=mike') # accepts correct ones assert c('p=5') == {'p': 5} # and passes through assert c(dict(p=None)) == {'p': None} # even when the actual value constraint would not with pytest.raises(ConstraintError): c.parameter_constraint.constraint(None) # setting is retrievable assert c.passthrough_value is None # now the "same" via from_parameter() c = EnsureParameterConstraint.from_parameter( Parameter(constraints=EnsureInt()), default=None) assert c(dict(p=None)) == {'p': None} assert c('p=5') == {'p': 5} nested_json = """\ {"name": "Alexa", "wins": [["two pair", "4♠"], ["two pair", "9♠"]]} """ nested_json_decoded = { "name": "Alexa", "wins": [["two pair", "4♠"], ["two pair", "9♠"]], } invalid_json = """\ {"name": BOOM!} """ def test_EnsureJSONLines(): constraint = EnsureGeneratorFromFileLike(EnsureJSON()) assert 'items of type "JSON" read from a file-like' \ == constraint.short_description() # typical is "object", but any valid JSON value type must work assert list(constraint(StringIO("5"))) == [5] # unicode must work uc = "ΔЙקم๗あ" assert list(constraint(StringIO(f'"{uc}"'))) == [uc] assert list(constraint(StringIO(nested_json))) == [nested_json_decoded] with pytest.raises(ValueError) as e: list(constraint(StringIO(f'{nested_json}\n{invalid_json}'))) url_testcases = { "http://www.google.com": ['netloc','scheme',], "https://www.google.com": ['netloc','scheme',], "http://google.com": ['netloc','scheme',], "https://google.com": ['netloc','scheme',], "www.google.com": ['path',], "google.com": ['path',], "http://www.google.com/~as_db3.2123/134-1a": ['netloc','path','scheme',], "https://www.google.com/~as_db3.2123/134-1a": ['netloc','path','scheme',], "http://google.com/~as_db3.2123/134-1a": ['netloc','path','scheme',], "https://google.com/~as_db3.2123/134-1a": ['netloc','path','scheme',], "www.google.com/~as_db3.2123/134-1a": ['path',], "google.com/~as_db3.2123/134-1a": ['path',], # .co.uk top level "http://www.google.co.uk": ['netloc','scheme',], "https://www.google.co.uk": ['netloc','scheme',], "http://google.co.uk": ['netloc','scheme',], "https://google.co.uk": ['netloc','scheme',], "www.google.co.uk": ['path',], "google.co.uk": ['path',], "http://www.google.co.uk/~as_db3.2123/134-1a": ['netloc','path','scheme',], "https://www.google.co.uk/~as_db3.2123/134-1a": ['netloc','path','scheme',], "http://google.co.uk/~as_db3.2123/134-1a": ['netloc','path','scheme',], "https://google.co.uk/~as_db3.2123/134-1a": ['netloc','path','scheme',], "www.google.co.uk/~as_db3.2123/134-1a": ['path',], "google.co.uk/~as_db3.2123/134-1a": ['path',], "https://...": ['netloc', 'scheme',], "https://..": ['netloc', 'scheme',], "https://.": ['netloc', 'scheme',], "file:///mike/was/here": ['path','scheme',], "https://.google.com": ['netloc','scheme',], "https://..google.com": ['netloc','scheme',], "https://...google.com": ['netloc','scheme',], "https://.google..com": ['netloc','scheme',], "https://.google...com": ['netloc','scheme',], "https://...google..com": ['netloc','scheme',], "https://...google...com": ['netloc','scheme',], ".google.com": ['path',], ".google.co.": ['path',], "https://google.co.": ['netloc','scheme',], } def test_EnsureURL(): with pytest.raises(ValueError): # only str input EnsureURL()(5) assert EnsureURL().short_description() == 'URL' assert EnsureURL( required=['scheme', 'netloc'] ).short_description() == "URL with required ['scheme', 'netloc'] component(s)" assert EnsureURL( forbidden=['fragment'] ).short_description() == "URL with no ['fragment'] component(s)" assert EnsureURL( # yes, it need not make sense required=['a'], forbidden=['b'] ).short_description() == "URL with required ['a'] and with no ['b'] component(s)" any_url = EnsureURL() for tc in url_testcases.keys(): any_url(tc) for t in ['netloc', 'path', 'scheme']: cnotag = EnsureURL(forbidden=[t]) cnotag_parsed = EnsureParsedURL(forbidden=[t]) for url, tags in url_testcases.items(): if t in tags: with pytest.raises(ConstraintError) as e: cnotag(url) assert f"forbidden '{t}'" in str(e.value) else: cnotag(url) cnotag_parsed(url) ctag = EnsureURL(required=[t]) ctag_parsed = EnsureParsedURL(required=[t]) for url, tags in url_testcases.items(): if t not in tags: with pytest.raises(ConstraintError) as e: ctag(url) assert f"missing '{t}'" in str(e.value) else: ctag(url) ctag_parsed(url) def test_EnsureURL_match(): # must contain a UUID c = EnsureURL( match='^.*([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}).*$', ) with pytest.raises(ValueError): c('http://example.com') # it does not matter where it is for url in ( 'https://s.kg.eb.eu/i/a8932c7e-063c-4131-ab96-996d843998e9', 'ssh://4ac9f0bc-560d-47e0-8916-7b24da9bb0ce.com/home', ): c(url) def test_EnsureDataset(tmp_path, no_result_rendering): with pytest.raises(ValueError): # will not return a Dataset from sensless input EnsureDataset()(5) # by default the installation state is not checked # this matches the behavior of the original implementation # from datalad-core assert EnsureDataset()(tmp_path).ds.pathobj == tmp_path # this is same with auto-discovery of a dataset from CWD # (ie. with None as the argument) with chpwd(tmp_path): assert EnsureDataset()(None).ds.pathobj == tmp_path assert EnsureDataset(installed=False)(None).ds.pathobj == tmp_path # unless installation-verification is turned on with pytest.raises(NoDatasetFound): EnsureDataset(installed=True)(None) # any return value created from not-a-dataset-instance # has the original argument as an attribute assert EnsureDataset()(tmp_path).original == tmp_path # but it can be turned on, and then yields the specific # exception that datalad-core's require_dataset() would # give with pytest.raises(NoDatasetFound): EnsureDataset(installed=True)('/nothere_datalad_test') # we can also ensure absence assert EnsureDataset(installed=False)(tmp_path).ds.pathobj == tmp_path # absence detection with a dataset instance with pytest.raises(ValueError): EnsureDataset(installed=True)( # this provides the instance for testing EnsureDataset()(tmp_path).ds ) # # tmp_path has a dataset from here # # create a dataset, making sure it did not exist before ds = EnsureDataset(installed=False)(tmp_path).ds.create() assert EnsureDataset()(ds).ds == ds assert EnsureDataset()(ds).original == ds # existence verified assert EnsureDataset(installed=True)(ds).ds.pathobj == tmp_path # check presence detection with path with pytest.raises(ValueError): EnsureDataset(installed=False)(tmp_path) # check presence detection and with dataset instance with pytest.raises(ValueError): EnsureDataset(installed=False)(ds) assert EnsureDataset().short_description() == '(path to) a Dataset' assert EnsureDataset( installed=True).short_description() == '(path to) an existing Dataset' assert EnsureDataset( installed=False).short_description() == \ '(path to) a non-existing Dataset' # smoke test for idcheck: assert EnsureDataset(require_id=True)(ds).ds == ds assert EnsureDataset(require_id=False)(ds).ds == ds # unset the dataset ID to test whether an ID check would raise, but # bring it back later in case future tests need it id = ds.config.get('datalad.dataset.id') ds.config.unset('datalad.dataset.id', scope='branch') with pytest.raises(ValueError): EnsureDataset(require_id=True)(tmp_path) datalad-next-1.4.1/datalad_next/constraints/tests/test_tutorial.py000066400000000000000000000040301462321624600255030ustar00rootroot00000000000000"""Build a complete (minimal) command that implements batch-mode But without any batch-mode code inside the command implementation """ from io import StringIO from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, Parameter, build_doc, eval_results, get_status_dict, ) from datalad_next.exceptions import CapturedException from datalad_next.constraints import ( EnsureGeneratorFromFileLike, EnsureJSON, ) @build_doc class DoBatch(ValidatedInterface): """Explainer!""" _validator_ = EnsureCommandParameterization(dict( # TODO add constraint that checks composition # of each JSON-line source=EnsureGeneratorFromFileLike( EnsureJSON(), exc_mode='yield', ), )) _params_ = dict( source=Parameter(args=('source',)), ) @staticmethod @eval_results def __call__(source): for item in source: if isinstance(item, CapturedException): yield get_status_dict( action='dobatch', status='error', exception=item, ) continue yield get_status_dict( action='dobatch', status='ok', selected=item.get('this'), ) def test_dobatch(monkeypatch, no_result_rendering): data_in = '{"this":[1,2,3],"noise":"some"}\n{"this":true}' monkeypatch.setattr('sys.stdin', StringIO(data_in)) res = DoBatch.__call__('-') assert len(res) == 2 assert res[0]['selected'] == [1, 2, 3] assert res[1]['selected'] is True # now we have an intermediate error monkeypatch.setattr('sys.stdin', StringIO('bug\n' + data_in)) res = DoBatch.__call__( '-', on_failure='ignore') assert len(res) == 3 assert res[0]['status'] == 'error' assert 'Expecting value' in res[0]['error_message'] # second one has the data now assert res[1]['selected'] == [1, 2, 3] assert res[2]['selected'] is True datalad-next-1.4.1/datalad_next/constraints/utils.py000066400000000000000000000020271462321624600226030ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 et: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Helper for parameter validation, documentation and conversion""" __docformat__ = 'restructuredtext' import re def _strip_typerepr(s): """Strip away and decorations for docstrings """ return re.sub(r"<(class|type) '(\S+)'>", r'\2', s) def _type_str(t): """Get string human-readable representation of a data type If type (t) is given as a tuple, assume ability to choose any of the listed types, so those types listing get joined with | """ if isinstance(t, tuple): s = ' or '.join(map(_type_str, t)) return ("(%s)" % s) if len(t) > 1 else s return _strip_typerepr(str(t)) datalad-next-1.4.1/datalad_next/consts/000077500000000000000000000000001462321624600200325ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/consts/__init__.py000066400000000000000000000014071462321624600221450ustar00rootroot00000000000000"""Common constants COPY_BUFSIZE ``shutil`` buffer size default, with Windows platform default changes backported from Python 3.10. PRE_INIT_COMMIT_SHA SHA value for ``git hash-object -t tree /dev/null``, i.e. for nothing. This corresponds to the state of a Git repository before the first commit is made. on_linux ``True`` if executed on the Linux platform. on_windows ``True`` if executed on the Windows platform. """ # import from "utils", but these really are constants from datalad.utils import ( on_linux, on_windows, ) try: from shutil import COPY_BUFSIZE except ImportError: # pragma: no cover # too old # from PY3.10 COPY_BUFSIZE = 1024 * 1024 if on_windows else 64 * 1024 from datalad.consts import PRE_INIT_COMMIT_SHA datalad-next-1.4.1/datalad_next/credman/000077500000000000000000000000001462321624600201325ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/credman/__init__.py000066400000000000000000000003501462321624600222410ustar00rootroot00000000000000"""Credential management .. currentmodule:: datalad_next.credman .. autosummary:: :toctree: generated CredentialManager verify_property_names """ from .manager import ( CredentialManager, verify_property_names, ) datalad-next-1.4.1/datalad_next/credman/manager.py000066400000000000000000001255441462321624600221310ustar00rootroot00000000000000# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See LICENSE file distributed along with the datalad_osf package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Credential management and query""" # allow for |-type UnionType declarations from __future__ import annotations __docformat__ = 'restructuredtext' __all__ = ['CredentialManager'] from collections.abc import Set from datetime import datetime import logging import re from typing import ( Dict, List, Tuple, ) import datalad from datalad_next.config import ConfigManager from datalad_next.exceptions import ( CapturedException, CommandError, ) from datalad_next.uis import ui_switcher as ui lgr = logging.getLogger('datalad.credman') class CredentialManager(object): """Facility to get, set, remove and query credentials. A credential in this context is a set of properties (key-value pairs) associated with exactly one secret. At present, the only backend for secret storage is the Python keyring package, as interfaced via a custom DataLad wrapper. Store for credential properties is implemented using DataLad's (i.e. Git's) configuration system. All properties are stored in the `global` (i.e., user) scope under configuration items following the pattern:: datalad.credential.. where ```` is a credential name/identifier, and ```` is an arbitrarily named credential property, whose name must follow the git-config syntax for variable names (case-insensitive, only alphanumeric characters and ``-``, and must start with an alphabetic character). Create a ``CredentialManager`` instance is fast, virtually no initialization needs to be performed. All internal properties are lazily evaluated. This facilitates usage in code where it is difficult to incorporate a long-lived central instance. API With one exception, all parameter names of methods in the core API outside ``**kwargs`` must have a ``_`` prefix that distinguishes credential properties from method parameters. The one exception is the ``name`` parameter, which is used as a primary identifier (albeit being optional for some operations). The ``obtain()`` method is provided as an additional convenience, and implements a standard workflow for obtaining a credential in a wide variety of scenarios (credential name, credential properties, secret either respectively already known or yet unknown). """ valid_property_names_regex = re.compile(r'[a-z0-9]+[a-z0-9-]*$') secret_names = { 'user_password': 'password', } def __init__(self, cfg: ConfigManager | None = None): """ Parameters ---------- cfg: ConfigManager, optional If given, all configuration queries are performed using this ``ConfigManager`` instance. Otherwise ``datalad.cfg`` is used. """ self.__cfg = cfg self.__cred_types = None self.__keyring = None # main API # # # Design remark: # This is the keyhole through which any retrieval-related processing goes # (get, query, obtain). Any normalization performed in here will # automatically also effect these other operations. # def get(self, name=None, *, _prompt=None, _type_hint=None, **kwargs): """Get properties and secret of a credential. This is a read-only method that never modifies information stored on a credential in any backend. Credential property lookup is supported via a number approaches. When providing ``name``, all existing corresponding configuration items are found and reported, and an existing secret is retrieved from name-based secret backends (presently ``keyring``). When providing a ``type`` property or a ``_type_hint`` the lookup of additional properties in the keyring-backend is enabled, using predefined property name lists for a number of known credential types. For all given property keys that have no value assigned after the initial lookup, manual/interactive entry is attempted, whenever a custom ``_prompt`` was provided. This include requesting a secret. If manually entered information is contained in the return credential record, the record contains an additional ``_edited`` property with a value of ``True``. If no secret is known after lookup and a potential manual data entry, a plain ``None`` is returned instead of a full credential record. Parameters ---------- name: str, optional Name of the credential to be retrieved _prompt: str or None Instructions for credential entry to be displayed when missing properties are encountered. If ``None``, manual entry is disabled. _type_hint: str or None In case no ``type`` property is included in ``kwargs``, this parameter is used to determine a credential type, to possibly enable further lookup/entry of additional properties for a known credential type **kwargs: Credential property name/value pairs to overwrite/amend potentially existing properties. For any property with a value of ``None``, manual data entry will be performed, unless a value could be retrieved on lookup, or prompting was not enabled. Returns ------- dict or None Return ``None``, if no secret for the credential was found or entered. Otherwise returns the complete credential record, comprising all properties and the secret. An additional ``_edited`` key with a value of ``True`` is added whenever the returned record contains manually entered information. Raises ------ ValueError When the method is called without any information that could be used to identify a credential """ if name is None and _type_hint is None and not kwargs: # there is no chance that this could work raise ValueError( 'CredentialManager.get() called without any identifying ' 'information') if name is None: # there is no chance we can retrieve any stored properties # but we can prompt for some below cred = {} else: # if we have a chance to query for stored legacy credentials # we do this first to have the more modern parts of the # system overwrite them reliably cred = self._get_legacy_credential_from_keyring( name, type_hint=kwargs.get('type', _type_hint), ) or {} # and now take whatever we got from the legacy store and update # it from what we have in the config cred_update = self._get_credential_from_cfg(name) if set(cred_update.keys()) >= set( k for k in cred.keys() if k != '_from_backend'): # we are overwriting all info from a possible legacy credential # take marker off cred.pop('_from_backend', None) cred.update(cred_update) # we merge existing credential properties with overrides. # we are only adding 'enter-please' markers (i.e. None) for properties # that have no known value yet cred.update(**{k: v for k, v in kwargs.items() if v is not None or k not in cred}) # final word on the credential type, if there is any type info at all # `cred` will have a `type` property after this self._assign_credential_type(cred, _type_hint) # determine what is missing and possibly prompt for it self._complete_credential_props(name, cred, _prompt) if not cred.get('secret'): # no secret, no credential if any(not p.startswith('_') for p in cred): lgr.debug( 'Not reporting on credential fragment ' '(name=%r) with no secret: %r', name, cred, ) return return cred def set(self, name, *, _lastused=False, _suggested_name=None, _context=None, **kwargs): """Set credential properties and secret Presently, all supported backends require the specification of a credential ``name`` for storage. This may change in the future, when support for alternative backends is added, at which point the ``name`` parameter would become optional. All properties provided as `kwargs` with keys not starting with `_` and with values that are not ``None`` will be stored. If ``kwargs`` do not contain a ``secret`` specification, manual entry will be attempted. The associated prompt with be either the name of the ``secret`` field of a known credential (as identified via a ``type`` property), or the label ``'secret'``. All properties with an associated value of ``None`` will be removed (unset). Parameters ---------- name: str or None Credential name. If None, the name will be prompted for and setting the credential is skipped if no name is provided. _lastused: bool, optional If set, automatically add an additional credential property ``'last-used'`` with the current timestamp in ISO 8601 format. _suggested_name: str, optional If `name` is None, this name (if given) is presented as a default suggestion that can be accepted without having to enter it manually. If this name suggestion conflicts with an existing credential, it is ignored and not presented as a suggestion. _context: str, optional If given, will be included in the prompt for a missing credential name to provide context for a user. It should be written to fit into a parenthical statement after "Enter a name to save the credential (...)", e.g. "for download from ". **kwargs: Any number of credential property key/value pairs to set (update), or remove. With one exception, values of ``None`` indicate removal of a property from a credential. However, ``secret=None`` does not lead to the removal of a credential's secret, because it would result in an incomplete credential. Instead, it will cause a credential's effective ``secret`` property to be written to the secret store. The effective secret might come from other sources, such as particular configuration scopes or environment variables (i.e., matching the ``datalad.credential..secret`` configuration item. Properties whose names start with an underscore are automatically removed prior storage. Returns ------- dict or None key/values of all modified credential properties with respect to their previously recorded values. None is returned in case a user did not enter a missing credential name. If a user entered a credential name, it is included in the returned dictionary under the 'name' key. Raises ------ RuntimeError This exception is raised whenever a property cannot be removed successfully. Likely cause is that it is defined in a configuration scope or backend for which write-access is not supported. ValueError When property names in kwargs are not syntax-compliant. """ updated = {} if not name: known_credentials = self._get_known_credential_names() if _suggested_name in known_credentials: # ignore name suggestion, conflicts with existing credential _suggested_name = None prompt = 'Enter a name to save the credential' if _context: prompt = f'{prompt} ({_context})' prompt = f"{prompt} securely for future reuse, " \ "or 'skip' to not save the credential" if _suggested_name: prompt = f'{prompt}, or leave empty to accept the name ' \ f'{_suggested_name!r}' while not name: entered = self._ask_property('name', prompt=prompt) if entered == 'skip': return elif entered is None: # the user was no able to enter a value (non-interactive # session). we raise to not let this go unnoticed. raise ValueError('no credential name provided for setting') elif not entered: if not _suggested_name: ui.message('Cannot proceed without a credential name') continue # otherwise take the default entered = _suggested_name if entered in known_credentials: ui.message( f'A credential with the name {entered!r} already ' 'exists, please provide a different name.') else: name = entered updated['name'] = name # we strip internal properties, such as '_edited' automatically # forcing each caller to to this by hand is kinda pointless, if # they can never be stored anyway, and e.g. a previous `get()` # would include one for any credentials that was manually entered kwargs = self._strip_internal_properties(kwargs) # check syntax for the rest verify_property_names(kwargs) # retrieve the previous credential state, so we can merge with the # incremental changes, and report on effective updates prev_cred = self.get( name=name, # we never want any manual interaction at this point _prompt=None, # if we know the type, hence we can do a query for legacy secrets # and properties. This will migrate them to the new setup # over time _type_hint=kwargs.get('type'), ) # merge incoming with existing properties to create an updated # credential if prev_cred: prev_cred = self._strip_internal_properties(prev_cred) cred = dict(prev_cred, **kwargs) else: cred = dict(kwargs) # update last-used, if requested if _lastused: cred['last-used'] = datetime.now().isoformat() # remove props # remove_props = [ k for k, v in cred.items() # can we really know that no 'secret' field was deposited # in the config backend? MIH does not think so. However, # secret=None has special semantics (update secret store # from config), hence we cannot use it to perform removal # of secrets from config here. # MIH did not find a rational for this setup. It was already # part of the original implementation. At least this is # documented in the `kwargs` docstring now. if v is None and k != 'secret'] self._unset_credprops_anyscope(name, remove_props) updated.update(**{k: None for k in remove_props}) # set non-secret props # set_props = { k: v for k, v in cred.items() if v is not None and k != 'secret' } for k, v in set_props.items(): var = _get_cred_cfg_var(name, k) if self._cfg.get(var) == v: # desired value already exists, we are not # storing again to preserve the scope it # was defined in continue # we always write to the global scope (ie. user config) # credentials are typically a personal, not a repository # specific entity -- likewise secrets go into a personal # not repository-specific store # for custom needs users can directly set the respective # config self._cfg.set(var, v, scope='global', force=True, reload=False) updated[k] = v if set_props: self._cfg.reload() # set secret # # we aim to update the secret in the store, hence we must # query for a previous setting in order to reliably report # updates prev_secret = self._get_secret(prev_cred) if prev_cred else None if 'secret' not in cred: # we have no removal directive, reuse previous secret cred['secret'] = prev_secret if cred.get('secret') is None: # we want to reset the secret, consider active config # (which was already queried when retrieving the previous # credential above) cred['secret'] = prev_secret if cred.get('secret') is None: # we have no secret specified or in the store already: ask # typically we would end up here with an explicit attempt # to set a credential in a context that is known to an # interactive user, hence the messaging here can be simple cred['secret'] = self._ask_secret( CredentialManager.secret_names.get(cred.get('type'), 'secret')) # at this point we will have a secret. it could be from ENV # or provided, or entered. we always want to put it in the # store # we never ever write a secret to any other field-name than # 'secret' if cred['secret'] != self._keyring.get(name, 'secret'): # only report updated if actually different from before. # and "before" is what was in the secret store, because # we will write to it next. A secret could have been # provided via an ENV var, hence even with no change from # `prev_cred` there could be a change in the secret store updated['secret'] = cred['secret'] # TODO make sure that there actually is a secret that is written # and not None self._keyring.set(name, 'secret', cred['secret']) return updated def remove(self, name, *, type_hint=None): """Remove a credential, including all properties and secret Presently, all supported backends require the specification of a credential ``name`` for lookup. This may change in the future, when support for alternative backends is added, at which point the ``name`` parameter would become optional, and additional parameters would be added. Returns ------- bool True if a credential was removed, and False if not (because no respective credential was found). Raises ------ RuntimeError This exception is raised whenever a property cannot be removed successfully. Likely cause is that it is defined in a configuration scope or backend for which write-access is not supported. """ # prefix for all config variables of this credential prefix = _get_cred_cfg_var(name, '') to_remove = [ k[len(prefix):] for k in self._cfg.keys() if k.startswith(prefix) ] removed = False if to_remove: self._unset_credprops_anyscope(name, to_remove) removed = True # delete the secret from the keystore, if there is any def del_field(name, field): global removed try: self._keyring.delete(name, field) removed = True except Exception as e: if self._keyring.get(name, field) is None: # whatever it was, the target is reached CapturedException(e) else: # we could not delete the field raise # pragma: nocover del_field(name, 'secret') if type_hint: # remove legacy records too for field in self._cred_types.get( type_hint, {}).get('fields', []): del_field(name, field) return removed def query_(self, **kwargs): """Query for all (matching) credentials. Credentials are yielded in no particular order. This method cannot find credentials for which only a secret was deposited in the keyring. This method does support lookup of credentials defined in DataLad's "provider" configurations. Parameters ---------- **kwargs If not given, any found credential is yielded. Otherwise, any credential must match all property name/value pairs Yields ------ tuple(str, dict) The first element in the tuple is the credential name, the second element is the credential record as returned by ``get()`` for any matching credential. """ done = set() known_credentials = set((n, None) for n in self._get_known_credential_names()) from itertools import chain for name, legacy_type_hint in chain( _yield_legacy_credential_types(), known_credentials): done.add(name) cred = self.get(name, _prompt=None, _type_hint=legacy_type_hint) if not cred and legacy_type_hint: # this legacy-type credential is not set. We still want to # report on it, because it is the only way for users that # discover these predefined credential "slots" cred = dict(type=legacy_type_hint) if legacy_type_hint is not None: # leading underscore to distinguish this internal marker from # an actual credential property. # the credentials command will then also treat it as such cred['_from_backend'] = 'legacy' if not cred: # no info on such a credential, not even legacy info # ignore continue if not kwargs: yield (name, cred) else: if all(cred.get(k) == v for k, v in kwargs.items()): yield (name, cred) else: continue def query(self, *, _sortby=None, _reverse=True, **kwargs): """Query for all (matching) credentials, sorted by a property This method is a companion of ``query_()``, and the same limitations regarding credential discovery apply. In contrast to ``query_()``, this method return a list instead of yielding credentials one by one. This returned list is optionally sorted. Parameters ---------- _sortby: str, optional Name of a credential property to provide a value to sort by. Credentials that do not carry the specified property always sort last, regardless of sort order. _reverse: bool, optional Flag whether to sort ascending or descending when sorting. By default credentials are return in descending property value order. This flag does not impact the fact that credentials without the property to sort by always sort last. **kwargs Pass on as-is to ``query_()`` Returns ------- list(str, dict) Each item is a 2-tuple. The first element in each tuple is the credential name, the second element is the credential record as returned by ``get()`` for any matching credential. """ matches = self.query_(**kwargs) if _sortby is None: return list(matches) # this makes sure that any credential that does not have the # sort-by property name sorts to the very end of the list # regardless of whether the sorting is ascending or descending def get_sort_key(x): # x is a tuple as returned by query_() prop_indicator = _sortby in x[1] if not _reverse: prop_indicator = not prop_indicator return (prop_indicator, x[1].get(_sortby)) return sorted(matches, key=get_sort_key, reverse=_reverse) def obtain(self, name: str | None = None, *, prompt: str | None = None, type_hint: str | None = None, query_props: Dict | None = None, expected_props: List | Tuple | None = None): """Obtain a credential by query or prompt (if needed) This convenience method implements a standard workflow to obtain a credential. It supports credential selection by credential name/identifier, and falls back onto querying for a credential matching a set of specified properties (as key-value mappings). If no suitable credential is known, a user is prompted to enter one interactively (if possible in the current session). If a credential was entered manually, any given ``type_hint`` will be included as a ``type`` property of the returned credential, and the returned credential has an ``_edited=True`` property. Likewise, any ``realm`` property included in the ``query_props`` is included in the returned credential in this case. If desired, a credential workflow can be completed, after a credential was found to be valid/working, by storing or updating it in the credential store:: cm = CredentialManager() cname, cprops = cm.obtain(...) # verify credential is working ... # set/update cm.set(cname, _lastused=True, **cprops) In the code sketch above, if ``cname`` is ``None`` (as it will be for a newly entered credential, ``set()`` will prompt for a name to store the credential under, and will offer a user the choice to skip storing a credential. For any previously known credential, the ``last-used`` property will be updated to enable preferred selection in future credential discovery attempts via ``obtain()``. Examples -------- Minimal call to get a credential entered (manually):: credman.obtain(type_hint='token', prompt='Credential please!') Without a prompt text no interaction is attempted, and without a type hint it is unknown what (and how much) to enter. Minimal call to retrieve a credential by its identifier:: credman.obtain('my-github-token') Minimal call to retrieve the last-used credential for a particular authentication "realm". In this case "realm" is a property that was previously set to match a particular service/location, and is now used to match credentials against:: credman.obtain(query_props={'realm': 'mysecretlair'}) Parameters ---------- name: str, optional Name of the credential to be retrieved prompt: str, optional Passed to ``CredentialManager.get()`` if a credential name was provided, or no suitable credential could be found by querying. type_hint: str, optional In case no ``type`` property is included in ``query_props``, this parameter is passed to ``CredentialManager.get()``. query_props: dict, optional Credential property to be used for querying for a suitable credential. When multiple credentials match a query, the last-used credential is selected. expected_props: list or tuple, optional When specified, a credential will be inspected to contain properties matching all listed property names, or a ``ValueError`` will be raised. Returns ------- (str, dict) Credential name (possibly different from the input, when a credential was discovered based on properties), and credential properties. Raises ------ ValueError Raised when no matching credential could be found and none was entered. Also raised, when a credential selected from a query result or a manually entered one is missing any of the properties with a name given in ``expected_props``. """ cred = None if not name: if query_props: creds = self.query(_sortby='last-used', **(query_props or {})) if creds: name, cred = creds[0] if not cred: kwargs = dict( # name could be none name=name, _prompt=prompt, _type_hint=type_hint, ) try: cred = self.get(**kwargs) # check if we know the realm, if so include in the credential, realm = (query_props or {}).get('realm') if realm: cred['realm'] = realm if name is None and type_hint: # we are not expecting to retrieve a particular credential. # make the type hint the actual type of the credential cred['type'] = type_hint except Exception as e: lgr.debug('Obtaining credential failed: %s', e) if not cred: raise ValueError('No suitable credential found or specified') missing_props = [ ep for ep in (expected_props or []) if ep not in cred ] if any(missing_props): raise ValueError( 'No suitable credential or specified ' f'(missing properties: {missing_props})') return name, cred # internal helpers # def _strip_internal_properties(self, cred: Dict) -> Dict: return {k: v for k, v in cred.items() if not k.startswith('_')} def _assign_credential_type(self, cred, _type_hint=None): """Set 'type' property (in-place)""" _type_hint = cred.get('type', _type_hint) if _type_hint: cred['type'] = _type_hint return # if we get here, we don't know what type this is # let's derive one for a few clear-cut cases where we can be # reasonable sure what type a credential is if set(cred) == set(('token',)): # all we have is a token property -- very likely a token-type # credential. Move the token to the secret property and # assign the type cred['type'] = 'token' cred['secret'] = cred.pop('token') def _complete_credential_props( self, name: str, cred: Dict, prompt: str | None, ) -> None: """Determine missing credential properties, and fill them in What properties are missing is determined based on credential type info, and their values will be prompted for (if a prompt was provided). The given credential is modified in place. """ cred_type = cred.get('type') # import the definition of expected fields from the known # credential types cred_type_def = self._cred_types.get( cred_type, dict(fields=[], secret=None)) required_fields = cred_type_def['fields'] or [] secret_field = cred_type_def['secret'] # mark required fields for this credential type for k in required_fields: if k == secret_field: # do nothing, if this is the secret key continue if k in cred: # do nothing if we have an incoming value for this key already continue # otherwise make sure we prompt for the essential # fields cred[k] = None # - prompt for required but missing prompts # - retrieve a secret prompted = False entered = {} for k, v in cred.items(): if k in ('secret', secret_field): # a secret is either held in a 'secret' field, or in a dedicated field # defined by the cred_type_def. Both are handled below # handled below continue if prompt and v is None: # prevent double-prompting for entering a series of properties # of the same credential v = self._ask_property(k, None if prompted else prompt) if v is not None: prompted = True if v: entered[k] = v # bulk merged, cannot do in-loop above, because we iterate over items() cred.update(entered) # extract the secret, from the assembled properties or a secret store secret = self._get_secret(cred, name=name, secret_field=secret_field) if prompt and secret is None: secret = self._ask_secret( type_hint=secret_field, prompt=None if prompted else prompt, ) if secret: prompted = True if secret: cred['secret'] = secret # report whether there were any edits to the credential record # (incl. being entirely new), such that consumers can decide # to save a credentials, once battle-tested if prompted: cred['_edited'] = True def _get_credential_from_cfg(self, name: str) -> Dict: var_prefix = _get_cred_cfg_var(name, '') return { k[len(var_prefix):]: v for k, v in self._cfg.items() if k.startswith(var_prefix) } def _get_known_credential_names(self) -> Set[str]: known_credentials = set( '.'.join(k.split('.')[2:-1]) for k in self._cfg.keys() if k.startswith('datalad.credential.') ) return known_credentials def _ask_property(self, name, prompt=None): if not ui.is_interactive: lgr.debug('Cannot ask for credential property %r in non-interactive session', name) return return ui.question(name, title=prompt) def _ask_secret(self, type_hint=None, prompt=None): if not ui.is_interactive: lgr.debug('Cannot ask for credential secret in non-interactive session') return return ui.question( type_hint or 'secret', title=prompt, repeat=self._cfg.obtain( 'datalad.credentials.repeat-secret-entry'), hidden=self._cfg.obtain( 'datalad.credentials.hidden-secret-entry'), ) def _unset_credprops_anyscope(self, name, keys): """Reloads the config after unsetting all relevant variables This method does not modify the keystore. """ nonremoved_vars = [] for k in keys: var = _get_cred_cfg_var(name, k) if var not in self._cfg: continue try: self._cfg.unset(var, scope='global', reload=False) except CommandError as e: CapturedException(e) try: self._cfg.unset(var, scope='local', reload=False) except CommandError as e: CapturedException(e) nonremoved_vars.append(var) if nonremoved_vars: raise RuntimeError( f"Cannot remove configuration items {nonremoved_vars} " f"for credential, defined outside global or local " "configuration scope. Remove manually") self._cfg.reload() def _get_legacy_credential_from_keyring( self, name: str, type_hint: str | None, ) -> Dict | None: """With a ``type_hint`` given or determined from a known legacy credential, attempts to retrieve a credential comprised of all fields defined in ``self._cred_types``. Otherwise ``None`` is returned. """ if not type_hint: # no type hint given in any form. Last chance is that # this is a known legacy credential. # doing this query is a bit expensive, but getting a # credential is not a high-performance procedure, and # the gain in convenience is substantial -- otherwise # users would need to somehow know what they should be # looking for type_hint = dict(_yield_legacy_credential_types()).get(name) if not type_hint or type_hint not in self._cred_types: return None cred = {} lc = self._cred_types[type_hint] for field in (lc['fields'] or []): if field == lc['secret']: continue val = self._keyring.get(name, field) if val: # legacy credentials used property names with underscores, # but this is no longer syntax-compliant -- fix on read cred[field.replace('_', '-')] = val if not cred: # there is nothing on a legacy credential with this name return None else: # at least some info came from the legacy backend, record that cred['_from_backend'] = 'legacy' cred['type'] = type_hint return cred def _get_secret( self, cred: Dict, name: str | None = None, secret_field: str | None = None, ) -> str | None: """Report a secret Either directly from the set of credential properties, or from a secret store. """ # from literal 'secret' property secret = cred.get('secret') if secret: return secret # from secret store under 'secret' label if name: secret = self._keyring.get(name, 'secret') if secret: return secret # `secret_field` property if secret_field: secret = cred.get(secret_field) if secret: return secret # from secret store under `secret_field` label if name and secret_field: secret = self._keyring.get(name, secret_field) if secret: return secret # no secret found anywhere return None @property def _cfg(self): """Return a ConfigManager given to __init__() or the global datalad.cfg """ if self.__cfg: return self.__cfg return datalad.cfg @property def _keyring(self): """Returns the DataLad keyring wrapper This internal property may vanish whenever changes to the supported backends are made. """ if self.__keyring: return self.__keyring from datalad.support.keyring_ import keyring self.__keyring = keyring return keyring @property def _cred_types(self): """Internal property for mapping of credential type names to fields. Returns ------- dict Legacy credential type name ('token', 'user_password', etc.) as keys, and dictionaries as values. Each of these dicts has two keys: 'fields' (the complete list of "fields" that the credential comprises), and 'secret' (the name of the field that represents the secret. If there is no secret, the value associated with that key is ``None``. """ # at present the credential type specifications are built from the # legacy credential types, but this may change at any point in the # future # here is what that was in Mar 2022 # 'user_password': {'fields': ['user', 'password'], # 'secret': 'password'}, # 'token': {'fields': ['token'], 'secret': 'token'}, # 'git': {'fields': ['user', 'password'], 'secret': 'password'} # 'aws-s3': {'fields': ['key_id', 'secret_id', 'session', 'expiration'], # 'secret': 'secret_id'}, # 'nda-s3': {'fields': None, 'secret': None}, # 'loris-token': {'fields': None, 'secret': None}, if self.__cred_types: return self.__cred_types from datalad.downloaders import CREDENTIAL_TYPES mapping = {} for cname, ctype in CREDENTIAL_TYPES.items(): secret_fields = [ f for f in (ctype._FIELDS or {}) if ctype._FIELDS[f].get('hidden') ] mapping[cname] = dict( fields=list(ctype._FIELDS.keys()) if ctype._FIELDS else None, secret=secret_fields[0] if secret_fields else None, ) # an implementation-independent s3-style credential (with the aim to # also work for MinIO and Ceph) mapping['s3'] = dict( # use boto-style names, but strip "aws" prefix, and redundant # non-distinguishing 'key' and 'access' terms fields=['key', 'secret'], secret='secret', ) self.__cred_types = mapping return mapping def _yield_legacy_credential_types(): # query is constrained by non-secrets, no constraints means report all # a constraint means *exact* match on all given properties from datalad.downloaders.providers import ( Providers, CREDENTIAL_TYPES, ) type_hints = {v: k for k, v in CREDENTIAL_TYPES.items()} # ATTN: from Providers.from_config_files() is sensitive to the PWD # it will only read legacy credentials from datasets whenever # PWD is inside a dataset legacy_credentials = set( (p.credential.name, type(p.credential)) # without reload, no changes in files since the last call # will be considered. That last call might have happened # in datalad-core, and may have been in another directory for p in Providers.from_config_files(reload=True) if p.credential ) for name, type_ in legacy_credentials: yield (name, type_hints.get(type_)) def verify_property_names(names): """Check credential property names for syntax-compliance. Parameters ---------- names: iterable Raises ------ ValueError When any non-compliant property names were found """ invalid_names = [ k for k in names if not CredentialManager.valid_property_names_regex.match(k) ] if invalid_names: raise ValueError( f'Unsupported property names {invalid_names}, ' 'must match git-config variable syntax (a-z0-9 and - characters)') def _get_cred_cfg_var(name, prop): """Return a config variable name for a credential property Parameters ---------- name : str Credential name prop : str Property name Returns ------- str """ return f'datalad.credential.{name}.{prop}' datalad-next-1.4.1/datalad_next/credman/tests/000077500000000000000000000000001462321624600212745ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/credman/tests/__init__.py000066400000000000000000000000001462321624600233730ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/credman/tests/test_credman.py000066400000000000000000000353761462321624600243340ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # -*- coding: utf-8 -*- # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ """ import pytest from datalad_next.config import ConfigManager from ..manager import ( CredentialManager, _get_cred_cfg_var, ) from datalad_next.tests import ( assert_in, assert_raises, eq_, ) from datalad_next.utils import chpwd def test_credmanager(tmp_keyring, datalad_cfg, datalad_interactive_ui): ui = datalad_interactive_ui credman = CredentialManager(datalad_cfg) # doesn't work with thing air assert_raises(ValueError, credman.get) eq_(credman.get('donotexiststest'), None) # we get reports as soon as there is a secret available # this makes it possible to discover credential fragments, if only to # expose them for clean-up eq_(credman.get(crazy='empty'), None) eq_(credman.get(crazy='empty', secret='bogus'), {'crazy': 'empty', 'secret': 'bogus'}) # does not fiddle with a secret that is readily provided eq_(credman.get('dummy', secret='mike', _type_hint='token'), dict(type='token', secret='mike')) # remove a secret that has not yet been set eq_(credman.remove('elusive', type_hint='user_password'), False) # but the secret was written to the keystore with pytest.raises(ValueError): credman.set('mycred', forbidden_propname='some') # no instructions what to do, no legacy entry, nothing was changed # but the secret was written to the keystore eq_(credman.set('mycred', secret='some'), dict(secret='some')) # redo but with timestep setprops = credman.set('lastusedcred', _lastused=True, secret='some') assert_in('last-used', setprops) # now re-set, based on the retrieved info, but update the timestamp setprops_new = credman.set('lastusedcred', _lastused=True, **credman.get('lastusedcred')) # must have updated 'last-used' assert setprops['last-used'] != setprops_new['last-used'] # first property store attempt eq_(credman.set('changed', secret='some', prop='val'), dict(secret='some', prop='val')) # second, no changing the secret, but changing the prop, albeit with # the same value, change report should be empty eq_(credman.set('changed', prop='val'), dict()) # change secret, with value pulled from config try: datalad_cfg.set('datalad.credential.changed.secret', 'envsec', scope='override') eq_(credman.set('changed', secret=None), dict(secret='envsec')) finally: datalad_cfg.unset('datalad.credential.changed.secret', scope='override') # remove non-existing property, secret not report, because unchanged eq_(credman.set('mycred', dummy=None), dict(dummy=None)) assert _get_cred_cfg_var("mycred", "dummy") not in datalad_cfg # set property eq_(credman.set('mycred', dummy='good', this='that'), dict(dummy='good', this='that')) # ensure set eq_(credman.get('mycred'), dict(dummy='good', this='that', secret='some')) # remove individual property eq_(credman.set('mycred', dummy=None), dict(dummy=None)) # remove individual property that is not actually present eq_(credman.set('mycred', imaginary=None), dict(imaginary=None)) # ensure removal eq_(credman.get('mycred'), dict(this='that', secret='some')) # test full query and constrained query q = list(credman.query_()) # 3 defined here, plus any number of legacy credentials assert len(q) > 3 # now query for one of the creds created above q = list(credman.query_(prop='val')) eq_(len(q), 1) eq_(q[0][0], 'changed') eq_(q[0][1]['prop'], 'val') # and now a query with no match q = list(credman.query_(prop='val', funky='town')) eq_(len(q), 0) # remove complete credential credman.remove('mycred') eq_(credman.get('mycred'), None) # test prompting for a secret when none is given ui.staged_responses.append('mysecret') res = credman.set('mycred', other='prop') assert res == {'other': 'prop', 'secret': 'mysecret'} # test prompting for a name when None is given ui.staged_responses.append('mycustomname') res = credman.set(None, secret='dummy', other='prop') assert res == {'name': 'mycustomname', 'other': 'prop', 'secret': 'dummy'} # test name prompt loop in case of a name collision ui.staged_responses.extend(['mycustomname', 'mycustomname2']) res = credman.set(None, secret='dummy2', other='prop2') assert res == {'name': 'mycustomname2', 'other': 'prop2', 'secret': 'dummy2'} # test skipping at prompt, smoke test _context arg ui.staged_responses.append('skip') res = credman.set( None, _context='for me', secret='dummy', other='prop') assert res is None # accept suggested name ui.staged_responses.append('') res = credman.set( None, _suggested_name='auto1', secret='dummy', other='prop') assert res == {'name': 'auto1', 'other': 'prop', 'secret': 'dummy'} # a suggestion conflicting with an existing credential is like # not making a suggestion at all ui.staged_responses.extend(('', 'auto2')) res = credman.set( None, _suggested_name='auto1', secret='dummy', other='prop') assert res == {'name': 'auto2', 'other': 'prop', 'secret': 'dummy'} def test_credmanager_set_noninteractive( tmp_keyring, datalad_cfg, datalad_noninteractive_ui): credman = CredentialManager(datalad_cfg) # if no name is provided and none _can_ be entered -> raise with pytest.raises(ValueError): credman.set(None, secret='dummy', other='prop') def test_credman_local(existing_dataset): ds = existing_dataset credman = CredentialManager(ds.config) # deposit a credential into the dataset's config, and die trying to # remove it ds.config.set('datalad.credential.stupid.secret', 'really', scope='branch') assert_raises(RuntimeError, credman.remove, 'stupid') # but it manages for the local scope ds.config.set('datalad.credential.notstupid.secret', 'really', scope='local') credman.remove('notstupid') def test_query(tmp_keyring, datalad_cfg): credman = CredentialManager(datalad_cfg) # set a bunch of credentials with a common realm AND timestamp for i in range(3): credman.set( f'cred.{i}', _lastused=True, secret=f'diff{i}', realm='http://ex.com/login', ) # now a credential with the common realm, but without a timestamp credman.set( 'cred.no.time', _lastused=False, secret='notime', realm='http://ex.com/login', ) # and the most recent one (with timestamp) is an unrelated one credman.set('unrelated', _lastused=True, secret='unrelated') # smoke test for an unsorted report assert len(credman.query()) > 1 # now we want all credentials that match the realm, sorted by # last-used timestamp -- most recent first slist = credman.query(realm='http://ex.com/login', _sortby='last-used') eq_(['cred.2', 'cred.1', 'cred.0', 'cred.no.time'], [i[0] for i in slist]) # same now, but least recent first, importantly no timestamp stays last slist = credman.query(realm='http://ex.com/login', _sortby='last-used', _reverse=False) eq_(['cred.0', 'cred.1', 'cred.2', 'cred.no.time'], [i[0] for i in slist]) def test_credman_get(datalad_cfg, datalad_interactive_ui): ui = datalad_interactive_ui # we are not making any writes, any config must work credman = CredentialManager(datalad_cfg) # must be prompting for missing properties ui.staged_responses.append('myuser') res = credman.get( None, _type_hint='user_password', _prompt='myprompt', secret='dummy') assert 'myuser' == res['user'] # same for the secret ui.staged_responses.append('mysecret') res = credman.get( None, _type_hint='user_password', _prompt='myprompt', user='dummy') assert 'mysecret' == res['secret'] def test_credman_get_guess_type(): # define token-only-no-type credential in config override credman = CredentialManager( ConfigManager(overrides={ 'datalad.credential.mike.token': 'some', }) ) # we get it reported fine, token property converted to the # 'secret' and a proper 'type' assigned assert credman.get('mike') == { 'secret': 'some', 'type': 'token', } def test_credman_obtain(tmp_keyring, datalad_cfg, datalad_interactive_ui): ui = datalad_interactive_ui credman = CredentialManager(datalad_cfg) # senseless, but valid call # could not possibly report a credential without any info with pytest.raises(ValueError): credman.obtain() # a type_hint is not enough, if no prompt is provided with pytest.raises(ValueError): credman.obtain(type_hint='token') # also a prompt alone is not enough with pytest.raises(ValueError): credman.obtain(prompt='myprompt') # minimal condition prompt and type-hint for manual entry ui.staged_responses.append('mytoken') res = credman.obtain(type_hint='token', prompt='myprompt') assert res == (None, {'type': 'token', 'secret': 'mytoken', '_edited': True}) # no place a credential we could discover cred1_props = dict(secret='sec1', type='token', realm='myrealm') credman.set('cred1', _lastused=True, **cred1_props) # one matching property is all that is needed res = credman.obtain(query_props={'realm': 'myrealm'}) assert res == ('cred1', credman.get('cred1')) # will report the last-used one credman.set('cred2', _lastused=True, **cred1_props) res = credman.obtain(query_props={'realm': 'myrealm'}) assert res == ('cred2', credman.get('cred2')) credman.set('cred1', _lastused=True, **cred1_props) res = credman.obtain(query_props={'realm': 'myrealm'}) assert res == ('cred1', credman.get('cred1')) # built-in test for additional property expectations with pytest.raises(ValueError): credman.obtain(query_props={'realm': 'myrealm'}, expected_props=['funky']) res = credman.obtain(query_props={'realm': 'myrealm'}) # if we are looking for a realm, we get it back even if a credential # had to be entered ui.staged_responses.append('mynewtoken') res = credman.obtain( type_hint='token', prompt='myprompt', query_props={'realm': 'mytotallynewrealm'}) assert res == (None, {'type': 'token', 'secret': 'mynewtoken', '_edited': True, 'realm': 'mytotallynewrealm'}) legacy_provider_cfg = """\ [provider:credmanuniquetestcredentialsetup] url_re = http://example\\.com/ authentication_type = http_basic_auth credential = credmanuniquetestcredentialsetup [credential:credmanuniquetestcredentialsetup] type = user_password """ def test_legacy_credentials(tmp_keyring, existing_dataset): # - the legacy code will only ever pick up a dataset credential, when # PWD is inside a dataset # - we want all tests to bypass the actual system keyring # 'datalad.downloaders.credentials.keyring_' is what the UserPassword # credential will use to store the credential # - 'datalad.support.keyring_' is what credman uses # - we need to make them one and the same thing, and the tmp_keyring # fixture does this by replacing the keyring storage for the runtime # of the test with chpwd(existing_dataset.path): check_legacy_credentials(tmp_keyring, existing_dataset) def check_legacy_credentials(tmp_keyring, existing_dataset): # we will use a dataset to host a legacy provider config ds = existing_dataset provider_path = ds.pathobj / '.datalad' / 'providers' / 'mylegacycred.cfg' provider_path.parent.mkdir(parents=True, exist_ok=True) provider_path.write_text(legacy_provider_cfg) # shortcut cname = 'credmanuniquetestcredentialsetup' credman = CredentialManager(ds.config) # check that we get legacy reports in a query. this is needed to be # able to even know that they exist res = dict(credman.query()) assert cname in res cred = res[cname] # we always get the type reported assert cred['type'] == 'user_password' # we can know that it is a legacy credential assert cred['_from_backend'] == 'legacy' # but getting an unset legacy credential will unambiguously say # "there is none" assert credman.get(cname) is None # we want all tests to bypass the actual system keyring # 'datalad.downloaders.credentials.keyring_' is what the UserPassword # credential will use to store the credential from datalad.downloaders.credentials import UserPassword lc = UserPassword(cname, dataset=ds) lc.set(user='mike', password='pass123') # now we should be able to get it from credman too # and just by name -- no need to provide a type hint cred = credman.get(cname) assert cred['user'] == 'mike' # reporting of the secret is always under the 'secret' key assert cred['secret'] == 'pass123' assert cred['type'] == 'user_password' assert cred['_from_backend'] == 'legacy' # check migration on set try: # setting a credential, will migrate info into the non-legacy # backend. however, it will not move information _out of_ # the legacy backend, in order to keep old code working # with the old info # confirm starting point: legacy code keeps user in secret store assert tmp_keyring.get_password(f'datalad-{cname}', 'user') == 'mike' assert ds.config.get(f'datalad.credential.{cname}.user') is None credman.set(cname, **cred) # it remains there assert tmp_keyring.get_password(f'datalad-{cname}', 'user') == 'mike' # but is also migrated assert ds.config.get(f'datalad.credential.{cname}.user') == 'mike' # NOTE: This setup is not without problems. Users will update # a credential and will leave an outdated (half) as garbage. # however, I did not come up with a better approach that gradually # brings users over. credman.set(cname, user='newmike', secret='othersecret') assert tmp_keyring.get_password(f'datalad-{cname}', 'user') == 'mike' # finally check that the update is reported now cred = credman.get(cname) assert cred['user'] == 'newmike' assert cred['secret'] == 'othersecret' assert cred['type'] == 'user_password' # no legacy info makes it out, hence no marker assert cred.get('_from_backend') != 'legacy' finally: credman.remove(cname) datalad-next-1.4.1/datalad_next/datasets/000077500000000000000000000000001462321624600203315ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/datasets/__init__.py000066400000000000000000000033741462321624600224510ustar00rootroot00000000000000"""Representations of DataLad datasets built on git/git-annex repositories Two sets of repository abstractions are available :class:`LeanGitRepo` and :class:`LeanAnnexRepo` vs. :class:`LegacyGitRepo` and :class:`LegacyAnnexRepo`. :class:`LeanGitRepo` and :class:`LeanAnnexRepo` provide a more modern, small-ish interface and represent the present standard API for low-level repository operations. They are geared towards interacting with Git and git-annex more directly, and are more suitable for generator-like implementations, promoting low response latencies, and a leaner processing footprint. The ``Legacy*Repo`` classes provide a, now legacy, low-level API to repository operations. This functionality stems from the earliest days of DataLad and implements paradigms and behaviors that are no longer common to the rest of the DataLad API. :class:`LegacyGitRepo` and :class:`LegacyAnnexRepo` should no longer be used in new developments, and are not documented here. .. currentmodule:: datalad_next.datasets .. autosummary:: :toctree: generated Dataset LeanGitRepo LeanAnnexRepo LegacyGitRepo LegacyAnnexRepo """ from datalad.distribution.dataset import ( Dataset, # this does nothing but provide documentation # only kept here until this command is converted to # pre-call parameter validation # TODO REMOVE FOR V2.0 EnsureDataset as NoOpEnsureDataset, # TODO REMOVE FOR V2.0 datasetmethod, # TODO REMOVE FOR V2.0 resolve_path, ) from datalad.dataset.gitrepo import GitRepo as LeanGitRepo from datalad.support.gitrepo import GitRepo as LegacyGitRepo from datalad.support.gitrepo import GitRepo as LegacyGitRepo from datalad.support.annexrepo import AnnexRepo as LegacyAnnexRepo from .annexrepo import LeanAnnexRepo datalad-next-1.4.1/datalad_next/datasets/annexrepo.py000066400000000000000000000034151462321624600227050ustar00rootroot00000000000000from pathlib import Path from datalad.dataset.gitrepo import GitRepo as LeanGitRepo from datalad.support.annexrepo import AnnexRepo as LegacyAnnexRepo class LeanAnnexRepo(LegacyAnnexRepo): """git-annex repository representation with a minimized API This is a companion of :class:`LeanGitRepo`. In the same spirit, it restricts its API to a limited set of method that extend :class:`LeanGitRepo`. """ #CA .. autosummary:: #CA call_annex #CA call_annex_oneline #CA call_annex_success # list of attributes permitted in the "lean" API. This list extends # the API of LeanGitRepo # TODO extend whitelist of attributes as necessary _lean_attrs = [ #CA # these are the ones we intend to provide #CA 'call_annex', #CA 'call_annex_oneline', #CA 'call_annex_success', # and here are the ones that we need to permit in order to get them # to run '_check_git_version', #CA '_check_git_annex_version', # used by AnnexRepo.__init__() -- should be using `is_valid()` 'is_valid_git', 'is_valid_annex', '_is_direct_mode_from_config', #CA '_call_annex', #CA 'call_annex_items_', ] # intentionally limiting to just `path` as the only constructor argument def __new__(cls, path: Path): for attr in dir(cls): if not hasattr(LeanGitRepo, attr) \ and callable(getattr(cls, attr)) \ and attr not in LeanAnnexRepo._lean_attrs: setattr(cls, attr, _unsupported_method) obj = super(LegacyAnnexRepo, cls).__new__(cls) return obj def _unsupported_method(self, *args, **kwargs): raise NotImplementedError('method unsupported by LeanAnnexRepo') datalad-next-1.4.1/datalad_next/exceptions/000077500000000000000000000000001462321624600207025ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/exceptions/__init__.py000066400000000000000000000014131462321624600230120ustar00rootroot00000000000000"""Special purpose exceptions .. currentmodule:: datalad_next.exceptions .. autosummary:: :toctree: generated CapturedException IncompleteResultsError NoDatasetFound """ # we cannot have CommandError above, sphinx complains # TODO rethink the purpose of this module and possibly # make it about *external* custom exceptions from datalad.runner.exception import CommandError from datalad.support.exceptions import ( CapturedException, IncompleteResultsError, NoDatasetFound, ) # TODO REMOVE FOR V2.0 (they are specific to that module from datalad_next.url_operations import ( UrlOperationsRemoteError, UrlOperationsAuthenticationError, UrlOperationsAuthorizationError, UrlOperationsInteractionError, UrlOperationsResourceUnknown, ) datalad-next-1.4.1/datalad_next/gitremotes/000077500000000000000000000000001462321624600207035ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/gitremotes/__init__.py000066400000000000000000000000001462321624600230020ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/gitremotes/datalad_annex.py000077500000000000000000001466751462321624600240660ustar00rootroot00000000000000#!/usr/bin/env python ## emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See LICENSE file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """git-remote-datalad-annex to fetch/push via any git-annex special remote In essence, this Git remote helper bootstraps a utility repository in order to push/fetch the state of a repository to any location accessible by any git-annex special remote implementation. All information necessary for this bootstrapping is taken from the remote URL specification. The internal utility repository is removed again after every invocation. Therefore changes to the remote access configuration can be made any time by simply modifying the configured remote URL. When installed, this remote helper is invoked for any "URLs" that start with the prefix ``datalad-annex::``. Following this prefix, two types of specifications are support. 1. Plain parameters list:: datalad-annex::?type=&[...][exporttree=yes] In this case the prefix is followed by a URL query string that comprises all necessary (and optional) parameters that would be normally given to the ``git annex initremote`` command. It is required to specify the special remote ``type``, and it is possible to request "export" mode for any special remote that supports it. Depending on the chosen special remote additional parameters may be required or supported. Please consult the git-annex documentation at https://git-annex.branchable.com/special_remotes/ 2. URL:: datalad-annex::[?...] Alternatively, an actual URL can be given after the prefix. In this case, the, now optional, URL query string can still be used to specify arbitrary parameters for special remote initialization. In addition, the query string specification can use Python-format-style placeholder to reference particular URL components as parameters values, in order to avoid double-specification. The list of supported placeholders is ``scheme``, ``netloc``, ``path``, ``fragment``, ``username``, ``password``, ``hostname``, ``port``, corresponding to the respective URL components. In addition, a ``noquery`` placeholder is supported, which resolves to the entire URL except any query string. An example of such a URL specification is:: datalad-annex::file:///tmp/example?type=directory&directory={path}&encryption=none' which would initialize a ``type=directory`` special remote pointing at ``/tmp/example``. Caution with collaborative workflows There is no protection against simultaneous, conflicting repository state uploads from two different locations! Similar to git-annex's "export" feature, this feature is most appropriately used as a dataset deposition mechanism, where uploads are conducted from a single site only -- deposited for consumption by any number of parties. If this Git remote helper is to be used for multi-way collaboration, with two or more parties contributing updates, it is advisable to employ a separate ``datalad-annex::`` target site for each contributor, such that only one site is pushing to any given location. Updates are exchanged by the remaining contributors adding the respective other ``datalad-annex::`` sites as additional Git remotes, analog to forks of a repository. Special remote type support In addition to the regular list of special remotes, plain http(s) access via URLs is also supported via the 'web' special remote. For such cases, only the base URL and the 'type=web' parameter needs to be given, e.g:: git clone 'datalad-annex::https://example.com?type=web&url={noquery}' When a plain URL is given, with no parameter specification in a query string, the parameters ``type=web`` and ``exporttree=yes`` are added automatically by default. This means that this remote helper can clone from any remote deposit accessible via ``http(s)`` that matches the layout depicted in the next section. Remote layout The representation of a repository at a remote depends on the chosen type of special remote. In general, two files will be deposited. One text file containing a list of Git ``refs`` contained in the deposit, and one ZIP file with a (compressed) archive of a bare Git repository. Beside the idiosyncrasies of particular special remotes, to major modes determine the layout of a remote deposit. In "normal" mode, two annex keys (``XDLRA--refs``, ``XDLRA--repo-export``) will be deposited. In "export" mode, a directory tree is created that is designed to blend with arbitrary repository content, such that a git remote and a git-annex export can be pushed to the same location without conflicting with each other. The aforementioned files will be represented like this:: .datalad └── dotgit # named to not be confused with an actual Git repository ├── refs └── repo.zip The default LZMA-compression of the ZIP file (in both export and normal mode) can be turned off with the ``dladotgit=uncompressed`` URL parameter. Credential handling Some git-annex special remotes require the specification of credentials via environment variables. With the URL parameter ``dlacredential=`` it is possible to query DataLad for a user/password credential to be used for this purpose. This convenience functionality is supported for the special remotes ``glacier``, ``s3``, and ``webdav``. When a credential of the given name does not exist, or no credential name was specified, an attempt is made to determine a suitable credential based on, for example, a detected HTTP authentication realm. If no matching credential could be found, the user will be prompted to enter a credential. After having successfully established access, the entered credential will be saved in the local credential store. DataLad-based credentials are only utilized, when the native git-annex credential setup via environment variables is not in use (see the documentation of a particular special remote implementation for more information). Implementation details This Git remote implementation uses *two* extra repositories, besides the repository (R) it is used with, to do its work: (A) A tiny repository that is entirely bootstrapped from the remote URL, and is used to retrieve/deposit a complete state of the actual repo an a remote site, via a git-annex special remote setup. (B) A local, fully functional mirror repo of the remotely stored repository state. On fetch/push the existence of both additional repositories is ensured. The remote state of retrieved via repo (A), and unpacked to repo (B). The actual fetch/push Git operations are performed locally between the repo (R) and repo (B). On push, repo (B) is then packed up again, and deposited on the remote site via git-annex transfer in repo (A). Due to a limitation of this implementation, it is possible that when the last upload step fails, Git nevertheless advances the pushed refs, making it appear as if the push was completely successful. That being said, Git will still issue a message (``error: failed to push some refs to..``) and the git-push process will also exit with a non-zero status. In addition, all of the remote's refs will be annotated with an additional ref named ``refs/dlra-upload-failed//`` to indicate the upload failure. These markers will be automatically removed after the next successful upload. .. note:: Confirmed to work with git-annex version 8.20211123 onwards. .. todo:: - At the moment, only one format for repository deposition is supported (a ZIP archive of a working bare repository). However this is not a good format for the purpose of long-term archiving, because it require a functional Git installation to work with. It would be fairly doable to make the deposited format configurable, and support additional formats. An interesting one would be a fast-export stream, basically a plain text serialization of an entire repository. - recognize that a different repo is being pushed over an existing one at the remote - think about adding additional information into the header of `refs` maybe give it some kind of stamp that also makes it easier to validate by the XDLRA backend - think about preventing duplication between the repo and its local mirror could they safely share git objects? If so, in which direction? """ from __future__ import annotations __all__ = ['RepoAnnexGitRemote'] import datetime import logging import os import sys import zipfile from pathlib import Path from shutil import make_archive from typing import ( IO, ) from unittest.mock import patch from urllib.parse import ( unquote, urlparse, ) from datalad.core.local.repo import repo_from_path from datalad_next.consts import ( PRE_INIT_COMMIT_SHA, on_windows, ) from datalad_next.constraints import EnsureInt from datalad_next.datasets import ( LegacyAnnexRepo as AnnexRepo, LegacyGitRepo as GitRepo, ) from datalad_next.exceptions import CapturedException from datalad_next.runners import ( CommandError, call_git, call_git_oneline, call_git_success, ) from datalad_next.uis import ui_switcher as ui from datalad_next.utils import ( CredentialManager, external_versions, get_specialremote_credential_envpatch, get_specialremote_credential_properties, needs_specialremote_credential_envpatch, patched_env, rmtree, specialremote_credential_envmap, update_specialremote_credential, ) lgr = logging.getLogger('datalad.gitremote.datalad_annex') class RepoAnnexGitRemote(object): """git-remote-helper implementation ``communicate()`` is the entrypoint. """ # hard code relevant keynames for the XDLRA backend # this will always have the refs list refs_key = 'XDLRA--refs' # this will have the repository archive repo_export_key = 'XDLRA--repo-export' xdlra_key_locations = { refs_key: dict( prefix='3f7/4a3', loc='.datalad/dotgit/refs'), repo_export_key: dict( prefix='eb3/ca0', loc='.datalad/dotgit/repo.zip'), } # all top-level content in a repo archive # this is used as a positive-filter when extracting downloaded # archives (to avoid writing to undesirable locations from # high-jacked archives) safe_content = [ 'branches', 'hooks', 'info', 'objects', 'refs', 'config', 'packed-refs', 'description', 'HEAD', ] # define all supported options, including their type-checker support_githelper_options = { 'verbosity': EnsureInt(), } # supported parameters that can come in via the URL, but must not # be relayed to `git annex initremote` internal_parameters = ('dladotgit=uncompressed', 'dlacredential=') def __init__( self, gitdir: str, remote: str, url: str, instream: IO = sys.stdin, outstream: IO = sys.stdout, errstream: IO = sys.stderr, ): """ Parameters ---------- gitdir : str Path to the GITDIR of the repository to operate on (provided by Git). remote : str Remote label to use (provided by Git). url : str URL of the remote (provided by Git). instream : Stream to read communication from Git from. outstream : Stream to communicate outcomes to Git. errstream : Stream for logging. """ self.repo = GitRepo(gitdir) # this is the key piece, take special remote params from # URL # this function yields a type= parameter in any case self.initremote_params = get_initremote_params_from_url(url) self.remote_name = remote # internal logic relies on workdir to be an absolute path self.workdir = Path(gitdir, 'dl-repoannex', remote).resolve() self._repoannexdir = self.workdir / 'repoannex' if self._repoannexdir.exists(): # whatever existed here before is an undesirable # leftover of a previous crash rmtree(str(self._repoannexdir), ignore_errors=True) self._repoannex = None self._mirrorrepodir = self.workdir / 'mirrorrepo' self._mirrorrepo = None # cache for remote refs, to avoid repeated queries self._cached_remote_refs: None | str = None self.instream = instream self.outstream = outstream self.errstream = errstream # options communicated by Git # https://www.git-scm.com/docs/gitremote-helpers#_options self.options: dict[str, str] = {} # we want to go for verbose output whenever datalad's log level is # debug or even more verbose. This makes it unnecessary to call # git directly with multiple `-v` options self.verbosity_threshold = 1 if lgr.getEffectiveLevel() > 10 else 10 # ID of the tree to export, if needed self.exporttree: None | str = None self.credman = None self.pending_credential = None # must come after the two above! self.credential_env = self._get_credential_env() annex_version = external_versions['cmd:annex'] if annex_version < '8.20211123': self.log( f'git-annex version {annex_version} is unsupported, ' 'please upgrade', level=1 ) def _get_credential_env(self) -> dict[str, str] | None: """ Returns ------- dict or None A dict with all required items to patch the environment, or None if not enough information is available. Raises ------ ValueError If a credential retrieval is requested for an unsupported special remote type. """ credential_name = None credential_names = [ p[14:] for p in self.initremote_params if p.startswith('dlacredential=') ] or None if credential_names: credential_name = credential_names[0] remote_type = self._get_remote_type() supported_remote_type = remote_type in specialremote_credential_envmap if credential_name and not supported_remote_type: # we have no idea how to deploy credentials for this remote type raise ValueError( f"Deploying credentials for type={remote_type} special " "remote is not supported. Remove dlacredential= parameter from " "the remote URL and provide credentials according to the " "documentation of this particular special remote.") if not needs_specialremote_credential_envpatch(remote_type): return None cred = self._retrieve_credential(credential_name) if not cred: lgr.debug( 'Could not find a matching credential for special remote %s', self.initremote_params) return None return get_specialremote_credential_envpatch(remote_type, cred) def _retrieve_credential( self, name: str | None, ) -> dict[str, str] | None: """Retrieve a credential Successfully retrieved credentials are also placed in self.pending_credential to be picked up by `_store_credential()`. Returns ------- dict or None If a credential could be retrieved, a dict with 'user' and 'secret' keys will be return, or None otherwise. """ if not self.credman: self.credman = CredentialManager(self.repo.config) assert self.credman is not None cred = None credprops: dict[str, str] = {} if name: # we can ask blindly first, caller seems to know what to do cred = self.credman.get( name=name, # give to make legacy credentials accessible _type_hint='user_password', ) if not cred: # direct lookup failed, try query. credprops = get_specialremote_credential_properties( self.initremote_params) or {} if credprops: creds = self.credman.query(_sortby='last-used', **credprops) if creds: name, cred = creds[0] if not cred: # credential query failed too, enable manual entry credprops['type'] = 'user_password' cred = self.credman.get( # this might still be None name=name, _type_hint='user_password', _prompt=f'A credential is required for access', # inject anything we already know to make sure we store it # at the very end, and can use it for discovery next time **credprops ) if not cred: return None # stage for eventual (re-)storage after having proven to work self.pending_credential = (name, cred) return {k: cred[k] for k in ('user', 'secret')} def _get_remote_type(self) -> str | None: remote_type = [ p[5:] for p in self.initremote_params if p.startswith('type=') ] if not remote_type: return None return remote_type[0] def _store_credential(self) -> None: """Look for a pending credential and store it Safe to call unconditionally. """ if self.pending_credential and self.credman: name, cred = self.pending_credential update_specialremote_credential( self._get_remote_type(), self.credman, name, cred, credtype_hint='user_password', duplicate_hint= 'Specify a credential name via the dlacredential= ' 'remote URL parameter, and/or configure a credential ' 'with the datalad-credentials command{}'.format( f' with a `realm={cred["realm"]}` property' if 'realm' in cred else ''), ) def _ensure_workdir(self) -> None: self.workdir.mkdir(parents=True, exist_ok=True) @property def repoannex(self) -> AnnexRepo: """Repo annex repository If accessed when there is no repo annex, as new one is created automatically. It is bootstrapped entirely from the parameters encoded in the remote URL. Returns ------- AnnexRepo This is always an annex repository. It is configured with a single special remote, parameterized from the Git repo URL. Raises ------ CommandError ValueError """ if self._repoannex: return self._repoannex self._ensure_workdir() try: # check if there is one already, would only be due to a prior # RUD (rapid unscheduled disassembly) ra = repo_from_path(self._repoannexdir) except ValueError: # funny dance to get to a bare annexrepo ra = GitRepo( self._repoannexdir, create=not GitRepo.is_valid(self._repoannexdir), bare=True, ) try: # send annex into private mode, if supported # this repo will never ever be shared call_git_success(['config', 'annex.private', 'true'], cwd=ra.pathobj, capture_output=True) call_git_success(['annex', 'init'], capture_output=True) ra = AnnexRepo(self._repoannexdir) if 'type=web' in self.initremote_params: self._init_repoannex_type_web(ra) else: # let git-annex-initremote take over with patched_env(**(self.credential_env or {})): ra.call_annex( ['initremote', 'origin'] + [ p for p in self.initremote_params if not any(p.startswith(ip) for ip in self.internal_parameters) ]) # make the new remote config known in the repo instance ra.config.reload() if 'exporttree=yes' in self.initremote_params: # conflicts with type=web, but we checked that above already. # plant the to-be-exported tree, still leaving the underlying # keys unfulfilled self.exporttree = make_export_tree(ra) except (CommandError, ValueError): # something blew up. clean up and blow again rmtree(ra.path, ignore_errors=True) raise self._repoannex = ra return ra def _init_repoannex_type_web(self, repoannex: AnnexRepo) -> None: """Uses registerurl to utilize the omnipresent type=web remote Raises ------ ValueError When there is no `url=` parameter or when there are other parameters than the additional `type=web` and `exporttree=yes`, indicating an unsupported setup. """ # for type=web we have to add URLs by hand baseurls = [ v for v in self.initremote_params if v.startswith('url=')] if not len(baseurls) == 1: raise ValueError( "'web'-type remote requires 'url' parameter") # validate the rest of the params, essentially there # must not be any other if not all(p in ('type=web', 'exporttree=yes') or p.startswith('url=') for p in self.initremote_params): raise ValueError( "'web'-type remote only supports 'url' " "and 'exporttree' parameters") baseurl = baseurls[0][4:] for key, kinfo in self.xdlra_key_locations.items(): repoannex.call_annex([ 'registerurl', key, f'{baseurl}/{kinfo["loc"]}' if 'exporttree=yes' in self.initremote_params else f'{baseurl}/{kinfo["prefix"]}/{key}/{key}' ]) @property def mirrorrepo(self) -> GitRepo: """Local remote mirror repository If accessed when there is no local mirror repo, as new one is created automatically, either from the remote state (if there is any), or an empty one. Returns ------- GitRepo This is always only a plain Git repository (bare). """ if self._mirrorrepo: return self._mirrorrepo # ensure we have a mirror repo, either fresh or existing self._ensure_workdir() if not self.get_remote_refs(): existing_repo = False # there is nothing at the remote, hence we must wipe # out the local state, whatever it was to make git # report subsequent pushes properly, and prevent # "impossible" fetches if self._mirrorrepodir.exists(): # if we extract, we cannot tollerate left-overs rmtree(str(self._mirrorrepodir), ignore_errors=True) # null the repohandle to be reconstructed later on-demand self._mirrorrepo = None elif GitRepo.is_valid(self._mirrorrepodir): # so we have remote refs and we also have a local mirror # create an instance, assume it is set up how we need it # must also have bare=True, or the newly created one below # will inherit the config # https://github.com/datalad/datalad/issues/6347 mr = GitRepo(self._mirrorrepodir, bare=True) # make sure any recursion back in here is prevented self._mirrorrepo = mr # this will trigger a download if possible (remote has refs) self.replace_mirrorrepo_from_remote_deposit_if_needed() # reevaluate existing_repo = GitRepo.is_valid(self._mirrorrepodir) else: # we have nothing local, pull from the remote, because it # reports stuff to exist self.replace_mirrorrepo_from_remote_deposit() existing_repo = True # (re-)create an instance mr = GitRepo( self._mirrorrepodir, # if the remote had no refs, there would still be no repo create=not existing_repo, bare=True) if not existing_repo: # align HEAD symbolic ref between source and mirror repo # IF WE CREATED IT LOCALLY JUST NOW, otherwise take whatever # we got. # otherwise we can end up in a conflict situation where the mirror # points to 'master' (or something else) and the source actually # has 'main' (or something different) src_head_ref = call_git_oneline( ['symbolic-ref', 'HEAD'], cwd=self.repo.pathobj, ).strip() call_git_success( ['symbolic-ref', 'HEAD', src_head_ref], cwd=mr.pathobj, capture_output=True, ) self.log('Established mirror') self._mirrorrepo = mr return mr def log( self, *args, level: int = 2, ) -> None: """Send log messages to the errstream""" # A value of 0 for means that processes operate quietly, # and the helper produces only error output. # 1 is the default level of verbosity, # and higher values of correspond to the number of -v flags # passed on the command line if int(self.options.get('verbosity', self.verbosity_threshold)) >= level: print('[DATALAD-ANNEX]', *args, file=self.errstream) def send(self, msg: str) -> None: """Communicate with Git""" print(msg, end='', file=self.outstream, flush=True) def communicate(self) -> None: """Implement the necessary pieces of the git-remote-helper protocol Uses the input, output and error streams configured for the class instance. """ self.log('Git remote startup: ' f'{self.remote_name} [{self.initremote_params}]') for line in self.instream: self.log(f'Received Git remote command: {repr(line)}', level=4) if line == '\n': # orderly exit command return elif line == 'capabilities\n': self.send( 'option\n' 'list\n' 'connect\n' '\n' ) elif line == 'connect git-receive-pack\n': # "receive", because we are receiving at the local mirror repo # from a `send-pack` process that is connected to the main local # repo self.log('Connecting git-receive-pack\n') self.send('\n') # we assume the mirror repo is in-sync with the remote at # this point pre_refs = sorted(self.mirrorrepo.for_each_ref_(), key=lambda x: x['refname']) # must not capture -- git is talking to it directly from here call_git( ['receive-pack', self.mirrorrepo.path], cwd=self.mirrorrepo.pathobj, ) post_refs = sorted(self.mirrorrepo.for_each_ref_(), key=lambda x: x['refname']) if pre_refs != post_refs \ or (post_refs != self.get_remote_refs()): # there was a change in the refs of the mirror repo # OR # the mirror is out-of-sync with the remote (could be a # slightly more expensive test) # we must upload it. try: self.replace_remote_deposit_from_mirrorrepo() except Exception: # the bad thing is that we have no way of properly # signaling to git that this happened, # the refs for this remote will look as if the upload # was successful # we do not need to roll-back the refs in the # mirrorrepo as it will be rsync'ed to the remote on # next access self.log('Remote update failed, flagging refs', post_refs) for ref in post_refs: # best MIH can think of is to leave behind another # ref to indicate the unsuccessful upload call_git_success([ 'update-ref', # strip 'refs/heads/' from refname f'refs/dlra-upload-failed/{self.remote_name}/' f'{ref["refname"][11:]}', ref['objectname']], cwd=self.repo.pathobj, capture_output=True, ) raise # clean-up potential upload failure markers for this particular # remote. whatever has failed before, we just uploaded a mirror # that was freshly sync'ed with the remote state before for ref in self.repo.for_each_ref_( fields=('refname',), pattern=f'refs/dlra-upload-failed/{self.remote_name}'): call_git_success( ['update-ref', '-d', ref['refname']], cwd=self.repo.pathobj, capture_output=True, ) # we do not need to update `self._cached_remote_refs`, # because we end the remote-helper process here # everything has worked, if we used a credential, update it self._store_credential() return elif line == 'connect git-upload-pack\n': # "upload", because we are uploading from the local mirror repo # to a `fetch-pack` process that is connected to the main local # repo try: self.get_remote_refs(raise_on_error=True) except Exception as e: self.log("fatal: couldn't find remote refs (repository deposit does not exist, or is inaccessible", level=1) self.log(f"query error: {e!r}", level=2) return self.log('Connecting git-upload-pack\n') self.send('\n') # must not capture -- git is talking to it directly from here. # the `self.mirrorrepo` access will ensure that the mirror # is up-to-date call_git( ['upload-pack', self.mirrorrepo.path], cwd=self.mirrorrepo.pathobj, ) # everything has worked, if we used a credential, update it self._store_credential() return elif line.startswith('option '): key, value = line[7:].split(' ', maxsplit=1) if key not in self.support_githelper_options: self.send('unsupported\n') else: try: self.options[key] = \ self.support_githelper_options[key]( value.rstrip('\n')) self.send('ok\n') except ValueError as e: # ensure no-multiline message excstr = str(e).replace('\n', '\\n') # git may not communicate reason for error, do log self.log( f'Type-checking of "{line[:-1]}" failed: {excstr}') self.send(f'error {excstr}\n') else: self.log('UNKNOWN COMMAND', line) # unrecoverable error return def replace_remote_deposit_from_mirrorrepo(self) -> None: """Package the local mirrorrepo up, and copy to the special remote The mirror is assumed to be ready/complete. It will be cleaned with `gc` to minimize the upload size. The mirrorrepo is then compressed into an LZMA ZIP archive, and a separate refs list for it is created in addition. Both are then copied to the special remote. """ self.log('Replace remote from mirror') mirrorrepo = self.mirrorrepo repoannex = self.repoannex # trim it down, as much as possible call_git(['gc'], cwd=mirrorrepo.pathobj) # update the repo state keys # it is critical to drop the local keys first, otherwise # `setkey` below will not replace them with new content # however, git-annex fails to do so in some edge cases # https://git-annex.branchable.com/bugs/Fails_to_drop_key_on_windows___40__Access_denied__41__/?updated # no regular `drop` works, nor does `dropkeys` #self.log(repoannex.call_annex(['drop', '--force', '--all'])) # nuclear option remains, luckily possible in this utility repo if on_windows: objdir = self.repoannex.dot_git / 'annex' / 'objects' if objdir.exists(): rmtree(str(objdir), ignore_errors=True) objdir.mkdir() else: # more surgical for the rest self.log(repoannex.call_annex([ 'dropkey', '--force', self.refs_key, self.repo_export_key])) # use our zipfile wrapper to get an LZMA compressed archive # via the shutil convenience layer with patch('zipfile.ZipFile', UncompressedZipFile if 'dladotgit=uncompressed' in self.initremote_params else LZMAZipFile): # TODO exclude hooks (the mirror is always plain-git), # would we ever need any archive_file = make_archive( str(self.workdir / 'repoarchive'), 'zip', root_dir=str(mirrorrepo.path), base_dir=os.curdir, ) # hand over archive to annex repoannex.call_annex([ 'setkey', self.repo_export_key, archive_file ]) # generate a list of refs # write to file refs_file = self.workdir / 'reporefs' refs_file.write_text(_format_refs(mirrorrepo)) self.log(refs_file.read_text()) # hand over reflist to annex self.log(repoannex.call_annex([ 'setkey', self.refs_key, str(refs_file), ])) if 'exporttree=yes' in self.initremote_params: # we want to "force" an export, because the content of our # keys can change, but this is not possible. # we cheat be exporting "nothing" (an empty tree) first, # and then reexport try: self.log(repoannex.call_annex( ['export', PRE_INIT_COMMIT_SHA, '--to=origin'])) except Exception as e: # some remotes will error out if we unexport something that # wasn't actually exported (e.g. webdav) CapturedException(e) pass self.log(repoannex.call_annex( ['export', self.exporttree, '--to=origin'])) else: # it is critical to drop the keys from the remote first, otherwise # `copy` below will fail to replace their content self.log(repoannex.call_annex( ['drop', '--force', '-f', 'origin', '--all'])) self.log(repoannex.call_annex( ['copy', '--fast', '--to', 'origin', '--all'])) # update remote refs from local ones # we just updated the remote from local self._cached_remote_refs = self.get_mirror_refs() def replace_mirrorrepo_from_remote_deposit_if_needed( self, ) -> tuple[str | None, str]: """Replace the mirror if the remote has refs and they differ """ self.log("Check if mirror needs to be replaced with remote state") remote_refs = self.get_remote_refs() mirror_refs = self.get_mirror_refs() if remote_refs and remote_refs != mirror_refs: self.log(repr(remote_refs), repr(mirror_refs)) # we must replace the local mirror with the # state of the remote self.replace_mirrorrepo_from_remote_deposit() return remote_refs, mirror_refs def replace_mirrorrepo_from_remote_deposit(self) -> None: """Replaces the local mirror repo with one obtained from the remote This method assumes that the remote does have one. This should be checked by inspecting `get_remote_refs()` before calling this method. """ self.log('Set mirror to remote state') ra = self.repoannex # we have to get the key with the repo archive # because the local repoannex is likely a freshly bootstrapped one # without any remote awareness, claim that the remote has this key sremotes = ra.get_special_remotes() if len(sremotes) == 1: # in case of the 'web' special remote, we have no actual special # remote, but URLs for the two individual keys ra.call_annex(['setpresentkey', self.repo_export_key, sremotes.popitem()[0], '1']) # drop locally to ensure re-downlad, the keyname never changes, # even when the content does self.log( ra.call_annex([ 'drop', '--force', '--key', self.repo_export_key]) ) # download the repo archive self.log( ra.call_annex(['get', '--key', self.repo_export_key]) ) # locate it in the local annex, use annex function to do this in order # to cope with any peculiar repo setups we might face across platforms repoexportkeyloc = ra.call_annex_oneline([ 'contentlocation', self.repo_export_key]) repoexportkeyloc = ra.dot_git / repoexportkeyloc if self._mirrorrepodir.exists(): # if we extract, we cannot tollerate left-overs rmtree(str(self._mirrorrepodir), ignore_errors=True) # null the repohandle to be reconstructed later on-demand self._mirrorrepo = None self.log('Extracting repository archive') legacy_deposit = False with zipfile.ZipFile(repoexportkeyloc) as zip: try: zip.getinfo('repo/') legacy_deposit = True safe_content = [f'repo/{i}' for i in self.safe_content] except KeyError: safe_content = self.safe_content zip.extractall( self._mirrorrepodir, # a bit of a safety-net, exclude all unexpected content members=[ m for m in zip.namelist() if any(m.startswith(prefix) for prefix in safe_content)], ) if legacy_deposit: legacy_basedir = self._mirrorrepodir / 'repo' for p in legacy_basedir.iterdir(): p.rename(self._mirrorrepodir / p.relative_to(legacy_basedir)) def get_remote_refs(self, raise_on_error: bool = False) -> str | None: """Report remote refs The underlying special remote is asked whether it has the key containing the refs list for the remote. If it does, it is retrieved and reported. Returns ------- str or None If the remote has a refs record, it is returned as a string, formatted like a refs file in a Git directory. Otherwise, `None` is returned. """ if self._cached_remote_refs: # this process already queried them once, return cache return self._cached_remote_refs self.log("Get refs from remote") ra = self.repoannex # in case of the 'web' special remote, we have no actual special # remote, but URLs for the two individual keys sremotes = ra.get_special_remotes() # if we do not have a special remote reported, fall back on # possibly recorded URLs for the XDLRA--refs key sremote_id = sremotes.popitem()[0] if sremotes else 'web' # we want to get the latest refs from the remote under all # circumstances, and transferkey will not attempt a download for # a key that is already present locally -> drop first ra.call_annex([ 'drop', '--force', '--key', self.refs_key]) # now get the key from the determined remote try: ra.call_annex([ 'transferkey', self.refs_key, f'--from={sremote_id}']) except CommandError as e: if raise_on_error: raise CapturedException(e) self.log("Remote appears to have no refs") # download failed, we have no refs # this can happen for legit reasons (prior first push), # but also with broken URLs or lack of permissions return None refskeyloc = ra.call_annex_oneline([ 'contentlocation', self.refs_key]) # read, cache, return refs = (ra.dot_git / refskeyloc).read_text() self._cached_remote_refs = refs return refs def get_mirror_refs(self) -> str: """Return the refs of the current mirror repo Returns ------- str """ self.log("Get refs from mirror") return _format_refs(self.mirrorrepo) # TODO propose as addition to AnnexRepo # https://github.com/datalad/datalad/issues/6316 def call_annex_success(self, args, files=None) -> bool: """Call git-annex and return true if the call exit code of 0. All parameters match those described for `call_annex`. Returns ------- bool """ try: self.call_annex(args, files) except CommandError: return False return True class LZMAZipFile(zipfile.ZipFile): """Tiny wrapper to monkey-patch zipfile in order to have shutil.make_archive produce an LZMA-compressed ZIP""" def __init__(self, *args, **kwargs): kwargs.pop('compression', None) return super().__init__( *args, compression=zipfile.ZIP_LZMA, **kwargs) class UncompressedZipFile(zipfile.ZipFile): """Tiny wrapper to monkey-patch zipfile in order to have shutil.make_archive produce an uncompressed ZIP""" def __init__(self, *args, **kwargs): kwargs.pop('compression', None) return super().__init__( *args, compression=zipfile.ZIP_STORED, **kwargs) def _format_refs(repo: GitRepo) -> str: """Helper to format a standard refs list from for_each_ref() output Parameters ---------- repo: GitRepo Repo which to query for the 'HEAD' symbolic ref Returns ------- str Formatted refs list """ refs = repo.for_each_ref_() # generate a list of refs refstr = '\n'.join( "{objectname} {refname}".format(**r) for r in refs ) if refstr: refstr += '\n' refstr += '@{} HEAD\n'.format( call_git_oneline( ['symbolic-ref', 'HEAD'], cwd=repo.pathobj, ).strip() ) return refstr def get_initremote_params_from_url(url: str) -> list[str]: """Parse a remote URL for initremote parameters Parameters are taken from a URL's query string. In the query parameters can be defined directly, or via placeholder for all URL components (using Python's format language). The following placeholders are supported: 'scheme', 'netloc', 'path', 'fragment', 'username', 'password', 'hostname', 'port'. Their values are determined by urlparse(). There is no placeholder for the 'query' component, but a 'noquery' placeholder is supported, which provides the original (reassembled) URL without the query string. Parameters ---------- url : str Returns ------- list git-annex initremote parameter list. Each value string has the format 'key=value'. """ if url.startswith('datalad-annex::'): url = url[15:] if not url: raise ValueError("Given URL only contained 'datalad-annex::' prefix") pu = urlparse(url) expansion = { p: getattr(pu, p) for p in ( 'scheme', 'netloc', 'path', # we do not extract the 'query', because it is the very # thing we iterate over below 'fragment', 'username', 'password', 'hostname', 'port') } expansion['noquery'] = pu._replace(query='').geturl() # expand all parameters in the query params = [ # unquote any string -- should be safe, because # initremote parameter names should not have any special # characters unquote( # apply any expansion from the URL components v.format(**expansion) ) for v in pu.query.split('&') # nothing to pull from an empty string if v ] if all(not p.startswith('type=') for p in params): # if there is no type declared, this is a plain type=web # export using the full URL params = ['type=web', 'exporttree=yes', f'url={url}'] return params def make_export_tree(repo: GitRepo) -> str: """Create an exportable tree The function expects a clean (bare) repository. It requires no checkout, and does not modify any branches or creates commits. The tree is always the same, but still has to be create in the repoannex to be accessible for git-annex. It looks like this:: .datalad └── dotgit ├── refs └── repo.zip where the two files under ``dotgit/`` link to the two critical keys. The placement of the files under ``.datalad/`` is chosen so that the export can blend with an export of the underlying dataset without conflict. The name ``dotgit`` rather than ``.git`` is chosen to avoid confusing it with an actual nested Git repo. Parameters ---------- repo: AnnexRepo Repository instance to write to. Returns ------- str ID of the tree object, suitable for `git-annex export`. """ here = repo.config.get('annex.uuid') # reuse existing, or go with fixed random one origin = repo.config.get('remote.origin.annex-uuid', '8249ffce-770a-11ec-9578-5f6af5e76eaa') assert here, "No 'here'" assert origin, "No 'origin'" # we need to force Git to use a throwaway index file to maintain # the bare nature of the repoannex, git-annex would stop functioning # properly otherwise index_file = repo.pathobj / 'datalad_tmp_index' with patched_env(GIT_INDEX_FILE=index_file): try: for key, kinfo in RepoAnnexGitRemote.xdlra_key_locations.items(): # create a blob for the annex link linkhash = call_git_oneline( ['hash-object', '-w', '--stdin'], cwd=repo.pathobj, input=f'../../.git/annex/objects/{kinfo["prefix"]}/{key}/{key}', ).strip() # place link into a tree call_git_success( ['update-index', '--add', '--cacheinfo', '120000', linkhash, kinfo["loc"]], cwd=repo.pathobj, capture_output=True, ) # write the complete tree, and return ID exporttree = call_git_oneline( ['write-tree'], cwd=repo.pathobj ).strip() # this should always come out identically # unless we made changes in the composition of the export tree assert exporttree == '7f0e7953e93b4c9920c2bff9534773394f3a5762' # clean slate if index_file.exists(): index_file.unlink() # fake export.log record # s : now_ts = datetime.datetime.now().timestamp() exportlog = call_git_oneline( ['hash-object', '-w', '--stdin'], input=f'{now_ts}s {here}:{origin} {exporttree}\n', cwd=repo.pathobj, ).strip() call_git_success( ['read-tree', 'git-annex'], cwd=repo.pathobj, ) call_git_success( ['update-index', '--add', '--cacheinfo', '100644', exportlog, 'export.log'], cwd=repo.pathobj, capture_output=True, ) gaupdate = call_git_oneline( ['write-tree'], cwd=repo.pathobj, ).strip() gacommit = call_git_oneline( ['commit-tree', '-m', 'Fake export', '-p', 'git-annex', gaupdate], cwd=repo.pathobj, ).strip() call_git_success( ['update-ref', 'refs/heads/git-annex', gacommit], cwd=repo.pathobj, ) finally: if index_file.exists(): index_file.unlink() return exporttree def push_caused_change(operations: list[str]) -> bool: ok_operations = ( 'new-tag', 'new-branch', 'forced-update', 'fast-forward', 'deleted' ) return any(o in operations for o in ok_operations) def push_error(operations: list[str]) -> bool: error_operations = ( 'no-match', 'rejected', 'remote-rejected', 'remote-failure', 'error', ) return any(o in operations for o in error_operations) def main(gitremote_cls=RepoAnnexGitRemote): """git-remote helper executable entrypoint""" try: if len(sys.argv) < 3: raise ValueError(f"Usage: {sys.argv[0]} REMOTE-NAME URL") remote, url = sys.argv[1:3] # provided by Git gitdir = os.environ.pop('GIT_DIR') # no fallback, must be present if gitdir is None: raise RuntimeError('GIT_DIR environment variable not defined') # stdin/stdout will be used for interactions with git # the 'annex' backend really doesn't do much annex-specific # albeit maybe progress reporting (unclear to MIH right now) # but it does make credential entry possible here, despite the # remote helper process being connected to Git with its stdin/stdout ui.set_backend('annex') # lock and load remote = gitremote_cls(gitdir, remote, url) remote.communicate() # there is no value in keeping around the downloads # we either have things in the mirror repo or have to # redownload anyways # leaving the table clean and always bootstrap from scratch # has the advantage that we always automatically react to any # git-remote reconfiguration between runs rmtree(remote.repoannex.path, ignore_errors=True) except Exception as e: ce = CapturedException(e) # Receiving an exception here is "fatal" by definition. # Mimicking git's error reporting style. print(f"fatal: {ce}", file=sys.stderr) sys.exit(1) if __name__ == '__main__': main() datalad-next-1.4.1/datalad_next/gitremotes/tests/000077500000000000000000000000001462321624600220455ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/gitremotes/tests/__init__.py000066400000000000000000000000001462321624600241440ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/gitremotes/tests/test_datalad_annex.py000066400000000000000000000351351462321624600262500ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 noet: # -*- coding: utf-8 -*- # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """ """ from pathlib import Path from stat import S_IREAD, S_IRGRP, S_IROTH from urllib.parse import quote as urlquote from datalad.api import ( Dataset, clone, ) from datalad_next.tests import ( DEFAULT_BRANCH, DEFAULT_REMOTE, assert_raises, assert_status, eq_, skip_if_root, ) from datalad_next.consts import on_windows from datalad_next.exceptions import CommandError from datalad_next.utils import ( patched_env, rmtree, ) from ..datalad_annex import get_initremote_params_from_url webdav_cred = ('datalad', 'secure') def eq_dla_branch_state(state, path, branch=DEFAULT_BRANCH): """Confirm that the reported branch hexsha at a remote matches a given value""" refsfile = Path(path) / '3f7' / '4a3' / 'XDLRA--refs' / 'XDLRA--refs' if not refsfile.exists(): # this may be an export remote refsfile = Path(path) / '.datalad' / 'dotgit' / 'refs' if not refsfile.exists(): assert None, f'Could not find refs at {path}' for line in refsfile.read_text().splitlines(): if line.strip().endswith(f'heads/{branch}'): eq_(state, line.split(maxsplit=1)[0]) return assert None, f'Could not find state for branch {branch} at {path}' @skip_if_root # see https://github.com/datalad/datalad-next/issues/525 def test_annex_remote(existing_noannex_dataset, tmp_path, no_result_rendering): remotepath = tmp_path / 'remote' # bypass the complications of folding a windows path into a file URL dlaurl = \ f'datalad-annex::?type=directory&directory={remotepath}&encryption=none' \ if on_windows else \ f'datalad-annex::file://{remotepath}?type=directory&directory={{path}}&encryption=none' ds = existing_noannex_dataset _check_push_fetch_cycle(ds, dlaurl, remotepath, tmp_path) @skip_if_root # see https://github.com/datalad/datalad-next/issues/525 def test_export_remote(existing_noannex_dataset, tmp_path, no_result_rendering): remotepath = tmp_path / 'remote' # bypass the complications of folding a windows path into a file URL dlaurl = \ f'datalad-annex::?type=directory&directory={remotepath}&encryption=none&exporttree=yes' \ if on_windows else \ f'datalad-annex::file://{remotepath}?type=directory&directory={{path}}&encryption=none&exporttree=yes' ds = existing_noannex_dataset _check_push_fetch_cycle(ds, dlaurl, remotepath, tmp_path) def _check_push_fetch_cycle(ds, remoteurl, remotepath, tmp_path): """Test helper - add a dla remote to the dataset - push the ds to it - clone from it to a tmp location - check error handling when post-git-update upload fails - update cycle starting from the original ds - repeated supposed-to-be-noop push/fetch calls - update cycle starting from the clone """ localtargetpath = tmp_path / 'ltarget' probepath = tmp_path / 'probe' remotepath.mkdir() dsrepo = ds.repo dsrepo.call_git(['remote', 'add', 'dla', remoteurl]) # basic push/clone roundtrip on clean locations # Since some version of git > 2.30.2 and <= 2.35.1 # it would work without specifying branch. dsrepo.call_git(['push', '-u', 'dla', DEFAULT_BRANCH]) eq_dla_branch_state(dsrepo.get_hexsha(DEFAULT_BRANCH), remotepath) dsclone = clone(remoteurl, localtargetpath) dsclonerepo = dsclone.repo eq_(dsrepo.get_hexsha(DEFAULT_BRANCH), dsclonerepo.get_hexsha(DEFAULT_BRANCH)) # update round (ds.pathobj / 'file1').write_text('file1text') assert_status('ok', ds.save()) # but first make destination read-only to test error recovery # verify starting point, we are one step ahead of the remote eq_(dsrepo.get_hexsha(DEFAULT_BRANCH + '~1'), dsrepo.get_hexsha(f'dla/{DEFAULT_BRANCH}')) # if we are on a sane system, also test recovery from (temporary) # push failure. MIH cannot force himself to figure out how to do # this on windows/crippledFS, sorry probeds = Dataset(probepath).create() if not probeds.repo.is_managed_branch(): # preserve stat-info for later restore stat_records = {} # must go reverse to not block chmod'ing of children for p in sorted(remotepath.glob('**/*'), reverse=True): stat_records[p] = p.stat().st_mode p.chmod(S_IREAD | S_IRGRP | S_IROTH) # push must fail assert_raises(CommandError, dsrepo.call_git, ['push', 'dla']) # really bad that we cannot roll-back the remote branch state # from within the helper (see code), but we leave an indicator eq_(dsrepo.get_hexsha(DEFAULT_BRANCH), dsrepo.get_hexsha(f'refs/dlra-upload-failed/dla/{DEFAULT_BRANCH}')) # revert read-only permission on the remote side for p in sorted(stat_records): p.chmod(stat_records[p]) # now a push can work (it should internally see that refs need # pushing that previously were reported as pushed, no need for # --force) dsrepo.call_git(['push', 'dla']) # and it has removed the marker assert_raises( ValueError, dsrepo.get_hexsha, f'refs/dlra-upload-failed/dla/{DEFAULT_BRANCH}') # the remote has received the new state eq_dla_branch_state(dsrepo.get_hexsha(DEFAULT_BRANCH), remotepath) # verify that there is something to update assert dsrepo.get_hexsha(DEFAULT_BRANCH) != dsclonerepo.get_hexsha(DEFAULT_BRANCH) # pull dsclonerepo.call_git(['pull', DEFAULT_REMOTE, DEFAULT_BRANCH]) # source and clone are now equal eq_(dsrepo.get_hexsha(DEFAULT_BRANCH), dsclonerepo.get_hexsha(DEFAULT_BRANCH)) # push no update dsrepo.call_git(['push', 'dla']) # twice dsrepo.call_git(['push', 'dla']) # fetch no update dsclonerepo.call_git(['fetch', DEFAULT_REMOTE]) # twice dsclonerepo.call_git(['fetch', DEFAULT_REMOTE]) # push/pull in reverse from clone to source (dsclone.pathobj / 'file2').write_text('file2text') assert_status('ok', dsclone.save()) assert dsrepo.get_hexsha(DEFAULT_BRANCH) != dsclonerepo.get_hexsha(DEFAULT_BRANCH) dsclonerepo.call_git(['push', DEFAULT_REMOTE]) eq_dla_branch_state(dsclonerepo.get_hexsha(DEFAULT_BRANCH), remotepath) dsrepo.call_git(['pull', 'dla', DEFAULT_BRANCH]) eq_(dsrepo.get_hexsha(DEFAULT_BRANCH), dsclonerepo.get_hexsha(DEFAULT_BRANCH)) # now create a non-heads ref and roundtrip that # this is what metalad needs to push metadata refs dsrepo.call_git([ 'update-ref', 'refs/datalad/dummy', dsrepo.get_hexsha(DEFAULT_BRANCH)]) dsrepo.call_git(['push', 'dla', 'refs/datalad/dummy']) dsclonerepo.call_git([ 'fetch', DEFAULT_REMOTE, 'refs/datalad/dummy:refs/datalad/dummy']) eq_(dsrepo.get_hexsha('refs/datalad/dummy'), dsclonerepo.get_hexsha('refs/datalad/dummy')) def test_annex_remote_autorepush(existing_noannex_dataset, tmp_path): remotepath = tmp_path # bypass the complications of folding a windows path into a file URL dlaurl = \ f'datalad-annex::?type=directory&directory={remotepath}&encryption=none' \ if on_windows else \ f'datalad-annex::file://{remotepath}?type=directory&directory={{path}}&encryption=none' _check_repush_after_vanish(existing_noannex_dataset, dlaurl, remotepath) def test_export_remote_autorepush(existing_noannex_dataset, tmp_path): remotepath = tmp_path # bypass the complications of folding a windows path into a file URL dlaurl = \ f'datalad-annex::?type=directory&directory={remotepath}&encryption=none&exporttree=yes' \ if on_windows else \ f'datalad-annex::file://{remotepath}?type=directory&directory={{path}}&encryption=none&exporttree=yes' _check_repush_after_vanish(existing_noannex_dataset, dlaurl, remotepath) def _check_repush_after_vanish(ds, remoteurl, remotepath): dsrepo = ds.repo dsrepo.call_git(['remote', 'add', 'dla', remoteurl]) remotepath = Path(remotepath) dsrepo.call_git(['push', '-u', 'dla', DEFAULT_BRANCH]) eq_dla_branch_state(dsrepo.get_hexsha(DEFAULT_BRANCH), remotepath) # wipe out the remote rmtree(remotepath) assert not remotepath.exists() remotepath.mkdir(parents=True) # helper must detect the discrepancy and re-push, despite the local mirror # repo already being up-to-date dsrepo.call_git(['push', 'dla']) eq_dla_branch_state(dsrepo.get_hexsha(DEFAULT_BRANCH), remotepath) def test_params_from_url(): f = get_initremote_params_from_url # just the query part being used eq_(f('datalad-annex::?encryption=none&type=directory&directory=/this/h'), ['encryption=none', 'type=directory', 'directory=/this/h']) # some url prperty expansion eq_(f('datalad-annex::file:///this/h?type=directory&directory={path}'), ['type=directory', 'directory=/this/h']) # original URL, but query stripped eq_(f('https://ex.com/dav/proj/ds?type=webdav&url={noquery}&keyid=id@ex'), ['type=webdav', 'url=https://ex.com/dav/proj/ds', 'keyid=id@ex']) # proper unquoting eq_(f('http://ex.com?weirdparam=some%26amp'), ['type=web', 'exporttree=yes', 'url=http://ex.com?weirdparam=some%26amp']) # nothing is not valid assert_raises(ValueError, f, '') assert_raises(ValueError, f, 'datalad-annex::') # URL without annotation is type=web export remote eq_(f('http://example.com/path/to/something'), ['type=web', 'exporttree=yes', 'url=http://example.com/path/to/something']) def test_typeweb_annex(existing_noannex_dataset, http_server, tmp_path, no_result_rendering): _check_typeweb( # bypass the complications of folding a windows path into a file URL 'datalad-annex::?type=directory&directory={export}&encryption=none' \ if on_windows else 'datalad-annex::file://{export}?type=directory&directory={{path}}&encryption=none', 'datalad-annex::{url}?type=web&url={{noquery}}', existing_noannex_dataset, http_server, tmp_path, ) # just to exercise the code path leading to an uncompressed ZIP def test_typeweb_annex_uncompressed( existing_noannex_dataset, http_server, tmp_path, no_result_rendering): _check_typeweb( # bypass the complications of folding a windows path into a file URL 'datalad-annex::?type=directory&directory={export}&encryption=none&dladotgit=uncompressed' \ if on_windows else 'datalad-annex::file://{export}?type=directory&directory={{path}}&encryption=none&dladotgit=uncompressed', 'datalad-annex::{url}?type=web&url={{noquery}}', existing_noannex_dataset, http_server, tmp_path, ) def test_typeweb_export(existing_noannex_dataset, http_server, tmp_path, no_result_rendering): _check_typeweb( # bypass the complications of folding a windows path into a file URL 'datalad-annex::?type=directory&directory={export}&encryption=none&exporttree=yes' \ if on_windows else 'datalad-annex::file://{export}?type=directory&directory={{path}}&encryption=none&exporttree=yes', # when nothing is given type=web&exporttree=yes is the default 'datalad-annex::{url}', existing_noannex_dataset, http_server, tmp_path, ) def _check_typeweb(pushtmpl, clonetmpl, ds, server, clonepath): ds.repo.call_git([ 'remote', 'add', 'dla', pushtmpl.format(export=server.path), ]) ds.repo.call_git(['push', '-u', 'dla', DEFAULT_BRANCH]) # must override git-annex security setting for localhost with patched_env(**{ "GIT_CONFIG_COUNT": "1", "GIT_CONFIG_KEY_0": "annex.security.allowed-ip-addresses", "GIT_CONFIG_VALUE_0": "127.0.0.1"} ): dsclone = clone( clonetmpl.format(url=server.url), clonepath) eq_(ds.repo.get_hexsha(DEFAULT_BRANCH), dsclone.repo.get_hexsha(DEFAULT_BRANCH)) def test_submodule_url(tmp_path, existing_noannex_dataset, http_server, no_result_rendering): servepath = http_server.path url = http_server.url # a future subdataset that we want to register under a complex URL tobesubds = existing_noannex_dataset # push to test web server, this URL doesn't matter yet tobesubds.repo.call_git([ 'remote', 'add', 'dla', # bypass the complications of folding a windows path into a file URL f'datalad-annex::?type=directory&directory={servepath}&encryption=none&exporttree=yes' if on_windows else f'datalad-annex::file://{servepath}?type=directory&directory={{path}}&encryption=none&exporttree=yes', ]) tobesubds.repo.call_git(['push', '-u', 'dla', DEFAULT_BRANCH]) # create a superdataset to register the subds to super = Dataset(tmp_path / 'super').create() with patched_env(**{ "GIT_CONFIG_COUNT": "1", "GIT_CONFIG_KEY_0": "annex.security.allowed-ip-addresses", "GIT_CONFIG_VALUE_0": "127.0.0.1"} ): # this is the URL that matters # we intentionally use something that leaves a placeholder behind # in the submodule record super.clone( f'datalad-annex::{url}?type=web&url={{noquery}}&exporttree=yes', 'subds', ) # no clone the entire super superclone = clone(super.path, tmp_path / 'superclone') # and auto-fetch the sub via the datalad-annex remote helper superclone.get('subds', get_data=False, recursive=True) # we got the original subds subdsclone = Dataset(superclone.pathobj / 'subds') eq_(tobesubds.id, subdsclone.id) def test_webdav_auth(existing_noannex_dataset, tmp_path, credman, webdav_credential, webdav_server, no_result_rendering): credman.set(**webdav_credential) # this is the dataset we want to roundtrip through webdav ds = existing_noannex_dataset remoteurl = \ f'datalad-annex::{webdav_server.url}' \ '?type=webdav&url={noquery}&encryption=none&' \ f'dlacredential={urlquote(webdav_credential["name"])}' ds.repo.call_git(['remote', 'add', 'dla', remoteurl]) # roundtrip ds.repo.call_git(['push', '-u', 'dla', DEFAULT_BRANCH]) cln = clone(remoteurl, tmp_path) # must give the same thing eq_(ds.repo.get_hexsha(DEFAULT_BRANCH), cln.repo.get_hexsha(DEFAULT_BRANCH)) datalad-next-1.4.1/datalad_next/iter_collections/000077500000000000000000000000001462321624600220625ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/iter_collections/__init__.py000066400000000000000000000042061462321624600241750ustar00rootroot00000000000000"""Iterators for particular types of collections Most importantly this includes different collections (or containers) for files, such as a file system directory, or an archive (also see the ``ls_file_collection`` command). However, this module is not per-se limited to file collections. Most, if not all, implementation come in the form of a function that takes a collection identifier or a collection location (e.g., a file system path), and possibly some additional options. When called, an iterator is returned that produces collection items in the form of data class instances of a given type. The particular type can be different across different collections. .. currentmodule:: datalad_next.iter_collections .. autosummary:: :toctree: generated iter_annexworktree iter_dir iter_gitdiff iter_gitstatus iter_gittree iter_gitworktree iter_submodules iter_tar iter_zip TarfileItem ZipfileItem FileSystemItem FileSystemItemType GitTreeItemType GitWorktreeItem GitWorktreeFileSystemItem GitDiffItem GitDiffStatus GitContainerModificationType """ from .tarfile import ( # TODO move to datalad_next.types? TarfileItem, iter_tar, ) from .zipfile import ( # TODO move to datalad_next.types? ZipfileItem, iter_zip, ) # TODO move to datalad_next.types? from .utils import ( # TODO move to datalad_next.types? FileSystemItemType, # TODO move to datalad_next.types? FileSystemItem, compute_multihash_from_fp, ) from .directory import iter_dir from .gittree import ( # TODO move to datalad_next.types? GitTreeItemType, iter_gittree, ) from .gitworktree import ( # TODO move to datalad_next.types? GitWorktreeItem, # TODO move to datalad_next.types? GitWorktreeFileSystemItem, iter_gitworktree, iter_submodules, ) from .annexworktree import ( iter_annexworktree, ) from .gitdiff import ( # TODO move to datalad_next.types? GitDiffItem, # TODO move to datalad_next.types? GitDiffStatus, # TODO move to datalad_next.types? GitContainerModificationType, iter_gitdiff, ) from .gitstatus import ( iter_gitstatus, ) datalad-next-1.4.1/datalad_next/iter_collections/annexworktree.py000066400000000000000000000350111462321624600253300ustar00rootroot00000000000000"""Report on the content of a Git-annex repository worktree The main functionality is provided by the :func:`iter_annexworktree()` function. """ from __future__ import annotations import logging from dataclasses import dataclass from more_itertools import intersperse from pathlib import ( Path, PurePath, ) from typing import ( Any, Generator, ) from datalad_next.consts import on_windows from datalad_next.itertools import ( itemize, load_json, route_in, route_out, StoreOnly, ) from datalad_next.repo_utils import has_initialized_annex from datalad_next.runners import iter_git_subproc from .gitworktree import ( GitWorktreeItem, GitWorktreeFileSystemItem, iter_gitworktree ) from .utils import FileSystemItemType lgr = logging.getLogger('datalad.ext.next.iter_collections.annexworktree') @dataclass class AnnexWorktreeItem(GitWorktreeItem): annexkey: str | None = None annexsize: int | None = None # annex object path, relative to the item annexobjpath: PurePath | None = None @classmethod def from_gitworktreeitem( cls, item: GitWorktreeItem, ): return cls(**item.__dict__) @dataclass class AnnexWorktreeFileSystemItem(GitWorktreeFileSystemItem): annexkey: str | None = None annexsize: int | None = None # annex object path, relative to the item annexobjpath: PurePath | None = None # TODO this iterator should get a filter mechanism to limit it to a single # directory (non-recursive). This will be needed for gooey. # unlike iter_gitworktree() we pay a larger dedicated per item cost. # Given that the switch to iterative processing is also made for # iter_gitworktree() we should provide the same filtering for that one # too! def iter_annexworktree( path: Path, *, untracked: str | None = 'all', link_target: bool = False, fp: bool = False, recursive: str = 'repository', ) -> Generator[AnnexWorktreeItem | AnnexWorktreeFileSystemItem, None, None]: """Companion to ``iter_gitworktree()`` for git-annex repositories This iterator wraps :func:`~datalad_next.iter_collections.gitworktree.iter_gitworktree`. For each item, it determines whether it is an annexed file. If so, it amends the yielded item with information on the respective annex key, the byte size of the key, and its (would-be) location in the repository's annex. The basic semantics of all arguments are identical to :func:`~datalad_next.iter_collections.gitworktree.iter_gitworktree`. Importantly, with ``fp=True``, an annex object is opened directly, if available. If not available, no attempt is made to open the associated symlink or pointer file. With ``link_target`` and ``fp`` disabled items of type :class:`AnnexWorktreeItem` are yielded, otherwise :class:`AnnexWorktreeFileSystemItem` instances are yielded. In both cases, ``annexkey``, ``annexsize``, and ``annnexobjpath`` properties are provided. .. note:: Although ``annexobjpath`` is always set for annexed content, that does not imply that an object at this path actually exists. The latter will only be the case if the annexed content is present in the work tree, typically as a result of a `datalad get`- or `git annex get`-call. Parameters ---------- path: Path Path of a directory in a git-annex repository to report on. This directory need not be the root directory of the repository, but must be part of the repository's work tree. untracked: {'all', 'whole-dir', 'no-empty-dir'} or None, optional If not ``None``, also reports on untracked work tree content. ``all`` reports on any untracked file; ``whole-dir`` yields a single report for a directory that is entirely untracked, and not individual untracked files in it; ``no-empty-dir`` skips any reports on untracked empty directories. link_target: bool, optional If ``True``, information matching a :class:`~datalad_next.iter_collections.utils.FileSystemItem` will be included for each yielded item, and the targets of any symlinks will be reported, too. fp: bool, optional If ``True``, information matching a :class:`~datalad_next.iter_collections.utils.FileSystemItem` will be included for each yielded item, but without a link target detection, unless ``link_target`` is given. Moreover, each file-type item includes a file-like object to access the file's content. This file handle will be closed automatically when the next item is yielded. recursive: {'repository', 'no'}, optional Pass on to :func:`~datalad_next.iter_collections.gitworktree.iter_gitworktree`, thereby determining which items this iterator will yield. Yields ------ :class:`AnnexWorktreeItem` or :class:`AnnexWorktreeFileSystemItem` The ``name`` attribute of an item is a ``PurePath`` instance with the corresponding (relative) path, in platform conventions. """ glsf = iter_gitworktree( path, untracked=untracked, link_target=False, fp=False, recursive=recursive, ) if not has_initialized_annex(path): # this is not an annex repo. # we just yield the items from the gitworktree iterator. # we funnel them through the standard result item prep # function for type equality. # when a recursive-mode other than 'repository' will be # implemented, this implementation needs to be double-checked # to avoid decision making on submodules just based on # the nature of the toplevel repo. for item in glsf: yield _get_worktree_item( path, get_fs_info=link_target, git_item=item) return git_fileinfo_store: list[Any] = list() # this is a technical helper that will just store a bunch of `None`s # for aligning item-results between git-ls-files and git-annex-find _annex_git_align: list[Any] = list() with \ iter_git_subproc( # we get the annex key for any filename # (or empty if not annexed) ['annex', 'find', '--anything', '--format=${key}\\n', '--batch'], # intersperse items with newlines to trigger a batch run # this avoids string operations to append newlines to items input=intersperse( b'\n', # use `GitWorktree*`-elements yielded by `iter_gitworktree` # to create an `AnnexWorktreeItem` or # `AnnexWorktreeFileSystemItem` object, which is stored in # `git_fileinfo_store`. Yield a string representation of # the path contained in the `GitWorktree*`-element yielded # by `iter_gitworktree` route_out( glsf, git_fileinfo_store, lambda git_worktree_item: ( str(git_worktree_item.name).encode(), git_worktree_item ) ) ), cwd=path, ) as gaf, \ iter_git_subproc( # get the key properties JSON-lines style ['annex', 'examinekey', '--json', '--batch'], # use only non-empty keys as input to `git annex examinekey`. input=intersperse( # Add line ending to submit the key to batch processing in # `git annex examinekey`. b'\n', route_out( itemize( gaf, # although we declare a specific key output format # for the git-annex find call, versions of # git-annex <10.20231129 on Windows will terminate # lines with '\r\n' instead of '\n'. We therefore use # `None` as separator, which enables `itemize()` # to use either separator, i.e. '\r\n' or '\n'. sep=None if on_windows else b'\n', ), # we need this route-out solely for the purpose # of maintaining a 1:1 relationship of items reported # by git-ls-files and git-annex-find (merged again # in the `route-in` that gives `results` below). The # "store" here does not actually store anything other # than`None`s (because the `key` --which is consumed by # `git annex examinekey`-- is also present in the # output of `git annex examinekey`). _annex_git_align, # do not process empty key lines. Non-empty key lines # are processed, but nothing needs to be stored because # the processing result includes the key itself. lambda key: (key if key else StoreOnly, None) ) ), cwd=path, ) as gek: results = route_in( # the following `route_in` yields processed keys for annexed # files and `StoreOnly` for non-annexed files. Its # cardinality is the same as the cardinality of # `iter_gitworktree`, i.e. it produces data for each element # yielded by `iter_gitworktree`. route_in( load_json(itemize(gek, sep=None)), _annex_git_align, # `processed` data is either `StoreOnly` or detailed # annex key information. we just return `process_data` as # result, because `join_annex_info` knows how to incorporate # it into an `AnnexWorktree*`-object. lambda processed_data, _: processed_data ), git_fileinfo_store, _join_annex_info, ) # at this point, each item in `results` is a dict with a `git_item` # key that hold a `GitWorktreeItem` instance, plus additional annex # related keys added by join_annex_info() for annexed files if not fp: # life is simpler here, we do not need to open any files in the # annex, hence all processing can be based in the information # collected so far for res in results: yield _get_worktree_item(path, get_fs_info=link_target, **res) return # if we get here, this is about file pointers... # for any annexed file we need to open, we need to locate it in # the annex. we get `annexobjpath` in the results. this is # relative to `path`. We could not use the `link_target`, because # we might be in a managed branch without link. path = Path(path) for res in results: try: item = _get_worktree_item(path, get_fs_info=True, **res) except FileNotFoundError: # there is nothing to open, yield non FS item item = _get_worktree_item(path, get_fs_info=False, **res) yield item continue # determine would file we would open fp_src = None if item.annexobjpath is not None: # this is an annexed file fp_src = item.annexobjpath elif item.type == FileSystemItemType.file \ and item.annexkey is None: # regular file (untracked or tracked) fp_src = item.name elif item.type == FileSystemItemType.symlink \ and item.annexkey is None: # regular symlink fp_src = item.name if fp_src is None: # nothing to open yield item else: fp_src_fullpath = path / fp_src if not fp_src_fullpath.exists(): # nothing there to open (would resolve through a symlink) yield item else: with fp_src_fullpath.open('rb') as active_fp: item.fp = active_fp yield item def _get_worktree_item( base_path: Path, get_fs_info: bool, git_item: GitWorktreeItem, annexkey: str | None = None, annexsize: int | None = None, annexobjpath: str | None = None, ) -> AnnexWorktreeFileSystemItem | AnnexWorktreeItem: """Internal helper to get an item from ``_join_annex_info()`` output The assumption is that minimal investigations have been done until this helper is called. In particular, no file system inspects have been performed. Depending on whether a user requested file system information to be contained in the items (``get_fs_info``), either ``AnnexWorktreeFileSystemItem`` or ``AnnexWorktreeItem`` is returned. The main workhorse of this function if ``AnnexWorktreeFileSystemItem.from_path()``. Besides calling it, information is only taken from arguments and injected into the item instances. """ # we did not do any filesystem inspection previously, so # do now when link_target is enabled item = AnnexWorktreeFileSystemItem.from_path( base_path / git_item.name, link_target=True, ) if get_fs_info else AnnexWorktreeItem.from_gitworktreeitem(git_item) # amend the AnnexWorktree* object with the available git info item.gitsha = git_item.gitsha item.gittype = git_item.gittype # amend the AnnexWorktree* object with the available annex info item.annexkey = annexkey item.annexsize = annexsize item.annexobjpath = annexobjpath return item def _join_annex_info( processed_data, stored_data: GitWorktreeItem, ) -> dict: """Internal helper to join results from pipeline stages All that is happening here is that information from git and git-annex inquiries gets merged into a single result dict. """ joined = dict(git_item=stored_data) if processed_data is StoreOnly: # this is a non-annexed item, nothing to join return joined else: # here processed data is a dict with properties from annex examinekey joined.update( annexkey=processed_data['key'], annexsize=int(processed_data['bytesize']), annexobjpath=PurePath(str(processed_data['objectpath'])), ) return joined datalad-next-1.4.1/datalad_next/iter_collections/directory.py000066400000000000000000000042071462321624600244430ustar00rootroot00000000000000"""Report on the content of directories The main functionality is provided by the :func:`iter_dir()` function. """ from __future__ import annotations from dataclasses import dataclass from pathlib import Path from typing import Generator from datalad_next.exceptions import CapturedException from .utils import ( FileSystemItem, FileSystemItemType, ) @dataclass # sadly PY3.10+ only (kw_only=True) class DirectoryItem(FileSystemItem): pass def iter_dir( path: Path, *, fp: bool = False, ) -> Generator[DirectoryItem, None, None]: """Uses ``Path.iterdir()`` to iterate over a directory and reports content The iterator produces an :class:`DirectoryItem` instance with standard information on file system elements, such as ``size``, or ``mtime``. In addition to a plain ``Path.iterdir()`` the report includes a path-type label (distinguished are ``file``, ``directory``, ``symlink``). Parameters ---------- path: Path Path of the directory to report content for (iterate over). fp: bool, optional If ``True``, each file-type item includes a file-like object to access the file's content. This file handle will be closed automatically when the next item is yielded. Yields ------ :class:`DirectoryItem` The ``name`` attribute of an item is a ``Path`` instance, with the format matching the main ``path`` argument. When an absolute ``path`` is given, item names are absolute paths too. When a relative path is given, it is relative to CWD, and items names are relative paths (relative to CWD) too. """ for c in path.iterdir(): # c could disappear while this is running. Example: temp files managed # by other processes. try: item = DirectoryItem.from_path( c, link_target=True, ) except FileNotFoundError as e: CapturedException(e) continue if fp and item.type == FileSystemItemType.file: with c.open('rb') as fp: item.fp = fp yield item else: yield item datalad-next-1.4.1/datalad_next/iter_collections/gitdiff.py000066400000000000000000000441501462321624600240540ustar00rootroot00000000000000"""Report on the difference of two Git tree-ishes or tracked worktree content The main functionality is provided by the :func:`iter_gitdiff()` function. """ from __future__ import annotations from dataclasses import dataclass from enum import Enum from functools import cached_property import logging from pathlib import ( Path, PurePosixPath, ) from typing import Generator from datalad_next.consts import PRE_INIT_COMMIT_SHA from datalad_next.runners import ( CommandError, iter_git_subproc, ) from datalad_next.itertools import ( decode_bytes, itemize, ) from datalad_next.runners import ( call_git, call_git_oneline, ) from .gittree import ( GitTreeItem, GitTreeItemType, _mode_type_map, ) lgr = logging.getLogger('datalad.ext.next.iter_collections.gitdiff') # TODO Could be `StrEnum`, came with PY3.11 class GitDiffStatus(Enum): """Enumeration of statuses for diff items """ addition = 'addition' copy = 'copy' deletion = 'deletion' modification = 'modification' rename = 'rename' typechange = 'typechange' unmerged = 'unmerged' unknown = 'unknown' # this is a local addition and not defined by git # AKA "untracked" other = 'other' _diffstatus_map = { 'A': GitDiffStatus.addition, 'C': GitDiffStatus.copy, 'D': GitDiffStatus.deletion, 'M': GitDiffStatus.modification, 'R': GitDiffStatus.rename, 'T': GitDiffStatus.typechange, 'U': GitDiffStatus.unmerged, 'X': GitDiffStatus.unknown, 'O': GitDiffStatus.other, } # TODO Could be `StrEnum`, came with PY3.11 class GitContainerModificationType(Enum): new_commits = 'new commits' untracked_content = 'untracked content' modified_content = 'modified content' @dataclass class GitDiffItem(GitTreeItem): """``GitTreeItem`` with "previous" property values given a state comparison """ prev_name: str | None = None prev_gitsha: str | None = None prev_gittype: GitTreeItemType | None = None status: GitDiffStatus | None = None percentage: int | None = None """This is the percentage of similarity for copy-status and rename-status diff items, and the percentage of dissimilarity for modifications.""" modification_types: tuple[GitContainerModificationType, ...] | None = None """Qualifiers for modification types of container-type items (directories, submodules).""" def __post_init__(self): if self.status == GitDiffStatus.addition and self.gitsha is None: self.add_modification_type(GitContainerModificationType.modified_content) @cached_property def prev_path(self) -> PurePosixPath | None: """Returns the item ``prev_name`` as a ``PurePosixPath`` instance""" if self.prev_name is None: return None return PurePosixPath(self.prev_name) def add_modification_type(self, value: GitContainerModificationType): if self.modification_types is None: self.modification_types = (value,) else: self.modification_types = (*self.modification_types, value) def iter_gitdiff( path: Path, from_treeish: str | None, to_treeish: str | None, *, recursive: str = 'repository', find_renames: int | None = None, find_copies: int | None = None, yield_tree_items: str | None = None, # TODO add documentation eval_submodule_state: str = 'full', ) -> Generator[GitDiffItem, None, None]: """Report differences between Git tree-ishes or tracked worktree content This function is a wrapper around the Git command ``diff-tree`` and ``diff-index``. Therefore most semantics also apply here. The main difference with respect to the Git commands are: 1) uniform support for non-recursive, single tree reporting (no subtrees); and 2) support for submodule recursion. Notes on 'no' recursion mode When comparing to the worktree, ``git diff-index`` always reports on subdirectories. For homogeneity with the report on a committed tree, a non-recursive mode emulation is implemented. It compresses all reports from a direct subdirectory into a single report on that subdirectory. The ``gitsha`` of that directory item will always be ``None``. Moreover, no type or typechange inspection, or further filesystem queries are performed. Therefore, ``prev_gittype`` will always be ``None``, and any change other than the addition of the directory will be labeled as a ``GitDiffStatus.modification``. Parameters ---------- path: Path Path of a directory in a Git repository to report on. This directory need not be the root directory of the repository, but must be part of the repository. If the directory is not the root directory of a non-bare repository, the iterator is constrained to items underneath that directory. from_treeish: str or None Git "tree-ish" that defines the comparison reference. If ``None``, ``to_treeeish`` must not be ``None`` (see its documentation for details). to_treeish: Git "tree-ish" that defines the comparison target. If ``None``, ``from_treeish`` must not be ``None``, and that tree-ish will be compared against the worktree. (see its documentation for details). If ``from_treeish`` is ``None``, the given tree-ish is compared to its immediate parents (see ``git diff-tree`` documentation for details). recursive: {'repository', 'submodules', 'no'}, optional Behavior for recursion into subtrees. By default (``repository``), all trees within the repository underneath ``path``) are reported, but no tree within submodules. With ``submodules``, recursion includes any submodule that is present. If ``no``, only direct children are reported on. find_renames: int, optional If given, this defines the similarity threshold for detecting renames (see ``git diff-{index,tree} --find-renames``). By default, no rename detection is done and reported items never have the ``rename`` status. Instead, a renames would be reported as a deletion and an addition. find_copied: int, optional If given, this defines the similarity threshold for detecting copies (see ``git diff-{index,tree} --find-copies``). By default, no copy detection is done and reported items never have the ``copy`` status. Instead, a copy would be reported as addition. This option always implies the use of the ``--find-copies-harder`` Git option that enables reporting of copy sources, even when they have not been modified in the same change. This is a very expensive operation for large projects, so use it with caution. yield_tree_items: {'submodules', 'directories', 'all', None}, optional Whether to yield an item on type of subtree that will also be recursed into. For example, a submodule item, when submodule recursion is enabled. When disabled, subtree items (directories, submodules) will still be reported whenever there is no recursion into them. For example, submodule items are reported when ``recursive='repository``, even when ``yield_tree_items=None``. Yields ------ :class:`GitDiffItem` The ``name`` and ``prev_name`` attributes of an item are a ``str`` with the corresponding (relative) path, as reported by Git (in POSIX conventions). """ # we force-convert to Path to give us the piece of mind we want. # The docs already ask for that, but it is easy to # forget/ignore and leads to non-obvious errors. Running this once is # a cheap safety net path = Path(path) # put most args in a container, we need to pass then around quite # a bit kwargs = dict( from_treeish=from_treeish, to_treeish=to_treeish, recursive=recursive, find_renames=find_renames, find_copies=find_copies, yield_tree_items=yield_tree_items, eval_submodule_state=eval_submodule_state, ) cmd = _build_cmd(**kwargs) if cmd[0] == 'diff-index': # when we compare to the index, we need a refresh run to not have # something like plain mtime changes trigger modification reports # https://github.com/datalad/datalad-next/issues/639 call_git([ 'update-index', # must come first, we recurse ourselves '--ignore-submodules', # we want to continue the refresh when the index need updating '-q', '--refresh', ], cwd=path) # when do we need to condense subdir reports into a single dir-report reported_dirs: set[str] = set() _single_dir = (cmd[0] == 'diff-index') and recursive == 'no' # diff-tree reports the compared tree when no from is given, we need # to skip that output below skip_first = (cmd[0] == 'diff-tree') and from_treeish is None pending_props = None for line in _git_diff_something(path, cmd): if skip_first: skip_first = False continue if pending_props: pending_props.append(line) if pending_props[4][0] in ('C', 'R'): # for copies and renames we expect a second path continue yield from _yield_diff_item( cwd=path, single_dir=_single_dir, spec=pending_props, reported_dirs=reported_dirs, **kwargs ) pending_props = None elif line.startswith(':'): pending_props = line[1:].split(' ') else: # pragma: no cover raise RuntimeError( 'we should not get here, unexpected diff output') if pending_props: # flush yield from _yield_diff_item( cwd=path, single_dir=_single_dir, spec=pending_props, reported_dirs=reported_dirs, **kwargs ) def _build_cmd( *, from_treeish, to_treeish, recursive, yield_tree_items, find_renames, find_copies, eval_submodule_state, ) -> list[str]: # from : to : description # --------------------------- # HEAD : None : compare to worktree, not with the index (diff-index) # HEAD~2 : HEAD : compare trees (diff-tree) # None : HEAD~2 : compare tree with its parents (diff-tree) # None : None : exception common_args: list[str] = [ '--no-rename-empty', # ignore changes above CWD '--relative', '--raw', '-z', ] if find_renames is not None: common_args.append(f'--find-renames={find_renames}%') if find_copies is not None: common_args.append(f'--find-copies={find_copies}%') # if someone wants to look for copies, we actually look # for copies. This is expensive, but IMHO is the one # thing that makes this useful # TODO possibly we only want to enable this when # find_copies==100 (exact copies), based on the assumption # that this is cheaper than reading all file content. # but if that is actually true remains to be tested common_args.append(f'--find-copies-harder') if eval_submodule_state == 'no': common_args.append('--ignore-submodules=all') elif eval_submodule_state == 'commit': common_args.append('--ignore-submodules=dirty') elif eval_submodule_state == 'full': common_args.append('--ignore-submodules=none') else: raise ValueError( f'unknown submodule evaluation mode {eval_submodule_state!r}') if from_treeish is None and to_treeish is None: raise ValueError( 'either `from_treeish` or `to_treeish` must not be None') elif to_treeish is None: cmd = ['diff-index', *common_args, from_treeish] else: # diff NOT against the working tree cmd = ['diff-tree', *common_args] if recursive == 'repository': cmd.append('-r') if yield_tree_items in ('all', 'directories'): cmd.append('-t') if from_treeish is None: cmd.append(to_treeish) else: # two tree-ishes given cmd.extend((from_treeish, to_treeish)) # add disambiguation marker for pathspec. # even if we do not pass any, we get simpler error messages from Git cmd.append('--') return cmd def _yield_diff_item( *, cwd: Path, recursive: str, from_treeish: str | None, to_treeish: str | None, spec: list, single_dir: bool, reported_dirs: set, yield_tree_items: bool, **kwargs ) -> Generator[GitDiffItem, None, None]: props: dict[str, str | int | GitTreeItemType] = {} props.update( (k, _mode_type_map.get(v, None)) for k, v in (('prev_gittype', spec[0]), ('gittype', spec[1])) ) props.update( (k, None if v == (40 * '0') else v) for k, v in (('prev_gitsha', spec[2]), ('gitsha', spec[3])) ) status = spec[4] props['status'] = _diffstatus_map[status[0]] if len(status) > 1: props['percentage'] = int(status[1:]) if status == 'A': # this is an addition, we want `name` in the right place props['name'] = spec[5] else: props['prev_name'] = spec[5] props['name'] = spec[6] if len(spec) > 6 else spec[5] # at this point we know all about the item # conversion should be cheap, so let's do this here # and get a bit neater code for the rest of this function item = GitDiffItem(**props) if not single_dir: if item.gittype != GitTreeItemType.submodule: yield item return # this is about a present submodule if item.status == GitDiffStatus.modification: if item.gitsha is None: # in 'git diff-index' speak the submodule is "out-of-sync" with # the index: this happens when there are new commits item.add_modification_type( GitContainerModificationType.new_commits) # TODO we cannot give details for other modification types. # depending on --ignore-submodules a range of situations # could be the case #else: # # this modification means that "content" is modified # item.add_modification_type( # GitContainerModificationType.modified_content) if recursive != 'submodules' or yield_tree_items in ( 'all', 'submodules'): # we are instructed to yield it yield item if recursive == 'submodules': # I believe we need no protection against absent submodules. # The only way they can appear here is a reported modification. # The only modification that is possible with an absent submodule # is a deletion. And that would cause the item.gittype to be None # -- a condition that is caught above for i in iter_gitdiff( cwd / PurePosixPath(item.name), **dict( kwargs, # we never want to pass None here # if `prev_gitsha` is None, it means that the # submodule record is new, and we want its full # content reported. Passing None, however, # would only report the change to the current # state. from_treeish=item.prev_gitsha or PRE_INIT_COMMIT_SHA, # when comparing the parent to the worktree, we # also want to compare any children to the worktree to_treeish=None if to_treeish is None else item.gitsha, ) ): # prepend any item name with the parent items # name for attr in ('name', 'prev_name'): val = getattr(i, attr) if val is not None: setattr(i, attr, f'{item.name}/{val}') yield i return name = props['name'] or props['prev_name'] # we cannot have items that have no name whatsoever assert name is not None # we decide on mangling the actual report to be on the containing directory # only, or to withhold it entirely dname_l = name.split('/', maxsplit=1) if len(dname_l) < 2: # nothing in a subdirectory yield item return dname = dname_l[0] if dname in reported_dirs: # nothing else todo, we already reported return reported_dirs.add(dname) yield _mangle_item_for_singledir(item, dname, from_treeish, cwd) def _mangle_item_for_singledir(item, dname, from_treeish, cwd): # at this point we have a change report on subdirectory content # we only get here when comparing `from_treeish` to the worktree. item.name = dname # non-committed change -> no SHA (this ignored the index, # like we do elsewhere too) item.gitsha = None item.gittype = GitTreeItemType.directory try: item.prev_gitsha = call_git_oneline( ['rev-parse', '-q', f'{from_treeish}:./{dname}'], cwd=cwd, ) # if we get here, we know that the name was valid in # `from_treeish` too item.prev_name = dname # it would require more calls to figure out the mode and infer # a possible type change. For now, we do not go there item.prev_gittype = None item.status = GitDiffStatus.modification except CommandError: # the was nothing with this name in `from_treeish`, but now # it exists. We compare to the worktree, but not any untracked # content -- this means that we likely compare across multiple # states and the directory become tracked after `from_treeish`. # let's call it an addition item.prev_gitsha = None item.prev_gittype = None item.status = GitDiffStatus.addition return item def _git_diff_something(path, args): with iter_git_subproc([*args], cwd=path) as r: yield from decode_bytes( itemize( r, sep=b'\0', keep_ends=False, ) ) datalad-next-1.4.1/datalad_next/iter_collections/gitstatus.py000066400000000000000000000473131462321624600244730ustar00rootroot00000000000000"""Report on the status of the worktree The main functionality is provided by the :func:`iter_gitstatus` function. """ from __future__ import annotations import logging from pathlib import ( Path, PurePath, ) from typing import Generator from datalad_next.consts import PRE_INIT_COMMIT_SHA from datalad_next.runners import ( CommandError, call_git_lines, iter_git_subproc, ) from datalad_next.itertools import ( decode_bytes, itemize, ) from datalad_next.repo_utils import ( get_worktree_head, ) from .gitdiff import ( GitDiffItem, GitDiffStatus, GitContainerModificationType, iter_gitdiff, ) from .gitworktree import ( GitTreeItem, GitTreeItemType, iter_gitworktree, iter_submodules, lsfiles_untracked_args, _git_ls_files, ) lgr = logging.getLogger('datalad.ext.next.iter_collections.gitstatus') def iter_gitstatus( path: Path, *, untracked: str | None = 'all', recursive: str = 'repository', eval_submodule_state: str = "full", ) -> Generator[GitDiffItem, None, None]: """ Recursion mode 'no' This mode limits the reporting to immediate directory items of a given path. This mode is not necessarily faster than a 'repository' recursion. Its primary purpose is the ability to deliver a collapsed report in that subdirectories are treated similar to submodules -- as containers that maybe have modified or untracked content. Parameters ---------- path: Path Path of a directory in a Git repository to report on. This directory need not be the root directory of the repository, but must be part of the repository. If the directory is not the root directory of a non-bare repository, the iterator is constrained to items underneath that directory. untracked: {'all', 'whole-dir', 'no-empty-dir'} or None, optional If not ``None``, also reports on untracked work tree content. ``all`` reports on any untracked file; ``whole-dir`` yields a single report for a directory that is entirely untracked, and not individual untracked files in it; ``no-empty-dir`` skips any reports on untracked empty directories. Also see ``eval_submodule_state`` for how this parameter is applied in submodule recursion. recursive: {'no', 'repository', 'submodules', 'monolithic'}, optional Behavior for recursion into subtrees. By default (``repository``), all trees within the repository underneath ``path``) are reported, but no tree within submodules. With ``submodules``, recursion includes any submodule that is present. If ``no``, only direct children are reported on. eval_submodule_state: {"no", "commit", "full"}, optional If 'full' (default), the state of a submodule is evaluated by considering all modifications, with the treatment of untracked files determined by `untracked`. If 'commit', the modification check is restricted to comparing the submodule's "HEAD" commit to the one recorded in the superdataset. If 'no', the state of the subdataset is not evaluated. When a git-annex repository in adjusted mode is detected, the reference commit that the worktree is being compared to is the basis of the adjusted branch (i.e., the corresponding branch). Yields ------ :class:`GitDiffItem` The ``name`` and ``prev_name`` attributes of an item are a ``str`` with the corresponding (relative) path, as reported by Git (in POSIX conventions). .. note:: The implementation requires `git rev-parse --path-format=relative` that was introduced with Git v2.31. """ path = Path(path) head, corresponding_head = get_worktree_head(path) if head is None: # no commit at all -> compare to an empty repo. head = PRE_INIT_COMMIT_SHA # TODO it would make sense to always (or optionally) compare against any # existing corresponding_head. This would make the status communicate # anything that has not made it into the corresponding branch yet common_args = dict( head=head, path=path, untracked=untracked, eval_submodule_state=eval_submodule_state, ) if recursive == 'no': yield from _yield_dir_items(**common_args) return elif recursive == 'repository': yield from _yield_repo_items(**common_args) # TODO what we really want is a status that is not against a per-repository # HEAD, but against the commit that is recorded in the parent repository # TODO we need a name for that elif recursive in ('submodules', 'monolithic'): yield from _yield_hierarchy_items( recursion_mode=recursive, **common_args, ) else: raise ValueError(f'unknown recursion type {recursive!r}') # # status generators for each mode # def _yield_dir_items( *, head: str | None, path: Path, untracked: str | None, eval_submodule_state: str, ): # potential container items in a directory that need content # investigation container_types = ( GitTreeItemType.directory, GitTreeItemType.submodule, ) if untracked == 'no': # no need to look at anything other than the diff report dir_items = {} else: # there is no recursion, avoid wasting cycles on listing individual # files in subdirectories untracked = 'whole-dir' if untracked == 'all' else untracked # gather all dierectory items upfront, we subtract the ones reported # modified later and lastly yield all untracked content from them dir_items = { str(item.name): item for item in iter_gitworktree( path, untracked=untracked, recursive='no', ) } # diff constrained to direct children for item in iter_gitdiff( path, from_treeish=head, # to the worktree to_treeish=None, recursive='no', # TODO trim scope like in repo_items eval_submodule_state=eval_submodule_state, ): if item.status != GitDiffStatus.deletion \ and item.gittype in container_types: if item.gittype == GitTreeItemType.submodule: # issue standard submodule container report _eval_submodule(path, item, eval_submodule_state) else: dir_path = path / item.path # this is on a directory. if it appears here, it has # modified content if dir_path.exists(): item.add_modification_type( GitContainerModificationType.modified_content) if untracked != 'no' \ and _path_has_untracked(path / item.path): item.add_modification_type( GitContainerModificationType.untracked_content) else: # this directory is gone entirely item.status = GitDiffStatus.deletion item.modification_types = None # we dealt with this item completely dir_items.pop(item.name, None) if item.status: yield item if untracked == 'no': return # yield anything untracked, and inspect remaining containers for dir_item in dir_items.values(): if dir_item.gitsha is None and dir_item.gittype is None: # this is untracked yield GitDiffItem( # for homgeneity for report a str-path no matter what name=str(dir_item.name), status=GitDiffStatus.other, ) elif dir_item.gittype in container_types: # none of these containers has any modification other than # possibly untracked content item = GitDiffItem( # for homgeneity for report a str-path no matter what name=str(dir_item.name), # this submodule has not been detected as modified # per-commit, assign reported gitsha to pre and post # state gitsha=dir_item.gitsha, prev_gitsha=dir_item.gitsha, gittype=dir_item.gittype, # TODO others? ) if item.gittype == GitTreeItemType.submodule: # issue standard submodule container report _eval_submodule(path, item, eval_submodule_state) else: # this is on a directory. if it appears here, it has # no modified content if _path_has_untracked(path / dir_item.path): item.status = GitDiffStatus.modification item.add_modification_type( GitContainerModificationType.untracked_content) if item.status: yield item def _yield_repo_items( *, head: str | None, path: Path, untracked: str | None, eval_submodule_state: str, ) -> Generator[GitDiffItem, None, None]: """Report status items for a single/whole repsoitory""" present_submodules = { # stringify name for speedy comparison # TODO double-check that comparisons are primarily with # GitDiffItem.name which is str str(item.name): item for item in iter_submodules(path) } # start with a repository-contrained diff against the worktree for item in iter_gitdiff( path, from_treeish=head, # to the worktree to_treeish=None, recursive='repository', # we should be able to go cheaper with the submodule evaluation here. # We need to redo some check for adjusted mode, and other cases anyways eval_submodule_state='commit' if eval_submodule_state == 'full' else eval_submodule_state, ): # immediately investigate any submodules that are already # reported modified by Git if item.gittype == GitTreeItemType.submodule: _eval_submodule(path, item, eval_submodule_state) # we dealt with this submodule present_submodules.pop(item.name, None) if item.status: yield item # we are not generating a recursive report for submodules, hence # we need to look at ALL submodules for untracked content # `or {}` for the case where we got no submodules, which happens # with `eval_submodule_state == 'no'` for subm_name, subm_item in (present_submodules or {}).items(): # none of these submodules has any modification other than # possibly untracked content item = GitDiffItem( # for homgeneity for report a str-path no matter what name=str(subm_item.name), # this submodule has not been detected as modified # per-commit, assign reported gitsha to pre and post # state gitsha=subm_item.gitsha, prev_gitsha=subm_item.gitsha, gittype=subm_item.gittype, # TODO others? ) # TODO possibly trim eval_submodule_state _eval_submodule(path, item, eval_submodule_state) if item.status: yield item if untracked == 'no': return # lastly untracked files of this repo yield from _yield_repo_untracked(path, untracked) def _yield_hierarchy_items( *, head: str | None, path: Path, untracked: str | None, recursion_mode: str, eval_submodule_state: str, ) -> Generator[GitDiffItem, None, None]: for item in _yield_repo_items( head=head, path=path, untracked=untracked, # TODO do we need to adjust the eval mode here for the diff recmodes? eval_submodule_state=eval_submodule_state, ): # there is nothing else to do for any non-submodule item if item.gittype != GitTreeItemType.submodule: yield item continue # we get to see any submodule item passing through here, and can simply # call this function again for a subpath # submodule recursion # the .path of a GitTreeItem is always POSIX sm_path = path / item.path if recursion_mode == 'submodules': # in this mode, we run the submodule status against it own # worktree head sm_head, _ = get_worktree_head(sm_path) # because this need not cover all possible changes with respect # to the parent repository, we yield an item on the submodule # itself yield item elif recursion_mode == 'monolithic': # in this mode we determine the change of the submodule with # respect to the recorded state in the parent. This is either # the current gitsha, or (if git detected a committed # modification) the previous sha. This way, any further report # on changes a comprehensive from the point of view of the parent # repository, hence no submodule item is emitted sm_head = item.gitsha or item.prev_gitsha if GitContainerModificationType.new_commits in item.modification_types: # this is a submodule that has new commits compared to # its state in the parent dataset. We need to yield this # item, even if nothing else is modified, because otherwise # this (unsafed) changed would go unnoticed # https://github.com/datalad/datalad-next/issues/645 yield item for i in _yield_hierarchy_items( head=sm_head, path=sm_path, untracked=untracked, # TODO here we could implement handling for a recursion-depth limit recursion_mode=recursion_mode, eval_submodule_state=eval_submodule_state, ): i.name = f'{item.name}/{i.name}' yield i # # Helpers # def _yield_repo_untracked( path: Path, untracked: str | None, ) -> Generator[GitDiffItem, None, None]: """Yield items on all untracked content in a repository""" if untracked is None: return for uf in _git_ls_files( path, *lsfiles_untracked_args[untracked], ): yield GitDiffItem( name=uf, status=GitDiffStatus.other, # it is cheap to discriminate between a directory and anything # else. So let's do that, but not spend the cost of deciding # between files and symlinks gittype=GitTreeItemType.directory if uf.endswith('/') else None ) def _path_has_untracked(path: Path) -> bool: """Recursively check for any untracked content (except empty dirs)""" if not path.exists(): # cannot possibly have untracked return False for ut in _yield_repo_untracked( path, 'no-empty-dir', ): # fast exit on the first detection return True # we need to find all submodules, regardless of mode. # untracked content can also be in a submodule underneath # a directory for subm in iter_submodules(path): if _path_has_untracked(path / subm.path): # fast exit on the first detection return True # only after we saw everything we can say there is nothing return False def _get_submod_worktree_head(path: Path) -> tuple[bool, str | None, bool]: """Returns (submodule exists, SHA | None, adjusted)""" try: HEAD, corresponding_head = get_worktree_head(path) except ValueError: return False, None, False adjusted = corresponding_head is not None if adjusted: # this is a git-annex adjusted branch. do the comparison against # its basis. it is not meaningful to track the managed branch in # a superdataset HEAD = corresponding_head res = call_git_lines( ['rev-parse', '--path-format=relative', '--show-toplevel', HEAD], cwd=path, ) assert len(res) == 2 if res[0].startswith('..'): # this is not a report on a submodule at this location return False, None, adjusted else: return True, res[1], adjusted def _eval_submodule(basepath, item, eval_mode) -> None: """In-place amend GitDiffItem submodule item It does nothing with ``eval_mode='no'``. """ if eval_mode == 'no': return item_path = basepath / item.path # this is the cheapest test for the theoretical chance that a submodule # is present at `item_path`. This is beneficial even when we would only # run a single call to `git rev-parse` # https://github.com/datalad/datalad-next/issues/606 if not (item_path / '.git').exists(): return # get head commit, and whether a submodule is actually present, # and/or in adjusted mode subds_present, head_commit, adjusted = _get_submod_worktree_head(item_path) if not subds_present: return if adjusted: _eval_submodule_adjusted(item_path, item, head_commit, eval_mode) else: _eval_submodule_normal(item_path, item, head_commit, eval_mode) def _eval_submodule_normal(item_path, item, head_commit, eval_mode) -> None: if eval_mode == 'full' and item.status is None or ( item.modification_types and GitContainerModificationType.new_commits in item.modification_types ): # if new commits have been detected, the diff-implementation is # not able to report "modified content" at the same time, if it # exists. This requires a dedicated inspection, which conincidentally # is identical to the analysis of an adjusted mode submodule. return _eval_submodule_adjusted( item_path, item, head_commit, eval_mode) if item.gitsha != head_commit: item.status = GitDiffStatus.modification item.add_modification_type(GitContainerModificationType.new_commits) if eval_mode == 'commit': return # check for untracked content (recursively) if _path_has_untracked(item_path): item.status = GitDiffStatus.modification item.add_modification_type( GitContainerModificationType.untracked_content) def _eval_submodule_adjusted(item_path, item, head_commit, eval_mode) -> None: # we cannot rely on the diff-report for a submodule in adjusted mode. # git would make the comparison to the adjusted branch HEAD alone. # this would almost always be invalid, because it is not meaningful to # track a commit in an adjusted branch (it goes away). # # instead, we need to: # - check for a change in the corresponding HEAD to the recorded commit # in the parent repository, consider any change "new commits" # - check for a diff of the worktree to corresponding HEAD, consider # any such diff a "modified content" # - and lastly check for untracked content # start with "no modification" item.status = None item.modification_types = None if item.prev_gitsha != head_commit: item.status = GitDiffStatus.modification item.add_modification_type(GitContainerModificationType.new_commits) if eval_mode == 'commit': return if any( i.status is not None for i in iter_gitdiff( item_path, from_treeish=head_commit, # worktree to_treeish=None, recursive='repository', find_renames=None, find_copies=None, eval_submodule_state='commit', ) ): item.status = GitDiffStatus.modification item.add_modification_type( GitContainerModificationType.modified_content) # check for untracked content (recursively) if _path_has_untracked(item_path): item.status = GitDiffStatus.modification item.add_modification_type( GitContainerModificationType.untracked_content) datalad-next-1.4.1/datalad_next/iter_collections/gittree.py000066400000000000000000000106611462321624600241030ustar00rootroot00000000000000"""Report on the content of a Git tree-ish The main functionality is provided by the :func:`iter_gittree()` function. """ from __future__ import annotations from dataclasses import dataclass from enum import Enum from functools import cached_property import logging from pathlib import ( Path, PurePosixPath, ) from typing import Generator from datalad_next.runners import iter_git_subproc from datalad_next.itertools import ( decode_bytes, itemize, ) from .utils import PathBasedItem lgr = logging.getLogger('datalad.ext.next.iter_collections.gittree') # TODO Could be `StrEnum`, came with PY3.11 class GitTreeItemType(Enum): """Enumeration of item types of Git trees """ file = 'file' executablefile = 'executablefile' symlink = 'symlink' directory = 'directory' submodule = 'submodule' @dataclass class GitTreeItem(PathBasedItem): """``PathBasedItem`` with a relative path as a name (in POSIX conventions) """ name: str # gitsha is not the sha1 of the file content, but the output # of `git hash-object` which does something like # `printf "blob $(wc -c < "$file_name")\0$(cat "$file_name")" | sha1sum` gitsha: str | None = None gittype: GitTreeItemType | None = None @cached_property def path(self) -> PurePosixPath: """Returns the item name as a ``PurePosixPath`` instance""" return PurePosixPath(self.name) _mode_type_map = { '100644': GitTreeItemType.file, '100755': GitTreeItemType.executablefile, '040000': GitTreeItemType.directory, '120000': GitTreeItemType.symlink, '160000': GitTreeItemType.submodule, } def iter_gittree( path: Path, treeish: str, *, recursive: str = 'repository', ) -> Generator[GitTreeItem, None, None]: """Uses ``git ls-tree`` to report on a tree in a Git repository Parameters ---------- path: Path Path of a directory in a Git repository to report on. This directory need not be the root directory of the repository, but must be part of the repository. If the directory is not the root directory of a non-bare repository, the iterator is constrained to items underneath that directory. recursive: {'repository', 'no'}, optional Behavior for recursion into subtrees. By default (``repository``), all tree within the repository underneath ``path``) are reported, but not tree within submodules. If ``no``, only direct children are reported on. Yields ------ :class:`GitTreeItem` The ``name`` attribute of an item is a ``str`` with the corresponding (relative) path, as reported by Git (in POSIX conventions). """ # we force-convert to Path to give us the piece of mind we want. # The docs already ask for that, but it is easy to # forget/ignore and leads to non-obvious errors. Running this once is # a cheap safety net path = Path(path) # although it would be easy to also query the object size, we do not # do so, because it has a substantial runtime impact. It is unclear # what the main factor for the slowdown is, but in test cases I can # see 10x slower #lstree_args = ['--long'] # we do not go for a custom format that would allow for a single split # by tab, because if we do, Git starts quoting paths with special # characters (like tab) again #lstree_args = ['--format=%(objectmode)%x09%(objectname)%x09%(path)'] lstree_args = [] if recursive == 'repository': lstree_args.append('-r') for line in _git_ls_tree(path, treeish, *lstree_args): yield _get_tree_item(line) def _get_tree_item(spec: str) -> GitTreeItem: props, path = spec.split('\t', maxsplit=1) # 0::2 gets the first and third (last) item, effectively skippping the # type name (blob/tree etc.), we have the mode lookup for that, which # provides more detail mode, sha = props.split(' ')[0::2] return GitTreeItem( name=path, gitsha=sha, gittype=_mode_type_map[mode], ) def _git_ls_tree(path, *args): with iter_git_subproc( [ 'ls-tree', # we rely on zero-byte splitting below '-z', # otherwise take whatever is coming in *args, ], cwd=path, ) as r: yield from decode_bytes( itemize( r, sep=b'\0', keep_ends=False, ) ) datalad-next-1.4.1/datalad_next/iter_collections/gitworktree.py000066400000000000000000000315461462321624600250130ustar00rootroot00000000000000"""Report on the content of a Git repository worktree The main functionality is provided by the :func:`iter_gitworktree()` function. """ from __future__ import annotations from dataclasses import dataclass from itertools import chain import logging from pathlib import ( Path, PurePath, PurePosixPath, ) from typing import ( Dict, Generator, Tuple, ) from datalad_next.runners import iter_git_subproc from datalad_next.itertools import ( decode_bytes, itemize, ) from datalad_next.utils import external_versions # Kludge: Filter out paths starting with .git/ to work around # an `ls-files -o` bug that was fixed in Git 2.25. git_needs_filter_kludge = external_versions['cmd:git'] < '2.25' from .utils import ( FileSystemItem, FileSystemItemType, ) from .gittree import ( GitTreeItem, GitTreeItemType, _mode_type_map, ) lgr = logging.getLogger('datalad.ext.next.iter_collections.gitworktree') @dataclass class GitWorktreeItem(GitTreeItem): name: PurePath @dataclass class GitWorktreeFileSystemItem(FileSystemItem): name: PurePath # gitsha is not the sha1 of the file content, but the output # of `git hash-object` which does something like # `printf "blob $(wc -c < "$file_name")\0$(cat "$file_name")" | sha1sum` gitsha: str | None = None gittype: GitTreeItemType | None = None lsfiles_untracked_args = { 'all': ('--exclude-standard', '--others'), 'whole-dir': ('--exclude-standard', '--others', '--directory'), 'no-empty-dir': ('--exclude-standard', '--others', '--directory', '--no-empty-directory'), } def iter_gitworktree( path: Path, *, untracked: str | None = 'all', link_target: bool = False, fp: bool = False, recursive: str = 'repository', ) -> Generator[GitWorktreeItem | GitWorktreeFileSystemItem, None, None]: """Uses ``git ls-files`` to report on a work tree of a Git repository This iterator can be used to report on all tracked, and untracked content of a Git repository's work tree. This includes files that have been removed from the work tree (deleted), unless their removal has already been staged. For any tracked content, yielded items include type information and gitsha as last known to Git. This means that such reports reflect the last committed or staged content, not the state of a potential unstaged modification in the work tree. When no reporting of link targets or file-objects are requested, items of type :class:`GitWorktreeItem` are yielded, otherwise :class:`GitWorktreeFileSystemItem` instances. In both cases, ``gitsha`` and ``gittype`` properties are provided. Either of them being ``None`` indicates untracked work tree content. .. note:: The ``gitsha`` is not equivalent to a SHA1 hash of a file's content, but is the SHA-type blob identifier as reported and used by Git. Parameters ---------- path: Path Path of a directory in a Git repository to report on. This directory need not be the root directory of the repository, but must be part of the repository's work tree. untracked: {'all', 'whole-dir', 'no-empty-dir'} or None, optional If not ``None``, also reports on untracked work tree content. ``all`` reports on any untracked file; ``whole-dir`` yields a single report for a directory that is entirely untracked, and not individual untracked files in it; ``no-empty-dir`` skips any reports on untracked empty directories. link_target: bool, optional If ``True``, information matching a :class:`~datalad_next.iter_collections.utils.FileSystemItem` will be included for each yielded item, and the targets of any symlinks will be reported, too. fp: bool, optional If ``True``, information matching a :class:`~datalad_next.iter_collections.utils.FileSystemItem` will be included for each yielded item, but without a link target detection, unless ``link_target`` is given. Moreover, each file-type item includes a file-like object to access the file's content. This file handle will be closed automatically when the next item is yielded. recursive: {'repository', 'no'}, optional Behavior for recursion into subdirectories of ``path``. By default (``repository``), all directories within the repository are reported. This possibly includes untracked ones (see ``untracked``), but not directories within submodules. If ``no``, only direct children of ``path`` are reported on. For any worktree items in subdirectories of ``path`` only a single record for the containing immediate subdirectory ``path`` is yielded. For example, with 'path/subdir/file1' and 'path/subdir/file2' there will only be a single item with ``name='subdir'`` and ``type='directory'``. Yields ------ :class:`GitWorktreeItem` or :class:`GitWorktreeFileSystemItem` The ``name`` attribute of an item is a ``PurePath`` instance with the corresponding (relative) path, in platform conventions. """ # we force-convert to Path to prevent delayed crashing when reading from # the file system. The docs already ask for that, but it is easy to # forget/ignore and leads to non-obvious errors. Running this once is # a cheap safety net # https://github.com/datalad/datalad-next/issues/551 path = Path(path) lsfiles_args = ['--stage', '--cached'] if untracked: lsfiles_args.extend(lsfiles_untracked_args[untracked]) # helper to handle multi-stage reports by ls-files pending_item = (None, None) reported_dirs = set() _single_dir = recursive == 'no' # we add a "fake" `None` record at the end to avoid a special # case for submitting the last pending item after the loop. # otherwise the context manager handling of the file pointer # would lead to lots of code duplication for line in chain(_git_ls_files(path, *lsfiles_args), [None]): # a bit ugly, but we need to account for the `None` record # that signals the final loop iteration ipath, lsfiles_props = _lsfiles_line2props(line) \ if line is not None else (None, None) # yield any pending item, if the current record is not an # addendum of it if ipath is None or pending_item[0] not in (None, ipath): if ipath is None and pending_item[0] is None: return # this is the last point where we can still withhold a report. # it is also the point where we can do this with minimal # impact on the rest of the logic. # so act on recursion setup now pending_item_path_parts = pending_item[0].parts if _single_dir and len(pending_item_path_parts) > 1: # this path is pointing inside a subdirectory of the # base directory -> ignore # we do reset pending_item here, although this would also # happen below -- it decomplexifies the conditionals dir_path = pending_item_path_parts[0] if dir_path in reported_dirs: # we only yield each containing dir once, and only once pending_item = (ipath, lsfiles_props) continue item = _get_item( path, # the next two must be passed in order to get the # full logic when to yield a GitWorktreeFileSystemItem # (not just GitWorktreeItem) link_target=link_target, fp=fp, # we know all props already ipath=dir_path, type=GitTreeItemType.directory, gitsha=None, ) yield item reported_dirs.add(dir_path) pending_item = (ipath, lsfiles_props) continue # report on a pending item, this is not a "higher-stage" # report by ls-files item = _get_item( path, link_target, fp, pending_item[0], pending_item[1]['mode'] if pending_item[1] else None, pending_item[1]['gitsha'] if pending_item[1] else None, ) fp_src = _get_fp_src(fp, path, item) if fp_src is None: # nothing to open yield item else: with fp_src.open('rb') as active_fp: item.fp = active_fp yield item if ipath is None: # this is the trailing `None` record. we are done here break if lsfiles_props is None: # when no properties were produced, this is a # category "other" report (i.e., untracked content) # the path is always relative-POSIX pending_item = (ipath, None) else: pending_item = (ipath, lsfiles_props) # do not yield immediately, wait for a possible higher-stage # report in the next loop iteration def iter_submodules( path: Path, ) -> Generator[GitTreeItem, None, None]: """Given a path, report all submodules of a repository worktree underneath This is a thin convenience wrapper around ``iter_gitworktree()``. """ for item in iter_gitworktree( path, untracked=None, link_target=False, fp=False, recursive='repository', ): # exclude non-submodules, or a submodule that was found at # the root path -- which would indicate that the submodule # itself it not around, only its record in the parent if item.gittype == GitTreeItemType.submodule \ and item.name != PurePath('.'): yield item def _get_item( basepath: Path, link_target: bool, fp: bool, ipath: PurePosixPath, type: str | GitTreeItemType | None = None, gitsha: str | None = None, ) -> GitWorktreeItem | GitWorktreeFileSystemItem: if isinstance(type, str): type: GitTreeItemType = _mode_type_map[type] item = None if link_target or fp: fullpath = basepath / ipath try: item = GitWorktreeFileSystemItem.from_path( fullpath, link_target=link_target, ) except FileNotFoundError: pass if item is None: item = GitWorktreeItem(name=ipath) if type is not None: item.gittype = type if gitsha is not None: item.gitsha = gitsha # make sure the name/id is the path relative to the basepath item.name = PurePath(ipath) return item def _lsfiles_line2props( line: str ) -> Tuple[PurePosixPath, Dict[str, str] | None]: items = line.split('\t', maxsplit=1) # check if we cannot possibly have a 'staged' report with mode and gitsha if len(items) < 2: if git_needs_filter_kludge and line.startswith(".git/"): # pragma nocover lgr.debug("Filtering out .git/ file: %s", line) return # not known to Git, but Git always reports POSIX path = PurePosixPath(line) # early exist, we have nothing but the path (untracked) return path, None props = items[0].split(' ') if len(props) != 3: if git_needs_filter_kludge and line.startswith(".git/"): # pragma nocover lgr.debug("Filtering out .git/ file: %s", line) return # not known to Git, but Git always reports POSIX path = PurePosixPath(line) # early exist, we have nothing but the path (untracked) return path, None # again Git reports always in POSIX path = PurePosixPath(items[1]) return path, dict( gitsha=props[1], mode=props[0], ) def _git_ls_files(path, *args): with iter_git_subproc( [ 'ls-files', # we rely on zero-byte splitting below '-z', # otherwise take whatever is coming in *args, ], cwd=path, ) as r: yield from decode_bytes( itemize( r, sep=b'\0', keep_ends=False, ) ) def _get_fp_src( fp: bool, basepath: Path, item: GitWorktreeItem | GitWorktreeFileSystemItem, ) -> Path | None: if not fp or isinstance(item, GitWorktreeItem): # no file pointer request, we are done return None # if we get here, this is about file pointers... fp_src = None if item.type in (FileSystemItemType.file, FileSystemItemType.symlink): fp_src = item.name if fp_src is None: # nothing to open return None fp_src_fullpath = basepath / fp_src if not fp_src_fullpath.exists(): # nothing there to open (would resolve through a symlink) return None return fp_src_fullpath datalad-next-1.4.1/datalad_next/iter_collections/tarfile.py000066400000000000000000000066231462321624600240710ustar00rootroot00000000000000"""Report on the content of TAR archives The main functionality is provided by the :func:`iter_tar()` function. """ from __future__ import annotations from dataclasses import dataclass from functools import cached_property from pathlib import ( Path, PurePosixPath, ) import tarfile from typing import Generator from .utils import ( FileSystemItem, FileSystemItemType, ) @dataclass # sadly PY3.10+ only (kw_only=True) class TarfileItem(FileSystemItem): name: str """TAR uses POSIX paths as item identifiers. Not all POSIX paths can be represented on all (non-POSIX) file systems, therefore the item name is represented in POSIX form, instead of in platform conventions. """ link_target: str | None = None """Just as for ``name``, a link target is also reported in POSIX format.""" @cached_property def path(self) -> PurePosixPath: """Returns the item name as a ``PurePosixPath`` instance""" return PurePosixPath(self.name) @cached_property def link_target_path(self) -> PurePosixPath: """Returns the link_target as a ``PurePosixPath`` instance""" return PurePosixPath(self.link_target) def iter_tar( path: Path, *, fp: bool = False, ) -> Generator[TarfileItem, None, None]: """Uses the standard library ``tarfile`` module to report on TAR archives A TAR archive can represent more or less the full bandwidth of file system properties, therefore reporting on archive members is implemented similar to :func:`~datalad_next.iter_collections.directory.iter_dir()`. The iterator produces an :class:`TarfileItem` instance with standard information on file system elements, such as ``size``, or ``mtime``. Parameters ---------- path: Path Path of the TAR archive to report content for (iterate over). fp: bool, optional If ``True``, each file-type item includes a file-like object to access the file's content. This file handle will be closed automatically when the next item is yielded or the function returns. Yields ------ :class:`TarfileItem` The ``name`` attribute of an item is a ``str`` with the corresponding archive member name (in POSIX conventions). """ with tarfile.open(path, 'r') as tar: for member in tar: # reduce the complexity of tar member types to the desired # level (i.e. disregard the diversity of special files and # block devices) mtype = FileSystemItemType.file if member.isreg() \ else FileSystemItemType.directory if member.isdir() \ else FileSystemItemType.symlink if member.issym() \ else FileSystemItemType.hardlink if member.islnk() \ else FileSystemItemType.specialfile item = TarfileItem( name=member.name, type=mtype, size=member.size, mode=member.mode, mtime=member.mtime, uid=member.uid, gid=member.gid, link_target=member.linkname if member.linkname else None, ) if fp and mtype in ( FileSystemItemType.file, FileSystemItemType.hardlink): with tar.extractfile(member) as fp: item.fp = fp yield item else: yield item datalad-next-1.4.1/datalad_next/iter_collections/tests/000077500000000000000000000000001462321624600232245ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/iter_collections/tests/__init__.py000066400000000000000000000000001462321624600253230ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/iter_collections/tests/test_iterannexworktree.py000066400000000000000000000115131462321624600304160ustar00rootroot00000000000000from pathlib import ( PurePath, ) from datalad import cfg as dlcfg from datalad_next.datasets import Dataset from datalad_next.utils import check_symlink_capability from ..gitworktree import ( GitTreeItemType, iter_gitworktree, ) from ..annexworktree import iter_annexworktree from .test_itergitworktree import prep_fp_tester def _mkds(tmp_path_factory, monkeypatch, cfg_overrides): with monkeypatch.context() as m: for k, v in cfg_overrides.items(): m.setitem(dlcfg.overrides, k, v) dlcfg.reload() ds = Dataset(tmp_path_factory.mktemp('ds')).create( result_renderer='disabled') dlcfg.reload() return ds def _dotests(ds): test_file_content = 'test_file' test_file = ds.pathobj / 'annexed' / 'subdir' / 'file1.txt' test_file.parent.mkdir(parents=True) test_file.write_text(test_file_content) # we create an additional file where the content will be dropped # to test behavior on unavailable annex key droptest_content = 'somethingdropped' droptest_file = ds.pathobj / 'annexed' / 'dropped.txt' droptest_file.write_text(droptest_content) ds.save(result_renderer='disabled') ds.drop(droptest_file, reckless='availability', result_renderer='disabled') # get results for the annexed files query_path = ds.pathobj / 'annexed' res = list(iter_annexworktree( query_path, untracked=None, link_target=True, )) assert len(res) == 2 # # pick the present annex file to start r = [r for r in res if r.name.name == 'file1.txt'][0] assert r.name == query_path / 'subdir' / 'file1.txt' # we cannot check gitsha and symlink content for identity, it will change # depending on the tuning # we cannot check the item type, because it will vary across repository # modes (e.g., adjusted unlocked) assert r.annexsize == len(test_file_content) assert r.annexkey == 'MD5E-s9--37b87ee8c563af911dcc0f949826b1c9.txt' # with `link_target=True` we get an objpath that is relative to the # query path, and we find the actual key file there assert (query_path / r.annexobjpath).read_text() == test_file_content # # now pick the dropped annex file r = [r for r in res if r.name.name == 'dropped.txt'][0] assert r.name == query_path / 'dropped.txt' # we get basic info regardless of availability assert r.annexsize == len(droptest_content) assert r.annexkey == 'MD5E-s16--770a06889bc88f8743d1ed9a1977ff7b.txt' # even with an absent key file, we get its would-be location, # and it is relative to the query path assert r.annexobjpath.parts[:2] == ('..', '.git') def test_iter_annexworktree(tmp_path_factory, monkeypatch): ds = _mkds(tmp_path_factory, monkeypatch, {}) _dotests(ds) def test_iter_annexworktree_tuned(tmp_path_factory, monkeypatch): # same as test_file_content(), but with a "tuned" annexed that # no longer matches the traditional setup. # we need to be able to cope with that too ds = _mkds(tmp_path_factory, monkeypatch, { 'annex.tune.objecthash1': 'true', 'annex.tune.branchhash1': 'true', 'annex.tune.objecthashlower': 'true', }) _dotests(ds) def test_iter_annexworktree_basic_fp(existing_dataset, no_result_rendering): ds = existing_dataset fcount, content_tmpl = prep_fp_tester(ds) for ai in filter( lambda i: str(i.name.name).startswith('file_'), iter_annexworktree(ds.pathobj, fp=True) ): fcount -= 1 if getattr(ai, 'fp', False): assert content_tmpl.format( ai.name.name[5:]) == ai.fp.read().decode() else: assert (ai.annexobjpath and ( ds.pathobj / ai.annexobjpath).exists() is False) or ( (ds.pathobj / ai.name).exists() is False) assert not fcount def test_iter_annexworktree_nonrecursive(existing_dataset): # just a smoke test # given that iter_annexworktree() only wraps iter_gitworktree() # there is nothing to test here, any item not yielded by # iter_gitworktree() will also not be amended all_items = list(iter_annexworktree( existing_dataset.pathobj, recursive='no')) # we get a .datalad directory-tyoe item, rather than the file item from # inside the dir dirs = [i for i in all_items if i.gittype == GitTreeItemType.directory] assert len(dirs) == 1 dirs[0].name == PurePath('.datalad') def test_iter_annexworktree_noannex(existing_noannex_dataset): # plain smoke test to ensure this can run on a dataset without an annex all_annex_items = list( iter_annexworktree(existing_noannex_dataset.pathobj)) all_git_items = list(iter_gitworktree(existing_noannex_dataset.pathobj)) assert len(all_annex_items) == len(all_git_items) for a, g in zip(all_annex_items, all_git_items): assert a.name == g.name datalad-next-1.4.1/datalad_next/iter_collections/tests/test_iterdir.py000066400000000000000000000054341462321624600263050ustar00rootroot00000000000000import os from pathlib import PurePath import pytest from datalad_next.tests import ( create_tree, ) from datalad_next.utils import ( check_symlink_capability, rmtree, ) from ..directory import ( DirectoryItem, FileSystemItemType, iter_dir, ) from ..utils import compute_multihash_from_fp @pytest.fixture(scope="function") def dir_tree(tmp_path_factory): path = tmp_path_factory.mktemp("dir_tree") create_tree( path, { "random_file1.txt": "some content", "some_dir": { "file_in_dir.txt": "some content in file in dir", }, } ) symlink = path / 'symlink' symlink_target = path / 'some_dir' / "file_in_dir.txt" if check_symlink_capability(symlink, symlink_target): symlink.symlink_to(symlink_target) yield path rmtree(path) def test_iter_dir(dir_tree): target_hash = dict(md5='9893532233caff98cd083a116b013c0b', SHA1='94e66df8cd09d410c62d9e0dc59d3a884e458e05') target_paths = [ (dir_tree / 'random_file1.txt', FileSystemItemType.file, {}), (dir_tree / 'some_dir', FileSystemItemType.directory, {}), ] if check_symlink_capability(dir_tree / '__dummy1__', dir_tree / '__dummy2__'): target_paths.append(( dir_tree / 'symlink', FileSystemItemType.symlink, # how `readlink()` behaves on windows is fairly complex # rather than anticipating a result (that changes with # python version, see https://bugs.python.org/issue42957), # we simply test that this is compatible with `os.readlink()` dict(link_target=PurePath(os.readlink(dir_tree / 'symlink'))), )) target = [ DirectoryItem( name=PurePath(path.name), type=type, size=path.lstat().st_size, mode=path.lstat().st_mode, mtime=path.lstat().st_mtime, uid=path.lstat().st_uid, gid=path.lstat().st_gid, **kwa ) for path, type, kwa in target_paths ] iter_dir_res = [] for i in iter_dir(dir_tree, fp=True): if i.fp: # capitalization of algorithm labels is preserved assert compute_multihash_from_fp( i.fp, ['md5', 'SHA1']) == target_hash # we null the file pointers to ease the comparison i.fp = None iter_dir_res.append(i) assert len(iter_dir_res) == len(target) # check iter_dir() to be robust to concurrent removal it = iter_dir(dir_tree) # start iteration next(it) # wipe out content for i in dir_tree.glob('*'): rmtree(i) # consume the rest of the generator, nothing more, but also no crashing assert [] == list(it) datalad-next-1.4.1/datalad_next/iter_collections/tests/test_itergitdiff.py000066400000000000000000000302241462321624600271360ustar00rootroot00000000000000from pathlib import PurePosixPath import pytest import shutil from datalad_next.utils import rmtree from ..gitdiff import ( GitTreeItemType, GitDiffStatus, iter_gitdiff, ) def test_iter_gitdiff_invalid(): with pytest.raises(ValueError): # no meaningful comparison list(iter_gitdiff('.', None, None)) with pytest.raises(ValueError): # unsupported eval mode list(iter_gitdiff('.', None, None, eval_submodule_state='weird')) def test_iter_gitdiff_basic(existing_dataset, no_result_rendering): ds = existing_dataset dsp = ds.pathobj # we compare based on the last state of the corresponding # branch if there is any, or the HEAD of the current # branch comp_base = ds.repo.get_corresponding_branch() or 'HEAD' # we use two distinct content blobs below, hardcode sha here # for readability empty_sha = 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391' content = '123' content_sha = 'd800886d9c86731ae5c4a62b0b77c437015e00d2' status_args = ( # we always test against the root of the dataset dsp, comp_base, # we always compare to the worktree None, ) diff_args = ( # we always test against the root of the dataset dsp, # we always compare to last committed state f'{comp_base}~1', comp_base, ) # clean dataset, no items assert list(iter_gitdiff(*status_args)) == [] testpath = dsp / 'sub' / 'test' testpath.parent.mkdir() testpath.touch() # dataset with untracked file, no items assert list(iter_gitdiff(*status_args)) == [] ds.save(to_git=True) # clean dataset again, no items assert list(iter_gitdiff(*status_args)) == [] # added file diff = list(iter_gitdiff(*diff_args)) assert len(diff) == 1 di = diff[0] assert di.status == GitDiffStatus.addition assert di.name == 'sub/test' assert di.prev_name is di.prev_gitsha is di.prev_gittype is None assert di.gitsha == empty_sha assert di.gittype == GitTreeItemType.file # modified file testpath.write_text(content) diff = list(iter_gitdiff(*status_args)) assert len(diff) == 1 di = diff[0] # labeled as modified assert di.status == GitDiffStatus.modification # the name is plain str in POSIX assert di.name == di.prev_name == 'sub/test' # path conversion yield POSIX relpath assert di.path == di.prev_path == PurePosixPath(testpath.relative_to(dsp)) # unstaged modification reports no shasum assert di.gitsha is None assert di.prev_gitsha == empty_sha assert di.gittype == di.prev_gittype == GitTreeItemType.file # make clean ds.save(to_git=True) moved_testpath = testpath.parent / 'moved_test' testpath.rename(moved_testpath) # renamed file, unstaged, reported as deletion, we do not see the addition # yet (untracked) diff = list(iter_gitdiff(*status_args)) assert len(diff) == 1 di = diff[0] assert di.status == GitDiffStatus.deletion assert di.name == di.prev_name == 'sub/test' assert di.prev_gitsha == content_sha assert di.prev_gittype == GitTreeItemType.file assert di.gitsha is di.gittype is None # make clean ds.save(to_git=True) # now we can look at the rename diff = list(iter_gitdiff(*diff_args, find_renames=100)) assert len(diff) == 1 di = diff[0] assert di.status == GitDiffStatus.rename assert di.name == 'sub/moved_test' assert di.prev_name == 'sub/test' assert di.gitsha == di.prev_gitsha == content_sha assert di.prev_gittype is di.gittype is GitTreeItemType.file assert di.percentage == 100 # now a copy shutil.copyfile(moved_testpath, testpath) ds.save(to_git=True) diff = list(iter_gitdiff(*diff_args, find_copies=100)) assert len(diff) == 1 di = diff[0] assert di.status == GitDiffStatus.copy assert di.name == 'sub/test' assert di.prev_name == 'sub/moved_test' assert di.gitsha == di.prev_gitsha == content_sha assert di.percentage == 100 # now replace file with submodule testpath.unlink() # we must safe to appease datalad's content collision detection ds.save(to_git=True) # intermediate smoke test for describing a single tree (diff from parents) diff = list(iter_gitdiff(dsp, None, comp_base)) assert len(diff) == 1 assert diff[0].status == GitDiffStatus.deletion # now cause typechange ds.create(testpath) diff = list(iter_gitdiff( dsp, # because we have an intermediate safe, compare to two states # back f'{comp_base}~2', comp_base, )) assert len(diff) == 2 # let's ignore the uninteresting .gitmodules addition for further tests di = [i for i in diff if i.name != '.gitmodules'][0] assert di.status == GitDiffStatus.typechange assert di.name == di.prev_name == 'sub/test' assert di.gitsha != di.prev_gitsha assert di.prev_gitsha == content_sha assert di.prev_gittype == GitTreeItemType.file assert di.gittype == GitTreeItemType.submodule def test_iter_gitdiff_nonroot(existing_dataset, no_result_rendering): ds = existing_dataset comp_base = ds.repo.get_corresponding_branch() or 'HEAD' # all tests are concerned with running not in the dataset root root = ds.pathobj nonroot = root / 'sub' nonroot.mkdir() status_args = (nonroot, comp_base, None) diff_args = (nonroot, f'{comp_base}~1', comp_base) # nothing to report, no problem assert list(iter_gitdiff(*status_args)) == [] # change above CWD is not reported (root / 'rootfile').touch() ds.save(to_git=True) assert list(iter_gitdiff(*diff_args)) == [] # check worktree modification detection too (root / 'rootfile').write_text('some') assert list(iter_gitdiff(*status_args)) == [] # and now test that reporting is relative to # CWD (nonroot / 'nonrootfile').touch() ds.save(to_git=True) assert list(iter_gitdiff(*diff_args))[0].name == 'nonrootfile' (nonroot / 'nonrootfile').write_text('other') assert list(iter_gitdiff(*diff_args))[0].name == 'nonrootfile' def test_iter_gitdiff_nonrec(existing_dataset, no_result_rendering): ds = existing_dataset dsp = ds.pathobj comp_base = ds.repo.get_corresponding_branch() or 'HEAD' subdir = dsp / 'sub' subdir.mkdir() for fn in ('f1.txt', 'f2.txt'): (subdir / fn).touch() ds.save(to_git=True) diff = list(iter_gitdiff(dsp, f'{comp_base}~1', comp_base, recursive='no')) assert len(diff) == 1 di = diff[0] assert di.name == 'sub' assert di.gittype == GitTreeItemType.directory assert di.status == GitDiffStatus.addition di_tree = di # same behavior for a worktree modification for fn in ('f1.txt', 'f2.txt'): (subdir / fn).write_text('modified') diff = list(iter_gitdiff(dsp, f'{comp_base}~1', None, recursive='no')) assert len(diff) == 1 di = diff[0] # these are identical to the diff-tree based report for p in ('name', 'gittype', 'prev_gitsha', 'prev_gittype'): assert getattr(di, p) == getattr(di_tree, p) # and there are different # not staged, no gitsha assert di.gitsha is None # it does no type inference for the previous state (expensive) assert di.prev_gittype is None # when the directory existed in the from-state it becomes a # modification diff = list(iter_gitdiff(dsp, f'{comp_base}~1', None, recursive='no')) assert len(diff) == 1 diff[0].status == GitDiffStatus.modification # now remove the subdir rmtree(subdir) diff = list(iter_gitdiff(dsp, comp_base, None, recursive='no')) assert len(diff) == 1 # it still reports a modification, even though the directory is empty/gone. # it would require a filesystem STAT to detect a deletion, and a further # type investigation in `from_treeish` to detect a type change. # This is not done until there is evidence for a real use case diff[0].status == GitDiffStatus.modification def test_iter_gitdiff_typechange_issue6791( existing_dataset, no_result_rendering): # verify that we can handle to problem described in # https://github.com/datalad/datalad/issues/6791 # # a subdataset is wiped out (uncommitted) and replaced by a file ds = existing_dataset ds.create('subds') rmtree(ds.pathobj / 'subds') (ds.pathobj / 'subds').touch() diff = list(iter_gitdiff( ds.pathobj, ds.repo.get_corresponding_branch() or 'HEAD', None, )) assert len(diff) == 1 di = diff[0] assert di.status == GitDiffStatus.typechange assert di.name == di.prev_name == 'subds' # unstaged change assert di.gitsha is None assert di.prev_gittype == GitTreeItemType.submodule assert di.gittype == GitTreeItemType.file def test_iter_gitdiff_rec(existing_dataset, no_result_rendering): ds = existing_dataset subds = ds.create('subds') dsp = ds.pathobj comp_base = ds.repo.get_corresponding_branch() or 'HEAD' status_args = (dsp, comp_base, None) diff_args = (dsp, f'{comp_base}~1', comp_base) diff = list(iter_gitdiff(*diff_args, recursive='submodules')) # we get more than just .gitmodules and a submodule record assert len(diff) > 2 # the entire submodule is new and the first one, so everything # is an addition assert all(i.status == GitDiffStatus.addition for i in diff) # only files, no submodule record, by default assert all(i.gittype == GitTreeItemType.file for i in diff) # when we ask for it, we get the submodule item too diff_w_sm = list(iter_gitdiff(*diff_args, recursive='submodules', yield_tree_items='submodules')) assert len(diff) + 1 == len(diff_w_sm) assert any(i.name == 'subds' and i.gittype == GitTreeItemType.submodule for i in diff_w_sm) # smoke test for an all-clean diff against the worktrees assert list(iter_gitdiff(*status_args, recursive='submodules')) == [] # make subdataset record modified (subds.pathobj / 'file').touch() subds.save(to_git=True) diff = list(iter_gitdiff(*status_args, recursive='submodules')) assert len(diff) == 1 di = diff[0] assert di.name == 'subds/file' assert di.status == GitDiffStatus.addition # now with submodule item diff_w_sm = list(iter_gitdiff(*status_args, recursive='submodules', yield_tree_items='all')) assert len(diff_w_sm) == 2 di = diff_w_sm[0] # the submodule item is always first assert di.name == 'subds' assert di.gittype == GitTreeItemType.submodule assert di.status == GitDiffStatus.modification assert diff_w_sm[1] == diff[0] # safe the whole hierarchy ds.save(recursive=True) # we get the exact same change report via the diff to HEAD~1:HEAD assert diff == list(iter_gitdiff(*diff_args, recursive='submodules')) # modify a tracked file in the subdataset (subds.pathobj / 'file').write_text('123') diff_w_sm = list(iter_gitdiff(*status_args, recursive='submodules', yield_tree_items='all')) # same report for the submodule (and it is first again) assert diff_w_sm[0].name == 'subds' assert diff_w_sm[0].gittype == GitTreeItemType.submodule assert diff_w_sm[0].status == GitDiffStatus.modification # but this time the file is not an addition but a modification assert diff_w_sm[1].name == 'subds/file' assert diff_w_sm[1].status == GitDiffStatus.modification # force-wipe the subdataset, and create a condition where the subdatasets # is expected but missing rmtree(subds.pathobj) diff = list(iter_gitdiff(*status_args)) assert len(diff) == 1 di = diff[0] assert di.name == 'subds' assert di.status == GitDiffStatus.deletion # if we now run with recursion, we get the exact same result, the absent # submodule is a subtree that we do not recurse into, hence the report # is only on the tree itself assert diff == list(iter_gitdiff(*status_args, recursive='submodules')) # use the opportunity to check equality of recursive='all' for this case assert diff == list(iter_gitdiff(*status_args, recursive='all')) datalad-next-1.4.1/datalad_next/iter_collections/tests/test_itergitstatus.py000066400000000000000000000252501462321624600275540ustar00rootroot00000000000000from itertools import chain import pytest from datalad_next.runners import ( call_git_success, ) from ..gitstatus import ( GitDiffStatus, GitContainerModificationType, iter_gitstatus, ) def test_status_homogeneity(modified_dataset): """Test things that should always be true, no matter the precise parameterization A main purpose of this test is also to exercise all (main) code paths. """ ds = modified_dataset for kwargs in ( # default dict(path=ds.pathobj), dict(path=ds.pathobj, recursive='no'), dict(path=ds.pathobj, recursive='repository'), dict(path=ds.pathobj, recursive='submodules'), # same as above, but with the submodules in the root dict(path=ds.pathobj / 'dir_sm', recursive='no'), dict(path=ds.pathobj / 'dir_sm', recursive='repository'), dict(path=ds.pathobj / 'dir_sm', recursive='submodules'), # no submodule state dict(path=ds.pathobj, eval_submodule_state='no', recursive='no'), dict(path=ds.pathobj, eval_submodule_state='no', recursive='repository'), dict(path=ds.pathobj, eval_submodule_state='no', recursive='submodules'), # just the commit dict(path=ds.pathobj, eval_submodule_state='commit', recursive='no'), dict(path=ds.pathobj, eval_submodule_state='commit', recursive='repository'), dict(path=ds.pathobj, eval_submodule_state='commit', recursive='submodules'), # without untracked dict(path=ds.pathobj, untracked='no', recursive='no'), dict(path=ds.pathobj, untracked='no', recursive='repository'), dict(path=ds.pathobj, untracked='no', recursive='submodules'), # special untracked modes dict(path=ds.pathobj, untracked='whole-dir', recursive='no'), dict(path=ds.pathobj, untracked='whole-dir', recursive='repository'), dict(path=ds.pathobj, untracked='whole-dir', recursive='submodules'), dict(path=ds.pathobj, untracked='no-empty-dir', recursive='no'), dict(path=ds.pathobj, untracked='no-empty-dir', recursive='repository'), dict(path=ds.pathobj, untracked='no-empty-dir', recursive='submodules'), # call in the mountpoint of a dropped submodule dict(path=ds.pathobj / 'dir_sm' / 'droppedsm_c'), ): st = {item.name: item for item in iter_gitstatus(**kwargs)} # we get no report on anything clean (implicitly also tests # whether all item names are plain strings assert all(not i.name.endswith('_c') for i in st.values()) # anything untracked is labeled as such assert all( i.status == GitDiffStatus.other # we would not see a submodule modification qualifier when instructed # not to evaluate a submodule or kwargs.get('eval_submodule_state') in ('no', 'commit') or GitContainerModificationType.untracked_content in i.modification_types for i in st.values() if 'u' in i.path.name.split('_')[1] ) # anything modified is labeled as such assert all( # either directly i.status == GitDiffStatus.modification # or as an addition with a modification on top or (i.status == GitDiffStatus.addition and GitContainerModificationType.modified_content in i.modification_types) for i in st.values() if 'm' in i.path.name.split('_')[1] ) # anything deleted is labeled as such assert all( i.status == GitDiffStatus.deletion for i in st.values() if 'd' in i.path.name.split('_')[1] ) def test_status_invalid_params(existing_dataset): ds = existing_dataset with pytest.raises(ValueError): list(iter_gitstatus(ds.pathobj, recursive='fromspace')) test_cases_repository_recursion = [ {'name': 'file_a', 'status': GitDiffStatus.addition}, {'name': 'dir_m/file_a', 'status': GitDiffStatus.addition}, {'name': 'file_u', 'status': GitDiffStatus.other}, {'name': 'dir_u/file_u', 'status': GitDiffStatus.other}, {'name': 'dir_m/file_u', 'status': GitDiffStatus.other}, {'name': 'dir_m/dir_u/file_u', 'status': GitDiffStatus.other}, {'name': 'file_d', 'status': GitDiffStatus.deletion}, {'name': 'dir_d/file_d', 'status': GitDiffStatus.deletion}, {'name': 'dir_m/file_d', 'status': GitDiffStatus.deletion}, {'name': 'file_m', 'status': GitDiffStatus.modification}, {'name': 'dir_m/file_m', 'status': GitDiffStatus.modification}, {'name': 'dir_sm/sm_d', 'status': GitDiffStatus.deletion}, {'name': 'dir_sm/sm_n', 'status': GitDiffStatus.modification, 'qual': (GitContainerModificationType.new_commits,)}, {'name': 'dir_sm/sm_m', 'status': GitDiffStatus.modification, 'qual': (GitContainerModificationType.modified_content,)}, {'name': 'dir_sm/sm_nm', 'status': GitDiffStatus.modification, 'qual': (GitContainerModificationType.modified_content, GitContainerModificationType.new_commits)}, {'name': 'dir_sm/sm_nmu', 'status': GitDiffStatus.modification, 'qual': (GitContainerModificationType.modified_content, GitContainerModificationType.untracked_content, GitContainerModificationType.new_commits)}, {'name': 'dir_sm/sm_u', 'status': GitDiffStatus.modification, 'qual': (GitContainerModificationType.untracked_content,)}, {'name': 'dir_sm/sm_mu', 'status': GitDiffStatus.modification, 'qual': (GitContainerModificationType.modified_content, GitContainerModificationType.untracked_content)}, ] test_cases_submodule_recursion = [ {'name': 'dir_sm/sm_m/file_a', 'status': GitDiffStatus.addition}, {'name': 'dir_sm/sm_nm/file_a', 'status': GitDiffStatus.addition}, {'name': 'dir_sm/sm_mu/file_a', 'status': GitDiffStatus.addition}, {'name': 'dir_sm/sm_nmu/file_a', 'status': GitDiffStatus.addition}, {'name': 'dir_sm/sm_m/file_m', 'status': GitDiffStatus.modification}, {'name': 'dir_sm/sm_mu/file_m', 'status': GitDiffStatus.modification}, {'name': 'dir_sm/sm_nmu/file_m', 'status': GitDiffStatus.modification}, {'name': 'dir_sm/sm_u/file_u', 'status': GitDiffStatus.other}, {'name': 'dir_sm/sm_mu/file_u', 'status': GitDiffStatus.other}, {'name': 'dir_sm/sm_nmu/file_u', 'status': GitDiffStatus.other}, {'name': 'dir_sm/sm_u/dir_u/file_u', 'status': GitDiffStatus.other}, {'name': 'dir_sm/sm_mu/dir_u/file_u', 'status': GitDiffStatus.other}, {'name': 'dir_sm/sm_nmu/dir_u/file_u', 'status': GitDiffStatus.other}, ] def _assert_testcases(st, tc): for c in tc: assert st[c['name']].status == c['status'] mod_types = st[c['name']].modification_types if 'qual' in c: assert set(mod_types) == set(c['qual']) else: assert mod_types is None def test_status_vs_git(modified_dataset): """Implements a comparison against how git-status behaved when the test was written (see fixture docstring) """ st = { item.name: item for item in iter_gitstatus( path=modified_dataset.pathobj, recursive='repository', eval_submodule_state='full', untracked='all', ) } _assert_testcases(st, test_cases_repository_recursion) def test_status_norec(modified_dataset): st = { item.name: item for item in iter_gitstatus( path=modified_dataset.pathobj, recursive='no', eval_submodule_state='full', untracked='all', ) } test_cases = [ {'name': 'file_a', 'status': GitDiffStatus.addition}, {'name': 'dir_d', 'status': GitDiffStatus.deletion}, {'name': 'dir_m', 'status': GitDiffStatus.modification, 'qual': (GitContainerModificationType.modified_content, GitContainerModificationType.untracked_content)}, {'name': 'dir_sm', 'status': GitDiffStatus.modification, 'qual': (GitContainerModificationType.modified_content, GitContainerModificationType.untracked_content)}, {'name': 'file_d', 'status': GitDiffStatus.deletion}, {'name': 'file_m', 'status': GitDiffStatus.modification}, {'name': 'dir_u', 'status': GitDiffStatus.other}, {'name': 'file_u', 'status': GitDiffStatus.other}, ] _assert_testcases(st, test_cases) def test_status_smrec(modified_dataset): st = { item.name: item for item in iter_gitstatus( path=modified_dataset.pathobj, recursive='submodules', eval_submodule_state='full', untracked='all', ) } # in this mode we expect ALL results of a 'repository' mode recursion, # including the submodule-type items, plus additional ones from within # the submodules _assert_testcases(st, chain(test_cases_repository_recursion, test_cases_submodule_recursion)) def test_status_monorec(modified_dataset): st = { item.name: item for item in iter_gitstatus( path=modified_dataset.pathobj, recursive='monolithic', eval_submodule_state='full', untracked='all', ) } # in this mode we expect ALL results of a 'repository' mode recursion, # including the submodule-type items, plus additional ones from within # the submodules _assert_testcases( st, # repository and recursive test cases [c for c in chain(test_cases_repository_recursion, test_cases_submodule_recursion) # minus any submodule that have no new commits # (this only thing that is not attributable to individual # content changes) if not c['name'].split('/')[-1] in ( 'sm_m', 'sm_mu', 'sm_u', )]) def test_status_gitinit(tmp_path): # initialize a fresh git repo, but make no commits assert call_git_success(['init'], cwd=tmp_path) for recmode in ('no', 'repository', 'submodules'): assert [] == list(iter_gitstatus(tmp_path, recursive=recmode)) # untracked reporting must be working normal (tmp_path / 'untracked').touch() for recmode in ('no', 'repository', 'submodules'): res = list(iter_gitstatus(tmp_path, recursive=recmode)) assert len(res) == 1 assert res[0].name == 'untracked' assert res[0].status == GitDiffStatus.other def test_status_nohead_staged(tmp_path): # initialize a fresh git repo, but make no commits assert call_git_success(['init'], cwd=tmp_path) # stage a file (tmp_path / 'probe').write_text('tostage') assert call_git_success(['add', 'probe'], cwd=tmp_path) _assert_testcases( {i.name: i for i in iter_gitstatus(tmp_path)}, [{'name': 'probe', 'status': GitDiffStatus.addition}], ) datalad-next-1.4.1/datalad_next/iter_collections/tests/test_itergittree.py000066400000000000000000000064311462321624600271700ustar00rootroot00000000000000from pathlib import ( PurePosixPath, ) import pytest from datalad_next.utils import rmtree from ..gittree import ( GitTreeItem, GitTreeItemType, iter_gittree, ) def test_iter_gittree(existing_dataset, no_result_rendering): ds = existing_dataset is_crippled_fs = ds.repo.is_crippled_fs() tracked_items = list(iter_gittree(ds.pathobj, 'HEAD')) # without untracked's and no link resolution this is plain and fast assert all( isinstance(i, GitTreeItem) and i.gitsha and i.gittype for i in tracked_items ) # we add a new file and test its expected properties probe_name = 'probe.txt' # on crippled FS we are testing the managed branch which contains # pointer files, not symlinks expected_probe_sha = '969921a905b411137fce77c104954f742e1ee6c7' \ if is_crippled_fs \ else '7c38bf0378c31f8696e5869e7828a32c9dc2684e' probe = ds.pathobj / 'subdir' / probe_name probe.parent.mkdir() probe.write_text('probe') ds.save() assert any( # let's query a Path instance here, to get that covered too i.path == PurePosixPath(f'subdir/{probe_name}') and i.gitsha == expected_probe_sha and ( i.gittype in (GitTreeItemType.file, GitTreeItemType.executablefile) if is_crippled_fs else i.gittype == GitTreeItemType.symlink ) for i in iter_gittree(ds.pathobj, 'HEAD') ) if not is_crippled_fs: # if we check the prior version, we do not see it (hence the # tree-ish passing is working assert not any( i.path == PurePosixPath(f'subdir/{probe_name}') for i in iter_gittree(ds.pathobj, 'HEAD~1') ) # if we disable recursion, the probe is not listed, but its # parent dir is tracked_toplevel_items = list( iter_gittree(ds.pathobj, 'HEAD', recursive='no')) assert not any( i.name == f'subdir/{probe_name}' for i in tracked_toplevel_items ) assert any( i.name == 'subdir' and (True if is_crippled_fs else 'eb4aa65f42b90178837350571a227445b996cf90') and i.gittype == GitTreeItemType.directory for i in tracked_toplevel_items ) # iterating on a subdir does constrain the report tracked_subdir_items = list(iter_gittree(probe.parent, 'HEAD')) assert len(tracked_subdir_items) == 1 probe_item = tracked_subdir_items[0] assert probe_item.name == probe_name assert probe_item.gitsha == expected_probe_sha def test_name_starting_with_tab(existing_dataset, no_result_rendering): ds = existing_dataset if ds.repo.is_crippled_fs(): pytest.skip("not applicable on crippled filesystems") tabbed_file_name = "\ttab.txt" tabbed_name = ds.pathobj / tabbed_file_name tabbed_name.write_text('name of this file starts with a tab') ds.save() iter_names = [item.path for item in iter_gittree(ds.pathobj, 'HEAD')] assert PurePosixPath(tabbed_file_name) in iter_names def test_iter_gittree_empty(existing_dataset, no_result_rendering): ds = existing_dataset rmtree(ds.pathobj / '.datalad') (ds.pathobj / '.gitattributes').unlink() ds.save() assert len(ds.status()) == 0 all_items = list(iter_gittree(ds.pathobj, 'HEAD')) assert len(all_items) == 0 datalad-next-1.4.1/datalad_next/iter_collections/tests/test_itergitworktree.py000066400000000000000000000202011462321624600300620ustar00rootroot00000000000000from pathlib import ( PurePath, PurePosixPath, ) import pytest from datalad_next.utils import ( check_symlink_capability, rmtree, ) from ..gitworktree import ( GitWorktreeItem, GitWorktreeFileSystemItem, iter_gitworktree, ) def test_iter_gitworktree(existing_dataset): ds = existing_dataset (ds.pathobj / 'emptydir').mkdir() untracked = ds.pathobj / 'subdir' / 'untracked' untracked.parent.mkdir() untracked.write_text('untracked') tracked_items = list(iter_gitworktree(ds.pathobj, untracked=None)) # without untracked's and no link resolution this is plain and fast assert all( isinstance(i, GitWorktreeItem) and i.gitsha and i.gittype for i in tracked_items ) all_items = list(iter_gitworktree(ds.pathobj, untracked='all')) # empty-dir is not reported, only untracked files assert len(all_items) == len(tracked_items) + 1 assert any( i.name == PurePath('subdir', 'untracked') and i.gitsha is None and i.gittype is None for i in all_items ) # same again, but with a different untracked reporting all_items = list(iter_gitworktree(ds.pathobj, untracked='whole-dir')) # emptydir is reported too assert len(all_items) == len(tracked_items) + 2 assert any( i.name == PurePath('subdir') and i.gitsha is None and i.gittype is None for i in all_items ) # and again for the last variant all_items = list(iter_gitworktree(ds.pathobj, untracked='no-empty-dir')) # and again no emptydir assert len(all_items) == len(tracked_items) + 1 assert any( i.name == PurePath('subdir') and i.gitsha is None and i.gittype is None for i in all_items ) # if we ask for file objects or link_targets, we get a different item type, # but including the same for kwargs in ( dict(link_target=True, fp=False, untracked=None), dict(link_target=False, fp=True, untracked=None), dict(link_target=True, fp=True, untracked=None), ): assert all( isinstance(i, GitWorktreeFileSystemItem) and i.gitsha and i.gittype for i in iter_gitworktree(ds.pathobj, **kwargs) ) # check that file pointers work for tracked and untracked content checked_tracked = False checked_untracked = False for item in iter_gitworktree(ds.pathobj, fp=True): if item.name == PurePath('.datalad', 'config'): assert ds.id in (ds.pathobj / item.name).read_text() checked_tracked = True elif item.name == PurePath('subdir', 'untracked'): assert 'untracked' == (ds.pathobj / item.name).read_text() checked_untracked = True assert checked_tracked assert checked_untracked def test_name_starting_with_tab(existing_dataset, no_result_rendering): ds = existing_dataset if ds.repo.is_crippled_fs(): pytest.skip("not applicable on crippled filesystems") tabbed_file_name = "\ttab.txt" tabbed_name = ds.pathobj / tabbed_file_name tabbed_name.write_text('name of this file starts with a tab') ds.save() iter_names = [item.name for item in iter_gitworktree(ds.pathobj)] assert PurePosixPath(tabbed_file_name) in iter_names def test_iter_gitworktree_recursive(existing_dataset, no_result_rendering): # actually, this tests non-recursive, because within-repo # recursion is the default. # later, we might also test subdataset recursion here ds = existing_dataset # some tracked content tracked1 = ds.pathobj / 'tracked1' tracked2 = ds.pathobj / 'subdir' / 'tracked2' tracked3 = ds.pathobj / 'subdir' / 'tracked3' for p in (tracked1, tracked2, tracked3): p.parent.mkdir(exist_ok=True) p.write_text(p.name) ds.save() # an "invisible" directory (no content) (ds.pathobj / 'emptydir').mkdir() # untracked file in subdir untracked = ds.pathobj / 'subdir_u' / 'untracked' untracked.parent.mkdir() untracked.write_text('untracked') # matches git report with untracked=all all_content = set(( PurePath('.datalad'), PurePath('subdir'), PurePath('.gitattributes'), PurePath('subdir_u'), PurePath('tracked1'), )) # without any recursion, we see all top-level content, except for # the empty directory with no content all_items = list(iter_gitworktree(ds.pathobj, recursive='no')) assert set(i.name for i in all_items) == all_content # no we test a query that gooey would want to make, # give me all content in a single directory, and also include any # untracked files and even untracked/empty directories all_items = list(iter_gitworktree(ds.pathobj, recursive='no', untracked='whole-dir')) assert set(i.name for i in all_items) == \ all_content.union((PurePath('emptydir'),)) def test_iter_gitworktree_empty(existing_dataset, no_result_rendering): ds = existing_dataset rmtree(ds.pathobj / '.datalad') (ds.pathobj / '.gitattributes').unlink() ds.save() assert len(ds.status()) == 0 all_items = list(iter_gitworktree(ds.pathobj)) assert len(all_items) == 0 def test_iter_gitworktree_deadsymlinks(existing_dataset, no_result_rendering): ds = existing_dataset dpath = ds.pathobj / 'subdir' dpath.mkdir() fpath = dpath / 'file1' test_content = 'content' fpath.write_text(test_content) ds.save() ds.drop(fpath, reckless='availability') try: # if there is a file we can open, it should not have the content # (annex pointer file) assert fpath.read_text() != test_content except FileNotFoundError: # with dead symlinks, we end up here and that is normal pass # next one must not crash all_items = list(iter_gitworktree(dpath)) # we get our "dead symlink" -- but depending on the p[latform # it may take a different form, hence not checking for type assert len(all_items) == 1 assert all_items[0].name == PurePath('file1') def prep_fp_tester(ds): # we expect to process an exact number of files below # 3 annexed files, 1 untracked, 1 in git, # and possibly 1 symlink in git, 1 symlink untracked # we count them up on creation, and then down on test fcount = 0 content_tmpl = 'content: #ö file_{}' for i in ('annex1', 'annex2', 'annex3'): (ds.pathobj / f'file_{i}').write_text( content_tmpl.format(i), encoding='utf-8') fcount += 1 ds.save() ds.drop( ds.pathobj / 'file_annex1', reckless='availability', ) # and also add a file to git directly and a have one untracked too for i in ('untracked', 'ingit', 'deleted'): (ds.pathobj / f'file_{i}').write_text( content_tmpl.format(i), encoding='utf-8') fcount += 1 ds.save(['file_ingit', 'file_deleted'], to_git=True) # and add symlinks (untracked and in git) if check_symlink_capability( ds.pathobj / '_dummy', ds.pathobj / '_dummy_target' ): for i in ('symlinkuntracked', 'symlinkingit'): tpath = ds.pathobj / f'target_{i}' lpath = ds.pathobj / f'file_{i}' tpath.write_text( content_tmpl.format(i), encoding='utf-8') lpath.symlink_to(tpath) fcount += 1 ds.save('file_symlinkingit', to_git=True) (ds.pathobj / 'file_deleted').unlink() return fcount, content_tmpl def test_iter_gitworktree_basic_fp(existing_dataset, no_result_rendering): ds = existing_dataset fcount, content_tmpl = prep_fp_tester(ds) for ai in filter( lambda i: i.name.name.startswith('file_'), iter_gitworktree(ds.pathobj, fp=True) ): fcount -= 1 if getattr(ai, 'fp', False): # for annexed files the fp can be an annex pointer file. # in the context of `iter_gitworktree` this is not a # recognized construct assert content_tmpl.format( ai.name.name[5:]) == ai.fp.read().decode() \ or ai.name.name.startswith('file_annex') else: assert (ds.pathobj / ai.name).exists() is False assert not fcount datalad-next-1.4.1/datalad_next/iter_collections/tests/test_itertar.py000066400000000000000000000064351462321624600263170ustar00rootroot00000000000000from pathlib import PurePosixPath import pytest from datalad.api import download from datalad_next.tests import skipif_no_network from ..tarfile import ( TarfileItem, FileSystemItemType, iter_tar, ) from ..utils import compute_multihash_from_fp @pytest.fixture(scope="session") def sample_tar_xz(tmp_path_factory): """Provides a path to a tarball with file, directory, hard link, and soft link. Any file content is '123\n'. The associated hashes are: md5: ba1f2511fc30423bdbb183fe33f3dd0f sha1: a8fdc205a9f19cc1c7507a60c4f01b13d11d7fd0 Layout:: ❯ datalad tree --include-files test-archive test-archive ├── 123.txt -> subdir/onetwothree_again.txt ├── 123_hard.txt ├── onetwothree.txt └── subdir/ └── onetwothree_again.txt """ path = tmp_path_factory.mktemp("tarfile") tfpath = path / 'sample.tar.xz' download( {'https://github.com/datalad/datalad-next/releases/download/0.1.0/test_archive.tar.xz': tfpath}, result_renderer='disabled', ) yield tfpath tfpath.unlink() @skipif_no_network def test_iter_tar(sample_tar_xz): target_hash = {'SHA1': 'a8fdc205a9f19cc1c7507a60c4f01b13d11d7fd0', 'md5': 'ba1f2511fc30423bdbb183fe33f3dd0f'} targets = [ TarfileItem( name='test-archive', type=FileSystemItemType.directory, size=0, mtime=1683657433, mode=509, uid=1000, gid=1000), TarfileItem( name='test-archive/123.txt', type=FileSystemItemType.symlink, size=0, mtime=1683657414, mode=511, uid=1000, gid=1000, link_target='subdir/onetwothree_again.txt'), TarfileItem( name='test-archive/123_hard.txt', type=FileSystemItemType.file, size=4, mtime=1683657364, mode=436, uid=1000, gid=1000, link_target=None), TarfileItem( name='test-archive/subdir', type=FileSystemItemType.directory, size=0, mtime=1683657400, mode=509, uid=1000, gid=1000), TarfileItem( name='test-archive/subdir/onetwothree_again.txt', type=FileSystemItemType.file, size=4, mtime=1683657400, mode=436, uid=1000, gid=1000, link_target=None), TarfileItem( name='test-archive/onetwothree.txt', type=FileSystemItemType.hardlink, size=0, mtime=1683657364, mode=436, uid=1000, gid=1000, link_target='test-archive/123_hard.txt'), ] ires = [] for i in iter_tar(sample_tar_xz, fp=True): # check that file pointer is usable if i.fp: assert compute_multihash_from_fp( i.fp, ['md5', 'SHA1']) == target_hash # we null the file pointers to ease the comparison i.fp = None ires.append(i) # root + subdir, 2 files, softlink, hardlink assert 6 == len(ires) for t in targets: assert t in ires datalad-next-1.4.1/datalad_next/iter_collections/tests/test_iterzip.py000066400000000000000000000046261462321624600263330ustar00rootroot00000000000000import pytest import zipfile from pathlib import PurePosixPath from ..zipfile import ( FileSystemItemType, ZipfileItem, iter_zip, ) from ..utils import compute_multihash_from_fp @pytest.fixture(scope="session") def sample_zip(tmp_path_factory): """Create a sample zip file Provides a path to a zip with files and directories. Any file content is 'zip-123\n'. The associated hashes are: md5: d700214df5487801e8ee23d31e60382a sha1: b5dfcec4d1b6166067226fae102f7fbcf6bd1bd4 Layout:: test-archive/ ├── onetwothree.txt └── subdir/ └── onetwothree<>again.txt """ path = tmp_path_factory.mktemp('zipfile') / 'sample.zip' file_content = b'zip-123\n' with zipfile.ZipFile(path, mode='w') as zip_file: zip_file.writestr('test-archive/', '') zip_file.writestr('test-archive/subdir/', '') with zip_file.open('test-archive/onetwothree.txt', mode='w') as fp: fp.write(file_content) with zip_file.open('test-archive/subdir/onetwothree<>again.txt', mode='w') as fp: fp.write(file_content) yield path path.unlink() def test_iter_zip(sample_zip): target_hash = { 'SHA1': 'b5dfcec4d1b6166067226fae102f7fbcf6bd1bd4', 'md5': 'd700214df5487801e8ee23d31e60382a', } root = 'test-archive' targets = [ ZipfileItem( name=f'{root}/', type=FileSystemItemType.directory, size=0, ), ZipfileItem( name=f'{root}/onetwothree.txt', type=FileSystemItemType.file, size=8, ), ZipfileItem( name=f'{root}/subdir/', type=FileSystemItemType.directory, size=0, ), ZipfileItem( name=f'{root}/subdir/onetwothree<>again.txt', type=FileSystemItemType.file, size=8, ), ] ires = [] for i in iter_zip(sample_zip, fp=True): # check that file pointer is usable if i.fp: assert compute_multihash_from_fp( i.fp, ['md5', 'SHA1']) == target_hash # we null the file pointers to ease the comparison i.fp = None ires.append(i) # root + subdir, 2 files assert 4 == len(ires) for r in ires: # do not compare mtime r.mtime = None for t in targets: assert t in ires datalad-next-1.4.1/datalad_next/iter_collections/tests/test_utils.py000066400000000000000000000017421462321624600260010ustar00rootroot00000000000000from datalad_next.tests import skip_wo_symlink_capability from ..utils import FileSystemItem def test_FileSystemItem(tmp_path): testfile = tmp_path / 'file1.txt' testfile_content = 'content' testfile.write_text(testfile_content) item = FileSystemItem.from_path(testfile) assert item.size == len(testfile_content) assert item.link_target is None @skip_wo_symlink_capability def test_FileSystemItem_linktarget(tmp_path): testfile = tmp_path / 'file1.txt' testfile_content = 'short' testfile.write_text(testfile_content) testlink = tmp_path / 'link' testlink.symlink_to(testfile) item = FileSystemItem.from_path(testlink) assert testfile.samefile(item.link_target) # size of the link file does not anyhow propagate the size of the # link target assert item.size != len(testfile_content) # we can disable link resolution item = FileSystemItem.from_path(testlink, link_target=False) assert item.link_target is None datalad-next-1.4.1/datalad_next/iter_collections/utils.py000066400000000000000000000071171462321624600236020ustar00rootroot00000000000000"""Utilities and types for collection iterators""" from __future__ import annotations from dataclasses import dataclass from enum import Enum import os from pathlib import ( Path, PurePath, ) import stat from typing import ( Any, IO, List, ) from datalad_next.consts import COPY_BUFSIZE from datalad_next.utils import MultiHash # TODO Could be `StrEnum`, came with PY3.11 class FileSystemItemType(Enum): """Enumeration of file system path types The associated ``str`` values are chosen to be appropriate for downstream use (e.g, as type labels in DataLad result records). """ file = 'file' directory = 'directory' symlink = 'symlink' hardlink = 'hardlink' specialfile = 'specialfile' @dataclass class NamedItem: name: Any @dataclass class TypedItem: type: Any @dataclass class PathBasedItem(NamedItem): """An item with a path as its ``name`` A dedicated property supports the conversion of the native name representation into a ``PurePath`` instance. Any argument understood by the ``PurePath`` constructor can be used as ``name``, such as a a filename, a relative path, or an absolute path -- in string form, as path segment sequence, or a ``Path`` instance. It is recommended to use name/path values that are relative to the containing collection (directory, archive, repository, etc.). """ def path(self) -> PurePath: """Returns the item name as a ``PurePath`` instance This default implementation assumes the ``name`` to be platform path conventions. """ return PurePath(self.name) @dataclass # sadly PY3.10+ only (kw_only=True) class FileSystemItem(PathBasedItem, TypedItem): type: FileSystemItemType size: int mtime: float | None = None mode: int | None = None uid: int | None = None gid: int | None = None link_target: Any | None = None fp: IO | None = None def link_target_path(self) -> PurePath: """Returns the link_target as a ``PurePath`` instance""" return PurePath(self.link_target) @classmethod def from_path( cls, path: Path, *, link_target: bool = True, ): """Populate item properties from a single `stat` and `readlink` call The given ``path`` must exist. The ``link_target`` flag indicates whether to report the result of ``readlink`` for a symlink-type path. """ cstat = path.lstat() cmode = cstat.st_mode if stat.S_ISLNK(cmode): ctype = FileSystemItemType.symlink elif stat.S_ISDIR(cmode): ctype = FileSystemItemType.directory else: # the rest is a file # there could be fifos and sockets, etc. # but we do not recognize them here ctype = FileSystemItemType.file item = cls( name=path, type=ctype, size=cstat.st_size, mode=cmode, mtime=cstat.st_mtime, uid=cstat.st_uid, gid=cstat.st_gid, ) if link_target and ctype == FileSystemItemType.symlink: # could be p.readlink() from PY3.9+ # but check performance difference item.link_target = os.readlink(path) return item def compute_multihash_from_fp(fp, hash: List[str], bufsize=COPY_BUFSIZE): """Compute multiple hashes from a file-like """ hash = MultiHash(hash) while True: chunk = fp.read(bufsize) if not chunk: break hash.update(chunk) return hash.get_hexdigest() datalad-next-1.4.1/datalad_next/iter_collections/zipfile.py000066400000000000000000000053741462321624600241070ustar00rootroot00000000000000"""Report on the content of ZIP file The main functionality is provided by the :func:`iter_zip()` function. """ from __future__ import annotations import datetime from functools import cached_property import time import zipfile from dataclasses import dataclass from pathlib import ( Path, PurePosixPath, ) from typing import Generator from .utils import ( FileSystemItem, FileSystemItemType, ) @dataclass class ZipfileItem(FileSystemItem): name: str @cached_property def path(self) -> PurePosixPath: """Returns the item name as a ``PurePosixPath`` instance ZIP uses POSIX paths as item identifiers from version 6.3.3 onwards. Not all POSIX paths are legal paths on non-POSIX file systems or platforms. Therefore we cannot use a platform-dependent ``PurePath``-instance to address ZIP-file items, anq we use ``PurePosixPath``-instances instead. """ return PurePosixPath(self.name) def iter_zip( path: Path, *, fp: bool = False, ) -> Generator[ZipfileItem, None, None]: """Uses the standard library ``zipfile`` module to report on ZIP-files A ZIP archive can represent more or less the full bandwidth of file system properties, therefore reporting on archive members is implemented similar to :func:`~datalad_next.iter_collections.directory.iter_dir()`. The iterator produces an :class:`ZipfileItem` instance with standard information on file system elements, such as ``size``, or ``mtime``. Parameters ---------- path: Path Path of the ZIP archive to report content for (iterate over). fp: bool, optional If ``True``, each file-type item includes a file-like object to access the file's content. This file handle will be closed automatically when the next item is yielded or the function returns. Yields ------ :class:`ZipfileItem` The ``name`` attribute of an item is a ``str`` with the corresponding archive member name (in POSIX conventions). """ with zipfile.ZipFile(path, mode='r') as zip_file: for zip_info in zip_file.infolist(): item = _get_zipfile_item(zip_info) if fp and item.type == FileSystemItemType.file: with zip_file.open(zip_info) as amfp: item.fp = amfp yield item else: yield item def _get_zipfile_item(zip_info: zipfile.ZipInfo) -> ZipfileItem: return ZipfileItem( name=zip_info.filename, type=FileSystemItemType.directory if zip_info.is_dir() else FileSystemItemType.file, size=zip_info.file_size, mtime=time.mktime( datetime.datetime(*zip_info.date_time).timetuple() ) ) datalad-next-1.4.1/datalad_next/iterable_subprocess/000077500000000000000000000000001462321624600225605ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/iterable_subprocess/.coveragerc000066400000000000000000000000241462321624600246750ustar00rootroot00000000000000[run] branch = True datalad-next-1.4.1/datalad_next/iterable_subprocess/.github/000077500000000000000000000000001462321624600241205ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/iterable_subprocess/.github/workflows/000077500000000000000000000000001462321624600261555ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/iterable_subprocess/.github/workflows/deploy-package-to-pypi.yml000066400000000000000000000017411462321624600331670ustar00rootroot00000000000000name: Deploy package to PyPI on: release: types: [published] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v3 with: python-version: 3.11 - name: Update version in pyproject.toml from current git tag run: >- sed -i "s/0\\.0\\.0\\.dev0/${GITHUB_REF/refs\/tags\/v/}/g" pyproject.toml - run: | pip install build python -m build - uses: actions/upload-artifact@v3 with: path: ./dist deploy: needs: ['build'] environment: 'pypi' name: upload release to PyPI runs-on: ubuntu-latest permissions: # IMPORTANT: this permission is mandatory for trusted publishing id-token: write steps: - uses: actions/download-artifact@v3 - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: packages_dir: artifact/ datalad-next-1.4.1/datalad_next/iterable_subprocess/.github/workflows/test.yml000066400000000000000000000014631462321624600276630ustar00rootroot00000000000000name: Tests on: push: branches: [ "main" ] pull_request: branches: [ "main" ] jobs: test: name: Test runs-on: ubuntu-20.04 strategy: matrix: python-version: - "3.6.7" - "3.7.1" - "3.8.0" - "3.9.0" - "3.10.0" - "3.11.0" steps: - name: "Checkout" uses: "actions/checkout@v3" - uses: "actions/setup-python@v4" with: python-version: '${{ matrix.python-version }}' - name: "Install funzip" run: | sudo apt-get update sudo apt-get install unzip - name: "Install package and python dependencies" run: | pip install .[dev] - name: "Test" run: | pytest --cov - uses: codecov/codecov-action@v3 datalad-next-1.4.1/datalad_next/iterable_subprocess/.gitignore000066400000000000000000000034071462321624600245540ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ datalad-next-1.4.1/datalad_next/iterable_subprocess/LICENSE000066400000000000000000000021031462321624600235610ustar00rootroot00000000000000MIT License Copyright (c) 2021 Department for International Trade Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. datalad-next-1.4.1/datalad_next/iterable_subprocess/README.md000066400000000000000000000071761462321624600240520ustar00rootroot00000000000000# iterable-subprocess [![PyPI package](https://img.shields.io/pypi/v/iterable-subprocess?label=PyPI%20package&color=%234c1)](https://pypi.org/project/iterable-subprocess/) [![Test suite](https://img.shields.io/github/actions/workflow/status/uktrade/iterable-subprocess/test.yml?label=Test%20suite)](https://github.com/uktrade/iterable-subprocess/actions/workflows/test.yml) [![Code coverage](https://img.shields.io/codecov/c/github/uktrade/iterable-subprocess?label=Code%20coverage)](https://app.codecov.io/gh/uktrade/iterable-subprocess) Python context manager to communicate with a subprocess using iterables. This offers a higher level interface to subprocesses than Python's built-in subprocess module, and is particularly helpful when data won't fit in memory and has to be streamed. This also allows an external subprocess to be naturally placed in a chain of iterables as part of a data processing pipeline. ## Installation ```bash pip install iterable-subprocess ``` ## Usage A single context manager `iterable_subprocess` is exposed. The first parameter is the `args` argument passed to the [Popen Constructor](https://docs.python.org/3/library/subprocess.html#popen-constructor), and the second is an iterable whose items must be `bytes` instances and are sent to the subprocess's standard input. Returned from the function is an iterable whose items are `bytes` instances of the process's standard output. ```python from iterable_subprocess import iterable_subprocess # In a real case could be a generator function that reads from the filesystem or the network iterable_of_bytes = ( b'first\n', b'second\n', b'third\n', ) with iterable_subprocess(['cat'], iterable_of_bytes) as output: for chunk in output: print(chunk) ``` ## Exceptions Python's `subprocess.Popen` is used to start the process, and any exceptions it raises are propagated without transformation. For example, if the subprocess can't be found, then a `FileNotFoundError` is raised. If the process starts, but exits with a non-zero return code, then an `iterable_subprocess.IterableSubprocessError` exception will be raised with two members: - `returncode` - the return code of the process - `stderr` - the final 65536 bytes of the standard error of the process However, if the process starts, but an exception is raised from inside the context or from the source iterable, then this exception is propagated, even if the process subsequently exits with a non-zero return code. ## Example: unzip the first file of a ZIP archive while downloading It's possible to download the bytes of a ZIP file in Python, and unzip by passing the bytes to `funzip`, as in the following example. ```python import httpx from iterable_subprocess import iterable_subprocess with \ httpx.stream('GET', 'https://www.example.com/my.zip') as r, \ iterable_subprocess(['funzip'], r.iter_bytes()) as unzipped_chunks: for chunk in unzipped_chunks: print(chunk) ``` Note that it's also possible to stream unzip files without resorting to another process using [stream-unzip](https://github.com/uktrade/stream-unzip). ## Example: download file using curl and process in Python You would usually download directly from Python, but as an example, you can download using the curl executable and process its output in Python. ```python from iterable_subprocess import iterable_subprocess url = 'https://data.api.trade.gov.uk/v1/datasets/uk-tariff-2021-01-01/versions/v3.0.212/tables/measures-on-declarable-commodities/data?format=csv' with iterable_subprocess(['curl', '--no-progress-meter', '--fail-with-body', url], ()) as output: for chunk in output: print(chunk) ``` datalad-next-1.4.1/datalad_next/iterable_subprocess/__init__.py000066400000000000000000000015331462321624600246730ustar00rootroot00000000000000"""Context manager to communicate with a subprocess using iterables This offers a higher level interface to subprocesses than Python's built-in subprocess module, and is particularly helpful when data won't fit in memory and has to be streamed. This also allows an external subprocess to be naturally placed in a chain of iterables as part of a data processing pipeline. This code has been taken from https://pypi.org/project/iterable-subprocess/ and was subsequently adjusted for cross-platform compatibility and performance, as well as tighter integration with DataLad. The original code was made available under the terms of the MIT License, and was written by Michal Charemza. .. currentmodule:: datalad_next.iterable_subprocess .. autosummary:: :toctree: generated iterable_subprocess """ from .iterable_subprocess import iterable_subprocess datalad-next-1.4.1/datalad_next/iterable_subprocess/codecov.yml000066400000000000000000000000171462321624600247230ustar00rootroot00000000000000comment: false datalad-next-1.4.1/datalad_next/iterable_subprocess/iterable_subprocess.py000066400000000000000000000156561462321624600272060ustar00rootroot00000000000000from collections import deque from collections.abc import Generator from contextlib import contextmanager from subprocess import PIPE, Popen from threading import Thread # Importing from datalad-core to prevent circular imports from datalad_next.exceptions import CommandError class OutputFrom(Generator): def __init__(self, stdout, stderr_deque, chunk_size=65536): self.stdout = stdout self.stderr_deque = stderr_deque self.chunk_size = chunk_size self.returncode = None def send(self, _): chunk = self.stdout.read(self.chunk_size) if not chunk: raise StopIteration return chunk def throw(self, typ, value=None, traceback=None): return super().throw(typ, value, traceback) @contextmanager def iterable_subprocess( program, input_chunks, chunk_size=65536, cwd=None, bufsize=-1, ): # This context starts a thread that populates the subprocess's standard input. It # also starts a threads that reads the process's standard error. Otherwise we risk # a deadlock - there is no output because the process is waiting for more input. # # This itself introduces its own complications and risks, but hopefully mitigated # by having a well defined start and stop mechanism that also avoid sending data # to the process if it's not running # # To start, i.e. on entry to the context from client code # - The process is started # - The thread to read from standard error is started # - The thread to populate input is started # # When running: # - The standard input thread iterates over the input, passing chunks to the process # - While the standard error thread fetches the error output # - And while this thread iterates over the processe's output from client code # in the context # # To stop, i.e. on exit of the context from client code # - This thread closes the process's standard output # - Wait for the standard input thread to exit # - Wait for the standard error thread to exit # - Wait for the process to exit # # By using context managers internally, this also gives quite strong guarantees that # the above order is enforced to make sure the thread doesn't send data to the process # whose standard input is closed and so we don't get BrokenPipe errors # Writing to the process can result in a BrokenPipeError. If this then results in # a non-zero code from the process, the process's standard error probably has useful # information on the cause of this. However, the non-zero error code happens after # BrokenPipeError, so propagating "what happens first" isn't helpful in this case. # So, we re-raise BrokenPipeError as _BrokenPipeError so we can catch it after the # process ends to then allow us to branch on its error code: # - if it's non-zero raise a CommandError containing its standard error # - if it's zero, re-raise the original BrokenPipeError class _BrokenPipeError(Exception): pass @contextmanager def thread(target, *args): exception = None def wrapper(): nonlocal exception try: target(*args) except BaseException as e: exception = e t = Thread(target=wrapper) def start(): t.start() def join(): if t.ident: t.join() return exception yield start, join def input_to(stdin): try: for chunk in input_chunks: try: stdin.write(chunk) except BrokenPipeError: raise _BrokenPipeError() except OSError as e: if e.errno != 22: # Errno22 indicates an IO failure with a # file descriptor (maybe process is dead already) raise _BrokenPipeError() else: # no idea what this could be, let it bubble up raise finally: try: stdin.close() except BrokenPipeError: raise _BrokenPipeError() except OSError as e: # silently ignore Errno22, which happens on # windows when trying to interacted with file descriptors # associated with a process that exited already if e.errno != 22: raise def keep_only_most_recent(stderr, stderr_deque): total_length = 0 while True: chunk = stderr.read(chunk_size) total_length += len(chunk) if not chunk: break stderr_deque.append(chunk) if total_length - len(stderr_deque[0]) >= chunk_size: total_length -= len(stderr_deque[0]) stderr_deque.popleft() def raise_if_not_none(exception): if exception is not None: raise exception from None proc = None stderr_deque = deque() chunk_generator = None exception_stdin = None exception_stderr = None try: with \ Popen( # nosec - all arguments are controlled by the caller program, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd, bufsize=bufsize, ) as proc, \ thread( keep_only_most_recent, proc.stderr, stderr_deque, ) as (start_t_stderr, join_t_stderr), \ thread( input_to, proc.stdin, ) as (start_t_stdin, join_t_stdin): try: start_t_stderr() start_t_stdin() chunk_generator = OutputFrom( proc.stdout, stderr_deque, chunk_size ) yield chunk_generator except BaseException: proc.terminate() raise finally: proc.stdout.close() exception_stdin = join_t_stdin() exception_stderr = join_t_stderr() raise_if_not_none(exception_stdin) raise_if_not_none(exception_stderr) except _BrokenPipeError as e: if chunk_generator: chunk_generator.returncode = proc.returncode if proc.returncode == 0: raise e.__context__ from None except BaseException: if chunk_generator: chunk_generator.returncode = proc.returncode raise chunk_generator.returncode = proc.returncode if proc.returncode: raise CommandError( cmd=program, code=proc.returncode, stderr=b''.join(stderr_deque)[-chunk_size:], cwd=cwd, ) datalad-next-1.4.1/datalad_next/iterable_subprocess/pyproject.toml000066400000000000000000000013401462321624600254720ustar00rootroot00000000000000[build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] name = "iterable-subprocess" version = "0.0.0.dev0" authors = [ { name="Department for International Trade", email="sre@digital.trade.gov.uk" }, ] description = "Python context manager to communicate with a subprocess using iterables of bytes rather Python's built-in subprocess module" readme = "README.md" requires-python = ">=3.6.7" classifiers = [ 'Programming Language :: Python :: 3', 'License :: OSI Approved :: MIT License', ] [project.optional-dependencies] dev = [ "psutil", "pytest-cov", ] [project.urls] "Source" = "https://github.com/uktrade/iterable-subprocess" [tool.hatch.build] include = [ "iterable_subprocess.py" ] datalad-next-1.4.1/datalad_next/iterable_subprocess/test_iterable_subprocess.py000066400000000000000000000266721462321624600302450ustar00rootroot00000000000000import io import sys import threading import time import zipfile import psutil import pytest from threading import Thread from .iterable_subprocess import ( iterable_subprocess, CommandError, ) def test_cat_not_necessarily_streamed(): def yield_small_input(): yield b'first' yield b'second' yield b'third' with iterable_subprocess(['cat'], yield_small_input()) as output: assert b''.join(output) == b'firstsecondthird' def test_cat_streamed(): latest_input = None def yield_input(): nonlocal latest_input for i in range(0, 10000000): yield b'*' * 10 latest_input = i with iterable_subprocess(['cat'], yield_input()) as output: latest_input_during_output = [latest_input for _ in output] # Make sure the input is progressing during the output. In test, there # are about 915 steps, so checking that it's greater than 50 shouldm't # make this test too flakey num_steps = 0 prev_i = 0 for i in latest_input_during_output: if i != prev_i: num_steps += 1 prev_i = i assert num_steps > 50 def test_process_closed_after(): # in datalad-next we do not necessarily have no child-processes # so determine the number of test incrementally #assert len(psutil.Process().children(recursive=True)) == 0 n_children = len(psutil.Process().children(recursive=True)) with iterable_subprocess(['cat'], ()) as output: assert len(psutil.Process().children(recursive=True)) == (n_children + 1) assert len(psutil.Process().children(recursive=True)) == n_children def test_exception_from_input_before_yield_propagated(): def yield_input(): raise Exception('Something went wrong') with pytest.raises(Exception, match='Something went wrong'): with iterable_subprocess(['cat'], yield_input()) as output: pass def test_exception_from_input_after_yield_propagated(): def yield_input(): yield b'*' raise Exception('Something went wrong') with pytest.raises(Exception, match='Something went wrong'): with iterable_subprocess(['cat'], yield_input()) as output: pass def test_exception_from_input_incorrect_type_propagated(): def yield_input(): yield 'this-should-be-bytes' with pytest.raises(TypeError): with iterable_subprocess(['cat'], yield_input()) as output: pass @pytest.mark.parametrize("size", [ 1, 100, 10000, 1000000, ]) def test_exception_from_output_during_input_iterating_propagates_and_does_not_hang(size): event = threading.Event() def yield_input(): while True: event.set() yield b'*' * size with pytest.raises(Exception, match='My error'): with iterable_subprocess(['cat'], yield_input()) as output: event.wait() raise Exception('My error') @pytest.mark.parametrize("chunk_size", [ 1, 100, 10000, 1000000, ]) @pytest.mark.parametrize("at_iteration", [ 0, 1, 100, ]) def test_exception_from_output_iterating_propagates_and_does_not_hang(at_iteration, chunk_size): def yield_input(): while True: yield b'*' * chunk_size with pytest.raises(Exception, match='My error'): with iterable_subprocess(['cat'], yield_input(), chunk_size=chunk_size) as output: for i, chunk in enumerate(output): if i == at_iteration: raise Exception('My error') def test_exception_from_not_found_process_propagated(): with pytest.raises(FileNotFoundError): with iterable_subprocess(['does-not-exist'], ()) as output: b''.join(output) def test_exception_from_return_code(monkeypatch): monkeypatch.setenv('LANG', 'C') with pytest.raises(CommandError, match='No such file or directory') as excinfo: with iterable_subprocess(['ls', 'does-not-exist'], ()) as output: a = b''.join(output) assert excinfo.value.returncode > 0 assert b'No such file or directory' in excinfo.value.stderr def test_exception_from_context_even_though_return_code_with_long_standard_error(): with pytest.raises(Exception, match="Another exception"): with iterable_subprocess([sys.executable, '-c', 'import sys; print("Out"); print("Error message" * 100000, file=sys.stderr); sys.exit(1)'], ()) as output: for _ in output: pass raise Exception('Another exception') def test_exception_from_return_code_with_long_standard_error(): with pytest.raises(CommandError) as excinfo: with iterable_subprocess([sys.executable, '-c', 'import sys; print("Out"); print("Error message" * 100000, file=sys.stderr); sys.exit(2)'], ()) as output: for _ in output: pass assert excinfo.value.returncode == 2 assert len(excinfo.value.stderr) == 65536 def test_if_process_exits_with_non_zero_error_code_and_inner_exception_it_propagates(): def yield_input(): while True: yield b'*' * 10 with pytest.raises(Exception, match='Another exception'): with iterable_subprocess([ sys.executable, '-c', 'import sys; print("The error", file=sys.stderr); print("After output"); sys.exit(1)', ], yield_input()) as output: all_output = b''.join(output) raise Exception('Another exception') # rstrip to account for different platform line endings here assert all_output.rstrip() == b'After output' def test_if_process_closes_standard_input_but_exits_with_non_zero_error_code_then_broken_pipe_error(): def yield_input(): while True: yield b'*' * 10 with pytest.raises(BrokenPipeError): with iterable_subprocess([ sys.executable, '-c', 'import sys; sys.stdin.close(); print("The error", file=sys.stderr); print("After output"); sys.exit(0)', ], yield_input()) as output: all_output = b''.join(output) # rstrip to account for different platform line endings here assert all_output.rstrip() == b'After output' def test_if_process_closes_standard_input_but_exits_with_non_zero_error_code_then_iterable_subprocess_error(): def yield_input(): while True: yield b'*' * 10 with pytest.raises(CommandError) as excinfo: with iterable_subprocess([ sys.executable, '-c', 'import sys; sys.stdin.close(); print("The error", file=sys.stderr); print("After output"); sys.exit(3)', ], yield_input()) as output: all_output = b''.join(output) # rstrip to account for different platform line endings here assert all_output.rstrip() == b'After output' assert excinfo.value.returncode == 3 assert excinfo.value.stderr.rstrip()== b'The error' def test_program_that_outputs_for_a_long_time_is_interrupted_on_context_exit(): start = time.monotonic() with pytest.raises(CommandError) as excinfo: with iterable_subprocess([sys.executable, '-c', 'import time; start = time.monotonic()\nwhile (time.monotonic() - start) < 60:\n print("Output" * 1000)'], ()) as output: pass end = time.monotonic() assert excinfo.value.returncode != 0 # alternative condition reflects error communication on windows (errno22) assert b'BrokenPipeError' in excinfo.value.stderr or b'Errno 22' in excinfo.value.stderr assert end - start < 10 def test_program_that_sleeps_exits_quickly_if_exception(): start = time.monotonic() with pytest.raises(Exception, match='From context'): with iterable_subprocess([sys.executable, '-c', 'import time; time.sleep(60)'], ()) as output: raise Exception('From context') end = time.monotonic() assert end - start < 10 def test_program_that_sleeps_exits_quickly_if_keyboard_interrupt(): start = time.monotonic() with pytest.raises(KeyboardInterrupt, match='From context'): with iterable_subprocess([sys.executable, '-c', 'import time; time.sleep(60)'], ()) as output: raise KeyboardInterrupt('From context') end = time.monotonic() assert end - start < 10 def test_program_that_sleeps_exits_quickly_if_keyboard_interrupt_just_before_thread_starts(monkeypatch): start = time.monotonic() def start_that_raises_keyboard_interrupt(self): raise KeyboardInterrupt('Just before starting thread') monkeypatch.setattr(Thread, 'start', start_that_raises_keyboard_interrupt) with pytest.raises(KeyboardInterrupt, match='Just before starting thread'): iterable_subprocess([sys.executable, '-c', 'import time; time.sleep(60)'], ()).__enter__() end = time.monotonic() assert end - start < 10 def test_program_that_sleeps_exits_quickly_if_keyboard_interrupt_just_after_thread_starts(monkeypatch): start = time.monotonic() original_start = Thread.start def start_that_raises_keyboard_interrupt(self): original_start(self) raise KeyboardInterrupt('Just after starting thread') monkeypatch.setattr(Thread, 'start', start_that_raises_keyboard_interrupt) with pytest.raises(KeyboardInterrupt, match='Just after starting thread'): iterable_subprocess([sys.executable, '-c', 'import time; time.sleep(60)'], ()).__enter__() end = time.monotonic() assert end - start < 10 def test_program_that_sleeps_not_quickly_if_no_exception(): start = time.monotonic() with iterable_subprocess([sys.executable, '-c', 'import time; time.sleep(2)'], ()) as output: pass end = time.monotonic() assert end - start > 2 def test_funzip_no_compression(): contents = b'*' * 100000 def yield_input(): file = io.BytesIO() with zipfile.ZipFile(file, 'w', zipfile.ZIP_STORED) as zf: zf.writestr('any.txt', contents) yield file.getvalue() with iterable_subprocess(['funzip'], yield_input()) as output: assert b''.join(output) == contents def test_funzip_deflate(): contents = b'*' * 100000 def yield_input(): file = io.BytesIO() with zipfile.ZipFile(file, 'w', zipfile.ZIP_DEFLATED) as zf: zf.writestr('any.txt', contents) yield file.getvalue() with iterable_subprocess(['funzip'], yield_input()) as output: assert b''.join(output) == contents def test_error_returncode_available_from_generator(): with pytest.raises(CommandError): with iterable_subprocess(['ls', 'does-not-exist'], ()) as ls: tuple(ls) assert ls.returncode != 0 def test_error_returncode_available_from_generator_with_exception(): with pytest.raises(StopIteration): with iterable_subprocess(['ls', 'does-not-exist'], ()) as ls: while True: next(ls) assert ls.returncode != 0 def test_returncode_available_from_generator_with_exception(): with pytest.raises(StopIteration): with iterable_subprocess(['echo', 'a'], ()) as echo: while True: next(echo) # On a Linux system, all exceptions that are raised before the subprocess # exited will lead to a -15 return code. If StopIteration is raised, the # subprocess will either have terminated which results in a 0-return code, # or the subprocess is still running and will therefore be terminated which # results in a -15 return code. Any other exception than StopIteration, # e.g. a CommandError because echo could not be found, would lead to an # early test-exit and not proceed to the assign-statement. assert echo.returncode in (0, -15) datalad-next-1.4.1/datalad_next/itertools/000077500000000000000000000000001462321624600205455ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/itertools/__init__.py000066400000000000000000000010131462321624600226510ustar00rootroot00000000000000"""Various iterators, e.g., for subprocess pipelining and output processing .. currentmodule:: datalad_next.itertools .. autosummary:: :toctree: generated align_pattern decode_bytes itemize load_json load_json_with_flag route_out route_in """ from .align_pattern import align_pattern from .decode_bytes import decode_bytes from .itemize import itemize from .load_json import ( load_json, load_json_with_flag, ) from .reroute import ( route_in, route_out, StoreOnly, ) datalad-next-1.4.1/datalad_next/itertools/align_pattern.py000066400000000000000000000106501462321624600237500ustar00rootroot00000000000000""" Function to ensure that a pattern is completely contained in single chunks """ from __future__ import annotations import re from typing import ( Generator, Iterable, ) def align_pattern(iterable: Iterable[str | bytes | bytearray], pattern: str | bytes | bytearray ) -> Generator[str | bytes | bytearray, None, None]: """ Yield data chunks that contain a complete pattern, if it is present ``align_pattern`` makes it easy to find a pattern (``str``, ``bytes``, or ``bytearray``) in data chunks. It joins data-chunks in such a way, that a simple containment-check (e.g. ``pattern in chunk``) on the chunks that ``align_pattern`` yields will suffice to determine whether the pattern is present in the stream yielded by the underlying iterable or not. To achieve this, ``align_pattern`` will join consecutive chunks to ensures that the following two assertions hold: 1. Each chunk that is yielded by ``align_pattern`` has at least the length of the pattern (unless the underlying iterable is exhausted before the length of the pattern is reached). 2. The pattern is not split between two chunks, i.e. no chunk that is yielded by ``align_pattern`` ends with a prefix of the pattern (unless it is the last chunk that the underlying iterable yield). The pattern might be present multiple times in a yielded data chunk. Note: the ``pattern`` is compared verbatim to the content in the data chunks, i.e. no parsing of the ``pattern`` is performed and no regular expressions or wildcards are supported. .. code-block:: python >>> from datalad_next.itertools import align_pattern >>> tuple(align_pattern([b'abcd', b'e', b'fghi'], pattern=b'def')) (b'abcdefghi',) >>> # The pattern can be present multiple times in a yielded chunk >>> tuple(align_pattern([b'abcd', b'e', b'fdefghi'], pattern=b'def')) (b'abcdefdefghi',) Use this function if you want to locate a pattern in an input stream. It allows to use a simple ``in``-check to determine whether the pattern is present in the yielded result chunks. The function always yields everything it has fetched from the underlying iterable. So after a yield it does not cache any data from the underlying iterable. That means, if the functionality of ``align_pattern`` is no longer required, the underlying iterator can be used, when ``align_pattern`` has yielded a data chunk. This allows more efficient processing of the data that remains in the underlying iterable. Parameters ---------- iterable: Iterable An iterable that yields data chunks. pattern: str | bytes | bytearray The pattern that should be contained in the chunks. Its type must be compatible to the type of the elements in ``iterable``. Yields ------- str | bytes | bytearray data chunks that have at least the size of the pattern and do not end with a prefix of the pattern. Note that a data chunk might contain the pattern multiple times. """ # Create pattern matcher for all if isinstance(pattern, str): regex: str | bytes | bytearray = '(' + '|'.join( '.' * (len(pattern) - index - 1) + re.escape(pattern[:index]) + '$' for index in range(1, len(pattern)) ) + ')' else: regex = b'(' + b'|'.join( b'.' * (len(pattern) - index - 1) + re.escape(pattern[:index]) + b'$' for index in range(1, len(pattern)) ) + b')' pattern_matcher = re.compile(regex, re.DOTALL) pattern_sub = len(pattern) - 1 # Join data chunks until they are sufficiently long to contain the pattern, # i.e. have at least size: `len(pattern)`. Continue joining, if the chunk # ends with a prefix of the pattern. current_chunk = None for data_chunk in iterable: # get the type of current_chunk from the type of this data_chunk if current_chunk is None: current_chunk = data_chunk else: current_chunk += data_chunk if len(current_chunk) >= len(pattern) \ and not ( current_chunk[-1] in pattern and pattern_matcher.match(current_chunk, len(current_chunk) - pattern_sub)): yield current_chunk current_chunk = None if current_chunk is not None: yield current_chunk datalad-next-1.4.1/datalad_next/itertools/decode_bytes.py000066400000000000000000000122461462321624600235550ustar00rootroot00000000000000"""Get strings decoded from chunks of bytes """ from __future__ import annotations from typing import ( Generator, Iterable, ) __all__ = ['decode_bytes'] def decode_bytes( iterable: Iterable[bytes], encoding: str = 'utf-8', backslash_replace: bool = True, ) -> Generator[str, None, None]: """Decode bytes in an ``iterable`` into strings This function decodes ``bytes`` or ``bytearray`` into ``str`` objects, using the specified encoding. Importantly, the decoding input can be spread across multiple chunks of heterogeneous sizes, for example output read from a process or pieces of a download. Multi-byte encodings that are spread over multiple byte chunks are supported, and chunks are joined as necessary. For example, the utf-8 encoding for ö is ``b'\\xc3\\xb6'``. If the encoding is split in the middle because a chunk ends with ``b'\\xc3'`` and the next chunk starts with ``b'\\xb6'``, a naive decoding approach like the following would fail: .. code-block:: python >>> [chunk.decode() for chunk in [b'\\xc3', b'\\xb6']] # doctest: +SKIP Traceback (most recent call last): File "", line 1, in File "", line 1, in UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 0: unexpected end of data Compared to: .. code-block:: python >>> from datalad_next.itertools import decode_bytes >>> tuple(decode_bytes([b'\\xc3', b'\\xb6'])) ('ö',) Input chunks are only joined, if it is necessary to properly decode bytes: .. code-block:: python >>> from datalad_next.itertools import decode_bytes >>> tuple(decode_bytes([b'\\xc3', b'\\xb6', b'a'])) ('ö', 'a') If ``backslash_replace`` is ``True``, undecodable bytes will be replaced with a backslash-substitution. Otherwise, undecodable bytes will raise a ``UnicodeDecodeError``: .. code-block:: python >>> tuple(decode_bytes([b'\\xc3'])) ('\\\\xc3',) >>> tuple(decode_bytes([b'\\xc3'], backslash_replace=False)) # doctest: +SKIP Traceback (most recent call last): ... UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 1: invalid continuation byte Backslash-replacement of undecodable bytes is an ambiguous mapping, because, for example, ``b'\\xc3'`` can already be present in the input. Parameters ---------- iterable: Iterable[bytes] Iterable that yields bytes that should be decoded encoding: str (default: ``'utf-8'``) Encoding to be used for decoding. backslash_replace: bool (default: ``True``) If ``True``, backslash-escapes are used for undecodable bytes. If ``False``, a ``UnicodeDecodeError`` is raised if a byte sequence cannot be decoded. Yields ------ str Decoded strings that are generated by decoding the data yielded by ``iterable`` with the specified ``encoding`` Raises ------ UnicodeDecodeError If ``backslash_replace`` is ``False`` and the data yielded by ``iterable`` cannot be decoded with the specified ``encoding`` """ def handle_decoding_error(position: int, exc: UnicodeDecodeError ) -> tuple[int, str]: """ Handle a UnicodeDecodeError """ if not backslash_replace: # Signal the error to the caller raise exc else: return ( position + exc.end, joined_data[:position + exc.start].decode(encoding) + joined_data[position + exc.start:position + exc.end].decode( encoding, errors='backslashreplace' ), ) joined_data = b'' pending_error = None position = 0 for chunk in iterable: joined_data += chunk while position < len(joined_data): try: yield joined_data[position:].decode(encoding) joined_data = b'' except UnicodeDecodeError as e: # If an encoding error occurs, we first check whether it was # in the middle of `joined_data` or whether it extends until the # end of `joined_data`. # If it occurred in the middle of # `joined_data`, we replace it with backslash encoding or # re-raise the decoding error. # If it occurred at the end of `joined_data`, we wait for the # next chunk, which might fix the problem. if position + e.end == len(joined_data): # Wait for the next chunk, which might fix the problem pending_error = e break else: pending_error = None position, string = handle_decoding_error(position, e) yield string if pending_error: # If the last chunk has a decoding error at the end, process it. position, string = handle_decoding_error(position, pending_error) if string: yield string datalad-next-1.4.1/datalad_next/itertools/itemize.py000066400000000000000000000131071462321624600225670ustar00rootroot00000000000000"""Get complete items from input chunks""" from __future__ import annotations from typing import ( Generator, Iterable, TypeVar, ) __all__ = ['itemize'] T = TypeVar('T', str, bytes, bytearray) def itemize( iterable: Iterable[T], sep: T | None, *, keep_ends: bool = False, ) -> Generator[T, None, None]: """Yields complete items (only), assembled from an iterable This function consumes chunks from an iterable and yields items defined by a separator. An item might span multiple input chunks. Input (chunks) can be ``bytes``, ``bytearray``, or ``str`` objects. The result type is determined by the type of the first input chunk. During its runtime, the type of the elements in ``iterable`` must not change. Items are defined by a separator given via ``sep``. If ``sep`` is ``None``, the line-separators built into ``str.splitlines()`` are used, and each yielded item will be a line. If ``sep`` is not `None`, its type must be compatible to the type of the elements in ``iterable``. A separator could, for example, be ``b'\\n'``, in which case the items would be terminated by Unix line-endings, i.e. each yielded item is a single line. The separator could also be, ``b'\\x00'`` (or ``'\\x00'``), to split zero-byte delimited content, like the output of ``git ls-files -z``. Separators can be longer than one byte or character, e.g. ``b'\\r\\n'``, or ``b'\\n-------------------\\n'``. Content after the last separator, possibly merged across input chunks, is always yielded as the last item, even if it is not terminated by the separator. Performance notes: - Using ``None`` as a separator (splitlines-mode) is slower than providing a specific separator. - If another separator than ``None`` is used, the runtime with ``keep_end=False`` is faster than with ``keep_end=True``. Parameters ---------- iterable: Iterable[str | bytes | bytearray] The iterable that yields the input data sep: str | bytes | bytearray | None The separator that defines items. If ``None``, the items are determined by the line-separators that are built into ``str.splitlines()``. keep_ends: bool If `True`, the item-separator will remain at the end of a yielded item. If `False`, items will not contain the separator. Preserving separators implies a runtime cost, unless the separator is ``None``. Yields ------ str | bytes | bytearray The items determined from the input iterable. The type of the yielded items depends on the type of the first element in ``iterable``. Examples -------- .. code-block:: python >>> from datalad_next.itertools import itemize >>> with open('/etc/passwd', 'rt') as f: # doctest: +SKIP ... print(tuple(itemize(iter(f.read, ''), sep=None))[0:2]) # doctest: +SKIP ('root:x:0:0:root:/root:/bin/bash', 'systemd-timesync:x:497:497:systemd Time Synchronization:/:/usr/sbin/nologin') >>> with open('/etc/passwd', 'rt') as f: # doctest: +SKIP ... print(tuple(itemize(iter(f.read, ''), sep=':'))[0:10]) # doctest: +SKIP ('root', 'x', '0', '0', 'root', '/root', '/bin/bash\\nsystemd-timesync', 'x', '497', '497') >>> with open('/etc/passwd', 'rt') as f: # doctest: +SKIP ... print(tuple(itemize(iter(f.read, ''), sep=':', keep_ends=True))[0:10]) # doctest: +SKIP ('root:', 'x:', '0:', '0:', 'root:', '/root:', '/bin/bash\\nsystemd-timesync:', 'x:', '497:', '497:') """ if sep is None: yield from _split_lines(iterable, keep_ends=keep_ends) else: yield from _split_items_with_separator( iterable, sep=sep, keep_ends=keep_ends, ) def _split_items_with_separator(iterable: Iterable[T], sep: T, keep_ends: bool = False, ) -> Generator[T, None, None]: assembled = None for chunk in iterable: if not assembled: assembled = chunk else: assembled += chunk items = assembled.split(sep=sep) if len(items) == 1: continue if assembled.endswith(sep): assembled = None else: assembled = items[-1] items.pop(-1) if keep_ends: for item in items: yield item + sep else: yield from items if assembled: yield assembled def _split_lines(iterable: Iterable[T], keep_ends: bool = False, ) -> Generator[T, None, None]: assembled = None for chunk in iterable: if not assembled: assembled = chunk else: assembled += chunk # We don't know all elements on which python splits lines, therefore we # split once with ends and once without ends. Lines that differ have no # ending lines_with_end = assembled.splitlines(keepends=True) lines_without_end = assembled.splitlines(keepends=False) if lines_with_end[-1] == lines_without_end[-1]: assembled = lines_with_end[-1] lines_with_end.pop(-1) lines_without_end.pop(-1) else: assembled = None if keep_ends: yield from lines_with_end else: yield from lines_without_end if assembled: yield assembled datalad-next-1.4.1/datalad_next/itertools/load_json.py000066400000000000000000000077411462321624600231000ustar00rootroot00000000000000""" Functions that yield JSON objects converted from input items """ from __future__ import annotations import json from typing import ( Any, Generator, Iterable, ) __all__ = ['load_json', 'load_json_with_flag'] def load_json(iterable: Iterable[bytes | str], ) -> Generator[Any, None, None]: """ Convert items yielded by ``iterable`` into JSON objects and yield them This function fetches items from the underlying iterable. The items are expected to be ``bytes``, ``str``, or ``bytearry``, and contain one JSON-encoded object. Items are converted into a JSON-object, by feeding them into ``json.loads``. On successful conversion to a JSON-object, ``load_json`` will yield the resulting JSON-object. If the conversion to a JSON-object fails, ``load_json`` will raise a ``json.decoder.JSONDecodeError``: .. code-block:: python >>> from datalad_next.itertools import load_json, load_json_with_flag >>> tuple(load_json(['{"a": 1}'])) ({'a': 1},) >>> tuple(load_json(['{"c": 3'])) # Faulty JSON-encoding, doctest: +SKIP Traceback (most recent call last): ... json.decoder.JSONDecodeError: Expecting ',' delimiter: line 1 column 8 (char 7) Using ``load_json`` together with ``itemize`` allows the processing of JSON-lines data. ``itemize`` will yield a single item for each line and ``load_json`` will convert it into a JSON-object. Note: JSON-decoding is slightly faster if the items of type ``str``. Items of type ``bytes`` or ``bytearray`` will work as well, but processing might be slower. Parameters ---------- iterable: Iterable[bytes | str] The iterable that yields the JSON-strings or -bytestrings that should be parsed and converted into JSON-objects Yields ------ Any The JSON-object that are generated from the data yielded by ``iterable`` Raises ------ json.decoder.JSONDecodeError If the data yielded by ``iterable`` is not a valid JSON-string """ for json_string in iterable: yield json.loads(json_string) def load_json_with_flag( iterable: Iterable[bytes | str], ) -> Generator[tuple[Any | json.decoder.JSONDecodeError, bool], None, None]: """ Convert items from ``iterable`` into JSON objects and a success flag ``load_json_with_flag`` works analogous to ``load_json``, but reports success and failure differently. On successful conversion to a JSON-object, ``load_json_with_flag`` will yield a tuple of two elements. The first element contains the JSON-object, the second element is ``True``. If the conversion to a JSON-object fails, ``load_json_with_flag`` will yield a tuple of two elements, where the first element contains the ``json.decoder.JSONDecodeError`` that was raised during conversion, and the second element is ``False``: .. code-block:: python >>> from datalad_next.itertools import load_json, load_json_with_flag >>> tuple(load_json_with_flag(['{"b": 2}'])) (({'b': 2}, True),) >>> tuple(load_json_with_flag(['{"d": 4'])) # Faulty JSON-encoding ((JSONDecodeError("Expecting ',' delimiter: line 1 column 8 (char 7)"), False),) Parameters ---------- iterable: Iterable[bytes | str] The iterable that yields the JSON-strings or -bytestrings that should be parsed and converted into JSON-objects Yields ------ tuple[Any | json.decoder.JSONDecodeError, bool] A tuple containing of a decoded JSON-object and ``True``, if the JSON string could be decoded correctly. If the JSON string could not be decoded correctly, the tuple will contain the ``json.decoder.JSONDecodeError`` that was raised during JSON-decoding and ``False``. """ for json_string in iterable: try: yield json.loads(json_string), True except json.decoder.JSONDecodeError as e: yield e, False datalad-next-1.4.1/datalad_next/itertools/reroute.py000066400000000000000000000202721462321624600226070ustar00rootroot00000000000000""" Functions that allow to route data around upstream iterator """ from __future__ import annotations from typing import ( Any, Callable, Generator, Iterable, ) __all__ = ['StoreOnly', 'route_in', 'route_out'] class StoreOnly: pass def route_out(iterable: Iterable, data_store: list, splitter: Callable[[Any], tuple[Any, Any]], ) -> Generator: """ Route data around the consumer of this iterable :func:`route_out` allows its user to: 1. store data that is received from an iterable, 2. determine whether this data should be yielded to a consumer of ``route_out``, by calling :func:`splitter`. To determine which data is to be yielded to the consumer and which data should only be stored but not yielded, :func:`route_out` calls :func:`splitter`. :func:`splitter` is called for each item of the input iterable, with the item as sole argument. The function should return a tuple of two elements. The first element is the data that is to be yielded to the consumer. The second element is the data that is to be stored in the list ``data_store``. If the first element of the tuple is ``datalad_next.itertools.StoreOnly``, no data is yielded to the consumer. :func:`route_in` can be used to combine data that was previously stored by :func:`route_out` with the data that is yielded by :func:`route_out` and with the data the was not processed, i.e. not yielded by :func:`route_out`. The items yielded by :func:`route_in` will be in the same order in which they were passed into :func:`route_out`, including the items that were not yielded by :func:`route_out` because :func:`splitter` returned ``StoreOnly`` in the first element of the result-tuple. The combination of the two functions :func:`route_out` and :func:`route_in` can be used to "carry" additional data along with data that is processed by iterators. And it can be used to route data around iterators that cannot process certain data. For example, a user has an iterator to divide the number ``2`` by all numbers in a list. The user wants the iterator to process all numbers in a divisor list, except from zeros, In this case :func:`route_out` and :func:`route_in` can be used as follows: .. code-block:: python from math import nan from datalad_next.itertools import route_out, route_in, StoreOnly def splitter(divisor): # if divisor == 0, return `StoreOnly` in the first element of the # result tuple to indicate that route_out should not yield this # element to its consumer return (StoreOnly, divisor) if divisor == 0 else (divisor, divisor) def joiner(processed_data, stored_data): # return nan if processed_data is StoreOnly else processed_data divisors = [0, 1, 0, 2, 0, 3, 0, 4] store = list() r = route_in( map( lambda x: 2.0 / x, route_out( divisors, store, splitter ) ), store, joiner ) print(list(r)) The example about will print ``[nan, 2.0, nan, 1.0, nan, 0.6666666666666666, nan, 0.5]``. Parameters ---------- iterable: Iterable The iterable that yields the input data data_store: list The list that is used to store the data that is routed out splitter: Callable[[Any], tuple[Any, Any | None]] The function that is used to determine which part of the input data, if any, is to be yielded to the consumer and which data is to be stored in the list ``data_store``. The function is called for each item of the input iterable with the item as sole argument. It should return a tuple of two elements. If the first element is not ``datalad_next.itertools.StoreOnly``, it is yielded to the consumer. If the first element is ``datalad_next.itertools.StoreOnly``, nothing is yielded to the consumer. The second element is stored in the list ``data_store``. The cardinality of ``data_store`` will be the same as the cardinality of the input iterable. """ for item in iterable: data_to_process, data_to_store = splitter(item) data_store.append((data_to_process, data_to_store)) if data_to_process is not StoreOnly: yield data_to_process def route_in(iterable: Iterable, data_store: list, joiner: Callable[[Any, Any], Any] ) -> Generator: """ Yield previously rerouted data to the consumer This function is the counter-part to :func:`route_out`. It takes the iterable ``iterable`` and a data store given in ``data_store`` and yields items in the same order in which :func:`route_out` received them from its underlying iterable (using the same data store). This includes items that were not yielded by :func:`route_out`, but only stored. :func:`route_in` uses :func:`joiner`-function to determine how stored and optionally processed data should be joined into a single item, which is then yielded by :func:`route_in`. :func:`route_in` calls :func:`joiner` with a 2-tuple. The first element of the tuple is either ``datalad_next.itertools.StoreOnly`` or the next item from the underlying iterator. The second element is the data that was stored in the data store. The result of :func:`joiner` which will be yielded by :func:`route_in`. This module provides a standard joiner-function: :func:`join_with_list` that works with splitter-functions that return a list as second element of the result tuple. The cardinality of ``iterable`` must match the number of processed data elements in the data store. The output cardinality of :func:`route_in` will be the cardinality of the input iterable of the corresponding :func:`route_out`-call. Given the following code: .. code-block:: python store_1 = list() route_in( some_generator( route_out(input_iterable, store_1, splitter_1) ), store_1, joiner_1 ) :func:`route_in` will yield the same number of elements as ``input_iterable``. But, the number of elements processed by ``some_generator`` is determined by the :func:`splitter_1` in :func:`route_out`, i.e. by the number of :func:`splitter_1`-results that have don't have ``datalad_next.itertools.don_process`` as first element. Parameters ---------- iterable: Iterable The iterable that yields the input data. data_store: list The list from which the data that is to be "routed in" is read. joiner: Callable[[Any, Any], Any] A function that determines how the items that are yielded by ``iterable`` should be combined with the corresponding data from ``data_store``, in order to yield the final result. The first argument to ``joiner`` is the item that is yielded by ``iterable``, or ``datalad_next.itertools.StoreOnly`` if no data was processed in the corresponding step. The second argument is the data that was stored in ``data_store`` in the corresponding step. """ for element in iterable: processed, stored = data_store.pop(0) # yield stored-only content until we find an item that was processed while processed is StoreOnly: yield joiner(processed, stored) processed, stored = data_store.pop(0) yield joiner(element, stored) # we reached the end of the incoming iterable. # this means that we must not find any remaining items in `data_store` # that indicate that they would have a corresponding item in the # iterable (processed is not StoreOnly) for processed, stored in data_store: assert processed is StoreOnly, \ "iterable did not yield matching item for route-in item, cardinality mismatch?" yield joiner(processed, stored) # rather than pop() in the last loop, we just yielded from the list # now this information is no longer needed del data_store[:] datalad-next-1.4.1/datalad_next/itertools/tests/000077500000000000000000000000001462321624600217075ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/itertools/tests/__init__.py000066400000000000000000000000001462321624600240060ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/itertools/tests/test_align_pattern.py000066400000000000000000000040001462321624600261410ustar00rootroot00000000000000from __future__ import annotations import timeit import pytest from ..align_pattern import align_pattern @pytest.mark.parametrize('data_chunks,pattern,expected', [ (['a', 'b', 'c', 'd', 'e'], 'abc', ['abc', 'de']), (['a', 'b', 'c', 'a', 'b', 'c'], 'abc', ['abc', 'abc']), # Ensure that unaligned pattern prefixes are not keeping data chunks short. (['a', 'b', 'c', 'dddbbb', 'a', 'b', 'x'], 'abc', ['abc', 'dddbbb', 'abx']), # Expect that a trailing minimum length-chunk that ends with a pattern # prefix is not returned as data, but as remainder, if it is not the final # chunk. (['a', 'b', 'c', 'd', 'a'], 'abc', ['abc', 'da']), # Expect the last chunk to be returned as data, if final is True, although # it ends with a pattern prefix. If final is false, the last chunk will be # returned as a remainder, because it ends with a pattern prefix. (['a', 'b', 'c', 'dddbbb', 'a'], 'abc', ['abc', 'dddbbb', 'a']), (['a', 'b', 'c', '9', 'a'], 'abc', ['abc', '9a']), ]) def test_pattern_processor(data_chunks, pattern, expected): assert expected == list(align_pattern(data_chunks, pattern=pattern)) def test_performance(): # Ensure that the performance of align_pattern is acceptable for large # data chunks and patterns. number = 10 pattern = b'01234' data_chunks = [b'a' * 1000 for _ in range(100 * 1000)] + [pattern] result_base = timeit.timeit( lambda: tuple(data_chunks), number=number, ) result_iter = timeit.timeit( lambda: tuple(align_pattern(data_chunks, pattern=pattern)), number=number, ) print(result_base, result_iter, result_iter / result_base) def test_newline_matches(): pattern = b'----datalad-end-marker-3654137433-rekram-dne-dalatad----\n' chunk1 = b'Have a lot of fun...\n----datalad-end-marker-3654137433-r' chunk2 = b'e' chunk3 = b'kram-dne-dalatad----\n' result = list(align_pattern([chunk1, chunk2, chunk3], pattern)) assert result == [chunk1 + chunk2 + chunk3] datalad-next-1.4.1/datalad_next/itertools/tests/test_decode_bytes.py000066400000000000000000000020661462321624600257550ustar00rootroot00000000000000from __future__ import annotations import pytest from ..decode_bytes import decode_bytes def test_split_decoding(): encoded = 'ö'.encode('utf-8') part_1, part_2 = encoded[:1], encoded[1:] # check that incomplete encodings are caught r = tuple(decode_bytes([b'abc' + part_1, part_2 + b'def'])) assert ''.join(r) == 'abcödef' def test_unfixable_error_decoding(): encoded = 'ö'.encode('utf-8') part_1, part_2 = encoded[:1], encoded[1:] # check that incomplete encodings are caught r = tuple(decode_bytes([b'abc' + part_1 + b'def' + part_1, part_2 + b'ghi'])) assert ''.join(r) == 'abc\\xc3deföghi' def test_single_undecodable_byte(): # check that a single undecodable byte is handled properly r = tuple(decode_bytes([b'\xc3'])) assert ''.join(r) == '\\xc3' with pytest.raises(UnicodeDecodeError): tuple(decode_bytes([b'\xc3'], backslash_replace=False)) def test_no_empty_strings(): # check that empty strings are not yielded r = tuple(decode_bytes([b'\xc3', b'\xb6'])) assert r == ('ö',) datalad-next-1.4.1/datalad_next/itertools/tests/test_itemize.py000066400000000000000000000023111462321624600247630ustar00rootroot00000000000000from __future__ import annotations import pytest from ..itemize import itemize text_chunks = [ 'abc', 'def\n012', '\n', '\n' ] byte_chunks = [chunk.encode() for chunk in text_chunks] text_chunks_other = [chunk.replace('\n', '\r\n') for chunk in text_chunks] byte_chunks_other = [chunk.encode() for chunk in text_chunks_other] @pytest.mark.parametrize( 'input_chunks,separator', [ (text_chunks, '\n'), (byte_chunks, b'\n'), (text_chunks_other, '\r\n'), (byte_chunks_other, b'\r\n') ] ) def test_assembling_and_splitting(input_chunks, separator): empty = input_chunks[0][:0] r = tuple(itemize(input_chunks, None, keep_ends=True)) assert len(r) == 3 assert empty.join(r) == empty.join(input_chunks) r = tuple(itemize(input_chunks, sep=separator, keep_ends=True)) assert len(r) == 3 assert empty.join(r) == empty.join(input_chunks) r = tuple(itemize(input_chunks, sep=separator)) assert len(r) == 3 assert empty.join(r) == empty.join(input_chunks).replace(separator, empty) r = tuple(itemize(input_chunks + input_chunks[:1], sep=separator, keep_ends=True)) assert len(r) == 4 assert r[3] == input_chunks[0] datalad-next-1.4.1/datalad_next/itertools/tests/test_load_json.py000066400000000000000000000025451462321624600252760ustar00rootroot00000000000000from __future__ import annotations import json from json.decoder import JSONDecodeError import pytest from ..load_json import ( load_json, load_json_with_flag, ) from ..decode_bytes import decode_bytes from ..itemize import itemize json_object = { 'list1': [ 'a', 'bäöl', 1 ], 'dict1': { 'x': 123, 'y': 234, 'z': 456, } } correct_json = b'\n'.join( json.dumps(x).encode() for x in [json_object] * 10 ) + b'\n' correct_chunks = [ correct_json[i:i + 10] for i in range(0, len(correct_json) + 10, 10) ] faulty_json = correct_json.replace(b'}\n', b'\n') faulty_chunks = [ faulty_json[i:i + 10] for i in range(0, len(correct_json) + 10, 10) ] def test_load_json_on_decoded_bytes(): assert all(x == json_object for x in load_json( decode_bytes(itemize(correct_chunks, b'\n')))) with pytest.raises(JSONDecodeError): list(load_json(decode_bytes(itemize(faulty_chunks, b'\n')))) def test_load_json_with_flag(): assert all( obj == json_object and success is True for (obj, success) in load_json_with_flag(decode_bytes(itemize(correct_chunks, b'\n'))) ) assert all( isinstance(exc, JSONDecodeError) and success is False for (exc, success) in load_json_with_flag(decode_bytes(itemize(faulty_chunks, b'\n'))) ) datalad-next-1.4.1/datalad_next/itertools/tests/test_reroute.py000066400000000000000000000030641462321624600250100ustar00rootroot00000000000000 from more_itertools import intersperse from ..reroute import ( route_in, route_out, StoreOnly ) def test_route_around(): """Test routing of data around a consumer""" # Route 0 around `lambda x: 2.0 / x. store = list() r = route_in( map( lambda divisor: 2.0 / divisor, route_out( intersperse(0, range(2, 20)), store, lambda divisor: (StoreOnly, [divisor]) if divisor == 0 else (divisor, None) ) ), store, lambda processed_data, stored_data: processed_data if processed_data is not StoreOnly else 'divisor is 0' ) # The result should be a list in which every odd element consists of a list # with the elements `[n / 2.0, n]` and every even element consists of a list # with `[dont_process, 0]`, because the `0`s were routed around the # consumer, i.e. around `lambda x: x / 2.0`. assert list(r) == list( intersperse('divisor is 0', map(lambda x: 2.0 / x, range(2, 20))) ) def test_route_no_processing(): """Test routing of data without processing""" store = list() r = route_in( map( lambda x: x, route_out( range(10), store, lambda x: (StoreOnly, x) ) ), store, lambda x, y: y ) assert list(r) == list(range(10)) datalad-next-1.4.1/datalad_next/patches/000077500000000000000000000000001462321624600201505ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/patches/__init__.py000066400000000000000000000037311462321624600222650ustar00rootroot00000000000000from __future__ import annotations from importlib import import_module import logging from typing import Any lgr = logging.getLogger('datalad.ext.next.patches') def apply_patch( modname: str, objname: str | None, attrname: str, patch: Any, msg: str | None = None, expect_attr_present=True, ): """ Monkey patch helper Parameters ---------- modname: str Importable name of the module with the patch target. objname: str or None If `None`, patch will target an attribute of the module, otherwise an object name within the module can be specified. attrname: str Name of the attribute to replace with the patch, either in the module or the given object (see ``objname``) patch: The identified attribute will be replaced with this object. msg: str or None If given, a debug-level log message with this text will be emitted, otherwise a default message is generated. expect_attr_present: bool If True (default) an exception is raised when the target attribute is not found. Returns ------- object The original, unpatched attribute -- if ``expect_attr_present`` is enabled, or ``None`` otherwise. Raises ------ ImportError When the target module cannot be imported AttributedError When the target object is not found, or the target attribute is not found. """ orig_attr = None msg = msg or f'Apply patch to {modname}.{attrname}' lgr.debug(msg) # we want to fail on ImportError mod = import_module(modname, package='datalad') if objname: # we want to fail on a missing object obj = getattr(mod, objname) else: # the target is the module itself obj = mod if expect_attr_present: # we want to fail on a missing attribute/object orig_attr = getattr(obj, attrname) setattr(obj, attrname, patch) return orig_attr datalad-next-1.4.1/datalad_next/patches/add_method_url2transport_path.py000066400000000000000000000043141462321624600265510ustar00rootroot00000000000000"""Add the method :meth:`url2transport_path` to RIA IO-abstraction classes This patch adds the method :meth:`url2transport_path` to the IO-abstraction classes: :class:`datalad.distributed.ora_remote.LocalIO`, and to the class :class:`datalad.distributed.ora_remote.HTTPRemoteIO`. This method is required by the patches that add Windows-client support to RIA-code. It converts internally used abstract paths to concrete paths that are platform- andIO-abstraction specific and on which IO-operations cam be performed. """ from __future__ import annotations import logging from re import compile from pathlib import ( Path, PurePosixPath, ) from datalad_next.consts import on_windows from . import apply_patch # The methods are patched into the ora_remote/ria_remote. Use the same logger. lgr = logging.getLogger('datalad.customremotes.ria_remote') drive_letter_matcher = compile('^/[A-Z]:') def str2windows_path(url_path: PurePosixPath): path_str = str(url_path) match = drive_letter_matcher.match(path_str) if match: if path_str[3] == '/': return Path(*([f'{path_str[1]}:', '/'] + path_str[4:].split('/'))) else: lgr.warning(f'Non-absolute Windows-path detected: {path_str}') return Path(*([f'{path_str[1]}:'] + path_str[3:].split('/'))) else: return Path(path_str) def local_io_url2transport_path( self, url_path: PurePosixPath ) -> Path | PurePosixPath: assert isinstance(url_path, PurePosixPath) if on_windows: return str2windows_path(url_path) else: return Path(url_path) def http_remote_io_url2transport_path( self, url_path: PurePosixPath ) -> Path | PurePosixPath: assert isinstance(url_path, PurePosixPath) return url_path # Add a `url2transport_path`-method to `ora_remote.LocalIO` apply_patch( 'datalad.distributed.ora_remote', 'LocalIO', 'url2transport_path', local_io_url2transport_path, expect_attr_present=False, ) # Add a `url2transport_path`-method to `ora_remote.HTTPRemoteIO` apply_patch( 'datalad.distributed.ora_remote', 'HTTPRemoteIO', 'url2transport_path', http_remote_io_url2transport_path, expect_attr_present=False, ) datalad-next-1.4.1/datalad_next/patches/annexrepo.py000066400000000000000000000101151462321624600225170ustar00rootroot00000000000000"""Credential support for ``AnnexRepo.enable_remote()`` and ``siblings enable`` Supported targets for automatic credential deployments are determined by ``needs_specialremote_credential_envpatch()``. At the time of this writing this includes the git-annex built-in remote types ``webdav``, ``s3``, and ``glacier``. This patch also changes the function to raise its custom exception with the context of an original underlying exception for better error reporting. """ import logging import os import re from unittest.mock import patch from datalad.support.exceptions import ( AccessDeniedError, AccessFailedError, ) from datalad_next.exceptions import ( CommandError, ) from datalad_next.utils import ( CredentialManager, ensure_list, get_specialremote_credential_envpatch, get_specialremote_param_dict, get_specialremote_credential_properties, needs_specialremote_credential_envpatch, ) from . import apply_patch # reuse logger from -core, despite the unconventional name lgr = logging.getLogger('datalad.annex') # This function is taken from datalad-core@2ed709613ecde8218a215dcb7d74b4a352825685 # datalad/support/annexrepo.py:AnnexRepo def annexRepo__enable_remote(self, name, options=None, env=None): """Enables use of an existing special remote Parameters ---------- name: str name, the special remote was created with options: list, optional """ # MIH thinks there should be no `env` argument at all # https://github.com/datalad/datalad/issues/5162 # if it would not be there, this whole dance is pretty much # obsolete env = env or self._git_runner.env # an enableremote can do pretty much anything, including a type change. # in order to be able to determine whether credentials *will* be needed, # we have to look ahead and form the special remote parameters that will # be there at the end -- more or less # pull info for present config sp_remotes = {v['name']: dict(v, uuid=k) for k, v in self.get_special_remotes().items()} remote_info = sp_remotes.get(name, {}) # TODO if remote_info is empty, we can fail right here if options: # and now update with given params remote_info.update(get_specialremote_param_dict(options)) # careful here, `siblings()` also calls this for regular remotes, check # for a known type if 'type' in remote_info \ and needs_specialremote_credential_envpatch(remote_info['type']): # see if we can identify any matching credentials credprops = get_specialremote_credential_properties(remote_info) credman = None credspec = None if credprops: credman = CredentialManager(self.config) creds = credman.query(_sortby='last-used', **credprops) if creds: # found one credspec = creds[0] # TODO manual entry could be supported here too! (also see at the end) if env: env.copy() if credspec: credpatch = get_specialremote_credential_envpatch( remote_info['type'], credspec[1]) if credpatch: if not env: env = os.environ.copy() env.update(credpatch) try: with patch.object(self._git_runner, 'env', env): # TODO: outputs are nohow used/displayed. Eventually convert to # to a generator style yielding our "dict records" self.call_annex(['enableremote', name] + ensure_list(options)) except CommandError as e: if re.match(r'.*StatusCodeException.*statusCode = 401', e.stderr): raise AccessDeniedError(e.stderr) from e elif 'FailedConnectionException' in e.stderr: raise AccessFailedError(e.stderr) from e else: raise e self.config.reload() # TODO when manual credential entry is supported, # implement store-after-success here apply_patch( 'datalad.support.annexrepo', 'AnnexRepo', 'enable_remote', annexRepo__enable_remote, msg='Apply datalad-next patch to annexrepo.py:AnnexRepo.enable_remote') datalad-next-1.4.1/datalad_next/patches/cli_configoverrides.py000066400000000000000000000040331462321624600245410ustar00rootroot00000000000000"""Post DataLad config overrides CLI/ENV as GIT_CONFIG items in process ENV This enables their propagation to any subprocess. This includes the specification of overrides via the ``datalad -c ...`` option of the main CLI entrypoint. """ from datalad.config import _update_from_env as _update_from_datalad_env from datalad.cli.helpers import _parse_overrides_from_cmdline from datalad_next.config.utils import ( get_gitconfig_items_from_env, set_gitconfig_items_in_env, ) from . import apply_patch def parse_overrides_from_cmdline(cmdlineargs): # read from cmdlineargs first to error on any syntax issues # before any other processing cli_overrides = _parse_overrides_from_cmdline(cmdlineargs) # reuse datalad-core implementation of datalad-specific ENV parsing # for config items overrides = {} _update_from_datalad_env(overrides) # let CLI settings override any ENV -- in-line with the behavior of Git overrides.update(cli_overrides) # read any existing GIT_CONFIG ENV vars and superimpose our # overrides on them, repost in ENV using git-native approach. # This will apply the overrides to any git(-config) calls # in this process and any subprocess gc_overrides = get_gitconfig_items_from_env() gc_overrides.update(overrides) set_gitconfig_items_in_env(gc_overrides) # we do not actually disclose any of these overrides. # the CLI runs a `datalad.cfg.reload(force=True)` # immediately after executing this function and thereby # pulls in the overrides we just posted into the ENV # here. This change reduced the scope of # `datalad.cfg.overrides` to be mere instance overrides # and no longer process overrides. This rectifies the mismatch # between appearance and actual impact of this information # in the ConfigManager return {} apply_patch( 'datalad.cli.helpers', None, '_parse_overrides_from_cmdline', parse_overrides_from_cmdline, msg='Enable posting DataLad config overrides CLI/ENV as ' 'GIT_CONFIG items in process ENV', ) datalad-next-1.4.1/datalad_next/patches/commanderror.py000066400000000000000000000024561462321624600232210ustar00rootroot00000000000000"""Improve ``CommandError`` rendering and add ``returncode`` alias for ``code`` This patch does two things: It overwrites ``__repr__``, otherwise ``CommandError` would use ``RuntimeError``'s variant and ignore all additional structured information except for ``.msg`` -- which is frequently empty and confuses with a `CommandError('')` display. It adds a ``returncode`` alias for ``code``. This unifies return code access between ``CommandError`` and `Popen``-like objects, which usually have a ``returncode`` attribute. """ from datalad.runner.exception import CommandError def commanderror_repr(self) -> str: return self.to_str() CommandError.__repr__ = commanderror_repr # Basic alias idea taken from here: # _commanderror_aliases = { 'returncode': 'code', } def commanderror_getattr(self, item): return object.__getattribute__(self, _commanderror_aliases.get(item, item)) def commanderror_setattr(self, key, value): if key == '_aliases': raise AttributeError('Cannot set `_aliases`') return object.__setattr__(self, _commanderror_aliases.get(key, key), value) CommandError.__getattr__ = commanderror_getattr CommandError.__setattr__ = commanderror_setattr datalad-next-1.4.1/datalad_next/patches/common_cfg.py000066400000000000000000000007531462321624600226360ustar00rootroot00000000000000"""Change the default of ``datalad.annex.retry`` to ``1`` This prevents unconditional retries, and thereby improves the legibility of errors (now only one error instead of three identical errors). This change does not override user-settings, only the default. """ from datalad.support.extensions import has_config if has_config('datalad.annex.retry'): from datalad.interface.common_cfg import definitions retrycfg = definitions['datalad.annex.retry'] retrycfg['default'] = 1 datalad-next-1.4.1/datalad_next/patches/configuration.py000066400000000000000000000153011462321624600233710ustar00rootroot00000000000000"""Enable ``configuration()`` to query ``global`` scope without a dataset """ __docformat__ = 'restructuredtext' import logging from datalad import cfg as dlcfg from datalad.distribution.dataset import require_dataset from datalad_next.commands import ( build_doc, datasetmethod, eval_results, get_status_dict, ) from datalad.interface.common_cfg import definitions as cfg_defs from datalad.local import configuration as conf_mod from datalad.local.configuration import ( config_actions, _dump, _get, _set, _unset, ) from datalad_next.exceptions import NoDatasetFound from datalad_next.utils import ensure_list from datalad_next.datasets import ( Dataset, ) lgr = logging.getLogger('datalad.local.configuration') @build_doc class Configuration(conf_mod.Configuration): """""" @staticmethod @datasetmethod(name='configuration') @eval_results def __call__( action='dump', spec=None, *, scope=None, dataset=None, recursive=False, recursion_limit=None): # check conditions # - global and recursion makes no sense if action == 'dump': if scope: raise ValueError( 'Scope selection is not supported for dumping') # normalize variable specifications specs = [] for s in ensure_list(spec): if isinstance(s, tuple): specs.append((str(s[0]), str(s[1]))) elif '=' not in s: specs.append((str(s),)) else: specs.append(tuple(s.split('=', 1))) if action == 'set': missing_values = [s[0] for s in specs if len(s) < 2] if missing_values: raise ValueError( 'Values must be provided for all configuration ' 'settings. Missing: {}'.format(missing_values)) invalid_names = [s[0] for s in specs if '.' not in s[0]] if invalid_names: raise ValueError( 'Name must contain a section (i.e. "section.name"). ' 'Invalid: {}'.format(invalid_names)) ds = None if scope != 'global' or recursive: try: ds = require_dataset( dataset, check_installed=True, purpose='configure') except NoDatasetFound: if action not in ('dump', 'get') or dataset: raise res_kwargs = dict( action='configuration', logger=lgr, ) if ds: res_kwargs['refds'] = ds.path yield from configuration(action, scope, specs, res_kwargs, ds) if not recursive: return for subds in ds.subdatasets( state='present', recursive=True, recursion_limit=recursion_limit, on_failure='ignore', return_type='generator', result_renderer='disabled'): yield from configuration( action, scope, specs, res_kwargs, Dataset(subds['path'])) def configuration(action, scope, specs, res_kwargs, ds=None): # go with the more specific dataset configmanager, if we are # operating on a dataset cfg = dlcfg if ds is None else ds.config if action not in config_actions: raise ValueError("Unsupported action '{}'".format(action)) if action == 'dump': if not specs: # dumping is querying for all known keys specs = [ (n,) for n in sorted( set(cfg_defs.keys()).union(cfg.keys())) ] scope = None for spec in specs: if '.' not in spec[0]: yield get_status_dict( ds=ds, status='error', message=( "Configuration key without a section: '%s'", spec[0], ), **res_kwargs) continue # TODO without get-all there is little sense in having add #if action == 'add': # res = _add(cfg, scope, spec) if action == 'get': res = _get(cfg, scope, spec[0]) # `None` is a value that cannot be set in the config. # if it is returned, it indicates that no value was set # we need to communicate that back, because a None value # cannot be reported as such by the CLI # (only via, e.g. JSON, encoding). # It makes sense to communicate that getting this specific # configuration item is "impossible" (because it is not set). # if a caller wants to tollerate this scenario, they can # set on_failure='ignore' if res.get('value') is None: res['status'] = 'impossible' res['message'] = ( 'key %r not set in configuration%s', res['name'], f" scope '{scope}'" if scope else '', ) elif action == 'dump': res = _dump(cfg, spec[0]) # TODO this should be there, if we want to be comprehensive # however, we turned this off by default in the config manager # because we hardly use it, and the handling in ConfigManager # is not really well done. #elif action == 'get-all': # res = _get_all(cfg, scope, spec) elif action == 'set': res = _set(cfg, scope, *spec) elif action == 'unset': res = _unset(cfg, scope, spec[0]) if ds: res['path'] = ds.path if 'status' not in res: res['status'] = 'ok' yield dict(res_kwargs, **res) if action in ('add', 'set', 'unset'): # we perform a single reload, rather than one for each modification # TODO: can we detect a call from cmdline? We could skip the reload. cfg.reload(force=True) conf_mod.Configuration.__call__ = Configuration.__call__ conf_mod.Configuration._params_['scope']._doc = """\ scope for getting or setting configuration. If no scope is declared for a query, all configuration sources (including overrides via environment variables) are considered according to the normal rules of precedence. A 'get' action can be constrained to scope 'branch', otherwise 'global' is used when not operating on a dataset, or 'local' (including 'global', when operating on a dataset. For action 'dump', a scope selection is ignored and all available scopes are considered.""" conf_mod.Configuration.__call__.__doc__ = None conf_mod.Configuration = build_doc(conf_mod.Configuration) datalad-next-1.4.1/datalad_next/patches/create_sibling_ghlike.py000066400000000000000000000125031462321624600250200ustar00rootroot00000000000000"""Improved credential handling for ``create_sibling_()`` This patch makes the storage of a newly entered credential conditional on a successful authorization, in the spirit of `datalad/datalad#3126 `__. Moreover, stored credentials now contain a ``realm`` property that identified the API endpoint. This makes it possible to identify candidates of suitable credentials without having to specific their name, similar to a request context url used by the old providers setup. This automatic realm-based credential lookup is now also implemented. When no credential name is specified, the most recently used credential matching the API realm will be used automatically. If determined like this, it will be tested for successful authorization, and will then be stored again with an updated ``last-used`` timestamp. """ import logging from urllib.parse import urlparse from datalad.distributed.create_sibling_ghlike import _GitHubLike from datalad.downloaders.http import DEFAULT_USER_AGENT from datalad_next.exceptions import CapturedException from datalad_next.utils import CredentialManager # use same logger as -core lgr = logging.getLogger('datalad.distributed.create_sibling_ghlike') def _set_request_headers(self, credential_name, auth_info, require_token): credman = CredentialManager() from_query = False credential = None if not credential_name: # get the most recent credential by realm, because none was identified creds = credman.query(realm=self.api_url, _sortby='last-used') if creds: # found one, also assign the name to be able to update # it below credential_name, credential = creds[0] from_query = True if not credential_name: # if we have no name given, fall back on a generated one # that may exist from times before realms were recorded # properly, otherwise we would not be here credential_name = urlparse(self.api_url).netloc if not credential: # no credential yet try: credential = credman.get( credential_name, _prompt=auth_info, type='token', realm=self.api_url, ) if credential is None or 'secret' not in credential: raise ValueError('No credential found') except Exception as e: CapturedException(e) lgr.debug('Token retrieval failed: %s', e) lgr.warning( 'Cannot determine authorization token for %s', credential_name) if require_token: raise ValueError( f'Authorization required for {self.fullname}, ' f'cannot find token for a credential {credential_name}.') else: lgr.warning("No token found for credential '%s'", credential_name) credential = {} self.request_headers = { 'user-agent': DEFAULT_USER_AGENT, 'authorization': f'token {credential.get("secret", "NO-TOKEN-AVAILABLE")}', } edited_credential = credential.pop('_edited', False) if from_query or edited_credential: # if the credential was determined based on the api realm or edited, # test it so we know it (still) works before we save/update it try: self.authenticated_user except Exception as e: raise ValueError( ("{state} credential {name!r} did not yield successful " "authorization. {advice}").format( state='Entered' if edited_credential else "Auto-selected", name=credential_name, advice='Please try again with a valid credential' if edited_credential else 'Please select a different credential via the ' '`credential` option of this command, ' 'or remove/edit the credential with the DataLad command ' '`credentials`.' ) ) from e # this went well, store try: credman.set( credential_name, _lastused=True, **credential, ) except Exception as e: # we do not want to crash for any failure to store a # credential lgr.warn( 'Exception raised when storing credential %r %r: %s', credential_name, credential, CapturedException(e), ) # patch the core class lgr.debug('Apply datalad-next patch to create_sibling_ghlike.py:_GitHubLike._set_request_headers') _GitHubLike._set_request_headers = _set_request_headers # update docs _GitHubLike.create_sibling_params['credential']._doc = """\ name of the credential providing a personal access token to be used for authorization. The token can be supplied via configuration setting 'datalad.credential..secret', or environment variable DATALAD_CREDENTIAL__SECRET, or will be queried from the active credential store using the provided name. If none is provided, the last-used token for the API URL realm will be used. If no matching credential exists, a credential named after the hostname part of the API URL is tried as a last fallback.""" datalad-next-1.4.1/datalad_next/patches/create_sibling_gitlab.py000066400000000000000000000373771462321624600250370ustar00rootroot00000000000000"""Streamline user experience Discontinue advertizing the ``hierarchy`` layout, and better explain limitations of the command. """ import datalad.distributed.create_sibling_gitlab as mod_gitlab # provide some symbols from it for the patch below CapturedException = mod_gitlab.CapturedException GitLabSite = mod_gitlab.GitLabSite known_access_labels = mod_gitlab.known_access_labels lgr = mod_gitlab.lgr from datalad_next.commands import build_doc from . import apply_patch known_layout_labels = ('collection', 'flat') command_doc = """ Create dataset sibling at a GitLab site An existing GitLab project, or a project created via the GitLab web interface can be configured as a sibling with the :command:`siblings` command. Alternatively, this command can create a GitLab project at any location/path a given user has appropriate permissions for. This is particularly helpful for recursive sibling creation for subdatasets. API access and authentication are implemented via python-gitlab, and all its features are supported. A particular GitLab site must be configured in a named section of a python-gitlab.cfg file (see https://python-gitlab.readthedocs.io/en/stable/cli.html#configuration for details), such as:: [mygit] url = https://git.example.com api_version = 4 private_token = abcdefghijklmnopqrst Subsequently, this site is identified by its name ('mygit' in the example above). (Recursive) sibling creation for all, or a selected subset of subdatasets is supported with two different project layouts (see --layout): "flat" All datasets are placed as GitLab projects in the same group. The project name of the top-level dataset follows the configured datalad.gitlab-SITENAME-project configuration. The project names of contained subdatasets extend the configured name with the subdatasets' s relative path within the root dataset, with all path separator characters replaced by '-'. This path separator is configurable (see Configuration). "collection" A new group is created for the dataset hierarchy, following the datalad.gitlab-SITENAME-project configuration. The root dataset is placed in a "project" project inside this group, and all nested subdatasets are represented inside the group using a "flat" layout. The root datasets project name is configurable (see Configuration). This command cannot create root-level groups! To use this layout for a collection located in the root of an account, create the target group via the GitLab web UI first. GitLab cannot host dataset content. However, in combination with other data sources (and siblings), publishing a dataset to GitLab can facilitate distribution and exchange, while still allowing any dataset consumer to obtain actual data content from alternative sources. *Configuration* Many configuration switches and options for GitLab sibling creation can be provided arguments to the command. However, it is also possible to specify a particular setup in a dataset's configuration. This is particularly important when managing large collections of datasets. Configuration options are: "datalad.gitlab-default-site" Name of the default GitLab site (see --site) "datalad.gitlab-SITENAME-siblingname" Name of the sibling configured for the local dataset that points to the GitLab instance SITENAME (see --name) "datalad.gitlab-SITENAME-layout" Project layout used at the GitLab instance SITENAME (see --layout) "datalad.gitlab-SITENAME-access" Access method used for the GitLab instance SITENAME (see --access) "datalad.gitlab-SITENAME-project" Project "location/path" used for a datasets at GitLab instance SITENAME (see --project). Configuring this is useful for deriving project paths for subdatasets, relative to superdataset. The root-level group ("location") needs to be created beforehand via GitLab's web interface. "datalad.gitlab-default-projectname" The collection layout publishes (sub)datasets as projects with a custom name. The default name "project" can be overridden with this configuration. "datalad.gitlab-default-pathseparator" The flat and collection layout represent subdatasets with project names that correspond to the path, with the regular path separator replaced with a "-": superdataset-subdataset. This configuration can override this default separator. This command can be configured with "datalad.create-sibling-ghlike.extra-remote-settings.NETLOC.KEY=VALUE" in order to add any local KEY = VALUE configuration to the created sibling in the local `.git/config` file. NETLOC is the domain of the Gitlab instance to apply the configuration for. This leads to a behavior that is equivalent to calling datalad's ``siblings('configure', ...)``||``siblings configure`` command with the respective KEY-VALUE pair after creating the sibling. The configuration, like any other, could be set at user- or system level, so users do not need to add this configuration to every sibling created with the service at NETLOC themselves. """ # # This replacement function is taken from # https://github.com/datalad/datalad/pull/7410 # @7c83f4ac282dc3b48be8439dbbbe0f0c2c57d467 # The actual change over the patch target is only four lines, but the spaghetti # nature of the function does not allow for a lean patch. def _proc_dataset(refds, ds, site, project, remotename, layout, existing, access, dry_run, siteobjs, depends, description): # basic result setup res_kwargs = dict( action='create_sibling_gitlab', refds=refds.path, path=ds.path, type='dataset', logger=lgr, ) if description: res_kwargs['description'] = description if site is None: # always try pulling the base config from a parent dataset # even if paths were given (may be overwritten later) basecfgsite = ds.config.get('datalad.gitlab-default-site', None) # let the dataset config overwrite the target site, if none # was given site = refds.config.get( 'datalad.gitlab-default-site', basecfgsite) \ if site is None else site if site is None: # this means the most top-level dataset has no idea about # gitlab, and no site was specified as an argument # fail rather then give an error result, as this is very # unlikely to be intentional raise ValueError( 'No GitLab site was specified (--site) or configured ' 'in {} (datalad.gitlab.default-site)'.format(ds)) res_kwargs['site'] = site # determine target remote name, unless given if remotename is None: remotename_var = 'datalad.gitlab-{}-siblingname'.format(site) remotename = ds.config.get( remotename_var, # use config from parent, if needed refds.config.get( remotename_var, # fall back on site name, if nothing else can be used site)) res_kwargs['sibling'] = remotename # check against existing remotes dremotes = { r['name']: r for r in ds.siblings( action='query', # fastest possible get_annex_info=False, recursive=False, return_type='generator', result_renderer='disabled') } if remotename in dremotes and existing not in ['replace', 'reconfigure']: # we already know a sibling with this name yield dict( res_kwargs, status='error' if existing == 'error' else 'notneeded', message=('already has a configured sibling "%s"', remotename), ) return if layout is None: # figure out the layout of projects on the site # use the reference dataset as default, and fall back # on 'collection' as the most generic method of representing # the filesystem in a group/subproject structure layout_var = 'datalad.gitlab-{}-layout'.format(site) layout = ds.config.get( layout_var, refds.config.get( layout_var, 'collection')) if layout not in known_layout_labels: raise ValueError( "Unknown site layout '{}' given or configured, " "known ones are: {}".format(layout, known_layout_labels)) if access is None: access_var = 'datalad.gitlab-{}-access'.format(site) access = ds.config.get( access_var, refds.config.get( access_var, 'http')) if access not in known_access_labels: raise ValueError( "Unknown site access '{}' given or configured, " "known ones are: {}".format(access, known_access_labels)) pathsep = ds.config.get("datalad.gitlab-default-pathseparator", None) or "-" project_stub = \ ds.config.get("datalad.gitlab-default-projectname", None) or "project" project_var = 'datalad.gitlab-{}-project'.format(site) process_root = refds == ds if project is None: # look for a specific config in the dataset project = ds.config.get(project_var, None) if project and process_root and layout != 'flat': # the root of a collection project = f'{project}/{project_stub}' elif project is None and not process_root: # check if we can build one from the refds config ref_project = refds.config.get(project_var, None) if ref_project: # layout-specific derivation of a path from # the reference dataset configuration rproject = ds.pathobj.relative_to(refds.pathobj).as_posix() if layout == 'collection': project = '{}/{}'.format( ref_project, rproject.replace('/', pathsep)) else: project = '{}-{}'.format( ref_project, rproject.replace('/', pathsep)) if project is None: yield dict( res_kwargs, status='error', message='No project name/location specified, and no configuration ' 'to derive one', ) return res_kwargs['project'] = project if dry_run: # this is as far as we can get without talking to GitLab yield dict( res_kwargs, status='ok', dryrun=True, ) return # and now talk to GitLab for real site_api = siteobjs[site] if site in siteobjs else GitLabSite(site) site_project = site_api.get_project(project) if site_project is None: try: site_project = site_api.create_project(project, description) # report success message = "sibling repository '%s' created at %s",\ remotename, site_project.get('web_url', None) yield dict( res_kwargs, # relay all attributes project_attributes=site_project, message=message, status='ok', ) except Exception as e: ce = CapturedException(e) yield dict( res_kwargs, # relay all attributes status='error', message=('Failed to create GitLab project: %s', ce), exception=ce ) return else: # there already is a project if existing == 'error': # be nice and only actually error if there is a real mismatch if remotename not in dremotes: yield dict( res_kwargs, project_attributes=site_project, status='error', message=( "There is already a project at '%s' on site '%s', " "but no sibling with name '%s' is configured, " "maybe use --existing=reconfigure", project, site, remotename, ) ) return elif access in ('ssh', 'ssh+http') \ and dremotes[remotename].get( 'url', None) != site_project.get( # use False as a default so that there is a # mismatch, complain if both are missing 'ssh_url_to_repo', False): yield dict( res_kwargs, project_attributes=site_project, status='error', message=( "There is already a project at '%s' on site '%s', " "but SSH access URL '%s' does not match '%s', " "maybe use --existing=reconfigure", project, site, dremotes[remotename].get('url', None), site_project.get('ssh_url_to_repo', None) ) ) return elif access == 'http' \ and dremotes[remotename].get( 'url', None) != site_project.get( # use False as a default so that there is a # mismatch, veen if both are missing 'http_url_to_repo', False): yield dict( res_kwargs, project_attributes=site_project, status='error', message=( "There is already a project at '%s' on site '%s', " "but HTTP access URL '%s' does not match '%s', " "maybe use --existing=reconfigure", project, site, dremotes[remotename].get('url', None), site_project.get('http_url_to_repo', None) ) ) return yield dict( res_kwargs, project_attributes=site_project, status='notneeded', message=( "There is already a project at '%s' on site '%s'", project, site, ) ) # first make sure that annex doesn't touch this one # but respect any existing config ignore_var = 'remote.{}.annex-ignore'.format(remotename) if ignore_var not in ds.config: ds.config.add(ignore_var, 'true', scope='local') for res in ds.siblings( 'configure', name=remotename, url=site_project['http_url_to_repo'] if access in ('http', 'ssh+http') else site_project['ssh_url_to_repo'], pushurl=site_project['ssh_url_to_repo'] if access in ('ssh', 'ssh+http') else None, recursive=False, publish_depends=depends, result_renderer='disabled', return_type='generator'): yield res apply_patch( 'datalad.distributed.create_sibling_gitlab', None, '_proc_dataset', _proc_dataset) apply_patch( 'datalad.distributed.create_sibling_gitlab', None, 'known_layout_labels', known_layout_labels, msg='Stop advertising discontinued "hierarchy" layout for ' '`create_siblign_gitlab()`') # also put in effect for the constraint, add None to address limitation that # the default also needs to be covered for datalad-core mod_gitlab.CreateSiblingGitlab._params_['layout'].constraints._allowed = \ (None,) + known_layout_labels # rebuild command docs mod_gitlab.CreateSiblingGitlab.__call__.__doc__ = None mod_gitlab.CreateSiblingGitlab.__doc__ = command_doc mod_gitlab.CreateSiblingGitlab = build_doc(mod_gitlab.CreateSiblingGitlab) datalad-next-1.4.1/datalad_next/patches/customremotes_main.py000066400000000000000000000141201462321624600244350ustar00rootroot00000000000000"""Connect ``log_progress``-style progress reporting to git-annex, add `close()` This patch introduces a dedicated progress log handler as a proxy between standard datalad progress logging and a git-annex special remote as an approach to report (data transfer) progress to a git-annex parent process. This functionality is only (to be) used in dedicated special remote processes. This patch also adds a standard `close()` handler to special remotes, and calls that handler in a context manager to ensure releasing any resources. This replaces the custom `stop()` method, which is undocumented and only used by the `datalad-archive` special remote. This patch also adds code that allows to patch a class that is already loaded """ from contextlib import closing import logging from typing import ( Dict, Type, ) from . import apply_patch from datalad_next.annexremotes import SpecialRemote def only_progress_logrecords(record: logging.LogRecord) -> bool: """Log filter to ignore any non-progress log message""" return hasattr(record, 'dlm_progress') class AnnexProgressLogHandler(logging.Handler): """Log handler to funnel progress logs to git-annex For this purpose the handler wraps :class:`datalad_next.annexremotes.SpecialRemote` instance. When it receives progress log messages, it converts any increment reports to absolute values, and then calls the special remote's ``send_progress()`` method, which will cause the respective progress update protocol message to be issued. .. note:: Git-annex only supports "context-free" progress reporting. When a progress report is send, it is assumed to be on a currently running transfer. Only a single integer value can be reported, and it corresponds to the number of bytes transferred. This approach implemented here cannot distinguish progress reports that corresponding to git-annex triggered data transfers and other (potentially co-occurring) operations. The likelihood of unrelated operations reporting progress is relatively low, because this handler is only supposed to be used in dedicated special remote processes, but remains possible. This implementation is set up to support tracking multiple processes, and could report one of them selectively. However, at present any progress update is relayed to git-annex directly. This could lead to confusing and non-linear progress reporting. """ def __init__(self, annexremote: SpecialRemote): super().__init__() self.annexremote = annexremote self._ptrackers: Dict[str, int] = {} def emit(self, record: logging.LogRecord): """Process a log record Any incoming log record, compliant with http://docs.datalad.org/design/progress_reporting.html is processed. Increment reports are converted to absolute values, and each update is eventually passed on to special remote, which issues a progress report to git-annex. """ if not hasattr(record, 'dlm_progress'): # a filter should have been used to prevent this call return maint = getattr(record, 'dlm_progress_maint', None) if maint in ('clear', 'refresh'): return pid = getattr(record, 'dlm_progress') update = getattr(record, 'dlm_progress_update', None) if pid not in self._ptrackers: # this is new prg = getattr(record, 'dlm_progress_initial', 0) self._ptrackers[pid] = prg self.annexremote.send_progress(prg) elif update is None: # not an update -> done self._ptrackers.pop(pid) else: prg = self._ptrackers[pid] if getattr(record, 'dlm_progress_increment', False): prg += update else: prg = update self._ptrackers[pid] = prg self.annexremote.send_progress(prg) def patched_underscore_main(args: list, cls: Type[SpecialRemote]): """Full replacement for datalad.customremotes.main._main() Its only purpose is to create a running instance of a SpecialRemote. The only difference to the original in datalad-core is that once this instance exists, it is linked to a log handler that converts incoming progress log messages to the equivalent annex protocol progress reports. This additional log handler is a strict addition to the log handling setup established at this point. There should be no interference with any other log message processing. .. seealso:: :class:`AnnexProgressLogHandler` """ assert cls is not None from annexremote import Master # Reload the class, to allow `cls` itself to be patched. new_module = __import__(cls.__module__, fromlist=[cls.__name__]) cls = getattr(new_module, cls.__name__) master = Master() # this context manager use relies on patching in a close() below with closing(cls(master)) as remote: master.LinkRemote(remote) # we add an additional handler to the logger to deal with # progress reports dlroot_lgr = logging.getLogger('datalad') phandler = AnnexProgressLogHandler(remote) phandler.addFilter(only_progress_logrecords) dlroot_lgr.addHandler(phandler) # run the remote master.Listen() # cleanup special case datalad-core `archive` remote # nobody should do this, use `close()` if hasattr(remote, 'stop'): remote.stop() # a default cleanup handler for CoreBaseSpecialRemote # this enables us to use a standard `closing()` context manager with # special remotes def specialremote_defaultclose_noop(self): pass apply_patch( 'datalad.customremotes', 'SpecialRemote', 'close', specialremote_defaultclose_noop, msg='Retrofit `SpecialRemote` with a `close()` handler', expect_attr_present=False, ) apply_patch( 'datalad.customremotes.main', None, '_main', patched_underscore_main, msg='Replace special remote _main() ' "with datalad-next's progress logging enabled variant") datalad-next-1.4.1/datalad_next/patches/distribution_dataset.py000066400000000000000000000022251462321624600247470ustar00rootroot00000000000000"""``DatasetParameter`` support for ``resolve_path()`` This is the standard result of ``EnsureDataset``, which unlike the datalad-core version actually carries a ``Dataset`` instance. This patch ensure the traditional handling of "dataset instance from a string-type parameter in this context. """ import logging from . import apply_patch # use same logger as -core, looks weird but is correct lgr = logging.getLogger('datalad.dataset') def resolve_path(path, ds=None, ds_resolved=None): if hasattr(ds, 'auto_instance_from_path'): # this instance came from datalad-next's EnsureDataset, # pretend that we resolved the dataset by hand return orig_resolve_path( ds=ds.auto_instance_from_path, ds_resolved=ds, ) else: return orig_resolve_path(ds=ds, ds_resolved=ds_resolved) # we need to preserve it as the workhorse, this patch only wraps around it orig_resolve_path = apply_patch( 'datalad.distribution.dataset', None, 'resolve_path', resolve_path, msg='Apply datalad-next patch to distribution.dataset:resolve_path') # reuse docs resolve_path.__doc__ = orig_resolve_path.__doc__ datalad-next-1.4.1/datalad_next/patches/enabled.py000066400000000000000000000006621462321624600221200ustar00rootroot00000000000000from . import ( cli_configoverrides, commanderror, common_cfg, annexrepo, configuration, create_sibling_ghlike, interface_utils, push_to_export_remote, push_optimize, siblings, test_keyring, customremotes_main, create_sibling_gitlab, run, update, # the following two patches have been taken verbatim from datalad-ria ssh_exec, sshconnector, patch_ria_ora, ) datalad-next-1.4.1/datalad_next/patches/fix_ria_ora_tests.py000066400000000000000000001110201462321624600242210ustar00rootroot00000000000000"""Patch ria-, ora-, ria_utils-, and clone-tests to work with modified ria_utils The ria-utils-patches use an abstract path representation for RIA-store elements. This patch adapts the tests that use `ria_utils.create_store` and `ria_utils.create_ds_in_store` to these modifications. """ from __future__ import annotations import logging import shutil import stat from pathlib import ( Path, PurePosixPath, ) from urllib.request import pathname2url from datalad.api import ( Dataset, clone, create_sibling_ria, ) from datalad.cmd import ( WitlessRunner as Runner, NoCapture, ) from datalad.customremotes.ria_utils import ( UnknownLayoutVersion, create_ds_in_store, create_store, get_layout_locations, ) from datalad.distributed.ora_remote import ( LocalIO, SSHRemoteIO, ) from datalad.distributed.tests.ria_utils import ( common_init_opts, get_all_files, populate_dataset, ) from datalad.support.exceptions import ( CommandError, IncompleteResultsError, ) from datalad.support.network import get_local_file_url from datalad.tests.utils_pytest import ( SkipTest, assert_equal, assert_false, assert_in, assert_not_in, assert_raises, assert_repo_status, assert_result_count, assert_status, assert_true, create_tree, has_symlink_capability, known_failure_githubci_win, known_failure_windows, rmtree, serve_path_via_http, skip_if_adjusted_branch, swallow_logs, with_tempfile, ) from . import apply_patch def local_path2pure_posix_path(path: Path | str): return PurePosixPath(pathname2url(str(path))) # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 @with_tempfile def patched__postclonetest_prepare(lcl, storepath, storepath2, link): from datalad.customremotes.ria_utils import ( create_ds_in_store, create_store, get_layout_locations, ) from datalad.distributed.ora_remote import LocalIO create_tree(lcl, tree={ 'ds': { 'test.txt': 'some', 'subdir': { 'subds': {'testsub.txt': 'somemore'}, 'subgit': {'testgit.txt': 'even more'} }, }, }) lcl = Path(lcl) storepath = Path(storepath) storepath2 = Path(storepath2) # PATCH: introduce `ppp_storepath` and `ppp_storepath2` and use them instead # of `storepath` and `storepath2`. ppp_storepath = local_path2pure_posix_path(storepath) ppp_storepath2 = local_path2pure_posix_path(storepath2) link = Path(link) link.symlink_to(storepath) # create a local dataset with a subdataset subds = Dataset(lcl / 'ds' / 'subdir' / 'subds').create(force=True) subds.save() # add a plain git dataset as well subgit = Dataset(lcl / 'ds' / 'subdir' / 'subgit').create(force=True, annex=False) subgit.save() ds = Dataset(lcl / 'ds').create(force=True) ds.save(version_tag='original') assert_repo_status(ds.path) io = LocalIO() # Have a second store with valid ORA remote. This should not interfere with # reconfiguration of the first one, when that second store is not the one we # clone from. However, don't push data into it for easier get-based testing # later on. # Doing this first, so datasets in "first"/primary store know about this. create_store(io, ppp_storepath2, '1') url2 = "ria+{}".format(get_local_file_url(str(storepath2))) for d in (ds, subds, subgit): create_ds_in_store(io, ppp_storepath2, d.id, '2', '1') d.create_sibling_ria(url2, "anotherstore", new_store_ok=True) d.push('.', to='anotherstore', data='nothing') store2_loc, _, _ = get_layout_locations(1, ppp_storepath2, d.id) Runner(cwd=str(store2_loc)).run(['git', 'update-server-info']) # Now the store to clone from: create_store(io, ppp_storepath, '1') # URL to use for upload. Point is, that this should be invalid for the clone # so that autoenable would fail. Therefore let it be based on a to be # deleted symlink upl_url = "ria+{}".format(get_local_file_url(str(link))) for d in (ds, subds, subgit): # TODO: create-sibling-ria required for config! => adapt to RF'd # creation (missed on rebase?) create_ds_in_store(io, ppp_storepath, d.id, '2', '1') d.create_sibling_ria(upl_url, "store", new_store_ok=True) if d is not subgit: # Now, simulate the problem by reconfiguring the special remote to # not be autoenabled. # Note, however, that the actual intention is a URL, that isn't # valid from the point of view of the clone (doesn't resolve, no # credentials, etc.) and therefore autoenabling on git-annex-init # when datalad-cloning would fail to succeed. Runner(cwd=d.path).run(['git', 'annex', 'enableremote', 'store-storage', 'autoenable=false']) d.push('.', to='store') store_loc, _, _ = get_layout_locations(1, ppp_storepath, d.id) Runner(cwd=str(store_loc)).run(['git', 'update-server-info']) link.unlink() # We should now have a store with datasets that have an autoenabled ORA # remote relying on an inaccessible URL. # datalad-clone is supposed to reconfigure based on the URL we cloned from. # Test this feature for cloning via HTTP, SSH and FILE URLs. return ds.id # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 @known_failure_githubci_win # in datalad/git-annex as e.g. of 20201218 @with_tempfile(mkdir=True) @with_tempfile @with_tempfile def patched_test_ria_postclone_noannex(dspath=None, storepath=None, clonepath=None): # Test for gh-5186: Cloning from local FS, shouldn't lead to annex # initializing origin. dspath = Path(dspath) storepath = Path(storepath) clonepath = Path(clonepath) # PATCH: introduce `ppp_storepath` and use it instead of `storepath`. ppp_storepath = local_path2pure_posix_path(storepath) from datalad.customremotes.ria_utils import ( create_ds_in_store, create_store, get_layout_locations, ) from datalad.distributed.ora_remote import LocalIO # First create a dataset in a RIA store the standard way somefile = dspath / 'a_file.txt' somefile.write_text('irrelevant') ds = Dataset(dspath).create(force=True) io = LocalIO() create_store(io, ppp_storepath, '1') lcl_url = "ria+{}".format(get_local_file_url(str(storepath))) create_ds_in_store(io, ppp_storepath, ds.id, '2', '1') ds.create_sibling_ria(lcl_url, "store", new_store_ok=True) ds.push('.', to='store') # now, remove annex/ tree from store in order to see, that clone # doesn't cause annex to recreate it. store_loc, _, _ = get_layout_locations(1, storepath, ds.id) annex = store_loc / 'annex' rmtree(str(annex)) assert_false(annex.exists()) clone_url = get_local_file_url(str(storepath), compatibility='git') + \ '#{}'.format(ds.id) clone("ria+{}".format(clone_url), clonepath) # no need to test the cloning itself - we do that over and over in here # bare repo in store still has no local annex: assert_false(annex.exists()) # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 @with_tempfile def patched_test_setup_store(io_cls, io_args, store=None): io = io_cls(*io_args) store = Path(store) # PATCH: introduce `ppp_store` and use it instead of `store` ppp_store = local_path2pure_posix_path(store) version_file = store / 'ria-layout-version' error_logs = store / 'error_logs' # invalid version raises: assert_raises(UnknownLayoutVersion, create_store, io, ppp_store, '2') # non-existing path should work: create_store(io, ppp_store, '1') assert_true(version_file.exists()) assert_true(error_logs.exists()) assert_true(error_logs.is_dir()) assert_equal([f for f in error_logs.iterdir()], []) # empty target directory should work as well: rmtree(str(store)) store.mkdir(exist_ok=False) create_store(io, ppp_store, '1') assert_true(version_file.exists()) assert_true(error_logs.exists()) assert_true(error_logs.is_dir()) assert_equal([f for f in error_logs.iterdir()], []) # re-execution also fine: create_store(io, ppp_store, '1') # but version conflict with existing target isn't: version_file.write_text("2|unknownflags\n") assert_raises(ValueError, create_store, io, ppp_store, '1') # TODO: check output reporting conflicting version "2" # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 @with_tempfile def patched_test_setup_ds_in_store(io_cls, io_args, store=None): io = io_cls(*io_args) store = Path(store) # PATCH: introduce `ppp_store` and use it instead of `store` ppp_store = local_path2pure_posix_path(store) # ATM create_ds_in_store doesn't care what kind of ID is provided dsid = "abc123456" ds_path = store / dsid[:3] / dsid[3:] # store layout version 1 version_file = ds_path / 'ria-layout-version' archives = ds_path / 'archives' objects = ds_path / 'annex' / 'objects' git_config = ds_path / 'config' # invalid store version: assert_raises(UnknownLayoutVersion, create_ds_in_store, io, ppp_store, dsid, '1', 'abc') # invalid obj version: assert_raises(UnknownLayoutVersion, create_ds_in_store, io, ppp_store, dsid, 'abc', '1') # version 1 create_store(io, ppp_store, '1') create_ds_in_store(io, ppp_store, dsid, '1', '1') for p in [ds_path, archives, objects]: assert_true(p.is_dir(), msg="Not a directory: %s" % str(p)) for p in [version_file]: assert_true(p.is_file(), msg="Not a file: %s" % str(p)) assert_equal(version_file.read_text(), "1\n") # conflicting version exists at target: assert_raises(ValueError, create_ds_in_store, io, ppp_store, dsid, '2', '1') # version 2 # Note: The only difference between version 1 and 2 are supposed to be the # key paths (dirhashlower vs mixed), which has nothing to do with # setup routine. rmtree(str(store)) create_store(io, ppp_store, '1') create_ds_in_store(io, ppp_store, dsid, '2', '1') for p in [ds_path, archives, objects]: assert_true(p.is_dir(), msg="Not a directory: %s" % str(p)) for p in [version_file]: assert_true(p.is_file(), msg="Not a file: %s" % str(p)) assert_equal(version_file.read_text(), "2\n") # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 @with_tempfile(mkdir=True) @serve_path_via_http @with_tempfile def patched_test_initremote(store_path=None, store_url=None, ds_path=None): ds = Dataset(ds_path).create() store_path = Path(store_path) # PATCH: introduce `ppp_store_path` and use it instead of `store_path` ppp_store_path = local_path2pure_posix_path(store_path) url = "ria+" + store_url init_opts = common_init_opts + ['url={}'.format(url)] # fail when there's no RIA store at the destination assert_raises(CommandError, ds.repo.init_remote, 'ora-remote', options=init_opts) # Doesn't actually create a remote if it fails assert_not_in('ora-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) # now make it a store io = LocalIO() create_store(io, ppp_store_path, '1') create_ds_in_store(io, ppp_store_path, ds.id, '2', '1') # fails on non-RIA URL assert_raises(CommandError, ds.repo.init_remote, 'ora-remote', options=common_init_opts + ['url={}' ''.format(store_path.as_uri())] ) # Doesn't actually create a remote if it fails assert_not_in('ora-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) ds.repo.init_remote('ora-remote', options=init_opts) assert_in('ora-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) assert_repo_status(ds.path) # git-annex:remote.log should have: # - url # - common_init_opts # - archive_id (which equals ds id) remote_log = ds.repo.call_git(['cat-file', 'blob', 'git-annex:remote.log'], read_only=True) assert_in("url={}".format(url), remote_log) [assert_in(c, remote_log) for c in common_init_opts] assert_in("archive-id={}".format(ds.id), remote_log) # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 # TODO: on crippled FS copytree to populate store doesn't seem to work. # Or may be it's just the serving via HTTP that doesn't work. # Either way, after copytree and fsck, whereis doesn't report # the store as an available source. @skip_if_adjusted_branch @known_failure_windows # see gh-4469 @with_tempfile(mkdir=True) @serve_path_via_http @with_tempfile def patched_test_read_access(store_path=None, store_url=None, ds_path=None): ds = Dataset(ds_path).create() populate_dataset(ds) files = [Path('one.txt'), Path('subdir') / 'two'] store_path = Path(store_path) # PATCH: introduce `ppp_store_path` and use it instead of `store_path` ppp_store_path = local_path2pure_posix_path(store_path) url = "ria+" + store_url init_opts = common_init_opts + ['url={}'.format(url)] io = LocalIO() create_store(io, ppp_store_path, '1') create_ds_in_store(io, ppp_store_path, ds.id, '2', '1') ds.repo.init_remote('ora-remote', options=init_opts) fsck_results = ds.repo.fsck(remote='ora-remote', fast=True) # Note: Failures in the special remote will show up as a success=False # result for fsck -> the call itself would not fail. for r in fsck_results: if "note" in r: # we could simply assert "note" to not be in r, but we want proper # error reporting - content of note, not just its unexpected # existence. assert_equal(r["success"], "true", msg="git-annex-fsck failed with ORA over HTTP: %s" % r) assert_equal(r["error-messages"], []) store_uuid = ds.siblings(name='ora-remote', return_type='item-or-list', result_renderer='disabled')['annex-uuid'] here_uuid = ds.siblings(name='here', return_type='item-or-list', result_renderer='disabled')['annex-uuid'] # nothing in store yet: for f in files: known_sources = ds.repo.whereis(str(f)) assert_in(here_uuid, known_sources) assert_not_in(store_uuid, known_sources) annex_obj_target = str(store_path / ds.id[:3] / ds.id[3:] / 'annex' / 'objects') shutil.rmtree(annex_obj_target) shutil.copytree(src=str(ds.repo.dot_git / 'annex' / 'objects'), dst=annex_obj_target) ds.repo.fsck(remote='ora-remote', fast=True) # all in store now: for f in files: known_sources = ds.repo.whereis(str(f)) assert_in(here_uuid, known_sources) assert_in(store_uuid, known_sources) ds.drop('.') res = ds.get('.') assert_equal(len(res), 4) assert_result_count(res, 4, status='ok', type='file', action='get', message="from ora-remote...") # try whether the reported access URL is correct one_url = ds.repo.whereis('one.txt', output='full' )[store_uuid]['urls'].pop() assert_status('ok', ds.download_url(urls=[one_url], path=str(ds.pathobj / 'dummy'))) # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 @with_tempfile @with_tempfile def patched_test_initremote_basic(url, io, store, ds_path, link): ds_path = Path(ds_path) store = Path(store) # PATCH: introduce `ppp_store` and use it instead of `store` ppp_store = local_path2pure_posix_path(store) link = Path(link) ds = Dataset(ds_path).create() populate_dataset(ds) init_opts = common_init_opts + ['url={}'.format(url)] # fails on non-existing storage location assert_raises(CommandError, ds.repo.init_remote, 'ria-remote', options=init_opts) # Doesn't actually create a remote if it fails assert_not_in('ria-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) # fails on non-RIA URL assert_raises(CommandError, ds.repo.init_remote, 'ria-remote', options=common_init_opts + ['url={}'.format(store.as_uri())] ) # Doesn't actually create a remote if it fails assert_not_in('ria-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) # set up store: create_store(io, ppp_store, '1') # still fails, since ds isn't setup in the store assert_raises(CommandError, ds.repo.init_remote, 'ria-remote', options=init_opts) # Doesn't actually create a remote if it fails assert_not_in('ria-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) # set up the dataset as well create_ds_in_store(io, ppp_store, ds.id, '2', '1') # now should work ds.repo.init_remote('ria-remote', options=init_opts) assert_in('ria-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) assert_repo_status(ds.path) # git-annex:remote.log should have: # - url # - common_init_opts # - archive_id (which equals ds id) remote_log = ds.repo.call_git(['cat-file', 'blob', 'git-annex:remote.log'], read_only=True) assert_in("url={}".format(url), remote_log) [assert_in(c, remote_log) for c in common_init_opts] assert_in("archive-id={}".format(ds.id), remote_log) # re-configure with invalid URL should fail: assert_raises( CommandError, ds.repo.call_annex, ['enableremote', 'ria-remote'] + common_init_opts + [ 'url=ria+file:///non-existing']) # but re-configure with valid URL should work if has_symlink_capability(): link.symlink_to(store) new_url = 'ria+{}'.format(link.as_uri()) ds.repo.call_annex( ['enableremote', 'ria-remote'] + common_init_opts + [ 'url={}'.format(new_url)]) # git-annex:remote.log should have: # - url # - common_init_opts # - archive_id (which equals ds id) remote_log = ds.repo.call_git(['cat-file', 'blob', 'git-annex:remote.log'], read_only=True) assert_in("url={}".format(new_url), remote_log) [assert_in(c, remote_log) for c in common_init_opts] assert_in("archive-id={}".format(ds.id), remote_log) # we can deal with --sameas, which leads to a special remote not having a # 'name' property, but only a 'sameas-name'. See gh-4259 try: ds.repo.init_remote('ora2', options=init_opts + ['--sameas', 'ria-remote']) except CommandError as e: if 'Invalid option `--sameas' in e.stderr: # annex too old - doesn't know --sameas pass else: raise # TODO: - check output of failures to verify it's failing the right way # - might require to run initremote directly to get the output # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 @known_failure_windows # see gh-4469 @with_tempfile @with_tempfile @with_tempfile def patched_test_remote_layout(host, dspath, store, archiv_store): dspath = Path(dspath) store = Path(store) archiv_store = Path(archiv_store) # PATCH: introduce `ppp_store` and use it instead of `store` ppp_store = local_path2pure_posix_path(store) ppp_archiv_store = local_path2pure_posix_path(archiv_store) ds = Dataset(dspath).create() populate_dataset(ds) assert_repo_status(ds.path) # set up store: io = SSHRemoteIO(host) if host else LocalIO() if host: store_url = "ria+ssh://{host}{path}".format(host=host, path=store) arch_url = "ria+ssh://{host}{path}".format(host=host, path=archiv_store) else: store_url = "ria+{}".format(store.as_uri()) arch_url = "ria+{}".format(archiv_store.as_uri()) create_store(io, ppp_store, '1') # TODO: Re-establish test for version 1 # version 2: dirhash create_ds_in_store(io, ppp_store, ds.id, '2', '1') # add special remote init_opts = common_init_opts + ['url={}'.format(store_url)] ds.repo.init_remote('store', options=init_opts) # copy files into the RIA store ds.push('.', to='store') # we should see the exact same annex object tree dsgit_dir, archive_dir, dsobj_dir = \ get_layout_locations(1, store, ds.id) store_objects = get_all_files(dsobj_dir) local_objects = get_all_files(ds.pathobj / '.git' / 'annex' / 'objects') assert_equal(len(store_objects), 4) if not ds.repo.is_managed_branch(): # with managed branches the local repo uses hashdirlower instead # TODO: However, with dataset layout version 1 this should therefore # work on adjusted branch the same way # TODO: Wonder whether export-archive-ora should account for that and # rehash according to target layout. assert_equal(sorted([p for p in store_objects]), sorted([p for p in local_objects]) ) if not io.get_7z(): raise SkipTest("No 7z available in RIA store") # we can simply pack up the content of the remote into a # 7z archive and place it in the right location to get a functional # archive remote create_store(io, ppp_archiv_store, '1') create_ds_in_store(io, ppp_archiv_store, ds.id, '2', '1') whereis = ds.repo.whereis('one.txt') dsgit_dir, archive_dir, dsobj_dir = \ get_layout_locations(1, archiv_store, ds.id) ds.export_archive_ora(archive_dir / 'archive.7z') init_opts = common_init_opts + ['url={}'.format(arch_url)] ds.repo.init_remote('archive', options=init_opts) # now fsck the new remote to get the new special remote indexed ds.repo.fsck(remote='archive', fast=True) assert_equal(len(ds.repo.whereis('one.txt')), len(whereis) + 1) # test creating an archive with filters on files ds.export_archive_ora(archive_dir / 'archive2.7z', annex_wanted='(include=*.txt)') # test with wanted expression of a specific remote ds.repo.set_preferred_content("wanted", "include=subdir/*", remote="store") ds.export_archive_ora(archive_dir / 'archive3.7z', remote="store") # test with the current sha ds.export_archive_ora( archive_dir / 'archive4.7z', froms=ds.repo.get_revisions()[1], ) # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 @known_failure_windows # see gh-4469 @with_tempfile @with_tempfile def patched_test_version_check(host, dspath, store): dspath = Path(dspath) store = Path(store) # PATCH: introduce `ppp_store` and use it instead of `store` ppp_store = local_path2pure_posix_path(store) ds = Dataset(dspath).create() populate_dataset(ds) assert_repo_status(ds.path) # set up store: io = SSHRemoteIO(host) if host else LocalIO() if host: store_url = "ria+ssh://{host}{path}".format(host=host, path=store) else: store_url = "ria+{}".format(store.as_uri()) create_store(io, ppp_store, '1') # TODO: Re-establish test for version 1 # version 2: dirhash create_ds_in_store(io, ppp_store, ds.id, '2', '1') # add special remote init_opts = common_init_opts + ['url={}'.format(store_url)] ds.repo.init_remote('store', options=init_opts) ds.push('.', to='store') # check version files remote_ds_tree_version_file = store / 'ria-layout-version' dsgit_dir, archive_dir, dsobj_dir = \ get_layout_locations(1, store, ds.id) remote_obj_tree_version_file = dsgit_dir / 'ria-layout-version' assert_true(remote_ds_tree_version_file.exists()) assert_true(remote_obj_tree_version_file.exists()) with open(str(remote_ds_tree_version_file), 'r') as f: assert_equal(f.read().strip(), '1') with open(str(remote_obj_tree_version_file), 'r') as f: assert_equal(f.read().strip(), '2') # Accessing the remote should not yield any output regarding versioning, # since it's the "correct" version. Note that "fsck" is an arbitrary choice. # We need just something to talk to the special remote. with swallow_logs(new_level=logging.INFO) as cml: ds.repo.fsck(remote='store', fast=True) # TODO: For some reason didn't get cml.assert_logged to assert # "nothing was logged" assert not cml.out # Now fake-change the version with open(str(remote_obj_tree_version_file), 'w') as f: f.write('X\n') # Now we should see a message about it with swallow_logs(new_level=logging.INFO) as cml: ds.repo.fsck(remote='store', fast=True) cml.assert_logged(level="INFO", msg="Remote object tree reports version X", regex=False) # reading still works: ds.drop('.') assert_status('ok', ds.get('.')) # but writing doesn't: with open(str(Path(ds.path) / 'new_file'), 'w') as f: f.write("arbitrary addition") ds.save(message="Add a new_file") with assert_raises((CommandError, IncompleteResultsError)): ds.push('new_file', to='store') # However, we can force it by configuration ds.config.add("annex.ora-remote.store.force-write", "true", scope='local') ds.push('new_file', to='store') # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 # git-annex-testremote is way too slow on crippled FS. # Use is_managed_branch() as a proxy and skip only here # instead of in a decorator @skip_if_adjusted_branch @known_failure_windows # see gh-4469 @with_tempfile @with_tempfile def patched_test_gitannex(host, store, dspath): dspath = Path(dspath) store = Path(store) # PATCH: introduce `ppp_store` and use it instead of `store` ppp_store = local_path2pure_posix_path(store) ds = Dataset(dspath).create() populate_dataset(ds) assert_repo_status(ds.path) # set up store: io = SSHRemoteIO(host) if host else LocalIO() if host: store_url = "ria+ssh://{host}{path}".format(host=host, path=store) else: store_url = "ria+{}".format(store.as_uri()) create_store(io, ppp_store, '1') # TODO: Re-establish test for version 1 # version 2: dirhash create_ds_in_store(io, ppp_store, ds.id, '2', '1') # add special remote init_opts = common_init_opts + ['url={}'.format(store_url)] ds.repo.init_remote('store', options=init_opts) from datalad.support.external_versions import external_versions if '8.20200330' < external_versions['cmd:annex'] < '8.20200624': # https://git-annex.branchable.com/bugs/testremote_breeds_way_too_many_instances_of_the_externals_remote/?updated raise SkipTest( "git-annex might lead to overwhelming number of external " "special remote instances") # run git-annex-testremote # note, that we don't want to capture output. If something goes wrong we # want to see it in test build's output log. ds.repo._call_annex(['testremote', 'store'], protocol=NoCapture) # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 @known_failure_windows @with_tempfile @with_tempfile @with_tempfile def patched_test_push_url(storepath=None, dspath=None, blockfile=None): dspath = Path(dspath) store = Path(storepath) # PATCH: introduce `ppp_store` and use it instead of `store` ppp_store = local_path2pure_posix_path(store) blockfile = Path(blockfile) blockfile.touch() ds = Dataset(dspath).create() populate_dataset(ds) assert_repo_status(ds.path) repo = ds.repo # set up store: io = LocalIO() store_url = "ria+{}".format(store.as_uri()) create_store(io, ppp_store, '1') create_ds_in_store(io, ppp_store, ds.id, '2', '1') # initremote fails with invalid url (not a ria+ URL): invalid_url = (store.parent / "non-existent").as_uri() init_opts = common_init_opts + ['url={}'.format(store_url), 'push-url={}'.format(invalid_url)] assert_raises(CommandError, ds.repo.init_remote, 'store', options=init_opts) # initremote succeeds with valid but inaccessible URL (pointing to a file # instead of a store): block_url = "ria+" + blockfile.as_uri() init_opts = common_init_opts + ['url={}'.format(store_url), 'push-url={}'.format(block_url)] repo.init_remote('store', options=init_opts) store_uuid = ds.siblings(name='store', return_type='item-or-list')['annex-uuid'] here_uuid = ds.siblings(name='here', return_type='item-or-list')['annex-uuid'] # but a push will fail: assert_raises(CommandError, ds.repo.call_annex, ['copy', 'one.txt', '--to', 'store']) # reconfigure w/ local overwrite: repo.config.add("remote.store.ora-push-url", store_url, scope='local') # push works now: repo.call_annex(['copy', 'one.txt', '--to', 'store']) # remove again (config and file from store) repo.call_annex(['move', 'one.txt', '--from', 'store']) repo.config.unset("remote.store.ora-push-url", scope='local') repo.call_annex(['fsck', '-f', 'store']) known_sources = repo.whereis('one.txt') assert_in(here_uuid, known_sources) assert_not_in(store_uuid, known_sources) # reconfigure (this time committed) init_opts = common_init_opts + ['url={}'.format(store_url), 'push-url={}'.format(store_url)] repo.enable_remote('store', options=init_opts) # push works now: repo.call_annex(['copy', 'one.txt', '--to', 'store']) known_sources = repo.whereis('one.txt') assert_in(here_uuid, known_sources) assert_in(store_uuid, known_sources) # taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 # Skipping on adjusted branch as a proxy for crippledFS. Write permissions of # the owner on a directory can't be revoked on VFAT. "adjusted branch" is a # bit broad but covers the CI cases. And everything RIA/ORA doesn't currently # properly run on crippled/windows anyway. Needs to be more precise when # RF'ing will hopefully lead to support on windows in principle. @skip_if_adjusted_branch @known_failure_windows @with_tempfile @with_tempfile def patched_test_permission(host, storepath, dspath): # Test whether ORA correctly revokes and obtains write permissions within # the annex object tree. That is: Revoke after ORA pushed a key to store # in order to allow the object tree to safely be used with an ephemeral # clone. And on removal obtain write permissions, like annex would # internally on a drop (but be sure to restore if something went wrong). dspath = Path(dspath) storepath = Path(storepath) # PATCH: introduce `ppp_storepath` and use it instead of `storepath` ppp_storepath = local_path2pure_posix_path(storepath) ds = Dataset(dspath).create() populate_dataset(ds) ds.save() assert_repo_status(ds.path) testfile = 'one.txt' # set up store: io = SSHRemoteIO(host) if host else LocalIO() if host: store_url = "ria+ssh://{host}{path}".format(host=host, path=storepath) else: store_url = "ria+{}".format(storepath.as_uri()) create_store(io, ppp_storepath, '1') create_ds_in_store(io, ppp_storepath, ds.id, '2', '1') _, _, obj_tree = get_layout_locations(1, storepath, ds.id) assert_true(obj_tree.is_dir()) file_key_in_store = obj_tree / 'X9' / '6J' / 'MD5E-s8--7e55db001d319a94b0b713529a756623.txt' / 'MD5E-s8--7e55db001d319a94b0b713529a756623.txt' init_opts = common_init_opts + ['url={}'.format(store_url)] ds.repo.init_remote('store', options=init_opts) store_uuid = ds.siblings(name='store', return_type='item-or-list')['annex-uuid'] here_uuid = ds.siblings(name='here', return_type='item-or-list')['annex-uuid'] known_sources = ds.repo.whereis(testfile) assert_in(here_uuid, known_sources) assert_not_in(store_uuid, known_sources) assert_false(file_key_in_store.exists()) ds.repo.call_annex(['copy', testfile, '--to', 'store']) known_sources = ds.repo.whereis(testfile) assert_in(here_uuid, known_sources) assert_in(store_uuid, known_sources) assert_true(file_key_in_store.exists()) # Revoke write permissions from parent dir in-store to test whether we # still can drop (if we can obtain the permissions). Note, that this has # no effect on VFAT. file_key_in_store.parent.chmod(file_key_in_store.parent.stat().st_mode & ~stat.S_IWUSR) # we can't directly delete; key in store should be protected assert_raises(PermissionError, file_key_in_store.unlink) # ORA can still drop, since it obtains permission to: ds.repo.call_annex(['drop', testfile, '--from', 'store']) known_sources = ds.repo.whereis(testfile) assert_in(here_uuid, known_sources) assert_not_in(store_uuid, known_sources) assert_false(file_key_in_store.exists()) # Overwrite `_postclonetest_prepare` to handle paths properly apply_patch( 'datalad.core.distributed.tests.test_clone', None, '_postclonetest_prepare', patched__postclonetest_prepare, 'modify _postclonetest_prepare to use PurePosixPath-arguments ' 'in RIA-methodes' ) apply_patch( 'datalad.core.distributed.tests.test_clone', None, 'test_ria_postclone_noannex', patched_test_ria_postclone_noannex, 'modify test_ria_postclone_noannex to use PurePosixPath-arguments ' 'in RIA-methods' ) apply_patch( 'datalad.customremotes.tests.test_ria_utils', None, '_test_setup_store', patched_test_setup_store, 'modify _test_setup_store to use PurePosixPath-arguments in RIA-methods' ) apply_patch( 'datalad.customremotes.tests.test_ria_utils', None, '_test_setup_ds_in_store', patched_test_setup_ds_in_store, 'modify _test_setup_ds_in_store to use PurePosixPath-arguments ' 'in RIA-methods' ) apply_patch( 'datalad.distributed.tests.test_ora_http', None, 'test_initremote', patched_test_initremote, 'modify test_initremote to use PurePosixPath-arguments in RIA-methods' ) apply_patch( 'datalad.distributed.tests.test_ora_http', None, 'test_read_access', patched_test_read_access, 'modify test_read_access to use PurePosixPath-arguments in RIA-methods' ) apply_patch( 'datalad.distributed.tests.test_ria_basics', None, '_test_initremote_basic', patched_test_initremote_basic, 'modify _test_initremote_basic to use PurePosixPath-arguments ' 'in RIA-methods' ) apply_patch( 'datalad.distributed.tests.test_ria_basics', None, '_test_remote_layout', patched_test_remote_layout, 'modify _test_remote_layout to use PurePosixPath-arguments in RIA-methods' ) apply_patch( 'datalad.distributed.tests.test_ria_basics', None, '_test_version_check', patched_test_version_check, 'modify _test_version_check to use PurePosixPath-arguments in RIA-methods' ) apply_patch( 'datalad.distributed.tests.test_ria_basics', None, '_test_gitannex', patched_test_gitannex, 'modify _test_gitannex to use PurePosixPath-arguments in RIA-methods' ) apply_patch( 'datalad.distributed.tests.test_ria_basics', None, 'test_push_url', patched_test_push_url, 'modify test_push_url to use PurePosixPath-arguments in RIA-methods' ) apply_patch( 'datalad.distributed.tests.test_ria_basics', None, '_test_permission', patched_test_permission, 'modify _test_permission to use PurePosixPath-arguments in RIA-methods' ) datalad-next-1.4.1/datalad_next/patches/interface_utils.py000066400000000000000000000254741462321624600237160ustar00rootroot00000000000000"""Uniform pre-execution parameter validation for commands With this patch commands can now opt-in to receive fully validated parameters. This can substantially simplify the implementation complexity of a command at the expense of a more elaborate specification of the structural and semantic properties of the parameters. For details on implementing validation for individual commands see :class:`datalad_next.commands.ValidatedInterface`. """ import logging from typing import ( Callable, Dict, Generator, ) from datalad import cfg as dlcfg from datalad.core.local.resulthooks import ( get_jsonhooks_from_config, match_jsonhook2result, run_jsonhook, ) from datalad.interface.results import known_result_xfms from datalad.interface.utils import ( anInterface, get_result_filter, keep_result, render_action_summary, xfm_result, _process_results, ) from datalad_next.exceptions import IncompleteResultsError from . import apply_patch from datalad_next.constraints import DatasetParameter # use same logger as -core lgr = logging.getLogger('datalad.interface.utils') # this is a replacement for datalad.interface.base.get_allargs_as_kwargs # it reports which arguments were at their respective defaults def get_allargs_as_kwargs(call, args, kwargs): """Generate a kwargs dict from a call signature and ``*args``, ``**kwargs`` Basically resolving the argnames for all positional arguments, and resolving the defaults for all kwargs that are not given in a kwargs dict Returns ------- (dict, set, set) The first return value is a mapping of argument names to their respective values. The second return value in the tuple is a set of argument names for which the effective value is identical to the default declared in the signature of the callable. The third value is a set with names of all mandatory arguments, whether or not they are included in the returned mapping. """ from datalad_next.utils import getargspec argspec = getargspec(call, include_kwonlyargs=True) defaults = argspec.defaults nargs = len(argspec.args) defaults = defaults or [] # ensure it is a list and not None assert (nargs >= len(defaults)) # map any args to their name argmap = list(zip(argspec.args[:len(args)], args)) kwargs_ = dict(argmap) # map defaults of kwargs to their names (update below) default_map = dict(zip(argspec.args[-len(defaults):], defaults)) for k, v in default_map.items(): if k not in kwargs_: kwargs_[k] = v # update with provided kwarg args kwargs_.update(kwargs) # determine which arguments still have values identical to their declared # defaults at_default = set( k for k in kwargs_ if k in default_map and default_map[k] == kwargs_[k] ) # XXX we cannot assert the following, because our own highlevel # API commands support more kwargs than what is discoverable # from their signature... #assert (nargs == len(kwargs_)) return ( # argument name/value mapping kwargs_, # names of arguments that are at their default at_default, # names of mandatory arguments (set for uniformity) set(argspec.args), ) # This function interface is taken from # datalad-core@209bc319db8f34cceae4fee86493bf41927676fd def _execute_command_( *, interface: anInterface, cmd: Callable[..., Generator[Dict, None, None]], cmd_args: tuple, cmd_kwargs: Dict, exec_kwargs: Dict, ) -> Generator[Dict, None, None]: """Internal helper to drive a command execution generator-style Parameters ---------- interface: Interface class of associated with the `cmd` callable cmd: A DataLad command implementation. Typically the `__call__()` of the given `interface`. cmd_args: Positional arguments for `cmd`. cmd_kwargs: Keyword arguments for `cmd`. exec_kwargs: Keyword argument affecting the result handling. See `datalad.interface.common_opts.eval_params`. """ # for result filters and validation # we need to produce a dict with argname/argvalue pairs for all args # incl. defaults and args given as positionals allkwargs, at_default, required_args = get_allargs_as_kwargs( cmd, cmd_args, {**cmd_kwargs, **exec_kwargs}, ) # validate the complete parameterization param_validator = interface.get_parameter_validator() \ if hasattr(interface, 'get_parameter_validator') else None if param_validator is None: lgr.debug( 'Command parameter validation skipped. %s declares no validator', interface) else: lgr.debug('Command parameter validation for %s', interface) validator_kwargs = dict( at_default=at_default, required=required_args or None, ) # make immediate vs exhaustive parameter validation # configurable raise_on_error = dlcfg.get( 'datalad.runtime.parameter-violation', None) if raise_on_error: validator_kwargs['on_error'] = raise_on_error allkwargs = param_validator( allkwargs, **validator_kwargs ) lgr.debug('Command parameter validation ended for %s', interface) # look for potential override of logging behavior result_log_level = dlcfg.get('datalad.log.result-level', 'debug') # resolve string labels for transformers too result_xfm = known_result_xfms.get( allkwargs['result_xfm'], # use verbatim, if not a known label allkwargs['result_xfm']) result_filter = get_result_filter(allkwargs['result_filter']) result_renderer = allkwargs['result_renderer'] if result_renderer == 'tailored' and not hasattr(interface, 'custom_result_renderer'): # a tailored result renderer is requested, but the class # does not provide any, fall back to the generic one result_renderer = 'generic' if result_renderer == 'default': # standardize on the new name 'generic' to avoid more complex # checking below result_renderer = 'generic' # figure out which hooks are relevant for this command execution # query cfg for defaults # .is_installed and .config can be costly, so ensure we do # it only once. See https://github.com/datalad/datalad/issues/3575 dataset_arg = allkwargs.get('dataset', None) ds = None if dataset_arg is not None: from datalad_next.datasets import Dataset if isinstance(dataset_arg, Dataset): ds = dataset_arg elif isinstance(dataset_arg, DatasetParameter): ds = dataset_arg.ds else: try: ds = Dataset(dataset_arg) except ValueError: pass # look for hooks hooks = get_jsonhooks_from_config(ds.config if ds else dlcfg) # end of hooks discovery # flag whether to raise an exception incomplete_results = [] # track what actions were performed how many times action_summary = {} # if a custom summary is to be provided, collect the results # of the command execution results = [] do_custom_result_summary = result_renderer in ( 'tailored', 'generic', 'default') and hasattr( interface, 'custom_result_summary_renderer') pass_summary = do_custom_result_summary \ and getattr(interface, 'custom_result_summary_renderer_pass_summary', None) # process main results for r in _process_results( # execution, call with any arguments from the validated # set that are no result-handling related cmd(**{k: v for k, v in allkwargs.items() if k not in exec_kwargs}), interface, allkwargs['on_failure'], # bookkeeping action_summary, incomplete_results, # communication result_renderer, result_log_level, # let renderers get to see how a command was called allkwargs): for hook, spec in hooks.items(): # run the hooks before we yield the result # this ensures that they are executed before # a potentially wrapper command gets to act # on them if match_jsonhook2result(hook, r, spec['match']): lgr.debug('Result %s matches hook %s', r, hook) # a hook is also a command that yields results # so yield them outside too # users need to pay attention to void infinite # loops, i.e. when a hook yields a result that # triggers that same hook again for hr in run_jsonhook(hook, spec, r, dataset_arg): # apply same logic as for main results, otherwise # any filters would only tackle the primary results # and a mixture of return values could happen if not keep_result(hr, result_filter, **allkwargs): continue hr = xfm_result(hr, result_xfm) # rationale for conditional is a few lines down if hr: yield hr if not keep_result(r, result_filter, **allkwargs): continue r = xfm_result(r, result_xfm) # in case the result_xfm decided to not give us anything # exclude it from the results. There is no particular reason # to do so other than that it was established behavior when # this comment was written. This will not affect any real # result record if r: yield r # collect if summary is desired if do_custom_result_summary: results.append(r) # result summary before a potential exception # custom first if do_custom_result_summary: if pass_summary: summary_args = (results, action_summary) else: summary_args = (results,) interface.custom_result_summary_renderer(*summary_args) elif result_renderer in ('generic', 'default') \ and action_summary \ and sum(sum(s.values()) for s in action_summary.values()) > 1: # give a summary in generic mode, when there was more than one # action performed render_action_summary(action_summary) if incomplete_results: raise IncompleteResultsError( failed=incomplete_results, msg="Command did not complete successfully") # apply patch patch_msg = \ 'Apply datalad-next patch to interface.(utils|base).py:_execute_command_' apply_patch('datalad.interface.base', None, '_execute_command_', _execute_command_, msg=patch_msg) datalad-next-1.4.1/datalad_next/patches/patch_ria_ora.py000066400000000000000000000016711462321624600233220ustar00rootroot00000000000000"""This file collects all patches for ORA/RIA-related code. The patches have to goals: 1. Improve stability and consolidate code by using persistent shell support in class :class:`SSHRemoteIO`. 2. Improve ORA/RIA-related code so that it also works on Windows. """ from os import environ from . import ( add_method_url2transport_path, # this replaces SSHRemoteIO entirely replace_sshremoteio, # The following patches add Windows-support to ORA/RIA code ria_utils, replace_ora_remote, ) # we only want to import the patches for the tests when actually running # under pytest. this prevents inflating the runtime dependency with # test-only dependencies -- which would be needed for the necessary imports if environ.get("PYTEST_VERSION"): from . import fix_ria_ora_tests from . import ( # `replace_create_sibling_ria` be imported after `replace_sshremoteio` # and `ria_utils`. replace_create_sibling_ria, ) datalad-next-1.4.1/datalad_next/patches/push_optimize.py000066400000000000000000000412211462321624600234210ustar00rootroot00000000000000"""Make push avoid refspec handling for special remote push targets This change introduces a replacement for core's ``push.py:_push()`` with a more intelligible flow. It replaces the stalled https://github.com/datalad/datalad/pull/6666 Importantly, it makes one behavior change, which is desirable IMHO. Instead of rejecting to git-push any refspec for a repo with a detached HEAD, it will attempt to push a git-annex branch for an AnnexRepo. The respective test that ensured this behavior beyond the particular conditions the original problem occurred in was adjusted accordingly. All ``push`` tests from core are imported and executed to ensure proper functioning. Summary of the original commits patching the core implementation. - Consolidate publication dependency handling in one place - Consolidate tracking of git-push-dryrun exec Make a failed attempt discriminable from no prior attempt. - Factor out helper to determine refspecs-to-push for a target - Consolidate more handling of git-pushed and make conditional on an actual git-remote target This change is breaking behavior, because previously a source repository without an active branch would have been rejected for a push attempt. However, this is a bit questionable, because the git-annex branch might well need a push. - Simplify push-logic: no need for a fetch, if there is no git-push - Factor out helper to sync a remote annex-branch - Adjust test to constrain the evaluated conditions (replacement tests is included here) As per the reasoning recorded in datalad#1811 (comment) the test ensuring the continue fix of datalad#1811 is actually verifying a situation that is not fully desirable. It prevents pushing of thew 'git-annex' branch whenever a repo is on a detached HEAD. This change let's the test run on a plain Git repo, where there is indeed nothing to push in this case. """ from itertools import chain import logging import re import datalad.core.distributed.push as mod_push from datalad_next.utils import ( ensure_list, log_progress, ) from datalad_next.datasets import ( LegacyAnnexRepo as AnnexRepo, Dataset, ) from . import apply_patch lgr = logging.getLogger('datalad.core.distributed.push') def _push(dspath, content, target, data, force, jobs, res_kwargs, pbars, got_path_arg=False): force_git_push = force in ('all', 'gitpush') # nothing recursive in here, we only need a repo to work with ds = Dataset(dspath) repo = ds.repo res_kwargs.update(type='dataset', path=dspath) # content will be unique for every push (even on the same dataset) pbar_id = 'push-{}-{}'.format(target, id(content)) # register for final orderly take down pbars[pbar_id] = ds log_progress( lgr.info, pbar_id, 'Determine push target', unit=' Steps', label='Push', total=4, ) # # First we must figure out where to push to, if needed # # will contain info to determine what refspecs need to be pushed # and to which remote, if none is given wannabe_gitpush = None # pristine input arg _target = target # verified or auto-detected target sibling name target, status, message, wannabe_gitpush = _get_push_target(repo, target) if target is None: yield dict( res_kwargs, status=status, message=message, ) return log_progress( lgr.info, pbar_id, "Push refspecs", label="Push to '{}'".format(target), update=1, total=4) # cache repo type is_annex_repo = isinstance(ds.repo, AnnexRepo) # handling pure special remotes is a lot simpler target_is_git_remote = repo.config.get( f'remote.{target}.url', None) is not None # TODO would is be useful to also check whether the # target is set 'annex-ignore' right here? if target_is_git_remote: # branch and refspec only need handling for Git remotes refspecs2push = _get_refspecs2push( repo, is_annex_repo, target, target_arg=_target, wannabe_gitpush=wannabe_gitpush) if not refspecs2push: # nothing was set up for push, and we have no active branch # this is a weird one, let's confess and stop here # I don't think we need to support such a scenario yield dict( res_kwargs, status='impossible', message= 'There is no active branch, cannot determine remote ' 'branch' ) return # # We know where to push to, honor dependencies # XXX we could do this right after we know the value of `target`, # but this would mean we would also push to dependencies # even when no actual push to the primary target is needed # # list of remotes that are publication dependencies for the # target remote # multiple dependencies could come from multiple declarations # of such a config items, but each declaration could also # contain a white-space separated list of remote names # see https://github.com/datalad/datalad/issues/6867 publish_depends = list(chain.from_iterable( d.split() for d in ensure_list( ds.config.get( f'remote.{target}.datalad-publish-depends', [], get_all=True)))) if publish_depends: lgr.debug("Discovered publication dependencies for '%s': %s'", target, publish_depends) # we know what to push and where, now dependency processing first for r in publish_depends: # simply make a call to this function again, all the same, but # target is different # TODO: what if a publication dependency doesn't have any of the # determined refspecs2push, yet. Should we not attempt to push them, # because the main target has it? yield from _push( dspath, content, # to this particular dependency r, data, force, jobs, res_kwargs.copy(), pbars, got_path_arg=got_path_arg, ) # and lastly the primary push target # git-annex data copy # if is_annex_repo: if data != "nothing": log_progress( lgr.info, pbar_id, "Transfer data", label="Transfer data to '{}'".format(target), update=2, total=4) yield from mod_push._transfer_data( repo, ds, target, content, data, force, jobs, res_kwargs.copy(), got_path_arg=got_path_arg, ) else: lgr.debug("Data transfer to '%s' disabled by argument", target) else: lgr.debug("No data transfer: %s is not a git annex repository", repo) if not target_is_git_remote or not refspecs2push: # there is nothing that we need to push or sync with on the git-side # of things with this remote lgr.debug('No git-remote or no refspecs found that need to be pushed') # TODO ensure progress bar is ended properly return log_progress( lgr.info, pbar_id, "Update availability information", label="Update availability for '{}'".format(target), update=3, total=4) # TODO fetch is only needed if anything was actually transferred. Collect this # info and make the following conditional on it # after file transfer the remote might have different commits to # the annex branch. They have to be merged locally, otherwise a # push of it further down will fail _sync_remote_annex_branch(repo, target, is_annex_repo) # and push all relevant branches, plus the git-annex branch to announce # local availability info too yield from mod_push._push_refspecs( repo, target, refspecs2push, force_git_push, res_kwargs.copy(), ) def _append_branch_to_refspec_if_needed(repo, refspecs, branch): # try to anticipate any flavor of an idea of a branch ending up in a # refspec looks_like_that_branch = re.compile( r'((^|.*:)refs/heads/|.*:|^){}$'.format(branch)) if all(not looks_like_that_branch.match(r) for r in refspecs): refspecs.append( branch if repo.config.get('branch.{}.merge'.format(branch), None) else '{branch}:{branch}'.format(branch=branch) ) def _get_push_dryrun(repo, remote=None): """ Returns ------- list The result of the dry-run. Will be an empty list if the dry-run failed for any reason. """ try: wannabe_gitpush = repo.push(remote=remote, git_options=['--dry-run']) except Exception as e: lgr.debug( 'Dry-run push to %r remote failed, ' 'assume no configuration: %s', remote if remote else 'default', e) wannabe_gitpush = [] return wannabe_gitpush def _get_push_target(repo, target_arg): """ Returns ------- str or None, str, str or None, list or None Target label, if determined; status label; optional message; git-push-dryrun result for reuse or None, if no dry-run was attempted. """ # verified or auto-detected target = None # for reuse wannabe_gitpush = None if not target_arg: # let Git figure out what needs doing # we will reuse the result further down again, so nothing is wasted wannabe_gitpush = _get_push_dryrun(repo) # we did not get an explicit push target, get it from Git target = set(p.get('remote', None) for p in wannabe_gitpush) # handle case where a pushinfo record did not have a 'remote' # property -- should not happen, but be robust target.discard(None) if not len(target): return ( None, 'impossible', 'No push target given, and none could be ' 'auto-detected, please specify via --to', wannabe_gitpush, ) elif len(target) > 1: # dunno if this can ever happen, but if it does, report # nicely return ( None, 'error', ('No push target given, ' 'multiple candidates auto-detected: %s', list(target)), wannabe_gitpush, ) else: # can only be a single one at this point target = target.pop() if not target: if target_arg not in repo.get_remotes(): return ( None, 'error', ("Unknown target sibling '%s'.", target_arg), wannabe_gitpush, ) target = target_arg # we must have a valid target label now assert target return (target, 'ok', None, wannabe_gitpush) def _get_refspecs2push(repo, is_annex_repo, target, target_arg=None, wannabe_gitpush=None): """Determine which refspecs shall be pushed to target Parameters ---------- repo: Repo target: str Pre-determined push target target_arg: str, optional Target level given to original push() call, if any. wannabe_gitpush: list, optional Any cashed git-push-dryrun results for `target` Returns ------- list Refspec labels """ # (possibly redo) a push attempt to figure out what needs pushing # do this on the main target only, and apply the result to all # dependencies if target_arg and wannabe_gitpush is None: # only do it when an explicit target was given, otherwise # we can reuse the result from the auto-probing above wannabe_gitpush = _get_push_dryrun(repo, remote=target) refspecs2push = [ # if an upstream branch is set, go with it p['from_ref'] if repo.config.get( # refs come in as refs/heads/ # need to cut the prefix 'branch.{}.remote'.format(p['from_ref'][11:]), None) == target and repo.config.get( 'branch.{}.merge'.format(p['from_ref'][11:]), None) # if not, define target refspec explicitly to avoid having to # set an upstream branch, which would happen implicitly from # a users POV, and may also be hard to decide when publication # dependencies are present else '{}:{}'.format(p['from_ref'], p['to_ref']) for p in wannabe_gitpush if 'uptodate' not in p['operations'] and ( # cannot think of a scenario where we would want to push a # managed branch directly, instead of the corresponding branch 'refs/heads/adjusted' not in p['from_ref']) ] active_branch = repo.get_active_branch() if active_branch and is_annex_repo: # we could face a managed branch, in which case we need to # determine the actual one and make sure it is sync'ed with the # managed one, and push that one instead. following methods can # be called unconditionally repo.localsync(managed_only=True) active_branch = repo.get_corresponding_branch( active_branch) or active_branch # make sure that we always push the active branch (the context for the # potential path arguments) and the annex branch -- because we claim # to know better than any git config must_have_branches = [active_branch] if active_branch else [] if is_annex_repo: must_have_branches.append('git-annex') for branch in must_have_branches: # refspecs2push= (in-place modification inside) _append_branch_to_refspec_if_needed(repo, refspecs2push, branch) return refspecs2push def _sync_remote_annex_branch(repo, target, is_annex_repo): """Fetch remote annex-branch and merge locally Useful to ensure a push to the target will not fail due to unmerged remote changes. Parameters ---------- repo: Repo target: str is_annex_repo: bool """ try: # fetch remote, let annex sync them locally, so that the push # later on works. # We have to fetch via the push url (if there is any), # not a pull url. # The latter might be dumb and without the execution of a # post-update hook we might not be able to retrieve the # server-side git-annex branch updates (and git-annex does # not trigger the hook on copy), but we know we have # full access via the push url -- we have just used it to copy. lgr.debug("Fetching 'git-annex' branch updates from '%s'", target) fetch_cmd = ['fetch', target, 'git-annex'] pushurl = repo.config.get( 'remote.{}.pushurl'.format(target), None) if pushurl: # for some reason overwriting remote.{target}.url # does not have any effect... fetch_cmd = [ '-c', 'url.{}.insteadof={}'.format( pushurl, repo.config.get( 'remote.{}.url'.format(target), None) ) ] + fetch_cmd lgr.debug( "Sync local annex branch from pushurl after remote " 'availability update.') # XXX when this is changed to `call_git()`, # make sure to `force_c_locale=True` repo.call_git(fetch_cmd) # If no CommandError was raised, it means that remote has git-annex # but local repo might not be an annex yet. Since there is nothing to "sync" # from us, we just skip localsync without mutating repo into an AnnexRepo if is_annex_repo: repo.localsync(target) except mod_push.CommandError as e: # it is OK if the remote doesn't have a git-annex branch yet # (e.g. fresh repo) # Is this even possible? we just copied? Maybe check if anything # was actually copied? # Yes, this is possible. The current implementation of the datalad-annex # special remote would run into this situation. It would copy annex objects # to a new location just fine, but until a repository deposit was made # (and this implementation of push only does this as a second step), it # could not retrieve any refs from the remote. # the following conditional tests for the common prefix of the respective # error message by Git and the Git-channeled error message from the # datalad-annex remote helper. if "fatal: couldn't find remote ref" not in e.stderr.lower(): raise lgr.debug('Remote does not have a git-annex branch: %s', e) apply_patch('datalad.core.distributed.push', None, '_push', _push) datalad-next-1.4.1/datalad_next/patches/push_to_export_remote.py000066400000000000000000000236201462321624600251620ustar00rootroot00000000000000"""Add support for export to WebDAV remotes to ``push()`` This approach generally works for any special remote configured with ``exporttree=yes``, but is only tested for ``type=webdav``. A smooth operation requires automatic deployment of credentials. Support for that is provide and limited by the capabilities of ``needs_specialremote_credential_envpatch()``. """ import logging from typing import ( Dict, Generator, Iterable, Optional, Union, ) import datalad.core.distributed.push as mod_push from datalad_next.constraints import EnsureChoice from datalad_next.exceptions import CapturedException from datalad_next.commands import Parameter from datalad_next.datasets import ( LegacyAnnexRepo as AnnexRepo, Dataset, ) from datalad_next.utils import ( CredentialManager, get_specialremote_credential_envpatch, get_specialremote_credential_properties, needs_specialremote_credential_envpatch, patched_env, ) from . import apply_patch lgr = logging.getLogger('datalad.core.distributed.push') def _is_export_remote(remote_info: Optional[Dict]) -> bool: """Check if remote_info is valid and has exporttree set to "yes" Parameters ---------- remote_info: Optional[Dict] Optional dictionary the contains git annex special. Returns ------- bool True if exporttree key is contained in remote_info and is set to yes, else False. """ if remote_info is not None: return remote_info.get("exporttree") == "yes" return False def _get_credentials(ds: Dataset, remote_info: Dict ) -> Optional[Dict]: # Check for credentials params = { "type": remote_info.get("type"), "url": remote_info.get("url") } credentials = None credential_properties = get_specialremote_credential_properties(params) if credential_properties: # TODO: lower prio: factor this if clause out, also used in # create_sibling_webdav.py credential_manager = CredentialManager(ds.config) credentials = (credential_manager.query( _sortby='last-used', **credential_properties) or [(None, None)])[0][1] return credentials def get_export_records(repo: AnnexRepo) -> Generator: """Read exports that git-annex recorded in its 'export.log'-file Interpret the lines in export.log. Each line has the following structure: time-stamp " " source-annex-uuid ":" destination-annex-uuid " " treeish Parameters ---------- repo: AnnexRepo The annex repo from which exports should be determined Returns ------- Generator Generator yielding one dictionary for each export entry in git-annex. Each dictionary contains the keys: "timestamp", "source-annex-uuid", "destination-annex-uuid", "treeish". The timestamp-value is a float, all other values are strings. """ try: # XXX when this is changed to `call_git()`, make sure to use # `force_c_locale=True` for line in repo.call_git_items_(["cat-file", "blob", "git-annex:export.log"]): result_dict = dict(zip( [ "timestamp", "source-annex-uuid", "destination-annex-uuid", "treeish" ], line.replace(":", " ").split() )) result_dict["timestamp"] = float(result_dict["timestamp"][:-1]) yield result_dict except mod_push.CommandError as command_error: # Some errors indicate that there was no export yet. # May depend on Git version expected_errors = ( "fatal: Not a valid object name git-annex:export.log", "fatal: path 'export.log' does not exist in 'git-annex'", # v2.36 ) if command_error.stderr.strip() in expected_errors: return raise def _get_export_log_entry(repo: AnnexRepo, target_uuid: str ) -> Optional[Dict]: target_entries = [ entry for entry in repo.get_export_records() if entry["destination-annex-uuid"] == target_uuid] if not target_entries: return None return sorted(target_entries, key=lambda e: e["timestamp"])[-1] def _is_valid_treeish(repo: AnnexRepo, export_entry: Dict, ) -> bool: # Due to issue https://github.com/datalad/datalad-next/issues/39 # fast-forward validation has to be re-designed. return True #for line in repo.call_git_items_(["log", "--pretty=%H %T"]): # commit_hash, treeish = line.split() # if treeish == export_entry["treeish"]: # return True #return False def _transfer_data(repo: AnnexRepo, ds: Dataset, target: str, content: Iterable, data: str, force: Optional[str], jobs: Optional[Union[str, int]], res_kwargs: Dict, got_path_arg: bool ) -> Generator: target_uuid, remote_info = ([ (uuid, info) for uuid, info in repo.get_special_remotes().items() if info.get("name") == target] or [(None, None)])[0] if not _is_export_remote(remote_info): yield from mod_push._push_data( ds, target, content, data, force, jobs, res_kwargs.copy(), got_path_arg=got_path_arg, ) return from datalad.interface.results import annexjson2result # TODO: # - check for configuration entries, e.g. what to export lgr.debug(f"Exporting HEAD of {ds} to remote {remote_info}") if ds.config.getbool('remote.{}'.format(target), 'annex-ignore', False): lgr.debug( "Target '%s' is set to annex-ignore, exclude from data-export.", target) return if force not in ("all", "export"): export_entry = _get_export_log_entry(repo, target_uuid) if export_entry: if export_entry["source-annex-uuid"] != repo.uuid: yield dict( **res_kwargs, status="error", message=f"refuse to export to {target}, because the " f"last known export came from another repo " f"({export_entry['source-annex-uuid']}). Use " f"--force=export to enforce the export anyway.") return if not _is_valid_treeish(repo, export_entry): yield dict( **res_kwargs, status="error", message=f"refuse to export to {target}, because the " f"current state is not a fast-forward of the " f"last known exported state. Use " f"--force=export to enforce the export anyway.") return credentials = _get_credentials(ds, remote_info) # If we have credentials, check whether we require an environment patch env_patch = {} remote_type = remote_info.get("type") if credentials and needs_specialremote_credential_envpatch(remote_type): env_patch = get_specialremote_credential_envpatch( remote_type, credentials) res_kwargs['target'] = target with patched_env(**env_patch): try: for result in repo._call_annex_records_items_( [ "export", "HEAD", "--to", target ], progress=True ): result_adjusted = \ annexjson2result(result, ds, **res_kwargs) # annexjson2result overwrites 'action' with annex' 'command', # even if we provided our 'action' within res_kwargs. Therefore, # change afterwards instead: result_adjusted['action'] = "copy" yield result_adjusted except mod_push.CommandError as cmd_error: ce = CapturedException(cmd_error) yield { **res_kwargs, "action": "copy", "status": "error", "message": str(ce), "exception": ce } apply_patch('datalad.core.distributed.push', None, '_transfer_data', _transfer_data) lgr.debug( "Patching datalad.core.distributed.push.Push docstring and parameters") mod_push.Push.__doc__ += """\ The following feature is added by the datalad-next extension: If a target is a git-annex special remote that has "exporttree" set to "yes", push will call 'git-annex export' to export the current HEAD to the remote target. This will usually result in a copy of the file tree, to which HEAD refers, on the remote target. A git-annex special remote with "exporttree" set to "yes" can, for example, be created with the datalad command "create-sibling-webdav" with the option "--mode=filetree" or "--mode=filetree-only". """ mod_push.Push._params_["force"] = Parameter( args=("-f", "--force",), doc="""force particular operations, possibly overruling safety protections or optimizations: use --force with git-push ('gitpush'); do not use --fast with git-annex copy ('checkdatapresent'); force an annex export (to git annex remotes with "exporttree" set to "yes"); combine all force modes ('all').""", constraints=EnsureChoice( 'all', 'gitpush', 'checkdatapresent', 'export', None)) from datalad.interface.base import build_doc mod_push.Push.__call__.__doc__ = None mod_push.Push = build_doc(mod_push.Push) apply_patch( 'datalad_next.datasets', 'LegacyAnnexRepo', 'get_export_records', get_export_records, msg="Patching datalad.support.AnnexRepo.get_export_records (new method)", expect_attr_present=False, ) datalad-next-1.4.1/datalad_next/patches/replace_create_sibling_ria.py000066400000000000000000001055541462321624600260340ustar00rootroot00000000000000"""This file contains an updated :class:`CreateSiblingRia`-class. It replaces the :class:`CreateSiblingRia`-class in `datalad.distributed.create_sibling_ria`. The updated class uses `PurePosixPath` for all RIA-store path calculations. It uses `url2transport_path` to convert the abstract paths to concrete paths for the IO-abstraction. The updated class also uses a canonified representation of path-anchors in URL-paths. This allows to handle the differences in path-anchor encoding between git-annex and RFC 8089-style file-URLs. """ from __future__ import annotations import logging from pathlib import PurePosixPath from datalad.cmd import WitlessRunner as Runner from datalad.core.distributed.clone import decode_source_spec from datalad.customremotes.ria_utils import ( create_ds_in_store, get_layout_locations, verify_ria_url, ) from datalad.distributed.ora_remote import ( LocalIO, RemoteCommandFailedError, RIARemoteError, SSHRemoteIO, ) from datalad.distribution.dataset import ( EnsureDataset, datasetmethod, require_dataset, ) from datalad.distribution.utils import _yield_ds_w_matching_siblings from datalad.interface.base import ( Interface, build_doc, eval_results, ) from datalad.interface.common_opts import ( recursion_flag, recursion_limit, ) from datalad.interface.results import get_status_dict from datalad.log import log_progress from datalad.support.annexrepo import AnnexRepo from datalad.support.constraints import ( EnsureBool, EnsureChoice, EnsureNone, EnsureStr, ) from datalad.support.exceptions import CommandError from datalad.support.gitrepo import GitRepo from datalad.support.param import Parameter from datalad.utils import quote_cmdlinearg from datalad_next.consts import on_windows from . import apply_patch from .replace_ora_remote import ( canonify_url, de_canonify_url, ) lgr = logging.getLogger('datalad.distributed.create_sibling_ria') # `CreateSiblingRia` taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732: @build_doc class CreateSiblingRia(Interface): """Creates a sibling to a dataset in a RIA store Communication with a dataset in a RIA store is implemented via two siblings. A regular Git remote (repository sibling) and a git-annex special remote for data transfer (storage sibling) -- with the former having a publication dependency on the latter. By default, the name of the storage sibling is derived from the repository sibling's name by appending "-storage". The store's base path is expected to not exist, be an empty directory, or a valid RIA store. Notes ----- **RIA URL format** Interactions with new or existing RIA stores require RIA URLs to identify the store or specific datasets inside of it. The general structure of a RIA URL pointing to a store takes the form ``ria+[scheme]://`` (e.g., ``ria+ssh://[user@]hostname:/absolute/path/to/ria-store``, or ``ria+file:///absolute/path/to/ria-store``) The general structure of a RIA URL pointing to a dataset in a store (for example for cloning) takes a similar form, but appends either the datasets UUID or a "~" symbol followed by the dataset's alias name: ``ria+[scheme]://#`` or ``ria+[scheme]://#~``. In addition, specific version identifiers can be appended to the URL with an additional "@" symbol: ``ria+[scheme]://#@``, where ``dataset-version`` refers to a branch or tag. **RIA store layout** A RIA store is a directory tree with a dedicated subdirectory for each dataset in the store. The subdirectory name is constructed from the DataLad dataset ID, e.g. ``124/68afe-59ec-11ea-93d7-f0d5bf7b5561``, where the first three characters of the ID are used for an intermediate subdirectory in order to mitigate files system limitations for stores containing a large number of datasets. By default, a dataset in a RIA store consists of two components: A Git repository (for all dataset contents stored in Git) and a storage sibling (for dataset content stored in git-annex). It is possible to selectively disable either component using ``storage-sibling 'off'`` or ``storage-sibling 'only'``, respectively. If neither component is disabled, a dataset's subdirectory layout in a RIA store contains a standard bare Git repository and an ``annex/`` subdirectory inside of it. The latter holds a Git-annex object store and comprises the storage sibling. Disabling the standard git-remote (``storage-sibling='only'``) will result in not having the bare git repository, disabling the storage sibling (``storage-sibling='off'``) will result in not having the ``annex/`` subdirectory. Optionally, there can be a further subdirectory ``archives`` with (compressed) 7z archives of annex objects. The storage remote is able to pull annex objects from these archives, if it cannot find in the regular annex object store. This feature can be useful for storing large collections of rarely changing data on systems that limit the number of files that can be stored. Each dataset directory also contains a ``ria-layout-version`` file that identifies the data organization (as, for example, described above). Lastly, there is a global ``ria-layout-version`` file at the store's base path that identifies where dataset subdirectories themselves are located. At present, this file must contain a single line stating the version (currently "1"). This line MUST end with a newline character. It is possible to define an alias for an individual dataset in a store by placing a symlink to the dataset location into an ``alias/`` directory in the root of the store. This enables dataset access via URLs of format: ``ria+://#~``. Compared to standard git-annex object stores, the ``annex/`` subdirectories used as storage siblings follow a different layout naming scheme ('dirhashmixed' instead of 'dirhashlower'). This is mostly noted as a technical detail, but also serves to remind git-annex powerusers to refrain from running git-annex commands directly in-store as it can cause severe damage due to the layout difference. Interactions should be handled via the ORA special remote instead. **Error logging** To enable error logging at the remote end, append a pipe symbol and an "l" to the version number in ria-layout-version (like so: ``1|l\\n``). Error logging will create files in an "error_log" directory whenever the git-annex special remote (storage sibling) raises an exception, storing the Python traceback of it. The logfiles are named according to the scheme ``..log`` showing "who" ran into this issue with which dataset. Because logging can potentially leak personal data (like local file paths for example), it can be disabled client-side by setting the configuration variable ``annex.ora-remote..ignore-remote-config``. """ # TODO: description? _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to process. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone()), url=Parameter( args=("url",), metavar="ria+://[/path]", doc="""URL identifying the target RIA store and access protocol. If ``push_url||--push-url`` is given in addition, this is used for read access only. Otherwise it will be used for write access too and to create the repository sibling in the RIA store. Note, that HTTP(S) currently is valid for consumption only thus requiring to provide ``push_url||--push-url``. """, constraints=EnsureStr() | EnsureNone()), push_url=Parameter( args=("--push-url",), metavar="ria+://[/path]", doc="""URL identifying the target RIA store and access protocol for write access to the storage sibling. If given this will also be used for creation of the repository sibling in the RIA store.""", constraints=EnsureStr() | EnsureNone()), name=Parameter( args=('-s', '--name',), metavar='NAME', doc="""Name of the sibling. With `recursive`, the same name will be used to label all the subdatasets' siblings.""", constraints=EnsureStr() | EnsureNone(), required=True), storage_name=Parameter( args=("--storage-name",), metavar="NAME", doc="""Name of the storage sibling (git-annex special remote). Must not be identical to the sibling name. If not specified, defaults to the sibling name plus '-storage' suffix. If only a storage sibling is created, this setting is ignored, and the primary sibling name is used.""", constraints=EnsureStr() | EnsureNone()), alias=Parameter( args=('--alias',), metavar='ALIAS', doc="""Alias for the dataset in the RIA store. Add the necessary symlink so that this dataset can be cloned from the RIA store using the given ALIAS instead of its ID. With `recursive=True`, only the top dataset will be aliased.""", constraints=EnsureStr() | EnsureNone()), post_update_hook=Parameter( args=("--post-update-hook",), doc="""Enable Git's default post-update-hook for the created sibling. This is useful when the sibling is made accessible via a "dumb server" that requires running 'git update-server-info' to let Git interact properly with it.""", action="store_true"), shared=Parameter( args=("--shared",), metavar='{false|true|umask|group|all|world|everybody|0xxx}', doc="""If given, configures the permissions in the RIA store for multi-users access. Possible values for this option are identical to those of `git init --shared` and are described in its documentation.""", constraints=EnsureStr() | EnsureBool() | EnsureNone()), group=Parameter( args=("--group",), metavar="GROUP", doc="""Filesystem group for the repository. Specifying the group is crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""", constraints=EnsureStr() | EnsureNone()), storage_sibling=Parameter( args=("--storage-sibling",), dest='storage_sibling', metavar='MODE', constraints=EnsureChoice('only') | EnsureBool() | EnsureNone(), doc="""By default, an ORA storage sibling and a Git repository sibling are created ([CMD: on CMD][PY: True|'on' PY]). Alternatively, creation of the storage sibling can be disabled ([CMD: off CMD][PY: False|'off' PY]), or a storage sibling created only and no Git sibling ([CMD: only CMD][PY: 'only' PY]). In the latter mode, no Git installation is required on the target host."""), existing=Parameter( args=("--existing",), constraints=EnsureChoice('skip', 'error', 'reconfigure'), metavar='MODE', doc="""Action to perform, if a (storage) sibling is already configured under the given name and/or a target already exists. In this case, a dataset can be skipped ('skip'), an existing target repository be forcefully re-initialized, and the sibling (re-)configured ('reconfigure'), or the command be instructed to fail ('error').""", ), new_store_ok=Parameter( args=("--new-store-ok",), action='store_true', doc="""When set, a new store will be created, if necessary. Otherwise, a sibling will only be created if the url points to an existing RIA store.""", ), recursive=recursion_flag, recursion_limit=recursion_limit, trust_level=Parameter( args=("--trust-level",), metavar="TRUST-LEVEL", constraints=EnsureChoice('trust', 'semitrust', 'untrust', None), doc="""specify a trust level for the storage sibling. If not specified, the default git-annex trust level is used. 'trust' should be used with care (see the git-annex-trust man page).""",), disable_storage__=Parameter( args=("--no-storage-sibling",), dest='disable_storage__', doc="""This option is deprecated. Use '--storage-sibling off' instead.""", action="store_false"), ) @staticmethod @datasetmethod(name='create_sibling_ria') @eval_results def __call__(url, name, *, # note that `name` is required but not posarg in CLI dataset=None, storage_name=None, alias=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', new_store_ok=False, trust_level=None, recursive=False, recursion_limit=None, disable_storage__=None, push_url=None ): if disable_storage__ is not None: import warnings warnings.warn("datalad-create-sibling-ria --no-storage-sibling " "is deprecated, use --storage-sibling off instead.", DeprecationWarning) # recode to new setup disable_storage__ = None storage_sibling = False if storage_sibling == 'only' and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided" ) ds = require_dataset( dataset, check_installed=True, purpose='create RIA sibling(s)') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) # parse target URL # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. # PATCH: use canonified representation of `url` and `push_url` url = canonify_url(url) push_url = canonify_url(url) try: ssh_host, url_base_path_str, rewritten_url = \ verify_ria_url(push_url if push_url else url, ds.config) except ValueError as e: yield get_status_dict( status='error', message=str(e), **res_kwargs ) return # PATCH: use `PurePosixPath` to represent the base path of the store url_base_path = PurePosixPath(url_base_path_str) if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError( "Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format(ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided" ) if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': failed = False for dpath, sname in _yield_ds_w_matching_siblings( ds, (name, storage_name), recursive=recursive, recursion_limit=recursion_limit): res = get_status_dict( status='error', message=( "a sibling %r is already configured in dataset %r", sname, dpath), type='sibling', name=sname, **res_kwargs, ) failed = True yield res if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option io = SSHRemoteIO(ssh_host) if ssh_host else LocalIO() try: # determine the existence of a store by trying to read its layout. # Because this raises a FileNotFound error if non-existent, we need # to catch it # PATCH: convert ria-layout-version location to a concreate path # before giving it to an IO-abstraction. io.read_file(io.url2transport_path(url_base_path / 'ria-layout-version')) except (FileNotFoundError, RIARemoteError, RemoteCommandFailedError) as e: if not new_store_ok: # we're instructed to only act in case of an existing RIA store res = get_status_dict( status='error', message="No store found at '{}'. Forgot " "--new-store-ok ?".format(url_base_path), **res_kwargs) yield res return log_progress( lgr.info, 'create-sibling-ria', 'Creating a new RIA store at %s', url_base_path, ) create_store(io, url_base_path, '1') yield from _create_sibling_ria( ds, url, push_url, name, storage_sibling, storage_name, alias, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(state='present', recursive=True, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets'): yield from _create_sibling_ria( subds, url, push_url, name, storage_sibling, storage_name, None, # subdatasets can't have the same alias as the parent existing, shared, group, post_update_hook, trust_level, res_kwargs) def _create_sibling_ria( ds, url, push_url, name, storage_sibling, storage_name, alias, existing, shared, group, post_update_hook, trust_level, res_kwargs): # be safe across datasets res_kwargs = res_kwargs.copy() # update dataset res_kwargs['ds'] = ds if not isinstance(ds.repo, AnnexRepo): # No point in dealing with a special remote when there's no annex. # Note, that in recursive invocations this might only apply to some of # the datasets. Therefore dealing with it here rather than one level up. lgr.debug("No annex at %s. Ignoring special remote options.", ds.path) storage_sibling = False storage_name = None # parse target URL try: ssh_host, url_base_path_str, rewritten_url = \ verify_ria_url(push_url if push_url else url, ds.config) except ValueError as e: yield get_status_dict( status='error', message=str(e), **res_kwargs ) return # PATCH: use `PurePosixPath` to represent the base path of the RIA-store url_base_path = PurePosixPath(url_base_path_str) git_url = decode_source_spec( # append dataset id to url and use magic from clone-helper: url + '#{}'.format(ds.id), cfg=ds.config )['giturl'] git_push_url = decode_source_spec( push_url + '#{}'.format(ds.id), cfg=ds.config )['giturl'] if push_url else None # determine layout locations; go for a v1 store-level layout repo_path, _, _ = get_layout_locations(1, url_base_path, ds.id) ds_siblings = [ r['name'] for r in ds.siblings( result_renderer='disabled', return_type='generator') ] # Figure whether we are supposed to skip this very dataset if existing == 'skip' and ( name in ds_siblings or ( storage_name and storage_name in ds_siblings)): yield get_status_dict( status='notneeded', message="Skipped on existing sibling", **res_kwargs ) # if we skip here, nothing else can change that decision further # down return # figure whether we need to skip or error due an existing target repo before # we try to init a special remote. if ssh_host: from datalad import ssh_manager ssh = ssh_manager.get_connection( ssh_host, use_remote_annex_bundle=False) ssh.open() exists = False if existing in ['skip', 'error', 'reconfigure']: config_path = repo_path / 'config' # No .git -- if it's an existing repo in a RIA store it should be a # bare repo. # Theoretically we could have additional checks for whether we have # an empty repo dir or a non-bare repo or whatever else. if ssh_host: try: ssh('[ -e {p} ]'.format(p=quote_cmdlinearg(str(config_path)))) exists = True except CommandError: exists = False else: # PATCH: use concrete path for `config_path` to check for existence exists = LocalIO().url2transport_path(config_path).exists() if exists: if existing == 'skip': # 1. not rendered by default # 2. message doesn't show up in ultimate result # record as shown by -f json_pp yield get_status_dict( status='notneeded', message="Skipped on existing remote " "directory {}".format(repo_path), **res_kwargs ) return elif existing == 'error': yield get_status_dict( status='error', message="remote directory {} already " "exists.".format(repo_path), **res_kwargs ) return else: # reconfigure will be handled later in the code pass if storage_sibling == 'only': lgr.info("create storage sibling '%s' ...", name) else: lgr.info("create sibling%s '%s'%s ...", 's' if storage_name else '', name, " and '{}'".format(storage_name) if storage_name else '', ) create_ds_in_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), # PATCH: use abstract `url_base_path` url_base_path, ds.id, '2', '1', alias, init_obj_tree=storage_sibling is not False) if storage_sibling: # we are using the main `name`, if the only thing we are creating # is the storage sibling srname = name if storage_sibling == 'only' else storage_name lgr.debug('init special remote %s', srname) special_remote_options = [ 'type=external', 'externaltype=ora', 'encryption=none', 'autoenable=true', # PATCH: de-canonify `url`, because git-annex expects the path # anchor in the netloc position. f'url={de_canonify_url(url)}'] if push_url: # PATCH: de-canonify `push_url`, because git-annex expects the path # anchor in the netloc position. special_remote_options.append(f'push-url={de_canonify_url(push_url)}') try: ds.repo.init_remote( srname, options=special_remote_options) except CommandError as e: if existing == 'reconfigure' \ and 'There is already a special remote' \ in e.stderr: # run enableremote instead lgr.debug( "special remote '%s' already exists. " "Run enableremote instead.", srname) # TODO: Use AnnexRepo.enable_remote (which needs to get # `options` first) ds.repo.call_annex([ 'enableremote', srname] + special_remote_options) else: yield get_status_dict( status='error', message="initremote failed.\nstdout: %s\nstderr: %s" % (e.stdout, e.stderr), **res_kwargs ) return if trust_level: trust_cmd = [trust_level] if trust_level == 'trust': # Following git-annex 8.20201129-73-g6a0030a11, using `git # annex trust` requires --force. trust_cmd.append('--force') ds.repo.call_annex(trust_cmd + [srname]) # get uuid for use in bare repo's config uuid = ds.config.get("remote.{}.annex-uuid".format(srname)) if storage_sibling == 'only': # we can stop here, the rest of the function is about setting up # the git remote part of the sibling yield get_status_dict( status='ok', **res_kwargs, ) return # 2. create a bare repository in-store: lgr.debug("init bare repository") # TODO: we should prob. check whether it's there already. How? # Note: like the special remote itself, we assume local FS if no # SSH host is specified disabled_hook = repo_path / 'hooks' / 'post-update.sample' enabled_hook = repo_path / 'hooks' / 'post-update' if group: chgrp_cmd = "chgrp -R {} {}".format( quote_cmdlinearg(str(group)), quote_cmdlinearg(str(repo_path))) if ssh_host: ssh('cd {rootdir} && git init --bare{shared}'.format( rootdir=quote_cmdlinearg(str(repo_path)), shared=" --shared='{}'".format( quote_cmdlinearg(shared)) if shared else '' )) if storage_sibling: # write special remote's uuid into git-config, so clone can # which one it is supposed to be and enable it even with # fallback URL ssh("cd {rootdir} && git config datalad.ora-remote.uuid {uuid}" "".format(rootdir=quote_cmdlinearg(str(repo_path)), uuid=uuid)) if post_update_hook: ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)), quote_cmdlinearg(str(enabled_hook)))) if group: # Either repository existed before or a new directory was # created for it, set its group to a desired one if was # provided with the same chgrp ssh(chgrp_cmd) # finally update server if post_update_hook: # Conditional on post_update_hook, since one w/o the other doesn't # seem to make much sense. ssh('cd {rootdir} && git update-server-info'.format( rootdir=quote_cmdlinearg(str(repo_path)) )) else: gr = GitRepo( # PATCH: convert `repo_path` to a concrete path before stringifying # it and giving it to `GitRepo`. str(LocalIO().url2transport_path(repo_path)), create=True, bare=True, shared=shared if shared else None ) if exists and existing == 'reconfigure': # if the repo exists at the given path, the GitRepo would not # (re)-run git init, and just return an instance of GitRepo; # skip & error have been handled at this point gr.init( sanity_checks=False, init_options=["--bare"] + ([f"--shared={shared}"] if shared else []), ) if storage_sibling: # write special remote's uuid into git-config, so clone can # which one it is supposed to be and enable it even with # fallback URL gr.config.add("datalad.ora-remote.uuid", uuid, scope='local') if post_update_hook: # PATCH: convert `disabled_hook` to a concrete path before renaming LocalIO().url2transport_path(disabled_hook).rename(enabled_hook) if group: if on_windows: # PATCH: skip group-handling on Windows because there is no # `chgrp`. lgr.warning( "Group '%s' was provided, but chgrp is not available on " "Windows. Skipping.", group ) else: # No CWD needed here, since `chgrp` is expected to be found via PATH # and the path it's operating on is absolute (repo_path). No # repository operation involved. Runner().run(chgrp_cmd) # finally update server if post_update_hook: # Conditional on post_update_hook, since one w/o the other doesn't # seem to make much sense. gr.call_git(["update-server-info"]) # add a git remote to the bare repository # Note: needs annex-ignore! Otherwise we might push into dirhash # lower annex/object tree instead of mixed, since it's a bare # repo. This in turn would be an issue, if we want to pack the # entire thing into an archive. Special remote will then not be # able to access content in the "wrong" place within the archive lgr.debug("set up git remote") if name in ds_siblings: # otherwise we should have skipped or failed before assert existing == 'reconfigure' ds.config.set( "remote.{}.annex-ignore".format(name), value="true", scope="local") yield from ds.siblings( 'configure', name=name, # PATCH: convert `repo_path` as a concrete path before stringifying it url=str(LocalIO().url2transport_path(repo_path)) if url.startswith("ria+file") else git_url, pushurl=git_push_url, recursive=False, # Note, that this should be None if storage_sibling was not set publish_depends=storage_name, result_renderer='disabled', return_type='generator', # Note, that otherwise a subsequent publish will report # "notneeded". fetch=True ) yield get_status_dict( status='ok', **res_kwargs, ) # Replace the complete `CreateSiblingRia`-class apply_patch( 'datalad.distributed.create_sibling_ria', None, 'CreateSiblingRia', CreateSiblingRia, ) class UnknownLayoutVersion(Exception): pass known_versions_objt = ['1', '2'] # Dataset tree versions we introduced so far. This is about the layout of # datasets in a RIA store known_versions_dst = ['1'] def _ensure_version(io, base_url, version): """Check a store or dataset version and make sure it is declared Parameters ---------- io: SSHRemoteIO or LocalIO base_url: PurePosixPath root path of a store or dataset version: str target layout version of the store (dataset tree) """ version_file = base_url / 'ria-layout-version' if io.exists(io.url2transport_path(version_file)): existing_version = io.read_file( io.url2transport_path(version_file) ).split('|')[0].strip() if existing_version != version.split('|')[0]: # We have an already existing location with a conflicting version on # record. # Note, that a config flag after pipe symbol is fine. raise ValueError("Conflicting version found at target: {}" .format(existing_version)) else: # already exists, recorded version fits - nothing to do return # Note, that the following does create the base-path dir as well, since # mkdir has parents=True: io.mkdir(io.url2transport_path(base_url)) io.write_file(io.url2transport_path(version_file), version) def create_store(io, base_url, version): """Helper to create a RIA store Note, that this is meant as an internal helper and part of intermediate RF'ing. Ultimately should lead to dedicated command or option for create-sibling-ria. Parameters ---------- io: SSHRemoteIO or LocalIO Respective execution instance. Note: To be replaced by proper command abstraction base_url: PurePosixPath root url path of the store version: str layout version of the store (dataset tree) """ assert isinstance(base_url, PurePosixPath) # At store level the only version we know as of now is 1. if version not in known_versions_dst: raise UnknownLayoutVersion("RIA store layout version unknown: {}." "Supported versions: {}" .format(version, known_versions_dst)) _ensure_version(io, base_url, version) error_logs = base_url / 'error_logs' io.mkdir(io.url2transport_path(error_logs)) datalad-next-1.4.1/datalad_next/patches/replace_ora_remote.py000066400000000000000000001205341462321624600243560ustar00rootroot00000000000000""" Patch ``datalad.distributed.ora_remote.ORARemote`` This patch replaces the class :class:`datalad.distributed.ora_remote.ORARemote` with an updated version that should work properly on Linux, OSX, and Windows. The main difference to the original code is that all path-operations are performed on URL-paths. Those are represented by instances of `PurePosixPath`. All subclasses of :class:`BaseIO`, i.e. :class:`LocalIO`, :class:`SSHRemoteIO`, and :class:`HTTPRemoteIO`, are extended to contain the method :meth:`url2transport_path`. This method converts an URL-path into the correct path for the transport, i.e. the IO abstraction. Before methods on a subclass of :class:`BaseIO` that require a path are called, the generic URL-path is converted into the correct path for the IO-class by calling :meth:`url2transport_path` on the respective IO-class. The patch keeps changes to the necessary minimum. That means the source is mostly identical to the original. Besides the changes described above, more debug output was added. NOTE: this patch only provides :class:`ORARemote`. The patches that add a :meth:`url2transport_path`-method to :class:`LocalIO` and to :class:`HTTPRemoteIO` are contained in module ``datalad_next.patches.add_method_url2localpath``. The reason to keep them separate is that the patch from module ``datalad_next.patches.replace_create_sibling_ria`` require them as well. For :class:`SSHRemoteIO` the method is included in the patch definition of :class:`SSHRemoteIO`, which is contained in the module ``datalad_next.patches.replace_sshremoteio``. """ from __future__ import annotations import re import urllib.parse from pathlib import ( Path, PurePosixPath, ) from shlex import quote as sh_quote import logging from datalad.config import anything2bool from datalad.customremotes import ( ProtocolError, SpecialRemote, ) from datalad.distributed.ora_remote import ( HTTPRemoteIO, LocalIO, RIARemoteError, SSHRemoteIO, _get_datalad_id, _get_gitcfg, handle_errors, NoLayoutVersion, ) from datalad.support.annex_utils import _sanitize_key from datalad.support.annexrepo import AnnexRepo from datalad.customremotes.ria_utils import ( get_layout_locations, UnknownLayoutVersion, verify_ria_url, ) from datalad.utils import on_windows from . import apply_patch lgr = logging.getLogger('datalad.customremotes.ria_remote') drive_letter_matcher = re.compile('^[A-Z]:') slash_drive_letter_matcher = re.compile('^/[A-Z]:') DEFAULT_BUFFER_SIZE = 65536 def canonify_url(url: str | None): """For file URLs on windows: put the drive letter into the path component""" if not on_windows or url is None: return url url_parts = urllib.parse.urlparse(url) if url_parts.scheme not in ('ria+file', 'file'): return url match = drive_letter_matcher.match(url_parts.netloc) if not match: return url return f'{url_parts.scheme}:///{match.string}{url_parts.path}' def de_canonify_url(url: str | None): """For file URLs on windows: put the drive letter into the netloc component""" if not on_windows or url is None: return url url_parts = urllib.parse.urlparse(url) if url_parts.scheme not in ('ria+file', 'file'): return url match = slash_drive_letter_matcher.match(url_parts.path) if not match: return url return f'{url_parts.scheme}://{url_parts.path[1:3]}{url_parts.path[3:]}' # `ORARemote` taken from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732: class ORARemote(SpecialRemote): """This is the class of RIA remotes. """ dataset_tree_version = '1' object_tree_version = '2' # TODO: Move known versions. Needed by creation routines as well. known_versions_objt = ['1', '2'] known_versions_dst = ['1'] @handle_errors def __init__(self, annex): super(ORARemote, self).__init__(annex) if hasattr(self, 'configs'): # introduced in annexremote 1.4.2 to support LISTCONFIGS self.configs['url'] = "RIA store to use" self.configs['push-url'] = "URL for pushing to the RIA store. " \ "Optional." self.configs['archive-id'] = "Dataset ID (fallback: annex uuid. " \ "Should be set automatically by " \ "datalad" # the local repo self._repo = None self.gitdir = None self.name = None # name of the special remote self.gitcfg_name = None # name in respective git remote self.ria_store_url = None self.ria_store_pushurl = None # machine to SSH-log-in to access/store the data # subclass must set this self.storage_host = None self.storage_host_push = None # must be absolute, and POSIX (will be instance of PurePosixPath) # subclass must set this self.store_base_path = None self.store_base_path_push = None # by default we can read and write self.read_only = False self.force_write = None self.ignore_remote_config = None self.remote_log_enabled = None self.remote_dataset_tree_version = None self.remote_object_tree_version = None # for caching the remote's layout locations: self.remote_git_dir = None self.remote_archive_dir = None self.remote_obj_dir = None # lazy IO: self._io = None self._push_io = None # cache obj_locations: self._last_archive_path = None self._last_keypath = (None, None) # SSH "streaming" buffer self.buffer_size = DEFAULT_BUFFER_SIZE # PATCH: add a helper to assert the type of a path. @staticmethod def _assert_pure_posix_path(path): assert isinstance(path, PurePosixPath) # PATCH: add a close function to ensure that all IO-abstraction objects are # closed. def close(self): if self._io: self._io.close() self._io = None if self._push_io: self._push_io.close() self._push_io = None def verify_store(self): """Check whether the store exists and reports a layout version we know The layout of the store is recorded in base_path/ria-layout-version. If the version found on the remote end isn't supported and `force-write` isn't configured, sets the remote to read-only operation. """ # THE PATCH: assert path type and perform operation on abstract path self._assert_pure_posix_path(self.store_base_path) dataset_tree_version_file = self.store_base_path / 'ria-layout-version' # check dataset tree version try: self.remote_dataset_tree_version = \ self._get_version_config(dataset_tree_version_file) except Exception as exc: raise RIARemoteError("RIA store unavailable.") from exc if self.remote_dataset_tree_version not in self.known_versions_dst: # Note: In later versions, condition might change in order to # deal with older versions. raise UnknownLayoutVersion(f"RIA store layout version unknown: " f"{self.remote_dataset_tree_version}") def verify_ds_in_store(self): """Check whether the dataset exists in store and reports a layout version we know The layout is recorded in 'dataset_somewhere_beneath_base_path/ria-layout-version.' If the version found on the remote end isn't supported and `force-write` isn't configured, sets the remote to read-only operation. """ object_tree_version_file = self.remote_git_dir / 'ria-layout-version' # check (annex) object tree version try: self.remote_object_tree_version =\ self._get_version_config(object_tree_version_file) except Exception as e: raise RIARemoteError("Dataset unavailable from RIA store.") if self.remote_object_tree_version not in self.known_versions_objt: raise UnknownLayoutVersion(f"RIA dataset layout version unknown: " f"{self.remote_object_tree_version}") def _load_local_cfg(self): # this will work, even when this is not a bare repo # but it is not capable of reading out dataset/branch config self._repo = AnnexRepo(self.gitdir) cfg_map = {"ora-force-write": "force_write", "ora-ignore-ria-config": "ignore_remote_config", "ora-buffer-size": "buffer_size", "ora-url": "ria_store_url", "ora-push-url": "ria_store_pushurl" } # in initremote we may not have a reliable name of the git remote config # yet. Go with the default. gitcfg_name = self.gitcfg_name or self.name if gitcfg_name: for cfg, att in cfg_map.items(): value = self._repo.config.get(f"remote.{gitcfg_name}.{cfg}") if value is not None: self.__setattr__(cfg_map[cfg], value) if cfg == "ora-url": self.ria_store_url_source = 'local' elif cfg == "ora-push-url": self.ria_store_pushurl_source = 'local' if self.buffer_size: try: self.buffer_size = int(self.buffer_size) except ValueError: self.message(f"Invalid value of config " f"'remote.{gitcfg_name}." f"ora-buffer-size': {self.buffer_size}") self.buffer_size = DEFAULT_BUFFER_SIZE if self.name: # Consider deprecated configs if there's no value yet if self.force_write is None: self.force_write = self._repo.config.get( f'annex.ora-remote.{self.name}.force-write') if self.force_write: self.message("WARNING: config " "'annex.ora-remote.{}.force-write' is " "deprecated. Use 'remote.{}.ora-force-write' " "instead.".format(self.name, self.gitcfg_name)) try: self.force_write = anything2bool(self.force_write) except TypeError: raise RIARemoteError("Invalid value of config " "'annex.ora-remote.{}.force-write'" ": {}".format(self.name, self.force_write)) if self.ignore_remote_config is None: self.ignore_remote_config = self._repo.config.get( f"annex.ora-remote.{self.name}.ignore-remote-config") if self.ignore_remote_config: self.message("WARNING: config " "'annex.ora-remote.{}.ignore-remote-config' is" " deprecated. Use " "'remote.{}.ora-ignore-ria-config' instead." "".format(self.name, self.gitcfg_name)) try: self.ignore_remote_config = \ anything2bool(self.ignore_remote_config) except TypeError: raise RIARemoteError( "Invalid value of config " "'annex.ora-remote.{}.ignore-remote-config': {}" "".format(self.name, self.ignore_remote_config)) def _load_committed_cfg(self, fail_noid=True): # which repo are we talking about self.gitdir = self.annex.getgitdir() # go look for an ID self.archive_id = self.annex.getconfig('archive-id') if fail_noid and not self.archive_id: # TODO: Message! "archive ID" is confusing. dl-id or annex-uuid raise RIARemoteError( "No archive ID configured. This should not happen.") # what is our uuid? self.uuid = self.annex.getuuid() # RIA store URL(s) self.ria_store_url = self.annex.getconfig('url') if self.ria_store_url: self.ria_store_url_source = 'annex' self.ria_store_pushurl = self.annex.getconfig('push-url') if self.ria_store_pushurl: self.ria_store_pushurl_source = 'annex' # TODO: This should prob. not be done! Would only have an effect if # force-write was committed annex-special-remote-config and this # is likely a bad idea. self.force_write = self.annex.getconfig('force-write') if self.force_write == "": self.force_write = None # Get the special remote name # TODO: Make 'name' a property of `SpecialRemote`; # Same for `gitcfg_name`, `_repo`? self.name = self.annex.getconfig('name') if not self.name: self.name = self.annex.getconfig('sameas-name') if not self.name: # TODO: Do we need to crash? Not necessarily, I think. We could # still find configs and if not - might work out. raise RIARemoteError( "Cannot determine special remote name, got: {}".format( repr(self.name))) # Get the name of the remote entry in .git/config. # Note, that this by default is the same as the stored name of the # special remote, but can be different (for example after # git-remote-rename). The actual connection is the uuid of the special # remote, not the name. try: self.gitcfg_name = self.annex.getgitremotename() except (ProtocolError, AttributeError): # GETGITREMOTENAME not supported by annex version or by annexremote # version. # Lets try to find ourselves: Find remote with matching annex uuid response = _get_gitcfg(self.gitdir, r"^remote\..*\.annex-uuid", regex=True) response = response.splitlines() if response else [] candidates = set() for line in response: k, v = line.split() if v == self.annex.getuuid(): # TODO: Where else? self.uuid? candidates.add(''.join(k.split('.')[1:-1])) num_candidates = len(candidates) if num_candidates == 1: self.gitcfg_name = candidates.pop() elif num_candidates > 1: self.message("Found multiple used remote names in git " "config: %s" % str(candidates)) # try same name: if self.name in candidates: self.gitcfg_name = self.name self.message("Choose '%s'" % self.name) else: self.gitcfg_name = None self.message("Ignore git config") else: # No entry found. # Possible if we are in "initremote". self.gitcfg_name = None def _load_cfg(self, gitdir, name): # Whether or not to force writing to the remote. Currently used to # overrule write protection due to layout version mismatch. self.force_write = self._repo.config.get( f'annex.ora-remote.{name}.force-write') # whether to ignore config flags set at the remote end self.ignore_remote_config = \ self._repo.config.get( f'annex.ora-remote.{name}.ignore-remote-config') # buffer size for reading files over HTTP and SSH self.buffer_size = self._repo.config.get( f"remote.{name}.ora-buffer-size") if self.buffer_size: self.buffer_size = int(self.buffer_size) def _verify_config(self, fail_noid=True): # try loading all needed info from (git) config # first load committed config self._load_committed_cfg(fail_noid=fail_noid) # now local configs (possible overwrite of committed) self._load_local_cfg() # PATCH: use canonified URLs self.ria_store_url = canonify_url(self.ria_store_url) self.ria_store_pushurl = canonify_url(self.ria_store_pushurl) # get URL rewriting config url_cfgs = {k: v for k, v in self._repo.config.items() if k.startswith('url.')} if self.ria_store_url: self.storage_host, self.store_base_path, self.ria_store_url = \ verify_ria_url(self.ria_store_url, url_cfgs) else: # There's one exception to the precedence of local configs: # Age-old "ssh-host" + "base-path" configs are only considered, # if there was no RIA URL (local or committed). However, issue # deprecation warning, if that situation is encountered: host = None path = None if self.name: host = self._repo.config.get( f'annex.ora-remote.{self.name}.ssh-host') or \ self.annex.getconfig('ssh-host') # Note: Special value '0' is replaced by None only after checking # the repository's annex config. This is to uniformly handle '0' and # None later on, but let a user's config '0' overrule what's # stored by git-annex. self.storage_host = None if host == '0' else host path = self._repo.config.get( f'annex.ora-remote.{self.name}.base-path') or \ self.annex.getconfig('base-path') self.store_base_path = path.strip() if path else path if path or host: self.message("WARNING: base-path + ssh-host configs are " "deprecated and won't be considered in the future." " Use 'git annex enableremote {} " "url=' to store a ria+:" "//... URL in the special remote's config." "".format(self.name), type='info') if not self.store_base_path: raise RIARemoteError( "No base path configured for RIA store. Specify a proper " "ria+://... URL.") # the base path is ultimately derived from a URL, always treat as POSIX self.store_base_path = PurePosixPath(self.store_base_path) if not self.store_base_path.is_absolute(): raise RIARemoteError( 'Non-absolute RIA store base path configuration: %s' '' % str(self.store_base_path)) if self.ria_store_pushurl: if self.ria_store_pushurl.startswith("ria+http"): raise RIARemoteError("Invalid push-url: {}. Pushing over HTTP " "not implemented." "".format(self.ria_store_pushurl)) self.storage_host_push, \ self.store_base_path_push, \ self.ria_store_pushurl = \ verify_ria_url(self.ria_store_pushurl, url_cfgs) self.store_base_path_push = PurePosixPath(self.store_base_path_push) def _get_version_config(self, path): """ Get version and config flags from RIA store's layout file """ if self.ria_store_url: # construct path to ria_layout_version file for reporting # PATCH: use abstract path local_store_base_path = self.store_base_path target_ri = ( self.ria_store_url[4:] + "/" + path.relative_to(local_store_base_path).as_posix() ) elif self.storage_host: target_ri = "ssh://{}{}".format(self.storage_host, path.as_posix()) else: target_ri = path.as_uri() try: # PATCH: convert abstract path to io-specific concrete path file_content = self.io.read_file( self.io.url2transport_path(path) ).strip().split('|') # Note, that we enhance the reporting here, as the IO classes don't # uniformly operate on that kind of RI (which is more informative # as it includes the store base address including the access # method). except FileNotFoundError as exc: raise NoLayoutVersion( f"{target_ri} not found, " f"self.ria_store_url: {self.ria_store_url}, " f"self.store_base_path: {self.store_base_path}, " f"self.store_base_path_push: {self.store_base_path_push}, " f"path: {type(path)} {path}") from exc except PermissionError as exc: raise PermissionError(f"Permission denied: {target_ri}") from exc except Exception as exc: raise RuntimeError(f"Failed to access {target_ri}") from exc if not (1 <= len(file_content) <= 2): self.message("invalid version file {}".format(path), type='info') return None remote_version = file_content[0] remote_config_flags = file_content[1] \ if len(file_content) == 2 else None if not self.ignore_remote_config and remote_config_flags: # Note: 'or', since config flags can come from toplevel # (dataset-tree-root) as well as from dataset-level. # toplevel is supposed flag the entire tree. self.remote_log_enabled = self.remote_log_enabled or \ 'l' in remote_config_flags return remote_version def get_store(self): """checks the remote end for an existing store and dataset Furthermore reads and stores version and config flags, layout locations, etc. If this doesn't raise, the remote end should be fine to work with. """ # make sure the base path is a platform path when doing local IO # the incoming Path object is a PurePosixPath # XXX this else branch is wrong: Incoming is PurePosixPath # but it is subsequently assumed to be a platform path, by # get_layout_locations() etc. Hence it must be converted # to match the *remote* platform, not the local client # cache remote layout directories # PATCH: use the abstract `self.store_base_path` to calculate RIA-store # directory paths. self.remote_git_dir, self.remote_archive_dir, self.remote_obj_dir = \ self.get_layout_locations(self.store_base_path, self.archive_id) read_only_msg = "Treating remote as read-only in order to " \ "prevent damage by putting things into an unknown " \ "version of the target layout. You can overrule this " \ "by setting 'annex.ora-remote..force-write=true'." try: self.verify_store() except UnknownLayoutVersion: reason = "Remote dataset tree reports version {}. Supported " \ "versions are: {}. Consider upgrading datalad or " \ "fix the 'ria-layout-version' file at the RIA store's " \ "root. ".format(self.remote_dataset_tree_version, self.known_versions_dst) self._set_read_only(reason + read_only_msg) except NoLayoutVersion: reason = "Remote doesn't report any dataset tree version. " \ "Consider upgrading datalad or add a fitting " \ "'ria-layout-version' file at the RIA store's " \ "root." self._set_read_only(reason + read_only_msg) try: self.verify_ds_in_store() except UnknownLayoutVersion: reason = "Remote object tree reports version {}. Supported" \ "versions are {}. Consider upgrading datalad or " \ "fix the 'ria-layout-version' file at the remote " \ "dataset root. " \ "".format(self.remote_object_tree_version, self.known_versions_objt) self._set_read_only(reason + read_only_msg) except NoLayoutVersion: reason = "Remote doesn't report any object tree version. " \ "Consider upgrading datalad or add a fitting " \ "'ria-layout-version' file at the remote " \ "dataset root. " self._set_read_only(reason + read_only_msg) @handle_errors def initremote(self): self._verify_config(fail_noid=False) if not self.archive_id: self.archive_id = _get_datalad_id(self.gitdir) if not self.archive_id: # fall back on the UUID for the annex remote self.archive_id = self.annex.getuuid() self.get_store() self.annex.setconfig('archive-id', self.archive_id) # Make sure, we store the potentially rewritten URL. But only, if the # source was annex as opposed to a local config. if self.ria_store_url and self.ria_store_url_source == 'annex': self.annex.setconfig('url', self.ria_store_url) if self.ria_store_pushurl and self.ria_store_pushurl_source == 'annex': self.annex.setconfig('push-url', self.ria_store_pushurl) def _local_io(self): """Are we doing local operations?""" # let's not make this decision dependent on the existence # of a directory the matches the name of the configured # store tree base dir. Such a match could be pure # coincidence. Instead, let's do remote whenever there # is a remote host configured #return self.store_base_path.is_dir() # TODO: Isn't that wrong with HTTP anyway? # + just isinstance(LocalIO)? # XXX isinstance(LocalIO) would not work, this method is used # before LocalIO is instantiated return not self.storage_host def _set_read_only(self, msg): if not self.force_write: self.read_only = True self.message(msg, type='info') else: self.message("Was instructed to force write", type='info') def _ensure_writeable(self): if self.read_only: raise RIARemoteError("Remote is treated as read-only. " "Set 'ora-remote..force-write=true' to " "overrule this.") if isinstance(self.push_io, HTTPRemoteIO): raise RIARemoteError("Write access via HTTP not implemented") @property def io(self): if not self._io: if self._local_io(): self._io = LocalIO() elif self.ria_store_url.startswith("ria+http"): # TODO: That construction of "http(s)://host/" should probably # be moved, so that we get that when we determine # self.storage_host. In other words: Get the parsed URL # instead and let HTTPRemoteIO + SSHRemoteIO deal with it # uniformly. Also: Don't forget about a possible port. url_parts = self.ria_store_url[4:].split('/') # we expect parts: ("http(s):", "", host:port, path) self._io = HTTPRemoteIO( url_parts[0] + "//" + url_parts[2], self.buffer_size ) elif self.storage_host: self._io = SSHRemoteIO(self.storage_host, self.buffer_size) from atexit import register register(self._io.close) else: raise RIARemoteError( "Local object tree base path does not exist, and no SSH" "host configuration found.") return self._io @property def push_io(self): # Instance of an IOBase subclass for execution based on configured # 'push-url' if such exists. Otherwise identical to `self.io`. # Note, that once we discover we need to use the push-url (that is on # TRANSFER_STORE and REMOVE), we should switch all operations to that IO # instance instead of using different connections for read and write # operations. Ultimately this is due to the design of annex' special # remote protocol - we don't know which annex command is running and # therefore we don't know whether to use fetch or push URL during # PREPARE. if not self._push_io: if self.ria_store_pushurl: self.message("switching ORA to push-url") # Not-implemented-push-HTTP is ruled out already when reading # push-url, so either local or SSH: if not self.storage_host_push: # local operation self._push_io = LocalIO() else: self._push_io = SSHRemoteIO(self.storage_host_push, self.buffer_size) # We have a new instance. Kill the existing one and replace. from atexit import register, unregister if hasattr(self.io, 'close'): unregister(self.io.close) self.io.close() # XXX now also READ IO is done with the write IO # this explicitly ignores the remote config # that distinguishes READ from WRITE with different # methods self._io = self._push_io if hasattr(self.io, 'close'): register(self.io.close) self.storage_host = self.storage_host_push self.store_base_path = self.store_base_path_push # delete/update cached locations: self._last_archive_path = None self._last_keypath = (None, None) self.remote_git_dir, \ self.remote_archive_dir, \ self.remote_obj_dir = \ self.get_layout_locations( # PATCH: use abstract path to calculate RIA-store dirs self.store_base_path, self.archive_id ) else: # no push-url: use existing IO self._push_io = self._io return self._push_io @handle_errors def prepare(self): gitdir = self.annex.getgitdir() self._repo = AnnexRepo(gitdir) self._verify_config() self.get_store() # report active special remote configuration/status self.info = { 'store_base_path': str(self.store_base_path), 'storage_host': 'local' if self._local_io() else self.storage_host, } # TODO: following prob. needs hasattr instead: if not isinstance(self.io, HTTPRemoteIO): self.info['7z'] = ("not " if not self.io.get_7z() else "") + \ "available" @handle_errors def transfer_store(self, key, filename): self._ensure_writeable() # we need a file-system compatible name for the key key = _sanitize_key(key) dsobj_dir, archive_path, key_path = self._get_obj_location(key) key_path = dsobj_dir / key_path # PATCH: convert abstract path `key_path` to io-specific concrete path # and use that. transport_key_path = self.push_io.url2transport_path(key_path) if self.push_io.exists(transport_key_path): # if the key is here, we trust that the content is in sync # with the key return self.push_io.mkdir(transport_key_path.parent) # We need to copy to a temp location to let checkpresent fail while the # transfer is still in progress and furthermore not interfere with # administrative tasks in annex/objects. # In addition include uuid, to not interfere with parallel uploads from # different clones. transfer_dir = \ self.remote_git_dir / "ora-remote-{}".format(self._repo.uuid) / "transfer" # PATCH: convert abstract path `transfer_dir` to io-specific concrete # path and use that transport_transfer_dir = self.push_io.url2transport_path(transfer_dir) self.push_io.mkdir(transport_transfer_dir) tmp_path = transfer_dir / key # PATCH: convert abstract path `transport_tmp_path` to io-specific # concrete path and use that transport_tmp_path = self.push_io.url2transport_path(tmp_path) try: self.push_io.put(filename, transport_tmp_path, self.annex.progress) # copy done, atomic rename to actual target self.push_io.rename(transport_tmp_path, transport_key_path) except Exception as e: # whatever went wrong, we don't want to leave the transfer location # blocked self.push_io.remove(transport_tmp_path) raise e @handle_errors def transfer_retrieve(self, key, filename): # we need a file-system compatible name for the key key = _sanitize_key(key) dsobj_dir, archive_path, key_path = self._get_obj_location(key) abs_key_path = dsobj_dir / key_path # PATCH: convert abstract path `abs_key_path` to io-specific # concrete path and use that transport_abs_key_path = self.io.url2transport_path(abs_key_path) # sadly we have no idea what type of source gave checkpresent->true # we can either repeat the checks, or just make two opportunistic # attempts (at most) try: self.io.get(transport_abs_key_path, filename, self.annex.progress) except Exception as e1: if isinstance(self.io, HTTPRemoteIO): # no client-side archive access over HTTP # Note: This is intentional, as it would mean one additional # request per key. However, server response to the GET can # consider archives on their end. raise # catch anything and keep it around for a potential re-raise try: # PATCH: convert abstract path `archive_path` to io-specific # concrete path and use that transport_archive_path = self.io.url2transport_path( archive_path ) self.io.get_from_archive( transport_archive_path, key_path, filename, self.annex.progress ) except Exception as e2: # TODO properly report the causes raise RIARemoteError('Failed to obtain key: {}' ''.format([str(e1), str(e2)])) @handle_errors def checkpresent(self, key): # we need a file-system compatible name for the key key = _sanitize_key(key) dsobj_dir, archive_path, key_path = self._get_obj_location(key) abs_key_path = dsobj_dir / key_path # PATCH: convert abstract path `abs_key_path` to io-specific concrete # path and use that transport_abs_key_path = self.io.url2transport_path(abs_key_path) if self.io.exists(transport_abs_key_path): # we have an actual file for this key return True if isinstance(self.io, HTTPRemoteIO): # no client-side archive access over HTTP return False # do not make a careful check whether an archive exists, because at # present this requires an additional SSH call for remote operations # which may be rather slow. Instead just try to run 7z on it and let # it fail if no archive is around # TODO honor future 'archive-mode' flag # PATCH: convert abstract path `archive_path` to io-specific concrete # path and use that transport_archive_path = self.io.url2transport_path(archive_path) return self.io.in_archive(transport_archive_path, key_path) @handle_errors def remove(self, key): # we need a file-system compatible name for the key key = _sanitize_key(key) self._ensure_writeable() dsobj_dir, archive_path, key_path = self._get_obj_location(key) key_path = dsobj_dir / key_path # PATCH: convert abstract path `key_path` to io-specific concrete path # and use that transport_key_path = self.push_io.url2transport_path(key_path) if self.push_io.exists(transport_key_path): self.push_io.remove(transport_key_path) key_dir = key_path # remove at most two levels of empty directories for level in range(2): key_dir = key_dir.parent try: # PATCH: convert abstract path `key_dir` to io-specific concrete # path and use that transport_key_dir = self.push_io.url2transport_path(key_dir) self.push_io.remove_dir(transport_key_dir) except Exception: break @handle_errors def getcost(self): # 100 is cheap, 200 is expensive (all relative to Config/Cost.hs) # 100/200 are the defaults for local and remote operations in # git-annex # if we have the object tree locally, operations are cheap (100) # otherwise expensive (200) return '100' if self._local_io() else '200' @handle_errors def whereis(self, key): # we need a file-system compatible name for the key key = _sanitize_key(key) dsobj_dir, archive_path, key_path = self._get_obj_location(key) if isinstance(self.io, HTTPRemoteIO): # display the URL for a request # TODO: method of HTTPRemoteIO # in case of a HTTP remote (unchecked for others), storage_host # is not just a host, but a full URL without a path return f'{self.storage_host}{dsobj_dir}/{key_path}' return str(dsobj_dir / key_path) if self._local_io() \ else '{}: {}:{}'.format( self.storage_host, self.remote_git_dir, sh_quote(str(key_path)), ) @staticmethod def get_layout_locations(base_path, dsid): # PATCH: type of `base_path` is `PurePosixPath` ORARemote._assert_pure_posix_path(base_path) return get_layout_locations(1, base_path, dsid) def _get_obj_location(self, key): # Notes: - Changes to this method may require an update of # ORARemote._layout_version # - archive_path is always the same ATM. However, it might depend # on `key` in the future. Therefore build the actual filename # for the archive herein as opposed to `get_layout_locations`. if not self._last_archive_path: # PATCH: type of `base_path` is `PurePosixPath` self._assert_pure_posix_path(self.remote_archive_dir) self._last_archive_path = self.remote_archive_dir / 'archive.7z' if self._last_keypath[0] != key: if self.remote_object_tree_version == '1': # PATCH: dir-hashes are always in platform format. We convert it # to a platform-specific `Path` and then to `PurePosixPath`. key_dir = PurePosixPath(Path(self.annex.dirhash_lower(key))) # If we didn't recognize the remote layout version, we set to # read-only and promised to at least try and read according to our # current version. So, treat that case as if remote version was our # (client's) version. else: # PATCH: dir-hashes are always in platform format. We convert it # to a platform-specific `Path` and then to `PurePosixPath`. key_dir = PurePosixPath(Path(self.annex.dirhash(key))) # double 'key' is not a mistake, but needed to achieve the exact # same layout as the annex/objects tree # PATCH: use the abstract `key_dir` path self._last_keypath = (key, key_dir / key / key) self._assert_pure_posix_path(self.remote_obj_dir) return self.remote_obj_dir, self._last_archive_path, \ self._last_keypath[1] apply_patch( 'datalad.distributed.ora_remote', None, 'ORARemote', ORARemote, ) datalad-next-1.4.1/datalad_next/patches/replace_sshremoteio.py000066400000000000000000000324141462321624600245620ustar00rootroot00000000000000"""Provide a full replacement of `SSHRemoteIO` First and foremost, this replacement no longer uses the remote shell implementation of the previous version, but is based on `datalad_next.shell`. Moreover, the ``cmd``-argument for the shell ssh-process, is not correct, if ``self.ssh`` is an instance of ``NoMultiplexSSHConnection``. The changes in this patch build the correct ``cmd``-argument by adding additional arguments to ``cmd``, if `self.ssh` is an instance of ``NoMultiplexSSHConnection``. More precisely, the arguments that are required to open a "shell" in a ``NoMultiplexSSHConnection`` are stored in ``NoMultiplexSSHConnection._ssh_open_args`` and not in ``NoMultiplexSSHConnection._ssh_args``. This patch therefore provides arguments from both lists, i.e. from ``_ssh_args`` and ``_ssh_open_args`` in the call that opens a "shell", if ``self.ssh`` is an instance of ``NoMultiplexSSHConnection``. The implementation also no longer assumes that local and remote platform are identical. This patch introduces an actual remote platform/system determination. This patch also adds the method :meth:`url2transport_path`, which is used to convert abstract paths, which are used in the patched RIA/ORA-code, into paths that SSHRemoteIO can operate on. """ from __future__ import annotations from pathlib import ( Path, PurePosixPath, ) from urllib.parse import ( unquote, urlparse, ) from datalad.distributed.ora_remote import ( DEFAULT_BUFFER_SIZE, IOBase, RemoteError, RIARemoteError, contextmanager, functools, on_osx, sh_quote, ssh_manager, stat, ) from datalad.support.sshconnector import NoMultiplexSSHConnection from datalad_next.exceptions import CapturedException from datalad_next.patches import apply_patch from datalad_next.runners import CommandError from datalad_next.shell import ( FixedLengthResponseGeneratorPosix, shell, posix as posix_ops, ) class SSHRemoteIO(IOBase): """IO operation if the object tree is SSH-accessible It doesn't even think about a windows server. """ def __init__(self, ssh_url, buffer_size=DEFAULT_BUFFER_SIZE): """ Parameters ---------- ssh_url : str SSH-accessible host(name) to perform remote IO operations on. buffer_size: int or None The buffer size to be used as the `chunk_size` for communication with the remote shell. """ parsed_url = urlparse(ssh_url) self.url = ssh_url self._remote_system = None # the connection to the remote # we don't open it yet, not yet clear if needed self.ssh = ssh_manager.get_connection( ssh_url, use_remote_annex_bundle=False, ) self.ssh.open() ssh_args = self.ssh._ssh_args if isinstance(self.ssh, NoMultiplexSSHConnection): ssh_args.extend(self.ssh._ssh_open_args) cmd = ['ssh'] + ssh_args + [self.ssh.sshri.as_str()] # we settle on `bash` as a shell. It should be around and then we # can count on it cmd.append('bash') # open the remote shell self.servershell_context = shell( cmd, chunk_size=buffer_size, ) self.servershell = self.servershell_context.__enter__() # if the URL had a path, we try to 'cd' into it to make operations on # relative paths intelligible if parsed_url.path: # unquote path real_path = unquote(parsed_url.path) try: self.servershell( f'cd {sh_quote(real_path)}', check=True, ) except Exception as e: # it may be a legit use case to point to a directory that is # not yet existing. Log and continue CapturedException(e) def close(self): if self.servershell_context is None: return self.servershell_context.__exit__(None, None, None) self.servershell_context = None def url2transport_path( self, url_path: PurePosixPath ) -> Path | PurePosixPath: assert isinstance(url_path, PurePosixPath) return url_path @property def remote_system(self): if self._remote_system is None: self._remote_system = self.servershell( "uname -s", check=True ).stdout.strip().decode().casefold() return self._remote_system @contextmanager def ensure_writeable(self, path): """Context manager to get write permission on `path` and restore original mode afterwards. If git-annex ever touched the key store, the keys will be in mode 444 directories, and we need to obtain permission first. Parameters ---------- path: Path path to the target file """ path = sh_quote(str(path)) # remember original mode -- better than to prescribe a fixed mode if self.remote_system == 'darwin': format_option = "-f%Dp" # on macOS this would return decimal representation of mode (same # as python's stat().st_mode conversion = int else: # win is currently ignored anyway format_option = "--format=\"%f\"" # in opposition to the above form for macOS, on debian this would # yield the hexadecimal representation of the mode; hence conversion # needed. conversion = functools.partial(int, base=16) output = self.servershell( f"stat {format_option} {path}", check=True, ).stdout.decode() mode = conversion(output) if not mode & stat.S_IWRITE: new_mode = oct(mode | stat.S_IWRITE)[-3:] self.servershell(f"chmod {new_mode} {path}", check=True) changed = True else: changed = False try: yield finally: if changed: # restore original mode self.servershell( f"chmod {oct(mode)[-3:]} {path}", # don't fail if path doesn't exist anymore check=False, ) def mkdir(self, path): self.servershell( f'mkdir -p {sh_quote(str(path))}', check=True, ) def symlink(self, target, link_name): self.servershell( f'ln -s {sh_quote(str(target))} {sh_quote(str(link_name))}', check=True, ) def put(self, src, dst, progress_cb): posix_ops.upload( self.servershell, Path(src), PurePosixPath(dst), # the given callback only takes a single int, but posix.upload # gives two (cur, target) -> have an adaptor progress_callback=lambda c, m: progress_cb(c), check=True, ) def get(self, src, dst, progress_cb): posix_ops.download( self.servershell, PurePosixPath(src), Path(dst), # the given callback only takes a single int, but posix.download # gives two (cur, target) -> have an adaptor progress_callback=lambda c, m: progress_cb(c), check=True, ) def rename(self, src, dst): with self.ensure_writeable(dst.parent): self.servershell( f'mv {sh_quote(str(src))} {sh_quote(str(dst))}', check=True, ) def remove(self, path): try: with self.ensure_writeable(path.parent): self.servershell( f'rm {sh_quote(str(path))}', check=True, ) except CommandError as e: raise RIARemoteError( f"Unable to remove {path} " "or to obtain write permission in parent directory.") from e def remove_dir(self, path): with self.ensure_writeable(path.parent): self.servershell( f'rmdir {sh_quote(str(path))}', check=True, ) def exists(self, path): try: self.servershell( f'test -e {sh_quote(str(path))}', check=True, ) return True except CommandError: return False def in_archive(self, archive_path, file_path): if not self.exists(archive_path): return False loc = str(file_path) # query 7z for the specific object location, keeps the output # lean, even for big archives cmd = f'7z l {sh_quote(str(archive_path))} {sh_quote(loc)}' # Note: Currently relies on file_path not showing up in case of failure # including non-existent archive. If need be could be more sophisticated # and called with check=True + catch RemoteCommandFailedError out = self.servershell( cmd, check=False, ).stdout.decode() return loc in out def get_from_archive(self, archive, src, dst, progress_cb): # Note, that as we are in blocking mode, we can't easily fail on the # actual get (that is 'cat'). Therefore check beforehand. if not self.exists(archive): raise RIARemoteError("archive {arc} does not exist." "".format(arc=archive)) # with `7z -slt` we get an info block per file like this # # Path = some.txt # Size = 4 # Packed Size = 8 # Modified = 2024-04-18 14:55:39.2376272 # Attributes = A -rw-rw-r-- # CRC = 5A82FD08 # Encrypted = - # Method = LZMA2:12 # Block = 0 # # we use -scsUTF-8 to be able to match an UTF filename properly, # and otherwise use basic grep/cut to get the integer byte size of # the file to be extracted # size_cmd = \ f'7z -slt -scsUTF-8 l "{archive}" | grep -A9 "Path = {src}" ' \ '| grep "^Size =" | cut -d " " -f 3' res = self.servershell(size_cmd, check=True) nbytes = res.stdout.strip().decode() if not nbytes: raise RIARemoteError( 'Cannot determine archive member size. Invalid name?') member_size = int(res.stdout.strip().decode()) cmd = f'7z x -so -- {sh_quote(str(archive))} {sh_quote(str(src))}' resgen = self.servershell.start( cmd, response_generator=FixedLengthResponseGeneratorPosix( self.servershell.stdout, member_size, ), ) bytes_received = 0 with open(dst, 'wb') as target_file: for chunk in resgen: bytes_received += len(chunk) target_file.write(chunk) progress_cb(bytes_received) assert resgen.returncode == 0 if member_size: assert member_size == bytes_received def read_file(self, file_path): cmd = f"cat {sh_quote(str(file_path))}" try: out = self.servershell( cmd, check=True, ).stdout.decode() except CommandError as e: # Currently we don't read stderr. All we know is, we couldn't read. # Try narrowing it down by calling a subsequent exists() if not self.exists(file_path): raise FileNotFoundError(f"{str(file_path)} not found.") from e else: raise RuntimeError(f"Could not read {file_path}") from e return out def write_file(self, file_path, content, mode='w'): if mode == 'w': mode = ">" elif mode == 'a': mode = ">>" else: raise ValueError("Unknown mode '{}'".format(mode)) # it really should read from stdin, but MIH cannot make it happen stdin = content.encode() cmd = f"head -c {len(stdin)} | cat {mode} {sh_quote(str(file_path))}" try: self.servershell( cmd, check=True, stdin=[stdin], ) except CommandError as e: raise RIARemoteError(f"Could not write to {file_path}") from e def get_7z(self): # TODO: To not rely on availability in PATH we might want to use `which` # (`where` on windows) and get the actual path to 7z to reuse in # in_archive() and get(). # Note: `command -v XXX` or `type` might be cross-platform # solution! # However, for availability probing only, it would be sufficient # to just call 7z and see whether it returns zero. try: self.servershell( "7z", check=True, ) return True except CommandError: return False def oraremote_close_io_onclose(self): if self._io: self._io.close() self._io = None if self._push_io: self._push_io.close() self._push_io = None # replace the whole class apply_patch('datalad.distributed.ora_remote', None, 'SSHRemoteIO', SSHRemoteIO) # add close handler that calls the io.close() apply_patch('datalad.distributed.ora_remote', 'ORARemote', 'close', oraremote_close_io_onclose) datalad-next-1.4.1/datalad_next/patches/ria_utils.py000066400000000000000000000176351462321624600225310ustar00rootroot00000000000000"""Patch ria_utils.py tp work with abstract RIA-paths The ORARemote and CreateSiblingRia-patches use an abstract representation of all paths that are related to elements of a RIA-store, e.g. `ria-layout-version` or `ria-object-dir`. This patch adapts `ria_utils.py` to this modification. """ from __future__ import annotations import logging from pathlib import PurePosixPath from datalad.customremotes.ria_utils import ( UnknownLayoutVersion, get_layout_locations, ) from . import apply_patch lgr = logging.getLogger('datalad.customremotes.ria_utils') # The following two blocks of comments and definitions are verbatim copies from # `datalad.cutomremotes.ria_utils` # TODO: Make versions a tuple of (label, description)? # Object tree versions we introduced so far. This is about the layout within a # dataset in a RIA store known_versions_objt = ['1', '2'] # Dataset tree versions we introduced so far. This is about the layout of # datasets in a RIA store known_versions_dst = ['1'] # taken from `ria_utils._ensure_version` from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 def ria_utils__ensure_version(io, base_path, version): """Check a store or dataset version and make sure it is declared Parameters ---------- io: SSHRemoteIO or LocalIO base_path: PurePosixPath root path of a store or dataset version: str target layout version of the store (dataset tree) """ # PATCH: ensure that `base_path` is an instance of `PurePosixPath`. assert isinstance(base_path, PurePosixPath) # PATCH: convert abstract `ria-layout-version`-path to concrete IO-specific # path version_file = io.url2transport_path(base_path / 'ria-layout-version') if io.exists(version_file): existing_version = io.read_file(version_file).split('|')[0].strip() if existing_version != version.split('|')[0]: # We have an already existing location with a conflicting version on # record. # Note, that a config flag after pipe symbol is fine. raise ValueError("Conflicting version found at target: {}" .format(existing_version)) else: # already exists, recorded version fits - nothing to do return # Note, that the following does create the base-path dir as well, since # mkdir has parents=True: # PATCH: convert abstract path `base_path` to concrete IO-specific path # before handing it to `mkdir`. io.mkdir(io.url2transport_path(base_path)) io.write_file(version_file, version) # taken from `ria_utils.create_store` from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 def ria_utils_create_store(io, base_path, version): """Helper to create a RIA store Note, that this is meant as an internal helper and part of intermediate RF'ing. Ultimately should lead to dedicated command or option for create-sibling-ria. Parameters ---------- io: SSHRemoteIO or LocalIO Respective execution instance. Note: To be replaced by proper command abstraction base_path: PurePosixPath root url path of the store version: str layout version of the store (dataset tree) """ # PATCH: ensure that `base_path` is an instance of `PurePosixPath`. assert isinstance(base_path, PurePosixPath) # At store level the only version we know as of now is 1. if version not in known_versions_dst: raise UnknownLayoutVersion("RIA store layout version unknown: {}." "Supported versions: {}" .format(version, known_versions_dst)) _ensure_version(io, base_path, version) error_logs = base_path / 'error_logs' # PATCH: convert abstract path `error_logs` to concrete IO-specific path # before handing it to `mkdir`. io.mkdir(io.url2transport_path(error_logs)) # taken from `ria_utils.create_ds_in_store` from datalad-core@864dc4ae24c8aac0ec4003604543b86de4735732 def ria_utils_create_ds_in_store(io, base_path, dsid, obj_version, store_version, alias=None, init_obj_tree=True ): """Helper to create a dataset in a RIA store Note, that this is meant as an internal helper and part of intermediate RF'ing. Ultimately should lead to a version option for create-sibling-ria in conjunction with a store creation command/option. Parameters ---------- io: SSHRemoteIO or LocalIO Respective execution instance. Note: To be replaced by proper command abstraction base_path: PurePosixPath root path of the store dsid: str dataset id store_version: str layout version of the store (dataset tree) obj_version: str layout version of the dataset itself (object tree) alias: str, optional alias for the dataset in the store init_obj_tree: bool whether or not to create the base directory for an annex objects tree ( 'annex/objects') """ # PATCH: ensure that `base_path` is an instance of `PurePosixPath`. assert isinstance(base_path, PurePosixPath) # TODO: Note for RF'ing, that this is about setting up a valid target # for the special remote not a replacement for create-sibling-ria. # There's currently no git (bare) repo created. try: # TODO: This is currently store layout version! # Too entangled by current get_layout_locations. dsgit_dir, archive_dir, dsobj_dir = \ get_layout_locations(int(store_version), base_path, dsid) except ValueError as e: raise UnknownLayoutVersion(str(e)) if obj_version not in known_versions_objt: raise UnknownLayoutVersion("Dataset layout version unknown: {}. " "Supported: {}" .format(obj_version, known_versions_objt)) _ensure_version(io, dsgit_dir, obj_version) # PATCH: convert abstract path `archive_dir` to concrete IO-specific path # before handing it to `mkdir`. io.mkdir(io.url2transport_path(archive_dir)) if init_obj_tree: # PATCH: convert abstract path `dsobj_dir` to concrete IO-specific path # before handing it to `mkdir`. io.mkdir(io.url2transport_path(dsobj_dir)) if alias: alias_dir = base_path / "alias" # PATCH: convert abstract path `alias_dir` to concrete IO-specific path # before handing it to `mkdir`. io.mkdir(io.url2transport_path(alias_dir)) try: # go for a relative path to keep the alias links valid # when moving a store io.symlink( # PATCH: convert abstract relative path to concrete IO-specific # path before handing it to `symlink`. io.url2transport_path( PurePosixPath('..') / dsgit_dir.relative_to(base_path) ), # PATCH: convert abstract alias-path to concrete IO-specific path # before handing it to `symlink`. io.url2transport_path(alias_dir / alias) ) except FileExistsError: lgr.warning("Alias %r already exists in the RIA store, not adding an " "alias.", alias) _ensure_version = ria_utils__ensure_version # Overwrite `create_store` to handle paths properly apply_patch( 'datalad.customremotes.ria_utils', None, 'create_store', ria_utils_create_store, ) # Overwrite `create_ds_in_store` to handle paths properly apply_patch( 'datalad.customremotes.ria_utils', None, 'create_ds_in_store', ria_utils_create_ds_in_store, ) # Overwrite `_ensure_version` to handle paths properly apply_patch( 'datalad.customremotes.ria_utils', None, '_ensure_version', ria_utils__ensure_version, ) datalad-next-1.4.1/datalad_next/patches/run.py000066400000000000000000000057731462321624600213420ustar00rootroot00000000000000"""Enhance ``run()`` placeholder substitutions to honor configuration defaults Previously, ``run()`` would not recognize configuration defaults for placeholder substitution. This means that any placeholders globally declared in ``datalad.interface.common_cfg``, or via ``register_config()`` in DataLad extensions would not be effective. This patch makes run's ``format_command()`` helper include such defaults explicitly, and thereby enable the global declaration of substitution defaults. Moreoever a ``{python}`` placeholder is now defined via this mechanism, and points to the value of ``sys.executable`` by default. This particular placeholder was found to be valuable for improving the portability of run-recording across (specific) Python versions, or across different (virtual) environments. See https://github.com/datalad/datalad-container/issues/224 for an example use case. https://github.com/datalad/datalad/pull/7509 """ from itertools import filterfalse import sys from datalad.core.local.run import ( GlobbedPaths, SequenceFormatter, normalize_command, quote_cmdlinearg, ) from datalad.interface.common_cfg import definitions as cfg_defs from datalad.support.constraints import EnsureStr from datalad.support.extensions import register_config from . import apply_patch # This function is taken from datalad-core@a96c51c0b2794b2a2b4432ec7bd51f260cb91a37 # datalad/core/local/run.py # The change has been proposed in https://github.com/datalad/datalad/pull/7509 def format_command(dset, command, **kwds): """Plug in placeholders in `command`. Parameters ---------- dset : Dataset command : str or list `kwds` is passed to the `format` call. `inputs` and `outputs` are converted to GlobbedPaths if necessary. Returns ------- formatted command (str) """ command = normalize_command(command) sfmt = SequenceFormatter() cprefix = 'datalad.run.substitutions.' def not_subst(x): return not x.startswith(cprefix) for k in set(filterfalse(not_subst, cfg_defs.keys())).union( filterfalse(not_subst, dset.config.keys())): v = dset.config.get( k, # pull a default from the config definitions # if we have no value, but a key cfg_defs.get(k, {}).get('default', None)) sub_key = k.replace(cprefix, "") if sub_key not in kwds: kwds[sub_key] = v for name in ["inputs", "outputs"]: io_val = kwds.pop(name, None) if not isinstance(io_val, GlobbedPaths): io_val = GlobbedPaths(io_val, pwd=kwds.get("pwd")) kwds[name] = list(map(quote_cmdlinearg, io_val.expand(dot=False))) return sfmt.format(command, **kwds) apply_patch( 'datalad.core.local.run', None, 'format_command', format_command) register_config( 'datalad.run.substitutions.python', 'Substitution for {python} placeholder', description='Path to a Python interpreter executable', type=EnsureStr(), default=sys.executable, dialog='question', ) datalad-next-1.4.1/datalad_next/patches/siblings.py000066400000000000000000000045701462321624600223420ustar00rootroot00000000000000"""Auto-deploy credentials when enabling special remotes This is the companion of the ``annexRepo__enable_remote`` patch, and simply removes the webdav-specific credential handling in ``siblings()``. It is no longer needed, because credential deployment moved to a lower layer, covering more special remote types. Manual credential entry on ``enableremote`` is not implemented here, but easily possible following the patterns from `datalad-annex::` and ``create_sibling_webdav()`` """ import logging from datalad_next.datasets import LegacyAnnexRepo as AnnexRepo from datalad.support.exceptions import ( AccessDeniedError, AccessFailedError, CapturedException, ) from . import apply_patch # use same logger as -core lgr = logging.getLogger('datalad.distribution.siblings') # This function is taken from datalad-core@2ed709613ecde8218a215dcb7d74b4a352825685 # datalad/distribution/siblings.py # Changes # - removed credential lookup for webdav-remotes # - exception logging via CapturedException def _enable_remote(ds, repo, name, res_kwargs, **unused_kwargs): result_props = dict( action='enable-sibling', path=ds.path, type='sibling', name=name, **res_kwargs) if not isinstance(repo, AnnexRepo): yield dict( result_props, status='impossible', message='cannot enable sibling of non-annex dataset') return if name is None: yield dict( result_props, status='error', message='require `name` of sibling to enable') return # get info on special remote sp_remotes = { v['name']: dict(v, uuid=k) for k, v in repo.get_special_remotes().items() } remote_info = sp_remotes.get(name, None) if remote_info is None: yield dict( result_props, status='impossible', message=("cannot enable sibling '%s', not known", name)) return try: repo.enable_remote(name) result_props['status'] = 'ok' except (AccessDeniedError, AccessFailedError) as e: CapturedException(e) result_props['status'] = 'error' # TODO should use proper way of injecting exceptions in result records result_props['message'] = str(e) yield result_props apply_patch( 'datalad.distribution.siblings', None, '_enable_remote', _enable_remote) datalad-next-1.4.1/datalad_next/patches/ssh_exec.py000066400000000000000000000047351462321624600223340ustar00rootroot00000000000000"""Enable SSH-based remote command execution on Windows This change introduces a replacement for core's ``datalad/support/sshconnector.py:BaseSSHConnection._exec_ssh()`` with a dedicated handling of ``stdin`` for Windows. The OpenSSH client in Windows modifies its ``stdin``-descriptor in such a way, that it becomes unusable for the python process, if the ``stdin``-descriptor is shared between the ``python``-process and the ``ssh``-process. As a result, all read-operations that the ``python``-process performs on ``stdin`` will block and leave the python-process "hanging". This change passes an explicit, empty, byte-string as ``stdin`` to the SSH client call, in order to avoid any interaction of SSH with the parent process's ``stdin`` descriptor. """ import logging from datalad.support.sshconnector import ( StdOutErrCapture, NoCapture, ) from datalad_next.patches import apply_patch from datalad_next.utils import on_windows # use same logger as -core lgr = logging.getLogger('datalad.support.sshconnector') # This method interface/original implementation is taken from # datalad-core@58b8e06317fe1a03290aed80526bff1e2d5b7797 # datalad/support/sshconnector.py:BaseSSHConnection def _exec_ssh(self, ssh_cmd, cmd, options=None, stdin=None, log_output=True): cmd = self._adjust_cmd_for_bundle_execution(cmd) for opt in options or []: ssh_cmd.extend(["-o", opt]) # THIS IS THE PATCH if on_windows and stdin is None: # SSH on windows requires a special stdin handling. If we'd let # stdin=None do its normal thing, the Python process would hang, # because it looses touch with its own file descriptor. # See https://github.com/datalad/datalad-ria/issues/68 stdin = b'' # build SSH call, feed remote command as a single last argument # whatever it contains will go to the remote machine for execution # we cannot perform any sort of escaping, because it will limit # what we can do on the remote, e.g. concatenate commands with '&&' ssh_cmd += [self.sshri.as_str()] + [cmd] lgr.debug("%s is used to run %s", self, ssh_cmd) # TODO: pass expect parameters from above? # Hard to explain to toplevel users ... So for now, just set True out = self.runner.run( ssh_cmd, protocol=StdOutErrCapture if log_output else NoCapture, stdin=stdin) return out['stdout'], out['stderr'] apply_patch( 'datalad.support.sshconnector', 'BaseSSHConnection', '_exec_ssh', _exec_ssh, ) datalad-next-1.4.1/datalad_next/patches/sshconnector.py000066400000000000000000000103521462321624600232330ustar00rootroot00000000000000"""Provide proper arguments for scp-command calls in `SSHConnection` The original code has errors in the methods ``BaseSSHConnection.put`` ``BaseSSHConnection.get``. Both methods use ``self.sshri.hostname`` to determine the target for an ``scp``-command. They should instead use ``self.sshri.as_str()`` in order to include a user specification into the target. The changes in this patch use ``self.sshri.as_str()`` to provide the correct targets for ``scp``-commands. """ import logging from datalad.support.sshconnector import ( StdOutErrCapture, ensure_list, ) from datalad_next.patches import apply_patch # use same logger as -core lgr = logging.getLogger('datalad.support.sshconnector') # The method 'BaseSSHConnection_get' is a patched version of # 'datalad/support/sshconnector.py:BaseSSHConnection.get' # from datalad@e0b357d9b8ca5f432638c23c0cb7c373028c8e52 def BaseSSHConnection_get(self, source, destination, recursive=False, preserve_attrs=False): """Copies source file/folder from remote to a local destination. Note: this method performs escaping of filenames to an extent that moderately weird ones should work (spaces, quotes, pipes, other characters with special shell meaning), but more complicated cases might require appropriate external preprocessing of filenames. Parameters ---------- source : str or list file/folder path(s) to copy from the remote host destination : str file/folder path to copy to on the local host recursive : bool flag to enable recursive copying of given sources preserve_attrs : bool preserve modification times, access times, and modes from the original file Returns ------- str stdout, stderr of the copy operation. """ # make sure we have an open connection, will test if action is needed # by itself self.open() scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs) # add source filepath(s) to scp command, prefixed with the remote host # PATCH in the line below: replaces `self.sshri.hostname` with `self.sshri.as_str()` scp_cmd += ["%s:%s" % (self.sshri.as_str(), self._quote_filename(s)) for s in ensure_list(source)] # add destination path scp_cmd += [destination] out = self.runner.run(scp_cmd, protocol=StdOutErrCapture) return out['stdout'], out['stderr'] # The method 'BaseSSHConnection_put' is a patched version of # 'datalad/support/sshconnector.py:BaseSSHConnection.put' # from datalad@e0b357d9b8ca5f432638c23c0cb7c373028c8e52 def BaseSSHConnection_put(self, source, destination, recursive=False, preserve_attrs=False): """Copies source file/folder to destination on the remote. Note: this method performs escaping of filenames to an extent that moderately weird ones should work (spaces, quotes, pipes, other characters with special shell meaning), but more complicated cases might require appropriate external preprocessing of filenames. Parameters ---------- source : str or list file/folder path(s) to copy from on local destination : str file/folder path to copy to on remote recursive : bool flag to enable recursive copying of given sources preserve_attrs : bool preserve modification times, access times, and modes from the original file Returns ------- str stdout, stderr of the copy operation. """ # make sure we have an open connection, will test if action is needed # by itself self.open() scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs) # add source filepath(s) to scp command scp_cmd += ensure_list(source) # add destination path scp_cmd += ['%s:%s' % ( # PATCH in the line below: replaces `self.sshri.hostname` with `self.sshri.as_str()` self.sshri.as_str(), self._quote_filename(destination), )] out = self.runner.run(scp_cmd, protocol=StdOutErrCapture) return out['stdout'], out['stderr'] apply_patch( modname='datalad.support.sshconnector', objname='BaseSSHConnection', attrname='get', patch=BaseSSHConnection_get, ) apply_patch( modname='datalad.support.sshconnector', objname='BaseSSHConnection', attrname='put', patch=BaseSSHConnection_put, ) datalad-next-1.4.1/datalad_next/patches/test_keyring.py000066400000000000000000000014731462321624600232360ustar00rootroot00000000000000"""Recognize DATALAD_TESTS_TMP_KEYRING_PATH to set alternative secret storage Within `pytest` DataLad uses the plaintext keyring backend. This backend has no built-in way to configure a custom file location for secret storage from the outside. This patch looks for a DATALAD_TESTS_TMP_KEYRING_PATH environment variable, and uses its value as a file path for the storage. This makes it possible to (temporarily) switch storage. This feature is used by the ``tmp_keyring`` pytest fixture. This patch is needed in addition to the test fixture in order to apply such changes also to child processes, such as special remotes and git remotes. """ from os import environ if 'DATALAD_TESTS_TMP_KEYRING_PATH' in environ: import keyring kr = keyring.get_keyring() kr.file_path = environ['DATALAD_TESTS_TMP_KEYRING_PATH'] datalad-next-1.4.1/datalad_next/patches/tests/000077500000000000000000000000001462321624600213125ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/patches/tests/__init__.py000066400000000000000000000000001462321624600234110ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/patches/tests/test_add_method_url2transport_path.py000066400000000000000000000031071462321624600307510ustar00rootroot00000000000000from pathlib import ( Path, PurePosixPath, ) from ..add_method_url2transport_path import ( local_io_url2transport_path, http_remote_io_url2transport_path, ) from datalad.utils import on_windows from datalad.tests.utils_pytest import skip_if @skip_if(on_windows) def test_local_io_url2transport_path_posix(): for url, transport_path in ( ('/a/b/c', '/a/b/c'), ('/C:/a/b/c', '/C:/a/b/c'), ('C:/a/b/c', 'C:/a/b/c'), ): assert local_io_url2transport_path( None, PurePosixPath(url) ) == Path(transport_path) @skip_if(not on_windows) def test_local_io_url2transport_path_windows(monkeypatch): monkeypatch.setattr( 'datalad_next.patches.add_method_url2transport_path.on_windows', True, ) warnings = [] monkeypatch.setattr( 'datalad_next.patches.add_method_url2transport_path.lgr.warning', lambda x: warnings.append(x), ) for url, transport_path in ( ('/a/b/c', '/a/b/c'), ('C:/a/b/c', 'C:/a/b/c'), ('C:a/b/c', 'C:a/b/c'), ('/C:a/b/c', 'C:a/b/c'), ('/C:/a/b/c', 'C:/a/b/c'), ): assert local_io_url2transport_path( None, PurePosixPath(url) ) == Path(transport_path) assert len(warnings) > 0 def test_http_remote_io_url2transport_path(): for url in ('a/b/c', '/a/b/c', '/C:/a/b/c', '/C:a/b/c', 'C:/a/b/c'): assert http_remote_io_url2transport_path( None, PurePosixPath(url) ) == PurePosixPath(url) datalad-next-1.4.1/datalad_next/patches/tests/test_annex_progress_logging.py000066400000000000000000000044531462321624600274740ustar00rootroot00000000000000from datalad_next.tests.marker import skipif_no_network @skipif_no_network def test_uncurl_progress_reporting_to_annex(existing_dataset, monkeypatch): """Set up a repo that is used to download a key, check that we see progress reports """ repo = existing_dataset.repo # enable uncurl to get a datalad code piece generate progress info repo.call_annex([ 'initremote', 'uncurl', 'type=external', 'externaltype=uncurl', 'encryption=none', ]) # 1.7MB download, should never change testfilekey = 'MD5E-s1725572--3f9f0f5c05517686c008115a611586b1.zip' testfileurl = \ 'https://github.com/datalad/datalad/archive/refs/tags/0.18.3.zip' testfilename = 'datalad.zip' # register the key in the dataset with the source URL repo.call_annex(['registerurl', testfilekey, testfileurl]) # record the key to be available from uncurl uncurl_uuid = repo.call_annex_records(['info', 'uncurl'])[0]['uuid'] repo.call_annex(['setpresentkey', testfilekey, uncurl_uuid, '1']) # place key in worktree (not strictly required, but a more common setup) repo.call_annex(['fromkey', '--force', testfilekey, testfilename]) # intercept progress logs in this process. in order for progress reports # to appear here, uncurl needs to report them to git-annex, and our runner # setup needs to catch them and call `log_progress`. So this really is an # end-to-end test. logs = [] # patch the log_progress() used in annexrepo.py def catch_progress(*args, **kwargs): logs.append(kwargs) import datalad.support.annexrepo monkeypatch.setattr( datalad.support.annexrepo, "log_progress", catch_progress, ) # trigger a download. use git-annex directly such that there is # little chance that the uncurl remote process is talking to a # datalad parent process directly repo._call_annex_records( args=['get'], files=[testfilename], progress=True, total_nbytes=1725572, ) # check that we got the progress init report assert any('total' in log for log in logs) # and at least one progress update -- do not check for more, because # on fast systems this may take very little time assert any('update' in log for log in logs) datalad-next-1.4.1/datalad_next/patches/tests/test_cli_configoverrides.py000066400000000000000000000011471462321624600267450ustar00rootroot00000000000000from datalad_next.utils import chpwd from datalad_next.tests import run_main def test_cli_configoverrides(existing_dataset): # test whether a `datalad -c ...` is effective within the # execution environment of a subprocess (for a non-datalad # configuration item with chpwd(existing_dataset.path): out, err = run_main( [ '-c', 'bogusdataladtestsec.subsec=unique', 'run', 'git config bogusdataladtestsec.subsec', ], # git-config would fail, if the config item is unknown exit_code=0, ) datalad-next-1.4.1/datalad_next/patches/tests/test_commanderror.py000066400000000000000000000013211462321624600254100ustar00rootroot00000000000000import pytest from datalad_next.exceptions import CommandError def test_repr_str(): # standard case of a command that failed with non-zero exit # many git/git-annex plumbing commands purposefully signal # statuses like this e = CommandError('some command', code=1) assert 'some command' in str(e) assert 'some command' in repr(e) def test_returncode_code_alias(): # check that `returncode` is an alias for `code` e = CommandError('some command', code=1) assert e.returncode == 1 e.returncode = 2 assert e.returncode == 2 assert e.code == 2 with pytest.raises(AttributeError): assert e.xyz == 3 with pytest.raises(AttributeError): e._aliases = 1 datalad-next-1.4.1/datalad_next/patches/tests/test_configuration.py000066400000000000000000000034061462321624600255750ustar00rootroot00000000000000from datalad_next.tests import ( assert_in_results, assert_raises, ) from datalad_next.utils import chpwd from datalad.api import configuration from datalad_next.exceptions import IncompleteResultsError # run all -core tests from datalad.local.tests.test_configuration import * def test_config_get_global(existing_dataset, tmp_path, no_result_rendering): """Make sure `get` does not require a dataset to be present""" # enter a tempdir to be confident that there is no dataset around with chpwd(str(tmp_path)): res = configuration('get', 'user.name') assert_in_results( res, name='user.name', status='ok', ) # verify that the dataset method was replaced too assert "'get' action can be constrained" \ in existing_dataset.configuration.__doc__ def test_getset_None(tmp_path, no_result_rendering): # enter a tempdir to be confident that there is no dataset around with chpwd(str(tmp_path)): # set an empty string, this is not the same as `None` configuration('set', 'some.item=', scope='global') assert_in_results( configuration('get', 'some.item'), value='', ) # an unset config item is equivalent to `None` configuration('unset', 'some.item', scope='global'), # retrieving an unset item triggers an exception ... assert_raises( IncompleteResultsError, configuration, 'get', 'some.item') # ... because the status of the respective result is "impossible" assert_in_results( configuration('get', 'some.item', on_failure='ignore'), value=None, status='impossible', ) datalad-next-1.4.1/datalad_next/patches/tests/test_create_sibling_ghlike.py000066400000000000000000000026711462321624600272260ustar00rootroot00000000000000# ex: set sts=4 ts=4 sw=4 et: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # # See COPYING file distributed along with the datalad package for the # copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Test create publication target on Github-like platforms""" from datalad.distributed.tests.test_create_sibling_ghlike import * from datalad.distributed.tests.test_create_sibling_gin import * from datalad.distributed.tests.test_create_sibling_gitea import * from datalad.distributed.tests.test_create_sibling_github import * from datalad.distributed.tests.test_create_sibling_gogs import * # we overwrite this one from core, because it assumed the old credential # system to be used def test_invalid_call(dataset, existing_dataset, no_result_rendering): # no dataset assert_raises(ValueError, dataset.create_sibling_gin, 'bogus') ds = existing_dataset # unsupported name assert_raises( ValueError, ds.create_sibling_gin, 'bo gus', credential='some') # conflicting sibling name ds.siblings('add', name='gin', url='http://example.com') res = ds.create_sibling_gin( 'bogus', name='gin', credential='some', on_failure='ignore', dry_run=True) assert_status('error', res) assert_in_results( res, status='error', message=('already has a configured sibling "%s"', 'gin')) datalad-next-1.4.1/datalad_next/patches/tests/test_create_sibling_gitlab.py000066400000000000000000000414171462321624600272260ustar00rootroot00000000000000import pytest import os # this must import ok with and without gitlab from datalad.api import ( Dataset, create, create_sibling_gitlab, ) from datalad.tests.utils_pytest import ( assert_repo_status, assert_result_count, assert_status, ) from datalad.utils import chpwd @pytest.fixture(autouse=False, scope="function") def nested_collections(tmp_path, no_result_rendering): ds = Dataset(tmp_path).create() c1 = ds.create(ds.pathobj / 'subdir' / 'collection1') c1s1 = c1.create('sub1') c1s2 = c1.create('sub2') c2 = ds.create('collection2') c2s1 = c2.create('sub1') c2s11 = c2s1.create('deepsub1') ds.save(recursive=True) assert_repo_status(ds.path) # return a catalog return dict( root=ds, c1=c1, c1s1=c1s1, c1s2=c1s2, c2=c2, c2s1=c2s1, c2s11=c2s11, ) # doesn't actually need gitlab and exercises most of the decision logic def test_dryrun(nested_collections): ctlg = nested_collections # no site config -> error with pytest.raises(ValueError): ctlg['root'].create_sibling_gitlab() # wrong path specification -> impossible result res = ctlg['root'].create_sibling_gitlab( dry_run=True, on_failure='ignore', site='dummy', path='imaghost' ) assert_result_count(res, 1) assert_result_count( res, 1, path=ctlg['root'].pathobj / 'imaghost', type='dataset', status='impossible') # single project vs multi-dataset call with pytest.raises(ValueError): ctlg['root'].create_sibling_gitlab( site='site', project='one', recursive=True) with pytest.raises(ValueError): ctlg['root'].create_sibling_gitlab( site='site', project='one', path=['one', 'two']) # explicit cite, no path constraints, fails for lack of project path config res = ctlg['root'].create_sibling_gitlab( dry_run=True, on_failure='ignore', site='dummy', ) assert_result_count(res, 1) assert_result_count( res, 1, path=ctlg['root'].path, type='dataset', status='error', site='dummy', sibling='dummy', ) # now a working, fully manual call for p in (None, ctlg['root'].path): res = ctlg['root'].create_sibling_gitlab( dry_run=True, on_failure='ignore', site='dummy', project='here', path=p, ) assert_result_count(res, 1) assert_result_count( res, 1, path=ctlg['root'].path, type='dataset', status='ok', site='dummy', sibling='dummy', project='here/project', ) # now configure a default gitlab site ctlg['root'].config.set('datalad.gitlab-default-site', 'theone') # we don't need to specify one anymore, but we can still customize # the sibling name res = ctlg['root'].create_sibling_gitlab( dry_run=True, on_failure='ignore', name='ursula', project='here', ) assert_result_count(res, 1) assert_result_count( res, 1, path=ctlg['root'].path, type='dataset', status='ok', site='theone', sibling='ursula', project='here/project', ) # now configure a sibling name for this site ctlg['root'].config.set('datalad.gitlab-theone-siblingname', 'dieter') # and another one for another site ctlg['root'].config.set('datalad.gitlab-otherone-siblingname', 'ulf') # no need to specific 'name' anymore res = ctlg['root'].create_sibling_gitlab( dry_run=True, on_failure='ignore', project='here', ) assert_result_count( res, 1, path=ctlg['root'].path, type='dataset', status='ok', site='theone', sibling='dieter', project='here/project', ) # properly switches the name based on site res = ctlg['root'].create_sibling_gitlab( dry_run=True, on_failure='ignore', site='otherone', project='here', ) assert_result_count( res, 1, path=ctlg['root'].path, type='dataset', status='ok', site='otherone', sibling='ulf', project='here/project', ) # reports notneeded on existing='skip' with an existing remote ctlg['root'].repo.add_remote('dieter', 'http://example.com') res = ctlg['root'].create_sibling_gitlab( dry_run=True, on_failure='ignore', project='here', existing='skip', ) assert_result_count( res, 1, path=ctlg['root'].path, type='dataset', status='notneeded', site='theone', sibling='dieter', ) ctlg['root'].repo.remove_remote('dieter') # lastly, configure a project path ctlg['root'].config.set('datalad.gitlab-theone-project', 'secret') # now we can drive it blind res = ctlg['root'].create_sibling_gitlab(dry_run=True) assert_result_count( res, 1, path=ctlg['root'].path, type='dataset', status='ok', site='theone', sibling='dieter', project='secret/project', ) # we can make use of the config in the base dataset to drive # calls on subdatasets: use -d plus a path res = ctlg['root'].create_sibling_gitlab(path='subdir', dry_run=True) # only a single result, doesn't touch the parent assert_result_count(res, 1) assert_result_count( res, 1, path=ctlg['c1'].path, type='dataset', status='ok', site='theone', sibling='dieter', # collection setup: superdataset becomes group name and "project" # project underneath, subdirectories and subdatasets are projects # with path separators replaced underneath the group. project='secret/{}'.format(str( ctlg['c1'].pathobj.relative_to(ctlg['root'].pathobj)).replace( os.sep, '-')), ) # we get the same result with an explicit layout request expl_res = ctlg['root'].create_sibling_gitlab( path='subdir', layout='collection', dry_run=True) assert res == expl_res # layout can be configured too, "collection" is "flat" in a group ctlg['root'].config.set('datalad.gitlab-theone-layout', 'collection') res = ctlg['root'].create_sibling_gitlab( path='subdir', dry_run=True) assert_result_count( res, 1, path=ctlg['c1'].path, type='dataset', status='ok', # http://site/group/dir-dir-dir-name.git project='secret/{}'.format(str( ctlg['c1'].pathobj.relative_to(ctlg['root'].pathobj)).replace( os.sep, '-')), ) # make sure the reference dataset does not conflict with its group in this # case res = ctlg['root'].create_sibling_gitlab(dry_run=True) assert_result_count( res, 1, path=ctlg['root'].path, type='dataset', status='ok', project='secret/project') # "flat" does GitHub-style ctlg['root'].config.set('datalad.gitlab-theone-layout', 'flat') res = ctlg['root'].create_sibling_gitlab( path='subdir', dry_run=True) assert_result_count( res, 1, path=ctlg['c1'].path, type='dataset', status='ok', # http://site/base-dir-dir-dir-name.git project='secret-{}'.format(str( ctlg['c1'].pathobj.relative_to(ctlg['root'].pathobj)).replace( os.sep, '-')), ) # the results do not depend on explicitly given datasets, if we just enter # the parent dataset we get the same results with chpwd(str(ctlg['root'].pathobj / 'subdir')): rel_res = create_sibling_gitlab(path=os.curdir, dry_run=True) assert res == rel_res # and again the same results if we are in a subdataset and point to a parent # dataset as a reference and config provider with chpwd(ctlg['c1'].path): rel_res = create_sibling_gitlab( dataset=ctlg['root'].path, path=os.curdir, dry_run=True) assert res == rel_res # blows on unknown layout ctlg['root'].config.unset('datalad.gitlab-theone-layout') with pytest.raises(ValueError): ctlg['root'].create_sibling_gitlab(layout='funny', dry_run=True) # and finally recursion res = ctlg['root'].create_sibling_gitlab(recursive=True, dry_run=True) # one result per dataset assert_result_count(res, len(ctlg)) # verbose check of target layout (easier to see target pattern for humans) # default layout: collection expected_collection_res = [ 'secret/collection2', 'secret/collection2-sub1', 'secret/collection2-sub1-deepsub1', 'secret/project', 'secret/subdir-collection1', 'secret/subdir-collection1-sub1', 'secret/subdir-collection1-sub2', ] assert sorted(r['project'] for r in res) == expected_collection_res # should be the same when explicitly requested res = ctlg['root'].create_sibling_gitlab( recursive=True, layout='collection', dry_run=True) assert_result_count(res, len(ctlg)) assert sorted(r['project'] for r in res) == expected_collection_res res = ctlg['root'].create_sibling_gitlab( recursive=True, layout='flat', dry_run=True) assert_result_count(res, len(ctlg)) assert sorted(r['project'] for r in res) == \ [ 'secret', 'secret-collection2', 'secret-collection2-sub1', 'secret-collection2-sub1-deepsub1', 'secret-subdir-collection1', 'secret-subdir-collection1-sub1', 'secret-subdir-collection1-sub2', ] # test that the configurations work ctlg['root'].config.set("datalad.gitlab-default-projectname", 'myownname') ctlg['c1s1'].config.set("datalad.gitlab-default-pathseparator", '+') res = ctlg['root'].create_sibling_gitlab( recursive=True, layout='flat', dry_run=True) assert_result_count(res, len(ctlg)) assert sorted(r['project'] for r in res) == \ [ 'secret', 'secret-collection2', 'secret-collection2-sub1', 'secret-collection2-sub1-deepsub1', 'secret-subdir+collection1+sub1', 'secret-subdir-collection1', 'secret-subdir-collection1-sub2', ] res = ctlg['root'].create_sibling_gitlab( recursive=True, layout='collection', dry_run=True) assert_result_count(res, len(ctlg)) assert sorted(r['project'] for r in res) == \ [ 'secret/collection2', 'secret/collection2-sub1', 'secret/collection2-sub1-deepsub1', 'secret/myownname', 'secret/subdir+collection1+sub1', 'secret/subdir-collection1', 'secret/subdir-collection1-sub2', ] class _FakeGitLab(object): def __init__(self, site): pass class _NewProjectGitLab(_FakeGitLab): def get_project(self, path): return None def create_project(self, path, description=None): return dict( http_url_to_repo='http://example.com', ssh_url_to_repo='example.com', description=description, ) class _ExistingProjectGitLab(_FakeGitLab): def get_project(self, path): return dict( http_url_to_repo='http://example.com', ssh_url_to_repo='example.com', ) class _ExistingProjectOtherURLGitLab(_FakeGitLab): def get_project(self, path): return dict( http_url_to_repo='http://example2.com', ssh_url_to_repo='example2.com', ) class _CreateFailureGitLab(_FakeGitLab): def get_project(self, path): None def create_project(self, path, description=None): raise RuntimeError def test_fake_gitlab(tmp_path, monkeypatch, no_result_rendering): path = str(tmp_path) ds = Dataset(path).create() import datalad_next.patches.create_sibling_gitlab as glpatch with monkeypatch.context() as m: m.setattr(glpatch, 'GitLabSite', _NewProjectGitLab) res = ds.create_sibling_gitlab(site='dummy', project='here', description='thisisit') assert_result_count(res, 2) # GitLab success assert_result_count( res, 1, action='create_sibling_gitlab', path=path, type='dataset', site='dummy', sibling='dummy', project='here/project', description='thisisit', project_attributes={ 'http_url_to_repo': 'http://example.com', 'ssh_url_to_repo': 'example.com', 'description': 'thisisit' }, status='ok') assert_result_count( res, 1, action='configure-sibling', path=path, name='dummy', url='http://example.com', status='ok') # test sibling name conflicts with monkeypatch.context() as m: m.setattr(glpatch, 'GitLabSite', _ExistingProjectGitLab) res = ds.create_sibling_gitlab(path=ds.path, site='dummy', project='here', existing='skip') assert_result_count(res, 1) assert_result_count( res, 0, action='create_sibling_gitlab', message=['already has a configured sibling "%s"', "dummy"], path=path, refds=path, site='dummy', sibling='dummy', status='notneeded', type='dataset' ) # sibling name conflict with existing='error' should yiel error with monkeypatch.context() as m: m.setattr(glpatch, 'GitLabSite', _ExistingProjectGitLab) res = ds.create_sibling_gitlab(path=ds.path, site='dummy', project='here', existing='skip') assert_result_count(res, 1) assert_result_count( res, 0, action='create_sibling_gitlab', message=['already has a configured sibling "%s"', "dummy"], path=path, refds=path, site='dummy', sibling='dummy', status='error', type='dataset' ) # try recreation, the sibling is already configured, same setup, no error with monkeypatch.context() as m: m.setattr(glpatch, 'GitLabSite', _ExistingProjectGitLab) res = ds.create_sibling_gitlab(path=ds.path, site='dummy', project='here', existing='reconfigure') assert_result_count( res, 1, action='configure-sibling', path=path, name='dummy', url='http://example.com', status='ok') # but error when the name differs res = ds.create_sibling_gitlab( site='dummy', project='here', name='othername', on_failure='ignore') assert_result_count(res, 1) assert_result_count( res, 1, action='create_sibling_gitlab', path=path, site='dummy', sibling='othername', project='here/project', project_attributes={ 'http_url_to_repo': 'http://example.com', 'ssh_url_to_repo': 'example.com' }, status='error') with monkeypatch.context() as m: m.setattr(glpatch, 'GitLabSite', _CreateFailureGitLab) assert_status( 'error', ds.create_sibling_gitlab(site='dummy', project='here', on_failure='ignore') ) # new sibling, ssh access with monkeypatch.context() as m: m.setattr(glpatch, 'GitLabSite', _NewProjectGitLab) res = ds.create_sibling_gitlab(site='sshsite', project='here', access='ssh') assert_result_count(res, 2) assert_result_count( res, 1, action='create_sibling_gitlab', path=path, type='dataset', site='sshsite', sibling='sshsite', project='here/project', project_attributes={ 'http_url_to_repo': 'http://example.com', 'ssh_url_to_repo': 'example.com', 'description': None }, status='ok') assert_result_count( res, 1, action='configure-sibling', path=path, name='sshsite', url='example.com', status='ok') with monkeypatch.context() as m: m.setattr(glpatch, 'GitLabSite', _ExistingProjectOtherURLGitLab) res = ds.create_sibling_gitlab(site='sshsite', project='here', access='ssh', on_failure='ignore', name='sshsite2') assert_result_count(res, 1) assert_result_count( res, 0, action='create_sibling_gitlab', message=["There is already a project at '%s' on site '%s', " "but no sibling with name '%s' is configured, " "maybe use --existing=reconfigure", "here", "sshsite", "sshsite2"], path=path, refds=path, site='sshsite', sibling='sshsite2', project='here/project', project_attributes={ 'http_url_to_repo': 'http://example2.com', 'ssh_url_to_repo': 'example2.com' }, status='error', type='dataset') # same goes for switching the access type without --reconfigure assert_status( 'error', ds.create_sibling_gitlab(site='sshsite', project='here', access='http', on_failure='ignore') ) datalad-next-1.4.1/datalad_next/patches/tests/test_push.py000066400000000000000000000026151462321624600237060ustar00rootroot00000000000000from datalad_next.tests import ( DEFAULT_REMOTE, assert_result_count, ) from datalad.core.distributed.clone import Clone # run all -core tests, because with _push() we patched a central piece from datalad.core.distributed.tests.test_push import * from datalad_next.datasets import Dataset # we override this specific test, because the original behavior is no longer # value, because our implementation behaves "better" def test_gh1811(tmp_path, no_result_rendering): srcpath = tmp_path / 'src' clonepath = tmp_path / 'clone' # `annex=false` is the only change from the -core implementation # of the test. For normal datasets with an annex, the problem underlying # gh1811 is no longer valid, because of more comprehensive analysis of # what needs pushing in this case orig = Dataset(srcpath).create(annex=False) (orig.pathobj / 'some').write_text('some') orig.save() clone = Clone.__call__(source=orig.path, path=clonepath) (clone.pathobj / 'somemore').write_text('somemore') clone.save() clone.repo.call_git(['checkout', 'HEAD~1']) res = clone.push(to=DEFAULT_REMOTE, on_failure='ignore') assert_result_count(res, 1) assert_result_count( res, 1, path=clone.path, type='dataset', action='publish', status='impossible', message='There is no active branch, cannot determine remote ' 'branch', ) datalad-next-1.4.1/datalad_next/patches/tests/test_push_to_export_remote.py000066400000000000000000000171661462321624600273730ustar00rootroot00000000000000from pathlib import Path import pytest from typing import Generator from unittest.mock import ( MagicMock, call, patch, ) from datalad_next.tests import ( assert_in, assert_in_results, eq_, ) from datalad_next.patches.push_to_export_remote import ( _get_export_log_entry, _is_export_remote, _is_valid_treeish, _transfer_data, get_export_records, mod_push, ) module_name = "datalad_next.patches.push_to_export_remote" class MockRepo: def __init__(self, return_special_remotes: bool = True): self.return_special_remotes = return_special_remotes def get_special_remotes(self): if self.return_special_remotes: return { 0: { "name": "no-target", "exporttree": "no" }, 1: { "name": "yes-target", "exporttree": "yes" }, 2: { "name": "some-target", "exporttree": "no" } } else: return {} def call_git(self, *args, **kwargs): return def _call_annex_records_items_(self, *args, **kwargs): yield { "command": f"export {args[0][3]}", "file": "file.txt", "success": True, "input": [], "error-messages": [] } yield { "command": f"export {args[0][3]}", "success": False, "input": [], "error-messages": ["external special remote error: WHATEVER WENT WRONG"], "file": "somefile"} def _call_transfer(target: str, config_result: bool, return_special_remotes: bool = True) -> Generator: ds_mock = MagicMock() ds_mock.config.getbool.return_value = config_result ds_mock.pathobj = Path("/root") return _transfer_data( repo=MockRepo(return_special_remotes), ds=ds_mock, target=target, content=[], data="", force=None, jobs=None, res_kwargs={"path": str(Path("/root"))}, got_path_arg=False) def test_is_export_remote(): # Ensure that None is handled properly assert not _is_export_remote(None) # Ensure that dicts without "exporttree" keyword are handled correctly assert not _is_export_remote({}) # Ensure that "exporttree" is interpreted correctly assert not _is_export_remote({"exporttree": "no"}) assert _is_export_remote({"exporttree": "yes"}) def test_patch_pass_through(): # Ensure that the original _transfer_data is called if the target remote # has exporttree # not set to "yes" with patch("datalad_next.patches.push_to_export_remote.mod_push._push_data") as pd_mock: tuple(_call_transfer("no-target", False)) eq_(pd_mock.call_count, 1) def test_patch_execute_export(): # Ensure that export is called if the target remote has exporttree set to # "yes" with patch(f"{module_name}.mod_push._push_data") as pd_mock, \ patch(f"{module_name}._get_export_log_entry") as gele_mock: gele_mock.return_value = None results = tuple(_call_transfer("yes-target", False)) eq_(pd_mock.call_count, 0) assert_in_results(results, path=str(Path("/root/file.txt")), target="yes-target", action="copy", status="ok") assert_in_results(results, path=str(Path("/root/somefile")), target="yes-target", action="copy", status="error") def test_patch_skip_ignore_targets_export(): with patch(f"{module_name}.lgr") as lgr_mock: tuple(_call_transfer("yes-target", True)) assert_in( call.debug( "Target '%s' is set to annex-ignore, exclude from data-export.", 'yes-target' ), lgr_mock.mock_calls ) def test_patch_check_envpatch(): # Ensure that export is called if the target remote has exporttree not set # to "yes" with patch(f"{module_name}.mod_push._push_data") as pd_mock, \ patch(f"{module_name}.needs_specialremote_credential_envpatch") as nsce_mock, \ patch(f"{module_name}.get_specialremote_credential_envpatch") as gsce_mock, \ patch(f"{module_name}._get_export_log_entry") as gele_mock, \ patch(f"{module_name}._get_credentials") as gc_mock: nsce_mock.return_value = True gsce_mock.return_value = {"WEBDAVU": "hans", "WEBDAVP": "abc"} gele_mock.return_value = None gc_mock.return_value = {"secret": "abc", "user": "hans"} results = tuple(_call_transfer("yes-target", False)) eq_(pd_mock.call_count, 0) assert_in_results(results, path=str(Path("/root/file.txt")), target="yes-target", action="copy", status="ok") assert_in_results(results, path=str(Path("/root/somefile")), target="yes-target", action="copy", status="error") def test_no_special_remotes(): # Ensure that the code works if no special remotes exist with patch(f"{module_name}.mod_push._push_data") as pd_mock: tuple(_call_transfer("no-target", False, False)) eq_(pd_mock.call_count, 1) def test_get_export_records_no_exports(): class NoExportRepo: def call_git_items_(self, *args, **kwargs): raise mod_push.CommandError( stderr="fatal: Not a valid object name git-annex:export.log") results = tuple(get_export_records(NoExportRepo())) eq_(results, ()) def test_get_export_records(): class SomeExportsRepo: def call_git_items_(self, *args, **kwargs): return [ f"{i}.3s from{i}:to 0000{i}" for i in (3, 1, 4, 5, 2) ] result = tuple(get_export_records(SomeExportsRepo())) expected = tuple( { "timestamp": float(i + .3), "source-annex-uuid": f"from{i}", "destination-annex-uuid": f"to", "treeish": f"0000{i}" } for i in range(1, 6) ) for remote_info in expected: assert_in(remote_info, result) def test_get_export_log_entry(): # Expect the youngest entry to be returned. class ManyExportsRepo: def call_git_items_(self, *args, **kwargs): return [ f"{i}.3s from{i}:to 0000{i}" for i in (3, 4, 1, 5, 2) ] def get_export_records(self): yield from get_export_records(self) result = _get_export_log_entry(ManyExportsRepo(), "to") eq_( result, { "timestamp": 5.3, "source-annex-uuid": "from5", "destination-annex-uuid": "to", "treeish": f"00005" } ) def test_is_valid_treeish(): pytest.skip( "this test is skipped until issue " "https://github.com/datalad/datalad-next/issues/39 is solved") class LogRepo: def call_git_items_(self, *args, **kwargs): return [ f"commit{i} 0000{i}" for i in range(4) ] # Check successful validation export_entry = {"treeish": "00002"} assert _is_valid_treeish(LogRepo(), export_entry) # Check unsuccessful validation export_entry = {"treeish": "10000"} assert not _is_valid_treeish(LogRepo(), export_entry) datalad-next-1.4.1/datalad_next/patches/tests/test_replace_ora_remote.py000066400000000000000000000026341462321624600265570ustar00rootroot00000000000000from __future__ import annotations import pytest from ..replace_ora_remote import ( canonify_url, de_canonify_url, ) @pytest.mark.parametrize("scheme", ['ria+file', 'file']) def test_canonify(scheme, monkeypatch): url_uncanonified = scheme + '://C:/a/b/c' url_canonified = scheme + ':///C:/a/b/c' monkeypatch.setattr( 'datalad_next.patches.replace_ora_remote.on_windows', True, ) assert canonify_url(url_canonified) == url_canonified assert canonify_url(url_uncanonified) == url_canonified monkeypatch.setattr( 'datalad_next.patches.replace_ora_remote.on_windows', False, ) assert canonify_url(url_canonified) == url_canonified assert canonify_url(url_uncanonified) == url_uncanonified @pytest.mark.parametrize("scheme", ['ria+file', 'file']) def test_de_canonify(scheme, monkeypatch): url_uncanonified = scheme + '://C:/a/b/c' url_canonified = scheme + ':///C:/a/b/c' monkeypatch.setattr( 'datalad_next.patches.replace_ora_remote.on_windows', True, ) assert de_canonify_url(url_canonified) == url_uncanonified assert de_canonify_url(url_uncanonified) == url_uncanonified monkeypatch.setattr( 'datalad_next.patches.replace_ora_remote.on_windows', False, ) assert de_canonify_url(url_canonified) == url_canonified assert de_canonify_url(url_uncanonified) == url_uncanonified datalad-next-1.4.1/datalad_next/patches/tests/test_ria.py000066400000000000000000000017211462321624600234770ustar00rootroot00000000000000from datalad.api import clone from datalad_next.tests import skip_if_on_windows # we cannot yet run on windows. see # https://github.com/datalad/datalad-next/issues/654 def test_ria_ssh_roundtrip( sshserver, existing_dataset, no_result_rendering, tmp_path): ds = existing_dataset sshurl, sshlocalpath = sshserver testfile = ds.pathobj / 'testfile1.txt' testfile_content = 'uppytyup!' testfile.write_text(testfile_content) ds.save() # create store ds.create_sibling_ria( f'ria+{sshurl}', name='datastore', new_store_ok=True, ) # push to store ds.push(to='datastore') # clone from store into a new location dsclone = clone( source=f'ria+{sshurl}#{ds.id}', path=tmp_path, ) dsclone.get('.') assert ds.id == dsclone.id assert (ds.pathobj / 'testfile1.txt').read_text() \ == (dsclone.pathobj / 'testfile1.txt').read_text() \ == 'uppytyup!' datalad-next-1.4.1/datalad_next/patches/tests/test_run.py000066400000000000000000000015071462321624600235320ustar00rootroot00000000000000import pytest from datalad_next.exceptions import IncompleteResultsError from datalad_next.tests import assert_result_count def test_substitution_config_default(existing_dataset, no_result_rendering): ds = existing_dataset if ds.config.get('datalad.run.substitutions.python') is not None: # we want to test default handling when no config is set pytest.skip( 'Test assumptions conflict with effective configuration') # the {python} placeholder is not explicitly defined, but it has # a default, which run() should discover and use res = ds.run('{python} -c "True"') assert_result_count(res, 1, action='run', status='ok') # make sure we could actually detect breakage with the check above with pytest.raises(IncompleteResultsError): ds.run('{python} -c "breakage"') datalad-next-1.4.1/datalad_next/patches/tests/test_sshremoteio.py000066400000000000000000000065011462321624600252660ustar00rootroot00000000000000from pathlib import PurePosixPath import pytest import subprocess from datalad.distributed.ora_remote import ( RIARemoteError, SSHRemoteIO, ) def test_sshremoteio(sshserver, tmp_path): sshurl, sshlocalpath = sshserver io = SSHRemoteIO(sshurl) # relative path, must be interpreted relative to given base url testfpath = 'dummy.txt' # we run in a tmp dir, test file must not exit assert not io.exists(testfpath) # TODO this content has a trailing newline, because "write_file()" requires # that. Madness. Remove when fixed, must work without. testcontent = 'two\nlines' io.write_file(testfpath, testcontent) # now we have a file assert io.exists(testfpath) # read content matches assert io.read_file(testfpath) == testcontent # create directory, make it interesting and have a space in the name testdirpath = 'something spacy' assert not io.exists(testdirpath) io.mkdir(testdirpath) assert io.exists(testdirpath) # download the testfile to local storage local_testfpath = tmp_path / testfpath # no progress callback io.get(testfpath, local_testfpath, lambda x: x) assert local_testfpath.read_text() == testcontent # upload to subdir testfpath_subdir = f'{testdirpath}/{testfpath}' assert not io.exists(testfpath_subdir) # TODO make absolutification unnecessary from urllib.parse import urlparse io.put( local_testfpath, f'{urlparse(sshurl).path}/{testfpath_subdir}', # no progress callback lambda x: x) assert io.exists(testfpath_subdir) # symlinks testfpath_link = 'dummy_link.txt' assert not io.exists(testfpath_link) io.symlink(testfpath, testfpath_link) assert io.exists(testfpath_link) assert io.read_file(testfpath_link) == testcontent # rename and delete # requires a Pure(Posix)Path object here io.rename(testfpath_subdir, PurePosixPath('deleteme')) assert not io.exists(testfpath_subdir) io.remove(PurePosixPath('deleteme')) assert not io.exists('deleteme') io.remove_dir(PurePosixPath(testdirpath)) assert not io.exists(testdirpath) def test_sshremoteio_7z(sshserver, tmp_path): sshurl, sshlocalpath = sshserver io = SSHRemoteIO(sshurl) # ensure we have a remote 7z if not io.get_7z(): raise pytest.skip("No 7z available on SSH server target") testarchivefpath = 'my.7z' testfpath = 'dummy space.txt' testcontent = 'two\nlines\n' io.write_file(testfpath, testcontent) io.servershell( f'7z a "{testarchivefpath}" "{testfpath}"', check=True, ) # we have an archive assert io.exists(testarchivefpath) # we have the test file in it assert io.in_archive(testarchivefpath, testfpath) # the "in" test means something assert not io.in_archive(testarchivefpath, "random_name") # we can pull from the archive extractfpath = tmp_path / 'extracted.txt' io.get_from_archive(testarchivefpath, testfpath, extractfpath, lambda x: x) assert extractfpath.read_text() == testcontent with pytest.raises(RIARemoteError): io.get_from_archive( 'invalid_archive', testfpath, extractfpath, lambda x: x) with pytest.raises(RIARemoteError): io.get_from_archive( testarchivefpath, 'invalid_member', extractfpath, lambda x: x) datalad-next-1.4.1/datalad_next/patches/update.py000066400000000000000000000041211462321624600220020ustar00rootroot00000000000000"""Robustify ``update()`` target detection for adjusted mode datasets The true cause of the problem is not well understood. https://github.com/datalad/datalad/issues/7507 documents that it is not easy to capture the breakage in a test. """ from . import apply_patch # This function is taken from datalad-core@cdc0ceb30ae04265c5369186acf2ab2683a8ec96 # datalad/distribution/update.py # The change has been proposed in https://github.com/datalad/datalad/pull/7522 def _choose_update_target(repo, branch, remote, cfg_remote): """Select a target to update `repo` from. Note: This function is not concerned with _how_ the update is done (e.g., merge, reset, ...). Parameters ---------- repo : Repo instance branch : str The current branch. remote : str The remote which updates are coming from. cfg_remote : str The configured upstream remote. Returns ------- str (the target) or None if a choice wasn't made. """ target = None if cfg_remote and remote == cfg_remote: # Use the configured cfg_remote branch as the target. # # In this scenario, it's tempting to use FETCH_HEAD as the target. For # a merge, that would be the equivalent of 'git pull REMOTE'. But doing # so would be problematic when the GitRepo.fetch() call was passed # all_=True. Given we can't use FETCH_HEAD, it's tempting to use the # branch.*.merge value, but that assumes a value for remote.*.fetch. target = repo.call_git_oneline( ["rev-parse", "--symbolic-full-name", "--abbrev-ref=strict", # THIS IS THE PATCH: prefix @{upstream} with the branch name # of the corresponding branch f"{repo.get_corresponding_branch(branch) or ''}" "@{upstream}"], read_only=True) elif branch: remote_branch = "{}/{}".format(remote, branch) if repo.commit_exists(remote_branch): target = remote_branch return target apply_patch( 'datalad.distribution.update', None, '_choose_update_target', _choose_update_target) datalad-next-1.4.1/datalad_next/repo_utils/000077500000000000000000000000001462321624600207065ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/repo_utils/__init__.py000066400000000000000000000004131462321624600230150ustar00rootroot00000000000000"""Common repository operations .. currentmodule:: datalad_next.repo_utils .. autosummary:: :toctree: generated get_worktree_head has_initialized_annex """ from .annex import ( has_initialized_annex, ) from .worktree import ( get_worktree_head, ) datalad-next-1.4.1/datalad_next/repo_utils/annex.py000066400000000000000000000023301462321624600223670ustar00rootroot00000000000000from pathlib import Path from datalad_next.runners import call_git_success def has_initialized_annex( path: Path, ) -> bool: """Return whether there is an initialized annex for ``path`` The given ``path`` can be any directory, inside or outside a Git repository. ``True`` is returned when the path is found to be within a (locally) initialized git-annex repository. When this test returns ``True`` it can be expected that no subsequent call to an annex command fails with `git-annex: First run: git-annex init` for this ``path``. """ # this test is about 3ms in MIH's test system. # datalad-core tests for a git repo and then for .git/annex, this # achieves both in one step (although the test in datalad-core is # likely still faster, because it only inspects the filesystem # for a few key members of a Git repo. In order for that test to # work, though, it has to traverse the filesystem to find a repo root # -- if there even is any). # also ee https://git-annex.branchable.com/forum/Cheapest_test_for_an_initialized_annex__63__/ return call_git_success( ['config', '--local', 'annex.uuid'], cwd=path, capture_output=True, ) datalad-next-1.4.1/datalad_next/repo_utils/tests/000077500000000000000000000000001462321624600220505ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/repo_utils/tests/__init__.py000066400000000000000000000000001462321624600241470ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/repo_utils/tests/test_annex.py000066400000000000000000000011421462321624600245700ustar00rootroot00000000000000from ..annex import has_initialized_annex def test_has_initialized_annex(existing_dataset): # for the root assert has_initialized_annex(existing_dataset.pathobj) # for a subdir assert has_initialized_annex(existing_dataset.pathobj / '.datalad') def test_no_initialized_annex(existing_noannex_dataset, tmp_path): # for the root assert not has_initialized_annex(existing_noannex_dataset.pathobj) # for a subdir assert not has_initialized_annex( existing_noannex_dataset.pathobj / '.datalad') # for a random directory assert not has_initialized_annex(tmp_path) datalad-next-1.4.1/datalad_next/repo_utils/tests/test_head.py000066400000000000000000000021121462321624600243560ustar00rootroot00000000000000import pytest from datalad_next.runners import call_git from .. import get_worktree_head def test_get_worktree_head(tmp_path, existing_dataset): ds = existing_dataset with pytest.raises(ValueError) as e: get_worktree_head(tmp_path / 'IDONOTEXISTONTHEFILESYSTEM') assert str(e.value) == 'path not found' norepo = tmp_path / 'norepo' norepo.mkdir() with pytest.raises(ValueError) as e: get_worktree_head(norepo) assert str(e.value) == f'no Git repository at {norepo!r}' reponohead = tmp_path / 'reponohead' reponohead.mkdir() call_git(['init'], cwd=reponohead) assert (None, None) == get_worktree_head(reponohead) # and actual repo with a commit head, chead = get_worktree_head(ds.pathobj) # we always get a HEAD # we always get fullname symbolic info assert head.startswith('refs/heads/') if chead is not None: # there is a corresponding head, and we get it as the # git-annex 'basis' ref assert head.startswith('refs/heads/adjusted/') assert chead.startswith('refs/basis/') datalad-next-1.4.1/datalad_next/repo_utils/worktree.py000066400000000000000000000035651462321624600231330ustar00rootroot00000000000000from __future__ import annotations from pathlib import Path from datalad_next.exceptions import CapturedException from datalad_next.runners import ( CommandError, call_git_lines, ) def get_worktree_head( path: Path, ) -> tuple[str | None, str | None]: """Returns the symbolic name of the worktree `HEAD` at the given path Returns ------- tuple The first item is the symbolic name of the worktree `HEAD`, or `None` if there is no commit. The second item is the symbolic name of the "corresponding branch" in an adjusted-mode git-annex repository, or `None`. """ try: HEAD = call_git_lines( # we add the pathspec disambiguator to get cleaner error messages # (and we only report the first item below, to take it off again) ['rev-parse', '-q', '--symbolic-full-name', 'HEAD', '--'], cwd=path, # we are doing error message parsing below, fix the language # to avoid making it even more fragile force_c_locale=True, )[0] except (NotADirectoryError, FileNotFoundError) as e: raise ValueError('path not found') from e except CommandError as e: CapturedException(e) if 'fatal: not a git repository' in e.stderr: raise ValueError(f'no Git repository at {path!r}') from e elif 'fatal: bad revision' in e.stderr: return (None, None) else: # no idea reraise raise if HEAD.startswith('refs/heads/adjusted/'): # this is a git-annex adjusted branch. do the comparison against # its basis. it is not meaningful to track the managed branch in # a superdataset return ( HEAD, # replace 'refs/heads' with 'refs/basis' f'refs/basis/{HEAD[11:]}', ) else: return (HEAD, None) datalad-next-1.4.1/datalad_next/runners/000077500000000000000000000000001462321624600202155ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/runners/__init__.py000066400000000000000000000056711462321624600223370ustar00rootroot00000000000000"""Execution of subprocesses This module provides all relevant components for subprocess execution. The main work horse is :func:`~datalad_next.runners.iter_subproc`, a context manager that enables interaction with a subprocess in the form of an iterable for input/output processing. Execution errors are communicated with the :class:`~datalad_next.runners.CommandError` exception. In addition, a few convenience functions are provided to execute Git commands (including git-annex). .. currentmodule:: datalad_next.runners .. autosummary:: :toctree: generated iter_subproc call_git call_git_lines call_git_oneline call_git_success iter_git_subproc CommandError Low-level tooling from datalad-core ----------------------------------- .. deprecated:: 1.4 The functionality described here has been deprecated, and the associated imports from datalad-core are scheduled for removal with version 2.0. Use the implementations listed above instead. Few process execution/management utilities are provided, for generic command execution, and for execution command in the context of a Git repository. .. autosummary:: :toctree: generated GitRunner Runner Additional information on the design of the subprocess execution tooling is available from https://docs.datalad.org/design/threaded_runner.html A standard exception type is used to communicate any process termination with a non-zero exit code .. autosummary:: :toctree: generated CommandError Command output can be processed via "protocol" implementations that are inspired by ``asyncio.SubprocessProtocol``. .. autosummary:: :toctree: generated KillOutput NoCapture StdOutCapture StdErrCapture StdOutErrCapture """ from .iter_subproc import ( iter_subproc, ) from .git import ( call_git, call_git_lines, call_git_oneline, call_git_success, iter_git_subproc, ) # runners # TODO REMOVE FOR V2.0 from datalad.runner import ( GitRunner, Runner, ) # TODO REMOVE FOR V2.0 from datalad.runner.nonasyncrunner import ThreadedRunner # protocols # TODO REMOVE FOR V2.0 from datalad.runner import ( KillOutput, NoCapture, Protocol, StdOutCapture, StdErrCapture, StdOutErrCapture, ) # TODO REMOVE FOR V2.0 from datalad.runner.protocol import GeneratorMixIn # TODO REMOVE FOR V2.0 from .protocols import ( NoCaptureGeneratorProtocol, StdOutCaptureGeneratorProtocol, ) # exceptions # The following import supports legacy code that uses `CommandError` from this # module. If you are writing new code, please use `CommandError` from # `datalad.support.exceptions`. We intend to remove this import in the future. from datalad_next.exceptions import CommandError # utilities # TODO REMOVE FOR V2.0 from datalad.runner.nonasyncrunner import ( STDOUT_FILENO, STDERR_FILENO, ) # TODO REMOVE FOR V2.0 from datalad.runner.utils import ( LineSplitter, ) # TODO REMOVE FOR V2.0 from subprocess import ( DEVNULL, ) datalad-next-1.4.1/datalad_next/runners/git.py000066400000000000000000000143541462321624600213610ustar00rootroot00000000000000from __future__ import annotations import os from pathlib import Path import subprocess from datalad_next.exceptions import CapturedException from .iter_subproc import ( CommandError, iter_subproc, ) def _call_git( args: list[str], *, capture_output: bool = False, cwd: Path | None = None, check: bool = False, text: bool | None = None, input: str | bytes | None = None, force_c_locale: bool = False, ) -> subprocess.CompletedProcess: """Wrapper around ``subprocess.run`` for calling Git command ``args`` is a list of argument for the Git command. This list must not contain the Git executable itself. It will be prepended (unconditionally) to the arguments before passing them on. If ``force_c_locale`` is ``True`` the environment of the Git process is altered to ensure output according to the C locale. This is useful when output has to be processed in a locale invariant fashion. All other argument are pass on to ``subprocess.run()`` verbatim. """ env = None if force_c_locale: env = dict(os.environ, LC_ALL='C') # make configurable git_executable = 'git' cmd = [git_executable, *args] try: return subprocess.run( cmd, capture_output=capture_output, cwd=cwd, check=check, text=text, input=input, env=env, ) except subprocess.CalledProcessError as e: # TODO we could support post-error forensics, but some client # might call this knowing that it could fail, and may not # appreciate the slow-down. Add option `expect_fail=False`? # # normalize exception to datalad-wide standard raise CommandError( cmd=cmd, code=e.returncode, stdout=e.stdout, stderr=e.stderr, cwd=cwd, ) from e def call_git( args: list[str], *, cwd: Path | None = None, force_c_locale: bool = False, ) -> None: """Call Git with no output capture, raises on non-zero exit. If ``cwd`` is not None, the function changes the working directory to ``cwd`` before executing the command. If ``force_c_locale`` is ``True`` the environment of the Git process is altered to ensure output according to the C locale. This is useful when output has to be processed in a locale invariant fashion. """ _call_git( args, capture_output=False, cwd=cwd, check=True, force_c_locale=force_c_locale, ) def call_git_success( args: list[str], *, cwd: Path | None = None, capture_output: bool = False, ) -> bool: """Call Git and report success or failure of the command ``args`` is a list of arguments for the Git command. This list must not contain the Git executable itself. It will be prepended (unconditionally) to the arguments before passing them on. If ``cwd`` is not None, the function changes the working directory to ``cwd`` before executing the command. If ``capture_output`` is ``True``, process output is captured, but not returned. By default process output is not captured. """ try: _call_git( args, capture_output=capture_output, cwd=cwd, check=True, ) except CommandError as e: CapturedException(e) return False return True def call_git_lines( args: list[str], *, cwd: Path | None = None, input: str | None = None, force_c_locale: bool = False, ) -> list[str]: """Call Git for any (small) number of lines of output ``args`` is a list of arguments for the Git command. This list must not contain the Git executable itself. It will be prepended (unconditionally) to the arguments before passing them on. If ``cwd`` is not None, the function changes the working directory to ``cwd`` before executing the command. If ``input`` is not None, the argument becomes the subprocess’s stdin. This is intended for small-scale inputs. For call that require processing large inputs, ``iter_git_subproc()`` is to be preferred. If ``force_c_locale`` is ``True`` the environment of the Git process is altered to ensure output according to the C locale. This is useful when output has to be processed in a locale invariant fashion. Raises ------ CommandError if the call exits with a non-zero status. """ res = _call_git( args, capture_output=True, cwd=cwd, check=True, text=True, input=input, force_c_locale=force_c_locale, ) return res.stdout.splitlines() def call_git_oneline( args: list[str], *, cwd: Path | None = None, input: str | None = None, force_c_locale: bool = False, ) -> str: """Call Git for a single line of output If ``cwd`` is not None, the function changes the working directory to ``cwd`` before executing the command. If ``input`` is not None, the argument becomes the subprocess’s stdin. This is intended for small-scale inputs. For call that require processing large inputs, ``iter_git_subproc()`` is to be preferred. If ``force_c_locale`` is ``True`` the environment of the Git process is altered to ensure output according to the C locale. This is useful when output has to be processed in a locale invariant fashion. Raises ------ CommandError if the call exits with a non-zero status. AssertionError if there is more than one line of output. """ lines = call_git_lines(args, cwd=cwd, input=input, force_c_locale=force_c_locale) if len(lines) > 1: raise AssertionError( f"Expected Git {args} to return a single line, but got {lines}" ) return lines[0] def iter_git_subproc( args: list[str], **kwargs ): """``iter_subproc()`` wrapper for calling Git commands All argument semantics are identical to those of ``iter_subproc()``, except that ``args`` must not contain the Git binary, but need to be exclusively arguments to it. The respective `git` command/binary is automatically added internally. """ cmd = ['git'] cmd.extend(args) return iter_subproc(cmd, **kwargs) datalad-next-1.4.1/datalad_next/runners/iter_subproc.py000066400000000000000000000101071462321624600232660ustar00rootroot00000000000000from __future__ import annotations from pathlib import Path from typing import ( Iterable, List, ) from datalad_next.iterable_subprocess.iterable_subprocess import ( iterable_subprocess, OutputFrom, ) from datalad_next.exceptions import CommandError from datalad_next.consts import COPY_BUFSIZE __all__ = ['iter_subproc'] def iter_subproc( args: List[str], *, input: Iterable[bytes] | None = None, chunk_size: int = COPY_BUFSIZE, cwd: Path | None = None, bufsize: int = -1, ): """Context manager to communicate with a subprocess using iterables This offers a higher level interface to subprocesses than Python's built-in ``subprocess`` module. It allows a subprocess to be naturally placed in a chain of iterables as part of a data processing pipeline. It is also helpful when data won't fit in memory and has to be streamed. This is a convenience wrapper around ``datalad_next.iterable_subprocess``, which itself is a slightly modified (for use on Windows) fork of https://github.com/uktrade/iterable-subprocess, written by Michal Charemza. This function provides a context manager. On entering the context, the subprocess is started, the thread to read from standard error is started, the thread to populate subprocess input is started. When running, the standard input thread iterates over the input, passing chunks to the process, while the standard error thread fetches the error output, and while the main thread iterates over the process's output from client code in the context. On context exit, the main thread closes the process's standard output, waits for the standard input thread to exit, waits for the standard error thread to exit, and wait for the process to exit. If the process exited with a non-zero return code, a ``CommandError`` is raised, containing the process's return code. If the context is exited due to an exception that was raised in the context, the main thread terminates the process via ``Popen.terminate()``, closes the process's standard output, waits for the standard input thread to exit, waits for the standard error thread to exit, waits for the process to exit, and re-raises the exception. Note, if an exception is raised in the context, this exception will bubble up to the main thread. That means no ``CommandError`` will be raised if the subprocess exited with a non-zero return code. To access the return code in case of an exception inside the context, use the ``code``-attribute of the ``as``-variable. This object will always contain the return code of the subprocess. For example, the following code will raise a ``StopIteration``-exception in the context (by repeatedly using :func:`next`). The subprocess will exit with ``2`` due to the illegal option ``-@``, and no ``CommandError`` is raised. The return code is read from the variable ``ls_stdout`` .. code-block:: python >>> from datalad_next.runners import iter_subproc >>> try: ... with iter_subproc(['ls', '-@']) as ls_stdout: ... while True: ... next(ls_stdout) ... except Exception as e: ... print(repr(e), ls_stdout.returncode) StopIteration() 2 Parameters ---------- args: list Sequence of program arguments to be passed to ``subprocess.Popen``. input: iterable, optional If given, chunks of ``bytes`` to be written, iteratively, to the subprocess's ``stdin``. chunk_size: int, optional Size of chunks to read from the subprocess's stdout/stderr in bytes. cwd: Path Working directory for the subprocess, passed to ``subprocess.Popen``. bufsize: int, optional Buffer size to use for the subprocess's ``stdin``, ``stdout``, and ``stderr``. See ``subprocess.Popen`` for details. Returns ------- contextmanager """ return iterable_subprocess( args, tuple() if input is None else input, chunk_size=chunk_size, cwd=cwd, bufsize=bufsize, ) datalad-next-1.4.1/datalad_next/runners/protocols.py000066400000000000000000000015671462321624600226240ustar00rootroot00000000000000from . import ( GeneratorMixIn, NoCapture, StdOutCapture, ) # # Below are generic generator protocols that should be provided # upstream # class NoCaptureGeneratorProtocol(NoCapture, GeneratorMixIn): def __init__(self, done_future=None, encoding=None): NoCapture.__init__(self, done_future, encoding) GeneratorMixIn.__init__(self) def timeout(self, fd): raise TimeoutError(f"Runner timeout: process has not terminated yet") class StdOutCaptureGeneratorProtocol(StdOutCapture, GeneratorMixIn): def __init__(self, done_future=None, encoding=None): StdOutCapture.__init__(self, done_future, encoding) GeneratorMixIn.__init__(self) def pipe_data_received(self, fd: int, data: bytes): assert fd == 1 self.send_result(data) def timeout(self, fd): raise TimeoutError(f"Runner timeout {fd}") datalad-next-1.4.1/datalad_next/runners/tests/000077500000000000000000000000001462321624600213575ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/runners/tests/__init__.py000066400000000000000000000000001462321624600234560ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/runners/tests/test_git.py000066400000000000000000000022461462321624600235570ustar00rootroot00000000000000import pytest from ..git import ( CommandError, call_git, call_git_lines, call_git_oneline, call_git_success, iter_git_subproc, ) def test_call_git(): # smoke test call_git(['--version']) # raises properly with pytest.raises(CommandError): call_git(['notacommand']) def test_call_git_success(): assert call_git_success(['--version']) assert not call_git_success(['notacommand']) def test_call_git_lines(): lines = call_git_lines(['--version']) assert len(lines) == 1 assert lines[0].startswith('git version') # check that we can force Git into LC_ALL mode. # this test is only meaningful on systems that # run with some other locale call_git_lines(['-h'])[0].casefold().startswith('usage') def test_call_git_oneline(): line = call_git_oneline(['--version']) assert line.startswith('git version') with pytest.raises(AssertionError): # TODO may not yield multiple lines on all systems call_git_oneline(['config', '-l']) def test_iter_git_subproc(): # just a smoke test that 'git' gets prepended with iter_git_subproc(['--version']) as g: assert list(g) datalad-next-1.4.1/datalad_next/runners/tests/test_iter_subproc.py000066400000000000000000000016621462321624600254750ustar00rootroot00000000000000import pytest import sys from ..iter_subproc import ( iter_subproc, CommandError, ) def test_iter_subproc_cwd(tmp_path): test_content = 'some' test_file_name = 'testfile' test_file = tmp_path / test_file_name test_file.write_text(test_content) check_fx = \ "import sys\n" \ "if open('{input}').read() == '{content}':\n" \ " print('okidoki')".format( input=test_file_name, content=test_content, ) # we cannot read the test file without a full path, because # CWD is not `tmp_path` with pytest.raises(CommandError) as e: with iter_subproc([sys.executable, '-c', check_fx]): pass assert 'FileNotFoundError' in e.value # but if we make it change to CWD, the same code runs with iter_subproc([sys.executable, '-c', check_fx], cwd=tmp_path) as proc: out = b''.join(proc) assert b'okidoki' in out datalad-next-1.4.1/datalad_next/shell/000077500000000000000000000000001462321624600176305ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/shell/__init__.py000066400000000000000000000172431462321624600217500ustar00rootroot00000000000000"""A persistent shell connection This module provides a context manager that establishes a connection to a shell and can be used to execute multiple commands in that shell. Shells are usually remote shells, e.g. connected via an ``ssh``-client, but local shells like ``zsh``, ``bash`` or ``PowerShell`` can also be used. The context manager returns an instance of :class:`ShellCommandExecutor` that can be used to execute commands in the shell via the method :meth:`ShellCommandExecutor.__call__`. The method will return an instance of a subclass of :class:`ShellCommandResponseGenerator` that can be used to retrieve the output of the command, the result code of the command, and the stderr-output of the command. Every response generator expects a certain output structure. It is responsible for ensuring that the output structure is generated. To this end every response generator provides a method :meth:`ShellCommandResponseGenerator.get_command_list`. The method :class:`ShellCommandExecutor.__call__` will pass the user-provided command to :meth:`ShellCommandResponseGenerator.get_command_list` and receive a list of final commands that should be executed in the connected shell and that will generate the expected output structure. Instances of :class:`ShellCommandResponseGenerator` have therefore four tasks: 1. Create a final command list that is used to execute the user provided command. This could, for example, execute the command, print an end marker, and print the return code of the command. 2. Parse the output of the command, yield it to the user. 3. Read the return code and provide it to the user. 4. Provide stderr-output to the user. A very versatile example of a response generator is the class :class:`VariableLengthResponseGenerator`. It can be used to execute a command that will result in an output of unknown length, e.g. ``ls``, and will yield the output of the command to the user. It does that by using a random *end marker* to detect the end of the output and read the trailing return code. This is suitable for almost all commands. If :class:`VariableLengthResponseGenerator` is so versatile, why not just implement its functionality in :class:`ShellCommandExecutor`? There are two major reasons for that: 1. Although the :class:`VariableLengthResponseGenerator` is very versatile, it is not the most efficient implementation for commands that produce large amounts of output. In addition, there is also a minimal risk that the end marker is part of the output of the command, which would trip up the response generator. Putting response generation into a separate class allows to implement specific operations more efficiently and more safely. For example, :class:`DownloadResponseGenerator` implements the download of files. It takes a remote file name as user "command" and creates a final command list that emits the length of the file, a newline, the file content, a return code, and a newline. This allows :class:`DownloadResponseGenerator` to parse the output without relying on an end marker, thus increasing efficiency and safety 2. Factoring out the response generation creates an interface that can be used to support the syntax of different shells and the difference in command names and options in different operating systems. For example, the response generator class :class:`VariableLengthResponseGeneratorPowerShell` supports the invocation of commands with variable length output in a ``PowerShell``. In short, parser generator classes encapsulate details of shell-syntax and operation implementation. That allows support of different shell syntax, and the efficient implementation of specific higher level operations, e.g. ``download``. It also allows users to extend the functionality of :class:`ShellCommandExecutor` by providing their own response generator classes. The module :mod:`datalad_next.shell.response_generators` provides two generally applicable abstract response generator classes: - :class:`VariableLengthResponseGenerator` - :class:`FixedLengthResponseGenerator` The functionality of the former is described above. The latter can be used to execute a command that will result in output of known length, e.g. ``echo -n 012345``. It reads the specified number of bytes and a trailing return code. This is more performant than the variable length response generator (because it does not have to search for the end marker). In addition, it does not rely on the uniqueness of the end marker. It is most useful for operation like ``download``, where the length of the output can be known in advance. As mentioned above, the classes :class:`VariableLengthResponseGenerator` and :class:`FixedLengthResponseGenerator` are abstract. The module :mod:`datalad_next.shell.response_generators` provides the following concrete implementations for them: - :class:`VariableLengthResponseGeneratorPosix` - :class:`VariableLengthResponseGeneratorPowerShell` - :class:`FixedLengthResponseGeneratorPosix` - :class:`FixedLengthResponseGeneratorPowerShell` When :func:`datalad_next.shell.shell` is executed it will use a :class:`VariableLengthResponseClass` to skip the login message of the shell. This is done by executing a *zero command* (a command that will possibly generate some output, and successfully return) in the shell. The zero command is provided by the concrete implementation of class :class:`VariableLengthResponseGenerator`. For example, the zero command for POSIX shells is ``test 0 -eq 0``, for PowerShell it is ``Write-Host hello``. Because there is no way for func:`shell` to determine the kind of shell it connects to, the user can provide an alternative response generator class, in the ``zero_command_rg_class``-parameter. Instance of that class will then be used to execute the zero command. Currently, the following two response generator classes are available: - :class:`VariableLengthResponseGeneratorPosix`: works with POSIX-compliant shells, e.g. ``sh`` or ``bash``. This is the default. - :class:`VariableLengthResponseGeneratorPowerShell`: works with PowerShell. Whenever a command is executed via :meth:`ShellCommandExecutor.__call__`, the class identified by ``zero_command_rg_class`` will be used by default to create the final command list and to parse the result. Users can override this on a per-call basis by providing a different response generator class in the ``response_generator``-parameter of :meth:`ShellCommandExecutor.__call__`. Examples -------- See the documentation of :func:`datalad_next.shell.shell` for examples of how to use the shell-function and different response generator classes. API overview ------------ .. currentmodule:: datalad_next.shell .. autosummary:: :toctree: generated :recursive: ShellCommandExecutor ShellCommandResponseGenerator VariableLengthResponseGenerator VariableLengthResponseGeneratorPosix VariableLengthResponseGeneratorPowerShell FixedLengthResponseGenerator FixedLengthResponseGeneratorPosix FixedLengthResponseGeneratorPowerShell DownloadResponseGenerator DownloadResponseGeneratorPosix operations.posix.upload operations.posix.download operations.posix.delete """ __all__ = [ 'shell', 'posix', ] from .shell import ( shell, ShellCommandExecutor, ) from .operations import posix from .operations.posix import ( DownloadResponseGenerator, DownloadResponseGeneratorPosix, ) from .response_generators import ( FixedLengthResponseGenerator, FixedLengthResponseGeneratorPosix, FixedLengthResponseGeneratorPowerShell, ShellCommandResponseGenerator, VariableLengthResponseGenerator, VariableLengthResponseGeneratorPosix, VariableLengthResponseGeneratorPowerShell, ) datalad-next-1.4.1/datalad_next/shell/operations/000077500000000000000000000000001462321624600220135ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/shell/operations/__init__.py000066400000000000000000000000001462321624600241120ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/shell/operations/common.py000066400000000000000000000057761462321624600236740ustar00rootroot00000000000000from __future__ import annotations from abc import ABCMeta from logging import getLogger from datalad_next.runners.iter_subproc import OutputFrom from ..response_generators import ShellCommandResponseGenerator lgr = getLogger('datalad.ext.next.shell.operations') class DownloadResponseGenerator(ShellCommandResponseGenerator, metaclass=ABCMeta): """Response generator interface for efficient download This response generator is used to implement download in a single command call (instead of using one command to determine the length of a file and a subsequent fixed-length command to download the file). It assumes that the shell sends ``\\n``, the content of the file, and ``\\n``. The response generator delegates the creation of the appropriate final command list to its subclasses. """ def __init__(self, stdout: OutputFrom, ) -> None: super().__init__(stdout, stdout.stderr_deque) self.length = 0 self.read = 0 self.state = 1 self.returncode_chunk = b'' def send(self, _) -> bytes: chunk = b'' # Use a while loop to make arbitrary order of state checks possible. # This allows us to put the most active state at the top of the loop # and increase performance. while True: if self.state == 2: if not chunk: chunk = next(self.stdout_gen) self.read += len(chunk) if self.read >= self.length: self.state = 3 excess = self.read - self.length if excess > 0: chunk, self.returncode_chunk = chunk[:-excess], chunk[-excess:] else: self.returncode_chunk = b'' if chunk: return chunk else: return chunk if self.state == 1: self.length, chunk = self._get_number_and_newline( b'', self.stdout_gen, ) # a negative length indicates an error during download length # determination or download length-communication. if self.length < 0: self.state = 1 self.returncode = 23 raise StopIteration self.state = 2 continue if self.state == 3: self.returncode, trailing = self._get_number_and_newline( self.returncode_chunk, self.stdout_gen, ) if trailing: lgr.warning( 'unexpected output after return code: %s', repr(trailing)) self.state = 4 if self.state == 4: self.state = 1 raise StopIteration raise RuntimeError(f'unknown state: {self.state}') datalad-next-1.4.1/datalad_next/shell/operations/posix.py000066400000000000000000000262741462321624600235420ustar00rootroot00000000000000from __future__ import annotations import logging from pathlib import ( Path, PurePosixPath, ) from queue import Queue from shlex import quote as posix_quote from typing import ( BinaryIO, Callable, ) from .common import DownloadResponseGenerator from ..shell import ( ExecutionResult, ShellCommandExecutor, create_result, ) from datalad_next.consts import COPY_BUFSIZE __all__ = [ 'DownloadResponseGenerator', 'DownloadResponseGeneratorPosix', 'upload', 'download', 'delete', ] lgr = logging.getLogger("datalad.ext.next.shell.operations") class DownloadResponseGeneratorPosix(DownloadResponseGenerator): """A response generator for efficient download commands from Linux systems""" def get_final_command(self, remote_file_name: bytes) -> bytes: """Return a final command list for the download of ``remote_file_name`` The POSIX version for download response generators. This method is usually only called by :meth:`ShellCommandExecutor.__call__`. Parameters ---------- remote_file_name : bytes The name of the file that should be downloaded. If the file name contains special character, e.g. space or ``$``, it must be quoted for a POSIX shell, for example with ``shlex.quote``. Returns ------- bytes The final command that will be executed in the persistent shell in order to start the download in the connected shell. """ command = b""" test -r {remote_file_name} if [ $? -eq 0 ]; then LC_ALL=C ls -dln -- {remote_file_name} | awk '{print $5; exit}' cat {remote_file_name} echo $? else echo -1; fi """.replace(b'{remote_file_name}', remote_file_name) return command def upload( shell: ShellCommandExecutor, local_path: Path, remote_path: PurePosixPath, *, progress_callback: Callable[[int, int], None] | None = None, check: bool = False, ) -> ExecutionResult: """Upload a local file to a named file in the connected shell This function uploads a file to the connected shell ``shell``. It uses ``head`` to limit the number of bytes that the remote shell will read. This ensures that the upload is terminated. The requirements for upload are: - The connected shell must be a POSIX shell. - ``head`` must be installed in the remote shell. Parameters ---------- shell : ShellCommandExecutor The shell that should be used to upload the file. local_path : Path The path of the file that should be uploaded. remote_path : PurePosixPath The path of the file on the connected shell that will contain the uploaded content. progress_callback : callable[[int, int], None], optional, default: None If given, the callback is called with the number of bytes that have been sent and the total number of bytes that should be sent. check : bool, optional, default: False If ``True``, raise a :class:`CommandError` if the remote operation does not exit with a ``0`` as return code. Returns ------- ExecutionResult The result of the upload operation. Raises ------- CommandError: If the remote operation does not exit with a ``0`` as return code, and ``check`` is ``True``, a :class:`CommandError` is raised. It will contain the exit code and the last (up to ``chunk_size`` (defined by the ``chunk_size`` keyword argument to :func:`shell`)) bytes of stderr output. """ def signaling_read( file: BinaryIO, size: int, queue: Queue, *, chunk_size: int = COPY_BUFSIZE ): """iterator that reads from a file and signals EOF via a queue This iterator is used to prevent the situation where a file that should be uploaded is completely read and uploaded, but the final EOF-triggering `read()` call has not yet been made. In this case it can happen that the server provides an answer. If the answer is interpreted as indicator for a completed operation, the calling code assumes that it can close all file handles associated with the operation. This can lead to the final `read()` call being performed on a closed file, which would raise a `ValueError`. To prevent this, ``signaling_read`` signals the end of the read-operation, i.e. an EOF was read, by enqueuing ``Ǹone`` into the signaling queue. The caller can wait for that event to ensure that the read operation is really done. """ processed = 0 while True: data = file.read(chunk_size) if data == b"": break yield data processed += len(data) if progress_callback is not None: progress_callback(processed, size) queue.put(None) # The following command line ensures that content that we send to the shell # either goes to the destination file or into `/dev/null`, but not into the # stdin of the shell. In the latter case it would be interpreted as the # next command, and that might be bad, e.g. if the uploaded content was # `rm -rf $HOME`. file_size = local_path.stat().st_size cmd_line = ( f'head -c {file_size} > {posix_quote(str(remote_path))}' f"|| (head -c {file_size} > /dev/null; test 1 == 2)" ) with local_path.open("rb") as local_file: # We use the `signaling_read` iterator to deal with the situation where # the content of a file that should be uploaded is completely read and # uploaded, but the final, EOF-triggering, `read()` call has not yet been # made. In this case it can happen that the server provides an answer, # and we leave the context, thereby closing the file. When the # `iterable_subprocess..input_to`-thread then tries to read # from the file, a `ValueError` would be raised. This exception would # in turn lead to the closing of stdin of the `shell`-subprocess and # render it unusable.`signaling_read` allows us to wait for a completed # read, including the EOF reading. signal_queue: Queue = Queue() result = shell( cmd_line, stdin=signaling_read(local_file, file_size, signal_queue) ) signal_queue.get() if check: result.to_exception(cmd_line, 'upload failed') return result def download( shell: ShellCommandExecutor, remote_path: PurePosixPath, local_path: Path, *, progress_callback: Callable[[int, int], None] | None = None, response_generator_class: type[ DownloadResponseGenerator ] = DownloadResponseGeneratorPosix, check: bool = False, ) -> ExecutionResult: """Download a file from the connected shell This method downloads a file from the connected shell. The requirements for download via instances of class :class:`DownloadResponseGeneratorPosix` are: - The connected shell must support `ls -dln`. - The connected shell must support `echo -e`. - The connected shell must support `awk`. - The connected shell must support `cat`. Parameters ---------- shell: ShellCommandExecutor The shell from which a file should be downloaded. remote_path : PurePosixPath The path of the file on the connected shell that should be downloaded. local_path : Path The path of the local file that will contain the downloaded content. progress_callback : callable[[int, int], None], optional, default: None If given, the callback is called with the number of bytes that have been received and the total number of bytes that should be received. response_generator_class : type[DownloadResponseGenerator], optional, default: DownloadResponseGeneratorPosix The response generator that should be used to handle the download output. It must be a subclass of :class:`DownloadResponseGenerator`. The default works if the connected shell runs on a Unix-like system that provides `ls -dln`, `cat`, `echo`, and `awk`, e.g. ``Linux`` or ``OSX``. check : bool, optional, default: False If ``True``, raise a :class:`CommandError` if the remote operation does not exit with a ``0`` as return code. Returns ------- ExecutionResult The result of the download operation. Raises ------- CommandError: If the remote operation does not exit with a ``0`` as return code, and ``check`` is ``True``, a :class:`CommandError` is raised. It will contain the exit code and the last (up to ``chunk_size`` (defined by the ``chunk_size`` keyword argument to :func:`shell`)) bytes of stderr output. """ command = posix_quote(str(remote_path)).encode() response_generator = response_generator_class(shell.stdout) result_generator = shell.start( command, response_generator=response_generator, ) with local_path.open("wb") as local_file: processed = 0 for chunk in result_generator: local_file.write(chunk) processed += len(chunk) if progress_callback is not None: progress_callback(processed, response_generator.length) stderr = b''.join(result_generator.stderr_deque) result_generator.stderr_deque.clear() return create_result( result_generator, command, stdout=b'', stderr=stderr, check=check, error_message='download failed', ) def delete( shell: ShellCommandExecutor, files: list[PurePosixPath], *, force: bool = False, check: bool = False, ) -> ExecutionResult: """Delete files on the connected shell The requirements for delete are: - The connected shell must be a POSIX shell. - ``rm`` must be installed in the remote shell. Parameters ---------- shell: ShellCommandExecutor The shell from which a file should be downloaded. files : list[PurePosixPath] The "paths" of the files that should be deleted. force : bool If ``True``, enforce removal, if possible. For example, the command could change the permissions of the files to be deleted to ensure their removal. check : bool, optional, default: False If ``True``, raise a :class:`CommandError` if the remote operation does not exit with a ``0`` as return code. Raises ------- CommandError: If the remote operation does not exit with a ``0`` as return code, and ``check`` is ``True``, a :class:`CommandError` is raised. It will contain the exit code and the last (up to ``chunk_size`` (defined by the ``chunk_size`` keyword argument to :func:`shell`)) bytes of stderr output. """ cmd_line = ( "rm " + ("-f " if force else "") + " ".join( f"{posix_quote(str(f))}" for f in files ) ) result = shell(cmd_line.encode()) if check: result.to_exception(cmd_line, 'delete failed') return result datalad-next-1.4.1/datalad_next/shell/response_generators.py000066400000000000000000000253551462321624600243030ustar00rootroot00000000000000from __future__ import annotations import logging from abc import ( ABCMeta, abstractmethod, ) from collections import deque from collections.abc import Generator from random import randint from datalad_next.itertools import align_pattern from datalad_next.runners.iter_subproc import OutputFrom __all__ = [ 'FixedLengthResponseGenerator', 'FixedLengthResponseGeneratorPosix', 'FixedLengthResponseGeneratorPowerShell', 'ShellCommandResponseGenerator', 'VariableLengthResponseGenerator', 'VariableLengthResponseGeneratorPosix', 'VariableLengthResponseGeneratorPowerShell', ] lgr = logging.getLogger('datalad.ext.next.shell.protocol') class ShellCommandResponseGenerator(Generator, metaclass=ABCMeta): """An abstract class the specifies the minimal functionality of a response generator Subclasses of this class can be used to implement operation-specific, shell-specific or OS-specific details of the command execution and the command output parsing. The return code is available in the ``returncode``-attribute, the stderr-output is available in the ``stderr_deque``-attribute (a ``deque``-instance), of instances of this class. """ def __init__(self, stdout_gen: Generator, stderr_deque: deque) -> None: self.stdout_gen = stdout_gen self.stderr_deque = stderr_deque self.state: str | int = 'output' self.returncode_chunk = b'' self.returncode: int | None = None @staticmethod def _get_number_and_newline(chunk, iterable) -> tuple[int, bytes]: """Help that reads a trailing number and a newline from a chunk Parameters ---------- chunk : bytes An chunk of bytes that should contain the number and the newline. iterable : Iterable An iterable that will be used to extend ``chunk`` if no newline is found in ``chunk``. Returns ------- int A tuple that contains the number that was found in the chunk and the trailing portion of the chunk that was not parsed. """ while b'\n' not in chunk: lgr.log(5, 'completing number chunk') chunk += next(iterable) digits, trailing = chunk.split(b'\n', 1) return int(digits), trailing @abstractmethod def send(self, _) -> bytes: """Deliver the next part of generated output Whenever the response generator is iterated over, this method is called and should deliver the next part of the command output or raise ``StopIteration`` if the command has finished. """ raise NotImplementedError @abstractmethod def get_final_command(self, command: bytes) -> bytes: """Return a final command list that executes ``command`` This method should return a "final" command-pipeline that executes ``command`` and generates the output structure that the response generator expects. This structure will typically be parsed in the implementation of :meth:`send`. This method is usually only called by :meth:`ShellCommandExecutor.__call__`. """ raise NotImplementedError def throw(self, typ, val=..., tb=...): # pragma: no cover return super().throw(typ, val, tb) class VariableLengthResponseGenerator(ShellCommandResponseGenerator, metaclass=ABCMeta): """Response generator that handles outputs of unknown length This response generator is used to execute a command that will result in an output of unknown length, e.g. ``ls``. The final command list it creates will execute the command and print a random end-marker and the return code after the output of the command. The :meth:`send`-method of this class uses the end-marker to determine then end of the command output. """ def __init__(self, stdout: OutputFrom, ) -> None: self.end_marker = _create_end_marker() self.stream_marker = self.end_marker + b'\n' self.plain_stdout = stdout super().__init__( align_pattern(stdout, self.stream_marker), stdout.stderr_deque ) def send(self, _) -> bytes: if self.state == 'output': chunk = next(self.stdout_gen) if self.stream_marker in chunk: self.state = 'returncode' chunk, self.returncode_chunk = chunk.split(self.stream_marker) if chunk: return chunk else: return chunk if self.state == 'returncode': self.returncode, trailing = self._get_number_and_newline( self.returncode_chunk, self.plain_stdout, ) if trailing: lgr.warning( 'unexpected output after return code: %s', repr(trailing)) self.state = 'exhausted' if self.state == 'exhausted': self.state = 'output' raise StopIteration() raise RuntimeError(f'unknown state: {self.state}') @property @abstractmethod def zero_command(self) -> bytes: """Return a command that functions as "zero command" """ raise NotImplementedError class VariableLengthResponseGeneratorPosix(VariableLengthResponseGenerator): """A variable length response generator for POSIX shells""" def __init__(self, stdout): """ Parameters ---------- stdout : OutputFrom A generator that yields output from a shell. Usually the object that is returned by :func:`iter_proc`. """ super().__init__(stdout) def get_final_command(self, command: bytes) -> bytes: """Return a command list that executes ``command`` and prints the end-marker The POSIX version for variable length response generators. This method is usually only called by :meth:`ShellCommandExecutor.__call__`. """ return ( command + b' ; x=$?; echo -e -n "' + self.end_marker + b'\\n"; echo $x\n' ) @property def zero_command(self) -> bytes: return b'test 0 -eq 0' class VariableLengthResponseGeneratorPowerShell(VariableLengthResponseGenerator): """A variable length response generator for PowerShell shells""" def __init__(self, stdout): """ Parameters ---------- stdout : OutputFrom A generator that yields output from a shell. Usually the object that is returned by :func:`iter_proc`. """ super().__init__(stdout) def get_final_command(self, command: bytes) -> bytes: """Return a command list that executes ``command`` and prints the end-marker The PowerShell version for variable length response generators. This method is usually only called by :meth:`ShellCommandExecutor.__call__`. """ # TODO: check whether `command` sets `$LASTEXITCODE` and assign that # to `$x`, iff set. return ( b'$x=0; try {' + command + b'} catch { $x=1 }\n' + b'Write-Host -NoNewline ' + self.end_marker + b'`n$x`n\n' ) @property def zero_command(self) -> bytes: return b'Write-Host hello' class FixedLengthResponseGenerator(ShellCommandResponseGenerator, metaclass=ABCMeta): """Response generator for efficient handling of outputs of known length This response generator is used to execute commands that have an output of known length. The final command list it creates will execute the command and print the return code followed by a newline. The :meth:`send`-method of this response generator will read the specified number of bytes and a trailing return code. This is more performant than scanning the output for an end-marker. """ def __init__(self, stdout: OutputFrom, length: int, ) -> None: """ Parameters ---------- stdout : OutputFrom A generator that yields output from a shell. Usually the object that is returned by :func:`iter_proc`. length : int The length (in bytes) of the output that a command will generate. """ super().__init__(stdout, stdout.stderr_deque) self.length = length self.read = 0 def send(self, _) -> bytes: if self.state == 'output': chunk = next(self.stdout_gen) self.read += len(chunk) if self.read >= self.length: self.state = 'returncode' excess = self.read - self.length if excess > 0: chunk, self.returncode_chunk = chunk[:-excess], chunk[-excess:] else: self.returncode_chunk = b'' if chunk: return chunk else: return chunk if self.state == 'returncode': self.returncode, trailing = self._get_number_and_newline( self.returncode_chunk, self.stdout_gen, ) if trailing: lgr.warning( 'unexpected output after return code: %s', repr(trailing)) self.state = 'exhausted' if self.state == 'exhausted': self.state = 'output' raise StopIteration() raise RuntimeError(f'unknown state: {self.state}') class FixedLengthResponseGeneratorPosix(FixedLengthResponseGenerator): def get_final_command(self, command: bytes) -> bytes: """Return a final command list for a command with a fixed length output The POSIX version for fixed length response generators. This method is usually only called by :meth:`ShellCommandExecutor.__call__`. """ return command + b' ; echo $?\n' class FixedLengthResponseGeneratorPowerShell(FixedLengthResponseGenerator): def get_final_command(self, command: bytes) -> bytes: """Return a final command list for a command with a fixed length output The PowerShell version for fixed length response generators. This method is usually only called by :meth:`ShellCommandExecutor.__call__`. """ return ( b'$x=0; try {' + command + b'} catch { $x=1 }\n' + b'Write-Host -NoNewline $x`n\n' ) def _create_end_marker() -> bytes: """ Create a hopefully unique marker for the shell """ # The following line is marked with `nosec` because `randint` is only # used to diversify markers, not for cryptographic purposes. marker_id = f'{randint(1000000000, 9999999999)}'.encode() # nosec fixed_part = b'----datalad-end-marker-' return fixed_part + marker_id + fixed_part[::-1] datalad-next-1.4.1/datalad_next/shell/shell.py000066400000000000000000000662701462321624600213240ustar00rootroot00000000000000""" -- autoclass:: ShellCommandExecutor :special-members: __call__ """ from __future__ import annotations import logging from contextlib import contextmanager from dataclasses import dataclass from queue import Queue from typing import ( Generator, Iterable, ) from .response_generators import ( ShellCommandResponseGenerator, VariableLengthResponseGenerator, VariableLengthResponseGeneratorPosix, ) from datalad_next.consts import COPY_BUFSIZE from datalad_next.exceptions import CommandError from datalad_next.runners.iter_subproc import ( OutputFrom, iter_subproc, ) __all__ = [ 'shell', 'ExecutionResult', 'ShellCommandExecutor', ] lgr = logging.getLogger('datalad.ext.next.shell') @dataclass class ExecutionResult: stdout: bytes stderr: bytes returncode: int | None def to_exception(self, command: bytes | str | list[str], message: str = '' ): if self.returncode != 0: raise CommandError( cmd=command.decode() if isinstance(command, bytes) else str(command), msg=message, code=self.returncode, stdout=self.stdout, stderr=self.stderr, ) @contextmanager def shell(shell_cmd: list[str], *, credential: str | None = None, chunk_size: int = COPY_BUFSIZE, zero_command_rg_class: type[VariableLengthResponseGenerator] = VariableLengthResponseGeneratorPosix, ) -> Generator[ShellCommandExecutor, None, None]: """Context manager that provides an interactive connection to a shell This context manager uses the provided argument ``shell_cmd`` to start a shell-subprocess. Usually the commands provided in ``shell_cmd`` will start a client for a remote shell, e.g. ``ssh``. :func:`shell` returns an instance of :class:`ShellCommandExecutor` in the ``as``-variable. This instance can be used to interact with the shell. That means, it can be used to execute commands in the shell, receive the data that the commands write to their ``stdout`` and ``stderr``, and retrieve the return code of the executed commands. All commands that are executed via the returned instance of :class:`ShellCommandExecutor` are executed in the same shell instance. Parameters ---------- shell_cmd : list[str] The command to execute the shell. It should be a list of strings that is given to :func:`iter_subproc` as `args`-parameter. For example: ``['ssh', '-p', '2222', 'localhost']``. chunk_size : int, optional The size of the chunks that are read from the shell's ``stdout`` and ``stderr``. This also defines the size of stored ``stderr``-content. zero_command_rg_class : type[VariableLengthResponseGenerator], optional, default: 'VariableLengthResponseGeneratorPosix' Shell uses an instance of the specified response generator class to execute the *zero command* ("zero command" is the command used to skip the login messages of the shell). This class will also be used as the default response generator for all further commands executed in the :class:`ShellCommandExecutor`-instances that is returned by :func:`shell`. Currently, the following concrete subclasses of :class:`VariableLengthResponseGenerator` exist: - :class:`VariableLengthResponseGeneratorPosix`: compatible with POSIX-compliant shells, e.g. ``sh`` or ``bash``. - :class:`VariableLengthResponseGeneratorPowerShell`: compatible with PowerShell. Yields ------ :class:`ShellCommandExecutor` Examples -------- **Example 1:** a simple example that invokes a single command, prints its output and its return code:: >>> from datalad_next.shell import shell >>> with shell(['ssh', 'localhost']) as ssh: ... result = ssh(b'ls -l /etc/passwd') ... print(result.stdout) ... print(result.returncode) ... b'-rw-r--r-- 1 root root 2773 Nov 14 10:05 /etc/passwd\\n' 0 **Example 2:** this example invokes two commands, the second of which exits with a non-zero return code. The error output is retrieved from ``result.stderr``, which contains all ``stderr`` data that was written since the last command was executed:: >>> from datalad_next.shell import shell >>> with shell(['ssh', 'localhost']) as ssh: ... print(ssh(b'head -1 /etc/passwd').stdout) ... result = ssh(b'ls /no-such-file') ... print(result.stdout) ... print(result.returncode) ... print(result.stderr) ... b'root:x:0:0:root:/root:/bin/bash\\n' b'' 2 b"Pseudo-terminal will not be allocated because stdin is not a terminal.\\r\\nls: cannot access '/no-such-file': No such file or directory\\n" **Example 3:** demonstrates how to use the ``check``-parameter to raise a :class:`CommandError`-exception if the return code of the command is not zero. This delegates error handling to the calling code and helps to keep the code clean:: >>> from datalad_next.shell import shell >>> with shell(['ssh', 'localhost']) as ssh: ... print(ssh(b'ls /no-such-file', check=True).stdout) ... Traceback (most recent call last): File "", line 2, in File "/home/cristian/Develop/datalad-next/datalad_next/shell/shell.py", line 279, in __call__ return create_result( File "/home/cristian/Develop/datalad-next/datalad_next/shell/shell.py", line 349, in create_result result.to_exception(command, error_message) File "/home/cristian/Develop/datalad-next/datalad_next/shell/shell.py", line 52, in to_exception raise CommandError( datalad.runner.exception.CommandError: CommandError: 'ls /no-such-file' failed with exitcode 2 [err: 'cannot access '/no-such-file': No such file or directory'] **Example 4:** an example for manual checking of the return code:: >>> from datalad_next.shell import shell >>> def file_exists(file_name): ... with shell(['ssh', 'localhost']) as ssh: ... result = ssh(f'ls {file_name}') ... return result.returncode == 0 ... print(file_exists('/etc/passwd')) True >>> print(file_exists('/no-such-file')) False **Example 5:** an example for result content checking:: >>> from datalad_next.shell import shell >>> with shell(['ssh', 'localhost']) as ssh: ... result = ssh(f'grep root /etc/passwd', check=True).stdout ... if len(result.splitlines()) != 1: ... raise ValueError('Expected exactly one line') **Example 6:** how to work with generator-based results. For long running commands a generator-based result fetching can be used. To use generator-based output the command has to be executed with the method :meth:`ShellCommandExecutor.start`. This method returns a generator that provides command output as soon as it is available:: >>> import time >>> from datalad_next.shell import shell >>> with shell(['ssh', 'localhost']) as ssh: ... result_generator = ssh.start(b'c=0; while [ $c -lt 6 ]; do head -2 /etc/passwd; sleep 2; c=$(( $c + 1 )); done') ... for result in result_generator: ... print(time.time(), result) ... assert result_generator.returncode == 0 1713358098.82588 b'root:x:0:0:root:/root:/bin/bash\\nsystemd-timesync:x:497:497:systemd Time Synchronization:/:/usr/sbin/nologin\\n' 1713358100.8315682 b'root:x:0:0:root:/root:/bin/bash\\nsystemd-timesync:x:497:497:systemd Time Synchronization:/:/usr/sbin/nologin\\n' 1713358102.8402972 b'root:x:0:0:root:/root:/bin/bash\\nsystemd-timesync:x:497:497:systemd Time Synchronization:/:/usr/sbin/nologin\\n' 1713358104.8490314 b'root:x:0:0:root:/root:/bin/bash\\nsystemd-timesync:x:497:497:systemd Time Synchronization:/:/usr/sbin/nologin\\n' 1713358106.8577306 b'root:x:0:0:root:/root:/bin/bash\\nsystemd-timesync:x:497:497:systemd Time Synchronization:/:/usr/sbin/nologin\\n' 1713358108.866439 b'root:x:0:0:root:/root:/bin/bash\\nsystemd-timesync:x:497:497:systemd Time Synchronization:/:/usr/sbin/nologin\\n' (The exact output of the above example might differ, depending on the length of the first two entries in the ``/etc/passwd``-file.) **Example 7:** how to use the ``stdin``-parameter to feed data to a command that is executed in the persistent shell. The methods :meth:`ShellCommandExecutor.__call__` and :meth:`ShellCommandExecutor.start` allow to pass an iterable in the ``stdin``-argument. The content of this iterable will be sent to ``stdin`` of the executed command:: >>> from datalad_next.shell import shell >>> with shell(['ssh', 'localhost']) as ssh: ... result = ssh(b'head -c 4', stdin=(b'ab', b'c', b'd')) ... print(result.stdout) b'abcd' **Example 8:** how to work with commands that consume ``stdin`` completely. In the previous example, the command ``head -c 4`` was used to consume data from ``stdin``. This command terminates after reading exactly 4 bytes from ``stdin``. If ``cat`` was used instead of ``head -c 4``, the command would have continued to run until its ``stdin`` was closed. The ``stdin`` of the command that is executed in the persistent shell can be close by calling :meth:`ssh.close`. But, in order to be able to call :meth:`ssh.close`, any process that consumes ``stdin`` completely should be executed by calling the :meth:`ssh.start`-method. The reason for this is that :meth:`ssh.start` will return immediately which allows to call the :meth:`ssh.close`-method, as shown in the following code (:meth:`ssh.__call__` would have waited for ``cat`` to terminate, but because :meth:`ssh.close` is not called, ``cat`` would never terminate):: >>> from datalad_next.shell import shell >>> with shell(['ssh', 'localhost']) as ssh: ... result_generator = ssh.start(b'cat', stdin=(b'12', b'34', b'56')) ... ssh.close() ... print(tuple(result_generator)) (b'123456',) Note that the ``ssh``-object cannot be used for further command execution after :meth:`ssh.close` was called. Further command execution requires to spin up a new persistent shell-object. To prevent this overhead, it is advised to limit the number of bytes that a shell-command consumes, either by their number, e.g. by using ``head -c``, or by some other means, e.g. by interpreting the content or using a command like ``timeout``. **Example 9:** upload a file to the persistent shell. The command ``head -c`` can be used to implement the upload a file to a remote shell. The basic idea is to determine the number of bytes that will be uploaded and create a command in the remote shell that will consume exactly this amount of bytes. The following code implements this idea (without file-name escaping and error handling):: >>> import os >>> import time >>> from datalad_next.shell import shell >>> def upload(ssh, file_name, remote_file_name): ... size = os.stat(file_name).st_size ... f = open(file_name, 'rb') ... return ssh(f'head -c {size} > {remote_file_name}', stdin=iter(f.read, b'')) ... >>> with shell(['ssh', 'localhost']) as ssh: ... upload(ssh, '/etc/passwd', '/tmp/uploaded-1') Note: in this example, ``f`` is not explicitly closed, it is only closed when the program exits. The reason for this is that the shell uses threads internally for stdin-feeding, and there is no simple way to determine whether the thread that reads ``f`` has yet read an EOF and exited. If ``f`` is closed before the thread exits, and the thread tries to read from ``f``, a ``ValueError`` will be raised (the function :func:`datalad_next.shell.posix.upload` contains a solution for this problem that has slightly more code. For the sake of simplicity, this solution was not implemented in the example above). **Example 10:** download a file. This example uses a fixed-length response generator to download a file from a remote shell. The basic idea is to determine the number of bytes that will be downloaded and create a fixed-length response generator that reads exactly this number of bytes. The fixed length response generator is then passed to :meth:`ssh.start` in the keyword-argument ``response_generator``. This instructs :meth:`ssh.start` to use the response generator to interpret the output of this command invocation (the example code has no file-name escaping or error handling):: >>> from datalad_next.shell import shell >>> from datalad_next.shell.response_generators import FixedLengthResponseGeneratorPosix >>> def download(ssh, remote_file_name, local_file_name): ... size = ssh(f'stat -c %s {remote_file_name}').stdout ... with open(local_file_name, 'wb') as f: ... response_generator = FixedLengthResponseGeneratorPosix(ssh.stdout, int(size)) ... results = ssh.start(f'cat {remote_file_name}', response_generator=response_generator) ... for chunk in results: ... f.write(chunk) ... >>> with shell(['ssh', 'localhost']) as ssh: ... download(ssh, '/etc/passwd', '/tmp/downloaded-1') ... Note that :meth:`ssh.start` is used to start the download. This allows to process downloaded data as soon as it is available. **Example 11:** This example implements interaction with a *Python* interpreter (which can be local or remote). Interaction in the context of this example means, executing a line of python code, returning the result, i.e. the output on ``stdout``, and detect whether an exception was raised or not. To this end a Python-specific variable-length response generator is created by subclassing the generic class :class:`VariableLengthResponseGenerator`. The new response generator implements the method :meth:`get_final_command`, which takes a python statement and returns a ``try``-``except``-block that executes the python statement, prints the end-marker and a return code (which is ``0`` if the statement was executed successfully, and ``1`` if an exception was raised):: >>> from datalad_next.shell import shell >>> from datalad_next.shell.response_generators import VariableLengthResponseGenerator >>> class PythonResponseGenerator(VariableLengthResponseGenerator): ... def get_final_command(self, command: bytes) -> bytes: ... return f'''try: ... {command.decode()} ... print('{self.end_marker.decode()}') ... print(0) ... except: ... print('{self.end_marker.decode()}') ... print(1) ... '''.encode() ... @property ... def zero_command(self) -> bytes: ... return b'True' ... >>> with shell(['python', '-u', '-i']) as py: ... print(py('1 + 1')) ... print(py('1 / 0')) ... ExecutionResult(stdout=b'2\\n', stderr=b'>>> ... ... ... ... ... ... ... ... ', returncode=0) ExecutionResult(stdout=b'', stderr=b'... ... ... ... ... ... ... ... Traceback (most recent call last):\\n File "", line 2, in \\nZeroDivisionError: division by zero', returncode=1) The python response generator could be extended to deliver exception information in an extended ``ExecutionResult``. This can be achieved by *pickling* (see the ``pickle``-module) a caught exception to a byte-string, printing this byte-string after the return-code line, and printing another end-marker. The :meth:`send`-method of the response generator must then be overwritten to unpickle the exception information and store it in an extended ``ExecutionResult`` (or raise it in the shell-context, if that is preferred). **Example 12:** this example shows how to use the shell context handler in situations were a ``with``-statement is not suitable, e.g. if a shell object should be used in multiple, independently called functions. In this case the context manager can be manually entered and exited. The following code generates a global ``ShellCommandExecutor``-instance in the ``ssh``-variable:: >>> from datalad_next.shell import shell >>> context_manager = shell(['ssh', 'localhost']) >>> ssh = context_manager.__enter__() >>> print(ssh(b'ls /etc/passwd').stdout) b'/etc/passwd\\n' >>> context_manager.__exit__(None, None, None) False """ def train(queue: Queue): """Use a queue to allow chaining of iterables at different times""" for iterable in iter(queue.get, None): yield from iterable subprocess_inputs: Queue = Queue() with iter_subproc(shell_cmd, input=train(subprocess_inputs), chunk_size=chunk_size, bufsize=0) as shell_output: assert issubclass(zero_command_rg_class, VariableLengthResponseGenerator) cmd_executor = ShellCommandExecutor( subprocess_inputs, shell_output, shell_cmd, zero_command_rg_class ) try: cmd_executor.command_zero(zero_command_rg_class(shell_output)) # Return the now ready connection yield cmd_executor finally: # Ensure that the shell is terminated if an exception is raised by # code that uses `shell`. This is necessary because # the `terminate`-call that is invoked when leaving the # `iterable_subprocess`-context will not end the shell-process. It # will only terminate if its stdin is closed, or if it is killed. subprocess_inputs.put(None) class ShellCommandExecutor: """Execute a command in a shell and return a generator that yields output Instances of :class:`ShellCommandExecutor` allow to execute commands that are provided as byte-strings via its :meth:`__call__`-method. To execute the command and collect its output, return code, and stderr-output, :class:`ShellCommandExecutor` uses instances of subclasses of :class:`ShellCommandResponseGenerator`, e.g. :class:`VariableLengthResponseGeneratorPosix`. """ def __init__(self, process_inputs: Queue, stdout: OutputFrom, shell_cmd: list[str], default_rg_class: type[VariableLengthResponseGenerator], ) -> None: self.process_inputs = process_inputs self.stdout = stdout self.shell_cmd = shell_cmd self.default_rg_class = default_rg_class def __call__(self, command: bytes | str, *, stdin: Iterable[bytes] | None = None, response_generator: ShellCommandResponseGenerator | None = None, encoding: str = 'utf-8', check: bool = False ) -> ExecutionResult: """Execute a command in the connected shell and return the result This method executes the given command in the connected shell. It assembles all output on stdout, all output on stderr that was written during the execution of the command, and the return code of the command. (The response generator defines when the command output is considered complete. Usually that is done by checking for a random end-of-output marker.) Parameters ---------- command : bytes | str The command to execute. If the command is given as a string, it will be encoded to bytes using the encoding given in `encoding`. stdin : Iterable[byte] | None, optional, default: None If given, the bytes are sent to stdin of the command. Note: If the command reads its ``stdin`` until EOF, you have to use :meth:`self.close` to close ``stdin`` of the command. Otherwise, the command will usually not terminate. Once :meth:`self.close` is called, no more commands can be executed with this :class:`ShellCommandExecutor`-instance. If you want to execute further commands in the same :class:`ShellCommandExecutor`-instance, you must ensure that commands consume a fixed amount of input, for example, by using `head -c | `. response_generator : ShellCommandResponseGenerator | None, optional, default: None If given, the responder generator (usually an instance of a subclass of ``ShellCommandResponseGenerator``), that is used to generate the command line and to parse the output of the command. This can be used to implement, for example, fixed length output processing. encoding : str, optional, default: 'utf-8' The encoding that is used to encode the command if it is given as a string. Note: the encoding should match the decoding the is used in the connected shell. check : bool, optional, default: False If True, a :class:`CommandError`-exception is raised if the return code of the command is not zero. Returns ------- :class:`ExecutionResult` An instance of :class:`ExecutionResult` that contains the ``stdout``-output, the ``stderr``-output, and the return code of the command. Raises ------ :class:`CommandError` If the return code of the command is not zero and `check` is True. """ response_generator = self.start( command, stdin=stdin, response_generator=response_generator, encoding=encoding, ) stdout = b''.join(response_generator) stderr = b''.join(self.stdout.stderr_deque) self.stdout.stderr_deque.clear() return create_result( response_generator, command, stdout, stderr, check=check ) def start(self, command: bytes | str, *, stdin: Iterable[bytes] | None = None, response_generator: ShellCommandResponseGenerator | None = None, encoding: str = 'utf-8', ) -> ShellCommandResponseGenerator: """Execute a command in the connected shell Execute a command in the connected shell and return a generator that provides the content written to stdout of the command. After the generator is exhausted, the return code of the command is available in the ``returncode``-attribute of the generator. Parameters ---------- command : bytes | str The command to execute. If the command is given as a string, it will be encoded to bytes using the encoding given in `encoding`. stdin : Iterable[byte] | None, optional, default: None If given, the bytes are sent to stdin of the command. Note: If the command reads its ``stdin`` until EOF, you have to use :meth:`self.close` to close ``stdin`` of the command. Otherwise, the command will usually not terminate. Once :meth:`self.close` is called, no more commands can be executed with this :class:`ShellCommandExecutor`-instance. If you want to execute further commands in the same :class:`ShellCommandExecutor`-instance, you must ensure that commands consume a fixed amount of input, for example, by using `head -c | `. response_generator : ShellCommandResponseGenerator | None, optional, default: None If given, the responder generator (usually an instance of a subclass of ``ShellCommandResponseGenerator``), that is used to generate the command line and to parse the output of the command. This can be used to implement, for example, fixed length output processing. encoding : str, optional, default: 'utf-8' The encoding that is used to encode the command if it is given as a string. Note: the encoding should match the decoding the is used in the connected shell. Returns ------- :class:`ShellCommandResponseGenerator` A generator that yields the output of ``stdout`` of the command. The generator is exhausted when all output is read. After that, the return code of the command execution is available in the ``returncode``-attribute of the generator, and the stderr-output is available in the ``stderr_deque``-attribute of the response generator. If a response generator was passed in via the ``response_generator``-parameter, the same instance will be returned. """ if response_generator is None: response_generator = self.default_rg_class(self.stdout) if isinstance(command, str): command = command.encode(encoding) final_command = response_generator.get_final_command(command) # Store the command list to report it in `CommandError`-exceptions. # This is done here to relieve the response generator classes from # this task. self.process_inputs.put([final_command]) if stdin is not None: self.process_inputs.put(stdin) return response_generator def __repr__(self): return f'{self.__class__.__name__}({self.shell_cmd!r})' def close(self): """stop input to the shell This method closes stdin of the shell. This will in turn terminate the shell, no further commands can be executed in the shell. """ self.process_inputs.put(None) def command_zero(self, response_generator: VariableLengthResponseGenerator ) -> None: """Execute the zero command This method is only used by :func:`shell` to skip any login messages """ result_zero = self( response_generator.zero_command, response_generator=response_generator, check=True, ) lgr.debug('skipped login message: %s', result_zero.stdout) def create_result(response_generator: ShellCommandResponseGenerator, command: bytes | str | list[str], stdout: bytes, stderr: bytes, error_message: str = '', check: bool = False) -> ExecutionResult: result = ExecutionResult( stdout=stdout, stderr=stderr, returncode=response_generator.returncode ) if check is True: result.to_exception(command, error_message) return result datalad-next-1.4.1/datalad_next/shell/tests/000077500000000000000000000000001462321624600207725ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/shell/tests/__init__.py000066400000000000000000000000001462321624600230710ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/shell/tests/test_response_generators.py000066400000000000000000000073731462321624600265040ustar00rootroot00000000000000from __future__ import annotations from typing import cast import pytest from datalad_next.runners.iter_subproc import OutputFrom from ..response_generators import ( FixedLengthResponseGeneratorPosix, VariableLengthResponseGeneratorPosix, lgr as response_generator_lgr ) from ..operations.posix import DownloadResponseGeneratorPosix from ..operations.common import lgr as posix_common_lgr class DummyOutputFrom(OutputFrom): def __init__(self, iterable: list[bytes] | None = None, ) -> None: super().__init__(None, None) self.iterable = iterable def send(self, _): if self.iterable: return self.iterable.pop(0) raise StopIteration def test_unknown_state_detection_in_variable(): # Check that the response generator detects an unknown internal state. # Since different response_generator = VariableLengthResponseGeneratorPosix( cast(OutputFrom, DummyOutputFrom()) ) response_generator.state = '<-no-such-state->' with pytest.raises(RuntimeError): response_generator.send(b'') def test_unknown_state_detection(): # Check that the response generator detects an unknown internal state. # Since different response_generators = [ VariableLengthResponseGeneratorPosix( cast(OutputFrom, DummyOutputFrom()) ), FixedLengthResponseGeneratorPosix( cast(OutputFrom, DummyOutputFrom()), 100, ), DownloadResponseGeneratorPosix( cast(OutputFrom, DummyOutputFrom()) ), ] for response_generator in response_generators: response_generator.state = '<-no-such-state->' with pytest.raises(RuntimeError): response_generator.send(b'') def test_trailing_content_detection_in_variable(monkeypatch): # Check that the response generator detects a trailing newline. input_list = [] warning_list = [] response_generator = VariableLengthResponseGeneratorPosix( cast(OutputFrom, DummyOutputFrom(input_list)) ) input_list.extend([ b'123\n', response_generator.stream_marker, b'0\nEXTRA-CONTENT\n', ]) monkeypatch.setattr( response_generator_lgr, 'warning', lambda *args, **kwargs: warning_list.append((args, kwargs)) ) assert tuple(response_generator) == (b'123\n',) assert warning_list == [( ('unexpected output after return code: %s', "b'EXTRA-CONTENT\\n'"), {} )] def test_trailing_content_detection_in_fixed(monkeypatch): # Check that the response generator detects a trailing newline. input_list = [b'1230\nEXTRA-CONTENT\n'] warning_list = [] response_generator = FixedLengthResponseGeneratorPosix( cast(OutputFrom, DummyOutputFrom(input_list)), 3, ) monkeypatch.setattr( response_generator_lgr, 'warning', lambda *args, **kwargs: warning_list.append((args, kwargs)) ) assert tuple(response_generator) == (b'123',) assert warning_list == [( ('unexpected output after return code: %s', "b'EXTRA-CONTENT\\n'"), {} )] def test_trailing_content_detection_in_download(monkeypatch): # Check that the response generator detects a trailing newline. input_list = [b'3\n1230\nEXTRA-CONTENT\n'] warning_list = [] response_generator = DownloadResponseGeneratorPosix( cast(OutputFrom, DummyOutputFrom(input_list)), ) monkeypatch.setattr( posix_common_lgr, 'warning', lambda *args, **kwargs: warning_list.append((args, kwargs)) ) assert tuple(response_generator) == (b'123',) assert warning_list == [( ('unexpected output after return code: %s', "b'EXTRA-CONTENT\\n'"), {} )] datalad-next-1.4.1/datalad_next/shell/tests/test_shell.py000066400000000000000000000434151462321624600235210ustar00rootroot00000000000000from __future__ import annotations import os import sys from pathlib import PurePosixPath from shlex import quote as posix_quote import pytest from more_itertools import consume import datalad from datalad.tests.utils_pytest import ( on_windows, skip_if, ) from datalad_next.runners import ( CommandError, iter_subproc, ) from datalad_next.url_operations.ssh import ssh_url2openargs from ..response_generators import ( FixedLengthResponseGeneratorPosix, FixedLengthResponseGeneratorPowerShell, VariableLengthResponseGeneratorPosix, VariableLengthResponseGeneratorPowerShell, lgr as response_generator_lgr ) from ..shell import ( ShellCommandExecutor, shell, ) from .. import posix # Some files that are usually found on POSIX systems, i.e. Linux, OSX common_files = [b'/etc/passwd', b'/etc/shells'] # Select "challenging" file names that need proper quoting based on Windows, on # POSIX, and on FAT file systems. if os.getenv('TMPDIR', '').startswith('/crippledfs'): upload_file_name = "up 1" download_file_name = "down 1" files_to_delete = ('f 1', 'f 2', 'f 3') elif on_windows: upload_file_name = "upload 123" download_file_name = "download 123" files_to_delete = ('f 1', 'f 2', 'f 3') else: upload_file_name = "upload $123 \"'" download_file_name = "download $123 \" ' " files_to_delete = ('f $1', 'f \\2 " ', 'f 3 \' ') def _get_cmdline(ssh_url: str): args, parsed = ssh_url2openargs(ssh_url, datalad.cfg) return ['ssh'] + args, parsed.path @pytest.mark.parametrize('file_name', common_files) def test_basic_functionality(sshserver, file_name): ssh_url = sshserver[0] with shell(_get_cmdline(ssh_url)[0]) as ssh: _check_ls_result(ssh, file_name) def test_basic_functionality_multi(sshserver): # Similar to `test_basic_functionality`, but executes all commands on the # same connection. ssh_url = sshserver[0] with shell(_get_cmdline(ssh_url)[0]) as ssh_executor: for file_name in common_files: _check_ls_result(ssh_executor, file_name) def _quote_file_name(file_name: bytes, *, encoding: str = 'utf-8') -> bytes: return posix_quote(file_name.decode(encoding)).encode(encoding) def _check_ls_result(ssh_executor, file_name: bytes): quoted_file_name = _quote_file_name(file_name) result = ssh_executor(b'ls ' + quoted_file_name) assert result.stdout == file_name + b'\n' result = ssh_executor('ls ' + quoted_file_name.decode()) assert result.stdout == file_name + b'\n' def test_return_code_functionality(sshserver): ssh_url = sshserver[0] with shell(_get_cmdline(ssh_url)[0]) as ssh: result = ssh(b'bash -c "exit 123"') assert result.returncode == 123 @pytest.mark.parametrize('cmd,expected', [ (b'echo 0123456789', b'0123456789\n'), (b'echo -n 0123456789', b'0123456789') ]) def test_stdout_forwarding(sshserver, cmd, expected): ssh_url = sshserver[0] with shell(_get_cmdline(ssh_url)[0]) as ssh: _check_echo_result(ssh, cmd, expected) def test_stdout_forwarding_multi(sshserver): # Similar to `test_stdout_forwarding`, but executes all commands on the # same connection. ssh_url = sshserver[0] with shell(_get_cmdline(ssh_url)[0]) as ssh: for cmd, expected in [(b'echo 0123456789', b'0123456789\n'), (b'echo -n 0123456789', b'0123456789')]: _check_echo_result(ssh, cmd, expected) def _check_echo_result(ssh: ShellCommandExecutor, cmd: bytes, expected: bytes): result = ssh(cmd) assert result.stdout == expected assert result.returncode == 0 def test_exit_if_unlimited_stdin_is_closed(sshserver): # Check that the test terminates if stdin is closed ssh_url, local_path = sshserver ssh_args, ssh_path = _get_cmdline(ssh_url) test_file_name = 'cat-123' # We know the ssh-server is on a POSIX system ssh_path = (ssh_path + '/' + test_file_name).encode() with \ shell(ssh_args) as ssh_executor, \ iter_subproc([sys.executable, '-c', 'print("0123456789")']) as cat_feed: response_generator = ssh_executor.start(b'cat >' + ssh_path, stdin=cat_feed) ssh_executor.close() consume(response_generator) assert response_generator.returncode == 0 assert (local_path / test_file_name).read_text() == '0123456789\n' def test_continuation_after_stdin_reading(sshserver): # check that the connection continues to work, after stdin was fed into the # remote command. ssh_url, local_path = sshserver ssh_args, ssh_path = _get_cmdline(ssh_url) feed_command = [sys.executable, '-c', 'print("0123456789", end="")'] with \ shell(ssh_args) as ssh_executor, \ iter_subproc(feed_command) as dd_feed_1, \ iter_subproc(feed_command) as dd_feed_2: for file_name, feed in (('dd-123', dd_feed_1), ('dd-456', dd_feed_2)): server_path = (ssh_path + '/' + file_name).encode() result = ssh_executor( b'dd bs=1 count=10 of=' + server_path, stdin=feed ) assert result.returncode == 0 assert (local_path / file_name).read_text() == '0123456789' _check_ls_result(ssh_executor, common_files[0]) def test_upload(sshserver, tmp_path): ssh_url, local_path = sshserver ssh_args, ssh_path = _get_cmdline(ssh_url) content = '0123456789' upload_file = tmp_path / upload_file_name upload_file.write_text(content) progress = [] with shell(ssh_args) as ssh_executor: # perform an operation on the remote shell _check_ls_result(ssh_executor, common_files[0]) # upload file to server and verify its content result = posix.upload( ssh_executor, upload_file, PurePosixPath(ssh_path + '/' + upload_file_name), progress_callback=lambda a, b: progress.append((a, b)) ) assert result.returncode == 0 assert (local_path / upload_file_name).read_text() == content assert len(progress) > 0 # perform another operation on the remote shell to ensure functionality _check_ls_result(ssh_executor, common_files[0]) def test_download_ssh(sshserver, tmp_path): ssh_url, local_path = sshserver ssh_args, ssh_path = _get_cmdline(ssh_url) content = '0123456789' server_file = local_path / download_file_name server_file.write_text(content) download_file = tmp_path / download_file_name progress = [] with shell(ssh_args) as ssh_executor: # perform an operation on the remote shell _check_ls_result(ssh_executor, common_files[0]) # download file from server and verify its content result = posix.download( ssh_executor, PurePosixPath(ssh_path + '/' + download_file_name), download_file, progress_callback=lambda a, b: progress.append((a, b)) ) assert result.returncode == 0 assert download_file.read_text() == content assert len(progress) > 0 # perform another operation on the remote shell to ensure functionality _check_ls_result(ssh_executor, common_files[0]) # This test only works on Posix-like systems because it executes a local # bash command. @skip_if(on_windows) def test_download_local_bash(tmp_path): content = '0123456789' download_file = tmp_path / download_file_name download_file.write_text(content) result_file = tmp_path / ('result' + download_file_name) progress = [] with shell(['bash']) as bash: _check_ls_result(bash, common_files[0]) # download file from server and verify its content posix.download( bash, PurePosixPath(download_file), result_file, progress_callback=lambda a, b: progress.append((a, b)), ) assert result_file.read_text() == content assert len(progress) > 0 # perform another operation on the remote shell to ensure functionality _check_ls_result(bash, common_files[0]) # This test only works on Posix-like systems because it executes a local bash @skip_if(on_windows) def test_upload_local_bash(tmp_path): content = '0123456789' upload_file = tmp_path / upload_file_name upload_file.write_text(content) result_file = tmp_path / ('result' + upload_file_name) progress = [] with shell(['bash']) as bash: _check_ls_result(bash, common_files[0]) # upload file to server and verify its content posix.upload( bash, upload_file, PurePosixPath(result_file), progress_callback=lambda a, b: progress.append((a, b)), ) assert result_file.read_text() == content assert len(progress) > 0 # perform another operation on the remote shell to ensure functionality _check_ls_result(bash, common_files[0]) # This test only works on Posix-like systems because it executes a local bash @skip_if(on_windows) def test_upload_local_bash_error(tmp_path): content = '0123456789' source_file = tmp_path / 'upload_123' source_file.write_text(content) destination_file = PurePosixPath('/result_123') progress = [] with shell(['bash']) as bash: _check_ls_result(bash, common_files[0]) # upload file to a root on the server result = posix.upload( bash, source_file, destination_file, progress_callback=lambda a, b: progress.append((a, b)), ) assert result.returncode != 0 assert len(progress) > 0 # perform another operation on the remote shell to ensure functionality _check_ls_result(bash, common_files[0]) with pytest.raises(CommandError): posix.upload(bash, source_file, destination_file, check=True) # perform another operation on the remote shell to ensure functionality _check_ls_result(bash, common_files[0]) def test_delete(sshserver): ssh_url, local_path = sshserver ssh_args, ssh_path = _get_cmdline(ssh_url) with shell(ssh_args) as ssh_executor: for file in files_to_delete: (local_path / file).write_text(f'content_{file}') # verify that the remote files exist on the server _check_ls_result(ssh_executor, (ssh_path + '/' + file).encode()) # delete files on server posix.delete( ssh_executor, [PurePosixPath(ssh_path) / file for file in files_to_delete], force=False, ) # verify that the remote files were deleted for file in files_to_delete: assert not (local_path / file).exists() def test_delete_error(sshserver): ssh_url, local_path = sshserver ssh_args, ssh_path = _get_cmdline(ssh_url) with shell(ssh_args) as ssh_executor: _check_ls_result(ssh_executor, common_files[0]) # Try to delete a non-existing file result = posix.delete( ssh_executor, [PurePosixPath('/no-such-file')], force=False, ) assert result.returncode != 0 _check_ls_result(ssh_executor, common_files[0]) with pytest.raises(CommandError): posix.delete( ssh_executor, [PurePosixPath('/no-such-file')], force=False, check=True, ) _check_ls_result(ssh_executor, common_files[0]) # Try to delete an existing file for which we are not authorized # Try to delete a non-existing file result = posix.delete( ssh_executor, [PurePosixPath('/etc/passwd')], force=False, ) assert result.returncode != 0 _check_ls_result(ssh_executor, common_files[0]) with pytest.raises(CommandError): posix.delete( ssh_executor, [PurePosixPath('/etc/passwd')], force=False, check=True, ) _check_ls_result(ssh_executor, common_files[0]) def test_returncode(): with pytest.raises(RuntimeError): with shell(['ssh', 'xyz@localhost:22']): pass @skip_if(not on_windows) def test_powershell_basic(): with shell( ['powershell', '-Command', '-'], zero_command_rg_class=VariableLengthResponseGeneratorPowerShell, ) as pwsh: r = pwsh(b'Get-ChildItem') assert r.returncode == 0 r = pwsh(b'Get-ChildItem -Path C:\\') assert r.returncode == 0 pwsh.close() @skip_if(not on_windows) def test_powershell_repr(): with shell( ['powershell', '-Command', '-'], zero_command_rg_class=VariableLengthResponseGeneratorPowerShell, ) as pwsh: assert "ShellCommandExecutor(['powershell', '-Command', '-'])" == repr(pwsh) pwsh.close() @skip_if(on_windows) def test_posix_repr(): with shell(['bash']) as ssh: assert "ShellCommandExecutor(['bash'])" == repr(ssh) ssh.close() # This test only works on Posix-like systems because it executes a local # bash command @skip_if(on_windows) def test_variable_length_reuse(monkeypatch): # This test ensures that the `VariableLengthResponseGenerator` can be # reused, e.g. after it was used for command zero, even if there is # unexpected output after the return code. def mocked_get_final_command(command: bytes) -> bytes: return ( command + b' ; x=$?; echo -e -n "' + response_generator.end_marker + b'\\n"; echo -e "$x\\nsome stuff"\n' ) log_messages = [] def mocked_log(*args): log_messages.append(args[0]) with shell(['bash']) as bash: response_generator = VariableLengthResponseGeneratorPosix(bash.stdout) monkeypatch.setattr( response_generator, 'get_final_command', mocked_get_final_command ) monkeypatch.setattr(response_generator_lgr, 'warning', mocked_log) result = bash( b'echo hello', response_generator=response_generator ) assert result.stdout == b'hello\n' assert all( msg.startswith('unexpected output after return code: ') for msg in log_messages ) bash.close() # This test only works on Posix-like systems because it executes a local bash @skip_if(on_windows) def test_bad_zero_command(monkeypatch): monkeypatch.setattr( VariableLengthResponseGeneratorPosix, 'zero_command', b'tessdsdsdt 0 -eq 1' ) with pytest.raises(RuntimeError): with shell(['bash']): pass # This test only works on Unix-like systems because it executes a local bash. @skip_if(on_windows) def test_fixed_length_response_generator_bash(): with shell(['bash']) as bash: response_generator = FixedLengthResponseGeneratorPosix( bash.stdout, length=10 ) result = bash( b'echo -n 0123456789', response_generator=response_generator ) assert result.stdout == b'0123456789' # Check that only 10 bytes are consumed and any excess bytes show up # in the return code. with pytest.raises(ValueError): result = bash( b'echo -n 0123456789abc', response_generator=response_generator ) # This test mostly works on Windows systems because it executes a local powershell. @skip_if(not on_windows) def test_fixed_length_response_generator_powershell(): with shell( ['powershell', '-Command', '-'], zero_command_rg_class=VariableLengthResponseGeneratorPowerShell, ) as powershell: result = powershell(b'Write-Host -NoNewline 0123456789') assert result.returncode == 0 if result.stdout.startswith(b'\nOops,'): pytest.skip(f'skipping test because powershell detected a bug') assert result.stdout == b'0123456789' # Check that only 10 bytes are consumed and any excess bytes show up # in the return code. Because the extra bytes are `'abc'`, the return # code is invalid, which leads to a `ValueError`. response_generator = FixedLengthResponseGeneratorPowerShell( powershell.stdout, length=10 ) with pytest.raises(ValueError): powershell( b'Write-Host -NoNewline 0123456789abc', response_generator=response_generator ) # This test only works on Unix-like systems because it executes a local bash. @skip_if(on_windows) def test_download_length_error(): with shell(['bash']) as bash: response_generator = posix.DownloadResponseGeneratorPosix(bash.stdout) result = bash(b'unknown_file', response_generator=response_generator) assert result.stdout == b'' assert result.returncode == 23 # perform another operation on the remote shell to ensure functionality _check_ls_result(bash, common_files[0]) # This test does not work on Windows systems because it executes a local bash. @skip_if(on_windows) def test_download_error(tmp_path): progress = [] with shell(['bash']) as bash: with pytest.raises(CommandError): posix.download( bash, PurePosixPath('/thisdoesnotexist'), tmp_path / 'downloaded_file', progress_callback=lambda a, b: progress.append((a, b)), check=True, ) _check_ls_result(bash, common_files[0]) result = posix.download( bash, PurePosixPath('/thisdoesnotexist'), tmp_path / 'downloaded_file', progress_callback=lambda a, b: progress.append((a, b)), check=False, ) assert result.returncode not in (0, None) _check_ls_result(bash, common_files[0]) datalad-next-1.4.1/datalad_next/tests/000077500000000000000000000000001462321624600176635ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/tests/__init__.py000066400000000000000000000020711462321624600217740ustar00rootroot00000000000000"""Tooling for test implementations .. currentmodule:: datalad_next.tests .. autosummary:: :toctree: generated BasicGitTestRepo DEFAULT_BRANCH DEFAULT_REMOTE assert_in assert_in_results assert_result_count assert_status create_tree eq_ get_deeply_nested_structure ok_ ok_good_symlink ok_broken_symlink run_main skip_if_on_windows skip_if_root skip_wo_symlink_capability swallow_logs skipif_no_network """ # TODO `assert_raises` is excluded above to avoid syntax errors in the docstring # rather than fixing those, we should replace it with `pytest.raises` entirely from .utils import ( BasicGitTestRepo, DEFAULT_BRANCH, DEFAULT_REMOTE, assert_in, assert_in_results, assert_raises, assert_result_count, assert_status, create_tree, eq_, get_deeply_nested_structure, ok_, ok_good_symlink, ok_broken_symlink, run_main, skip_if_on_windows, skip_if_root, skip_wo_symlink_capability, swallow_logs, ) from .marker import ( skipif_no_network, ) datalad-next-1.4.1/datalad_next/tests/fixtures.py000066400000000000000000000604401462321624600221120ustar00rootroot00000000000000"""Collection of fixtures for facilitation test implementations """ import getpass import logging import os from pathlib import Path import subprocess import pytest from tempfile import NamedTemporaryFile from time import sleep from urllib.request import urlopen from datalad_next.datasets import Dataset from datalad_next.runners import ( call_git_lines, call_git_success, ) from datalad_next.utils import patched_env from .utils import ( HTTPPath, WebDAVPath, assert_ssh_access, external_versions, get_git_config_global_fpath, md5sum, rmtree, ) lgr = logging.getLogger('datalad.next.tests.fixtures') @pytest.fixture(autouse=True, scope="session") def reduce_logging(): """Reduce the logging output during test runs DataLad emits a large amount of repetitive INFO log messages that only clutter the test output, and hardly ever help to identify an issue. This fixture modifies the standard logger to throw away all INFO level log messages. With this approach, such messages are still fed to and processes by the logger (in contrast to an apriori level setting). """ dllgr = logging.getLogger('datalad') # leave a trace that this is happening dllgr.info("Test fixture starts suppressing INFO level messages") class NoInfo(logging.Filter): def filter(self, record): # it seems unnecessary to special case progress logs, moreover # not filtering them out will make clone/fetch/push very visible # in the logs with trivial messages #if hasattr(record, 'dlm_progress'): # # this is a progress log message that may trigger something # # a test is looking for # return True if record.levelno == 20: # this is a plain INFO message, ignore return False else: return True noinfo = NoInfo() # we need to attach the filter to any handler to make it effective. # adding to the logger only will not effect any log messages produced # via descendant loggers for hdlr in dllgr.handlers: hdlr.addFilter(noinfo) @pytest.fixture(autouse=False, scope="function") def no_result_rendering(monkeypatch): """Disable datalad command result rendering for all command calls This is achieved by forcefully supplying `result_renderer='disabled'` to any command call via a patch to internal argument normalizer ``get_allargs_as_kwargs()``. """ # we need to patch our patch function, because datalad-core's is no # longer used import datalad_next.patches.interface_utils as dnpiu old_get_allargs_as_kwargs = dnpiu.get_allargs_as_kwargs def no_render_get_allargs_as_kwargs(call, args, kwargs): kwargs, one, two = old_get_allargs_as_kwargs(call, args, kwargs) kwargs['result_renderer'] = 'disabled' return kwargs, one, two with monkeypatch.context() as m: m.setattr(dnpiu, 'get_allargs_as_kwargs', no_render_get_allargs_as_kwargs) yield @pytest.fixture(autouse=False, scope="function") def tmp_keyring(): """Patch plaintext keyring to temporarily use a different storage No credential read or write actions will impact any existing credential store of any configured backend. The patched backend is yielded by the fixture. """ import keyring # the testsetup assumes this to be a plaintext backend. # this backend is based on a filename and maintains no state. # each operation opens, reads/writes, and then closes the file. # hence we can simply point to a different file backend = keyring.get_keyring() prev_fpath = backend.file_path # no tmp keyring yet, make one with NamedTemporaryFile( 'r', prefix='datalad_tmp_keyring_', delete=True) as tf: # we must close, because windows does not like the file being open # already when ConfigManager would open it for reading tf.close() backend.file_path = tf.name with patched_env(DATALAD_TESTS_TMP_KEYRING_PATH=tf.name): yield backend backend.file_path = prev_fpath # the following is taken from datalad/conftest.py # sadly, this is defined inline and cannot be reused directly standard_gitconfig = """\ [user] name = DataLad Tester email = test@example.com [core] askPass = [datalad "log"] exc = 1 [datalad "extensions"] # load the next extension to be able to test patches of annex remotes # that run in subprocesses load = next [annex "security"] # from annex 6.20180626 file:/// and http://localhost access isn't # allowed by default allowed-url-schemes = http https file allowed-http-addresses = all [protocol "file"] # since git 2.38.1 cannot by default use local clones for submodules # https://github.blog/2022-10-18-git-security-vulnerabilities-announced/#cve-2022-39253 allow = always """ + os.environ.get('DATALAD_TESTS_GITCONFIG', '').replace('\\n', os.linesep) @pytest.fixture(autouse=False, scope="function") def datalad_cfg(): """Temporarily alter configuration to use a plain "global" configuration The global configuration manager at `datalad.cfg` is reloaded after adjusting `GIT_CONFIG_GLOBAL` to point to a new temporary `.gitconfig` file. After test execution the file is removed, and the global `ConfigManager` is reloaded once more. Any test using this fixture will be skipped for Git versions earlier than 2.32, because the `GIT_CONFIG_GLOBAL` environment variable used here was only introduced with that version. """ if external_versions['cmd:git'] < "2.32": pytest.skip( "Git configuration redirect via GIT_CONFIG_GLOBAL " "only supported since Git v2.32" ) from datalad import cfg with NamedTemporaryFile( 'w', prefix='datalad_gitcfg_global_', delete=False) as tf: tf.write(standard_gitconfig) # we must close, because windows does not like the file being open # already when ConfigManager would open it for reading tf.close() with patched_env(GIT_CONFIG_GLOBAL=tf.name): cfg.reload(force=True) yield cfg # reload to put the previous config in effect again cfg.reload(force=True) @pytest.fixture(autouse=True, scope="function") def check_gitconfig_global(): """No test must modify a user's global Git config. If such modifications are needed, a custom configuration setup limited to the scope of the test requiring it must be arranged. """ globalcfg_fname = get_git_config_global_fpath() if not globalcfg_fname.exists(): lgr.warning( 'No global/user Git config file exists. This is an unexpected ' 'test environment, no config modifications checks can be ' 'performed. Proceeding nevertheless.') # let the test run yield # and exit quietly return # we have a config file. hash it pre and post test. Fail is changed. pre = md5sum(globalcfg_fname) yield post = md5sum(globalcfg_fname) assert pre == post, \ "Global Git config modification detected. Test must be modified to use " \ "a temporary configuration target. Hint: use the `datalad_cfg` fixture." @pytest.fixture(autouse=True, scope="function") def check_plaintext_keyring(): """No test must modify a user's keyring. If such modifications are needed, a custom keyring setup limited to the scope of the test requiring it must be arranged. The ``tmp_keyring`` fixture can be employed in such cases. """ # datalad-core configures keyring to use a plaintext backend # we will look for the underlying file and verify that it is either # no there, or remains unmodified import keyring kr = keyring.get_keyring() if not hasattr(kr, 'file_path'): # this is not the plain text keyring, nothing we can do here # run as-is, but leave a message lgr.warning('Running without the expected plain-text keyring') yield return kr_fpath = Path(kr.file_path) pre = md5sum(kr_fpath) if kr_fpath.exists() else '' yield post = md5sum(kr_fpath) if kr_fpath.exists() else '' assert pre == post, \ "Keyring modification detected. Test must be modified to use " \ "a temporary keyring. Hint: use the `tmp_keyring` fixture." @pytest.fixture(autouse=False, scope="function") def credman(datalad_cfg, tmp_keyring): """Provides a temporary credential manager It comes with a temporary global datalad config and a temporary keyring as well. This manager can be used to deploy or manipulate credentials within the scope of a single test. """ from datalad import cfg from datalad_next.credman import CredentialManager cm = CredentialManager(cfg) yield cm @pytest.fixture(autouse=False, scope="function") def dataset(datalad_cfg, tmp_path_factory): """Provides a ``Dataset`` instance for a not-yet-existing repository The instance points to an existing temporary path, but ``create()`` has not been called on it yet. """ # must use the factory to get a unique path even when a concrete # test also uses `tmp_path` ds = Dataset(tmp_path_factory.mktemp("dataset")) yield ds @pytest.fixture(autouse=False, scope="function") def existing_dataset(dataset): """Provides a ``Dataset`` instance pointing to an existing dataset/repo This fixture uses an instance provided by the ``dataset`` fixture and calls ``create()`` on it, before it yields the ``Dataset`` instance. """ dataset.create(result_renderer='disabled') yield dataset @pytest.fixture(autouse=False, scope="function") def existing_noannex_dataset(dataset): """just like ``existing_dataset``, but created with ``annex=False`` """ dataset.create(annex=False, result_renderer='disabled') yield dataset @pytest.fixture(scope="session") def modified_dataset(tmp_path_factory): """Produces a dataset with various modifications The fixture is module-scope, aiming to be reused by many tests focused on reporting. It does not support any further modification. The fixture will fail, if any such modification is detected. ``git status`` will report:: ❯ git status -uall On branch dl-test-branch Changes to be committed: (use "git restore --staged ..." to unstage) new file: dir_m/file_a new file: file_a new file: file_am Changes not staged for commit: (use "git add/rm ..." to update what will be committed) (use "git restore ..." to discard changes in working directory) (commit or discard the untracked or modified content in submodules) deleted: dir_d/file_d deleted: dir_m/file_d modified: dir_m/file_m deleted: dir_sm/sm_d modified: dir_sm/sm_m (modified content) modified: dir_sm/sm_mu (modified content, untracked content) modified: dir_sm/sm_n (new commits) modified: dir_sm/sm_nm (new commits, modified content) modified: dir_sm/sm_nmu (new commits, modified content, untracked content) modified: dir_sm/sm_u (untracked content) modified: file_am deleted: file_d modified: file_m Untracked files: (use "git add ..." to include in what will be committed) dir_m/dir_u/file_u dir_m/file_u dir_u/file_u file_u Suffix indicates the ought-to state (multiple possible): a - added c - clean d - deleted n - new commits m - modified u - untracked content Prefix indicated the item type: file - file sm - submodule dir - directory """ ds = Dataset(tmp_path_factory.mktemp("modified_dataset")) ds.create(result_renderer='disabled') ds_dir = ds.pathobj / 'dir_m' ds_dir.mkdir() ds_dir_d = ds.pathobj / 'dir_d' ds_dir_d.mkdir() (ds_dir / 'file_m').touch() (ds.pathobj / 'file_m').touch() dirsm = ds.pathobj / 'dir_sm' dss = {} for smname in ( 'sm_d', 'sm_c', 'sm_n', 'sm_m', 'sm_nm', 'sm_u', 'sm_mu', 'sm_nmu', 'droppedsm_c', ): sds = Dataset(dirsm / smname).create(result_renderer='disabled') # for the plain modification, commit the reference right here if smname in ('sm_m', 'sm_nm', 'sm_mu', 'sm_nmu'): (sds.pathobj / 'file_m').touch() sds.save(to_git=True, result_renderer='disabled') dss[smname] = sds # files in superdataset to be deleted for d in (ds_dir_d, ds_dir, ds.pathobj): (d / 'file_d').touch() dss['.'] = ds dss['dir'] = ds_dir ds.save(to_git=True, result_renderer='disabled') ds.drop(dirsm / 'droppedsm_c', what='datasets', reckless='availability', result_renderer='disabled') # a new commit for smname in ('.', 'sm_n', 'sm_nm', 'sm_nmu'): sds = dss[smname] (sds.pathobj / 'file_c').touch() sds.save(to_git=True, result_renderer='disabled') # modified file for smname in ('.', 'dir', 'sm_m', 'sm_nm', 'sm_mu', 'sm_nmu'): obj = dss[smname] pobj = obj.pathobj if isinstance(obj, Dataset) else obj (pobj / 'file_m').write_text('modify!') # untracked for smname in ('.', 'dir', 'sm_u', 'sm_mu', 'sm_nmu'): obj = dss[smname] pobj = obj.pathobj if isinstance(obj, Dataset) else obj (pobj / 'file_u').touch() (pobj / 'dirempty_u').mkdir() (pobj / 'dir_u').mkdir() (pobj / 'dir_u' / 'file_u').touch() # delete items rmtree(dss['sm_d'].pathobj) rmtree(ds_dir_d) (ds_dir / 'file_d').unlink() (ds.pathobj / 'file_d').unlink() # added items for smname in ('.', 'dir', 'sm_m', 'sm_nm', 'sm_mu', 'sm_nmu'): obj = dss[smname] pobj = obj.pathobj if isinstance(obj, Dataset) else obj (pobj / 'file_a').write_text('added') assert call_git_success(['add', 'file_a'], cwd=pobj) # added and then modified file file_am_obj = ds.pathobj / 'file_am' file_am_obj.write_text('added') assert call_git_success(['add', 'file_am'], cwd=ds.pathobj) file_am_obj.write_text('modified') # record git-status output as a reference status_start = call_git_lines(['status'], cwd=ds.pathobj) yield ds # compare with initial git-status output, if there are any # differences the assumptions of any consuming test could be # invalidated. The modifying code must be found and fixed assert status_start == call_git_lines(['status'], cwd=ds.pathobj), \ "Unexpected modification of the testbed" @pytest.fixture(autouse=False, scope="session") def webdav_credential(): """Provides HTTP Basic authentication credential necessary to access the server provided by the ``webdav_server`` fixture.""" yield dict( name='dltest-my&=webdav', user='datalad', secret='secure', type='user_password', ) @pytest.fixture(autouse=False, scope="function") def webdav_server(tmp_path_factory, webdav_credential): """Provides a WebDAV server, serving a temporary directory The fixtures yields an instance of ``WebDAVPath``, providing the following essential attributes: - ``path``: ``Path`` instance of the served temporary directory - ``url``: HTTP URL to access the WebDAV server Server access requires HTTP Basic authentication with the credential provided by the ``webdav_credential`` fixture. """ auth = (webdav_credential['user'], webdav_credential['secret']) # must use the factory to get a unique path even when a concrete # test also uses `tmp_path` path = tmp_path_factory.mktemp("webdav") # this looks a little awkward, but is done to avoid a change in # WebDAVPath. server = WebDAVPath(path, auth=auth) with server as server_url: server.url = server_url yield server @pytest.fixture(autouse=False, scope="session") def http_credential(): """Provides the HTTP Basic authentication credential necessary to access the HTTP server provided by the ``http_server_with_basicauth`` fixture.""" yield dict( name='dltest-my&=http', user='datalad', secret='secure', type='user_password', ) @pytest.fixture(autouse=False, scope="function") def http_server(tmp_path_factory): """Provides an HTTP server, serving a temporary directory The fixtures yields an instance of ``HTTPPath``, providing the following essential attributes: - ``path``: ``Path`` instance of the served temporary directory - ``url``: HTTP URL to access the HTTP server """ # must use the factory to get a unique path even when a concrete # test also uses `tmp_path` path = tmp_path_factory.mktemp("webdav") server = HTTPPath(path, use_ssl=False, auth=None) with server: # overwrite path with Path object for convenience server.path = path yield server @pytest.fixture(autouse=False, scope="function") def http_server_with_basicauth(tmp_path_factory, http_credential): """Like ``http_server`` but requiring authentication via ``http_credential`` """ path = tmp_path_factory.mktemp("webdav") server = HTTPPath( path, use_ssl=False, auth=(http_credential['user'], http_credential['secret']), ) with server: # overwrite path with Path object for convenience server.path = path yield server @pytest.fixture(scope="session") def httpbin_service(): """Return canonical access URLs for the HTTPBIN service This fixture tries to spin up a httpbin Docker container at localhost:8765; if successful, it returns this URL as the 'standard' URL. If the attempt fails, a URL pointing to the canonical instance is returned. For tests that need to have the service served via a specific protocol (https vs http), the corresponding URLs are returned too. They always point to the canonical deployment, as some tests require both protocols simultaneously and a local deployment generally won't have https. """ hburl = 'http://httpbin.org' hbsurl = 'https://httpbin.org' ciurl = 'http://localhost:8765' if os.name == "posix": try: r = subprocess.run( ["docker", "run", "-d", "-p", "127.0.0.1:8765:80", "kennethreitz/httpbin"], check=True, stdout=subprocess.PIPE, text=True, ) except (OSError, subprocess.CalledProcessError): lgr.warning("Failed to spin up httpbin Docker container:", exc_info=True) container_id = None else: container_id = r.stdout.strip() else: container_id = None try: if container_id is not None: # Wait for container to fully start: for _ in range(25): try: urlopen(ciurl) except Exception: sleep(1) else: break else: raise RuntimeError("httpbin container did not start up in time") yield { "standard": ciurl if container_id is not None else hbsurl, "http": hburl, "https": hbsurl, } finally: if container_id is not None: subprocess.run(["docker", "rm", "-f", container_id], check=True) @pytest.fixture(scope="function") def httpbin(httpbin_service): """Does the same thing as ``httpbin_service``, but skips on function-scope ``httpbin_service`` always returns access URLs for HTTPBIN. However, in some cases it is simply not desirable to run a test. For example, the appveyor workers are more or less constantly unable to access the public service. This fixture is evaluated at function-scope and skips the test whenever any of these undesired conditions is detected. Otherwise it just relays ``httpbin_service``. """ if os.environ.get('DATALAD_TESTS_NONETWORK'): pytest.skip( 'Not running httpbin-based test: NONETWORK flag set' ) if 'APPVEYOR' in os.environ and 'DEPLOY_HTTPBIN_IMAGE' not in os.environ: pytest.skip( "Not running httpbin-based test on appveyor without " "docker-deployed instance -- too unreliable" ) yield httpbin_service @pytest.fixture(autouse=False, scope="function") def datalad_interactive_ui(monkeypatch): """Yields a UI replacement to query for operations and stage responses No output will be written to STDOUT/ERR by this UI. A standard usage pattern is to stage one or more responses, run the to-be-tested code, and verify that the desired user interaction took place:: > datalad_interactive_ui.staged_responses.append('skip') > ... > assert ... datalad_interactive_ui.log """ from datalad_next.uis import ui_switcher from .utils import InteractiveTestUI with monkeypatch.context() as m: m.setattr(ui_switcher, '_ui', InteractiveTestUI()) yield ui_switcher.ui @pytest.fixture(autouse=False, scope="function") def datalad_noninteractive_ui(monkeypatch): """Yields a UI replacement to query for operations No output will be written to STDOUT/ERR by this UI. A standard usage pattern is to run the to-be-tested code, and verify that the desired user messaging took place:: > ... > assert ... datalad_interactive_ui.log """ from datalad_next.uis import ui_switcher from .utils import TestUI with monkeypatch.context() as m: m.setattr(ui_switcher, '_ui', TestUI()) yield ui_switcher.ui @pytest.fixture(autouse=False, scope="session") def sshserver_setup(tmp_path_factory): if not os.environ.get('DATALAD_TESTS_SSH'): pytest.skip( "set DATALAD_TESTS_SSH=1 to enable") # query a bunch of recognized configuration environment variables, # fill in the blanks, then check if the given configuration is working, # and post the full configuration again as ENV vars, to be picked up by # the function-scope `datalad_cfg` tmp_root = str(tmp_path_factory.mktemp("sshroot")) host = os.environ.get('DATALAD_TESTS_SERVER_SSH_HOST', 'localhost') port = os.environ.get('DATALAD_TESTS_SERVER_SSH_PORT', '22') login = os.environ.get( 'DATALAD_TESTS_SERVER_SSH_LOGIN', getpass.getuser()) seckey = os.environ.get( 'DATALAD_TESTS_SERVER_SSH_SECKEY', str(Path.home() / '.ssh' / 'id_rsa')) path = os.environ.get('DATALAD_TESTS_SERVER_SSH_PATH', tmp_root) # TODO this should not use `tmp_root` unconditionally, but only if # the SSH_PATH is known to be the same. This might not be if SSH_PATH # is explicitly configured and LOCALPATH is not -- which could be # an indication that there is none localpath = os.environ.get('DATALAD_TESTS_SERVER_LOCALPATH', tmp_root) assert_ssh_access(host, port, login, seckey, path, localpath) info = {} # as far as we can tell, this is good, post effective config in ENV for v, e in ( (host, 'HOST'), # this is SSH_*, because elsewhere we also have other properties # for other services (port, 'SSH_PORT'), (login, 'SSH_LOGIN'), (seckey, 'SSH_SECKEY'), (path, 'SSH_PATH'), (localpath, 'LOCALPATH'), ): os.environ[f"DATALAD_TESTS_SERVER_{e}"] = v info[e] = v yield info @pytest.fixture(autouse=False, scope="function") def sshserver(sshserver_setup, datalad_cfg, monkeypatch): # strip any leading / from the path, we add one, and # only one below sshserver_path = sshserver_setup['SSH_PATH'].lstrip('/') baseurl = f"ssh://{sshserver_setup['SSH_LOGIN']}" \ f"@{sshserver_setup['HOST']}" \ f":{sshserver_setup['SSH_PORT']}" \ f"/{sshserver_path}" with monkeypatch.context() as m: m.setenv("DATALAD_SSH_IDENTITYFILE", sshserver_setup['SSH_SECKEY']) # force reload the config manager, to ensure the private key setting # makes it into the active config datalad_cfg.reload(force=True) yield baseurl, Path(sshserver_setup['LOCALPATH']) datalad-next-1.4.1/datalad_next/tests/marker.py000066400000000000000000000002341462321624600215150ustar00rootroot00000000000000import os import pytest skipif_no_network = pytest.mark.skipif( 'DATALAD_TESTS_NONETWORK' in os.environ, reason='DATALAD_TESTS_NONETWORK is set' ) datalad-next-1.4.1/datalad_next/tests/test_common_cfg.py000066400000000000000000000002161462321624600234020ustar00rootroot00000000000000def test_annexretry(): from datalad.interface.common_cfg import definitions assert definitions['datalad.annex.retry']['default'] == 1 datalad-next-1.4.1/datalad_next/tests/test_register.py000066400000000000000000000001701462321624600231160ustar00rootroot00000000000000 def test_register(): import datalad.api as da assert hasattr(da, 'credentials') assert hasattr(da, 'tree') datalad-next-1.4.1/datalad_next/tests/test_testutils.py000066400000000000000000000007461462321624600233430ustar00rootroot00000000000000from webdav3.client import Client as DAVClient def test_serve_webdav_fixture(webdav_credential, webdav_server): webdav_cfg = dict( webdav_hostname=webdav_server.url, webdav_login=webdav_credential['user'], webdav_password=webdav_credential['secret'], webdav_root='/', ) webdav = DAVClient(webdav_cfg) # plain use should work without error webdav.list() (webdav_server.path / 'probe').touch() assert 'probe' in webdav.list() datalad-next-1.4.1/datalad_next/tests/utils.py000066400000000000000000000223341462321624600214010ustar00rootroot00000000000000from __future__ import annotations from collections import deque import logging from functools import wraps import os from os import environ from pathlib import Path import pytest import subprocess from typing import Any from datalad.support.external_versions import external_versions # all datalad-core test utils needed for datalad-next from datalad.tests.utils_pytest import ( DEFAULT_BRANCH, DEFAULT_REMOTE, HTTPPath, # TODO REMOVE FOR V2.0 SkipTest, assert_in, assert_in_results, assert_raises, assert_result_count, assert_status, attr, chpwd, eq_, get_deeply_nested_structure, ok_, ok_broken_symlink, ok_exists, ok_good_symlink, # TODO REMOVE FOR V2.0 rmtree, skip_if_on_windows, skip_if_root, skip_wo_symlink_capability, swallow_logs, ) from datalad.tests.test_utils_testrepos import BasicGitTestRepo from datalad.cli.tests.test_main import run_main from datalad.ui.progressbars import SilentProgressBar from datalad.utils import ( create_tree, md5sum, ) from datalad_next.shell import shell from datalad_next.utils import ( CredentialManager, ) lgr = logging.getLogger("datalad.tests.utils") class WebDAVPath(object): """Serve the content of a path via an HTTP WebDAV URL. This class is a context manager. Parameters ---------- path : str Directory with content to serve. auth : tuple Username, password Returns ------- str WebDAV server URL """ def __init__(self, path, auth=None): self.path = Path(path) self.auth = auth self.server = None self.server_thread = None def __enter__(self): try: from cheroot import wsgi from wsgidav.wsgidav_app import WsgiDAVApp except ImportError as e: pytest.skip(f'No WSGI capabilities. Install cheroot and/or wsgidav ({e!r})') if self.auth: auth = {self.auth[0]: {'password': self.auth[1]}} else: auth = True self.path.mkdir(exist_ok=True, parents=True) config = { "host": "127.0.0.1", # random fixed number, maybe make truly random and deal with taken ports "port": 43612, "provider_mapping": {"/": str(self.path)}, "simple_dc": {"user_mapping": {'*': auth}}, # disable DAV server logging to avoid clustering the test output # unless logger runs at least on debug log level "logging": {"enable": lgr.isEnabledFor(10)}, } app = WsgiDAVApp(config) self.server = wsgi.Server( bind_addr=(config["host"], config["port"]), wsgi_app=app, ) lgr.debug('Starting WebDAV server') from threading import Thread self.server.prepare() self.server_thread = Thread(target=self.server.serve) self.server_thread.start() lgr.debug('WebDAV started') return f'http://{config["host"]}:{config["port"]}' def __exit__(self, *args): lgr.debug('Stopping WebDAV server') # graceful exit self.server.stop() lgr.debug('WebDAV server stopped, waiting for server thread to exit') # wait for shutdown self.server_thread.join() lgr.debug('WebDAV server thread exited') def with_credential(name, **kwargs): """A decorator to temporarily deploy a credential. If a credential of the given name already exists, it will be temporarily replaced by the given one. In pretty much all cases, the keyword arguments need to include `secret`. Otherwise any properties are supported. """ import warnings warnings.warn( "datalad_next.tests.utils.with_credential was replaced by a `credman` " "fixture in datalad_next 1.0, and will be removed in " "datalad_next 2.0.", DeprecationWarning, ) def with_credential_decorator(fx): @wraps(fx) def _with_credential(*dargs, **dkw): credman = CredentialManager() # retrieve anything that might be conflicting with the # to-be-deployed credential prev_cred = credman.get(name) try: credman.set(name, **kwargs) fx(*dargs, **dkw) finally: if prev_cred: credman.set(name, **prev_cred) else: credman.remove(name) return _with_credential return with_credential_decorator def get_git_config_global_fpath() -> Path: """Returns the file path for the "global" (aka user) Git config scope""" fpath_str = environ.get('GIT_CONFIG_GLOBAL') if fpath_str is None: # this can happen with the datalad-core setup for Git < 2.32. # we provide a fallback, but we do not aim to support all # possible variants fpath = Path(environ['HOME']) / '.gitconfig' else: fpath = Path(fpath_str) return fpath class TestUI: """Drop-in replacement for the DataLad UI to protocol any calls""" is_interactive = False """Flag is inspected in generic UI code to check for the possibility of interactivity""" def __init__(self): # this member will hold a log of all calls made to the UI wrapper self._log = [] def __str__(self) -> str: return "{cls}(\n{log}\n)".format( cls=self.__class__.__name__, log='\n'.join(f' {i[0]}: {i[1]}' for i in self.log), ) @property def log(self) -> list: """Call log Returns ------- list Each item is a two-tuple with the label of the UI operation as first element, and the verbatim parameters/values of the respective operation. """ return self._log @property def operation_sequence(self) -> list: """Same as ``.log()``, but limited to just the operation labels""" return [i[0] for i in self.log] def question(self, *args, **kwargs) -> Any: """Raise ``RuntimeError`` when a question needs to be asked""" self._log.append(('question', (args, kwargs))) raise RuntimeError( 'non-interactive test UI was asked for a response to a question') def message(self, msg, cr='\n'): """Post a message""" self._log.append(('message', (msg, cr))) def get_progressbar(self, *args, **kwargs): """Return a progress handler""" self._log.append(('get_progressbar', (args, kwargs))) return SilentProgressBar(*args, **kwargs) class InteractiveTestUI(TestUI): """DataLad UI that can also provide staged user responses""" is_interactive = True def __init__(self): super().__init__() # queue to provision responses self._responses = deque() def __str__(self) -> str: return "{cls}(\n{log}\n (unused responses: {res})\n)".format( cls=self.__class__.__name__, log='\n'.join(f' {i[0]}: {i[1]}' for i in self.log), res=list(self.staged_responses), ) @property def staged_responses(self) -> deque: """``deque`` for staging user responses and retrieving them""" return self._responses def question(self, *args, **kwargs) -> Any: """Report a provisioned response when a question is asked""" self._log.append(('question', (args, kwargs))) if not self.staged_responses: raise AssertionError( "UI response requested, but no further are provisioned") response = self.staged_responses.popleft() self._log.append(('response', response)) return response def assert_ssh_access( host: str, port: str, login: str, seckey: str, path: str, localpath: str | None = None, ): """Test for a working SSH connection and sufficient permissions to write This helper establishes a connection to an SSH server identified by ``host`` and ``port``, using a given SSH private key file (``seckey``) for authentication. Once logged in successfully, it tries to create a directory and a file at POSIX ``path`` on the server. If ``localpath`` is given, it must be a representation of that server-side path on the local file system (e.g., a bindmount), and the helper tests whether the created content is also reflected in this directory. """ # we can only handle openssh ssh_bin = os.environ.get('DATALAD_SSH_EXECUTABLE', 'ssh') ssh_bash_call = [ ssh_bin, '-i', seckey, '-p', port, f'{login}@{host}', 'bash', ] # now try if this is a viable configuration # verify execute and write permissions (implicitly also POSIX path handling from more_itertools import consume with shell(ssh_bash_call) as ssh: # each call here will crash with CommandError, if it does not work ssh(f'mkdir -p {path}', check=True) ssh(f'touch {path}/datalad-tests-probe', check=True) if localpath: # we should see the probe file locally assert (Path(localpath) / 'datalad-tests-probe').exists() # cleanup ssh(f'rm {path}/datalad-tests-probe', check=True) if localpath: assert not (Path(localpath) / 'datalad-tests-probe').exists() datalad-next-1.4.1/datalad_next/types/000077500000000000000000000000001462321624600176655ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/types/__init__.py000066400000000000000000000004131462321624600217740ustar00rootroot00000000000000"""Custom types and dataclasses .. currentmodule:: datalad_next.types .. autosummary:: :toctree: generated AnnexKey ArchivistLocator ArchiveType """ from .annexkey import AnnexKey from .archivist import ArchivistLocator from .enums import ArchiveType datalad-next-1.4.1/datalad_next/types/annexkey.py000066400000000000000000000027631462321624600220710ustar00rootroot00000000000000"""git-annex key representation""" from __future__ import annotations from dataclasses import dataclass import re # BACKEND[-sNNNN][-mNNNN][-SNNNN-CNNNN]--NAME _annexkey_regex = re.compile( '(?P[A-Z0-9]+)' '(|-s(?P[0-9]+))' '(|-m(?P[0-9]+))' '(|-S(?P[0-9]+)-C(?P[0-9]+))' '--(?P.*)$' ) @dataclass(frozen=True) class AnnexKey: """Representation of a git-annex key https://git-annex.branchable.com/internals/key_format/ """ name: str backend: str size: int | None = None mtime: int | None = None chunksize: int | None = None chunknumber: int | None = None @classmethod def from_str(cls, key: str): """Return an ``AnnexKey`` instance from a key string""" key_matched = _annexkey_regex.match(key) if not key_matched: # without a sensible key there is no hope raise ValueError(f'{key!r} is not a valid git-annex key') return cls(**key_matched.groupdict()) def __str__(self) -> str: return '{backend}{size}{mtime}{chunk}--{name}'.format( name=self.name, backend=self.backend, size=f'-s{self.size}' if self.size else '', mtime=f'-m{self.mtime}' if self.mtime else '', # if me reading of the spec is correct, the two chunk props # can only occur together chunk=f'-S{self.chunksize}-C{self.chunknumber}' if self.chunknumber else '', ) datalad-next-1.4.1/datalad_next/types/archivist.py000066400000000000000000000124511462321624600222360ustar00rootroot00000000000000"""``dl+archive:`` archive member locator""" from __future__ import annotations from dataclasses import dataclass from pathlib import PurePosixPath import re from .annexkey import AnnexKey from .enums import ArchiveType # be relatively permissive _recognized_urls = re.compile(r'^dl\+archive:(?P.*)#(?P.*)') # each archive member is identified by a (relative) path inside # the archive. _archive_member_props = re.compile( # a path may contain any char but '&' # TODO check that something in the machinery ensures proper # quoting 'path=(?P[^&]+)' # size info (in bytes) is optional '(&size=(?P[0-9]+)|)' # archive type label is optional '(&atype=(?P[a-z0-9]+)|)' ) @dataclass class ArchivistLocator: """Representation of a ``dl+archive:`` archive member locator These locators are used by the ``datalad-archives`` and ``archivist`` git-annex special remotes. They identify a member of a archive that is itself identified by an annex key. Each member is annotated with its size (in bytes). Optionally, the file format type of the archive can be annotated too. Syntax of ``dl+archives:`` locators ----------------------------------- The locators the following minimal form:: dl+archive:#path= where ```` is a regular git-annex key of an archive file, and ```` is a POSIX-style relative path pointing to a member within the archive. Two optional, additional attributes ``size`` and ``atype`` are recognized (only ``size`` is also understood by the ``datalad-archives`` special remote). ``size`` declares the size of the (extracted) archive member in bytes:: dl+archive:#path=&size= ``atype`` declares the type of the containing archive using a label. Currently recognized labels are ``tar`` (a TAR archive, compressed or not), and ``zip`` (a ZIP archive). See :class:`~datalad_next.types.enums.ArchiveType` for all recognized labels. If no type information is given, :func:`ArchivistLocator.from_str()` will try to determine the archive type from the archive key (via ``*E``-type git-annex backends, such as DataLad's default ``MD5E``). The order in the fragment part of the URL (after ``#``) is significant. ``path`` must come first, followed by ``size`` or ``atype``. If both ``size`` and ``atype`` are present, ``size`` must be declared first. A complete example of a URL is:: dl+archive:MD5-s389--e9f624eb778e6f945771c543b6e9c7b2#path=dir/file.csv&size=234&atype=tar """ akey: AnnexKey member: PurePosixPath size: int | None = None # datalad-archives did not have the type info, we want to be # able to handle those too, make optional atype: ArchiveType | None = None def __str__(self) -> str: return 'dl+archive:{akey}#path={member}&size={size}{atype}'.format( akey=self.akey, # TODO needs quoting? member=self.member, size=self.size, atype=f'&atype={self.atype.value}' if self.atype else '', ) @classmethod def from_str(cls, url: str): """Return ``ArchivistLocator`` from ``str`` form""" url_match = _recognized_urls.match(url) if not url_match: raise ValueError('Unrecognized dl+archives locator syntax') url_matched = url_match.groupdict() # convert to desired type akey = AnnexKey.from_str(url_matched['key']) # archive member properties props_match = _archive_member_props.match(url_matched['props']) if not props_match: # without at least a 'path' there is nothing we can do here raise ValueError( 'dl+archives locator contains invalid archive member ' f'specification: {url_matched["props"]!r}') props_matched = props_match.groupdict() amember_path = PurePosixPath(props_matched['path']) if amember_path.is_absolute(): raise ValueError( 'dl+archives locator contains absolute archive member path') if '..' in amember_path.parts: raise ValueError( 'dl+archives locator archive member path contains ".."') # size is optional, regex ensure that it is an int size = props_matched.get('size') if size is not None: size = int(size) # archive type, could be None atype = props_matched.get('atype') if atype is not None: # if given, most be known type try: atype = getattr(ArchiveType, atype) except AttributeError as e: raise ValueError( 'dl+archives locator archive type unrecognized') from e if atype is None and akey.backend.endswith('E'): # try by key name extension suf = PurePosixPath(akey.name).suffixes if '.zip' == suf[-1]: atype = ArchiveType.zip elif '.tar' in suf: atype = ArchiveType.tar elif '.tgz' in suf: atype = ArchiveType.tar return cls( akey=akey, member=amember_path, size=size, atype=atype, ) datalad-next-1.4.1/datalad_next/types/enums.py000066400000000000000000000004121462321624600213630ustar00rootroot00000000000000"""Type ENUMs""" from enum import Enum class ArchiveType(Enum): """Enumeration of archive types Each one should have an associated ArchiveOperations handler. """ # TODO the values could also be handler classes ... tar = 'tar' zip = 'zip' datalad-next-1.4.1/datalad_next/types/tests/000077500000000000000000000000001462321624600210275ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/types/tests/__init__.py000066400000000000000000000000001462321624600231260ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/types/tests/test_annexkey.py000066400000000000000000000016111462321624600242610ustar00rootroot00000000000000import pytest from ..annexkey import AnnexKey def test_annexkey(): for key in ( 'MD5E-s792207360--985e680a221e47db05063a12b91d7d89.tar', 'SHA256E-s31390--f50d7ac4c6b9031379986bc362fcefb65f1e52621ce1708d537e740fefc59cc0.mp3', 'URL-s1899248--http&c%%ai.stanford.edu%,126nilsson%MLBOOK.pdf/URL-s1899248--http&c%%ai.stanford.edu%,126nilsson%MLBOOK.pdf', ): # round-tripping for any key must give same outcome assert key == str(AnnexKey.from_str(key)) # check that it can be used as a dict-key, i.e. is hashable key = AnnexKey.from_str('MD5-s9--985e680a221e47db05063a12b91d7d89') d = {key: 'some'} def test_annexkey_errors(): for wrong in ( 'MD5E-985e680a221e47db05063a12b91d7d89.tar', 'MD5E-SUPRISE--985e680a221e47db05063a12b91d7d89.tar', ): with pytest.raises(ValueError): AnnexKey.from_str(wrong) datalad-next-1.4.1/datalad_next/types/tests/test_archivist.py000066400000000000000000000037441462321624600244440ustar00rootroot00000000000000import pytest from ..annexkey import AnnexKey from ..archivist import ArchivistLocator from ..enums import ArchiveType some_key = 'MD5-s389--e9f624eb778e6f945771c543b6e9c7b2' def test_archivistlocator(): test_locator = \ f'dl+archive:{some_key}#path=dir/file.csv&size=234&atype=tar' al = ArchivistLocator.from_str(test_locator) assert al.akey == AnnexKey.from_str(some_key) assert al.atype == ArchiveType.tar # round trip assert str(al) == test_locator # type determination from key assert ArchivistLocator.from_str( 'dl+archive:MD5E-s1--e9f624eb778e6f945771c543b6e9c7b2.tar#path=f.txt' ).atype == ArchiveType.tar assert ArchivistLocator.from_str( 'dl+archive:MD5E-s1--e9f624eb778e6f945771c543b6e9c7b2.tgz#path=f.txt' ).atype == ArchiveType.tar assert ArchivistLocator.from_str( 'dl+archive:MD5E-s1--e9f624eb778e6f945771c543b6e9c7b2.tar.gz#path=f.txt' ).atype == ArchiveType.tar assert ArchivistLocator.from_str( 'dl+archive:MD5E-s1--e9f624eb778e6f945771c543b6e9c7b2.zip#path=f.txt' ).atype == ArchiveType.zip def test_archivistlocatori_errors(): for wrong in ( # no chance without prefix 'bogus', # not just a prefix or some bogus properties 'dl+archive:', 'dl+archive:#', 'dl+archive:keything', 'dl+archive:#props', 'dl+archive:keything#props', # a real key is required, but not sufficient f'dl+archive:{some_key}#props', # we require a member path, the rest is optional f'dl+archive:{some_key}#size=123', f'dl+archive:{some_key}#size=123&atype=tar', # must be a proper POSIX path, relative, no .. f'dl+archive:{some_key}#path=/dummy', f'dl+archive:{some_key}#path=../dd', # cannot work with unknown archive type f'dl+archive:{some_key}#path=good&size=123&atype=eh!', ): with pytest.raises(ValueError): ArchivistLocator.from_str(wrong) datalad-next-1.4.1/datalad_next/uis/000077500000000000000000000000001462321624600173215ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/uis/__init__.py000066400000000000000000000005211462321624600214300ustar00rootroot00000000000000"""UI abstractions for user communication .. currentmodule:: datalad_next.uis .. autosummary:: :toctree: generated ansi_colors ui_switcher """ # make more obvious that this is a frontend that behaves # differently depending on many conditions from datalad.ui import ui as ui_switcher from datalad.support import ansi_colors datalad-next-1.4.1/datalad_next/url_operations/000077500000000000000000000000001462321624600215665ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/url_operations/__init__.py000066400000000000000000000026131462321624600237010ustar00rootroot00000000000000"""Handlers for operations on various URL types and protocols Available handlers: .. currentmodule:: datalad_next.url_operations .. autosummary:: :toctree: generated UrlOperations AnyUrlOperations FileUrlOperations HttpUrlOperations SshUrlOperations UrlOperationsRemoteError UrlOperationsResourceUnknown UrlOperationsInteractionError UrlOperationsAuthenticationError UrlOperationsAuthorizationError """ from .base import ( # base class for 3rd-party extensions and implementations UrlOperations, ) # operation support for different protocols from .any import AnyUrlOperations from .file import FileUrlOperations from .http import HttpUrlOperations from .ssh import SshUrlOperations # primary exceptions types from .exceptions import ( UrlOperationsRemoteError, UrlOperationsResourceUnknown, UrlOperationsInteractionError, UrlOperationsAuthenticationError, UrlOperationsAuthorizationError, ) # TODO REMOVE EVERYTHING BELOW FOR V2.0 import logging from functools import partial from more_itertools import side_effect from pathlib import Path from typing import ( Any, Dict, Generator, Iterable, ) import datalad from datalad_next.config import ConfigManager from datalad_next.utils import log_progress from datalad_next.utils.multihash import ( MultiHash, NoOpHash, ) lgr = logging.getLogger('datalad.ext.next.url_operations') datalad-next-1.4.1/datalad_next/url_operations/any.py000066400000000000000000000176141462321624600227400ustar00rootroot00000000000000"""Meta URL handler with automatic scheme-based switching of implementations""" # allow for |-type UnionType declarations from __future__ import annotations from importlib import import_module import json import logging from pathlib import Path import re from typing import Dict from datalad_next.config import ConfigManager from datalad_next.exceptions import CapturedException from .base import UrlOperations lgr = logging.getLogger('datalad.ext.next.url_operations.any') __all__ = ['AnyUrlOperations'] # define handlers for each supported URL pattern # FORMAT OF HANDLER REGISTRY (dict) # - key: regex match expression to be apply on a URL (to test whether a # particular handler should be used for a given URL) # - value: tuple (handler specification, see below) # FORMAT OF HANDLER SPECIFICATION # - tuple of min-length 1 # - item1: str, handler class to import # e.g., package.module.class # - item2: dict, optional, kwargs to pass to the handler constructor # TODO support proper entrypoint mechanism # It is best to only record handlers here for which there is no alternative, # because the best handler is determined based on this information # and only this handler is imported. If that fails, there is no fallback. # Handlers that may or may not work under given conditions should only # be added via external logic after they have been found to be "working" # on a given installation. _url_handlers = { 'http': ('datalad_next.url_operations.http.HttpUrlOperations',), 'file': ('datalad_next.url_operations.file.FileUrlOperations',), 'ssh': ('datalad_next.url_operations.ssh.SshUrlOperations',), } class AnyUrlOperations(UrlOperations): """Handler for operations on any supported URLs The methods inspect a given URL and call the corresponding methods for the `UrlOperations` implementation that matches the URL best. The "best match" is the match expression of a registered URL handler that yields the longest match against the given URL. Parameter identity and semantics are unchanged with respect to the underlying implementations. See their documentation for details. An instance retains and reuses URL scheme handler instances for subsequent operations, such that held connections or cached credentials can be reused efficiently. """ def __init__(self, cfg: ConfigManager | None = None): """ Parameters ---------- cfg: ConfigManager, optional A config manager instance that is consulted for any configuration filesystem configuration individual handlers may support. """ super().__init__(cfg=cfg) self._load_handler_registery() # cache of already used handlers self._url_handler_cache = dict() def _load_handler_registery(self): # update with handlers from config # https://github.com/datalad/datalad-next/issues/217 cfgh = {} for citem in self.cfg.keys(): if not citem.startswith('datalad.url-handler.'): # none of our business continue # the match expression is right in the item key # (all but the first two and the last segment) citem_l = citem.split('.') match = '.'.join(citem_l[2:-1]) prop = citem_l[-1] value = self.cfg[citem] if prop != 'class': try: value = json.loads(value) except Exception as e: ce = CapturedException(e) lgr.debug( 'Ignoring invalid URL handler configuration ' 'for %r(%s): %r [%s]', match, prop, value, ce) continue hc = cfgh.get(match, {}) hc[prop] = value cfgh[match] = hc # merge all specs uh = dict(_url_handlers) for match, spec in cfgh.items(): try: uh[match] = (spec['class'], spec['kwargs']) except KeyError: try: uh[match] = (spec['class'],) except Exception as e: CapturedException(e) lgr.debug( 'Ignoring incomplete URL handler specification ' 'for %r: %r', match, spec) self._url_handlers = {} for k, v in uh.items(): # compile matches to finalize lgr.log(8, 'Add URL handler for %r: %r', k, v) self._url_handlers[re.compile(k)] = v def _get_handler(self, url: str) -> UrlOperations: # match URL against all registered handlers and get the one with the # longest (AKA best) match longest_match = 0 best_match = None for r in self._url_handlers: m = r.match(url) if not m: continue length = m.end() - m.start() if length > longest_match: best_match = r longest_match = length if best_match is None: raise ValueError(f'unsupported URL {url!r}') # reuse existing handler, they might already have an idea on # authentication etc. from a previously processed URL if best_match in self._url_handler_cache: return self._url_handler_cache[best_match] # we need to import the handler try: handler_spec = self._url_handlers[best_match] # split the import declaration into units toimport = handler_spec[0].split('.') # the handler class is the last unit cls = toimport[-1] # the rest is the module mod = '.'.join(toimport[:-1]) module = import_module(mod, package='datalad') handler_cls = getattr(module, cls) handler_kwargs = handler_spec[1] if len(handler_spec) > 1 else {} url_handler = handler_cls(cfg=self.cfg, **handler_kwargs) except Exception as e: raise ValueError( 'Cannot create URL handler instance for ' f'{best_match.pattern!r} from {self._url_handlers[best_match]}') from e self._url_handler_cache[best_match] = url_handler return url_handler def is_supported_url(self, url) -> bool: return any(r.match(url) for r in self._url_handlers) def stat(self, url: str, *, credential: str | None = None, timeout: float | None = None) -> Dict: """Call `*UrlOperations.stat()` for the respective URL scheme""" return self._get_handler(url).stat( url, credential=credential, timeout=timeout) def download(self, from_url: str, to_path: Path | None, *, credential: str | None = None, hash: list[str] | None = None, timeout: float | None = None) -> Dict: """Call `*UrlOperations.download()` for the respective URL scheme""" return self._get_handler(from_url).download( from_url, to_path, credential=credential, hash=hash, timeout=timeout) def upload(self, from_path: Path | None, to_url: str, *, credential: str | None = None, hash: list[str] | None = None, timeout: float | None = None) -> Dict: """Call `*UrlOperations.upload()` for the respective URL scheme""" return self._get_handler(to_url).upload( from_path, to_url, credential=credential, hash=hash, timeout=timeout) def delete(self, url: str, *, credential: str | None = None, timeout: float | None = None) -> Dict: """Call `*UrlOperations.delete()` for the respective URL scheme""" return self._get_handler(url).delete( url, credential=credential, timeout=timeout) datalad-next-1.4.1/datalad_next/url_operations/base.py000066400000000000000000000350631462321624600230610ustar00rootroot00000000000000"""API base class""" from __future__ import annotations import logging from functools import partial from more_itertools import side_effect from pathlib import Path from typing import ( Any, Dict, Generator, Iterable, ) import datalad from datalad_next.config import ConfigManager from datalad_next.utils import log_progress from datalad_next.utils.multihash import ( MultiHash, NoOpHash, ) lgr = logging.getLogger('datalad.ext.next.url_operations') class UrlOperations: """Abstraction for operations on URLs Support for specific URL schemes can be implemented via sub-classes. Such classes must comply with the following conditions: - Any configuration look-up must be performed with the `self.cfg` property, which is guaranteed to be a `ConfigManager` instance. - When downloads are to be supported, implement the `download()` method and comply with the behavior described in its documentation. This class provides a range of helper methods to aid computation of hashes and progress reporting. """ def __init__(self, *, cfg: ConfigManager | None = None): """ Parameters ---------- cfg: ConfigManager, optional A config manager instance that implementations will consult for any configuration items they may support. """ self._cfg = cfg @property def cfg(self) -> ConfigManager: if self._cfg is None: self._cfg = datalad.cfg return self._cfg def stat(self, url: str, *, credential: str | None = None, timeout: float | None = None) -> Dict: """Gather information on a URL target, without downloading it Returns ------- dict A mapping of property names to values of the URL target. The particular composition of properties depends on the specific URL. A standard property is 'content-length', indicating the size of a download. Raises ------ UrlOperationsRemoteError This exception is raised on any access-related error on the remote side, with a summary of the underlying issues as its message. It may carry a status code (e.g. HTTP status code) as its ``status_code`` property. Any underlying exception must be linked via the `__cause__` property (e.g. `raise UrlOperationsRemoteError(...) from ...`). UrlOperationsInteractionError UrlOperationsAuthenticationError UrlOperationsAuthorizationError UrlOperationsResourceUnknown Implementations that can distinguish several remote error types beyond indication a general ``UrlOperationsRemoteError``: ``UrlOperationsInteractionError`` general issues in communicating with the remote side; ``UrlOperationsAuthenticationError`` for errors related to (failed) authentication at the remote; ``UrlOperationsAuthorizationError`` for (lack of) authorizating to access a particular resource of perform a particular operation; ``UrlOperationsResourceUnknown`` if the target of an operation does not exist. TimeoutError If `timeout` is given and the operation does not complete within the number of seconds that a specified by `timeout`. """ raise NotImplementedError def download(self, from_url: str, to_path: Path | None, *, credential: str | None = None, hash: list[str] | None = None, timeout: float | None = None) -> Dict: """Download from a URL to a local file or stream to stdout Parameters ---------- from_url: str Valid URL with any scheme supported by a particular implementation. to_path: Path or None A local platform-native path or `None`. If `None` the downloaded data is written to `stdout`, otherwise it is written to a file at the given path. The path is assumed to not exist. Any existing file will be overwritten. credential: str, optional The name of a dedicated credential to be used for authentication in order to perform the download. Particular implementations may or may not require or support authentication. They also may or may not support automatic credential lookup. hash: list(algorithm_names), optional If given, must be a list of hash algorithm names supported by the `hashlib` module. A corresponding hash will be computed simultaenous to the download (without reading the data twice), and included in the return value. timeout: float, optional If given, specifies a timeout in seconds. If the operation is not completed within this time, it will raise a `TimeoutError`-exception. If timeout is None, the operation will never timeout. Returns ------- dict A mapping of property names to values for the completed download. If `hash` algorithm names are provided, a corresponding key for each algorithm is included in this mapping, with the hexdigest of the corresponding checksum as the value. Raises ------ UrlOperationsRemoteError This exception is raised on any deletion-related error on the remote side, with a summary of the underlying issues as its message. It may carry a status code (e.g. HTTP status code) as its ``status_code`` property. Any underlying exception must be linked via the `__cause__` property (e.g. `raise UrlOperationsRemoteError(...) from ...`). UrlOperationsInteractionError UrlOperationsAuthenticationError UrlOperationsAuthorizationError UrlOperationsResourceUnknown Implementations that can distinguish several remote error types beyond indication a general ``UrlOperationsRemoteError``: ``UrlOperationsInteractionError`` general issues in communicating with the remote side; ``UrlOperationsAuthenticationError`` for errors related to (failed) authentication at the remote; ``UrlOperationsAuthorizationError`` for (lack of) authorizating to access a particular resource of perform a particular operation; ``UrlOperationsResourceUnknown`` if the target of an operation does not exist. TimeoutError If `timeout` is given and the operation does not complete within the number of seconds that a specified by `timeout`. """ raise NotImplementedError def upload(self, from_path: Path | None, to_url: str, *, credential: str | None = None, hash: list[str] | None = None, timeout: float | None = None) -> Dict: """Upload from a local file or stream to a URL Parameters ---------- from_path: Path or None A local platform-native path or `None`. If `None` the upload data is read from `stdin`, otherwise it is read from a file at the given path. to_url: str Valid URL with any scheme supported by a particular implementation. The target is assumed to not conflict with existing content, and may be overwritten. credential: str, optional The name of a dedicated credential to be used for authentication in order to perform the upload. Particular implementations may or may not require or support authentication. They also may or may not support automatic credential lookup. hash: list(algorithm_names), optional If given, must be a list of hash algorithm names supported by the `hashlib` module. A corresponding hash will be computed simultaenous to the upload (without reading the data twice), and included in the return value. timeout: float, optional If given, specifies a timeout in seconds. If the operation is not completed within this time, it will raise a `TimeoutError`-exception. If timeout is None, the operation will never timeout. Returns ------- dict A mapping of property names to values for the completed upload. If `hash` algorithm names are provided, a corresponding key for each algorithm is included in this mapping, with the hexdigest of the corresponding checksum as the value. Raises ------ FileNotFoundError If the source file cannot be found. UrlOperationsRemoteError This exception is raised on any deletion-related error on the remote side, with a summary of the underlying issues as its message. It may carry a status code (e.g. HTTP status code) as its ``status_code`` property. Any underlying exception must be linked via the `__cause__` property (e.g. `raise UrlOperationsRemoteError(...) from ...`). UrlOperationsInteractionError UrlOperationsAuthenticationError UrlOperationsAuthorizationError UrlOperationsResourceUnknown Implementations that can distinguish several remote error types beyond indication a general ``UrlOperationsRemoteError``: ``UrlOperationsInteractionError`` general issues in communicating with the remote side; ``UrlOperationsAuthenticationError`` for errors related to (failed) authentication at the remote; ``UrlOperationsAuthorizationError`` for (lack of) authorizating to access a particular resource of perform a particular operation; ``UrlOperationsResourceUnknown`` if the target of an operation does not exist. TimeoutError If `timeout` is given and the operation does not complete within the number of seconds that a specified by `timeout`. """ raise NotImplementedError def delete(self, url: str, *, credential: str | None = None, timeout: float | None = None) -> Dict: """Delete a resource identified by a URL Parameters ---------- url: str Valid URL with any scheme supported by a particular implementation. credential: str, optional The name of a dedicated credential to be used for authentication in order to perform the deletion. Particular implementations may or may not require or support authentication. They also may or may not support automatic credential lookup. timeout: float, optional If given, specifies a timeout in seconds. If the operation is not completed within this time, it will raise a `TimeoutError`-exception. If timeout is None, the operation will never timeout. Returns ------- dict A mapping of property names to values for the deletion. Raises ------ UrlOperationsRemoteError This exception is raised on any deletion-related error on the remote side, with a summary of the underlying issues as its message. It may carry a status code (e.g. HTTP status code) as its ``status_code`` property. Any underlying exception must be linked via the `__cause__` property (e.g. `raise UrlOperationsRemoteError(...) from ...`). UrlOperationsInteractionError UrlOperationsAuthenticationError UrlOperationsAuthorizationError UrlOperationsResourceUnknown Implementations that can distinguish several remote error types beyond indication a general ``UrlOperationsRemoteError``: ``UrlOperationsInteractionError`` general issues in communicating with the remote side; ``UrlOperationsAuthenticationError`` for errors related to (failed) authentication at the remote; ``UrlOperationsAuthorizationError`` for (lack of) authorizating to access a particular resource of perform a particular operation; ``UrlOperationsResourceUnknown`` if the target of an operation does not exist. TimeoutError If `timeout` is given and the operation does not complete within the number of seconds that a specified by `timeout`. """ raise NotImplementedError def _get_progress_id(self, from_id: str, to_id: str): return f'progress_transport_{from_id}_{to_id}' def _progress_report_start(self, pid: str, log_msg: tuple, label: str, expected_size: int | None): log_progress( lgr.info, pid, *log_msg, unit=' Bytes', label=label, total=expected_size, noninteractive_level=logging.DEBUG, ) def _progress_report_update(self, pid: str, log_msg: tuple, increment: int): log_progress( lgr.info, pid, *log_msg, update=increment, increment=True, noninteractive_level=logging.DEBUG, ) def _progress_report_stop(self, pid: str, log_msg: tuple): log_progress( lgr.info, pid, *log_msg, noninteractive_level=logging.DEBUG, ) def _get_hasher(self, hash: list[str] | None) -> NoOpHash | MultiHash: return MultiHash(hash) if hash is not None else NoOpHash() def _with_progress(self, stream: Iterable[Any], *, progress_id: str, label: str, expected_size: int | None, start_log_msg: tuple, end_log_msg: tuple, update_log_msg: tuple ) -> Generator[Any, None, None]: yield from side_effect( lambda chunk: self._progress_report_update( progress_id, update_log_msg, len(chunk) ), stream, before=partial( self._progress_report_start, progress_id, start_log_msg, label, expected_size ), after=partial( self._progress_report_stop, progress_id, end_log_msg ) ) datalad-next-1.4.1/datalad_next/url_operations/exceptions.py000066400000000000000000000040531462321624600243230ustar00rootroot00000000000000"""Exceptions to be used by all handlers""" from __future__ import annotations from typing import ( Any, ) class UrlOperationsRemoteError(Exception): def __init__(self, url, message=None, status_code: Any = None): # use base exception feature to store all arguments in a tuple # and have named properties to access them super().__init__( url, message, status_code, ) def __str__(self): url, message, status_code = self.args if message: return message if status_code: return f"error {status_code} for {url!r}" return f"{self.__class__.__name__} for {url!r}" def __repr__(self) -> str: url, message, status_code = self.args return f"{self.__class__.__name__}(" \ f"{url!r}, {message!r}, {status_code!r})" @property def url(self): return self.args[0] @property def message(self): return self.args[1] @property def status_code(self): return self.args[2] class UrlOperationsResourceUnknown(UrlOperationsRemoteError): """A connection request succeeded in principle, but target was not found Equivalent of an HTTP404 response. """ pass class UrlOperationsInteractionError(UrlOperationsRemoteError): pass class UrlOperationsAuthenticationError(UrlOperationsInteractionError): def __init__(self, url: str, credential: dict | None = None, message: str | None = None, status_code: Any = None): super().__init__(url, message=message, status_code=status_code) self.credential = credential class UrlOperationsAuthorizationError(UrlOperationsRemoteError): def __init__(self, url: str, credential: dict | None = None, message: str | None = None, status_code: Any | None = None): super().__init__(url, message=message, status_code=status_code) self.credential = credential datalad-next-1.4.1/datalad_next/url_operations/file.py000066400000000000000000000215261462321624600230650ustar00rootroot00000000000000"""Handler for operations, such as "download", on file:// URLs""" # allow for |-type UnionType declarations from __future__ import annotations import logging from pathlib import Path import sys from typing import Dict from urllib import ( request, parse, ) from datalad_next.consts import COPY_BUFSIZE from .base import UrlOperations from .exceptions import ( UrlOperationsRemoteError, UrlOperationsResourceUnknown, ) lgr = logging.getLogger('datalad.ext.next.file_url_operations') __all__ = ['FileUrlOperations'] class FileUrlOperations(UrlOperations): """Handler for operations on `file://` URLs Access to local data via file-scheme URLs is supported with the same API and feature set as other URL-schemes (simultaneous content hashing and progress reporting. """ def _file_url_to_path(self, url): assert url.startswith('file://') parsed = parse.urlparse(url) path = request.url2pathname(parsed.path) return Path(path) def stat(self, url: str, *, credential: str | None = None, timeout: float | None = None) -> Dict: """Gather information on a URL target, without downloading it See :meth:`datalad_next.url_operations.UrlOperations.stat` for parameter documentation and exception behavior. Raises ------ UrlOperationsResourceUnknown For access targets found absent. """ # filter out internals return { k: v for k, v in self._stat(url, credential).items() if not k.startswith('_') } def _stat(self, url: str, credential: str | None = None) -> Dict: # turn url into a native path from_path = self._file_url_to_path(url) # if anything went wrong with the conversion, or we lack # permissions: die here try: size = from_path.stat().st_size except FileNotFoundError as e: raise UrlOperationsResourceUnknown(url) from e return { 'content-length': size, '_path': from_path, } def download(self, from_url: str, to_path: Path | None, *, # unused, but theoretically could be used to # obtain escalated/different privileges on a system # to gain file access credential: str | None = None, hash: list[str] | None = None, timeout: float | None = None) -> Dict: """Copy a file:// URL target to a local path See :meth:`datalad_next.url_operations.UrlOperations.download` for parameter documentation and exception behavior. Raises ------ UrlOperationsResourceUnknown For download targets found absent. """ dst_fp = None try: props = self._stat(from_url, credential=credential) from_path = props['_path'] expected_size = props['content-length'] dst_fp = sys.stdout.buffer if to_path is None \ else open(to_path, 'wb') with from_path.open('rb') as src_fp: props.update(self._copyfp( src_fp, dst_fp, expected_size, hash, start_log=('Download %s to %s', from_url, to_path), update_log=('Downloaded chunk',), finish_log=('Finished download',), progress_label='downloading', )) return props except PermissionError: # would be a local issue, pass-through raise except UrlOperationsResourceUnknown: # would come from stat(), pass_through raise except Exception as e: # wrap this into the datalad-standard, but keep the # original exception linked raise UrlOperationsRemoteError(from_url, message=str(e)) from e finally: if dst_fp and to_path is not None: dst_fp.close() def upload(self, from_path: Path | None, to_url: str, *, credential: str | None = None, hash: list[str] | None = None, timeout: float | None = None) -> Dict: """Copy a local file to a file:// URL target Any missing parent directories of the URL target are created as necessary. See :meth:`datalad_next.url_operations.UrlOperations.upload` for parameter documentation and exception behavior. Raises ------ FileNotFoundError If the source file cannot be found. """ # get the size, or die if inaccessible props = {} if from_path: expected_size = from_path.stat().st_size props['content-length'] = expected_size else: expected_size = None to_path = self._file_url_to_path(to_url) # create parent dir(s) as necessary to_path.parent.mkdir(exist_ok=True, parents=True) src_fp = None try: src_fp = sys.stdin.buffer if from_path is None \ else open(from_path, 'rb') with to_path.open('wb') as dst_fp: props.update(self._copyfp( src_fp, dst_fp, expected_size, hash, start_log=('Upload %s to %s', from_path, to_url), update_log=('Uploaded chunk',), finish_log=('Finished upload',), progress_label='uploading', )) return props except FileNotFoundError as e: raise UrlOperationsResourceUnknown(url) from e except Exception as e: # wrap this into the datalad-standard, but keep the # original exception linked raise UrlOperationsRemoteError(from_url, message=str(e)) from e finally: if src_fp and from_path is not None: src_fp.close() def delete(self, url: str, *, credential: str | None = None, timeout: float | None = None) -> Dict: """Delete the target of a file:// URL The target can be a file or a directory. If it is a directory, it has to be empty. See :meth:`datalad_next.url_operations.UrlOperations.delete` for parameter documentation and exception behavior. Raises ------ UrlOperationsResourceUnknown For deletion targets found absent. """ path = self._file_url_to_path(url) try: path.unlink() except FileNotFoundError as e: raise UrlOperationsResourceUnknown(url) from e except IsADirectoryError: try: path.rmdir() except Exception as e: raise UrlOperationsRemoteError(url, message=str(e)) from e except Exception as e: # wrap this into the datalad-standard, but keep the # original exception linked raise UrlOperationsRemoteError(url, message=str(e)) from e def _copyfp(self, src_fp: file, dst_fp: file, expected_size: int, hash: list[str] | None, start_log: tuple, update_log: tuple, finish_log: tuple, progress_label: str, ) -> dict: # this is pretty much shutil.copyfileobj() with the necessary # wrapping to perform hashing and progress reporting hasher = self._get_hasher(hash) progress_id = self._get_progress_id(id(src_fp), id(src_fp)) # Localize variable access to minimize overhead src_fp_read = src_fp.read dst_fp_write = dst_fp.write props = {} self._progress_report_start( progress_id, start_log, progress_label, expected_size) copy_size = 0 try: while True: chunk = src_fp_read(COPY_BUFSIZE) if not chunk: break dst_fp_write(chunk) chunk_size = len(chunk) self._progress_report_update( progress_id, update_log, chunk_size) # compute hash simultaneously hasher.update(chunk) copy_size += chunk_size props.update(hasher.get_hexdigest()) # return how much was copied. we could compare with # `expected_size` and error on mismatch, but not all # sources can provide that (e.g. stdin) props['content-length'] = copy_size return props finally: self._progress_report_stop(progress_id, finish_log) datalad-next-1.4.1/datalad_next/url_operations/http.py000066400000000000000000000271111462321624600231210ustar00rootroot00000000000000"""Handler for operations, such as "download", on http(s):// URLs""" # allow for |-type UnionType declarations from __future__ import annotations import logging from pathlib import Path import sys from typing import Dict import requests from requests_toolbelt import user_agent import datalad from datalad_next.utils import ( DataladAuth, parse_www_authenticate, ) from .base import UrlOperations from .exceptions import ( UrlOperationsRemoteError, UrlOperationsResourceUnknown, ) lgr = logging.getLogger('datalad.ext.next.url_operations.http') __all__ = ['HttpUrlOperations'] class HttpUrlOperations(UrlOperations): """Handler for operations on `http(s)://` URLs This handler is built on the `requests` package. For authentication, it employes :class:`datalad_next.utils.requests_auth.DataladAuth`, an adaptor that consults the DataLad credential system in order to fulfill HTTP authentication challenges. """ def __init__(self, cfg=None, headers: Dict | None = None): """ Parameters ---------- cfg: ConfigManager, optional A config manager instance that is consulted for any configuration filesystem configuration individual handlers may support. headers: dict, optional Additional or alternative headers to add to a request. The default headers contain a ``user-agent`` declaration. Any headers provided here override corresponding defaults. """ super().__init__(cfg=cfg) self._headers = { 'user-agent': user_agent('datalad', datalad.__version__), } if headers: self._headers.update(headers) def get_headers(self, headers: Dict | None = None) -> Dict: # start with the default hdrs = dict(self._headers) if headers is not None: hdrs.update(headers) return hdrs def stat(self, url: str, *, credential: str | None = None, timeout: float | None = None) -> Dict: """Gather information on a URL target, without downloading it See :meth:`datalad_next.url_operations.UrlOperations.stat` for parameter documentation and exception behavior. Raises ------ UrlOperationsResourceUnknown For access targets found absent. """ auth = DataladAuth(self.cfg, credential=credential) with requests.head( url, headers=self.get_headers(), auth=auth, # we want to match the `get` behavior explicitly # in order to arrive at the final URL after any # redirects that get would also end up with allow_redirects=True, ) as r: # fail visible for any non-OK outcome try: r.raise_for_status() except requests.exceptions.RequestException as e: # wrap this into the datalad-standard, but keep the # original exception linked if e.response.status_code == 404: # special case reporting for a 404 raise UrlOperationsResourceUnknown( url, status_code=e.response.status_code) from e else: raise UrlOperationsRemoteError( url, message=str(e), status_code=e.response.status_code ) from e props = { # standardize on lower-case header keys. # also prefix anything other than 'content-length' to make # room for future standardizations k.lower() if k.lower() == 'content-length' else f'http-{k.lower()}': v for k, v in r.headers.items() } props['url'] = r.url auth.save_entered_credential( context=f"for accessing {url}" ) if 'content-length' in props: # make an effort to return size in bytes as int try: props['content-length'] = int(props['content-length']) except (TypeError, ValueError): # but be reasonably robust against unexpected responses pass return props def download(self, from_url: str, to_path: Path | None, *, credential: str | None = None, hash: list[str] | None = None, timeout: float | None = None) -> Dict: """Download via HTTP GET request See :meth:`datalad_next.url_operations.UrlOperations.download` for parameter documentation and exception behavior. Raises ------ UrlOperationsResourceUnknown For download targets found absent. """ # a new manager per request # TODO optimize later to cache credentials per target # similar to requests_toolbelt.auth.handler.AuthHandler auth = DataladAuth(self.cfg, credential=credential) with requests.get( from_url, stream=True, headers=self.get_headers(), auth=auth, ) as r: # fail visible for any non-OK outcome try: r.raise_for_status() except requests.exceptions.RequestException as e: # wrap this into the datalad-standard, but keep the # original exception linked if e.response.status_code == 404: # special case reporting for a 404 raise UrlOperationsResourceUnknown( from_url, status_code=e.response.status_code) from e else: raise UrlOperationsRemoteError( from_url, message=str(e), status_code=e.response.status_code ) from e download_props = self._stream_download_from_request( r, to_path, hash=hash) auth.save_entered_credential( context=f'download from {from_url}' ) return download_props def probe_url(self, url, timeout=10.0, headers=None): """Probe a HTTP(S) URL for redirects and authentication needs This functions performs a HEAD request against the given URL, while waiting at most for the given timeout duration for a server response. Parameters ---------- url: str URL to probe timeout: float, optional Maximum time to wait for a server response to the probe headers: dict, optional Any custom headers to use for the probe request. If none are provided, or the provided headers contain no 'user-agent' field, the default DataLad user agent is added automatically. Returns ------- str or None, dict The first value is the URL against the final request was performed, after following any redirects and applying normalizations. The second value is a mapping with a particular set of properties inferred from probing the webserver. The following key-value pairs are supported: - 'is_redirect' (bool), True if any redirection occurred. This boolean property is a more accurate test than comparing input and output URL - 'status_code' (int), HTTP response code (of the final request in case of redirection). - 'auth' (dict), present if the final server response contained any 'www-authenticate' headers, typically the case for 401 responses. The dict contains a mapping of server-reported authentication scheme names (e.g., 'basic', 'bearer') to their respective properties (dict). These can be any nature and number, depending on the respective authentication scheme. Most notably, they may contain a 'realm' property that can be used to determine suitable credentials for authentication. Raises ------ requests.RequestException May raise any exception of the `requests` package, most notably `ConnectionError`, `Timeout`, `TooManyRedirects`, etc. """ hdrs = self.get_headers() if headers is None: headers = hdrs elif 'user-agent' not in headers: headers.update(hdrs) props = {} req = requests.head( url, allow_redirects=True, timeout=timeout, headers=headers, ) if 'www-authenticate' in req.headers: props['auth'] = parse_www_authenticate( req.headers['www-authenticate']) props['is_redirect'] = True if req.history else False props['status_code'] = req.status_code return req.url, props def _stream_download_from_request( self, r, to_path, hash: list[str] | None = None) -> Dict: from_url = r.url hasher = self._get_hasher(hash) progress_id = self._get_progress_id(from_url, to_path) # try to get download size, it might not be provided, e.g. if # chunked transport encoding is used try: # for compressed downloads the content length refers to the # compressed content expected_size = int(r.headers.get('content-length')) except (ValueError, TypeError): # some HTTP-200 responses do not have a `content-length` header, # e.g. if chunked transport encoding is used. in this case, set # up everything to calculate size by ourselves expected_size = None self._progress_report_start( progress_id, ('Download %s to %s', from_url, to_path), 'downloading', # can be None, and that is OK expected_size, ) fp = None props: Dict[str, str] = {} try: # we can only write to file-likes opened in bytes mode fp = sys.stdout.buffer if to_path is None else open(to_path, 'wb') # we need to track how much came down the pipe for progress # reporting downloaded_bytes = 0 # TODO make chunksize a config item, 65536 is the default in # requests_toolbelt for chunk in r.raw.stream(amt=65536, decode_content=True): # update how much data was transferred from the remote server. if expected_size: # if we have an expected size, we don't use the size of the # chunk for that because content might be downloaded with # transparent (de)compression. instead we ask the download # stream itself for its "position". tell = r.raw.tell() else: # if we do not have an expected size, all we can use is # the size of the downloaded chunk. tell = downloaded_bytes + len(chunk) self._progress_report_update( progress_id, ('Downloaded chunk',), tell - downloaded_bytes, ) fp.write(chunk) downloaded_bytes = tell # compute hash simultaneously hasher.update(chunk) props.update(hasher.get_hexdigest()) return props finally: if fp and to_path is not None: fp.close() self._progress_report_stop(progress_id, ('Finished download',)) datalad-next-1.4.1/datalad_next/url_operations/ssh.py000066400000000000000000000346251462321624600227470ustar00rootroot00000000000000"""Handler for operations, such as "download", on ssh:// URLs""" # allow for |-type UnionType declarations from __future__ import annotations import logging import sys from functools import partial from itertools import chain from pathlib import ( Path, PurePosixPath, ) from queue import ( Full, Queue, ) from typing import ( Dict, Generator, IO, cast, ) from urllib.parse import ( urlparse, ParseResult, ) from datalad_next.consts import COPY_BUFSIZE from datalad_next.config import ConfigManager from datalad_next.itertools import align_pattern from datalad_next.runners import ( iter_subproc, CommandError, ) from .base import UrlOperations from .exceptions import ( UrlOperationsRemoteError, UrlOperationsResourceUnknown, ) lgr = logging.getLogger('datalad.ext.next.ssh_url_operations') __all__ = ['SshUrlOperations'] class SshUrlOperations(UrlOperations): """Handler for operations on ``ssh://`` URLs For downloading files, only servers that support execution of the commands 'printf', 'ls -nl', 'awk', and 'cat' are supported. This includes a wide range of operating systems, including devices that provide these commands via the 'busybox' software. .. note:: The present implementation does not support SSH connection multiplexing, (re-)authentication is performed for each request. This limitation is likely to be removed in the future, and connection multiplexing supported where possible (non-Windows platforms). """ # first try ls'ing the path, and catch a missing path with a dedicated 244 # exit code, to be able to distinguish the original exit=2 that ls-call # from a later exit=2 from awk in case of a "fatal error". # when executed through ssh, only a missing file would yield 244, while # a connection error or other problem unrelated to the present of a file # would a different error code (255 in case of a connection error) _stat_cmd = "printf \"\\1\\2\\3\"; ls '{fpath}' &> /dev/null " \ "&& ls -nl '{fpath}' | awk 'BEGIN {{ORS=\"\\1\"}} {{print $5}}' " \ "|| exit 244" _cat_cmd = "cat '{fpath}'" @staticmethod def _check_return_code(return_code: int, url: str): # At this point the subprocess has either exited, was terminated, or # was killed. if return_code == 244: # this is the special code for a file-not-found raise UrlOperationsResourceUnknown(url) elif return_code != 0: raise UrlOperationsRemoteError( url, message=f'ssh process returned {return_code}' ) def stat(self, url: str, *, credential: str | None = None, timeout: float | None = None) -> Dict: """Gather information on a URL target, without downloading it See :meth:`datalad_next.url_operations.UrlOperations.stat` for parameter documentation and exception behavior. """ ssh_cat = _SshCommandBuilder(url, self.cfg) cmd = ssh_cat.get_cmd(SshUrlOperations._stat_cmd) try: with iter_subproc(cmd) as stream: try: props = self._get_props(url, stream) except StopIteration: # we did not receive all data that should be sent, if a # remote file exists. This indicates a non-existing # resource or some other problem. The remotely executed # command should signal the error via a non-zero exit code. # That will trigger a `CommandError` below. pass except CommandError: self._check_return_code(stream.returncode, url) return {k: v for k, v in props.items() if not k.startswith('_')} def _get_props(self, url, stream: Generator) -> dict: # Any stream must start with this magic marker, or we do not # recognize what is happening # after this marker, the server will send the size of the # to-be-downloaded file in bytes, followed by another magic # b'\1', and the file content after that. magic_marker = b'\1\2\3' # use the `align_pattern` iterable to guarantees, that the magic # marker is always contained in a complete chunk. aligned_stream = align_pattern(stream, magic_marker) # Because the stream should start with the pattern, the first chunk of # the aligned stream must contain it. # We know that the stream will deliver bytes, cast the result # accordingly. chunk = cast(bytes, next(aligned_stream)) if chunk[:len(magic_marker)] != magic_marker: raise RuntimeError("Protocol error: report header not received") chunk = chunk[len(magic_marker):] # We are done with the aligned stream, use the original stream again. # This is possible because `align_pattern` does not cache any data # after a `yield`. del aligned_stream # The length is transferred now and terminated by b'\x01'. while b'\x01' not in chunk: chunk += next(stream) marker_index = chunk.index(b'\x01') expected_size = int(chunk[:marker_index]) chunk = chunk[marker_index + 1:] props = { 'content-length': expected_size, # go back to the original iterator, no need to keep looking for # a pattern '_stream': chain([chunk], stream) if chunk else stream } return props def download(self, from_url: str, to_path: Path | None, *, # unused, but theoretically could be used to # obtain escalated/different privileges on a system # to gain file access credential: str | None = None, hash: list[str] | None = None, timeout: float | None = None) -> Dict: """Download a file by streaming it through an SSH connection. On the server-side, the file size is determined and sent. Afterwards the file content is sent via `cat` to the SSH client. See :meth:`datalad_next.url_operations.UrlOperations.download` for parameter documentation and exception behavior. """ # this is pretty much shutil.copyfileobj() with the necessary # wrapping to perform hashing and progress reporting hasher = self._get_hasher(hash) progress_id = self._get_progress_id(from_url, str(to_path)) dst_fp = None ssh_cat = _SshCommandBuilder(from_url, self.cfg) cmd = ssh_cat.get_cmd(f'{SshUrlOperations._stat_cmd}; {SshUrlOperations._cat_cmd}') try: with iter_subproc(cmd) as stream: try: props = self._get_props(from_url, stream) expected_size = props['content-length'] # The stream might have changed due to not yet processed, but # fetched data, that is now chained in front of it. Therefore we # get the updated stream from the props download_stream = props.pop('_stream') dst_fp = sys.stdout.buffer \ if to_path is None \ else open(to_path, 'wb') # Localize variable access to minimize overhead dst_fp_write = dst_fp.write # download can start for chunk in self._with_progress( download_stream, progress_id=progress_id, label='downloading', expected_size=expected_size, start_log_msg=('Download %s to %s', from_url, to_path), end_log_msg=('Finished download',), update_log_msg=('Downloaded chunk',) ): # write data dst_fp_write(chunk) # compute hash simultaneously hasher.update(chunk) except StopIteration: # we did not receive all data that should be sent, if a # remote file exists. This indicates a non-existing # resource or some other problem. The remotely executed # command should signal the error via a non-zero exit code. # That will trigger a `CommandError` below. pass except CommandError: self._check_return_code(stream.returncode, from_url) finally: if dst_fp and to_path is not None: dst_fp.close() return { **props, **hasher.get_hexdigest(), } def upload(self, from_path: Path | None, to_url: str, *, credential: str | None = None, hash: list[str] | None = None, timeout: float | None = None) -> Dict: """Upload a file by streaming it through an SSH connection. It, more or less, runs `ssh 'cat > '`. See :meth:`datalad_next.url_operations.UrlOperations.upload` for parameter documentation and exception behavior. """ if from_path is None: source_name = '' return self._perform_upload( src_fp=sys.stdin.buffer, source_name=source_name, to_url=to_url, hash_names=hash, expected_size=None, timeout=timeout, ) else: # die right away, if we lack read permissions or there is no file with from_path.open("rb") as src_fp: return self._perform_upload( src_fp=src_fp, source_name=str(from_path), to_url=to_url, hash_names=hash, expected_size=from_path.stat().st_size, timeout=timeout, ) def _perform_upload(self, src_fp: IO, source_name: str, to_url: str, hash_names: list[str] | None, expected_size: int | None, timeout: float | None) -> dict: hasher = self._get_hasher(hash_names) # we use a queue to implement timeouts. # we limit the queue to few items in order to `make queue.put()` # block relatively quickly, and thereby have the progress report # actually track the upload, i.e. the feeding of the stdin pipe # of the ssh-process, and not just the feeding of the # queue. # If we did not support timeouts, we could just use the following # as `input`-iterable for `iter_subproc`: # # `iter(partial(src_fp.read, COPY_BUFSIZE), b'') # upload_queue: Queue = Queue(maxsize=2) cmd = _SshCommandBuilder(to_url, self.cfg).get_cmd( # leave special exit code when writing fails, but not the # general SSH access "( mkdir -p '{fdir}' && cat > '{fpath}' ) || exit 244" ) progress_id = self._get_progress_id(source_name, to_url) try: with iter_subproc( cmd, input=self._with_progress( iter(upload_queue.get, None), progress_id=progress_id, label='uploading', expected_size=expected_size, start_log_msg=('Upload %s to %s', source_name, to_url), end_log_msg=('Finished upload',), update_log_msg=('Uploaded chunk',) ) ): upload_size = 0 for chunk in iter(partial(src_fp.read, COPY_BUFSIZE), b''): # we are just putting stuff in the queue, and rely on # its maxsize to cause it to block the next call to # have the progress reports be anyhow valid, we also # rely on put-timeouts to implement timeout. upload_queue.put(chunk, timeout=timeout) # compute hash simultaneously hasher.update(chunk) upload_size += len(chunk) upload_queue.put(None, timeout=timeout) except CommandError as e: self._check_return_code(e.returncode, to_url) except Full: if chunk != b'': # we had a timeout while uploading raise TimeoutError return { **hasher.get_hexdigest(), # return how much was copied. we could compare with # `expected_size` and error on mismatch, but not all # sources can provide that (e.g. stdin) 'content-length': upload_size } class _SshCommandBuilder: def __init__( self, url: str, cfg: ConfigManager, ): self.ssh_args, self._parsed = ssh_url2openargs(url, cfg) self.ssh_args.extend(('-e', 'none')) # make sure the essential pieces exist assert self._parsed.path self.substitutions = dict( fdir=str(PurePosixPath(self._parsed.path).parent), fpath=self._parsed.path, ) def get_cmd(self, payload_cmd: str, ) -> list[str]: cmd = ['ssh'] cmd.extend(self.ssh_args) cmd.append(payload_cmd.format(**self.substitutions)) return cmd def ssh_url2openargs( url: str, cfg: ConfigManager, ) -> tuple[list[str], ParseResult]: """Helper to report ssh-open arguments from a URL and config Returns a tuple with the argument list and the parsed URL. """ args: list[str] = list() parsed = urlparse(url) # make sure the essential pieces exist assert parsed.hostname for opt, arg in (('-p', parsed.port), ('-l', parsed.username), ('-i', cfg.get('datalad.ssh.identityfile'))): if arg: # f-string, because port is not str args.extend((opt, f'{arg}')) # we could also use .netloc here and skip -p/-l above args.append(parsed.hostname) return args, parsed datalad-next-1.4.1/datalad_next/url_operations/tests/000077500000000000000000000000001462321624600227305ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/url_operations/tests/__init__.py000066400000000000000000000000001462321624600250270ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/url_operations/tests/test_any.py000066400000000000000000000037711462321624600251400ustar00rootroot00000000000000import pytest from .. import ( UrlOperationsResourceUnknown, ) from ..any import ( _url_handlers, AnyUrlOperations, ) from ..http import HttpUrlOperations from ..file import FileUrlOperations def test_get_best_url_handler(monkeypatch): ops = AnyUrlOperations() assert type(ops._get_handler('https://example.com')) == HttpUrlOperations # it will report the "longest-matching" Handler # we create a non-sensicle FileUrlOperations record to test that with monkeypatch.context() as m: m.setitem( _url_handlers, r'https://ex.*\.co', ('datalad_next.url_operations.file.FileUrlOperations',), ) # the handlers are sucked into the class, so we need a new instance ops = AnyUrlOperations() assert type(ops._get_handler('https://example.com')) == FileUrlOperations def test_any_url_operations(tmp_path): test_path = tmp_path / 'myfile' test_url = test_path.as_uri() ops = AnyUrlOperations() # no target file (yet), precise exception with pytest.raises(UrlOperationsResourceUnknown): ops.stat(test_url) # now put something at the target location test_path.write_text('surprise!') # and now it works props = ops.stat(test_url) # we get the correct file size reported assert props['content-length'] == test_path.stat().st_size # and download download_path = tmp_path / 'download' props = ops.download(test_url, download_path, hash=['sha256']) assert props['sha256'] == '71de4622cf536ed4aa9b65fc3701f4fc5a198ace2fa0bda234fd71924267f696' assert props['content-length'] == 9 == test_path.stat().st_size # remove source and try again test_path.unlink() with pytest.raises(UrlOperationsResourceUnknown): ops.download(test_url, download_path) # try some obscure URL scheme with pytest.raises(ValueError): ops.stat('weird://stuff') # and it could have been figured out before assert ops.is_supported_url('weird://stuff') == False datalad-next-1.4.1/datalad_next/url_operations/tests/test_file.py000066400000000000000000000065631462321624600252720ustar00rootroot00000000000000import io import locale import pytest import sys from datalad_next.utils import on_linux from ..file import ( FileUrlOperations, UrlOperationsRemoteError, UrlOperationsResourceUnknown, ) def test_file_url_download(tmp_path): test_path = tmp_path / 'myfile' test_url = test_path.as_uri() ops = FileUrlOperations() # no target file (yet), precise exception with pytest.raises(UrlOperationsResourceUnknown): ops.stat(test_url) # now put something at the target location test_path.write_text('surprise!') # and now it works props = ops.stat(test_url) # we get the correct file size reported assert props['content-length'] == test_path.stat().st_size # and download download_path = tmp_path / 'download' props = ops.download(test_url, download_path, hash=['sha256']) assert props['sha256'] == '71de4622cf536ed4aa9b65fc3701f4fc5a198ace2fa0bda234fd71924267f696' assert props['content-length'] == 9 == test_path.stat().st_size # remove source and try again test_path.unlink() with pytest.raises(UrlOperationsResourceUnknown): ops.download(test_url, download_path) def test_file_url_upload(tmp_path, monkeypatch): payload = 'payload' payload_file = tmp_path / 'payload' test_upload_path = tmp_path / 'myfile' test_upload_url = test_upload_path.as_uri() ops = FileUrlOperations() # missing source file # standard exception, makes no sense to go custom thinks mih with pytest.raises(FileNotFoundError): ops.upload(payload_file, test_upload_url) # no empty targets lying around assert not test_upload_path.exists() # now again payload_file.write_text(payload) props = ops.upload(payload_file, test_upload_url, hash=['md5']) assert test_upload_path.read_text() == 'payload' assert props['content-length'] == len(payload) assert props['md5'] == '321c3cf486ed509164edec1e1981fec8' # upload from STDIN from_stdin_url = (tmp_path / 'missingdir' / 'from_stdin').as_uri() with monkeypatch.context() as m: m.setattr(sys, 'stdin', io.TextIOWrapper(io.BytesIO( bytes(payload, encoding='utf-8')))) props = ops.upload(None, from_stdin_url, hash=['md5']) assert props['md5'] == '321c3cf486ed509164edec1e1981fec8' assert props['content-length'] == len(payload) # TODO test missing write permissions def test_file_url_delete(tmp_path): payload = 'payload' test_path = tmp_path / 'subdir' / 'myfile' test_path.parent.mkdir() test_url = test_path.as_uri() ops = FileUrlOperations() # missing file with pytest.raises(UrlOperationsResourceUnknown): ops.delete(test_url) # place file test_path.write_text(payload) assert test_path.read_text() == payload # try deleting a non-empty dir with pytest.raises(UrlOperationsRemoteError): ops.delete(test_path.parent.as_uri()) # file deletion works ops.delete(test_url) assert not test_path.exists() # both windows and mac give incomprehensible AccessDenied # errors on appveyor, although the directory is confirmed # to be empty if on_linux: # empty dir deletion works too # confirm it is indeed empty assert not list(test_path.parent.iterdir()) ops.delete(test_path.parent.as_uri()) assert not test_path.parent.exists() datalad-next-1.4.1/datalad_next/url_operations/tests/test_http.py000066400000000000000000000117121462321624600253220ustar00rootroot00000000000000from __future__ import annotations import gzip import pytest import requests from datalad_next.tests import skipif_no_network from ..any import AnyUrlOperations from ..http import ( HttpUrlOperations, UrlOperationsRemoteError, UrlOperationsResourceUnknown, ) def test_http_url_operations(credman, httpbin, tmp_path): hbsurl = httpbin['standard'] hbscred = ( 'hbscred', dict(user='mike', secret='dummy', type='user_password', realm=f'{hbsurl}/Fake Realm'), ) credman.set(hbscred[0], **hbscred[1]) ops = HttpUrlOperations() # authentication after redirect target_url = f'{hbsurl}/basic-auth/mike/dummy' props = ops.stat(f'{hbsurl}/redirect-to?url={target_url}') # we get the resolved URL after redirect back assert props['url'] == target_url # same again, but credentials are wrong target_url = f'{hbsurl}/basic-auth/mike/WRONG' with pytest.raises(UrlOperationsRemoteError): ops.stat(f'{hbsurl}/redirect-to?url={target_url}') # make sure we get the size info assert ops.stat(f'{hbsurl}/bytes/63')['content-length'] == 63 # download # SFRUUEJJTiBpcyBhd2Vzb21l == 'HTTPBIN is awesome' props = ops.download(f'{hbsurl}/base64/SFRUUEJJTiBpcyBhd2Vzb21l', tmp_path / 'mydownload', hash=['md5']) assert (tmp_path / 'mydownload').read_text() == 'HTTPBIN is awesome' # 404s with pytest.raises(UrlOperationsResourceUnknown): ops.stat(f'{hbsurl}/status/404') with pytest.raises(UrlOperationsResourceUnknown): ops.download(f'{hbsurl}/status/404', tmp_path / 'dontmatter') def test_custom_http_headers_via_config(datalad_cfg): for k, v in ( ('datalad.url-handler.http.*.class', 'datalad_next.url_operations.http.HttpUrlOperations'), ('datalad.url-handler.http.*.kwargs', '{"headers": {"X-Funky": "Stuff"}}'), ): datalad_cfg.set(k, v, scope='global', reload=False) datalad_cfg.reload() auo = AnyUrlOperations() huo = auo._get_handler(f'http://example.com') assert huo._headers['X-Funky'] == 'Stuff' @skipif_no_network def test_transparent_decompression(tmp_path): # this file is offered with transparent compression/decompression # by the github webserver url = 'https://raw.githubusercontent.com/datalad/datalad-next/' \ 'd0c4746425a48ef20e3b1c218e68954db9412bee/pyproject.toml' dpath = tmp_path / 'test.txt' ops = HttpUrlOperations() ops.download(from_url=url, to_path=dpath) # make sure it ends up on disk uncompressed assert dpath.read_text() == \ '[build-system]\nrequires = ["setuptools >= 43.0.0", "wheel"]\n' @skipif_no_network def test_compressed_file_stay_compressed(tmp_path): # this file is offered with transparent compression/decompression # by the github webserver, but is also actually gzip'ed url = \ 'https://github.com/datalad/datalad-neuroimaging/raw/' \ '05b45c8c15d24b6b894eb59544daa17159a88945/' \ 'datalad_neuroimaging/tests/data/files/nifti1.nii.gz' # first confirm validity of the test approach, opening an # uncompressed file should raise an exception with pytest.raises(gzip.BadGzipFile): testpath = tmp_path / 'uncompressed' testpath.write_text('some') with gzip.open(testpath, 'rb') as f: f.read(1000) # and now with a compressed file dpath = tmp_path / 'test.nii.gz' ops = HttpUrlOperations() ops.download(from_url=url, to_path=dpath) # make sure it ends up on disk compressed! with gzip.open(dpath, 'rb') as f: f.read(1000) def test_size_less_progress_reporting(http_server, monkeypatch): test_file = (http_server.path / 'test.bin').open('wb') test_file.seek(100000) test_file.write(b'a') test_file.close() r = requests.get(http_server.url + '/test.bin', stream=True) del r.headers['content-length'] logs = [] # patch the log_progress() used in http.py def catch_progress(*_, **kwargs): logs.append(kwargs) import datalad_next.url_operations.base monkeypatch.setattr(datalad_next.url_operations.base, 'log_progress', catch_progress) http_handler = HttpUrlOperations() http_handler._stream_download_from_request(r, None) assert any('update' in kwargs for kwargs in logs) assert any(('total', None) in kwargs.items() for kwargs in logs) def test_header_adding(): default_headers = dict(key_1='value_1') added_headers = dict(key_2='value_2') url_ops = HttpUrlOperations(headers=default_headers) assert 'key_1' in url_ops.get_headers() # ensure that header entries from `headers` show up in result combined_keys = {'key_1', 'key_2'} result_1 = url_ops.get_headers(headers=dict(added_headers)) assert combined_keys.issubset(set(result_1)) # ensure that `headers` did not change the stored headers result_2 = url_ops.get_headers() assert 'key_2' not in set(result_2) datalad-next-1.4.1/datalad_next/url_operations/tests/test_ssh.py000066400000000000000000000107661462321624600251500ustar00rootroot00000000000000import contextlib import io import pytest from datalad_next.tests import ( skip_if_on_windows, ) from ..ssh import ( SshUrlOperations, UrlOperationsRemoteError, UrlOperationsResourceUnknown, ) # path magic inside the test is posix only @skip_if_on_windows def test_ssh_url_download(tmp_path, monkeypatch, sshserver): ssh_url, ssh_localpath = sshserver test_path = ssh_localpath / 'myfile' test_url = f'{ssh_url}/myfile' ops = SshUrlOperations() # no target file (yet), precise exception with pytest.raises(UrlOperationsResourceUnknown): ops.stat(test_url) # this is different for a general connection error with pytest.raises(UrlOperationsRemoteError): ops.stat(f'ssh://localhostnotaround{test_path}') # now put something at the target location test_path.write_text('surprise!') # and now it works props = ops.stat(test_url) # we get the correct file size reported assert props['content-length'] == test_path.stat().st_size # simulate a "protocol error" where the server-side command # is not reporting the magic header with monkeypatch.context() as m: m.setattr(SshUrlOperations, '_stat_cmd', 'echo nothing') # we get a distinct exception with pytest.raises(RuntimeError): ops.stat(test_url) # and download download_path = tmp_path / 'download' props = ops.download(test_url, download_path, hash=['sha256']) assert props['sha256'] == '71de4622cf536ed4aa9b65fc3701f4fc5a198ace2fa0bda234fd71924267f696' assert props['content-length'] == 9 == test_path.stat().st_size # remove source and try again test_path.unlink() with pytest.raises(UrlOperationsResourceUnknown): ops.download(test_url, download_path) # this is different for a general connection error with pytest.raises(UrlOperationsRemoteError): ops.download(f'ssh://localhostnotaround{test_path}', download_path) # path magic inside the test is posix only @skip_if_on_windows def test_ssh_url_upload(tmp_path, monkeypatch, sshserver): ssh_url, ssh_localpath = sshserver payload = 'surprise!' payload_path = tmp_path / 'payload' upload_path = ssh_localpath / 'subdir' / 'myfile' upload_url = f'{ssh_url}/subdir/myfile' ops = SshUrlOperations() # standard error if local source is not around with pytest.raises(FileNotFoundError): ops.upload(payload_path, upload_url) payload_path.write_text(payload) # upload creates parent dirs, so the next just works. # this may seem strange for SSH, but FILE does it too. # likewise an HTTP upload is also not required to establish # server-side preconditions first. # this functionality is not about exposing a full # remote FS abstraction -- just upload ops.upload(payload_path, upload_url) assert upload_path.read_text() == payload def test_ssh_url_upload_from_stdin(tmp_path, monkeypatch, sshserver): ssh_url, ssh_localpath = sshserver payload = 'surprise!' upload_path = ssh_localpath / 'uploaded.txt' upload_url = f'{ssh_url}/uploaded.txt' ops = SshUrlOperations() class StdinBufferMock: def __init__(self, byte_stream: bytes): self.buffer = io.BytesIO(byte_stream) with monkeypatch.context() as mp_ctx: mp_ctx.setattr('sys.stdin', StdinBufferMock(payload.encode())) ops.upload(None, upload_url) assert upload_path.exists() assert upload_path.read_text() == payload def test_ssh_url_upload_timeout(tmp_path, monkeypatch): payload = 'surprise!' payload_path = tmp_path / 'payload' payload_path.write_text(payload) upload_url = f'ssh://localhost/not_used' ssh_url_ops = SshUrlOperations() @contextlib.contextmanager def mocked_iter_subproc(*args, **kwargs): yield None with monkeypatch.context() as mp_ctx: import datalad_next.url_operations.ssh mp_ctx.setattr(datalad_next.url_operations.ssh, 'iter_subproc', mocked_iter_subproc) mp_ctx.setattr(datalad_next.url_operations.ssh, 'COPY_BUFSIZE', 1) with pytest.raises(TimeoutError): ssh_url_ops.upload(payload_path, upload_url, timeout=1) def test_check_return_code(): SshUrlOperations._check_return_code(0, 'test-0') with pytest.raises(UrlOperationsResourceUnknown): SshUrlOperations._check_return_code(244, 'test-244') with pytest.raises(UrlOperationsRemoteError): SshUrlOperations._check_return_code(None, 'test-None') SshUrlOperations._check_return_code(1, 'test-1') datalad-next-1.4.1/datalad_next/utils/000077500000000000000000000000001462321624600176615ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/utils/__init__.py000066400000000000000000000053321462321624600217750ustar00rootroot00000000000000"""Assorted utility functions .. currentmodule:: datalad_next.utils .. autosummary:: :toctree: generated DataladAuth MultiHash check_symlink_capability chpwd ensure_list external_versions log_progress parse_www_authenticate patched_env rmtree get_specialremote_param_dict get_specialremote_credential_properties update_specialremote_credential needs_specialremote_credential_envpatch get_specialremote_credential_envpatch """ from datalad.utils import ( # TODO REMOVE FOR V2.0 Path, check_symlink_capability, chpwd, # TODO REMOVE FOR V2.0 ensure_bool, ensure_list, # TODO https://github.com/datalad/datalad-next/issues/626 get_dataset_root, # TODO REMOVE FOR V2.0 # only needed for `interface_utils` patch and better be imported # in there getargspec, # TODO REMOVE FOR V2.0 get_wrapped_class, # TODO REMOVE FOR V2.0 knows_annex, # TODO REMOVE FOR V2.0 # in datalad_next.consts on_linux, # TODO REMOVE FOR V2.0 # in datalad_next.consts on_windows, # TODO REMOVE FOR V2.0 rmtemp, rmtree, # TODO REMOVE FOR V2.0 # only a test utility and should move there swallow_outputs, ) # TODO REMOVE FOR V2.0 # internal helper of create_sibling_webdav from datalad.distribution.utils import _yield_ds_w_matching_siblings from datalad.support.external_versions import external_versions # TODO REMOVE FOR V2.0 from datalad_next.credman import CredentialManager from .log import log_progress from .multihash import MultiHash from .requests_auth import ( DataladAuth, parse_www_authenticate, ) from .specialremote import ( get_specialremote_param_dict, get_specialremote_credential_properties, update_specialremote_credential, specialremote_credential_envmap, needs_specialremote_credential_envpatch, get_specialremote_credential_envpatch, ) from .patch import patched_env # TODO REMOVE EVERYTHING BELOW FOR V2.0 # https://github.com/datalad/datalad-next/issues/611 from typing import ( Any, Dict, ) class ParamDictator: """Parameter dict access helper This class can be used to wrap a dict containing function parameter name-value mapping, and get/set values by parameter name attribute rather than via the ``__getitem__`` dict API. """ def __init__(self, params: Dict): self.__params = params def __getattr__(self, attr: str): if attr.startswith('__'): return super().__getattribute__(attr) return self.__params[attr] def __setattr__(self, attr: str, value: Any): if attr.startswith('_ParamDictator__'): super().__setattr__(attr, value) else: self.__params[attr] = value datalad-next-1.4.1/datalad_next/utils/consts.py000066400000000000000000000002061462321624600215420ustar00rootroot00000000000000# ATTN! These are legacy imports. DO NOT ADD ANYTHING! from datalad_next.consts import ( COPY_BUFSIZE, PRE_INIT_COMMIT_SHA, ) datalad-next-1.4.1/datalad_next/utils/credman.py000066400000000000000000000005061462321624600216450ustar00rootroot00000000000000import warnings warnings.warn( "datalad_next.utils.credman was replaced by datalad_next.credman in " "datalad_next 1.0. This transition helper module will be removed in " "datalad_next 2.0.", DeprecationWarning, ) from datalad_next.credman.manager import ( CredentialManager, verify_property_names, ) datalad-next-1.4.1/datalad_next/utils/deprecate.py000066400000000000000000000075271462321624600222020ustar00rootroot00000000000000from __future__ import annotations from collections.abc import Callable from functools import wraps import warnings __all__ = ['deprecated'] _base_tmpl = "{mod}.{func} was deprecated in version {version}. {msg}" _kwarg_tmpl = f"argument {{kwarg!r}} of {_base_tmpl}" _kwarg_val_tmpl = f"Use of values {{kwarg_values!r}} for {_kwarg_tmpl}" # we must have a secret value that indicates "no value deprecation", otherwise # we cannot tell whether `None` is deprecated or not class _NoDeprecatedValue: pass def deprecated( msg: str, *, version: str, kwarg: str | None = None, kwarg_values: list | _NoDeprecatedValue = _NoDeprecatedValue, ) -> Callable: """Annotate functions, classes, or (required) keyword-arguments with standardized deprecation warnings. Support for deprecation messages on individual keyword arguments is limited to calls with explicit keyword-argument use, not (implicit) use as a positional argument. Parameters ---------- msg: str Custom message to append to a deprecation warning version: str Software version number at which the deprecation was made kwarg: str, optional Name of the particular deprecated keyword argument (instead of entire function/class) kwarg_values: list, optional Particular deprecated values of the specified keyword-argument """ # normalize to a set(), when the set is empty, no particular value # was deprecated kwarg_values = set() \ if kwarg_values is _NoDeprecatedValue else set(kwarg_values) def decorator(func): @wraps(func) def func_with_deprecation_warning(*args, **kwargs): # this is the layer that run for a deprecated call # do we have a deprecated kwargs, but it has not been used? # -> quick way out if kwarg is not None and kwarg not in kwargs.keys(): # there is nothing to deprecate return func(*args, **kwargs) # pick the right message template # whole thing deprecated, or kwargs, or particular kwarg-value template = _base_tmpl if kwarg is None \ else _kwarg_tmpl if not kwarg_values \ else _kwarg_val_tmpl # deprecated value to compare against val = kwargs.get(kwarg, _NoDeprecatedValue) # comprehensive set of conditions when to issue deprecation # warning # - no particular kwarg is deprecated, but the whole callable # - no particular value is deprecated, but the whole argument # - given value matches any deprecated value # - given list/tuple/dict-keys match any deprecated value if (# no particular kwarg is deprecated, but the whole callable kwarg is None # no particular value is deprecated, but the whole argument or not kwarg_values # given value matches any deprecated value # exluce tuple/list, because they are not hashable or (not isinstance(val, (list, dict)) and val in kwarg_values) # given list/tuple-item or dict-key match any deprecated value or (isinstance(val, (tuple, list, dict)) and kwarg_values.intersection(val)) ): warnings.warn( template.format( mod=func.__module__, func=func.__name__, kwarg=kwarg, kwarg_values=kwarg_values, version=version, msg=msg, ), DeprecationWarning, ) return func(*args, **kwargs) return func_with_deprecation_warning return decorator datalad-next-1.4.1/datalad_next/utils/http_helpers.py000066400000000000000000000051531462321624600227400ustar00rootroot00000000000000"""Small helpers for HTTP operations """ # allow for |-type UnionType declarations from __future__ import annotations import logging from urllib.parse import urlparse lgr = logging.getLogger('datalad.ext.next.utils.http_helpers') __all__ = ['get_auth_realm'] def get_auth_realm(url, auth_info, scheme=None): """Determine an authentication realm identifier from a HTTP response. Examples -------- Robustly determine a realm identifier for any URL:: > url, props = HttpUrlOperations().probe_url( 'https://fz-juelich.sciebo.de/...') > get_auth_realm(url, props.get('auth')) 'https://fz-juelich.sciebo.de/login' Parameters ---------- url: str A URL as returned by `probe_url()` auth_info: dict A mapping of supported authentication schemes to the properties (i.e., a 'www-authenticate' response header), as returned by `probe_url()`'s 'auth' property. scheme: str, optional Which specific authentication to report a realm for, in case multiple are supported (such as 'basic', or 'token'). If not given, the first (if any) reported authentication scheme is reported on. Returns ------- str A server-specific realm identifier """ if not auth_info: # no info from the server on what it needs # out best bet is the URL itself return url if scheme: auth_info = auth_info[scheme] else: scheme, auth_info = auth_info.copy().popitem() # take any, but be satisfied with none too realm = auth_info.get('realm') if auth_info else '' # a realm is supposed to indicate a validity scope of a credential # on a server. so we make sure to have the return realm identifier # actually indicate a server too, in order to make it suitable for # a global credential lookup if _is_valid_url(realm): # the realm is already a valid URL with a server specification. # we can simply relay it as such, following the admins' judgement return realm else: # the realm was just some kind of string. we prefix it with the # netloc of the given URL (ignoring its path) to achieve # the same server-specific realm semantics parsed = urlparse(url) return '{scheme}://{netloc}{slash}{realm}'.format( scheme=parsed.scheme, netloc=parsed.netloc, slash='' if realm.startswith('/') else'/', realm=realm, ) def _is_valid_url(url): try: parsed = urlparse(url) return all([parsed.scheme, parsed.netloc]) except: return False datalad-next-1.4.1/datalad_next/utils/log.py000066400000000000000000000000451462321624600210130ustar00rootroot00000000000000from datalad.log import log_progress datalad-next-1.4.1/datalad_next/utils/multihash.py000066400000000000000000000033111462321624600222270ustar00rootroot00000000000000"""Compute more than one hash for the same data in one go""" from __future__ import annotations import hashlib from typing import ( ByteString, Dict, ) class NoOpHash: """Companion of :class:`MultiHash` that computes no hash at all This can be used wherever ``MultiHash`` would be used, because it implements its API. However, no hash is computed and no hexdigest is reported. """ def __init__(self, algorithms: None = None): pass def update(self, data): pass def get_hexdigest(self): return {} class MultiHash: """Compute any number of hashes as if computing just one Supports any hash algorithm supported by the ``hashlib`` module of the standard library. """ def __init__(self, algorithms: list[str]): """ Parameters ---------- algorithms: list Hash names, must match the name of the algorithms in the ``hashlib`` module (case insensitive). """ # yes, this will crash, if an invalid hash algorithm name # is given _hasher = [] for h in algorithms: hr = getattr(hashlib, h.lower(), None) if hr is None: raise ValueError(f'unsupported hash algorithm {h}') _hasher.append(hr()) self._hasher = dict(zip(algorithms, _hasher)) def update(self, data: ByteString) -> None: """Updates all configured digests""" for h in self._hasher.values(): h.update(data) def get_hexdigest(self) -> Dict[str, str]: """Returns a mapping of algorithm name to hexdigest for all algorithms """ return {a: h.hexdigest() for a, h in self._hasher.items()} datalad-next-1.4.1/datalad_next/utils/patch.py000066400000000000000000000015311462321624600213320ustar00rootroot00000000000000import contextlib from os import environ # legacy import from datalad_next.patches import apply_patch @contextlib.contextmanager def patched_env(**env): """Context manager for patching the process environment Any number of kwargs can be given. Keys represent environment variable names, and values their values. A value of ``None`` indicates that the respective variable should be unset, i.e., removed from the environment. """ preserve = {} for name, val in env.items(): preserve[name] = environ.get(name, None) if val is None: del environ[name] else: environ[name] = str(val) try: yield finally: for name, val in preserve.items(): if val is None: del environ[name] else: environ[name] = val datalad-next-1.4.1/datalad_next/utils/requests_auth.py000066400000000000000000000356311462321624600231370ustar00rootroot00000000000000"""python-requests-compatible authentication handler using DataLad credentials """ # allow for |-type UnionType declarations from __future__ import annotations import logging from typing import Dict from urllib.parse import urlparse import requests from datalad_next.config import ConfigManager from datalad_next.credman import CredentialManager from .http_helpers import get_auth_realm lgr = logging.getLogger('datalad.ext.next.utils.requests_auth') __all__ = ['DataladAuth', 'HTTPBearerTokenAuth', 'parse_www_authenticate'] def parse_www_authenticate(hdr: str) -> dict: """Parse HTTP www-authenticate header This helper uses ``requests`` utilities to parse the ``www-authenticate`` header as represented in a ``requests.Response`` instance. The header may contain any number of challenge specifications. The implementation follows RFC7235, where a challenge parameters set is specified as: either a comma-separated list of parameters, or a single sequence of characters capable of holding base64-encoded information, and parameters are name=value pairs, where the name token is matched case-insensitively, and each parameter name MUST only occur once per challenge. Returns ------- dict Keys are casefolded challenge labels (e.g., 'basic', 'digest'). Values are: ``None`` (no parameter), ``str`` (a token68), or ``dict`` (name/value mapping of challenge parameters) """ plh = requests.utils.parse_list_header pdh = requests.utils.parse_dict_header challenges = {} challenge = None # challenges as well as their properties are in a single # comma-separated list for item in plh(hdr): # parse the item into a key/value set # the value will be `None` if this item was no mapping k, v = pdh(item).popitem() # split the key to check for a challenge spec start key_split = k.split(' ', maxsplit=1) if len(key_split) > 1 or v is None: item_suffix = item[len(key_split[0]) + 1:] challenge = [item[len(key_split[0]) + 1:]] if item_suffix else None challenges[key_split[0].casefold()] = challenge else: # implementation logic assumes that the above conditional # was triggered before we ever get here assert challenge challenge.append(item) return { challenge: _convert_www_authenticate_items(items) for challenge, items in challenges.items() } def _convert_www_authenticate_items(items: list) -> None | str | dict: pdh = requests.utils.parse_dict_header # according to RFC7235, items can be: # either a comma-separated list of parameters # or a single sequence of characters capable of holding base64-encoded # information. # parameters are name=value pairs, where the name token is matched # case-insensitively, and each parameter name MUST only occur once # per challenge. if items is None: return None elif len(items) == 1 and pdh(items[0].rstrip('=')).popitem()[1] is None: # this items matches the token68 appearance (no name value # pair after potential base64 padding its removed return items[0] else: return { k.casefold(): v for i in items for k, v in pdh(i).items() } class DataladAuth(requests.auth.AuthBase): """Requests-style authentication handler using DataLad credentials Similar to request_toolbelt's `AuthHandler`, this is a meta implementation that can be used with different actual authentication schemes. In contrast to `AuthHandler`, a credential can not only be specified directly, but credentials can be looked up based on the target URL and the server-supported authentication schemes. In addition to programmatic specification and automated lookup, manual credential entry using interactive prompts is also supported. At present, this implementation is not thread-safe. """ _supported_auth_schemes = { 'basic': 'user_password', 'digest': 'user_password', 'bearer': 'token', } def __init__(self, cfg: ConfigManager, credential: str | None = None): """ Parameters ---------- cfg: ConfigManager Is passed to CredentialManager() as `cfg`-parameter. credential: str, optional Name of a particular credential to be used for any operations. """ self._credman = CredentialManager(cfg) self._credential = credential self._entered_credential = None def save_entered_credential(self, suggested_name: str | None = None, context: str | None = None) -> Dict | None: """Utility method to save a pending credential in the store Pending credentials have been entered manually, and were subsequently used successfully for authentication. Saving a credential will prompt for entering a name to identify the credentials. """ if self._entered_credential is None: # nothing to do return None return self._credman.set( name=None, _lastused=True, _suggested_name=suggested_name, _context=context, **self._entered_credential ) def __call__(self, r): # TODO support being called from multiple threads #self.init_per_thread_state() # register hooks to be executed from a response to this # request is available # Redirect: reset credentials to avoid leakage to other server r.register_hook("response", self.handle_redirect) # 401 Unauthorized: look for a credential and try again r.register_hook("response", self.handle_401) return r def _get_credential(self, url, auth_schemes ) -> tuple[str | None, str | None, Dict | None]: """Get a credential for access to `url` given server-supported schemes If a particular credential to use was given to the `DataladAuth` constructor it reported here. In all other situations a credential will be looked up, based on the access URL and the authentication schemes supported by the host. The authentication schemes will be tested in the order in which they are reported by the remote host. If no matching credential can be identified, a prompt to enter a credential is presented. The credential type will match, and be used with the first authentication scheme that is both reported by the host, and by this implementation. The methods returns a 3-tuple. The first element is an identifier for the authentication scheme ('basic', digest', etc.) to use with the credential. The second item is the name for the reported credential, and the third element is a dictionary with the credential properties and its secret. Any of the three items can be `None` if the respective information is not available. """ if self._credential: cred = self._credman.get(name=self._credential) # this credential is scheme independent return None, self._credential, cred # no credential identified, find one for ascheme in auth_schemes: if ascheme not in DataladAuth._supported_auth_schemes: # nothing we can handle continue ctype = DataladAuth._supported_auth_schemes[ascheme] # get a realm ID for this authentication scheme realm = get_auth_realm(url, auth_schemes, scheme=ascheme) # ask for matching credentials creds = [ (name, cred) for name, cred in self._credman.query( _sortby='last-used', type=ctype, realm=realm, ) # we can only work with complete credentials, although # query() might return others. We exclude them here # to be able to fall back on manual entry further down if cred.get('secret') ] if creds: # we have matches, go with the last used one name, cred = creds[0] return ascheme, name, cred # no success finding an existing credential, now ask, if possible # pick a scheme that is supported by the server and by us ascheme = [s for s in auth_schemes if s in DataladAuth._supported_auth_schemes] if not ascheme: # f-string OK, only executed on failure lgr.debug( 'Only unsupported HTTP auth schemes offered ' f'{list(auth_schemes.keys())!r}') # go with the first supported scheme ascheme = ascheme[0] ctype = DataladAuth._supported_auth_schemes[ascheme] try: realm = get_auth_realm(url, auth_schemes) cred = self._credman.get( name=None, _prompt=f'Credential needed for accessing {url} (authentication realm {realm!r})', _type_hint=ctype, type=ctype, # include the realm in the credential to avoid asking for it # interactively (it is a server-specified property # users would generally not know, if they do, they can use the # `credentials` command upfront. realm=realm ) self._entered_credential = cred return ascheme, None, cred except Exception as e: lgr.debug('Credential retrieval failed: %s', e) return ascheme, None, None def handle_401(self, r, **kwargs): """Callback that received any response to a request Any non-4xx response or a response lacking a 'www-authenticate' header is ignored. Server-provided 'www-authenticated' challenges are inspected, and corresponding credentials are looked-up (if needed) and subsequently tried in a re-request to the original URL after performing any necessary actions to meet a given challenge. Such a re-request is then using the same connection as the original request. Particular challenges are implemented in dedicated classes, e.g. :class:`requests.auth.HTTPBasicAuth`. Credential look-up or entry is performed by :meth:`datalad_next.requests_auth.DataladAuth._get_credential`. """ if not 400 <= r.status_code < 500: # fast return if this is no error, see # https://github.com/psf/requests/issues/3772 for background return r if 'www-authenticate' not in r.headers: # no info on how to authenticate to react to, leave it as-is. # this also catches any non-401-like error code (e.g. 429). # doing this more loose check (rather then going for 401 # specifically) enables to support services that send # www-authenticate with e.g. 403s return r # which auth schemes does the server support? auth_schemes = parse_www_authenticate(r.headers['www-authenticate']) ascheme, credname, cred = self._get_credential(r.url, auth_schemes) if cred is None or 'secret' not in cred: # we got nothing, leave things as they are return r # TODO add safety check. if a credential somehow contains # information on its scope (i.e. only for github.com) # prevent its use for other hosts -- maybe unless given explicitly. if ascheme is None: # if there is no authentication scheme identified, look at the # credential, if it knows ascheme = cred.get('http_auth_scheme') # if it does not, go with the first supported scheme that matches # the credential type, one is guaranteed to match ascheme = [ c for c in auth_schemes if c in DataladAuth._supported_auth_schemes and cred.get('type') == DataladAuth._supported_auth_schemes[c] ][0] if ascheme == 'basic': return self._authenticated_rerequest( r, requests.auth.HTTPBasicAuth(cred['user'], cred['secret']), **kwargs) elif ascheme == 'digest': return self._authenticated_rerequest( r, requests.auth.HTTPDigestAuth(cred['user'], cred['secret']), **kwargs) elif ascheme == 'bearer': return self._authenticated_rerequest( r, HTTPBearerTokenAuth(cred['secret']), **kwargs) else: raise NotImplementedError( 'Only unsupported HTTP auth schemes offered ' f'{list(auth_schemes.keys())!r} need {ascheme!r}') def handle_redirect(self, r, **kwargs): """Callback that received any response to a request Any non-redirect response is ignore. This callback drops an explicitly set credential whenever the redirect causes a non-encrypted connection to be used after the original request was encrypted, or when the `netloc` of the redirect differs from the original target. """ if r.is_redirect and self._credential: og_p = urlparse(r.url) rd_p = urlparse(r.headers.get('location'), '') if og_p.netloc != rd_p.netloc or ( rd_p.scheme == 'http' and og_p.scheme == 'https'): lgr.debug( 'URL redirect, discarded given credential %r ' 'to avoid leakage', self._credential) self._credential = None def _authenticated_rerequest( self, response: requests.models.Response, auth: requests.auth.AuthBase, **kwargs ) -> requests.models.Response: """Helper to rerun a request, but with authentication added""" prep = _get_renewed_request(response) auth(prep) _r = response.connection.send(prep, **kwargs) _r.history.append(response) _r.request = prep return _r def _get_renewed_request(r: requests.models.Response ) -> requests.models.PreparedRequest: """Helper. Logic taken from requests.auth.HTTPDigestAuth""" # Consume content and release the original connection # to allow our new request to reuse the same one. r.content r.close() prep = r.request.copy() requests.cookies.extract_cookies_to_jar( prep._cookies, r.request, r.raw) prep.prepare_cookies(prep._cookies) return prep class HTTPBearerTokenAuth(requests.auth.AuthBase): """Attaches HTTP Bearer Token Authentication to the given Request object. """ def __init__(self, token): super().__init__() self.token = token def __call__(self, r): r.headers["Authorization"] = f'Bearer {self.token}' return r datalad-next-1.4.1/datalad_next/utils/specialremote.py000066400000000000000000000133431462321624600230730ustar00rootroot00000000000000import logging import os lgr = logging.getLogger('datalad.utils.specialremote') def get_specialremote_param_dict(params): """ Parameters ---------- params : list Returns ------- dict """ return dict(p.split('=', maxsplit=1) for p in params) def get_specialremote_credential_properties(params): """Determine properties of credentials special remote configuration The input is a parameterization as it would be given to `git annex initremote|enableremote ...`, or as stored in `remote.log`. These parameters are inspected and a dictionary of credential properties, suitable for `CredentialManager.query()` is returned. This inspection may involve network activity, e.g. HTTP requests. Parameters ---------- params : list or dict Either a list of strings of the format 'param=value', or a dictionary with parameter names as keys. Returns ------- dict or None Credential property name-value mapping. This mapping can be passed to `CredentialManager.query()`. If no credential properties could be inferred, for example, because the special remote type is not recognized `None` is returned. """ if isinstance(params, (list, tuple)): params = get_specialremote_param_dict(params) props = {} # no other way to do this specifically for each supported remote type remote_type = params.get('type') if remote_type == 'webdav': from .http_helpers import get_auth_realm from datalad_next.url_operations import HttpUrlOperations url = params.get('url') if not url: return url, urlprops = HttpUrlOperations().probe_url(url) realm = get_auth_realm(url, urlprops.get('auth')) if realm: props['realm'] = realm else: return return props or None def update_specialremote_credential( srtype, credman, credname, credprops, credtype_hint=None, duplicate_hint=None): """ Parameters ---------- srtype: str credman: CredentialManager credname: str or Name credprops: dict """ if not credname: # name could still be None, if this was entered # create a default name, and check if it has not been used credname = '{type}{udelim}{user}{delim}{realm}'.format( type=srtype, udelim='-' if 'user' in credprops else '', user=credprops.get('user', ''), delim='-' if 'realm' in credprops else '', realm=credprops.get('realm', ''), ) if credman.get( name=credname, # give to make legacy credentials accessible _type_hint=credtype_hint): # this is already in use, do not override lgr.warning( 'The entered credential will not be stored, ' 'a credential with the default name %r already exists.%s', credname, f' {duplicate_hint}' if duplicate_hint else '') return # we have used a credential, store it with updated usage info try: credman.set(credname, _lastused=True, **credprops) except Exception as e: from datalad_next.exceptions import CapturedException # we do not want to crash for any failure to store a # credential lgr.warn( 'Exception raised when storing credential %r %r: %s', credname, credprops, CapturedException(e), ) # mapping for credential properties for specific special remote # types. this is unpleasantly non-generic, but only a small # subset of git-annex special remotes require credentials to be # given via ENV vars, and all of rclone's handle it internally specialremote_credential_envmap = dict( # it makes no sense to pull a short-lived access token from # a credential store, it can be given via AWS_SESSION_TOKEN # in any case glacier=dict( user='AWS_ACCESS_KEY_ID', # nosec secret='AWS_SECRET_ACCESS_KEY'), # nosec s3=dict( user='AWS_ACCESS_KEY_ID', # nosec secret='AWS_SECRET_ACCESS_KEY'), # nosec webdav=dict( user='WEBDAV_USERNAME', # nosec secret='WEBDAV_PASSWORD'), # nosec ) def needs_specialremote_credential_envpatch(remote_type): """Returns whether the environment needs to be patched with credentials Returns ------- bool False, if the special remote type is not recognized as one needing credentials, or if there are credentials already present. True, otherwise. """ if remote_type not in specialremote_credential_envmap: lgr.debug('Special remote type %r not supported for credential setup', remote_type) return False # retrieve deployment mapping env_map = specialremote_credential_envmap[remote_type] if all(k in os.environ for k in env_map.values()): # the ENV is fully set up # let's prefer the environment to behave like git-annex lgr.debug( 'Not deploying credentials for special remote type %r, ' 'already present in environment', remote_type) return False # no counterevidence return True def get_specialremote_credential_envpatch(remote_type, cred): """Create an environment path for a particular remote type and credential Returns ------- dict or None A dict with all required items to patch the environment, or None if not enough information is available, or nothing needs to be patched. """ env_map = specialremote_credential_envmap.get(remote_type, {}) return { # take whatever partial setup the ENV has already v: cred[k] for k, v in env_map.items() if v not in os.environ } or None datalad-next-1.4.1/datalad_next/utils/tests/000077500000000000000000000000001462321624600210235ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/utils/tests/__init__.py000066400000000000000000000000001462321624600231220ustar00rootroot00000000000000datalad-next-1.4.1/datalad_next/utils/tests/test_deprecated.py000066400000000000000000000130511462321624600245340ustar00rootroot00000000000000import warnings from ..deprecate import deprecated import pytest @deprecated(msg='nothing to see here', version='1.0') def deprecated_function(inputstring): return inputstring class RandomClass(object): @deprecated(msg="nothing to see here", version='1.0') def deprecated_method(self, inputstring): return inputstring @deprecated(msg="nothing to see here", version='1.0') class DeprecatedClass(object): def __call__(self, inputstring): return inputstring @deprecated(msg='nothing to see here', kwarg='inputmode', version='1.0') def deprecated_function_param(inputmode='default', other_param=None): return inputmode class RandomClassParam(object): @deprecated(msg="nothing to see here", kwarg='inputmode', version='1.0') def deprecated_method(self, inputmode='default', other_param=None): return inputmode @deprecated(msg='nothing to see here', kwarg='inputmode', kwarg_values=['default'], version='1.0') def deprecated_function_param_value(inputmode='default'): return inputmode @deprecated(msg='nothing to see here', kwarg='inputmode', kwarg_values=[None], version='1.0') def deprecated_function_param_value_none(inputmode=None): return inputmode class RandomClassParamValue(object): @deprecated(msg="nothing to see here", kwarg='inputmode', kwarg_values=['default'], version='1.0') def deprecated_method(self, inputmode='default'): return inputmode @deprecated(msg='nothing to see here', version='1.0', kwarg='mode') @deprecated(msg='even less to see here', version='1.0', kwarg='othermode') def double_deprecated_function(mode='default', othermode='moredefault'): return (mode, othermode) @deprecated(msg='nothing to see here', version='1.0', kwarg='mode', kwarg_values=['1', '2']) def two_deprecated_values(mode='default'): return mode def test_deprecated(): # deprecations for entire functions/classes input_string = 'hello world' for func in [deprecated_function, RandomClass().deprecated_method]: with pytest.warns( DeprecationWarning, match=f"{func.__module__}.{func.__name__} was deprecated in version 1.0. nothing to see here"): res = func(inputstring=input_string) assert res == input_string with pytest.warns(DeprecationWarning, match="nothing to see here"): DeprecatedClass() # deprecations for a kwarg inputmode = 'default' for func in [deprecated_function_param, RandomClassParam().deprecated_method]: with pytest.warns(DeprecationWarning, match="argument 'inputmode'"): res = func(inputmode=inputmode) assert res == inputmode # deprecations for a kwarg value for func in [deprecated_function_param_value, RandomClassParamValue().deprecated_method, ]: with pytest.warns( DeprecationWarning, match="Use of values {'default'} for argument 'inputmode'"): res = func(inputmode=inputmode) assert res == inputmode # `None` value deprecation is supported # test in many complicated forms for v in (None, (None,), [None], {None: 'some'}): with pytest.warns( DeprecationWarning, match="Use of values {None} for argument 'inputmode'"): res = deprecated_function_param_value_none(inputmode=v) assert res == v # no deprecations for an unused deprecated parameter or parameter value for func in [deprecated_function_param_value, RandomClassParamValue().deprecated_method, ]: with warnings.catch_warnings(record=True) as record: res = func(inputmode='not-deprecated') assert res == 'not-deprecated' assert len(record) == 0 for func in [deprecated_function_param, RandomClassParam().deprecated_method]: with warnings.catch_warnings(record=True) as record: res = func(other_param='something!') assert res == inputmode assert len(record) == 0 # make sure it catches the parameter even if its a list for func in [deprecated_function_param_value, RandomClassParamValue().deprecated_method, ]: with pytest.warns( DeprecationWarning, match="Use of values {'default'} for argument 'inputmode'"): res = func(inputmode=[inputmode]) assert res == [inputmode] with warnings.catch_warnings(record=True) as record: res = func(inputmode=['not-deprecated']) assert res == ['not-deprecated'] assert len(record) == 0 # two decorators work as expected with pytest.warns(DeprecationWarning) as record: res = double_deprecated_function(mode='1', othermode='2') assert res == ('1', '2') assert len(record) == 2 assert 'nothing to see here' in str(record.list[0].message) assert 'even less to see here' in str(record.list[1].message) # test that everything works when the function has several deprecated values with pytest.warns(DeprecationWarning): res = two_deprecated_values(mode='1') assert res == '1' # shouldn't matter if the parameter value is a list res = two_deprecated_values(mode=['1']) assert res == ['1'] with warnings.catch_warnings(record=True) as record: res = two_deprecated_values(mode='safe') assert res == 'safe' assert len(record) == 0 datalad-next-1.4.1/datalad_next/utils/tests/test_multihash.py000066400000000000000000000010371462321624600244330ustar00rootroot00000000000000import pytest from ..multihash import ( MultiHash, NoOpHash, ) def test_multihash(): mh = MultiHash(['sha1', 'MD5']) mh.update(b'') hd = mh.get_hexdigest() assert len(hd) == 2 # algorithm label preserves original casing assert hd['MD5'] == 'd41d8cd98f00b204e9800998ecf8427e' assert hd['sha1'] == 'da39a3ee5e6b4b0d3255bfef95601890afd80709' with pytest.raises(ValueError): MultiHash(['bogus']) def test_noophash(): mh = NoOpHash() mh.update(b'') assert mh.get_hexdigest() == {} datalad-next-1.4.1/datalad_next/utils/tests/test_paramdictator.py000066400000000000000000000003601462321624600252650ustar00rootroot00000000000000import pytest from .. import ParamDictator def test_paramdictator(): d = {'a': 1, 'b': 2} pd = ParamDictator(d) assert pd.a == 1 assert pd.b == 2 with pytest.raises(AssertionError): assert pd.__dir__ is None datalad-next-1.4.1/datalad_next/utils/tests/test_parse_www_authenticate.py000066400000000000000000000027161462321624600272160ustar00rootroot00000000000000 from ..requests_auth import parse_www_authenticate challenges = ( # just challenge type ('Negotiate', [('negotiate', None)]), # challenge and just a token, tolerate any base64 padding ('Negotiate abcdef', [('negotiate', 'abcdef')]), ('Negotiate abcdef=', [('negotiate', 'abcdef=')]), ('Negotiate abcdef==', [('negotiate', 'abcdef==')]), # standard bearer ('Bearer realm=example.com', [('bearer', {'realm': 'example.com'})]), # standard digest ('Digest realm="example.com", qop="auth,auth-int", nonce="abcdef", ' 'opaque="ghijkl"', [('digest', {'realm': 'example.com', 'qop': 'auth,auth-int', 'nonce': 'abcdef', 'opaque': 'ghijkl'})]), # multi challenge ('Basic speCial="paf ram", realm="basIC", ' 'Bearer, ' 'Digest realm="http-auth@example.org", qop="auth, auth-int", ' 'algorithm=MD5', [('basic', {'special': 'paf ram', 'realm': 'basIC'}), ('bearer', None), ('digest', {'realm': "http-auth@example.org", 'qop': "auth, auth-int", 'algorithm': 'MD5'})]), # same challenge, multiple times, last one wins ('Basic realm="basIC", ' 'Basic realm="complex"', [('basic', {'realm': 'complex'})]), ) def test_parse_www_authenticate(): for hdr, targets in challenges: res = parse_www_authenticate(hdr) for ctype, props in targets: assert ctype in res assert res[ctype] == props datalad-next-1.4.1/datalad_next/utils/tests/test_patch.py000066400000000000000000000007431462321624600235370ustar00rootroot00000000000000from ..patch import patched_env from os import environ def test_patched_env(): if 'HOME' in environ: home = environ['HOME'] with patched_env(HOME=None): assert 'HOME' not in environ assert environ['HOME'] == home unusual_name = 'DATALADPATCHENVTESTVAR' if unusual_name not in environ: with patched_env(**{unusual_name: 'dummy'}): assert environ[unusual_name] == 'dummy' assert unusual_name not in environ datalad-next-1.4.1/docs/000077500000000000000000000000001462321624600150215ustar00rootroot00000000000000datalad-next-1.4.1/docs/CODEOWNERS000066400000000000000000000012221462321624600164110ustar00rootroot00000000000000# The release team (RT) is responsible for all code (reviews), unless # a more precise, additional specification further down matches a particular # changeset. # # It is the responsibility of any RT member to act on code review # requests in a timely manner. # # RT member approval is required for any merge request. # # Merge requests are accepted (automatically) when all (relevant) # status checks have passed, and RT approval was given. * michael.hanke@gmail.com /iter_collections/ christian.moench@web.de /iterable_subprocess/ christian.moench@web.de /patches/ christian.moench@web.de /runners/ christian.moench@web.de /shell/ christian.moench@web.de datalad-next-1.4.1/docs/Makefile000066400000000000000000000165101462321624600164640ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = -W SPHINXBUILD = python -m sphinx PAPER = BUILDDIR = build # User-friendly check for sphinx-build #ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) #$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) #endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " coverage to run coverage check of the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* source/generated source/_extras/schema.json html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/datalad_next.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/datalad_next.qhc" applehelp: $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp @echo @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." @echo "N.B. You won't be able to view it unless you put it in" \ "~/Library/Documentation/Help or install it in your application" \ "bundle." devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/datalad_next" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/datalad_next" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." coverage: $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage @echo "Testing of coverage in the sources finished, look at the " \ "results in $(BUILDDIR)/coverage/python.txt." xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." datalad-next-1.4.1/docs/README.md000066400000000000000000000033331462321624600163020ustar00rootroot00000000000000## Editing, building, and publishing extension documentation The `datalad-extension-template` uses [Sphinx](https://www.sphinx-doc.org/en/master/index.html#) for document generation and suggests using [Read the Docs](https://docs.readthedocs.io/en/stable/) for automatic documentation building, versioning, and hosting. Once you are ready to document your extension software, take note of the following: ### Document editing Edit your `docs/source/index.rst` file using [reStructuredText](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html), which is the default plaintext markup language used by Sphinx. Add further documentation as needed. ### Local testing For testing locally whether your documentation builds and renders correctly, first install the developer requirements from the repository's root directory: ``` pip install -r requirements-devel.txt ``` Then build the documentation locally: ``` make -C docs html ``` Navigate to `docs/build/` and open `index.html` in your browser to view your documentation. ### Remote building and testing The GitHub Action workflow located at `.github/workflows/docbuild.yml` will run on a push or pull request to your GitHub repository's master/main branch. This builds the documentation remotely and serves as an automated documentation test. ### Publishing your documentation - If you maintain your extension yourself *outside of the scope of the DataLad GitHub organization*, you can follow [these instructions](https://docs.readthedocs.io/en/stable/integrations.html) for integrating your version control system (such as GitHub) with Read the Docs. - If your extension is *maintained by the DataLad developer team*, please create an issue asking for help with the setup. datalad-next-1.4.1/docs/policy/000077500000000000000000000000001462321624600163205ustar00rootroot00000000000000datalad-next-1.4.1/docs/policy/release-management.md000066400000000000000000000014721462321624600224000ustar00rootroot00000000000000# Release team The release team (RT) is an charge reviewing merge requests, and issuing new releases. The members of the RT are defined in `docs/CODEOWNERS` in the `main` branch of the repository. The RT itself adds or removes RT members. It is the RT's duty to act on any merge request in a timely manner. A code review of at least one RT member is required for any changeset to be merged into the `main` branch. When all technical checks pass (e.g., CI success, resolved pull-request conversations), any RT member approval is a sufficient condition for an (automatic) merge of a changeset into the `main` branch. RT members are not expected to be an expert in all techniques, features, and parts of the code base. Consequently, a team member should seek feedback prior to approving merge requests whenever necessary. datalad-next-1.4.1/docs/source/000077500000000000000000000000001462321624600163215ustar00rootroot00000000000000datalad-next-1.4.1/docs/source/_static/000077500000000000000000000000001462321624600177475ustar00rootroot00000000000000datalad-next-1.4.1/docs/source/_static/datalad_logo.png000066400000000000000000000016761462321624600231010ustar00rootroot00000000000000PNG  IHDRddGPA@ncO gw iYAJ# a 0H!S X* Hhm#A,v6,]i&RFkV2Lijd ʰʏ6 >}]SD| ~p~mX/OTD~ Up4adQ/™ǣ%x!Ex4ʵꈾd8uّ@h[i(mBB!㠽PH2?c!)^U͙pw~?gGx =l9u9aJ0$O8xh$pIENDB`datalad-next-1.4.1/docs/source/_templates/000077500000000000000000000000001462321624600204565ustar00rootroot00000000000000datalad-next-1.4.1/docs/source/_templates/autosummary/000077500000000000000000000000001462321624600230445ustar00rootroot00000000000000datalad-next-1.4.1/docs/source/_templates/autosummary/class.rst000066400000000000000000000001641462321624600247040ustar00rootroot00000000000000{{ fullname }} {{ underline }} .. autoclass:: {{ fullname }} :members: :undoc-members: :show-inheritance: datalad-next-1.4.1/docs/source/_templates/autosummary/module.rst000066400000000000000000000006641462321624600250710ustar00rootroot00000000000000{% if fullname == 'datalad.api' -%} `{{ name }}` =={%- for c in name %}={%- endfor %} .. automodule:: datalad.api .. currentmodule:: datalad.api {% for item in members if not item.startswith('_') %} `{{ item }}` --{%- for c in item %}-{%- endfor %} .. autofunction:: {{ item }} {% endfor %} {% else -%} {{ fullname }} {{ underline }} .. automodule:: {{ fullname }} :members: :undoc-members: :show-inheritance: {% endif %} datalad-next-1.4.1/docs/source/annex-backends.rst000066400000000000000000000002161462321624600217330ustar00rootroot00000000000000Git-annex backends ****************** .. currentmodule:: datalad_next.annexbackends .. autosummary:: :toctree: generated base xdlradatalad-next-1.4.1/docs/source/annex-specialremotes.rst000066400000000000000000000002631462321624600232020ustar00rootroot00000000000000Git-annex special remotes ************************* .. currentmodule:: datalad_next.annexremotes .. autosummary:: :toctree: generated SpecialRemote archivist uncurl datalad-next-1.4.1/docs/source/api.rst000066400000000000000000000003321462321624600176220ustar00rootroot00000000000000High-level API commands *********************** .. currentmodule:: datalad.api .. autosummary:: :toctree: generated create_sibling_webdav credentials download ls_file_collection next_status tree datalad-next-1.4.1/docs/source/cmd.rst000066400000000000000000000004621462321624600176200ustar00rootroot00000000000000Command line reference ********************** .. toctree:: :maxdepth: 1 generated/man/datalad-create-sibling-webdav generated/man/datalad-credentials generated/man/datalad-download generated/man/datalad-ls-file-collection generated/man/datalad-next-status generated/man/datalad-tree datalad-next-1.4.1/docs/source/conf.py000066400000000000000000000113531462321624600176230ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # datalad_next documentation build configuration file, created by # sphinx-quickstart on Tue Oct 13 08:41:19 2015. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. from __future__ import annotations import sys import subprocess import datetime from os.path import ( abspath, dirname, exists, join as opj, ) from os import pardir import datalad_next # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.insert(0, os.path.abspath('.')) # generate missing pieces for setup_py_path in (opj(pardir, 'setup.py'), # travis opj(pardir, pardir, 'setup.py')): # RTD if exists(setup_py_path): sys.path.insert(0, abspath(dirname(setup_py_path))) # Build manpage try: subprocess.run( args=[setup_py_path, 'build_manpage', '--cmdsuite', 'datalad_next:command_suite', '--manpath', abspath(opj( dirname(setup_py_path), 'build', 'man')), '--rstpath', opj(dirname(__file__), 'generated', 'man'), ], check=True, ) except (FileNotFoundError, subprocess.CalledProcessError): # shut up and do your best pass # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', 'sphinx.ext.ifconfig', 'sphinx.ext.inheritance_diagram', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', ] # for the module reference autosummary_generate = True # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # General information about the project. project = u'Datalad Next' copyright = u'2018-{}, DataLad team'.format(datetime.datetime.now().year) author = u'DataLad team' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. version = datalad_next.__version__ release = version # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = 'en' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns: list[str] = [] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = 'sphinx_rtd_theme' # The name of an image file (relative to this directory) to place at the top # of the sidebar. html_logo = '_static/datalad_logo.png' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # If true, the index is split into individual pages for each letter. html_split_index = True # If true, links to the reST sources are added to the pages. html_show_sourcelink = False # smart quotes are incompatible with the RST flavor of the generated manpages # but see `smartquotes_action` for more fine-grained control, in case # some of this functionality is needed smartquotes = False # render docstrings for dunder-methods, e.g. `__call__`. napoleon_include_special_with_doc = True datalad-next-1.4.1/docs/source/developer_guide/000077500000000000000000000000001462321624600214635ustar00rootroot00000000000000datalad-next-1.4.1/docs/source/developer_guide/constraints.rst000066400000000000000000000065471462321624600246000ustar00rootroot00000000000000.. _constraints: ``datalad-next``'s Constraint System ************************************ ``datalad_next.constraints`` implements a system to perform data validation, coercion, and parameter documentation for commands via a flexible set of "Constraints". You can find an overview of available Constraints in the respective module overview of the :ref:`pyutils`. Adding parameter validation to a command ---------------------------------------- In order to equip an existing or new command with the constraint system, the following steps are required: * Set the commands base class to ``ValidatedInterface``: .. code-block:: python from datalad_next.commands import ValidatedInterface @build_doc class MyCommand(ValidatedInterface): """Download from URLs""" * Declare a ``_validator_`` class member: .. code-block:: python from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, ) @build_doc class MyCommand(ValidatedInterface): """Download from URLs""" _validator_ = EnsureCommandParameterization(dict( [...] )) * Determine for each parameter of the command whether it has constraints, and what those constraints are. If you're transitioning an existing command, remove any ``constraints=`` declaration in the ``_parameter_`` class member. * Add a fitting Constraint declaration for each parameter into the ``_validator_`` as a key-value pair where the key is the parameter and its value is a Constraint. There does not need to be a Constraint per parameter; only add entries for parameters that need validation. .. code-block:: python from datalad_next.commands import ( EnsureCommandParameterization, ValidatedInterface, ) from datalad_next.constraints import EnsureChoice from datalad_next.constraints import EnsureDataset @build_doc class Download(ValidatedInterface): """Download from URLs""" _validator_ = EnsureCommandParameterization(dict( dataset=EnsureDataset(installed=True), force=EnsureChoice('yes','no','maybe'), )) Combining constraints """"""""""""""""""""" Constraints can be combined in different ways. The ``|``, ``&``, and ``()`` operators allow ``AND``, ``OR``, and grouping of Constraints. The following example from the ``download`` command defines a chain of possible Constraints: .. code-block:: python spec_item_constraint = url2path_constraint | ( ( EnsureJSON() | EnsureURLFilenamePairFromURL() ) & url2path_constraint) Constrains can also be combined using ``AnyOf`` or ``AllOf`` MultiConstraints, which correspond almost entirely to ``|`` and ``&``. Here's another example from the ``download`` command: .. code-block:: python spec_constraint = AnyOf( spec_item_constraint, EnsureListOf(spec_item_constraint), EnsureGeneratorFromFileLike( spec_item_constraint, exc_mode='yield', ), One can combine an arbitrary number of Constraints. They are evaluated in the order in which they were specified. Logical OR constraints will return the value from the first constraint that does not raise an exception, and logical AND constraints pass the return values of each constraint into the next. Implementing additional constraints ----------------------------------- TODO Parameter Documentation ----------------------- TODO datalad-next-1.4.1/docs/source/developer_guide/contributing.rst000066400000000000000000000007611462321624600247300ustar00rootroot00000000000000.. _contributing: Contributing to ``datalad-next`` ******************************** We're happy about contributions of any kind to this project - thanks for considering making one! Please take a look at `CONTRIBUTING.md `_ for an overview of development principles and common questions, and `get in touch `_ in case of questions or to discuss features, bugs, or other issues.datalad-next-1.4.1/docs/source/developer_guide/index.rst000066400000000000000000000004721462321624600233270ustar00rootroot00000000000000.. _devguide: Developer Guide =============== This guide sheds light on new and reusable subsystems developed in ``datalad-next``. The target audience are developers that intend to build up on or use functionality provided by this extension. .. toctree:: :maxdepth: 2 constraints.rst contributing.rst datalad-next-1.4.1/docs/source/git-remote-helpers.rst000066400000000000000000000002141462321624600225640ustar00rootroot00000000000000Git-remote helpers ****************** .. currentmodule:: datalad_next.gitremotes .. autosummary:: :toctree: generated datalad_annex datalad-next-1.4.1/docs/source/index.rst000066400000000000000000000066241462321624600201720ustar00rootroot00000000000000DataLad NEXT extension ********************** This `DataLad `__ extension can be thought of as a staging area for additional functionality, or for improved performance and user experience. Unlike other topical or more experimental extensions, the focus here is on functionality with broad applicability. This extension is a suitable dependency for other software packages that intend to build on this improved set of functionality. Installation and usage ====================== Install from PyPi or Github like any other Python package:: # create and enter a new virtual environment (optional) $ virtualenv --python=python3 ~/env/dl-next $ . ~/env/dl-next/bin/activate # install from PyPi $ python -m pip install datalad-next Once installed, additional commands provided by this extension are immediately available. However, in order to fully benefit from all improvements, the extension has to be enabled for auto-loading by executing:: git config --global --add datalad.extensions.load next Doing so will enable the extension to also alter the behavior the core DataLad package and its commands. Functionality provided by DataLad NEXT ====================================== The following table of contents offers entry points to the main components provided by this extension. The `project README `__ offers a more detailed summary in a different format. .. toctree:: :maxdepth: 1 api.rst cmd.rst Infrastructure classes and utilities git-remote-helpers.rst annex-backends.rst annex-specialremotes.rst patches.rst Developing with DataLad NEXT ============================ This extension package moves fast in comparison to the DataLad core package. Nevertheless, attention is paid to API stability, adequate semantic versioning, and informative changelogs. Besides the DataLad commands shipped with this extension package, a number of Python utilities are provided that facilitate the implementation of workflows and additional functionality. An overview is available in the :ref:`reference manual `. Public vs internal Python API ----------------------------- Anything that can be imported directly from any of the top-level sub-packages in `datalad_next` is considered to be part of the public API. Changes to this API determine the versioning, and development is done with the aim to keep this API as stable as possible. This includes signatures and return value behavior. As an example:: from datalad_next.runners import iter_git_subproc imports a part of the public API, but:: from datalad_next.runners.git import iter_git_subproc does not. Use of the internal API ----------------------- Developers can obviously use parts of the non-public API. However, this should only be done with the understanding that these components may change from one release to another, with no guarantee of transition periods, deprecation warnings, etc. Developers are advised to never reuse any components with names starting with `_` (underscore). Their use should be limited to their individual sub-package. Contributor information ======================= .. toctree:: :maxdepth: 2 developer_guide/index.rst Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` .. |---| unicode:: U+02014 .. em dash datalad-next-1.4.1/docs/source/patches.rst000066400000000000000000000012621462321624600205030ustar00rootroot00000000000000DataLad patches *************** Patches that are automatically applied to DataLad when loading the ``datalad-next`` extension package. .. currentmodule:: datalad_next.patches .. autosummary:: :toctree: generated add_method_url2transport_path annexrepo cli_configoverrides commanderror common_cfg configuration create_sibling_ghlike create_sibling_gitlab customremotes_main distribution_dataset fix_ria_ora_tests interface_utils patch_ria_ora push_optimize push_to_export_remote replace_create_sibling_ria replace_ora_remote replace_sshremoteio ria_utils run siblings ssh_exec sshconnector test_keyring update datalad-next-1.4.1/docs/source/pyutils.rst000066400000000000000000000013501462321624600205630ustar00rootroot00000000000000.. _pyutils: Python tooling ************** ``datalad-next`` comprises a number of more-or-less self-contained mini-packages providing particular functionality. These implementations are candidates for a migration into the DataLad core package, and are provided here for immediate use. If and when components are migrated, transition modules will be kept to prevent API breakage in dependent packages. .. currentmodule:: datalad_next .. autosummary:: :toctree: generated archive_operations commands config constraints consts credman datasets exceptions iterable_subprocess itertools iter_collections repo_utils runners shell tests tests.fixtures types uis url_operations utils datalad-next-1.4.1/pyproject.toml000066400000000000000000000041051462321624600170050ustar00rootroot00000000000000[build-system] requires = ["setuptools >= 43.0.0", "wheel"] [tool.commitizen] name = "cz_customize" tag_format = "$version" version_scheme = "pep440" version_provider = "scm" changelog_incremental = true template = ".changelog.md.j2" gpg_sign = true [tool.commitizen.customize] commit_parser = "^((?Pfeat|fix|rf|perf|test|docs|BREAKING CHANGE)(?:\\((?P[^()\r\n]*)\\)|\\()?(?P!)?|\\w+!):\\s(?P.*)?(?P.*)?" change_type_order = ["BREAKING CHANGE", "feat", "fix", "rf", "perf", "docs", "test"] changelog_pattern = "^((BREAKING[\\-\\ ]CHANGE|\\w+)(\\(.+\\))?!?):" bump_pattern = "^((BREAKING[\\-\\ ]CHANGE|\\w+)(\\(.+\\))?!?):" schema_pattern = "(?s)(ci|docs|feat|fix|perf|rf|style|test|chore|revert|bump)(\\(\\S+\\))?!?:( [^\\n\\r]+)((\\n\\n.*)|(\\s*))?$" [tool.commitizen.customize.bump_map] "^\\w+!" = "MAJOR" "^BREAKING" = "MAJOR" "^feat" = "MINOR" "^fix" = "PATCH" [tool.commitizen.customize.change_type_map] "BREAKING CHANGE" = "Breaking changes" docs = "📝 Documentation" feat = "💫 New features" fix = "🐛 Bug Fixes" test = "🛡 Tests" rf = "Refactorings" perf = "Performance improvements" [tool.pytest.ini_options] addopts = "--strict-markers" markers = [ # datalad-next custom markers "skip_if_no_network", # (implicitly) used markers from datalad-core, which are only declared # in its tox.ini (inaccessible to pytest here) "fail_slow", "githubci_osx", "githubci_win", "integration", "known_failure", "known_failure_githubci_osx", "known_failure_githubci_win", "known_failure_osx", "known_failure_windows", "network", "osx", "probe_known_failure", "serve_path_via_http", "skip_if_adjusted_branch", "skip_if_no_network", "skip_if_on_windows", "skip_if_root", "skip_known_failure", "skip_nomultiplex_ssh", "skip_ssh", "skip_wo_symlink_capability", "slow", "turtle", "usecase", "windows", "with_config", "with_fake_cookies_db", "with_memory_keyring", "with_sameas_remotes", "with_testrepos", "without_http_proxy", ] datalad-next-1.4.1/readthedocs.yml000066400000000000000000000010521462321624600170770ustar00rootroot00000000000000# .readthedocs.yaml # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Set the version of Python and other tools you might need build: os: ubuntu-20.04 tools: python: "3.9" # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/source/conf.py formats: all # Optionally declare the Python requirements required to build your docs python: install: - path: . method: pip - requirements: requirements-devel.txt datalad-next-1.4.1/requirements-devel.txt000066400000000000000000000002001462321624600204420ustar00rootroot00000000000000# requirements for a development environment # (also) to get the docs built properly by RTD -e .[devel] sphinx sphinx_rtd_theme datalad-next-1.4.1/setup.cfg000066400000000000000000000037731462321624600157240ustar00rootroot00000000000000[metadata] url = https://github.com/datalad/datalad-next author = The DataLad Team and Contributors author_email = team@datalad.org description = What is next in DataLad long_description = file:README.md long_description_content_type = text/markdown; charset=UTF-8 license = MIT classifiers = Programming Language :: Python License :: OSI Approved :: BSD License Programming Language :: Python :: 3 [options] python_requires = >= 3.8 install_requires = annexremote datalad >= 0.18.4 humanize more-itertools packages = find_namespace: include_package_data = True [options.packages.find] include = datalad_next* [options.extras_require] # this matches the name used by -core and what is expected by some CI setups devel = pytest pytest-cov coverage # for iterable_subprocess psutil # for webdav testing cheroot wsgidav webdavclient3 httpsupport = requests requests_toolbelt [options.entry_points] # 'datalad.extensions' is THE entrypoint inspected by the datalad API builders datalad.extensions = # the label in front of '=' is the command suite label # the entrypoint can point to any symbol of any name, as long it is # valid datalad interface specification (see demo in this extensions) next = datalad_next:command_suite console_scripts = git-annex-backend-XDLRA = datalad_next.annexbackends.xdlra:main git-remote-datalad-annex = datalad_next.gitremotes.datalad_annex:main git-annex-remote-uncurl = datalad_next.annexremotes.uncurl:main git-annex-remote-archivist = datalad_next.annexremotes.archivist:main [versioneer] # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the # resulting files. VCS = git style = pep440 versionfile_source = datalad_next/_version.py versionfile_build = datalad_next/_version.py tag_prefix = parentdir_prefix = [coverage:report] show_missing = True omit = # versioneer code datalad_next/_version.py datalad-next-1.4.1/setup.py000077500000000000000000000005541462321624600156120ustar00rootroot00000000000000#!/usr/bin/env python import sys from setuptools import setup import versioneer from _datalad_buildsupport.setup import ( BuildManPage, ) cmdclass = versioneer.get_cmdclass() cmdclass.update(build_manpage=BuildManPage) if __name__ == '__main__': setup(name='datalad_next', version=versioneer.get_version(), cmdclass=cmdclass, ) datalad-next-1.4.1/tools/000077500000000000000000000000001462321624600152315ustar00rootroot00000000000000datalad-next-1.4.1/tools/appveyor/000077500000000000000000000000001462321624600170765ustar00rootroot00000000000000datalad-next-1.4.1/tools/appveyor/chmod600.bat000066400000000000000000000004431462321624600211070ustar00rootroot00000000000000set key=%1 :: remove inheritance icacls %key% /c /t /Inheritance:d :: set ownership to owner icacls %key% /c /t /Grant %UserName%:F :: remove all users except owner icacls %key% /c /t /Remove:g "Authenticated Users" BUILTIN\Administrators BUILTIN Everyone System Users :: cleanup set "key=" datalad-next-1.4.1/tools/appveyor/docker-load-httpbin000077500000000000000000000004671462321624600226650ustar00rootroot00000000000000#!/bin/sh set -e -u imgfile=~/cache/httpbin.dockerimg if [ -f "$imgfile" ]; then # we have the image cached docker load < $imgfile else # pull from dockerhub docker pull kennethreitz/httpbin # and export for caching mkdir -p $(dirname $imgfile) docker save kennethreitz/httpbin > "$imgfile" fi datalad-next-1.4.1/tools/appveyor/enable-ssh-login000077500000000000000000000001741462321624600221550ustar00rootroot00000000000000#!/bin/bash set -e -u curl -sflL 'https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-ssh.sh' | bash -e - datalad-next-1.4.1/tools/appveyor/env_setup.bat000066400000000000000000000001451462321624600215760ustar00rootroot00000000000000set PY=%1-x64 set TMP=C:\DLTMP set TEMP=C:\DLTMP set PATH=C:\Python%PY%;C:\Python%PY%\Scripts;%PATH% datalad-next-1.4.1/tools/appveyor/install-git-annex000077500000000000000000000006171462321624600223660ustar00rootroot00000000000000#!/bin/bash # # Install git-annex. Any environment setup to source would be # written to ${HOME}/dlinstaller_env.sh # set -e -u # no install requested -> exit [ -z "$1" ] && exit 0 || true # assumes a virtualenv or equivalent python env # get the installer for this python -m pip install datalad-installer${DATALAD_INSTALLER_VERSION:-} datalad-installer -E ${HOME}/dlinstaller_env.sh --sudo ok $* datalad-next-1.4.1/tools/appveyor/install-syspkgs000077500000000000000000000005131462321624600221720ustar00rootroot00000000000000#!/bin/bash set -e # no install requested -> exit [ -z "$1" ] && exit 0 || true if (which apt-get > /dev/null ); then sudo apt-get update -qq -y --allow-releaseinfo-change sudo apt-get install -q --no-install-recommends -y eatmydata sudo eatmydata apt-get install -q --no-install-recommends -y $* else brew install -q $* fi datalad-next-1.4.1/tools/appveyor/setup-sshd000077500000000000000000000026701462321624600211300ustar00rootroot00000000000000#!/bin/bash set -e -u -x DATALAD_TESTS_SERVER_SSH_SECKEY=${DATALAD_TESTS_SERVER_SSH_SECKEY:-$HOME/.ssh/id_rsa} function setup_docker () { # obtain the docker image for SSH testing curl -fsSL --ssl-no-revoke -o sshd.dockerimg.gz "${DATALAD_TESTS_DOCKER_SSHD_DOWNLOADURL}" gzip -c -d sshd.dockerimg.gz | docker load # obtain the matching SSH private key for SSH server login curl \ -fsSL \ -o "${DATALAD_TESTS_SERVER_SSH_SECKEY}" \ "${DATALAD_TESTS_DOCKER_SSHD_SECKEY_DOWNLOADURL}" # start docker container docker run \ --rm -dit \ --name "${DATALAD_TESTS_DOCKER_SSHD_CONTAINER_NAME}" \ -p "${DATALAD_TESTS_SERVER_SSH_PORT}:22" \ -v "${DATALAD_TESTS_SERVER_LOCALPATH}:${DATALAD_TESTS_SERVER_SSH_PATH}" \ sshd } function setup_ssh_localhost () { ssh-keygen -f "${DATALAD_TESTS_SERVER_SSH_SECKEY}" -N '' cat "${DATALAD_TESTS_SERVER_SSH_SECKEY}.pub" >> ${HOME}/.ssh/authorized_keys } # if there is docker use it, if not, use the worker itself docker -v && setup_docker || setup_ssh_localhost # wipe any other known host keys ssh-keygen \ -f "${HOME}/.ssh/known_hosts" \ -R "[${DATALAD_TESTS_SERVER_SSH_HOST}]:${DATALAD_TESTS_SERVER_SSH_PORT}" # establish expected permission setup for SSH key chmod 600 "${DATALAD_TESTS_SERVER_SSH_SECKEY}" # ingest actual host key ssh-keyscan \ -t ecdsa \ -p "${DATALAD_TESTS_SERVER_SSH_PORT}" \ -H "${DATALAD_TESTS_SERVER_SSH_HOST}" >> "${HOME}/.ssh/known_hosts" datalad-next-1.4.1/tools/appveyor/setup-sshd.bat000066400000000000000000000020361462321624600216660ustar00rootroot00000000000000:: set -x @echo on :: download and ingest docker image curl -fsSL --ssl-no-revoke -o sshd.dockerimg.gz %DATALAD_TESTS_DOCKER_SSHD_DOWNLOADURL% gzip -c -d sshd.dockerimg.gz | docker load :: start container docker run --rm -dit --name %DATALAD_TESTS_DOCKER_SSHD_CONTAINER_NAME% -p %DATALAD_TESTS_SERVER_SSH_PORT%:22 -v %DATALAD_TESTS_SERVER_LOCALPATH%:%DATALAD_TESTS_SERVER_SSH_PATH% sshd :: give the service a moment to start (otherwise we may run into timeouts on windows) sleep 10 :: wipe any other known host keys ssh-keygen -f C:\Users\appveyor\.ssh\known_hosts -R "[%DATALAD_TESTS_SERVER_SSH_HOST%]:%DATALAD_TESTS_SERVER_SSH_PORT%" :: ingest actual host key ssh-keyscan -t ecdsa -p %DATALAD_TESTS_SERVER_SSH_PORT% %DATALAD_TESTS_SERVER_SSH_HOST% >> C:\Users\appveyor\.ssh\known_hosts :: get the ssh key matching the container curl -fsSL --ssl-no-revoke -o %DATALAD_TESTS_SERVER_SSH_SECKEY% %DATALAD_TESTS_DOCKER_SSHD_SECKEY_DOWNLOADURL% :: establish expected permission setup for SSH key tools\appveyor\chmod600.bat %DATALAD_TESTS_SERVER_SSH_SECKEY% datalad-next-1.4.1/tools/appveyor/verify-ssh-access000077500000000000000000000005121462321624600223600ustar00rootroot00000000000000#!/bin/bash set -e -u touch ${DATALAD_TESTS_SERVER_LOCALPATH}/probe ssh \ -i "${DATALAD_TESTS_SERVER_SSH_SECKEY}" \ -p "${DATALAD_TESTS_SERVER_SSH_PORT}" \ "${DATALAD_TESTS_SERVER_SSH_LOGIN}@${DATALAD_TESTS_SERVER_SSH_HOST}" \ test -f ${DATALAD_TESTS_SERVER_SSH_PATH}/probe rm ${DATALAD_TESTS_SERVER_LOCALPATH}/probe datalad-next-1.4.1/tools/appveyor/verify-ssh-access.bat000066400000000000000000000005271462321624600231300ustar00rootroot00000000000000:: set -x @echo on type nul >> %DATALAD_TESTS_SERVER_LOCALPATH%\probe ssh -i %DATALAD_TESTS_SERVER_SSH_SECKEY% -p %DATALAD_TESTS_SERVER_SSH_PORT% %DATALAD_TESTS_SERVER_SSH_LOGIN%@%DATALAD_TESTS_SERVER_SSH_HOST% test -f %DATALAD_TESTS_SERVER_SSH_PATH%/probe if %errorlevel% neq 0 exit /b %errorlevel% del %DATALAD_TESTS_SERVER_LOCALPATH%\probe datalad-next-1.4.1/versioneer.py000066400000000000000000002512251462321624600166330ustar00rootroot00000000000000 # Version: 0.29 """The Versioneer - like a rocketeer, but for versions. The Versioneer ============== * like a rocketeer, but for versions! * https://github.com/python-versioneer/python-versioneer * Brian Warner * License: Public Domain (Unlicense) * Compatible with: Python 3.7, 3.8, 3.9, 3.10, 3.11 and pypy3 * [![Latest Version][pypi-image]][pypi-url] * [![Build Status][travis-image]][travis-url] This is a tool for managing a recorded version number in setuptools-based python projects. The goal is to remove the tedious and error-prone "update the embedded version string" step from your release process. Making a new release should be as easy as recording a new tag in your version-control system, and maybe making new tarballs. ## Quick Install Versioneer provides two installation modes. The "classic" vendored mode installs a copy of versioneer into your repository. The experimental build-time dependency mode is intended to allow you to skip this step and simplify the process of upgrading. ### Vendored mode * `pip install versioneer` to somewhere in your $PATH * A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is available, so you can also use `conda install -c conda-forge versioneer` * add a `[tool.versioneer]` section to your `pyproject.toml` or a `[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md)) * Note that you will need to add `tomli; python_version < "3.11"` to your build-time dependencies if you use `pyproject.toml` * run `versioneer install --vendor` in your source tree, commit the results * verify version information with `python setup.py version` ### Build-time dependency mode * `pip install versioneer` to somewhere in your $PATH * A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is available, so you can also use `conda install -c conda-forge versioneer` * add a `[tool.versioneer]` section to your `pyproject.toml` or a `[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md)) * add `versioneer` (with `[toml]` extra, if configuring in `pyproject.toml`) to the `requires` key of the `build-system` table in `pyproject.toml`: ```toml [build-system] requires = ["setuptools", "versioneer[toml]"] build-backend = "setuptools.build_meta" ``` * run `versioneer install --no-vendor` in your source tree, commit the results * verify version information with `python setup.py version` ## Version Identifiers Source trees come from a variety of places: * a version-control system checkout (mostly used by developers) * a nightly tarball, produced by build automation * a snapshot tarball, produced by a web-based VCS browser, like github's "tarball from tag" feature * a release tarball, produced by "setup.py sdist", distributed through PyPI Within each source tree, the version identifier (either a string or a number, this tool is format-agnostic) can come from a variety of places: * ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows about recent "tags" and an absolute revision-id * the name of the directory into which the tarball was unpacked * an expanded VCS keyword ($Id$, etc) * a `_version.py` created by some earlier build step For released software, the version identifier is closely related to a VCS tag. Some projects use tag names that include more than just the version string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool needs to strip the tag prefix to extract the version identifier. For unreleased software (between tags), the version identifier should provide enough information to help developers recreate the same tree, while also giving them an idea of roughly how old the tree is (after version 1.2, before version 1.3). Many VCS systems can report a description that captures this, for example `git describe --tags --dirty --always` reports things like "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has uncommitted changes). The version identifier is used for multiple purposes: * to allow the module to self-identify its version: `myproject.__version__` * to choose a name and prefix for a 'setup.py sdist' tarball ## Theory of Operation Versioneer works by adding a special `_version.py` file into your source tree, where your `__init__.py` can import it. This `_version.py` knows how to dynamically ask the VCS tool for version information at import time. `_version.py` also contains `$Revision$` markers, and the installation process marks `_version.py` to have this marker rewritten with a tag name during the `git archive` command. As a result, generated tarballs will contain enough information to get the proper version. To allow `setup.py` to compute a version too, a `versioneer.py` is added to the top level of your source tree, next to `setup.py` and the `setup.cfg` that configures it. This overrides several distutils/setuptools commands to compute the version when invoked, and changes `setup.py build` and `setup.py sdist` to replace `_version.py` with a small static file that contains just the generated version data. ## Installation See [INSTALL.md](./INSTALL.md) for detailed installation instructions. ## Version-String Flavors Code which uses Versioneer can learn about its version string at runtime by importing `_version` from your main `__init__.py` file and running the `get_versions()` function. From the "outside" (e.g. in `setup.py`), you can import the top-level `versioneer.py` and run `get_versions()`. Both functions return a dictionary with different flavors of version information: * `['version']`: A condensed version string, rendered using the selected style. This is the most commonly used value for the project's version string. The default "pep440" style yields strings like `0.11`, `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section below for alternative styles. * `['full-revisionid']`: detailed revision identifier. For Git, this is the full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". * `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the commit date in ISO 8601 format. This will be None if the date is not available. * `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that this is only accurate if run in a VCS checkout, otherwise it is likely to be False or None * `['error']`: if the version string could not be computed, this will be set to a string describing the problem, otherwise it will be None. It may be useful to throw an exception in setup.py if this is set, to avoid e.g. creating tarballs with a version string of "unknown". Some variants are more useful than others. Including `full-revisionid` in a bug report should allow developers to reconstruct the exact code being tested (or indicate the presence of local changes that should be shared with the developers). `version` is suitable for display in an "about" box or a CLI `--version` output: it can be easily compared against release notes and lists of bugs fixed in various releases. The installer adds the following text to your `__init__.py` to place a basic version in `YOURPROJECT.__version__`: from ._version import get_versions __version__ = get_versions()['version'] del get_versions ## Styles The setup.cfg `style=` configuration controls how the VCS information is rendered into a version string. The default style, "pep440", produces a PEP440-compliant string, equal to the un-prefixed tag name for actual releases, and containing an additional "local version" section with more detail for in-between builds. For Git, this is TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags --dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and that this commit is two revisions ("+2") beyond the "0.11" tag. For released software (exactly equal to a known tag), the identifier will only contain the stripped tag, e.g. "0.11". Other styles are available. See [details.md](details.md) in the Versioneer source tree for descriptions. ## Debugging Versioneer tries to avoid fatal errors: if something goes wrong, it will tend to return a version of "0+unknown". To investigate the problem, run `setup.py version`, which will run the version-lookup code in a verbose mode, and will display the full contents of `get_versions()` (including the `error` string, which may help identify what went wrong). ## Known Limitations Some situations are known to cause problems for Versioneer. This details the most significant ones. More can be found on Github [issues page](https://github.com/python-versioneer/python-versioneer/issues). ### Subprojects Versioneer has limited support for source trees in which `setup.py` is not in the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are two common reasons why `setup.py` might not be in the root: * Source trees which contain multiple subprojects, such as [Buildbot](https://github.com/buildbot/buildbot), which contains both "master" and "slave" subprojects, each with their own `setup.py`, `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI distributions (and upload multiple independently-installable tarballs). * Source trees whose main purpose is to contain a C library, but which also provide bindings to Python (and perhaps other languages) in subdirectories. Versioneer will look for `.git` in parent directories, and most operations should get the right version string. However `pip` and `setuptools` have bugs and implementation details which frequently cause `pip install .` from a subproject directory to fail to find a correct version string (so it usually defaults to `0+unknown`). `pip install --editable .` should work correctly. `setup.py install` might work too. Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in some later version. [Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking this issue. The discussion in [PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the issue from the Versioneer side in more detail. [pip PR#3176](https://github.com/pypa/pip/pull/3176) and [pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve pip to let Versioneer work correctly. Versioneer-0.16 and earlier only looked for a `.git` directory next to the `setup.cfg`, so subprojects were completely unsupported with those releases. ### Editable installs with setuptools <= 18.5 `setup.py develop` and `pip install --editable .` allow you to install a project into a virtualenv once, then continue editing the source code (and test) without re-installing after every change. "Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a convenient way to specify executable scripts that should be installed along with the python package. These both work as expected when using modern setuptools. When using setuptools-18.5 or earlier, however, certain operations will cause `pkg_resources.DistributionNotFound` errors when running the entrypoint script, which must be resolved by re-installing the package. This happens when the install happens with one version, then the egg_info data is regenerated while a different version is checked out. Many setup.py commands cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into a different virtualenv), so this can be surprising. [Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes this one, but upgrading to a newer version of setuptools should probably resolve it. ## Updating Versioneer To upgrade your project to a new release of Versioneer, do the following: * install the new Versioneer (`pip install -U versioneer` or equivalent) * edit `setup.cfg` and `pyproject.toml`, if necessary, to include any new configuration settings indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. * re-run `versioneer install --[no-]vendor` in your source tree, to replace `SRC/_version.py` * commit any changed files ## Future Directions This tool is designed to make it easily extended to other version-control systems: all VCS-specific components are in separate directories like src/git/ . The top-level `versioneer.py` script is assembled from these components by running make-versioneer.py . In the future, make-versioneer.py will take a VCS name as an argument, and will construct a version of `versioneer.py` that is specific to the given VCS. It might also take the configuration arguments that are currently provided manually during installation by editing setup.py . Alternatively, it might go the other direction and include code from all supported VCS systems, reducing the number of intermediate scripts. ## Similar projects * [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time dependency * [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of versioneer * [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools plugin ## License To make Versioneer easier to embed, all its code is dedicated to the public domain. The `_version.py` that it creates is also in the public domain. Specifically, both are released under the "Unlicense", as described in https://unlicense.org/. [pypi-image]: https://img.shields.io/pypi/v/versioneer.svg [pypi-url]: https://pypi.python.org/pypi/versioneer/ [travis-image]: https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg [travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer """ # pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring # pylint:disable=missing-class-docstring,too-many-branches,too-many-statements # pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error # pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with # pylint:disable=attribute-defined-outside-init,too-many-arguments import configparser import errno import json import os import re import subprocess import sys from pathlib import Path from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union from typing import NoReturn import functools have_tomllib = True if sys.version_info >= (3, 11): import tomllib else: try: import tomli as tomllib except ImportError: have_tomllib = False class VersioneerConfig: """Container for Versioneer configuration parameters.""" VCS: str style: str tag_prefix: str versionfile_source: str versionfile_build: Optional[str] parentdir_prefix: Optional[str] verbose: Optional[bool] def get_root() -> str: """Get the project root directory. We require that all commands are run from the project root, i.e. the directory that contains setup.py, setup.cfg, and versioneer.py . """ root = os.path.realpath(os.path.abspath(os.getcwd())) setup_py = os.path.join(root, "setup.py") pyproject_toml = os.path.join(root, "pyproject.toml") versioneer_py = os.path.join(root, "versioneer.py") if not ( os.path.exists(setup_py) or os.path.exists(pyproject_toml) or os.path.exists(versioneer_py) ): # allow 'python path/to/setup.py COMMAND' root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) setup_py = os.path.join(root, "setup.py") pyproject_toml = os.path.join(root, "pyproject.toml") versioneer_py = os.path.join(root, "versioneer.py") if not ( os.path.exists(setup_py) or os.path.exists(pyproject_toml) or os.path.exists(versioneer_py) ): err = ("Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " "its immediate directory (like 'python setup.py COMMAND'), " "or in a way that lets it use sys.argv[0] to find the root " "(like 'python path/to/setup.py COMMAND').") raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools # tree) execute all dependencies in a single python process, so # "versioneer" may be imported multiple times, and python's shared # module-import table will cache the first one. So we can't use # os.path.dirname(__file__), as that will find whichever # versioneer.py was first imported, even in later projects. my_path = os.path.realpath(os.path.abspath(__file__)) me_dir = os.path.normcase(os.path.splitext(my_path)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir and "VERSIONEER_PEP518" not in globals(): print("Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(my_path), versioneer_py)) except NameError: pass return root def get_config_from_root(root: str) -> VersioneerConfig: """Read the project setup.cfg file to determine Versioneer config.""" # This might raise OSError (if setup.cfg is missing), or # configparser.NoSectionError (if it lacks a [versioneer] section), or # configparser.NoOptionError (if it lacks "VCS="). See the docstring at # the top of versioneer.py for instructions on writing your setup.cfg . root_pth = Path(root) pyproject_toml = root_pth / "pyproject.toml" setup_cfg = root_pth / "setup.cfg" section: Union[Dict[str, Any], configparser.SectionProxy, None] = None if pyproject_toml.exists() and have_tomllib: try: with open(pyproject_toml, 'rb') as fobj: pp = tomllib.load(fobj) section = pp['tool']['versioneer'] except (tomllib.TOMLDecodeError, KeyError) as e: print(f"Failed to load config from {pyproject_toml}: {e}") print("Try to load it from setup.cfg") if not section: parser = configparser.ConfigParser() with open(setup_cfg) as cfg_file: parser.read_file(cfg_file) parser.get("versioneer", "VCS") # raise error if missing section = parser["versioneer"] # `cast`` really shouldn't be used, but its simplest for the # common VersioneerConfig users at the moment. We verify against # `None` values elsewhere where it matters cfg = VersioneerConfig() cfg.VCS = section['VCS'] cfg.style = section.get("style", "") cfg.versionfile_source = cast(str, section.get("versionfile_source")) cfg.versionfile_build = section.get("versionfile_build") cfg.tag_prefix = cast(str, section.get("tag_prefix")) if cfg.tag_prefix in ("''", '""', None): cfg.tag_prefix = "" cfg.parentdir_prefix = section.get("parentdir_prefix") if isinstance(section, configparser.SectionProxy): # Make sure configparser translates to bool cfg.verbose = section.getboolean("verbose") else: cfg.verbose = section.get("verbose") return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" # these dictionaries contain VCS-specific tools LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" HANDLERS.setdefault(vcs, {})[method] = f return f return decorate def run_command( commands: List[str], args: List[str], cwd: Optional[str] = None, verbose: bool = False, hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs: Dict[str, Any] = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen([command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs) break except OSError as e: if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, process.returncode return stdout, process.returncode LONG_VERSION_PY['git'] = r''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. # Generated by versioneer-0.29 # https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" import errno import os import re import subprocess import sys from typing import Any, Callable, Dict, List, Optional, Tuple import functools def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" VCS: str style: str tag_prefix: str parentdir_prefix: str versionfile_source: str verbose: bool def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "%(STYLE)s" cfg.tag_prefix = "%(TAG_PREFIX)s" cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command( commands: List[str], args: List[str], cwd: Optional[str] = None, verbose: bool = False, hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs: Dict[str, Any] = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen([command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs) break except OSError as e: if e.errno == errno.ENOENT: continue if verbose: print("unable to run %%s" %% dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %%s" %% (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %%s (error)" %% dispcmd) print("stdout was %%s" %% stdout) return None, process.returncode return stdout, process.returncode def versions_from_parentdir( parentdir_prefix: str, root: str, verbose: bool, ) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %%s but none started with prefix %%s" %% (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords( keywords: Dict[str, str], tag_prefix: str, verbose: bool, ) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %%d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%%s', no digits" %% ",".join(refs - tags)) if verbose: print("likely tags: %%s" %% ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r'\d', r): continue if verbose: print("picking %%s" %% r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %%s not under git control" %% root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner(GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", f"{tag_prefix}[[:digit:]]*" ], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%%s'" %% describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%%s' doesn't start with prefix '%%s'" print(fmt %% (full_tag, tag_prefix)) pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" %% (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%%d.dev%%d" %% (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%%d" %% (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%%d" %% pieces["distance"] return rendered def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] return rendered def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%%s'" %% style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for _ in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} ''' @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords( keywords: Dict[str, str], tag_prefix: str, verbose: bool, ) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r'\d', r): continue if verbose: print("picking %s" % r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner(GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", f"{tag_prefix}[[:digit:]]*" ], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%s'" % describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" % (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def do_vcs_install(versionfile_source: str, ipy: Optional[str]) -> None: """Git-specific installation logic for Versioneer. For Git, this means creating/changing .gitattributes to mark _version.py for export-subst keyword substitution. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] files = [versionfile_source] if ipy: files.append(ipy) if "VERSIONEER_PEP518" not in globals(): try: my_path = __file__ if my_path.endswith((".pyc", ".pyo")): my_path = os.path.splitext(my_path)[0] + ".py" versioneer_file = os.path.relpath(my_path) except NameError: versioneer_file = "versioneer.py" files.append(versioneer_file) present = False try: with open(".gitattributes", "r") as fobj: for line in fobj: if line.strip().startswith(versionfile_source): if "export-subst" in line.strip().split()[1:]: present = True break except OSError: pass if not present: with open(".gitattributes", "a+") as fobj: fobj.write(f"{versionfile_source} export-subst\n") files.append(".gitattributes") run_command(GITS, ["add", "--"] + files) def versions_from_parentdir( parentdir_prefix: str, root: str, verbose: bool, ) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") SHORT_VERSION_PY = """ # This file was generated by 'versioneer.py' (0.29) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. import json version_json = ''' %s ''' # END VERSION_JSON def get_versions(): return json.loads(version_json) """ def versions_from_file(filename: str) -> Dict[str, Any]: """Try to determine the version from _version.py if present.""" try: with open(filename) as f: contents = f.read() except OSError: raise NotThisMethod("unable to read _version.py") mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) def write_to_version_file(filename: str, versions: Dict[str, Any]) -> None: """Write the given version number to the given _version.py file.""" contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) print("set %s to '%s'" % (filename, versions["version"])) def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} class VersioneerBadRootError(Exception): """The project root directory is unknown or missing key files.""" def get_versions(verbose: bool = False) -> Dict[str, Any]: """Get the project version from whatever source is available. Returns dict with two keys: 'version' and 'full'. """ if "versioneer" in sys.modules: # see the discussion in cmdclass.py:get_cmdclass() del sys.modules["versioneer"] root = get_root() cfg = get_config_from_root(root) assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or bool(cfg.verbose) # `bool()` used to avoid `None` assert cfg.versionfile_source is not None, \ "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) # extract version from first of: _version.py, VCS command (e.g. 'git # describe'), parentdir. This is meant to work for developers using a # source checkout, for users of a tarball created by 'setup.py sdist', # and for users of a tarball/zipball created by 'git archive' or github's # download-from-tag feature or the equivalent in other VCSes. get_keywords_f = handlers.get("get_keywords") from_keywords_f = handlers.get("keywords") if get_keywords_f and from_keywords_f: try: keywords = get_keywords_f(versionfile_abs) ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) if verbose: print("got version from expanded keyword %s" % ver) return ver except NotThisMethod: pass try: ver = versions_from_file(versionfile_abs) if verbose: print("got version from file %s %s" % (versionfile_abs, ver)) return ver except NotThisMethod: pass from_vcs_f = handlers.get("pieces_from_vcs") if from_vcs_f: try: pieces = from_vcs_f(cfg.tag_prefix, root, verbose) ver = render(pieces, cfg.style) if verbose: print("got version from VCS %s" % ver) return ver except NotThisMethod: pass try: if cfg.parentdir_prefix: ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) if verbose: print("got version from parentdir %s" % ver) return ver except NotThisMethod: pass if verbose: print("unable to compute version") return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} def get_version() -> str: """Get the short version string for this project.""" return get_versions()["version"] def get_cmdclass(cmdclass: Optional[Dict[str, Any]] = None): """Get the custom setuptools subclasses used by Versioneer. If the package uses a different cmdclass (e.g. one from numpy), it should be provide as an argument. """ if "versioneer" in sys.modules: del sys.modules["versioneer"] # this fixes the "python setup.py develop" case (also 'install' and # 'easy_install .'), in which subdependencies of the main project are # built (using setup.py bdist_egg) in the same python process. Assume # a main project A and a dependency B, which use different versions # of Versioneer. A's setup.py imports A's Versioneer, leaving it in # sys.modules by the time B's setup.py is executed, causing B to run # with the wrong versioneer. Setuptools wraps the sub-dep builds in a # sandbox that restores sys.modules to it's pre-build state, so the # parent is protected against the child's "import versioneer". By # removing ourselves from sys.modules here, before the child build # happens, we protect the child from the parent's versioneer too. # Also see https://github.com/python-versioneer/python-versioneer/issues/52 cmds = {} if cmdclass is None else cmdclass.copy() # we add "version" to setuptools from setuptools import Command class cmd_version(Command): description = "report generated version string" user_options: List[Tuple[str, str, str]] = [] boolean_options: List[str] = [] def initialize_options(self) -> None: pass def finalize_options(self) -> None: pass def run(self) -> None: vers = get_versions(verbose=True) print("Version: %s" % vers["version"]) print(" full-revisionid: %s" % vers.get("full-revisionid")) print(" dirty: %s" % vers.get("dirty")) print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) cmds["version"] = cmd_version # we override "build_py" in setuptools # # most invocation pathways end up running build_py: # distutils/build -> build_py # distutils/install -> distutils/build ->.. # setuptools/bdist_wheel -> distutils/install ->.. # setuptools/bdist_egg -> distutils/install_lib -> build_py # setuptools/install -> bdist_egg ->.. # setuptools/develop -> ? # pip install: # copies source tree to a tempdir before running egg_info/etc # if .git isn't copied too, 'git describe' will fail # then does setup.py bdist_wheel, or sometimes setup.py install # setup.py egg_info -> ? # pip install -e . and setuptool/editable_wheel will invoke build_py # but the build_py command is not expected to copy any files. # we override different "build_py" commands for both environments if 'build_py' in cmds: _build_py: Any = cmds['build_py'] else: from setuptools.command.build_py import build_py as _build_py class cmd_build_py(_build_py): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_py.run(self) if getattr(self, "editable_mode", False): # During editable installs `.py` and data files are # not copied to build_lib return # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_py"] = cmd_build_py if 'build_ext' in cmds: _build_ext: Any = cmds['build_ext'] else: from setuptools.command.build_ext import build_ext as _build_ext class cmd_build_ext(_build_ext): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_ext.run(self) if self.inplace: # build_ext --inplace will only build extensions in # build/lib<..> dir with no _version.py to write to. # As in place builds will already have a _version.py # in the module dir, we do not need to write one. return # now locate _version.py in the new build/ directory and replace # it with an updated value if not cfg.versionfile_build: return target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) if not os.path.exists(target_versionfile): print(f"Warning: {target_versionfile} does not exist, skipping " "version update. This can happen if you are running build_ext " "without first running build_py.") return print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_ext"] = cmd_build_ext if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe # type: ignore # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION # "product_version": versioneer.get_version(), # ... class cmd_build_exe(_build_exe): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _build_exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) cmds["build_exe"] = cmd_build_exe del cmds["build_py"] if 'py2exe' in sys.modules: # py2exe enabled? try: from py2exe.setuptools_buildexe import py2exe as _py2exe # type: ignore except ImportError: from py2exe.distutils_buildexe import py2exe as _py2exe # type: ignore class cmd_py2exe(_py2exe): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _py2exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) cmds["py2exe"] = cmd_py2exe # sdist farms its file list building out to egg_info if 'egg_info' in cmds: _egg_info: Any = cmds['egg_info'] else: from setuptools.command.egg_info import egg_info as _egg_info class cmd_egg_info(_egg_info): def find_sources(self) -> None: # egg_info.find_sources builds the manifest list and writes it # in one shot super().find_sources() # Modify the filelist and normalize it root = get_root() cfg = get_config_from_root(root) self.filelist.append('versioneer.py') if cfg.versionfile_source: # There are rare cases where versionfile_source might not be # included by default, so we must be explicit self.filelist.append(cfg.versionfile_source) self.filelist.sort() self.filelist.remove_duplicates() # The write method is hidden in the manifest_maker instance that # generated the filelist and was thrown away # We will instead replicate their final normalization (to unicode, # and POSIX-style paths) from setuptools import unicode_utils normalized = [unicode_utils.filesys_decode(f).replace(os.sep, '/') for f in self.filelist.files] manifest_filename = os.path.join(self.egg_info, 'SOURCES.txt') with open(manifest_filename, 'w') as fobj: fobj.write('\n'.join(normalized)) cmds['egg_info'] = cmd_egg_info # we override different "sdist" commands for both environments if 'sdist' in cmds: _sdist: Any = cmds['sdist'] else: from setuptools.command.sdist import sdist as _sdist class cmd_sdist(_sdist): def run(self) -> None: versions = get_versions() self._versioneer_generated_versions = versions # unless we update this, the command will keep using the old # version self.distribution.metadata.version = versions["version"] return _sdist.run(self) def make_release_tree(self, base_dir: str, files: List[str]) -> None: root = get_root() cfg = get_config_from_root(root) _sdist.make_release_tree(self, base_dir, files) # now locate _version.py in the new base_dir directory # (remembering that it may be a hardlink) and replace it with an # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist return cmds CONFIG_ERROR = """ setup.cfg is missing the necessary Versioneer configuration. You need a section like: [versioneer] VCS = git style = pep440 versionfile_source = src/myproject/_version.py versionfile_build = myproject/_version.py tag_prefix = parentdir_prefix = myproject- You will also need to edit your setup.py to use the results: import versioneer setup(version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), ...) Please read the docstring in ./versioneer.py for configuration instructions, edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. """ SAMPLE_CONFIG = """ # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the # resulting files. [versioneer] #VCS = git #style = pep440 #versionfile_source = #versionfile_build = #tag_prefix = #parentdir_prefix = """ OLD_SNIPPET = """ from ._version import get_versions __version__ = get_versions()['version'] del get_versions """ INIT_PY_SNIPPET = """ from . import {0} __version__ = {0}.get_versions()['version'] """ def do_setup() -> int: """Do main VCS-independent setup function for installing Versioneer.""" root = get_root() try: cfg = get_config_from_root(root) except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (OSError, configparser.NoSectionError)): print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) return 1 print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") maybe_ipy: Optional[str] = ipy if os.path.exists(ipy): try: with open(ipy, "r") as f: old = f.read() except OSError: old = "" module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0] snippet = INIT_PY_SNIPPET.format(module) if OLD_SNIPPET in old: print(" replacing boilerplate in %s" % ipy) with open(ipy, "w") as f: f.write(old.replace(OLD_SNIPPET, snippet)) elif snippet not in old: print(" appending to %s" % ipy) with open(ipy, "a") as f: f.write(snippet) else: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) maybe_ipy = None # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-subst keyword # substitution. do_vcs_install(cfg.versionfile_source, maybe_ipy) return 0 def scan_setup_py() -> int: """Validate the contents of setup.py against Versioneer's expectations.""" found = set() setters = False errors = 0 with open("setup.py", "r") as f: for line in f.readlines(): if "import versioneer" in line: found.add("import") if "versioneer.get_cmdclass()" in line: found.add("cmdclass") if "versioneer.get_version()" in line: found.add("get_version") if "versioneer.VCS" in line: setters = True if "versioneer.versionfile_source" in line: setters = True if len(found) != 3: print("") print("Your setup.py appears to be missing some important items") print("(but I might be wrong). Please make sure it has something") print("roughly like the following:") print("") print(" import versioneer") print(" setup( version=versioneer.get_version(),") print(" cmdclass=versioneer.get_cmdclass(), ...)") print("") errors += 1 if setters: print("You should remove lines like 'versioneer.VCS = ' and") print("'versioneer.versionfile_source = ' . This configuration") print("now lives in setup.cfg, and should be removed from setup.py") print("") errors += 1 return errors def setup_command() -> NoReturn: """Set up Versioneer and exit with appropriate error code.""" errors = do_setup() errors += scan_setup_py() sys.exit(1 if errors else 0) if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup": setup_command()